mirror of
https://github.com/yacy/yacy_search_server.git
synced 2024-09-19 00:01:41 +02:00
added stub of oai-pmh importer (not working yet)
git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@6437 6c8d7289-2bf4-0310-a012-ef5d649a1542
This commit is contained in:
parent
77c99e500f
commit
30f108f97d
37
htroot/IndexImportOAIPMH_p.html
Normal file
37
htroot/IndexImportOAIPMH_p.html
Normal file
|
@ -0,0 +1,37 @@
|
|||
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
|
||||
<html xmlns="http://www.w3.org/1999/xhtml">
|
||||
<head>
|
||||
<title>YaCy '#[clientname]#': OAI-PMH Import</title>
|
||||
#%env/templates/metas.template%#
|
||||
#(import)#::<meta http-equiv="REFRESH" content="10" />#(/import)#
|
||||
</head>
|
||||
<body id="IndexImportOAIPMH">
|
||||
#%env/templates/header.template%#
|
||||
#%env/templates/submenuIntegration.template%#
|
||||
<h2>OAI-PMH Import</h2>
|
||||
|
||||
#(import)#
|
||||
<p>#(status)#No import thread is running, you can start a new thread here::Bad input data: #[message]# #(/status)#</p>
|
||||
<form action="IndexImportOAIPMH_p.html" method="get">
|
||||
<fieldset>
|
||||
<legend>OAI-PMH Import: set a OAI-PMH URL</legend>
|
||||
<input name="oaipmhurl" type="text" value="" size="80" />
|
||||
<input name="submit" type="submit" value="Import from a OAI-PMH source" />
|
||||
</fieldset>
|
||||
</form>
|
||||
::
|
||||
<form><fieldset><legend>Import Process</legend>
|
||||
<dl>
|
||||
<dt>Thread:</dt><dd>#[thread]#</dd>
|
||||
<dt>Source:</dt><dd>#[source]#</dd>
|
||||
<dt>Processed:</dt><dd>#[count]# Wiki Entries</dd>
|
||||
<dt>Speed:</dt><dd>#[speed]# articles per second</dd>
|
||||
<dt>Running Time:</dt><dd>#[runningHours]# hours, #[runningMinutes]# minutes</dd>
|
||||
<dt>Remaining Time:</dt><dd>#[remainingHours]# hours, #[remainingMinutes]# minutes</dd>
|
||||
</dl>
|
||||
</fieldset></form>
|
||||
#(/import)#
|
||||
|
||||
#%env/templates/footer.template%#
|
||||
</body>
|
||||
</html>
|
86
htroot/IndexImportOAIPMH_p.java
Normal file
86
htroot/IndexImportOAIPMH_p.java
Normal file
|
@ -0,0 +1,86 @@
|
|||
// IndexImportOAIPMH.java
|
||||
// -------------------------
|
||||
// (C) 2009 by Michael Peter Christen; mc@yacy.net
|
||||
// first published 04.05.2009 on http://yacy.net
|
||||
// Frankfurt, Germany
|
||||
//
|
||||
// $LastChangedDate: 2009-10-11 23:29:18 +0200 (So, 11 Okt 2009) $
|
||||
// $LastChangedRevision: 6400 $
|
||||
// $LastChangedBy: orbiter $
|
||||
//
|
||||
// This program is free software; you can redistribute it and/or modify
|
||||
// it under the terms of the GNU General Public License as published by
|
||||
// the Free Software Foundation; either version 2 of the License, or
|
||||
// (at your option) any later version.
|
||||
//
|
||||
// This program is distributed in the hope that it will be useful,
|
||||
// but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
// GNU General Public License for more details.
|
||||
//
|
||||
// You should have received a copy of the GNU General Public License
|
||||
// along with this program; if not, write to the Free Software
|
||||
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
||||
|
||||
import java.io.File;
|
||||
import java.net.MalformedURLException;
|
||||
|
||||
import net.yacy.document.importer.OAIPMHImporter;
|
||||
import net.yacy.kelondro.data.meta.DigestURI;
|
||||
|
||||
import de.anomic.http.server.RequestHeader;
|
||||
import de.anomic.search.Switchboard;
|
||||
import de.anomic.server.serverObjects;
|
||||
import de.anomic.server.serverSwitch;
|
||||
|
||||
public class IndexImportOAIPMH_p {
|
||||
|
||||
public static serverObjects respond(final RequestHeader header, final serverObjects post, final serverSwitch env) {
|
||||
final serverObjects prop = new serverObjects();
|
||||
final Switchboard sb = (Switchboard) env;
|
||||
|
||||
if (OAIPMHImporter.job != null && OAIPMHImporter.job.isAlive()) {
|
||||
// one import is running, no option to insert anything
|
||||
prop.put("import", 1);
|
||||
prop.put("import_thread", "running");
|
||||
prop.put("import_source", OAIPMHImporter.job.source());
|
||||
prop.put("import_count", OAIPMHImporter.job.count());
|
||||
prop.put("import_speed", OAIPMHImporter.job.speed());
|
||||
prop.put("import_runningHours", (OAIPMHImporter.job.runningTime() / 60) / 60);
|
||||
prop.put("import_runningMinutes", (OAIPMHImporter.job.runningTime() / 60) % 60);
|
||||
prop.put("import_remainingHours", (OAIPMHImporter.job.remainingTime() / 60) / 60);
|
||||
prop.put("import_remainingMinutes", (OAIPMHImporter.job.remainingTime() / 60) % 60);
|
||||
} else {
|
||||
prop.put("import", 0);
|
||||
if (post == null) {
|
||||
prop.put("import_status", 0);
|
||||
} else {
|
||||
if (post.containsKey("file")) {
|
||||
String oaipmhurl = post.get("oaipmhurl");
|
||||
DigestURI url = null;
|
||||
try {
|
||||
url = new DigestURI(oaipmhurl, null);
|
||||
OAIPMHImporter.job = new OAIPMHImporter(sb.loader, url);
|
||||
OAIPMHImporter.job.start();
|
||||
prop.put("import", 1);
|
||||
prop.put("import_thread", "started");
|
||||
prop.put("import_dump", OAIPMHImporter.job.source());
|
||||
prop.put("import_count", 0);
|
||||
prop.put("import_speed", 0);
|
||||
prop.put("import_runningHours", 0);
|
||||
prop.put("import_runningMinutes", 0);
|
||||
prop.put("import_remainingHours", 0);
|
||||
prop.put("import_remainingMinutes", 0);
|
||||
} catch (MalformedURLException e) {
|
||||
e.printStackTrace();
|
||||
prop.put("import", 0);
|
||||
prop.put("import_status", 1);
|
||||
prop.put("import_status_message", e.getMessage());
|
||||
}
|
||||
}
|
||||
return prop;
|
||||
}
|
||||
}
|
||||
return prop;
|
||||
}
|
||||
}
|
|
@ -25,11 +25,12 @@
|
|||
import java.io.File;
|
||||
import java.net.MalformedURLException;
|
||||
|
||||
import net.yacy.document.importer.MediawikiImporter;
|
||||
|
||||
import de.anomic.http.server.RequestHeader;
|
||||
import de.anomic.search.Switchboard;
|
||||
import de.anomic.server.serverObjects;
|
||||
import de.anomic.server.serverSwitch;
|
||||
import de.anomic.tools.mediawikiIndex;
|
||||
|
||||
public class IndexImportWikimedia_p {
|
||||
|
||||
|
@ -37,17 +38,17 @@ public class IndexImportWikimedia_p {
|
|||
final serverObjects prop = new serverObjects();
|
||||
final Switchboard sb = (Switchboard) env;
|
||||
|
||||
if (mediawikiIndex.job != null && mediawikiIndex.job.isAlive()) {
|
||||
if (MediawikiImporter.job != null && MediawikiImporter.job.isAlive()) {
|
||||
// one import is running, no option to insert anything
|
||||
prop.put("import", 1);
|
||||
prop.put("import_thread", "running");
|
||||
prop.put("import_dump", mediawikiIndex.job.sourcefile.getName());
|
||||
prop.put("import_count", mediawikiIndex.job.count);
|
||||
prop.put("import_speed", mediawikiIndex.job.speed());
|
||||
prop.put("import_runningHours", (mediawikiIndex.job.runningTime() / 60) / 60);
|
||||
prop.put("import_runningMinutes", (mediawikiIndex.job.runningTime() / 60) % 60);
|
||||
prop.put("import_remainingHours", (mediawikiIndex.job.remainingTime() / 60) / 60);
|
||||
prop.put("import_remainingMinutes", (mediawikiIndex.job.remainingTime() / 60) % 60);
|
||||
prop.put("import_dump", MediawikiImporter.job.source());
|
||||
prop.put("import_count", MediawikiImporter.job.count());
|
||||
prop.put("import_speed", MediawikiImporter.job.speed());
|
||||
prop.put("import_runningHours", (MediawikiImporter.job.runningTime() / 60) / 60);
|
||||
prop.put("import_runningMinutes", (MediawikiImporter.job.runningTime() / 60) % 60);
|
||||
prop.put("import_remainingHours", (MediawikiImporter.job.remainingTime() / 60) / 60);
|
||||
prop.put("import_remainingMinutes", (MediawikiImporter.job.remainingTime() / 60) % 60);
|
||||
} else {
|
||||
prop.put("import", 0);
|
||||
if (post == null) {
|
||||
|
@ -64,11 +65,11 @@ public class IndexImportWikimedia_p {
|
|||
}
|
||||
String lang = name.substring(0, 2);
|
||||
try {
|
||||
mediawikiIndex.job = new mediawikiIndex(sourcefile, sb.surrogatesInPath, "http://" + lang + ".wikipedia.org/wiki/");
|
||||
mediawikiIndex.job.start();
|
||||
MediawikiImporter.job = new MediawikiImporter(sourcefile, sb.surrogatesInPath, "http://" + lang + ".wikipedia.org/wiki/");
|
||||
MediawikiImporter.job.start();
|
||||
prop.put("import", 1);
|
||||
prop.put("import_thread", "started");
|
||||
prop.put("import_dump", mediawikiIndex.job.sourcefile.getName());
|
||||
prop.put("import_dump", MediawikiImporter.job.source());
|
||||
prop.put("import_count", 0);
|
||||
prop.put("import_speed", 0);
|
||||
prop.put("import_runningHours", 0);
|
||||
|
|
|
@ -27,11 +27,12 @@
|
|||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
|
||||
import net.yacy.document.importer.MediawikiImporter;
|
||||
|
||||
import de.anomic.http.server.RequestHeader;
|
||||
import de.anomic.search.Switchboard;
|
||||
import de.anomic.server.serverObjects;
|
||||
import de.anomic.server.serverSwitch;
|
||||
import de.anomic.tools.mediawikiIndex;
|
||||
|
||||
public class mediawiki_p {
|
||||
|
||||
|
@ -53,12 +54,12 @@ public class mediawiki_p {
|
|||
|
||||
File dumpFile = new File(sb.getRootPath(), "DATA/HTCACHE/mediawiki/" + dump);
|
||||
if (!dumpFile.exists()) return post;
|
||||
mediawikiIndex.checkIndex(dumpFile);
|
||||
mediawikiIndex.wikisourcerecord w = mediawikiIndex.find(title.replaceAll(" ", "_"), mediawikiIndex.idxFromWikimediaXML(dumpFile));
|
||||
MediawikiImporter.checkIndex(dumpFile);
|
||||
MediawikiImporter.wikisourcerecord w = MediawikiImporter.find(title.replaceAll(" ", "_"), MediawikiImporter.idxFromWikimediaXML(dumpFile));
|
||||
if (w == null) {
|
||||
return post;
|
||||
}
|
||||
String page = new String(mediawikiIndex.read(dumpFile, w.start, (int) (w.end - w.start)), "UTF-8");
|
||||
String page = new String(MediawikiImporter.read(dumpFile, w.start, (int) (w.end - w.start)), "UTF-8");
|
||||
int p = page.indexOf("<text");
|
||||
if (p < 0) return prop;
|
||||
p = page.indexOf('>', p);
|
||||
|
|
40
source/net/yacy/document/importer/Importer.java
Normal file
40
source/net/yacy/document/importer/Importer.java
Normal file
|
@ -0,0 +1,40 @@
|
|||
package net.yacy.document.importer;
|
||||
|
||||
public interface Importer extends Runnable {
|
||||
|
||||
|
||||
public String source();
|
||||
|
||||
public int count();
|
||||
|
||||
/**
|
||||
* return the number of articles per second
|
||||
* @return
|
||||
*/
|
||||
public int speed();
|
||||
|
||||
/**
|
||||
* return the time this import is already running
|
||||
* @return
|
||||
*/
|
||||
public long runningTime();
|
||||
|
||||
|
||||
/**
|
||||
* return the remaining seconds for the completion of all records in milliseconds
|
||||
* @return
|
||||
*/
|
||||
public long remainingTime();
|
||||
|
||||
|
||||
|
||||
public boolean isAlive();
|
||||
|
||||
public void start();
|
||||
|
||||
/**
|
||||
* the run method from runnable
|
||||
*/
|
||||
public void run();
|
||||
|
||||
}
|
|
@ -24,7 +24,7 @@
|
|||
// along with this program; if not, write to the Free Software
|
||||
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
||||
|
||||
package de.anomic.tools;
|
||||
package net.yacy.document.importer;
|
||||
|
||||
import net.yacy.document.Document;
|
||||
import net.yacy.document.TextParser;
|
||||
|
@ -71,7 +71,7 @@ import de.anomic.data.wiki.wikiParser;
|
|||
* as referenced with xmlns="http://www.mediawiki.org/xml/export-0.3/"
|
||||
*/
|
||||
|
||||
public class mediawikiIndex extends Thread {
|
||||
public class MediawikiImporter extends Thread implements Importer {
|
||||
|
||||
private static final String textstart = "<text";
|
||||
private static final String textend = "</text>";
|
||||
|
@ -79,6 +79,9 @@ public class mediawikiIndex extends Thread {
|
|||
private static final String pageend = "</page>";
|
||||
private static final byte[] pagestartb = pagestart.getBytes();
|
||||
private static final byte[] pageendb = pageend.getBytes();
|
||||
private static final int docspermbinxmlbz2 = 800; // documents per megabyte in a xml.bz2 wikimedia dump
|
||||
|
||||
public static Importer job; // if started from a servlet, this object is used to store the thread
|
||||
|
||||
protected wikiParser wparser;
|
||||
protected String urlStub;
|
||||
|
@ -89,11 +92,8 @@ public class mediawikiIndex extends Thread {
|
|||
private long docsize;
|
||||
private int approxdocs;
|
||||
|
||||
private static final int docspermbinxmlbz2 = 800; // documents per megabyte in a xml.bz2 wikimedia dump
|
||||
|
||||
public static mediawikiIndex job; // if started from a servlet, this object is used to store the thread
|
||||
|
||||
public mediawikiIndex(File sourcefile, File targetdir, String baseURL) throws MalformedURLException {
|
||||
public MediawikiImporter(File sourcefile, File targetdir, String baseURL) throws MalformedURLException {
|
||||
this.sourcefile = sourcefile;
|
||||
this.docsize = sourcefile.length();
|
||||
this.approxdocs = (int) (this.docsize * (long) docspermbinxmlbz2 / 1024L / 1024L);
|
||||
|
@ -104,6 +104,14 @@ public class mediawikiIndex extends Thread {
|
|||
this.start = 0;
|
||||
}
|
||||
|
||||
public int count() {
|
||||
return this.count;
|
||||
}
|
||||
|
||||
public String source() {
|
||||
return this.sourcefile.getAbsolutePath();
|
||||
}
|
||||
|
||||
/**
|
||||
* return the number of articles per second
|
||||
* @return
|
||||
|
@ -738,7 +746,7 @@ public class mediawikiIndex extends Thread {
|
|||
String urlStub = s[3]; // i.e. http://de.wikipedia.org/wiki/
|
||||
//String language = urlStub.substring(7,9);
|
||||
try {
|
||||
mediawikiIndex mi = new mediawikiIndex(sourcefile, targetdir, urlStub);
|
||||
MediawikiImporter mi = new MediawikiImporter(sourcefile, targetdir, urlStub);
|
||||
mi.start();
|
||||
mi.join();
|
||||
} catch (InterruptedException e) {
|
|
@ -24,7 +24,7 @@
|
|||
// along with this program; if not, write to the Free Software
|
||||
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
||||
|
||||
package de.anomic.crawler;
|
||||
package net.yacy.document.importer;
|
||||
|
||||
import java.io.ByteArrayInputStream;
|
||||
import java.io.IOException;
|
||||
|
@ -35,21 +35,56 @@ import net.yacy.document.content.file.SurrogateReader;
|
|||
import net.yacy.kelondro.data.meta.DigestURI;
|
||||
import net.yacy.repository.LoaderDispatcher;
|
||||
|
||||
import de.anomic.crawler.CrawlProfile;
|
||||
import de.anomic.crawler.retrieval.HTTPLoader;
|
||||
import de.anomic.crawler.retrieval.Request;
|
||||
import de.anomic.crawler.retrieval.Response;
|
||||
|
||||
public class PMHReader {
|
||||
public class OAIPMHImporter extends Thread implements Importer {
|
||||
|
||||
LoaderDispatcher loader;
|
||||
public static Importer job; // if started from a servlet, this object is used to store the thread
|
||||
|
||||
public PMHReader(LoaderDispatcher loader) {
|
||||
private LoaderDispatcher loader;
|
||||
private DigestURI source;
|
||||
private int count;
|
||||
private long startTime;
|
||||
|
||||
public OAIPMHImporter(LoaderDispatcher loader, DigestURI source) {
|
||||
this.loader = loader;
|
||||
this.source = source;
|
||||
this.count = 0;
|
||||
this.startTime = System.currentTimeMillis();
|
||||
}
|
||||
|
||||
|
||||
public int count() {
|
||||
return this.count;
|
||||
}
|
||||
|
||||
public long remainingTime() {
|
||||
return Long.MAX_VALUE; // we don't know
|
||||
}
|
||||
|
||||
public long runningTime() {
|
||||
return System.currentTimeMillis() - this.startTime;
|
||||
}
|
||||
|
||||
public String source() {
|
||||
return source.toNormalform(true, false);
|
||||
}
|
||||
|
||||
public int speed() {
|
||||
return (int) (1000L * ((long) count()) / runningTime());
|
||||
}
|
||||
|
||||
public void load(DigestURI source) throws IOException {
|
||||
Response response = this.loader.load(source, true, true, CrawlProfile.CACHE_STRATEGY_NOCACHE);
|
||||
load(response);
|
||||
public void run() {
|
||||
Response response;
|
||||
try {
|
||||
response = this.loader.load(source, true, true, CrawlProfile.CACHE_STRATEGY_NOCACHE);
|
||||
load(response);
|
||||
} catch (IOException e) {
|
||||
e.printStackTrace();
|
||||
}
|
||||
}
|
||||
|
||||
public static void load0(DigestURI source) throws IOException {
|
Loading…
Reference in New Issue
Block a user