mirror of
https://github.com/yacy/yacy_search_server.git
synced 2024-09-19 00:01:41 +02:00
added stub of oai-pmh importer (not working yet)
git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@6437 6c8d7289-2bf4-0310-a012-ef5d649a1542
This commit is contained in:
parent
77c99e500f
commit
30f108f97d
37
htroot/IndexImportOAIPMH_p.html
Normal file
37
htroot/IndexImportOAIPMH_p.html
Normal file
|
@ -0,0 +1,37 @@
|
||||||
|
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
|
||||||
|
<html xmlns="http://www.w3.org/1999/xhtml">
|
||||||
|
<head>
|
||||||
|
<title>YaCy '#[clientname]#': OAI-PMH Import</title>
|
||||||
|
#%env/templates/metas.template%#
|
||||||
|
#(import)#::<meta http-equiv="REFRESH" content="10" />#(/import)#
|
||||||
|
</head>
|
||||||
|
<body id="IndexImportOAIPMH">
|
||||||
|
#%env/templates/header.template%#
|
||||||
|
#%env/templates/submenuIntegration.template%#
|
||||||
|
<h2>OAI-PMH Import</h2>
|
||||||
|
|
||||||
|
#(import)#
|
||||||
|
<p>#(status)#No import thread is running, you can start a new thread here::Bad input data: #[message]# #(/status)#</p>
|
||||||
|
<form action="IndexImportOAIPMH_p.html" method="get">
|
||||||
|
<fieldset>
|
||||||
|
<legend>OAI-PMH Import: set a OAI-PMH URL</legend>
|
||||||
|
<input name="oaipmhurl" type="text" value="" size="80" />
|
||||||
|
<input name="submit" type="submit" value="Import from a OAI-PMH source" />
|
||||||
|
</fieldset>
|
||||||
|
</form>
|
||||||
|
::
|
||||||
|
<form><fieldset><legend>Import Process</legend>
|
||||||
|
<dl>
|
||||||
|
<dt>Thread:</dt><dd>#[thread]#</dd>
|
||||||
|
<dt>Source:</dt><dd>#[source]#</dd>
|
||||||
|
<dt>Processed:</dt><dd>#[count]# Wiki Entries</dd>
|
||||||
|
<dt>Speed:</dt><dd>#[speed]# articles per second</dd>
|
||||||
|
<dt>Running Time:</dt><dd>#[runningHours]# hours, #[runningMinutes]# minutes</dd>
|
||||||
|
<dt>Remaining Time:</dt><dd>#[remainingHours]# hours, #[remainingMinutes]# minutes</dd>
|
||||||
|
</dl>
|
||||||
|
</fieldset></form>
|
||||||
|
#(/import)#
|
||||||
|
|
||||||
|
#%env/templates/footer.template%#
|
||||||
|
</body>
|
||||||
|
</html>
|
86
htroot/IndexImportOAIPMH_p.java
Normal file
86
htroot/IndexImportOAIPMH_p.java
Normal file
|
@ -0,0 +1,86 @@
|
||||||
|
// IndexImportOAIPMH.java
|
||||||
|
// -------------------------
|
||||||
|
// (C) 2009 by Michael Peter Christen; mc@yacy.net
|
||||||
|
// first published 04.05.2009 on http://yacy.net
|
||||||
|
// Frankfurt, Germany
|
||||||
|
//
|
||||||
|
// $LastChangedDate: 2009-10-11 23:29:18 +0200 (So, 11 Okt 2009) $
|
||||||
|
// $LastChangedRevision: 6400 $
|
||||||
|
// $LastChangedBy: orbiter $
|
||||||
|
//
|
||||||
|
// This program is free software; you can redistribute it and/or modify
|
||||||
|
// it under the terms of the GNU General Public License as published by
|
||||||
|
// the Free Software Foundation; either version 2 of the License, or
|
||||||
|
// (at your option) any later version.
|
||||||
|
//
|
||||||
|
// This program is distributed in the hope that it will be useful,
|
||||||
|
// but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
// GNU General Public License for more details.
|
||||||
|
//
|
||||||
|
// You should have received a copy of the GNU General Public License
|
||||||
|
// along with this program; if not, write to the Free Software
|
||||||
|
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
||||||
|
|
||||||
|
import java.io.File;
|
||||||
|
import java.net.MalformedURLException;
|
||||||
|
|
||||||
|
import net.yacy.document.importer.OAIPMHImporter;
|
||||||
|
import net.yacy.kelondro.data.meta.DigestURI;
|
||||||
|
|
||||||
|
import de.anomic.http.server.RequestHeader;
|
||||||
|
import de.anomic.search.Switchboard;
|
||||||
|
import de.anomic.server.serverObjects;
|
||||||
|
import de.anomic.server.serverSwitch;
|
||||||
|
|
||||||
|
public class IndexImportOAIPMH_p {
|
||||||
|
|
||||||
|
public static serverObjects respond(final RequestHeader header, final serverObjects post, final serverSwitch env) {
|
||||||
|
final serverObjects prop = new serverObjects();
|
||||||
|
final Switchboard sb = (Switchboard) env;
|
||||||
|
|
||||||
|
if (OAIPMHImporter.job != null && OAIPMHImporter.job.isAlive()) {
|
||||||
|
// one import is running, no option to insert anything
|
||||||
|
prop.put("import", 1);
|
||||||
|
prop.put("import_thread", "running");
|
||||||
|
prop.put("import_source", OAIPMHImporter.job.source());
|
||||||
|
prop.put("import_count", OAIPMHImporter.job.count());
|
||||||
|
prop.put("import_speed", OAIPMHImporter.job.speed());
|
||||||
|
prop.put("import_runningHours", (OAIPMHImporter.job.runningTime() / 60) / 60);
|
||||||
|
prop.put("import_runningMinutes", (OAIPMHImporter.job.runningTime() / 60) % 60);
|
||||||
|
prop.put("import_remainingHours", (OAIPMHImporter.job.remainingTime() / 60) / 60);
|
||||||
|
prop.put("import_remainingMinutes", (OAIPMHImporter.job.remainingTime() / 60) % 60);
|
||||||
|
} else {
|
||||||
|
prop.put("import", 0);
|
||||||
|
if (post == null) {
|
||||||
|
prop.put("import_status", 0);
|
||||||
|
} else {
|
||||||
|
if (post.containsKey("file")) {
|
||||||
|
String oaipmhurl = post.get("oaipmhurl");
|
||||||
|
DigestURI url = null;
|
||||||
|
try {
|
||||||
|
url = new DigestURI(oaipmhurl, null);
|
||||||
|
OAIPMHImporter.job = new OAIPMHImporter(sb.loader, url);
|
||||||
|
OAIPMHImporter.job.start();
|
||||||
|
prop.put("import", 1);
|
||||||
|
prop.put("import_thread", "started");
|
||||||
|
prop.put("import_dump", OAIPMHImporter.job.source());
|
||||||
|
prop.put("import_count", 0);
|
||||||
|
prop.put("import_speed", 0);
|
||||||
|
prop.put("import_runningHours", 0);
|
||||||
|
prop.put("import_runningMinutes", 0);
|
||||||
|
prop.put("import_remainingHours", 0);
|
||||||
|
prop.put("import_remainingMinutes", 0);
|
||||||
|
} catch (MalformedURLException e) {
|
||||||
|
e.printStackTrace();
|
||||||
|
prop.put("import", 0);
|
||||||
|
prop.put("import_status", 1);
|
||||||
|
prop.put("import_status_message", e.getMessage());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return prop;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return prop;
|
||||||
|
}
|
||||||
|
}
|
|
@ -25,11 +25,12 @@
|
||||||
import java.io.File;
|
import java.io.File;
|
||||||
import java.net.MalformedURLException;
|
import java.net.MalformedURLException;
|
||||||
|
|
||||||
|
import net.yacy.document.importer.MediawikiImporter;
|
||||||
|
|
||||||
import de.anomic.http.server.RequestHeader;
|
import de.anomic.http.server.RequestHeader;
|
||||||
import de.anomic.search.Switchboard;
|
import de.anomic.search.Switchboard;
|
||||||
import de.anomic.server.serverObjects;
|
import de.anomic.server.serverObjects;
|
||||||
import de.anomic.server.serverSwitch;
|
import de.anomic.server.serverSwitch;
|
||||||
import de.anomic.tools.mediawikiIndex;
|
|
||||||
|
|
||||||
public class IndexImportWikimedia_p {
|
public class IndexImportWikimedia_p {
|
||||||
|
|
||||||
|
@ -37,17 +38,17 @@ public class IndexImportWikimedia_p {
|
||||||
final serverObjects prop = new serverObjects();
|
final serverObjects prop = new serverObjects();
|
||||||
final Switchboard sb = (Switchboard) env;
|
final Switchboard sb = (Switchboard) env;
|
||||||
|
|
||||||
if (mediawikiIndex.job != null && mediawikiIndex.job.isAlive()) {
|
if (MediawikiImporter.job != null && MediawikiImporter.job.isAlive()) {
|
||||||
// one import is running, no option to insert anything
|
// one import is running, no option to insert anything
|
||||||
prop.put("import", 1);
|
prop.put("import", 1);
|
||||||
prop.put("import_thread", "running");
|
prop.put("import_thread", "running");
|
||||||
prop.put("import_dump", mediawikiIndex.job.sourcefile.getName());
|
prop.put("import_dump", MediawikiImporter.job.source());
|
||||||
prop.put("import_count", mediawikiIndex.job.count);
|
prop.put("import_count", MediawikiImporter.job.count());
|
||||||
prop.put("import_speed", mediawikiIndex.job.speed());
|
prop.put("import_speed", MediawikiImporter.job.speed());
|
||||||
prop.put("import_runningHours", (mediawikiIndex.job.runningTime() / 60) / 60);
|
prop.put("import_runningHours", (MediawikiImporter.job.runningTime() / 60) / 60);
|
||||||
prop.put("import_runningMinutes", (mediawikiIndex.job.runningTime() / 60) % 60);
|
prop.put("import_runningMinutes", (MediawikiImporter.job.runningTime() / 60) % 60);
|
||||||
prop.put("import_remainingHours", (mediawikiIndex.job.remainingTime() / 60) / 60);
|
prop.put("import_remainingHours", (MediawikiImporter.job.remainingTime() / 60) / 60);
|
||||||
prop.put("import_remainingMinutes", (mediawikiIndex.job.remainingTime() / 60) % 60);
|
prop.put("import_remainingMinutes", (MediawikiImporter.job.remainingTime() / 60) % 60);
|
||||||
} else {
|
} else {
|
||||||
prop.put("import", 0);
|
prop.put("import", 0);
|
||||||
if (post == null) {
|
if (post == null) {
|
||||||
|
@ -64,11 +65,11 @@ public class IndexImportWikimedia_p {
|
||||||
}
|
}
|
||||||
String lang = name.substring(0, 2);
|
String lang = name.substring(0, 2);
|
||||||
try {
|
try {
|
||||||
mediawikiIndex.job = new mediawikiIndex(sourcefile, sb.surrogatesInPath, "http://" + lang + ".wikipedia.org/wiki/");
|
MediawikiImporter.job = new MediawikiImporter(sourcefile, sb.surrogatesInPath, "http://" + lang + ".wikipedia.org/wiki/");
|
||||||
mediawikiIndex.job.start();
|
MediawikiImporter.job.start();
|
||||||
prop.put("import", 1);
|
prop.put("import", 1);
|
||||||
prop.put("import_thread", "started");
|
prop.put("import_thread", "started");
|
||||||
prop.put("import_dump", mediawikiIndex.job.sourcefile.getName());
|
prop.put("import_dump", MediawikiImporter.job.source());
|
||||||
prop.put("import_count", 0);
|
prop.put("import_count", 0);
|
||||||
prop.put("import_speed", 0);
|
prop.put("import_speed", 0);
|
||||||
prop.put("import_runningHours", 0);
|
prop.put("import_runningHours", 0);
|
||||||
|
|
|
@ -27,11 +27,12 @@
|
||||||
import java.io.File;
|
import java.io.File;
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
|
|
||||||
|
import net.yacy.document.importer.MediawikiImporter;
|
||||||
|
|
||||||
import de.anomic.http.server.RequestHeader;
|
import de.anomic.http.server.RequestHeader;
|
||||||
import de.anomic.search.Switchboard;
|
import de.anomic.search.Switchboard;
|
||||||
import de.anomic.server.serverObjects;
|
import de.anomic.server.serverObjects;
|
||||||
import de.anomic.server.serverSwitch;
|
import de.anomic.server.serverSwitch;
|
||||||
import de.anomic.tools.mediawikiIndex;
|
|
||||||
|
|
||||||
public class mediawiki_p {
|
public class mediawiki_p {
|
||||||
|
|
||||||
|
@ -53,12 +54,12 @@ public class mediawiki_p {
|
||||||
|
|
||||||
File dumpFile = new File(sb.getRootPath(), "DATA/HTCACHE/mediawiki/" + dump);
|
File dumpFile = new File(sb.getRootPath(), "DATA/HTCACHE/mediawiki/" + dump);
|
||||||
if (!dumpFile.exists()) return post;
|
if (!dumpFile.exists()) return post;
|
||||||
mediawikiIndex.checkIndex(dumpFile);
|
MediawikiImporter.checkIndex(dumpFile);
|
||||||
mediawikiIndex.wikisourcerecord w = mediawikiIndex.find(title.replaceAll(" ", "_"), mediawikiIndex.idxFromWikimediaXML(dumpFile));
|
MediawikiImporter.wikisourcerecord w = MediawikiImporter.find(title.replaceAll(" ", "_"), MediawikiImporter.idxFromWikimediaXML(dumpFile));
|
||||||
if (w == null) {
|
if (w == null) {
|
||||||
return post;
|
return post;
|
||||||
}
|
}
|
||||||
String page = new String(mediawikiIndex.read(dumpFile, w.start, (int) (w.end - w.start)), "UTF-8");
|
String page = new String(MediawikiImporter.read(dumpFile, w.start, (int) (w.end - w.start)), "UTF-8");
|
||||||
int p = page.indexOf("<text");
|
int p = page.indexOf("<text");
|
||||||
if (p < 0) return prop;
|
if (p < 0) return prop;
|
||||||
p = page.indexOf('>', p);
|
p = page.indexOf('>', p);
|
||||||
|
|
40
source/net/yacy/document/importer/Importer.java
Normal file
40
source/net/yacy/document/importer/Importer.java
Normal file
|
@ -0,0 +1,40 @@
|
||||||
|
package net.yacy.document.importer;
|
||||||
|
|
||||||
|
public interface Importer extends Runnable {
|
||||||
|
|
||||||
|
|
||||||
|
public String source();
|
||||||
|
|
||||||
|
public int count();
|
||||||
|
|
||||||
|
/**
|
||||||
|
* return the number of articles per second
|
||||||
|
* @return
|
||||||
|
*/
|
||||||
|
public int speed();
|
||||||
|
|
||||||
|
/**
|
||||||
|
* return the time this import is already running
|
||||||
|
* @return
|
||||||
|
*/
|
||||||
|
public long runningTime();
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* return the remaining seconds for the completion of all records in milliseconds
|
||||||
|
* @return
|
||||||
|
*/
|
||||||
|
public long remainingTime();
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
public boolean isAlive();
|
||||||
|
|
||||||
|
public void start();
|
||||||
|
|
||||||
|
/**
|
||||||
|
* the run method from runnable
|
||||||
|
*/
|
||||||
|
public void run();
|
||||||
|
|
||||||
|
}
|
|
@ -24,7 +24,7 @@
|
||||||
// along with this program; if not, write to the Free Software
|
// along with this program; if not, write to the Free Software
|
||||||
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
||||||
|
|
||||||
package de.anomic.tools;
|
package net.yacy.document.importer;
|
||||||
|
|
||||||
import net.yacy.document.Document;
|
import net.yacy.document.Document;
|
||||||
import net.yacy.document.TextParser;
|
import net.yacy.document.TextParser;
|
||||||
|
@ -71,7 +71,7 @@ import de.anomic.data.wiki.wikiParser;
|
||||||
* as referenced with xmlns="http://www.mediawiki.org/xml/export-0.3/"
|
* as referenced with xmlns="http://www.mediawiki.org/xml/export-0.3/"
|
||||||
*/
|
*/
|
||||||
|
|
||||||
public class mediawikiIndex extends Thread {
|
public class MediawikiImporter extends Thread implements Importer {
|
||||||
|
|
||||||
private static final String textstart = "<text";
|
private static final String textstart = "<text";
|
||||||
private static final String textend = "</text>";
|
private static final String textend = "</text>";
|
||||||
|
@ -79,6 +79,9 @@ public class mediawikiIndex extends Thread {
|
||||||
private static final String pageend = "</page>";
|
private static final String pageend = "</page>";
|
||||||
private static final byte[] pagestartb = pagestart.getBytes();
|
private static final byte[] pagestartb = pagestart.getBytes();
|
||||||
private static final byte[] pageendb = pageend.getBytes();
|
private static final byte[] pageendb = pageend.getBytes();
|
||||||
|
private static final int docspermbinxmlbz2 = 800; // documents per megabyte in a xml.bz2 wikimedia dump
|
||||||
|
|
||||||
|
public static Importer job; // if started from a servlet, this object is used to store the thread
|
||||||
|
|
||||||
protected wikiParser wparser;
|
protected wikiParser wparser;
|
||||||
protected String urlStub;
|
protected String urlStub;
|
||||||
|
@ -89,11 +92,8 @@ public class mediawikiIndex extends Thread {
|
||||||
private long docsize;
|
private long docsize;
|
||||||
private int approxdocs;
|
private int approxdocs;
|
||||||
|
|
||||||
private static final int docspermbinxmlbz2 = 800; // documents per megabyte in a xml.bz2 wikimedia dump
|
|
||||||
|
|
||||||
public static mediawikiIndex job; // if started from a servlet, this object is used to store the thread
|
public MediawikiImporter(File sourcefile, File targetdir, String baseURL) throws MalformedURLException {
|
||||||
|
|
||||||
public mediawikiIndex(File sourcefile, File targetdir, String baseURL) throws MalformedURLException {
|
|
||||||
this.sourcefile = sourcefile;
|
this.sourcefile = sourcefile;
|
||||||
this.docsize = sourcefile.length();
|
this.docsize = sourcefile.length();
|
||||||
this.approxdocs = (int) (this.docsize * (long) docspermbinxmlbz2 / 1024L / 1024L);
|
this.approxdocs = (int) (this.docsize * (long) docspermbinxmlbz2 / 1024L / 1024L);
|
||||||
|
@ -104,6 +104,14 @@ public class mediawikiIndex extends Thread {
|
||||||
this.start = 0;
|
this.start = 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public int count() {
|
||||||
|
return this.count;
|
||||||
|
}
|
||||||
|
|
||||||
|
public String source() {
|
||||||
|
return this.sourcefile.getAbsolutePath();
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* return the number of articles per second
|
* return the number of articles per second
|
||||||
* @return
|
* @return
|
||||||
|
@ -738,7 +746,7 @@ public class mediawikiIndex extends Thread {
|
||||||
String urlStub = s[3]; // i.e. http://de.wikipedia.org/wiki/
|
String urlStub = s[3]; // i.e. http://de.wikipedia.org/wiki/
|
||||||
//String language = urlStub.substring(7,9);
|
//String language = urlStub.substring(7,9);
|
||||||
try {
|
try {
|
||||||
mediawikiIndex mi = new mediawikiIndex(sourcefile, targetdir, urlStub);
|
MediawikiImporter mi = new MediawikiImporter(sourcefile, targetdir, urlStub);
|
||||||
mi.start();
|
mi.start();
|
||||||
mi.join();
|
mi.join();
|
||||||
} catch (InterruptedException e) {
|
} catch (InterruptedException e) {
|
|
@ -24,7 +24,7 @@
|
||||||
// along with this program; if not, write to the Free Software
|
// along with this program; if not, write to the Free Software
|
||||||
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
||||||
|
|
||||||
package de.anomic.crawler;
|
package net.yacy.document.importer;
|
||||||
|
|
||||||
import java.io.ByteArrayInputStream;
|
import java.io.ByteArrayInputStream;
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
|
@ -35,21 +35,56 @@ import net.yacy.document.content.file.SurrogateReader;
|
||||||
import net.yacy.kelondro.data.meta.DigestURI;
|
import net.yacy.kelondro.data.meta.DigestURI;
|
||||||
import net.yacy.repository.LoaderDispatcher;
|
import net.yacy.repository.LoaderDispatcher;
|
||||||
|
|
||||||
|
import de.anomic.crawler.CrawlProfile;
|
||||||
import de.anomic.crawler.retrieval.HTTPLoader;
|
import de.anomic.crawler.retrieval.HTTPLoader;
|
||||||
import de.anomic.crawler.retrieval.Request;
|
import de.anomic.crawler.retrieval.Request;
|
||||||
import de.anomic.crawler.retrieval.Response;
|
import de.anomic.crawler.retrieval.Response;
|
||||||
|
|
||||||
public class PMHReader {
|
public class OAIPMHImporter extends Thread implements Importer {
|
||||||
|
|
||||||
LoaderDispatcher loader;
|
public static Importer job; // if started from a servlet, this object is used to store the thread
|
||||||
|
|
||||||
public PMHReader(LoaderDispatcher loader) {
|
private LoaderDispatcher loader;
|
||||||
|
private DigestURI source;
|
||||||
|
private int count;
|
||||||
|
private long startTime;
|
||||||
|
|
||||||
|
public OAIPMHImporter(LoaderDispatcher loader, DigestURI source) {
|
||||||
this.loader = loader;
|
this.loader = loader;
|
||||||
|
this.source = source;
|
||||||
|
this.count = 0;
|
||||||
|
this.startTime = System.currentTimeMillis();
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public int count() {
|
||||||
|
return this.count;
|
||||||
|
}
|
||||||
|
|
||||||
|
public long remainingTime() {
|
||||||
|
return Long.MAX_VALUE; // we don't know
|
||||||
|
}
|
||||||
|
|
||||||
|
public long runningTime() {
|
||||||
|
return System.currentTimeMillis() - this.startTime;
|
||||||
|
}
|
||||||
|
|
||||||
|
public String source() {
|
||||||
|
return source.toNormalform(true, false);
|
||||||
|
}
|
||||||
|
|
||||||
|
public int speed() {
|
||||||
|
return (int) (1000L * ((long) count()) / runningTime());
|
||||||
}
|
}
|
||||||
|
|
||||||
public void load(DigestURI source) throws IOException {
|
public void run() {
|
||||||
Response response = this.loader.load(source, true, true, CrawlProfile.CACHE_STRATEGY_NOCACHE);
|
Response response;
|
||||||
load(response);
|
try {
|
||||||
|
response = this.loader.load(source, true, true, CrawlProfile.CACHE_STRATEGY_NOCACHE);
|
||||||
|
load(response);
|
||||||
|
} catch (IOException e) {
|
||||||
|
e.printStackTrace();
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
public static void load0(DigestURI source) throws IOException {
|
public static void load0(DigestURI source) throws IOException {
|
Loading…
Reference in New Issue
Block a user