mirror of
https://github.com/yacy/yacy_search_server.git
synced 2024-09-19 00:01:41 +02:00
enhanced and fixed OAI-PMH import
- now importing OAI-PMH server list fron two sources - simultanous import from several servers (even > 2000) - check buttons on OAI-PMH server list to select multiple servers for import start - it is possible to select all servers at once for import - imported XML data is gzipped after import from surrogate reader git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@6847 6c8d7289-2bf4-0310-a012-ef5d649a1542
This commit is contained in:
parent
c2098f9399
commit
fc5efcc05a
86
.classpath
86
.classpath
|
@ -1,43 +1,43 @@
|
|||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<classpath>
|
||||
<classpathentry excluding="env/|htdocsdefault/|proxymsg/|yacy/|env/|yacy/user/|yacy/user/|yacy/ui/|processing/domaingraph/applet/|processing/domaingraph/|api/|api/bookmarks/posts/|api/bookmarks/|api/util/|api/bookmarks/xbel/|api/bookmarks/tags/" kind="src" path="htroot"/>
|
||||
<classpathentry kind="src" path="test"/>
|
||||
<classpathentry excluding="user/|user/|ui/" kind="src" path="htroot/yacy"/>
|
||||
<classpathentry kind="src" path="htroot/env"/>
|
||||
<classpathentry kind="src" path="source"/>
|
||||
<classpathentry kind="src" path="htroot/yacy/ui"/>
|
||||
<classpathentry excluding="bookmarks/posts/|bookmarks/|util/|bookmarks/xbel/|bookmarks/tags/" kind="src" path="htroot/api"/>
|
||||
<classpathentry kind="src" path="htroot/api/bookmarks/posts"/>
|
||||
<classpathentry excluding="posts/|xbel/|tags/" kind="src" path="htroot/api/bookmarks"/>
|
||||
<classpathentry kind="src" path="htroot/api/util"/>
|
||||
<classpathentry kind="src" path="htroot/api/bookmarks/xbel"/>
|
||||
<classpathentry kind="src" path="htroot/api/bookmarks/tags"/>
|
||||
<classpathentry exported="true" kind="con" path="org.eclipse.jdt.launching.JRE_CONTAINER"/>
|
||||
<classpathentry exported="true" kind="lib" path="lib/commons-httpclient-3.1.jar"/>
|
||||
<classpathentry exported="true" kind="lib" path="lib/commons-logging-1.1.1.jar"/>
|
||||
<classpathentry exported="true" kind="lib" path="lib/commons-io-1.4.jar"/>
|
||||
<classpathentry exported="true" kind="lib" path="lib/commons-fileupload-1.2.1.jar"/>
|
||||
<classpathentry exported="true" kind="lib" path="lib/servlet-api.jar"/>
|
||||
<classpathentry exported="true" kind="lib" path="lib/xerces.jar"/>
|
||||
<classpathentry exported="true" kind="lib" path="lib/bzip2.jar"/>
|
||||
<classpathentry exported="true" kind="lib" path="lib/J7Zip-modified.jar"/>
|
||||
<classpathentry exported="true" kind="lib" path="lib/webcat-0.1-swf.jar"/>
|
||||
<classpathentry exported="true" kind="lib" path="lib/activation.jar"/>
|
||||
<classpathentry exported="true" kind="lib" path="lib/commons-jxpath-1.3.jar"/>
|
||||
<classpathentry exported="true" kind="lib" path="libt/junit-4.7.jar"/>
|
||||
<classpathentry kind="lib" path="lib/fontbox-1.0.0.jar"/>
|
||||
<classpathentry kind="lib" path="lib/pdfbox-1.0.0.jar"/>
|
||||
<classpathentry kind="lib" path="lib/poi-3.6-20091214.jar"/>
|
||||
<classpathentry kind="lib" path="lib/poi-scratchpad-3.6-20091214.jar"/>
|
||||
<classpathentry kind="lib" path="lib/bcmail-jdk15-145.jar"/>
|
||||
<classpathentry kind="lib" path="lib/bcprov-jdk15-145.jar"/>
|
||||
<classpathentry kind="lib" path="lib/jsch-0.1.42.jar"/>
|
||||
<classpathentry kind="lib" path="lib/jakarta-oro-2.0.8.jar"/>
|
||||
<classpathentry kind="lib" path="lib/commons-codec-1.4.jar"/>
|
||||
<classpathentry kind="lib" path="lib/log4j-1.2.15.jar"/>
|
||||
<classpathentry kind="lib" path="lib/mysql-connector-java-5.1.12-bin.jar"/>
|
||||
<classpathentry kind="lib" path="lib/jcifs-1.3.14.jar"/>
|
||||
<classpathentry kind="lib" path="lib/metadata-extractor-2.4.0-beta-1.jar"/>
|
||||
<classpathentry exported="true" kind="con" path="org.eclipse.jdt.junit.JUNIT_CONTAINER/4"/>
|
||||
<classpathentry kind="output" path="gen"/>
|
||||
</classpath>
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<classpath>
|
||||
<classpathentry excluding="env/|htdocsdefault/|proxymsg/|yacy/|env/|yacy/user/|yacy/user/|yacy/ui/|processing/domaingraph/applet/|processing/domaingraph/|api/|api/bookmarks/posts/|api/bookmarks/|api/util/|api/bookmarks/xbel/|api/bookmarks/tags/" kind="src" path="htroot"/>
|
||||
<classpathentry kind="src" path="test"/>
|
||||
<classpathentry excluding="user/|user/|ui/" kind="src" path="htroot/yacy"/>
|
||||
<classpathentry kind="src" path="htroot/env"/>
|
||||
<classpathentry kind="src" path="source"/>
|
||||
<classpathentry kind="src" path="htroot/yacy/ui"/>
|
||||
<classpathentry excluding="bookmarks/posts/|bookmarks/|util/|bookmarks/xbel/|bookmarks/tags/" kind="src" path="htroot/api"/>
|
||||
<classpathentry kind="src" path="htroot/api/bookmarks/posts"/>
|
||||
<classpathentry excluding="posts/|xbel/|tags/" kind="src" path="htroot/api/bookmarks"/>
|
||||
<classpathentry kind="src" path="htroot/api/util"/>
|
||||
<classpathentry kind="src" path="htroot/api/bookmarks/xbel"/>
|
||||
<classpathentry kind="src" path="htroot/api/bookmarks/tags"/>
|
||||
<classpathentry exported="true" kind="con" path="org.eclipse.jdt.launching.JRE_CONTAINER"/>
|
||||
<classpathentry exported="true" kind="lib" path="lib/commons-httpclient-3.1.jar" sourcepath="/commons-httpclient-3.1/src"/>
|
||||
<classpathentry exported="true" kind="lib" path="lib/commons-logging-1.1.1.jar"/>
|
||||
<classpathentry exported="true" kind="lib" path="lib/commons-io-1.4.jar"/>
|
||||
<classpathentry exported="true" kind="lib" path="lib/commons-fileupload-1.2.1.jar"/>
|
||||
<classpathentry exported="true" kind="lib" path="lib/servlet-api.jar"/>
|
||||
<classpathentry exported="true" kind="lib" path="lib/xerces.jar"/>
|
||||
<classpathentry exported="true" kind="lib" path="lib/bzip2.jar"/>
|
||||
<classpathentry exported="true" kind="lib" path="lib/J7Zip-modified.jar"/>
|
||||
<classpathentry exported="true" kind="lib" path="lib/webcat-0.1-swf.jar"/>
|
||||
<classpathentry exported="true" kind="lib" path="lib/activation.jar"/>
|
||||
<classpathentry exported="true" kind="lib" path="lib/commons-jxpath-1.3.jar"/>
|
||||
<classpathentry exported="true" kind="lib" path="libt/junit-4.7.jar"/>
|
||||
<classpathentry kind="lib" path="lib/fontbox-1.0.0.jar"/>
|
||||
<classpathentry kind="lib" path="lib/pdfbox-1.0.0.jar"/>
|
||||
<classpathentry kind="lib" path="lib/poi-3.6-20091214.jar"/>
|
||||
<classpathentry kind="lib" path="lib/poi-scratchpad-3.6-20091214.jar"/>
|
||||
<classpathentry kind="lib" path="lib/bcmail-jdk15-145.jar"/>
|
||||
<classpathentry kind="lib" path="lib/bcprov-jdk15-145.jar"/>
|
||||
<classpathentry kind="lib" path="lib/jsch-0.1.42.jar"/>
|
||||
<classpathentry kind="lib" path="lib/jakarta-oro-2.0.8.jar"/>
|
||||
<classpathentry kind="lib" path="lib/commons-codec-1.4.jar"/>
|
||||
<classpathentry kind="lib" path="lib/log4j-1.2.15.jar"/>
|
||||
<classpathentry kind="lib" path="lib/mysql-connector-java-5.1.12-bin.jar"/>
|
||||
<classpathentry kind="lib" path="lib/jcifs-1.3.14.jar"/>
|
||||
<classpathentry kind="lib" path="lib/metadata-extractor-2.4.0-beta-1.jar"/>
|
||||
<classpathentry exported="true" kind="con" path="org.eclipse.jdt.junit.JUNIT_CONTAINER/4"/>
|
||||
<classpathentry kind="output" path="gen"/>
|
||||
</classpath>
|
||||
|
|
|
@ -3,7 +3,7 @@ javacSource=1.5
|
|||
javacTarget=1.5
|
||||
|
||||
# Release Configuration
|
||||
releaseVersion=0.94
|
||||
releaseVersion=0.95
|
||||
stdReleaseFile=yacy_v${releaseVersion}_${DSTAMP}_${releaseNr}.tar.gz
|
||||
sourceReleaseFile=yacy_src_v${releaseVersion}_${DSTAMP}_${releaseNr}.tar.gz
|
||||
releaseFileParentDir=yacy
|
||||
|
|
|
@ -187,8 +187,8 @@ public class CrawlResults {
|
|||
entry = i.next();
|
||||
try {
|
||||
urle = sb.indexSegments.urlMetadata(Segments.Process.LOCALCRAWLING).load(entry.getKey().getBytes(), null, 0);
|
||||
if(urle == null) {
|
||||
Log.logWarning("PLASMA", "CrawlResults: URL not in index for crawl result "+ i +" with hash "+ entry.getKey());
|
||||
if (urle == null) {
|
||||
Log.logWarning("PLASMA", "CrawlResults: URL not in index with url hash "+ entry.getKey());
|
||||
urlstr = null;
|
||||
urltxt = null;
|
||||
metadata = null;
|
||||
|
|
|
@ -4,20 +4,49 @@
|
|||
<title>YaCy '#[clientname]#': OAI-PMH source import list</title>
|
||||
#%env/templates/metas.template%#
|
||||
#(refresh)#::<meta http-equiv="REFRESH" content="6" />#(/refresh)#
|
||||
<script>
|
||||
<!--
|
||||
function setall(name, check){
|
||||
var selectForm = document.forms.namedItem(name);
|
||||
var count = selectForm.elements["num"].value;
|
||||
if (check) for(i = 0; i < count; i++) {
|
||||
if (selectForm.elements["item_" + i].checked) {
|
||||
check = false;
|
||||
break;
|
||||
}
|
||||
}
|
||||
for(i = 0; i < count; i++){
|
||||
selectForm.elements["item_" + i].checked = check;
|
||||
}
|
||||
}
|
||||
-->
|
||||
</script>
|
||||
<script src="/js/sorttable.js"></script>
|
||||
</head>
|
||||
<body>
|
||||
#(source)#::
|
||||
<h3>OAI Source List</h3>
|
||||
<h3>List of #[num]# OAI-PMH Servers</h3>
|
||||
<form action="IndexImportOAIPMH_p.html" target="_top" method="post" enctype="multipart/form-data" accept-charset="UTF-8" name="oaipmhimport">
|
||||
<p>
|
||||
<input type="hidden" name="num" value="#[num]#" />
|
||||
<input type="submit" name="loadrows" value="Load Selected Sources" />
|
||||
</p>
|
||||
<table cellpadding="2" cellspacing="1" >
|
||||
<tr class="TableHeader">
|
||||
<td><input type="checkbox" name="allswitch" onclick="setall(this.form.name, this.value)" /></td>
|
||||
<td>Source</td>
|
||||
</tr>
|
||||
#{table}#
|
||||
<tr class="TableCell#(dark)#Light::Dark#(/dark)#">
|
||||
<td>#[source]#</td>
|
||||
<td align="left"><input type="checkbox" name="item_#[count]#" value="mark_#[source]#" /></td>
|
||||
<td>#[loadurl]#</td>
|
||||
</tr>
|
||||
#{/table}#
|
||||
</table>
|
||||
<p>
|
||||
<input type="submit" name="loadrows" value="Load Selected Sources" />
|
||||
</p>
|
||||
</form>
|
||||
#(/source)#
|
||||
|
||||
#(import)#::
|
||||
|
|
|
@ -25,6 +25,7 @@
|
|||
import java.util.ArrayList;
|
||||
import java.util.Set;
|
||||
|
||||
import net.yacy.document.importer.OAIListFriendsLoader;
|
||||
import net.yacy.document.importer.OAIPMHImporter;
|
||||
|
||||
import de.anomic.http.server.RequestHeader;
|
||||
|
@ -43,39 +44,42 @@ public class IndexImportOAIPMHList_p {
|
|||
prop.put("source", 0);
|
||||
|
||||
if (post != null && post.containsKey("source")) {
|
||||
Set<String> oaiRoots = OAIPMHImporter.getAllListedOAIServer(sb.loader);
|
||||
Set<String> oaiRoots = OAIListFriendsLoader.load(sb.loader).keySet();
|
||||
|
||||
boolean dark = false;
|
||||
int cnt = 0;
|
||||
int count = 0;
|
||||
for (String root: oaiRoots) {
|
||||
prop.put("source_table_" + cnt + "_dark", (dark) ? "1" : "0");
|
||||
prop.put("source_table_" + cnt + "_source", "<a href=\"/IndexImportOAIPMH_p.html?importroot=&urlstartall=" + root + "\" target=\"_top\">" + root+ "</a>");
|
||||
prop.put("source_table_" + count + "_dark", (dark) ? "1" : "0");
|
||||
prop.put("source_table_" + count + "_count", count);
|
||||
prop.put("source_table_" + count + "_source", root);
|
||||
prop.put("source_table_" + count + "_loadurl", "<a href=\"/IndexImportOAIPMH_p.html?urlstart=" + root + "\" target=\"_top\">" + root + "</a>");
|
||||
dark = !dark;
|
||||
cnt++;
|
||||
count++;
|
||||
}
|
||||
prop.put("source_table", cnt);
|
||||
prop.put("source_table", count);
|
||||
prop.put("source_num", count);
|
||||
prop.put("source", 1);
|
||||
}
|
||||
|
||||
if (post != null && post.containsKey("import")) {
|
||||
ArrayList<OAIPMHImporter> jobs = new ArrayList<OAIPMHImporter>();
|
||||
for (OAIPMHImporter job: OAIPMHImporter.runningJobs) jobs.add(job);
|
||||
for (OAIPMHImporter job: OAIPMHImporter.startedJobs) jobs.add(job);
|
||||
for (OAIPMHImporter job: OAIPMHImporter.finishedJobs) jobs.add(job);
|
||||
for (OAIPMHImporter job: OAIPMHImporter.runningJobs.keySet()) jobs.add(job);
|
||||
for (OAIPMHImporter job: OAIPMHImporter.startedJobs.keySet()) jobs.add(job);
|
||||
for (OAIPMHImporter job: OAIPMHImporter.finishedJobs.keySet()) jobs.add(job);
|
||||
|
||||
boolean dark = false;
|
||||
int cnt = 0;
|
||||
int count = 0;
|
||||
for (OAIPMHImporter job: jobs) {
|
||||
prop.put("import_table_" + cnt + "_dark", (dark) ? "1" : "0");
|
||||
prop.put("import_table_" + cnt + "_thread", (job.isAlive()) ? "<img src=\"/env/grafics/loading.gif\" alt=\"running\" />" : "finished");
|
||||
prop.put("import_table_" + cnt + "_source", job.source());
|
||||
prop.put("import_table_" + cnt + "_chunkCount", job.chunkCount());
|
||||
prop.put("import_table_" + cnt + "_recordsCount", job.count());
|
||||
prop.put("import_table_" + cnt + "_speed", job.speed());
|
||||
prop.put("import_table_" + count + "_dark", (dark) ? "1" : "0");
|
||||
prop.put("import_table_" + count + "_thread", (job.isAlive()) ? "<img src=\"/env/grafics/loading.gif\" alt=\"running\" />" : "finished");
|
||||
prop.put("import_table_" + count + "_source", job.source());
|
||||
prop.put("import_table_" + count + "_chunkCount", job.chunkCount());
|
||||
prop.put("import_table_" + count + "_recordsCount", job.count());
|
||||
prop.put("import_table_" + count + "_speed", job.speed());
|
||||
dark = !dark;
|
||||
cnt++;
|
||||
count++;
|
||||
}
|
||||
prop.put("import_table", cnt);
|
||||
prop.put("import_table", count);
|
||||
prop.put("import", 1);
|
||||
prop.put("refresh", 1);
|
||||
}
|
||||
|
|
|
@ -33,7 +33,7 @@
|
|||
<fieldset>
|
||||
<legend>Import all Records from a server</legend>
|
||||
Import all records that follow according to resumption elements into index<br />
|
||||
<input name="urlstartall" type="text" value="" size="80" />
|
||||
<input name="urlstart" type="text" value="" size="80" />
|
||||
<input name="importroot" type="submit" value="import this source" />
|
||||
#(optiongetlist)#::or <input name="getlist" type="submit" value="import from a list" />#(/optiongetlist)#
|
||||
#(status)#::<p>Import started!</p>::<p>Bad input data: #[message]# </p>#(/status)#
|
||||
|
|
|
@ -24,9 +24,13 @@
|
|||
|
||||
import java.io.IOException;
|
||||
import java.net.MalformedURLException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Map;
|
||||
import java.util.Random;
|
||||
import java.util.TreeSet;
|
||||
|
||||
import net.yacy.document.importer.OAIPMHImporter;
|
||||
import net.yacy.document.importer.OAIPMHReader;
|
||||
import net.yacy.document.importer.OAIPMHLoader;
|
||||
import net.yacy.document.importer.ResumptionToken;
|
||||
import net.yacy.kelondro.data.meta.DigestURI;
|
||||
import net.yacy.kelondro.logging.Log;
|
||||
|
@ -55,7 +59,7 @@ public class IndexImportOAIPMH_p {
|
|||
DigestURI url = null;
|
||||
try {
|
||||
url = new DigestURI(oaipmhurl, null);
|
||||
OAIPMHReader r = new OAIPMHReader(sb.loader, url, sb.surrogatesInPath, "oaipmh-one");
|
||||
OAIPMHLoader r = new OAIPMHLoader(sb.loader, url, sb.surrogatesInPath, "oaipmh-one");
|
||||
ResumptionToken rt = r.getResumptionToken();
|
||||
prop.put("import-one", 1);
|
||||
prop.put("import-one_count", (rt == null) ? "not available" : Integer.toString(rt.getRecordCounter()));
|
||||
|
@ -83,8 +87,8 @@ public class IndexImportOAIPMH_p {
|
|||
}
|
||||
}
|
||||
|
||||
if (post.containsKey("importroot")) {
|
||||
String oaipmhurl = post.get("urlstartall", "");
|
||||
if (post.get("urlstart", "").length() > 0) {
|
||||
String oaipmhurl = post.get("urlstart", "");
|
||||
DigestURI url = null;
|
||||
try {
|
||||
url = new DigestURI(oaipmhurl, null);
|
||||
|
@ -100,6 +104,38 @@ public class IndexImportOAIPMH_p {
|
|||
}
|
||||
}
|
||||
|
||||
|
||||
if (post.get("loadrows", "").length() > 0) {
|
||||
// create a time-ordered list of events to execute
|
||||
TreeSet<String> sources = new TreeSet<String>();
|
||||
for (Map.Entry<String, String> entry: post.entrySet()) {
|
||||
if (entry.getValue().startsWith("mark_")) {
|
||||
sources.add(entry.getValue().substring(5));
|
||||
}
|
||||
}
|
||||
prop.put("status", 1);
|
||||
prop.put("optiongetlist", 1);
|
||||
prop.put("iframetype", 1);
|
||||
|
||||
// prepare the set for random read from it (to protect the servers at the beginning of the list)
|
||||
ArrayList<String> sourceList = new ArrayList<String>(sources.size());
|
||||
for (String oaipmhurl: sources) sourceList.add(oaipmhurl);
|
||||
Random r = new Random(System.currentTimeMillis());
|
||||
|
||||
// start jobs for the sources
|
||||
DigestURI url = null;
|
||||
while (sourceList.size() > 0) {
|
||||
String oaipmhurl = sourceList.remove(r.nextInt(sourceList.size()));
|
||||
try {
|
||||
url = new DigestURI(oaipmhurl, null);
|
||||
OAIPMHImporter job = new OAIPMHImporter(sb.loader, url);
|
||||
job.start();
|
||||
} catch (MalformedURLException e) {
|
||||
Log.logException(e);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (post.containsKey("getlist")) {
|
||||
prop.put("iframetype", 2);
|
||||
}
|
||||
|
|
|
@ -37,9 +37,16 @@
|
|||
package de.anomic.search;
|
||||
|
||||
import java.io.BufferedInputStream;
|
||||
import java.io.BufferedOutputStream;
|
||||
import java.io.ByteArrayInputStream;
|
||||
import java.io.ByteArrayOutputStream;
|
||||
import java.io.File;
|
||||
import java.io.FileInputStream;
|
||||
import java.io.FileNotFoundException;
|
||||
import java.io.FileOutputStream;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.io.OutputStream;
|
||||
import java.io.UnsupportedEncodingException;
|
||||
import java.net.MalformedURLException;
|
||||
import java.security.NoSuchAlgorithmException;
|
||||
|
@ -60,6 +67,10 @@ import java.util.TreeSet;
|
|||
import java.util.concurrent.ConcurrentHashMap;
|
||||
import java.util.concurrent.Semaphore;
|
||||
import java.util.regex.Pattern;
|
||||
import java.util.zip.GZIPInputStream;
|
||||
import java.util.zip.GZIPOutputStream;
|
||||
import java.util.zip.ZipEntry;
|
||||
import java.util.zip.ZipInputStream;
|
||||
|
||||
import net.yacy.document.Condenser;
|
||||
import net.yacy.document.Document;
|
||||
|
@ -68,6 +79,7 @@ import net.yacy.document.ParserException;
|
|||
import net.yacy.document.content.DCEntry;
|
||||
import net.yacy.document.content.RSSMessage;
|
||||
import net.yacy.document.content.SurrogateReader;
|
||||
import net.yacy.document.importer.OAIListFriendsLoader;
|
||||
import net.yacy.document.parser.html.ImageEntry;
|
||||
import net.yacy.document.parser.xml.RSSFeed;
|
||||
import net.yacy.kelondro.data.meta.DigestURI;
|
||||
|
@ -474,6 +486,7 @@ public final class Switchboard extends serverSwitch {
|
|||
// start a loader
|
||||
log.logConfig("Starting Crawl Loader");
|
||||
this.loader = new LoaderDispatcher(this);
|
||||
OAIListFriendsLoader.init(this.loader);
|
||||
this.crawlQueues = new CrawlQueues(this, queuesRoot);
|
||||
this.crawlQueues.noticeURL.setMinimumDelta(
|
||||
this.getConfigLong("minimumLocalDelta", this.crawlQueues.noticeURL.getMinimumLocalDelta()),
|
||||
|
@ -1236,58 +1249,106 @@ public final class Switchboard extends serverSwitch {
|
|||
}
|
||||
|
||||
public boolean processSurrogate(final String s) {
|
||||
File surrogateFile = new File(this.surrogatesInPath, s);
|
||||
File infile = new File(this.surrogatesInPath, s);
|
||||
if (!infile.exists() || !infile.canWrite() || !infile.canRead()) return false;
|
||||
File outfile = new File(this.surrogatesOutPath, s);
|
||||
if (!surrogateFile.exists() || !surrogateFile.canWrite() || !surrogateFile.canRead()) return false;
|
||||
if (outfile.exists()) return false;
|
||||
boolean moved = false;
|
||||
try {
|
||||
SurrogateReader reader = new SurrogateReader(new BufferedInputStream(new FileInputStream(surrogateFile)), 3);
|
||||
Thread readerThread = new Thread(reader, "Surrogate-Reader " + surrogateFile.getAbsolutePath());
|
||||
readerThread.start();
|
||||
DCEntry surrogate;
|
||||
Response response;
|
||||
while ((surrogate = reader.take()) != DCEntry.poison) {
|
||||
// check if url is in accepted domain
|
||||
assert surrogate != null;
|
||||
assert crawlStacker != null;
|
||||
final String urlRejectReason = crawlStacker.urlInAcceptedDomain(surrogate.getIdentifier());
|
||||
if (urlRejectReason != null) {
|
||||
if (this.log.isFine()) this.log.logInfo("Rejected URL '" + surrogate.getIdentifier() + "': " + urlRejectReason);
|
||||
continue;
|
||||
if (s.endsWith("xml.zip")) {
|
||||
// open the zip file with all the xml files in it
|
||||
try {
|
||||
InputStream is = new BufferedInputStream(new FileInputStream(infile));
|
||||
ZipInputStream zis = new ZipInputStream(is);
|
||||
ZipEntry entry;
|
||||
while ((entry = zis.getNextEntry()) != null) {
|
||||
int size;
|
||||
byte[] buffer = new byte[2048];
|
||||
ByteArrayOutputStream baos = new ByteArrayOutputStream();
|
||||
while ((size = zis.read(buffer, 0, buffer.length)) != -1) {
|
||||
baos.write(buffer, 0, size);
|
||||
}
|
||||
baos.flush();
|
||||
processSurrogate(new ByteArrayInputStream(baos.toByteArray()), entry.getName());
|
||||
baos.close();
|
||||
}
|
||||
|
||||
// create a queue entry
|
||||
Document document = surrogate.document();
|
||||
Request request = new Request(
|
||||
peers.mySeed().hash.getBytes(),
|
||||
surrogate.getIdentifier(),
|
||||
null,
|
||||
"",
|
||||
new Date(),
|
||||
new Date(),
|
||||
this.crawler.defaultSurrogateProfile.handle(),
|
||||
0,
|
||||
0,
|
||||
0
|
||||
);
|
||||
response = new Response(request, null, null, "200", this.crawler.defaultSurrogateProfile);
|
||||
indexingQueueEntry queueEntry = new indexingQueueEntry(Segments.Process.SURROGATES, response, document, null);
|
||||
|
||||
// place the queue entry into the concurrent process of the condenser (document analysis)
|
||||
try {
|
||||
indexingCondensementProcessor.enQueue(queueEntry);
|
||||
} catch (InterruptedException e) {
|
||||
Log.logException(e);
|
||||
break;
|
||||
} catch (IOException e) {
|
||||
Log.logException(e);
|
||||
} finally {
|
||||
moved = infile.renameTo(outfile);
|
||||
}
|
||||
return moved;
|
||||
} else {
|
||||
try {
|
||||
InputStream is = new BufferedInputStream(new FileInputStream(infile));
|
||||
if (s.endsWith(".gz")) is = new GZIPInputStream(is);
|
||||
processSurrogate(is, infile.getName());
|
||||
} catch (IOException e) {
|
||||
Log.logException(e);
|
||||
} finally {
|
||||
moved = infile.renameTo(outfile);
|
||||
if (moved) {
|
||||
// check if this file is already compressed, if not, compress now
|
||||
if (!outfile.getName().endsWith(".gz")) {
|
||||
String gzname = outfile.getName() + ".gz";
|
||||
File gzfile = new File(outfile.getParentFile(), gzname);
|
||||
try {
|
||||
OutputStream os = new BufferedOutputStream(new GZIPOutputStream(new FileOutputStream(gzfile)));
|
||||
FileUtils.copy(new BufferedInputStream(new FileInputStream(outfile)), os);
|
||||
os.close();
|
||||
if (gzfile.exists()) FileUtils.deletedelete(outfile);
|
||||
} catch (FileNotFoundException e) {
|
||||
Log.logException(e);
|
||||
} catch (IOException e) {
|
||||
Log.logException(e);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
} catch (IOException e) {
|
||||
Log.logException(e);
|
||||
} finally {
|
||||
moved = surrogateFile.renameTo(outfile);
|
||||
return moved;
|
||||
}
|
||||
}
|
||||
|
||||
public void processSurrogate(final InputStream is, String name) throws IOException {
|
||||
SurrogateReader reader = new SurrogateReader(is, 3);
|
||||
Thread readerThread = new Thread(reader, name);
|
||||
readerThread.start();
|
||||
DCEntry surrogate;
|
||||
Response response;
|
||||
while ((surrogate = reader.take()) != DCEntry.poison) {
|
||||
// check if url is in accepted domain
|
||||
assert surrogate != null;
|
||||
assert crawlStacker != null;
|
||||
final String urlRejectReason = crawlStacker.urlInAcceptedDomain(surrogate.getIdentifier());
|
||||
if (urlRejectReason != null) {
|
||||
if (this.log.isFine()) this.log.logInfo("Rejected URL '" + surrogate.getIdentifier() + "': " + urlRejectReason);
|
||||
continue;
|
||||
}
|
||||
|
||||
// create a queue entry
|
||||
Document document = surrogate.document();
|
||||
Request request = new Request(
|
||||
peers.mySeed().hash.getBytes(),
|
||||
surrogate.getIdentifier(),
|
||||
null,
|
||||
"",
|
||||
new Date(),
|
||||
new Date(),
|
||||
this.crawler.defaultSurrogateProfile.handle(),
|
||||
0,
|
||||
0,
|
||||
0
|
||||
);
|
||||
response = new Response(request, null, null, "200", this.crawler.defaultSurrogateProfile);
|
||||
indexingQueueEntry queueEntry = new indexingQueueEntry(Segments.Process.SURROGATES, response, document, null);
|
||||
|
||||
// place the queue entry into the concurrent process of the condenser (document analysis)
|
||||
try {
|
||||
indexingCondensementProcessor.enQueue(queueEntry);
|
||||
} catch (InterruptedException e) {
|
||||
Log.logException(e);
|
||||
break;
|
||||
}
|
||||
}
|
||||
return moved;
|
||||
}
|
||||
|
||||
public int surrogateQueueSize() {
|
||||
|
@ -1326,7 +1387,7 @@ public final class Switchboard extends serverSwitch {
|
|||
// check for interruption
|
||||
checkInterruption();
|
||||
|
||||
if (surrogate.endsWith(".xml")) {
|
||||
if (surrogate.endsWith(".xml") || surrogate.endsWith(".xml.gz") || surrogate.endsWith(".xml.zip")) {
|
||||
// read the surrogate file and store entry in index
|
||||
if (processSurrogate(surrogate)) return true;
|
||||
}
|
||||
|
|
|
@ -1,3 +1,25 @@
|
|||
/**
|
||||
* Importer
|
||||
* Copyright 2009 by Michael Peter Christen
|
||||
* First released 29.04.2010 at http://yacy.net
|
||||
*
|
||||
* This is a part of YaCy, a peer-to-peer based web search engine
|
||||
*
|
||||
* This program is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU Lesser General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public License
|
||||
* along with this program in the file COPYING.LESSER.
|
||||
* If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
package net.yacy.document.importer;
|
||||
|
||||
public interface Importer extends Runnable {
|
||||
|
|
|
@ -1,28 +1,24 @@
|
|||
// mediawikiIndex.java
|
||||
// (C) 2008 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany
|
||||
// first published 20.11.2008 on http://yacy.net
|
||||
//
|
||||
// This is a part of YaCy, a peer-to-peer based web search engine
|
||||
//
|
||||
// $LastChangedDate: 2006-04-02 22:40:07 +0200 (So, 02 Apr 2006) $
|
||||
// $LastChangedRevision: 1986 $
|
||||
// $LastChangedBy: orbiter $
|
||||
//
|
||||
// LICENSE
|
||||
//
|
||||
// This program is free software; you can redistribute it and/or modify
|
||||
// it under the terms of the GNU General Public License as published by
|
||||
// the Free Software Foundation; either version 2 of the License, or
|
||||
// (at your option) any later version.
|
||||
//
|
||||
// This program is distributed in the hope that it will be useful,
|
||||
// but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
// GNU General Public License for more details.
|
||||
//
|
||||
// You should have received a copy of the GNU General Public License
|
||||
// along with this program; if not, write to the Free Software
|
||||
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
||||
/**
|
||||
* MediawikiImporter
|
||||
* Copyright 2008 by Michael Peter Christen
|
||||
* First released 20.11.2008 at http://yacy.net
|
||||
*
|
||||
* This is a part of YaCy, a peer-to-peer based web search engine
|
||||
*
|
||||
* This program is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU Lesser General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public License
|
||||
* along with this program in the file COPYING.LESSER.
|
||||
* If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
package net.yacy.document.importer;
|
||||
|
||||
|
|
203
source/net/yacy/document/importer/OAIListFriendsLoader.java
Normal file
203
source/net/yacy/document/importer/OAIListFriendsLoader.java
Normal file
|
@ -0,0 +1,203 @@
|
|||
/**
|
||||
* OAIListFriendsLoader
|
||||
* Copyright 2010 by Michael Peter Christen
|
||||
* First released 29.04.2010 at http://yacy.net
|
||||
*
|
||||
* This is a part of YaCy, a peer-to-peer based web search engine
|
||||
*
|
||||
* This program is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU Lesser General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public License
|
||||
* along with this program in the file COPYING.LESSER.
|
||||
* If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
package net.yacy.document.importer;
|
||||
|
||||
import java.io.ByteArrayInputStream;
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.util.Map;
|
||||
import java.util.TreeMap;
|
||||
|
||||
import javax.xml.parsers.ParserConfigurationException;
|
||||
import javax.xml.parsers.SAXParser;
|
||||
import javax.xml.parsers.SAXParserFactory;
|
||||
|
||||
import net.yacy.kelondro.data.meta.DigestURI;
|
||||
import net.yacy.kelondro.logging.Log;
|
||||
import net.yacy.kelondro.util.FileUtils;
|
||||
import net.yacy.repository.LoaderDispatcher;
|
||||
|
||||
import org.xml.sax.Attributes;
|
||||
import org.xml.sax.SAXException;
|
||||
import org.xml.sax.helpers.DefaultHandler;
|
||||
|
||||
import de.anomic.crawler.CrawlProfile;
|
||||
import de.anomic.crawler.retrieval.Response;
|
||||
|
||||
public class OAIListFriendsLoader {
|
||||
|
||||
private static final long serialVersionUID = -8705115274655024604L;
|
||||
|
||||
//private static String url10 = "http://roar.eprints.org/cgi/roar_search/advanced/export_roar_ROAR::ListFriends.xml?screen=ROAR%3A%3AFacetSearch&_action_export=1&output=ROAR%3A%3AListFriends&exp=1|1|-recordcount%2F-date|archive|-|-|eprint_status%3Aeprint_status%3AALL%3AEQ%3Aarchive|metadata_visibility%3Ametadata_visibility%3AALL%3AEX%3Ashow";
|
||||
private static String url10 = "http://roar.eprints.org/cgi/roar_search/advanced/export_roar_ROAR::ListFriends.xml?_action_export=1&output=ROAR%3A%3AListFriends";
|
||||
private static File cache10 = new File("DATA/DICTIONARIES/harvesting/export_roar_ROAR_ListFriends.xml");
|
||||
private static String url20 = "http://www.openarchives.org/Register/ListFriends";
|
||||
private static File cache20 = new File("DATA/DICTIONARIES/harvesting/ListFriends.xml");
|
||||
|
||||
public static void init(LoaderDispatcher loader) {
|
||||
loader.loadIfNotExistBackground(url10, cache10);
|
||||
loader.loadIfNotExistBackground(url20, cache20);
|
||||
}
|
||||
|
||||
public static Map<String, String> load(LoaderDispatcher loader) {
|
||||
Map<String, String> map10;
|
||||
try {
|
||||
map10 = load(null, null, new File("DATA/DICTIONARIES/harvesting/export_roar_ROAR_ListFriends.xml"));
|
||||
} catch (IOException e) {
|
||||
map10 = new TreeMap<String, String>();
|
||||
}
|
||||
|
||||
Map<String, String> map20;
|
||||
try {
|
||||
map20 = load(null, null, new File("DATA/DICTIONARIES/harvesting/ListFriends.xml"));
|
||||
} catch (IOException e) {
|
||||
map20 = new TreeMap<String, String>();
|
||||
}
|
||||
|
||||
map10.putAll(map20);
|
||||
return map10;
|
||||
}
|
||||
|
||||
/**
|
||||
* load a OAI ListFriends file from the net or from a cache location
|
||||
* If the given file does exist, the OAI ListFriends File is loaded and parsed.
|
||||
* The resulting map is a mapping from OAI-PMH start url to a loaction description
|
||||
* @param loader a LoaderDispatcher that loads the file if targetFile does not exist
|
||||
* @param source the source URL for the OAI ListFriends file
|
||||
* @param targetFile the file where the loaded content is stored if it does not exist, the source othervise
|
||||
* @return a Map from OAI-PMH source to source description (which is usually also a URL)
|
||||
* @throws IOException
|
||||
*/
|
||||
private static Map<String, String> load(LoaderDispatcher loader, DigestURI source, File targetFile) throws IOException {
|
||||
|
||||
byte[] b;
|
||||
if (targetFile.exists()) {
|
||||
// load file
|
||||
b = FileUtils.read(targetFile);
|
||||
} else {
|
||||
// load from the net
|
||||
Response response = loader.load(source, false, true, CrawlProfile.CACHE_STRATEGY_NOCACHE);
|
||||
b = response.getContent();
|
||||
FileUtils.copy(b, targetFile);
|
||||
}
|
||||
|
||||
return new Parser(b).map;
|
||||
}
|
||||
|
||||
|
||||
// get a resumption token using a SAX xml parser from am input stream
|
||||
private static class Parser extends DefaultHandler {
|
||||
|
||||
// class variables
|
||||
private final StringBuilder buffer;
|
||||
private boolean parsingValue;
|
||||
private SAXParser saxParser;
|
||||
private InputStream stream;
|
||||
private Attributes atts;
|
||||
private int recordCounter;
|
||||
private TreeMap<String, String> map;
|
||||
|
||||
public Parser(final byte[] b) throws IOException {
|
||||
this.map = new TreeMap<String, String>();
|
||||
this.recordCounter = 0;
|
||||
this.buffer = new StringBuilder();
|
||||
this.parsingValue = false;
|
||||
this.atts = null;
|
||||
final SAXParserFactory factory = SAXParserFactory.newInstance();
|
||||
this.stream = new ByteArrayInputStream(b);
|
||||
try {
|
||||
this.saxParser = factory.newSAXParser();
|
||||
this.saxParser.parse(this.stream, this);
|
||||
} catch (SAXException e) {
|
||||
Log.logException(e);
|
||||
Log.logWarning("OAIListFriendsLoader.Parser", "OAIListFriends was not parsed:\n" + new String(b));
|
||||
} catch (IOException e) {
|
||||
Log.logException(e);
|
||||
Log.logWarning("OAIListFriendsLoader.Parser", "OAIListFriends was not parsed:\n" + new String(b));
|
||||
} catch (ParserConfigurationException e) {
|
||||
Log.logException(e);
|
||||
Log.logWarning("OAIListFriendsLoader.Parser", "OAIListFriends was not parsed:\n" + new String(b));
|
||||
throw new IOException(e.getMessage());
|
||||
} finally {
|
||||
try {
|
||||
this.stream.close();
|
||||
} catch (IOException e) {
|
||||
Log.logException(e);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<BaseURLs>
|
||||
<baseURL id="http://roar.eprints.org/id/eprint/102">http://research.nla.gov.au/oai</baseURL>
|
||||
<baseURL id="http://roar.eprints.org/id/eprint/174">http://oai.bibsys.no/repository</baseURL>
|
||||
<baseURL id="http://roar.eprints.org/id/eprint/1064">http://oai.repec.openlib.org/</baseURL>
|
||||
</BaseURLs>
|
||||
*/
|
||||
|
||||
public void startElement(final String uri, final String name, final String tag, final Attributes atts) throws SAXException {
|
||||
if ("baseURL".equals(tag)) {
|
||||
recordCounter++;
|
||||
this.parsingValue = true;
|
||||
this.atts = atts;
|
||||
}
|
||||
}
|
||||
|
||||
public void endElement(final String uri, final String name, final String tag) {
|
||||
if (tag == null) return;
|
||||
if ("baseURL".equals(tag)) {
|
||||
this.map.put(buffer.toString(), this.atts.getValue("id"));
|
||||
this.buffer.setLength(0);
|
||||
this.parsingValue = false;
|
||||
}
|
||||
}
|
||||
|
||||
public void characters(final char ch[], final int start, final int length) {
|
||||
if (parsingValue) {
|
||||
buffer.append(ch, start, length);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
public static void main(String[] args) {
|
||||
try {
|
||||
Map<String, String> map1 = load(null, null, new File("DATA/DICTIONARIES/harvesting/export_roar_ROAR_ListFriends.xml"));
|
||||
int count1 = map1.size();
|
||||
|
||||
Map<String, String> map2 = load(null, null, new File("DATA/DICTIONARIES/harvesting/ListFriends.xml"));
|
||||
int count2 = map2.size();
|
||||
|
||||
map1.putAll(map2);
|
||||
System.out.println("count1 = " + count1 + ", count2 = " + count2 + ", all = " + map1.size());
|
||||
|
||||
for (Map.Entry<String, String> entry: map1.entrySet()) System.out.println(entry.getKey());
|
||||
} catch (IOException e) {
|
||||
e.printStackTrace();
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
}
|
|
@ -1,72 +1,55 @@
|
|||
// OAIPMHImporter
|
||||
// (C) 2009 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany
|
||||
// first published 30.09.2009 on http://yacy.net
|
||||
//
|
||||
// This is a part of YaCy, a peer-to-peer based web search engine
|
||||
//
|
||||
// $LastChangedDate: 2009-09-23 23:26:14 +0200 (Mi, 23 Sep 2009) $
|
||||
// $LastChangedRevision: 6340 $
|
||||
// $LastChangedBy: low012 $
|
||||
//
|
||||
// LICENSE
|
||||
//
|
||||
// This program is free software; you can redistribute it and/or modify
|
||||
// it under the terms of the GNU General Public License as published by
|
||||
// the Free Software Foundation; either version 2 of the License, or
|
||||
// (at your option) any later version.
|
||||
//
|
||||
// This program is distributed in the hope that it will be useful,
|
||||
// but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
// GNU General Public License for more details.
|
||||
//
|
||||
// You should have received a copy of the GNU General Public License
|
||||
// along with this program; if not, write to the Free Software
|
||||
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
||||
/**
|
||||
* OAIPMHImporter
|
||||
* Copyright 2009 by Michael Peter Christen
|
||||
* First released 30.09.2009 at http://yacy.net
|
||||
*
|
||||
* This is a part of YaCy, a peer-to-peer based web search engine
|
||||
*
|
||||
* This program is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU Lesser General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public License
|
||||
* along with this program in the file COPYING.LESSER.
|
||||
* If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
package net.yacy.document.importer;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.FileInputStream;
|
||||
import java.io.FileNotFoundException;
|
||||
import java.io.IOException;
|
||||
import java.net.MalformedURLException;
|
||||
import java.text.ParseException;
|
||||
import java.util.Date;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
import java.util.TreeSet;
|
||||
import java.util.concurrent.ConcurrentHashMap;
|
||||
|
||||
import net.yacy.kelondro.data.meta.DigestURI;
|
||||
import net.yacy.kelondro.logging.Log;
|
||||
import net.yacy.kelondro.util.DateFormatter;
|
||||
import net.yacy.repository.LoaderDispatcher;
|
||||
import net.yacy.document.parser.csvParser;
|
||||
|
||||
import de.anomic.crawler.CrawlProfile;
|
||||
import de.anomic.search.Switchboard;
|
||||
|
||||
|
||||
// get one server with
|
||||
// http://roar.eprints.org/index.php?action=csv
|
||||
// or
|
||||
// http://www.openarchives.org/Register/BrowseSites
|
||||
// or
|
||||
// http://www.openarchives.org/Register/ListFriends
|
||||
//
|
||||
// list records from oai-pmh like
|
||||
// http://opus.bsz-bw.de/fhhv/oai2/oai2.php?verb=ListRecords&metadataPrefix=oai_dc
|
||||
|
||||
|
||||
public class OAIPMHImporter extends Thread implements Importer, Comparable<OAIPMHImporter> {
|
||||
|
||||
private static int importerCounter = Integer.MAX_VALUE;
|
||||
private static Object N = new Object();
|
||||
|
||||
public static TreeSet<OAIPMHImporter> startedJobs = new TreeSet<OAIPMHImporter>();
|
||||
public static TreeSet<OAIPMHImporter> runningJobs = new TreeSet<OAIPMHImporter>();
|
||||
public static TreeSet<OAIPMHImporter> finishedJobs = new TreeSet<OAIPMHImporter>();
|
||||
public static ConcurrentHashMap<OAIPMHImporter, Object> startedJobs = new ConcurrentHashMap<OAIPMHImporter, Object>();
|
||||
public static ConcurrentHashMap<OAIPMHImporter, Object> runningJobs = new ConcurrentHashMap<OAIPMHImporter, Object>();
|
||||
public static ConcurrentHashMap<OAIPMHImporter, Object> finishedJobs = new ConcurrentHashMap<OAIPMHImporter, Object>();
|
||||
|
||||
private final LoaderDispatcher loader;
|
||||
private DigestURI source;
|
||||
|
@ -95,7 +78,7 @@ public class OAIPMHImporter extends Thread implements Importer, Comparable<OAIPM
|
|||
// this should never happen
|
||||
Log.logException(e);
|
||||
}
|
||||
startedJobs.add(this);
|
||||
startedJobs.put(this, N);
|
||||
}
|
||||
|
||||
public int count() {
|
||||
|
@ -131,23 +114,23 @@ public class OAIPMHImporter extends Thread implements Importer, Comparable<OAIPM
|
|||
}
|
||||
|
||||
public void run() {
|
||||
while (runningJobs.size() > 10) {
|
||||
try {Thread.sleep(1000 + 1000 * System.currentTimeMillis() % 6);} catch (InterruptedException e) {}
|
||||
while (runningJobs.size() > 50) {
|
||||
try {Thread.sleep(10000 + 3000 * (System.currentTimeMillis() % 6));} catch (InterruptedException e) {}
|
||||
}
|
||||
startedJobs.remove(this);
|
||||
runningJobs.add(this);
|
||||
runningJobs.put(this, N);
|
||||
this.message = "loading first part of records";
|
||||
while (true) {
|
||||
try {
|
||||
OAIPMHReader reader = new OAIPMHReader(this.loader, this.source, Switchboard.getSwitchboard().surrogatesInPath, filenamePrefix);
|
||||
OAIPMHLoader loader = new OAIPMHLoader(this.loader, this.source, Switchboard.getSwitchboard().surrogatesInPath, filenamePrefix);
|
||||
this.chunkCount++;
|
||||
this.recordsCount += reader.getResumptionToken().getRecordCounter();
|
||||
this.source = reader.getResumptionToken().resumptionURL(this.source);
|
||||
this.recordsCount += loader.getResumptionToken().getRecordCounter();
|
||||
this.source = loader.getResumptionToken().resumptionURL(this.source);
|
||||
if (this.source == null) {
|
||||
this.message = "import terminated with source = null";
|
||||
break;
|
||||
}
|
||||
this.message = "loading next resumption fragment, cursor = " + reader.getResumptionToken().getCursor();
|
||||
this.message = "loading next resumption fragment, cursor = " + loader.getResumptionToken().getCursor();
|
||||
} catch (IOException e) {
|
||||
this.message = e.getMessage();
|
||||
break;
|
||||
|
@ -155,7 +138,7 @@ public class OAIPMHImporter extends Thread implements Importer, Comparable<OAIPM
|
|||
}
|
||||
this.finishTime = System.currentTimeMillis();
|
||||
runningJobs.remove(this);
|
||||
finishedJobs.add(this);
|
||||
finishedJobs.put(this, N);
|
||||
}
|
||||
|
||||
|
||||
|
@ -185,7 +168,7 @@ public class OAIPMHImporter extends Thread implements Importer, Comparable<OAIPM
|
|||
File surrogatesIn,
|
||||
File surrogatesOut,
|
||||
long staleLimit) {
|
||||
Set<String> plainList = getAllListedOAIServer(loader);
|
||||
Set<String> plainList = OAIListFriendsLoader.load(loader).keySet();
|
||||
Map<String, Date> loaded = getLoadedOAIServer(surrogatesIn, surrogatesOut);
|
||||
long limit = System.currentTimeMillis() - staleLimit;
|
||||
for (Map.Entry<String, Date> a: loaded.entrySet()) {
|
||||
|
@ -193,47 +176,6 @@ public class OAIPMHImporter extends Thread implements Importer, Comparable<OAIPM
|
|||
}
|
||||
return plainList;
|
||||
}
|
||||
|
||||
/**
|
||||
* use the list server at http://roar.eprints.org/index.php?action=csv
|
||||
* to produce a list of OAI-PMH sources
|
||||
* @param loader
|
||||
* @return the list of oai-pmh sources
|
||||
*/
|
||||
public static Set<String> getAllListedOAIServer(LoaderDispatcher loader) {
|
||||
TreeSet<String> list = new TreeSet<String>();
|
||||
|
||||
// read roar
|
||||
File roar = new File(Switchboard.getSwitchboard().dictionariesPath, "harvesting/roar.csv");
|
||||
DigestURI roarSource;
|
||||
try {
|
||||
roarSource = new DigestURI("http://roar.eprints.org/index.php?action=csv", null);
|
||||
} catch (MalformedURLException e) {
|
||||
Log.logException(e);
|
||||
roarSource = null;
|
||||
}
|
||||
if (!roar.exists()) try {
|
||||
// load the file from the net
|
||||
loader.load(roarSource, CrawlProfile.CACHE_STRATEGY_NOCACHE, roar);
|
||||
} catch (IOException e) {
|
||||
Log.logException(e);
|
||||
}
|
||||
if (roar.exists()) {
|
||||
csvParser parser = new csvParser();
|
||||
try {
|
||||
List<String[]> table = parser.getTable(roarSource, "", "UTF-8", new FileInputStream(roar));
|
||||
for (String[] row: table) {
|
||||
if (row.length > 2 && (row[2].startsWith("http://") || row[2].startsWith("https://"))) {
|
||||
list.add(row[2]);
|
||||
}
|
||||
}
|
||||
} catch (FileNotFoundException e) {
|
||||
Log.logException(e);
|
||||
}
|
||||
}
|
||||
|
||||
return list;
|
||||
}
|
||||
|
||||
/**
|
||||
* get a map for already loaded oai-pmh servers and their latest access date
|
||||
|
|
|
@ -1,32 +1,27 @@
|
|||
// OAIPMHReader
|
||||
// (C) 2009 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany
|
||||
// first published 30.09.2009 on http://yacy.net
|
||||
//
|
||||
// This is a part of YaCy, a peer-to-peer based web search engine
|
||||
//
|
||||
// $LastChangedDate: 2009-09-23 23:26:14 +0200 (Mi, 23 Sep 2009) $
|
||||
// $LastChangedRevision: 6340 $
|
||||
// $LastChangedBy: low012 $
|
||||
//
|
||||
// LICENSE
|
||||
//
|
||||
// This program is free software; you can redistribute it and/or modify
|
||||
// it under the terms of the GNU General Public License as published by
|
||||
// the Free Software Foundation; either version 2 of the License, or
|
||||
// (at your option) any later version.
|
||||
//
|
||||
// This program is distributed in the hope that it will be useful,
|
||||
// but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
// GNU General Public License for more details.
|
||||
//
|
||||
// You should have received a copy of the GNU General Public License
|
||||
// along with this program; if not, write to the Free Software
|
||||
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
||||
/**
|
||||
* OAIPMHLoader
|
||||
* Copyright 2009 by Michael Peter Christen
|
||||
* First released 30.09.2009 at http://yacy.net
|
||||
*
|
||||
* This is a part of YaCy, a peer-to-peer based web search engine
|
||||
*
|
||||
* This program is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU Lesser General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public License
|
||||
* along with this program in the file COPYING.LESSER.
|
||||
* If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
package net.yacy.document.importer;
|
||||
|
||||
import java.io.ByteArrayInputStream;
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
|
||||
|
@ -44,18 +39,18 @@ import de.anomic.crawler.retrieval.Response;
|
|||
// http://opus.bsz-bw.de/fhhv/oai2/oai2.php?verb=ListRecords&metadataPrefix=oai_dc
|
||||
|
||||
|
||||
public class OAIPMHReader {
|
||||
public class OAIPMHLoader {
|
||||
|
||||
private final DigestURI source;
|
||||
private final ResumptionToken resumptionToken;
|
||||
|
||||
public OAIPMHReader(LoaderDispatcher loader, DigestURI source, File targetDir, String filePrefix) throws IOException {
|
||||
public OAIPMHLoader(LoaderDispatcher loader, DigestURI source, File targetDir, String filePrefix) throws IOException {
|
||||
this.source = source;
|
||||
|
||||
// load the file from the net
|
||||
Response response = loader.load(source, false, true, CrawlProfile.CACHE_STRATEGY_NOCACHE);
|
||||
byte[] b = response.getContent();
|
||||
this.resumptionToken = new ResumptionToken(new ByteArrayInputStream(b));
|
||||
this.resumptionToken = new ResumptionToken(b);
|
||||
File f1 = new File(targetDir, OAIPMHImporter.filename4Source(source));
|
||||
File f0 = new File(targetDir, f1.getName() + ".tmp");
|
||||
|
|
@ -1,30 +1,28 @@
|
|||
// ResumptionToken
|
||||
// (C) 2009 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany
|
||||
// first published 31.10.2009 on http://yacy.net
|
||||
//
|
||||
// $LastChangedDate: 2006-04-02 22:40:07 +0200 (So, 02 Apr 2006) $
|
||||
// $LastChangedRevision: 1986 $
|
||||
// $LastChangedBy: orbiter $
|
||||
//
|
||||
// LICENSE
|
||||
//
|
||||
// This program is free software; you can redistribute it and/or modify
|
||||
// it under the terms of the GNU General Public License as published by
|
||||
// the Free Software Foundation; either version 2 of the License, or
|
||||
// (at your option) any later version.
|
||||
//
|
||||
// This program is distributed in the hope that it will be useful,
|
||||
// but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
// GNU General Public License for more details.
|
||||
//
|
||||
// You should have received a copy of the GNU General Public License
|
||||
// along with this program; if not, write to the Free Software
|
||||
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
||||
|
||||
/**
|
||||
* ResumptionToken
|
||||
* Copyright 2009 by Michael Peter Christen
|
||||
* First released 31.10.2009 at http://yacy.net
|
||||
*
|
||||
* This is a part of YaCy, a peer-to-peer based web search engine
|
||||
*
|
||||
* This program is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU Lesser General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public License
|
||||
* along with this program in the file COPYING.LESSER.
|
||||
* If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
package net.yacy.document.importer;
|
||||
|
||||
import java.io.ByteArrayInputStream;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.text.Collator;
|
||||
|
@ -45,7 +43,7 @@ import net.yacy.kelondro.data.meta.DigestURI;
|
|||
import net.yacy.kelondro.logging.Log;
|
||||
import net.yacy.kelondro.util.DateFormatter;
|
||||
|
||||
public class ResumptionToken extends TreeMap<String, String> {
|
||||
public class ResumptionToken extends TreeMap<String, String> {
|
||||
|
||||
private static final long serialVersionUID = -8389462290545629792L;
|
||||
|
||||
|
@ -58,10 +56,10 @@ public class ResumptionToken extends TreeMap<String, String> {
|
|||
|
||||
int recordCounter;
|
||||
|
||||
public ResumptionToken(final InputStream stream) throws IOException {
|
||||
public ResumptionToken(final byte[] b) throws IOException {
|
||||
super((Collator) insensitiveCollator.clone());
|
||||
this.recordCounter = 0;
|
||||
new Reader(stream);
|
||||
new Parser(b);
|
||||
}
|
||||
|
||||
public ResumptionToken(
|
||||
|
@ -206,7 +204,7 @@ public class ResumptionToken extends TreeMap<String, String> {
|
|||
}
|
||||
|
||||
// get a resumption token using a SAX xml parser from am input stream
|
||||
private class Reader extends DefaultHandler {
|
||||
private class Parser extends DefaultHandler {
|
||||
|
||||
// class variables
|
||||
private final StringBuilder buffer;
|
||||
|
@ -215,21 +213,24 @@ public class ResumptionToken extends TreeMap<String, String> {
|
|||
private InputStream stream;
|
||||
private Attributes atts;
|
||||
|
||||
public Reader(final InputStream stream) throws IOException {
|
||||
public Parser(final byte[] b) throws IOException {
|
||||
this.buffer = new StringBuilder();
|
||||
this.parsingValue = false;
|
||||
this.stream = stream;
|
||||
this.atts = null;
|
||||
final SAXParserFactory factory = SAXParserFactory.newInstance();
|
||||
this.stream = new ByteArrayInputStream(b);
|
||||
try {
|
||||
this.saxParser = factory.newSAXParser();
|
||||
this.saxParser.parse(this.stream, this);
|
||||
} catch (SAXException e) {
|
||||
Log.logException(e);
|
||||
Log.logWarning("ResumptionToken", "token was not parsed:\n" + new String(b));
|
||||
} catch (IOException e) {
|
||||
Log.logException(e);
|
||||
Log.logWarning("ResumptionToken", "token was not parsed:\n" + new String(b));
|
||||
} catch (ParserConfigurationException e) {
|
||||
Log.logException(e);
|
||||
Log.logWarning("ResumptionToken", "token was not parsed:\n" + new String(b));
|
||||
throw new IOException(e.getMessage());
|
||||
} finally {
|
||||
try {
|
||||
|
|
|
@ -171,6 +171,8 @@ public final class FileUtils {
|
|||
* @see #copy(File source, File dest)
|
||||
*/
|
||||
public static void copy(final InputStream source, final File dest, final long count) throws IOException {
|
||||
String path = dest.getParent();
|
||||
if (path != null && path.length() > 0) new File(path).mkdirs();
|
||||
FileOutputStream fos = null;
|
||||
try {
|
||||
fos = new FileOutputStream(dest);
|
||||
|
|
|
@ -84,7 +84,7 @@ public final class ScoreCluster<E> {
|
|||
public synchronized void shrinkToMinScore(int minScore) {
|
||||
int score;
|
||||
Long key;
|
||||
while (true) {
|
||||
while (keyrefDB.size() > 0) {
|
||||
// find and remove objects where their score is smaller than the demanded minimum score
|
||||
key = keyrefDB.firstKey();
|
||||
if (key == null) break;
|
||||
|
|
|
@ -31,6 +31,7 @@ import java.io.File;
|
|||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.io.Writer;
|
||||
import java.net.MalformedURLException;
|
||||
import java.util.Arrays;
|
||||
import java.util.Date;
|
||||
import java.util.HashSet;
|
||||
|
@ -454,4 +455,29 @@ public final class LoaderDispatcher {
|
|||
if (System.currentTimeMillis() - e.getValue().longValue() > minDelay) i.remove();
|
||||
}
|
||||
}
|
||||
|
||||
public void loadIfNotExistBackground(String url, File cache) {
|
||||
new Loader(url, cache).start();
|
||||
}
|
||||
|
||||
private class Loader extends Thread {
|
||||
|
||||
private String url;
|
||||
private File cache;
|
||||
|
||||
public Loader(String url, File cache) {
|
||||
this.url = url;
|
||||
this.cache = cache;
|
||||
}
|
||||
|
||||
public void run() {
|
||||
if (this.cache.exists()) return;
|
||||
try {
|
||||
// load from the net
|
||||
Response response = load(new DigestURI(this.url), false, true, CrawlProfile.CACHE_STRATEGY_NOCACHE);
|
||||
byte[] b = response.getContent();
|
||||
FileUtils.copy(b, this.cache);
|
||||
} catch (MalformedURLException e) {} catch (IOException e) {}
|
||||
}
|
||||
}
|
||||
}
|
Loading…
Reference in New Issue
Block a user