enhanced and fixed OAI-PMH import

- now importing OAI-PMH server list fron two sources
- simultanous import from several servers (even > 2000)
- check buttons on OAI-PMH server list to select multiple servers for import start
- it is possible to select all servers at once for import
- imported XML data is gzipped after import from surrogate reader

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@6847 6c8d7289-2bf4-0310-a012-ef5d649a1542
This commit is contained in:
orbiter 2010-04-30 14:03:51 +00:00
parent c2098f9399
commit fc5efcc05a
17 changed files with 613 additions and 296 deletions

View File

@ -1,43 +1,43 @@
<?xml version="1.0" encoding="UTF-8"?>
<classpath>
<classpathentry excluding="env/|htdocsdefault/|proxymsg/|yacy/|env/|yacy/user/|yacy/user/|yacy/ui/|processing/domaingraph/applet/|processing/domaingraph/|api/|api/bookmarks/posts/|api/bookmarks/|api/util/|api/bookmarks/xbel/|api/bookmarks/tags/" kind="src" path="htroot"/>
<classpathentry kind="src" path="test"/>
<classpathentry excluding="user/|user/|ui/" kind="src" path="htroot/yacy"/>
<classpathentry kind="src" path="htroot/env"/>
<classpathentry kind="src" path="source"/>
<classpathentry kind="src" path="htroot/yacy/ui"/>
<classpathentry excluding="bookmarks/posts/|bookmarks/|util/|bookmarks/xbel/|bookmarks/tags/" kind="src" path="htroot/api"/>
<classpathentry kind="src" path="htroot/api/bookmarks/posts"/>
<classpathentry excluding="posts/|xbel/|tags/" kind="src" path="htroot/api/bookmarks"/>
<classpathentry kind="src" path="htroot/api/util"/>
<classpathentry kind="src" path="htroot/api/bookmarks/xbel"/>
<classpathentry kind="src" path="htroot/api/bookmarks/tags"/>
<classpathentry exported="true" kind="con" path="org.eclipse.jdt.launching.JRE_CONTAINER"/>
<classpathentry exported="true" kind="lib" path="lib/commons-httpclient-3.1.jar"/>
<classpathentry exported="true" kind="lib" path="lib/commons-logging-1.1.1.jar"/>
<classpathentry exported="true" kind="lib" path="lib/commons-io-1.4.jar"/>
<classpathentry exported="true" kind="lib" path="lib/commons-fileupload-1.2.1.jar"/>
<classpathentry exported="true" kind="lib" path="lib/servlet-api.jar"/>
<classpathentry exported="true" kind="lib" path="lib/xerces.jar"/>
<classpathentry exported="true" kind="lib" path="lib/bzip2.jar"/>
<classpathentry exported="true" kind="lib" path="lib/J7Zip-modified.jar"/>
<classpathentry exported="true" kind="lib" path="lib/webcat-0.1-swf.jar"/>
<classpathentry exported="true" kind="lib" path="lib/activation.jar"/>
<classpathentry exported="true" kind="lib" path="lib/commons-jxpath-1.3.jar"/>
<classpathentry exported="true" kind="lib" path="libt/junit-4.7.jar"/>
<classpathentry kind="lib" path="lib/fontbox-1.0.0.jar"/>
<classpathentry kind="lib" path="lib/pdfbox-1.0.0.jar"/>
<classpathentry kind="lib" path="lib/poi-3.6-20091214.jar"/>
<classpathentry kind="lib" path="lib/poi-scratchpad-3.6-20091214.jar"/>
<classpathentry kind="lib" path="lib/bcmail-jdk15-145.jar"/>
<classpathentry kind="lib" path="lib/bcprov-jdk15-145.jar"/>
<classpathentry kind="lib" path="lib/jsch-0.1.42.jar"/>
<classpathentry kind="lib" path="lib/jakarta-oro-2.0.8.jar"/>
<classpathentry kind="lib" path="lib/commons-codec-1.4.jar"/>
<classpathentry kind="lib" path="lib/log4j-1.2.15.jar"/>
<classpathentry kind="lib" path="lib/mysql-connector-java-5.1.12-bin.jar"/>
<classpathentry kind="lib" path="lib/jcifs-1.3.14.jar"/>
<classpathentry kind="lib" path="lib/metadata-extractor-2.4.0-beta-1.jar"/>
<classpathentry exported="true" kind="con" path="org.eclipse.jdt.junit.JUNIT_CONTAINER/4"/>
<classpathentry kind="output" path="gen"/>
</classpath>
<?xml version="1.0" encoding="UTF-8"?>
<classpath>
<classpathentry excluding="env/|htdocsdefault/|proxymsg/|yacy/|env/|yacy/user/|yacy/user/|yacy/ui/|processing/domaingraph/applet/|processing/domaingraph/|api/|api/bookmarks/posts/|api/bookmarks/|api/util/|api/bookmarks/xbel/|api/bookmarks/tags/" kind="src" path="htroot"/>
<classpathentry kind="src" path="test"/>
<classpathentry excluding="user/|user/|ui/" kind="src" path="htroot/yacy"/>
<classpathentry kind="src" path="htroot/env"/>
<classpathentry kind="src" path="source"/>
<classpathentry kind="src" path="htroot/yacy/ui"/>
<classpathentry excluding="bookmarks/posts/|bookmarks/|util/|bookmarks/xbel/|bookmarks/tags/" kind="src" path="htroot/api"/>
<classpathentry kind="src" path="htroot/api/bookmarks/posts"/>
<classpathentry excluding="posts/|xbel/|tags/" kind="src" path="htroot/api/bookmarks"/>
<classpathentry kind="src" path="htroot/api/util"/>
<classpathentry kind="src" path="htroot/api/bookmarks/xbel"/>
<classpathentry kind="src" path="htroot/api/bookmarks/tags"/>
<classpathentry exported="true" kind="con" path="org.eclipse.jdt.launching.JRE_CONTAINER"/>
<classpathentry exported="true" kind="lib" path="lib/commons-httpclient-3.1.jar" sourcepath="/commons-httpclient-3.1/src"/>
<classpathentry exported="true" kind="lib" path="lib/commons-logging-1.1.1.jar"/>
<classpathentry exported="true" kind="lib" path="lib/commons-io-1.4.jar"/>
<classpathentry exported="true" kind="lib" path="lib/commons-fileupload-1.2.1.jar"/>
<classpathentry exported="true" kind="lib" path="lib/servlet-api.jar"/>
<classpathentry exported="true" kind="lib" path="lib/xerces.jar"/>
<classpathentry exported="true" kind="lib" path="lib/bzip2.jar"/>
<classpathentry exported="true" kind="lib" path="lib/J7Zip-modified.jar"/>
<classpathentry exported="true" kind="lib" path="lib/webcat-0.1-swf.jar"/>
<classpathentry exported="true" kind="lib" path="lib/activation.jar"/>
<classpathentry exported="true" kind="lib" path="lib/commons-jxpath-1.3.jar"/>
<classpathentry exported="true" kind="lib" path="libt/junit-4.7.jar"/>
<classpathentry kind="lib" path="lib/fontbox-1.0.0.jar"/>
<classpathentry kind="lib" path="lib/pdfbox-1.0.0.jar"/>
<classpathentry kind="lib" path="lib/poi-3.6-20091214.jar"/>
<classpathentry kind="lib" path="lib/poi-scratchpad-3.6-20091214.jar"/>
<classpathentry kind="lib" path="lib/bcmail-jdk15-145.jar"/>
<classpathentry kind="lib" path="lib/bcprov-jdk15-145.jar"/>
<classpathentry kind="lib" path="lib/jsch-0.1.42.jar"/>
<classpathentry kind="lib" path="lib/jakarta-oro-2.0.8.jar"/>
<classpathentry kind="lib" path="lib/commons-codec-1.4.jar"/>
<classpathentry kind="lib" path="lib/log4j-1.2.15.jar"/>
<classpathentry kind="lib" path="lib/mysql-connector-java-5.1.12-bin.jar"/>
<classpathentry kind="lib" path="lib/jcifs-1.3.14.jar"/>
<classpathentry kind="lib" path="lib/metadata-extractor-2.4.0-beta-1.jar"/>
<classpathentry exported="true" kind="con" path="org.eclipse.jdt.junit.JUNIT_CONTAINER/4"/>
<classpathentry kind="output" path="gen"/>
</classpath>

View File

@ -3,7 +3,7 @@ javacSource=1.5
javacTarget=1.5
# Release Configuration
releaseVersion=0.94
releaseVersion=0.95
stdReleaseFile=yacy_v${releaseVersion}_${DSTAMP}_${releaseNr}.tar.gz
sourceReleaseFile=yacy_src_v${releaseVersion}_${DSTAMP}_${releaseNr}.tar.gz
releaseFileParentDir=yacy

View File

@ -187,8 +187,8 @@ public class CrawlResults {
entry = i.next();
try {
urle = sb.indexSegments.urlMetadata(Segments.Process.LOCALCRAWLING).load(entry.getKey().getBytes(), null, 0);
if(urle == null) {
Log.logWarning("PLASMA", "CrawlResults: URL not in index for crawl result "+ i +" with hash "+ entry.getKey());
if (urle == null) {
Log.logWarning("PLASMA", "CrawlResults: URL not in index with url hash "+ entry.getKey());
urlstr = null;
urltxt = null;
metadata = null;

View File

@ -4,20 +4,49 @@
<title>YaCy '#[clientname]#': OAI-PMH source import list</title>
#%env/templates/metas.template%#
#(refresh)#::<meta http-equiv="REFRESH" content="6" />#(/refresh)#
<script>
<!--
function setall(name, check){
var selectForm = document.forms.namedItem(name);
var count = selectForm.elements["num"].value;
if (check) for(i = 0; i < count; i++) {
if (selectForm.elements["item_" + i].checked) {
check = false;
break;
}
}
for(i = 0; i < count; i++){
selectForm.elements["item_" + i].checked = check;
}
}
-->
</script>
<script src="/js/sorttable.js"></script>
</head>
<body>
#(source)#::
<h3>OAI Source List</h3>
<h3>List of #[num]# OAI-PMH Servers</h3>
<form action="IndexImportOAIPMH_p.html" target="_top" method="post" enctype="multipart/form-data" accept-charset="UTF-8" name="oaipmhimport">
<p>
<input type="hidden" name="num" value="#[num]#" />
<input type="submit" name="loadrows" value="Load Selected Sources" />
</p>
<table cellpadding="2" cellspacing="1" >
<tr class="TableHeader">
<td><input type="checkbox" name="allswitch" onclick="setall(this.form.name, this.value)" /></td>
<td>Source</td>
</tr>
#{table}#
<tr class="TableCell#(dark)#Light::Dark#(/dark)#">
<td>#[source]#</td>
<td align="left"><input type="checkbox" name="item_#[count]#" value="mark_#[source]#" /></td>
<td>#[loadurl]#</td>
</tr>
#{/table}#
</table>
<p>
<input type="submit" name="loadrows" value="Load Selected Sources" />
</p>
</form>
#(/source)#
#(import)#::

View File

@ -25,6 +25,7 @@
import java.util.ArrayList;
import java.util.Set;
import net.yacy.document.importer.OAIListFriendsLoader;
import net.yacy.document.importer.OAIPMHImporter;
import de.anomic.http.server.RequestHeader;
@ -43,39 +44,42 @@ public class IndexImportOAIPMHList_p {
prop.put("source", 0);
if (post != null && post.containsKey("source")) {
Set<String> oaiRoots = OAIPMHImporter.getAllListedOAIServer(sb.loader);
Set<String> oaiRoots = OAIListFriendsLoader.load(sb.loader).keySet();
boolean dark = false;
int cnt = 0;
int count = 0;
for (String root: oaiRoots) {
prop.put("source_table_" + cnt + "_dark", (dark) ? "1" : "0");
prop.put("source_table_" + cnt + "_source", "<a href=\"/IndexImportOAIPMH_p.html?importroot=&urlstartall=" + root + "\" target=\"_top\">" + root+ "</a>");
prop.put("source_table_" + count + "_dark", (dark) ? "1" : "0");
prop.put("source_table_" + count + "_count", count);
prop.put("source_table_" + count + "_source", root);
prop.put("source_table_" + count + "_loadurl", "<a href=\"/IndexImportOAIPMH_p.html?urlstart=" + root + "\" target=\"_top\">" + root + "</a>");
dark = !dark;
cnt++;
count++;
}
prop.put("source_table", cnt);
prop.put("source_table", count);
prop.put("source_num", count);
prop.put("source", 1);
}
if (post != null && post.containsKey("import")) {
ArrayList<OAIPMHImporter> jobs = new ArrayList<OAIPMHImporter>();
for (OAIPMHImporter job: OAIPMHImporter.runningJobs) jobs.add(job);
for (OAIPMHImporter job: OAIPMHImporter.startedJobs) jobs.add(job);
for (OAIPMHImporter job: OAIPMHImporter.finishedJobs) jobs.add(job);
for (OAIPMHImporter job: OAIPMHImporter.runningJobs.keySet()) jobs.add(job);
for (OAIPMHImporter job: OAIPMHImporter.startedJobs.keySet()) jobs.add(job);
for (OAIPMHImporter job: OAIPMHImporter.finishedJobs.keySet()) jobs.add(job);
boolean dark = false;
int cnt = 0;
int count = 0;
for (OAIPMHImporter job: jobs) {
prop.put("import_table_" + cnt + "_dark", (dark) ? "1" : "0");
prop.put("import_table_" + cnt + "_thread", (job.isAlive()) ? "<img src=\"/env/grafics/loading.gif\" alt=\"running\" />" : "finished");
prop.put("import_table_" + cnt + "_source", job.source());
prop.put("import_table_" + cnt + "_chunkCount", job.chunkCount());
prop.put("import_table_" + cnt + "_recordsCount", job.count());
prop.put("import_table_" + cnt + "_speed", job.speed());
prop.put("import_table_" + count + "_dark", (dark) ? "1" : "0");
prop.put("import_table_" + count + "_thread", (job.isAlive()) ? "<img src=\"/env/grafics/loading.gif\" alt=\"running\" />" : "finished");
prop.put("import_table_" + count + "_source", job.source());
prop.put("import_table_" + count + "_chunkCount", job.chunkCount());
prop.put("import_table_" + count + "_recordsCount", job.count());
prop.put("import_table_" + count + "_speed", job.speed());
dark = !dark;
cnt++;
count++;
}
prop.put("import_table", cnt);
prop.put("import_table", count);
prop.put("import", 1);
prop.put("refresh", 1);
}

View File

@ -33,7 +33,7 @@
<fieldset>
<legend>Import all Records from a server</legend>
Import all records that follow according to resumption elements into index<br />
<input name="urlstartall" type="text" value="" size="80" />
<input name="urlstart" type="text" value="" size="80" />
<input name="importroot" type="submit" value="import this source" />
#(optiongetlist)#::or&nbsp;<input name="getlist" type="submit" value="import from a list" />#(/optiongetlist)#
#(status)#::<p>Import started!</p>::<p>Bad input data: #[message]# </p>#(/status)#

View File

@ -24,9 +24,13 @@
import java.io.IOException;
import java.net.MalformedURLException;
import java.util.ArrayList;
import java.util.Map;
import java.util.Random;
import java.util.TreeSet;
import net.yacy.document.importer.OAIPMHImporter;
import net.yacy.document.importer.OAIPMHReader;
import net.yacy.document.importer.OAIPMHLoader;
import net.yacy.document.importer.ResumptionToken;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.logging.Log;
@ -55,7 +59,7 @@ public class IndexImportOAIPMH_p {
DigestURI url = null;
try {
url = new DigestURI(oaipmhurl, null);
OAIPMHReader r = new OAIPMHReader(sb.loader, url, sb.surrogatesInPath, "oaipmh-one");
OAIPMHLoader r = new OAIPMHLoader(sb.loader, url, sb.surrogatesInPath, "oaipmh-one");
ResumptionToken rt = r.getResumptionToken();
prop.put("import-one", 1);
prop.put("import-one_count", (rt == null) ? "not available" : Integer.toString(rt.getRecordCounter()));
@ -83,8 +87,8 @@ public class IndexImportOAIPMH_p {
}
}
if (post.containsKey("importroot")) {
String oaipmhurl = post.get("urlstartall", "");
if (post.get("urlstart", "").length() > 0) {
String oaipmhurl = post.get("urlstart", "");
DigestURI url = null;
try {
url = new DigestURI(oaipmhurl, null);
@ -100,6 +104,38 @@ public class IndexImportOAIPMH_p {
}
}
if (post.get("loadrows", "").length() > 0) {
// create a time-ordered list of events to execute
TreeSet<String> sources = new TreeSet<String>();
for (Map.Entry<String, String> entry: post.entrySet()) {
if (entry.getValue().startsWith("mark_")) {
sources.add(entry.getValue().substring(5));
}
}
prop.put("status", 1);
prop.put("optiongetlist", 1);
prop.put("iframetype", 1);
// prepare the set for random read from it (to protect the servers at the beginning of the list)
ArrayList<String> sourceList = new ArrayList<String>(sources.size());
for (String oaipmhurl: sources) sourceList.add(oaipmhurl);
Random r = new Random(System.currentTimeMillis());
// start jobs for the sources
DigestURI url = null;
while (sourceList.size() > 0) {
String oaipmhurl = sourceList.remove(r.nextInt(sourceList.size()));
try {
url = new DigestURI(oaipmhurl, null);
OAIPMHImporter job = new OAIPMHImporter(sb.loader, url);
job.start();
} catch (MalformedURLException e) {
Log.logException(e);
}
}
}
if (post.containsKey("getlist")) {
prop.put("iframetype", 2);
}

View File

@ -37,9 +37,16 @@
package de.anomic.search;
import java.io.BufferedInputStream;
import java.io.BufferedOutputStream;
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.io.UnsupportedEncodingException;
import java.net.MalformedURLException;
import java.security.NoSuchAlgorithmException;
@ -60,6 +67,10 @@ import java.util.TreeSet;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.Semaphore;
import java.util.regex.Pattern;
import java.util.zip.GZIPInputStream;
import java.util.zip.GZIPOutputStream;
import java.util.zip.ZipEntry;
import java.util.zip.ZipInputStream;
import net.yacy.document.Condenser;
import net.yacy.document.Document;
@ -68,6 +79,7 @@ import net.yacy.document.ParserException;
import net.yacy.document.content.DCEntry;
import net.yacy.document.content.RSSMessage;
import net.yacy.document.content.SurrogateReader;
import net.yacy.document.importer.OAIListFriendsLoader;
import net.yacy.document.parser.html.ImageEntry;
import net.yacy.document.parser.xml.RSSFeed;
import net.yacy.kelondro.data.meta.DigestURI;
@ -474,6 +486,7 @@ public final class Switchboard extends serverSwitch {
// start a loader
log.logConfig("Starting Crawl Loader");
this.loader = new LoaderDispatcher(this);
OAIListFriendsLoader.init(this.loader);
this.crawlQueues = new CrawlQueues(this, queuesRoot);
this.crawlQueues.noticeURL.setMinimumDelta(
this.getConfigLong("minimumLocalDelta", this.crawlQueues.noticeURL.getMinimumLocalDelta()),
@ -1236,58 +1249,106 @@ public final class Switchboard extends serverSwitch {
}
public boolean processSurrogate(final String s) {
File surrogateFile = new File(this.surrogatesInPath, s);
File infile = new File(this.surrogatesInPath, s);
if (!infile.exists() || !infile.canWrite() || !infile.canRead()) return false;
File outfile = new File(this.surrogatesOutPath, s);
if (!surrogateFile.exists() || !surrogateFile.canWrite() || !surrogateFile.canRead()) return false;
if (outfile.exists()) return false;
boolean moved = false;
try {
SurrogateReader reader = new SurrogateReader(new BufferedInputStream(new FileInputStream(surrogateFile)), 3);
Thread readerThread = new Thread(reader, "Surrogate-Reader " + surrogateFile.getAbsolutePath());
readerThread.start();
DCEntry surrogate;
Response response;
while ((surrogate = reader.take()) != DCEntry.poison) {
// check if url is in accepted domain
assert surrogate != null;
assert crawlStacker != null;
final String urlRejectReason = crawlStacker.urlInAcceptedDomain(surrogate.getIdentifier());
if (urlRejectReason != null) {
if (this.log.isFine()) this.log.logInfo("Rejected URL '" + surrogate.getIdentifier() + "': " + urlRejectReason);
continue;
if (s.endsWith("xml.zip")) {
// open the zip file with all the xml files in it
try {
InputStream is = new BufferedInputStream(new FileInputStream(infile));
ZipInputStream zis = new ZipInputStream(is);
ZipEntry entry;
while ((entry = zis.getNextEntry()) != null) {
int size;
byte[] buffer = new byte[2048];
ByteArrayOutputStream baos = new ByteArrayOutputStream();
while ((size = zis.read(buffer, 0, buffer.length)) != -1) {
baos.write(buffer, 0, size);
}
baos.flush();
processSurrogate(new ByteArrayInputStream(baos.toByteArray()), entry.getName());
baos.close();
}
// create a queue entry
Document document = surrogate.document();
Request request = new Request(
peers.mySeed().hash.getBytes(),
surrogate.getIdentifier(),
null,
"",
new Date(),
new Date(),
this.crawler.defaultSurrogateProfile.handle(),
0,
0,
0
);
response = new Response(request, null, null, "200", this.crawler.defaultSurrogateProfile);
indexingQueueEntry queueEntry = new indexingQueueEntry(Segments.Process.SURROGATES, response, document, null);
// place the queue entry into the concurrent process of the condenser (document analysis)
try {
indexingCondensementProcessor.enQueue(queueEntry);
} catch (InterruptedException e) {
Log.logException(e);
break;
} catch (IOException e) {
Log.logException(e);
} finally {
moved = infile.renameTo(outfile);
}
return moved;
} else {
try {
InputStream is = new BufferedInputStream(new FileInputStream(infile));
if (s.endsWith(".gz")) is = new GZIPInputStream(is);
processSurrogate(is, infile.getName());
} catch (IOException e) {
Log.logException(e);
} finally {
moved = infile.renameTo(outfile);
if (moved) {
// check if this file is already compressed, if not, compress now
if (!outfile.getName().endsWith(".gz")) {
String gzname = outfile.getName() + ".gz";
File gzfile = new File(outfile.getParentFile(), gzname);
try {
OutputStream os = new BufferedOutputStream(new GZIPOutputStream(new FileOutputStream(gzfile)));
FileUtils.copy(new BufferedInputStream(new FileInputStream(outfile)), os);
os.close();
if (gzfile.exists()) FileUtils.deletedelete(outfile);
} catch (FileNotFoundException e) {
Log.logException(e);
} catch (IOException e) {
Log.logException(e);
}
}
}
}
} catch (IOException e) {
Log.logException(e);
} finally {
moved = surrogateFile.renameTo(outfile);
return moved;
}
}
public void processSurrogate(final InputStream is, String name) throws IOException {
SurrogateReader reader = new SurrogateReader(is, 3);
Thread readerThread = new Thread(reader, name);
readerThread.start();
DCEntry surrogate;
Response response;
while ((surrogate = reader.take()) != DCEntry.poison) {
// check if url is in accepted domain
assert surrogate != null;
assert crawlStacker != null;
final String urlRejectReason = crawlStacker.urlInAcceptedDomain(surrogate.getIdentifier());
if (urlRejectReason != null) {
if (this.log.isFine()) this.log.logInfo("Rejected URL '" + surrogate.getIdentifier() + "': " + urlRejectReason);
continue;
}
// create a queue entry
Document document = surrogate.document();
Request request = new Request(
peers.mySeed().hash.getBytes(),
surrogate.getIdentifier(),
null,
"",
new Date(),
new Date(),
this.crawler.defaultSurrogateProfile.handle(),
0,
0,
0
);
response = new Response(request, null, null, "200", this.crawler.defaultSurrogateProfile);
indexingQueueEntry queueEntry = new indexingQueueEntry(Segments.Process.SURROGATES, response, document, null);
// place the queue entry into the concurrent process of the condenser (document analysis)
try {
indexingCondensementProcessor.enQueue(queueEntry);
} catch (InterruptedException e) {
Log.logException(e);
break;
}
}
return moved;
}
public int surrogateQueueSize() {
@ -1326,7 +1387,7 @@ public final class Switchboard extends serverSwitch {
// check for interruption
checkInterruption();
if (surrogate.endsWith(".xml")) {
if (surrogate.endsWith(".xml") || surrogate.endsWith(".xml.gz") || surrogate.endsWith(".xml.zip")) {
// read the surrogate file and store entry in index
if (processSurrogate(surrogate)) return true;
}

View File

@ -1,3 +1,25 @@
/**
* Importer
* Copyright 2009 by Michael Peter Christen
* First released 29.04.2010 at http://yacy.net
*
* This is a part of YaCy, a peer-to-peer based web search engine
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program in the file COPYING.LESSER.
* If not, see <http://www.gnu.org/licenses/>.
*/
package net.yacy.document.importer;
public interface Importer extends Runnable {

View File

@ -1,28 +1,24 @@
// mediawikiIndex.java
// (C) 2008 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany
// first published 20.11.2008 on http://yacy.net
//
// This is a part of YaCy, a peer-to-peer based web search engine
//
// $LastChangedDate: 2006-04-02 22:40:07 +0200 (So, 02 Apr 2006) $
// $LastChangedRevision: 1986 $
// $LastChangedBy: orbiter $
//
// LICENSE
//
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation; either version 2 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
/**
* MediawikiImporter
* Copyright 2008 by Michael Peter Christen
* First released 20.11.2008 at http://yacy.net
*
* This is a part of YaCy, a peer-to-peer based web search engine
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program in the file COPYING.LESSER.
* If not, see <http://www.gnu.org/licenses/>.
*/
package net.yacy.document.importer;

View File

@ -0,0 +1,203 @@
/**
* OAIListFriendsLoader
* Copyright 2010 by Michael Peter Christen
* First released 29.04.2010 at http://yacy.net
*
* This is a part of YaCy, a peer-to-peer based web search engine
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program in the file COPYING.LESSER.
* If not, see <http://www.gnu.org/licenses/>.
*/
package net.yacy.document.importer;
import java.io.ByteArrayInputStream;
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.util.Map;
import java.util.TreeMap;
import javax.xml.parsers.ParserConfigurationException;
import javax.xml.parsers.SAXParser;
import javax.xml.parsers.SAXParserFactory;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.logging.Log;
import net.yacy.kelondro.util.FileUtils;
import net.yacy.repository.LoaderDispatcher;
import org.xml.sax.Attributes;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.DefaultHandler;
import de.anomic.crawler.CrawlProfile;
import de.anomic.crawler.retrieval.Response;
public class OAIListFriendsLoader {
private static final long serialVersionUID = -8705115274655024604L;
//private static String url10 = "http://roar.eprints.org/cgi/roar_search/advanced/export_roar_ROAR::ListFriends.xml?screen=ROAR%3A%3AFacetSearch&_action_export=1&output=ROAR%3A%3AListFriends&exp=1|1|-recordcount%2F-date|archive|-|-|eprint_status%3Aeprint_status%3AALL%3AEQ%3Aarchive|metadata_visibility%3Ametadata_visibility%3AALL%3AEX%3Ashow";
private static String url10 = "http://roar.eprints.org/cgi/roar_search/advanced/export_roar_ROAR::ListFriends.xml?_action_export=1&output=ROAR%3A%3AListFriends";
private static File cache10 = new File("DATA/DICTIONARIES/harvesting/export_roar_ROAR_ListFriends.xml");
private static String url20 = "http://www.openarchives.org/Register/ListFriends";
private static File cache20 = new File("DATA/DICTIONARIES/harvesting/ListFriends.xml");
public static void init(LoaderDispatcher loader) {
loader.loadIfNotExistBackground(url10, cache10);
loader.loadIfNotExistBackground(url20, cache20);
}
public static Map<String, String> load(LoaderDispatcher loader) {
Map<String, String> map10;
try {
map10 = load(null, null, new File("DATA/DICTIONARIES/harvesting/export_roar_ROAR_ListFriends.xml"));
} catch (IOException e) {
map10 = new TreeMap<String, String>();
}
Map<String, String> map20;
try {
map20 = load(null, null, new File("DATA/DICTIONARIES/harvesting/ListFriends.xml"));
} catch (IOException e) {
map20 = new TreeMap<String, String>();
}
map10.putAll(map20);
return map10;
}
/**
* load a OAI ListFriends file from the net or from a cache location
* If the given file does exist, the OAI ListFriends File is loaded and parsed.
* The resulting map is a mapping from OAI-PMH start url to a loaction description
* @param loader a LoaderDispatcher that loads the file if targetFile does not exist
* @param source the source URL for the OAI ListFriends file
* @param targetFile the file where the loaded content is stored if it does not exist, the source othervise
* @return a Map from OAI-PMH source to source description (which is usually also a URL)
* @throws IOException
*/
private static Map<String, String> load(LoaderDispatcher loader, DigestURI source, File targetFile) throws IOException {
byte[] b;
if (targetFile.exists()) {
// load file
b = FileUtils.read(targetFile);
} else {
// load from the net
Response response = loader.load(source, false, true, CrawlProfile.CACHE_STRATEGY_NOCACHE);
b = response.getContent();
FileUtils.copy(b, targetFile);
}
return new Parser(b).map;
}
// get a resumption token using a SAX xml parser from am input stream
private static class Parser extends DefaultHandler {
// class variables
private final StringBuilder buffer;
private boolean parsingValue;
private SAXParser saxParser;
private InputStream stream;
private Attributes atts;
private int recordCounter;
private TreeMap<String, String> map;
public Parser(final byte[] b) throws IOException {
this.map = new TreeMap<String, String>();
this.recordCounter = 0;
this.buffer = new StringBuilder();
this.parsingValue = false;
this.atts = null;
final SAXParserFactory factory = SAXParserFactory.newInstance();
this.stream = new ByteArrayInputStream(b);
try {
this.saxParser = factory.newSAXParser();
this.saxParser.parse(this.stream, this);
} catch (SAXException e) {
Log.logException(e);
Log.logWarning("OAIListFriendsLoader.Parser", "OAIListFriends was not parsed:\n" + new String(b));
} catch (IOException e) {
Log.logException(e);
Log.logWarning("OAIListFriendsLoader.Parser", "OAIListFriends was not parsed:\n" + new String(b));
} catch (ParserConfigurationException e) {
Log.logException(e);
Log.logWarning("OAIListFriendsLoader.Parser", "OAIListFriends was not parsed:\n" + new String(b));
throw new IOException(e.getMessage());
} finally {
try {
this.stream.close();
} catch (IOException e) {
Log.logException(e);
}
}
}
/*
<?xml version="1.0" encoding="UTF-8"?>
<BaseURLs>
<baseURL id="http://roar.eprints.org/id/eprint/102">http://research.nla.gov.au/oai</baseURL>
<baseURL id="http://roar.eprints.org/id/eprint/174">http://oai.bibsys.no/repository</baseURL>
<baseURL id="http://roar.eprints.org/id/eprint/1064">http://oai.repec.openlib.org/</baseURL>
</BaseURLs>
*/
public void startElement(final String uri, final String name, final String tag, final Attributes atts) throws SAXException {
if ("baseURL".equals(tag)) {
recordCounter++;
this.parsingValue = true;
this.atts = atts;
}
}
public void endElement(final String uri, final String name, final String tag) {
if (tag == null) return;
if ("baseURL".equals(tag)) {
this.map.put(buffer.toString(), this.atts.getValue("id"));
this.buffer.setLength(0);
this.parsingValue = false;
}
}
public void characters(final char ch[], final int start, final int length) {
if (parsingValue) {
buffer.append(ch, start, length);
}
}
}
public static void main(String[] args) {
try {
Map<String, String> map1 = load(null, null, new File("DATA/DICTIONARIES/harvesting/export_roar_ROAR_ListFriends.xml"));
int count1 = map1.size();
Map<String, String> map2 = load(null, null, new File("DATA/DICTIONARIES/harvesting/ListFriends.xml"));
int count2 = map2.size();
map1.putAll(map2);
System.out.println("count1 = " + count1 + ", count2 = " + count2 + ", all = " + map1.size());
for (Map.Entry<String, String> entry: map1.entrySet()) System.out.println(entry.getKey());
} catch (IOException e) {
e.printStackTrace();
}
}
}

View File

@ -1,72 +1,55 @@
// OAIPMHImporter
// (C) 2009 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany
// first published 30.09.2009 on http://yacy.net
//
// This is a part of YaCy, a peer-to-peer based web search engine
//
// $LastChangedDate: 2009-09-23 23:26:14 +0200 (Mi, 23 Sep 2009) $
// $LastChangedRevision: 6340 $
// $LastChangedBy: low012 $
//
// LICENSE
//
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation; either version 2 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
/**
* OAIPMHImporter
* Copyright 2009 by Michael Peter Christen
* First released 30.09.2009 at http://yacy.net
*
* This is a part of YaCy, a peer-to-peer based web search engine
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program in the file COPYING.LESSER.
* If not, see <http://www.gnu.org/licenses/>.
*/
package net.yacy.document.importer;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.net.MalformedURLException;
import java.text.ParseException;
import java.util.Date;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.TreeSet;
import java.util.concurrent.ConcurrentHashMap;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.logging.Log;
import net.yacy.kelondro.util.DateFormatter;
import net.yacy.repository.LoaderDispatcher;
import net.yacy.document.parser.csvParser;
import de.anomic.crawler.CrawlProfile;
import de.anomic.search.Switchboard;
// get one server with
// http://roar.eprints.org/index.php?action=csv
// or
// http://www.openarchives.org/Register/BrowseSites
// or
// http://www.openarchives.org/Register/ListFriends
//
// list records from oai-pmh like
// http://opus.bsz-bw.de/fhhv/oai2/oai2.php?verb=ListRecords&metadataPrefix=oai_dc
public class OAIPMHImporter extends Thread implements Importer, Comparable<OAIPMHImporter> {
private static int importerCounter = Integer.MAX_VALUE;
private static Object N = new Object();
public static TreeSet<OAIPMHImporter> startedJobs = new TreeSet<OAIPMHImporter>();
public static TreeSet<OAIPMHImporter> runningJobs = new TreeSet<OAIPMHImporter>();
public static TreeSet<OAIPMHImporter> finishedJobs = new TreeSet<OAIPMHImporter>();
public static ConcurrentHashMap<OAIPMHImporter, Object> startedJobs = new ConcurrentHashMap<OAIPMHImporter, Object>();
public static ConcurrentHashMap<OAIPMHImporter, Object> runningJobs = new ConcurrentHashMap<OAIPMHImporter, Object>();
public static ConcurrentHashMap<OAIPMHImporter, Object> finishedJobs = new ConcurrentHashMap<OAIPMHImporter, Object>();
private final LoaderDispatcher loader;
private DigestURI source;
@ -95,7 +78,7 @@ public class OAIPMHImporter extends Thread implements Importer, Comparable<OAIPM
// this should never happen
Log.logException(e);
}
startedJobs.add(this);
startedJobs.put(this, N);
}
public int count() {
@ -131,23 +114,23 @@ public class OAIPMHImporter extends Thread implements Importer, Comparable<OAIPM
}
public void run() {
while (runningJobs.size() > 10) {
try {Thread.sleep(1000 + 1000 * System.currentTimeMillis() % 6);} catch (InterruptedException e) {}
while (runningJobs.size() > 50) {
try {Thread.sleep(10000 + 3000 * (System.currentTimeMillis() % 6));} catch (InterruptedException e) {}
}
startedJobs.remove(this);
runningJobs.add(this);
runningJobs.put(this, N);
this.message = "loading first part of records";
while (true) {
try {
OAIPMHReader reader = new OAIPMHReader(this.loader, this.source, Switchboard.getSwitchboard().surrogatesInPath, filenamePrefix);
OAIPMHLoader loader = new OAIPMHLoader(this.loader, this.source, Switchboard.getSwitchboard().surrogatesInPath, filenamePrefix);
this.chunkCount++;
this.recordsCount += reader.getResumptionToken().getRecordCounter();
this.source = reader.getResumptionToken().resumptionURL(this.source);
this.recordsCount += loader.getResumptionToken().getRecordCounter();
this.source = loader.getResumptionToken().resumptionURL(this.source);
if (this.source == null) {
this.message = "import terminated with source = null";
break;
}
this.message = "loading next resumption fragment, cursor = " + reader.getResumptionToken().getCursor();
this.message = "loading next resumption fragment, cursor = " + loader.getResumptionToken().getCursor();
} catch (IOException e) {
this.message = e.getMessage();
break;
@ -155,7 +138,7 @@ public class OAIPMHImporter extends Thread implements Importer, Comparable<OAIPM
}
this.finishTime = System.currentTimeMillis();
runningJobs.remove(this);
finishedJobs.add(this);
finishedJobs.put(this, N);
}
@ -185,7 +168,7 @@ public class OAIPMHImporter extends Thread implements Importer, Comparable<OAIPM
File surrogatesIn,
File surrogatesOut,
long staleLimit) {
Set<String> plainList = getAllListedOAIServer(loader);
Set<String> plainList = OAIListFriendsLoader.load(loader).keySet();
Map<String, Date> loaded = getLoadedOAIServer(surrogatesIn, surrogatesOut);
long limit = System.currentTimeMillis() - staleLimit;
for (Map.Entry<String, Date> a: loaded.entrySet()) {
@ -193,47 +176,6 @@ public class OAIPMHImporter extends Thread implements Importer, Comparable<OAIPM
}
return plainList;
}
/**
* use the list server at http://roar.eprints.org/index.php?action=csv
* to produce a list of OAI-PMH sources
* @param loader
* @return the list of oai-pmh sources
*/
public static Set<String> getAllListedOAIServer(LoaderDispatcher loader) {
TreeSet<String> list = new TreeSet<String>();
// read roar
File roar = new File(Switchboard.getSwitchboard().dictionariesPath, "harvesting/roar.csv");
DigestURI roarSource;
try {
roarSource = new DigestURI("http://roar.eprints.org/index.php?action=csv", null);
} catch (MalformedURLException e) {
Log.logException(e);
roarSource = null;
}
if (!roar.exists()) try {
// load the file from the net
loader.load(roarSource, CrawlProfile.CACHE_STRATEGY_NOCACHE, roar);
} catch (IOException e) {
Log.logException(e);
}
if (roar.exists()) {
csvParser parser = new csvParser();
try {
List<String[]> table = parser.getTable(roarSource, "", "UTF-8", new FileInputStream(roar));
for (String[] row: table) {
if (row.length > 2 && (row[2].startsWith("http://") || row[2].startsWith("https://"))) {
list.add(row[2]);
}
}
} catch (FileNotFoundException e) {
Log.logException(e);
}
}
return list;
}
/**
* get a map for already loaded oai-pmh servers and their latest access date

View File

@ -1,32 +1,27 @@
// OAIPMHReader
// (C) 2009 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany
// first published 30.09.2009 on http://yacy.net
//
// This is a part of YaCy, a peer-to-peer based web search engine
//
// $LastChangedDate: 2009-09-23 23:26:14 +0200 (Mi, 23 Sep 2009) $
// $LastChangedRevision: 6340 $
// $LastChangedBy: low012 $
//
// LICENSE
//
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation; either version 2 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
/**
* OAIPMHLoader
* Copyright 2009 by Michael Peter Christen
* First released 30.09.2009 at http://yacy.net
*
* This is a part of YaCy, a peer-to-peer based web search engine
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program in the file COPYING.LESSER.
* If not, see <http://www.gnu.org/licenses/>.
*/
package net.yacy.document.importer;
import java.io.ByteArrayInputStream;
import java.io.File;
import java.io.IOException;
@ -44,18 +39,18 @@ import de.anomic.crawler.retrieval.Response;
// http://opus.bsz-bw.de/fhhv/oai2/oai2.php?verb=ListRecords&metadataPrefix=oai_dc
public class OAIPMHReader {
public class OAIPMHLoader {
private final DigestURI source;
private final ResumptionToken resumptionToken;
public OAIPMHReader(LoaderDispatcher loader, DigestURI source, File targetDir, String filePrefix) throws IOException {
public OAIPMHLoader(LoaderDispatcher loader, DigestURI source, File targetDir, String filePrefix) throws IOException {
this.source = source;
// load the file from the net
Response response = loader.load(source, false, true, CrawlProfile.CACHE_STRATEGY_NOCACHE);
byte[] b = response.getContent();
this.resumptionToken = new ResumptionToken(new ByteArrayInputStream(b));
this.resumptionToken = new ResumptionToken(b);
File f1 = new File(targetDir, OAIPMHImporter.filename4Source(source));
File f0 = new File(targetDir, f1.getName() + ".tmp");

View File

@ -1,30 +1,28 @@
// ResumptionToken
// (C) 2009 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany
// first published 31.10.2009 on http://yacy.net
//
// $LastChangedDate: 2006-04-02 22:40:07 +0200 (So, 02 Apr 2006) $
// $LastChangedRevision: 1986 $
// $LastChangedBy: orbiter $
//
// LICENSE
//
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation; either version 2 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
/**
* ResumptionToken
* Copyright 2009 by Michael Peter Christen
* First released 31.10.2009 at http://yacy.net
*
* This is a part of YaCy, a peer-to-peer based web search engine
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program in the file COPYING.LESSER.
* If not, see <http://www.gnu.org/licenses/>.
*/
package net.yacy.document.importer;
import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.text.Collator;
@ -45,7 +43,7 @@ import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.logging.Log;
import net.yacy.kelondro.util.DateFormatter;
public class ResumptionToken extends TreeMap<String, String> {
public class ResumptionToken extends TreeMap<String, String> {
private static final long serialVersionUID = -8389462290545629792L;
@ -58,10 +56,10 @@ public class ResumptionToken extends TreeMap<String, String> {
int recordCounter;
public ResumptionToken(final InputStream stream) throws IOException {
public ResumptionToken(final byte[] b) throws IOException {
super((Collator) insensitiveCollator.clone());
this.recordCounter = 0;
new Reader(stream);
new Parser(b);
}
public ResumptionToken(
@ -206,7 +204,7 @@ public class ResumptionToken extends TreeMap<String, String> {
}
// get a resumption token using a SAX xml parser from am input stream
private class Reader extends DefaultHandler {
private class Parser extends DefaultHandler {
// class variables
private final StringBuilder buffer;
@ -215,21 +213,24 @@ public class ResumptionToken extends TreeMap<String, String> {
private InputStream stream;
private Attributes atts;
public Reader(final InputStream stream) throws IOException {
public Parser(final byte[] b) throws IOException {
this.buffer = new StringBuilder();
this.parsingValue = false;
this.stream = stream;
this.atts = null;
final SAXParserFactory factory = SAXParserFactory.newInstance();
this.stream = new ByteArrayInputStream(b);
try {
this.saxParser = factory.newSAXParser();
this.saxParser.parse(this.stream, this);
} catch (SAXException e) {
Log.logException(e);
Log.logWarning("ResumptionToken", "token was not parsed:\n" + new String(b));
} catch (IOException e) {
Log.logException(e);
Log.logWarning("ResumptionToken", "token was not parsed:\n" + new String(b));
} catch (ParserConfigurationException e) {
Log.logException(e);
Log.logWarning("ResumptionToken", "token was not parsed:\n" + new String(b));
throw new IOException(e.getMessage());
} finally {
try {

View File

@ -171,6 +171,8 @@ public final class FileUtils {
* @see #copy(File source, File dest)
*/
public static void copy(final InputStream source, final File dest, final long count) throws IOException {
String path = dest.getParent();
if (path != null && path.length() > 0) new File(path).mkdirs();
FileOutputStream fos = null;
try {
fos = new FileOutputStream(dest);

View File

@ -84,7 +84,7 @@ public final class ScoreCluster<E> {
public synchronized void shrinkToMinScore(int minScore) {
int score;
Long key;
while (true) {
while (keyrefDB.size() > 0) {
// find and remove objects where their score is smaller than the demanded minimum score
key = keyrefDB.firstKey();
if (key == null) break;

View File

@ -31,6 +31,7 @@ import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.io.Writer;
import java.net.MalformedURLException;
import java.util.Arrays;
import java.util.Date;
import java.util.HashSet;
@ -454,4 +455,29 @@ public final class LoaderDispatcher {
if (System.currentTimeMillis() - e.getValue().longValue() > minDelay) i.remove();
}
}
public void loadIfNotExistBackground(String url, File cache) {
new Loader(url, cache).start();
}
private class Loader extends Thread {
private String url;
private File cache;
public Loader(String url, File cache) {
this.url = url;
this.cache = cache;
}
public void run() {
if (this.cache.exists()) return;
try {
// load from the net
Response response = load(new DigestURI(this.url), false, true, CrawlProfile.CACHE_STRATEGY_NOCACHE);
byte[] b = response.getContent();
FileUtils.copy(b, this.cache);
} catch (MalformedURLException e) {} catch (IOException e) {}
}
}
}