From fc5efcc05a7817c2d5c2a7e1c1b958eb5bbc3d87 Mon Sep 17 00:00:00 2001 From: orbiter Date: Fri, 30 Apr 2010 14:03:51 +0000 Subject: [PATCH] enhanced and fixed OAI-PMH import - now importing OAI-PMH server list fron two sources - simultanous import from several servers (even > 2000) - check buttons on OAI-PMH server list to select multiple servers for import start - it is possible to select all servers at once for import - imported XML data is gzipped after import from surrogate reader git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@6847 6c8d7289-2bf4-0310-a012-ef5d649a1542 --- .classpath | 86 ++++---- build.properties | 2 +- htroot/CrawlResults.java | 4 +- htroot/IndexImportOAIPMHList_p.html | 33 ++- htroot/IndexImportOAIPMHList_p.java | 40 ++-- htroot/IndexImportOAIPMH_p.html | 2 +- htroot/IndexImportOAIPMH_p.java | 44 +++- source/de/anomic/search/Switchboard.java | 153 +++++++++---- .../net/yacy/document/importer/Importer.java | 22 ++ .../document/importer/MediawikiImporter.java | 46 ++-- .../importer/OAIListFriendsLoader.java | 203 ++++++++++++++++++ .../document/importer/OAIPMHImporter.java | 130 ++++------- .../{OAIPMHReader.java => OAIPMHLoader.java} | 53 +++-- .../document/importer/ResumptionToken.java | 61 +++--- source/net/yacy/kelondro/util/FileUtils.java | 2 + .../net/yacy/kelondro/util/ScoreCluster.java | 2 +- .../net/yacy/repository/LoaderDispatcher.java | 26 +++ 17 files changed, 613 insertions(+), 296 deletions(-) create mode 100644 source/net/yacy/document/importer/OAIListFriendsLoader.java rename source/net/yacy/document/importer/{OAIPMHReader.java => OAIPMHLoader.java} (87%) diff --git a/.classpath b/.classpath index 5094c4b77..12926a882 100644 --- a/.classpath +++ b/.classpath @@ -1,43 +1,43 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/build.properties b/build.properties index 9bfc3e391..05bc061e7 100644 --- a/build.properties +++ b/build.properties @@ -3,7 +3,7 @@ javacSource=1.5 javacTarget=1.5 # Release Configuration -releaseVersion=0.94 +releaseVersion=0.95 stdReleaseFile=yacy_v${releaseVersion}_${DSTAMP}_${releaseNr}.tar.gz sourceReleaseFile=yacy_src_v${releaseVersion}_${DSTAMP}_${releaseNr}.tar.gz releaseFileParentDir=yacy diff --git a/htroot/CrawlResults.java b/htroot/CrawlResults.java index f3c3f2f75..7efe7dd8c 100644 --- a/htroot/CrawlResults.java +++ b/htroot/CrawlResults.java @@ -187,8 +187,8 @@ public class CrawlResults { entry = i.next(); try { urle = sb.indexSegments.urlMetadata(Segments.Process.LOCALCRAWLING).load(entry.getKey().getBytes(), null, 0); - if(urle == null) { - Log.logWarning("PLASMA", "CrawlResults: URL not in index for crawl result "+ i +" with hash "+ entry.getKey()); + if (urle == null) { + Log.logWarning("PLASMA", "CrawlResults: URL not in index with url hash "+ entry.getKey()); urlstr = null; urltxt = null; metadata = null; diff --git a/htroot/IndexImportOAIPMHList_p.html b/htroot/IndexImportOAIPMHList_p.html index 99b0756c3..168a1f85d 100644 --- a/htroot/IndexImportOAIPMHList_p.html +++ b/htroot/IndexImportOAIPMHList_p.html @@ -4,20 +4,49 @@ YaCy '#[clientname]#': OAI-PMH source import list #%env/templates/metas.template%# #(refresh)#::#(/refresh)# + + #(source)#:: -

OAI Source List

+

List of #[num]# OAI-PMH Servers

+
+

+ + +

+ #{table}# - + + #{/table}#
Source
#[source]##[loadurl]#
+

+ +

+
#(/source)# #(import)#:: diff --git a/htroot/IndexImportOAIPMHList_p.java b/htroot/IndexImportOAIPMHList_p.java index 9c2dd257b..ceba573a2 100644 --- a/htroot/IndexImportOAIPMHList_p.java +++ b/htroot/IndexImportOAIPMHList_p.java @@ -25,6 +25,7 @@ import java.util.ArrayList; import java.util.Set; +import net.yacy.document.importer.OAIListFriendsLoader; import net.yacy.document.importer.OAIPMHImporter; import de.anomic.http.server.RequestHeader; @@ -43,39 +44,42 @@ public class IndexImportOAIPMHList_p { prop.put("source", 0); if (post != null && post.containsKey("source")) { - Set oaiRoots = OAIPMHImporter.getAllListedOAIServer(sb.loader); + Set oaiRoots = OAIListFriendsLoader.load(sb.loader).keySet(); boolean dark = false; - int cnt = 0; + int count = 0; for (String root: oaiRoots) { - prop.put("source_table_" + cnt + "_dark", (dark) ? "1" : "0"); - prop.put("source_table_" + cnt + "_source", "" + root+ ""); + prop.put("source_table_" + count + "_dark", (dark) ? "1" : "0"); + prop.put("source_table_" + count + "_count", count); + prop.put("source_table_" + count + "_source", root); + prop.put("source_table_" + count + "_loadurl", "" + root + ""); dark = !dark; - cnt++; + count++; } - prop.put("source_table", cnt); + prop.put("source_table", count); + prop.put("source_num", count); prop.put("source", 1); } if (post != null && post.containsKey("import")) { ArrayList jobs = new ArrayList(); - for (OAIPMHImporter job: OAIPMHImporter.runningJobs) jobs.add(job); - for (OAIPMHImporter job: OAIPMHImporter.startedJobs) jobs.add(job); - for (OAIPMHImporter job: OAIPMHImporter.finishedJobs) jobs.add(job); + for (OAIPMHImporter job: OAIPMHImporter.runningJobs.keySet()) jobs.add(job); + for (OAIPMHImporter job: OAIPMHImporter.startedJobs.keySet()) jobs.add(job); + for (OAIPMHImporter job: OAIPMHImporter.finishedJobs.keySet()) jobs.add(job); boolean dark = false; - int cnt = 0; + int count = 0; for (OAIPMHImporter job: jobs) { - prop.put("import_table_" + cnt + "_dark", (dark) ? "1" : "0"); - prop.put("import_table_" + cnt + "_thread", (job.isAlive()) ? "\"running\"" : "finished"); - prop.put("import_table_" + cnt + "_source", job.source()); - prop.put("import_table_" + cnt + "_chunkCount", job.chunkCount()); - prop.put("import_table_" + cnt + "_recordsCount", job.count()); - prop.put("import_table_" + cnt + "_speed", job.speed()); + prop.put("import_table_" + count + "_dark", (dark) ? "1" : "0"); + prop.put("import_table_" + count + "_thread", (job.isAlive()) ? "\"running\"" : "finished"); + prop.put("import_table_" + count + "_source", job.source()); + prop.put("import_table_" + count + "_chunkCount", job.chunkCount()); + prop.put("import_table_" + count + "_recordsCount", job.count()); + prop.put("import_table_" + count + "_speed", job.speed()); dark = !dark; - cnt++; + count++; } - prop.put("import_table", cnt); + prop.put("import_table", count); prop.put("import", 1); prop.put("refresh", 1); } diff --git a/htroot/IndexImportOAIPMH_p.html b/htroot/IndexImportOAIPMH_p.html index 325bedf01..be20d2527 100644 --- a/htroot/IndexImportOAIPMH_p.html +++ b/htroot/IndexImportOAIPMH_p.html @@ -33,7 +33,7 @@
Import all Records from a server Import all records that follow according to resumption elements into index
- + #(optiongetlist)#::or #(/optiongetlist)# #(status)#::

Import started!

::

Bad input data: #[message]#

#(/status)# diff --git a/htroot/IndexImportOAIPMH_p.java b/htroot/IndexImportOAIPMH_p.java index 55a62dcec..dc6985979 100644 --- a/htroot/IndexImportOAIPMH_p.java +++ b/htroot/IndexImportOAIPMH_p.java @@ -24,9 +24,13 @@ import java.io.IOException; import java.net.MalformedURLException; +import java.util.ArrayList; +import java.util.Map; +import java.util.Random; +import java.util.TreeSet; import net.yacy.document.importer.OAIPMHImporter; -import net.yacy.document.importer.OAIPMHReader; +import net.yacy.document.importer.OAIPMHLoader; import net.yacy.document.importer.ResumptionToken; import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.kelondro.logging.Log; @@ -55,7 +59,7 @@ public class IndexImportOAIPMH_p { DigestURI url = null; try { url = new DigestURI(oaipmhurl, null); - OAIPMHReader r = new OAIPMHReader(sb.loader, url, sb.surrogatesInPath, "oaipmh-one"); + OAIPMHLoader r = new OAIPMHLoader(sb.loader, url, sb.surrogatesInPath, "oaipmh-one"); ResumptionToken rt = r.getResumptionToken(); prop.put("import-one", 1); prop.put("import-one_count", (rt == null) ? "not available" : Integer.toString(rt.getRecordCounter())); @@ -83,8 +87,8 @@ public class IndexImportOAIPMH_p { } } - if (post.containsKey("importroot")) { - String oaipmhurl = post.get("urlstartall", ""); + if (post.get("urlstart", "").length() > 0) { + String oaipmhurl = post.get("urlstart", ""); DigestURI url = null; try { url = new DigestURI(oaipmhurl, null); @@ -100,6 +104,38 @@ public class IndexImportOAIPMH_p { } } + + if (post.get("loadrows", "").length() > 0) { + // create a time-ordered list of events to execute + TreeSet sources = new TreeSet(); + for (Map.Entry entry: post.entrySet()) { + if (entry.getValue().startsWith("mark_")) { + sources.add(entry.getValue().substring(5)); + } + } + prop.put("status", 1); + prop.put("optiongetlist", 1); + prop.put("iframetype", 1); + + // prepare the set for random read from it (to protect the servers at the beginning of the list) + ArrayList sourceList = new ArrayList(sources.size()); + for (String oaipmhurl: sources) sourceList.add(oaipmhurl); + Random r = new Random(System.currentTimeMillis()); + + // start jobs for the sources + DigestURI url = null; + while (sourceList.size() > 0) { + String oaipmhurl = sourceList.remove(r.nextInt(sourceList.size())); + try { + url = new DigestURI(oaipmhurl, null); + OAIPMHImporter job = new OAIPMHImporter(sb.loader, url); + job.start(); + } catch (MalformedURLException e) { + Log.logException(e); + } + } + } + if (post.containsKey("getlist")) { prop.put("iframetype", 2); } diff --git a/source/de/anomic/search/Switchboard.java b/source/de/anomic/search/Switchboard.java index 9c704452f..12c64ff69 100644 --- a/source/de/anomic/search/Switchboard.java +++ b/source/de/anomic/search/Switchboard.java @@ -37,9 +37,16 @@ package de.anomic.search; import java.io.BufferedInputStream; +import java.io.BufferedOutputStream; +import java.io.ByteArrayInputStream; +import java.io.ByteArrayOutputStream; import java.io.File; import java.io.FileInputStream; +import java.io.FileNotFoundException; +import java.io.FileOutputStream; import java.io.IOException; +import java.io.InputStream; +import java.io.OutputStream; import java.io.UnsupportedEncodingException; import java.net.MalformedURLException; import java.security.NoSuchAlgorithmException; @@ -60,6 +67,10 @@ import java.util.TreeSet; import java.util.concurrent.ConcurrentHashMap; import java.util.concurrent.Semaphore; import java.util.regex.Pattern; +import java.util.zip.GZIPInputStream; +import java.util.zip.GZIPOutputStream; +import java.util.zip.ZipEntry; +import java.util.zip.ZipInputStream; import net.yacy.document.Condenser; import net.yacy.document.Document; @@ -68,6 +79,7 @@ import net.yacy.document.ParserException; import net.yacy.document.content.DCEntry; import net.yacy.document.content.RSSMessage; import net.yacy.document.content.SurrogateReader; +import net.yacy.document.importer.OAIListFriendsLoader; import net.yacy.document.parser.html.ImageEntry; import net.yacy.document.parser.xml.RSSFeed; import net.yacy.kelondro.data.meta.DigestURI; @@ -474,6 +486,7 @@ public final class Switchboard extends serverSwitch { // start a loader log.logConfig("Starting Crawl Loader"); this.loader = new LoaderDispatcher(this); + OAIListFriendsLoader.init(this.loader); this.crawlQueues = new CrawlQueues(this, queuesRoot); this.crawlQueues.noticeURL.setMinimumDelta( this.getConfigLong("minimumLocalDelta", this.crawlQueues.noticeURL.getMinimumLocalDelta()), @@ -1236,58 +1249,106 @@ public final class Switchboard extends serverSwitch { } public boolean processSurrogate(final String s) { - File surrogateFile = new File(this.surrogatesInPath, s); + File infile = new File(this.surrogatesInPath, s); + if (!infile.exists() || !infile.canWrite() || !infile.canRead()) return false; File outfile = new File(this.surrogatesOutPath, s); - if (!surrogateFile.exists() || !surrogateFile.canWrite() || !surrogateFile.canRead()) return false; if (outfile.exists()) return false; boolean moved = false; - try { - SurrogateReader reader = new SurrogateReader(new BufferedInputStream(new FileInputStream(surrogateFile)), 3); - Thread readerThread = new Thread(reader, "Surrogate-Reader " + surrogateFile.getAbsolutePath()); - readerThread.start(); - DCEntry surrogate; - Response response; - while ((surrogate = reader.take()) != DCEntry.poison) { - // check if url is in accepted domain - assert surrogate != null; - assert crawlStacker != null; - final String urlRejectReason = crawlStacker.urlInAcceptedDomain(surrogate.getIdentifier()); - if (urlRejectReason != null) { - if (this.log.isFine()) this.log.logInfo("Rejected URL '" + surrogate.getIdentifier() + "': " + urlRejectReason); - continue; + if (s.endsWith("xml.zip")) { + // open the zip file with all the xml files in it + try { + InputStream is = new BufferedInputStream(new FileInputStream(infile)); + ZipInputStream zis = new ZipInputStream(is); + ZipEntry entry; + while ((entry = zis.getNextEntry()) != null) { + int size; + byte[] buffer = new byte[2048]; + ByteArrayOutputStream baos = new ByteArrayOutputStream(); + while ((size = zis.read(buffer, 0, buffer.length)) != -1) { + baos.write(buffer, 0, size); + } + baos.flush(); + processSurrogate(new ByteArrayInputStream(baos.toByteArray()), entry.getName()); + baos.close(); } - - // create a queue entry - Document document = surrogate.document(); - Request request = new Request( - peers.mySeed().hash.getBytes(), - surrogate.getIdentifier(), - null, - "", - new Date(), - new Date(), - this.crawler.defaultSurrogateProfile.handle(), - 0, - 0, - 0 - ); - response = new Response(request, null, null, "200", this.crawler.defaultSurrogateProfile); - indexingQueueEntry queueEntry = new indexingQueueEntry(Segments.Process.SURROGATES, response, document, null); - - // place the queue entry into the concurrent process of the condenser (document analysis) - try { - indexingCondensementProcessor.enQueue(queueEntry); - } catch (InterruptedException e) { - Log.logException(e); - break; + } catch (IOException e) { + Log.logException(e); + } finally { + moved = infile.renameTo(outfile); + } + return moved; + } else { + try { + InputStream is = new BufferedInputStream(new FileInputStream(infile)); + if (s.endsWith(".gz")) is = new GZIPInputStream(is); + processSurrogate(is, infile.getName()); + } catch (IOException e) { + Log.logException(e); + } finally { + moved = infile.renameTo(outfile); + if (moved) { + // check if this file is already compressed, if not, compress now + if (!outfile.getName().endsWith(".gz")) { + String gzname = outfile.getName() + ".gz"; + File gzfile = new File(outfile.getParentFile(), gzname); + try { + OutputStream os = new BufferedOutputStream(new GZIPOutputStream(new FileOutputStream(gzfile))); + FileUtils.copy(new BufferedInputStream(new FileInputStream(outfile)), os); + os.close(); + if (gzfile.exists()) FileUtils.deletedelete(outfile); + } catch (FileNotFoundException e) { + Log.logException(e); + } catch (IOException e) { + Log.logException(e); + } + } } } - } catch (IOException e) { - Log.logException(e); - } finally { - moved = surrogateFile.renameTo(outfile); + return moved; + } + } + + public void processSurrogate(final InputStream is, String name) throws IOException { + SurrogateReader reader = new SurrogateReader(is, 3); + Thread readerThread = new Thread(reader, name); + readerThread.start(); + DCEntry surrogate; + Response response; + while ((surrogate = reader.take()) != DCEntry.poison) { + // check if url is in accepted domain + assert surrogate != null; + assert crawlStacker != null; + final String urlRejectReason = crawlStacker.urlInAcceptedDomain(surrogate.getIdentifier()); + if (urlRejectReason != null) { + if (this.log.isFine()) this.log.logInfo("Rejected URL '" + surrogate.getIdentifier() + "': " + urlRejectReason); + continue; + } + + // create a queue entry + Document document = surrogate.document(); + Request request = new Request( + peers.mySeed().hash.getBytes(), + surrogate.getIdentifier(), + null, + "", + new Date(), + new Date(), + this.crawler.defaultSurrogateProfile.handle(), + 0, + 0, + 0 + ); + response = new Response(request, null, null, "200", this.crawler.defaultSurrogateProfile); + indexingQueueEntry queueEntry = new indexingQueueEntry(Segments.Process.SURROGATES, response, document, null); + + // place the queue entry into the concurrent process of the condenser (document analysis) + try { + indexingCondensementProcessor.enQueue(queueEntry); + } catch (InterruptedException e) { + Log.logException(e); + break; + } } - return moved; } public int surrogateQueueSize() { @@ -1326,7 +1387,7 @@ public final class Switchboard extends serverSwitch { // check for interruption checkInterruption(); - if (surrogate.endsWith(".xml")) { + if (surrogate.endsWith(".xml") || surrogate.endsWith(".xml.gz") || surrogate.endsWith(".xml.zip")) { // read the surrogate file and store entry in index if (processSurrogate(surrogate)) return true; } diff --git a/source/net/yacy/document/importer/Importer.java b/source/net/yacy/document/importer/Importer.java index 7d997f5d3..3d51bc666 100644 --- a/source/net/yacy/document/importer/Importer.java +++ b/source/net/yacy/document/importer/Importer.java @@ -1,3 +1,25 @@ +/** + * Importer + * Copyright 2009 by Michael Peter Christen + * First released 29.04.2010 at http://yacy.net + * + * This is a part of YaCy, a peer-to-peer based web search engine + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this program in the file COPYING.LESSER. + * If not, see . + */ + package net.yacy.document.importer; public interface Importer extends Runnable { diff --git a/source/net/yacy/document/importer/MediawikiImporter.java b/source/net/yacy/document/importer/MediawikiImporter.java index 29366218f..ca4bd5c60 100644 --- a/source/net/yacy/document/importer/MediawikiImporter.java +++ b/source/net/yacy/document/importer/MediawikiImporter.java @@ -1,28 +1,24 @@ -// mediawikiIndex.java -// (C) 2008 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany -// first published 20.11.2008 on http://yacy.net -// -// This is a part of YaCy, a peer-to-peer based web search engine -// -// $LastChangedDate: 2006-04-02 22:40:07 +0200 (So, 02 Apr 2006) $ -// $LastChangedRevision: 1986 $ -// $LastChangedBy: orbiter $ -// -// LICENSE -// -// This program is free software; you can redistribute it and/or modify -// it under the terms of the GNU General Public License as published by -// the Free Software Foundation; either version 2 of the License, or -// (at your option) any later version. -// -// This program is distributed in the hope that it will be useful, -// but WITHOUT ANY WARRANTY; without even the implied warranty of -// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -// GNU General Public License for more details. -// -// You should have received a copy of the GNU General Public License -// along with this program; if not, write to the Free Software -// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +/** + * MediawikiImporter + * Copyright 2008 by Michael Peter Christen + * First released 20.11.2008 at http://yacy.net + * + * This is a part of YaCy, a peer-to-peer based web search engine + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this program in the file COPYING.LESSER. + * If not, see . + */ package net.yacy.document.importer; diff --git a/source/net/yacy/document/importer/OAIListFriendsLoader.java b/source/net/yacy/document/importer/OAIListFriendsLoader.java new file mode 100644 index 000000000..580a12866 --- /dev/null +++ b/source/net/yacy/document/importer/OAIListFriendsLoader.java @@ -0,0 +1,203 @@ +/** + * OAIListFriendsLoader + * Copyright 2010 by Michael Peter Christen + * First released 29.04.2010 at http://yacy.net + * + * This is a part of YaCy, a peer-to-peer based web search engine + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this program in the file COPYING.LESSER. + * If not, see . + */ + +package net.yacy.document.importer; + +import java.io.ByteArrayInputStream; +import java.io.File; +import java.io.IOException; +import java.io.InputStream; +import java.util.Map; +import java.util.TreeMap; + +import javax.xml.parsers.ParserConfigurationException; +import javax.xml.parsers.SAXParser; +import javax.xml.parsers.SAXParserFactory; + +import net.yacy.kelondro.data.meta.DigestURI; +import net.yacy.kelondro.logging.Log; +import net.yacy.kelondro.util.FileUtils; +import net.yacy.repository.LoaderDispatcher; + +import org.xml.sax.Attributes; +import org.xml.sax.SAXException; +import org.xml.sax.helpers.DefaultHandler; + +import de.anomic.crawler.CrawlProfile; +import de.anomic.crawler.retrieval.Response; + +public class OAIListFriendsLoader { + + private static final long serialVersionUID = -8705115274655024604L; + + //private static String url10 = "http://roar.eprints.org/cgi/roar_search/advanced/export_roar_ROAR::ListFriends.xml?screen=ROAR%3A%3AFacetSearch&_action_export=1&output=ROAR%3A%3AListFriends&exp=1|1|-recordcount%2F-date|archive|-|-|eprint_status%3Aeprint_status%3AALL%3AEQ%3Aarchive|metadata_visibility%3Ametadata_visibility%3AALL%3AEX%3Ashow"; + private static String url10 = "http://roar.eprints.org/cgi/roar_search/advanced/export_roar_ROAR::ListFriends.xml?_action_export=1&output=ROAR%3A%3AListFriends"; + private static File cache10 = new File("DATA/DICTIONARIES/harvesting/export_roar_ROAR_ListFriends.xml"); + private static String url20 = "http://www.openarchives.org/Register/ListFriends"; + private static File cache20 = new File("DATA/DICTIONARIES/harvesting/ListFriends.xml"); + + public static void init(LoaderDispatcher loader) { + loader.loadIfNotExistBackground(url10, cache10); + loader.loadIfNotExistBackground(url20, cache20); + } + + public static Map load(LoaderDispatcher loader) { + Map map10; + try { + map10 = load(null, null, new File("DATA/DICTIONARIES/harvesting/export_roar_ROAR_ListFriends.xml")); + } catch (IOException e) { + map10 = new TreeMap(); + } + + Map map20; + try { + map20 = load(null, null, new File("DATA/DICTIONARIES/harvesting/ListFriends.xml")); + } catch (IOException e) { + map20 = new TreeMap(); + } + + map10.putAll(map20); + return map10; + } + + /** + * load a OAI ListFriends file from the net or from a cache location + * If the given file does exist, the OAI ListFriends File is loaded and parsed. + * The resulting map is a mapping from OAI-PMH start url to a loaction description + * @param loader a LoaderDispatcher that loads the file if targetFile does not exist + * @param source the source URL for the OAI ListFriends file + * @param targetFile the file where the loaded content is stored if it does not exist, the source othervise + * @return a Map from OAI-PMH source to source description (which is usually also a URL) + * @throws IOException + */ + private static Map load(LoaderDispatcher loader, DigestURI source, File targetFile) throws IOException { + + byte[] b; + if (targetFile.exists()) { + // load file + b = FileUtils.read(targetFile); + } else { + // load from the net + Response response = loader.load(source, false, true, CrawlProfile.CACHE_STRATEGY_NOCACHE); + b = response.getContent(); + FileUtils.copy(b, targetFile); + } + + return new Parser(b).map; + } + + + // get a resumption token using a SAX xml parser from am input stream + private static class Parser extends DefaultHandler { + + // class variables + private final StringBuilder buffer; + private boolean parsingValue; + private SAXParser saxParser; + private InputStream stream; + private Attributes atts; + private int recordCounter; + private TreeMap map; + + public Parser(final byte[] b) throws IOException { + this.map = new TreeMap(); + this.recordCounter = 0; + this.buffer = new StringBuilder(); + this.parsingValue = false; + this.atts = null; + final SAXParserFactory factory = SAXParserFactory.newInstance(); + this.stream = new ByteArrayInputStream(b); + try { + this.saxParser = factory.newSAXParser(); + this.saxParser.parse(this.stream, this); + } catch (SAXException e) { + Log.logException(e); + Log.logWarning("OAIListFriendsLoader.Parser", "OAIListFriends was not parsed:\n" + new String(b)); + } catch (IOException e) { + Log.logException(e); + Log.logWarning("OAIListFriendsLoader.Parser", "OAIListFriends was not parsed:\n" + new String(b)); + } catch (ParserConfigurationException e) { + Log.logException(e); + Log.logWarning("OAIListFriendsLoader.Parser", "OAIListFriends was not parsed:\n" + new String(b)); + throw new IOException(e.getMessage()); + } finally { + try { + this.stream.close(); + } catch (IOException e) { + Log.logException(e); + } + } + } + + /* + + + http://research.nla.gov.au/oai + http://oai.bibsys.no/repository + http://oai.repec.openlib.org/ + + */ + + public void startElement(final String uri, final String name, final String tag, final Attributes atts) throws SAXException { + if ("baseURL".equals(tag)) { + recordCounter++; + this.parsingValue = true; + this.atts = atts; + } + } + + public void endElement(final String uri, final String name, final String tag) { + if (tag == null) return; + if ("baseURL".equals(tag)) { + this.map.put(buffer.toString(), this.atts.getValue("id")); + this.buffer.setLength(0); + this.parsingValue = false; + } + } + + public void characters(final char ch[], final int start, final int length) { + if (parsingValue) { + buffer.append(ch, start, length); + } + } + + } + + public static void main(String[] args) { + try { + Map map1 = load(null, null, new File("DATA/DICTIONARIES/harvesting/export_roar_ROAR_ListFriends.xml")); + int count1 = map1.size(); + + Map map2 = load(null, null, new File("DATA/DICTIONARIES/harvesting/ListFriends.xml")); + int count2 = map2.size(); + + map1.putAll(map2); + System.out.println("count1 = " + count1 + ", count2 = " + count2 + ", all = " + map1.size()); + + for (Map.Entry entry: map1.entrySet()) System.out.println(entry.getKey()); + } catch (IOException e) { + e.printStackTrace(); + } + + } + +} diff --git a/source/net/yacy/document/importer/OAIPMHImporter.java b/source/net/yacy/document/importer/OAIPMHImporter.java index 66b6a093f..7fc5b453d 100644 --- a/source/net/yacy/document/importer/OAIPMHImporter.java +++ b/source/net/yacy/document/importer/OAIPMHImporter.java @@ -1,72 +1,55 @@ -// OAIPMHImporter -// (C) 2009 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany -// first published 30.09.2009 on http://yacy.net -// -// This is a part of YaCy, a peer-to-peer based web search engine -// -// $LastChangedDate: 2009-09-23 23:26:14 +0200 (Mi, 23 Sep 2009) $ -// $LastChangedRevision: 6340 $ -// $LastChangedBy: low012 $ -// -// LICENSE -// -// This program is free software; you can redistribute it and/or modify -// it under the terms of the GNU General Public License as published by -// the Free Software Foundation; either version 2 of the License, or -// (at your option) any later version. -// -// This program is distributed in the hope that it will be useful, -// but WITHOUT ANY WARRANTY; without even the implied warranty of -// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -// GNU General Public License for more details. -// -// You should have received a copy of the GNU General Public License -// along with this program; if not, write to the Free Software -// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +/** + * OAIPMHImporter + * Copyright 2009 by Michael Peter Christen + * First released 30.09.2009 at http://yacy.net + * + * This is a part of YaCy, a peer-to-peer based web search engine + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this program in the file COPYING.LESSER. + * If not, see . + */ package net.yacy.document.importer; import java.io.File; -import java.io.FileInputStream; -import java.io.FileNotFoundException; import java.io.IOException; import java.net.MalformedURLException; import java.text.ParseException; import java.util.Date; import java.util.HashMap; -import java.util.List; import java.util.Map; import java.util.Set; -import java.util.TreeSet; +import java.util.concurrent.ConcurrentHashMap; import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.kelondro.logging.Log; import net.yacy.kelondro.util.DateFormatter; import net.yacy.repository.LoaderDispatcher; -import net.yacy.document.parser.csvParser; -import de.anomic.crawler.CrawlProfile; import de.anomic.search.Switchboard; - -// get one server with -// http://roar.eprints.org/index.php?action=csv -// or -// http://www.openarchives.org/Register/BrowseSites -// or -// http://www.openarchives.org/Register/ListFriends -// // list records from oai-pmh like // http://opus.bsz-bw.de/fhhv/oai2/oai2.php?verb=ListRecords&metadataPrefix=oai_dc - public class OAIPMHImporter extends Thread implements Importer, Comparable { private static int importerCounter = Integer.MAX_VALUE; + private static Object N = new Object(); - public static TreeSet startedJobs = new TreeSet(); - public static TreeSet runningJobs = new TreeSet(); - public static TreeSet finishedJobs = new TreeSet(); + public static ConcurrentHashMap startedJobs = new ConcurrentHashMap(); + public static ConcurrentHashMap runningJobs = new ConcurrentHashMap(); + public static ConcurrentHashMap finishedJobs = new ConcurrentHashMap(); private final LoaderDispatcher loader; private DigestURI source; @@ -95,7 +78,7 @@ public class OAIPMHImporter extends Thread implements Importer, Comparable 10) { - try {Thread.sleep(1000 + 1000 * System.currentTimeMillis() % 6);} catch (InterruptedException e) {} + while (runningJobs.size() > 50) { + try {Thread.sleep(10000 + 3000 * (System.currentTimeMillis() % 6));} catch (InterruptedException e) {} } startedJobs.remove(this); - runningJobs.add(this); + runningJobs.put(this, N); this.message = "loading first part of records"; while (true) { try { - OAIPMHReader reader = new OAIPMHReader(this.loader, this.source, Switchboard.getSwitchboard().surrogatesInPath, filenamePrefix); + OAIPMHLoader loader = new OAIPMHLoader(this.loader, this.source, Switchboard.getSwitchboard().surrogatesInPath, filenamePrefix); this.chunkCount++; - this.recordsCount += reader.getResumptionToken().getRecordCounter(); - this.source = reader.getResumptionToken().resumptionURL(this.source); + this.recordsCount += loader.getResumptionToken().getRecordCounter(); + this.source = loader.getResumptionToken().resumptionURL(this.source); if (this.source == null) { this.message = "import terminated with source = null"; break; } - this.message = "loading next resumption fragment, cursor = " + reader.getResumptionToken().getCursor(); + this.message = "loading next resumption fragment, cursor = " + loader.getResumptionToken().getCursor(); } catch (IOException e) { this.message = e.getMessage(); break; @@ -155,7 +138,7 @@ public class OAIPMHImporter extends Thread implements Importer, Comparable plainList = getAllListedOAIServer(loader); + Set plainList = OAIListFriendsLoader.load(loader).keySet(); Map loaded = getLoadedOAIServer(surrogatesIn, surrogatesOut); long limit = System.currentTimeMillis() - staleLimit; for (Map.Entry a: loaded.entrySet()) { @@ -193,47 +176,6 @@ public class OAIPMHImporter extends Thread implements Importer, Comparable getAllListedOAIServer(LoaderDispatcher loader) { - TreeSet list = new TreeSet(); - - // read roar - File roar = new File(Switchboard.getSwitchboard().dictionariesPath, "harvesting/roar.csv"); - DigestURI roarSource; - try { - roarSource = new DigestURI("http://roar.eprints.org/index.php?action=csv", null); - } catch (MalformedURLException e) { - Log.logException(e); - roarSource = null; - } - if (!roar.exists()) try { - // load the file from the net - loader.load(roarSource, CrawlProfile.CACHE_STRATEGY_NOCACHE, roar); - } catch (IOException e) { - Log.logException(e); - } - if (roar.exists()) { - csvParser parser = new csvParser(); - try { - List table = parser.getTable(roarSource, "", "UTF-8", new FileInputStream(roar)); - for (String[] row: table) { - if (row.length > 2 && (row[2].startsWith("http://") || row[2].startsWith("https://"))) { - list.add(row[2]); - } - } - } catch (FileNotFoundException e) { - Log.logException(e); - } - } - - return list; - } /** * get a map for already loaded oai-pmh servers and their latest access date diff --git a/source/net/yacy/document/importer/OAIPMHReader.java b/source/net/yacy/document/importer/OAIPMHLoader.java similarity index 87% rename from source/net/yacy/document/importer/OAIPMHReader.java rename to source/net/yacy/document/importer/OAIPMHLoader.java index 7bd071625..40800b1bc 100644 --- a/source/net/yacy/document/importer/OAIPMHReader.java +++ b/source/net/yacy/document/importer/OAIPMHLoader.java @@ -1,32 +1,27 @@ -// OAIPMHReader -// (C) 2009 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany -// first published 30.09.2009 on http://yacy.net -// -// This is a part of YaCy, a peer-to-peer based web search engine -// -// $LastChangedDate: 2009-09-23 23:26:14 +0200 (Mi, 23 Sep 2009) $ -// $LastChangedRevision: 6340 $ -// $LastChangedBy: low012 $ -// -// LICENSE -// -// This program is free software; you can redistribute it and/or modify -// it under the terms of the GNU General Public License as published by -// the Free Software Foundation; either version 2 of the License, or -// (at your option) any later version. -// -// This program is distributed in the hope that it will be useful, -// but WITHOUT ANY WARRANTY; without even the implied warranty of -// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -// GNU General Public License for more details. -// -// You should have received a copy of the GNU General Public License -// along with this program; if not, write to the Free Software -// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +/** + * OAIPMHLoader + * Copyright 2009 by Michael Peter Christen + * First released 30.09.2009 at http://yacy.net + * + * This is a part of YaCy, a peer-to-peer based web search engine + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this program in the file COPYING.LESSER. + * If not, see . + */ package net.yacy.document.importer; -import java.io.ByteArrayInputStream; import java.io.File; import java.io.IOException; @@ -44,18 +39,18 @@ import de.anomic.crawler.retrieval.Response; // http://opus.bsz-bw.de/fhhv/oai2/oai2.php?verb=ListRecords&metadataPrefix=oai_dc -public class OAIPMHReader { +public class OAIPMHLoader { private final DigestURI source; private final ResumptionToken resumptionToken; - public OAIPMHReader(LoaderDispatcher loader, DigestURI source, File targetDir, String filePrefix) throws IOException { + public OAIPMHLoader(LoaderDispatcher loader, DigestURI source, File targetDir, String filePrefix) throws IOException { this.source = source; // load the file from the net Response response = loader.load(source, false, true, CrawlProfile.CACHE_STRATEGY_NOCACHE); byte[] b = response.getContent(); - this.resumptionToken = new ResumptionToken(new ByteArrayInputStream(b)); + this.resumptionToken = new ResumptionToken(b); File f1 = new File(targetDir, OAIPMHImporter.filename4Source(source)); File f0 = new File(targetDir, f1.getName() + ".tmp"); diff --git a/source/net/yacy/document/importer/ResumptionToken.java b/source/net/yacy/document/importer/ResumptionToken.java index a47a3f6f2..7cd2749c6 100644 --- a/source/net/yacy/document/importer/ResumptionToken.java +++ b/source/net/yacy/document/importer/ResumptionToken.java @@ -1,30 +1,28 @@ -// ResumptionToken -// (C) 2009 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany -// first published 31.10.2009 on http://yacy.net -// -// $LastChangedDate: 2006-04-02 22:40:07 +0200 (So, 02 Apr 2006) $ -// $LastChangedRevision: 1986 $ -// $LastChangedBy: orbiter $ -// -// LICENSE -// -// This program is free software; you can redistribute it and/or modify -// it under the terms of the GNU General Public License as published by -// the Free Software Foundation; either version 2 of the License, or -// (at your option) any later version. -// -// This program is distributed in the hope that it will be useful, -// but WITHOUT ANY WARRANTY; without even the implied warranty of -// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -// GNU General Public License for more details. -// -// You should have received a copy of the GNU General Public License -// along with this program; if not, write to the Free Software -// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA - +/** + * ResumptionToken + * Copyright 2009 by Michael Peter Christen + * First released 31.10.2009 at http://yacy.net + * + * This is a part of YaCy, a peer-to-peer based web search engine + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this program in the file COPYING.LESSER. + * If not, see . + */ package net.yacy.document.importer; +import java.io.ByteArrayInputStream; import java.io.IOException; import java.io.InputStream; import java.text.Collator; @@ -45,7 +43,7 @@ import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.kelondro.logging.Log; import net.yacy.kelondro.util.DateFormatter; -public class ResumptionToken extends TreeMap { +public class ResumptionToken extends TreeMap { private static final long serialVersionUID = -8389462290545629792L; @@ -58,10 +56,10 @@ public class ResumptionToken extends TreeMap { int recordCounter; - public ResumptionToken(final InputStream stream) throws IOException { + public ResumptionToken(final byte[] b) throws IOException { super((Collator) insensitiveCollator.clone()); this.recordCounter = 0; - new Reader(stream); + new Parser(b); } public ResumptionToken( @@ -206,7 +204,7 @@ public class ResumptionToken extends TreeMap { } // get a resumption token using a SAX xml parser from am input stream - private class Reader extends DefaultHandler { + private class Parser extends DefaultHandler { // class variables private final StringBuilder buffer; @@ -215,21 +213,24 @@ public class ResumptionToken extends TreeMap { private InputStream stream; private Attributes atts; - public Reader(final InputStream stream) throws IOException { + public Parser(final byte[] b) throws IOException { this.buffer = new StringBuilder(); this.parsingValue = false; - this.stream = stream; this.atts = null; final SAXParserFactory factory = SAXParserFactory.newInstance(); + this.stream = new ByteArrayInputStream(b); try { this.saxParser = factory.newSAXParser(); this.saxParser.parse(this.stream, this); } catch (SAXException e) { Log.logException(e); + Log.logWarning("ResumptionToken", "token was not parsed:\n" + new String(b)); } catch (IOException e) { Log.logException(e); + Log.logWarning("ResumptionToken", "token was not parsed:\n" + new String(b)); } catch (ParserConfigurationException e) { Log.logException(e); + Log.logWarning("ResumptionToken", "token was not parsed:\n" + new String(b)); throw new IOException(e.getMessage()); } finally { try { diff --git a/source/net/yacy/kelondro/util/FileUtils.java b/source/net/yacy/kelondro/util/FileUtils.java index c73de4974..84e02bd32 100644 --- a/source/net/yacy/kelondro/util/FileUtils.java +++ b/source/net/yacy/kelondro/util/FileUtils.java @@ -171,6 +171,8 @@ public final class FileUtils { * @see #copy(File source, File dest) */ public static void copy(final InputStream source, final File dest, final long count) throws IOException { + String path = dest.getParent(); + if (path != null && path.length() > 0) new File(path).mkdirs(); FileOutputStream fos = null; try { fos = new FileOutputStream(dest); diff --git a/source/net/yacy/kelondro/util/ScoreCluster.java b/source/net/yacy/kelondro/util/ScoreCluster.java index 994b8c266..202494669 100644 --- a/source/net/yacy/kelondro/util/ScoreCluster.java +++ b/source/net/yacy/kelondro/util/ScoreCluster.java @@ -84,7 +84,7 @@ public final class ScoreCluster { public synchronized void shrinkToMinScore(int minScore) { int score; Long key; - while (true) { + while (keyrefDB.size() > 0) { // find and remove objects where their score is smaller than the demanded minimum score key = keyrefDB.firstKey(); if (key == null) break; diff --git a/source/net/yacy/repository/LoaderDispatcher.java b/source/net/yacy/repository/LoaderDispatcher.java index e33f01fcf..77d9bc063 100644 --- a/source/net/yacy/repository/LoaderDispatcher.java +++ b/source/net/yacy/repository/LoaderDispatcher.java @@ -31,6 +31,7 @@ import java.io.File; import java.io.IOException; import java.io.InputStream; import java.io.Writer; +import java.net.MalformedURLException; import java.util.Arrays; import java.util.Date; import java.util.HashSet; @@ -454,4 +455,29 @@ public final class LoaderDispatcher { if (System.currentTimeMillis() - e.getValue().longValue() > minDelay) i.remove(); } } + + public void loadIfNotExistBackground(String url, File cache) { + new Loader(url, cache).start(); + } + + private class Loader extends Thread { + + private String url; + private File cache; + + public Loader(String url, File cache) { + this.url = url; + this.cache = cache; + } + + public void run() { + if (this.cache.exists()) return; + try { + // load from the net + Response response = load(new DigestURI(this.url), false, true, CrawlProfile.CACHE_STRATEGY_NOCACHE); + byte[] b = response.getContent(); + FileUtils.copy(b, this.cache); + } catch (MalformedURLException e) {} catch (IOException e) {} + } + } } \ No newline at end of file