diff --git a/htroot/CrawlResults.html b/htroot/CrawlResults.html index 09c857123..d544f0cb8 100644 --- a/htroot/CrawlResults.html +++ b/htroot/CrawlResults.html @@ -16,6 +16,7 @@
  • (4) Proxy Use
  • (5) Local Crawling
  • (6) Global Crawling
  • +
  • (7) Surrogate Import
  • @@ -28,6 +29,7 @@

    Case (6) is a monitor of the local receipt-generator, the opposed case of (1). It contains also an indexing result monitor but is not considered private since it shows crawl requests from other peers.

    +

    Case (7) occurs if surrogate files are imported

    An illustration how yacy works

    The image above illustrates the data flow initiated by web index acquisition. Some processes occur double to document the complex index migration structure. @@ -70,6 +72,10 @@

    These pages had been indexed by your peer, but the crawl was initiated by a remote peer. This is the 'mirror'-case of process (1).

    Use Case: This list may fill if you check the 'Accept remote crawling requests'-flag on the 'Index Crate' page

    + :: +

    (7) Results from surrogates import

    +

    These records had been imported from surrogate files in DATA/SURROGATES/in

    +

    Use Case: place files with dublin core metadata content into DATA/SURROGATES/in or use an index import method (i.e. wikimedia import, OAI-PMH retrieval)

    #(/process)# diff --git a/htroot/IndexImportOAIPMH_p.html b/htroot/IndexImportOAIPMH_p.html index d7943a504..325a1a8e2 100644 --- a/htroot/IndexImportOAIPMH_p.html +++ b/htroot/IndexImportOAIPMH_p.html @@ -10,12 +10,34 @@ #%env/templates/submenuContentIntegration.template%#

    OAI-PMH Import

    - #(import)# -

    #(status)#No import thread is running, you can start a new thread here::Bad input data: #[message]# #(/status)#

    - OAI-PMH Import: set a OAI-PMH URL - + Single request import + This will submit only a single request as given here to a OAI-PMH server and imports records into the index + + +
    +
    + #(import-one)# + :: +
    Import Process +
    +
    Source:
    #[source]#
    +
    Processed:
    #[count]# records
    +
    ResumptionToken:
    #[rt]#
    +
    +
    + :: + Import failed: #[error]# + #(/import-one)# + + #(import-all)# +

    #(status)#::Bad input data: #[message]# #(/status)#

    +
    +
    + Import all Records from a server + Import all records that follow acording to resumption elements into index +
    @@ -24,14 +46,13 @@
    Thread:
    #[thread]#
    Source:
    #[source]#
    -
    ResumptionToken:
    #[rt]#
    -
    Processed:
    #[count]# Wiki Entries
    -
    Speed:
    #[speed]# articles per second
    +
    Processed:
    #[count]# records
    +
    Speed:
    #[speed]# records per second
    Running Time:
    #[runningHours]# hours, #[runningMinutes]# minutes
    Remaining Time:
    #[remainingHours]# hours, #[remainingMinutes]# minutes
    - #(/import)# + #(/import-all)# #%env/templates/footer.template%# diff --git a/htroot/IndexImportOAIPMH_p.java b/htroot/IndexImportOAIPMH_p.java index fc7371450..f3e84483d 100644 --- a/htroot/IndexImportOAIPMH_p.java +++ b/htroot/IndexImportOAIPMH_p.java @@ -26,6 +26,7 @@ import java.io.IOException; import java.net.MalformedURLException; import net.yacy.document.importer.OAIPMHImporter; +import net.yacy.document.importer.OAIPMHReader; import net.yacy.document.importer.ResumptionToken; import net.yacy.kelondro.data.meta.DigestURI; @@ -40,62 +41,82 @@ public class IndexImportOAIPMH_p { final serverObjects prop = new serverObjects(); final Switchboard sb = (Switchboard) env; - prop.put("import_defaulturl", ""); + prop.put("import-one", 0); + prop.put("import-all", 0); + prop.put("import-all_status", 0); + prop.put("defaulturl", ""); + + if (OAIPMHImporter.job != null) { - // show result from finished import - try { - ResumptionToken rt = OAIPMHImporter.job.getResumptionToken(); - if (rt != null) prop.put("import_defaulturl", rt.resumptionURL(new DigestURI(OAIPMHImporter.job.source(), null)).toNormalform(true, false)); - } catch (MalformedURLException e) { - prop.put("import_defaulturl", e.getMessage()); - } catch (IOException e) { - // reached end of resumption - prop.put("import_defaulturl", e.getMessage()); - } - } - - if (OAIPMHImporter.job != null && OAIPMHImporter.job.isAlive()) { // one import is running, no option to insert anything - prop.put("import", 1); - prop.put("import_thread", "running"); - prop.put("import_source", OAIPMHImporter.job.source()); - prop.put("import_count", OAIPMHImporter.job.count()); - prop.put("import_speed", OAIPMHImporter.job.speed()); - prop.put("import_runningHours", (OAIPMHImporter.job.runningTime() / 60) / 60); - prop.put("import_runningMinutes", (OAIPMHImporter.job.runningTime() / 60) % 60); - prop.put("import_remainingHours", (OAIPMHImporter.job.remainingTime() / 60) / 60); - prop.put("import_remainingMinutes", (OAIPMHImporter.job.remainingTime() / 60) % 60); + prop.put("import-all", 1); + prop.put("import-all_thread", (OAIPMHImporter.job.isAlive()) ? "running" : "finished"); + prop.put("import-all_source", OAIPMHImporter.job.source()); + prop.put("import-all_count", OAIPMHImporter.job.count()); + prop.put("import-all_speed", OAIPMHImporter.job.speed()); + prop.put("import-all_runningHours", (OAIPMHImporter.job.runningTime() / 60) / 60); + prop.put("import-all_runningMinutes", (OAIPMHImporter.job.runningTime() / 60) % 60); + prop.put("import-all_remainingHours", (OAIPMHImporter.job.remainingTime() / 60) / 60); + prop.put("import-all_remainingMinutes", (OAIPMHImporter.job.remainingTime() / 60) % 60); return prop; } - prop.put("import", 0); - if (post == null) { - prop.put("import_status", 0); - return prop; - } - - if (post.containsKey("oaipmhurl")) { - String oaipmhurl = post.get("oaipmhurl"); - DigestURI url = null; - try { - url = new DigestURI(oaipmhurl, null); - OAIPMHImporter.job = new OAIPMHImporter(sb.loader, url); - OAIPMHImporter.job.start(); - prop.put("import", 1); - prop.put("import_thread", "started"); - prop.put("import_source", OAIPMHImporter.job.source()); - prop.put("import_rt", OAIPMHImporter.job.status()); - prop.put("import_count", 0); - prop.put("import_speed", 0); - prop.put("import_runningHours", 0); - prop.put("import_runningMinutes", 0); - prop.put("import_remainingHours", 0); - prop.put("import_remainingMinutes", 0); - } catch (MalformedURLException e) { - e.printStackTrace(); - prop.put("import", 0); - prop.put("import_status", 1); - prop.put("import_status_message", e.getMessage()); + if (post != null) { + if (post.containsKey("urlstartone")) { + String oaipmhurl = post.get("urlstartone"); + DigestURI url = null; + try { + url = new DigestURI(oaipmhurl, null); + OAIPMHReader r = new OAIPMHReader(sb.loader, url, sb.surrogatesInPath, "oaipmh-one"); + ResumptionToken rt = r.getResumptionToken(); + prop.put("import-one", 1); + prop.put("import-one_count", (rt == null) ? "not available" : Integer.toString(rt.getRecordCounter())); + prop.put("import-one_source", r.source()); + prop.put("import-one_rt", r.getResumptionToken().toString()); + + // set next default url + try { + DigestURI nexturl = (rt == null) ? null : rt.resumptionURL(url); + if (rt != null) prop.put("defaulturl", (nexturl == null) ? "" : nexturl.toNormalform(true, false)); + } catch (MalformedURLException e) { + prop.put("defaulturl", e.getMessage()); + } catch (IOException e) { + // reached end of resumption + prop.put("defaulturl", e.getMessage()); + } + } catch (MalformedURLException e) { + e.printStackTrace(); + prop.put("import-one", 2); + prop.put("import-one_error", e.getMessage()); + } catch (IOException e) { + e.printStackTrace(); + prop.put("import-one", 2); + prop.put("import-one_error", e.getMessage()); + } + } + + if (post.containsKey("urlstartall")) { + String oaipmhurl = post.get("urlstartall"); + DigestURI url = null; + try { + url = new DigestURI(oaipmhurl, null); + OAIPMHImporter.job = new OAIPMHImporter(sb.loader, url); + OAIPMHImporter.job.start(); + prop.put("import-all", 1); + prop.put("import-all_thread", "started"); + prop.put("import-all_source", OAIPMHImporter.job.source()); + prop.put("import-all_count", 0); + prop.put("import-all_speed", 0); + prop.put("import-all_runningHours", 0); + prop.put("import-all_runningMinutes", 0); + prop.put("import-all_remainingHours", 0); + prop.put("import-all_remainingMinutes", 0); + } catch (MalformedURLException e) { + e.printStackTrace(); + prop.put("import-all", 0); + prop.put("import-all_status", 1); + prop.put("import-all_status_message", e.getMessage()); + } } } return prop; diff --git a/source/de/anomic/crawler/CrawlSwitchboard.java b/source/de/anomic/crawler/CrawlSwitchboard.java index 34ab81b13..0fc7c71aa 100644 --- a/source/de/anomic/crawler/CrawlSwitchboard.java +++ b/source/de/anomic/crawler/CrawlSwitchboard.java @@ -30,7 +30,6 @@ import java.io.File; import java.io.IOException; import java.util.Iterator; -import net.yacy.kelondro.data.word.Word; import net.yacy.kelondro.logging.Log; import net.yacy.kelondro.util.FileUtils; import net.yacy.kelondro.util.kelondroException; @@ -72,7 +71,7 @@ public final class CrawlSwitchboard { final Log log, final File queuesRoot) { - log.logInfo("Initializing Word Index for the network '" + networkName + "', word hash cache size is " + Word.hashCacheSize + "."); + log.logInfo("Initializing Word Index for the network '" + networkName + "'."); if (networkName == null || networkName.length() == 0) { log.logSevere("no network name given - shutting down"); diff --git a/source/de/anomic/crawler/retrieval/EventOrigin.java b/source/de/anomic/crawler/retrieval/EventOrigin.java index 29fea6250..265eaee0a 100644 --- a/source/de/anomic/crawler/retrieval/EventOrigin.java +++ b/source/de/anomic/crawler/retrieval/EventOrigin.java @@ -17,11 +17,12 @@ public enum EventOrigin { DHT_TRANSFER(3), PROXY_LOAD(4), LOCAL_CRAWLING(5), - GLOBAL_CRAWLING(6); + GLOBAL_CRAWLING(6), + SURROGATES(7); protected int code; private static final EventOrigin[] list = { - UNKNOWN, REMOTE_RECEIPTS, QUERIES, DHT_TRANSFER, PROXY_LOAD, LOCAL_CRAWLING, GLOBAL_CRAWLING}; + UNKNOWN, REMOTE_RECEIPTS, QUERIES, DHT_TRANSFER, PROXY_LOAD, LOCAL_CRAWLING, GLOBAL_CRAWLING, SURROGATES}; private EventOrigin(int code) { this.code = code; } diff --git a/source/de/anomic/search/Segment.java b/source/de/anomic/search/Segment.java index e967bc727..97566f2f9 100644 --- a/source/de/anomic/search/Segment.java +++ b/source/de/anomic/search/Segment.java @@ -93,7 +93,7 @@ public class Segment { migrateTextIndex(segmentPath, segmentPath); migrateTextMetadata(segmentPath, segmentPath); - log.logInfo("Initializing Segment '" + segmentPath + "', word hash cache size is " + Word.hashCacheSize + "."); + log.logInfo("Initializing Segment '" + segmentPath + "."); this.log = log; this.segmentPath = segmentPath; diff --git a/source/de/anomic/search/Segments.java b/source/de/anomic/search/Segments.java index 14db36224..91f9bc2a9 100644 --- a/source/de/anomic/search/Segments.java +++ b/source/de/anomic/search/Segments.java @@ -56,7 +56,8 @@ public class Segments implements Iterable { PROXY, LOCALCRAWLING, REMOTECRAWLING, - PUBLIC; // includes the index that can be retrieved by the yacy p2p api + PUBLIC, + SURROGATES; // includes the index that can be retrieved by the yacy p2p api public String toString() { throw new UnsupportedOperationException("toString not allowed"); @@ -97,6 +98,7 @@ public class Segments implements Iterable { this.process_assignment.put(Process.LOCALCRAWLING, "default"); this.process_assignment.put(Process.REMOTECRAWLING, "default"); this.process_assignment.put(Process.PUBLIC, "default"); + this.process_assignment.put(Process.SURROGATES, "default"); } public void setSegment(Process process, String segmentName) { diff --git a/source/de/anomic/search/Switchboard.java b/source/de/anomic/search/Switchboard.java index 20be1b63e..31ec924b5 100644 --- a/source/de/anomic/search/Switchboard.java +++ b/source/de/anomic/search/Switchboard.java @@ -1237,7 +1237,7 @@ public final class Switchboard extends serverSwitch { 0 ); response = new Response(request, null, null, "200", this.crawler.defaultSurrogateProfile); - indexingQueueEntry queueEntry = new indexingQueueEntry(Segments.Process.LOCALCRAWLING, response, document, null); + indexingQueueEntry queueEntry = new indexingQueueEntry(Segments.Process.SURROGATES, response, document, null); // place the queue entry into the concurrent process of the condenser (document analysis) try { @@ -1717,9 +1717,10 @@ public final class Switchboard extends serverSwitch { // CREATE INDEX final String dc_title = document.dc_title(); final DigestURI referrerURL = queueEntry.referrerURL(); - final EventOrigin processCase = queueEntry.processCase(peers.mySeed().hash); + EventOrigin processCase = queueEntry.processCase(peers.mySeed().hash); + if (process == Segments.Process.SURROGATES) processCase = EventOrigin.SURROGATES; - // remove stopwords + // remove stopwords log.logInfo("Excluded " + condenser.excludeWords(stopwords) + " words in URL " + queueEntry.url()); // STORE URL TO LOADED-URL-DB diff --git a/source/net/yacy/document/importer/OAIPMHImporter.java b/source/net/yacy/document/importer/OAIPMHImporter.java index e5e817cf3..de69c017a 100644 --- a/source/net/yacy/document/importer/OAIPMHImporter.java +++ b/source/net/yacy/document/importer/OAIPMHImporter.java @@ -26,21 +26,12 @@ package net.yacy.document.importer; -import java.io.ByteArrayInputStream; -import java.io.File; import java.io.IOException; import java.net.MalformedURLException; -import net.yacy.document.content.DCEntry; -import net.yacy.document.content.SurrogateReader; import net.yacy.kelondro.data.meta.DigestURI; -import net.yacy.kelondro.util.FileUtils; import net.yacy.repository.LoaderDispatcher; -import de.anomic.crawler.CrawlProfile; -import de.anomic.crawler.retrieval.HTTPLoader; -import de.anomic.crawler.retrieval.Request; -import de.anomic.crawler.retrieval.Response; import de.anomic.search.Switchboard; @@ -59,22 +50,31 @@ public class OAIPMHImporter extends Thread implements Importer { private int count; private long startTime; private ResumptionToken resumptionToken; + private String message; public OAIPMHImporter(LoaderDispatcher loader, DigestURI source) { this.loader = loader; - this.source = source; this.count = 0; this.startTime = System.currentTimeMillis(); this.resumptionToken = null; + this.message = "import initialized"; + // fix start url + String url = ResumptionToken.truncatedURL(source); + if (!url.endsWith("?")) url = url + "?"; + try { + this.source = new DigestURI(url + "verb=ListRecords&metadataPrefix=oai_dc", null); + } catch (MalformedURLException e) { + // this should never happen + e.printStackTrace(); + } } - public int count() { return this.count; } public String status() { - return (this.resumptionToken == null) ? "" : this.resumptionToken.toString(); + return this.message; } public ResumptionToken getResumptionToken() { @@ -98,206 +98,20 @@ public class OAIPMHImporter extends Thread implements Importer { } public void run() { - Response response; - try { - response = this.loader.load(source, true, true, CrawlProfile.CACHE_STRATEGY_NOCACHE); - load(response); - } catch (IOException e) { - e.printStackTrace(); - } - } - - public void load0(DigestURI source) throws IOException { - Response response = HTTPLoader.load(new Request(source, null)); - load(response); - } - - private void load(Response response) throws IOException { - byte[] b = response.getContent(); - this.resumptionToken = new ResumptionToken(new ByteArrayInputStream(b)); - String file = this.source.getHost() + "_" + System.currentTimeMillis(); - File f0 = new File(Switchboard.getSwitchboard().surrogatesInPath, file + ".tmp"); - File f1 = new File(Switchboard.getSwitchboard().surrogatesInPath, file + ".xml"); - FileUtils.copy(b, f0); - f0.renameTo(f1); - - /* - SurrogateReader sr = new SurrogateReader(new ByteArrayInputStream(b), 100); - Thread srt = new Thread(sr); - srt.start(); - DCEntry dce; - while ((dce = sr.take()) != DCEntry.poison) { - System.out.println(dce.toString()); - } - try { - srt.join(); - } catch (InterruptedException e) {} - */ - System.out.println("TOKEN: " + resumptionToken.toString()); - - } - - public static StringBuilder escape(final String s) { - final int len = s.length(); - final StringBuilder sbuf = new StringBuilder(len + 10); - for (int i = 0; i < len; i++) { - final int ch = s.charAt(i); - if ('A' <= ch && ch <= 'Z') { // 'A'..'Z' - sbuf.append((char)ch); - } else if ('a' <= ch && ch <= 'z') { // 'a'..'z' - sbuf.append((char)ch); - } else if ('0' <= ch && ch <= '9') { // '0'..'9' - sbuf.append((char)ch); - } else if (ch == ' ') { // space - sbuf.append("%20"); - } else if (ch == '&' || ch == ':' // unreserved - || ch == '-' || ch == '_' - || ch == '.' || ch == '!' - || ch == '~' || ch == '*' - || ch == '\'' || ch == '(' - || ch == ')' || ch == ';') { - sbuf.append((char)ch); + this.message = "loading first part of records"; + while (true) { + try { + OAIPMHReader reader = new OAIPMHReader(this.loader, this.source, Switchboard.getSwitchboard().surrogatesInPath, "oaipmh"); + this.source = reader.getResumptionToken().resumptionURL(this.source); + if (this.source == null) { + this.message = "import terminated with source = null"; + break; + } + this.message = "loading next resumption fragment, cursor = " + reader.getResumptionToken().getCursor(); + } catch (IOException e) { + this.message = e.getMessage(); + break; } } - return sbuf; } - - public static String unescape(final String s) { - final int l = s.length(); - final StringBuilder sbuf = new StringBuilder(l); - int ch = -1; - int b, sumb = 0; - for (int i = 0, more = -1; i < l; i++) { - /* Get next byte b from URL segment s */ - switch (ch = s.charAt(i)) { - case '%': - if (i + 2 < l) { - ch = s.charAt(++i); - int hb = (Character.isDigit ((char) ch) ? ch - '0' : 10 + Character.toLowerCase((char) ch) - 'a') & 0xF; - ch = s.charAt(++i); - int lb = (Character.isDigit ((char) ch) ? ch - '0' : 10 + Character.toLowerCase ((char) ch) - 'a') & 0xF; - b = (hb << 4) | lb; - } else { - b = ch; - } - break; - case '+': - b = ' '; - break; - default: - b = ch; - } - } - return sbuf.toString(); - } -} -/* - -http://an.oa.org/OAI-script?verb=GetRecord&identifier=oai:arXiv.org:hep-th/9901001&metadataPrefix=oai_dc - -special characters in URIs must be encoded, the correct form of the above GET request URL is: - -http://an.oa.org/OAI-script?verb=GetRecord&identifier=oai%3AarXiv.org%3Ahep-th%2F9901001&metadataPrefix=oai_dc - -"/","%2F" -"?","%3F" -"#","%23" -"=","%3D" -"&","%26" -":","%3A" -";","%3B" -" ","%20" -"%","%25" -"+","%2B" - -GetRecord -http://arXiv.org/oai2?verb=GetRecord&identifier=oai:arXiv.org:cs/0112017&metadataPrefix=oai_dc -http://opus.bsz-bw.de/fhhv/oai2/oai2.php?verb=GetRecord&identifier=oai:opus.bsz-bw.de-fhhv:6&metadataPrefix=oai_dc - -Identify -http://memory.loc.gov/cgi-bin/oai?verb=Identify - -ListIdentifiers -http://an.oa.org/OAI-script?verb=ListIdentifiers&from=1998-01-15&metadataPrefix=oldArXiv&set=physics:hep -http://an.oa.org/OAI-script?verb=ListIdentifiers&resumptionToken=xxx45abttyz -http://www.perseus.tufts.edu/cgi-bin/pdataprov?verb=ListIdentifiers&metadataPrefix=olac&from=2001-01-01&until=2001-01-01&set=Perseus:collection:PersInfo - -ListMetadataFormats -http://www.perseus.tufts.edu/cgi-bin/pdataprov?verb=ListMetadataFormats&identifier=oai:perseus.tufts.edu:Perseus:text:1999.02.0119 -http://memory.loc.gov/cgi-bin/oai?verb=ListMetadataFormats -http://memory.loc.gov/cgi-bin/oai?verb=ListMetadataFormats&identifier=oai:lcoa1.loc.gov:loc.rbc/rbpe.00000111 -http://opus.bsz-bw.de/fhhv/oai2/oai2.php?verb=ListMetadataFormats - -ListRecords -http://opus.bsz-bw.de/fhhv/oai2/oai2.php?verb=ListRecords&metadataPrefix=oai_dc -http://opus.bsz-bw.de/fhhv/oai2/oai2.php?verb=ListRecords&resumptionToken=455 -http://opus.bsz-bw.de/fhhv/oai2/oai2.php?verb=ListRecords&resumptionToken=890 -http://an.oa.org/OAI-script?verb=ListRecords&from=1998-01-15&set=physics:hep&metadataPrefix=oai_rfc1807 -http://www.perseus.tufts.edu/cgi-b:in/pdataprov?verb=ListRecords&from=2002-05-01T14:15:00Z&until=2002-05-01T14:20:00Z&metadataPrefix=oai_dc -http://memory.loc.gov/cgi-bin/oai?verb=ListRecords&from=2002-06-01T02:00:00Z&until=2002-06-01T03:00:00Z&metadataPrefix=oai_marc - -ListSets -http://an.oa.org/OAI-script?verb=ListSets -http://purl.org/alcme/etdcat/servlet/OAIHandler?verb=ListSets - -urn identifier koennen ueber den resolver der d-nb aufgeloest werden: -http://nbn-resolving.de/urn:nbn:de:bsz:960-opus-1860 - - - - 2009-10-01T22:20:04Z - http://opus.bsz-bw.de/fhhv/oai2/oai2.php - - -
    - oai:opus.bsz-bw.de-fhhv:1 - 2008-03-04T12:17:33Z - ddc:020 - pub-type:2 - has-source-swb:false -
    - - - Teaching Information Literacy with the Lerninformationssystem - Hauschke, Christian - Ullmann, Nadine - Informationskompetenz - E-Learning - Bibliothek - Informationsvermittlung - Wissenschaftliches Arbeiten - information literacy - e-learning - library - information dissemination - Library and information sciences - A German university has developed a learning information system to improve information literacy among German students. An online tutorial based on this Lerninformationssystem has been developed. The structure of this learning information system is described, an online tutorial based on it is illustrated, and the different learning styles that it supports are indicated. - Fachhochschule Hannover - Sonstige Einrichtungen. Sonstige Einrichtungen - 2006 - Article - application/pdf - urn:nbn:de:bsz:960-opus-10 - http://opus.bsz-bw.de/fhhv/volltexte/2008/1/ - Australian Academic & Research Libraries, 37 (1), S. 55-60 - eng - http://creativecommons.org/licenses/by/2.0/de/deed.de - - -
    - 119 -
    -
    - -*/ \ No newline at end of file +} \ No newline at end of file diff --git a/source/net/yacy/document/importer/OAIPMHReader.java b/source/net/yacy/document/importer/OAIPMHReader.java new file mode 100644 index 000000000..f2eb20415 --- /dev/null +++ b/source/net/yacy/document/importer/OAIPMHReader.java @@ -0,0 +1,256 @@ +// OAIPMHReader +// (C) 2009 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany +// first published 30.09.2009 on http://yacy.net +// +// This is a part of YaCy, a peer-to-peer based web search engine +// +// $LastChangedDate: 2009-09-23 23:26:14 +0200 (Mi, 23 Sep 2009) $ +// $LastChangedRevision: 6340 $ +// $LastChangedBy: low012 $ +// +// LICENSE +// +// This program is free software; you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation; either version 2 of the License, or +// (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program; if not, write to the Free Software +// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + +package net.yacy.document.importer; + +import java.io.ByteArrayInputStream; +import java.io.File; +import java.io.IOException; +import java.util.Date; + +import net.yacy.kelondro.data.meta.DigestURI; +import net.yacy.kelondro.util.DateFormatter; +import net.yacy.kelondro.util.FileUtils; +import net.yacy.repository.LoaderDispatcher; + +import de.anomic.crawler.CrawlProfile; +import de.anomic.crawler.retrieval.Response; + + +// get one server with +// http://roar.eprints.org/index.php?action=csv +// list records from oai-pmh like +// http://opus.bsz-bw.de/fhhv/oai2/oai2.php?verb=ListRecords&metadataPrefix=oai_dc + + +public class OAIPMHReader { + + private DigestURI source; + private ResumptionToken resumptionToken; + + public OAIPMHReader(LoaderDispatcher loader, DigestURI source, File targetDir, String filePrefix) throws IOException { + this.source = source; + + // load the file from the net + Response response; + response = loader.load(source, true, true, CrawlProfile.CACHE_STRATEGY_NOCACHE); + byte[] b = response.getContent(); + this.resumptionToken = new ResumptionToken(new ByteArrayInputStream(b)); + String file = filePrefix + "_" + this.source.getHost() + "_" + DateFormatter.formatShortMilliSecond(new Date()); + File f0 = new File(targetDir, file + ".tmp"); + File f1 = new File(targetDir, file + ".xml"); + + // transaction-safe writing + FileUtils.copy(b, f0); + f0.renameTo(f1); + + /* + SurrogateReader sr = new SurrogateReader(new ByteArrayInputStream(b), 100); + Thread srt = new Thread(sr); + srt.start(); + DCEntry dce; + while ((dce = sr.take()) != DCEntry.poison) { + System.out.println(dce.toString()); + } + try { + srt.join(); + } catch (InterruptedException e) {} + */ + } + + public ResumptionToken getResumptionToken() { + return this.resumptionToken; + } + + public String source() { + return source.toNormalform(true, false); + } + + public static StringBuilder escape(final String s) { + final int len = s.length(); + final StringBuilder sbuf = new StringBuilder(len + 10); + for (int i = 0; i < len; i++) { + final int ch = s.charAt(i); + if ('A' <= ch && ch <= 'Z') { // 'A'..'Z' + sbuf.append((char)ch); + } else if ('a' <= ch && ch <= 'z') { // 'a'..'z' + sbuf.append((char)ch); + } else if ('0' <= ch && ch <= '9') { // '0'..'9' + sbuf.append((char)ch); + } else if (ch == ' ') { // space + sbuf.append("%20"); + } else if (ch == '&' || ch == ':' // unreserved + || ch == '-' || ch == '_' + || ch == '.' || ch == '!' + || ch == '~' || ch == '*' + || ch == '\'' || ch == '(' + || ch == ')' || ch == ';') { + sbuf.append((char)ch); + } + } + return sbuf; + } + + public static String unescape(final String s) { + final int l = s.length(); + final StringBuilder sbuf = new StringBuilder(l); + int ch = -1; + int b; + for (int i = 0; i < l; i++) { + /* Get next byte b from URL segment s */ + switch (ch = s.charAt(i)) { + case '%': + if (i + 2 < l) { + ch = s.charAt(++i); + int hb = (Character.isDigit ((char) ch) ? ch - '0' : 10 + Character.toLowerCase((char) ch) - 'a') & 0xF; + ch = s.charAt(++i); + int lb = (Character.isDigit ((char) ch) ? ch - '0' : 10 + Character.toLowerCase ((char) ch) - 'a') & 0xF; + b = (hb << 4) | lb; + } else { + b = ch; + } + break; + case '+': + b = ' '; + break; + default: + b = ch; + } + sbuf.append(b); + } + return sbuf.toString(); + } +} +/* + +http://an.oa.org/OAI-script?verb=GetRecord&identifier=oai:arXiv.org:hep-th/9901001&metadataPrefix=oai_dc + +special characters in URIs must be encoded, the correct form of the above GET request URL is: + +http://an.oa.org/OAI-script?verb=GetRecord&identifier=oai%3AarXiv.org%3Ahep-th%2F9901001&metadataPrefix=oai_dc + +"/","%2F" +"?","%3F" +"#","%23" +"=","%3D" +"&","%26" +":","%3A" +";","%3B" +" ","%20" +"%","%25" +"+","%2B" + +GetRecord +http://arXiv.org/oai2?verb=GetRecord&identifier=oai:arXiv.org:cs/0112017&metadataPrefix=oai_dc +http://opus.bsz-bw.de/fhhv/oai2/oai2.php?verb=GetRecord&identifier=oai:opus.bsz-bw.de-fhhv:6&metadataPrefix=oai_dc + +Identify +http://memory.loc.gov/cgi-bin/oai?verb=Identify + +ListIdentifiers +http://an.oa.org/OAI-script?verb=ListIdentifiers&from=1998-01-15&metadataPrefix=oldArXiv&set=physics:hep +http://an.oa.org/OAI-script?verb=ListIdentifiers&resumptionToken=xxx45abttyz +http://www.perseus.tufts.edu/cgi-bin/pdataprov?verb=ListIdentifiers&metadataPrefix=olac&from=2001-01-01&until=2001-01-01&set=Perseus:collection:PersInfo + +ListMetadataFormats +http://www.perseus.tufts.edu/cgi-bin/pdataprov?verb=ListMetadataFormats&identifier=oai:perseus.tufts.edu:Perseus:text:1999.02.0119 +http://memory.loc.gov/cgi-bin/oai?verb=ListMetadataFormats +http://memory.loc.gov/cgi-bin/oai?verb=ListMetadataFormats&identifier=oai:lcoa1.loc.gov:loc.rbc/rbpe.00000111 +http://opus.bsz-bw.de/fhhv/oai2/oai2.php?verb=ListMetadataFormats + +ListRecords +http://opus.bsz-bw.de/fhhv/oai2/oai2.php?verb=ListRecords&metadataPrefix=oai_dc +http://opus.bsz-bw.de/fhhv/oai2/oai2.php?verb=ListRecords&resumptionToken=455 +http://opus.bsz-bw.de/fhhv/oai2/oai2.php?verb=ListRecords&resumptionToken=890 +http://an.oa.org/OAI-script?verb=ListRecords&from=1998-01-15&set=physics:hep&metadataPrefix=oai_rfc1807 +http://www.perseus.tufts.edu/cgi-b:in/pdataprov?verb=ListRecords&from=2002-05-01T14:15:00Z&until=2002-05-01T14:20:00Z&metadataPrefix=oai_dc +http://memory.loc.gov/cgi-bin/oai?verb=ListRecords&from=2002-06-01T02:00:00Z&until=2002-06-01T03:00:00Z&metadataPrefix=oai_marc + +ListSets +http://an.oa.org/OAI-script?verb=ListSets +http://purl.org/alcme/etdcat/servlet/OAIHandler?verb=ListSets + +urn identifier koennen ueber den resolver der d-nb aufgeloest werden: +http://nbn-resolving.de/urn:nbn:de:bsz:960-opus-1860 + + + + 2009-10-01T22:20:04Z + http://opus.bsz-bw.de/fhhv/oai2/oai2.php + + +
    + oai:opus.bsz-bw.de-fhhv:1 + 2008-03-04T12:17:33Z + ddc:020 + pub-type:2 + has-source-swb:false +
    + + + Teaching Information Literacy with the Lerninformationssystem + Hauschke, Christian + Ullmann, Nadine + Informationskompetenz + E-Learning + Bibliothek + Informationsvermittlung + Wissenschaftliches Arbeiten + information literacy + e-learning + library + information dissemination + Library and information sciences + A German university has developed a learning information system to improve information literacy among German students. An online tutorial based on this Lerninformationssystem has been developed. The structure of this learning information system is described, an online tutorial based on it is illustrated, and the different learning styles that it supports are indicated. + Fachhochschule Hannover + Sonstige Einrichtungen. Sonstige Einrichtungen + 2006 + Article + application/pdf + urn:nbn:de:bsz:960-opus-10 + http://opus.bsz-bw.de/fhhv/volltexte/2008/1/ + Australian Academic & Research Libraries, 37 (1), S. 55-60 + eng + http://creativecommons.org/licenses/by/2.0/de/deed.de + + +
    + 119 +
    +
    + +*/ \ No newline at end of file diff --git a/source/net/yacy/document/importer/ResumptionToken.java b/source/net/yacy/document/importer/ResumptionToken.java index 9ba7de4cb..0bd5ac670 100644 --- a/source/net/yacy/document/importer/ResumptionToken.java +++ b/source/net/yacy/document/importer/ResumptionToken.java @@ -55,8 +55,11 @@ public class ResumptionToken extends TreeMap { insensitiveCollator.setDecomposition(Collator.NO_DECOMPOSITION); } + int recordCounter; + public ResumptionToken(final InputStream stream) throws IOException { super((Collator) insensitiveCollator.clone()); + this.recordCounter = 0; new Reader(stream); } @@ -67,6 +70,7 @@ public class ResumptionToken extends TreeMap { String token ) { super((Collator) insensitiveCollator.clone()); + this.recordCounter = 0; this.put("expirationDate", DateFormatter.formatISO8601(expirationDate)); this.put("completeListSize", Integer.toString(completeListSize)); this.put("cursor", Integer.toString(cursor)); @@ -80,12 +84,33 @@ public class ResumptionToken extends TreeMap { String token ) { super((Collator) insensitiveCollator.clone()); + this.recordCounter = 0; this.put("expirationDate", expirationDate); this.put("completeListSize", Integer.toString(completeListSize)); this.put("cursor", Integer.toString(cursor)); this.put("token", token); } + /** + * truncate the given url at the '?' + * @param url + * @return a string containing the url up to and including the '?' + */ + public static String truncatedURL(DigestURI url) { + String u = url.toNormalform(true, true); + int i = u.indexOf('?'); + if (i > 0) u = u.substring(0, i + 1); + return u; + } + + /** + * while parsing the resumption token, also all records are counted + * @return the result from counting the records + */ + public int getRecordCounter() { + return this.recordCounter; + } + /** * compute a url that can be used to resume the retrieval from the OAI-PMH resource * @param givenURL @@ -97,9 +122,7 @@ public class ResumptionToken extends TreeMap { String token = this.getToken(); if (token == null || token.length() == 0) throw new IOException("end of resumption reached"); - String url = givenURL.toNormalform(true, true); - int i = url.indexOf('?'); - if (i > 0) url = url.substring(0, i + 1); + String url = truncatedURL(givenURL); // encoded state if (token.indexOf("from=") >= 0) { @@ -225,6 +248,9 @@ public class ResumptionToken extends TreeMap { */ public void startElement(final String uri, final String name, final String tag, final Attributes atts) throws SAXException { + if ("record".equals(tag)) { + recordCounter++; + } if ("resumptionToken".equals(tag)) { this.parsingValue = true; this.atts = atts; diff --git a/source/net/yacy/kelondro/data/word/Word.java b/source/net/yacy/kelondro/data/word/Word.java index 1cfc89e31..c82bf27ab 100644 --- a/source/net/yacy/kelondro/data/word/Word.java +++ b/source/net/yacy/kelondro/data/word/Word.java @@ -52,7 +52,7 @@ public class Word { */ public static final int commonHashLength = 12; - public static final int hashCacheSize = Math.max(2048, Math.min(100000, (int) (MemoryControl.available() / 20000L))); + private static final int hashCacheSize = Math.max(2048, Math.min(100000, (int) (MemoryControl.available() / 20000L))); private static final ARC hashCache = new ConcurrentARC(hashCacheSize, Runtime.getRuntime().availableProcessors()); // object carries statistics for words and sentences