- added function to OAI-PMH reader that can pull all records from a server using an evaluation of the resumption token to get URL to retrieve remaining records

- added monitoring for retrieved records git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@6444 6c8d7289-2bf4-0310-a012-ef5d649a1542
2024-09-19 00:01:41 +02:00 · 2009-11-02 11:53:14 +00:00 · 2009-11-02 11:53:14 +00:00 · b0b7a4f9a5
commit b0b7a4f9a5
parent 350d13e153
12 changed files with 431 additions and 284 deletions
--- a/htroot/CrawlResults.html
+++ b/htroot/CrawlResults.html
@ -16,6 +16,7 @@
        <li><a href="/CrawlResults.html?process=4" class="MenuItemLink lock">(4) Proxy Use</a></li>
        <li><a href="/CrawlResults.html?process=5" class="MenuItemLink lock">(5) Local Crawling</a></li>
        <li><a href="/CrawlResults.html?process=6" class="MenuItemLink">(6) Global Crawling</a></li>
+        <li><a href="/CrawlResults.html?process=7" class="MenuItemLink lock">(7) Surrogate Import</a></li>
      </ul>
    </div>
    
@ -28,6 +29,7 @@
    <p>Case (6) is a monitor of the local receipt-generator, the opposed case of (1). It contains also an indexing result monitor but is not considered private
    since it shows crawl requests from other peers.
    </p>
+    <p>Case (7) occurs if surrogate files are imported</p>
    <p><img src="/env/grafics/indexmonitor.png" alt="An illustration how yacy works" /></p>
    <p>The image above illustrates the data flow initiated by web index acquisition.
    Some processes occur double to document the complex index migration structure.
@ -70,6 +72,10 @@
    <p>These pages had been indexed by your peer, but the crawl was initiated by a remote peer.
    This is the 'mirror'-case of process (1).</p>
    <p><em>Use Case:</em> This list may fill if you check the 'Accept remote crawling requests'-flag on the 'Index Crate' page</p>
+    ::
+    <h2>(7) Results from surrogates import</h2>
+    <p>These records had been imported from surrogate files in DATA/SURROGATES/in</p>
+    <p><em>Use Case:</em> place files with dublin core metadata content into DATA/SURROGATES/in or use an index import method (i.e. wikimedia import, OAI-PMH retrieval)</p>
    #(/process)#
    
    
--- a/htroot/IndexImportOAIPMH_p.html
+++ b/htroot/IndexImportOAIPMH_p.html
@ -10,12 +10,34 @@
    #%env/templates/submenuContentIntegration.template%#
    <h2>OAI-PMH Import</h2>
    
-    #(import)#
-    <p>#(status)#No import thread is running, you can start a new thread here::Bad input data: #[message]# #(/status)#</p>
    <form action="IndexImportOAIPMH_p.html" method="get">
        <fieldset>
-          <legend>OAI-PMH Import: set a OAI-PMH URL</legend>
-          <input name="oaipmhurl" type="text" value="#[defaulturl]#" size="100" />
+          <legend>Single request import</legend>
+          This will submit only a single request as given here to a OAI-PMH server and imports records into the index
+          <input name="urlstartone" type="text" value="#[defaulturl]#" size="100" />
+          <input name="submit" type="submit" value="Import OAI-PMH source" />
+        </fieldset>
+    </form>
+    #(import-one)#
+    ::
+    <form><fieldset><legend>Import Process</legend>
+      <dl>
+        <dt>Source:</dt><dd>#[source]#</dd>
+        <dt>Processed:</dt><dd>#[count]# records</dd>
+        <dt>ResumptionToken:</dt><dd>#[rt]#</dd>
+      </dl>    
+    </fieldset></form>
+    ::
+    Import failed: #[error]#
+    #(/import-one)#
+    
+    #(import-all)#
+    <p>#(status)#::Bad input data: #[message]# #(/status)#</p>
+    <form action="IndexImportOAIPMH_p.html" method="get">
+        <fieldset>
+          <legend>Import all Records from a server</legend>
+          Import all records that follow acording to resumption elements into index
+          <input name="urlstartall" type="text" value="" size="100" />
          <input name="submit" type="submit" value="Import OAI-PMH source" />
        </fieldset>
    </form>
@ -24,14 +46,13 @@
      <dl>
        <dt>Thread:</dt><dd>#[thread]#</dd>
        <dt>Source:</dt><dd>#[source]#</dd>
-        <dt>ResumptionToken:</dt><dd>#[rt]#</dd>
-        <dt>Processed:</dt><dd>#[count]# Wiki Entries</dd>
-        <dt>Speed:</dt><dd>#[speed]# articles per second</dd>
+        <dt>Processed:</dt><dd>#[count]# records</dd>
+        <dt>Speed:</dt><dd>#[speed]# records per second</dd>
        <dt>Running Time:</dt><dd>#[runningHours]# hours, #[runningMinutes]# minutes</dd>
        <dt>Remaining Time:</dt><dd>#[remainingHours]# hours, #[remainingMinutes]# minutes</dd>
      </dl>    
    </fieldset></form>
-    #(/import)#
+    #(/import-all)#
    
    #%env/templates/footer.template%#
  </body>
--- a/htroot/IndexImportOAIPMH_p.java
+++ b/htroot/IndexImportOAIPMH_p.java
@ -26,6 +26,7 @@ import java.io.IOException;
 import java.net.MalformedURLException;

 import net.yacy.document.importer.OAIPMHImporter;
+import net.yacy.document.importer.OAIPMHReader;
 import net.yacy.document.importer.ResumptionToken;
 import net.yacy.kelondro.data.meta.DigestURI;

@ -40,62 +41,82 @@ public class IndexImportOAIPMH_p {
        final serverObjects prop = new serverObjects();
        final Switchboard sb = (Switchboard) env;

-        prop.put("import_defaulturl", "");
+        prop.put("import-one", 0);
+        prop.put("import-all", 0);
+        prop.put("import-all_status", 0);
+        prop.put("defaulturl", "");
+        
+        
        if (OAIPMHImporter.job != null) {
-            // show result from finished import
-            try {
-                ResumptionToken rt = OAIPMHImporter.job.getResumptionToken();
-                if (rt != null) prop.put("import_defaulturl", rt.resumptionURL(new DigestURI(OAIPMHImporter.job.source(), null)).toNormalform(true, false));
-            } catch (MalformedURLException e) {
-                prop.put("import_defaulturl", e.getMessage());
-            } catch (IOException e) {
-                // reached end of resumption
-                prop.put("import_defaulturl", e.getMessage());
-            }
-        }
-        
-        if (OAIPMHImporter.job != null && OAIPMHImporter.job.isAlive()) {
            // one import is running, no option to insert anything
-            prop.put("import", 1);
-            prop.put("import_thread", "running");
-            prop.put("import_source", OAIPMHImporter.job.source());
-            prop.put("import_count", OAIPMHImporter.job.count());
-            prop.put("import_speed", OAIPMHImporter.job.speed());
-            prop.put("import_runningHours", (OAIPMHImporter.job.runningTime() / 60) / 60);
-            prop.put("import_runningMinutes", (OAIPMHImporter.job.runningTime() / 60) % 60);
-            prop.put("import_remainingHours", (OAIPMHImporter.job.remainingTime() / 60) / 60);
-            prop.put("import_remainingMinutes", (OAIPMHImporter.job.remainingTime() / 60) % 60);
+            prop.put("import-all", 1);
+            prop.put("import-all_thread", (OAIPMHImporter.job.isAlive()) ? "running" : "finished");
+            prop.put("import-all_source", OAIPMHImporter.job.source());
+            prop.put("import-all_count", OAIPMHImporter.job.count());
+            prop.put("import-all_speed", OAIPMHImporter.job.speed());
+            prop.put("import-all_runningHours", (OAIPMHImporter.job.runningTime() / 60) / 60);
+            prop.put("import-all_runningMinutes", (OAIPMHImporter.job.runningTime() / 60) % 60);
+            prop.put("import-all_remainingHours", (OAIPMHImporter.job.remainingTime() / 60) / 60);
+            prop.put("import-all_remainingMinutes", (OAIPMHImporter.job.remainingTime() / 60) % 60);
            return prop;
        }
        
-        prop.put("import", 0);
-        if (post == null) {
-            prop.put("import_status", 0);
-            return prop;
-        }
-        
-        if (post.containsKey("oaipmhurl")) {
-            String oaipmhurl = post.get("oaipmhurl");
-            DigestURI url = null;
-            try {
-                url = new DigestURI(oaipmhurl, null);
-                OAIPMHImporter.job = new OAIPMHImporter(sb.loader, url);
-                OAIPMHImporter.job.start();
-                prop.put("import", 1);
-                prop.put("import_thread", "started");
-                prop.put("import_source", OAIPMHImporter.job.source());
-                prop.put("import_rt", OAIPMHImporter.job.status());
-                prop.put("import_count", 0);
-                prop.put("import_speed", 0);
-                prop.put("import_runningHours", 0);
-                prop.put("import_runningMinutes", 0);
-                prop.put("import_remainingHours", 0);
-                prop.put("import_remainingMinutes", 0);
-            } catch (MalformedURLException e) {
-                e.printStackTrace();
-                prop.put("import", 0);
-                prop.put("import_status", 1);
-                prop.put("import_status_message", e.getMessage());
+        if (post != null) {
+            if (post.containsKey("urlstartone")) {
+                String oaipmhurl = post.get("urlstartone");
+                DigestURI url = null;
+                try {
+                    url = new DigestURI(oaipmhurl, null);
+                    OAIPMHReader r = new OAIPMHReader(sb.loader, url, sb.surrogatesInPath, "oaipmh-one");
+                    ResumptionToken rt = r.getResumptionToken();
+                    prop.put("import-one", 1);
+                    prop.put("import-one_count", (rt == null) ? "not available" : Integer.toString(rt.getRecordCounter()));
+                    prop.put("import-one_source", r.source());
+                    prop.put("import-one_rt", r.getResumptionToken().toString());
+                    
+                    // set next default url
+                    try {
+                        DigestURI nexturl = (rt == null) ? null : rt.resumptionURL(url);
+                        if (rt != null) prop.put("defaulturl", (nexturl == null) ? "" : nexturl.toNormalform(true, false));
+                    } catch (MalformedURLException e) {
+                        prop.put("defaulturl", e.getMessage());
+                    } catch (IOException e) {
+                        // reached end of resumption
+                        prop.put("defaulturl", e.getMessage());
+                    }
+                } catch (MalformedURLException e) {
+                    e.printStackTrace();
+                    prop.put("import-one", 2);
+                    prop.put("import-one_error", e.getMessage());
+                } catch (IOException e) {
+                    e.printStackTrace();
+                    prop.put("import-one", 2);
+                    prop.put("import-one_error", e.getMessage());
+                }
+            }
+            
+            if (post.containsKey("urlstartall")) {
+                String oaipmhurl = post.get("urlstartall");
+                DigestURI url = null;
+                try {
+                    url = new DigestURI(oaipmhurl, null);
+                    OAIPMHImporter.job = new OAIPMHImporter(sb.loader, url);
+                    OAIPMHImporter.job.start();
+                    prop.put("import-all", 1);
+                    prop.put("import-all_thread", "started");
+                    prop.put("import-all_source", OAIPMHImporter.job.source());
+                    prop.put("import-all_count", 0);
+                    prop.put("import-all_speed", 0);
+                    prop.put("import-all_runningHours", 0);
+                    prop.put("import-all_runningMinutes", 0);
+                    prop.put("import-all_remainingHours", 0);
+                    prop.put("import-all_remainingMinutes", 0);
+                } catch (MalformedURLException e) {
+                    e.printStackTrace();
+                    prop.put("import-all", 0);
+                    prop.put("import-all_status", 1);
+                    prop.put("import-all_status_message", e.getMessage());
+                }
            }
        }
        return prop;
--- a/source/de/anomic/crawler/CrawlSwitchboard.java
+++ b/source/de/anomic/crawler/CrawlSwitchboard.java
@ -30,7 +30,6 @@ import java.io.File;
 import java.io.IOException;
 import java.util.Iterator;

-import net.yacy.kelondro.data.word.Word;
 import net.yacy.kelondro.logging.Log;
 import net.yacy.kelondro.util.FileUtils;
 import net.yacy.kelondro.util.kelondroException;
@ -72,7 +71,7 @@ public final class CrawlSwitchboard {
            final Log log,
            final File queuesRoot) {
        
-        log.logInfo("Initializing Word Index for the network '" + networkName + "', word hash cache size is " + Word.hashCacheSize + ".");
+        log.logInfo("Initializing Word Index for the network '" + networkName + "'.");
                        
        if (networkName == null || networkName.length() == 0) {
            log.logSevere("no network name given - shutting down");
--- a/source/de/anomic/crawler/retrieval/EventOrigin.java
+++ b/source/de/anomic/crawler/retrieval/EventOrigin.java
@ -17,11 +17,12 @@ public enum EventOrigin {
    DHT_TRANSFER(3),
    PROXY_LOAD(4),
    LOCAL_CRAWLING(5),
-    GLOBAL_CRAWLING(6);
+    GLOBAL_CRAWLING(6),
+    SURROGATES(7);
    
    protected int code;
    private static final EventOrigin[] list = {
-        UNKNOWN, REMOTE_RECEIPTS, QUERIES, DHT_TRANSFER, PROXY_LOAD, LOCAL_CRAWLING, GLOBAL_CRAWLING};
+        UNKNOWN, REMOTE_RECEIPTS, QUERIES, DHT_TRANSFER, PROXY_LOAD, LOCAL_CRAWLING, GLOBAL_CRAWLING, SURROGATES};
    private EventOrigin(int code) {
        this.code = code;
    }
--- a/source/de/anomic/search/Segment.java
+++ b/source/de/anomic/search/Segment.java
@ -93,7 +93,7 @@ public class Segment {
        migrateTextIndex(segmentPath, segmentPath);
        migrateTextMetadata(segmentPath, segmentPath);
        
-        log.logInfo("Initializing Segment '" + segmentPath + "', word hash cache size is " + Word.hashCacheSize + ".");
+        log.logInfo("Initializing Segment '" + segmentPath + ".");

        this.log = log;
        this.segmentPath = segmentPath;
--- a/source/de/anomic/search/Segments.java
+++ b/source/de/anomic/search/Segments.java
@ -56,7 +56,8 @@ public class Segments implements Iterable<Segment> {
        PROXY,
        LOCALCRAWLING,
        REMOTECRAWLING,
-        PUBLIC;         // includes the index that can be retrieved by the yacy p2p api
+        PUBLIC,
+        SURROGATES;    // includes the index that can be retrieved by the yacy p2p api

        public String toString() {
            throw new UnsupportedOperationException("toString not allowed");
@ -97,6 +98,7 @@ public class Segments implements Iterable<Segment> {
        this.process_assignment.put(Process.LOCALCRAWLING,  "default");
        this.process_assignment.put(Process.REMOTECRAWLING, "default");
        this.process_assignment.put(Process.PUBLIC,         "default");
+        this.process_assignment.put(Process.SURROGATES,     "default");
    }
    
    public void setSegment(Process process, String segmentName) {
--- a/source/de/anomic/search/Switchboard.java
+++ b/source/de/anomic/search/Switchboard.java
@ -1237,7 +1237,7 @@ public final class Switchboard extends serverSwitch {
                        0        
                );
                response = new Response(request, null, null, "200", this.crawler.defaultSurrogateProfile);
-                indexingQueueEntry queueEntry = new indexingQueueEntry(Segments.Process.LOCALCRAWLING, response, document, null);
+                indexingQueueEntry queueEntry = new indexingQueueEntry(Segments.Process.SURROGATES, response, document, null);
                
                // place the queue entry into the concurrent process of the condenser (document analysis)
                try {
@ -1717,9 +1717,10 @@ public final class Switchboard extends serverSwitch {
        // CREATE INDEX
        final String dc_title = document.dc_title();
        final DigestURI referrerURL = queueEntry.referrerURL();
-        final EventOrigin processCase = queueEntry.processCase(peers.mySeed().hash);
+        EventOrigin processCase = queueEntry.processCase(peers.mySeed().hash);
+        if (process == Segments.Process.SURROGATES) processCase = EventOrigin.SURROGATES;

-        // remove stopwords                        
+        // remove stopwords
        log.logInfo("Excluded " + condenser.excludeWords(stopwords) + " words in URL " + queueEntry.url());

        // STORE URL TO LOADED-URL-DB
--- a/source/net/yacy/document/importer/OAIPMHImporter.java
+++ b/source/net/yacy/document/importer/OAIPMHImporter.java
@ -26,21 +26,12 @@

 package net.yacy.document.importer;

-import java.io.ByteArrayInputStream;
-import java.io.File;
 import java.io.IOException;
 import java.net.MalformedURLException;

-import net.yacy.document.content.DCEntry;
-import net.yacy.document.content.SurrogateReader;
 import net.yacy.kelondro.data.meta.DigestURI;
-import net.yacy.kelondro.util.FileUtils;
 import net.yacy.repository.LoaderDispatcher;

-import de.anomic.crawler.CrawlProfile;
-import de.anomic.crawler.retrieval.HTTPLoader;
-import de.anomic.crawler.retrieval.Request;
-import de.anomic.crawler.retrieval.Response;
 import de.anomic.search.Switchboard;


@ -59,22 +50,31 @@ public class OAIPMHImporter extends Thread implements Importer {
    private int count;
    private long startTime;
    private ResumptionToken resumptionToken;
+    private String message;
    
    public OAIPMHImporter(LoaderDispatcher loader, DigestURI source) {
        this.loader = loader;
-        this.source = source;
        this.count = 0;
        this.startTime = System.currentTimeMillis();
        this.resumptionToken = null;
+        this.message = "import initialized";
+        // fix start url
+        String url = ResumptionToken.truncatedURL(source);
+        if (!url.endsWith("?")) url = url + "?";
+        try {
+            this.source = new DigestURI(url + "verb=ListRecords&metadataPrefix=oai_dc", null);
+        } catch (MalformedURLException e) {
+            // this should never happen
+            e.printStackTrace();
+        }
    }

-
    public int count() {
        return this.count;
    }
    
    public String status() {
-        return (this.resumptionToken == null) ? "" : this.resumptionToken.toString();
+        return this.message;
    }
    
    public ResumptionToken getResumptionToken() {
@ -98,206 +98,20 @@ public class OAIPMHImporter extends Thread implements Importer {
    }
    
    public void run() {
-        Response response;
-        try {
-            response = this.loader.load(source, true, true, CrawlProfile.CACHE_STRATEGY_NOCACHE);
-            load(response);
-        } catch (IOException e) {
-            e.printStackTrace();
-        }
-    }
-    
-    public void load0(DigestURI source) throws IOException {
-        Response response = HTTPLoader.load(new Request(source, null));
-        load(response);
-    }
-    
-    private void load(Response response) throws IOException {
-        byte[] b = response.getContent();
-        this.resumptionToken = new ResumptionToken(new ByteArrayInputStream(b));
-        String file = this.source.getHost() + "_" + System.currentTimeMillis();
-        File f0 = new File(Switchboard.getSwitchboard().surrogatesInPath, file + ".tmp");
-        File f1 = new File(Switchboard.getSwitchboard().surrogatesInPath, file + ".xml");
-        FileUtils.copy(b, f0);
-        f0.renameTo(f1);
-        
-        /*
-        SurrogateReader sr = new SurrogateReader(new ByteArrayInputStream(b), 100);
-        Thread srt = new Thread(sr);
-        srt.start();
-        DCEntry dce;
-        while ((dce = sr.take()) != DCEntry.poison) {
-            System.out.println(dce.toString());
-        }
-        try {
-            srt.join();
-        } catch (InterruptedException e) {}
-        */
-        System.out.println("TOKEN: " + resumptionToken.toString());
-        
-    }
-    
-    public static StringBuilder escape(final String s) {
-        final int len = s.length();
-        final StringBuilder sbuf = new StringBuilder(len + 10);
-        for (int i = 0; i < len; i++) {
-            final int ch = s.charAt(i);
-            if ('A' <= ch && ch <= 'Z') {           // 'A'..'Z'
-                sbuf.append((char)ch);
-            } else if ('a' <= ch && ch <= 'z') {    // 'a'..'z'
-                sbuf.append((char)ch);
-            } else if ('0' <= ch && ch <= '9') {    // '0'..'9'
-                sbuf.append((char)ch);
-            } else if (ch == ' ') {                 // space
-                sbuf.append("%20");
-            } else if (ch == '&' || ch == ':'       // unreserved
-                    || ch == '-' || ch == '_'
-                    || ch == '.' || ch == '!'
-                    || ch == '~' || ch == '*'
-                    || ch == '\'' || ch == '('
-                    || ch == ')' || ch == ';') {
-                sbuf.append((char)ch);
+        this.message = "loading first part of records";
+        while (true) {
+            try {
+                OAIPMHReader reader = new OAIPMHReader(this.loader, this.source, Switchboard.getSwitchboard().surrogatesInPath, "oaipmh");
+                this.source = reader.getResumptionToken().resumptionURL(this.source);
+                if (this.source == null) {
+                    this.message = "import terminated with source = null";
+                    break;
+                }
+                this.message = "loading next resumption fragment, cursor = " + reader.getResumptionToken().getCursor();
+            } catch (IOException e) {
+                this.message = e.getMessage();
+                break;
            }
        }
-        return sbuf;
    }
-
-    public static String unescape(final String s) {
-        final int l  = s.length();
-        final StringBuilder sbuf = new StringBuilder(l);
-        int ch = -1;
-        int b, sumb = 0;
-        for (int i = 0, more = -1; i < l; i++) {
-            /* Get next byte b from URL segment s */
-            switch (ch = s.charAt(i)) {
-                case '%':
-                    if (i + 2 < l) {
-                        ch = s.charAt(++i);
-                        int hb = (Character.isDigit ((char) ch) ? ch - '0' : 10 + Character.toLowerCase((char) ch) - 'a') & 0xF;
-                        ch = s.charAt(++i);
-                        int lb = (Character.isDigit ((char) ch) ? ch - '0' : 10 + Character.toLowerCase ((char) ch) - 'a') & 0xF;
-                        b = (hb << 4) | lb;
-                    } else {
-                        b = ch;
-                    }
-                    break;
-                case '+':
-                    b = ' ';
-                    break;
-                default:
-                    b = ch;
-            }
-        }
-        return sbuf.toString();
-    }
-}
-/*
-
-http://an.oa.org/OAI-script?verb=GetRecord&identifier=oai:arXiv.org:hep-th/9901001&metadataPrefix=oai_dc
-
-special characters in URIs must be encoded, the correct form of the above GET request URL is:
-
-http://an.oa.org/OAI-script?verb=GetRecord&identifier=oai%3AarXiv.org%3Ahep-th%2F9901001&metadataPrefix=oai_dc
-
-"/","%2F"
-"?","%3F"
-"#","%23"
-"=","%3D"
-"&","%26"
-":","%3A"
-";","%3B"
-" ","%20"
-"%","%25"
-"+","%2B"
-
-GetRecord
-http://arXiv.org/oai2?verb=GetRecord&identifier=oai:arXiv.org:cs/0112017&metadataPrefix=oai_dc
-http://opus.bsz-bw.de/fhhv/oai2/oai2.php?verb=GetRecord&identifier=oai:opus.bsz-bw.de-fhhv:6&metadataPrefix=oai_dc
-
-Identify
-http://memory.loc.gov/cgi-bin/oai?verb=Identify
-
-ListIdentifiers
-http://an.oa.org/OAI-script?verb=ListIdentifiers&from=1998-01-15&metadataPrefix=oldArXiv&set=physics:hep
-http://an.oa.org/OAI-script?verb=ListIdentifiers&resumptionToken=xxx45abttyz
-http://www.perseus.tufts.edu/cgi-bin/pdataprov?verb=ListIdentifiers&metadataPrefix=olac&from=2001-01-01&until=2001-01-01&set=Perseus:collection:PersInfo
-
-ListMetadataFormats
-http://www.perseus.tufts.edu/cgi-bin/pdataprov?verb=ListMetadataFormats&identifier=oai:perseus.tufts.edu:Perseus:text:1999.02.0119
-http://memory.loc.gov/cgi-bin/oai?verb=ListMetadataFormats
-http://memory.loc.gov/cgi-bin/oai?verb=ListMetadataFormats&identifier=oai:lcoa1.loc.gov:loc.rbc/rbpe.00000111
-http://opus.bsz-bw.de/fhhv/oai2/oai2.php?verb=ListMetadataFormats
-
-ListRecords
-http://opus.bsz-bw.de/fhhv/oai2/oai2.php?verb=ListRecords&metadataPrefix=oai_dc
-http://opus.bsz-bw.de/fhhv/oai2/oai2.php?verb=ListRecords&resumptionToken=455
-http://opus.bsz-bw.de/fhhv/oai2/oai2.php?verb=ListRecords&resumptionToken=890
-http://an.oa.org/OAI-script?verb=ListRecords&from=1998-01-15&set=physics:hep&metadataPrefix=oai_rfc1807
-http://www.perseus.tufts.edu/cgi-b:in/pdataprov?verb=ListRecords&from=2002-05-01T14:15:00Z&until=2002-05-01T14:20:00Z&metadataPrefix=oai_dc
-http://memory.loc.gov/cgi-bin/oai?verb=ListRecords&from=2002-06-01T02:00:00Z&until=2002-06-01T03:00:00Z&metadataPrefix=oai_marc
-
-ListSets
-http://an.oa.org/OAI-script?verb=ListSets
-http://purl.org/alcme/etdcat/servlet/OAIHandler?verb=ListSets
-
-urn identifier koennen ueber den resolver der d-nb aufgeloest werden:
-http://nbn-resolving.de/urn:nbn:de:bsz:960-opus-1860
-
-<?xml version="1.0" encoding="UTF-8"?>
-<OAI-PMH xmlns="http://www.openarchives.org/OAI/2.0/"
-         xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
-         xsi:schemaLocation="http://www.openarchives.org/OAI/2.0/
-         http://www.openarchives.org/OAI/2.0/OAI-PMH.xsd">
- <responseDate>2009-10-01T22:20:04Z</responseDate>
- <request verb="ListRecords" metadataPrefix="oai_dc">http://opus.bsz-bw.de/fhhv/oai2/oai2.php</request>
- <ListRecords>
-  <record>
-   <header>
-    <identifier>oai:opus.bsz-bw.de-fhhv:1</identifier>
-    <datestamp>2008-03-04T12:17:33Z</datestamp>
-    <setSpec>ddc:020</setSpec>
-    <setSpec>pub-type:2</setSpec>
-    <setSpec>has-source-swb:false</setSpec>
-   </header>
-   <metadata>
-     <oai_dc:dc
-       xmlns:oai_dc="http://www.openarchives.org/OAI/2.0/oai_dc/"
-       xmlns:dc="http://purl.org/dc/elements/1.1/"
-       xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
-       xsi:schemaLocation="http://www.openarchives.org/OAI/2.0/oai_dc/
-       http://www.openarchives.org/OAI/2.0/oai_dc.xsd">
-      <dc:title>Teaching Information Literacy with the Lerninformationssystem</dc:title>
-      <dc:creator>Hauschke, Christian</dc:creator>
-      <dc:creator>Ullmann, Nadine</dc:creator>
-      <dc:subject>Informationskompetenz</dc:subject>
-      <dc:subject>E-Learning</dc:subject>
-      <dc:subject>Bibliothek</dc:subject>
-      <dc:subject>Informationsvermittlung</dc:subject>
-      <dc:subject>Wissenschaftliches Arbeiten</dc:subject>
-      <dc:subject>information literacy</dc:subject>
-      <dc:subject>e-learning</dc:subject>
-      <dc:subject>library</dc:subject>
-      <dc:subject>information dissemination</dc:subject>
-      <dc:subject>Library and information sciences</dc:subject>
-      <dc:description>A German university has developed a learning information system to improve information literacy among German students. An online tutorial based on this Lerninformationssystem has been developed. The structure of this learning information system is described, an online tutorial based on it is illustrated, and the different learning styles that it supports are indicated.</dc:description>
-      <dc:publisher>Fachhochschule Hannover</dc:publisher>
-      <dc:publisher>Sonstige Einrichtungen. Sonstige Einrichtungen</dc:publisher>
-      <dc:date>2006</dc:date>
-      <dc:type>Article</dc:type>
-      <dc:format>application/pdf</dc:format>
-      <dc:identifier>urn:nbn:de:bsz:960-opus-10</dc:identifier>
-      <dc:identifier>http://opus.bsz-bw.de/fhhv/volltexte/2008/1/</dc:identifier>
-      <dc:source>Australian Academic &amp; Research Libraries, 37 (1), S. 55-60</dc:source>
-      <dc:language>eng</dc:language>
-      <dc:rights>http://creativecommons.org/licenses/by/2.0/de/deed.de</dc:rights>
-     </oai_dc:dc>
-   </metadata>
-  </record>
-  <resumptionToken
-     expirationDate="2009-10-02T20:20:04Z"
-     completeListSize="226"
-     cursor="0">119</resumptionToken>
- </ListRecords>
-</OAI-PMH>
-
-*/
+}
--- a/source/net/yacy/document/importer/OAIPMHReader.java
+++ b/source/net/yacy/document/importer/OAIPMHReader.java
@ -0,0 +1,256 @@
+// OAIPMHReader
+// (C) 2009 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany
+// first published 30.09.2009 on http://yacy.net
+//
+// This is a part of YaCy, a peer-to-peer based web search engine
+//
+// $LastChangedDate: 2009-09-23 23:26:14 +0200 (Mi, 23 Sep 2009) $
+// $LastChangedRevision: 6340 $
+// $LastChangedBy: low012 $
+//
+// LICENSE
+// 
+// This program is free software; you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation; either version 2 of the License, or
+// (at your option) any later version.
+//
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License for more details.
+//
+// You should have received a copy of the GNU General Public License
+// along with this program; if not, write to the Free Software
+// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+
+package net.yacy.document.importer;
+
+import java.io.ByteArrayInputStream;
+import java.io.File;
+import java.io.IOException;
+import java.util.Date;
+
+import net.yacy.kelondro.data.meta.DigestURI;
+import net.yacy.kelondro.util.DateFormatter;
+import net.yacy.kelondro.util.FileUtils;
+import net.yacy.repository.LoaderDispatcher;
+
+import de.anomic.crawler.CrawlProfile;
+import de.anomic.crawler.retrieval.Response;
+
+
+// get one server with
+// http://roar.eprints.org/index.php?action=csv
+// list records from oai-pmh like
+// http://opus.bsz-bw.de/fhhv/oai2/oai2.php?verb=ListRecords&metadataPrefix=oai_dc
+
+
+public class OAIPMHReader {
+
+    private DigestURI source;
+    private ResumptionToken resumptionToken;
+    
+    public OAIPMHReader(LoaderDispatcher loader, DigestURI source, File targetDir, String filePrefix) throws IOException {
+        this.source = source;
+        
+        // load the file from the net
+        Response response;
+        response = loader.load(source, true, true, CrawlProfile.CACHE_STRATEGY_NOCACHE);
+        byte[] b = response.getContent();
+        this.resumptionToken = new ResumptionToken(new ByteArrayInputStream(b));
+        String file = filePrefix + "_" + this.source.getHost() + "_" + DateFormatter.formatShortMilliSecond(new Date());
+        File f0 = new File(targetDir, file + ".tmp");
+        File f1 = new File(targetDir, file + ".xml");
+        
+        // transaction-safe writing
+        FileUtils.copy(b, f0);
+        f0.renameTo(f1);
+        
+        /*
+        SurrogateReader sr = new SurrogateReader(new ByteArrayInputStream(b), 100);
+        Thread srt = new Thread(sr);
+        srt.start();
+        DCEntry dce;
+        while ((dce = sr.take()) != DCEntry.poison) {
+            System.out.println(dce.toString());
+        }
+        try {
+            srt.join();
+        } catch (InterruptedException e) {}
+        */
+    }
+    
+    public ResumptionToken getResumptionToken() {
+        return this.resumptionToken;
+    }
+
+    public String source() {
+        return source.toNormalform(true, false);
+    }
+    
+    public static StringBuilder escape(final String s) {
+        final int len = s.length();
+        final StringBuilder sbuf = new StringBuilder(len + 10);
+        for (int i = 0; i < len; i++) {
+            final int ch = s.charAt(i);
+            if ('A' <= ch && ch <= 'Z') {           // 'A'..'Z'
+                sbuf.append((char)ch);
+            } else if ('a' <= ch && ch <= 'z') {    // 'a'..'z'
+                sbuf.append((char)ch);
+            } else if ('0' <= ch && ch <= '9') {    // '0'..'9'
+                sbuf.append((char)ch);
+            } else if (ch == ' ') {                 // space
+                sbuf.append("%20");
+            } else if (ch == '&' || ch == ':'       // unreserved
+                    || ch == '-' || ch == '_'
+                    || ch == '.' || ch == '!'
+                    || ch == '~' || ch == '*'
+                    || ch == '\'' || ch == '('
+                    || ch == ')' || ch == ';') {
+                sbuf.append((char)ch);
+            }
+        }
+        return sbuf;
+    }
+
+    public static String unescape(final String s) {
+        final int l  = s.length();
+        final StringBuilder sbuf = new StringBuilder(l);
+        int ch = -1;
+        int b;
+        for (int i = 0; i < l; i++) {
+            /* Get next byte b from URL segment s */
+            switch (ch = s.charAt(i)) {
+                case '%':
+                    if (i + 2 < l) {
+                        ch = s.charAt(++i);
+                        int hb = (Character.isDigit ((char) ch) ? ch - '0' : 10 + Character.toLowerCase((char) ch) - 'a') & 0xF;
+                        ch = s.charAt(++i);
+                        int lb = (Character.isDigit ((char) ch) ? ch - '0' : 10 + Character.toLowerCase ((char) ch) - 'a') & 0xF;
+                        b = (hb << 4) | lb;
+                    } else {
+                        b = ch;
+                    }
+                    break;
+                case '+':
+                    b = ' ';
+                    break;
+                default:
+                    b = ch;
+            }
+            sbuf.append(b);
+        }
+        return sbuf.toString();
+    }
+}
+/*
+
+http://an.oa.org/OAI-script?verb=GetRecord&identifier=oai:arXiv.org:hep-th/9901001&metadataPrefix=oai_dc
+
+special characters in URIs must be encoded, the correct form of the above GET request URL is:
+
+http://an.oa.org/OAI-script?verb=GetRecord&identifier=oai%3AarXiv.org%3Ahep-th%2F9901001&metadataPrefix=oai_dc
+
+"/","%2F"
+"?","%3F"
+"#","%23"
+"=","%3D"
+"&","%26"
+":","%3A"
+";","%3B"
+" ","%20"
+"%","%25"
+"+","%2B"
+
+GetRecord
+http://arXiv.org/oai2?verb=GetRecord&identifier=oai:arXiv.org:cs/0112017&metadataPrefix=oai_dc
+http://opus.bsz-bw.de/fhhv/oai2/oai2.php?verb=GetRecord&identifier=oai:opus.bsz-bw.de-fhhv:6&metadataPrefix=oai_dc
+
+Identify
+http://memory.loc.gov/cgi-bin/oai?verb=Identify
+
+ListIdentifiers
+http://an.oa.org/OAI-script?verb=ListIdentifiers&from=1998-01-15&metadataPrefix=oldArXiv&set=physics:hep
+http://an.oa.org/OAI-script?verb=ListIdentifiers&resumptionToken=xxx45abttyz
+http://www.perseus.tufts.edu/cgi-bin/pdataprov?verb=ListIdentifiers&metadataPrefix=olac&from=2001-01-01&until=2001-01-01&set=Perseus:collection:PersInfo
+
+ListMetadataFormats
+http://www.perseus.tufts.edu/cgi-bin/pdataprov?verb=ListMetadataFormats&identifier=oai:perseus.tufts.edu:Perseus:text:1999.02.0119
+http://memory.loc.gov/cgi-bin/oai?verb=ListMetadataFormats
+http://memory.loc.gov/cgi-bin/oai?verb=ListMetadataFormats&identifier=oai:lcoa1.loc.gov:loc.rbc/rbpe.00000111
+http://opus.bsz-bw.de/fhhv/oai2/oai2.php?verb=ListMetadataFormats
+
+ListRecords
+http://opus.bsz-bw.de/fhhv/oai2/oai2.php?verb=ListRecords&metadataPrefix=oai_dc
+http://opus.bsz-bw.de/fhhv/oai2/oai2.php?verb=ListRecords&resumptionToken=455
+http://opus.bsz-bw.de/fhhv/oai2/oai2.php?verb=ListRecords&resumptionToken=890
+http://an.oa.org/OAI-script?verb=ListRecords&from=1998-01-15&set=physics:hep&metadataPrefix=oai_rfc1807
+http://www.perseus.tufts.edu/cgi-b:in/pdataprov?verb=ListRecords&from=2002-05-01T14:15:00Z&until=2002-05-01T14:20:00Z&metadataPrefix=oai_dc
+http://memory.loc.gov/cgi-bin/oai?verb=ListRecords&from=2002-06-01T02:00:00Z&until=2002-06-01T03:00:00Z&metadataPrefix=oai_marc
+
+ListSets
+http://an.oa.org/OAI-script?verb=ListSets
+http://purl.org/alcme/etdcat/servlet/OAIHandler?verb=ListSets
+
+urn identifier koennen ueber den resolver der d-nb aufgeloest werden:
+http://nbn-resolving.de/urn:nbn:de:bsz:960-opus-1860
+
+<?xml version="1.0" encoding="UTF-8"?>
+<OAI-PMH xmlns="http://www.openarchives.org/OAI/2.0/"
+         xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+         xsi:schemaLocation="http://www.openarchives.org/OAI/2.0/
+         http://www.openarchives.org/OAI/2.0/OAI-PMH.xsd">
+ <responseDate>2009-10-01T22:20:04Z</responseDate>
+ <request verb="ListRecords" metadataPrefix="oai_dc">http://opus.bsz-bw.de/fhhv/oai2/oai2.php</request>
+ <ListRecords>
+  <record>
+   <header>
+    <identifier>oai:opus.bsz-bw.de-fhhv:1</identifier>
+    <datestamp>2008-03-04T12:17:33Z</datestamp>
+    <setSpec>ddc:020</setSpec>
+    <setSpec>pub-type:2</setSpec>
+    <setSpec>has-source-swb:false</setSpec>
+   </header>
+   <metadata>
+     <oai_dc:dc
+       xmlns:oai_dc="http://www.openarchives.org/OAI/2.0/oai_dc/"
+       xmlns:dc="http://purl.org/dc/elements/1.1/"
+       xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+       xsi:schemaLocation="http://www.openarchives.org/OAI/2.0/oai_dc/
+       http://www.openarchives.org/OAI/2.0/oai_dc.xsd">
+      <dc:title>Teaching Information Literacy with the Lerninformationssystem</dc:title>
+      <dc:creator>Hauschke, Christian</dc:creator>
+      <dc:creator>Ullmann, Nadine</dc:creator>
+      <dc:subject>Informationskompetenz</dc:subject>
+      <dc:subject>E-Learning</dc:subject>
+      <dc:subject>Bibliothek</dc:subject>
+      <dc:subject>Informationsvermittlung</dc:subject>
+      <dc:subject>Wissenschaftliches Arbeiten</dc:subject>
+      <dc:subject>information literacy</dc:subject>
+      <dc:subject>e-learning</dc:subject>
+      <dc:subject>library</dc:subject>
+      <dc:subject>information dissemination</dc:subject>
+      <dc:subject>Library and information sciences</dc:subject>
+      <dc:description>A German university has developed a learning information system to improve information literacy among German students. An online tutorial based on this Lerninformationssystem has been developed. The structure of this learning information system is described, an online tutorial based on it is illustrated, and the different learning styles that it supports are indicated.</dc:description>
+      <dc:publisher>Fachhochschule Hannover</dc:publisher>
+      <dc:publisher>Sonstige Einrichtungen. Sonstige Einrichtungen</dc:publisher>
+      <dc:date>2006</dc:date>
+      <dc:type>Article</dc:type>
+      <dc:format>application/pdf</dc:format>
+      <dc:identifier>urn:nbn:de:bsz:960-opus-10</dc:identifier>
+      <dc:identifier>http://opus.bsz-bw.de/fhhv/volltexte/2008/1/</dc:identifier>
+      <dc:source>Australian Academic &amp; Research Libraries, 37 (1), S. 55-60</dc:source>
+      <dc:language>eng</dc:language>
+      <dc:rights>http://creativecommons.org/licenses/by/2.0/de/deed.de</dc:rights>
+     </oai_dc:dc>
+   </metadata>
+  </record>
+  <resumptionToken
+     expirationDate="2009-10-02T20:20:04Z"
+     completeListSize="226"
+     cursor="0">119</resumptionToken>
+ </ListRecords>
+</OAI-PMH>
+
+*/
--- a/source/net/yacy/document/importer/ResumptionToken.java
+++ b/source/net/yacy/document/importer/ResumptionToken.java
@ -55,8 +55,11 @@ public class ResumptionToken  extends TreeMap<String, String> {
        insensitiveCollator.setDecomposition(Collator.NO_DECOMPOSITION);
    }
    
+    int recordCounter;
+    
    public ResumptionToken(final InputStream stream) throws IOException {
        super((Collator) insensitiveCollator.clone());
+        this.recordCounter = 0;
        new Reader(stream);
    }
    
@ -67,6 +70,7 @@ public class ResumptionToken  extends TreeMap<String, String> {
            String token
            ) {
        super((Collator) insensitiveCollator.clone());
+        this.recordCounter = 0;
        this.put("expirationDate", DateFormatter.formatISO8601(expirationDate));
        this.put("completeListSize", Integer.toString(completeListSize));
        this.put("cursor", Integer.toString(cursor));
@ -80,12 +84,33 @@ public class ResumptionToken  extends TreeMap<String, String> {
            String token
            ) {
        super((Collator) insensitiveCollator.clone());
+        this.recordCounter = 0;
        this.put("expirationDate", expirationDate);
        this.put("completeListSize", Integer.toString(completeListSize));
        this.put("cursor", Integer.toString(cursor));
        this.put("token", token);
    }
    
+    /**
+     * truncate the given url at the '?'
+     * @param url
+     * @return a string containing the url up to and including the '?'
+     */
+    public static String truncatedURL(DigestURI url) {
+        String u = url.toNormalform(true, true);
+        int i = u.indexOf('?');
+        if (i > 0) u = u.substring(0, i + 1);
+        return u;
+    }
+    
+    /**
+     * while parsing the resumption token, also all records are counted
+     * @return the result from counting the records
+     */
+    public int getRecordCounter() {
+        return this.recordCounter;
+    }
+    
    /**
     * compute a url that can be used to resume the retrieval from the OAI-PMH resource
     * @param givenURL
@ -97,9 +122,7 @@ public class ResumptionToken  extends TreeMap<String, String> {

        String token = this.getToken();
        if (token == null || token.length() == 0) throw new IOException("end of resumption reached");
-        String url = givenURL.toNormalform(true, true);
-        int i = url.indexOf('?');
-        if (i > 0) url = url.substring(0, i + 1);
+        String url = truncatedURL(givenURL);
        
        // encoded state
        if (token.indexOf("from=") >= 0) {
@ -225,6 +248,9 @@ public class ResumptionToken  extends TreeMap<String, String> {
         */
        
        public void startElement(final String uri, final String name, final String tag, final Attributes atts) throws SAXException {
+            if ("record".equals(tag)) {
+                recordCounter++;
+            }
            if ("resumptionToken".equals(tag)) {
                this.parsingValue = true;
                this.atts = atts;
--- a/source/net/yacy/kelondro/data/word/Word.java
+++ b/source/net/yacy/kelondro/data/word/Word.java
@ -52,7 +52,7 @@ public class Word {
     */
    public static final int commonHashLength = 12;
    
-    public static final int hashCacheSize = Math.max(2048, Math.min(100000, (int) (MemoryControl.available() / 20000L)));
+    private static final int hashCacheSize = Math.max(2048, Math.min(100000, (int) (MemoryControl.available() / 20000L)));
    private static final ARC<String, byte[]> hashCache = new ConcurrentARC<String, byte[]>(hashCacheSize, Runtime.getRuntime().availableProcessors());
    
    // object carries statistics for words and sentences