enhanced and fixed OAI-PMH import

- now importing OAI-PMH server list fron two sources - simultanous import from several servers (even > 2000) - check buttons on OAI-PMH server list to select multiple servers for import start - it is possible to select all servers at once for import - imported XML data is gzipped after import from surrogate reader git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@6847 6c8d7289-2bf4-0310-a012-ef5d649a1542
2024-09-19 00:01:41 +02:00 · 2010-04-30 14:03:51 +00:00 · 2010-04-30 14:03:51 +00:00 · fc5efcc05a
commit fc5efcc05a
parent c2098f9399
17 changed files with 613 additions and 296 deletions
--- a/.classpath
+++ b/.classpath
@ -1,43 +1,43 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<classpath>
-	<classpathentry excluding="env/|htdocsdefault/|proxymsg/|yacy/|env/|yacy/user/|yacy/user/|yacy/ui/|processing/domaingraph/applet/|processing/domaingraph/|api/|api/bookmarks/posts/|api/bookmarks/|api/util/|api/bookmarks/xbel/|api/bookmarks/tags/" kind="src" path="htroot"/>
-	<classpathentry kind="src" path="test"/>
-	<classpathentry excluding="user/|user/|ui/" kind="src" path="htroot/yacy"/>
-	<classpathentry kind="src" path="htroot/env"/>
-	<classpathentry kind="src" path="source"/>
-	<classpathentry kind="src" path="htroot/yacy/ui"/>
-	<classpathentry excluding="bookmarks/posts/|bookmarks/|util/|bookmarks/xbel/|bookmarks/tags/" kind="src" path="htroot/api"/>
-	<classpathentry kind="src" path="htroot/api/bookmarks/posts"/>
-	<classpathentry excluding="posts/|xbel/|tags/" kind="src" path="htroot/api/bookmarks"/>
-	<classpathentry kind="src" path="htroot/api/util"/>
-	<classpathentry kind="src" path="htroot/api/bookmarks/xbel"/>
-	<classpathentry kind="src" path="htroot/api/bookmarks/tags"/>
-	<classpathentry exported="true" kind="con" path="org.eclipse.jdt.launching.JRE_CONTAINER"/>
-	<classpathentry exported="true" kind="lib" path="lib/commons-httpclient-3.1.jar"/>
-	<classpathentry exported="true" kind="lib" path="lib/commons-logging-1.1.1.jar"/>
-	<classpathentry exported="true" kind="lib" path="lib/commons-io-1.4.jar"/>
-	<classpathentry exported="true" kind="lib" path="lib/commons-fileupload-1.2.1.jar"/>
-	<classpathentry exported="true" kind="lib" path="lib/servlet-api.jar"/>
-	<classpathentry exported="true" kind="lib" path="lib/xerces.jar"/>
-	<classpathentry exported="true" kind="lib" path="lib/bzip2.jar"/>
-	<classpathentry exported="true" kind="lib" path="lib/J7Zip-modified.jar"/>
-	<classpathentry exported="true" kind="lib" path="lib/webcat-0.1-swf.jar"/>
-	<classpathentry exported="true" kind="lib" path="lib/activation.jar"/>
-	<classpathentry exported="true" kind="lib" path="lib/commons-jxpath-1.3.jar"/>
-	<classpathentry exported="true" kind="lib" path="libt/junit-4.7.jar"/>
-	<classpathentry kind="lib" path="lib/fontbox-1.0.0.jar"/>
-	<classpathentry kind="lib" path="lib/pdfbox-1.0.0.jar"/>
-	<classpathentry kind="lib" path="lib/poi-3.6-20091214.jar"/>
-	<classpathentry kind="lib" path="lib/poi-scratchpad-3.6-20091214.jar"/>
-	<classpathentry kind="lib" path="lib/bcmail-jdk15-145.jar"/>
-	<classpathentry kind="lib" path="lib/bcprov-jdk15-145.jar"/>
-	<classpathentry kind="lib" path="lib/jsch-0.1.42.jar"/>
-	<classpathentry kind="lib" path="lib/jakarta-oro-2.0.8.jar"/>
-	<classpathentry kind="lib" path="lib/commons-codec-1.4.jar"/>
-	<classpathentry kind="lib" path="lib/log4j-1.2.15.jar"/>
-	<classpathentry kind="lib" path="lib/mysql-connector-java-5.1.12-bin.jar"/>
-	<classpathentry kind="lib" path="lib/jcifs-1.3.14.jar"/>
-	<classpathentry kind="lib" path="lib/metadata-extractor-2.4.0-beta-1.jar"/>
-	<classpathentry exported="true" kind="con" path="org.eclipse.jdt.junit.JUNIT_CONTAINER/4"/>
-	<classpathentry kind="output" path="gen"/>
-</classpath>
+<?xml version="1.0" encoding="UTF-8"?>
+<classpath>
+	<classpathentry excluding="env/|htdocsdefault/|proxymsg/|yacy/|env/|yacy/user/|yacy/user/|yacy/ui/|processing/domaingraph/applet/|processing/domaingraph/|api/|api/bookmarks/posts/|api/bookmarks/|api/util/|api/bookmarks/xbel/|api/bookmarks/tags/" kind="src" path="htroot"/>
+	<classpathentry kind="src" path="test"/>
+	<classpathentry excluding="user/|user/|ui/" kind="src" path="htroot/yacy"/>
+	<classpathentry kind="src" path="htroot/env"/>
+	<classpathentry kind="src" path="source"/>
+	<classpathentry kind="src" path="htroot/yacy/ui"/>
+	<classpathentry excluding="bookmarks/posts/|bookmarks/|util/|bookmarks/xbel/|bookmarks/tags/" kind="src" path="htroot/api"/>
+	<classpathentry kind="src" path="htroot/api/bookmarks/posts"/>
+	<classpathentry excluding="posts/|xbel/|tags/" kind="src" path="htroot/api/bookmarks"/>
+	<classpathentry kind="src" path="htroot/api/util"/>
+	<classpathentry kind="src" path="htroot/api/bookmarks/xbel"/>
+	<classpathentry kind="src" path="htroot/api/bookmarks/tags"/>
+	<classpathentry exported="true" kind="con" path="org.eclipse.jdt.launching.JRE_CONTAINER"/>
+	<classpathentry exported="true" kind="lib" path="lib/commons-httpclient-3.1.jar" sourcepath="/commons-httpclient-3.1/src"/>
+	<classpathentry exported="true" kind="lib" path="lib/commons-logging-1.1.1.jar"/>
+	<classpathentry exported="true" kind="lib" path="lib/commons-io-1.4.jar"/>
+	<classpathentry exported="true" kind="lib" path="lib/commons-fileupload-1.2.1.jar"/>
+	<classpathentry exported="true" kind="lib" path="lib/servlet-api.jar"/>
+	<classpathentry exported="true" kind="lib" path="lib/xerces.jar"/>
+	<classpathentry exported="true" kind="lib" path="lib/bzip2.jar"/>
+	<classpathentry exported="true" kind="lib" path="lib/J7Zip-modified.jar"/>
+	<classpathentry exported="true" kind="lib" path="lib/webcat-0.1-swf.jar"/>
+	<classpathentry exported="true" kind="lib" path="lib/activation.jar"/>
+	<classpathentry exported="true" kind="lib" path="lib/commons-jxpath-1.3.jar"/>
+	<classpathentry exported="true" kind="lib" path="libt/junit-4.7.jar"/>
+	<classpathentry kind="lib" path="lib/fontbox-1.0.0.jar"/>
+	<classpathentry kind="lib" path="lib/pdfbox-1.0.0.jar"/>
+	<classpathentry kind="lib" path="lib/poi-3.6-20091214.jar"/>
+	<classpathentry kind="lib" path="lib/poi-scratchpad-3.6-20091214.jar"/>
+	<classpathentry kind="lib" path="lib/bcmail-jdk15-145.jar"/>
+	<classpathentry kind="lib" path="lib/bcprov-jdk15-145.jar"/>
+	<classpathentry kind="lib" path="lib/jsch-0.1.42.jar"/>
+	<classpathentry kind="lib" path="lib/jakarta-oro-2.0.8.jar"/>
+	<classpathentry kind="lib" path="lib/commons-codec-1.4.jar"/>
+	<classpathentry kind="lib" path="lib/log4j-1.2.15.jar"/>
+	<classpathentry kind="lib" path="lib/mysql-connector-java-5.1.12-bin.jar"/>
+	<classpathentry kind="lib" path="lib/jcifs-1.3.14.jar"/>
+	<classpathentry kind="lib" path="lib/metadata-extractor-2.4.0-beta-1.jar"/>
+	<classpathentry exported="true" kind="con" path="org.eclipse.jdt.junit.JUNIT_CONTAINER/4"/>
+	<classpathentry kind="output" path="gen"/>
+</classpath>
--- a/build.properties
+++ b/build.properties
@ -3,7 +3,7 @@ javacSource=1.5
 javacTarget=1.5

 # Release Configuration
-releaseVersion=0.94
+releaseVersion=0.95
 stdReleaseFile=yacy_v${releaseVersion}_${DSTAMP}_${releaseNr}.tar.gz
 sourceReleaseFile=yacy_src_v${releaseVersion}_${DSTAMP}_${releaseNr}.tar.gz
 releaseFileParentDir=yacy
--- a/htroot/CrawlResults.java
+++ b/htroot/CrawlResults.java
@ -187,8 +187,8 @@ public class CrawlResults {
                entry = i.next();
                try {
                    urle = sb.indexSegments.urlMetadata(Segments.Process.LOCALCRAWLING).load(entry.getKey().getBytes(), null, 0);
-                    if(urle == null) {
-                        Log.logWarning("PLASMA", "CrawlResults: URL not in index for crawl result "+ i +" with hash "+ entry.getKey());
+                    if (urle == null) {
+                        Log.logWarning("PLASMA", "CrawlResults: URL not in index with url hash "+ entry.getKey());
                        urlstr = null;
                        urltxt = null;
                        metadata = null;
--- a/htroot/IndexImportOAIPMHList_p.html
+++ b/htroot/IndexImportOAIPMHList_p.html
@ -4,20 +4,49 @@
    <title>YaCy '#[clientname]#': OAI-PMH source import list</title>
    #%env/templates/metas.template%#
    #(refresh)#::<meta http-equiv="REFRESH" content="6" />#(/refresh)#
+    <script>
+    <!--
+    function setall(name, check){
+        var selectForm = document.forms.namedItem(name);
+        var count = selectForm.elements["num"].value;
+        if (check) for(i = 0; i < count; i++) {
+            if (selectForm.elements["item_" + i].checked) {
+                check = false;
+                break;
+            }
+        }
+        for(i = 0; i < count; i++){
+    	    selectForm.elements["item_" + i].checked = check;
+        }
+    }
+    -->
+    </script>
+    <script src="/js/sorttable.js"></script>
  </head>
  <body>
    #(source)#::
-    <h3>OAI Source List</h3>
+    <h3>List of #[num]# OAI-PMH Servers</h3>
+    <form action="IndexImportOAIPMH_p.html" target="_top" method="post" enctype="multipart/form-data" accept-charset="UTF-8" name="oaipmhimport">
+    <p>
+    <input type="hidden" name="num" value="#[num]#" />
+    <input type="submit" name="loadrows" value="Load Selected Sources" />
+    </p>
    <table cellpadding="2" cellspacing="1" >
      <tr class="TableHeader">
+        <td><input type="checkbox" name="allswitch" onclick="setall(this.form.name, this.value)" /></td>
        <td>Source</td>
      </tr>
      #{table}#
      <tr class="TableCell#(dark)#Light::Dark#(/dark)#">
-        <td>#[source]#</td>
+        <td align="left"><input type="checkbox" name="item_#[count]#" value="mark_#[source]#" /></td>
+        <td>#[loadurl]#</td>
      </tr>
      #{/table}#
    </table>
+    <p>
+    <input type="submit" name="loadrows" value="Load Selected Sources" />
+    </p>
+    </form>
    #(/source)#
    
    #(import)#::
--- a/htroot/IndexImportOAIPMHList_p.java
+++ b/htroot/IndexImportOAIPMHList_p.java
@ -25,6 +25,7 @@
 import java.util.ArrayList;
 import java.util.Set;

+import net.yacy.document.importer.OAIListFriendsLoader;
 import net.yacy.document.importer.OAIPMHImporter;

 import de.anomic.http.server.RequestHeader;
@ -43,39 +44,42 @@ public class IndexImportOAIPMHList_p {
        prop.put("source", 0);
        
        if (post != null && post.containsKey("source")) {
-            Set<String> oaiRoots = OAIPMHImporter.getAllListedOAIServer(sb.loader);
+            Set<String> oaiRoots = OAIListFriendsLoader.load(sb.loader).keySet();
            
            boolean dark = false;
-            int cnt = 0;
+            int count = 0;
            for (String root: oaiRoots) {
-                prop.put("source_table_" + cnt + "_dark", (dark) ? "1" : "0");
-                prop.put("source_table_" + cnt + "_source", "<a href=\"/IndexImportOAIPMH_p.html?importroot=&urlstartall=" + root + "\" target=\"_top\">" + root+ "</a>");
+                prop.put("source_table_" + count + "_dark", (dark) ? "1" : "0");
+                prop.put("source_table_" + count + "_count", count);
+                prop.put("source_table_" + count + "_source", root);
+                prop.put("source_table_" + count + "_loadurl", "<a href=\"/IndexImportOAIPMH_p.html?urlstart=" + root + "\" target=\"_top\">" + root + "</a>");
                dark = !dark;
-                cnt++;
+                count++;
            }
-            prop.put("source_table", cnt);
+            prop.put("source_table", count);
+            prop.put("source_num", count);
            prop.put("source", 1);
        }
        
        if (post != null && post.containsKey("import")) {
            ArrayList<OAIPMHImporter> jobs = new ArrayList<OAIPMHImporter>();
-            for (OAIPMHImporter job: OAIPMHImporter.runningJobs) jobs.add(job);
-            for (OAIPMHImporter job: OAIPMHImporter.startedJobs) jobs.add(job);
-            for (OAIPMHImporter job: OAIPMHImporter.finishedJobs) jobs.add(job);
+            for (OAIPMHImporter job: OAIPMHImporter.runningJobs.keySet()) jobs.add(job);
+            for (OAIPMHImporter job: OAIPMHImporter.startedJobs.keySet()) jobs.add(job);
+            for (OAIPMHImporter job: OAIPMHImporter.finishedJobs.keySet()) jobs.add(job);
            
            boolean dark = false;
-            int cnt = 0;
+            int count = 0;
            for (OAIPMHImporter job: jobs) {
-                prop.put("import_table_" + cnt + "_dark", (dark) ? "1" : "0");
-                prop.put("import_table_" + cnt + "_thread", (job.isAlive()) ? "<img src=\"/env/grafics/loading.gif\" alt=\"running\" />" : "finished");
-                prop.put("import_table_" + cnt + "_source", job.source());
-                prop.put("import_table_" + cnt + "_chunkCount", job.chunkCount());
-                prop.put("import_table_" + cnt + "_recordsCount", job.count());
-                prop.put("import_table_" + cnt + "_speed", job.speed());
+                prop.put("import_table_" + count + "_dark", (dark) ? "1" : "0");
+                prop.put("import_table_" + count + "_thread", (job.isAlive()) ? "<img src=\"/env/grafics/loading.gif\" alt=\"running\" />" : "finished");
+                prop.put("import_table_" + count + "_source", job.source());
+                prop.put("import_table_" + count + "_chunkCount", job.chunkCount());
+                prop.put("import_table_" + count + "_recordsCount", job.count());
+                prop.put("import_table_" + count + "_speed", job.speed());
                dark = !dark;
-                cnt++;
+                count++;
            }
-            prop.put("import_table", cnt);
+            prop.put("import_table", count);
            prop.put("import", 1);
            prop.put("refresh", 1);
        }
--- a/htroot/IndexImportOAIPMH_p.html
+++ b/htroot/IndexImportOAIPMH_p.html
@ -33,7 +33,7 @@
        <fieldset>
          <legend>Import all Records from a server</legend>
          Import all records that follow according to resumption elements into index<br />
-          <input name="urlstartall" type="text" value="" size="80" />
+          <input name="urlstart" type="text" value="" size="80" />
          <input name="importroot" type="submit" value="import this source" />
          #(optiongetlist)#::or&nbsp;<input name="getlist" type="submit" value="import from a list" />#(/optiongetlist)#
          #(status)#::<p>Import started!</p>::<p>Bad input data: #[message]# </p>#(/status)#
--- a/htroot/IndexImportOAIPMH_p.java
+++ b/htroot/IndexImportOAIPMH_p.java
@ -24,9 +24,13 @@

 import java.io.IOException;
 import java.net.MalformedURLException;
+import java.util.ArrayList;
+import java.util.Map;
+import java.util.Random;
+import java.util.TreeSet;

 import net.yacy.document.importer.OAIPMHImporter;
-import net.yacy.document.importer.OAIPMHReader;
+import net.yacy.document.importer.OAIPMHLoader;
 import net.yacy.document.importer.ResumptionToken;
 import net.yacy.kelondro.data.meta.DigestURI;
 import net.yacy.kelondro.logging.Log;
@ -55,7 +59,7 @@ public class IndexImportOAIPMH_p {
                DigestURI url = null;
                try {
                    url = new DigestURI(oaipmhurl, null);
-                    OAIPMHReader r = new OAIPMHReader(sb.loader, url, sb.surrogatesInPath, "oaipmh-one");
+                    OAIPMHLoader r = new OAIPMHLoader(sb.loader, url, sb.surrogatesInPath, "oaipmh-one");
                    ResumptionToken rt = r.getResumptionToken();
                    prop.put("import-one", 1);
                    prop.put("import-one_count", (rt == null) ? "not available" : Integer.toString(rt.getRecordCounter()));
@ -83,8 +87,8 @@ public class IndexImportOAIPMH_p {
                }
            }
            
-            if (post.containsKey("importroot")) {
-                String oaipmhurl = post.get("urlstartall", "");
+            if (post.get("urlstart", "").length() > 0) {
+                String oaipmhurl = post.get("urlstart", "");
                DigestURI url = null;
                try {
                    url = new DigestURI(oaipmhurl, null);
@ -100,6 +104,38 @@ public class IndexImportOAIPMH_p {
                }
            }
            
+            
+            if (post.get("loadrows", "").length() > 0) {
+                // create a time-ordered list of events to execute
+                TreeSet<String> sources = new TreeSet<String>();
+                for (Map.Entry<String, String> entry: post.entrySet()) {
+                    if (entry.getValue().startsWith("mark_")) {
+                        sources.add(entry.getValue().substring(5));
+                    }
+                }
+                prop.put("status", 1);
+                prop.put("optiongetlist", 1);
+                prop.put("iframetype", 1);
+                
+                // prepare the set for random read from it (to protect the servers at the beginning of the list)
+                ArrayList<String> sourceList = new ArrayList<String>(sources.size());
+                for (String oaipmhurl: sources) sourceList.add(oaipmhurl);
+                Random r = new Random(System.currentTimeMillis());
+                
+                // start jobs for the sources
+                DigestURI url = null;
+                while (sourceList.size() > 0) {
+                    String oaipmhurl = sourceList.remove(r.nextInt(sourceList.size()));
+                    try {
+                        url = new DigestURI(oaipmhurl, null);
+                        OAIPMHImporter job = new OAIPMHImporter(sb.loader, url);
+                        job.start();
+                    } catch (MalformedURLException e) {
+                        Log.logException(e);
+                    }
+                }
+            }
+            
            if (post.containsKey("getlist")) {
                prop.put("iframetype", 2);
            }
--- a/source/de/anomic/search/Switchboard.java
+++ b/source/de/anomic/search/Switchboard.java
@ -37,9 +37,16 @@
 package de.anomic.search;

 import java.io.BufferedInputStream;
+import java.io.BufferedOutputStream;
+import java.io.ByteArrayInputStream;
+import java.io.ByteArrayOutputStream;
 import java.io.File;
 import java.io.FileInputStream;
+import java.io.FileNotFoundException;
+import java.io.FileOutputStream;
 import java.io.IOException;
+import java.io.InputStream;
+import java.io.OutputStream;
 import java.io.UnsupportedEncodingException;
 import java.net.MalformedURLException;
 import java.security.NoSuchAlgorithmException;
@ -60,6 +67,10 @@ import java.util.TreeSet;
 import java.util.concurrent.ConcurrentHashMap;
 import java.util.concurrent.Semaphore;
 import java.util.regex.Pattern;
+import java.util.zip.GZIPInputStream;
+import java.util.zip.GZIPOutputStream;
+import java.util.zip.ZipEntry;
+import java.util.zip.ZipInputStream;

 import net.yacy.document.Condenser;
 import net.yacy.document.Document;
@ -68,6 +79,7 @@ import net.yacy.document.ParserException;
 import net.yacy.document.content.DCEntry;
 import net.yacy.document.content.RSSMessage;
 import net.yacy.document.content.SurrogateReader;
+import net.yacy.document.importer.OAIListFriendsLoader;
 import net.yacy.document.parser.html.ImageEntry;
 import net.yacy.document.parser.xml.RSSFeed;
 import net.yacy.kelondro.data.meta.DigestURI;
@ -474,6 +486,7 @@ public final class Switchboard extends serverSwitch {
        // start a loader
        log.logConfig("Starting Crawl Loader");
        this.loader = new LoaderDispatcher(this);
+        OAIListFriendsLoader.init(this.loader);
        this.crawlQueues = new CrawlQueues(this, queuesRoot);
        this.crawlQueues.noticeURL.setMinimumDelta(
                this.getConfigLong("minimumLocalDelta", this.crawlQueues.noticeURL.getMinimumLocalDelta()),
@ -1236,58 +1249,106 @@ public final class Switchboard extends serverSwitch {
    }
    
    public boolean processSurrogate(final String s) {
-        File surrogateFile = new File(this.surrogatesInPath, s);
+        File infile = new File(this.surrogatesInPath, s);
+        if (!infile.exists() || !infile.canWrite() || !infile.canRead()) return false;
        File outfile = new File(this.surrogatesOutPath, s);
-        if (!surrogateFile.exists() || !surrogateFile.canWrite() || !surrogateFile.canRead()) return false;
        if (outfile.exists()) return false;
        boolean moved = false;
-        try {
-            SurrogateReader reader = new SurrogateReader(new BufferedInputStream(new FileInputStream(surrogateFile)), 3);
-            Thread readerThread = new Thread(reader, "Surrogate-Reader " + surrogateFile.getAbsolutePath());
-            readerThread.start();
-            DCEntry surrogate;
-            Response response;
-            while ((surrogate = reader.take()) != DCEntry.poison) {
-                // check if url is in accepted domain
-                assert surrogate != null;
-                assert crawlStacker != null;
-                final String urlRejectReason = crawlStacker.urlInAcceptedDomain(surrogate.getIdentifier());
-                if (urlRejectReason != null) {
-                    if (this.log.isFine()) this.log.logInfo("Rejected URL '" + surrogate.getIdentifier() + "': " + urlRejectReason);
-                    continue;
+        if (s.endsWith("xml.zip")) {
+            // open the zip file with all the xml files in it
+            try {
+                InputStream is = new BufferedInputStream(new FileInputStream(infile));
+                ZipInputStream zis = new ZipInputStream(is);
+                ZipEntry entry;
+                while ((entry = zis.getNextEntry()) != null) {
+                    int size;
+                    byte[] buffer = new byte[2048];
+                    ByteArrayOutputStream baos = new ByteArrayOutputStream();
+                    while ((size = zis.read(buffer, 0, buffer.length)) != -1) {
+                        baos.write(buffer, 0, size);
+                    }
+                    baos.flush();
+                    processSurrogate(new ByteArrayInputStream(baos.toByteArray()), entry.getName());
+                    baos.close();
                }
-                
-                // create a queue entry
-                Document document = surrogate.document();
-                Request request = new Request(
-                        peers.mySeed().hash.getBytes(), 
-                        surrogate.getIdentifier(), 
-                        null, 
-                        "", 
-                        new Date(),
-                        new Date(),
-                        this.crawler.defaultSurrogateProfile.handle(),
-                        0, 
-                        0, 
-                        0        
-                );
-                response = new Response(request, null, null, "200", this.crawler.defaultSurrogateProfile);
-                indexingQueueEntry queueEntry = new indexingQueueEntry(Segments.Process.SURROGATES, response, document, null);
-                
-                // place the queue entry into the concurrent process of the condenser (document analysis)
-                try {
-                    indexingCondensementProcessor.enQueue(queueEntry);
-                } catch (InterruptedException e) {
-                    Log.logException(e);
-                    break;
+            } catch (IOException e) {
+                Log.logException(e);
+            } finally {
+                moved = infile.renameTo(outfile);
+            }
+            return moved;
+        } else {
+            try {
+                InputStream is = new BufferedInputStream(new FileInputStream(infile));
+                if (s.endsWith(".gz")) is = new GZIPInputStream(is);
+                processSurrogate(is, infile.getName());
+            } catch (IOException e) {
+                Log.logException(e);
+            } finally {
+                moved = infile.renameTo(outfile);
+                if (moved) {
+                    // check if this file is already compressed, if not, compress now
+                    if (!outfile.getName().endsWith(".gz")) {
+                        String gzname = outfile.getName() + ".gz";
+                        File gzfile = new File(outfile.getParentFile(), gzname);
+                        try {
+                            OutputStream os = new BufferedOutputStream(new GZIPOutputStream(new FileOutputStream(gzfile)));
+                            FileUtils.copy(new BufferedInputStream(new FileInputStream(outfile)), os);
+                            os.close();
+                            if (gzfile.exists()) FileUtils.deletedelete(outfile);
+                        } catch (FileNotFoundException e) {
+                            Log.logException(e);
+                        } catch (IOException e) {
+                            Log.logException(e);
+                        }
+                    }
                }
            }
-        } catch (IOException e) {
-            Log.logException(e);
-        } finally {
-            moved = surrogateFile.renameTo(outfile);
+            return moved;
+        }
+    }
+    
+    public void processSurrogate(final InputStream is, String name) throws IOException {
+        SurrogateReader reader = new SurrogateReader(is, 3);
+        Thread readerThread = new Thread(reader, name);
+        readerThread.start();
+        DCEntry surrogate;
+        Response response;
+        while ((surrogate = reader.take()) != DCEntry.poison) {
+            // check if url is in accepted domain
+            assert surrogate != null;
+            assert crawlStacker != null;
+            final String urlRejectReason = crawlStacker.urlInAcceptedDomain(surrogate.getIdentifier());
+            if (urlRejectReason != null) {
+                if (this.log.isFine()) this.log.logInfo("Rejected URL '" + surrogate.getIdentifier() + "': " + urlRejectReason);
+                continue;
+            }
+            
+            // create a queue entry
+            Document document = surrogate.document();
+            Request request = new Request(
+                    peers.mySeed().hash.getBytes(), 
+                    surrogate.getIdentifier(), 
+                    null, 
+                    "", 
+                    new Date(),
+                    new Date(),
+                    this.crawler.defaultSurrogateProfile.handle(),
+                    0, 
+                    0, 
+                    0        
+            );
+            response = new Response(request, null, null, "200", this.crawler.defaultSurrogateProfile);
+            indexingQueueEntry queueEntry = new indexingQueueEntry(Segments.Process.SURROGATES, response, document, null);
+            
+            // place the queue entry into the concurrent process of the condenser (document analysis)
+            try {
+                indexingCondensementProcessor.enQueue(queueEntry);
+            } catch (InterruptedException e) {
+                Log.logException(e);
+                break;
+            }
        }
-        return moved;
    }

    public int surrogateQueueSize() {
@ -1326,7 +1387,7 @@ public final class Switchboard extends serverSwitch {
                    // check for interruption
                    checkInterruption();

-                    if (surrogate.endsWith(".xml")) {
+                    if (surrogate.endsWith(".xml") || surrogate.endsWith(".xml.gz") || surrogate.endsWith(".xml.zip")) {
                        // read the surrogate file and store entry in index
                        if (processSurrogate(surrogate)) return true;
                    }
--- a/source/net/yacy/document/importer/Importer.java
+++ b/source/net/yacy/document/importer/Importer.java
@ -1,3 +1,25 @@
+/**
+ *  Importer
+ *  Copyright 2009 by Michael Peter Christen
+ *  First released 29.04.2010 at http://yacy.net
+ *  
+ *  This is a part of YaCy, a peer-to-peer based web search engine
+ *
+ *  This program is free software: you can redistribute it and/or modify
+ *  it under the terms of the GNU Lesser General Public License as published by
+ *  the Free Software Foundation, either version 3 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU Lesser General Public License for more details.
+ *
+ *  You should have received a copy of the GNU Lesser General Public License
+ *  along with this program in the file COPYING.LESSER.
+ *  If not, see <http://www.gnu.org/licenses/>.
+ */
+
 package net.yacy.document.importer;

 public interface Importer extends Runnable {
--- a/source/net/yacy/document/importer/MediawikiImporter.java
+++ b/source/net/yacy/document/importer/MediawikiImporter.java
@ -1,28 +1,24 @@
-// mediawikiIndex.java
-// (C) 2008 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany
-// first published 20.11.2008 on http://yacy.net
-//
-// This is a part of YaCy, a peer-to-peer based web search engine
-//
-// $LastChangedDate: 2006-04-02 22:40:07 +0200 (So, 02 Apr 2006) $
-// $LastChangedRevision: 1986 $
-// $LastChangedBy: orbiter $
-//
-// LICENSE
-// 
-// This program is free software; you can redistribute it and/or modify
-// it under the terms of the GNU General Public License as published by
-// the Free Software Foundation; either version 2 of the License, or
-// (at your option) any later version.
-//
-// This program is distributed in the hope that it will be useful,
-// but WITHOUT ANY WARRANTY; without even the implied warranty of
-// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-// GNU General Public License for more details.
-//
-// You should have received a copy of the GNU General Public License
-// along with this program; if not, write to the Free Software
-// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+/**
+ *  MediawikiImporter
+ *  Copyright 2008 by Michael Peter Christen
+ *  First released 20.11.2008 at http://yacy.net
+ *  
+ *  This is a part of YaCy, a peer-to-peer based web search engine
+ *
+ *  This program is free software: you can redistribute it and/or modify
+ *  it under the terms of the GNU Lesser General Public License as published by
+ *  the Free Software Foundation, either version 3 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU Lesser General Public License for more details.
+ *
+ *  You should have received a copy of the GNU Lesser General Public License
+ *  along with this program in the file COPYING.LESSER.
+ *  If not, see <http://www.gnu.org/licenses/>.
+ */

 package net.yacy.document.importer;

--- a/source/net/yacy/document/importer/OAIListFriendsLoader.java
+++ b/source/net/yacy/document/importer/OAIListFriendsLoader.java
@ -0,0 +1,203 @@
+/**
+ *  OAIListFriendsLoader
+ *  Copyright 2010 by Michael Peter Christen
+ *  First released 29.04.2010 at http://yacy.net
+ *  
+ *  This is a part of YaCy, a peer-to-peer based web search engine
+ *
+ *  This program is free software: you can redistribute it and/or modify
+ *  it under the terms of the GNU Lesser General Public License as published by
+ *  the Free Software Foundation, either version 3 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU Lesser General Public License for more details.
+ *
+ *  You should have received a copy of the GNU Lesser General Public License
+ *  along with this program in the file COPYING.LESSER.
+ *  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+package net.yacy.document.importer;
+
+import java.io.ByteArrayInputStream;
+import java.io.File;
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.Map;
+import java.util.TreeMap;
+
+import javax.xml.parsers.ParserConfigurationException;
+import javax.xml.parsers.SAXParser;
+import javax.xml.parsers.SAXParserFactory;
+
+import net.yacy.kelondro.data.meta.DigestURI;
+import net.yacy.kelondro.logging.Log;
+import net.yacy.kelondro.util.FileUtils;
+import net.yacy.repository.LoaderDispatcher;
+
+import org.xml.sax.Attributes;
+import org.xml.sax.SAXException;
+import org.xml.sax.helpers.DefaultHandler;
+
+import de.anomic.crawler.CrawlProfile;
+import de.anomic.crawler.retrieval.Response;
+
+public class OAIListFriendsLoader {
+
+    private static final long serialVersionUID = -8705115274655024604L;
+    
+    //private static String url10 = "http://roar.eprints.org/cgi/roar_search/advanced/export_roar_ROAR::ListFriends.xml?screen=ROAR%3A%3AFacetSearch&_action_export=1&output=ROAR%3A%3AListFriends&exp=1|1|-recordcount%2F-date|archive|-|-|eprint_status%3Aeprint_status%3AALL%3AEQ%3Aarchive|metadata_visibility%3Ametadata_visibility%3AALL%3AEX%3Ashow";
+    private static String url10 = "http://roar.eprints.org/cgi/roar_search/advanced/export_roar_ROAR::ListFriends.xml?_action_export=1&output=ROAR%3A%3AListFriends";
+    private static File cache10 = new File("DATA/DICTIONARIES/harvesting/export_roar_ROAR_ListFriends.xml");
+    private static String url20 = "http://www.openarchives.org/Register/ListFriends";
+    private static File cache20 = new File("DATA/DICTIONARIES/harvesting/ListFriends.xml");
+    
+    public static void init(LoaderDispatcher loader) {
+        loader.loadIfNotExistBackground(url10, cache10);
+        loader.loadIfNotExistBackground(url20, cache20);
+    }
+    
+    public static Map<String, String> load(LoaderDispatcher loader) {
+        Map<String, String> map10;
+        try {
+            map10 = load(null, null, new File("DATA/DICTIONARIES/harvesting/export_roar_ROAR_ListFriends.xml"));
+        } catch (IOException e) {
+            map10 = new TreeMap<String, String>();
+        }
+        
+        Map<String, String> map20;
+        try {
+            map20 = load(null, null, new File("DATA/DICTIONARIES/harvesting/ListFriends.xml"));
+        } catch (IOException e) {
+            map20 = new TreeMap<String, String>();
+        }
+        
+        map10.putAll(map20);
+        return map10;
+    }
+    
+    /**
+     * load a OAI ListFriends file from the net or from a cache location
+     * If the given file does exist, the OAI ListFriends File is loaded and parsed.
+     * The resulting map is a mapping from OAI-PMH start url to a loaction description
+     * @param loader a LoaderDispatcher that loads the file if targetFile does not exist
+     * @param source the source URL for the OAI ListFriends file
+     * @param targetFile the file where the loaded content is stored if it does not exist, the source othervise
+     * @return a Map from OAI-PMH source to source description (which is usually also a URL)
+     * @throws IOException
+     */
+    private static Map<String, String> load(LoaderDispatcher loader, DigestURI source, File targetFile) throws IOException {
+        
+        byte[] b;
+        if (targetFile.exists()) {
+            // load file
+            b = FileUtils.read(targetFile);
+        } else {
+            // load from the net
+            Response response = loader.load(source, false, true, CrawlProfile.CACHE_STRATEGY_NOCACHE);
+            b = response.getContent();
+            FileUtils.copy(b, targetFile);
+        }
+               
+        return new Parser(b).map;
+    }
+    
+    
+    // get a resumption token using a SAX xml parser from am input stream
+    private static class Parser extends DefaultHandler {
+
+        // class variables
+        private final StringBuilder buffer;
+        private boolean parsingValue;
+        private SAXParser saxParser;
+        private InputStream stream;
+        private Attributes atts;
+        private int recordCounter;
+        private TreeMap<String, String> map;
+
+        public Parser(final byte[] b) throws IOException {
+            this.map = new TreeMap<String, String>();
+            this.recordCounter = 0;
+            this.buffer = new StringBuilder();
+            this.parsingValue = false;
+            this.atts = null;
+            final SAXParserFactory factory = SAXParserFactory.newInstance();
+            this.stream = new ByteArrayInputStream(b);
+            try {
+                this.saxParser = factory.newSAXParser();
+                this.saxParser.parse(this.stream, this);
+            } catch (SAXException e) {
+                Log.logException(e);
+                Log.logWarning("OAIListFriendsLoader.Parser", "OAIListFriends was not parsed:\n" + new String(b));
+            } catch (IOException e) {
+                Log.logException(e);
+                Log.logWarning("OAIListFriendsLoader.Parser", "OAIListFriends was not parsed:\n" + new String(b));
+            } catch (ParserConfigurationException e) {
+                Log.logException(e);
+                Log.logWarning("OAIListFriendsLoader.Parser", "OAIListFriends was not parsed:\n" + new String(b));
+                throw new IOException(e.getMessage());
+            } finally {
+                try {
+                    this.stream.close();
+                } catch (IOException e) {
+                    Log.logException(e);
+                }
+            }
+        }
+        
+        /*
+         <?xml version="1.0" encoding="UTF-8"?>
+         <BaseURLs>
+         <baseURL id="http://roar.eprints.org/id/eprint/102">http://research.nla.gov.au/oai</baseURL>
+         <baseURL id="http://roar.eprints.org/id/eprint/174">http://oai.bibsys.no/repository</baseURL>
+         <baseURL id="http://roar.eprints.org/id/eprint/1064">http://oai.repec.openlib.org/</baseURL>
+         </BaseURLs>
+         */
+        
+        public void startElement(final String uri, final String name, final String tag, final Attributes atts) throws SAXException {
+            if ("baseURL".equals(tag)) {
+                recordCounter++;
+                this.parsingValue = true;
+                this.atts = atts;
+            }
+        }
+
+        public void endElement(final String uri, final String name, final String tag) {
+            if (tag == null) return;
+            if ("baseURL".equals(tag)) {
+                this.map.put(buffer.toString(), this.atts.getValue("id"));
+                this.buffer.setLength(0);
+                this.parsingValue = false;
+            }
+        }
+
+        public void characters(final char ch[], final int start, final int length) {
+            if (parsingValue) {
+                buffer.append(ch, start, length);
+            }
+        }
+
+    }
+    
+    public static void main(String[] args) {
+        try {
+            Map<String, String> map1 = load(null, null, new File("DATA/DICTIONARIES/harvesting/export_roar_ROAR_ListFriends.xml"));
+            int count1 = map1.size();
+            
+            Map<String, String> map2 = load(null, null, new File("DATA/DICTIONARIES/harvesting/ListFriends.xml"));
+            int count2 = map2.size();
+            
+            map1.putAll(map2);
+            System.out.println("count1 = " + count1 + ", count2 = " + count2 + ", all = " + map1.size());
+            
+            for (Map.Entry<String, String> entry: map1.entrySet()) System.out.println(entry.getKey());            
+        } catch (IOException e) {
+            e.printStackTrace();
+        }
+        
+    }
+
+}
--- a/source/net/yacy/document/importer/OAIPMHImporter.java
+++ b/source/net/yacy/document/importer/OAIPMHImporter.java
@ -1,72 +1,55 @@
-// OAIPMHImporter
-// (C) 2009 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany
-// first published 30.09.2009 on http://yacy.net
-//
-// This is a part of YaCy, a peer-to-peer based web search engine
-//
-// $LastChangedDate: 2009-09-23 23:26:14 +0200 (Mi, 23 Sep 2009) $
-// $LastChangedRevision: 6340 $
-// $LastChangedBy: low012 $
-//
-// LICENSE
-// 
-// This program is free software; you can redistribute it and/or modify
-// it under the terms of the GNU General Public License as published by
-// the Free Software Foundation; either version 2 of the License, or
-// (at your option) any later version.
-//
-// This program is distributed in the hope that it will be useful,
-// but WITHOUT ANY WARRANTY; without even the implied warranty of
-// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-// GNU General Public License for more details.
-//
-// You should have received a copy of the GNU General Public License
-// along with this program; if not, write to the Free Software
-// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+/**
+ *  OAIPMHImporter
+ *  Copyright 2009 by Michael Peter Christen
+ *  First released 30.09.2009 at http://yacy.net
+ *  
+ *  This is a part of YaCy, a peer-to-peer based web search engine
+ *
+ *  This program is free software: you can redistribute it and/or modify
+ *  it under the terms of the GNU Lesser General Public License as published by
+ *  the Free Software Foundation, either version 3 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU Lesser General Public License for more details.
+ *
+ *  You should have received a copy of the GNU Lesser General Public License
+ *  along with this program in the file COPYING.LESSER.
+ *  If not, see <http://www.gnu.org/licenses/>.
+ */

 package net.yacy.document.importer;

 import java.io.File;
-import java.io.FileInputStream;
-import java.io.FileNotFoundException;
 import java.io.IOException;
 import java.net.MalformedURLException;
 import java.text.ParseException;
 import java.util.Date;
 import java.util.HashMap;
-import java.util.List;
 import java.util.Map;
 import java.util.Set;
-import java.util.TreeSet;
+import java.util.concurrent.ConcurrentHashMap;

 import net.yacy.kelondro.data.meta.DigestURI;
 import net.yacy.kelondro.logging.Log;
 import net.yacy.kelondro.util.DateFormatter;
 import net.yacy.repository.LoaderDispatcher;
-import net.yacy.document.parser.csvParser;

-import de.anomic.crawler.CrawlProfile;
 import de.anomic.search.Switchboard;

-
-// get one server with
-// http://roar.eprints.org/index.php?action=csv
-// or
-// http://www.openarchives.org/Register/BrowseSites
-// or
-// http://www.openarchives.org/Register/ListFriends
-//
 // list records from oai-pmh like
 // http://opus.bsz-bw.de/fhhv/oai2/oai2.php?verb=ListRecords&metadataPrefix=oai_dc

-
 public class OAIPMHImporter extends Thread implements Importer, Comparable<OAIPMHImporter> {

    private static int importerCounter = Integer.MAX_VALUE;
+    private static Object N = new Object();
    
-    public static TreeSet<OAIPMHImporter> startedJobs = new TreeSet<OAIPMHImporter>();
-    public static TreeSet<OAIPMHImporter> runningJobs = new TreeSet<OAIPMHImporter>();
-    public static TreeSet<OAIPMHImporter> finishedJobs = new TreeSet<OAIPMHImporter>();
+    public static ConcurrentHashMap<OAIPMHImporter, Object> startedJobs = new ConcurrentHashMap<OAIPMHImporter, Object>();
+    public static ConcurrentHashMap<OAIPMHImporter, Object> runningJobs = new ConcurrentHashMap<OAIPMHImporter, Object>();
+    public static ConcurrentHashMap<OAIPMHImporter, Object> finishedJobs = new ConcurrentHashMap<OAIPMHImporter, Object>();
    
    private final LoaderDispatcher loader;
    private DigestURI source;
@ -95,7 +78,7 @@ public class OAIPMHImporter extends Thread implements Importer, Comparable<OAIPM
            // this should never happen
            Log.logException(e);
        }
-        startedJobs.add(this);
+        startedJobs.put(this, N);
    }

    public int count() {
@ -131,23 +114,23 @@ public class OAIPMHImporter extends Thread implements Importer, Comparable<OAIPM
    }
    
    public void run() {
-        while (runningJobs.size() > 10) {
-            try {Thread.sleep(1000 + 1000 * System.currentTimeMillis() % 6);} catch (InterruptedException e) {}
+        while (runningJobs.size() > 50) {
+            try {Thread.sleep(10000 + 3000 * (System.currentTimeMillis() % 6));} catch (InterruptedException e) {}
        }
        startedJobs.remove(this);
-        runningJobs.add(this);
+        runningJobs.put(this, N);
        this.message = "loading first part of records";
        while (true) {
            try {
-                OAIPMHReader reader = new OAIPMHReader(this.loader, this.source, Switchboard.getSwitchboard().surrogatesInPath, filenamePrefix);
+                OAIPMHLoader loader = new OAIPMHLoader(this.loader, this.source, Switchboard.getSwitchboard().surrogatesInPath, filenamePrefix);
                this.chunkCount++;
-                this.recordsCount += reader.getResumptionToken().getRecordCounter();
-                this.source = reader.getResumptionToken().resumptionURL(this.source);
+                this.recordsCount += loader.getResumptionToken().getRecordCounter();
+                this.source = loader.getResumptionToken().resumptionURL(this.source);
                if (this.source == null) {
                    this.message = "import terminated with source = null";
                    break;
                }
-                this.message = "loading next resumption fragment, cursor = " + reader.getResumptionToken().getCursor();
+                this.message = "loading next resumption fragment, cursor = " + loader.getResumptionToken().getCursor();
            } catch (IOException e) {
                this.message = e.getMessage();
                break;
@ -155,7 +138,7 @@ public class OAIPMHImporter extends Thread implements Importer, Comparable<OAIPM
        }
        this.finishTime = System.currentTimeMillis();
        runningJobs.remove(this);
-        finishedJobs.add(this);
+        finishedJobs.put(this, N);
    }
    
    
@ -185,7 +168,7 @@ public class OAIPMHImporter extends Thread implements Importer, Comparable<OAIPM
            File surrogatesIn,
            File surrogatesOut,
            long staleLimit) {
-        Set<String> plainList = getAllListedOAIServer(loader);
+        Set<String> plainList = OAIListFriendsLoader.load(loader).keySet();
        Map<String, Date> loaded = getLoadedOAIServer(surrogatesIn, surrogatesOut);
        long limit = System.currentTimeMillis() - staleLimit;
        for (Map.Entry<String, Date> a: loaded.entrySet()) {
@ -193,47 +176,6 @@ public class OAIPMHImporter extends Thread implements Importer, Comparable<OAIPM
        }
        return plainList;
    }
-    
-    /**
-     * use the list server at http://roar.eprints.org/index.php?action=csv
-     * to produce a list of OAI-PMH sources
-     * @param loader
-     * @return the list of oai-pmh sources
-     */
-    public static Set<String> getAllListedOAIServer(LoaderDispatcher loader) {
-        TreeSet<String> list = new TreeSet<String>();
-
-        // read roar
-        File roar = new File(Switchboard.getSwitchboard().dictionariesPath, "harvesting/roar.csv");
-        DigestURI roarSource;
-        try {
-            roarSource = new DigestURI("http://roar.eprints.org/index.php?action=csv", null);
-        } catch (MalformedURLException e) {
-            Log.logException(e);
-            roarSource = null;
-        }
-        if (!roar.exists()) try {
-            // load the file from the net
-            loader.load(roarSource, CrawlProfile.CACHE_STRATEGY_NOCACHE, roar);
-        } catch (IOException e) {
-            Log.logException(e);
-        }
-        if (roar.exists()) {
-            csvParser parser = new csvParser();
-            try {
-                List<String[]> table = parser.getTable(roarSource, "", "UTF-8", new FileInputStream(roar));
-                for (String[] row: table) {
-                    if (row.length > 2 && (row[2].startsWith("http://") || row[2].startsWith("https://"))) {
-                        list.add(row[2]);
-                    }
-                }
-            } catch (FileNotFoundException e) {
-                Log.logException(e);
-            }
-        }
-        
-        return list;
-    }

    /**
     * get a map for already loaded oai-pmh servers and their latest access date
--- a/source/net/yacy/document/importer/OAIPMHLoader.java
+++ b/source/net/yacy/document/importer/OAIPMHLoader.java
@ -1,32 +1,27 @@
-// OAIPMHReader
-// (C) 2009 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany
-// first published 30.09.2009 on http://yacy.net
-//
-// This is a part of YaCy, a peer-to-peer based web search engine
-//
-// $LastChangedDate: 2009-09-23 23:26:14 +0200 (Mi, 23 Sep 2009) $
-// $LastChangedRevision: 6340 $
-// $LastChangedBy: low012 $
-//
-// LICENSE
-// 
-// This program is free software; you can redistribute it and/or modify
-// it under the terms of the GNU General Public License as published by
-// the Free Software Foundation; either version 2 of the License, or
-// (at your option) any later version.
-//
-// This program is distributed in the hope that it will be useful,
-// but WITHOUT ANY WARRANTY; without even the implied warranty of
-// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-// GNU General Public License for more details.
-//
-// You should have received a copy of the GNU General Public License
-// along with this program; if not, write to the Free Software
-// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+/**
+ *  OAIPMHLoader
+ *  Copyright 2009 by Michael Peter Christen
+ *  First released 30.09.2009 at http://yacy.net
+ *  
+ *  This is a part of YaCy, a peer-to-peer based web search engine
+ *
+ *  This program is free software: you can redistribute it and/or modify
+ *  it under the terms of the GNU Lesser General Public License as published by
+ *  the Free Software Foundation, either version 3 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU Lesser General Public License for more details.
+ *
+ *  You should have received a copy of the GNU Lesser General Public License
+ *  along with this program in the file COPYING.LESSER.
+ *  If not, see <http://www.gnu.org/licenses/>.
+ */

 package net.yacy.document.importer;

-import java.io.ByteArrayInputStream;
 import java.io.File;
 import java.io.IOException;

@ -44,18 +39,18 @@ import de.anomic.crawler.retrieval.Response;
 // http://opus.bsz-bw.de/fhhv/oai2/oai2.php?verb=ListRecords&metadataPrefix=oai_dc


-public class OAIPMHReader {
+public class OAIPMHLoader {

    private final DigestURI source;
    private final ResumptionToken resumptionToken;
    
-    public OAIPMHReader(LoaderDispatcher loader, DigestURI source, File targetDir, String filePrefix) throws IOException {
+    public OAIPMHLoader(LoaderDispatcher loader, DigestURI source, File targetDir, String filePrefix) throws IOException {
        this.source = source;
        
        // load the file from the net
        Response response = loader.load(source, false, true, CrawlProfile.CACHE_STRATEGY_NOCACHE);
        byte[] b = response.getContent();
-        this.resumptionToken = new ResumptionToken(new ByteArrayInputStream(b));
+        this.resumptionToken = new ResumptionToken(b);
        File f1 = new File(targetDir, OAIPMHImporter.filename4Source(source));
        File f0 = new File(targetDir, f1.getName() + ".tmp");
        
--- a/source/net/yacy/document/importer/ResumptionToken.java
+++ b/source/net/yacy/document/importer/ResumptionToken.java
@ -1,30 +1,28 @@
-// ResumptionToken
-// (C) 2009 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany
-// first published 31.10.2009 on http://yacy.net
-//
-// $LastChangedDate: 2006-04-02 22:40:07 +0200 (So, 02 Apr 2006) $
-// $LastChangedRevision: 1986 $
-// $LastChangedBy: orbiter $
-//
-// LICENSE
-// 
-// This program is free software; you can redistribute it and/or modify
-// it under the terms of the GNU General Public License as published by
-// the Free Software Foundation; either version 2 of the License, or
-// (at your option) any later version.
-//
-// This program is distributed in the hope that it will be useful,
-// but WITHOUT ANY WARRANTY; without even the implied warranty of
-// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-// GNU General Public License for more details.
-//
-// You should have received a copy of the GNU General Public License
-// along with this program; if not, write to the Free Software
-// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
-
+/**
+ *  ResumptionToken
+ *  Copyright 2009 by Michael Peter Christen
+ *  First released 31.10.2009 at http://yacy.net
+ *  
+ *  This is a part of YaCy, a peer-to-peer based web search engine
+ *
+ *  This program is free software: you can redistribute it and/or modify
+ *  it under the terms of the GNU Lesser General Public License as published by
+ *  the Free Software Foundation, either version 3 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU Lesser General Public License for more details.
+ *
+ *  You should have received a copy of the GNU Lesser General Public License
+ *  along with this program in the file COPYING.LESSER.
+ *  If not, see <http://www.gnu.org/licenses/>.
+ */

 package net.yacy.document.importer;

+import java.io.ByteArrayInputStream;
 import java.io.IOException;
 import java.io.InputStream;
 import java.text.Collator;
@ -45,7 +43,7 @@ import net.yacy.kelondro.data.meta.DigestURI;
 import net.yacy.kelondro.logging.Log;
 import net.yacy.kelondro.util.DateFormatter;

-public class ResumptionToken  extends TreeMap<String, String> {
+public class ResumptionToken extends TreeMap<String, String> {
    
    private static final long serialVersionUID = -8389462290545629792L;

@ -58,10 +56,10 @@ public class ResumptionToken  extends TreeMap<String, String> {
    
    int recordCounter;
    
-    public ResumptionToken(final InputStream stream) throws IOException {
+    public ResumptionToken(final byte[] b) throws IOException {
        super((Collator) insensitiveCollator.clone());
        this.recordCounter = 0;
-        new Reader(stream);
+        new Parser(b);
    }
    
    public ResumptionToken(
@ -206,7 +204,7 @@ public class ResumptionToken  extends TreeMap<String, String> {
    }
    
    // get a resumption token using a SAX xml parser from am input stream
-    private class Reader extends DefaultHandler {
+    private class Parser extends DefaultHandler {

        // class variables
        private final StringBuilder buffer;
@ -215,21 +213,24 @@ public class ResumptionToken  extends TreeMap<String, String> {
        private InputStream stream;
        private Attributes atts;

-        public Reader(final InputStream stream) throws IOException {
+        public Parser(final byte[] b) throws IOException {
            this.buffer = new StringBuilder();
            this.parsingValue = false;
-            this.stream = stream;
            this.atts = null;
            final SAXParserFactory factory = SAXParserFactory.newInstance();
+            this.stream = new ByteArrayInputStream(b);
            try {
                this.saxParser = factory.newSAXParser();
                this.saxParser.parse(this.stream, this);
            } catch (SAXException e) {
                Log.logException(e);
+                Log.logWarning("ResumptionToken", "token was not parsed:\n" + new String(b));
            } catch (IOException e) {
                Log.logException(e);
+                Log.logWarning("ResumptionToken", "token was not parsed:\n" + new String(b));
            } catch (ParserConfigurationException e) {
                Log.logException(e);
+                Log.logWarning("ResumptionToken", "token was not parsed:\n" + new String(b));
                throw new IOException(e.getMessage());
            } finally {
                try {
--- a/source/net/yacy/kelondro/util/FileUtils.java
+++ b/source/net/yacy/kelondro/util/FileUtils.java
@ -171,6 +171,8 @@ public final class FileUtils {
     * @see #copy(File source, File dest)
     */
    public static void copy(final InputStream source, final File dest, final long count) throws IOException {
+        String path = dest.getParent();
+        if (path != null && path.length() > 0) new File(path).mkdirs();
        FileOutputStream fos = null;
        try {
            fos = new FileOutputStream(dest);
--- a/source/net/yacy/kelondro/util/ScoreCluster.java
+++ b/source/net/yacy/kelondro/util/ScoreCluster.java
@ -84,7 +84,7 @@ public final class ScoreCluster<E> {
    public synchronized void shrinkToMinScore(int minScore) {
        int score;
        Long key;
-        while (true) {
+        while (keyrefDB.size() > 0) {
            // find and remove objects where their score is smaller than the demanded minimum score
            key = keyrefDB.firstKey();
            if (key == null) break;
--- a/source/net/yacy/repository/LoaderDispatcher.java
+++ b/source/net/yacy/repository/LoaderDispatcher.java
@ -31,6 +31,7 @@ import java.io.File;
 import java.io.IOException;
 import java.io.InputStream;
 import java.io.Writer;
+import java.net.MalformedURLException;
 import java.util.Arrays;
 import java.util.Date;
 import java.util.HashSet;
@ -454,4 +455,29 @@ public final class LoaderDispatcher {
            if (System.currentTimeMillis() - e.getValue().longValue() > minDelay) i.remove();
        }
    }
+    
+    public void loadIfNotExistBackground(String url, File cache) {
+        new Loader(url, cache).start();
+    }
+    
+    private class Loader extends Thread {
+
+        private String url;
+        private File cache;
+        
+        public Loader(String url, File cache) {
+            this.url = url;
+            this.cache = cache;
+        }
+        
+        public void run() {
+            if (this.cache.exists()) return;
+            try {
+                // load from the net
+                Response response = load(new DigestURI(this.url), false, true, CrawlProfile.CACHE_STRATEGY_NOCACHE);
+                byte[] b = response.getContent();
+                FileUtils.copy(b, this.cache);
+            } catch (MalformedURLException e) {} catch (IOException e) {}
+        }
+    }
 }