more refactoring for search

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@6263 6c8d7289-2bf4-0310-a012-ef5d649a1542
2024-09-19 00:01:41 +02:00 · 2009-08-25 21:27:01 +00:00 · 2009-08-25 21:27:01 +00:00 · d8ca6e6bf1
commit d8ca6e6bf1
parent becb30fa12
7 changed files with 428 additions and 326 deletions
--- a/htroot/yacy/search.java
+++ b/htroot/yacy/search.java
@ -302,7 +302,7 @@ public final class search {
                } else {
                    joincount = theSearch.getRankingResult().getLocalResourceSize();
                    prop.put("joincount", Integer.toString(joincount));
-                    accu = theSearch.completeResults(3000);
+                    accu = theSearch.snippets.completeResults(3000);
                }
                
                // generate compressed index for maxcounthash
@ -373,8 +373,8 @@ public final class search {
        theQuery.remotepeer = sb.peers.lookupByIP(natLib.getInetAddress(client), true, false, false);
        theQuery.resultcount = (theSearch == null) ? 0 : theSearch.getRankingResult().getLocalResourceSize() + theSearch.getRankingResult().getRemoteResourceSize();
        theQuery.searchtime = System.currentTimeMillis() - timestamp;
-        theQuery.urlretrievaltime = (theSearch == null) ? 0 : theSearch.getURLRetrievalTime();
-        theQuery.snippetcomputationtime = (theSearch == null) ? 0 : theSearch.getSnippetComputationTime();
+        theQuery.urlretrievaltime = (theSearch == null) ? 0 : theSearch.snippets.getURLRetrievalTime();
+        theQuery.snippetcomputationtime = (theSearch == null) ? 0 : theSearch.snippets.getSnippetComputationTime();
        sb.remoteSearches.add(theQuery);
        
        // update the search tracker
--- a/htroot/yacysearch.java
+++ b/htroot/yacysearch.java
@ -472,8 +472,8 @@ public class yacysearch {
            // prepare search statistics
            theQuery.resultcount = theSearch.getRankingResult().getLocalResourceSize() + theSearch.getRankingResult().getRemoteResourceSize();
            theQuery.searchtime = System.currentTimeMillis() - timestamp;
-            theQuery.urlretrievaltime = theSearch.getURLRetrievalTime();
-            theQuery.snippetcomputationtime = theSearch.getSnippetComputationTime();
+            theQuery.urlretrievaltime = theSearch.snippets.getURLRetrievalTime();
+            theQuery.snippetcomputationtime = theSearch.snippets.getSnippetComputationTime();
            sb.localSearches.add(theQuery);
                        
            // check suggestions
--- a/htroot/yacysearchitem.java
+++ b/htroot/yacysearchitem.java
@ -158,7 +158,7 @@ public class yacysearchitem {
            // image search; shows thumbnails

            prop.put("content", theQuery.contentdom + 1); // switch on specific content
-            final SnippetCache.MediaSnippet ms = theSearch.oneImage(item);
+            final SnippetCache.MediaSnippet ms = theSearch.snippets.oneImage(item);
            if (ms == null) {
                prop.put("content_items", "0");
            } else {
--- a/source/de/anomic/search/RankingProcess.java
+++ b/source/de/anomic/search/RankingProcess.java
@ -376,6 +376,18 @@ public final class RankingProcess extends Thread {
        return null;
    }
    
+    public URLMetadataRow bestURL(final boolean skipDoubleDom, long timeout) {
+        timeout += System.currentTimeMillis();
+        long wait = 10;
+        while (System.currentTimeMillis() < timeout) {
+            URLMetadataRow row = bestURL(skipDoubleDom);
+            if (row != null) return row;
+            try {Thread.sleep(wait);} catch (final InterruptedException e1) {}
+            wait = wait * 2;
+        }
+        return null;
+    }
+    
    public int size() {
        //assert sortedRWIEntries.size() == urlhashes.size() : "sortedRWIEntries.size() = " + sortedRWIEntries.size() + ", urlhashes.size() = " + urlhashes.size();
        int c = stack.size();
--- a/source/de/anomic/search/SearchEvent.java
+++ b/source/de/anomic/search/SearchEvent.java
@ -26,27 +26,19 @@

 package de.anomic.search;

-import java.io.IOException;
 import java.util.ArrayList;
-import java.util.HashMap;
 import java.util.Iterator;
 import java.util.Map;
 import java.util.TreeMap;
-import java.util.TreeSet;

 import de.anomic.crawler.ResultURLs;
-import de.anomic.document.Condenser;
 import de.anomic.kelondro.order.Base64Order;
 import de.anomic.kelondro.text.ReferenceContainer;
 import de.anomic.kelondro.text.Segment;
-import de.anomic.kelondro.text.metadataPrototype.URLMetadataRow;
 import de.anomic.kelondro.text.referencePrototype.WordReference;
 import de.anomic.kelondro.util.MemoryControl;
 import de.anomic.kelondro.util.SetTools;
-import de.anomic.kelondro.util.SortStack;
-import de.anomic.kelondro.util.SortStore;
 import de.anomic.search.RankingProcess.NavigatorEntry;
-import de.anomic.search.SnippetCache.MediaSnippet;
 import de.anomic.server.serverProfiling;
 import de.anomic.yacy.yacySearch;
 import de.anomic.yacy.yacySeedDB;
@ -81,14 +73,8 @@ public final class SearchEvent {
    public  TreeMap<byte[], String> IAResults;
    public  TreeMap<byte[], Integer> IACount;
    public  byte[] IAmaxcounthash, IAneardhthash;
-    protected SnippetFetcher[] workerThreads;
-    protected SortStore<ResultEntry> result;
-    protected SortStore<SnippetCache.MediaSnippet> images; // container to sort images by size
-    protected HashMap<String, String> failedURLs; // a mapping from a urlhash to a fail reason string
-    protected TreeSet<byte[]> snippetFetchWordHashes; // a set of word hashes that are used to match with the snippets
-    long urlRetrievalAllTime;
-    long snippetComputationAllTime;
    public ResultURLs crawlResults;
+    public SnippetFetcher snippets;
    
    @SuppressWarnings("unchecked") SearchEvent(final QueryParams query,
                             final Segment indexSegment,
@ -109,21 +95,7 @@ public final class SearchEvent {
        this.IACount = new TreeMap<byte[], Integer>(Base64Order.enhancedCoder);
        this.IAmaxcounthash = null;
        this.IAneardhthash = null;
-        this.urlRetrievalAllTime = 0;
-        this.snippetComputationAllTime = 0;
-        this.workerThreads = null;
        this.localSearchThread = null;
-        this.result = new SortStore<ResultEntry>(-1); // this is the result, enriched with snippets, ranked and ordered by ranking
-        this.images = new SortStore<SnippetCache.MediaSnippet>(-1);
-        this.failedURLs = new HashMap<String, String>(); // a map of urls to reason strings where a worker thread tried to work on, but failed.
-        
-        // snippets do not need to match with the complete query hashes,
-        // only with the query minus the stopwords which had not been used for the search
-        final TreeSet<byte[]> filtered = SetTools.joinConstructive(query.queryHashes, Switchboard.stopwordHashes);
-        this.snippetFetchWordHashes = (TreeSet<byte[]>) query.queryHashes.clone();
-        if ((filtered != null) && (filtered.size() > 0)) {
-            SetTools.excludeDestructive(this.snippetFetchWordHashes, Switchboard.stopwordHashes);
-        }
        
        final long start = System.currentTimeMillis();
        if ((query.domType == QueryParams.SEARCHDOM_GLOBALDHT) ||
@ -200,13 +172,8 @@ public final class SearchEvent {
        }
        
        // start worker threads to fetch urls and snippets
-        this.workerThreads = new SnippetFetcher[(query.onlineSnippetFetch) ? workerThreadCount : 1];
-        for (int i = 0; i < this.workerThreads.length; i++) {
-            this.workerThreads[i] = new SnippetFetcher(i, 10000, (query.onlineSnippetFetch) ? 2 : 0);
-            this.workerThreads[i].start();
-        }
-        serverProfiling.update("SEARCH", new ProfilingGraph.searchEvent(query.id(true), this.workerThreads.length + " online snippet fetch threads started", 0, 0), false);
-    
+        this.snippets = new SnippetFetcher(rankedCache, query, indexSegment, peers);
+        
        // clean up events
        SearchEventCache.cleanupEvents(false);
        serverProfiling.update("SEARCH", new ProfilingGraph.searchEvent(query.id(true), "event-cleanup", 0, 0), false);
@ -216,132 +183,6 @@ public final class SearchEvent {
        lastEventID = query.id(false);
        SearchEventCache.lastEvents.put(lastEventID, this);
    }
-
-    ResultEntry obtainResultEntry(final URLMetadataRow page, final int snippetFetchMode) {
-
-        // a search result entry needs some work to produce a result Entry:
-        // - check if url entry exists in LURL-db
-        // - check exclusions, constraints, masks, media-domains
-        // - load snippet (see if page exists) and check if snippet contains searched word
-
-        // Snippet Fetching can has 3 modes:
-        // 0 - do not fetch snippets
-        // 1 - fetch snippets offline only
-        // 2 - online snippet fetch
-        
-        // load only urls if there was not yet a root url of that hash
-        // find the url entry
-
-        long startTime = System.currentTimeMillis();
-        final URLMetadataRow.Components metadata = page.metadata();
-        final String pagetitle = metadata.dc_title().toLowerCase();
-        if (metadata.url() == null) {
-            registerFailure(page.hash(), "url corrupted (null)");
-            return null; // rare case where the url is corrupted
-        }
-        final String pageurl = metadata.url().toString().toLowerCase();
-        final String pageauthor = metadata.dc_creator().toLowerCase();
-        final long dbRetrievalTime = System.currentTimeMillis() - startTime;
-        
-        // check exclusion
-        if ((QueryParams.matches(pagetitle, query.excludeHashes)) ||
-            (QueryParams.matches(pageurl, query.excludeHashes)) ||
-            (QueryParams.matches(pageauthor, query.excludeHashes))) {
-            return null;
-        }
-            
-        // check url mask
-        if (!(pageurl.matches(query.urlMask))) {
-            return null;
-        }
-            
-        // check constraints
-        if ((query.constraint != null) &&
-            (query.constraint.get(Condenser.flag_cat_indexof)) &&
-            (!(metadata.dc_title().startsWith("Index of")))) {
-            final Iterator<byte[]> wi = query.queryHashes.iterator();
-            while (wi.hasNext()) try { indexSegment.termIndex().remove(wi.next(), page.hash()); } catch (IOException e) {}
-            registerFailure(page.hash(), "index-of constraint not fullfilled");
-            return null;
-        }
-        
-        if ((query.contentdom == QueryParams.CONTENTDOM_AUDIO) && (page.laudio() == 0)) {
-            registerFailure(page.hash(), "contentdom-audio constraint not fullfilled");
-            return null;
-        }
-        if ((query.contentdom == QueryParams.CONTENTDOM_VIDEO) && (page.lvideo() == 0)) {
-            registerFailure(page.hash(), "contentdom-video constraint not fullfilled");
-            return null;
-        }
-        if ((query.contentdom == QueryParams.CONTENTDOM_IMAGE) && (page.limage() == 0)) {
-            registerFailure(page.hash(), "contentdom-image constraint not fullfilled");
-            return null;
-        }
-        if ((query.contentdom == QueryParams.CONTENTDOM_APP) && (page.lapp() == 0)) {
-            registerFailure(page.hash(), "contentdom-app constraint not fullfilled");
-            return null;
-        }
-
-        if (snippetFetchMode == 0) {
-            return new ResultEntry(page, indexSegment, peers, null, null, dbRetrievalTime, 0); // result without snippet
-        }
-        
-        // load snippet
-        if (query.contentdom == QueryParams.CONTENTDOM_TEXT) {
-            // attach text snippet
-            startTime = System.currentTimeMillis();
-            final SnippetCache.TextSnippet snippet = SnippetCache.retrieveTextSnippet(metadata, snippetFetchWordHashes, (snippetFetchMode == 2), ((query.constraint != null) && (query.constraint.get(Condenser.flag_cat_indexof))), 180, (snippetFetchMode == 2) ? Integer.MAX_VALUE : 30000, query.isGlobal());
-            final long snippetComputationTime = System.currentTimeMillis() - startTime;
-            Log.logInfo("SEARCH_EVENT", "text snippet load time for " + metadata.url() + ": " + snippetComputationTime + ", " + ((snippet.getErrorCode() < 11) ? "snippet found" : ("no snippet found (" + snippet.getError() + ")")));
-            
-            if (snippet.getErrorCode() < 11) {
-                // we loaded the file and found the snippet
-                return new ResultEntry(page, indexSegment, peers, snippet, null, dbRetrievalTime, snippetComputationTime); // result with snippet attached
-            } else if (snippetFetchMode == 1) {
-                // we did not demand online loading, therefore a failure does not mean that the missing snippet causes a rejection of this result
-                // this may happen during a remote search, because snippet loading is omitted to retrieve results faster
-                return new ResultEntry(page, indexSegment, peers, null, null, dbRetrievalTime, snippetComputationTime); // result without snippet
-            } else {
-                // problems with snippet fetch
-                registerFailure(page.hash(), "no text snippet for URL " + metadata.url());
-                if (!peers.mySeed().isVirgin())
-                    try {
-                        SnippetCache.failConsequences(snippet, query.id(false));
-                    } catch (IOException e) {
-                        e.printStackTrace();
-                    }
-                return null;
-            }
-        } else {
-            // attach media information
-            startTime = System.currentTimeMillis();
-            final ArrayList<MediaSnippet> mediaSnippets = SnippetCache.retrieveMediaSnippets(metadata.url(), snippetFetchWordHashes, query.contentdom, (snippetFetchMode == 2), 6000, query.isGlobal());
-            final long snippetComputationTime = System.currentTimeMillis() - startTime;
-            Log.logInfo("SEARCH_EVENT", "media snippet load time for " + metadata.url() + ": " + snippetComputationTime);
-            
-            if ((mediaSnippets != null) && (mediaSnippets.size() > 0)) {
-                // found media snippets, return entry
-                return new ResultEntry(page, indexSegment, peers, null, mediaSnippets, dbRetrievalTime, snippetComputationTime);
-            } else if (snippetFetchMode == 1) {
-                return new ResultEntry(page, indexSegment, peers, null, null, dbRetrievalTime, snippetComputationTime);
-            } else {
-                // problems with snippet fetch
-                registerFailure(page.hash(), "no media snippet for URL " + metadata.url());
-                return null;
-            }
-        }
-        // finished, no more actions possible here
-    }
-    
-    boolean anyWorkerAlive() {
-        if (this.workerThreads == null) return false;
-        for (int i = 0; i < this.workerThreads.length; i++) {
-           if ((this.workerThreads[i] != null) &&
-        	   (this.workerThreads[i].isAlive()) &&
-        	   (this.workerThreads[i].busytime() < 3000)) return true;
-        }
-        return false;
-    }
    
    boolean anyRemoteSearchAlive() {
        // check primary search threads
@ -385,86 +226,7 @@ public final class SearchEvent {
    public RankingProcess getRankingResult() {
        return this.rankedCache;
    }
-    
-    public long getURLRetrievalTime() {
-        return this.urlRetrievalAllTime;
-    }
-    
-    public long getSnippetComputationTime() {
-        return this.snippetComputationAllTime;
-    }

-    protected class SnippetFetcher extends Thread {
-        
-        private final long timeout; // the date until this thread should try to work
-        private long lastLifeSign; // when the last time the run()-loop was executed
-        private final int id;
-        private int snippetMode;
-        
-        public SnippetFetcher(final int id, final long maxlifetime, int snippetMode) {
-            this.id = id;
-            this.snippetMode = snippetMode;
-            this.lastLifeSign = System.currentTimeMillis();
-            this.timeout = System.currentTimeMillis() + Math.max(1000, maxlifetime);
-        }
-
-        public void run() {
-
-            // start fetching urls and snippets
-            URLMetadataRow page;
-            final int fetchAhead = snippetMode == 0 ? 0 : 10;
-            boolean nav_topics = query.navigators.equals("all") || query.navigators.indexOf("topics") >= 0;
-            try {
-                while (System.currentTimeMillis() < this.timeout) {
-                    this.lastLifeSign = System.currentTimeMillis();
-    
-                    // check if we have enough
-                    if ((query.contentdom == QueryParams.CONTENTDOM_IMAGE) && (images.size() >= query.neededResults() + fetchAhead)) break;
-                    if ((query.contentdom != QueryParams.CONTENTDOM_IMAGE) && (result.size() >= query.neededResults() + fetchAhead)) break;
-    
-                    // get next entry
-                    page = rankedCache.bestURL(true);
-                    if (page == null) {
-                    	if (!anyRemoteSearchAlive()) break; // we cannot expect more results
-                        // if we did not get another entry, sleep some time and try again
-                        try {Thread.sleep(10);} catch (final InterruptedException e1) {}
-                        continue;
-                    }
-                    if (result.exists(page.hash().hashCode())) continue;
-                    if (failedURLs.get(page.hash()) != null) continue;
-                    
-                    // try secondary search
-                    prepareSecondarySearch(); // will be executed only once
-                    
-                    final ResultEntry resultEntry = obtainResultEntry(page, snippetMode);
-                    if (resultEntry == null) continue; // the entry had some problems, cannot be used
-                    urlRetrievalAllTime += resultEntry.dbRetrievalTime;
-                    snippetComputationAllTime += resultEntry.snippetComputationTime;
-                    //System.out.println("+++DEBUG-resultWorker+++ fetched " + resultEntry.urlstring());
-                    
-                    // place the result to the result vector
-                    if (!result.exists(resultEntry)) {
-                        result.push(resultEntry, Long.valueOf(rankedCache.getOrder().cardinal(resultEntry.word())));
-                        if (nav_topics) rankedCache.addTopics(resultEntry);
-                    }
-                    //System.out.println("DEBUG SNIPPET_LOADING: thread " + id + " got " + resultEntry.url());
-                }
-            } catch (final Exception e) {
-                e.printStackTrace();
-            }
-            Log.logInfo("SEARCH", "resultWorker thread " + id + " terminated");
-        }
-        
-        public long busytime() {
-        	return System.currentTimeMillis() - this.lastLifeSign;
-        }
-    }
-    
-    private void registerFailure(final String urlhash, final String reason) {
-        this.failedURLs.put(urlhash, reason);
-        Log.logInfo("search", "sorted out hash " + urlhash + " during search: " + reason);
-    }
-    
    public ArrayList<NavigatorEntry> getHostNavigator(int maxentries) {
    	return this.rankedCache.getHostNavigator(maxentries);
    }
@ -480,13 +242,6 @@ public final class SearchEvent {
    }
    
    public ResultEntry oneResult(final int item) {
-        // check if we already retrieved this item (happens if a search
-        // pages is accessed a second time)
-        serverProfiling.update("SEARCH", new ProfilingGraph.searchEvent(query.id(true), "obtain one result entry - start", 0, 0), false);
-        if (this.result.sizeStore() > item) {
-            // we have the wanted result already in the result array .. return that
-            return this.result.element(item).element;
-        }
        if ((query.domType == QueryParams.SEARCHDOM_GLOBALDHT) ||
             (query.domType == QueryParams.SEARCHDOM_CLUSTERALL)) {
            // this is a search using remote search threads. Also the local
@ -500,67 +255,12 @@ public final class SearchEvent {
            // want to display results
            while (this.primarySearchThreads != null &&
                   this.primarySearchThreads.length > item &&
-                   anyWorkerAlive() &&
-                   (result.size() <= item || countFinishedRemoteSearch() <= item)) {
+                   this.snippets.anyWorkerAlive() &&
+                   (this.snippets.resultCount() <= item || countFinishedRemoteSearch() <= item)) {
                try {Thread.sleep(item * 50L);} catch (final InterruptedException e) {}
            }
-
        }
-        // finally wait until enough results are there produced from the
-        // snippet fetch process
-        while ((anyWorkerAlive()) && (result.size() <= item)) {
-            try {Thread.sleep(item * 50L);} catch (final InterruptedException e) {}
-        }
-
-        // finally, if there is something, return the result
-        if (this.result.size() <= item) return null;
-        return this.result.element(item).element;
-    }
-    
-    private int resultCounter = 0;
-    public ResultEntry nextResult() {
-        final ResultEntry re = oneResult(resultCounter);
-        resultCounter++;
-        return re;
-    }
-    
-    public SnippetCache.MediaSnippet oneImage(final int item) {
-        // check if we already retrieved this item (happens if a search pages is accessed a second time)
-        if (this.images.sizeStore() > item) {
-            // we have the wanted result already in the result array .. return that
-            return this.images.element(item).element;
-        }
-        
-        // feed some results from the result stack into the image stack
-        final int count = Math.min(5, Math.max(1, 10 * this.result.size() / (item + 1)));
-        for (int i = 0; i < count; i++) {
-            // generate result object
-            final ResultEntry result = nextResult();
-            SnippetCache.MediaSnippet ms;
-            if (result != null) {
-                // iterate over all images in the result
-                final ArrayList<SnippetCache.MediaSnippet> imagemedia = result.mediaSnippets();
-                if (imagemedia != null) {
-                    for (int j = 0; j < imagemedia.size(); j++) {
-                        ms = imagemedia.get(j);
-                        images.push(ms, Long.valueOf(ms.ranking));
-                    }
-                }
-            }
-        }
-        
-        // now take the specific item from the image stack
-        if (this.images.size() <= item) return null;
-        return this.images.element(item).element;
-    }
-    
-    public ArrayList<SortStack<ResultEntry>.stackElement> completeResults(final long waitingtime) {
-        final long timeout = System.currentTimeMillis() + waitingtime;
-        while ((result.size() < query.neededResults()) && (anyWorkerAlive()) && (System.currentTimeMillis() < timeout)) {
-            try {Thread.sleep(100);} catch (final InterruptedException e) {}
-            //System.out.println("+++DEBUG-completeResults+++ sleeping " + 200);
-        }
-        return this.result.list(this.result.size());
+        return this.snippets.oneResult(item);
    }
    
    boolean secondarySearchStartet = false;
--- a/source/de/anomic/search/SearchEventCache.java
+++ b/source/de/anomic/search/SearchEventCache.java
@ -34,7 +34,6 @@ import java.util.concurrent.ConcurrentHashMap;

 import de.anomic.crawler.ResultURLs;
 import de.anomic.kelondro.text.Segment;
-import de.anomic.search.SearchEvent.SnippetFetcher;
 import de.anomic.yacy.yacySeedDB;
 import de.anomic.yacy.logging.Log;

@ -51,7 +50,7 @@ public class SearchEventCache {
            cleanEvent = i.next();
            if ((all) || (cleanEvent.eventTime + eventLifetime < System.currentTimeMillis())) {
                // execute deletion of failed words
-                int rw = cleanEvent.failedURLs.size();
+                int rw = cleanEvent.snippets.failedURLs.size();
                if (rw > 0) {
                    final TreeSet<byte[]> removeWords = cleanEvent.query.queryHashes;
                    removeWords.addAll(cleanEvent.query.excludeHashes);
@ -59,7 +58,7 @@ public class SearchEventCache {
                        final Iterator<byte[]> j = removeWords.iterator();
                        // remove the same url hashes for multiple words
                        while (j.hasNext()) {
-                            cleanEvent.indexSegment.termIndex().remove(j.next(), cleanEvent.failedURLs.keySet());
+                            cleanEvent.indexSegment.termIndex().remove(j.next(), cleanEvent.snippets.failedURLs.keySet());
                        }                    
                    } catch (IOException e) {
                        e.printStackTrace();
@ -106,21 +105,15 @@ public class SearchEventCache {
            event = new SearchEvent(query, indexSegment, peers, crawlResults, preselectedPeerHashes, generateAbstracts);
        } else {
            // if worker threads had been alive, but did not succeed, start them again to fetch missing links
-            if ((!event.anyWorkerAlive()) &&
-                (((query.contentdom == QueryParams.CONTENTDOM_IMAGE) && (event.images.size() + 30 < query.neededResults())) ||
-                 (event.result.size() < query.neededResults() + 10)) &&
+            if ((!event.snippets.anyWorkerAlive()) &&
+                (((query.contentdom == QueryParams.CONTENTDOM_IMAGE) && (event.snippets.images.size() + 30 < query.neededResults())) ||
+                 (event.snippets.result.size() < query.neededResults() + 10)) &&
                 //(event.query.onlineSnippetFetch) &&
-                (event.getRankingResult().getLocalResourceSize() + event.getRankingResult().getRemoteResourceSize() > event.result.size())) {
+                (event.getRankingResult().getLocalResourceSize() + event.getRankingResult().getRemoteResourceSize() > event.snippets.result.size())) {
                // set new timeout
                event.eventTime = System.currentTimeMillis();
                // start worker threads to fetch urls and snippets
-                event.workerThreads = new SnippetFetcher[SearchEvent.workerThreadCount];
-                SnippetFetcher worker;
-                for (int i = 0; i < event.workerThreads.length; i++) {
-                    worker = event.new SnippetFetcher(i, 6000, (query.onlineSnippetFetch) ? 2 : 0);
-                    worker.start();
-                    event.workerThreads[i] = worker;
-                }
+                event.snippets.restartWorker();
            }
        }
    
--- a/source/de/anomic/search/SnippetFetcher.java
+++ b/source/de/anomic/search/SnippetFetcher.java
@ -0,0 +1,397 @@
+// SearchEvent.java
+// (C) 2005 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany
+// first published 10.10.2005 on http://yacy.net
+//
+// This is a part of YaCy, a peer-to-peer based web search engine
+//
+// $LastChangedDate: 2006-04-02 22:40:07 +0200 (So, 02 Apr 2006) $
+// $LastChangedRevision: 1986 $
+// $LastChangedBy: orbiter $
+//
+// LICENSE
+// 
+// This program is free software; you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation; either version 2 of the License, or
+// (at your option) any later version.
+//
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License for more details.
+//
+// You should have received a copy of the GNU General Public License
+// along with this program; if not, write to the Free Software
+// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+
+package de.anomic.search;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.Iterator;
+import java.util.TreeSet;
+
+import de.anomic.document.Condenser;
+import de.anomic.kelondro.text.Segment;
+import de.anomic.kelondro.text.metadataPrototype.URLMetadataRow;
+import de.anomic.kelondro.util.SetTools;
+import de.anomic.kelondro.util.SortStack;
+import de.anomic.kelondro.util.SortStore;
+import de.anomic.search.RankingProcess.NavigatorEntry;
+import de.anomic.search.SnippetCache.MediaSnippet;
+import de.anomic.server.serverProfiling;
+import de.anomic.yacy.yacySeedDB;
+import de.anomic.yacy.logging.Log;
+import de.anomic.ymage.ProfilingGraph;
+
+public class SnippetFetcher {
+
+    protected final static int workerThreadCount = 10;
+    
+    // input values
+    private final RankingProcess  rankedCache; // ordered search results, grows dynamically as all the query threads enrich this container
+    private final QueryParams     query;
+    private final Segment         indexSegment;
+    private final yacySeedDB      peers;
+    
+    // result values
+    protected       Worker[]                             workerThreads;
+    protected final SortStore<ResultEntry>               result;
+    protected final SortStore<SnippetCache.MediaSnippet> images; // container to sort images by size
+    protected final HashMap<String, String>              failedURLs; // a mapping from a urlhash to a fail reason string
+    protected final TreeSet<byte[]>                      snippetFetchWordHashes; // a set of word hashes that are used to match with the snippets
+    long urlRetrievalAllTime;
+    long snippetComputationAllTime;
+    
+    
+    @SuppressWarnings("unchecked")
+    SnippetFetcher(
+            RankingProcess rankedCache,
+            final QueryParams query,
+            final Segment indexSegment,
+            final yacySeedDB peers) {
+    	
+    	this.rankedCache = rankedCache;
+    	this.query = query;
+        this.indexSegment = indexSegment;
+        this.peers = peers;
+        
+        this.urlRetrievalAllTime = 0;
+        this.snippetComputationAllTime = 0;
+        this.result = new SortStore<ResultEntry>(-1); // this is the result, enriched with snippets, ranked and ordered by ranking
+        this.images = new SortStore<SnippetCache.MediaSnippet>(-1);
+        this.failedURLs = new HashMap<String, String>(); // a map of urls to reason strings where a worker thread tried to work on, but failed.
+        
+        // snippets do not need to match with the complete query hashes,
+        // only with the query minus the stopwords which had not been used for the search
+        final TreeSet<byte[]> filtered = SetTools.joinConstructive(query.queryHashes, Switchboard.stopwordHashes);
+        this.snippetFetchWordHashes = (TreeSet<byte[]>) query.queryHashes.clone();
+        if ((filtered != null) && (filtered.size() > 0)) {
+            SetTools.excludeDestructive(this.snippetFetchWordHashes, Switchboard.stopwordHashes);
+        }
+        
+        // start worker threads to fetch urls and snippets
+        this.workerThreads = new Worker[(query.onlineSnippetFetch) ? workerThreadCount : 1];
+        for (int i = 0; i < this.workerThreads.length; i++) {
+            this.workerThreads[i] = new Worker(i, 10000, (query.onlineSnippetFetch) ? 2 : 0);
+            this.workerThreads[i].start();
+        }
+        serverProfiling.update("SEARCH", new ProfilingGraph.searchEvent(query.id(true), this.workerThreads.length + " online snippet fetch threads started", 0, 0), false);
+        
+    }
+
+    public void restartWorker() {
+    	if (anyWorkerAlive()) return;
+    	this.workerThreads = new Worker[SearchEvent.workerThreadCount];
+    	Worker worker;
+        for (int i = 0; i < workerThreads.length; i++) {
+            worker = new Worker(i, 6000, (query.onlineSnippetFetch) ? 2 : 0);
+            worker.start();
+            workerThreads[i] = worker;
+        }
+    }
+    
+    ResultEntry obtainResultEntry(final URLMetadataRow page, final int snippetFetchMode) {
+
+        // a search result entry needs some work to produce a result Entry:
+        // - check if url entry exists in LURL-db
+        // - check exclusions, constraints, masks, media-domains
+        // - load snippet (see if page exists) and check if snippet contains searched word
+
+        // Snippet Fetching can has 3 modes:
+        // 0 - do not fetch snippets
+        // 1 - fetch snippets offline only
+        // 2 - online snippet fetch
+        
+        // load only urls if there was not yet a root url of that hash
+        // find the url entry
+
+        long startTime = System.currentTimeMillis();
+        final URLMetadataRow.Components metadata = page.metadata();
+        final String pagetitle = metadata.dc_title().toLowerCase();
+        if (metadata.url() == null) {
+            registerFailure(page.hash(), "url corrupted (null)");
+            return null; // rare case where the url is corrupted
+        }
+        final String pageurl = metadata.url().toString().toLowerCase();
+        final String pageauthor = metadata.dc_creator().toLowerCase();
+        final long dbRetrievalTime = System.currentTimeMillis() - startTime;
+        
+        // check exclusion
+        if ((QueryParams.matches(pagetitle, query.excludeHashes)) ||
+            (QueryParams.matches(pageurl, query.excludeHashes)) ||
+            (QueryParams.matches(pageauthor, query.excludeHashes))) {
+            return null;
+        }
+            
+        // check url mask
+        if (!(pageurl.matches(query.urlMask))) {
+            return null;
+        }
+            
+        // check constraints
+        if ((query.constraint != null) &&
+            (query.constraint.get(Condenser.flag_cat_indexof)) &&
+            (!(metadata.dc_title().startsWith("Index of")))) {
+            final Iterator<byte[]> wi = query.queryHashes.iterator();
+            while (wi.hasNext()) try { indexSegment.termIndex().remove(wi.next(), page.hash()); } catch (IOException e) {}
+            registerFailure(page.hash(), "index-of constraint not fullfilled");
+            return null;
+        }
+        
+        if ((query.contentdom == QueryParams.CONTENTDOM_AUDIO) && (page.laudio() == 0)) {
+            registerFailure(page.hash(), "contentdom-audio constraint not fullfilled");
+            return null;
+        }
+        if ((query.contentdom == QueryParams.CONTENTDOM_VIDEO) && (page.lvideo() == 0)) {
+            registerFailure(page.hash(), "contentdom-video constraint not fullfilled");
+            return null;
+        }
+        if ((query.contentdom == QueryParams.CONTENTDOM_IMAGE) && (page.limage() == 0)) {
+            registerFailure(page.hash(), "contentdom-image constraint not fullfilled");
+            return null;
+        }
+        if ((query.contentdom == QueryParams.CONTENTDOM_APP) && (page.lapp() == 0)) {
+            registerFailure(page.hash(), "contentdom-app constraint not fullfilled");
+            return null;
+        }
+
+        if (snippetFetchMode == 0) {
+            return new ResultEntry(page, indexSegment, peers, null, null, dbRetrievalTime, 0); // result without snippet
+        }
+        
+        // load snippet
+        if (query.contentdom == QueryParams.CONTENTDOM_TEXT) {
+            // attach text snippet
+            startTime = System.currentTimeMillis();
+            final SnippetCache.TextSnippet snippet = SnippetCache.retrieveTextSnippet(metadata, snippetFetchWordHashes, (snippetFetchMode == 2), ((query.constraint != null) && (query.constraint.get(Condenser.flag_cat_indexof))), 180, (snippetFetchMode == 2) ? Integer.MAX_VALUE : 30000, query.isGlobal());
+            final long snippetComputationTime = System.currentTimeMillis() - startTime;
+            Log.logInfo("SEARCH_EVENT", "text snippet load time for " + metadata.url() + ": " + snippetComputationTime + ", " + ((snippet.getErrorCode() < 11) ? "snippet found" : ("no snippet found (" + snippet.getError() + ")")));
+            
+            if (snippet.getErrorCode() < 11) {
+                // we loaded the file and found the snippet
+                return new ResultEntry(page, indexSegment, peers, snippet, null, dbRetrievalTime, snippetComputationTime); // result with snippet attached
+            } else if (snippetFetchMode == 1) {
+                // we did not demand online loading, therefore a failure does not mean that the missing snippet causes a rejection of this result
+                // this may happen during a remote search, because snippet loading is omitted to retrieve results faster
+                return new ResultEntry(page, indexSegment, peers, null, null, dbRetrievalTime, snippetComputationTime); // result without snippet
+            } else {
+                // problems with snippet fetch
+                registerFailure(page.hash(), "no text snippet for URL " + metadata.url());
+                if (!peers.mySeed().isVirgin())
+                    try {
+                        SnippetCache.failConsequences(snippet, query.id(false));
+                    } catch (IOException e) {
+                        e.printStackTrace();
+                    }
+                return null;
+            }
+        } else {
+            // attach media information
+            startTime = System.currentTimeMillis();
+            final ArrayList<MediaSnippet> mediaSnippets = SnippetCache.retrieveMediaSnippets(metadata.url(), snippetFetchWordHashes, query.contentdom, (snippetFetchMode == 2), 6000, query.isGlobal());
+            final long snippetComputationTime = System.currentTimeMillis() - startTime;
+            Log.logInfo("SEARCH_EVENT", "media snippet load time for " + metadata.url() + ": " + snippetComputationTime);
+            
+            if ((mediaSnippets != null) && (mediaSnippets.size() > 0)) {
+                // found media snippets, return entry
+                return new ResultEntry(page, indexSegment, peers, null, mediaSnippets, dbRetrievalTime, snippetComputationTime);
+            } else if (snippetFetchMode == 1) {
+                return new ResultEntry(page, indexSegment, peers, null, null, dbRetrievalTime, snippetComputationTime);
+            } else {
+                // problems with snippet fetch
+                registerFailure(page.hash(), "no media snippet for URL " + metadata.url());
+                return null;
+            }
+        }
+        // finished, no more actions possible here
+    }
+    
+    boolean anyWorkerAlive() {
+        if (this.workerThreads == null) return false;
+        for (int i = 0; i < this.workerThreads.length; i++) {
+           if ((this.workerThreads[i] != null) &&
+               (this.workerThreads[i].isAlive()) &&
+               (this.workerThreads[i].busytime() < 3000)) return true;
+        }
+        return false;
+    }
+    
+    
+    public long getURLRetrievalTime() {
+        return this.urlRetrievalAllTime;
+    }
+    
+    public long getSnippetComputationTime() {
+        return this.snippetComputationAllTime;
+    }
+
+    protected class Worker extends Thread {
+        
+        private final long timeout; // the date until this thread should try to work
+        private long lastLifeSign; // when the last time the run()-loop was executed
+        private final int id;
+        private int snippetMode;
+        
+        public Worker(final int id, final long maxlifetime, int snippetMode) {
+            this.id = id;
+            this.snippetMode = snippetMode;
+            this.lastLifeSign = System.currentTimeMillis();
+            this.timeout = System.currentTimeMillis() + Math.max(1000, maxlifetime);
+        }
+
+        public void run() {
+
+            // start fetching urls and snippets
+            URLMetadataRow page;
+            final int fetchAhead = snippetMode == 0 ? 0 : 10;
+            boolean nav_topics = query.navigators.equals("all") || query.navigators.indexOf("topics") >= 0;
+            try {
+                while (System.currentTimeMillis() < this.timeout) {
+                    this.lastLifeSign = System.currentTimeMillis();
+    
+                    // check if we have enough
+                    if ((query.contentdom == QueryParams.CONTENTDOM_IMAGE) && (images.size() >= query.neededResults() + fetchAhead)) break;
+                    if ((query.contentdom != QueryParams.CONTENTDOM_IMAGE) && (result.size() >= query.neededResults() + fetchAhead)) break;
+    
+                    // get next entry
+                    page = rankedCache.bestURL(true, 10000);
+                    if (page == null) break;
+                    if (result.exists(page.hash().hashCode())) continue;
+                    if (failedURLs.get(page.hash()) != null) continue;
+                    
+                    final ResultEntry resultEntry = obtainResultEntry(page, snippetMode);
+                    if (resultEntry == null) continue; // the entry had some problems, cannot be used
+                    urlRetrievalAllTime += resultEntry.dbRetrievalTime;
+                    snippetComputationAllTime += resultEntry.snippetComputationTime;
+                    //System.out.println("+++DEBUG-resultWorker+++ fetched " + resultEntry.urlstring());
+                    
+                    // place the result to the result vector
+                    if (!result.exists(resultEntry)) {
+                        result.push(resultEntry, Long.valueOf(rankedCache.getOrder().cardinal(resultEntry.word())));
+                        if (nav_topics) rankedCache.addTopics(resultEntry);
+                    }
+                    //System.out.println("DEBUG SNIPPET_LOADING: thread " + id + " got " + resultEntry.url());
+                }
+            } catch (final Exception e) {
+                e.printStackTrace();
+            }
+            Log.logInfo("SEARCH", "resultWorker thread " + id + " terminated");
+        }
+        
+        public long busytime() {
+            return System.currentTimeMillis() - this.lastLifeSign;
+        }
+    }
+    
+    private void registerFailure(final String urlhash, final String reason) {
+        this.failedURLs.put(urlhash, reason);
+        Log.logInfo("search", "sorted out hash " + urlhash + " during search: " + reason);
+    }
+    
+    public ArrayList<NavigatorEntry> getHostNavigator(int maxentries) {
+        return this.rankedCache.getHostNavigator(maxentries);
+    }
+    
+    public ArrayList<NavigatorEntry> getTopicNavigator(final int maxentries) {
+        // returns a set of words that are computed as toplist
+        return this.rankedCache.getTopicNavigator(maxentries);
+    }
+    
+    public ArrayList<NavigatorEntry> getAuthorNavigator(final int maxentries) {
+        // returns a list of authors so far seen on result set
+        return this.rankedCache.getAuthorNavigator(maxentries);
+    }
+    
+    public int resultCount() {
+    	return this.result.size();
+    }
+    
+    public ResultEntry oneResult(final int item) {
+        // check if we already retrieved this item
+    	// (happens if a search pages is accessed a second time)
+        serverProfiling.update("SEARCH", new ProfilingGraph.searchEvent(query.id(true), "obtain one result entry - start", 0, 0), false);
+        if (this.result.sizeStore() > item) {
+            // we have the wanted result already in the result array .. return that
+            return this.result.element(item).element;
+        }
+        
+        // finally wait until enough results are there produced from the
+        // snippet fetch process
+        while ((anyWorkerAlive()) && (result.size() <= item)) {
+            try {Thread.sleep(item * 50L);} catch (final InterruptedException e) {}
+        }
+
+        // finally, if there is something, return the result
+        if (this.result.size() <= item) return null;
+        return this.result.element(item).element;
+    }
+    
+    private int resultCounter = 0;
+    public ResultEntry nextResult() {
+        final ResultEntry re = oneResult(resultCounter);
+        resultCounter++;
+        return re;
+    }
+    
+    public SnippetCache.MediaSnippet oneImage(final int item) {
+        // check if we already retrieved this item (happens if a search pages is accessed a second time)
+        if (this.images.sizeStore() > item) {
+            // we have the wanted result already in the result array .. return that
+            return this.images.element(item).element;
+        }
+        
+        // feed some results from the result stack into the image stack
+        final int count = Math.min(5, Math.max(1, 10 * this.result.size() / (item + 1)));
+        for (int i = 0; i < count; i++) {
+            // generate result object
+            final ResultEntry result = nextResult();
+            SnippetCache.MediaSnippet ms;
+            if (result != null) {
+                // iterate over all images in the result
+                final ArrayList<SnippetCache.MediaSnippet> imagemedia = result.mediaSnippets();
+                if (imagemedia != null) {
+                    for (int j = 0; j < imagemedia.size(); j++) {
+                        ms = imagemedia.get(j);
+                        images.push(ms, Long.valueOf(ms.ranking));
+                    }
+                }
+            }
+        }
+        
+        // now take the specific item from the image stack
+        if (this.images.size() <= item) return null;
+        return this.images.element(item).element;
+    }
+    
+    public ArrayList<SortStack<ResultEntry>.stackElement> completeResults(final long waitingtime) {
+        final long timeout = System.currentTimeMillis() + waitingtime;
+        while ((result.size() < query.neededResults()) && (anyWorkerAlive()) && (System.currentTimeMillis() < timeout)) {
+            try {Thread.sleep(100);} catch (final InterruptedException e) {}
+            //System.out.println("+++DEBUG-completeResults+++ sleeping " + 200);
+        }
+        return this.result.list(this.result.size());
+    }
+    
+}