yacy_search_server/source/de/anomic/plasma/plasmaSnippetCache.java

// plasmaSnippetCache.java
// -----------------------
// part of YaCy
// (C) by Michael Peter Christen; mc@anomic.de
// first published on http://www.anomic.de
// Frankfurt, Germany, 2005
// last major change: 07.06.2005
//
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation; either version 2 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
//
// Using this software in any meaning (reading, learning, copying, compiling,
// running) means that you agree that the Author(s) is (are) not responsible
// for cost, loss of data or any harm that may be caused directly or indirectly
// by usage of this softare or this documentation. The usage of this software
// is on your own risk. The installation and usage (starting/running) of this
// software may allow other people or application to access your computer and
// any attached devices and is highly dependent on the configuration of the
// software which must be done by the user of the software; the author(s) is
// (are) also not responsible for proper configuration and usage of the
// software, even if provoked by documentation provided together with
// the software.
//
// Any changes to this file according to the GPL as documented in the file
// gpl.txt aside this file in the shipment you received can be done to the
// lines that follows this copyright notice here, but changes must not be
// done inside the copyright notive above. A re-distribution must contain
// the intact and unchanged copyright notice.
// Contributions and changes to the program code must be marked as such.


package de.anomic.plasma;

import java.util.*;
import java.io.File;
import java.io.IOException;
import java.net.URL;
import de.anomic.htmlFilter.htmlFilterContentScraper;
import de.anomic.kelondro.kelondroMScoreCluster;
import de.anomic.server.serverFileUtils;
import de.anomic.server.logging.serverLog;
import de.anomic.http.httpHeader;
import de.anomic.yacy.yacySearch;

public class plasmaSnippetCache {

    private static final int maxCache = 500;
    
    private int                   snippetsScoreCounter;
    private kelondroMScoreCluster snippetsScore;
    private HashMap               snippetsCache;
    private plasmaHTCache         cacheManager;
    private plasmaParser          parser;
    private serverLog             log;
    private String                remoteProxyHost;
    private int                   remoteProxyPort;
    private boolean               remoteProxyUse;
    
    public plasmaSnippetCache(plasmaHTCache cacheManager, plasmaParser parser,
                              String remoteProxyHost, int remoteProxyPort, boolean remoteProxyUse,
                              serverLog log) {
        this.cacheManager = cacheManager;
        this.parser = parser;
        this.log = log;
        this.remoteProxyHost = remoteProxyHost;
        this.remoteProxyPort = remoteProxyPort;
        this.remoteProxyUse = remoteProxyUse;
        this.snippetsScoreCounter = 0;
        this.snippetsScore = new kelondroMScoreCluster();
        this.snippetsCache = new HashMap();        
    }
    
    
    public synchronized void store(String wordhashes, String urlhash, String snippet) {
        // generate key
        String key = urlhash + wordhashes;

        // do nothing if snippet is known
        if (snippetsCache.containsKey(key)) return;

        // learn new snippet
        snippetsScore.addScore(key, snippetsScoreCounter++);
        snippetsCache.put(key, snippet);

        // care for counter
        if (snippetsScoreCounter == java.lang.Integer.MAX_VALUE) {
            snippetsScoreCounter = 0;
            snippetsScore = new kelondroMScoreCluster();
            snippetsCache = new HashMap();
        }
        
        // flush cache if cache is full
        while (snippetsCache.size() > maxCache) {
            key = (String) snippetsScore.getMinObject();
            snippetsScore.deleteScore(key);
            snippetsCache.remove(key);
        }
    }
    
    private String retrieve(String wordhashes, String urlhash) {
        // generate key
        String key = urlhash + wordhashes;
        return (String) snippetsCache.get(key);
    }
    
    public String retrieve(java.net.URL url, boolean fetchOnline, Set query, boolean queryAreHashes) {
        if (query.size() == 0) return null;
        if (!(queryAreHashes)) query = plasmaSearch.words2hashes(query);
        String urlhash = plasmaURL.urlHash(url);
        
        // try to get snippet from snippetCache
        String wordhashes = yacySearch.set2string(query);
        String snippet = retrieve(wordhashes, urlhash);
        if (snippet != null) return snippet;
        
        // if the snippet is not in the cache, we can try to get it from the htcache
        plasmaParserDocument document = getDocument(url, fetchOnline);
        if (document == null) return null;
        String[] sentences = document.getSentences();
        //System.out.println("----" + url.toString()); for (int l = 0; l < sentences.length; l++) System.out.println(sentences[l]);
        if ((sentences == null) || (sentences.length == 0)) return null;

        // we have found a parseable non-empty file: use the lines
        TreeMap sentencematrix = hashMatrix(sentences);
        Iterator i = query.iterator();
        String hash;
        kelondroMScoreCluster hitTable = new kelondroMScoreCluster();
        Iterator j;
        Integer sentencenumber;
        Map.Entry entry;
        while (i.hasNext()) {
            hash = (String) i.next();
            j = sentencematrix.entrySet().iterator();
            while (j.hasNext()) {
                entry = (Map.Entry) j.next();
                sentencenumber = (Integer) entry.getKey();
                if (((HashSet) entry.getValue()).contains(hash)) hitTable.addScore(sentencenumber, sentences[sentencenumber.intValue()].length());
            }
        }
        Integer maxLine = (Integer) hitTable.getMaxObject();
        if (maxLine == null) return null;
        snippet = sentences[maxLine.intValue()];
        if (snippet.length() > 140) return null;
        
        // finally store this snippet in our own cache
        store(wordhashes, urlhash, snippet);
        return snippet;
    }
        
    private TreeMap hashMatrix(String[] sentences) {
        TreeMap map = new TreeMap();
        HashSet set;
        Enumeration words;
        for (int i = 0; i < sentences.length; i++) {
            set = new HashSet();
            words = plasmaCondenser.wordTokenizer(sentences[i]);
            while (words.hasMoreElements()) set.add(plasmaWordIndexEntry.word2hash((String) words.nextElement()));
            map.put(new Integer(i), set);
        }
        return map;
    }
    
    private byte[] getResource(URL url, boolean fetchOnline) {
        // load the url as resource from the web
        try {
            //return httpc.singleGET(url, 5000, null, null, remoteProxyHost, remoteProxyPort);
            byte[] resource = getResourceFromCache(url);
            if ((fetchOnline) && (resource == null)) {
                loadResourceFromWeb(url, 5000);
                resource = getResourceFromCache(url);
            }
            return resource;
        } catch (IOException e) {
            return null;
        }
    }
    
    private byte[] getResourceFromCache(URL url) {
        // load the url as resource from the cache
        String path = htmlFilterContentScraper.urlNormalform(url).substring(6);
        File cache = cacheManager.cachePath;
        File f = new File(cache, path);
        if (f.exists()) try {
            return serverFileUtils.read(f);
        } catch (IOException e) {
            return null;
        } else {
            return null;
        }
    }
    
    private void loadResourceFromWeb(URL url, int socketTimeout) throws IOException {
        plasmaCrawlWorker.load(
            url, 
            null, 
            null, 
            0, 
            null,
            socketTimeout,
            remoteProxyHost,
            remoteProxyPort,
            remoteProxyUse,
            cacheManager,
            log);
    }
    
    public plasmaParserDocument getDocument(URL url, boolean fetchOnline) {
        byte[] resource = getResource(url, fetchOnline);
        if (resource == null) return null;
        httpHeader header = null;
        try {
            header = cacheManager.getCachedResponse(plasmaURL.urlHash(url));
        } catch (IOException e) {
            return null;
        }
        if (header == null) return null;
        if (plasmaParser.supportedMimeTypesContains(header.mime())) {
            return parser.parseSource(url, header.mime(), resource);
        } else {
            return null;
        }
    }
}
added snippet-routines (not yet finished) git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@218 6c8d7289-2bf4-0310-a012-ef5d649a1542 2005-06-08 02:52:24 +02:00			`// plasmaSnippetCache.java`
			`// -----------------------`
			`// part of YaCy`
			`// (C) by Michael Peter Christen; mc@anomic.de`
			`// first published on http://www.anomic.de`
			`// Frankfurt, Germany, 2005`
			`// last major change: 07.06.2005`
			`//`
			`// This program is free software; you can redistribute it and/or modify`
			`// it under the terms of the GNU General Public License as published by`
			`// the Free Software Foundation; either version 2 of the License, or`
			`// (at your option) any later version.`
			`//`
			`// This program is distributed in the hope that it will be useful,`
			`// but WITHOUT ANY WARRANTY; without even the implied warranty of`
			`// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the`
			`// GNU General Public License for more details.`
			`//`
			`// You should have received a copy of the GNU General Public License`
			`// along with this program; if not, write to the Free Software`
			`// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA`
			`//`
			`// Using this software in any meaning (reading, learning, copying, compiling,`
			`// running) means that you agree that the Author(s) is (are) not responsible`
			`// for cost, loss of data or any harm that may be caused directly or indirectly`
			`// by usage of this softare or this documentation. The usage of this software`
			`// is on your own risk. The installation and usage (starting/running) of this`
			`// software may allow other people or application to access your computer and`
			`// any attached devices and is highly dependent on the configuration of the`
			`// software which must be done by the user of the software; the author(s) is`
			`// (are) also not responsible for proper configuration and usage of the`
			`// software, even if provoked by documentation provided together with`
			`// the software.`
			`//`
			`// Any changes to this file according to the GPL as documented in the file`
			`// gpl.txt aside this file in the shipment you received can be done to the`
			`// lines that follows this copyright notice here, but changes must not be`
			`// done inside the copyright notive above. A re-distribution must contain`
			`// the intact and unchanged copyright notice.`
			`// Contributions and changes to the program code must be marked as such.`


			`package de.anomic.plasma;`

			`import java.util.*;`
			`import java.io.File;`
			`import java.io.IOException;`
			`import java.net.URL;`
			`import de.anomic.htmlFilter.htmlFilterContentScraper;`
			`import de.anomic.kelondro.kelondroMScoreCluster;`
			`import de.anomic.server.serverFileUtils;`
*) changing references to logger git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@248 6c8d7289-2bf4-0310-a012-ef5d649a1542 2005-06-09 12:34:20 +02:00			`import de.anomic.server.logging.serverLog;`
added snippet-routines (not yet finished) git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@218 6c8d7289-2bf4-0310-a012-ef5d649a1542 2005-06-08 02:52:24 +02:00			`import de.anomic.http.httpHeader;`
			`import de.anomic.yacy.yacySearch;`

			`public class plasmaSnippetCache {`

			`private static final int maxCache = 500;`

			`private int snippetsScoreCounter;`
			`private kelondroMScoreCluster snippetsScore;`
			`private HashMap snippetsCache;`
			`private plasmaHTCache cacheManager;`
			`private plasmaParser parser;`
			`private serverLog log;`
			`private String remoteProxyHost;`
			`private int remoteProxyPort;`
			`private boolean remoteProxyUse;`

			`public plasmaSnippetCache(plasmaHTCache cacheManager, plasmaParser parser,`
			`String remoteProxyHost, int remoteProxyPort, boolean remoteProxyUse,`
			`serverLog log) {`
			`this.cacheManager = cacheManager;`
			`this.parser = parser;`
			`this.log = log;`
			`this.remoteProxyHost = remoteProxyHost;`
			`this.remoteProxyPort = remoteProxyPort;`
			`this.remoteProxyUse = remoteProxyUse;`
			`this.snippetsScoreCounter = 0;`
			`this.snippetsScore = new kelondroMScoreCluster();`
			`this.snippetsCache = new HashMap();`
			`}`


			`public synchronized void store(String wordhashes, String urlhash, String snippet) {`
			`// generate key`
			`String key = urlhash + wordhashes;`

			`// do nothing if snippet is known`
			`if (snippetsCache.containsKey(key)) return;`

			`// learn new snippet`
			`snippetsScore.addScore(key, snippetsScoreCounter++);`
			`snippetsCache.put(key, snippet);`

			`// care for counter`
			`if (snippetsScoreCounter == java.lang.Integer.MAX_VALUE) {`
			`snippetsScoreCounter = 0;`
			`snippetsScore = new kelondroMScoreCluster();`
			`snippetsCache = new HashMap();`
			`}`

			`// flush cache if cache is full`
			`while (snippetsCache.size() > maxCache) {`
			`key = (String) snippetsScore.getMinObject();`
			`snippetsScore.deleteScore(key);`
			`snippetsCache.remove(key);`
			`}`
			`}`

			`private String retrieve(String wordhashes, String urlhash) {`
			`// generate key`
			`String key = urlhash + wordhashes;`
			`return (String) snippetsCache.get(key);`
			`}`

			`public String retrieve(java.net.URL url, boolean fetchOnline, Set query, boolean queryAreHashes) {`
			`if (query.size() == 0) return null;`
			`if (!(queryAreHashes)) query = plasmaSearch.words2hashes(query);`
			`String urlhash = plasmaURL.urlHash(url);`

			`// try to get snippet from snippetCache`
			`String wordhashes = yacySearch.set2string(query);`
			`String snippet = retrieve(wordhashes, urlhash);`
			`if (snippet != null) return snippet;`

			`// if the snippet is not in the cache, we can try to get it from the htcache`
			`plasmaParserDocument document = getDocument(url, fetchOnline);`
			`if (document == null) return null;`
			`String[] sentences = document.getSentences();`
			`//System.out.println("----" + url.toString()); for (int l = 0; l < sentences.length; l++) System.out.println(sentences[l]);`
			`if ((sentences == null) \|\| (sentences.length == 0)) return null;`

			`// we have found a parseable non-empty file: use the lines`
			`TreeMap sentencematrix = hashMatrix(sentences);`
			`Iterator i = query.iterator();`
			`String hash;`
			`kelondroMScoreCluster hitTable = new kelondroMScoreCluster();`
			`Iterator j;`
			`Integer sentencenumber;`
			`Map.Entry entry;`
			`while (i.hasNext()) {`
			`hash = (String) i.next();`
			`j = sentencematrix.entrySet().iterator();`
			`while (j.hasNext()) {`
			`entry = (Map.Entry) j.next();`
			`sentencenumber = (Integer) entry.getKey();`
			`if (((HashSet) entry.getValue()).contains(hash)) hitTable.addScore(sentencenumber, sentences[sentencenumber.intValue()].length());`
			`}`
			`}`
			`Integer maxLine = (Integer) hitTable.getMaxObject();`
			`if (maxLine == null) return null;`
			`snippet = sentences[maxLine.intValue()];`
			`if (snippet.length() > 140) return null;`

			`// finally store this snippet in our own cache`
			`store(wordhashes, urlhash, snippet);`
			`return snippet;`
			`}`

			`private TreeMap hashMatrix(String[] sentences) {`
			`TreeMap map = new TreeMap();`
			`HashSet set;`
			`Enumeration words;`
			`for (int i = 0; i < sentences.length; i++) {`
			`set = new HashSet();`
			`words = plasmaCondenser.wordTokenizer(sentences[i]);`
			`while (words.hasMoreElements()) set.add(plasmaWordIndexEntry.word2hash((String) words.nextElement()));`
			`map.put(new Integer(i), set);`
			`}`
			`return map;`
			`}`

			`private byte[] getResource(URL url, boolean fetchOnline) {`
			`// load the url as resource from the web`
			`try {`
			`//return httpc.singleGET(url, 5000, null, null, remoteProxyHost, remoteProxyPort);`
			`byte[] resource = getResourceFromCache(url);`
			`if ((fetchOnline) && (resource == null)) {`
			`loadResourceFromWeb(url, 5000);`
			`resource = getResourceFromCache(url);`
			`}`
			`return resource;`
			`} catch (IOException e) {`
			`return null;`
			`}`
			`}`

			`private byte[] getResourceFromCache(URL url) {`
			`// load the url as resource from the cache`
			`String path = htmlFilterContentScraper.urlNormalform(url).substring(6);`
			`File cache = cacheManager.cachePath;`
			`File f = new File(cache, path);`
			`if (f.exists()) try {`
			`return serverFileUtils.read(f);`
			`} catch (IOException e) {`
			`return null;`
			`} else {`
			`return null;`
			`}`
			`}`

			`private void loadResourceFromWeb(URL url, int socketTimeout) throws IOException {`
			`plasmaCrawlWorker.load(`
			`url,`
			`null,`
			`null,`
			`0,`
			`null,`
			`socketTimeout,`
			`remoteProxyHost,`
			`remoteProxyPort,`
			`remoteProxyUse,`
			`cacheManager,`
			`log);`
			`}`

			`public plasmaParserDocument getDocument(URL url, boolean fetchOnline) {`
			`byte[] resource = getResource(url, fetchOnline);`
			`if (resource == null) return null;`
			`httpHeader header = null;`
			`try {`
			`header = cacheManager.getCachedResponse(plasmaURL.urlHash(url));`
			`} catch (IOException e) {`
			`return null;`
			`}`
			`if (header == null) return null;`
			`if (plasmaParser.supportedMimeTypesContains(header.mime())) {`
			`return parser.parseSource(url, header.mime(), resource);`
			`} else {`
			`return null;`
			`}`
			`}`
			`}`