// plasmaCrawlLURL.java // ----------------------- // part of YaCy // (C) by Michael Peter Christen; mc@yacy.net // first published on http://yacy.net // Frankfurt, Germany, 2004 // // $LastChangedDate: 2008-03-16 23:31:54 +0100 (So, 16 Mrz 2008) $ // $LastChangedRevision: 4575 $ // $LastChangedBy: orbiter $ // // This program is free software; you can redistribute it and/or modify // it under the terms of the GNU General Public License as published by // the Free Software Foundation; either version 2 of the License, or // (at your option) any later version. // // This program is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU General Public License for more details. // // You should have received a copy of the GNU General Public License // along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA /* This class provides storage functions for the plasma search engine. - the url-specific properties, including condenser results - the text content of the url Both entities are accessed with a hash, which is based on the MD5 algorithm. The MD5 is not encoded as a hex value, but a b64 value. */ package de.anomic.crawler; import java.net.MalformedURLException; import java.util.Date; import java.util.Iterator; import java.util.LinkedList; import java.util.List; import de.anomic.kelondro.order.Bitfield; import de.anomic.kelondro.text.metadataPrototype.URLMetadataRow; import de.anomic.kelondro.util.ScoreCluster; import de.anomic.kelondro.util.Log; import de.anomic.yacy.yacySeedDB; import de.anomic.yacy.yacyURL; public final class ResultURLs { // result stacks; // these have all entries of form // strings: urlHash + initiatorHash + ExecutorHash private final LinkedList externResultStack; // 1 - remote index: retrieved by other peer private final LinkedList searchResultStack; // 2 - partly remote/local index: result of search queries private final LinkedList transfResultStack; // 3 - partly remote/local index: result of index transfer private final LinkedList proxyResultStack; // 4 - local index: result of proxy fetch/prefetch private final LinkedList lcrawlResultStack; // 5 - local index: result of local crawling private final LinkedList gcrawlResultStack; // 6 - local index: triggered external private final ScoreCluster externResultDomains; private final ScoreCluster searchResultDomains; private final ScoreCluster transfResultDomains; private final ScoreCluster proxyResultDomains; private final ScoreCluster lcrawlResultDomains; private final ScoreCluster gcrawlResultDomains; public ResultURLs() { // init result stacks externResultStack = new LinkedList(); searchResultStack = new LinkedList(); transfResultStack = new LinkedList(); proxyResultStack = new LinkedList(); lcrawlResultStack = new LinkedList(); gcrawlResultStack = new LinkedList(); // init result domain statistics externResultDomains = new ScoreCluster(); searchResultDomains = new ScoreCluster(); transfResultDomains = new ScoreCluster(); proxyResultDomains = new ScoreCluster(); lcrawlResultDomains = new ScoreCluster(); gcrawlResultDomains = new ScoreCluster(); } public synchronized void stack(final URLMetadataRow e, final String initiatorHash, final String executorHash, final int stackType) { assert initiatorHash != null; assert executorHash != null; if (e == null) { return; } try { final List resultStack = getStack(stackType); if (resultStack != null) { resultStack.add(e.hash() + initiatorHash + executorHash); } } catch (final Exception ex) { System.out.println("INTERNAL ERROR in newEntry/2: " + ex.toString()); return; } try { final ScoreCluster domains = getDomains(stackType); if (domains != null) { domains.incScore(e.metadata().url().getHost()); } } catch (final Exception ex) { System.out.println("INTERNAL ERROR in newEntry/3: " + ex.toString()); return; } } public synchronized int getStackSize(final int stack) { final List resultStack = getStack(stack); if (resultStack == null) return 0; return resultStack.size(); } public synchronized int getDomainListSize(final int stack) { final ScoreCluster domains = getDomains(stack); if (domains == null) return 0; return domains.size(); } public synchronized String getUrlHash(final int stack, final int pos) { return getHashNo(stack, pos, 0); } public synchronized String getInitiatorHash(final int stack, final int pos) { return getHashNo(stack, pos, 1); } public synchronized String getExecutorHash(final int stack, final int pos) { return getHashNo(stack, pos, 2); } /** * gets the hash at index in element at pos in stack (based on {@link yacySeedDB#commonHashLength}) * *

simplified example with {@link yacySeedDB#commonHashLength} = 3:

* String[][] stacks[1][0] = "123456789"; * System.out.println(getHashNo(1, 0, 0)); * System.out.println(getHashNo(1, 0, 0)); * System.out.println(getHashNo(1, 0, 0)); *

Output: * 123
* 456
* 789

* * @param stack * @param pos * @param index starting at 0 * @return */ public synchronized String getHashNo(final int stack, final int pos, final int index) { final String result = getResultStackAt(stack, pos); if(result != null) { if(result.length() < yacySeedDB.commonHashLength * 3) { Log.logSevere("ResultURLs", "unexpected error: result of stack is too short: "+ result.length()); if(result.length() <= yacySeedDB.commonHashLength * 2) { return null; } // return what is there return result.substring(yacySeedDB.commonHashLength * 2); } return result.substring(yacySeedDB.commonHashLength * index, yacySeedDB.commonHashLength * (index + 1)); } else if(isValidStack(stack)) { Log.logSevere("ResultURLs", "unexpected error: result of stack is null: "+ stack +","+ pos); } return result; } /** * gets the element at pos in stack * * @param stack * @param pos * @return null if either stack or element do not exist */ private String getResultStackAt(final int stack, final int pos) { assert pos >= 0 : "precondition violated: " + pos + " >= 0"; final List resultStack = getStack(stack); if(resultStack == null) { return null; } assert pos < resultStack.size() : "pos = " + pos + ", resultStack.size() = " + resultStack.size(); if(pos >= resultStack.size()) { Log.logSevere("ResultURLs", "unexpected error: Index out of Bounds "+ pos +" of "+ resultStack.size()); return null; } return resultStack.get(pos); } /** * iterate all domains in the result domain statistic * @return iterator of domains in reverse order (downwards) */ public Iterator domains(final int stack) { assert getDomains(stack) != null : "getDomains(" + stack + ") = null"; return getDomains(stack).scores(false); } public int deleteDomain(final int stack, String host, String hosthash) { assert hosthash.length() == 6; int i = 0; while (i < getStackSize(stack)) { if (getUrlHash(stack, i).substring(6).equals(hosthash)) getStack(stack).remove(i); else i++; } assert host != null : "host = null"; assert getDomains(stack) != null : "getDomains(" + stack + ") = null"; return getDomains(stack).deleteScore(host); } /** * return the count of the domain * @param stack type * @param domain name * @return the number of occurrences of the domain in the stack statistics */ public int domainCount(final int stack, String domain) { assert domain != null : "domain = null"; assert getDomains(stack) != null : "getDomains(" + stack + ") = null"; return getDomains(stack).getScore(domain); } /** * returns the stack indentified by the id stack * * @param stack id of resultStack * @return null if stack does not exist (id is unknown or stack is null (which should not occur and an error is logged)) */ private List getStack(final int stack) { switch (stack) { case 1: return externResultStack; case 2: return searchResultStack; case 3: return transfResultStack; case 4: return proxyResultStack; case 5: return lcrawlResultStack; case 6: return gcrawlResultStack; default: return null; } } private ScoreCluster getDomains(final int stack) { switch (stack) { case 1: return externResultDomains; case 2: return searchResultDomains; case 3: return transfResultDomains; case 4: return proxyResultDomains; case 5: return lcrawlResultDomains; case 6: return gcrawlResultDomains; default: return null; } } /** * tests if a stack with id stack exists * * @param stack * @return */ private boolean isValidStack(final int stack) { return getStack(stack) != null; } public synchronized boolean removeStack(final int stack, final int pos) { final List resultStack = getStack(stack); if (resultStack == null) { return false; } return resultStack.remove(pos) != null; } public synchronized void clearStack(final int stack) { final List resultStack = getStack(stack); if (resultStack != null) resultStack.clear(); final ScoreCluster resultDomains = getDomains(stack); if (resultDomains != null) { // we do not clear this completely, just remove most of the less important entries resultDomains.shrinkToMaxSize(100); resultDomains.shrinkToMinScore(2); } } public synchronized boolean remove(final String urlHash) { if (urlHash == null) return false; String hash; for (int stack = 1; stack <= 6; stack++) { for (int i = getStackSize(stack) - 1; i >= 0; i--) { hash = getUrlHash(stack, i); if (hash != null && hash.equals(urlHash)) { removeStack(stack, i); return true; } } } return true; } /** * test and benchmark * @param args */ public static void main(final String[] args) { final ResultURLs results = new ResultURLs(); try { final yacyURL url = new yacyURL("http", "www.yacy.net", 80, "/"); final URLMetadataRow urlRef = new URLMetadataRow(url, "YaCy Homepage", "", "", "", new Date(), new Date(), new Date(), "", new byte[] {}, 123, 42, '?', new Bitfield(), "de", 0, 0, 0, 0, 0, 0); int stackNo = 1; System.out.println("valid test:\n======="); // add results.stack(urlRef, urlRef.hash(), url.hash(), stackNo); // size System.out.println("size of stack:\t"+ results.getStackSize(stackNo)); // get System.out.println("url hash:\t"+ results.getUrlHash(stackNo, 0)); System.out.println("executor hash:\t"+ results.getExecutorHash(stackNo, 0)); System.out.println("initiator hash:\t"+ results.getInitiatorHash(stackNo, 0)); // test errors System.out.println("invalid test:\n======="); // get System.out.println("url hash:\t"+ results.getUrlHash(stackNo, 1)); System.out.println("executor hash:\t"+ results.getExecutorHash(stackNo, 1)); System.out.println("initiator hash:\t"+ results.getInitiatorHash(stackNo, 1)); stackNo = 42; System.out.println("size of stack:\t"+ results.getStackSize(stackNo)); // get System.out.println("url hash:\t"+ results.getUrlHash(stackNo, 0)); System.out.println("executor hash:\t"+ results.getExecutorHash(stackNo, 0)); System.out.println("initiator hash:\t"+ results.getInitiatorHash(stackNo, 0)); // benchmark final long start = System.currentTimeMillis(); for(int i = 0; i < 1000000; i++) { stackNo = i % 6; // add results.stack(urlRef, urlRef.hash(), url.hash(), stackNo); // size results.getStackSize(stackNo); // get for(int j = 0; j < 10; j++) { results.getUrlHash(stackNo, i / 6); results.getExecutorHash(stackNo, i / 6); results.getInitiatorHash(stackNo, i / 6); } } System.out.println("benschmark: "+ (System.currentTimeMillis() - start) + " ms"); } catch (final MalformedURLException e) { e.printStackTrace(); } } }