From cd103709925b27045f0b6f122a9a528707f77797 Mon Sep 17 00:00:00 2001 From: orbiter Date: Sun, 14 Aug 2005 00:57:30 +0000 Subject: [PATCH] several bugfixes and dht selection / logging improvement git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@531 6c8d7289-2bf4-0310-a012-ef5d649a1542 --- build.properties | 2 +- htroot/yacy/query.java | 38 ++++++++----------- htroot/yacy/transferRWI.java | 7 +++- source/de/anomic/plasma/plasmaHTCache.java | 2 +- .../de/anomic/plasma/plasmaSwitchboard.java | 9 +---- .../anomic/plasma/plasmaWordIndexCache.java | 6 +-- .../plasma/plasmaWordIndexDistribution.java | 22 +++++++++-- source/de/anomic/yacy/yacyClient.java | 2 +- source/de/anomic/yacy/yacyDHTAction.java | 34 +++++++++++++++++ 9 files changed, 80 insertions(+), 42 deletions(-) diff --git a/build.properties b/build.properties index 62e0b5517..71d11ced3 100644 --- a/build.properties +++ b/build.properties @@ -3,7 +3,7 @@ javacSource=1.4 javacTarget=1.4 # Release Configuration -releaseVersion=0.395 +releaseVersion=0.396 releaseFile=yacy_dev_v${releaseVersion}_${DSTAMP}_${releaseNr}.tar.gz #releaseFile=yacy_v${releaseVersion}_${DSTAMP}_${releaseNr}.tar.gz releaseDir=yacy_dev_v${releaseVersion}_${DSTAMP}_${releaseNr} diff --git a/htroot/yacy/query.java b/htroot/yacy/query.java index a1820044a..456dd1bcb 100644 --- a/htroot/yacy/query.java +++ b/htroot/yacy/query.java @@ -44,6 +44,7 @@ // if the shell's current path is HTROOT import java.util.Hashtable; +import java.io.IOException; import de.anomic.http.httpHeader; import de.anomic.plasma.plasmaSwitchboard; @@ -79,40 +80,31 @@ public class query { // requests about environment - if (obj.equals("wordcount")) { - // the total number of different words in the rwi is returned - prop.put("response", "0"); // dummy response - return prop; + if (obj.equals("rwiurlcount")) { + // the total number of different urls in the rwi is returned + // shall contain a word hash, the number of assigned lurls to this hash is returned + try { + de.anomic.plasma.plasmaWordIndexEntity entity = switchboard.wordIndex.getEntity(env, true); + prop.put("response", entity.size()); + entity.close(); + } catch (IOException e) { + prop.put("response", -1); + } + return prop; } if (obj.equals("rwicount")) { - // return the number of available word indexes - // shall contain a word hash, the number of assigned lurls to this hash is returned - prop.put("response", "0"); // dummy response + // return the total number of available word indexes + prop.put("response", switchboard.wordIndex.size()); return prop; } if (obj.equals("lurlcount")) { // return the number of all available l-url's - Hashtable result = switchboard.action("urlcount", null); - //System.out.println("URLCOUNT result = " + ((result == null) ? "NULL" : result.toString())); - prop.put("response", ((result == null) ? "-1" : (String) result.get("urls"))); + prop.put("response", switchboard.urlPool.loadedURL.size()); return prop; } - if (obj.equals("purlcount")) { - // return number of stacked prefetch urls - prop.put("response", "0"); // dummy response - return prop; - } - - if (obj.equals("seedcount")) { - // return number of stacked prefetch urls - prop.put("response", "0"); // dummy response - return prop; - } - - // requests about requirements if (obj.equals("wantedlurls")) { diff --git a/htroot/yacy/transferRWI.java b/htroot/yacy/transferRWI.java index af70a30cf..9ddf62a62 100644 --- a/htroot/yacy/transferRWI.java +++ b/htroot/yacy/transferRWI.java @@ -55,6 +55,7 @@ import de.anomic.plasma.plasmaWordIndexEntryContainer; import de.anomic.server.serverObjects; import de.anomic.server.serverSwitch; import de.anomic.yacy.yacyCore; +import de.anomic.yacy.yacyDHTAction; public class transferRWI { @@ -124,8 +125,10 @@ public class transferRWI { if (unknownURLs.length() > 0) unknownURLs = unknownURLs.substring(1); if (wordhashes.length == 0) switchboard.getLog().logInfo("Received 0 Words from peer " + iam + ", requested " + unknownURL.size() + " URLs"); - else - switchboard.getLog().logInfo("Received " + received + " Words [" + wordhashes[0] + " .. " + wordhashes[wordhashes.length - 1] + "] from peer " + iam + ", requested " + unknownURL.size() + " URLs"); + else { + double avdist = (yacyDHTAction.dhtDistance(yacyCore.seedDB.mySeed.hash, wordhashes[0]) + yacyDHTAction.dhtDistance(yacyCore.seedDB.mySeed.hash, wordhashes[wordhashes.length - 1])) / 2.0; + switchboard.getLog().logInfo("Received " + received + " Words [" + wordhashes[0] + " .. " + wordhashes[wordhashes.length - 1] + "]/" + avdist + " from peer " + iam + ", requested " + unknownURL.size() + " URLs"); + } result = "ok"; } else { result = "error_not_granted"; diff --git a/source/de/anomic/plasma/plasmaHTCache.java b/source/de/anomic/plasma/plasmaHTCache.java index 8c7bfd9ff..0c2dee486 100644 --- a/source/de/anomic/plasma/plasmaHTCache.java +++ b/source/de/anomic/plasma/plasmaHTCache.java @@ -416,7 +416,7 @@ public final class plasmaHTCache { return ((ls.indexOf(".cgi") >= 0) || (ls.indexOf(".exe") >= 0) || (ls.indexOf(";jsessionid=") >= 0) || - (ls.indexOf("SESSIONID/") >= 0)); + (ls.indexOf("sessionid/") >= 0)); } public Entry newEntry(Date initDate, int depth, URL url, String name, diff --git a/source/de/anomic/plasma/plasmaSwitchboard.java b/source/de/anomic/plasma/plasmaSwitchboard.java index 47eb2c69c..f984f85f2 100644 --- a/source/de/anomic/plasma/plasmaSwitchboard.java +++ b/source/de/anomic/plasma/plasmaSwitchboard.java @@ -1515,15 +1515,8 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser public serverObjects action(String actionName, serverObjects actionInput) { - // perform an action. + // perform an action. (not used) - if (actionName.equals("urlcount")) { - serverObjects result = new serverObjects(); - result.put("urls", Integer.toString(urlPool.loadedURL.size())); - return result; - } - - // not a correct query return null; } diff --git a/source/de/anomic/plasma/plasmaWordIndexCache.java b/source/de/anomic/plasma/plasmaWordIndexCache.java index 1e763a2cf..bef6b7e23 100644 --- a/source/de/anomic/plasma/plasmaWordIndexCache.java +++ b/source/de/anomic/plasma/plasmaWordIndexCache.java @@ -517,11 +517,11 @@ public final class plasmaWordIndexCache implements plasmaWordIndexInterface { // check cache space if (cache.size() > 0) try { - // pause until space is in the cache - while (cache.size() >= this.maxWords) Thread.sleep(1000); + // pause to get space in the cache (while it is flushed) + if (cache.size() + 1000 >= this.maxWords) Thread.sleep(java.lang.Math.min(1000, cache.size() - this.maxWords + 1000)); // slow down if we reach cache limit - long pausetime = java.lang.Math.min(10, 3 * cache.size() / (maxWords + 1)); + long pausetime = java.lang.Math.min(10, 2 * cache.size() / (maxWords + 1)); //System.out.println("Pausetime=" + pausetime); Thread.sleep(pausetime); } catch (InterruptedException e) {} diff --git a/source/de/anomic/plasma/plasmaWordIndexDistribution.java b/source/de/anomic/plasma/plasmaWordIndexDistribution.java index ebeb8940c..73cde0595 100644 --- a/source/de/anomic/plasma/plasmaWordIndexDistribution.java +++ b/source/de/anomic/plasma/plasmaWordIndexDistribution.java @@ -13,6 +13,7 @@ import de.anomic.yacy.yacyCore; import de.anomic.yacy.yacySeed; import de.anomic.yacy.yacySeedDB; import de.anomic.yacy.yacyClient; +import de.anomic.yacy.yacyDHTAction; import de.anomic.server.serverCodings; import de.anomic.server.logging.serverLog; import de.anomic.kelondro.kelondroException; @@ -133,8 +134,8 @@ public class plasmaWordIndexDistribution { if ((yacyCore.seedDB == null) || (yacyCore.seedDB.sizeConnected() == 0)) return -1; // collect index - String startPointHash = yacyCore.seedDB.mySeed.hash; - //String startPointHash = serverCodings.encodeMD5B64("" + System.currentTimeMillis(), true).substring(0, yacySeedDB.commonHashLength); + String startPointHash = selectTransferStart(); + log.logDebug("Selected hash " + startPointHash + " as start point for index distribution, distance = " + yacyDHTAction.dhtDistance(yacyCore.seedDB.mySeed.hash, startPointHash)); Object[] selectResult = selectTransferIndexes(startPointHash, indexCount); plasmaWordIndexEntity[] indexEntities = (plasmaWordIndexEntity[]) selectResult[0]; HashMap urlCache = (HashMap) selectResult[1]; // String (url-hash) / plasmaCrawlLURL.Entry @@ -157,6 +158,7 @@ public class plasmaWordIndexDistribution { Enumeration e = yacyCore.dhtAgent.getAcceptRemoteIndexSeeds(keyhash); String error; String peerNames = ""; + double avdist; while ((e.hasMoreElements()) && (hc < peerCount)) { if (closed) { log.logError("Index distribution interrupted by close, nothing deleted locally."); @@ -166,7 +168,8 @@ public class plasmaWordIndexDistribution { if (seed != null) { error = yacyClient.transferIndex(seed, indexEntities, urlCache); if (error == null) { - log.logInfo("Index transfer of " + indexCount + " words [" + indexEntities[0].wordHash() + " .. " + indexEntities[indexEntities.length-1].wordHash() + "] to peer " + seed.getName() + ":" + seed.hash + " successfull"); + avdist = (yacyDHTAction.dhtDistance(seed.hash, indexEntities[0].wordHash()) + yacyDHTAction.dhtDistance(seed.hash, indexEntities[indexEntities.length-1].wordHash())) / 2.0; + log.logInfo("Index transfer of " + indexCount + " words [" + indexEntities[0].wordHash() + " .. " + indexEntities[indexEntities.length-1].wordHash() + "]/" + avdist + " to peer " + seed.getName() + ":" + seed.hash + " successfull"); peerNames += ", " + seed.getName(); hc++; } else { @@ -207,8 +210,21 @@ public class plasmaWordIndexDistribution { } } + private String selectTransferStart() { + String startPointHash; + // first try to select with increasing probality a good start point + for (int i = 9; i > 0; i--) { + startPointHash = serverCodings.encodeMD5B64(Long.toString(i + System.currentTimeMillis()), true).substring(2, 2 + yacySeedDB.commonHashLength); + if (yacyDHTAction.dhtDistance(yacyCore.seedDB.mySeed.hash, startPointHash) > ((double) i / (double) 10)) return startPointHash; + } + // if that fails, take simply the best start point (this is usually avoided, since that leads to always the same target peers) + startPointHash = yacyCore.seedDB.mySeed.hash.substring(0, 11) + "z"; + return startPointHash; + } + private Object[] /* of {plasmaWordIndexEntity[], HashMap(String, plasmaCrawlLURL.Entry)}*/ selectTransferIndexes(String hash, int count) { + // the hash is a start hash from where the indexes are picked Vector tmpEntities = new Vector(); String nexthash = ""; try { diff --git a/source/de/anomic/yacy/yacyClient.java b/source/de/anomic/yacy/yacyClient.java index 1876c2d12..7eff54231 100644 --- a/source/de/anomic/yacy/yacyClient.java +++ b/source/de/anomic/yacy/yacyClient.java @@ -380,7 +380,7 @@ public class yacyClient { } catch (NumberFormatException e) { searchtime = totalrequesttime; } - yacyCore.log.logDebug("yacyClient.search: processed " + results + " links from peer " + targetPeer.hash + ", score " + targetPeer.selectscore + "; duetime=" + duetime + ", searchtime=" + searchtime + ", netdelay=" + (totalrequesttime - searchtime) + ", references=" + result.get("references")); + yacyCore.log.logDebug("yacyClient.search: processed " + results + " links from peer " + targetPeer.hash + ", score=" + targetPeer.selectscore + ", DHTdist=" + yacyDHTAction.dhtDistance(targetPeer.hash, wordhashes) + ", duetime=" + duetime + ", searchtime=" + searchtime + ", netdelay=" + (totalrequesttime - searchtime) + ", references=" + result.get("references")); return results; } catch (Exception e) { yacyCore.log.logError("yacyClient.search error: '" + targetPeer.get("Name", "anonymous") + "' failed - " + e); diff --git a/source/de/anomic/yacy/yacyDHTAction.java b/source/de/anomic/yacy/yacyDHTAction.java index 92a2200b5..0bb429599 100644 --- a/source/de/anomic/yacy/yacyDHTAction.java +++ b/source/de/anomic/yacy/yacyDHTAction.java @@ -226,4 +226,38 @@ public class yacyDHTAction implements yacyPeerAction { public void processPeerPing(yacySeed peer) { } + + + + public static double dhtDistance(String peer, String word) { + // the dht distance is a positive value between 0 and 1 + // if the distance is small, the word more probably belongs to the peer + double d = hashDistance(peer, word); + if (d > 0) { + return d; // case where the word is 'before' the peer + } else { + return 1 + d; // wrap-around case + } + } + + private static double hashDistance(String from, String to) { + // computes the distance between two hashes. + // the maximum distance between two hashes is 1, the minimum -1 + // this can be used like "from - to" + // the result is positive if from > to + if ((from == null) || (to == null) || + (from.length() == 0) || (to.length() == 0) || + (from.length() != to.length())) return (double) 0.0; + return hashDistance(from.charAt(0), to.charAt(0)) + hashDistance(from.substring(1), to.substring(1)) / maxAtomarDistance; + } + + private static final double maxAtomarDistance = (double) (1+ ((byte) 'z') - ((byte) '-')); + + private static double hashDistance(char from, char to) { + // the distance is a little bit fuzzy, since not all characters are used in a hash. + if (from < to) + return -hashDistance(to, from); + else + return ((double) (((byte) from) - ((byte) to))) / maxAtomarDistance; + } }