yacy_search_server/source/de/anomic/yacy/dht/VerticalWordPartitionScheme.java

167 lines
8.1 KiB
Java
Raw Normal View History

replaced old DHT transmission method with new method. Many things have changed! some of them: - after a index selection is made, the index is splitted into its vertical components - from differrent index selctions the splitted components can be accumulated before they are placed into the transmission queue - each splitted chunk gets its own transmission thread - multiple transmission threads are started concurrently - the process can be monitored with the blocking queue servlet To implement that, a new package de.anomic.yacy.dht was created. Some old files have been removed. The new index distribution model using a vertical DHT was implemented. An abstraction of this model is implemented in the new dht package as interface. The freeworld network has now a configuration of two vertial partitions; sixteen partitions are planned and will be configured if the process is bug-free. This modification has three main targets: - enhance the DHT transmission speed - with a vertical DHT, a search will speed up. With two partitions, two times. With sixteen, sixteen times. - the vertical DHT will apply a semi-dht for URLs, and peers will receive a fraction of the overall URLs they received before. with two partitions, the fractions will be halve. With sixteen partitions, a 1/16 of the previous number of URLs. BE CAREFULL, THIS IS A MAJOR CODE CHANGE, POSSIBLY FULL OF BUGS AND HARMFUL THINGS. git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@5586 6c8d7289-2bf4-0310-a012-ef5d649a1542
2009-02-10 01:06:59 +01:00
// VerticalWordPartitionScheme.java
// --------------------------------
// part of YaCy
// (C) 2009 by Michael Peter Christen; mc@yacy.net
// first published on http://yacy.net
// Frankfurt, Germany, 28.01.2009
//
// $LastChangedDate: 2009-01-23 16:32:27 +0100 (Fr, 23 Jan 2009) $
// $LastChangedRevision: 5514 $
// $LastChangedBy: orbiter $
//
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation; either version 2 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
package de.anomic.yacy.dht;
import de.anomic.yacy.yacySeed;
public class VerticalWordPartitionScheme implements PartitionScheme {
int partitionExponent;
public VerticalWordPartitionScheme(int partitionExponent) {
this.partitionExponent = partitionExponent;
}
public int verticalPartitions() {
return 1 << partitionExponent;
}
/**
* calculate the DHT position for horizontal and vertical performance scaling:
* horizontal: scale with number of words
* vertical: scale with number of references for every word
* The vertical scaling is selected using the corresponding reference hash, the url hash
* This has the effect that every vertical position accumulates references for the same url
* and the urls are not spread over all positions of the DHT. To use this effect, the
* horizontal DHT position must be normed to a 'rest' value of a partition size
* This method is compatible to the classic DHT computation as always one of the vertical
* DHT position corresponds to the classic position.
* @param wordHash, the hash of the RWI
* @param partitions, the number of partitions should be computed with partitions = 2**n, n = scaling factor
* @param urlHash, the hash of a reference
* @return a double in the range 0 .. 1.0 (including 0, excluding 1.0), the DHT position
*/
public final long dhtPosition(final String wordHash, final String urlHash) {
// this creates 1^^e different positions for the same word hash (according to url hash)
assert wordHash != null;
assert urlHash != null;
if (urlHash == null || partitionExponent < 1) return FlatWordPartitionScheme.std.dhtPosition(wordHash, null);
// the partition size is (Long.MAX + 1) / 2 ** e == 2 ** (63 - e)
assert partitionExponent > 0;
long partitionMask = (1L << (Long.SIZE - 1 - partitionExponent)) - 1L;
// compute the position using a specific fragment of the word hash and the url hash:
// - from the word hash take the (63 - <partitionExponent>) lower bits
// - from the url hash take the (63 - <partitionExponent>) higher bits
// in case that the partitionExpoent is 1, only one bit is taken from the urlHash,
// which means that the partition is in two parts.
// With partitionExponent = 2 it is divided in four parts and so on.
return (FlatWordPartitionScheme.std.dhtPosition(wordHash, null) & partitionMask) | (FlatWordPartitionScheme.std.dhtPosition(urlHash, null) & ~partitionMask);
}
public final long dhtPosition(final String wordHash, final int verticalPosition) {
assert wordHash != null;
if (partitionExponent == 0) return FlatWordPartitionScheme.std.dhtPosition(wordHash, null);
long partitionMask = (1L << (Long.SIZE - 1 - partitionExponent)) - 1L;
long verticalMask = ((long) verticalPosition) << (Long.SIZE - 1 - partitionExponent); // don't remove the cast! it will become an integer result which is wrong.
return (FlatWordPartitionScheme.std.dhtPosition(wordHash, null) & partitionMask) | verticalMask;
}
public final int verticalPosition(final String urlHash) {
assert urlHash != null;
if (urlHash == null || partitionExponent < 1) return 0;
assert partitionExponent > 0;
return (int) (FlatWordPartitionScheme.std.dhtPosition(urlHash, null) >> (Long.SIZE - 1 - partitionExponent)); // take only the top-<partitionExponent> bits
}
/**
* compute all vertical DHT positions for a given word
* This is used when a word is searched and the peers holding the word must be computed
* @param wordHash, the hash of the word
* @param partitions, the number of partitions of the DHT
* @return a vector of long values, the possible DHT positions
*/
public final long[] dhtPositions(final String wordHash) {
assert wordHash != null;
int partitions = 1 << partitionExponent;
long[] l = new long[partitions];
long partitionSize = 1L << (Long.SIZE - 1 - partitionExponent);
l[0] = FlatWordPartitionScheme.std.dhtPosition(wordHash, null) & (partitionSize - 1L); // this is the lowest possible position
for (int i = 1; i < partitions; i++) {
l[i] = l[i - 1] + partitionSize; // no overflow, because we started with the lowest
}
return l;
}
public final long dhtDistance(final String word, final String urlHash, final yacySeed peer) {
return dhtDistance(word, urlHash, peer.hash);
}
private final long dhtDistance(final String from, final String urlHash, final String to) {
// the dht distance is a positive value between 0 and 1
// if the distance is small, the word more probably belongs to the peer
assert to != null;
assert from != null;
final long toPos = FlatWordPartitionScheme.std.dhtPosition(to, null);
final long fromPos = dhtPosition(from, urlHash);
return FlatWordPartitionScheme.dhtDistance(fromPos, toPos);
}
public static void main(String[] args) {
// java -classpath classes de.anomic.yacy.yacySeed hHJBztzcFn76
// java -classpath classes de.anomic.yacy.yacySeed hHJBztzcFG76 M8hgtrHG6g12 3
// test the DHT position calculation
String wordHash = args[0];
//double dhtd;
long dhtl;
int partitionExponent = 0;
VerticalWordPartitionScheme partition = new VerticalWordPartitionScheme(0);
if (args.length == 3) {
// the horizontal and vertical position calculation
String urlHash = args[1];
partitionExponent = Integer.parseInt(args[2]);
dhtl = partition.dhtPosition(wordHash, urlHash);
} else {
// only a horizontal position calculation
dhtl = FlatWordPartitionScheme.std.dhtPosition(wordHash, null);
}
//System.out.println("DHT Double = " + dhtd);
System.out.println("DHT Long = " + dhtl);
System.out.println("DHT as Double from Long = " + ((double) dhtl) / ((double) Long.MAX_VALUE));
//System.out.println("DHT as Long from Double = " + (long) (Long.MAX_VALUE * dhtd));
//System.out.println("DHT as b64 from Double = " + positionToHash(dhtd));
System.out.println("DHT as b64 from Long = " + FlatWordPartitionScheme.positionToHash(dhtl));
System.out.print("all " + (1 << partitionExponent) + " DHT positions from doubles: ");
/*
double[] d = dhtPositionsDouble(wordHash, partitionExponent);
for (int i = 0; i < d.length; i++) {
if (i > 0) System.out.print(", ");
System.out.print(positionToHash(d[i]));
}
System.out.println();
*/
System.out.print("all " + (1 << partitionExponent) + " DHT positions from long : ");
long[] l = partition.dhtPositions(wordHash);
for (int i = 0; i < l.length; i++) {
if (i > 0) System.out.print(", ");
System.out.print(FlatWordPartitionScheme.positionToHash(l[i]));
}
System.out.println();
}
}