yacy_search_server/source/de/anomic/yacy/dht/VerticalWordPartitionScheme.java
orbiter c8624903c6 full redesign of index access data model:
terms (words) are not any more retrieved by their word hash string, but by a byte[] containing the word hash.
this has strong advantages when RWIs are sorted in the ReferenceContainer Cache and compared with the sun.java TreeMap method, which needed getBytes() and new String() transformations before.
Many thousands of such conversions are now omitted every second, which increases the indexing speed by a factor of two.

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@5812 6c8d7289-2bf4-0310-a012-ef5d649a1542
2009-04-16 15:29:00 +00:00

167 lines
8.2 KiB
Java
Executable File

// VerticalWordPartitionScheme.java
// --------------------------------
// part of YaCy
// (C) 2009 by Michael Peter Christen; mc@yacy.net
// first published on http://yacy.net
// Frankfurt, Germany, 28.01.2009
//
// $LastChangedDate: 2009-01-23 16:32:27 +0100 (Fr, 23 Jan 2009) $
// $LastChangedRevision: 5514 $
// $LastChangedBy: orbiter $
//
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation; either version 2 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
package de.anomic.yacy.dht;
import de.anomic.yacy.yacySeed;
public class VerticalWordPartitionScheme implements PartitionScheme {
int partitionExponent;
public VerticalWordPartitionScheme(int partitionExponent) {
this.partitionExponent = partitionExponent;
}
public int verticalPartitions() {
return 1 << partitionExponent;
}
/**
* calculate the DHT position for horizontal and vertical performance scaling:
* horizontal: scale with number of words
* vertical: scale with number of references for every word
* The vertical scaling is selected using the corresponding reference hash, the url hash
* This has the effect that every vertical position accumulates references for the same url
* and the urls are not spread over all positions of the DHT. To use this effect, the
* horizontal DHT position must be normed to a 'rest' value of a partition size
* This method is compatible to the classic DHT computation as always one of the vertical
* DHT position corresponds to the classic position.
* @param wordHash, the hash of the RWI
* @param partitions, the number of partitions should be computed with partitions = 2**n, n = scaling factor
* @param urlHash, the hash of a reference
* @return a double in the range 0 .. 1.0 (including 0, excluding 1.0), the DHT position
*/
public final long dhtPosition(final byte[] wordHash, final String urlHash) {
// this creates 1^^e different positions for the same word hash (according to url hash)
assert wordHash != null;
assert urlHash != null;
if (urlHash == null || partitionExponent < 1) return FlatWordPartitionScheme.std.dhtPosition(wordHash, null);
// the partition size is (Long.MAX + 1) / 2 ** e == 2 ** (63 - e)
assert partitionExponent > 0;
long partitionMask = (1L << (Long.SIZE - 1 - partitionExponent)) - 1L;
// compute the position using a specific fragment of the word hash and the url hash:
// - from the word hash take the (63 - <partitionExponent>) lower bits
// - from the url hash take the (63 - <partitionExponent>) higher bits
// in case that the partitionExpoent is 1, only one bit is taken from the urlHash,
// which means that the partition is in two parts.
// With partitionExponent = 2 it is divided in four parts and so on.
return (FlatWordPartitionScheme.std.dhtPosition(wordHash, null) & partitionMask) | (FlatWordPartitionScheme.std.dhtPosition(urlHash.getBytes(), null) & ~partitionMask);
}
public final long dhtPosition(final byte[] wordHash, final int verticalPosition) {
assert wordHash != null;
if (partitionExponent == 0) return FlatWordPartitionScheme.std.dhtPosition(wordHash, null);
long partitionMask = (1L << (Long.SIZE - 1 - partitionExponent)) - 1L;
long verticalMask = ((long) verticalPosition) << (Long.SIZE - 1 - partitionExponent); // don't remove the cast! it will become an integer result which is wrong.
return (FlatWordPartitionScheme.std.dhtPosition(wordHash, null) & partitionMask) | verticalMask;
}
public final int verticalPosition(final String urlHash) {
assert urlHash != null;
if (urlHash == null || partitionExponent < 1) return 0;
assert partitionExponent > 0;
return (int) (FlatWordPartitionScheme.std.dhtPosition(urlHash.getBytes(), null) >> (Long.SIZE - 1 - partitionExponent)); // take only the top-<partitionExponent> bits
}
/**
* compute all vertical DHT positions for a given word
* This is used when a word is searched and the peers holding the word must be computed
* @param wordHash, the hash of the word
* @param partitions, the number of partitions of the DHT
* @return a vector of long values, the possible DHT positions
*/
public final long[] dhtPositions(final byte[] wordHash) {
assert wordHash != null;
int partitions = 1 << partitionExponent;
long[] l = new long[partitions];
long partitionSize = 1L << (Long.SIZE - 1 - partitionExponent);
l[0] = FlatWordPartitionScheme.std.dhtPosition(wordHash, null) & (partitionSize - 1L); // this is the lowest possible position
for (int i = 1; i < partitions; i++) {
l[i] = l[i - 1] + partitionSize; // no overflow, because we started with the lowest
}
return l;
}
public final long dhtDistance(final byte[] word, final String urlHash, final yacySeed peer) {
return dhtDistance(word, urlHash, peer.hash.getBytes());
}
private final long dhtDistance(final byte[] from, final String urlHash, final byte[] to) {
// the dht distance is a positive value between 0 and 1
// if the distance is small, the word more probably belongs to the peer
assert to != null;
assert from != null;
final long toPos = FlatWordPartitionScheme.std.dhtPosition(to, null);
final long fromPos = dhtPosition(from, urlHash);
return FlatWordPartitionScheme.dhtDistance(fromPos, toPos);
}
public static void main(String[] args) {
// java -classpath classes de.anomic.yacy.yacySeed hHJBztzcFn76
// java -classpath classes de.anomic.yacy.yacySeed hHJBztzcFG76 M8hgtrHG6g12 3
// test the DHT position calculation
String wordHash = args[0];
//double dhtd;
long dhtl;
int partitionExponent = 0;
VerticalWordPartitionScheme partition = new VerticalWordPartitionScheme(0);
if (args.length == 3) {
// the horizontal and vertical position calculation
String urlHash = args[1];
partitionExponent = Integer.parseInt(args[2]);
dhtl = partition.dhtPosition(wordHash.getBytes(), urlHash);
} else {
// only a horizontal position calculation
dhtl = FlatWordPartitionScheme.std.dhtPosition(wordHash.getBytes(), null);
}
//System.out.println("DHT Double = " + dhtd);
System.out.println("DHT Long = " + dhtl);
System.out.println("DHT as Double from Long = " + ((double) dhtl) / ((double) Long.MAX_VALUE));
//System.out.println("DHT as Long from Double = " + (long) (Long.MAX_VALUE * dhtd));
//System.out.println("DHT as b64 from Double = " + positionToHash(dhtd));
System.out.println("DHT as b64 from Long = " + FlatWordPartitionScheme.positionToHash(dhtl));
System.out.print("all " + (1 << partitionExponent) + " DHT positions from doubles: ");
/*
double[] d = dhtPositionsDouble(wordHash, partitionExponent);
for (int i = 0; i < d.length; i++) {
if (i > 0) System.out.print(", ");
System.out.print(positionToHash(d[i]));
}
System.out.println();
*/
System.out.print("all " + (1 << partitionExponent) + " DHT positions from long : ");
long[] l = partition.dhtPositions(wordHash.getBytes());
for (int i = 0; i < l.length; i++) {
if (i > 0) System.out.print(", ");
System.out.print(FlatWordPartitionScheme.positionToHash(l[i]));
}
System.out.println();
}
}