yacy_search_server/source/de/anomic/yacy/dht/VerticalWordPartitionScheme.java

// VerticalWordPartitionScheme.java 
// --------------------------------
// part of YaCy
// (C) 2009 by Michael Peter Christen; mc@yacy.net
// first published on http://yacy.net
// Frankfurt, Germany, 28.01.2009
//
// $LastChangedDate: 2009-01-23 16:32:27 +0100 (Fr, 23 Jan 2009) $
// $LastChangedRevision: 5514 $
// $LastChangedBy: orbiter $
//
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation; either version 2 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA

package de.anomic.yacy.dht;

import de.anomic.yacy.yacySeed;

public class VerticalWordPartitionScheme implements PartitionScheme {
    
    int partitionExponent;
    
    public VerticalWordPartitionScheme(int partitionExponent) {
        this.partitionExponent = partitionExponent;
    }

    public int verticalPartitions() {
        return 1 << partitionExponent;
    }
    
    /**
     * calculate the DHT position for horizontal and vertical performance scaling:
     * horizontal: scale with number of words
     * vertical: scale with number of references for every word
     * The vertical scaling is selected using the corresponding reference hash, the url hash
     * This has the effect that every vertical position accumulates references for the same url
     * and the urls are not spread over all positions of the DHT. To use this effect, the
     * horizontal DHT position must be normed to a 'rest' value of a partition size
     * This method is compatible to the classic DHT computation as always one of the vertical
     * DHT position corresponds to the classic position. 
     * @param wordHash, the hash of the RWI
     * @param partitions, the number of partitions should be computed with partitions = 2**n, n = scaling factor
     * @param urlHash, the hash of a reference
     * @return a double in the range 0 .. 1.0 (including 0, excluding 1.0), the DHT position
     */
    public final long dhtPosition(final String wordHash, final String urlHash) {
        // this creates 1^^e different positions for the same word hash (according to url hash)
        assert wordHash != null;
        assert urlHash != null;
        if (urlHash == null || partitionExponent < 1) return FlatWordPartitionScheme.std.dhtPosition(wordHash, null);
        // the partition size is (Long.MAX + 1) / 2 ** e == 2 ** (63 - e)
        assert partitionExponent > 0;
        long partitionMask = (1L << (Long.SIZE - 1 - partitionExponent)) - 1L;
        // compute the position using a specific fragment of the word hash and the url hash:
        // - from the word hash take the (63 - <partitionExponent>) lower bits
        // - from the url hash take the (63 - <partitionExponent>) higher bits
        // in case that the partitionExpoent is 1, only one bit is taken from the urlHash,
        // which means that the partition is in two parts.
        // With partitionExponent = 2 it is divided in four parts and so on.
        return (FlatWordPartitionScheme.std.dhtPosition(wordHash, null) & partitionMask) | (FlatWordPartitionScheme.std.dhtPosition(urlHash, null) & ~partitionMask);
    }
    
    public final long dhtPosition(final String wordHash, final int verticalPosition) {
        assert wordHash != null;
        if (partitionExponent == 0) return FlatWordPartitionScheme.std.dhtPosition(wordHash, null);
        long partitionMask = (1L << (Long.SIZE - 1 - partitionExponent)) - 1L;
        long verticalMask = ((long) verticalPosition) << (Long.SIZE - 1 - partitionExponent); // don't remove the cast! it will become an integer result which is wrong.
        return (FlatWordPartitionScheme.std.dhtPosition(wordHash, null) & partitionMask) | verticalMask;
    }
    
    public final int verticalPosition(final String urlHash) {
        assert urlHash != null;
        if (urlHash == null || partitionExponent < 1) return 0;
        assert partitionExponent > 0;
        return (int) (FlatWordPartitionScheme.std.dhtPosition(urlHash, null) >> (Long.SIZE - 1 - partitionExponent)); // take only the top-<partitionExponent> bits
    }
    
    /**
     * compute all vertical DHT positions for a given word
     * This is used when a word is searched and the peers holding the word must be computed
     * @param wordHash, the hash of the word
     * @param partitions, the number of partitions of the DHT
     * @return a vector of long values, the possible DHT positions
     */
    public final long[] dhtPositions(final String wordHash) {
        assert wordHash != null;
        int partitions = 1 << partitionExponent;
        long[] l = new long[partitions];
        long partitionSize = 1L << (Long.SIZE - 1 - partitionExponent);
        l[0] = FlatWordPartitionScheme.std.dhtPosition(wordHash, null) & (partitionSize - 1L); // this is the lowest possible position
        for (int i = 1; i < partitions; i++) {
            l[i] = l[i - 1] + partitionSize; // no overflow, because we started with the lowest
        }
        return l;
    }
 
    public final long dhtDistance(final String word, final String urlHash, final yacySeed peer) {
        return dhtDistance(word, urlHash, peer.hash);
    }
    
    private final long dhtDistance(final String from, final String urlHash, final String to) {
        // the dht distance is a positive value between 0 and 1
        // if the distance is small, the word more probably belongs to the peer
        assert to != null;
        assert from != null;
        final long toPos = FlatWordPartitionScheme.std.dhtPosition(to, null);
        final long fromPos = dhtPosition(from, urlHash);
        return FlatWordPartitionScheme.dhtDistance(fromPos, toPos);
    }

    public static void main(String[] args) {
        // java -classpath classes de.anomic.yacy.yacySeed hHJBztzcFn76
        // java -classpath classes de.anomic.yacy.yacySeed hHJBztzcFG76 M8hgtrHG6g12 3
        // test the DHT position calculation
        String wordHash = args[0];
        //double dhtd;
        long   dhtl;
        int partitionExponent = 0;
        VerticalWordPartitionScheme partition = new VerticalWordPartitionScheme(0);
        if (args.length == 3) {
            // the horizontal and vertical position calculation
            String urlHash = args[1];
            partitionExponent = Integer.parseInt(args[2]);
            dhtl = partition.dhtPosition(wordHash, urlHash);
        } else {
            // only a horizontal position calculation
            dhtl = FlatWordPartitionScheme.std.dhtPosition(wordHash, null);
        }
        //System.out.println("DHT Double              = " + dhtd);
        System.out.println("DHT Long                = " + dhtl);
        System.out.println("DHT as Double from Long = " + ((double) dhtl) / ((double) Long.MAX_VALUE));
        //System.out.println("DHT as Long from Double = " + (long) (Long.MAX_VALUE * dhtd));
        //System.out.println("DHT as b64 from Double  = " + positionToHash(dhtd));
        System.out.println("DHT as b64 from Long    = " + FlatWordPartitionScheme.positionToHash(dhtl));
        
        System.out.print("all " + (1 << partitionExponent) + " DHT positions from doubles: ");
        /*
        
        double[] d = dhtPositionsDouble(wordHash, partitionExponent);
        for (int i = 0; i < d.length; i++) {
            if (i > 0) System.out.print(", ");
            System.out.print(positionToHash(d[i]));
        }
        System.out.println();
        */
        System.out.print("all " + (1 << partitionExponent) + " DHT positions from long   : ");
        long[] l = partition.dhtPositions(wordHash);
        for (int i = 0; i < l.length; i++) {
            if (i > 0) System.out.print(", ");
            System.out.print(FlatWordPartitionScheme.positionToHash(l[i]));
        }
        System.out.println();
    }

}
replaced old DHT transmission method with new method. Many things have changed! some of them: - after a index selection is made, the index is splitted into its vertical components - from differrent index selctions the splitted components can be accumulated before they are placed into the transmission queue - each splitted chunk gets its own transmission thread - multiple transmission threads are started concurrently - the process can be monitored with the blocking queue servlet To implement that, a new package de.anomic.yacy.dht was created. Some old files have been removed. The new index distribution model using a vertical DHT was implemented. An abstraction of this model is implemented in the new dht package as interface. The freeworld network has now a configuration of two vertial partitions; sixteen partitions are planned and will be configured if the process is bug-free. This modification has three main targets: - enhance the DHT transmission speed - with a vertical DHT, a search will speed up. With two partitions, two times. With sixteen, sixteen times. - the vertical DHT will apply a semi-dht for URLs, and peers will receive a fraction of the overall URLs they received before. with two partitions, the fractions will be halve. With sixteen partitions, a 1/16 of the previous number of URLs. BE CAREFULL, THIS IS A MAJOR CODE CHANGE, POSSIBLY FULL OF BUGS AND HARMFUL THINGS. git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@5586 6c8d7289-2bf4-0310-a012-ef5d649a1542 2009-02-10 01:06:59 +01:00			`// VerticalWordPartitionScheme.java`
			`// --------------------------------`
			`// part of YaCy`
			`// (C) 2009 by Michael Peter Christen; mc@yacy.net`
			`// first published on http://yacy.net`
			`// Frankfurt, Germany, 28.01.2009`
			`//`
			`// $LastChangedDate: 2009-01-23 16:32:27 +0100 (Fr, 23 Jan 2009) $`
			`// $LastChangedRevision: 5514 $`
			`// $LastChangedBy: orbiter $`
			`//`
			`// This program is free software; you can redistribute it and/or modify`
			`// it under the terms of the GNU General Public License as published by`
			`// the Free Software Foundation; either version 2 of the License, or`
			`// (at your option) any later version.`
			`//`
			`// This program is distributed in the hope that it will be useful,`
			`// but WITHOUT ANY WARRANTY; without even the implied warranty of`
			`// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the`
			`// GNU General Public License for more details.`
			`//`
			`// You should have received a copy of the GNU General Public License`
			`// along with this program; if not, write to the Free Software`
			`// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA`

			`package de.anomic.yacy.dht;`

			`import de.anomic.yacy.yacySeed;`

			`public class VerticalWordPartitionScheme implements PartitionScheme {`

			`int partitionExponent;`

			`public VerticalWordPartitionScheme(int partitionExponent) {`
			`this.partitionExponent = partitionExponent;`
			`}`

			`public int verticalPartitions() {`
			`return 1 << partitionExponent;`
			`}`

			`/**`
			`* calculate the DHT position for horizontal and vertical performance scaling:`
			`* horizontal: scale with number of words`
			`* vertical: scale with number of references for every word`
			`* The vertical scaling is selected using the corresponding reference hash, the url hash`
			`* This has the effect that every vertical position accumulates references for the same url`
			`* and the urls are not spread over all positions of the DHT. To use this effect, the`
			`* horizontal DHT position must be normed to a 'rest' value of a partition size`
			`* This method is compatible to the classic DHT computation as always one of the vertical`
			`* DHT position corresponds to the classic position.`
			`* @param wordHash, the hash of the RWI`
			`* @param partitions, the number of partitions should be computed with partitions = 2**n, n = scaling factor`
			`* @param urlHash, the hash of a reference`
			`* @return a double in the range 0 .. 1.0 (including 0, excluding 1.0), the DHT position`
			`*/`
			`public final long dhtPosition(final String wordHash, final String urlHash) {`
			`// this creates 1^^e different positions for the same word hash (according to url hash)`
			`assert wordHash != null;`
			`assert urlHash != null;`
			`if (urlHash == null \|\| partitionExponent < 1) return FlatWordPartitionScheme.std.dhtPosition(wordHash, null);`
			`// the partition size is (Long.MAX + 1) / 2 e == 2 (63 - e)`
			`assert partitionExponent > 0;`
			`long partitionMask = (1L << (Long.SIZE - 1 - partitionExponent)) - 1L;`
			`// compute the position using a specific fragment of the word hash and the url hash:`
			`// - from the word hash take the (63 - <partitionExponent>) lower bits`
			`// - from the url hash take the (63 - <partitionExponent>) higher bits`
			`// in case that the partitionExpoent is 1, only one bit is taken from the urlHash,`
			`// which means that the partition is in two parts.`
			`// With partitionExponent = 2 it is divided in four parts and so on.`
			`return (FlatWordPartitionScheme.std.dhtPosition(wordHash, null) & partitionMask) \| (FlatWordPartitionScheme.std.dhtPosition(urlHash, null) & ~partitionMask);`
			`}`

			`public final long dhtPosition(final String wordHash, final int verticalPosition) {`
			`assert wordHash != null;`
			`if (partitionExponent == 0) return FlatWordPartitionScheme.std.dhtPosition(wordHash, null);`
			`long partitionMask = (1L << (Long.SIZE - 1 - partitionExponent)) - 1L;`
			`long verticalMask = ((long) verticalPosition) << (Long.SIZE - 1 - partitionExponent); // don't remove the cast! it will become an integer result which is wrong.`
			`return (FlatWordPartitionScheme.std.dhtPosition(wordHash, null) & partitionMask) \| verticalMask;`
			`}`

			`public final int verticalPosition(final String urlHash) {`
			`assert urlHash != null;`
			`if (urlHash == null \|\| partitionExponent < 1) return 0;`
			`assert partitionExponent > 0;`
			`return (int) (FlatWordPartitionScheme.std.dhtPosition(urlHash, null) >> (Long.SIZE - 1 - partitionExponent)); // take only the top-<partitionExponent> bits`
			`}`

			`/**`
			`* compute all vertical DHT positions for a given word`
			`* This is used when a word is searched and the peers holding the word must be computed`
			`* @param wordHash, the hash of the word`
			`* @param partitions, the number of partitions of the DHT`
			`* @return a vector of long values, the possible DHT positions`
			`*/`
			`public final long[] dhtPositions(final String wordHash) {`
			`assert wordHash != null;`
			`int partitions = 1 << partitionExponent;`
			`long[] l = new long[partitions];`
			`long partitionSize = 1L << (Long.SIZE - 1 - partitionExponent);`
			`l[0] = FlatWordPartitionScheme.std.dhtPosition(wordHash, null) & (partitionSize - 1L); // this is the lowest possible position`
			`for (int i = 1; i < partitions; i++) {`
			`l[i] = l[i - 1] + partitionSize; // no overflow, because we started with the lowest`
			`}`
			`return l;`
			`}`

			`public final long dhtDistance(final String word, final String urlHash, final yacySeed peer) {`
			`return dhtDistance(word, urlHash, peer.hash);`
			`}`

			`private final long dhtDistance(final String from, final String urlHash, final String to) {`
			`// the dht distance is a positive value between 0 and 1`
			`// if the distance is small, the word more probably belongs to the peer`
			`assert to != null;`
			`assert from != null;`
			`final long toPos = FlatWordPartitionScheme.std.dhtPosition(to, null);`
			`final long fromPos = dhtPosition(from, urlHash);`
			`return FlatWordPartitionScheme.dhtDistance(fromPos, toPos);`
			`}`

			`public static void main(String[] args) {`
			`// java -classpath classes de.anomic.yacy.yacySeed hHJBztzcFn76`
			`// java -classpath classes de.anomic.yacy.yacySeed hHJBztzcFG76 M8hgtrHG6g12 3`
			`// test the DHT position calculation`
			`String wordHash = args[0];`
			`//double dhtd;`
			`long dhtl;`
			`int partitionExponent = 0;`
			`VerticalWordPartitionScheme partition = new VerticalWordPartitionScheme(0);`
			`if (args.length == 3) {`
			`// the horizontal and vertical position calculation`
			`String urlHash = args[1];`
			`partitionExponent = Integer.parseInt(args[2]);`
			`dhtl = partition.dhtPosition(wordHash, urlHash);`
			`} else {`
			`// only a horizontal position calculation`
			`dhtl = FlatWordPartitionScheme.std.dhtPosition(wordHash, null);`
			`}`
			`//System.out.println("DHT Double = " + dhtd);`
			`System.out.println("DHT Long = " + dhtl);`
			`System.out.println("DHT as Double from Long = " + ((double) dhtl) / ((double) Long.MAX_VALUE));`
			`//System.out.println("DHT as Long from Double = " + (long) (Long.MAX_VALUE * dhtd));`
			`//System.out.println("DHT as b64 from Double = " + positionToHash(dhtd));`
			`System.out.println("DHT as b64 from Long = " + FlatWordPartitionScheme.positionToHash(dhtl));`

			`System.out.print("all " + (1 << partitionExponent) + " DHT positions from doubles: ");`
			`/*`

			`double[] d = dhtPositionsDouble(wordHash, partitionExponent);`
			`for (int i = 0; i < d.length; i++) {`
			`if (i > 0) System.out.print(", ");`
			`System.out.print(positionToHash(d[i]));`
			`}`
			`System.out.println();`
			`*/`
			`System.out.print("all " + (1 << partitionExponent) + " DHT positions from long : ");`
			`long[] l = partition.dhtPositions(wordHash);`
			`for (int i = 0; i < l.length; i++) {`
			`if (i > 0) System.out.print(", ");`
			`System.out.print(FlatWordPartitionScheme.positionToHash(l[i]));`
			`}`
			`System.out.println();`
			`}`

			`}`