yacy_search_server/source/de/anomic/kelondro/text/AbstractIndex.java

// AbstractIndex.java
// -----------------------------
// (C) 2009 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany
// first published 15.3.2009 on http://yacy.net
//
// This is a part of YaCy, a peer-to-peer based web search engine
//
// $LastChangedDate: 2006-04-02 22:40:07 +0200 (So, 02 Apr 2006) $
// $LastChangedRevision: 1986 $
// $LastChangedBy: orbiter $
//
// LICENSE
// 
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation; either version 2 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA

package de.anomic.kelondro.text;

import java.io.IOException;
import java.util.HashMap;
import java.util.Iterator;
import java.util.Set;
import java.util.TreeSet;

import de.anomic.kelondro.order.Order;

public abstract class AbstractIndex <ReferenceType extends Reference> implements Index<ReferenceType> {
    
    final protected ReferenceFactory<ReferenceType> factory;

    public AbstractIndex(final ReferenceFactory<ReferenceType> factory) {
        this.factory = factory;
    }
    
    public int remove(final TreeSet<byte[]> termHashes, final String urlHash) throws IOException {
        // remove the same url hashes for multiple words
        // this is mainly used when correcting a index after a search
        final Iterator<byte[]> i = termHashes.iterator();
        int c = 0;
        while (i.hasNext()) {
            if (remove(i.next(), urlHash)) c++;
        }
        return c;
    }
    
    public void remove(final TreeSet<byte[]> termHashes, final Set<String> urlHashes) throws IOException {
        // remove the same url hashes for multiple words
        // this is mainly used when correcting a index after a search
        final Iterator<byte[]> i = termHashes.iterator();
        while (i.hasNext()) {
            remove(i.next(), urlHashes);
        }
    }
    
    public synchronized TreeSet<ReferenceContainer<ReferenceType>> references(final byte[] startHash, final boolean rot, int count) throws IOException {
        // creates a set of indexContainers
        // this does not use the cache
        final Order<ReferenceContainer<ReferenceType>> containerOrder = new ReferenceContainerOrder<ReferenceType>(factory, this.ordering().clone());
        ReferenceContainer<ReferenceType> emptyContainer = ReferenceContainer.emptyContainer(factory, startHash, 0);
        containerOrder.rotate(emptyContainer);
        final TreeSet<ReferenceContainer<ReferenceType>> containers = new TreeSet<ReferenceContainer<ReferenceType>>(containerOrder);
        final Iterator<ReferenceContainer<ReferenceType>> i = references(startHash, rot);
        //if (ram) count = Math.min(size(), count);
        ReferenceContainer<ReferenceType> container;
        // this loop does not terminate using the i.hasNex() predicate when rot == true
        // because then the underlying iterator is a rotating iterator without termination
        // in this case a termination must be ensured with a counter
        // It must also be ensured that the counter is in/decreased every loop
        while ((count > 0) && (i.hasNext())) {
            container = i.next();
            if ((container != null) && (container.size() > 0)) {
                containers.add(container);
            }
            count--; // decrease counter even if the container was null or empty to ensure termination
        }
        return containers; // this may return less containers as demanded
    }
    
    
    // methods to search in the index
    
    /**
     * collect containers for given word hashes. This collection stops if a single container does not contain any references.
     * In that case only a empty result is returned.
     * @param wordHashes
     * @param urlselection
     * @return map of wordhash:indexContainer
     */
    public HashMap<byte[], ReferenceContainer<ReferenceType>> searchConjunction(final TreeSet<byte[]> wordHashes, final Set<String> urlselection) {
    	// first check if there is any entry that has no match; this uses only operations in ram
    	/*
    	Iterator<byte[]> i = wordHashes.iterator();
        while (i.hasNext()) {
            if (!this.has(i.next())); return new HashMap<byte[], ReferenceContainer<ReferenceType>>(0);
        }
        */
    	// retrieve entities that belong to the hashes
        final HashMap<byte[], ReferenceContainer<ReferenceType>> containers = new HashMap<byte[], ReferenceContainer<ReferenceType>>(wordHashes.size());
        byte[] singleHash;
        ReferenceContainer<ReferenceType> singleContainer;
        Iterator<byte[]> i = wordHashes.iterator();
        while (i.hasNext()) {
        
            // get next word hash:
            singleHash = i.next();
        
            // retrieve index
            try {
                singleContainer = this.get(singleHash, urlselection);
            } catch (IOException e) {
                e.printStackTrace();
                continue;
            }
        
            // check result
            if ((singleContainer == null || singleContainer.size() == 0)) return new HashMap<byte[], ReferenceContainer<ReferenceType>>(0);
        
            containers.put(singleHash, singleContainer);
        }
        return containers;
    }
    
    @SuppressWarnings("unchecked")
	public HashMap<byte[], ReferenceContainer<ReferenceType>>[] searchTerm(
            final TreeSet<byte[]> queryHashes, 
            final TreeSet<byte[]> excludeHashes, 
            final Set<String> urlselection) {
		// search for the set of hashes and return a map of of wordhash:indexContainer containing the seach result
		
		// retrieve entities that belong to the hashes
		HashMap<byte[], ReferenceContainer<ReferenceType>> inclusionContainers =
		(queryHashes.size() == 0) ?
		    new HashMap<byte[], ReferenceContainer<ReferenceType>>(0) :
		    this.searchConjunction(queryHashes, urlselection);
		if ((inclusionContainers.size() != 0) && (inclusionContainers.size() < queryHashes.size())) inclusionContainers = new HashMap<byte[], ReferenceContainer<ReferenceType>>(0); // prevent that only a subset is returned
		final HashMap<byte[], ReferenceContainer<ReferenceType>> exclusionContainers =
		(inclusionContainers.size() == 0) ?
		    new HashMap<byte[], ReferenceContainer<ReferenceType>>(0) :
		    this.searchConjunction(excludeHashes, urlselection);
		return new HashMap[]{inclusionContainers, exclusionContainers};
    }
    
}
refactoring to integrate indexCell data structures git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@5718 6c8d7289-2bf4-0310-a012-ef5d649a1542 2009-03-16 01:18:37 +01:00			`// AbstractIndex.java`
			`// -----------------------------`
			`// (C) 2009 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany`
			`// first published 15.3.2009 on http://yacy.net`
			`//`
			`// This is a part of YaCy, a peer-to-peer based web search engine`
			`//`
			`// $LastChangedDate: 2006-04-02 22:40:07 +0200 (So, 02 Apr 2006) $`
			`// $LastChangedRevision: 1986 $`
			`// $LastChangedBy: orbiter $`
			`//`
			`// LICENSE`
			`//`
			`// This program is free software; you can redistribute it and/or modify`
			`// it under the terms of the GNU General Public License as published by`
			`// the Free Software Foundation; either version 2 of the License, or`
			`// (at your option) any later version.`
			`//`
			`// This program is distributed in the hope that it will be useful,`
			`// but WITHOUT ANY WARRANTY; without even the implied warranty of`
			`// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the`
			`// GNU General Public License for more details.`
			`//`
			`// You should have received a copy of the GNU General Public License`
			`// along with this program; if not, write to the Free Software`
			`// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA`

			`package de.anomic.kelondro.text;`

			`import java.io.IOException;`
some refactoring of search methods git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@5988 6c8d7289-2bf4-0310-a012-ef5d649a1542 2009-05-28 01:51:34 +02:00			`import java.util.HashMap;`
refactoring to integrate indexCell data structures git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@5718 6c8d7289-2bf4-0310-a012-ef5d649a1542 2009-03-16 01:18:37 +01:00			`import java.util.Iterator;`
			`import java.util.Set;`
more refactoring git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@5722 6c8d7289-2bf4-0310-a012-ef5d649a1542 2009-03-16 17:24:53 +01:00			`import java.util.TreeSet;`

			`import de.anomic.kelondro.order.Order;`
refactoring to integrate indexCell data structures git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@5718 6c8d7289-2bf4-0310-a012-ef5d649a1542 2009-03-16 01:18:37 +01:00
- full abstraction of index content type: the kelondro full text index may now also contain indexes about other content than text, i.e. navigation indexes or reverse linking indexes. - during index joins all word positions are maintained: better ranking for word distance possible; exact phrase match can be implemented soundly git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@5804 6c8d7289-2bf4-0310-a012-ef5d649a1542 2009-04-15 08:34:27 +02:00			`public abstract class AbstractIndex <ReferenceType extends Reference> implements Index<ReferenceType> {`

			`final protected ReferenceFactory<ReferenceType> factory;`

			`public AbstractIndex(final ReferenceFactory<ReferenceType> factory) {`
			`this.factory = factory;`
			`}`
refactoring to integrate indexCell data structures git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@5718 6c8d7289-2bf4-0310-a012-ef5d649a1542 2009-03-16 01:18:37 +01:00
full redesign of index access data model: terms (words) are not any more retrieved by their word hash string, but by a byte[] containing the word hash. this has strong advantages when RWIs are sorted in the ReferenceContainer Cache and compared with the sun.java TreeMap method, which needed getBytes() and new String() transformations before. Many thousands of such conversions are now omitted every second, which increases the indexing speed by a factor of two. git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@5812 6c8d7289-2bf4-0310-a012-ef5d649a1542 2009-04-16 17:29:00 +02:00			`public int remove(final TreeSet<byte[]> termHashes, final String urlHash) throws IOException {`
refactoring to integrate indexCell data structures git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@5718 6c8d7289-2bf4-0310-a012-ef5d649a1542 2009-03-16 01:18:37 +01:00			`// remove the same url hashes for multiple words`
			`// this is mainly used when correcting a index after a search`
full redesign of index access data model: terms (words) are not any more retrieved by their word hash string, but by a byte[] containing the word hash. this has strong advantages when RWIs are sorted in the ReferenceContainer Cache and compared with the sun.java TreeMap method, which needed getBytes() and new String() transformations before. Many thousands of such conversions are now omitted every second, which increases the indexing speed by a factor of two. git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@5812 6c8d7289-2bf4-0310-a012-ef5d649a1542 2009-04-16 17:29:00 +02:00			`final Iterator<byte[]> i = termHashes.iterator();`
refactoring to integrate indexCell data structures git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@5718 6c8d7289-2bf4-0310-a012-ef5d649a1542 2009-03-16 01:18:37 +01:00			`int c = 0;`
			`while (i.hasNext()) {`
more refactoring git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@5722 6c8d7289-2bf4-0310-a012-ef5d649a1542 2009-03-16 17:24:53 +01:00			`if (remove(i.next(), urlHash)) c++;`
refactoring to integrate indexCell data structures git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@5718 6c8d7289-2bf4-0310-a012-ef5d649a1542 2009-03-16 01:18:37 +01:00			`}`
			`return c;`
			`}`

full redesign of index access data model: terms (words) are not any more retrieved by their word hash string, but by a byte[] containing the word hash. this has strong advantages when RWIs are sorted in the ReferenceContainer Cache and compared with the sun.java TreeMap method, which needed getBytes() and new String() transformations before. Many thousands of such conversions are now omitted every second, which increases the indexing speed by a factor of two. git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@5812 6c8d7289-2bf4-0310-a012-ef5d649a1542 2009-04-16 17:29:00 +02:00			`public void remove(final TreeSet<byte[]> termHashes, final Set<String> urlHashes) throws IOException {`
refactoring to integrate indexCell data structures git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@5718 6c8d7289-2bf4-0310-a012-ef5d649a1542 2009-03-16 01:18:37 +01:00			`// remove the same url hashes for multiple words`
			`// this is mainly used when correcting a index after a search`
full redesign of index access data model: terms (words) are not any more retrieved by their word hash string, but by a byte[] containing the word hash. this has strong advantages when RWIs are sorted in the ReferenceContainer Cache and compared with the sun.java TreeMap method, which needed getBytes() and new String() transformations before. Many thousands of such conversions are now omitted every second, which increases the indexing speed by a factor of two. git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@5812 6c8d7289-2bf4-0310-a012-ef5d649a1542 2009-04-16 17:29:00 +02:00			`final Iterator<byte[]> i = termHashes.iterator();`
refactoring to integrate indexCell data structures git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@5718 6c8d7289-2bf4-0310-a012-ef5d649a1542 2009-03-16 01:18:37 +01:00			`while (i.hasNext()) {`
more refactoring git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@5722 6c8d7289-2bf4-0310-a012-ef5d649a1542 2009-03-16 17:24:53 +01:00			`remove(i.next(), urlHashes);`
refactoring to integrate indexCell data structures git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@5718 6c8d7289-2bf4-0310-a012-ef5d649a1542 2009-03-16 01:18:37 +01:00			`}`
			`}`

full redesign of index access data model: terms (words) are not any more retrieved by their word hash string, but by a byte[] containing the word hash. this has strong advantages when RWIs are sorted in the ReferenceContainer Cache and compared with the sun.java TreeMap method, which needed getBytes() and new String() transformations before. Many thousands of such conversions are now omitted every second, which increases the indexing speed by a factor of two. git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@5812 6c8d7289-2bf4-0310-a012-ef5d649a1542 2009-04-16 17:29:00 +02:00			`public synchronized TreeSet<ReferenceContainer<ReferenceType>> references(final byte[] startHash, final boolean rot, int count) throws IOException {`
more refactoring git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@5722 6c8d7289-2bf4-0310-a012-ef5d649a1542 2009-03-16 17:24:53 +01:00			`// creates a set of indexContainers`
			`// this does not use the cache`
- full abstraction of index content type: the kelondro full text index may now also contain indexes about other content than text, i.e. navigation indexes or reverse linking indexes. - during index joins all word positions are maintained: better ranking for word distance possible; exact phrase match can be implemented soundly git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@5804 6c8d7289-2bf4-0310-a012-ef5d649a1542 2009-04-15 08:34:27 +02:00			`final Order<ReferenceContainer<ReferenceType>> containerOrder = new ReferenceContainerOrder<ReferenceType>(factory, this.ordering().clone());`
			`ReferenceContainer<ReferenceType> emptyContainer = ReferenceContainer.emptyContainer(factory, startHash, 0);`
			`containerOrder.rotate(emptyContainer);`
			`final TreeSet<ReferenceContainer<ReferenceType>> containers = new TreeSet<ReferenceContainer<ReferenceType>>(containerOrder);`
			`final Iterator<ReferenceContainer<ReferenceType>> i = references(startHash, rot);`
more refactoring git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@5722 6c8d7289-2bf4-0310-a012-ef5d649a1542 2009-03-16 17:24:53 +01:00			`//if (ram) count = Math.min(size(), count);`
- full abstraction of index content type: the kelondro full text index may now also contain indexes about other content than text, i.e. navigation indexes or reverse linking indexes. - during index joins all word positions are maintained: better ranking for word distance possible; exact phrase match can be implemented soundly git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@5804 6c8d7289-2bf4-0310-a012-ef5d649a1542 2009-04-15 08:34:27 +02:00			`ReferenceContainer<ReferenceType> container;`
more refactoring git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@5722 6c8d7289-2bf4-0310-a012-ef5d649a1542 2009-03-16 17:24:53 +01:00			`// this loop does not terminate using the i.hasNex() predicate when rot == true`
			`// because then the underlying iterator is a rotating iterator without termination`
			`// in this case a termination must be ensured with a counter`
			`// It must also be ensured that the counter is in/decreased every loop`
			`while ((count > 0) && (i.hasNext())) {`
			`container = i.next();`
			`if ((container != null) && (container.size() > 0)) {`
			`containers.add(container);`
			`}`
			`count--; // decrease counter even if the container was null or empty to ensure termination`
			`}`
			`return containers; // this may return less containers as demanded`
			`}`
some refactoring of search methods git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@5988 6c8d7289-2bf4-0310-a012-ef5d649a1542 2009-05-28 01:51:34 +02:00

			`// methods to search in the index`

			`/**`
			`* collect containers for given word hashes. This collection stops if a single container does not contain any references.`
			`* In that case only a empty result is returned.`
			`* @param wordHashes`
			`* @param urlselection`
			`* @return map of wordhash:indexContainer`
			`*/`
			`public HashMap<byte[], ReferenceContainer<ReferenceType>> searchConjunction(final TreeSet<byte[]> wordHashes, final Set<String> urlselection) {`
			`// first check if there is any entry that has no match; this uses only operations in ram`
			`/*`
			`Iterator<byte[]> i = wordHashes.iterator();`
			`while (i.hasNext()) {`
			`if (!this.has(i.next())); return new HashMap<byte[], ReferenceContainer<ReferenceType>>(0);`
			`}`
			`*/`
			`// retrieve entities that belong to the hashes`
			`final HashMap<byte[], ReferenceContainer<ReferenceType>> containers = new HashMap<byte[], ReferenceContainer<ReferenceType>>(wordHashes.size());`
			`byte[] singleHash;`
			`ReferenceContainer<ReferenceType> singleContainer;`
			`Iterator<byte[]> i = wordHashes.iterator();`
			`while (i.hasNext()) {`

			`// get next word hash:`
			`singleHash = i.next();`

			`// retrieve index`
			`try {`
			`singleContainer = this.get(singleHash, urlselection);`
			`} catch (IOException e) {`
			`e.printStackTrace();`
			`continue;`
			`}`

			`// check result`
			`if ((singleContainer == null \|\| singleContainer.size() == 0)) return new HashMap<byte[], ReferenceContainer<ReferenceType>>(0);`

			`containers.put(singleHash, singleContainer);`
			`}`
			`return containers;`
			`}`

			`@SuppressWarnings("unchecked")`
			`public HashMap<byte[], ReferenceContainer<ReferenceType>>[] searchTerm(`
			`final TreeSet<byte[]> queryHashes,`
			`final TreeSet<byte[]> excludeHashes,`
			`final Set<String> urlselection) {`
			`// search for the set of hashes and return a map of of wordhash:indexContainer containing the seach result`

			`// retrieve entities that belong to the hashes`
			`HashMap<byte[], ReferenceContainer<ReferenceType>> inclusionContainers =`
			`(queryHashes.size() == 0) ?`
			`new HashMap<byte[], ReferenceContainer<ReferenceType>>(0) :`
			`this.searchConjunction(queryHashes, urlselection);`
			`if ((inclusionContainers.size() != 0) && (inclusionContainers.size() < queryHashes.size())) inclusionContainers = new HashMap<byte[], ReferenceContainer<ReferenceType>>(0); // prevent that only a subset is returned`
			`final HashMap<byte[], ReferenceContainer<ReferenceType>> exclusionContainers =`
			`(inclusionContainers.size() == 0) ?`
			`new HashMap<byte[], ReferenceContainer<ReferenceType>>(0) :`
			`this.searchConjunction(excludeHashes, urlselection);`
			`return new HashMap[]{inclusionContainers, exclusionContainers};`
			`}`

refactoring to integrate indexCell data structures git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@5718 6c8d7289-2bf4-0310-a012-ef5d649a1542 2009-03-16 01:18:37 +01:00			`}`