- enhanced DidYouMean computation using a faster count on index entries; this causes that results can be ranked better

- added limitations on DidYouMean result sets according to input and output string length

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@7246 6c8d7289-2bf4-0310-a012-ef5d649a1542
This commit is contained in:
orbiter 2010-10-12 22:02:10 +00:00
parent beb65437d2
commit 7cd9d9d22a
8 changed files with 107 additions and 40 deletions

View File

@ -14,7 +14,7 @@
<script type="text/javascript" src="/yacy/ui/js/jquery.autocomplete.js"></script>
<script type="text/javascript">
$(document).ready(function() {
$('#search').autocomplete('/suggest.json', {parse: opensearch, delay: 0, selectFirst: false, scroll: false});
$('#search').autocomplete('/suggest.json', {parse: opensearch, delay: 0, selectFirst: false, scroll: false, max: 20});
function opensearch(data) {
var parsed = [];
data = eval('({"suggest":' + data + '})');

View File

@ -42,7 +42,7 @@
<script type="text/javascript" src="/yacy/ui/js/jquery.autocomplete.js"></script>
<script type="text/javascript">
$(document).ready(function() {
$('#search').autocomplete('/suggest.json', {parse: opensearch, delay: 0, selectFirst: false, scroll: false});
$('#search').autocomplete('/suggest.json', {parse: opensearch, delay: 0, selectFirst: false, scroll: false, max: 30});
function opensearch(data) {
var parsed = [];
data = eval('({"suggest":' + data + '})');

View File

@ -31,6 +31,9 @@ import net.yacy.kelondro.util.ScoreCluster;
*/
public class DidYouMean {
private static final int MinimumInputWordLength = 2;
private static final int MinimumOutputWordLength = 4;
private static final char[] ALPHABET_LATIN = {
'a','b','c','d','e','f','g','h','i','j','k','l','m','n','o','p',
'q','r','s','t','u','v','w','x','y','z',
@ -118,8 +121,10 @@ public class DidYouMean {
* @return
*/
public SortedSet<String> getSuggestions(long timeout, int preSortSelection) {
if (this.word.length() < MinimumInputWordLength) return this.resultSet; // return nothing if input is too short
long startTime = System.currentTimeMillis();
long timelimit = startTime + timeout;
if (this.word.indexOf(' ') > 0) return getSuggestions(this.word.split(" "), timeout, preSortSelection, this.index);
long timelimit = System.currentTimeMillis() + timeout;
SortedSet<String> preSorted = getSuggestions(timeout);
if (System.currentTimeMillis() > timelimit) return preSorted;
ScoreCluster<String> scored = new ScoreCluster<String>();
@ -129,22 +134,18 @@ public class DidYouMean {
scored.addScore(s, index.count(Word.word2hash(s)));
}
SortedSet<String> countSorted = Collections.synchronizedSortedSet(new TreeSet<String>(new indexSizeComparator()));
if (System.currentTimeMillis() > timelimit) {
while (scored.size() > 0) {
if (countSorted.size() >= preSortSelection) break;
String s = scored.getMaxObject();
scored.deleteScore(s);
countSorted.add(s);
}
} else {
int wc = index.count(Word.word2hash(this.word)); // all counts must be greater than this
while (scored.size() > 0) {
if (countSorted.size() >= preSortSelection) break;
while (scored.size() > 0 && countSorted.size() < preSortSelection) {
String s = scored.getMaxObject();
int score = scored.deleteScore(s);
if (score > wc) countSorted.add(s);
}
if (s.length() >= MinimumOutputWordLength && score > wc) countSorted.add(s);
if (System.currentTimeMillis() > timelimit) break;
}
// finished
Log.logInfo("DidYouMean", "found " + preSorted.size() + " terms, returned " + countSorted.size() + " suggestions; execution time: "
+ (System.currentTimeMillis() - startTime) + "ms" + " - remaining queue size: " + guessLib.size());
return countSorted;
}
@ -243,10 +244,6 @@ public class DidYouMean {
// we don't want the given word in the result
this.resultSet.remove(this.word);
// finished
Log.logInfo("DidYouMean", "found "+this.resultSet.size()+" terms; execution time: "
+(System.currentTimeMillis()-startTime)+"ms"+ " - remaining queue size: "+guessLib.size());
return this.resultSet;
}
@ -347,7 +344,7 @@ public class DidYouMean {
String s;
try {
while (!(s = guessLib.take()).equals(POISON_STRING)) {
if (index.has(Word.word2hash(s))) resultSet.add(s);
if (s.length() >= MinimumOutputWordLength && index.has(Word.word2hash(s))) resultSet.add(s);
if (System.currentTimeMillis() > timeLimit) return;
}
} catch (InterruptedException e) {}

View File

@ -662,6 +662,43 @@ public class ArrayStack implements BLOB {
return -1;
}
/**
* get all BLOBs in the array.
* this is useful when it is not clear if an entry is unique in all BLOBs in this array.
* @param key
* @return
* @throws IOException
*/
public Iterable<Long> lengthAll(byte[] key) throws IOException {
return new BlobLengths(key);
}
public class BlobLengths extends LookAheadIterator<Long> {
private final Iterator<blobItem> bii;
private final byte[] key;
public BlobLengths(byte[] key) {
this.bii = blobs.iterator();
this.key = key;
}
protected Long next0() {
while (this.bii.hasNext()) {
BLOB b = this.bii.next().blob;
if (b == null) continue;
try {
long l = b.length(key);
if (l >= 0) return Long.valueOf(l);
} catch (IOException e) {
Log.logSevere("ArrayStack", "", e);
return null;
}
}
return null;
}
}
/**
* retrieve the sizes of all BLOB
* @param key

View File

@ -121,7 +121,7 @@ public class RowCollection implements Iterable<Row.Entry>, Cloneable {
}
protected RowCollection(final Row rowdef, final Row.Entry exportedCollectionRowEnvironment) {
final int chunkcachelength = exportedCollectionRowEnvironment.cellwidth(1) - exportOverheadSize;
final int chunkcachelength = exportedCollectionRowEnvironment.cellwidth(1) - (int) exportOverheadSize;
final Row.Entry exportedCollection = exportRow(chunkcachelength).newEntry(exportedCollectionRowEnvironment, 1);
this.rowdef = rowdef;
@ -192,7 +192,7 @@ public class RowCollection implements Iterable<Row.Entry>, Cloneable {
private static Column exportColumn0, exportColumn1, exportColumn2, exportColumn3, exportColumn4;
protected static final int exportOverheadSize = 14;
protected static final long exportOverheadSize = 14;
private static Row exportRow(final int chunkcachelength) {
/*

View File

@ -79,16 +79,25 @@ public class RowSet extends RowCollection implements Index, Iterable<Row.Entry>
if (orderbound < 0) return new RowSet(rowdef); // error
long alloc = ((long) size) * ((long) rowdef.objectsize);
assert alloc <= Integer.MAX_VALUE : "alloc = " + alloc;
assert alloc == b.length - exportOverheadSize;
final byte[] chunkcache = new byte[(int) alloc];
//assert b.length - exportOverheadSize == size * rowdef.objectsize : "b.length = " + b.length + ", size * rowdef.objectsize = " + size * rowdef.objectsize;
if (b.length - exportOverheadSize != alloc) {
Log.logSevere("RowSet", "exportOverheadSize wrong: b.length = " + b.length + ", size * rowdef.objectsize = " + size * rowdef.objectsize);
return new RowSet(rowdef);
}
System.arraycopy(b, exportOverheadSize, chunkcache, 0, chunkcache.length);
System.arraycopy(b, (int) exportOverheadSize, chunkcache, 0, chunkcache.length);
return new RowSet(rowdef, size, chunkcache, orderbound);
}
public final static int importRowCount(final long blength, final Row rowdef) {
assert blength >= exportOverheadSize : "blength = " + blength;
if (blength < exportOverheadSize) return 0;
int c = (int) ((blength - exportOverheadSize) / (long) rowdef.objectsize);
assert c >= 0;
return c;
}
private RowSet(Row rowdef, byte[] chunkcache, int chunkcount, int sortBound, long lastTimeWrote) {
super(rowdef, chunkcache, chunkcount, sortBound, lastTimeWrote);
}

View File

@ -30,6 +30,7 @@ import java.io.File;
import java.io.IOException;
import java.util.concurrent.Semaphore;
import net.yacy.cora.storage.ComparableARC;
import net.yacy.kelondro.data.meta.URIMetadataRow;
import net.yacy.kelondro.index.HandleSet;
import net.yacy.kelondro.index.Row;
@ -62,6 +63,7 @@ public final class IndexCell<ReferenceType extends Reference> extends AbstractBu
// class variables
private final ReferenceContainerArray<ReferenceType> array;
private ReferenceContainerCache<ReferenceType> ram;
private final ComparableARC<byte[], Integer> countCache;
private int maxRamEntries;
private final IODispatcher merger;
private long lastCleanup, lastDump;
@ -86,6 +88,7 @@ public final class IndexCell<ReferenceType extends Reference> extends AbstractBu
this.array = new ReferenceContainerArray<ReferenceType>(cellPath, prefix, factory, termOrder, payloadrow, merger);
this.ram = new ReferenceContainerCache<ReferenceType>(factory, payloadrow, termOrder);
this.countCache = new ComparableARC<byte[], Integer>(100, termOrder);
this.maxRamEntries = maxRamEntries;
this.merger = merger;
this.lastCleanup = System.currentTimeMillis();
@ -147,29 +150,27 @@ public final class IndexCell<ReferenceType extends Reference> extends AbstractBu
/**
* count number of references for a given term
* this method may cause strong IO load if called too frequently, because it is
* necessary to read the corresponding reference containers from the files and
* count the resulting index object.
* To reduce the load for processes that frequently need access to the same
* term objects, a ARC cache is here to reduce IO load.
* this method may cause strong IO load if called too frequently.
*/
public int count(byte[] termHash) {
Integer cachedCount = this.countCache.get(termHash);
if (cachedCount != null) return cachedCount.intValue();
int countFile;
int countFile = 0;
// read fresh values from file
ReferenceContainer<ReferenceType> c1;
try {
c1 = this.array.get(termHash);
countFile = this.array.count(termHash);
} catch (Exception e) {
Log.logException(e);
c1 = null;
}
countFile = (c1 == null) ? 0 : c1.size();
assert countFile >= 0;
// count from container in ram
ReferenceContainer<ReferenceType> countRam = this.ram.get(termHash, null);
return (countRam == null) ? countFile : countFile + countRam.size();
assert countRam == null || countRam.size() >= 0;
int c = countRam == null ? countFile : countFile + countRam.size();
this.countCache.put(termHash, c);
return c;
}
/**

View File

@ -217,7 +217,6 @@ public final class ReferenceContainerArray<ReferenceType extends Reference> {
if (System.currentTimeMillis() > timeout) {
Log.logWarning("ReferenceContainerArray", "timout in index retrieval (1): " + k + " tables searched. timeout = 3000");
return c;
// timeout = Long.MAX_VALUE; // to prevent that the warning is shown again
}
while (entries.hasNext()) {
c = c.merge(new ReferenceContainer<ReferenceType>(this.factory, termHash, RowSet.importRowSet(entries.next(), payloadrow)));
@ -225,12 +224,36 @@ public final class ReferenceContainerArray<ReferenceType extends Reference> {
if (System.currentTimeMillis() > timeout) {
Log.logWarning("ReferenceContainerArray", "timout in index retrieval (2): " + k + " tables searched. timeout = 3000");
return c;
// timeout = Long.MAX_VALUE; // to prevent that the warning is shown again
}
}
return c;
}
public int count(final byte[] termHash) throws IOException {
long timeout = System.currentTimeMillis() + 3000;
Iterator<Long> entries = this.array.lengthAll(termHash).iterator();
if (entries == null || !entries.hasNext()) return 0;
Long a = entries.next();
int k = 1;
int c = RowSet.importRowCount(a, payloadrow);
assert c >= 0;
if (System.currentTimeMillis() > timeout) {
Log.logWarning("ReferenceContainerArray", "timout in index retrieval (1): " + k + " tables searched. timeout = 3000");
return c;
}
while (entries.hasNext()) {
c += RowSet.importRowCount(entries.next(), payloadrow);
assert c >= 0;
k++;
if (System.currentTimeMillis() > timeout) {
Log.logWarning("ReferenceContainerArray", "timout in index retrieval (2): " + k + " tables searched. timeout = 3000");
return c;
}
}
assert c >= 0;
return c;
}
/**
* calculate an upper limit for a ranking number of the container size
* the returned number is not a counter. It can only be used to compare the