mirror of
https://github.com/yacy/yacy_search_server.git
synced 2024-09-19 00:01:41 +02:00
- enhanced DidYouMean computation using a faster count on index entries; this causes that results can be ranked better
- added limitations on DidYouMean result sets according to input and output string length git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@7246 6c8d7289-2bf4-0310-a012-ef5d649a1542
This commit is contained in:
parent
beb65437d2
commit
7cd9d9d22a
|
@ -14,7 +14,7 @@
|
|||
<script type="text/javascript" src="/yacy/ui/js/jquery.autocomplete.js"></script>
|
||||
<script type="text/javascript">
|
||||
$(document).ready(function() {
|
||||
$('#search').autocomplete('/suggest.json', {parse: opensearch, delay: 0, selectFirst: false, scroll: false});
|
||||
$('#search').autocomplete('/suggest.json', {parse: opensearch, delay: 0, selectFirst: false, scroll: false, max: 20});
|
||||
function opensearch(data) {
|
||||
var parsed = [];
|
||||
data = eval('({"suggest":' + data + '})');
|
||||
|
|
|
@ -42,7 +42,7 @@
|
|||
<script type="text/javascript" src="/yacy/ui/js/jquery.autocomplete.js"></script>
|
||||
<script type="text/javascript">
|
||||
$(document).ready(function() {
|
||||
$('#search').autocomplete('/suggest.json', {parse: opensearch, delay: 0, selectFirst: false, scroll: false});
|
||||
$('#search').autocomplete('/suggest.json', {parse: opensearch, delay: 0, selectFirst: false, scroll: false, max: 30});
|
||||
function opensearch(data) {
|
||||
var parsed = [];
|
||||
data = eval('({"suggest":' + data + '})');
|
||||
|
|
|
@ -31,6 +31,9 @@ import net.yacy.kelondro.util.ScoreCluster;
|
|||
*/
|
||||
public class DidYouMean {
|
||||
|
||||
private static final int MinimumInputWordLength = 2;
|
||||
private static final int MinimumOutputWordLength = 4;
|
||||
|
||||
private static final char[] ALPHABET_LATIN = {
|
||||
'a','b','c','d','e','f','g','h','i','j','k','l','m','n','o','p',
|
||||
'q','r','s','t','u','v','w','x','y','z',
|
||||
|
@ -118,8 +121,10 @@ public class DidYouMean {
|
|||
* @return
|
||||
*/
|
||||
public SortedSet<String> getSuggestions(long timeout, int preSortSelection) {
|
||||
if (this.word.length() < MinimumInputWordLength) return this.resultSet; // return nothing if input is too short
|
||||
long startTime = System.currentTimeMillis();
|
||||
long timelimit = startTime + timeout;
|
||||
if (this.word.indexOf(' ') > 0) return getSuggestions(this.word.split(" "), timeout, preSortSelection, this.index);
|
||||
long timelimit = System.currentTimeMillis() + timeout;
|
||||
SortedSet<String> preSorted = getSuggestions(timeout);
|
||||
if (System.currentTimeMillis() > timelimit) return preSorted;
|
||||
ScoreCluster<String> scored = new ScoreCluster<String>();
|
||||
|
@ -129,22 +134,18 @@ public class DidYouMean {
|
|||
scored.addScore(s, index.count(Word.word2hash(s)));
|
||||
}
|
||||
SortedSet<String> countSorted = Collections.synchronizedSortedSet(new TreeSet<String>(new indexSizeComparator()));
|
||||
if (System.currentTimeMillis() > timelimit) {
|
||||
while (scored.size() > 0) {
|
||||
if (countSorted.size() >= preSortSelection) break;
|
||||
String s = scored.getMaxObject();
|
||||
scored.deleteScore(s);
|
||||
countSorted.add(s);
|
||||
}
|
||||
} else {
|
||||
int wc = index.count(Word.word2hash(this.word)); // all counts must be greater than this
|
||||
while (scored.size() > 0) {
|
||||
if (countSorted.size() >= preSortSelection) break;
|
||||
while (scored.size() > 0 && countSorted.size() < preSortSelection) {
|
||||
String s = scored.getMaxObject();
|
||||
int score = scored.deleteScore(s);
|
||||
if (score > wc) countSorted.add(s);
|
||||
}
|
||||
if (s.length() >= MinimumOutputWordLength && score > wc) countSorted.add(s);
|
||||
if (System.currentTimeMillis() > timelimit) break;
|
||||
}
|
||||
|
||||
// finished
|
||||
Log.logInfo("DidYouMean", "found " + preSorted.size() + " terms, returned " + countSorted.size() + " suggestions; execution time: "
|
||||
+ (System.currentTimeMillis() - startTime) + "ms" + " - remaining queue size: " + guessLib.size());
|
||||
|
||||
return countSorted;
|
||||
}
|
||||
|
||||
|
@ -243,10 +244,6 @@ public class DidYouMean {
|
|||
// we don't want the given word in the result
|
||||
this.resultSet.remove(this.word);
|
||||
|
||||
// finished
|
||||
Log.logInfo("DidYouMean", "found "+this.resultSet.size()+" terms; execution time: "
|
||||
+(System.currentTimeMillis()-startTime)+"ms"+ " - remaining queue size: "+guessLib.size());
|
||||
|
||||
return this.resultSet;
|
||||
|
||||
}
|
||||
|
@ -347,7 +344,7 @@ public class DidYouMean {
|
|||
String s;
|
||||
try {
|
||||
while (!(s = guessLib.take()).equals(POISON_STRING)) {
|
||||
if (index.has(Word.word2hash(s))) resultSet.add(s);
|
||||
if (s.length() >= MinimumOutputWordLength && index.has(Word.word2hash(s))) resultSet.add(s);
|
||||
if (System.currentTimeMillis() > timeLimit) return;
|
||||
}
|
||||
} catch (InterruptedException e) {}
|
||||
|
|
|
@ -662,6 +662,43 @@ public class ArrayStack implements BLOB {
|
|||
return -1;
|
||||
}
|
||||
|
||||
/**
|
||||
* get all BLOBs in the array.
|
||||
* this is useful when it is not clear if an entry is unique in all BLOBs in this array.
|
||||
* @param key
|
||||
* @return
|
||||
* @throws IOException
|
||||
*/
|
||||
public Iterable<Long> lengthAll(byte[] key) throws IOException {
|
||||
return new BlobLengths(key);
|
||||
}
|
||||
|
||||
public class BlobLengths extends LookAheadIterator<Long> {
|
||||
|
||||
private final Iterator<blobItem> bii;
|
||||
private final byte[] key;
|
||||
|
||||
public BlobLengths(byte[] key) {
|
||||
this.bii = blobs.iterator();
|
||||
this.key = key;
|
||||
}
|
||||
|
||||
protected Long next0() {
|
||||
while (this.bii.hasNext()) {
|
||||
BLOB b = this.bii.next().blob;
|
||||
if (b == null) continue;
|
||||
try {
|
||||
long l = b.length(key);
|
||||
if (l >= 0) return Long.valueOf(l);
|
||||
} catch (IOException e) {
|
||||
Log.logSevere("ArrayStack", "", e);
|
||||
return null;
|
||||
}
|
||||
}
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* retrieve the sizes of all BLOB
|
||||
* @param key
|
||||
|
|
|
@ -121,7 +121,7 @@ public class RowCollection implements Iterable<Row.Entry>, Cloneable {
|
|||
}
|
||||
|
||||
protected RowCollection(final Row rowdef, final Row.Entry exportedCollectionRowEnvironment) {
|
||||
final int chunkcachelength = exportedCollectionRowEnvironment.cellwidth(1) - exportOverheadSize;
|
||||
final int chunkcachelength = exportedCollectionRowEnvironment.cellwidth(1) - (int) exportOverheadSize;
|
||||
final Row.Entry exportedCollection = exportRow(chunkcachelength).newEntry(exportedCollectionRowEnvironment, 1);
|
||||
|
||||
this.rowdef = rowdef;
|
||||
|
@ -192,7 +192,7 @@ public class RowCollection implements Iterable<Row.Entry>, Cloneable {
|
|||
|
||||
private static Column exportColumn0, exportColumn1, exportColumn2, exportColumn3, exportColumn4;
|
||||
|
||||
protected static final int exportOverheadSize = 14;
|
||||
protected static final long exportOverheadSize = 14;
|
||||
|
||||
private static Row exportRow(final int chunkcachelength) {
|
||||
/*
|
||||
|
|
|
@ -79,16 +79,25 @@ public class RowSet extends RowCollection implements Index, Iterable<Row.Entry>
|
|||
if (orderbound < 0) return new RowSet(rowdef); // error
|
||||
long alloc = ((long) size) * ((long) rowdef.objectsize);
|
||||
assert alloc <= Integer.MAX_VALUE : "alloc = " + alloc;
|
||||
assert alloc == b.length - exportOverheadSize;
|
||||
final byte[] chunkcache = new byte[(int) alloc];
|
||||
//assert b.length - exportOverheadSize == size * rowdef.objectsize : "b.length = " + b.length + ", size * rowdef.objectsize = " + size * rowdef.objectsize;
|
||||
if (b.length - exportOverheadSize != alloc) {
|
||||
Log.logSevere("RowSet", "exportOverheadSize wrong: b.length = " + b.length + ", size * rowdef.objectsize = " + size * rowdef.objectsize);
|
||||
return new RowSet(rowdef);
|
||||
}
|
||||
System.arraycopy(b, exportOverheadSize, chunkcache, 0, chunkcache.length);
|
||||
System.arraycopy(b, (int) exportOverheadSize, chunkcache, 0, chunkcache.length);
|
||||
return new RowSet(rowdef, size, chunkcache, orderbound);
|
||||
}
|
||||
|
||||
public final static int importRowCount(final long blength, final Row rowdef) {
|
||||
assert blength >= exportOverheadSize : "blength = " + blength;
|
||||
if (blength < exportOverheadSize) return 0;
|
||||
int c = (int) ((blength - exportOverheadSize) / (long) rowdef.objectsize);
|
||||
assert c >= 0;
|
||||
return c;
|
||||
}
|
||||
|
||||
private RowSet(Row rowdef, byte[] chunkcache, int chunkcount, int sortBound, long lastTimeWrote) {
|
||||
super(rowdef, chunkcache, chunkcount, sortBound, lastTimeWrote);
|
||||
}
|
||||
|
|
|
@ -30,6 +30,7 @@ import java.io.File;
|
|||
import java.io.IOException;
|
||||
import java.util.concurrent.Semaphore;
|
||||
|
||||
import net.yacy.cora.storage.ComparableARC;
|
||||
import net.yacy.kelondro.data.meta.URIMetadataRow;
|
||||
import net.yacy.kelondro.index.HandleSet;
|
||||
import net.yacy.kelondro.index.Row;
|
||||
|
@ -62,6 +63,7 @@ public final class IndexCell<ReferenceType extends Reference> extends AbstractBu
|
|||
// class variables
|
||||
private final ReferenceContainerArray<ReferenceType> array;
|
||||
private ReferenceContainerCache<ReferenceType> ram;
|
||||
private final ComparableARC<byte[], Integer> countCache;
|
||||
private int maxRamEntries;
|
||||
private final IODispatcher merger;
|
||||
private long lastCleanup, lastDump;
|
||||
|
@ -86,6 +88,7 @@ public final class IndexCell<ReferenceType extends Reference> extends AbstractBu
|
|||
|
||||
this.array = new ReferenceContainerArray<ReferenceType>(cellPath, prefix, factory, termOrder, payloadrow, merger);
|
||||
this.ram = new ReferenceContainerCache<ReferenceType>(factory, payloadrow, termOrder);
|
||||
this.countCache = new ComparableARC<byte[], Integer>(100, termOrder);
|
||||
this.maxRamEntries = maxRamEntries;
|
||||
this.merger = merger;
|
||||
this.lastCleanup = System.currentTimeMillis();
|
||||
|
@ -147,29 +150,27 @@ public final class IndexCell<ReferenceType extends Reference> extends AbstractBu
|
|||
|
||||
/**
|
||||
* count number of references for a given term
|
||||
* this method may cause strong IO load if called too frequently, because it is
|
||||
* necessary to read the corresponding reference containers from the files and
|
||||
* count the resulting index object.
|
||||
* To reduce the load for processes that frequently need access to the same
|
||||
* term objects, a ARC cache is here to reduce IO load.
|
||||
* this method may cause strong IO load if called too frequently.
|
||||
*/
|
||||
public int count(byte[] termHash) {
|
||||
Integer cachedCount = this.countCache.get(termHash);
|
||||
if (cachedCount != null) return cachedCount.intValue();
|
||||
|
||||
int countFile;
|
||||
int countFile = 0;
|
||||
// read fresh values from file
|
||||
ReferenceContainer<ReferenceType> c1;
|
||||
try {
|
||||
c1 = this.array.get(termHash);
|
||||
countFile = this.array.count(termHash);
|
||||
} catch (Exception e) {
|
||||
Log.logException(e);
|
||||
c1 = null;
|
||||
}
|
||||
countFile = (c1 == null) ? 0 : c1.size();
|
||||
assert countFile >= 0;
|
||||
|
||||
// count from container in ram
|
||||
ReferenceContainer<ReferenceType> countRam = this.ram.get(termHash, null);
|
||||
|
||||
return (countRam == null) ? countFile : countFile + countRam.size();
|
||||
assert countRam == null || countRam.size() >= 0;
|
||||
int c = countRam == null ? countFile : countFile + countRam.size();
|
||||
this.countCache.put(termHash, c);
|
||||
return c;
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
|
@ -217,7 +217,6 @@ public final class ReferenceContainerArray<ReferenceType extends Reference> {
|
|||
if (System.currentTimeMillis() > timeout) {
|
||||
Log.logWarning("ReferenceContainerArray", "timout in index retrieval (1): " + k + " tables searched. timeout = 3000");
|
||||
return c;
|
||||
// timeout = Long.MAX_VALUE; // to prevent that the warning is shown again
|
||||
}
|
||||
while (entries.hasNext()) {
|
||||
c = c.merge(new ReferenceContainer<ReferenceType>(this.factory, termHash, RowSet.importRowSet(entries.next(), payloadrow)));
|
||||
|
@ -225,12 +224,36 @@ public final class ReferenceContainerArray<ReferenceType extends Reference> {
|
|||
if (System.currentTimeMillis() > timeout) {
|
||||
Log.logWarning("ReferenceContainerArray", "timout in index retrieval (2): " + k + " tables searched. timeout = 3000");
|
||||
return c;
|
||||
// timeout = Long.MAX_VALUE; // to prevent that the warning is shown again
|
||||
}
|
||||
}
|
||||
return c;
|
||||
}
|
||||
|
||||
public int count(final byte[] termHash) throws IOException {
|
||||
long timeout = System.currentTimeMillis() + 3000;
|
||||
Iterator<Long> entries = this.array.lengthAll(termHash).iterator();
|
||||
if (entries == null || !entries.hasNext()) return 0;
|
||||
Long a = entries.next();
|
||||
int k = 1;
|
||||
int c = RowSet.importRowCount(a, payloadrow);
|
||||
assert c >= 0;
|
||||
if (System.currentTimeMillis() > timeout) {
|
||||
Log.logWarning("ReferenceContainerArray", "timout in index retrieval (1): " + k + " tables searched. timeout = 3000");
|
||||
return c;
|
||||
}
|
||||
while (entries.hasNext()) {
|
||||
c += RowSet.importRowCount(entries.next(), payloadrow);
|
||||
assert c >= 0;
|
||||
k++;
|
||||
if (System.currentTimeMillis() > timeout) {
|
||||
Log.logWarning("ReferenceContainerArray", "timout in index retrieval (2): " + k + " tables searched. timeout = 3000");
|
||||
return c;
|
||||
}
|
||||
}
|
||||
assert c >= 0;
|
||||
return c;
|
||||
}
|
||||
|
||||
/**
|
||||
* calculate an upper limit for a ranking number of the container size
|
||||
* the returned number is not a counter. It can only be used to compare the
|
||||
|
|
Loading…
Reference in New Issue
Block a user