yacy_search_server/source/net/yacy/document/SnippetExtractor.java
orbiter 0d363a94d7 more performance hacks
this makes YaCy search results VERY fast for all verify=false search cases
and it enhances the search speed also for all other snippet-fetch cases.
With this change my peer performed 100 Queries Per Second (!!!) while doing 10 queries simultanously (!!!)
in an intranet index of 20000 URLs on my 16-core Mac

Check this yourself by doing:
cd bin
./searchtestmulti.sh
after finishing the run, divide 1000 by the given time per query (which is the qps for one thread)
and then multiply again by 10 (because 10 search threads has been started)

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@7231 6c8d7289-2bf4-0310-a012-ef5d649a1542
2010-10-09 08:55:57 +00:00

200 lines
9.6 KiB
Java

/**
* SnippetExtractor
* Copyright 2010 by Michael Peter Christen, mc@yacy.net, Frankfurt a. M., Germany
* First released 22.10.2010 at http://yacy.net
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program in the file lgpl21.txt
* If not, see <http://www.gnu.org/licenses/>.
*/
package net.yacy.document;
import java.util.Collection;
import java.util.Iterator;
import java.util.TreeMap;
import java.util.TreeSet;
import net.yacy.kelondro.index.HandleSet;
import net.yacy.kelondro.index.RowSpaceExceededException;
import net.yacy.kelondro.logging.Log;
public class SnippetExtractor {
String snippetString;
HandleSet remainingHashes;
public SnippetExtractor(final Collection<StringBuilder> sentences, final HandleSet queryhashes, int maxLength) throws UnsupportedOperationException {
if (sentences == null) throw new UnsupportedOperationException("sentence == null");
if (queryhashes == null || queryhashes.isEmpty()) throw new UnsupportedOperationException("queryhashes == null");
TreeMap<byte[], Integer> hs;
final TreeMap<Long, StringBuilder> order = new TreeMap<Long, StringBuilder>();
long uniqCounter = 999L;
Integer pos;
TreeSet<Integer> positions;
int linenumber = 0;
int fullmatchcounter = 0;
lookup: for (StringBuilder sentence: sentences) {
hs = Condenser.hashSentence(sentence.toString());
positions = new TreeSet<Integer>();
for (byte[] word: queryhashes) {
pos = hs.get(word);
if (pos != null) {
positions.add(pos);
}
}
int worddistance = positions.size() > 1 ? positions.last() - positions.first() : 0;
// sort by
// - 1st order: number of matching words
// - 2nd order: word distance
// - 3th order: line length (not too short and not too long)
// - 4rd order: line number
if (positions.size() > 0) {
order.put(Long.valueOf(-100000000L * (linenumber == 0 ? 1 : 0) + 10000000L * positions.size() + 1000000L * worddistance + 100000L * linelengthKey(sentence.length(), maxLength) - 10000L * linenumber + uniqCounter--), sentence);
if (order.size() > 5) order.remove(order.firstEntry().getKey());
if (positions.size() == queryhashes.size()) fullmatchcounter++;
if (fullmatchcounter >= 3) break lookup;
}
linenumber++;
}
StringBuilder sentence;
SnippetExtractor tsr;
while (!order.isEmpty()) {
sentence = order.remove(order.lastKey()); // sentence with the biggest score
try {
tsr = new SnippetExtractor(sentence.toString(), queryhashes, maxLength);
} catch (UnsupportedOperationException e) {
continue;
}
snippetString = tsr.snippetString;
if (snippetString != null && snippetString.length() > 0) {
remainingHashes = tsr.remainingHashes;
if (remainingHashes.isEmpty()) {
// we have found the snippet
return; // finished!
} else if (remainingHashes.size() < queryhashes.size()) {
// the result has not all words in it.
// find another sentence that represents the missing other words
// and find recursively more sentences
maxLength = maxLength - snippetString.length();
if (maxLength < 20) maxLength = 20;
try {
tsr = new SnippetExtractor(order.values(), remainingHashes, maxLength);
} catch (UnsupportedOperationException e) {
throw e;
}
final String nextSnippet = tsr.snippetString;
if (nextSnippet == null) return;
snippetString = snippetString + (" / " + nextSnippet);
remainingHashes = tsr.remainingHashes;
return;
} else {
// error
//assert remaininghashes.size() < queryhashes.size() : "remaininghashes.size() = " + remaininghashes.size() + ", queryhashes.size() = " + queryhashes.size() + ", sentence = '" + sentence + "', result = '" + result + "'";
continue;
}
}
}
throw new UnsupportedOperationException("no snippet computed");
}
private static int linelengthKey(int givenlength, int maxlength) {
if (givenlength > maxlength) return 1;
if (givenlength >= maxlength / 2 && givenlength < maxlength) return 7;
if (givenlength >= maxlength / 4 && givenlength < maxlength / 2) return 5;
if (givenlength >= maxlength / 8 && givenlength < maxlength / 4) return 3;
return 0;
}
private SnippetExtractor(String sentence, final HandleSet queryhashes, final int maxLength) throws UnsupportedOperationException {
try {
if (sentence == null) throw new UnsupportedOperationException("no sentence given");
if (queryhashes == null || queryhashes.isEmpty()) throw new UnsupportedOperationException("queryhashes == null");
byte[] hash;
// find all hashes that appear in the sentence
final TreeMap<byte[], Integer> hs = Condenser.hashSentence(sentence);
final Iterator<byte[]> j = queryhashes.iterator();
Integer pos;
int p, minpos = sentence.length(), maxpos = -1;
final HandleSet remainingHashes = new HandleSet(queryhashes.row().primaryKeyLength, queryhashes.comparator(), 0);
while (j.hasNext()) {
hash = j.next();
pos = hs.get(hash);
if (pos == null) {
try {
remainingHashes.put(hash);
} catch (RowSpaceExceededException e) {
Log.logException(e);
}
} else {
p = pos.intValue();
if (p > maxpos) maxpos = p;
if (p < minpos) minpos = p;
}
}
// check result size
maxpos = maxpos + 10;
if (maxpos > sentence.length()) maxpos = sentence.length();
if (minpos < 0) minpos = 0;
// we have a result, but is it short enough?
if (maxpos - minpos + 10 > maxLength) {
// the string is too long, even if we cut at both ends
// so cut here in the middle of the string
final int lenb = sentence.length();
sentence = sentence.substring(0, (minpos + 20 > sentence.length()) ? sentence.length() : minpos + 20).trim() +
" [..] " +
sentence.substring((maxpos + 26 > sentence.length()) ? sentence.length() : maxpos + 26).trim();
maxpos = maxpos + lenb - sentence.length() + 6;
}
if (maxpos > maxLength) {
// the string is too long, even if we cut it at the end
// so cut it here at both ends at once
assert maxpos >= minpos;
final int newlen = Math.max(10, maxpos - minpos + 10);
final int around = (maxLength - newlen) / 2;
assert minpos - around < sentence.length() : "maxpos = " + maxpos + ", minpos = " + minpos + ", around = " + around + ", sentence.length() = " + sentence.length();
//assert ((maxpos + around) <= sentence.length()) && ((maxpos + around) <= sentence.length()) : "maxpos = " + maxpos + ", minpos = " + minpos + ", around = " + around + ", sentence.length() = " + sentence.length();
sentence = "[..] " + sentence.substring(minpos - around, ((maxpos + around) > sentence.length()) ? sentence.length() : (maxpos + around)).trim() + " [..]";
minpos = around;
maxpos = sentence.length() - around - 5;
}
if (sentence.length() > maxLength) {
// trim sentence, 1st step (cut at right side)
sentence = sentence.substring(0, maxpos).trim() + " [..]";
}
if (sentence.length() > maxLength) {
// trim sentence, 2nd step (cut at left side)
sentence = "[..] " + sentence.substring(minpos).trim();
}
if (sentence.length() > maxLength) {
// trim sentence, 3rd step (cut in the middle)
sentence = sentence.substring(6, 20).trim() + " [..] " + sentence.substring(sentence.length() - 26, sentence.length() - 6).trim();
}
this.snippetString = sentence;
this.remainingHashes = remainingHashes;
} catch (final IndexOutOfBoundsException e) {
throw new UnsupportedOperationException(e.getMessage());
}
}
public String getSnippet() {
return this.snippetString;
}
public HandleSet getRemainingWords() {
return this.remainingHashes;
}
}