performance hack

This commit is contained in:
Michael Peter Christen 2012-01-25 12:48:48 +01:00
parent 41536eb4a2
commit ef78f22ee1
5 changed files with 44 additions and 37 deletions

View File

@ -47,7 +47,7 @@ public class SnippetExtractor {
int linenumber = 0; int linenumber = 0;
int fullmatchcounter = 0; int fullmatchcounter = 0;
lookup: for (final StringBuilder sentence: sentences) { lookup: for (final StringBuilder sentence: sentences) {
hs = WordTokenizer.hashSentence(sentence.toString(), null); hs = WordTokenizer.hashSentence(sentence.toString(), null, 100);
positions = new TreeSet<Integer>(); positions = new TreeSet<Integer>();
for (final byte[] word: queryhashes) { for (final byte[] word: queryhashes) {
pos = hs.get(word); pos = hs.get(word);
@ -79,27 +79,27 @@ public class SnippetExtractor {
} catch (UnsupportedOperationException e) { } catch (UnsupportedOperationException e) {
continue; continue;
} }
snippetString = tsr.snippetString; this.snippetString = tsr.snippetString;
if (snippetString != null && snippetString.length() > 0) { if (this.snippetString != null && this.snippetString.length() > 0) {
remainingHashes = tsr.remainingHashes; this.remainingHashes = tsr.remainingHashes;
if (remainingHashes.isEmpty()) { if (this.remainingHashes.isEmpty()) {
// we have found the snippet // we have found the snippet
return; // finished! return; // finished!
} else if (remainingHashes.size() < queryhashes.size()) { } else if (this.remainingHashes.size() < queryhashes.size()) {
// the result has not all words in it. // the result has not all words in it.
// find another sentence that represents the missing other words // find another sentence that represents the missing other words
// and find recursively more sentences // and find recursively more sentences
maxLength = maxLength - snippetString.length(); maxLength = maxLength - this.snippetString.length();
if (maxLength < 20) maxLength = 20; if (maxLength < 20) maxLength = 20;
try { try {
tsr = new SnippetExtractor(order.values(), remainingHashes, maxLength); tsr = new SnippetExtractor(order.values(), this.remainingHashes, maxLength);
} catch (UnsupportedOperationException e) { } catch (UnsupportedOperationException e) {
throw e; throw e;
} }
final String nextSnippet = tsr.snippetString; final String nextSnippet = tsr.snippetString;
if (nextSnippet == null) return; if (nextSnippet == null) return;
snippetString = snippetString + (" / " + nextSnippet); this.snippetString = this.snippetString + (" / " + nextSnippet);
remainingHashes = tsr.remainingHashes; this.remainingHashes = tsr.remainingHashes;
return; return;
} else { } else {
// error // error
@ -126,7 +126,7 @@ public class SnippetExtractor {
byte[] hash; byte[] hash;
// find all hashes that appear in the sentence // find all hashes that appear in the sentence
final Map<byte[], Integer> hs = WordTokenizer.hashSentence(sentence, null); final Map<byte[], Integer> hs = WordTokenizer.hashSentence(sentence, null, 100);
final Iterator<byte[]> j = queryhashes.iterator(); final Iterator<byte[]> j = queryhashes.iterator();
Integer pos; Integer pos;
int p, minpos = sentence.length(), maxpos = -1; int p, minpos = sentence.length(), maxpos = -1;

View File

@ -68,10 +68,12 @@ public class WordTokenizer implements Enumeration<StringBuilder> {
return null; return null;
} }
@Override
public boolean hasMoreElements() { public boolean hasMoreElements() {
return this.buffer != null; return this.buffer != null;
} }
@Override
public StringBuilder nextElement() { public StringBuilder nextElement() {
final StringBuilder r = (this.buffer == null) ? null : this.buffer; final StringBuilder r = (this.buffer == null) ? null : this.buffer;
this.buffer = nextElement0(); this.buffer = nextElement0();
@ -81,7 +83,7 @@ public class WordTokenizer implements Enumeration<StringBuilder> {
} }
public void close() { public void close() {
e.close(); this.e.close();
} }
private static class unsievedWordsEnum implements Enumeration<StringBuilder> { private static class unsievedWordsEnum implements Enumeration<StringBuilder> {
@ -139,10 +141,12 @@ public class WordTokenizer implements Enumeration<StringBuilder> {
return r; return r;
} }
@Override
public boolean hasMoreElements() { public boolean hasMoreElements() {
return this.buffer != null; return this.buffer != null;
} }
@Override
public StringBuilder nextElement() { public StringBuilder nextElement() {
final StringBuilder r = this.buffer; final StringBuilder r = this.buffer;
this.buffer = nextElement0(); this.buffer = nextElement0();
@ -150,7 +154,7 @@ public class WordTokenizer implements Enumeration<StringBuilder> {
} }
public void close() { public void close() {
e.close(); this.e.close();
} }
} }
@ -177,7 +181,7 @@ public class WordTokenizer implements Enumeration<StringBuilder> {
* @param sentence the sentence to be tokenized * @param sentence the sentence to be tokenized
* @return a ordered map containing word hashes as key and positions as value. The map is orderd by the hash ordering * @return a ordered map containing word hashes as key and positions as value. The map is orderd by the hash ordering
*/ */
public static SortedMap<byte[], Integer> hashSentence(final String sentence, final WordCache meaningLib) { public static SortedMap<byte[], Integer> hashSentence(final String sentence, final WordCache meaningLib, int maxlength) {
final SortedMap<byte[], Integer> map = new TreeMap<byte[], Integer>(Base64Order.enhancedCoder); final SortedMap<byte[], Integer> map = new TreeMap<byte[], Integer>(Base64Order.enhancedCoder);
final WordTokenizer words = new WordTokenizer(new ByteArrayInputStream(UTF8.getBytes(sentence)), meaningLib); final WordTokenizer words = new WordTokenizer(new ByteArrayInputStream(UTF8.getBytes(sentence)), meaningLib);
try { try {
@ -185,7 +189,7 @@ public class WordTokenizer implements Enumeration<StringBuilder> {
StringBuilder word; StringBuilder word;
byte[] hash; byte[] hash;
Integer oldpos; Integer oldpos;
while (words.hasMoreElements()) { while (words.hasMoreElements() && maxlength-- > 0) {
word = words.nextElement(); word = words.nextElement();
hash = Word.word2hash(word); hash = Word.word2hash(word);

View File

@ -129,12 +129,14 @@ public class Digest {
digest.reset(); digest.reset();
} catch (final NoSuchAlgorithmException e) { } catch (final NoSuchAlgorithmException e) {
} }
} else {
digest.reset(); // they should all be reseted but anyway; this is safe
} }
byte[] keyBytes; byte[] keyBytes;
keyBytes = UTF8.getBytes(key); keyBytes = UTF8.getBytes(key);
digest.update(keyBytes); digest.update(keyBytes);
final byte[] result = digest.digest(); final byte[] result = digest.digest();
digest.reset(); digest.reset(); // to be prepared for next
try { try {
digestPool.put(digest); digestPool.put(digest);
//System.out.println("Digest Pool size = " + digestPool.size()); //System.out.println("Digest Pool size = " + digestPool.size());

View File

@ -51,8 +51,8 @@ import net.yacy.kelondro.order.Base64Order;
import net.yacy.kelondro.util.ByteArray; import net.yacy.kelondro.util.ByteArray;
import net.yacy.repository.Blacklist; import net.yacy.repository.Blacklist;
import net.yacy.search.Switchboard; import net.yacy.search.Switchboard;
import de.anomic.crawler.retrieval.Request;
import de.anomic.crawler.ZURL.FailCategory; import de.anomic.crawler.ZURL.FailCategory;
import de.anomic.crawler.retrieval.Request;
public class MediaSnippet implements Comparable<MediaSnippet>, Comparator<MediaSnippet> { public class MediaSnippet implements Comparable<MediaSnippet>, Comparator<MediaSnippet> {
@ -117,10 +117,12 @@ public class MediaSnippet implements Comparable<MediaSnippet>, Comparator<MediaS
return Base64Order.enhancedCoder.equal(this.href.hash(), other.href.hash()); return Base64Order.enhancedCoder.equal(this.href.hash(), other.href.hash());
} }
@Override
public int compareTo(final MediaSnippet o) { public int compareTo(final MediaSnippet o) {
return Base64Order.enhancedCoder.compare(this.href.hash(), o.href.hash()); return Base64Order.enhancedCoder.compare(this.href.hash(), o.href.hash());
} }
@Override
public int compare(final MediaSnippet o1, final MediaSnippet o2) { public int compare(final MediaSnippet o1, final MediaSnippet o2) {
return o1.compareTo(o2); return o1.compareTo(o2);
} }
@ -217,7 +219,7 @@ public class MediaSnippet implements Comparable<MediaSnippet>, Comparator<MediaS
private static HandleSet removeAppearanceHashes(final String sentence, final HandleSet queryhashes) { private static HandleSet removeAppearanceHashes(final String sentence, final HandleSet queryhashes) {
// remove all hashes that appear in the sentence // remove all hashes that appear in the sentence
if (sentence == null) return queryhashes; if (sentence == null) return queryhashes;
final SortedMap<byte[], Integer> hs = WordTokenizer.hashSentence(sentence, null); final SortedMap<byte[], Integer> hs = WordTokenizer.hashSentence(sentence, null, 100);
final Iterator<byte[]> j = queryhashes.iterator(); final Iterator<byte[]> j = queryhashes.iterator();
byte[] hash; byte[] hash;
Integer pos; Integer pos;

View File

@ -497,8 +497,7 @@ public class TextSnippet implements Comparable<TextSnippet>, Comparator<TextSnip
private static boolean containsAllHashes( private static boolean containsAllHashes(
final String sentence, final HandleSet queryhashes) { final String sentence, final HandleSet queryhashes) {
final SortedMap<byte[], Integer> m = final SortedMap<byte[], Integer> m = WordTokenizer.hashSentence(sentence, null, 100);
WordTokenizer.hashSentence(sentence, null);
for (final byte[] b : queryhashes) { for (final byte[] b : queryhashes) {
if (!(m.containsKey(b))) { if (!(m.containsKey(b))) {
return false; return false;