added snippet extraction with synonym matching

This commit is contained in:
Michael Peter Christen 2024-08-26 23:44:42 +02:00
parent d181b9e89b
commit 3944984840

View File

@ -28,6 +28,8 @@ import java.util.SortedMap;
import java.util.TreeMap; import java.util.TreeMap;
import java.util.TreeSet; import java.util.TreeSet;
import net.yacy.cora.language.synonyms.SynonymLibrary;
public class SnippetExtractor { public class SnippetExtractor {
private String snippetString; private String snippetString;
@ -37,32 +39,42 @@ public class SnippetExtractor {
public SnippetExtractor(final Iterable<StringBuilder> sentences, final Set<String> queryTerms, int maxLength) throws UnsupportedOperationException { public SnippetExtractor(final Iterable<StringBuilder> sentences, final Set<String> queryTerms, int maxLength) throws UnsupportedOperationException {
if (sentences == null) throw new UnsupportedOperationException("sentences == null"); if (sentences == null) throw new UnsupportedOperationException("sentences == null");
if (queryTerms == null || queryTerms.isEmpty()) throw new UnsupportedOperationException("queryTerms == null"); if (queryTerms == null || queryTerms.isEmpty()) throw new UnsupportedOperationException("queryTerms == null");
SortedMap<String, Integer> hs; final TreeMap<Long, StringBuilder> sentences_candidates = new TreeMap<Long, StringBuilder>();
final TreeMap<Long, StringBuilder> order = new TreeMap<Long, StringBuilder>();
long uniqCounter = 999L; long uniqCounter = 999L;
Integer pos; Integer pos;
TreeSet<Integer> positions;
int linenumber = 0; int linenumber = 0;
int fullmatchcounter = 0; int fullmatchcounter = 0;
lookup: for(final StringBuilder sentence : sentences) { lookup: for(final StringBuilder sentence : sentences) {
hs = WordTokenizer.tokenizeSentence(sentence.toString(), 100); SortedMap<String, Integer> positions_in_sentence = WordTokenizer.tokenizeSentence(sentence.toString(), 100);
positions = new TreeSet<Integer>(); TreeSet<Integer> found_positions = new TreeSet<Integer>(); // the positions of the query terms in the sentence
for (final String word: queryTerms) { for (final String word: queryTerms) {
pos = hs.get(word); pos = positions_in_sentence.get(word);
if (pos != null) { if (pos != null) {
positions.add(pos); found_positions.add(pos);
} else {
// try to find synonyms
Set<String> syms = SynonymLibrary.getSynonyms(word);
if (syms != null && syms.size() > 0) {
symsearch: for (String sym: syms) {
pos = positions_in_sentence.get(sym);
if (pos != null) {
found_positions.add(pos);
break symsearch;
} }
} }
int worddistance = positions.size() > 1 ? positions.last() - positions.first() : 0; }
}
}
int worddistance = found_positions.size() > 1 ? found_positions.last() - found_positions.first() : 0;
// sort by // sort by
// - 1st order: number of matching words // - 1st order: number of matching words
// - 2nd order: word distance // - 2nd order: word distance
// - 3th order: line length (not too short and not too long) // - 3th order: line length (not too short and not too long)
// - 4rd order: line number // - 4rd order: line number
if (!positions.isEmpty()) { if (!found_positions.isEmpty()) {
order.put(Long.valueOf(-100000000L * (linenumber == 0 ? 1 : 0) + 10000000L * positions.size() + 1000000L * worddistance + 100000L * linelengthKey(sentence.length(), maxLength) - 10000L * linenumber + uniqCounter--), sentence); sentences_candidates.put(Long.valueOf(-100000000L * (linenumber == 0 ? 1 : 0) + 10000000L * found_positions.size() + 1000000L * worddistance + 100000L * linelengthKey(sentence.length(), maxLength) - 10000L * linenumber + uniqCounter--), sentence);
if (order.size() > 5) order.remove(order.firstEntry().getKey()); if (sentences_candidates.size() > 5) sentences_candidates.remove(sentences_candidates.firstEntry().getKey());
if (positions.size() == queryTerms.size()) fullmatchcounter++; if (found_positions.size() == queryTerms.size()) fullmatchcounter++;
if (fullmatchcounter >= 3) break lookup; if (fullmatchcounter >= 3) break lookup;
} }
linenumber++; linenumber++;
@ -70,8 +82,8 @@ public class SnippetExtractor {
StringBuilder sentence; StringBuilder sentence;
SnippetExtractor tsr; SnippetExtractor tsr;
while (!order.isEmpty()) { while (!sentences_candidates.isEmpty()) {
sentence = order.remove(order.lastKey()); // sentence with the biggest score sentence = sentences_candidates.remove(sentences_candidates.lastKey()); // sentence with the biggest score
try { try {
tsr = new SnippetExtractor(sentence.toString(), queryTerms, maxLength); tsr = new SnippetExtractor(sentence.toString(), queryTerms, maxLength);
} catch (final UnsupportedOperationException e) { } catch (final UnsupportedOperationException e) {
@ -90,7 +102,7 @@ public class SnippetExtractor {
maxLength = maxLength - this.snippetString.length(); maxLength = maxLength - this.snippetString.length();
if (maxLength < 20) maxLength = 20; if (maxLength < 20) maxLength = 20;
try { try {
tsr = new SnippetExtractor(order.values(), this.remainingTerms, maxLength); tsr = new SnippetExtractor(sentences_candidates.values(), this.remainingTerms, maxLength);
} catch (final UnsupportedOperationException e) { } catch (final UnsupportedOperationException e) {
throw e; throw e;
} }
@ -106,6 +118,7 @@ public class SnippetExtractor {
} }
} }
} }
throw new UnsupportedOperationException("no snippet computed"); throw new UnsupportedOperationException("no snippet computed");
} }
@ -134,7 +147,22 @@ public class SnippetExtractor {
term = j.next(); term = j.next();
pos = hs.get(term); pos = hs.get(term);
if (pos == null) { if (pos == null) {
remainingTerms.add(term); // try to find synonyms
Set<String> syms = SynonymLibrary.getSynonyms(term);
boolean found = false;
if (syms != null && syms.size() > 0) {
symsearch: for (String sym : syms) {
pos = hs.get(sym);
if (pos != null) {
p = pos.intValue();
if (p > maxpos) maxpos = p;
if (p < minpos) minpos = p;
found = true;
break symsearch;
}
}
}
if (!found) remainingTerms.add(term);
} else { } else {
p = pos.intValue(); p = pos.intValue();
if (p > maxpos) maxpos = p; if (p > maxpos) maxpos = p;