mirror of
https://github.com/yacy/yacy_search_server.git
synced 2024-09-19 00:01:41 +02:00
enhanced snippet fetch - removed a bug that caused documents to be
parsed even if a solr text was available
This commit is contained in:
parent
18f989dfb1
commit
9bece5ac5f
|
@ -99,6 +99,10 @@ public class URIMetadataNode implements URIMetadata {
|
|||
this.ranking = ranking;
|
||||
}
|
||||
|
||||
public SolrDocument getDocument() {
|
||||
return this.doc;
|
||||
}
|
||||
|
||||
private int getInt(YaCySchema field) {
|
||||
Integer x = (Integer) this.doc.getFieldValue(field.name());
|
||||
if (x == null) return 0;
|
||||
|
|
|
@ -35,7 +35,6 @@ import net.yacy.cora.document.ASCII;
|
|||
import net.yacy.cora.document.Classification;
|
||||
import net.yacy.cora.document.MultiProtocolURI;
|
||||
import net.yacy.cora.protocol.ResponseHeader;
|
||||
import net.yacy.cora.services.federated.solr.SolrConnector;
|
||||
import net.yacy.cora.services.federated.yacy.CacheStrategy;
|
||||
import net.yacy.cora.sorting.ScoreMap;
|
||||
import net.yacy.cora.sorting.WeakPriorityBlockingQueue;
|
||||
|
@ -45,6 +44,7 @@ import net.yacy.cora.storage.HandleSet;
|
|||
import net.yacy.cora.util.SpaceExceededException;
|
||||
import net.yacy.document.Condenser;
|
||||
import net.yacy.kelondro.data.meta.URIMetadata;
|
||||
import net.yacy.kelondro.data.meta.URIMetadataNode;
|
||||
import net.yacy.kelondro.data.word.Word;
|
||||
import net.yacy.kelondro.index.RowHandleSet;
|
||||
import net.yacy.kelondro.logging.Log;
|
||||
|
@ -54,6 +54,7 @@ import net.yacy.peers.graphics.ProfilingGraph;
|
|||
import net.yacy.repository.LoaderDispatcher;
|
||||
import net.yacy.search.EventTracker;
|
||||
import net.yacy.search.Switchboard;
|
||||
import net.yacy.search.index.MetadataRepository;
|
||||
import net.yacy.search.index.Segment;
|
||||
import net.yacy.search.snippet.MediaSnippet;
|
||||
import net.yacy.search.snippet.ResultEntry;
|
||||
|
@ -439,7 +440,7 @@ public class SnippetProcess {
|
|||
private final CacheStrategy cacheStrategy;
|
||||
private final int neededResults;
|
||||
private boolean shallrun;
|
||||
private final SolrConnector solr;
|
||||
private final MetadataRepository metadata;
|
||||
|
||||
public Worker(final long maxlifetime, final CacheStrategy cacheStrategy, final int neededResults) {
|
||||
this.cacheStrategy = cacheStrategy;
|
||||
|
@ -447,7 +448,7 @@ public class SnippetProcess {
|
|||
this.timeout = System.currentTimeMillis() + Math.max(1000, maxlifetime);
|
||||
this.neededResults = neededResults;
|
||||
this.shallrun = true;
|
||||
this.solr = SnippetProcess.this.rankingProcess.getQuery().getSegment().getSolr();
|
||||
this.metadata = SnippetProcess.this.rankingProcess.getQuery().getSegment().urlMetadata();
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@ -496,16 +497,18 @@ public class SnippetProcess {
|
|||
|
||||
// in case that we have an attached solr, we load also the solr document
|
||||
String solrContent = null;
|
||||
if (this.solr != null) {
|
||||
SolrDocument sd = null;
|
||||
if (page instanceof URIMetadataNode) {
|
||||
sd = ((URIMetadataNode) page).getDocument();
|
||||
} else {
|
||||
try {
|
||||
sd = this.solr.get(ASCII.String(page.hash()));
|
||||
sd = this.metadata.getSolr().get(ASCII.String(page.hash()));
|
||||
} catch (IOException e) {
|
||||
Log.logException(e);
|
||||
}
|
||||
if (sd != null) {
|
||||
solrContent = Switchboard.getSwitchboard().index.getSolrScheme().solrGetText(sd);
|
||||
}
|
||||
if (sd != null) {
|
||||
solrContent = this.metadata.getSolrScheme().solrGetText(sd);
|
||||
}
|
||||
|
||||
resultEntry = fetchSnippet(page, solrContent, this.cacheStrategy); // does not fetch snippets if snippetMode == 0
|
||||
|
|
|
@ -179,7 +179,16 @@ public class TextSnippet implements Comparable<TextSnippet>, Comparator<TextSnip
|
|||
{ //encapsulate potential expensive sentences
|
||||
Collection<StringBuilder> sentences = null;
|
||||
|
||||
// try the solr text first
|
||||
// try to get the snippet from metadata
|
||||
removeMatchingHashes(row.dc_title(), remainingHashes);
|
||||
removeMatchingHashes(row.dc_creator(), remainingHashes);
|
||||
removeMatchingHashes(row.dc_subject(), remainingHashes);
|
||||
removeMatchingHashes(row.url().toNormalform(true, true).replace('-', ' '), remainingHashes);
|
||||
|
||||
if (!remainingHashes.isEmpty()) {
|
||||
// we did not find everything in the metadata, look further into the document itself.
|
||||
|
||||
// first acquire the sentences:
|
||||
if (solrText != null) {
|
||||
// compute sentences from solr query
|
||||
final SentenceReader sr = new SentenceReader(solrText, pre);
|
||||
|
@ -187,29 +196,7 @@ public class TextSnippet implements Comparable<TextSnippet>, Comparator<TextSnip
|
|||
while (sr.hasNext()) {
|
||||
sentences.add(sr.next());
|
||||
}
|
||||
|
||||
if (sentences != null) {
|
||||
try {
|
||||
final SnippetExtractor tsr = new SnippetExtractor(sentences, remainingHashes, snippetMaxLength);
|
||||
textline = tsr.getSnippet();
|
||||
remainingHashes = tsr.getRemainingWords();
|
||||
} catch (final UnsupportedOperationException e) {
|
||||
init(url.hash(), null, ResultClass.ERROR_NO_MATCH, "no matching snippet found");
|
||||
return;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// try to get the snippet from metadata
|
||||
removeMatchingHashes(row.dc_title(), remainingHashes);
|
||||
removeMatchingHashes(row.dc_creator(), remainingHashes);
|
||||
removeMatchingHashes(row.dc_subject(), remainingHashes);
|
||||
removeMatchingHashes(row.url().toNormalform(true, true).replace('-', ' '), remainingHashes);
|
||||
|
||||
if (remainingHashes.isEmpty()) {
|
||||
// the snippet is fully inside the metadata!
|
||||
|
||||
if (de.anomic.crawler.Cache.has(url.hash())) {
|
||||
} else if (de.anomic.crawler.Cache.has(url.hash())) {
|
||||
// get the sentences from the cache
|
||||
final Request request = loader == null ? null : loader.request(url, true, reindexing);
|
||||
Response response;
|
||||
|
@ -227,11 +214,30 @@ public class TextSnippet implements Comparable<TextSnippet>, Comparator<TextSnip
|
|||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (sentences == null) {
|
||||
// not found the snippet
|
||||
init(url.hash(), null, ResultClass.SOURCE_METADATA, null);
|
||||
return;
|
||||
}
|
||||
|
||||
if (sentences.size() > 0) {
|
||||
try {
|
||||
final SnippetExtractor tsr = new SnippetExtractor(sentences, remainingHashes, snippetMaxLength);
|
||||
textline = tsr.getSnippet();
|
||||
remainingHashes = tsr.getRemainingWords();
|
||||
} catch (final UnsupportedOperationException e) {
|
||||
init(url.hash(), null, ResultClass.ERROR_NO_MATCH, "no matching snippet found");
|
||||
return;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (remainingHashes.isEmpty()) {
|
||||
// we found the snippet
|
||||
if (textline == null) {
|
||||
if (sentences == null) {
|
||||
textline = row.dc_subject();
|
||||
} else {
|
||||
// use the first lines from the text as snippet
|
||||
final StringBuilder s = new StringBuilder(snippetMaxLength);
|
||||
for (final StringBuilder t: sentences) {
|
||||
|
@ -239,7 +245,10 @@ public class TextSnippet implements Comparable<TextSnippet>, Comparator<TextSnip
|
|||
if (s.length() >= snippetMaxLength / 4 * 3) break;
|
||||
}
|
||||
if (s.length() > snippetMaxLength) { s.setLength(snippetMaxLength); s.trimToSize(); }
|
||||
init(url.hash(), s.length() > 0 ? s.toString() : this.line, ResultClass.SOURCE_METADATA, null);
|
||||
textline = s.toString();
|
||||
}
|
||||
}
|
||||
init(url.hash(), textline.length() > 0 ? textline : this.line, ResultClass.SOURCE_METADATA, null);
|
||||
return;
|
||||
}
|
||||
|
||||
|
@ -313,7 +322,7 @@ public class TextSnippet implements Comparable<TextSnippet>, Comparator<TextSnip
|
|||
//if (videoline != null) line += (line.isEmpty()) ? videoline : "<br />" + videoline;
|
||||
//if (appline != null) line += (line.isEmpty()) ? appline : "<br />" + appline;
|
||||
//if (hrefline != null) line += (line.isEmpty()) ? hrefline : "<br />" + hrefline;
|
||||
if (textline != null) snippetLine += (snippetLine.isEmpty()) ? textline : "<br />" + textline;
|
||||
//if (textline != null) snippetLine += (snippetLine.isEmpty()) ? textline : "<br />" + textline;
|
||||
|
||||
if (snippetLine == null || !remainingHashes.isEmpty()) {
|
||||
init(url.hash(), null, ResultClass.ERROR_NO_MATCH, "no matching snippet found");
|
||||
|
|
Loading…
Reference in New Issue
Block a user