identified and fixed search performance problem caused by

snippet loading. Some access to header-db had been twice and even
more times in some cases. Snippet resource loading fixed.
Furthermore the snippet loading during remote search within the
remote peer has been disabled, but can be switched on remotely by
new flag 'includesnippet=true'

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@2688 6c8d7289-2bf4-0310-a012-ef5d649a1542
This commit is contained in:
orbiter 2006-10-02 01:15:02 +00:00
parent 4d9e1b43dd
commit 00746ca232
3 changed files with 28 additions and 21 deletions

View File

@ -91,6 +91,7 @@ public final class search {
final int maxdist= post.getInt("maxdist", Integer.MAX_VALUE);
final String prefer = post.get("prefer", "");
final String filter = post.get("filter", ".*");
final boolean includesnippet = post.get("includesnippet", "false").equals("true");
// final boolean global = ((String) post.get("resource", "global")).equals("global"); // if true, then result may consist of answers from other peers
// Date remoteTime = yacyCore.parseUniversalDate((String) post.get(yacySeed.MYTIME)); // read remote time
@ -200,11 +201,15 @@ public final class search {
plasmaSnippetCache.Snippet snippet;
while ((acc.hasMoreElements()) && (i < squery.wantedResults)) {
urlentry = acc.nextElement();
snippet = sb.snippetCache.retrieveSnippet(urlentry.url(), squery.queryHashes, false, 260);
if (snippet.getSource() == plasmaSnippetCache.ERROR_NO_MATCH) {
if (includesnippet) {
snippet = sb.snippetCache.retrieveSnippet(urlentry.url(), squery.queryHashes, false, 260);
} else {
snippet = null;
}
if ((snippet != null) && (snippet.getSource() == plasmaSnippetCache.ERROR_NO_MATCH)) {
// suppress line: there is no match in that resource
} else {
if (snippet.exists()) {
if ((snippet != null) && (snippet.exists())) {
resource = urlentry.toString(snippet.getLineRaw());
} else {
resource = urlentry.toString();

View File

@ -192,7 +192,6 @@ public class plasmaSnippetCache {
try {
// trying to load the resource from the cache
resource = this.cacheManager.loadResourceContent(url);
docInfo = this.cacheManager.loadResourceInfo(url);
// if not found try to download it
if ((resource == null) && (fetchOnline)) {
@ -200,22 +199,21 @@ public class plasmaSnippetCache {
plasmaHTCache.Entry entry = loadResourceFromWeb(url, 5000);
// getting resource metadata (e.g. the http headers for http resources)
if (entry != null) {
docInfo = entry.getDocumentInfo();
}
if (entry != null) docInfo = entry.getDocumentInfo();
// now the resource should be stored in the cache, load body
resource = this.cacheManager.loadResourceContent(url);
// read resource body
resource = entry.cacheArray();
if (resource == null) {
//System.out.println("cannot load document for URL " + url);
return new Snippet(null, ERROR_RESOURCE_LOADING, "error loading resource from web, cacheManager returned NULL");
return new Snippet(null, ERROR_RESOURCE_LOADING, "error loading resource, plasmaHTCache.Entry cache is NULL");
}
source = SOURCE_WEB;
}
} catch (Exception e) {
if (!(e instanceof plasmaCrawlerException)) e.printStackTrace();
return new Snippet(null, ERROR_SOURCE_LOADING, "error loading resource from web: " + e.getMessage());
return new Snippet(null, ERROR_SOURCE_LOADING, "error loading resource: " + e.getMessage());
}
if (resource == null) return new Snippet(null, ERROR_SOURCE_LOADING, "no resource available");
/* ===========================================================================
* PARSING RESOURCE
@ -459,11 +457,12 @@ public class plasmaSnippetCache {
docInfo = this.cacheManager.loadResourceInfo(url);
} catch (Exception e) {
// ignore this. resource info loading failed
}
}
}
// TODO: we need a better solution here
// encapsulate this in the crawlLoader class
if (url.getProtocol().startsWith("http")) {
if ((docInfo == null) && (url.getProtocol().startsWith("http"))) {
// getting URL mimeType
try {
httpHeader header = httpc.whead(url, url.getHost(), 10000, null, null, this.sb.remoteProxyConfig);
@ -472,8 +471,6 @@ public class plasmaSnippetCache {
// ingore this. http header download failed
}
}
}
if (docInfo == null) {
String filename = this.cacheManager.getCachePath(url).getName();

View File

@ -2033,9 +2033,10 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
String host, hash, address, descr = "";
yacySeed seed;
plasmaSnippetCache.Snippet snippet;
boolean includeSnippets = false;
String formerSearch = query.words(" ");
long targetTime = timestamp + query.maximumTime;
if (targetTime < System.currentTimeMillis()) targetTime = System.currentTimeMillis() + 5000;
if (targetTime < System.currentTimeMillis()) targetTime = System.currentTimeMillis() + 1000;
while ((acc.hasMoreElements()) && (i < query.wantedResults) && (System.currentTimeMillis() < targetTime)) {
urlentry = acc.nextElement();
url = urlentry.url();
@ -2076,8 +2077,12 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
//addScoreForked(ref, gs, urlstring.split("/"));
URL wordURL;
if (urlstring.matches(query.urlMask)) { //.* is default
snippet = snippetCache.retrieveSnippet(url, query.queryHashes, false, 260);
if (snippet.getSource() == plasmaSnippetCache.ERROR_NO_MATCH) {
if (includeSnippets) {
snippet = snippetCache.retrieveSnippet(url, query.queryHashes, false, 260);
} else {
snippet = null;
}
if ((snippet != null) && (snippet.getSource() == plasmaSnippetCache.ERROR_NO_MATCH)) {
// suppress line: there is no match in that resource
} else {
prop.put("type_results_" + i + "_recommend", (yacyCore.newsPool.getSpecific(yacyNewsPool.OUTGOING_DB, "stippadd", "url", urlstring) == null) ? 1 : 0);
@ -2097,7 +2102,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
((indexURL.probablyRootURL(urlhash)) ? ", probablyRootURL" : "") +
(((wordURL = indexURL.probablyWordURL(urlhash, query.words(""))) != null) ? ", probablyWordURL=" + wordURL.toNormalform() : ""));
// adding snippet if available
if (snippet.exists()) {
if ((snippet != null) && (snippet.exists())) {
prop.put("type_results_" + i + "_snippet", 1);
prop.put("type_results_" + i + "_snippet_text", snippet.getLineMarked(query.queryHashes));
} else {