more performance hacks

this makes YaCy search results VERY fast for all verify=false search cases
and it enhances the search speed also for all other snippet-fetch cases.
With this change my peer performed 100 Queries Per Second (!!!) while doing 10 queries simultanously (!!!)
in an intranet index of 20000 URLs on my 16-core Mac

Check this yourself by doing:
cd bin
./searchtestmulti.sh
after finishing the run, divide 1000 by the given time per query (which is the qps for one thread)
and then multiply again by 10 (because 10 search threads has been started)

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@7231 6c8d7289-2bf4-0310-a012-ef5d649a1542
This commit is contained in:
orbiter 2010-10-09 08:55:57 +00:00
parent b8aee6d402
commit 0d363a94d7
15 changed files with 128 additions and 71 deletions

View File

@ -100,7 +100,6 @@ public class yacysearch {
String originalquerystring = (post == null) ? "" : post.get("query", post.get("search", "")).trim(); String originalquerystring = (post == null) ? "" : post.get("query", post.get("search", "")).trim();
String querystring = originalquerystring.replace('+', ' '); String querystring = originalquerystring.replace('+', ' ');
CrawlProfile.CacheStrategy snippetFetchStrategy = (post != null && post.get("verify", "false").equals("true")) ? CrawlProfile.CacheStrategy.IFFRESH : CrawlProfile.CacheStrategy.parse(post.get("verify", "cacheonly")); CrawlProfile.CacheStrategy snippetFetchStrategy = (post != null && post.get("verify", "false").equals("true")) ? CrawlProfile.CacheStrategy.IFFRESH : CrawlProfile.CacheStrategy.parse(post.get("verify", "cacheonly"));
if (snippetFetchStrategy == null) snippetFetchStrategy = CrawlProfile.CacheStrategy.CACHEONLY;
final serverObjects prop = new serverObjects(); final serverObjects prop = new serverObjects();
// get segment // get segment
@ -167,7 +166,7 @@ public class yacysearch {
// collect search attributes // collect search attributes
boolean newsearch = post.hasValue("query") && post.hasValue("former") && !post.get("query","").equalsIgnoreCase(post.get("former","")); //new search term boolean newsearch = post.hasValue("query") && post.hasValue("former") && !post.get("query","").equalsIgnoreCase(post.get("former","")); //new search term
int itemsPerPage = Math.min((authenticated) ? (snippetFetchStrategy.isAllowedToFetchOnline() ? 100 : 1000) : (snippetFetchStrategy.isAllowedToFetchOnline() ? 10 : 100), post.getInt("maximumRecords", post.getInt("count", 10))); // SRU syntax with old property as alternative int itemsPerPage = Math.min((authenticated) ? (snippetFetchStrategy != null && snippetFetchStrategy.isAllowedToFetchOnline() ? 100 : 1000) : (snippetFetchStrategy != null && snippetFetchStrategy.isAllowedToFetchOnline() ? 10 : 100), post.getInt("maximumRecords", post.getInt("count", 10))); // SRU syntax with old property as alternative
int offset = (newsearch) ? 0 : post.getInt("startRecord", post.getInt("offset", 0)); int offset = (newsearch) ? 0 : post.getInt("startRecord", post.getInt("offset", 0));
int newcount; int newcount;
@ -234,7 +233,7 @@ public class yacysearch {
boolean block = false; boolean block = false;
if (Domains.matchesList(client, sb.networkBlacklist)) { if (Domains.matchesList(client, sb.networkBlacklist)) {
global = false; global = false;
snippetFetchStrategy = CrawlProfile.CacheStrategy.CACHEONLY; if (snippetFetchStrategy != null) snippetFetchStrategy = null;
block = true; block = true;
Log.logWarning("LOCAL_SEARCH", "ACCESS CONTROL: BLACKLISTED CLIENT FROM " + client + " gets no permission to search"); Log.logWarning("LOCAL_SEARCH", "ACCESS CONTROL: BLACKLISTED CLIENT FROM " + client + " gets no permission to search");
} else if (Domains.matchesList(client, sb.networkWhitelist)) { } else if (Domains.matchesList(client, sb.networkWhitelist)) {
@ -254,9 +253,9 @@ public class yacysearch {
} }
} }
// protection against too many remote server snippet loads (protects traffic on server) // protection against too many remote server snippet loads (protects traffic on server)
if (snippetFetchStrategy.isAllowedToFetchOnline()) { if (snippetFetchStrategy != null && snippetFetchStrategy.isAllowedToFetchOnline()) {
if (accInTenMinutes >= 20 || accInOneMinute >= 4 || accInThreeSeconds >= 1) { if (accInTenMinutes >= 20 || accInOneMinute >= 4 || accInThreeSeconds >= 1) {
snippetFetchStrategy = CrawlProfile.CacheStrategy.CACHEONLY; snippetFetchStrategy = null;
Log.logWarning("LOCAL_SEARCH", "ACCESS CONTROL: CLIENT FROM " + client + ": " + accInOneSecond + "/1s, " + accInThreeSeconds + "/3s, " + accInOneMinute + "/60s, " + accInTenMinutes + "/600s, " + " requests, disallowed remote snippet loading"); Log.logWarning("LOCAL_SEARCH", "ACCESS CONTROL: CLIENT FROM " + client + ": " + accInOneSecond + "/1s, " + accInThreeSeconds + "/3s, " + accInOneMinute + "/60s, " + accInTenMinutes + "/600s, " + " requests, disallowed remote snippet loading");
} }
} }
@ -554,19 +553,7 @@ public class yacysearch {
suggestion = meanIt.next(); suggestion = meanIt.next();
prop.put("didYouMean_suggestions_"+meanCount+"_word", suggestion); prop.put("didYouMean_suggestions_"+meanCount+"_word", suggestion);
prop.put("didYouMean_suggestions_"+meanCount+"_url", prop.put("didYouMean_suggestions_"+meanCount+"_url",
"/yacysearch.html" + "?display=" + display + QueryParams.navurl("html", 0, display, theQuery, suggestion, originalUrlMask.toString(), theQuery.navigators)
"&query=" + suggestion +
"&maximumRecords="+ theQuery.displayResults() +
"&startRecord=" + (0 * theQuery.displayResults()) +
"&resource=" + ((theQuery.isLocal()) ? "local" : "global") +
"&verify=" + (theQuery.snippetCacheStrategy.mustBeOffline() ? "false" : "true") +
"&nav=" + theQuery.navigators +
"&urlmaskfilter=" + originalUrlMask.toString() +
"&prefermaskfilter=" + theQuery.prefer.toString() +
"&cat=href&constraint=" + ((theQuery.constraint == null) ? "" : theQuery.constraint.exportB64()) +
"&contentdom=" + theQuery.contentdom() +
"&former=" + theQuery.queryString(true) +
"&meanCount=" + meanMax
); );
prop.put("didYouMean_suggestions_"+meanCount+"_sep","|"); prop.put("didYouMean_suggestions_"+meanCount+"_sep","|");
meanCount++; meanCount++;
@ -624,7 +611,7 @@ public class yacysearch {
resnav.append("<img src=\"env/grafics/navdl.gif\" alt=\"arrowleft\" width=\"16\" height=\"16\" />&nbsp;"); resnav.append("<img src=\"env/grafics/navdl.gif\" alt=\"arrowleft\" width=\"16\" height=\"16\" />&nbsp;");
} else { } else {
resnav.append("<a id=\"prevpage\" href=\""); resnav.append("<a id=\"prevpage\" href=\"");
resnav.append(QueryParams.navurl("html", thispage - 1, display, theQuery, originalUrlMask, null, navigation)); resnav.append(QueryParams.navurl("html", thispage - 1, display, theQuery, null, originalUrlMask, navigation));
resnav.append("\"><img src=\"env/grafics/navdl.gif\" alt=\"arrowleft\" width=\"16\" height=\"16\" /></a>&nbsp;"); resnav.append("\"><img src=\"env/grafics/navdl.gif\" alt=\"arrowleft\" width=\"16\" height=\"16\" /></a>&nbsp;");
} }
final int numberofpages = Math.min(10, Math.max(1 + thispage, 1 + ((theSearch.getRankingResult().getLocalIndexCount() < 11) ? Math.max(30, theSearch.getRankingResult().getLocalResourceSize() + theSearch.getRankingResult().getRemoteResourceSize()) : theSearch.getRankingResult().getLocalIndexCount()) / theQuery.displayResults())); final int numberofpages = Math.min(10, Math.max(1 + thispage, 1 + ((theSearch.getRankingResult().getLocalIndexCount() < 11) ? Math.max(30, theSearch.getRankingResult().getLocalResourceSize() + theSearch.getRankingResult().getRemoteResourceSize()) : theSearch.getRankingResult().getLocalIndexCount()) / theQuery.displayResults()));
@ -637,7 +624,7 @@ public class yacysearch {
resnav.append("\" width=\"16\" height=\"16\" />&nbsp;"); resnav.append("\" width=\"16\" height=\"16\" />&nbsp;");
} else { } else {
resnav.append("<a href=\""); resnav.append("<a href=\"");
resnav.append(QueryParams.navurl("html", i, display, theQuery, originalUrlMask, null, navigation)); resnav.append(QueryParams.navurl("html", i, display, theQuery, null, originalUrlMask, navigation));
resnav.append("\"><img src=\"env/grafics/navd"); resnav.append("\"><img src=\"env/grafics/navd");
resnav.append(i + 1); resnav.append(i + 1);
resnav.append(".gif\" alt=\"page"); resnav.append(".gif\" alt=\"page");
@ -649,7 +636,7 @@ public class yacysearch {
resnav.append("<img src=\"env/grafics/navdr.gif\" alt=\"arrowright\" width=\"16\" height=\"16\" />"); resnav.append("<img src=\"env/grafics/navdr.gif\" alt=\"arrowright\" width=\"16\" height=\"16\" />");
} else { } else {
resnav.append("<a id=\"nextpage\" href=\""); resnav.append("<a id=\"nextpage\" href=\"");
resnav.append(QueryParams.navurl("html", thispage + 1, display, theQuery, originalUrlMask, null, navigation)); resnav.append(QueryParams.navurl("html", thispage + 1, display, theQuery, null, originalUrlMask, navigation));
resnav.append("\"><img src=\"env/grafics/navdr.gif\" alt=\"arrowright\" width=\"16\" height=\"16\" /></a>"); resnav.append("\"><img src=\"env/grafics/navdr.gif\" alt=\"arrowright\" width=\"16\" height=\"16\" /></a>");
} }
String resnavs = resnav.toString(); String resnavs = resnav.toString();
@ -705,7 +692,7 @@ public class yacysearch {
prop.putHTML("prefermaskfilter", prefermask); prop.putHTML("prefermaskfilter", prefermask);
prop.put("indexof", (indexof) ? "on" : "off"); prop.put("indexof", (indexof) ? "on" : "off");
prop.put("constraint", (constraint == null) ? "" : constraint.exportB64()); prop.put("constraint", (constraint == null) ? "" : constraint.exportB64());
prop.put("verify", snippetFetchStrategy.toName()); prop.put("verify", snippetFetchStrategy == null ? "false" : snippetFetchStrategy.toName());
prop.put("contentdom", (post == null ? "text" : post.get("contentdom", "text"))); prop.put("contentdom", (post == null ? "text" : post.get("contentdom", "text")));
prop.put("searchdomswitches", sb.getConfigBool("search.text", true) || sb.getConfigBool("search.audio", true) || sb.getConfigBool("search.video", true) || sb.getConfigBool("search.image", true) || sb.getConfigBool("search.app", true) ? 1 : 0); prop.put("searchdomswitches", sb.getConfigBool("search.text", true) || sb.getConfigBool("search.audio", true) || sb.getConfigBool("search.video", true) || sb.getConfigBool("search.image", true) || sb.getConfigBool("search.app", true) ? 1 : 0);
prop.put("searchdomswitches_searchtext", sb.getConfigBool("search.text", true) ? 1 : 0); prop.put("searchdomswitches_searchtext", sb.getConfigBool("search.text", true) ? 1 : 0);

View File

@ -72,8 +72,8 @@ public class yacysearchtrailer {
for (i = 0; i < Math.min(10, namespaceNavigator.size()); i++) { for (i = 0; i < Math.min(10, namespaceNavigator.size()); i++) {
entry = namespaceNavigator.get(i); entry = namespaceNavigator.get(i);
prop.put("nav-namespace_element_" + i + "_name", entry.name); prop.put("nav-namespace_element_" + i + "_name", entry.name);
prop.put("nav-namespace_element_" + i + "_url", "<a href=\"" + QueryParams.navurl("html", 0, display, theQuery, theQuery.urlMask.toString(), "inurl:" + entry.name, theQuery.navigators) + "\">" + entry.name + " (" + entry.count + ")</a>"); prop.put("nav-namespace_element_" + i + "_url", "<a href=\"" + QueryParams.navurl("html", 0, display, theQuery, theQuery.queryStringForUrl() + "+" + "inurl:" + entry.name, theQuery.urlMask.toString(), theQuery.navigators) + "\">" + entry.name + " (" + entry.count + ")</a>");
prop.putJSON("nav-namespace_element_" + i + "_url-json", QueryParams.navurl("json", 0, display, theQuery, theQuery.urlMask.toString(), "inurl:" + entry.name, theQuery.navigators)); prop.putJSON("nav-namespace_element_" + i + "_url-json", QueryParams.navurl("json", 0, display, theQuery, theQuery.queryStringForUrl() + "+" + "inurl:" + entry.name, theQuery.urlMask.toString(), theQuery.navigators));
prop.put("nav-namespace_element_" + i + "_count", entry.count); prop.put("nav-namespace_element_" + i + "_count", entry.count);
prop.put("nav-namespace_element_" + i + "_modifier", "inurl:" + entry.name); prop.put("nav-namespace_element_" + i + "_modifier", "inurl:" + entry.name);
prop.put("nav-namespace_element_" + i + "_nl", 1); prop.put("nav-namespace_element_" + i + "_nl", 1);
@ -94,8 +94,8 @@ public class yacysearchtrailer {
for (i = 0; i < Math.min(10, hostNavigator.size()); i++) { for (i = 0; i < Math.min(10, hostNavigator.size()); i++) {
entry = hostNavigator.get(i); entry = hostNavigator.get(i);
prop.put("nav-domains_element_" + i + "_name", entry.name); prop.put("nav-domains_element_" + i + "_name", entry.name);
prop.put("nav-domains_element_" + i + "_url", "<a href=\"" + QueryParams.navurl("html", 0, display, theQuery, theQuery.urlMask.toString(), "site:" + entry.name, theQuery.navigators) + "\">" + entry.name + " (" + entry.count + ")</a>"); prop.put("nav-domains_element_" + i + "_url", "<a href=\"" + QueryParams.navurl("html", 0, display, theQuery, theQuery.queryStringForUrl() + "+" + "site:" + entry.name, theQuery.urlMask.toString(), theQuery.navigators) + "\">" + entry.name + " (" + entry.count + ")</a>");
prop.putJSON("nav-domains_element_" + i + "_url-json", QueryParams.navurl("json", 0, display, theQuery, theQuery.urlMask.toString(), "site:" + entry.name, theQuery.navigators)); prop.putJSON("nav-domains_element_" + i + "_url-json", QueryParams.navurl("json", 0, display, theQuery, theQuery.queryStringForUrl() + "+" + "site:" + entry.name, theQuery.urlMask.toString(), theQuery.navigators));
prop.put("nav-domains_element_" + i + "_count", entry.count); prop.put("nav-domains_element_" + i + "_count", entry.count);
prop.put("nav-domains_element_" + i + "_modifier", "site:" + entry.name); prop.put("nav-domains_element_" + i + "_modifier", "site:" + entry.name);
prop.put("nav-domains_element_" + i + "_nl", 1); prop.put("nav-domains_element_" + i + "_nl", 1);
@ -118,8 +118,8 @@ public class yacysearchtrailer {
entry = authorNavigator.get(i); entry = authorNavigator.get(i);
anav = (entry.name.indexOf(' ') < 0) ? "author:" + entry.name : "author:'" + entry.name.replace(" ", "+") + "'"; anav = (entry.name.indexOf(' ') < 0) ? "author:" + entry.name : "author:'" + entry.name.replace(" ", "+") + "'";
prop.put("nav-authors_element_" + i + "_name", entry.name); prop.put("nav-authors_element_" + i + "_name", entry.name);
prop.put("nav-authors_element_" + i + "_url", "<a href=\"" + QueryParams.navurl("html", 0, display, theQuery, theQuery.urlMask.toString(), anav, theQuery.navigators) + "\">" + entry.name + " (" + entry.count + ")</a>"); prop.put("nav-authors_element_" + i + "_url", "<a href=\"" + QueryParams.navurl("html", 0, display, theQuery, theQuery.queryStringForUrl() + "+" + anav, theQuery.urlMask.toString(), theQuery.navigators) + "\">" + entry.name + " (" + entry.count + ")</a>");
prop.putJSON("nav-authors_element_" + i + "_url-json", QueryParams.navurl("json", 0, display, theQuery, theQuery.urlMask.toString(), anav, theQuery.navigators)); prop.putJSON("nav-authors_element_" + i + "_url-json", QueryParams.navurl("json", 0, display, theQuery, theQuery.queryStringForUrl() + "+" + anav, theQuery.urlMask.toString(), theQuery.navigators));
prop.put("nav-authors_element_" + i + "_count", entry.count); prop.put("nav-authors_element_" + i + "_count", entry.count);
prop.put("nav-authors_element_" + i + "_modifier", "author:'" + entry.name + "'"); prop.put("nav-authors_element_" + i + "_modifier", "author:'" + entry.name + "'");
prop.put("nav-authors_element_" + i + "_nl", 1); prop.put("nav-authors_element_" + i + "_nl", 1);
@ -144,8 +144,8 @@ public class yacysearchtrailer {
if (/*(theQuery == null) ||*/ (theQuery.queryString == null)) break; if (/*(theQuery == null) ||*/ (theQuery.queryString == null)) break;
if (e != null && e.name != null) { if (e != null && e.name != null) {
prop.putHTML("nav-topics_element_" + i + "_name", e.name); prop.putHTML("nav-topics_element_" + i + "_name", e.name);
prop.put("nav-topics_element_" + i + "_url", "<a href=\"" + QueryParams.navurl("html", 0, display, theQuery, theQuery.urlMask.toString(), e.name, theQuery.navigators) + "\">" + e.name + " (" + e.count + ")</a>"); prop.put("nav-topics_element_" + i + "_url", "<a href=\"" + QueryParams.navurl("html", 0, display, theQuery, theQuery.queryStringForUrl() + "+" + e.name, theQuery.urlMask.toString(), theQuery.navigators) + "\">" + e.name + " (" + e.count + ")</a>");
prop.putJSON("nav-topics_element_" + i + "_url-json", QueryParams.navurl("json", 0, display, theQuery, theQuery.urlMask.toString(), e.name, theQuery.navigators)); prop.putJSON("nav-topics_element_" + i + "_url-json", QueryParams.navurl("json", 0, display, theQuery, theQuery.queryStringForUrl() + "+" + e.name, theQuery.urlMask.toString(), theQuery.navigators));
prop.put("nav-topics_element_" + i + "_count", e.count); prop.put("nav-topics_element_" + i + "_count", e.count);
prop.put("nav-topics_element_" + i + "_modifier", e.name); prop.put("nav-topics_element_" + i + "_modifier", e.name);
prop.put("nav-topics_element_" + i + "_nl", (iter.hasNext() && i < MAX_TOPWORDS) ? 1 : 0); prop.put("nav-topics_element_" + i + "_nl", (iter.hasNext() && i < MAX_TOPWORDS) ? 1 : 0);

View File

@ -316,12 +316,13 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
return NOCACHE; return NOCACHE;
} }
public static CacheStrategy parse(String name) { public static CacheStrategy parse(String name) {
if (name == null) return null;
if (name.equals("nocache")) return NOCACHE; if (name.equals("nocache")) return NOCACHE;
if (name.equals("iffresh")) return IFFRESH; if (name.equals("iffresh")) return IFFRESH;
if (name.equals("ifexist")) return IFEXIST; if (name.equals("ifexist")) return IFEXIST;
if (name.equals("cacheonly")) return CACHEONLY; if (name.equals("cacheonly")) return CACHEONLY;
if (name.equals("true")) return IFFRESH; if (name.equals("true")) return IFFRESH;
if (name.equals("false")) return CACHEONLY; if (name.equals("false")) return null;
return null; return null;
} }
public String toName() { public String toName() {

View File

@ -60,6 +60,8 @@ public final class QueryParams {
public static final int SEARCHDOM_GLOBALDHT = 3; public static final int SEARCHDOM_GLOBALDHT = 3;
public static final int SEARCHDOM_GLOBALALL = 4; public static final int SEARCHDOM_GLOBALALL = 4;
private static final String ampersand = "&amp;";
public static enum FetchMode { public static enum FetchMode {
NO_FETCH_NO_VERIFY, NO_FETCH_NO_VERIFY,
FETCH_BUT_ACCEPT_OFFLINE_OR_USE_CACHE, FETCH_BUT_ACCEPT_OFFLINE_OR_USE_CACHE,
@ -137,7 +139,7 @@ public final class QueryParams {
this.domMaxTargets = 0; this.domMaxTargets = 0;
this.constraint = constraint; this.constraint = constraint;
this.allofconstraint = false; this.allofconstraint = false;
this.snippetCacheStrategy = CrawlProfile.CacheStrategy.CACHEONLY; this.snippetCacheStrategy = null;
this.host = null; this.host = null;
this.sitehash = null; this.sitehash = null;
this.authorhash = null; this.authorhash = null;
@ -453,9 +455,9 @@ public final class QueryParams {
* @param addToQuery * @param addToQuery
* @return * @return
*/ */
public static String navurl(final String ext, final int page, final int display, final QueryParams theQuery, final String originalUrlMask, final String addToQuery, final String nav) { public static String navurl(
final String ext, final int page, final int display, final QueryParams theQuery,
final String ampersand = "&amp;"; String newQueryString, final String originalUrlMask, final String nav) {
final StringBuilder sb = new StringBuilder(); final StringBuilder sb = new StringBuilder();
sb.append("/yacysearch."); sb.append("/yacysearch.");
@ -465,8 +467,7 @@ public final class QueryParams {
sb.append(ampersand); sb.append(ampersand);
sb.append("query="); sb.append("query=");
sb.append(theQuery.queryStringForUrl()); sb.append(newQueryString == null ? theQuery.queryStringForUrl() : newQueryString);
sb.append((addToQuery == null) ? "" : "+" + addToQuery);
sb.append(ampersand); sb.append(ampersand);
sb.append("maximumRecords="); sb.append("maximumRecords=");
@ -482,7 +483,7 @@ public final class QueryParams {
sb.append(ampersand); sb.append(ampersand);
sb.append("verify="); sb.append("verify=");
sb.append(theQuery.snippetCacheStrategy.mustBeOffline() ? "false" : "true"); sb.append(theQuery.snippetCacheStrategy == null ? "false" : theQuery.snippetCacheStrategy.toName());
sb.append(ampersand); sb.append(ampersand);
sb.append("nav="); sb.append("nav=");

View File

@ -34,6 +34,7 @@ import java.util.concurrent.LinkedBlockingQueue;
import java.util.concurrent.Semaphore; import java.util.concurrent.Semaphore;
import net.yacy.document.Condenser; import net.yacy.document.Condenser;
import net.yacy.document.LargeNumberCache;
import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.data.word.WordReference; import net.yacy.kelondro.data.word.WordReference;
import net.yacy.kelondro.data.word.WordReferenceRow; import net.yacy.kelondro.data.word.WordReferenceRow;
@ -155,7 +156,7 @@ public class ReferenceOrder {
if (count == null) { if (count == null) {
doms0.put(dom, int1); doms0.put(dom, int1);
} else { } else {
doms0.put(dom, Integer.valueOf(count.intValue() + 1)); doms0.put(dom, LargeNumberCache.valueOf(count.intValue() + 1));
} }
} }

View File

@ -43,7 +43,6 @@ import net.yacy.kelondro.util.EventTracker;
import net.yacy.repository.LoaderDispatcher; import net.yacy.repository.LoaderDispatcher;
import de.anomic.crawler.CrawlProfile; import de.anomic.crawler.CrawlProfile;
import de.anomic.crawler.CrawlProfile.CacheStrategy;
import de.anomic.search.MediaSnippet; import de.anomic.search.MediaSnippet;
import de.anomic.yacy.yacySeedDB; import de.anomic.yacy.yacySeedDB;
import de.anomic.yacy.graphics.ProfilingGraph; import de.anomic.yacy.graphics.ProfilingGraph;
@ -187,7 +186,7 @@ public class ResultFetcher {
if (failedURLs.has(page.hash())) continue; if (failedURLs.has(page.hash())) continue;
loops++; loops++;
final ResultEntry resultEntry = fetchSnippet(page, query.sitehash == null ? cacheStrategy : CacheStrategy.CACHEONLY); // does not fetch snippets if snippetMode == 0 final ResultEntry resultEntry = fetchSnippet(page, cacheStrategy); // does not fetch snippets if snippetMode == 0
if (resultEntry == null) continue; // the entry had some problems, cannot be used if (resultEntry == null) continue; // the entry had some problems, cannot be used
//if (result.contains(resultEntry)) continue; //if (result.contains(resultEntry)) continue;

View File

@ -36,6 +36,7 @@ import java.util.TreeSet;
import java.util.concurrent.Semaphore; import java.util.concurrent.Semaphore;
import java.util.concurrent.TimeUnit; import java.util.concurrent.TimeUnit;
import net.yacy.document.LargeNumberCache;
import net.yacy.kelondro.data.word.WordReference; import net.yacy.kelondro.data.word.WordReference;
import net.yacy.kelondro.index.HandleSet; import net.yacy.kelondro.index.HandleSet;
import net.yacy.kelondro.index.RowSpaceExceededException; import net.yacy.kelondro.index.RowSpaceExceededException;
@ -178,7 +179,7 @@ public final class SearchEvent {
mindhtdistance = l; mindhtdistance = l;
IAneardhthash = wordhash; IAneardhthash = wordhash;
} }
IACount.put(wordhash, Integer.valueOf(container.size())); IACount.put(wordhash, LargeNumberCache.valueOf(container.size()));
IAResults.put(wordhash, ReferenceContainer.compressIndex(container, null, 1000).toString()); IAResults.put(wordhash, ReferenceContainer.compressIndex(container, null, 1000).toString());
} }
EventTracker.update(EventTracker.EClass.SEARCH, new ProfilingGraph.searchEvent(query.id(true), Type.ABSTRACTS, "", this.rankingProcess.searchContainerMap().size(), System.currentTimeMillis() - timer), false); EventTracker.update(EventTracker.EClass.SEARCH, new ProfilingGraph.searchEvent(query.id(true), Type.ABSTRACTS, "", this.rankingProcess.searchContainerMap().size(), System.currentTimeMillis() - timer), false);

View File

@ -762,16 +762,22 @@ public class MultiProtocolURI implements Serializable, Comparable<MultiProtocolU
defaultPort = true; defaultPort = true;
} }
final String urlPath = this.getFile(excludeReference, removeSessionID); final String urlPath = this.getFile(excludeReference, removeSessionID);
StringBuilder u = new StringBuilder(80);
if (defaultPort) { u.append(this.protocol);
return u.append("://");
this.protocol + "://" + if (this.getHost() != null) {
((this.getHost() == null) ? "" : ((this.userInfo != null) ? (this.userInfo + "@") : ("")) + this.getHost().toLowerCase()) + if (this.userInfo != null) {
urlPath; u.append(this.userInfo);
u.append("@");
} }
return this.protocol + "://" + u.append(this.getHost().toLowerCase());
((this.userInfo != null) ? (this.userInfo + "@") : ("")) + }
this.getHost().toLowerCase() + ((defaultPort) ? ("") : (":" + this.port)) + urlPath; if (!defaultPort) {
u.append(":");
u.append(this.port);
}
u.append(urlPath);
return u.toString();
} }
public int hashCode() { public int hashCode() {

View File

@ -30,6 +30,7 @@ import java.io.InputStream;
import java.io.InputStreamReader; import java.io.InputStreamReader;
import java.io.Reader; import java.io.Reader;
import java.io.UnsupportedEncodingException; import java.io.UnsupportedEncodingException;
import java.text.NumberFormat;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.Enumeration; import java.util.Enumeration;
import java.util.HashMap; import java.util.HashMap;
@ -87,6 +88,11 @@ public final class Condenser {
public static final int flag_cat_hasapp = 23; // the page refers to (at least one) application file public static final int flag_cat_hasapp = 23; // the page refers to (at least one) application file
private final static int numlength = 5; private final static int numlength = 5;
private final static NumberFormat intStringFormatter = NumberFormat.getIntegerInstance();
static {
intStringFormatter.setMinimumIntegerDigits(numlength);
intStringFormatter.setMaximumIntegerDigits(numlength);
}
//private Properties analysis; //private Properties analysis;
private Map<String, Word> words; // a string (the words) to (indexWord) - relation private Map<String, Word> words; // a string (the words) to (indexWord) - relation
@ -97,7 +103,7 @@ public final class Condenser {
public int RESULT_NUMB_SENTENCES = -1; public int RESULT_NUMB_SENTENCES = -1;
public int RESULT_DIFF_SENTENCES = -1; public int RESULT_DIFF_SENTENCES = -1;
public Bitfield RESULT_FLAGS = new Bitfield(4); public Bitfield RESULT_FLAGS = new Bitfield(4);
Identificator languageIdentificator; private Identificator languageIdentificator;
public Condenser( public Condenser(
final Document document, final Document document,
@ -268,12 +274,6 @@ public final class Condenser {
return this.languageIdentificator.getLanguage(); return this.languageIdentificator.getLanguage();
} }
public String intString(final int number, final int length) {
String s = Integer.toString(number);
while (s.length() < length) s = "0" + s;
return s;
}
private void createCondensement(final InputStream is) throws UnsupportedEncodingException { private void createCondensement(final InputStream is) throws UnsupportedEncodingException {
final HashSet<String> currsentwords = new HashSet<String>(); final HashSet<String> currsentwords = new HashSet<String>();
StringBuilder sentence = new StringBuilder(100); StringBuilder sentence = new StringBuilder(100);
@ -357,7 +357,7 @@ public final class Condenser {
} }
words.put(word, wsp); words.put(word, wsp);
// we now have the unique handle of the word, put it into the sentence: // we now have the unique handle of the word, put it into the sentence:
sentence.append(intString(wordHandle, numlength)); sentence.append(intStringFormatter.format(wordHandle));
wordInSentenceCounter++; wordInSentenceCounter++;
} }
} }
@ -389,7 +389,7 @@ public final class Condenser {
wc = (sentence.length() - 1) / numlength; wc = (sentence.length() - 1) / numlength;
s = new String[wc + 2]; s = new String[wc + 2];
psp = sentences.get(sentence); psp = sentences.get(sentence);
s[0] = intString(psp.occurrences(), numlength); // number of occurrences of this sentence s[0] = intStringFormatter.format(psp.occurrences()); // number of occurrences of this sentence
s[1] = sentence.substring(0, 1); // the termination symbol of this sentence s[1] = sentence.substring(0, 1); // the termination symbol of this sentence
for (int i = 0; i < wc; i++) { for (int i = 0; i < wc; i++) {
k = sentence.substring(i * numlength + 1, (i + 1) * numlength + 1); k = sentence.substring(i * numlength + 1, (i + 1) * numlength + 1);
@ -422,8 +422,8 @@ public final class Condenser {
idx = it1.next().intValue(); // number of a sentence idx = it1.next().intValue(); // number of a sentence
s = (String[]) orderedSentences[idx]; s = (String[]) orderedSentences[idx];
for (int j = 2; j < s.length; j++) { for (int j = 2; j < s.length; j++) {
if (s[j].equals(intString(wsp.posInText, numlength))) if (s[j].equals(intStringFormatter.format(wsp.posInText)))
s[j] = intString(wsp1.posInText, numlength); s[j] = intStringFormatter.format(wsp1.posInText);
} }
orderedSentences[idx] = s; orderedSentences[idx] = s;
} }
@ -479,7 +479,7 @@ public final class Condenser {
hash = Word.word2hash(word.toString()); hash = Word.word2hash(word.toString());
// don't overwrite old values, that leads to too far word distances // don't overwrite old values, that leads to too far word distances
oldpos = map.put(hash, Integer.valueOf(pos)); oldpos = map.put(hash, LargeNumberCache.valueOf(pos));
if (oldpos != null) map.put(hash, oldpos); if (oldpos != null) map.put(hash, oldpos);
pos += word.length() + 1; pos += word.length() + 1;

View File

@ -0,0 +1,57 @@
/**
* LargeNumberCache.java
* Copyright 2010 by Michael Peter Christen, mc@yacy.net, Frankfurt am Main, Germany
* First released 09.10.2010 at http://yacy.net
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program in the file lgpl21.txt
* If not, see <http://www.gnu.org/licenses/>.
*/
package net.yacy.document;
/**
* a LargeIntegerCache is used whenever a Integer.valueOf(int i) is used.
* The Integer java class provides a cache for values from -128 to +127
* which is not enough for the parser to organize word positions in texts
* Using this large cache the parser has a lower memory allocation and is faster.
*/
public class LargeNumberCache {
private static final int integerCacheLimit = 3000;
private static final Integer integerCache[];
// fill the cache
static {
integerCache = new Integer[integerCacheLimit];
for (int i = 0; i < integerCache.length; i++) integerCache[i] = new Integer(i);
}
/**
* Returns a Integer instance representing the specified int value.
* If a new Integer instance is not required, this method
* should generally be used in preference to the constructor
* {@link #Integer(int)}, as this method is likely to yield
* significantly better space and time performance by caching
* frequently requested values.
*
* @param i an int value.
* @return a Integer instance representing i.
*/
public final static Integer valueOf(final int i) {
if (i < 0) return Integer.valueOf(i);
if (i >= integerCacheLimit) return new Integer(i);
return integerCache[i];
}
}

View File

@ -48,7 +48,7 @@ public class Phrase {
} }
public void check(final int i) { public void check(final int i) {
hash.add(Integer.valueOf(i)); hash.add(LargeNumberCache.valueOf(i));
} }

View File

@ -43,7 +43,8 @@ public class SnippetExtractor {
Integer pos; Integer pos;
TreeSet<Integer> positions; TreeSet<Integer> positions;
int linenumber = 0; int linenumber = 0;
for (StringBuilder sentence: sentences) { int fullmatchcounter = 0;
lookup: for (StringBuilder sentence: sentences) {
hs = Condenser.hashSentence(sentence.toString()); hs = Condenser.hashSentence(sentence.toString());
positions = new TreeSet<Integer>(); positions = new TreeSet<Integer>();
for (byte[] word: queryhashes) { for (byte[] word: queryhashes) {
@ -61,6 +62,8 @@ public class SnippetExtractor {
if (positions.size() > 0) { if (positions.size() > 0) {
order.put(Long.valueOf(-100000000L * (linenumber == 0 ? 1 : 0) + 10000000L * positions.size() + 1000000L * worddistance + 100000L * linelengthKey(sentence.length(), maxLength) - 10000L * linenumber + uniqCounter--), sentence); order.put(Long.valueOf(-100000000L * (linenumber == 0 ? 1 : 0) + 10000000L * positions.size() + 1000000L * worddistance + 100000L * linelengthKey(sentence.length(), maxLength) - 10000L * linenumber + uniqCounter--), sentence);
if (order.size() > 5) order.remove(order.firstEntry().getKey()); if (order.size() > 5) order.remove(order.firstEntry().getKey());
if (positions.size() == queryhashes.size()) fullmatchcounter++;
if (fullmatchcounter >= 3) break lookup;
} }
linenumber++; linenumber++;
} }

View File

@ -92,7 +92,7 @@ public class swfParser extends AbstractParser implements Parser {
while ((urlStart = contents.indexOf("http://",urlEnd)) >= 0){ while ((urlStart = contents.indexOf("http://",urlEnd)) >= 0){
urlEnd = contents.indexOf(linebreak,urlStart); urlEnd = contents.indexOf(linebreak,urlStart);
url = contents.substring(urlStart,urlEnd); url = contents.substring(urlStart,urlEnd);
urlnr = (Integer.valueOf(++urls)).toString(); urlnr = Integer.toString(++urls).toString();
anchors.put(new MultiProtocolURI(url), urlnr); anchors.put(new MultiProtocolURI(url), urlnr);
contents = contents.substring(0,urlStart)+contents.substring(urlEnd); contents = contents.substring(0,urlStart)+contents.substring(urlEnd);
} }

View File

@ -33,6 +33,7 @@ import java.util.Set;
import net.yacy.cora.storage.ARC; import net.yacy.cora.storage.ARC;
import net.yacy.cora.storage.ConcurrentARC; import net.yacy.cora.storage.ConcurrentARC;
import net.yacy.document.LargeNumberCache;
import net.yacy.kelondro.index.HandleSet; import net.yacy.kelondro.index.HandleSet;
import net.yacy.kelondro.index.RowSpaceExceededException; import net.yacy.kelondro.index.RowSpaceExceededException;
import net.yacy.kelondro.logging.Log; import net.yacy.kelondro.logging.Log;
@ -54,7 +55,7 @@ public class Word {
*/ */
public static final int commonHashLength = 12; public static final int commonHashLength = 12;
private static final int hashCacheSize = Math.max(10000, Math.min(100000, (int) (MemoryControl.available() / 20000L))); private static final int hashCacheSize = Math.max(100000, Math.min(10000000, (int) (MemoryControl.available() / 20000L)));
private static final ARC<String, byte[]> hashCache = new ConcurrentARC<String, byte[]>(hashCacheSize, Runtime.getRuntime().availableProcessors() + 1); private static final ARC<String, byte[]> hashCache = new ConcurrentARC<String, byte[]>(hashCacheSize, Runtime.getRuntime().availableProcessors() + 1);
// object carries statistics for words and sentences // object carries statistics for words and sentences
@ -83,7 +84,7 @@ public class Word {
} }
public void check(final int i) { public void check(final int i) {
phrases.add(Integer.valueOf(i)); phrases.add(LargeNumberCache.valueOf(i));
} }
public Iterator<Integer> phrases() { public Iterator<Integer> phrases() {

View File

@ -50,7 +50,7 @@ import net.yacy.kelondro.logging.Log;
public class Digest { public class Digest {
private final static int digestThreads = Runtime.getRuntime().availableProcessors() + 1; private final static int digestThreads = Runtime.getRuntime().availableProcessors() * 2 + 1;
public static BlockingQueue<MessageDigest> digestPool = new ArrayBlockingQueue<MessageDigest>(digestThreads); public static BlockingQueue<MessageDigest> digestPool = new ArrayBlockingQueue<MessageDigest>(digestThreads);
static { static {
for (int i = 0; i < digestThreads; i++) for (int i = 0; i < digestThreads; i++)