removed exist() retrieval functions from error cache and replaced it

with metadata retrieval from connectors directly. This should cause
better usage of the cache. Automatically increase the metadata cache if
more memory is available.
This commit is contained in:
Michael Peter Christen 2014-07-11 19:52:25 +02:00
parent 62c72360ee
commit b5fc2b63ea
6 changed files with 32 additions and 19 deletions

View File

@ -59,7 +59,6 @@ import net.yacy.search.index.Fulltext;
import net.yacy.search.index.Segment.ReferenceReport; import net.yacy.search.index.Segment.ReferenceReport;
import net.yacy.search.index.Segment.ReferenceReportCache; import net.yacy.search.index.Segment.ReferenceReportCache;
import net.yacy.search.query.QueryParams; import net.yacy.search.query.QueryParams;
import net.yacy.search.schema.CollectionConfiguration.FailDoc;
import net.yacy.search.schema.CollectionSchema; import net.yacy.search.schema.CollectionSchema;
import net.yacy.server.serverObjects; import net.yacy.server.serverObjects;
import net.yacy.server.serverSwitch; import net.yacy.server.serverSwitch;
@ -466,7 +465,7 @@ public class HostBrowser {
prop.putHTML("files_list_" + c + "_type_admin", admin ? "true" : "false"); prop.putHTML("files_list_" + c + "_type_admin", admin ? "true" : "false");
StoreType type = (StoreType) entry.getValue(); StoreType type = (StoreType) entry.getValue();
try {uri = new DigestURL(entry.getKey());} catch (final MalformedURLException e) {uri = null;} try {uri = new DigestURL(entry.getKey());} catch (final MalformedURLException e) {uri = null;}
HarvestProcess process = uri == null ? null : sb.crawlQueues.exists(uri.hash(), true); HarvestProcess process = uri == null ? null : sb.crawlQueues.exists(uri.hash()); // todo: cannot identify errors
boolean loading = load.equals(entry.getKey()) || (process != null && process != HarvestProcess.ERRORS); boolean loading = load.equals(entry.getKey()) || (process != null && process != HarvestProcess.ERRORS);
boolean error = process == HarvestProcess.ERRORS || type == StoreType.EXCLUDED || type == StoreType.FAILED; boolean error = process == HarvestProcess.ERRORS || type == StoreType.EXCLUDED || type == StoreType.FAILED;
boolean dc = type != StoreType.INDEX && !error && !loading && list.containsKey(entry.getKey() + "/"); boolean dc = type != StoreType.INDEX && !error && !loading && list.containsKey(entry.getKey() + "/");
@ -482,8 +481,8 @@ public class HostBrowser {
FailType failType = errorDocs.get(entry.getKey()); FailType failType = errorDocs.get(entry.getKey());
if (failType == null) { if (failType == null) {
// maybe this is only in the errorURL // maybe this is only in the errorURL
FailDoc faildoc = sb.crawlQueues.errorURL.get(ASCII.String(uri.hash())); //Metadata faildoc = sb.index.fulltext().getDefaultConnector().getMetadata(ASCII.String(uri.hash()));
prop.putHTML("files_list_" + c + "_type_stored_error", process == HarvestProcess.ERRORS && faildoc != null ? faildoc.getFailReason() : "unknown error"); prop.putHTML("files_list_" + c + "_type_stored_error", "unknown error");
} else { } else {
String ids = ASCII.String(uri.hash()); String ids = ASCII.String(uri.hash());
InfoCacheEntry ice = infoCache.get(ids); InfoCacheEntry ice = infoCache.get(ids);

View File

@ -32,6 +32,7 @@ import net.yacy.cora.federate.solr.connector.EmbeddedSolrConnector;
import net.yacy.cora.federate.solr.connector.MirrorSolrConnector; import net.yacy.cora.federate.solr.connector.MirrorSolrConnector;
import net.yacy.cora.federate.solr.connector.RemoteSolrConnector; import net.yacy.cora.federate.solr.connector.RemoteSolrConnector;
import net.yacy.cora.federate.solr.connector.SolrConnector; import net.yacy.cora.federate.solr.connector.SolrConnector;
import net.yacy.kelondro.util.MemoryControl;
public class InstanceMirror { public class InstanceMirror {
@ -165,7 +166,8 @@ public class InstanceMirror {
if (msc != null) return msc; if (msc != null) return msc;
EmbeddedSolrConnector esc = getEmbeddedConnector(corename); EmbeddedSolrConnector esc = getEmbeddedConnector(corename);
RemoteSolrConnector rsc = getRemoteConnector(corename); RemoteSolrConnector rsc = getRemoteConnector(corename);
msc = new ConcurrentUpdateSolrConnector(new MirrorSolrConnector(esc, rsc), RemoteInstance.queueSizeByMemory(), 10000, Runtime.getRuntime().availableProcessors()); int cacheSize = (int) (MemoryControl.available() / 30000); // will return about 10000 for standard ram size
msc = new ConcurrentUpdateSolrConnector(new MirrorSolrConnector(esc, rsc), RemoteInstance.queueSizeByMemory(), cacheSize, Runtime.getRuntime().availableProcessors());
//msc = new MirrorSolrConnector(esc, rsc); //msc = new MirrorSolrConnector(esc, rsc);
this.mirrorConnectorCache.put(corename, msc); this.mirrorConnectorCache.put(corename, msc);
return msc; return msc;

View File

@ -43,6 +43,7 @@ import net.yacy.cora.document.id.AnchorURL;
import net.yacy.cora.document.id.DigestURL; import net.yacy.cora.document.id.DigestURL;
import net.yacy.cora.document.id.MultiProtocolURL; import net.yacy.cora.document.id.MultiProtocolURL;
import net.yacy.cora.federate.solr.FailCategory; import net.yacy.cora.federate.solr.FailCategory;
import net.yacy.cora.federate.solr.connector.SolrConnector.Metadata;
import net.yacy.cora.order.Base64Order; import net.yacy.cora.order.Base64Order;
import net.yacy.cora.protocol.Domains; import net.yacy.cora.protocol.Domains;
import net.yacy.cora.protocol.ftp.FTPClient; import net.yacy.cora.protocol.ftp.FTPClient;
@ -60,7 +61,6 @@ import net.yacy.repository.Blacklist.BlacklistType;
import net.yacy.repository.FilterEngine; import net.yacy.repository.FilterEngine;
import net.yacy.search.Switchboard; import net.yacy.search.Switchboard;
import net.yacy.search.index.Segment; import net.yacy.search.index.Segment;
import net.yacy.search.schema.CollectionConfiguration;
public final class CrawlStacker { public final class CrawlStacker {
@ -379,22 +379,27 @@ public final class CrawlStacker {
public String checkAcceptanceInitially(final DigestURL url, final CrawlProfile profile) { public String checkAcceptanceInitially(final DigestURL url, final CrawlProfile profile) {
// check if the url is double registered // check if the url is double registered
final HarvestProcess dbocc = this.nextQueue.exists(url.hash(), false); // returns the name of the queue if entry exists final HarvestProcess dbocc = this.nextQueue.exists(url.hash()); // returns the name of the queue if entry exists
if (dbocc != null) { if (dbocc != null) {
return "double in: " + dbocc.name(); return "double in: " + dbocc.name();
} }
String urlhash = ASCII.String(url.hash()); String urlhash = ASCII.String(url.hash());
final CollectionConfiguration.FailDoc errorEntry = this.nextQueue.errorURL.get(urlhash); Metadata oldEntry = null;
final Date oldDate = errorEntry == null ? null : errorEntry.getFailDate(); try {
oldEntry = this.indexSegment.fulltext().getDefaultConnector().getMetadata(urlhash);
} catch (IOException e) {
ConcurrentLog.logException(e);
}
final Long oldDate = oldEntry == null ? null : oldEntry.date;
if (oldDate == null) { if (oldDate == null) {
return null; // no evidence that we know that url return null; // no evidence that we know that url
} }
final boolean recrawl = profile.recrawlIfOlder() > oldDate.getTime(); final boolean recrawl = profile.recrawlIfOlder() > oldDate.longValue();
final String urlstring = url.toString(); final String urlstring = url.toString();
if (recrawl) { if (recrawl) {
if (CrawlStacker.log.isInfo()) if (CrawlStacker.log.isInfo())
CrawlStacker.log.info("RE-CRAWL of URL '" + urlstring + "': this url was crawled " + CrawlStacker.log.info("RE-CRAWL of URL '" + urlstring + "': this url was crawled " +
((System.currentTimeMillis() - oldDate.getTime()) / 60000 / 60 / 24) + " days ago."); ((System.currentTimeMillis() - oldDate.longValue()) / 60000 / 60 / 24) + " days ago.");
} else { } else {
return "double in: LURL-DB, oldDate = " + oldDate.toString(); return "double in: LURL-DB, oldDate = " + oldDate.toString();
} }

View File

@ -143,7 +143,7 @@ public class CrawlQueues {
* @param hash * @param hash
* @return if the hash exists, the name of the database is returned, otherwise null is returned * @return if the hash exists, the name of the database is returned, otherwise null is returned
*/ */
public HarvestProcess exists(final byte[] hash, final boolean checkErrorCache) { public HarvestProcess exists(final byte[] hash) {
if (this.delegatedURL.containsKey(ASCII.String(hash))) { if (this.delegatedURL.containsKey(ASCII.String(hash))) {
return HarvestProcess.DELEGATED; return HarvestProcess.DELEGATED;
} }
@ -155,9 +155,6 @@ public class CrawlQueues {
return HarvestProcess.WORKER; return HarvestProcess.WORKER;
} }
} }
if (checkErrorCache && this.errorURL.exists(hash)) {
return HarvestProcess.ERRORS;
}
return null; return null;
} }

View File

@ -98,6 +98,7 @@ import net.yacy.cora.document.id.MultiProtocolURL;
import net.yacy.cora.federate.solr.FailCategory; import net.yacy.cora.federate.solr.FailCategory;
import net.yacy.cora.federate.solr.Ranking; import net.yacy.cora.federate.solr.Ranking;
import net.yacy.cora.federate.solr.SchemaConfiguration; import net.yacy.cora.federate.solr.SchemaConfiguration;
import net.yacy.cora.federate.solr.connector.SolrConnector.Metadata;
import net.yacy.cora.federate.solr.instance.RemoteInstance; import net.yacy.cora.federate.solr.instance.RemoteInstance;
import net.yacy.cora.federate.yacy.CacheStrategy; import net.yacy.cora.federate.yacy.CacheStrategy;
import net.yacy.cora.order.Base64Order; import net.yacy.cora.order.Base64Order;
@ -1616,7 +1617,16 @@ public final class Switchboard extends serverSwitch {
*/ */
public HarvestProcess urlExists(final String hash) { public HarvestProcess urlExists(final String hash) {
if (this.index.getLoadTime(hash) >= 0) return HarvestProcess.LOADED; if (this.index.getLoadTime(hash) >= 0) return HarvestProcess.LOADED;
return this.crawlQueues.exists(ASCII.getBytes(hash), true); HarvestProcess hp = this.crawlQueues.exists(ASCII.getBytes(hash));
if (hp != null) return hp;
try {
Metadata md = this.index.fulltext().getDefaultConnector().getMetadata(hash);
if (md == null) return null;
return HarvestProcess.LOADED; // todo: can also be in error
} catch (IOException e) {
ConcurrentLog.logException(e);
return null;
}
} }
public void urlRemove(final Segment segment, final byte[] hash) { public void urlRemove(final Segment segment, final byte[] hash) {

View File

@ -168,6 +168,7 @@ public class ErrorCache {
return l; return l;
} }
/*
public CollectionConfiguration.FailDoc get(final String urlhash) { public CollectionConfiguration.FailDoc get(final String urlhash) {
CollectionConfiguration.FailDoc failDoc = null; CollectionConfiguration.FailDoc failDoc = null;
synchronized (this.cache) { synchronized (this.cache) {
@ -185,7 +186,6 @@ public class ErrorCache {
return null; return null;
} }
} }
public boolean exists(final byte[] urlHash) { public boolean exists(final byte[] urlHash) {
String urlHashString = ASCII.String(urlHash); String urlHashString = ASCII.String(urlHash);
try { try {
@ -200,7 +200,7 @@ public class ErrorCache {
return false; return false;
} }
} }
*/
public void clearStack() { public void clearStack() {
synchronized (this.cache) { synchronized (this.cache) {
this.cache.clear(); this.cache.clear();