removed exist() retrieval functions from error cache and replaced it

with metadata retrieval from connectors directly. This should cause
better usage of the cache. Automatically increase the metadata cache if
more memory is available.
This commit is contained in:
Michael Peter Christen 2014-07-11 19:52:25 +02:00
parent 62c72360ee
commit b5fc2b63ea
6 changed files with 32 additions and 19 deletions

View File

@ -59,7 +59,6 @@ import net.yacy.search.index.Fulltext;
import net.yacy.search.index.Segment.ReferenceReport;
import net.yacy.search.index.Segment.ReferenceReportCache;
import net.yacy.search.query.QueryParams;
import net.yacy.search.schema.CollectionConfiguration.FailDoc;
import net.yacy.search.schema.CollectionSchema;
import net.yacy.server.serverObjects;
import net.yacy.server.serverSwitch;
@ -466,7 +465,7 @@ public class HostBrowser {
prop.putHTML("files_list_" + c + "_type_admin", admin ? "true" : "false");
StoreType type = (StoreType) entry.getValue();
try {uri = new DigestURL(entry.getKey());} catch (final MalformedURLException e) {uri = null;}
HarvestProcess process = uri == null ? null : sb.crawlQueues.exists(uri.hash(), true);
HarvestProcess process = uri == null ? null : sb.crawlQueues.exists(uri.hash()); // todo: cannot identify errors
boolean loading = load.equals(entry.getKey()) || (process != null && process != HarvestProcess.ERRORS);
boolean error = process == HarvestProcess.ERRORS || type == StoreType.EXCLUDED || type == StoreType.FAILED;
boolean dc = type != StoreType.INDEX && !error && !loading && list.containsKey(entry.getKey() + "/");
@ -482,8 +481,8 @@ public class HostBrowser {
FailType failType = errorDocs.get(entry.getKey());
if (failType == null) {
// maybe this is only in the errorURL
FailDoc faildoc = sb.crawlQueues.errorURL.get(ASCII.String(uri.hash()));
prop.putHTML("files_list_" + c + "_type_stored_error", process == HarvestProcess.ERRORS && faildoc != null ? faildoc.getFailReason() : "unknown error");
//Metadata faildoc = sb.index.fulltext().getDefaultConnector().getMetadata(ASCII.String(uri.hash()));
prop.putHTML("files_list_" + c + "_type_stored_error", "unknown error");
} else {
String ids = ASCII.String(uri.hash());
InfoCacheEntry ice = infoCache.get(ids);

View File

@ -32,6 +32,7 @@ import net.yacy.cora.federate.solr.connector.EmbeddedSolrConnector;
import net.yacy.cora.federate.solr.connector.MirrorSolrConnector;
import net.yacy.cora.federate.solr.connector.RemoteSolrConnector;
import net.yacy.cora.federate.solr.connector.SolrConnector;
import net.yacy.kelondro.util.MemoryControl;
public class InstanceMirror {
@ -165,7 +166,8 @@ public class InstanceMirror {
if (msc != null) return msc;
EmbeddedSolrConnector esc = getEmbeddedConnector(corename);
RemoteSolrConnector rsc = getRemoteConnector(corename);
msc = new ConcurrentUpdateSolrConnector(new MirrorSolrConnector(esc, rsc), RemoteInstance.queueSizeByMemory(), 10000, Runtime.getRuntime().availableProcessors());
int cacheSize = (int) (MemoryControl.available() / 30000); // will return about 10000 for standard ram size
msc = new ConcurrentUpdateSolrConnector(new MirrorSolrConnector(esc, rsc), RemoteInstance.queueSizeByMemory(), cacheSize, Runtime.getRuntime().availableProcessors());
//msc = new MirrorSolrConnector(esc, rsc);
this.mirrorConnectorCache.put(corename, msc);
return msc;

View File

@ -43,6 +43,7 @@ import net.yacy.cora.document.id.AnchorURL;
import net.yacy.cora.document.id.DigestURL;
import net.yacy.cora.document.id.MultiProtocolURL;
import net.yacy.cora.federate.solr.FailCategory;
import net.yacy.cora.federate.solr.connector.SolrConnector.Metadata;
import net.yacy.cora.order.Base64Order;
import net.yacy.cora.protocol.Domains;
import net.yacy.cora.protocol.ftp.FTPClient;
@ -60,7 +61,6 @@ import net.yacy.repository.Blacklist.BlacklistType;
import net.yacy.repository.FilterEngine;
import net.yacy.search.Switchboard;
import net.yacy.search.index.Segment;
import net.yacy.search.schema.CollectionConfiguration;
public final class CrawlStacker {
@ -379,22 +379,27 @@ public final class CrawlStacker {
public String checkAcceptanceInitially(final DigestURL url, final CrawlProfile profile) {
// check if the url is double registered
final HarvestProcess dbocc = this.nextQueue.exists(url.hash(), false); // returns the name of the queue if entry exists
final HarvestProcess dbocc = this.nextQueue.exists(url.hash()); // returns the name of the queue if entry exists
if (dbocc != null) {
return "double in: " + dbocc.name();
}
String urlhash = ASCII.String(url.hash());
final CollectionConfiguration.FailDoc errorEntry = this.nextQueue.errorURL.get(urlhash);
final Date oldDate = errorEntry == null ? null : errorEntry.getFailDate();
Metadata oldEntry = null;
try {
oldEntry = this.indexSegment.fulltext().getDefaultConnector().getMetadata(urlhash);
} catch (IOException e) {
ConcurrentLog.logException(e);
}
final Long oldDate = oldEntry == null ? null : oldEntry.date;
if (oldDate == null) {
return null; // no evidence that we know that url
}
final boolean recrawl = profile.recrawlIfOlder() > oldDate.getTime();
final boolean recrawl = profile.recrawlIfOlder() > oldDate.longValue();
final String urlstring = url.toString();
if (recrawl) {
if (CrawlStacker.log.isInfo())
CrawlStacker.log.info("RE-CRAWL of URL '" + urlstring + "': this url was crawled " +
((System.currentTimeMillis() - oldDate.getTime()) / 60000 / 60 / 24) + " days ago.");
((System.currentTimeMillis() - oldDate.longValue()) / 60000 / 60 / 24) + " days ago.");
} else {
return "double in: LURL-DB, oldDate = " + oldDate.toString();
}

View File

@ -143,7 +143,7 @@ public class CrawlQueues {
* @param hash
* @return if the hash exists, the name of the database is returned, otherwise null is returned
*/
public HarvestProcess exists(final byte[] hash, final boolean checkErrorCache) {
public HarvestProcess exists(final byte[] hash) {
if (this.delegatedURL.containsKey(ASCII.String(hash))) {
return HarvestProcess.DELEGATED;
}
@ -155,9 +155,6 @@ public class CrawlQueues {
return HarvestProcess.WORKER;
}
}
if (checkErrorCache && this.errorURL.exists(hash)) {
return HarvestProcess.ERRORS;
}
return null;
}

View File

@ -98,6 +98,7 @@ import net.yacy.cora.document.id.MultiProtocolURL;
import net.yacy.cora.federate.solr.FailCategory;
import net.yacy.cora.federate.solr.Ranking;
import net.yacy.cora.federate.solr.SchemaConfiguration;
import net.yacy.cora.federate.solr.connector.SolrConnector.Metadata;
import net.yacy.cora.federate.solr.instance.RemoteInstance;
import net.yacy.cora.federate.yacy.CacheStrategy;
import net.yacy.cora.order.Base64Order;
@ -1616,7 +1617,16 @@ public final class Switchboard extends serverSwitch {
*/
public HarvestProcess urlExists(final String hash) {
if (this.index.getLoadTime(hash) >= 0) return HarvestProcess.LOADED;
return this.crawlQueues.exists(ASCII.getBytes(hash), true);
HarvestProcess hp = this.crawlQueues.exists(ASCII.getBytes(hash));
if (hp != null) return hp;
try {
Metadata md = this.index.fulltext().getDefaultConnector().getMetadata(hash);
if (md == null) return null;
return HarvestProcess.LOADED; // todo: can also be in error
} catch (IOException e) {
ConcurrentLog.logException(e);
return null;
}
}
public void urlRemove(final Segment segment, final byte[] hash) {

View File

@ -168,6 +168,7 @@ public class ErrorCache {
return l;
}
/*
public CollectionConfiguration.FailDoc get(final String urlhash) {
CollectionConfiguration.FailDoc failDoc = null;
synchronized (this.cache) {
@ -185,7 +186,6 @@ public class ErrorCache {
return null;
}
}
public boolean exists(final byte[] urlHash) {
String urlHashString = ASCII.String(urlHash);
try {
@ -200,7 +200,7 @@ public class ErrorCache {
return false;
}
}
*/
public void clearStack() {
synchronized (this.cache) {
this.cache.clear();