/** * ErrorCache * Copyright 2013 by Michael Peter Christen * First released 17.10.2013 at http://yacy.net * * This library is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public License * along with this program in the file lgpl21.txt * If not, see . */ package net.yacy.search.index; import java.io.IOException; import java.util.ArrayList; import java.util.Collection; import java.util.Iterator; import java.util.LinkedHashMap; import java.util.Map; import java.util.Set; import org.apache.solr.client.solrj.SolrQuery; import org.apache.solr.client.solrj.SolrQuery.SortClause; import org.apache.solr.common.SolrDocument; import org.apache.solr.common.SolrDocumentList; import org.apache.solr.common.SolrInputDocument; import org.apache.solr.common.params.CommonParams; import net.yacy.cora.document.encoding.ASCII; import net.yacy.cora.document.id.DigestURL; import net.yacy.cora.federate.solr.FailCategory; import net.yacy.cora.federate.solr.connector.AbstractSolrConnector; import net.yacy.cora.util.ConcurrentLog; import net.yacy.crawler.data.CrawlProfile; import net.yacy.crawler.robots.RobotsTxt; import net.yacy.search.schema.CollectionConfiguration; import net.yacy.search.schema.CollectionSchema; public class ErrorCache { private static final ConcurrentLog log = new ConcurrentLog("REJECTED"); private static final int maxStackSize = 1000; // the class object private final Map cache; private final Fulltext fulltext; public ErrorCache(final Fulltext fulltext) { this.fulltext = fulltext; this.cache = new LinkedHashMap(); // concurrently fill stack with latest values new Thread() { @Override public void run() { final SolrQuery params = new SolrQuery(); params.setParam("defType", "edismax"); params.setStart(0); params.setRows(1000); params.setFacet(false); params.setSort(new SortClause(CollectionSchema.load_date_dt.getSolrFieldName(), SolrQuery.ORDER.desc)); // load_date_dt = faildate params.setFields(CollectionSchema.id.getSolrFieldName()); params.setQuery(CollectionSchema.failreason_s.getSolrFieldName() + AbstractSolrConnector.CATCHALL_DTERM); params.set(CommonParams.DF, CollectionSchema.id.getSolrFieldName()); // DisMaxParams.QF or CommonParams.DF must be given SolrDocumentList docList; try { docList = fulltext.getDefaultConnector().getDocumentListByParams(params); if (docList != null) for (int i = docList.size() - 1; i >= 0; i--) { SolrDocument doc = docList.get(i); String hash = (String) doc.getFieldValue(CollectionSchema.id.getSolrFieldName()); cache.put(hash, null); } } catch (IOException e) { ConcurrentLog.logException(e); } } }.start(); } public void clearCache() { if (this.cache != null) synchronized (this.cache) {this.cache.clear();} } public void clear() throws IOException { clearCache(); this.fulltext.getDefaultConnector().deleteByQuery(CollectionSchema.failreason_s.getSolrFieldName() + AbstractSolrConnector.CATCHALL_DTERM); } public void removeHosts(final Set hosthashes) { if (hosthashes == null || hosthashes.size() == 0) return; this.fulltext.deleteDomainErrors(hosthashes); synchronized (this.cache) { Iterator i = ErrorCache.this.cache.keySet().iterator(); while (i.hasNext()) { String b = i.next(); if (hosthashes.contains(b)) i.remove(); } } } /** * Adds a error document to the Solr index (marked as failed by httpstatus_i <> 200) * and caches recently added failed docs (up to maxStackSize = 1000) * * @param url failed url * @param crawldepth info crawldepth * @param profile info of collection * @param failCategory .store to index otherwise cache only * @param anycause info cause-string * @param httpcode http response code */ public void push(final DigestURL url, final int crawldepth, final CrawlProfile profile, final FailCategory failCategory, String anycause, final int httpcode) { // assert executor != null; // null == proxy ! assert failCategory.store || httpcode == -1 : "failCategory=" + failCategory.name(); if (anycause == null) anycause = "unknown"; final String reason = anycause + ((httpcode >= 0) ? " (http return code = " + httpcode + ")" : ""); if (!reason.startsWith("double")) log.info(url.toNormalform(true) + " - " + reason); if (!this.cache.containsKey(ASCII.String(url.hash()))) { // no further action if in error-cache CollectionConfiguration.FailDoc failDoc = new CollectionConfiguration.FailDoc( url, profile == null ? null : profile.collections(), failCategory.name() + " " + reason, failCategory.failType, httpcode, crawldepth); if (this.fulltext.getDefaultConnector() != null && failCategory.store && !RobotsTxt.isRobotsURL(url)) { // send the error to solr try { // do not overwrite error reports with error reports SolrDocument olddoc = this.fulltext.getDefaultConnector().getDocumentById(ASCII.String(failDoc.getDigestURL().hash()), CollectionSchema.httpstatus_i.getSolrFieldName()); if (olddoc == null || olddoc.getFieldValue(CollectionSchema.httpstatus_i.getSolrFieldName()) == null || ((Integer) olddoc.getFieldValue(CollectionSchema.httpstatus_i.getSolrFieldName())) == 200) { SolrInputDocument errorDoc = failDoc.toSolr(this.fulltext.getDefaultConfiguration()); this.fulltext.getDefaultConnector().add(errorDoc); } } catch (final IOException e) { ConcurrentLog.warn("SOLR", "failed to send error " + url.toNormalform(true) + " to solr: " + e.getMessage()); } } synchronized (this.cache) { this.cache.put(ASCII.String(url.hash()), failDoc); } checkStackSize(); } } private void checkStackSize() { synchronized (this.cache) { int dc = this.cache.size() - maxStackSize; if (dc > 0) { Collection d = new ArrayList(); Iterator i = this.cache.keySet().iterator(); while (dc-- > 0 && i.hasNext()) d.add(i.next()); for (String s: d) this.cache.remove(s); } } } public ArrayList list(int max) { final ArrayList l = new ArrayList(); synchronized (this.cache) { Iterator> hi = this.cache.entrySet().iterator(); for (int i = 0; i < this.cache.size() - max; i++) hi.next(); while (hi.hasNext()) { try { Map.Entry entry = hi.next(); String hash = entry.getKey(); CollectionConfiguration.FailDoc failDoc = entry.getValue(); if (failDoc == null) { SolrDocument doc = this.fulltext.getDefaultConnector().getDocumentById(hash); if (doc != null) failDoc = new CollectionConfiguration.FailDoc(doc); } if (failDoc != null) l.add(failDoc); } catch (IOException e) { } } } return l; } /* public CollectionConfiguration.FailDoc get(final String urlhash) { CollectionConfiguration.FailDoc failDoc = null; synchronized (this.cache) { failDoc = this.cache.get(urlhash); } if (failDoc != null) return failDoc; try { final SolrDocument doc = this.fulltext.getDefaultConnector().getDocumentById(urlhash); if (doc == null) return null; Object failreason = doc.getFieldValue(CollectionSchema.failreason_s.getSolrFieldName()); if (failreason == null || failreason.toString().length() == 0) return null; return new CollectionConfiguration.FailDoc(doc); } catch (final IOException e) { ConcurrentLog.logException(e); return null; } } public boolean exists(final byte[] urlHash) { String urlHashString = ASCII.String(urlHash); try { // load the fail reason, if exists final SolrDocument doc = this.fulltext.getDefaultConnector().getDocumentById(urlHashString, CollectionSchema.failreason_s.getSolrFieldName()); if (doc == null) return false; // check if the document contains a value in the field CollectionSchema.failreason_s Object failreason = doc.getFieldValue(CollectionSchema.failreason_s.getSolrFieldName()); return failreason != null && failreason.toString().length() > 0; } catch (IOException e) { return false; } } */ public void clearStack() { synchronized (this.cache) { this.cache.clear(); } } public int stackSize() { synchronized (this.cache) { return this.cache.size(); } } }