yacy_search_server/source/net/yacy/search/index/ErrorCache.java
Michael Peter Christen 2702d9e56b - added a SolrQueryResponse2SolrDocumentList method which is able to
work around the unfolding process in Solr's BinaryResponseWriter.
This was a huge performance bottleneck in the embedded solr connector
and the problem is actually on Solr side, but we have now a workaround.
- This made it possible to abstract a high-performance index access
method which is implemented as method getDocumentListByParams. That
method is also implemented in the SolrServerConnector and provides a
very efficient access to a solr index if the index is embedded.
- a popular use of the document list retrieval is a result count which
can now also make use of the new method, via getDocumentCountByParams.
- enhanced the Error cache which now does not store error documents
within the ram cache if the document is also written to solr. When
documents are retrieved from the cache, they are partly read from the
ram cache and if not existent there, from the Solr index.
2013-12-13 15:56:29 +01:00

205 lines
8.5 KiB
Java

/**
* ErrorCache
* Copyright 2013 by Michael Peter Christen
* First released 17.10.2013 at http://yacy.net
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program in the file lgpl21.txt
* If not, see <http://www.gnu.org/licenses/>.
*/
package net.yacy.search.index;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Iterator;
import java.util.LinkedHashMap;
import java.util.Map;
import java.util.Set;
import org.apache.solr.client.solrj.SolrQuery;
import org.apache.solr.client.solrj.SolrQuery.SortClause;
import org.apache.solr.common.SolrDocument;
import org.apache.solr.common.SolrDocumentList;
import org.apache.solr.common.SolrInputDocument;
import org.apache.solr.common.params.CommonParams;
import net.yacy.cora.document.encoding.ASCII;
import net.yacy.cora.document.id.DigestURL;
import net.yacy.cora.federate.solr.FailCategory;
import net.yacy.cora.util.ConcurrentLog;
import net.yacy.crawler.data.CrawlProfile;
import net.yacy.search.index.Fulltext;
import net.yacy.search.schema.CollectionConfiguration;
import net.yacy.search.schema.CollectionSchema;
public class ErrorCache {
private static final ConcurrentLog log = new ConcurrentLog("REJECTED");
private static final int maxStackSize = 1000;
// the class object
private final Map<String, CollectionConfiguration.FailDoc> cache;
private final Fulltext fulltext;
public ErrorCache(final Fulltext fulltext) {
this.fulltext = fulltext;
this.cache = new LinkedHashMap<String, CollectionConfiguration.FailDoc>();
try {
// fill stack with latest values
final SolrQuery params = new SolrQuery();
params.setParam("defType", "edismax");
params.setStart(0);
params.setRows(100);
params.setFacet(false);
params.setSort(new SortClause(CollectionSchema.last_modified.getSolrFieldName(), SolrQuery.ORDER.desc));
params.setFields(CollectionSchema.id.getSolrFieldName());
params.setQuery(CollectionSchema.failreason_s.getSolrFieldName() + ":[* TO *]");
params.set(CommonParams.DF, CollectionSchema.id.getSolrFieldName()); // DisMaxParams.QF or CommonParams.DF must be given
SolrDocumentList docList = fulltext.getDefaultConnector().getDocumentListByParams(params);
if (docList != null) for (int i = docList.size() - 1; i >= 0; i--) {
SolrDocument doc = docList.get(i);
String hash = (String) doc.getFieldValue(CollectionSchema.id.getSolrFieldName());
this.cache.put(hash, null);
}
} catch (final Throwable e) {
}
}
public void clear() throws IOException {
if (this.cache != null) synchronized (this.cache) {this.cache.clear();}
this.fulltext.getDefaultConnector().deleteByQuery(CollectionSchema.failreason_s.getSolrFieldName() + ":[* TO *]");
}
public void removeHosts(final Set<String> hosthashes) {
if (hosthashes == null || hosthashes.size() == 0) return;
this.fulltext.deleteDomainErrors(hosthashes);
synchronized (this.cache) {
Iterator<String> i = ErrorCache.this.cache.keySet().iterator();
while (i.hasNext()) {
String b = i.next();
if (hosthashes.contains(b)) i.remove();
}
}
}
public void push(final DigestURL url, final CrawlProfile profile, final FailCategory failCategory, String anycause, final int httpcode) {
// assert executor != null; // null == proxy !
assert failCategory.store || httpcode == -1 : "failCategory=" + failCategory.name();
if (exists(url.hash()))
return; // don't insert double causes
if (anycause == null) anycause = "unknown";
final String reason = anycause + ((httpcode >= 0) ? " (http return code = " + httpcode + ")" : "");
if (!reason.startsWith("double")) log.info(url.toNormalform(true) + " - " + reason);
CollectionConfiguration.FailDoc failDoc = new CollectionConfiguration.FailDoc(
url, profile == null ? null : profile.collections(),
failCategory.name() + " " + reason, failCategory.failType,
httpcode);
if (this.fulltext.getDefaultConnector() != null && failCategory.store) {
// send the error to solr
try {
SolrInputDocument errorDoc = failDoc.toSolr(this.fulltext.getDefaultConfiguration());
this.fulltext.getDefaultConnector().add(errorDoc);
} catch (final IOException e) {
ConcurrentLog.warn("SOLR", "failed to send error " + url.toNormalform(true) + " to solr: " + e.getMessage());
}
synchronized (this.cache) {
this.cache.put(ASCII.String(url.hash()), null);
}
} else {
synchronized (this.cache) {
this.cache.put(ASCII.String(url.hash()), failDoc);
}
}
checkStackSize();
}
private void checkStackSize() {
synchronized (this.cache) {
int dc = this.cache.size() - maxStackSize;
if (dc > 0) {
Collection<String> d = new ArrayList<String>();
Iterator<String> i = this.cache.keySet().iterator();
while (dc-- > 0 && i.hasNext()) d.add(i.next());
for (String s: d) this.cache.remove(s);
}
}
}
public ArrayList<CollectionConfiguration.FailDoc> list(int max) {
final ArrayList<CollectionConfiguration.FailDoc> l = new ArrayList<CollectionConfiguration.FailDoc>();
synchronized (this.cache) {
Iterator<Map.Entry<String, CollectionConfiguration.FailDoc>> hi = this.cache.entrySet().iterator();
for (int i = 0; i < this.cache.size() - max; i++) hi.next();
while (hi.hasNext()) {
try {
Map.Entry<String, CollectionConfiguration.FailDoc> entry = hi.next();
String hash = entry.getKey();
CollectionConfiguration.FailDoc failDoc = entry.getValue();
if (failDoc == null) {
SolrDocument doc = this.fulltext.getDefaultConnector().getDocumentById(hash);
if (doc != null) failDoc = new CollectionConfiguration.FailDoc(doc);
}
if (failDoc != null) l.add(failDoc);
} catch (IOException e) {
}
}
}
return l;
}
public CollectionConfiguration.FailDoc get(final String urlhash) {
CollectionConfiguration.FailDoc failDoc = null;
synchronized (this.cache) {
failDoc = this.cache.get(urlhash);
}
if (failDoc != null) return failDoc;
try {
SolrDocument doc = this.fulltext.getDefaultConnector().getDocumentById(urlhash);
if (doc == null) return null;
return new CollectionConfiguration.FailDoc(doc);
} catch (final IOException e) {
ConcurrentLog.logException(e);
return null;
}
}
public boolean exists(final byte[] urlHash) {
try {
final SolrDocument doc = this.fulltext.getDefaultConnector().getDocumentById(ASCII.String(urlHash), CollectionSchema.failreason_s.getSolrFieldName());
if (doc == null) return false;
// check if the document contains a value in the field CollectionSchema.failreason_s
Object failreason = doc.getFieldValue(CollectionSchema.failreason_s.getSolrFieldName());
return failreason != null && failreason.toString().length() > 0;
} catch (IOException e) {
return false;
}
}
public void clearStack() {
synchronized (this.cache) {
this.cache.clear();
}
}
public int stackSize() {
synchronized (this.cache) {
return this.cache.size();
}
}
}