added a new fail type attribute for the index to distinguish two

separate fail types: network fail and forced exclusion (i.e. by robots
or forwarding rules).
This commit is contained in:
Michael Peter Christen 2012-11-23 14:00:30 +01:00
parent 5e182a566f
commit efd2c4622d
5 changed files with 47 additions and 11 deletions

View File

@ -56,13 +56,15 @@ process_s
## fail reason if a page was not loaded. if the page was loaded then this field is empty, text (mandatory field)
failreason_t
## fail type if a page was not loaded. This field is either empty, 'excl' or 'fail'
failtype_s
## html status return code (i.e. "200" for ok), -1 if not loaded (see content of failreason_t for this case), int (mandatory field)
httpstatus_i
## redirect url if the error code is 299 < httpstatus_i < 310
#httpstatus_redirect_s
### optional but highly recommended values, part of the index distribution process
## time when resource was loaded

View File

@ -0,0 +1,28 @@
/**
* FailType
* Copyright 2012 by Michael Peter Christen
* First released 23.11.2012 at http://yacy.net
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program in the file lgpl21.txt
* If not, see <http://www.gnu.org/licenses/>.
*/
package net.yacy.cora.federate.solr;
public enum FailType {
fail, // failed because of network failure
excl; // failed because content had to be excluded
}

View File

@ -27,7 +27,7 @@ import java.util.List;
import org.apache.solr.common.SolrInputDocument;
public enum YaCySchema implements Schema {
// mandatory
id(SolrType.string, true, true, false, "primary key of document, the URL hash **mandatory field**"),
sku(SolrType.text_en_splitting_tight, true, true, false, true, "url of document"),
@ -44,6 +44,7 @@ public enum YaCySchema implements Schema {
size_i(SolrType.num_integer, true, true, false, "the size of the raw source"),// int size();
process_s(SolrType.string, true, true, false, "index creation comment"),
failreason_t(SolrType.text_general, true, true, false, "fail reason if a page was not loaded. if the page was loaded then this field is empty"),
failtype_s(SolrType.string, true, true, false, "fail type if a page was not loaded. This field is either empty, 'excl' or 'fail'"),
httpstatus_i(SolrType.num_integer, true, true, false, "html status return code (i.e. \"200\" for ok), -1 if not loaded"),
httpstatus_redirect_s(SolrType.num_integer, true, true, false, "html status return code (i.e. \"200\" for ok), -1 if not loaded"),
@ -192,7 +193,7 @@ public enum YaCySchema implements Schema {
ext_tracker_val(SolrType.num_integer, true, true, true, "number of attribute counts in ext_tracker_txt"),
ext_title_txt(SolrType.text_general, true, true, true, "names matching title expressions"),
ext_title_val(SolrType.num_integer, true, true, true, "number of matching title expressions");
private String solrFieldName = null; // solr field name in custom solr schema, defaults to solcell schema field name (= same as this.name() )
private final SolrType type;
private final boolean indexed, stored;

View File

@ -38,6 +38,7 @@ import java.util.concurrent.LinkedBlockingQueue;
import org.apache.solr.common.SolrInputDocument;
import net.yacy.cora.document.UTF8;
import net.yacy.cora.federate.solr.FailType;
import net.yacy.cora.federate.solr.connector.SolrConnector;
import net.yacy.cora.order.Base64Order;
import net.yacy.cora.order.NaturalOrder;
@ -63,16 +64,18 @@ public class ZURL implements Iterable<ZURL.Entry> {
public enum FailCategory {
// TEMPORARY categories are such failure cases that should be tried again
// FINAL categories are such failure cases that are final and should not be tried again
TEMPORARY_NETWORK_FAILURE(true), // an entity could not been loaded
FINAL_PROCESS_CONTEXT(false), // because of a processing context we do not want that url again (i.e. remote crawling)
FINAL_LOAD_CONTEXT(false), // the crawler configuration does not want to load the entity
FINAL_ROBOTS_RULE(true), // a remote server denies indexing or loading
FINAL_REDIRECT_RULE(true); // the remote server redirects this page, thus disallowing reading of content
TEMPORARY_NETWORK_FAILURE(true, FailType.fail), // an entity could not been loaded
FINAL_PROCESS_CONTEXT(false, FailType.excl), // because of a processing context we do not want that url again (i.e. remote crawling)
FINAL_LOAD_CONTEXT(false, FailType.excl), // the crawler configuration does not want to load the entity
FINAL_ROBOTS_RULE(true, FailType.excl), // a remote server denies indexing or loading
FINAL_REDIRECT_RULE(true, FailType.excl); // the remote server redirects this page, thus disallowing reading of content
public final boolean store;
public final FailType failType;
private FailCategory(boolean store) {
private FailCategory(boolean store, FailType failType) {
this.store = store;
this.failType = failType;
}
}
@ -180,7 +183,7 @@ public class ZURL implements Iterable<ZURL.Entry> {
if (this.solrConnector != null && failCategory.store) {
// send the error to solr
try {
SolrInputDocument errorDoc = this.solrConfiguration.err(bentry.url(), failCategory.name() + " " + reason, httpcode);
SolrInputDocument errorDoc = this.solrConfiguration.err(bentry.url(), failCategory.name() + " " + reason, failCategory.failType, httpcode);
this.solrConnector.add(errorDoc);
} catch (final IOException e) {
Log.logWarning("SOLR", "failed to send error " + bentry.url().toNormalform(true) + " to solr: " + e.getMessage());

View File

@ -42,6 +42,7 @@ import java.util.Set;
import net.yacy.cora.document.ASCII;
import net.yacy.cora.document.MultiProtocolURI;
import net.yacy.cora.document.UTF8;
import net.yacy.cora.federate.solr.FailType;
import net.yacy.cora.federate.solr.YaCySchema;
import net.yacy.cora.federate.yacy.ConfigurationSet;
import net.yacy.cora.protocol.Domains;
@ -822,7 +823,7 @@ public class SolrConfiguration extends ConfigurationSet implements Serializable
* @param httpstatus
* @throws IOException
*/
public SolrInputDocument err(final DigestURI digestURI, final String failReason, final int httpstatus) throws IOException {
public SolrInputDocument err(final DigestURI digestURI, final String failReason, final FailType failType, final int httpstatus) throws IOException {
final SolrInputDocument solrdoc = new SolrInputDocument();
add(solrdoc, YaCySchema.id, ASCII.String(digestURI.hash()));
add(solrdoc, YaCySchema.sku, digestURI.toNormalform(true));
@ -836,6 +837,7 @@ public class SolrConfiguration extends ConfigurationSet implements Serializable
// fail reason and status
if (contains(YaCySchema.failreason_t)) add(solrdoc, YaCySchema.failreason_t, failReason);
if (contains(YaCySchema.failtype_s)) add(solrdoc, YaCySchema.failtype_s, failType.name());
if (contains(YaCySchema.httpstatus_i)) add(solrdoc, YaCySchema.httpstatus_i, httpstatus);
return solrdoc;
}