mirror of
https://github.com/yacy/yacy_search_server.git
synced 2024-09-19 00:01:41 +02:00
added a new fail type attribute for the index to distinguish two
separate fail types: network fail and forced exclusion (i.e. by robots or forwarding rules).
This commit is contained in:
parent
5e182a566f
commit
efd2c4622d
|
@ -56,13 +56,15 @@ process_s
|
|||
## fail reason if a page was not loaded. if the page was loaded then this field is empty, text (mandatory field)
|
||||
failreason_t
|
||||
|
||||
## fail type if a page was not loaded. This field is either empty, 'excl' or 'fail'
|
||||
failtype_s
|
||||
|
||||
## html status return code (i.e. "200" for ok), -1 if not loaded (see content of failreason_t for this case), int (mandatory field)
|
||||
httpstatus_i
|
||||
|
||||
## redirect url if the error code is 299 < httpstatus_i < 310
|
||||
#httpstatus_redirect_s
|
||||
|
||||
|
||||
### optional but highly recommended values, part of the index distribution process
|
||||
|
||||
## time when resource was loaded
|
||||
|
|
28
source/net/yacy/cora/federate/solr/FailType.java
Normal file
28
source/net/yacy/cora/federate/solr/FailType.java
Normal file
|
@ -0,0 +1,28 @@
|
|||
/**
|
||||
* FailType
|
||||
* Copyright 2012 by Michael Peter Christen
|
||||
* First released 23.11.2012 at http://yacy.net
|
||||
*
|
||||
* This library is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* This library is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public License
|
||||
* along with this program in the file lgpl21.txt
|
||||
* If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
package net.yacy.cora.federate.solr;
|
||||
|
||||
public enum FailType {
|
||||
|
||||
fail, // failed because of network failure
|
||||
excl; // failed because content had to be excluded
|
||||
|
||||
}
|
|
@ -27,7 +27,7 @@ import java.util.List;
|
|||
import org.apache.solr.common.SolrInputDocument;
|
||||
|
||||
public enum YaCySchema implements Schema {
|
||||
|
||||
|
||||
// mandatory
|
||||
id(SolrType.string, true, true, false, "primary key of document, the URL hash **mandatory field**"),
|
||||
sku(SolrType.text_en_splitting_tight, true, true, false, true, "url of document"),
|
||||
|
@ -44,6 +44,7 @@ public enum YaCySchema implements Schema {
|
|||
size_i(SolrType.num_integer, true, true, false, "the size of the raw source"),// int size();
|
||||
process_s(SolrType.string, true, true, false, "index creation comment"),
|
||||
failreason_t(SolrType.text_general, true, true, false, "fail reason if a page was not loaded. if the page was loaded then this field is empty"),
|
||||
failtype_s(SolrType.string, true, true, false, "fail type if a page was not loaded. This field is either empty, 'excl' or 'fail'"),
|
||||
httpstatus_i(SolrType.num_integer, true, true, false, "html status return code (i.e. \"200\" for ok), -1 if not loaded"),
|
||||
httpstatus_redirect_s(SolrType.num_integer, true, true, false, "html status return code (i.e. \"200\" for ok), -1 if not loaded"),
|
||||
|
||||
|
@ -192,7 +193,7 @@ public enum YaCySchema implements Schema {
|
|||
ext_tracker_val(SolrType.num_integer, true, true, true, "number of attribute counts in ext_tracker_txt"),
|
||||
ext_title_txt(SolrType.text_general, true, true, true, "names matching title expressions"),
|
||||
ext_title_val(SolrType.num_integer, true, true, true, "number of matching title expressions");
|
||||
|
||||
|
||||
private String solrFieldName = null; // solr field name in custom solr schema, defaults to solcell schema field name (= same as this.name() )
|
||||
private final SolrType type;
|
||||
private final boolean indexed, stored;
|
||||
|
|
|
@ -38,6 +38,7 @@ import java.util.concurrent.LinkedBlockingQueue;
|
|||
import org.apache.solr.common.SolrInputDocument;
|
||||
|
||||
import net.yacy.cora.document.UTF8;
|
||||
import net.yacy.cora.federate.solr.FailType;
|
||||
import net.yacy.cora.federate.solr.connector.SolrConnector;
|
||||
import net.yacy.cora.order.Base64Order;
|
||||
import net.yacy.cora.order.NaturalOrder;
|
||||
|
@ -63,16 +64,18 @@ public class ZURL implements Iterable<ZURL.Entry> {
|
|||
public enum FailCategory {
|
||||
// TEMPORARY categories are such failure cases that should be tried again
|
||||
// FINAL categories are such failure cases that are final and should not be tried again
|
||||
TEMPORARY_NETWORK_FAILURE(true), // an entity could not been loaded
|
||||
FINAL_PROCESS_CONTEXT(false), // because of a processing context we do not want that url again (i.e. remote crawling)
|
||||
FINAL_LOAD_CONTEXT(false), // the crawler configuration does not want to load the entity
|
||||
FINAL_ROBOTS_RULE(true), // a remote server denies indexing or loading
|
||||
FINAL_REDIRECT_RULE(true); // the remote server redirects this page, thus disallowing reading of content
|
||||
TEMPORARY_NETWORK_FAILURE(true, FailType.fail), // an entity could not been loaded
|
||||
FINAL_PROCESS_CONTEXT(false, FailType.excl), // because of a processing context we do not want that url again (i.e. remote crawling)
|
||||
FINAL_LOAD_CONTEXT(false, FailType.excl), // the crawler configuration does not want to load the entity
|
||||
FINAL_ROBOTS_RULE(true, FailType.excl), // a remote server denies indexing or loading
|
||||
FINAL_REDIRECT_RULE(true, FailType.excl); // the remote server redirects this page, thus disallowing reading of content
|
||||
|
||||
public final boolean store;
|
||||
public final FailType failType;
|
||||
|
||||
private FailCategory(boolean store) {
|
||||
private FailCategory(boolean store, FailType failType) {
|
||||
this.store = store;
|
||||
this.failType = failType;
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -180,7 +183,7 @@ public class ZURL implements Iterable<ZURL.Entry> {
|
|||
if (this.solrConnector != null && failCategory.store) {
|
||||
// send the error to solr
|
||||
try {
|
||||
SolrInputDocument errorDoc = this.solrConfiguration.err(bentry.url(), failCategory.name() + " " + reason, httpcode);
|
||||
SolrInputDocument errorDoc = this.solrConfiguration.err(bentry.url(), failCategory.name() + " " + reason, failCategory.failType, httpcode);
|
||||
this.solrConnector.add(errorDoc);
|
||||
} catch (final IOException e) {
|
||||
Log.logWarning("SOLR", "failed to send error " + bentry.url().toNormalform(true) + " to solr: " + e.getMessage());
|
||||
|
|
|
@ -42,6 +42,7 @@ import java.util.Set;
|
|||
import net.yacy.cora.document.ASCII;
|
||||
import net.yacy.cora.document.MultiProtocolURI;
|
||||
import net.yacy.cora.document.UTF8;
|
||||
import net.yacy.cora.federate.solr.FailType;
|
||||
import net.yacy.cora.federate.solr.YaCySchema;
|
||||
import net.yacy.cora.federate.yacy.ConfigurationSet;
|
||||
import net.yacy.cora.protocol.Domains;
|
||||
|
@ -822,7 +823,7 @@ public class SolrConfiguration extends ConfigurationSet implements Serializable
|
|||
* @param httpstatus
|
||||
* @throws IOException
|
||||
*/
|
||||
public SolrInputDocument err(final DigestURI digestURI, final String failReason, final int httpstatus) throws IOException {
|
||||
public SolrInputDocument err(final DigestURI digestURI, final String failReason, final FailType failType, final int httpstatus) throws IOException {
|
||||
final SolrInputDocument solrdoc = new SolrInputDocument();
|
||||
add(solrdoc, YaCySchema.id, ASCII.String(digestURI.hash()));
|
||||
add(solrdoc, YaCySchema.sku, digestURI.toNormalform(true));
|
||||
|
@ -836,6 +837,7 @@ public class SolrConfiguration extends ConfigurationSet implements Serializable
|
|||
|
||||
// fail reason and status
|
||||
if (contains(YaCySchema.failreason_t)) add(solrdoc, YaCySchema.failreason_t, failReason);
|
||||
if (contains(YaCySchema.failtype_s)) add(solrdoc, YaCySchema.failtype_s, failType.name());
|
||||
if (contains(YaCySchema.httpstatus_i)) add(solrdoc, YaCySchema.httpstatus_i, httpstatus);
|
||||
return solrdoc;
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue
Block a user