2013-09-17 15:27:02 +02:00
/ * *
* ErrorCache
* Copyright 2013 by Michael Peter Christen
* First released 17 . 10 . 2013 at http : //yacy.net
*
* This library is free software ; you can redistribute it and / or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation ; either
* version 2 . 1 of the License , or ( at your option ) any later version .
*
* This library is distributed in the hope that it will be useful ,
* but WITHOUT ANY WARRANTY ; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE . See the GNU
* Lesser General Public License for more details .
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program in the file lgpl21 . txt
* If not , see < http : //www.gnu.org/licenses/>.
* /
package net.yacy.search.index ;
import java.io.IOException ;
import java.util.ArrayList ;
2013-09-21 10:20:13 +02:00
import java.util.Collection ;
2013-09-17 15:27:02 +02:00
import java.util.Iterator ;
2013-09-21 10:20:13 +02:00
import java.util.LinkedHashMap ;
import java.util.Map ;
2013-10-24 16:20:20 +02:00
import java.util.Set ;
2013-09-17 15:27:02 +02:00
import org.apache.solr.client.solrj.SolrQuery ;
import org.apache.solr.client.solrj.SolrQuery.SortClause ;
import org.apache.solr.common.SolrDocument ;
import org.apache.solr.common.SolrDocumentList ;
import org.apache.solr.common.SolrInputDocument ;
2013-12-13 15:56:29 +01:00
import org.apache.solr.common.params.CommonParams ;
2013-09-17 15:27:02 +02:00
import net.yacy.cora.document.encoding.ASCII ;
import net.yacy.cora.document.id.DigestURL ;
import net.yacy.cora.federate.solr.FailCategory ;
2014-02-26 14:30:48 +01:00
import net.yacy.cora.federate.solr.connector.AbstractSolrConnector ;
2013-09-17 15:27:02 +02:00
import net.yacy.cora.util.ConcurrentLog ;
import net.yacy.crawler.data.CrawlProfile ;
import net.yacy.search.index.Fulltext ;
import net.yacy.search.schema.CollectionConfiguration ;
import net.yacy.search.schema.CollectionSchema ;
public class ErrorCache {
2013-11-13 06:18:48 +01:00
private static final ConcurrentLog log = new ConcurrentLog ( " REJECTED " ) ;
2013-09-17 15:27:02 +02:00
private static final int maxStackSize = 1000 ;
// the class object
2013-12-13 15:56:29 +01:00
private final Map < String , CollectionConfiguration . FailDoc > cache ;
2013-09-17 15:27:02 +02:00
private final Fulltext fulltext ;
public ErrorCache ( final Fulltext fulltext ) {
this . fulltext = fulltext ;
2013-12-13 15:56:29 +01:00
this . cache = new LinkedHashMap < String , CollectionConfiguration . FailDoc > ( ) ;
2013-09-17 15:27:02 +02:00
try {
// fill stack with latest values
final SolrQuery params = new SolrQuery ( ) ;
params . setParam ( " defType " , " edismax " ) ;
params . setStart ( 0 ) ;
params . setRows ( 100 ) ;
params . setFacet ( false ) ;
params . setSort ( new SortClause ( CollectionSchema . last_modified . getSolrFieldName ( ) , SolrQuery . ORDER . desc ) ) ;
2013-12-13 15:56:29 +01:00
params . setFields ( CollectionSchema . id . getSolrFieldName ( ) ) ;
2014-02-26 14:30:48 +01:00
params . setQuery ( CollectionSchema . failreason_s . getSolrFieldName ( ) + AbstractSolrConnector . CATCHALL_DTERM ) ;
2013-12-13 15:56:29 +01:00
params . set ( CommonParams . DF , CollectionSchema . id . getSolrFieldName ( ) ) ; // DisMaxParams.QF or CommonParams.DF must be given
SolrDocumentList docList = fulltext . getDefaultConnector ( ) . getDocumentListByParams ( params ) ;
2013-09-17 15:27:02 +02:00
if ( docList ! = null ) for ( int i = docList . size ( ) - 1 ; i > = 0 ; i - - ) {
2013-12-13 15:56:29 +01:00
SolrDocument doc = docList . get ( i ) ;
String hash = ( String ) doc . getFieldValue ( CollectionSchema . id . getSolrFieldName ( ) ) ;
this . cache . put ( hash , null ) ;
2013-09-17 15:27:02 +02:00
}
} catch ( final Throwable e ) {
}
}
2014-04-04 14:43:35 +02:00
public void clearCache ( ) {
2013-12-13 15:56:29 +01:00
if ( this . cache ! = null ) synchronized ( this . cache ) { this . cache . clear ( ) ; }
2014-04-04 14:43:35 +02:00
}
public void clear ( ) throws IOException {
clearCache ( ) ;
2014-02-26 14:30:48 +01:00
this . fulltext . getDefaultConnector ( ) . deleteByQuery ( CollectionSchema . failreason_s . getSolrFieldName ( ) + AbstractSolrConnector . CATCHALL_DTERM ) ;
2013-09-17 15:27:02 +02:00
}
2013-10-24 16:20:20 +02:00
public void removeHosts ( final Set < String > hosthashes ) {
if ( hosthashes = = null | | hosthashes . size ( ) = = 0 ) return ;
this . fulltext . deleteDomainErrors ( hosthashes ) ;
2013-12-13 15:56:29 +01:00
synchronized ( this . cache ) {
Iterator < String > i = ErrorCache . this . cache . keySet ( ) . iterator ( ) ;
2013-10-24 16:20:20 +02:00
while ( i . hasNext ( ) ) {
String b = i . next ( ) ;
if ( hosthashes . contains ( b ) ) i . remove ( ) ;
2013-09-17 15:27:02 +02:00
}
}
}
2014-04-17 13:21:43 +02:00
public void push ( final DigestURL url , final int crawldepth , final CrawlProfile profile , final FailCategory failCategory , String anycause , final int httpcode ) {
2013-09-17 15:27:02 +02:00
// assert executor != null; // null == proxy !
assert failCategory . store | | httpcode = = - 1 : " failCategory= " + failCategory . name ( ) ;
if ( anycause = = null ) anycause = " unknown " ;
final String reason = anycause + ( ( httpcode > = 0 ) ? " (http return code = " + httpcode + " ) " : " " ) ;
if ( ! reason . startsWith ( " double " ) ) log . info ( url . toNormalform ( true ) + " - " + reason ) ;
CollectionConfiguration . FailDoc failDoc = new CollectionConfiguration . FailDoc (
url , profile = = null ? null : profile . collections ( ) ,
failCategory . name ( ) + " " + reason , failCategory . failType ,
2014-04-17 13:21:43 +02:00
httpcode , crawldepth ) ;
2014-07-11 16:26:52 +02:00
if ( this . fulltext . getDefaultConnector ( ) ! = null & & failCategory . store & & ! exists ( url . hash ( ) ) ) {
2013-09-17 15:27:02 +02:00
// send the error to solr
try {
2014-04-10 09:08:59 +02:00
// do not overwrite error reports with error reports
SolrDocument olddoc = this . fulltext . getDefaultConnector ( ) . getDocumentById ( ASCII . String ( failDoc . getDigestURL ( ) . hash ( ) ) , CollectionSchema . httpstatus_i . getSolrFieldName ( ) ) ;
if ( olddoc = = null | |
olddoc . getFieldValue ( CollectionSchema . httpstatus_i . getSolrFieldName ( ) ) = = null | |
( ( Integer ) olddoc . getFieldValue ( CollectionSchema . httpstatus_i . getSolrFieldName ( ) ) ) = = 200 ) {
SolrInputDocument errorDoc = failDoc . toSolr ( this . fulltext . getDefaultConfiguration ( ) ) ;
this . fulltext . getDefaultConnector ( ) . add ( errorDoc ) ;
}
2013-09-17 15:27:02 +02:00
} catch ( final IOException e ) {
ConcurrentLog . warn ( " SOLR " , " failed to send error " + url . toNormalform ( true ) + " to solr: " + e . getMessage ( ) ) ;
}
2013-12-13 15:56:29 +01:00
synchronized ( this . cache ) {
this . cache . put ( ASCII . String ( url . hash ( ) ) , null ) ;
}
} else {
synchronized ( this . cache ) {
this . cache . put ( ASCII . String ( url . hash ( ) ) , failDoc ) ;
}
2013-09-17 15:27:02 +02:00
}
2013-09-21 10:20:13 +02:00
checkStackSize ( ) ;
}
private void checkStackSize ( ) {
2013-12-13 15:56:29 +01:00
synchronized ( this . cache ) {
int dc = this . cache . size ( ) - maxStackSize ;
2013-09-21 10:20:13 +02:00
if ( dc > 0 ) {
Collection < String > d = new ArrayList < String > ( ) ;
2013-12-13 15:56:29 +01:00
Iterator < String > i = this . cache . keySet ( ) . iterator ( ) ;
2013-09-21 10:20:13 +02:00
while ( dc - - > 0 & & i . hasNext ( ) ) d . add ( i . next ( ) ) ;
2013-12-13 15:56:29 +01:00
for ( String s : d ) this . cache . remove ( s ) ;
2013-09-21 10:20:13 +02:00
}
}
2013-09-17 15:27:02 +02:00
}
public ArrayList < CollectionConfiguration . FailDoc > list ( int max ) {
final ArrayList < CollectionConfiguration . FailDoc > l = new ArrayList < CollectionConfiguration . FailDoc > ( ) ;
2013-12-13 15:56:29 +01:00
synchronized ( this . cache ) {
Iterator < Map . Entry < String , CollectionConfiguration . FailDoc > > hi = this . cache . entrySet ( ) . iterator ( ) ;
for ( int i = 0 ; i < this . cache . size ( ) - max ; i + + ) hi . next ( ) ;
while ( hi . hasNext ( ) ) {
try {
Map . Entry < String , CollectionConfiguration . FailDoc > entry = hi . next ( ) ;
String hash = entry . getKey ( ) ;
CollectionConfiguration . FailDoc failDoc = entry . getValue ( ) ;
if ( failDoc = = null ) {
SolrDocument doc = this . fulltext . getDefaultConnector ( ) . getDocumentById ( hash ) ;
if ( doc ! = null ) failDoc = new CollectionConfiguration . FailDoc ( doc ) ;
}
if ( failDoc ! = null ) l . add ( failDoc ) ;
} catch ( IOException e ) {
}
}
2013-09-21 10:20:13 +02:00
}
2013-09-17 15:27:02 +02:00
return l ;
}
2013-12-13 15:56:29 +01:00
2013-09-17 15:27:02 +02:00
public CollectionConfiguration . FailDoc get ( final String urlhash ) {
2013-12-13 15:56:29 +01:00
CollectionConfiguration . FailDoc failDoc = null ;
synchronized ( this . cache ) {
failDoc = this . cache . get ( urlhash ) ;
2013-09-21 10:20:13 +02:00
}
2013-12-13 15:56:29 +01:00
if ( failDoc ! = null ) return failDoc ;
2013-09-17 15:27:02 +02:00
try {
2014-04-09 12:45:04 +02:00
final SolrDocumentList docs = this . fulltext . getDefaultConnector ( ) . getDocumentListByQuery ( CollectionSchema . id + " : \" " + urlhash + " \" AND " + CollectionSchema . failtype_s . getSolrFieldName ( ) + AbstractSolrConnector . CATCHALL_DTERM , null , 0 , 1 ) ;
2014-02-25 02:16:22 +01:00
if ( docs = = null | | docs . isEmpty ( ) ) return null ;
SolrDocument doc = docs . get ( 0 ) ;
2013-09-17 15:27:02 +02:00
if ( doc = = null ) return null ;
return new CollectionConfiguration . FailDoc ( doc ) ;
} catch ( final IOException e ) {
ConcurrentLog . logException ( e ) ;
return null ;
}
}
public boolean exists ( final byte [ ] urlHash ) {
2014-02-24 22:59:58 +01:00
String urlHashString = ASCII . String ( urlHash ) ;
2013-09-17 15:27:02 +02:00
try {
2014-02-24 22:59:58 +01:00
// first try to check if the document exists at all.
long loaddate = this . fulltext . getLoadTime ( urlHashString ) ;
if ( loaddate < 0 ) return false ;
// then load the fail reason, if exists
final SolrDocument doc = this . fulltext . getDefaultConnector ( ) . getDocumentById ( urlHashString , CollectionSchema . failreason_s . getSolrFieldName ( ) ) ;
2013-12-12 03:36:30 +01:00
if ( doc = = null ) return false ;
// check if the document contains a value in the field CollectionSchema.failreason_s
Object failreason = doc . getFieldValue ( CollectionSchema . failreason_s . getSolrFieldName ( ) ) ;
2013-12-12 10:38:32 +01:00
return failreason ! = null & & failreason . toString ( ) . length ( ) > 0 ;
2013-09-17 15:27:02 +02:00
} catch ( IOException e ) {
return false ;
}
}
public void clearStack ( ) {
2013-12-13 15:56:29 +01:00
synchronized ( this . cache ) {
this . cache . clear ( ) ;
2013-09-21 10:20:13 +02:00
}
2013-09-17 15:27:02 +02:00
}
public int stackSize ( ) {
2013-12-13 15:56:29 +01:00
synchronized ( this . cache ) {
return this . cache . size ( ) ;
2013-09-21 10:20:13 +02:00
}
2013-09-17 15:27:02 +02:00
}
}
2013-12-13 15:56:29 +01:00