2012-09-28 22:45:16 +02:00
/ * *
* HostBrowser
* Copyright 2012 by Michael Peter Christen , mc @yacy.net , Frankfurt am Main , Germany
* First released 27 . 09 . 2012 at http : //yacy.net
*
* This library is free software ; you can redistribute it and / or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation ; either
* version 2 . 1 of the License , or ( at your option ) any later version .
*
* This library is distributed in the hope that it will be useful ,
* but WITHOUT ANY WARRANTY ; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE . See the GNU
* Lesser General Public License for more details .
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program in the file lgpl21 . txt
* If not , see < http : //www.gnu.org/licenses/>.
* /
2012-09-21 15:48:40 +02:00
2012-09-28 22:45:16 +02:00
import java.io.IOException ;
2012-09-30 13:23:06 +02:00
import java.net.MalformedURLException ;
2012-10-31 17:44:45 +01:00
import java.util.ArrayList ;
2014-01-23 15:56:36 +01:00
import java.util.Collection ;
2012-09-30 13:23:06 +02:00
import java.util.Date ;
2012-09-28 22:45:16 +02:00
import java.util.HashMap ;
import java.util.HashSet ;
import java.util.Iterator ;
2014-01-23 15:56:36 +01:00
import java.util.LinkedHashMap ;
2012-10-31 17:44:45 +01:00
import java.util.List ;
2012-09-28 22:45:16 +02:00
import java.util.Map ;
2014-10-29 10:50:08 +01:00
import java.util.Map.Entry ;
2012-09-28 22:45:16 +02:00
import java.util.Set ;
import java.util.TreeMap ;
import java.util.concurrent.BlockingQueue ;
2014-01-23 15:56:36 +01:00
import java.util.regex.Pattern ;
2012-09-21 15:48:40 +02:00
2012-09-28 22:45:16 +02:00
import org.apache.solr.common.SolrDocument ;
2013-09-15 00:30:23 +02:00
import net.yacy.cora.document.encoding.ASCII ;
import net.yacy.cora.document.encoding.UTF8 ;
import net.yacy.cora.document.id.DigestURL ;
import net.yacy.cora.document.id.MultiProtocolURL ;
2012-11-23 14:09:48 +01:00
import net.yacy.cora.federate.solr.FailType ;
2014-10-29 10:50:08 +01:00
import net.yacy.cora.federate.solr.SolrType ;
2012-09-28 22:45:16 +02:00
import net.yacy.cora.federate.solr.connector.AbstractSolrConnector ;
2012-09-21 15:48:40 +02:00
import net.yacy.cora.protocol.RequestHeader ;
2012-10-16 17:13:18 +02:00
import net.yacy.cora.sorting.ClusteredScoreMap ;
2012-09-28 22:45:16 +02:00
import net.yacy.cora.sorting.ReversibleScoreMap ;
2013-06-13 13:01:28 +02:00
import net.yacy.cora.storage.HandleSet ;
2013-07-09 14:28:25 +02:00
import net.yacy.cora.util.ConcurrentLog ;
2012-12-07 00:31:10 +01:00
import net.yacy.crawler.HarvestProcess ;
2014-01-23 15:56:36 +01:00
import net.yacy.crawler.data.CrawlProfile ;
2012-11-02 13:57:43 +01:00
import net.yacy.crawler.data.NoticedURL.StackType ;
2012-09-30 13:23:06 +02:00
import net.yacy.crawler.retrieval.Request ;
2012-10-18 15:09:04 +02:00
import net.yacy.kelondro.data.meta.URIMetadataNode ;
2012-10-16 17:13:18 +02:00
import net.yacy.peers.graphics.WebStructureGraph.StructureEntry ;
2012-09-21 15:48:40 +02:00
import net.yacy.search.Switchboard ;
2014-03-18 13:42:31 +01:00
import net.yacy.search.SwitchboardConstants ;
2012-09-28 22:45:16 +02:00
import net.yacy.search.index.Fulltext ;
2013-06-13 13:01:28 +02:00
import net.yacy.search.index.Segment.ReferenceReport ;
import net.yacy.search.index.Segment.ReferenceReportCache ;
2014-01-23 15:56:36 +01:00
import net.yacy.search.query.QueryParams ;
2013-02-21 13:23:55 +01:00
import net.yacy.search.schema.CollectionSchema ;
2012-09-21 15:48:40 +02:00
import net.yacy.server.serverObjects ;
import net.yacy.server.serverSwitch ;
public class HostBrowser {
2012-11-19 17:24:34 +01:00
final static long TIMEOUT = 10000L ;
2012-11-06 00:29:37 +01:00
public static enum StoreType {
2014-01-23 15:56:36 +01:00
LINK , INDEX , EXCLUDED , FAILED , RELOAD ;
2012-11-06 00:29:37 +01:00
}
2014-02-19 04:03:45 +01:00
@SuppressWarnings ( { " unchecked " } )
2012-09-28 22:45:16 +02:00
public static serverObjects respond ( final RequestHeader header , final serverObjects post , final serverSwitch env ) {
2012-09-21 15:48:40 +02:00
// return variable that accumulates replacements
final Switchboard sb = ( Switchboard ) env ;
2012-09-28 22:45:16 +02:00
Fulltext fulltext = sb . index . fulltext ( ) ;
2014-03-31 18:19:24 +02:00
final boolean authorized = sb . verifyAuthentication ( header ) ;
final boolean autoload = authorized & & sb . getConfigBool ( " browser.autoload " , true ) ;
2012-10-02 21:18:27 +02:00
final boolean load4everyone = sb . getConfigBool ( " browser.load4everyone " , false ) ;
2012-10-08 14:00:14 +02:00
final boolean loadRight = autoload | | load4everyone ; // add config later
2014-03-31 18:19:24 +02:00
final boolean searchAllowed = sb . getConfigBool ( SwitchboardConstants . PUBLIC_SEARCHPAGE , true ) | | authorized ;
2012-09-21 15:48:40 +02:00
final serverObjects prop = new serverObjects ( ) ;
2012-09-28 22:45:16 +02:00
2012-09-21 15:48:40 +02:00
// set default values
2012-09-28 22:45:16 +02:00
prop . put ( " path " , " " ) ;
2012-09-21 15:48:40 +02:00
prop . put ( " result " , " " ) ;
2012-09-28 22:45:16 +02:00
prop . put ( " hosts " , 0 ) ;
prop . put ( " files " , 0 ) ;
2014-10-29 10:50:08 +01:00
prop . put ( " hostanalysis " , 0 ) ;
2014-03-31 18:19:24 +02:00
prop . put ( " admin " , " false " ) ;
boolean admin = false ;
2012-09-21 15:48:40 +02:00
2014-03-31 18:19:24 +02:00
String referer = header . get ( " Referer " , " " ) ;
if ( ( post ! = null & & post . getBoolean ( " admin " ) ) | | referer . contains ( " HostBrowser.html?admin=true " ) ) {
prop . put ( " topmenu " , 2 ) ;
prop . put ( " admin " , " true " ) ;
admin = true ;
} else if ( authorized ) { // show top nav to admins
prop . put ( " topmenu " , 1 ) ;
2012-12-01 01:14:29 +01:00
} else { // for other respect setting in Search Design Configuration
prop . put ( " topmenu " , sb . getConfigBool ( " publicTopmenu " , true ) ? 1 : 0 ) ;
}
2014-03-18 13:42:31 +01:00
final String promoteSearchPageGreeting =
( env . getConfigBool ( SwitchboardConstants . GREETING_NETWORK_NAME , false ) ) ?
env . getConfig ( " network.unit.description " , " " ) :
env . getConfig ( SwitchboardConstants . GREETING , " " ) ;
prop . put ( " topmenu_promoteSearchPageGreeting " , promoteSearchPageGreeting ) ;
2012-09-28 22:45:16 +02:00
if ( ! searchAllowed ) {
prop . put ( " result " , " You are not allowed to use this page. Please ask an administrator for permission. " ) ;
2013-05-08 11:50:46 +02:00
prop . putNum ( " ucount " , 0 ) ;
2012-09-28 22:45:16 +02:00
return prop ;
}
2012-10-29 11:27:13 +01:00
String path = post = = null ? " " : post . get ( " path " , " " ) . trim ( ) ;
2014-03-31 18:19:24 +02:00
if ( authorized ) sb . index . fulltext ( ) . commit ( true ) ;
2012-09-21 15:48:40 +02:00
if ( post = = null | | env = = null ) {
2013-05-08 11:50:46 +02:00
prop . putNum ( " ucount " , fulltext . collectionSize ( ) ) ;
2012-09-28 22:45:16 +02:00
return prop ;
2012-09-21 15:48:40 +02:00
}
2012-09-28 22:45:16 +02:00
int p = path . lastIndexOf ( '/' ) ;
if ( p < 0 & & path . length ( ) > 0 ) path = path + " / " ; else if ( p > 7 ) path = path . substring ( 0 , p + 1 ) ; // the search path shall always end with "/"
if ( path . length ( ) > 0 & & (
! path . startsWith ( " http:// " ) & &
! path . startsWith ( " https:// " ) & &
! path . startsWith ( " ftp:// " ) & &
! path . startsWith ( " smb:// " ) & &
! path . startsWith ( " file:// " ) ) ) { path = " http:// " + path ; }
prop . putHTML ( " path " , path ) ;
2014-03-31 18:19:24 +02:00
prop . put ( " delete " , authorized & & path . length ( ) > 0 ? 1 : 0 ) ;
2012-10-31 17:44:45 +01:00
2013-09-15 00:30:23 +02:00
DigestURL pathURI = null ;
try { pathURI = new DigestURL ( path ) ; } catch ( final MalformedURLException e ) { }
2012-09-21 15:48:40 +02:00
2012-09-30 13:23:06 +02:00
String load = post . get ( " load " , " " ) ;
2012-10-02 21:18:27 +02:00
boolean wait = false ;
2014-08-01 11:00:10 +02:00
try {
if ( loadRight & & autoload & & path . length ( ) ! = 0 & & pathURI ! = null & & load . length ( ) = = 0 & & sb . index . getLoadTime ( ASCII . String ( pathURI . hash ( ) ) ) < 0 ) {
// in case that the url does not exist and loading is wanted turn this request into a loading request
load = path ;
wait = true ;
}
} catch ( IOException e1 ) {
2012-10-02 21:18:27 +02:00
load = path ;
wait = true ;
}
2012-09-30 13:23:06 +02:00
if ( load . length ( ) > 0 & & loadRight ) {
// stack URL
2013-09-15 00:30:23 +02:00
DigestURL url ;
2012-10-02 21:18:27 +02:00
if ( sb . crawlStacker . size ( ) > 2 ) wait = false ;
2012-09-30 13:23:06 +02:00
try {
2013-09-15 00:30:23 +02:00
url = new DigestURL ( load ) ;
2012-09-30 13:23:06 +02:00
String reasonString = sb . crawlStacker . stackCrawl ( new Request (
sb . peers . mySeed ( ) . hash . getBytes ( ) ,
url , null , load , new Date ( ) ,
2012-10-02 21:18:27 +02:00
sb . crawler . defaultProxyProfile . handle ( ) ,
2014-12-05 01:13:37 +01:00
0
2012-09-30 13:23:06 +02:00
) ) ;
2013-11-22 09:53:32 +01:00
prop . putHTML ( " result " , reasonString = = null ? ( " added url to indexer: " + load ) : ( " not indexed url ' " + load + " ': " + reasonString ) ) ;
2014-08-01 11:00:10 +02:00
if ( wait ) waitloop : for ( int i = 0 ; i < 30 ; i + + ) {
try {
if ( sb . index . getLoadTime ( ASCII . String ( url . hash ( ) ) ) > = 0 ) break ;
} catch ( IOException e1 ) {
e1 . printStackTrace ( ) ;
break waitloop ;
}
2013-07-17 18:31:30 +02:00
try { Thread . sleep ( 100 ) ; } catch ( final InterruptedException e ) { }
2012-10-02 21:18:27 +02:00
}
2013-07-17 18:31:30 +02:00
} catch ( final MalformedURLException e ) {
2013-11-22 09:53:32 +01:00
prop . putHTML ( " result " , " bad url ' " + load + " ' " ) ;
2012-09-30 13:23:06 +02:00
}
}
2013-04-14 05:33:01 +02:00
2014-03-31 18:19:24 +02:00
if ( authorized & & post . containsKey ( " deleteLoadErrors " ) ) {
2013-04-14 05:33:01 +02:00
try {
fulltext . getDefaultConnector ( ) . deleteByQuery ( " - " + CollectionSchema . httpstatus_i . getSolrFieldName ( ) + " :200 AND "
2014-02-26 14:30:48 +01:00
+ CollectionSchema . httpstatus_i . getSolrFieldName ( ) + AbstractSolrConnector . CATCHALL_DTERM ) ; // make sure field exists
2013-07-09 14:28:25 +02:00
ConcurrentLog . info ( " HostBrowser: " , " delete documents with httpstatus_i <> 200 " ) ;
2013-04-14 05:33:01 +02:00
fulltext . getDefaultConnector ( ) . deleteByQuery ( CollectionSchema . failtype_s . getSolrFieldName ( ) + " : \" " + FailType . fail . name ( ) + " \" " ) ;
2013-07-09 14:28:25 +02:00
ConcurrentLog . info ( " HostBrowser: " , " delete documents with failtype_s = fail " ) ;
2013-04-14 05:33:01 +02:00
fulltext . getDefaultConnector ( ) . deleteByQuery ( CollectionSchema . failtype_s . getSolrFieldName ( ) + " : \" " + FailType . excl . name ( ) + " \" " ) ;
2013-07-09 14:28:25 +02:00
ConcurrentLog . info ( " HostBrowser: " , " delete documents with failtype_s = excl " ) ;
2013-05-08 11:50:46 +02:00
prop . putNum ( " ucount " , fulltext . collectionSize ( ) ) ;
2013-04-14 05:33:01 +02:00
return prop ;
2013-07-17 18:31:30 +02:00
} catch ( final IOException ex ) {
2013-07-09 14:28:25 +02:00
ConcurrentLog . logException ( ex ) ;
2013-04-14 05:33:01 +02:00
}
}
2012-09-30 13:23:06 +02:00
2012-09-28 22:45:16 +02:00
if ( post . containsKey ( " hosts " ) ) {
// generate host list
try {
2012-11-09 16:24:56 +01:00
boolean onlyCrawling = " crawling " . equals ( post . get ( " hosts " , " " ) ) ;
boolean onlyErrors = " error " . equals ( post . get ( " hosts " , " " ) ) ;
2014-03-31 18:19:24 +02:00
int maxcount = authorized ? 2 * 3 * 2 * 5 * 7 * 2 * 3 : 360 ; // which makes nice matrixes for 2, 3, 4, 5, 6, 7, 8, 9 rows/colums
2012-11-02 13:57:43 +01:00
2012-11-06 00:29:37 +01:00
// collect hosts from index
2014-02-26 14:30:48 +01:00
ReversibleScoreMap < String > hostscore = fulltext . getDefaultConnector ( ) . getFacets ( AbstractSolrConnector . CATCHALL_QUERY , maxcount , CollectionSchema . host_s . getSolrFieldName ( ) ) . get ( CollectionSchema . host_s . getSolrFieldName ( ) ) ;
2012-11-06 00:29:37 +01:00
if ( hostscore = = null ) hostscore = new ClusteredScoreMap < String > ( ) ;
2012-11-02 13:57:43 +01:00
2012-11-06 00:29:37 +01:00
// collect hosts from crawler
2014-03-31 18:19:24 +02:00
final Map < String , Integer [ ] > crawler = ( authorized ) ? sb . crawlQueues . noticeURL . getDomainStackHosts ( StackType . LOCAL , sb . robots ) : new HashMap < String , Integer [ ] > ( ) ;
2012-11-02 13:57:43 +01:00
2012-11-06 00:29:37 +01:00
// collect the errorurls
2014-03-31 18:19:24 +02:00
Map < String , ReversibleScoreMap < String > > exclfacets = authorized ? fulltext . getDefaultConnector ( ) . getFacets ( CollectionSchema . failtype_s . getSolrFieldName ( ) + " : " + FailType . excl . name ( ) , maxcount , CollectionSchema . host_s . getSolrFieldName ( ) ) : null ;
2013-02-21 13:23:55 +01:00
ReversibleScoreMap < String > exclscore = exclfacets = = null ? new ClusteredScoreMap < String > ( ) : exclfacets . get ( CollectionSchema . host_s . getSolrFieldName ( ) ) ;
2014-03-31 18:19:24 +02:00
Map < String , ReversibleScoreMap < String > > failfacets = authorized ? fulltext . getDefaultConnector ( ) . getFacets ( CollectionSchema . failtype_s . getSolrFieldName ( ) + " : " + FailType . fail . name ( ) , maxcount , CollectionSchema . host_s . getSolrFieldName ( ) ) : null ;
2013-02-21 13:23:55 +01:00
ReversibleScoreMap < String > failscore = failfacets = = null ? new ClusteredScoreMap < String > ( ) : failfacets . get ( CollectionSchema . host_s . getSolrFieldName ( ) ) ;
2012-11-06 00:29:37 +01:00
2012-09-28 22:45:16 +02:00
int c = 0 ;
2012-11-06 00:29:37 +01:00
Iterator < String > i = hostscore . keys ( false ) ;
2012-09-28 22:45:16 +02:00
String host ;
while ( i . hasNext ( ) & & c < maxcount ) {
host = i . next ( ) ;
2014-03-31 18:19:24 +02:00
prop . put ( " hosts_list_ " + c + " _admin " , admin ? " true " : " false " ) ;
2013-11-22 09:53:32 +01:00
prop . putHTML ( " hosts_list_ " + c + " _host " , host ) ;
2012-11-02 13:57:43 +01:00
boolean inCrawler = crawler . containsKey ( host ) ;
2012-11-23 14:09:48 +01:00
int exclcount = exclscore . get ( host ) ;
int failcount = failscore . get ( host ) ;
int errors = exclcount + failcount ;
2012-11-09 16:24:56 +01:00
prop . put ( " hosts_list_ " + c + " _count " , hostscore . get ( host ) - errors ) ;
2012-11-02 13:57:43 +01:00
prop . put ( " hosts_list_ " + c + " _crawler " , inCrawler ? 1 : 0 ) ;
if ( inCrawler ) prop . put ( " hosts_list_ " + c + " _crawler_pending " , crawler . get ( host ) [ 0 ] ) ;
2012-11-06 00:29:37 +01:00
prop . put ( " hosts_list_ " + c + " _errors " , errors > 0 ? 1 : 0 ) ;
2012-11-23 14:09:48 +01:00
if ( errors > 0 ) {
prop . put ( " hosts_list_ " + c + " _errors_exclcount " , exclcount ) ;
prop . put ( " hosts_list_ " + c + " _errors_failcount " , failcount ) ;
}
2012-11-09 16:24:56 +01:00
prop . put ( " hosts_list_ " + c + " _type " , inCrawler ? 2 : errors > 0 ? 1 : 0 ) ;
if ( onlyCrawling ) {
if ( inCrawler ) c + + ;
} else if ( onlyErrors ) {
if ( errors > 0 ) c + + ;
} else {
c + + ;
}
2012-09-28 22:45:16 +02:00
}
prop . put ( " hosts_list " , c ) ;
prop . put ( " hosts " , 1 ) ;
2013-07-17 18:31:30 +02:00
} catch ( final IOException e ) {
2013-07-09 14:28:25 +02:00
ConcurrentLog . logException ( e ) ;
2012-09-28 22:45:16 +02:00
}
}
if ( path . length ( ) > 0 ) {
2012-09-21 15:48:40 +02:00
try {
2013-09-15 00:30:23 +02:00
DigestURL uri = new DigestURL ( path ) ;
2012-09-28 22:45:16 +02:00
String host = uri . getHost ( ) ;
2014-10-29 10:50:08 +01:00
// write host analysis if path after host is empty
if ( uri . getPath ( ) . length ( ) < = 1 & & host ! = null & & host . length ( ) > 0 & & sb . getConfigBool ( " decoration.hostanalysis " , false ) ) {
//how many documents per crawldepth_i; get crawldepth_i facet for host
ArrayList < String > ff = new ArrayList < > ( ) ;
for ( CollectionSchema csf : CollectionSchema . values ( ) ) {
2014-11-07 18:11:23 +01:00
if ( ( csf . getType ( ) ! = SolrType . num_integer & & csf . getType ( ) ! = SolrType . num_long ) | | csf . isMultiValued ( ) ) continue ;
2014-10-29 10:50:08 +01:00
String facetfield = csf . getSolrFieldName ( ) ;
if ( ! fulltext . getDefaultConfiguration ( ) . contains ( facetfield ) ) continue ;
ff . add ( csf . getSolrFieldName ( ) ) ;
}
2014-11-20 18:44:29 +01:00
// add also vocabulary counters
Map < String , ReversibleScoreMap < String > > vocabularyFacet = sb . index . fulltext ( ) . getDefaultConnector ( ) . getFacets ( CollectionSchema . vocabularies_sxt . getSolrFieldName ( ) + " :[* TO *] " , 100 , CollectionSchema . vocabularies_sxt . getSolrFieldName ( ) ) ;
if ( vocabularyFacet . size ( ) > 0 ) {
Collection < String > vocnames = vocabularyFacet . values ( ) . iterator ( ) . next ( ) . keyList ( true ) ;
for ( String vocname : vocnames ) {
ff . add ( CollectionSchema . VOCABULARY_PREFIX + vocname + CollectionSchema . VOCABULARY_LOGCOUNT_SUFFIX ) ;
ff . add ( CollectionSchema . VOCABULARY_PREFIX + vocname + CollectionSchema . VOCABULARY_LOGCOUNTS_SUFFIX ) ;
}
}
// list the facets
2014-10-29 10:50:08 +01:00
String [ ] facetfields = ff . toArray ( new String [ ff . size ( ) ] ) ;
Map < String , ReversibleScoreMap < String > > facets = fulltext . getDefaultConnector ( ) . getFacets ( CollectionSchema . host_s . getSolrFieldName ( ) + " : \" " + host + " \" " , 100 , facetfields ) ;
int fc = 0 ;
2014-11-20 18:44:29 +01:00
for ( Map . Entry < String , ReversibleScoreMap < String > > facetentry : facets . entrySet ( ) ) {
ReversibleScoreMap < String > facetfieldmap = facetentry . getValue ( ) ;
2014-10-30 15:47:44 +01:00
if ( facetfieldmap . size ( ) = = 0 ) continue ;
2014-11-07 18:11:23 +01:00
TreeMap < Long , Integer > statMap = new TreeMap < > ( ) ;
for ( String k : facetfieldmap ) statMap . put ( Long . parseLong ( k ) , facetfieldmap . get ( k ) ) ;
2014-11-20 18:44:29 +01:00
prop . put ( " hostanalysis_facets_ " + fc + " _facetname " , facetentry . getKey ( ) ) ;
2014-11-07 18:11:23 +01:00
int c = 0 ; for ( Entry < Long , Integer > entry : statMap . entrySet ( ) ) {
2014-10-29 10:50:08 +01:00
prop . put ( " hostanalysis_facets_ " + fc + " _facet_ " + c + " _key " , entry . getKey ( ) ) ;
prop . put ( " hostanalysis_facets_ " + fc + " _facet_ " + c + " _count " , entry . getValue ( ) ) ;
2014-11-20 18:44:29 +01:00
prop . put ( " hostanalysis_facets_ " + fc + " _facet_ " + c + " _a " , " http://localhost: " + sb . getConfigInt ( " port " , 8090 ) + " /solr/collection1/select?q=host_s: " + host + " AND " + facetentry . getKey ( ) + " : " + entry . getKey ( ) + " &defType=edismax&start=0&rows=1000&fl=sku,crawldepth_i " ) ;
2014-10-29 10:50:08 +01:00
c + + ;
}
prop . put ( " hostanalysis_facets_ " + fc + " _facet " , c ) ;
fc + + ;
}
prop . put ( " hostanalysis_facets " , fc ) ;
prop . put ( " hostanalysis " , 1 ) ;
}
// write file list for subpath
boolean delete = false ;
boolean reload404 = false ;
if ( authorized & & post . containsKey ( " delete " ) ) {
// delete the complete path!! That includes everything that matches with this prefix.
delete = true ;
}
if ( authorized & & post . containsKey ( " reload404 " ) ) {
// try to re-load all urls that have load errors and matches with this prefix.
reload404 = true ;
}
int facetcount = post . getInt ( " facetcount " , 0 ) ;
boolean complete = post . getBoolean ( " complete " ) ;
if ( complete ) { // we want only root paths for complete lists
p = path . indexOf ( '/' , 10 ) ;
if ( p > 0 ) path = path . substring ( 0 , p + 1 ) ;
}
prop . put ( " files_complete " , complete ? 1 : 0 ) ;
prop . put ( " files_complete_admin " , admin ? " true " : " false " ) ;
prop . putHTML ( " files_complete_path " , path ) ;
p = path . substring ( 0 , path . length ( ) - 1 ) . lastIndexOf ( '/' ) ;
if ( p < 8 ) {
prop . put ( " files_root " , 1 ) ;
} else {
prop . put ( " files_root " , 0 ) ;
prop . putHTML ( " files_root_path " , path . substring ( 0 , p + 1 ) ) ;
prop . put ( " files_root_admin " , admin ? " true " : " false " ) ;
}
// generate file list from path
2012-10-16 17:13:18 +02:00
prop . putHTML ( " outbound_host " , host ) ;
2014-03-31 18:19:24 +02:00
if ( authorized ) prop . putHTML ( " outbound_admin_host " , host ) ; //used for WebStructurePicture_p link
2012-10-16 17:13:18 +02:00
prop . putHTML ( " inbound_host " , host ) ;
2012-10-16 18:11:57 +02:00
String hosthash = ASCII . String ( uri . hash ( ) , 6 , 6 ) ;
2012-11-05 03:19:28 +01:00
String [ ] pathparts = uri . getPaths ( ) ;
2012-09-28 22:45:16 +02:00
// get all files for a specific host from the index
2012-11-05 18:57:21 +01:00
StringBuilder q = new StringBuilder ( ) ;
2014-10-13 18:33:39 +02:00
if ( host = = null ) {
if ( path . startsWith ( " file:// " ) ) {
q . append ( CollectionSchema . url_protocol_s . getSolrFieldName ( ) ) . append ( " :file " ) ;
}
} else {
q . append ( CollectionSchema . host_s . getSolrFieldName ( ) ) . append ( " : \" " ) . append ( host ) . append ( " \" " ) ;
}
2012-11-05 18:57:21 +01:00
if ( pathparts . length > 0 & & pathparts [ 0 ] . length ( ) > 0 ) {
for ( String pe : pathparts ) {
2013-02-21 13:23:55 +01:00
if ( pe . length ( ) > 0 ) q . append ( " AND " ) . append ( CollectionSchema . url_paths_sxt . getSolrFieldName ( ) ) . append ( " : \" " ) . append ( pe ) . append ( '\"' ) ;
2012-11-05 18:57:21 +01:00
}
} else {
2012-11-07 02:17:24 +01:00
if ( facetcount > 1000 | | post . containsKey ( " nepr " ) ) {
2014-02-26 14:30:48 +01:00
q . append ( " AND " ) . append ( CollectionSchema . url_paths_sxt . getSolrFieldName ( ) ) . append ( AbstractSolrConnector . CATCHALL_DTERM ) ;
2012-11-07 02:17:24 +01:00
}
2012-11-05 18:57:21 +01:00
}
2014-09-17 13:58:55 +02:00
BlockingQueue < SolrDocument > docs = fulltext . getDefaultConnector ( ) . concurrentDocumentsByQuery ( q . toString ( ) , CollectionSchema . url_chars_i . getSolrFieldName ( ) + " asc " , 0 , 100000 , TIMEOUT , 100 , 1 , false ,
2013-02-21 13:23:55 +01:00
CollectionSchema . id . getSolrFieldName ( ) ,
CollectionSchema . sku . getSolrFieldName ( ) ,
2013-05-06 16:45:54 +02:00
CollectionSchema . failreason_s . getSolrFieldName ( ) ,
2013-02-21 13:23:55 +01:00
CollectionSchema . failtype_s . getSolrFieldName ( ) ,
CollectionSchema . inboundlinks_protocol_sxt . getSolrFieldName ( ) ,
2013-09-01 14:35:36 +02:00
CollectionSchema . inboundlinks_urlstub_sxt . getSolrFieldName ( ) ,
2013-02-21 13:23:55 +01:00
CollectionSchema . outboundlinks_protocol_sxt . getSolrFieldName ( ) ,
2013-09-01 14:35:36 +02:00
CollectionSchema . outboundlinks_urlstub_sxt . getSolrFieldName ( ) ,
2014-04-02 23:37:01 +02:00
CollectionSchema . crawldepth_i . getSolrFieldName ( ) ,
2013-04-14 11:30:57 +02:00
CollectionSchema . references_i . getSolrFieldName ( ) ,
CollectionSchema . references_internal_i . getSolrFieldName ( ) ,
CollectionSchema . references_external_i . getSolrFieldName ( ) ,
2013-06-07 13:20:57 +02:00
CollectionSchema . references_exthosts_i . getSolrFieldName ( ) ,
CollectionSchema . cr_host_chance_d . getSolrFieldName ( ) ,
CollectionSchema . cr_host_norm_i . getSolrFieldName ( )
2012-11-19 17:24:34 +01:00
) ;
2012-09-28 22:45:16 +02:00
SolrDocument doc ;
Set < String > storedDocs = new HashSet < String > ( ) ;
2012-11-23 14:09:48 +01:00
Map < String , FailType > errorDocs = new HashMap < String , FailType > ( ) ;
2012-10-16 17:13:18 +02:00
Set < String > inboundLinks = new HashSet < String > ( ) ;
Map < String , ReversibleScoreMap < String > > outboundHosts = new HashMap < String , ReversibleScoreMap < String > > ( ) ;
2013-04-14 11:30:57 +02:00
Map < String , InfoCacheEntry > infoCache = new HashMap < String , InfoCacheEntry > ( ) ;
2012-09-28 22:45:16 +02:00
int hostsize = 0 ;
2013-05-08 13:26:25 +02:00
final List < String > deleteIDs = new ArrayList < String > ( ) ;
2014-01-23 15:56:36 +01:00
final Collection < String > reloadURLs = new ArrayList < String > ( ) ;
final Set < String > reloadURLCollection = new HashSet < String > ( ) ;
2013-06-13 13:01:28 +02:00
long timeoutList = System . currentTimeMillis ( ) + TIMEOUT ;
2014-01-23 15:56:36 +01:00
long timeoutReferences = System . currentTimeMillis ( ) + 6000 ;
2013-06-13 13:01:28 +02:00
ReferenceReportCache rrCache = sb . index . getReferenceReportCache ( ) ;
2012-09-28 22:45:16 +02:00
while ( ( doc = docs . take ( ) ) ! = AbstractSolrConnector . POISON_DOCUMENT ) {
2013-02-21 13:23:55 +01:00
String u = ( String ) doc . getFieldValue ( CollectionSchema . sku . getSolrFieldName ( ) ) ;
String errortype = ( String ) doc . getFieldValue ( CollectionSchema . failtype_s . getSolrFieldName ( ) ) ;
2013-04-14 11:30:57 +02:00
FailType error = errortype = = null ? null : FailType . valueOf ( errortype ) ;
String ids = ( String ) doc . getFieldValue ( CollectionSchema . id . getSolrFieldName ( ) ) ;
2013-06-13 13:01:28 +02:00
infoCache . put ( ids , new InfoCacheEntry ( sb . index . fulltext ( ) , rrCache , doc , ids , System . currentTimeMillis ( ) < timeoutReferences ) ) ;
2012-10-31 17:44:45 +01:00
if ( u . startsWith ( path ) ) {
if ( delete ) {
2013-05-08 13:26:25 +02:00
deleteIDs . add ( ids ) ;
2012-10-31 17:44:45 +01:00
} else {
2014-01-23 15:56:36 +01:00
if ( error = = null ) storedDocs . add ( u ) ; else {
if ( reload404 & & error = = FailType . fail ) {
ArrayList < String > collections = ( ArrayList < String > ) doc . getFieldValue ( CollectionSchema . collection_sxt . getSolrFieldName ( ) ) ;
if ( collections ! = null ) reloadURLCollection . addAll ( collections ) ;
reloadURLs . add ( u ) ;
}
2014-03-31 18:19:24 +02:00
if ( authorized ) errorDocs . put ( u , error ) ;
2014-01-23 15:56:36 +01:00
}
2012-10-31 17:44:45 +01:00
}
} else if ( complete ) {
2014-01-23 15:56:36 +01:00
if ( error = = null ) storedDocs . add ( u ) ; else {
2014-03-31 18:19:24 +02:00
if ( authorized ) errorDocs . put ( u , error ) ;
2014-01-23 15:56:36 +01:00
}
2012-10-31 17:44:45 +01:00
}
2012-11-19 17:24:34 +01:00
if ( ( complete | | u . startsWith ( path ) ) & & ! storedDocs . contains ( u ) ) inboundLinks . add ( u ) ; // add the current link
2012-11-06 00:29:37 +01:00
if ( error = = null ) {
hostsize + + ;
// collect inboundlinks to browse the host
Iterator < String > links = URIMetadataNode . getLinks ( doc , true ) ;
while ( links . hasNext ( ) ) {
u = links . next ( ) ;
if ( ( complete | | u . startsWith ( path ) ) & & ! storedDocs . contains ( u ) ) inboundLinks . add ( u ) ;
}
2013-06-13 13:01:28 +02:00
// collect referrer links
2012-11-06 00:29:37 +01:00
links = URIMetadataNode . getLinks ( doc , false ) ;
while ( links . hasNext ( ) ) {
u = links . next ( ) ;
try {
2013-09-15 00:30:23 +02:00
MultiProtocolURL mu = new MultiProtocolURL ( u ) ;
2012-11-06 00:29:37 +01:00
if ( mu . getHost ( ) ! = null ) {
ReversibleScoreMap < String > lks = outboundHosts . get ( mu . getHost ( ) ) ;
if ( lks = = null ) {
lks = new ClusteredScoreMap < String > ( UTF8 . insensitiveUTF8Comparator ) ;
outboundHosts . put ( mu . getHost ( ) , lks ) ;
}
lks . set ( u , u . length ( ) ) ;
2012-10-16 17:13:18 +02:00
}
2013-07-17 18:31:30 +02:00
} catch ( final MalformedURLException e ) { }
2012-11-06 00:29:37 +01:00
}
2012-09-28 22:45:16 +02:00
}
2013-06-13 13:01:28 +02:00
if ( System . currentTimeMillis ( ) > timeoutList ) break ;
2012-09-28 22:45:16 +02:00
}
2013-05-08 13:26:25 +02:00
if ( deleteIDs . size ( ) > 0 ) sb . remove ( deleteIDs ) ;
2014-01-23 15:56:36 +01:00
if ( reloadURLs . size ( ) > 0 ) {
final Map < String , Pattern > cm = new LinkedHashMap < String , Pattern > ( ) ;
for ( String collection : reloadURLCollection ) cm . put ( collection , QueryParams . catchall_pattern ) ;
sb . reload ( reloadURLs , cm . size ( ) > 0 ? cm : CrawlProfile . collectionParser ( " user " ) , false ) ;
}
2012-10-23 19:02:55 +02:00
2012-11-02 13:57:43 +01:00
// collect from crawler
2014-03-31 18:19:24 +02:00
List < Request > domainStackReferences = ( authorized ) ? sb . crawlQueues . noticeURL . getDomainStackReferences ( StackType . LOCAL , host , 1000 , 3000 ) : new ArrayList < Request > ( 0 ) ;
2012-11-02 13:57:43 +01:00
Set < String > loadingLinks = new HashSet < String > ( ) ;
for ( Request crawlEntry : domainStackReferences ) loadingLinks . add ( crawlEntry . url ( ) . toNormalform ( true ) ) ;
// now combine all lists into one
2012-11-06 00:29:37 +01:00
Map < String , StoreType > files = new HashMap < String , StoreType > ( ) ;
for ( String u : storedDocs ) files . put ( u , StoreType . INDEX ) ;
2012-11-23 14:09:48 +01:00
for ( Map . Entry < String , FailType > e : errorDocs . entrySet ( ) ) files . put ( e . getKey ( ) , e . getValue ( ) = = FailType . fail ? StoreType . FAILED : StoreType . EXCLUDED ) ;
2012-12-07 00:31:10 +01:00
for ( String u : inboundLinks ) if ( ! files . containsKey ( u ) ) files . put ( u , StoreType . LINK ) ;
for ( String u : loadingLinks ) if ( u . startsWith ( path ) & & ! files . containsKey ( u ) ) files . put ( u , StoreType . LINK ) ;
2013-07-09 14:28:25 +02:00
ConcurrentLog . info ( " HostBrowser " , " collected " + files . size ( ) + " urls for path " + path ) ;
2012-10-23 19:02:55 +02:00
2012-09-28 22:45:16 +02:00
// distinguish files and folders
2012-11-02 14:40:02 +01:00
Map < String , Object > list = new TreeMap < String , Object > ( ) ; // a directory list; if object is boolean, its a file; if its a int[], then its a folder
2012-10-23 19:02:55 +02:00
int pl = path . length ( ) ;
2012-10-25 10:23:43 +02:00
String file ;
2012-11-06 00:29:37 +01:00
for ( Map . Entry < String , StoreType > entry : files . entrySet ( ) ) {
2012-11-02 14:40:02 +01:00
if ( entry . getKey ( ) . length ( ) < pl ) continue ; // this is not inside the path
if ( ! entry . getKey ( ) . startsWith ( path ) ) continue ;
2012-10-25 10:23:43 +02:00
file = entry . getKey ( ) . substring ( pl ) ;
2012-11-06 00:29:37 +01:00
StoreType type = entry . getValue ( ) ;
2012-09-28 22:45:16 +02:00
p = file . indexOf ( '/' ) ;
if ( p < 0 ) {
2012-10-23 19:02:55 +02:00
// this is a file
2012-11-06 00:29:37 +01:00
list . put ( entry . getKey ( ) , type ) ; // StoreType value: this is a file; true -> file is in index; false -> not in index, maybe in crawler
2012-09-28 22:45:16 +02:00
} else {
2012-10-23 19:02:55 +02:00
// this is a directory path or a file in a subdirectory
String remainingPath = file . substring ( 0 , p + 1 ) ;
if ( complete & & remainingPath . indexOf ( '.' ) > 0 ) {
2012-11-06 00:29:37 +01:00
list . put ( entry . getKey ( ) , type ) ; // StoreType value: this is a file
2012-10-23 19:02:55 +02:00
} else {
String dir = path + remainingPath ;
Object c = list . get ( dir ) ;
if ( c = = null ) {
2014-01-23 15:56:36 +01:00
int [ ] linkedStoredIncrawlerError = new int [ ] { 0 , 0 , 0 , 0 , 0 } ;
2012-11-06 00:29:37 +01:00
if ( type = = StoreType . LINK ) linkedStoredIncrawlerError [ 0 ] + + ;
if ( type = = StoreType . INDEX ) linkedStoredIncrawlerError [ 1 ] + + ;
if ( loadingLinks . contains ( entry . getKey ( ) ) ) linkedStoredIncrawlerError [ 2 ] + + ;
2014-01-23 15:56:36 +01:00
if ( errorDocs . containsKey ( entry . getKey ( ) ) ) linkedStoredIncrawlerError [ errorDocs . get ( entry . getKey ( ) ) = = FailType . excl ? 3 : 4 ] + + ;
2012-11-06 00:29:37 +01:00
list . put ( dir , linkedStoredIncrawlerError ) ;
2012-10-23 19:02:55 +02:00
} else if ( c instanceof int [ ] ) {
2012-11-06 00:29:37 +01:00
if ( type = = StoreType . LINK ) ( ( int [ ] ) c ) [ 0 ] + + ;
if ( type = = StoreType . INDEX ) ( ( int [ ] ) c ) [ 1 ] + + ;
if ( loadingLinks . contains ( entry . getKey ( ) ) ) ( ( int [ ] ) c ) [ 2 ] + + ;
2014-01-23 15:56:36 +01:00
if ( errorDocs . containsKey ( entry . getKey ( ) ) ) ( ( int [ ] ) c ) [ errorDocs . get ( entry . getKey ( ) ) = = FailType . excl ? 3 : 4 ] + + ;
2012-10-23 19:02:55 +02:00
}
2012-09-28 22:45:16 +02:00
}
}
}
int maxcount = 1000 ;
int c = 0 ;
2012-11-23 14:09:48 +01:00
// first list only folders
int filecounter = 0 ;
for ( Map . Entry < String , Object > entry : list . entrySet ( ) ) {
if ( ( entry . getValue ( ) instanceof StoreType ) ) {
filecounter + + ;
} else {
// this is a folder
prop . put ( " files_list_ " + c + " _type " , 1 ) ;
2013-10-31 16:16:29 +01:00
prop . putHTML ( " files_list_ " + c + " _type_url " , entry . getKey ( ) ) ;
2014-03-31 18:19:24 +02:00
prop . putHTML ( " files_list_ " + c + " _type_admin " , admin ? " true " : " false " ) ;
2012-11-23 14:09:48 +01:00
int linked = ( ( int [ ] ) entry . getValue ( ) ) [ 0 ] ;
int stored = ( ( int [ ] ) entry . getValue ( ) ) [ 1 ] ;
int crawler = ( ( int [ ] ) entry . getValue ( ) ) [ 2 ] ;
2014-01-23 15:56:36 +01:00
int excl = ( ( int [ ] ) entry . getValue ( ) ) [ 3 ] ;
int error = ( ( int [ ] ) entry . getValue ( ) ) [ 4 ] ;
2012-11-23 14:09:48 +01:00
prop . put ( " files_list_ " + c + " _type_stored " , stored ) ;
prop . put ( " files_list_ " + c + " _type_linked " , linked ) ;
prop . put ( " files_list_ " + c + " _type_pendingVisible " , crawler > 0 ? 1 : 0 ) ;
prop . put ( " files_list_ " + c + " _type_pending " , crawler ) ;
2014-01-23 15:56:36 +01:00
prop . put ( " files_list_ " + c + " _type_excludedVisible " , excl > 0 ? 1 : 0 ) ;
prop . put ( " files_list_ " + c + " _type_excluded " , excl ) ;
2012-11-23 14:09:48 +01:00
prop . put ( " files_list_ " + c + " _type_failedVisible " , error > 0 ? 1 : 0 ) ;
prop . put ( " files_list_ " + c + " _type_failed " , error ) ;
if ( + + c > = maxcount ) break ;
}
}
// then list files
2012-09-28 22:45:16 +02:00
for ( Map . Entry < String , Object > entry : list . entrySet ( ) ) {
2012-11-06 00:29:37 +01:00
if ( entry . getValue ( ) instanceof StoreType ) {
2012-09-28 22:45:16 +02:00
// this is a file
prop . put ( " files_list_ " + c + " _type " , 0 ) ;
2013-11-22 09:53:32 +01:00
prop . putHTML ( " files_list_ " + c + " _type_url " , entry . getKey ( ) ) ;
2014-03-31 18:19:24 +02:00
prop . putHTML ( " files_list_ " + c + " _type_admin " , admin ? " true " : " false " ) ;
2012-11-06 00:29:37 +01:00
StoreType type = ( StoreType ) entry . getValue ( ) ;
2013-09-15 00:30:23 +02:00
try { uri = new DigestURL ( entry . getKey ( ) ) ; } catch ( final MalformedURLException e ) { uri = null ; }
2014-07-11 19:52:25 +02:00
HarvestProcess process = uri = = null ? null : sb . crawlQueues . exists ( uri . hash ( ) ) ; // todo: cannot identify errors
2012-12-07 00:31:10 +01:00
boolean loading = load . equals ( entry . getKey ( ) ) | | ( process ! = null & & process ! = HarvestProcess . ERRORS ) ;
boolean error = process = = HarvestProcess . ERRORS | | type = = StoreType . EXCLUDED | | type = = StoreType . FAILED ;
boolean dc = type ! = StoreType . INDEX & & ! error & & ! loading & & list . containsKey ( entry . getKey ( ) + " / " ) ;
if ( ! dc ) {
prop . put ( " files_list_ " + c + " _type_stored " , type = = StoreType . INDEX ? 1 : error ? 3 : loading ? 2 : 0 /*linked*/ ) ;
2013-01-02 20:55:43 +01:00
if ( type = = StoreType . INDEX ) {
2013-04-14 11:30:57 +02:00
String ids = ASCII . String ( uri . hash ( ) ) ;
InfoCacheEntry ice = infoCache . get ( ids ) ;
2014-03-31 18:19:24 +02:00
prop . put ( " files_list_ " + c + " _type_stored_comment " , ice = = null ? " " : ice . toString ( ) ) ; // ice.toString() contains html, therefore do not use putHTML here
2013-01-02 20:55:43 +01:00
}
2012-12-07 00:31:10 +01:00
prop . put ( " files_list_ " + c + " _type_stored_load " , loadRight ? 1 : 0 ) ;
if ( error ) {
FailType failType = errorDocs . get ( entry . getKey ( ) ) ;
if ( failType = = null ) {
// maybe this is only in the errorURL
2014-07-11 19:52:25 +02:00
//Metadata faildoc = sb.index.fulltext().getDefaultConnector().getMetadata(ASCII.String(uri.hash()));
prop . putHTML ( " files_list_ " + c + " _type_stored_error " , " unknown error " ) ;
2012-12-07 00:31:10 +01:00
} else {
2013-11-01 13:30:53 +01:00
String ids = ASCII . String ( uri . hash ( ) ) ;
InfoCacheEntry ice = infoCache . get ( ids ) ;
2014-01-23 15:56:36 +01:00
prop . put ( " files_list_ " + c + " _type_stored_error " , failType = = FailType . excl ? " excluded from indexing " : " load fail " + ( ice = = null ? " " : " ; " + ice . toString ( ) ) ) ;
2012-12-07 00:31:10 +01:00
}
}
if ( loadRight ) {
2013-11-22 09:53:32 +01:00
prop . putHTML ( " files_list_ " + c + " _type_stored_load_url " , entry . getKey ( ) ) ;
prop . putHTML ( " files_list_ " + c + " _type_stored_load_path " , path ) ;
2012-12-07 00:31:10 +01:00
}
if ( + + c > = maxcount ) break ;
2012-09-30 13:23:06 +02:00
}
2012-09-28 22:45:16 +02:00
}
2012-09-21 15:48:40 +02:00
}
2012-09-28 22:45:16 +02:00
prop . put ( " files_list " , c ) ;
prop . putHTML ( " files_path " , path ) ;
prop . put ( " files_hostsize " , hostsize ) ;
2012-11-23 14:09:48 +01:00
prop . put ( " files_subpathloadsize " , storedDocs . size ( ) ) ;
prop . put ( " files_subpathdetectedsize " , filecounter - storedDocs . size ( ) ) ;
2012-09-28 22:45:16 +02:00
prop . put ( " files " , 1 ) ;
2014-04-03 14:51:19 +02:00
uri = new DigestURL ( path ) ;
2014-10-08 17:12:35 +02:00
prop . put ( " files_linkgraph " , uri . getPath ( ) . length ( ) < = 1 & & hostsize > 0 & & sb . getConfigBool ( SwitchboardConstants . DECORATION_GRAFICS_LINKSTRUCTURE , true ) ) ;
2014-04-03 14:51:19 +02:00
prop . put ( " files_linkgraph_host " , uri . getHost ( ) ) ;
2012-10-16 17:13:18 +02:00
// generate inbound-links table
StructureEntry struct = sb . webStructure . incomingReferences ( hosthash ) ;
if ( struct ! = null & & struct . references . size ( ) > 0 ) {
maxcount = 200 ;
ReversibleScoreMap < String > score = new ClusteredScoreMap < String > ( UTF8 . insensitiveUTF8Comparator ) ;
for ( Map . Entry < String , Integer > entry : struct . references . entrySet ( ) ) score . set ( entry . getKey ( ) , entry . getValue ( ) ) ;
c = 0 ;
Iterator < String > i = score . keys ( false ) ;
while ( i . hasNext ( ) & & c < maxcount ) {
host = i . next ( ) ;
2014-03-31 18:19:24 +02:00
prop . put ( " inbound_list_ " + c + " _admin " , admin ? " true " : " false " ) ;
2013-11-22 09:53:32 +01:00
prop . putHTML ( " inbound_list_ " + c + " _host " , sb . webStructure . hostHash2hostName ( host ) ) ;
2012-10-16 17:13:18 +02:00
prop . put ( " inbound_list_ " + c + " _count " , score . get ( host ) ) ;
c + + ;
}
prop . put ( " inbound_list " , c ) ;
prop . put ( " inbound " , 1 ) ;
} else {
prop . put ( " inbound " , 0 ) ;
}
// generate outbound-links table
if ( outboundHosts . size ( ) > 0 ) {
maxcount = 200 ;
ReversibleScoreMap < String > score = new ClusteredScoreMap < String > ( UTF8 . insensitiveUTF8Comparator ) ;
for ( Map . Entry < String , ReversibleScoreMap < String > > entry : outboundHosts . entrySet ( ) ) score . set ( entry . getKey ( ) , entry . getValue ( ) . size ( ) ) ;
c = 0 ;
Iterator < String > i = score . keys ( false ) ;
while ( i . hasNext ( ) & & c < maxcount ) {
host = i . next ( ) ;
2013-11-22 09:53:32 +01:00
prop . putHTML ( " outbound_list_ " + c + " _host " , host ) ;
2012-10-16 17:13:18 +02:00
prop . put ( " outbound_list_ " + c + " _count " , score . get ( host ) ) ;
prop . put ( " outbound_list_ " + c + " _link " , outboundHosts . get ( host ) . getMinKey ( ) ) ;
2014-03-31 18:19:24 +02:00
prop . put ( " outbound_list_ " + c + " _admin " , admin ? " true " : " false " ) ;
2012-10-16 17:13:18 +02:00
c + + ;
}
prop . put ( " outbound_list " , c ) ;
prop . put ( " outbound " , 1 ) ;
} else {
prop . put ( " outbound " , 0 ) ;
}
2013-07-17 18:31:30 +02:00
} catch ( final Throwable e ) {
2013-07-09 14:28:25 +02:00
ConcurrentLog . logException ( e ) ;
2012-09-21 15:48:40 +02:00
}
}
// return rewrite properties
2013-05-08 11:50:46 +02:00
prop . putNum ( " ucount " , fulltext . collectionSize ( ) ) ;
2012-09-21 15:48:40 +02:00
return prop ;
}
2013-04-14 11:30:57 +02:00
public static final class InfoCacheEntry {
2013-06-07 13:20:57 +02:00
public Integer cr_n ;
public Double cr_c ;
2014-04-16 22:16:20 +02:00
public int crawldepth , references , references_internal , references_external , references_exthosts ;
2013-06-13 13:01:28 +02:00
public List < String > references_internal_urls , references_external_urls ;
public InfoCacheEntry ( final Fulltext fulltext , final ReferenceReportCache rrCache , final SolrDocument doc , final String urlhash , boolean fetchReferences ) {
2013-06-07 13:20:57 +02:00
this . cr_c = ( Double ) doc . getFieldValue ( CollectionSchema . cr_host_chance_d . getSolrFieldName ( ) ) ;
2014-04-02 23:37:01 +02:00
this . cr_n = ( Integer ) doc . getFieldValue ( CollectionSchema . cr_host_norm_i . getSolrFieldName ( ) ) ;
Integer cr = ( Integer ) doc . getFieldValue ( CollectionSchema . crawldepth_i . getSolrFieldName ( ) ) ;
2013-04-14 11:30:57 +02:00
Integer rc = ( Integer ) doc . getFieldValue ( CollectionSchema . references_i . getSolrFieldName ( ) ) ;
Integer rc_internal = ( Integer ) doc . getFieldValue ( CollectionSchema . references_internal_i . getSolrFieldName ( ) ) ;
Integer rc_external = ( Integer ) doc . getFieldValue ( CollectionSchema . references_external_i . getSolrFieldName ( ) ) ;
Integer rc_exthosts = ( Integer ) doc . getFieldValue ( CollectionSchema . references_exthosts_i . getSolrFieldName ( ) ) ;
2014-04-22 19:48:49 +02:00
this . crawldepth = ( cr = = null | | cr . intValue ( ) < 0 ) ? 0 : cr . intValue ( ) ; // for lazy value storage; non-existent means: stored as '0'
2013-04-14 11:30:57 +02:00
this . references = ( rc = = null | | rc . intValue ( ) < = 0 ) ? 0 : rc . intValue ( ) ;
this . references_internal = ( rc_internal = = null | | rc_internal . intValue ( ) < = 0 ) ? 0 : rc_internal . intValue ( ) ;
2013-06-07 13:20:57 +02:00
// calculate the url reference list
this . references_internal_urls = new ArrayList < String > ( ) ;
2013-06-13 13:01:28 +02:00
this . references_external_urls = new ArrayList < String > ( ) ;
if ( fetchReferences ) {
// get the references from the citation index
try {
2014-02-28 14:01:09 +01:00
ReferenceReport rr = rrCache . getReferenceReport ( urlhash , false ) ;
2013-06-13 13:01:28 +02:00
List < String > internalIDs = new ArrayList < String > ( ) ;
List < String > externalIDs = new ArrayList < String > ( ) ;
HandleSet iids = rr . getInternallIDs ( ) ;
for ( byte [ ] b : iids ) internalIDs . add ( ASCII . String ( b ) ) ;
HandleSet eids = rr . getExternalIDs ( ) ;
for ( byte [ ] b : eids ) externalIDs . add ( ASCII . String ( b ) ) ;
// get all urls from the index and store them here
for ( String id : internalIDs ) {
if ( id . equals ( urlhash ) ) continue ; // no self-references
2014-02-24 21:01:56 +01:00
DigestURL u = fulltext . getURL ( id ) ;
2013-06-13 13:01:28 +02:00
if ( u ! = null ) references_internal_urls . add ( u . toNormalform ( true ) ) ;
}
for ( String id : externalIDs ) {
if ( id . equals ( urlhash ) ) continue ; // no self-references
2014-02-24 21:01:56 +01:00
DigestURL u = fulltext . getURL ( id ) ;
2013-06-13 13:01:28 +02:00
if ( u ! = null ) references_external_urls . add ( u . toNormalform ( true ) ) ;
}
2013-07-17 18:31:30 +02:00
} catch ( final IOException e ) {
2013-06-07 13:20:57 +02:00
}
}
2013-04-14 11:30:57 +02:00
this . references_external = ( rc_external = = null | | rc_external . intValue ( ) < = 0 ) ? 0 : rc_external . intValue ( ) ;
this . references_exthosts = ( rc_exthosts = = null | | rc_exthosts . intValue ( ) < = 0 ) ? 0 : rc_exthosts . intValue ( ) ;
}
2014-03-28 13:48:37 +01:00
@Override
2013-06-07 13:20:57 +02:00
public String toString ( ) {
2013-06-13 13:01:28 +02:00
StringBuilder sbi = new StringBuilder ( ) ;
int c = 0 ;
for ( String s : references_internal_urls ) {
2013-11-06 18:05:46 +01:00
sbi . append ( " <a href=' " ) . append ( s ) . append ( " ' target='_blank'><img src='env/grafics/i16.gif' alt='info' title=' " + s + " ' width='12' height='12'/></a> " ) ;
2013-06-13 13:01:28 +02:00
c + + ;
if ( c % 80 = = 0 ) sbi . append ( " <br/> " ) ;
}
if ( sbi . length ( ) > 0 ) sbi . insert ( 0 , " <br/>internal referrer:</br> " ) ;
StringBuilder sbe = new StringBuilder ( ) ;
c = 0 ;
for ( String s : references_external_urls ) {
2013-11-06 18:05:46 +01:00
sbe . append ( " <a href=' " ) . append ( s ) . append ( " ' target='_blank'><img src='env/grafics/i16.gif' alt='info' title=' " + s + " ' width='12' height='12'/></a> " ) ;
2013-06-13 13:01:28 +02:00
c + + ;
if ( c % 80 = = 0 ) sbe . append ( " <br/> " ) ;
}
if ( sbe . length ( ) > 0 ) sbe . insert ( 0 , " <br/>external referrer:</br> " ) ;
2013-06-07 13:20:57 +02:00
return
2014-04-17 13:21:43 +02:00
( this . crawldepth = = 998 ? " unknown crawldepth " : this . crawldepth > = 0 ? " crawldepth: " + this . crawldepth : " " ) +
2013-06-07 13:20:57 +02:00
( this . cr_c ! = null ? " , cr= " + ( Math . round ( this . cr_c * 1000 . 0d ) / 1000 . 0d ) : " " ) +
( this . cr_n ! = null ? " , crn= " + this . cr_n : " " ) +
2014-04-02 23:37:01 +02:00
( this . references > = 0 ? " , refs: " + this . references_exthosts + " hosts, " + this . references_external + " ext, " + this . references_internal + " int " + sbi . toString ( ) + sbe . toString ( ) : " " ) ;
2013-06-07 13:20:57 +02:00
}
2013-04-14 11:30:57 +02:00
}
2012-09-21 15:48:40 +02:00
}