2012-09-28 22:45:16 +02:00
/ * *
* HostBrowser
* Copyright 2012 by Michael Peter Christen , mc @yacy.net , Frankfurt am Main , Germany
* First released 27 . 09 . 2012 at http : //yacy.net
*
* This library is free software ; you can redistribute it and / or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation ; either
* version 2 . 1 of the License , or ( at your option ) any later version .
*
* This library is distributed in the hope that it will be useful ,
* but WITHOUT ANY WARRANTY ; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE . See the GNU
* Lesser General Public License for more details .
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program in the file lgpl21 . txt
* If not , see < http : //www.gnu.org/licenses/>.
* /
2012-09-21 15:48:40 +02:00
2012-09-28 22:45:16 +02:00
import java.io.IOException ;
2012-09-30 13:23:06 +02:00
import java.net.MalformedURLException ;
2012-10-31 17:44:45 +01:00
import java.util.ArrayList ;
2012-09-30 13:23:06 +02:00
import java.util.Date ;
2012-09-28 22:45:16 +02:00
import java.util.HashMap ;
import java.util.HashSet ;
import java.util.Iterator ;
2012-10-31 17:44:45 +01:00
import java.util.List ;
2012-09-28 22:45:16 +02:00
import java.util.Map ;
import java.util.Set ;
import java.util.TreeMap ;
import java.util.concurrent.BlockingQueue ;
2012-09-21 15:48:40 +02:00
2012-09-28 22:45:16 +02:00
import org.apache.solr.common.SolrDocument ;
2012-10-16 17:13:18 +02:00
import net.yacy.cora.document.ASCII ;
import net.yacy.cora.document.MultiProtocolURI ;
import net.yacy.cora.document.UTF8 ;
2012-09-28 22:45:16 +02:00
import net.yacy.cora.federate.solr.YaCySchema ;
import net.yacy.cora.federate.solr.connector.AbstractSolrConnector ;
2012-09-21 15:48:40 +02:00
import net.yacy.cora.protocol.RequestHeader ;
2012-10-16 17:13:18 +02:00
import net.yacy.cora.sorting.ClusteredScoreMap ;
2012-09-28 22:45:16 +02:00
import net.yacy.cora.sorting.ReversibleScoreMap ;
2012-11-02 13:57:43 +01:00
import net.yacy.crawler.data.NoticedURL.StackType ;
2012-09-30 13:23:06 +02:00
import net.yacy.crawler.retrieval.Request ;
import net.yacy.kelondro.data.meta.DigestURI ;
2012-10-18 15:09:04 +02:00
import net.yacy.kelondro.data.meta.URIMetadataNode ;
2012-09-28 22:45:16 +02:00
import net.yacy.kelondro.logging.Log ;
2012-10-16 17:13:18 +02:00
import net.yacy.peers.graphics.WebStructureGraph.StructureEntry ;
2012-09-21 15:48:40 +02:00
import net.yacy.search.Switchboard ;
2012-09-28 22:45:16 +02:00
import net.yacy.search.index.Fulltext ;
2012-09-21 15:48:40 +02:00
import net.yacy.server.serverObjects ;
import net.yacy.server.serverSwitch ;
public class HostBrowser {
2012-11-06 00:29:37 +01:00
public static enum StoreType {
LINK , INDEX , ERROR ;
}
2012-09-28 22:45:16 +02:00
public static serverObjects respond ( final RequestHeader header , final serverObjects post , final serverSwitch env ) {
2012-09-21 15:48:40 +02:00
// return variable that accumulates replacements
final Switchboard sb = ( Switchboard ) env ;
2012-09-28 22:45:16 +02:00
Fulltext fulltext = sb . index . fulltext ( ) ;
2012-09-30 13:23:06 +02:00
final boolean admin = sb . verifyAuthentication ( header ) ;
2012-10-08 14:00:14 +02:00
final boolean autoload = admin & & sb . getConfigBool ( " browser.autoload " , true ) ;
2012-10-02 21:18:27 +02:00
final boolean load4everyone = sb . getConfigBool ( " browser.load4everyone " , false ) ;
2012-10-08 14:00:14 +02:00
final boolean loadRight = autoload | | load4everyone ; // add config later
2012-09-30 13:23:06 +02:00
final boolean searchAllowed = sb . getConfigBool ( " publicSearchpage " , true ) | | admin ;
2012-09-21 15:48:40 +02:00
final serverObjects prop = new serverObjects ( ) ;
2012-09-28 22:45:16 +02:00
2012-09-21 15:48:40 +02:00
// set default values
2012-09-28 22:45:16 +02:00
prop . put ( " path " , " " ) ;
2012-09-21 15:48:40 +02:00
prop . put ( " result " , " " ) ;
2012-09-28 22:45:16 +02:00
prop . putNum ( " ucount " , fulltext . size ( ) ) ;
prop . put ( " hosts " , 0 ) ;
prop . put ( " files " , 0 ) ;
2012-09-30 13:23:06 +02:00
prop . put ( " admin " , 0 ) ;
2012-09-21 15:48:40 +02:00
2012-09-28 22:45:16 +02:00
if ( ! searchAllowed ) {
prop . put ( " result " , " You are not allowed to use this page. Please ask an administrator for permission. " ) ;
return prop ;
}
2012-10-29 11:27:13 +01:00
String path = post = = null ? " " : post . get ( " path " , " " ) . trim ( ) ;
2012-11-07 17:27:50 +01:00
if ( admin & & path . length ( ) = = 0 ) sb . index . fulltext ( ) . commit ( ) ;
2012-09-21 15:48:40 +02:00
if ( post = = null | | env = = null ) {
2012-09-28 22:45:16 +02:00
return prop ;
2012-09-21 15:48:40 +02:00
}
2012-09-28 22:45:16 +02:00
int p = path . lastIndexOf ( '/' ) ;
if ( p < 0 & & path . length ( ) > 0 ) path = path + " / " ; else if ( p > 7 ) path = path . substring ( 0 , p + 1 ) ; // the search path shall always end with "/"
if ( path . length ( ) > 0 & & (
! path . startsWith ( " http:// " ) & &
! path . startsWith ( " https:// " ) & &
! path . startsWith ( " ftp:// " ) & &
! path . startsWith ( " smb:// " ) & &
! path . startsWith ( " file:// " ) ) ) { path = " http:// " + path ; }
prop . putHTML ( " path " , path ) ;
2012-10-31 17:44:45 +01:00
prop . put ( " delete " , admin & & path . length ( ) > 0 ? 1 : 0 ) ;
2012-10-02 21:18:27 +02:00
DigestURI pathURI = null ;
try { pathURI = new DigestURI ( path ) ; } catch ( MalformedURLException e ) { }
2012-09-21 15:48:40 +02:00
2012-09-30 13:23:06 +02:00
String load = post . get ( " load " , " " ) ;
2012-10-02 21:18:27 +02:00
boolean wait = false ;
if ( loadRight & & autoload & & path . length ( ) ! = 0 & & pathURI ! = null & & load . length ( ) = = 0 & & ! sb . index . exists ( pathURI . hash ( ) ) ) {
// in case that the url does not exist and loading is wanted turn this request into a loading request
load = path ;
wait = true ;
}
2012-09-30 13:23:06 +02:00
if ( load . length ( ) > 0 & & loadRight ) {
// stack URL
DigestURI url ;
2012-10-02 21:18:27 +02:00
if ( sb . crawlStacker . size ( ) > 2 ) wait = false ;
2012-09-30 13:23:06 +02:00
try {
url = new DigestURI ( load ) ;
String reasonString = sb . crawlStacker . stackCrawl ( new Request (
sb . peers . mySeed ( ) . hash . getBytes ( ) ,
url , null , load , new Date ( ) ,
2012-10-02 21:18:27 +02:00
sb . crawler . defaultProxyProfile . handle ( ) ,
2012-09-30 13:23:06 +02:00
0 , 0 , 0 , 0
) ) ;
prop . put ( " result " , reasonString = = null ? ( " added url to indexer: " + load ) : ( " not indexed url ' " + load + " ': " + reasonString ) ) ;
2012-10-08 14:00:14 +02:00
if ( wait ) for ( int i = 0 ; i < 30 ; i + + ) {
2012-10-02 21:18:27 +02:00
if ( sb . index . exists ( url . hash ( ) ) ) break ;
2012-10-08 14:00:14 +02:00
try { Thread . sleep ( 100 ) ; } catch ( InterruptedException e ) { }
2012-10-02 21:18:27 +02:00
}
2012-09-30 13:23:06 +02:00
} catch ( MalformedURLException e ) {
prop . put ( " result " , " bad url ' " + load + " ' " ) ;
}
}
2012-09-28 22:45:16 +02:00
if ( post . containsKey ( " hosts " ) ) {
// generate host list
try {
2012-11-06 00:29:37 +01:00
int maxcount = admin ? 2 * 3 * 2 * 5 * 7 * 2 * 3 : 360 ; // which makes nice matrixes for 2, 3, 4, 5, 6, 7, 8, 9 rows/colums
2012-11-02 13:57:43 +01:00
2012-11-06 00:29:37 +01:00
// collect hosts from index
2012-11-06 14:32:08 +01:00
ReversibleScoreMap < String > hostscore = fulltext . getSolr ( ) . getFacets ( " *:* " , new String [ ] { YaCySchema . host_s . getSolrFieldName ( ) } , maxcount ) . get ( YaCySchema . host_s . getSolrFieldName ( ) ) ;
2012-11-06 00:29:37 +01:00
if ( hostscore = = null ) hostscore = new ClusteredScoreMap < String > ( ) ;
2012-11-02 13:57:43 +01:00
2012-11-06 00:29:37 +01:00
// collect hosts from crawler
2012-11-02 13:57:43 +01:00
final Map < String , Integer [ ] > crawler = ( admin ) ? sb . crawlQueues . noticeURL . getDomainStackHosts ( StackType . LOCAL , sb . robots ) : new HashMap < String , Integer [ ] > ( ) ;
for ( Map . Entry < String , Integer [ ] > host : crawler . entrySet ( ) ) {
2012-11-06 00:29:37 +01:00
hostscore . inc ( host . getKey ( ) , host . getValue ( ) [ 0 ] ) ;
2012-11-02 13:57:43 +01:00
}
2012-11-06 00:29:37 +01:00
// collect the errorurls
2012-11-06 14:32:08 +01:00
ReversibleScoreMap < String > errorscore = admin ? fulltext . getSolr ( ) . getFacets ( YaCySchema . failreason_t . getSolrFieldName ( ) + " :[* TO *] " , new String [ ] { YaCySchema . host_s . getSolrFieldName ( ) } , maxcount ) . get ( YaCySchema . host_s . getSolrFieldName ( ) ) : null ;
2012-11-06 00:29:37 +01:00
if ( errorscore = = null ) errorscore = new ClusteredScoreMap < String > ( ) ;
2012-09-28 22:45:16 +02:00
int c = 0 ;
2012-11-06 00:29:37 +01:00
Iterator < String > i = hostscore . keys ( false ) ;
2012-09-28 22:45:16 +02:00
String host ;
while ( i . hasNext ( ) & & c < maxcount ) {
host = i . next ( ) ;
2012-11-07 12:23:21 +01:00
int errors = errorscore . get ( host ) ;
2012-09-28 22:45:16 +02:00
prop . put ( " hosts_list_ " + c + " _host " , host ) ;
2012-11-07 12:23:21 +01:00
prop . put ( " hosts_list_ " + c + " _count " , hostscore . get ( host ) - errors ) ;
2012-11-02 13:57:43 +01:00
boolean inCrawler = crawler . containsKey ( host ) ;
prop . put ( " hosts_list_ " + c + " _crawler " , inCrawler ? 1 : 0 ) ;
if ( inCrawler ) prop . put ( " hosts_list_ " + c + " _crawler_pending " , crawler . get ( host ) [ 0 ] ) ;
2012-11-06 00:29:37 +01:00
prop . put ( " hosts_list_ " + c + " _errors " , errors > 0 ? 1 : 0 ) ;
if ( errors > 0 ) prop . put ( " hosts_list_ " + c + " _errors_count " , errors ) ;
2012-09-28 22:45:16 +02:00
c + + ;
}
prop . put ( " hosts_list " , c ) ;
prop . put ( " hosts " , 1 ) ;
} catch ( IOException e ) {
Log . logException ( e ) ;
}
}
if ( path . length ( ) > 0 ) {
2012-10-31 17:44:45 +01:00
boolean delete = false ;
if ( admin & & post . containsKey ( " delete " ) ) {
// delete the complete path!! That includes everything that matches with this prefix.
delete = true ;
}
2012-11-05 18:57:21 +01:00
int facetcount = post . getInt ( " facetcount " , 0 ) ;
2012-10-23 19:02:55 +02:00
boolean complete = post . getBoolean ( " complete " ) ;
2012-10-25 10:23:43 +02:00
if ( complete ) { // we want only root paths for complete lists
p = path . indexOf ( '/' , 10 ) ;
if ( p > 0 ) path = path . substring ( 0 , p + 1 ) ;
}
2012-10-23 19:02:55 +02:00
prop . put ( " files_complete " , complete ? 1 : 0 ) ;
prop . put ( " files_complete_path " , path ) ;
2012-09-28 22:45:16 +02:00
p = path . substring ( 0 , path . length ( ) - 1 ) . lastIndexOf ( '/' ) ;
if ( p < 8 ) {
prop . put ( " files_root " , 1 ) ;
} else {
prop . put ( " files_root " , 0 ) ;
2012-11-02 13:57:43 +01:00
prop . put ( " files_root_path " , path . substring ( 0 , p + 1 ) ) ;
2012-09-28 22:45:16 +02:00
}
2012-09-21 15:48:40 +02:00
try {
2012-09-28 22:45:16 +02:00
// generate file list from path
2012-09-30 13:23:06 +02:00
DigestURI uri = new DigestURI ( path ) ;
2012-09-28 22:45:16 +02:00
String host = uri . getHost ( ) ;
2012-10-16 17:13:18 +02:00
prop . putHTML ( " outbound_host " , host ) ;
prop . putHTML ( " inbound_host " , host ) ;
2012-10-16 18:11:57 +02:00
String hosthash = ASCII . String ( uri . hash ( ) , 6 , 6 ) ;
2012-11-05 03:19:28 +01:00
String [ ] pathparts = uri . getPaths ( ) ;
2012-09-28 22:45:16 +02:00
// get all files for a specific host from the index
2012-11-05 18:57:21 +01:00
StringBuilder q = new StringBuilder ( ) ;
2012-11-06 14:32:08 +01:00
q . append ( YaCySchema . host_s . getSolrFieldName ( ) ) . append ( ':' ) . append ( host ) ;
2012-11-05 18:57:21 +01:00
if ( pathparts . length > 0 & & pathparts [ 0 ] . length ( ) > 0 ) {
for ( String pe : pathparts ) {
2012-11-06 14:32:08 +01:00
if ( pe . length ( ) > 0 ) q . append ( " AND " ) . append ( YaCySchema . url_paths_sxt . getSolrFieldName ( ) ) . append ( ':' ) . append ( pe ) ;
2012-11-05 18:57:21 +01:00
}
} else {
2012-11-07 02:17:24 +01:00
if ( facetcount > 1000 | | post . containsKey ( " nepr " ) ) {
q . append ( " AND " ) . append ( YaCySchema . url_paths_sxt . getSolrFieldName ( ) ) . append ( " :[* TO *] " ) ;
}
2012-11-05 18:57:21 +01:00
}
2012-11-07 02:17:24 +01:00
BlockingQueue < SolrDocument > docs = fulltext . getSolr ( ) . concurrentQuery ( q . toString ( ) , 0 , 100000 , 10000 , 100 ) ;
2012-09-28 22:45:16 +02:00
SolrDocument doc ;
Set < String > storedDocs = new HashSet < String > ( ) ;
2012-11-06 00:29:37 +01:00
Map < String , String > errorDocs = new HashMap < String , String > ( ) ;
2012-10-16 17:13:18 +02:00
Set < String > inboundLinks = new HashSet < String > ( ) ;
Map < String , ReversibleScoreMap < String > > outboundHosts = new HashMap < String , ReversibleScoreMap < String > > ( ) ;
2012-09-28 22:45:16 +02:00
int hostsize = 0 ;
2012-10-31 17:44:45 +01:00
final List < byte [ ] > deleteIDs = new ArrayList < byte [ ] > ( ) ;
2012-11-07 02:17:24 +01:00
long timeout = System . currentTimeMillis ( ) + 10000 ;
2012-09-28 22:45:16 +02:00
while ( ( doc = docs . take ( ) ) ! = AbstractSolrConnector . POISON_DOCUMENT ) {
2012-10-18 15:09:04 +02:00
String u = ( String ) doc . getFieldValue ( YaCySchema . sku . getSolrFieldName ( ) ) ;
2012-11-06 14:32:08 +01:00
String error = ( String ) doc . getFieldValue ( YaCySchema . failreason_t . getSolrFieldName ( ) ) ;
2012-10-31 17:44:45 +01:00
if ( u . startsWith ( path ) ) {
if ( delete ) {
deleteIDs . add ( ASCII . getBytes ( ( String ) doc . getFieldValue ( YaCySchema . id . name ( ) ) ) ) ;
} else {
2012-11-06 00:29:37 +01:00
if ( error = = null ) storedDocs . add ( u ) ; else if ( admin ) errorDocs . put ( u , error ) ;
2012-10-31 17:44:45 +01:00
}
} else if ( complete ) {
2012-11-06 00:29:37 +01:00
if ( error = = null ) storedDocs . add ( u ) ; else if ( admin ) errorDocs . put ( u , error ) ;
2012-10-31 17:44:45 +01:00
}
2012-11-06 00:29:37 +01:00
if ( error = = null ) {
hostsize + + ;
// collect inboundlinks to browse the host
Iterator < String > links = URIMetadataNode . getLinks ( doc , true ) ;
while ( links . hasNext ( ) ) {
u = links . next ( ) ;
if ( ( complete | | u . startsWith ( path ) ) & & ! storedDocs . contains ( u ) ) inboundLinks . add ( u ) ;
}
// collect outboundlinks to browse to the outbound
links = URIMetadataNode . getLinks ( doc , false ) ;
while ( links . hasNext ( ) ) {
u = links . next ( ) ;
try {
MultiProtocolURI mu = new MultiProtocolURI ( u ) ;
if ( mu . getHost ( ) ! = null ) {
ReversibleScoreMap < String > lks = outboundHosts . get ( mu . getHost ( ) ) ;
if ( lks = = null ) {
lks = new ClusteredScoreMap < String > ( UTF8 . insensitiveUTF8Comparator ) ;
outboundHosts . put ( mu . getHost ( ) , lks ) ;
}
lks . set ( u , u . length ( ) ) ;
2012-10-16 17:13:18 +02:00
}
2012-11-06 00:29:37 +01:00
} catch ( MalformedURLException e ) { }
}
2012-09-28 22:45:16 +02:00
}
2012-11-02 13:57:43 +01:00
if ( System . currentTimeMillis ( ) > timeout ) break ;
2012-09-28 22:45:16 +02:00
}
2012-11-06 00:29:37 +01:00
if ( deleteIDs . size ( ) > 0 ) {
for ( byte [ ] b : deleteIDs ) sb . crawlQueues . urlRemove ( b ) ;
sb . index . fulltext ( ) . remove ( deleteIDs , true ) ;
}
2012-10-23 19:02:55 +02:00
2012-11-02 13:57:43 +01:00
// collect from crawler
2012-11-02 14:40:02 +01:00
List < Request > domainStackReferences = ( admin ) ? sb . crawlQueues . noticeURL . getDomainStackReferences ( StackType . LOCAL , host , 1000 , 3000 ) : new ArrayList < Request > ( 0 ) ;
2012-11-02 13:57:43 +01:00
Set < String > loadingLinks = new HashSet < String > ( ) ;
for ( Request crawlEntry : domainStackReferences ) loadingLinks . add ( crawlEntry . url ( ) . toNormalform ( true ) ) ;
// now combine all lists into one
2012-11-06 00:29:37 +01:00
Map < String , StoreType > files = new HashMap < String , StoreType > ( ) ;
for ( String u : storedDocs ) files . put ( u , StoreType . INDEX ) ;
for ( String u : errorDocs . keySet ( ) ) files . put ( u , StoreType . ERROR ) ;
for ( String u : inboundLinks ) if ( ! storedDocs . contains ( u ) ) files . put ( u , StoreType . LINK ) ;
for ( String u : loadingLinks ) if ( u . startsWith ( path ) & & ! storedDocs . contains ( u ) ) files . put ( u , StoreType . LINK ) ;
2012-10-23 19:02:55 +02:00
Log . logInfo ( " HostBrowser " , " collected " + files . size ( ) + " urls for path " + path ) ;
2012-09-28 22:45:16 +02:00
// distinguish files and folders
2012-11-02 14:40:02 +01:00
Map < String , Object > list = new TreeMap < String , Object > ( ) ; // a directory list; if object is boolean, its a file; if its a int[], then its a folder
2012-10-23 19:02:55 +02:00
int pl = path . length ( ) ;
2012-10-25 10:23:43 +02:00
String file ;
2012-11-06 00:29:37 +01:00
for ( Map . Entry < String , StoreType > entry : files . entrySet ( ) ) {
2012-11-02 14:40:02 +01:00
if ( entry . getKey ( ) . length ( ) < pl ) continue ; // this is not inside the path
if ( ! entry . getKey ( ) . startsWith ( path ) ) continue ;
2012-10-25 10:23:43 +02:00
file = entry . getKey ( ) . substring ( pl ) ;
2012-11-06 00:29:37 +01:00
StoreType type = entry . getValue ( ) ;
2012-09-28 22:45:16 +02:00
p = file . indexOf ( '/' ) ;
if ( p < 0 ) {
2012-10-23 19:02:55 +02:00
// this is a file
2012-11-06 00:29:37 +01:00
list . put ( entry . getKey ( ) , type ) ; // StoreType value: this is a file; true -> file is in index; false -> not in index, maybe in crawler
2012-09-28 22:45:16 +02:00
} else {
2012-10-23 19:02:55 +02:00
// this is a directory path or a file in a subdirectory
String remainingPath = file . substring ( 0 , p + 1 ) ;
if ( complete & & remainingPath . indexOf ( '.' ) > 0 ) {
2012-11-06 00:29:37 +01:00
list . put ( entry . getKey ( ) , type ) ; // StoreType value: this is a file
2012-10-23 19:02:55 +02:00
} else {
String dir = path + remainingPath ;
Object c = list . get ( dir ) ;
if ( c = = null ) {
2012-11-06 00:29:37 +01:00
int [ ] linkedStoredIncrawlerError = new int [ ] { 0 , 0 , 0 , 0 } ;
if ( type = = StoreType . LINK ) linkedStoredIncrawlerError [ 0 ] + + ;
if ( type = = StoreType . INDEX ) linkedStoredIncrawlerError [ 1 ] + + ;
if ( loadingLinks . contains ( entry . getKey ( ) ) ) linkedStoredIncrawlerError [ 2 ] + + ;
if ( errorDocs . containsKey ( entry . getKey ( ) ) ) linkedStoredIncrawlerError [ 3 ] + + ;
list . put ( dir , linkedStoredIncrawlerError ) ;
2012-10-23 19:02:55 +02:00
} else if ( c instanceof int [ ] ) {
2012-11-06 00:29:37 +01:00
if ( type = = StoreType . LINK ) ( ( int [ ] ) c ) [ 0 ] + + ;
if ( type = = StoreType . INDEX ) ( ( int [ ] ) c ) [ 1 ] + + ;
if ( loadingLinks . contains ( entry . getKey ( ) ) ) ( ( int [ ] ) c ) [ 2 ] + + ;
if ( errorDocs . containsKey ( entry . getKey ( ) ) ) ( ( int [ ] ) c ) [ 3 ] + + ;
2012-10-23 19:02:55 +02:00
}
2012-09-28 22:45:16 +02:00
}
}
}
int maxcount = 1000 ;
int c = 0 ;
for ( Map . Entry < String , Object > entry : list . entrySet ( ) ) {
2012-11-06 00:29:37 +01:00
if ( entry . getValue ( ) instanceof StoreType ) {
2012-09-28 22:45:16 +02:00
// this is a file
prop . put ( " files_list_ " + c + " _type " , 0 ) ;
2012-10-18 11:42:13 +02:00
prop . put ( " files_list_ " + c + " _type_url " , entry . getKey ( ) ) ;
2012-11-06 00:29:37 +01:00
StoreType type = ( StoreType ) entry . getValue ( ) ;
2012-09-30 13:23:06 +02:00
try { uri = new DigestURI ( entry . getKey ( ) ) ; } catch ( MalformedURLException e ) { uri = null ; }
2012-11-02 13:57:43 +01:00
boolean loading = load . equals ( entry . getKey ( ) ) | | ( uri ! = null & & sb . crawlQueues . urlExists ( uri . hash ( ) ) ! = null ) ;
2012-10-02 21:18:27 +02:00
//String failr = fulltext.failReason(ASCII.String(uri.hash()));
2012-11-06 00:29:37 +01:00
prop . put ( " files_list_ " + c + " _type_stored " , type = = StoreType . INDEX ? 1 : loading ? 2 : type = = StoreType . ERROR ? 3 : 0 /*linked*/ ) ;
2012-09-30 13:23:06 +02:00
prop . put ( " files_list_ " + c + " _type_stored_load " , loadRight ? 1 : 0 ) ;
2012-11-06 00:29:37 +01:00
if ( type = = StoreType . ERROR ) prop . put ( " files_list_ " + c + " _type_stored_error " , errorDocs . get ( entry . getKey ( ) ) ) ;
2012-09-30 13:23:06 +02:00
if ( loadRight ) {
2012-10-18 11:42:13 +02:00
prop . put ( " files_list_ " + c + " _type_stored_load_url " , entry . getKey ( ) ) ;
2012-09-30 13:23:06 +02:00
prop . put ( " files_list_ " + c + " _type_stored_load_path " , path ) ;
}
2012-09-28 22:45:16 +02:00
} else {
// this is a folder
prop . put ( " files_list_ " + c + " _type " , 1 ) ;
2012-10-18 11:42:13 +02:00
prop . put ( " files_list_ " + c + " _type_url " , entry . getKey ( ) ) ;
2012-11-02 14:40:02 +01:00
int linked = ( ( int [ ] ) entry . getValue ( ) ) [ 0 ] ;
int stored = ( ( int [ ] ) entry . getValue ( ) ) [ 1 ] ;
int crawler = ( ( int [ ] ) entry . getValue ( ) ) [ 2 ] ;
2012-11-06 00:29:37 +01:00
int error = ( ( int [ ] ) entry . getValue ( ) ) [ 3 ] ;
prop . put ( " files_list_ " + c + " _type_count " , stored + " stored / " + linked + " linked " + ( crawler > 0 ? ( " / " + crawler + " pending " ) : " " ) + ( error > 0 ? ( " / " + error + " errors " ) : " " ) ) ;
2012-09-28 22:45:16 +02:00
}
if ( + + c > = maxcount ) break ;
2012-09-21 15:48:40 +02:00
}
2012-09-28 22:45:16 +02:00
prop . put ( " files_list " , c ) ;
prop . putHTML ( " files_path " , path ) ;
prop . put ( " files_hostsize " , hostsize ) ;
prop . put ( " files_subpathsize " , storedDocs . size ( ) ) ;
prop . put ( " files " , 1 ) ;
2012-10-16 17:13:18 +02:00
// generate inbound-links table
StructureEntry struct = sb . webStructure . incomingReferences ( hosthash ) ;
if ( struct ! = null & & struct . references . size ( ) > 0 ) {
maxcount = 200 ;
ReversibleScoreMap < String > score = new ClusteredScoreMap < String > ( UTF8 . insensitiveUTF8Comparator ) ;
for ( Map . Entry < String , Integer > entry : struct . references . entrySet ( ) ) score . set ( entry . getKey ( ) , entry . getValue ( ) ) ;
c = 0 ;
Iterator < String > i = score . keys ( false ) ;
while ( i . hasNext ( ) & & c < maxcount ) {
host = i . next ( ) ;
prop . put ( " inbound_list_ " + c + " _host " , sb . webStructure . hostHash2hostName ( host ) ) ;
prop . put ( " inbound_list_ " + c + " _count " , score . get ( host ) ) ;
c + + ;
}
prop . put ( " inbound_list " , c ) ;
prop . put ( " inbound " , 1 ) ;
} else {
prop . put ( " inbound " , 0 ) ;
}
// generate outbound-links table
if ( outboundHosts . size ( ) > 0 ) {
maxcount = 200 ;
ReversibleScoreMap < String > score = new ClusteredScoreMap < String > ( UTF8 . insensitiveUTF8Comparator ) ;
for ( Map . Entry < String , ReversibleScoreMap < String > > entry : outboundHosts . entrySet ( ) ) score . set ( entry . getKey ( ) , entry . getValue ( ) . size ( ) ) ;
c = 0 ;
Iterator < String > i = score . keys ( false ) ;
while ( i . hasNext ( ) & & c < maxcount ) {
host = i . next ( ) ;
prop . put ( " outbound_list_ " + c + " _host " , host ) ;
prop . put ( " outbound_list_ " + c + " _count " , score . get ( host ) ) ;
prop . put ( " outbound_list_ " + c + " _link " , outboundHosts . get ( host ) . getMinKey ( ) ) ;
c + + ;
}
prop . put ( " outbound_list " , c ) ;
prop . put ( " outbound " , 1 ) ;
} else {
prop . put ( " outbound " , 0 ) ;
}
2012-09-28 22:45:16 +02:00
} catch ( Throwable e ) {
Log . logException ( e ) ;
2012-09-21 15:48:40 +02:00
}
}
// insert constants
2012-09-28 22:45:16 +02:00
prop . putNum ( " ucount " , fulltext . size ( ) ) ;
2012-09-21 15:48:40 +02:00
// return rewrite properties
return prop ;
}
}