2005-10-05 16:11:50 +02:00
// IndexControl_p.java
2005-04-07 21:19:42 +02:00
// -----------------------
// part of the AnomicHTTPD caching proxy
// (C) by Michael Peter Christen; mc@anomic.de
// first published on http://www.anomic.de
// Frankfurt, Germany, 2004
2005-10-05 16:11:50 +02:00
//
// $LastChangedDate$
// $LastChangedRevision$
// $LastChangedBy$
2005-04-07 21:19:42 +02:00
//
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation; either version 2 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
//
// Using this software in any meaning (reading, learning, copying, compiling,
// running) means that you agree that the Author(s) is (are) not responsible
// for cost, loss of data or any harm that may be caused directly or indirectly
// by usage of this softare or this documentation. The usage of this software
// is on your own risk. The installation and usage (starting/running) of this
// software may allow other people or application to access your computer and
// any attached devices and is highly dependent on the configuration of the
// software which must be done by the user of the software; the author(s) is
// (are) also not responsible for proper configuration and usage of the
// software, even if provoked by documentation provided together with
// the software.
//
// Any changes to this file according to the GPL as documented in the file
// gpl.txt aside this file in the shipment you received can be done to the
// lines that follows this copyright notice here, but changes must not be
// done inside the copyright notive above. A re-distribution must contain
// the intact and unchanged copyright notice.
// Contributions and changes to the program code must be marked as such.
2005-05-12 19:50:45 +02:00
// You must compile this file with
2005-10-05 16:11:50 +02:00
// javac -classpath .:../classes IndexControl_p.java
2005-04-07 21:19:42 +02:00
// if the shell's current path is HTROOT
2005-05-05 07:36:42 +02:00
import java.io.IOException ;
2005-12-15 11:31:00 +01:00
import java.net.MalformedURLException ;
2005-05-05 07:36:42 +02:00
import java.net.URL ;
import java.util.Enumeration ;
import java.util.HashSet ;
2005-08-13 00:14:24 +02:00
import java.util.HashMap ;
2005-05-05 07:36:42 +02:00
import java.util.Iterator ;
2005-10-05 16:11:50 +02:00
import java.util.TreeMap ;
2005-10-12 14:28:49 +02:00
2005-05-05 07:36:42 +02:00
import de.anomic.htmlFilter.htmlFilterContentScraper ;
import de.anomic.http.httpHeader ;
import de.anomic.plasma.plasmaCrawlLURL ;
import de.anomic.plasma.plasmaSwitchboard ;
import de.anomic.plasma.plasmaURL ;
import de.anomic.plasma.plasmaWordIndexEntity ;
import de.anomic.plasma.plasmaWordIndexEntry ;
import de.anomic.server.serverObjects ;
import de.anomic.server.serverSwitch ;
import de.anomic.yacy.yacyClient ;
import de.anomic.yacy.yacyCore ;
import de.anomic.yacy.yacySeed ;
2005-04-07 21:19:42 +02:00
public class IndexControl_p {
public static serverObjects respond ( httpHeader header , serverObjects post , serverSwitch env ) {
2005-10-05 16:11:50 +02:00
// return variable that accumulates replacements
2005-04-07 21:19:42 +02:00
plasmaSwitchboard switchboard = ( plasmaSwitchboard ) env ;
2005-10-05 16:11:50 +02:00
serverObjects prop = new serverObjects ( ) ;
2005-04-07 21:19:42 +02:00
2005-10-05 16:11:50 +02:00
if ( post = = null | | env = = null ) {
2005-04-07 21:19:42 +02:00
prop . put ( " keystring " , " " ) ;
prop . put ( " keyhash " , " " ) ;
prop . put ( " urlstring " , " " ) ;
prop . put ( " urlhash " , " " ) ;
prop . put ( " result " , " " ) ;
2005-08-03 15:43:55 +02:00
prop . put ( " wcount " , Integer . toString ( switchboard . wordIndex . size ( ) ) ) ;
prop . put ( " ucount " , Integer . toString ( switchboard . urlPool . loadedURL . size ( ) ) ) ;
2005-04-07 21:19:42 +02:00
prop . put ( " otherHosts " , " " ) ;
prop . put ( " indexDistributeChecked " , ( switchboard . getConfig ( " allowDistributeIndex " , " true " ) . equals ( " true " ) ) ? " checked " : " " ) ;
2005-10-05 16:11:50 +02:00
prop . put ( " indexDistributeWhileCrawling " , ( switchboard . getConfig ( " allowDistributeIndexWhileCrawling " , " true " ) . equals ( " true " ) ) ? " checked " : " " ) ;
2005-04-07 21:19:42 +02:00
prop . put ( " indexReceiveChecked " , ( switchboard . getConfig ( " allowReceiveIndex " , " true " ) . equals ( " true " ) ) ? " checked " : " " ) ;
2005-07-11 17:36:10 +02:00
prop . put ( " indexReceiveBlockBlacklistChecked " , ( switchboard . getConfig ( " indexReceiveBlockBlacklist " , " true " ) . equals ( " true " ) ) ? " checked " : " " ) ;
2005-04-07 21:19:42 +02:00
return prop ; // be save
}
2005-10-05 16:11:50 +02:00
2005-04-07 21:19:42 +02:00
// default values
2005-12-15 11:31:00 +01:00
String keystring = ( ( String ) post . get ( " keystring " , " " ) ) . trim ( ) ;
String keyhash = ( ( String ) post . get ( " keyhash " , " " ) ) . trim ( ) ;
String urlstring = ( ( String ) post . get ( " urlstring " , " " ) ) . trim ( ) ;
String urlhash = ( ( String ) post . get ( " urlhash " , " " ) ) . trim ( ) ;
2005-10-05 16:11:50 +02:00
if ( ! urlstring . startsWith ( " http:// " ) & &
! urlstring . startsWith ( " https:// " ) ) { urlstring = " http:// " + urlstring ; }
2005-04-07 21:19:42 +02:00
prop . put ( " keystring " , keystring ) ;
prop . put ( " keyhash " , keyhash ) ;
prop . put ( " urlstring " , urlstring ) ;
prop . put ( " urlhash " , urlhash ) ;
prop . put ( " result " , " " ) ;
2005-10-05 16:11:50 +02:00
2005-04-07 21:19:42 +02:00
// read values from checkboxes
String [ ] urlx = post . getAll ( " urlhx.* " ) ;
boolean delurl = post . containsKey ( " delurl " ) ;
boolean delurlref = post . containsKey ( " delurlref " ) ;
2005-10-05 16:11:50 +02:00
// System.out.println("DEBUG CHECK: " + ((delurl) ? "delurl" : "") + " " + ((delurlref) ? "delurlref" : ""));
// DHT control
2005-07-28 00:52:29 +02:00
if ( post . containsKey ( " setIndexTransmission " ) ) {
2005-10-05 16:11:50 +02:00
if ( post . get ( " indexDistribute " , " " ) . equals ( " on " ) ) {
switchboard . setConfig ( " allowDistributeIndex " , " true " ) ;
switchboard . indexDistribution . enable ( ) ;
} else {
switchboard . setConfig ( " allowDistributeIndex " , " false " ) ;
switchboard . indexDistribution . disable ( ) ;
}
if ( post . containsKey ( " indexDistributeWhileCrawling " ) ) {
switchboard . setConfig ( " allowDistributeIndexWhileCrawling " , " true " ) ;
switchboard . indexDistribution . enableWhileCrawling ( ) ;
} else {
switchboard . setConfig ( " allowDistributeIndexWhileCrawling " , " false " ) ;
switchboard . indexDistribution . disableWhileCrawling ( ) ;
}
if ( post . get ( " indexReceive " , " " ) . equals ( " on " ) ) {
switchboard . setConfig ( " allowReceiveIndex " , " true " ) ;
yacyCore . seedDB . mySeed . setFlagAcceptRemoteIndex ( true ) ;
} else {
switchboard . setConfig ( " allowReceiveIndex " , " false " ) ;
yacyCore . seedDB . mySeed . setFlagAcceptRemoteIndex ( false ) ;
}
if ( post . get ( " indexReceiveBlockBlacklist " , " " ) . equals ( " on " ) ) {
switchboard . setConfig ( " indexReceiveBlockBlacklist " , " true " ) ;
} else {
switchboard . setConfig ( " indexReceiveBlockBlacklist " , " false " ) ;
}
2005-04-07 21:19:42 +02:00
}
2005-10-05 16:11:50 +02:00
// delete word
2005-04-07 21:19:42 +02:00
if ( post . containsKey ( " keyhashdeleteall " ) ) {
2005-10-05 16:11:50 +02:00
if ( delurl | | delurlref ) {
2005-05-07 23:11:18 +02:00
// generate an urlx array
2005-10-05 12:45:33 +02:00
plasmaWordIndexEntity index = null ;
2005-05-07 23:11:18 +02:00
try {
2005-10-23 19:50:27 +02:00
index = switchboard . wordIndex . getEntity ( keyhash , true , - 1 ) ;
2005-10-13 15:57:15 +02:00
Iterator en = index . elements ( true ) ;
2005-05-07 23:11:18 +02:00
int i = 0 ;
urlx = new String [ index . size ( ) ] ;
2005-10-13 15:57:15 +02:00
while ( en . hasNext ( ) ) {
urlx [ i + + ] = ( ( plasmaWordIndexEntry ) en . next ( ) ) . getUrlHash ( ) ;
2005-10-05 16:11:50 +02:00
}
index . close ( ) ;
index = null ;
2005-05-07 23:11:18 +02:00
} catch ( IOException e ) {
urlx = new String [ 0 ] ;
2005-10-05 12:45:33 +02:00
} finally {
if ( index ! = null ) try { index . close ( ) ; } catch ( Exception e ) { }
2005-04-07 21:19:42 +02:00
}
2005-05-07 23:11:18 +02:00
}
2005-10-05 16:11:50 +02:00
if ( delurlref ) {
2005-12-15 11:31:00 +01:00
for ( int i = 0 ; i < urlx . length ; i + + ) switchboard . removeAllUrlReferences ( urlx [ i ] , true ) ;
2005-10-05 16:11:50 +02:00
}
if ( delurl | | delurlref ) {
for ( int i = 0 ; i < urlx . length ; i + + ) {
switchboard . urlPool . loadedURL . remove ( urlx [ i ] ) ;
}
}
2005-05-07 23:11:18 +02:00
switchboard . wordIndex . deleteIndex ( keyhash ) ;
2005-04-07 21:19:42 +02:00
post . remove ( " keyhashdeleteall " ) ;
2005-10-05 16:11:50 +02:00
if ( keystring . length ( ) > 0 & &
plasmaWordIndexEntry . word2hash ( keystring ) . equals ( keyhash ) ) {
2005-04-07 21:19:42 +02:00
post . put ( " keystringsearch " , " generated " ) ;
2005-10-05 16:11:50 +02:00
} else {
2005-04-07 21:19:42 +02:00
post . put ( " keyhashsearch " , " generated " ) ;
2005-10-05 16:11:50 +02:00
}
2005-04-07 21:19:42 +02:00
}
2005-10-05 16:11:50 +02:00
// delete selected URLs
2005-04-07 21:19:42 +02:00
if ( post . containsKey ( " keyhashdelete " ) ) {
2005-10-05 16:11:50 +02:00
if ( delurlref ) {
2005-12-15 11:31:00 +01:00
for ( int i = 0 ; i < urlx . length ; i + + ) switchboard . removeAllUrlReferences ( urlx [ i ] , true ) ;
2005-10-05 16:11:50 +02:00
}
if ( delurl | | delurlref ) {
for ( int i = 0 ; i < urlx . length ; i + + ) {
switchboard . urlPool . loadedURL . remove ( urlx [ i ] ) ;
}
}
2005-05-07 23:11:18 +02:00
switchboard . wordIndex . removeEntries ( keyhash , urlx , true ) ;
2005-04-07 21:19:42 +02:00
// this shall lead to a presentation of the list; so handle that the remaining program
// thinks that it was called for a list presentation
post . remove ( " keyhashdelete " ) ;
2005-10-05 16:11:50 +02:00
if ( keystring . length ( ) > 0 & &
plasmaWordIndexEntry . word2hash ( keystring ) . equals ( keyhash ) ) {
2005-04-07 21:19:42 +02:00
post . put ( " keystringsearch " , " generated " ) ;
2005-10-05 16:11:50 +02:00
} else {
2005-04-07 21:19:42 +02:00
post . put ( " keyhashsearch " , " generated " ) ;
2005-10-05 16:11:50 +02:00
// prop.put("result", "Delete of relation of url hashes " + result + " to key hash " + keyhash);
}
2005-04-07 21:19:42 +02:00
}
2005-10-05 16:11:50 +02:00
2005-04-07 21:19:42 +02:00
if ( post . containsKey ( " urlhashdeleteall " ) ) {
2005-12-15 11:31:00 +01:00
//try {
2005-12-11 01:25:02 +01:00
int i = switchboard . removeAllUrlReferences ( urlhash , true ) ;
prop . put ( " result " , " Deleted URL and " + i + " references from " + i + " word indexes. " ) ;
2005-12-15 11:31:00 +01:00
//} catch (IOException e) {
// prop.put("result", "Deleted nothing because the url-hash could not be resolved");
//}
2005-04-07 21:19:42 +02:00
}
2005-10-05 16:11:50 +02:00
2005-04-07 21:19:42 +02:00
if ( post . containsKey ( " urlhashdelete " ) ) {
2005-12-11 01:25:02 +01:00
try {
2006-01-20 16:14:21 +01:00
plasmaCrawlLURL . Entry entry = switchboard . urlPool . loadedURL . getEntry ( urlhash , null ) ;
2005-12-11 01:25:02 +01:00
URL url = entry . url ( ) ;
2005-04-07 21:19:42 +02:00
urlstring = htmlFilterContentScraper . urlNormalform ( url ) ;
prop . put ( " urlstring " , " " ) ;
2005-06-16 02:31:13 +02:00
switchboard . urlPool . loadedURL . remove ( urlhash ) ;
2005-04-07 21:19:42 +02:00
prop . put ( " result " , " Removed URL " + urlstring ) ;
2005-12-11 01:25:02 +01:00
} catch ( IOException e ) {
prop . put ( " result " , " No Entry for URL hash " + urlhash + " ; nothing deleted. " ) ;
2005-04-07 21:19:42 +02:00
}
}
if ( post . containsKey ( " keystringsearch " ) ) {
keyhash = plasmaWordIndexEntry . word2hash ( keystring ) ;
prop . put ( " keyhash " , keyhash ) ;
prop . put ( " urlstring " , " " ) ;
prop . put ( " urlhash " , " " ) ;
prop . put ( " result " , genUrlList ( switchboard , keyhash , keystring ) ) ;
}
2005-10-05 16:11:50 +02:00
2005-04-07 21:19:42 +02:00
if ( post . containsKey ( " keyhashsearch " ) ) {
2005-10-05 16:11:50 +02:00
if ( keystring . length ( ) = = 0 | |
! plasmaWordIndexEntry . word2hash ( keystring ) . equals ( keyhash ) ) {
2005-04-07 21:19:42 +02:00
prop . put ( " keystring " , " <not possible to compute word from hash> " ) ;
2005-10-05 16:11:50 +02:00
}
2005-04-07 21:19:42 +02:00
prop . put ( " urlstring " , " " ) ;
prop . put ( " urlhash " , " " ) ;
prop . put ( " result " , genUrlList ( switchboard , keyhash , " " ) ) ;
}
2005-10-05 16:11:50 +02:00
// transfer to other peer
2005-04-07 21:19:42 +02:00
if ( post . containsKey ( " keyhashtransfer " ) ) {
2005-10-05 16:11:50 +02:00
if ( keystring . length ( ) = = 0 | |
! plasmaWordIndexEntry . word2hash ( keystring ) . equals ( keyhash ) ) {
2005-04-07 21:19:42 +02:00
prop . put ( " keystring " , " <not possible to compute word from hash> " ) ;
2005-10-05 16:11:50 +02:00
}
2005-04-07 21:19:42 +02:00
prop . put ( " urlstring " , " " ) ;
prop . put ( " urlhash " , " " ) ;
plasmaWordIndexEntity [ ] indexes = new plasmaWordIndexEntity [ 1 ] ;
String result ;
long starttime = System . currentTimeMillis ( ) ;
2005-10-23 19:50:27 +02:00
indexes [ 0 ] = switchboard . wordIndex . getEntity ( keyhash , true , - 1 ) ;
2005-08-13 00:14:24 +02:00
// built urlCache
2005-10-13 15:57:15 +02:00
Iterator urlIter = indexes [ 0 ] . elements ( true ) ;
2005-08-13 00:14:24 +02:00
HashMap knownURLs = new HashMap ( ) ;
HashSet unknownURLEntries = new HashSet ( ) ;
plasmaWordIndexEntry indexEntry ;
plasmaCrawlLURL . Entry lurl ;
2005-10-13 15:57:15 +02:00
while ( urlIter . hasNext ( ) ) {
indexEntry = ( plasmaWordIndexEntry ) urlIter . next ( ) ;
2005-12-11 01:25:02 +01:00
try {
2006-01-20 16:14:21 +01:00
lurl = switchboard . urlPool . loadedURL . getEntry ( indexEntry . getUrlHash ( ) , null ) ;
2005-08-13 00:14:24 +02:00
if ( lurl . toString ( ) = = null ) {
switchboard . urlPool . loadedURL . remove ( indexEntry . getUrlHash ( ) ) ;
unknownURLEntries . add ( indexEntry . getUrlHash ( ) ) ;
} else {
knownURLs . put ( indexEntry . getUrlHash ( ) , lurl ) ;
}
2005-12-11 01:25:02 +01:00
} catch ( IOException e ) {
unknownURLEntries . add ( indexEntry . getUrlHash ( ) ) ;
2005-08-13 00:14:24 +02:00
}
}
// now delete all entries that have no url entry
Iterator hashIter = unknownURLEntries . iterator ( ) ;
2005-10-05 16:11:50 +02:00
while ( hashIter . hasNext ( ) ) {
try {
indexes [ 0 ] . removeEntry ( ( String ) hashIter . next ( ) , false ) ;
} catch ( IOException e ) { }
}
// use whats remaining
2005-10-05 12:45:33 +02:00
String gzipBody = switchboard . getConfig ( " indexControl.gzipBody " , " false " ) ;
int timeout = ( int ) switchboard . getConfigLong ( " indexControl.timeout " , 60000 ) ;
2005-10-05 16:11:50 +02:00
result = yacyClient . transferIndex (
yacyCore . seedDB . getConnected ( post . get ( " hostHash " , " " ) ) ,
indexes ,
knownURLs ,
" true " . equalsIgnoreCase ( gzipBody ) ,
timeout ) ;
2005-04-07 21:19:42 +02:00
prop . put ( " result " , ( result = = null ) ? ( " Successfully transferred " + indexes [ 0 ] . size ( ) + " words in " + ( ( System . currentTimeMillis ( ) - starttime ) / 1000 ) + " seconds " ) : result ) ;
2005-10-05 16:11:50 +02:00
try { indexes [ 0 ] . close ( ) ; } catch ( IOException e ) { }
2005-04-07 21:19:42 +02:00
}
2005-10-05 16:11:50 +02:00
// generate list
if ( post . containsKey ( " keyhashsimilar " ) ) {
final Iterator hashIt = switchboard . wordIndex . wordHashes ( keyhash , true , true ) ;
StringBuffer result = new StringBuffer ( " Sequential List of Word-Hashes:<br> " ) ;
String hash ;
int i = 0 ;
while ( hashIt . hasNext ( ) & & i < 256 ) {
hash = ( String ) hashIt . next ( ) ;
result . append ( " <a href= \" /IndexControl_p.html? " )
2005-12-15 11:31:00 +01:00
. append ( " keyhash= " ) . append ( hash ) . append ( " &keyhashsearch= " )
2005-10-05 16:11:50 +02:00
. append ( " \" class= \" tt \" > " ) . append ( hash ) . append ( " </a> " )
. append ( ( ( i + 1 ) % 8 = = 0 ) ? " <br> " : " " ) ;
i + + ;
}
prop . put ( " result " , result ) ;
2005-04-07 21:19:42 +02:00
}
2005-10-05 16:11:50 +02:00
if ( post . containsKey ( " urlstringsearch " ) ) {
2005-04-07 21:19:42 +02:00
try {
URL url = new URL ( urlstring ) ;
2005-12-15 11:31:00 +01:00
urlhash = plasmaURL . urlHash ( url ) ;
prop . put ( " urlhash " , urlhash ) ;
2006-01-20 16:14:21 +01:00
plasmaCrawlLURL . Entry entry = switchboard . urlPool . loadedURL . getEntry ( urlhash , null ) ;
2005-04-07 21:19:42 +02:00
prop . put ( " result " , genUrlProfile ( switchboard , entry , urlhash ) ) ;
2005-12-15 11:31:00 +01:00
} catch ( MalformedURLException e ) {
prop . put ( " urlstring " , " bad url: " + urlstring ) ;
prop . put ( " urlhash " , " " ) ;
} catch ( IOException e ) {
prop . put ( " urlstring " , " unknown url: " + urlstring ) ;
2005-04-07 21:19:42 +02:00
prop . put ( " urlhash " , " " ) ;
}
}
2005-10-05 16:11:50 +02:00
2005-04-07 21:19:42 +02:00
if ( post . containsKey ( " urlhashsearch " ) ) {
2005-12-11 01:25:02 +01:00
try {
2006-01-20 16:14:21 +01:00
plasmaCrawlLURL . Entry entry = switchboard . urlPool . loadedURL . getEntry ( urlhash , null ) ;
2005-12-11 01:25:02 +01:00
URL url = entry . url ( ) ;
2005-04-07 21:19:42 +02:00
urlstring = url . toString ( ) ;
prop . put ( " urlstring " , urlstring ) ;
prop . put ( " result " , genUrlProfile ( switchboard , entry , urlhash ) ) ;
2005-12-11 01:25:02 +01:00
} catch ( IOException e ) {
prop . put ( " result " , " No Entry for URL hash " + urlhash ) ;
2005-04-07 21:19:42 +02:00
}
}
2005-10-05 16:11:50 +02:00
// generate list
if ( post . containsKey ( " urlhashsimilar " ) ) {
2005-12-07 00:51:29 +01:00
final Iterator hashIt = switchboard . urlPool . loadedURL . urlHashes ( urlhash , true ) ;
StringBuffer result = new StringBuffer ( " Sequential List of URL-Hashes:<br> " ) ;
String hash ;
int i = 0 ;
while ( hashIt . hasNext ( ) & & i < 256 ) {
hash = ( String ) hashIt . next ( ) ;
2005-12-15 11:31:00 +01:00
result . append ( " <a href= \" /IndexControl_p.html? " )
. append ( " urlhash= " ) . append ( hash ) . append ( " &urlhashsearch= " )
. append ( " \" class= \" tt \" > " ) . append ( hash ) . append ( " </a> " )
. append ( ( ( i + 1 ) % 8 = = 0 ) ? " <br> " : " " ) ;
2005-12-07 00:51:29 +01:00
i + + ;
2005-04-07 21:19:42 +02:00
}
2005-12-07 00:51:29 +01:00
prop . put ( " result " , result . toString ( ) ) ;
2005-04-07 21:19:42 +02:00
}
2005-10-05 16:11:50 +02:00
// list known hosts
yacySeed seed ;
2005-04-07 21:19:42 +02:00
int hc = 0 ;
2005-10-05 16:11:50 +02:00
if ( yacyCore . seedDB ! = null & & yacyCore . seedDB . sizeConnected ( ) > 0 ) {
Enumeration e = yacyCore . dhtAgent . getAcceptRemoteIndexSeeds ( keyhash ) ;
while ( e . hasMoreElements ( ) ) {
seed = ( yacySeed ) e . nextElement ( ) ;
2005-09-13 07:46:55 +02:00
if ( seed ! = null ) {
prop . put ( " hosts_ " + hc + " _hosthash " , seed . hash ) ;
2005-10-17 17:46:12 +02:00
prop . put ( " hosts_ " + hc + " _hostname " , /*seed.hash + " " +*/ seed . get ( yacySeed . NAME , " nameless " ) ) ;
2005-09-13 07:46:55 +02:00
hc + + ;
2005-04-07 21:19:42 +02:00
}
2005-10-05 16:11:50 +02:00
}
2005-08-03 15:43:55 +02:00
prop . put ( " hosts " , Integer . toString ( hc ) ) ;
2005-10-05 16:11:50 +02:00
} else {
2005-04-07 21:19:42 +02:00
prop . put ( " hosts " , " 0 " ) ;
2005-10-05 16:11:50 +02:00
}
2005-04-07 21:19:42 +02:00
// insert constants
2005-08-03 15:43:55 +02:00
prop . put ( " wcount " , Integer . toString ( switchboard . wordIndex . size ( ) ) ) ;
prop . put ( " ucount " , Integer . toString ( switchboard . urlPool . loadedURL . size ( ) ) ) ;
2005-10-05 16:11:50 +02:00
prop . put ( " indexDistributeChecked " , ( switchboard . getConfig ( " allowDistributeIndex " , " true " ) . equals ( " true " ) ) ? " checked " : " " ) ;
prop . put ( " indexDistributeWhileCrawling " , ( switchboard . getConfig ( " allowDistributeIndexWhileCrawling " , " true " ) . equals ( " true " ) ) ? " checked " : " " ) ;
2005-04-07 21:19:42 +02:00
prop . put ( " indexReceiveChecked " , ( switchboard . getConfig ( " allowReceiveIndex " , " true " ) . equals ( " true " ) ) ? " checked " : " " ) ;
2005-07-11 17:36:10 +02:00
prop . put ( " indexReceiveBlockBlacklistChecked " , ( switchboard . getConfig ( " indexReceiveBlockBlacklist " , " true " ) . equals ( " true " ) ) ? " checked " : " " ) ;
2005-04-07 21:19:42 +02:00
// return rewrite properties
2005-10-05 16:11:50 +02:00
return prop ;
2005-04-07 21:19:42 +02:00
}
2005-07-12 02:07:09 +02:00
public static String genUrlProfile ( plasmaSwitchboard switchboard , plasmaCrawlLURL . Entry entry , String urlhash ) {
2005-10-05 16:11:50 +02:00
if ( entry = = null ) { return " No entry found for URL-hash " + urlhash ; }
2005-04-07 21:19:42 +02:00
URL url = entry . url ( ) ;
2005-12-11 01:25:02 +01:00
String referrer = null ;
try {
2006-01-20 16:14:21 +01:00
referrer = switchboard . urlPool . loadedURL . getEntry ( entry . referrerHash ( ) , null ) . url ( ) . toString ( ) ;
2005-12-11 01:25:02 +01:00
} catch ( IOException e ) {
referrer = " <unknown> " ;
}
2005-10-05 16:11:50 +02:00
if ( url = = null ) { return " No entry found for URL-hash " + urlhash ; }
2005-04-07 21:19:42 +02:00
String result = " <table> " +
" <tr><td class= \" small \" >URL String</td><td class= \" tt \" > " + htmlFilterContentScraper . urlNormalform ( url ) + " </td></tr> " +
" <tr><td class= \" small \" >Hash</td><td class= \" tt \" > " + urlhash + " </td></tr> " +
" <tr><td class= \" small \" >Description</td><td class= \" tt \" > " + entry . descr ( ) + " </td></tr> " +
" <tr><td class= \" small \" >Modified-Date</td><td class= \" tt \" > " + entry . moddate ( ) + " </td></tr> " +
" <tr><td class= \" small \" >Loaded-Date</td><td class= \" tt \" > " + entry . loaddate ( ) + " </td></tr> " +
2005-12-11 01:25:02 +01:00
" <tr><td class= \" small \" >Referrer</td><td class= \" tt \" > " + referrer + " </td></tr> " +
2005-04-07 21:19:42 +02:00
" <tr><td class= \" small \" >Doctype</td><td class= \" tt \" > " + entry . doctype ( ) + " </td></tr> " +
" <tr><td class= \" small \" >Copy-Count</td><td class= \" tt \" > " + entry . copyCount ( ) + " </td></tr> " +
" <tr><td class= \" small \" >Local-Flag</td><td class= \" tt \" > " + entry . local ( ) + " </td></tr> " +
" <tr><td class= \" small \" >Quality</td><td class= \" tt \" > " + entry . quality ( ) + " </td></tr> " +
" <tr><td class= \" small \" >Language</td><td class= \" tt \" > " + entry . language ( ) + " </td></tr> " +
" <tr><td class= \" small \" >Size</td><td class= \" tt \" > " + entry . size ( ) + " </td></tr> " +
" <tr><td class= \" small \" >Words</td><td class= \" tt \" > " + entry . wordCount ( ) + " </td></tr> " +
" </table><br> " ;
result + =
" <form action= \" IndexControl_p.html \" method= \" post \" enctype= \" multipart/form-data \" > " +
" <input type= \" hidden \" name= \" keystring \" value= \" \" > " +
" <input type= \" hidden \" name= \" keyhash \" value= \" \" > " +
" <input type= \" hidden \" name= \" urlstring \" value= \" \" > " +
" <input type= \" hidden \" name= \" urlhash \" value= \" " + urlhash + " \" > " +
" <input type= \" submit \" value= \" Delete URL \" name= \" urlhashdelete \" ><br> " +
" <span class= \" small \" > this may produce unresolved references at other word indexes but they do not harm</span><br><br> " +
" <input type= \" submit \" value= \" Delete URL and remove all references from words \" name= \" urlhashdeleteall \" ><br> " +
" <span class= \" small \" > delete the reference to this url at every other word where the reference exists (very extensive, but prevents unresolved references)</span><br> " +
2005-10-05 16:11:50 +02:00
" </form> " ;
2005-04-07 21:19:42 +02:00
return result ;
}
2005-10-05 16:11:50 +02:00
2005-04-07 21:19:42 +02:00
public static String genUrlList ( plasmaSwitchboard switchboard , String keyhash , String keystring ) {
// search for a word hash and generate a list of url links
2005-10-05 12:45:33 +02:00
plasmaWordIndexEntity index = null ;
2005-04-07 21:19:42 +02:00
try {
2005-10-23 19:50:27 +02:00
index = switchboard . wordIndex . getEntity ( keyhash , true , - 1 ) ;
2005-10-05 16:11:50 +02:00
final StringBuffer result = new StringBuffer ( 1024 ) ;
2005-04-07 21:19:42 +02:00
if ( index . size ( ) = = 0 ) {
2005-10-05 16:11:50 +02:00
result . append ( " No URL entries related to this word hash <span class= \" tt \" > " ) . append ( keyhash ) . append ( " </span>. " ) ;
2005-04-07 21:19:42 +02:00
} else {
2005-10-13 15:57:15 +02:00
final Iterator en = index . elements ( true ) ;
2005-10-05 18:00:16 +02:00
result . append ( " URL entries related to this word hash <span class= \" tt \" > " ) . append ( keyhash ) . append ( " </span><br><br> " ) ;
2005-10-05 16:11:50 +02:00
result . append ( " <form action= \" IndexControl_p.html \" method= \" post \" enctype= \" multipart/form-data \" > " ) ;
2006-01-19 15:13:39 +01:00
String us ;
String uh [ ] = new String [ 2 ] ;
2005-04-07 21:19:42 +02:00
int i = 0 ;
2005-10-05 16:11:50 +02:00
final TreeMap tm = new TreeMap ( ) ;
2006-01-19 15:13:39 +01:00
plasmaWordIndexEntry xi ;
2005-10-13 15:57:15 +02:00
while ( en . hasNext ( ) ) {
2006-01-19 15:13:39 +01:00
xi = ( plasmaWordIndexEntry ) en . next ( ) ;
uh = new String [ ] { xi . getUrlHash ( ) , Integer . toString ( xi . posintext ( ) ) } ;
2005-12-15 11:31:00 +01:00
try {
2006-01-20 16:14:21 +01:00
us = switchboard . urlPool . loadedURL . getEntry ( uh [ 0 ] , null ) . url ( ) . toString ( ) ;
2005-10-05 16:11:50 +02:00
tm . put ( us , uh ) ;
2005-12-15 11:31:00 +01:00
} catch ( IOException e ) {
2006-01-19 15:13:39 +01:00
tm . put ( uh [ 0 ] , uh ) ;
2005-10-05 16:11:50 +02:00
}
}
2006-01-24 14:28:50 +01:00
URL url ;
2005-10-05 16:11:50 +02:00
final Iterator iter = tm . keySet ( ) . iterator ( ) ;
result . ensureCapacity ( ( tm . size ( ) + 2 ) * 384 ) ;
while ( iter . hasNext ( ) ) {
us = iter . next ( ) . toString ( ) ;
2006-01-19 15:13:39 +01:00
uh = ( String [ ] ) tm . get ( us ) ;
if ( us . equals ( uh [ 0 ] ) ) {
2006-01-24 14:28:50 +01:00
result . append ( " <input type= \" checkbox \" name= \" urlhx " ) . append ( i + + ) . append ( " \" checked value= \" " ) . append ( uh [ 0 ] ) . append ( " \" align= \" top \" > " )
. append ( " <span class= \" tt \" > " ) . append ( uh [ 0 ] ) . append ( " <unresolved URL Hash></span><br> " ) ;
2005-12-26 20:41:55 +01:00
} else {
2006-01-24 14:28:50 +01:00
url = new URL ( us ) ;
if ( plasmaSwitchboard . urlBlacklist . isListed ( url . getHost ( ) . toLowerCase ( ) , url . getPath ( ) ) ) {
result . append ( " <input type= \" checkbox \" name= \" urlhx " ) . append ( i + + ) . append ( " \" checked value= \" " ) . append ( uh [ 0 ] ) . append ( " \" align= \" top \" > " ) ;
} else {
result . append ( " <input type= \" checkbox \" name= \" urlhx " ) . append ( i + + ) . append ( " \" value= \" " ) . append ( uh [ 0 ] ) . append ( " \" align= \" top \" > " ) ;
}
2005-10-05 16:11:50 +02:00
result . append ( " <a href= \" /IndexControl_p.html? " ) . append ( " keystring= " ) . append ( keystring )
2006-01-19 15:13:39 +01:00
. append ( " &keyhash= " ) . append ( keyhash ) . append ( " &urlhash= " ) . append ( uh [ 0 ] )
2005-10-05 16:11:50 +02:00
. append ( " &urlstringsearch= " ) . append ( " &urlstring= " ) . append ( us ) . append ( " \" class= \" tt \" > " )
2006-01-19 15:13:39 +01:00
. append ( uh [ 0 ] ) . append ( " </a><span class= \" tt \" > " ) . append ( us ) . append ( " , pos= " ) . append ( uh [ 1 ] ) . append ( " </span><br> " ) ;
2005-04-07 21:19:42 +02:00
}
}
2005-10-05 16:11:50 +02:00
result . append ( " <input type= \" hidden \" name= \" keystring \" value= \" " ) . append ( keystring ) . append ( " \" > " )
. append ( " <input type= \" hidden \" name= \" keyhash \" value= \" " ) . append ( keyhash ) . append ( " \" > " )
. append ( " <input type= \" hidden \" name= \" urlstring \" value= \" \" > " )
. append ( " <input type= \" hidden \" name= \" urlhash \" value= \" \" > " )
. append ( " <br><fieldset><legend>Reference Deletion</legend><table border= \" 0 \" cellspacing= \" 5 \" cellpadding= \" 5 \" ><tr valign= \" top \" ><td><br><br> " )
. append ( " <input type= \" submit \" value= \" Delete reference to selected URLs \" name= \" keyhashdelete \" ><br><br> " )
. append ( " <input type= \" submit \" value= \" Delete reference to ALL URLs \" name= \" keyhashdeleteall \" ><span class= \" small \" ><br> (= delete Word)</span> " )
. append ( " </td><td width= \" 150 \" > " )
. append ( " <center><input type= \" checkbox \" name= \" delurl \" value= \" \" align= \" top \" checked></center><br> " )
. append ( " <span class= \" small \" >delete also the referenced URL itself (reasonable and recommended, may produce unresolved references at other word indexes but they do not harm)</span> " )
. append ( " </td><td width= \" 150 \" > " )
. append ( " <center><input type= \" checkbox \" name= \" delurlref \" value= \" \" align= \" top \" ></center><br> " )
. append ( " <span class= \" small \" >for every resolveable and deleted URL reference, delete the same reference at every other word where the reference exists (very extensive, but prevents further unresolved references)</span> " )
. append ( " </td></tr></table></fieldset></form><br> " ) ;
2005-04-07 21:19:42 +02:00
}
2005-10-05 16:11:50 +02:00
index . close ( ) ;
index = null ;
return result . toString ( ) ;
2005-04-07 21:19:42 +02:00
} catch ( IOException e ) {
return " " ;
2005-10-05 12:45:33 +02:00
} finally {
2005-10-05 16:11:50 +02:00
if ( index ! = null ) try { index . close ( ) ; index = null ; } catch ( Exception e ) { } ;
2005-04-07 21:19:42 +02:00
}
}
2005-10-05 16:11:50 +02:00
}