2005-05-07 23:11:18 +02:00
// yacyClient.java
2005-04-07 21:19:42 +02:00
// -------------------------------------
2008-07-20 19:14:51 +02:00
// (C) by Michael Peter Christen; mc@yacy.net
2005-04-07 21:19:42 +02:00
// first published on http://www.anomic.de
// Frankfurt, Germany, 2004
2005-09-21 23:32:43 +02:00
//
// $LastChangedDate$
// $LastChangedRevision$
// $LastChangedBy$
2005-04-07 21:19:42 +02:00
//
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation; either version 2 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
//
// Using this software in any meaning (reading, learning, copying, compiling,
// running) means that you agree that the Author(s) is (are) not responsible
// for cost, loss of data or any harm that may be caused directly or indirectly
// by usage of this softare or this documentation. The usage of this software
// is on your own risk. The installation and usage (starting/running) of this
// software may allow other people or application to access your computer and
// any attached devices and is highly dependent on the configuration of the
// software which must be done by the user of the software; the author(s) is
// (are) also not responsible for proper configuration and usage of the
// software, even if provoked by documentation provided together with
// the software.
//
// Any changes to this file according to the GPL as documented in the file
// gpl.txt aside this file in the shipment you received can be done to the
// lines that follows this copyright notice here, but changes must not be
// done inside the copyright notice above. A re-distribution must contain
// the intact and unchanged copyright notice.
// Contributions and changes to the program code must be marked as such.
package de.anomic.yacy ;
2008-02-01 00:40:47 +01:00
import java.io.File ;
2005-05-05 07:32:19 +02:00
import java.io.IOException ;
2006-01-10 17:48:59 +01:00
import java.io.UnsupportedEncodingException ;
2008-05-04 12:53:04 +02:00
import java.net.MalformedURLException ;
2010-09-14 15:35:47 +02:00
import java.util.ArrayList ;
2005-05-05 07:32:19 +02:00
import java.util.HashMap ;
2005-10-13 15:57:15 +02:00
import java.util.Iterator ;
2010-07-13 01:07:05 +02:00
import java.util.LinkedHashMap ;
2010-09-14 15:35:47 +02:00
import java.util.List ;
2006-09-30 00:27:20 +02:00
import java.util.Map ;
import java.util.TreeMap ;
2010-03-23 11:17:28 +01:00
import java.util.regex.Pattern ;
2006-01-30 01:42:38 +01:00
2010-08-23 03:08:56 +02:00
import net.yacy.cora.document.MultiProtocolURI ;
2010-05-25 14:54:57 +02:00
import net.yacy.cora.document.RSSFeed ;
2010-06-27 00:39:27 +02:00
import net.yacy.cora.document.RSSMessage ;
2010-05-25 14:54:57 +02:00
import net.yacy.cora.document.RSSReader ;
2010-07-14 00:10:24 +02:00
import net.yacy.cora.protocol.ByteArrayBody ;
2010-08-23 03:08:56 +02:00
import net.yacy.cora.protocol.http.HTTPConnector ;
2010-05-25 14:54:57 +02:00
import net.yacy.cora.services.Search ;
2009-10-11 02:12:19 +02:00
import net.yacy.kelondro.data.meta.URIMetadataRow ;
import net.yacy.kelondro.data.word.Word ;
import net.yacy.kelondro.data.word.WordReference ;
2009-12-10 00:27:26 +01:00
import net.yacy.kelondro.index.RowSpaceExceededException ;
2009-11-05 21:28:37 +01:00
import net.yacy.kelondro.logging.Log ;
2009-10-10 01:22:22 +02:00
import net.yacy.kelondro.order.Base64Order ;
import net.yacy.kelondro.order.Bitfield ;
import net.yacy.kelondro.order.Digest ;
2009-10-10 02:39:15 +02:00
import net.yacy.kelondro.rwi.Reference ;
import net.yacy.kelondro.rwi.ReferenceContainer ;
import net.yacy.kelondro.rwi.ReferenceContainerCache ;
2009-10-10 03:14:19 +02:00
import net.yacy.kelondro.util.ByteBuffer ;
import net.yacy.kelondro.util.FileUtils ;
2009-10-21 22:14:30 +02:00
import net.yacy.repository.Blacklist ;
2009-10-10 01:22:22 +02:00
2010-07-13 01:07:05 +02:00
import org.apache.http.entity.mime.content.ContentBody ;
import org.apache.http.entity.mime.content.StringBody ;
2008-04-12 10:12:51 +02:00
2008-05-06 02:32:41 +02:00
import de.anomic.crawler.ResultURLs ;
2009-10-21 22:14:30 +02:00
import de.anomic.crawler.retrieval.EventOrigin ;
2009-07-15 23:07:46 +02:00
import de.anomic.crawler.retrieval.HTTPLoader ;
2010-09-14 15:35:47 +02:00
import de.anomic.search.ContentDomain ;
import de.anomic.search.QueryParams ;
2009-06-16 23:45:40 +02:00
import de.anomic.search.RankingProfile ;
2009-07-09 00:14:57 +02:00
import de.anomic.search.RankingProcess ;
2010-06-18 11:44:21 +02:00
import de.anomic.search.SearchEvent ;
2009-10-11 02:12:19 +02:00
import de.anomic.search.Segment ;
2009-07-19 22:37:44 +02:00
import de.anomic.search.Switchboard ;
2009-08-27 16:34:41 +02:00
import de.anomic.search.TextSnippet ;
2005-05-05 07:32:19 +02:00
import de.anomic.server.serverCore ;
import de.anomic.tools.crypt ;
2005-04-07 21:19:42 +02:00
2005-10-05 12:45:33 +02:00
public final class yacyClient {
2005-09-21 23:32:43 +02:00
2010-05-25 14:54:57 +02:00
2010-07-15 02:59:53 +02:00
private static byte [ ] postToFile ( final yacySeed target , final String filename , final LinkedHashMap < String , ContentBody > parts , final int timeout ) throws IOException {
2010-08-23 03:08:56 +02:00
return HTTPConnector . getConnector ( HTTPLoader . crawlerUserAgent ) . post ( new MultiProtocolURI ( " http:// " + target . getClusterAddress ( ) + " /yacy/ " + filename ) , timeout , target . getHexHash ( ) + " .yacyh " , parts ) ;
2010-05-25 14:54:57 +02:00
}
2010-07-15 02:59:53 +02:00
private static byte [ ] postToFile ( final yacySeedDB seedDB , final String targetHash , final String filename , final LinkedHashMap < String , ContentBody > parts , final int timeout ) throws IOException {
2010-08-23 03:08:56 +02:00
return HTTPConnector . getConnector ( HTTPLoader . crawlerUserAgent ) . post ( new MultiProtocolURI ( " http:// " + targetAddress ( seedDB , targetHash ) + " /yacy/ " + filename ) , timeout , yacySeed . b64Hash2hexHash ( targetHash ) + " .yacyh " , parts ) ;
2010-05-25 14:54:57 +02:00
}
2008-08-02 15:57:00 +02:00
/ * *
* this is called to enrich the seed information by
* - own address ( if peer is behind a nat / router )
* - check peer type ( virgin / junior / senior / principal )
*
* to do this , we send a ' Hello ' to another peer
* this carries the following information :
* ' iam ' - own hash
* ' youare ' - remote hash , to verify that we are correct
* ' key ' - a session key that the remote peer may use to answer
* and the own seed string
* we expect the following information to be send back :
* - ' yourip ' the ip of the connection peer ( we )
* - ' yourtype ' the type of this peer that the other peer checked by asking for a specific word
* and the remote seed string
*
* one exceptional failure case is when we know the other ' s peers hash , the other peers responds correctly
* but they appear to be another peer by comparisment of the other peer ' s hash
* this works of course only if we know the other peer ' s hash .
*
* @return the number of new seeds
* /
2008-08-02 14:12:04 +02:00
public static int publishMySeed ( final yacySeed mySeed , final yacyPeerActions peerActions , final String address , final String otherHash ) {
2005-05-07 23:11:18 +02:00
2010-06-01 15:02:11 +02:00
Map < String , String > result = null ;
2008-04-12 10:12:51 +02:00
final String salt = crypt . randomSalt ( ) ;
2010-06-16 10:30:13 +02:00
try {
2007-07-05 01:48:52 +02:00
// generate request
2010-07-14 00:10:24 +02:00
final LinkedHashMap < String , ContentBody > parts = yacyNetwork . basicRequestParts ( Switchboard . getSwitchboard ( ) , null , salt ) ;
parts . put ( " count " , new StringBody ( " 20 " ) ) ;
2010-07-13 01:07:05 +02:00
parts . put ( " seed " , new StringBody ( mySeed . genSeedStr ( salt ) ) ) ;
2007-07-05 01:48:52 +02:00
// send request
2008-08-02 14:12:04 +02:00
final long start = System . currentTimeMillis ( ) ;
2010-08-23 03:08:56 +02:00
final byte [ ] content = HTTPConnector . getConnector ( HTTPLoader . crawlerUserAgent ) . post ( new MultiProtocolURI ( " http:// " + address + " /yacy/hello.html " ) , 30000 , yacySeed . b64Hash2hexHash ( otherHash ) + " .yacyh " , parts ) ;
2008-06-05 14:52:27 +02:00
yacyCore . log . logInfo ( " yacyClient.publishMySeed thread ' " + Thread . currentThread ( ) . getName ( ) + " ' contacted peer at " + address + " , received " + ( ( content = = null ) ? " null " : content . length ) + " bytes, time = " + ( System . currentTimeMillis ( ) - start ) + " milliseconds " ) ;
2009-10-05 22:11:41 +02:00
result = FileUtils . table ( content ) ;
2008-08-02 14:12:04 +02:00
} catch ( final Exception e ) {
2005-07-07 15:58:54 +02:00
if ( Thread . currentThread ( ) . isInterrupted ( ) ) {
2009-06-02 00:45:28 +02:00
yacyCore . log . logInfo ( " yacyClient.publishMySeed thread ' " + Thread . currentThread ( ) . getName ( ) + " ' interrupted. " ) ;
2007-09-12 02:42:53 +02:00
return - 1 ;
2005-07-07 15:58:54 +02:00
}
2010-06-16 10:30:13 +02:00
yacyCore . log . logInfo ( " yacyClient.publishMySeed thread ' " + Thread . currentThread ( ) . getName ( ) + " ', peer " + address + " ; exception: " + e . getMessage ( ) ) ;
2008-08-02 15:57:00 +02:00
// try again (go into loop)
2007-09-12 02:42:53 +02:00
result = null ;
2005-05-07 23:11:18 +02:00
}
2007-09-12 02:42:53 +02:00
2005-09-21 23:32:43 +02:00
if ( result = = null | | result . size ( ) < 3 ) {
2008-09-03 02:30:21 +02:00
if ( yacyCore . log . isFine ( ) ) yacyCore . log . logFine ( " yacyClient.publishMySeed result error: " +
2005-05-07 23:11:18 +02:00
( ( result = = null ) ? " result null " : ( " result= " + result . toString ( ) ) ) ) ;
return - 1 ;
}
2005-09-21 23:32:43 +02:00
2005-05-07 23:11:18 +02:00
// check consistency with expectation
2005-06-23 13:00:26 +02:00
yacySeed otherPeer = null ;
2007-06-13 15:21:19 +02:00
String seed ;
if ( ( otherHash ! = null ) & &
( otherHash . length ( ) > 0 ) & &
2008-06-06 18:01:27 +02:00
( ( seed = result . get ( " seed0 " ) ) ! = null ) ) {
2008-03-12 01:24:20 +01:00
if ( seed . length ( ) > yacySeed . maxsize ) {
yacyCore . log . logInfo ( " hello/client 0: rejected contacting seed; too large ( " + seed . length ( ) + " > " + yacySeed . maxsize + " ) " ) ;
2007-06-13 00:08:33 +02:00
} else {
2008-06-05 00:24:00 +02:00
otherPeer = yacySeed . genRemoteSeed ( seed , salt , false ) ;
2008-03-12 01:24:20 +01:00
if ( otherPeer = = null | | ! otherPeer . hash . equals ( otherHash ) ) {
2008-09-03 02:30:21 +02:00
if ( yacyCore . log . isFine ( ) ) yacyCore . log . logFine ( " yacyClient.publishMySeed: consistency error: other peer ' " + ( ( otherPeer = = null ) ? " unknown " : otherPeer . getName ( ) ) + " ' wrong " ) ;
2008-03-12 01:24:20 +01:00
return - 1 ; // no success
}
2005-05-07 23:11:18 +02:00
}
}
2005-09-21 23:32:43 +02:00
2005-05-07 23:11:18 +02:00
// set my own seed according to new information
2008-05-06 01:13:47 +02:00
// we overwrite our own IP number only
if ( serverCore . useStaticIP ) {
2009-10-11 02:12:19 +02:00
mySeed . setIP ( Switchboard . getSwitchboard ( ) . myPublicIP ( ) ) ;
2005-09-21 23:32:43 +02:00
} else {
2008-08-02 14:12:04 +02:00
final String myIP = result . get ( " yourip " ) ;
final String properIP = yacySeed . isProperIP ( myIP ) ;
2008-06-05 13:01:20 +02:00
if ( properIP = = null ) mySeed . setIP ( myIP ) ;
2005-06-16 09:28:07 +02:00
}
2005-09-21 23:32:43 +02:00
2008-05-06 01:13:47 +02:00
// change our seed-type
2008-06-06 18:01:27 +02:00
String mytype = result . get ( yacySeed . YOURTYPE ) ;
2008-05-06 01:13:47 +02:00
if ( mytype = = null ) { mytype = " " ; }
2008-08-02 14:12:04 +02:00
final yacyAccessible accessible = new yacyAccessible ( ) ;
2008-05-06 01:13:47 +02:00
if ( mytype . equals ( yacySeed . PEERTYPE_SENIOR ) | | mytype . equals ( yacySeed . PEERTYPE_PRINCIPAL ) ) {
accessible . IWasAccessed = true ;
if ( mySeed . isPrincipal ( ) ) {
mytype = yacySeed . PEERTYPE_PRINCIPAL ;
2005-07-04 13:09:48 +02:00
}
2008-05-06 01:13:47 +02:00
} else {
accessible . IWasAccessed = false ;
}
accessible . lastUpdated = System . currentTimeMillis ( ) ;
yacyCore . amIAccessibleDB . put ( otherHash , accessible ) ;
2005-09-21 23:32:43 +02:00
2008-05-06 01:13:47 +02:00
/ *
* If we were reported as junior we have to check if your port forwarding channel is broken
* If this is true we try to reconnect the sch channel to the remote server now .
* /
if ( mytype . equalsIgnoreCase ( yacySeed . PEERTYPE_JUNIOR ) ) {
yacyCore . log . logInfo ( " yacyClient.publishMySeed: Peer ' " + ( ( otherPeer = = null ) ? " unknown " : otherPeer . getName ( ) ) + " ' reported us as junior. " ) ;
} else if ( ( mytype . equalsIgnoreCase ( yacySeed . PEERTYPE_SENIOR ) ) | |
( mytype . equalsIgnoreCase ( yacySeed . PEERTYPE_PRINCIPAL ) ) ) {
2008-09-03 02:30:21 +02:00
if ( yacyCore . log . isFine ( ) ) yacyCore . log . logFine ( " yacyClient.publishMySeed: Peer ' " + ( ( otherPeer = = null ) ? " unknown " : otherPeer . getName ( ) ) + " ' reported us as " + mytype + " , accepted other peer. " ) ;
2008-05-06 01:13:47 +02:00
} else {
// wrong type report
2008-09-03 02:30:21 +02:00
if ( yacyCore . log . isFine ( ) ) yacyCore . log . logFine ( " yacyClient.publishMySeed: Peer ' " + ( ( otherPeer = = null ) ? " unknown " : otherPeer . getName ( ) ) + " ' reported us as " + mytype + " , rejecting other peer. " ) ;
2008-05-06 01:13:47 +02:00
return - 1 ;
2005-06-16 09:28:07 +02:00
}
2008-05-06 01:13:47 +02:00
if ( mySeed . orVirgin ( ) . equals ( yacySeed . PEERTYPE_VIRGIN ) )
mySeed . put ( yacySeed . PEERTYPE , mytype ) ;
2005-09-21 23:32:43 +02:00
2008-06-05 00:24:00 +02:00
final String error = mySeed . isProper ( true ) ;
2005-09-21 23:32:43 +02:00
if ( error ! = null ) {
2007-10-01 14:30:23 +02:00
yacyCore . log . logSevere ( " yacyClient.publishMySeed mySeed error - not proper: " + error ) ;
2005-05-07 23:11:18 +02:00
return - 1 ;
}
2005-09-21 23:32:43 +02:00
2005-10-17 17:46:12 +02:00
//final Date remoteTime = yacyCore.parseUniversalDate((String) result.get(yacySeed.MYTIME)); // read remote time
2007-10-19 17:31:38 +02:00
2005-05-07 23:11:18 +02:00
// read the seeds that the peer returned and integrate them into own database
2005-04-07 21:19:42 +02:00
int i = 0 ;
2005-05-07 23:11:18 +02:00
int count = 0 ;
2005-09-21 23:32:43 +02:00
String seedStr ;
2008-06-06 18:01:27 +02:00
while ( ( seedStr = result . get ( " seed " + i + + ) ) ! = null ) {
2005-05-07 23:11:18 +02:00
// integrate new seed into own database
// the first seed, "seed0" is the seed of the responding peer
2008-03-12 01:24:20 +01:00
if ( seedStr . length ( ) > yacySeed . maxsize ) {
yacyCore . log . logInfo ( " hello/client: rejected contacting seed; too large ( " + seedStr . length ( ) + " > " + yacySeed . maxsize + " ) " ) ;
2007-06-13 00:08:33 +02:00
} else {
2008-06-05 00:24:00 +02:00
if ( peerActions . peerArrival ( yacySeed . genRemoteSeed ( seedStr , salt , false ) , ( i = = 1 ) ) ) count + + ;
2007-06-13 00:08:33 +02:00
}
2005-05-07 23:11:18 +02:00
}
return count ;
2005-04-07 21:19:42 +02:00
}
2005-09-21 23:32:43 +02:00
2008-08-02 14:12:04 +02:00
public static yacySeed querySeed ( final yacySeed target , final String seedHash ) {
2007-07-05 01:48:52 +02:00
// prepare request
2008-04-12 10:12:51 +02:00
final String salt = crypt . randomSalt ( ) ;
2008-03-12 01:24:20 +01:00
2007-07-05 01:48:52 +02:00
// send request
try {
2010-07-15 02:59:53 +02:00
final LinkedHashMap < String , ContentBody > parts = yacyNetwork . basicRequestParts ( Switchboard . getSwitchboard ( ) , target . hash , salt ) ;
parts . put ( " object " , new StringBody ( " seed " ) ) ;
parts . put ( " env " , new StringBody ( seedHash ) ) ;
final byte [ ] content = postToFile ( target , " query.html " , parts , 10000 ) ;
2010-06-01 15:02:11 +02:00
final Map < String , String > result = FileUtils . table ( content ) ;
2008-03-12 01:24:20 +01:00
2009-12-02 01:37:59 +01:00
if ( result = = null | | result . isEmpty ( ) ) { return null ; }
2005-10-17 17:46:12 +02:00
//final Date remoteTime = yacyCore.parseUniversalDate((String) result.get(yacySeed.MYTIME)); // read remote time
2008-06-06 18:01:27 +02:00
return yacySeed . genRemoteSeed ( result . get ( " response " ) , salt , false ) ;
2008-08-02 14:12:04 +02:00
} catch ( final Exception e ) {
2005-08-30 23:32:59 +02:00
yacyCore . log . logSevere ( " yacyClient.querySeed error: " + e . getMessage ( ) ) ;
2005-04-07 21:19:42 +02:00
return null ;
}
}
2005-09-21 23:32:43 +02:00
2008-08-02 14:12:04 +02:00
public static int queryRWICount ( final yacySeed target , final String wordHash ) {
2007-07-05 01:48:52 +02:00
// prepare request
2008-04-12 10:12:51 +02:00
final String salt = crypt . randomSalt ( ) ;
2005-10-22 15:28:04 +02:00
2007-07-05 01:48:52 +02:00
// send request
try {
2010-07-15 02:59:53 +02:00
final LinkedHashMap < String , ContentBody > parts = yacyNetwork . basicRequestParts ( Switchboard . getSwitchboard ( ) , target . hash , salt ) ;
parts . put ( " object " , new StringBody ( " rwicount " ) ) ;
parts . put ( " ttl " , new StringBody ( " 0 " ) ) ;
parts . put ( " env " , new StringBody ( wordHash ) ) ;
final byte [ ] content = postToFile ( target , " query.html " , parts , 5000 ) ;
2010-06-01 15:02:11 +02:00
final Map < String , String > result = FileUtils . table ( content ) ;
2005-10-22 15:28:04 +02:00
2009-12-02 01:37:59 +01:00
if ( result = = null | | result . isEmpty ( ) ) { return - 1 ; }
2008-06-06 18:01:27 +02:00
return Integer . parseInt ( result . get ( " response " ) ) ;
2008-08-02 14:12:04 +02:00
} catch ( final Exception e ) {
2005-08-30 23:32:59 +02:00
yacyCore . log . logSevere ( " yacyClient.queryRWICount error: " + e . getMessage ( ) ) ;
2005-05-07 23:11:18 +02:00
return - 1 ;
}
2005-04-07 21:19:42 +02:00
}
2005-09-21 23:32:43 +02:00
2008-08-02 14:12:04 +02:00
public static int queryUrlCount ( final yacySeed target ) {
2005-09-21 23:32:43 +02:00
if ( target = = null ) { return - 1 ; }
2008-03-12 01:24:20 +01:00
2007-07-05 01:48:52 +02:00
// prepare request
2008-04-12 10:12:51 +02:00
final String salt = crypt . randomSalt ( ) ;
2008-03-12 01:24:20 +01:00
2007-07-05 01:48:52 +02:00
// send request
2005-05-07 23:11:18 +02:00
try {
2010-07-15 02:59:53 +02:00
final LinkedHashMap < String , ContentBody > parts = yacyNetwork . basicRequestParts ( Switchboard . getSwitchboard ( ) , target . hash , salt ) ;
parts . put ( " object " , new StringBody ( " lurlcount " ) ) ;
parts . put ( " ttl " , new StringBody ( " 0 " ) ) ;
parts . put ( " env " , new StringBody ( " " ) ) ;
final byte [ ] content = postToFile ( target , " query.html " , parts , 5000 ) ;
2010-06-01 15:02:11 +02:00
final Map < String , String > result = FileUtils . table ( content ) ;
2008-03-12 01:24:20 +01:00
2009-12-02 01:37:59 +01:00
if ( result = = null | | result . isEmpty ( ) ) return - 1 ;
2008-06-06 18:01:27 +02:00
final String resp = result . get ( " response " ) ;
2006-09-07 03:13:03 +02:00
if ( resp = = null ) {
return - 1 ;
2008-08-02 15:57:00 +02:00
}
try {
2006-09-07 03:13:03 +02:00
return Integer . parseInt ( resp ) ;
2008-08-02 14:12:04 +02:00
} catch ( final NumberFormatException e ) {
2006-09-07 03:13:03 +02:00
return - 1 ;
}
2008-08-02 14:12:04 +02:00
} catch ( final IOException e ) {
2010-03-07 22:02:08 +01:00
if ( yacyCore . log . isFine ( ) ) yacyCore . log . logFine ( " yacyClient.queryUrlCount error asking peer ' " + target . getName ( ) + " ': " + e . toString ( ) ) ;
2005-05-07 23:11:18 +02:00
return - 1 ;
}
2005-04-07 21:19:42 +02:00
}
2005-09-21 23:32:43 +02:00
2008-11-06 11:07:53 +01:00
public static RSSFeed queryRemoteCrawlURLs ( final yacySeedDB seedDB , final yacySeed target , final int maxCount , final long maxTime ) {
2007-11-29 03:07:37 +01:00
// returns a list of
if ( target = = null ) { return null ; }
// prepare request
2008-04-12 10:12:51 +02:00
final String salt = crypt . randomSalt ( ) ;
2007-11-29 03:07:37 +01:00
// send request
try {
2008-04-09 13:02:14 +02:00
/* a long time-out is needed */
2010-07-14 00:10:24 +02:00
final LinkedHashMap < String , ContentBody > parts = yacyNetwork . basicRequestParts ( Switchboard . getSwitchboard ( ) , target . hash , salt ) ;
parts . put ( " call " , new StringBody ( " remotecrawl " ) ) ;
parts . put ( " count " , new StringBody ( Integer . toString ( maxCount ) ) ) ;
parts . put ( " time " , new StringBody ( Long . toString ( maxTime ) ) ) ;
2010-08-23 03:08:56 +02:00
final byte [ ] result = HTTPConnector . getConnector ( HTTPLoader . crawlerUserAgent ) . post ( new MultiProtocolURI ( " http:// " + target . getClusterAddress ( ) + " /yacy/urls.xml " ) , ( int ) maxTime , target . getHexHash ( ) + " .yacyh " , parts ) ;
2010-08-23 13:41:12 +02:00
final RSSReader reader = RSSReader . parse ( RSSFeed . DEFAULT_MAXSIZE , result ) ;
2008-05-22 01:07:37 +02:00
if ( reader = = null ) {
yacyCore . log . logWarning ( " yacyClient.queryRemoteCrawlURLs failed asking peer ' " + target . getName ( ) + " ': probably bad response from remote peer (1), reader == null " ) ;
target . put ( yacySeed . RCOUNT , " 0 " ) ;
seedDB . update ( target . hash , target ) ; // overwrite number of remote-available number to avoid that this peer is called again (until update is done by peer ping)
2009-11-05 21:28:37 +01:00
//Log.logException(e);
2008-05-22 01:07:37 +02:00
return null ;
}
2008-08-02 14:12:04 +02:00
final RSSFeed feed = reader . getFeed ( ) ;
2008-04-24 23:31:07 +02:00
if ( feed = = null ) {
2007-11-29 03:07:37 +01:00
// case where the rss reader does not understand the content
2008-05-22 01:07:37 +02:00
yacyCore . log . logWarning ( " yacyClient.queryRemoteCrawlURLs failed asking peer ' " + target . getName ( ) + " ': probably bad response from remote peer (2) " ) ;
2009-11-20 13:11:56 +01:00
//System.out.println("***DEBUG*** rss input = " + new String(result));
2007-11-29 03:07:37 +01:00
target . put ( yacySeed . RCOUNT , " 0 " ) ;
2008-05-06 01:13:47 +02:00
seedDB . update ( target . hash , target ) ; // overwrite number of remote-available number to avoid that this peer is called again (until update is done by peer ping)
2009-11-05 21:28:37 +01:00
//Log.logException(e);
2007-11-29 03:07:37 +01:00
return null ;
}
2008-04-24 23:31:07 +02:00
return feed ;
2008-08-02 14:12:04 +02:00
} catch ( final IOException e ) {
2007-11-29 03:07:37 +01:00
yacyCore . log . logSevere ( " yacyClient.queryRemoteCrawlURLs error asking peer ' " + target . getName ( ) + " ': " + e . toString ( ) ) ;
return null ;
}
}
2010-05-11 13:14:05 +02:00
2010-05-25 14:54:57 +02:00
public static RSSFeed search ( final yacySeed targetSeed , String query , boolean verify , boolean global , long timeout , int startRecord , int maximumRecords ) throws IOException {
2010-05-11 13:14:05 +02:00
String address = ( targetSeed = = null | | targetSeed = = Switchboard . getSwitchboard ( ) . peers . mySeed ( ) ) ? " localhost: " + Switchboard . getSwitchboard ( ) . getConfig ( " port " , " 8080 " ) : targetSeed . getClusterAddress ( ) ;
String urlBase = " http:// " + address + " /yacysearch.rss " ;
2010-05-25 14:54:57 +02:00
return Search . search ( urlBase , query , verify , global , timeout , startRecord , maximumRecords ) ;
2010-05-11 13:14:05 +02:00
}
2010-09-14 15:35:47 +02:00
2009-04-15 08:34:27 +02:00
@SuppressWarnings ( " unchecked " )
2010-06-17 13:59:40 +02:00
public static int search (
2008-08-02 14:12:04 +02:00
final yacySeed mySeed ,
final String wordhashes ,
final String excludehashes ,
final String urlhashes ,
2010-03-23 11:17:28 +01:00
final Pattern prefer ,
final Pattern filter ,
2008-09-21 02:04:42 +02:00
final String language ,
2009-06-09 11:07:52 +02:00
final String sitehash ,
final String authorhash ,
2008-08-02 14:12:04 +02:00
final int count ,
final int maxDistance ,
final boolean global ,
final int partitions ,
final yacySeed target ,
2009-05-28 16:26:05 +02:00
final Segment indexSegment ,
2008-08-02 14:12:04 +02:00
final ResultURLs crawlResults ,
2009-07-09 00:14:57 +02:00
final RankingProcess containerCache ,
2010-06-18 11:44:21 +02:00
final SearchEvent . SecondarySearchSuperviser secondarySearchSuperviser ,
2009-03-02 12:04:13 +01:00
final Blacklist blacklist ,
2009-06-16 23:45:40 +02:00
final RankingProfile rankingProfile ,
2009-01-30 16:33:00 +01:00
final Bitfield constraint
2005-11-08 13:14:51 +01:00
) {
2005-05-07 23:11:18 +02:00
// send a search request to peer with remote Hash
2005-09-21 23:32:43 +02:00
2005-05-07 23:11:18 +02:00
// INPUT:
2007-02-01 14:27:23 +01:00
// iam : complete seed of the requesting peer
// youare : seed hash of the target peer, used for testing network stability
// key : transmission key for response
// search : a list of search words
// hsearch : a string of word hashes
// fwdep : forward depth. if "0" then peer may NOT ask another peer for more results
// fwden : forward deny, a list of seed hashes. They may NOT be target of forward hopping
// count : maximum number of wanted results
// global : if "true", then result may consist of answers from other peers
// partitions : number of remote peers that are asked (for evaluation of QPM)
// duetime : maximum time that a peer should spent to create a result
2005-09-21 23:32:43 +02:00
2007-07-05 01:48:52 +02:00
final long timestamp = System . currentTimeMillis ( ) ;
2010-09-14 15:35:47 +02:00
SearchResult result ;
2007-07-05 01:48:52 +02:00
try {
2010-09-15 13:38:03 +02:00
result = new SearchResult (
2010-09-14 15:35:47 +02:00
yacyNetwork . basicRequestParts ( Switchboard . getSwitchboard ( ) , target . hash , crypt . randomSalt ( ) ) ,
mySeed , wordhashes , excludehashes , urlhashes , prefer , filter , language ,
sitehash , authorhash , count , maxDistance , global , partitions , target . getHexHash ( ) + " .yacyh " , target . getClusterAddress ( ) ,
secondarySearchSuperviser , rankingProfile , constraint ) ;
2008-08-02 14:12:04 +02:00
} catch ( final IOException e ) {
replaced old DHT transmission method with new method. Many things have changed! some of them:
- after a index selection is made, the index is splitted into its vertical components
- from differrent index selctions the splitted components can be accumulated before they are placed into the transmission queue
- each splitted chunk gets its own transmission thread
- multiple transmission threads are started concurrently
- the process can be monitored with the blocking queue servlet
To implement that, a new package de.anomic.yacy.dht was created. Some old files have been removed.
The new index distribution model using a vertical DHT was implemented. An abstraction of this model
is implemented in the new dht package as interface. The freeworld network has now a configuration
of two vertial partitions; sixteen partitions are planned and will be configured if the process is bug-free.
This modification has three main targets:
- enhance the DHT transmission speed
- with a vertical DHT, a search will speed up. With two partitions, two times. With sixteen, sixteen times.
- the vertical DHT will apply a semi-dht for URLs, and peers will receive a fraction of the overall URLs they received before.
with two partitions, the fractions will be halve. With sixteen partitions, a 1/16 of the previous number of URLs.
BE CAREFULL, THIS IS A MAJOR CODE CHANGE, POSSIBLY FULL OF BUGS AND HARMFUL THINGS.
git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@5586 6c8d7289-2bf4-0310-a012-ef5d649a1542
2009-02-10 01:06:59 +01:00
yacyCore . log . logInfo ( " SEARCH failed, Peer: " + target . hash + " : " + target . getName ( ) + " ( " + e . getMessage ( ) + " ), score= " + target . selectscore ) ;
2008-03-12 01:56:18 +01:00
//yacyCore.peerActions.peerDeparture(target, "search request to peer created io exception: " + e.getMessage());
2010-06-17 13:59:40 +02:00
return - 1 ;
2007-07-05 01:48:52 +02:00
}
2010-09-14 15:35:47 +02:00
// computation time
final long totalrequesttime = System . currentTimeMillis ( ) - timestamp ;
// create containers
final int words = wordhashes . length ( ) / Word . commonHashLength ;
assert words > 0 : " wordhashes = " + wordhashes ;
final ReferenceContainer < WordReference > [ ] container = new ReferenceContainer [ words ] ;
for ( int i = 0 ; i < words ; i + + ) {
try {
2010-01-09 01:08:16 +01:00
container [ i ] = ReferenceContainer . emptyContainer ( Segment . wordReferenceFactory , wordhashes . substring ( i * Word . commonHashLength , ( i + 1 ) * Word . commonHashLength ) . getBytes ( ) , count ) ;
} catch ( RowSpaceExceededException e ) {
Log . logException ( e ) ;
2010-06-17 13:59:40 +02:00
return - 1 ;
2010-01-09 01:08:16 +01:00
}
2010-09-14 15:35:47 +02:00
}
2007-06-22 16:29:14 +02:00
2010-09-14 15:35:47 +02:00
// insert results to containers
for ( URIMetadataRow urlEntry : result . links ) {
// get one single search result
if ( urlEntry = = null ) continue ;
assert ( urlEntry . hash ( ) . length = = 12 ) : " urlEntry.hash() = " + new String ( urlEntry . hash ( ) ) ;
if ( urlEntry . hash ( ) . length ! = 12 ) continue ; // bad url hash
final URIMetadataRow . Components metadata = urlEntry . metadata ( ) ;
if ( metadata = = null ) continue ;
if ( blacklist . isListed ( Blacklist . BLACKLIST_SEARCH , metadata . url ( ) ) ) {
if ( yacyCore . log . isInfo ( ) ) yacyCore . log . logInfo ( " remote search: filtered blacklisted url " + metadata . url ( ) + " from peer " + target . getName ( ) ) ;
continue ; // block with backlist
}
2007-07-24 02:46:17 +02:00
2010-09-14 15:35:47 +02:00
final String urlRejectReason = Switchboard . getSwitchboard ( ) . crawlStacker . urlInAcceptedDomain ( metadata . url ( ) ) ;
2008-04-20 23:36:25 +02:00
if ( urlRejectReason ! = null ) {
2010-06-17 13:59:40 +02:00
if ( yacyCore . log . isInfo ( ) ) yacyCore . log . logInfo ( " remote search: rejected url ' " + metadata . url ( ) + " ' ( " + urlRejectReason + " ) from peer " + target . getName ( ) ) ;
2007-07-24 02:46:17 +02:00
continue ; // reject url outside of our domain
}
2006-12-05 03:47:51 +01:00
2010-09-14 15:35:47 +02:00
// save the url entry
2010-06-17 13:59:40 +02:00
Reference entry = urlEntry . word ( ) ;
2010-09-14 15:35:47 +02:00
if ( entry = = null ) {
if ( yacyCore . log . isWarning ( ) ) yacyCore . log . logWarning ( " remote search: no word attached from peer " + target . getName ( ) + " , version " + target . getVersion ( ) ) ;
continue ; // no word attached
}
2005-09-21 23:32:43 +02:00
2010-09-14 15:35:47 +02:00
// the search-result-url transports all the attributes of word indexes
if ( ! Base64Order . enhancedCoder . equal ( entry . metadataHash ( ) , urlEntry . hash ( ) ) ) {
yacyCore . log . logInfo ( " remote search: url-hash " + new String ( urlEntry . hash ( ) ) + " does not belong to word-attached-hash " + new String ( entry . metadataHash ( ) ) + " ; url = " + metadata . url ( ) + " from peer " + target . getName ( ) ) ;
continue ; // spammed
}
2005-05-07 23:11:18 +02:00
2010-09-14 15:35:47 +02:00
// passed all checks, store url
try {
indexSegment . urlMetadata ( ) . store ( urlEntry ) ;
crawlResults . stack ( urlEntry , mySeed . hash . getBytes ( ) , target . hash . getBytes ( ) , EventOrigin . QUERIES ) ;
} catch ( final IOException e ) {
yacyCore . log . logSevere ( " could not store search result " , e ) ;
continue ; // db-error
}
2007-07-05 01:48:52 +02:00
2010-09-14 15:35:47 +02:00
if ( urlEntry . snippet ( ) ! = null ) {
// we don't store the snippets along the url entry,
2007-08-15 13:36:59 +02:00
// because they are search-specific.
2010-09-14 15:35:47 +02:00
// instead, they are placed in a snipped-search cache.
// System.out.println("--- RECEIVED SNIPPET '" + link.snippet() + "'");
TextSnippet . storeToCache ( wordhashes , new String ( urlEntry . hash ( ) ) , urlEntry . snippet ( ) ) ;
}
2007-09-04 01:43:55 +02:00
2010-09-14 15:35:47 +02:00
// add the url entry to the word indexes
for ( int m = 0 ; m < words ; m + + ) {
try {
2009-12-10 00:27:26 +01:00
container [ m ] . add ( entry ) ;
} catch ( RowSpaceExceededException e ) {
Log . logException ( e ) ;
break ;
}
2010-09-14 15:35:47 +02:00
}
}
2007-07-05 01:48:52 +02:00
2007-09-04 01:43:55 +02:00
// store remote result to local result container
2010-09-10 00:42:54 +02:00
// insert one container into the search result buffer
2010-09-14 15:35:47 +02:00
// one is enough, only the references are used, not the word
containerCache . add ( container [ 0 ] , false , target . getName ( ) + " / " + target . hash , result . joincount ) ;
2010-09-10 00:42:54 +02:00
2010-06-17 13:59:40 +02:00
// insert the containers to the index
for ( ReferenceContainer < WordReference > c : container ) try {
indexSegment . termIndex ( ) . add ( c ) ;
} catch ( Exception e ) {
Log . logException ( e ) ;
}
2010-09-14 15:35:47 +02:00
boolean thisIsASecondarySearch = urlhashes . length ( ) > 0 ;
assert ! thisIsASecondarySearch | | secondarySearchSuperviser = = null ;
yacyCore . log . logInfo ( " remote search: peer " + target . getName ( ) + " sent " + container [ 0 ] . size ( ) + " / " + result . joincount + " references for " + ( thisIsASecondarySearch ? " a secondary search " : " joined word queries " ) ) ;
2010-06-17 13:59:40 +02:00
// integrate remote top-words/topics
2010-09-14 15:35:47 +02:00
if ( result . references ! = null & & result . references . length > 0 ) {
yacyCore . log . logInfo ( " remote search: peer " + target . getName ( ) + " sent " + result . references . length + " topics " ) ;
2010-06-17 13:59:40 +02:00
// add references twice, so they can be counted (must have at least 2 entries)
synchronized ( containerCache ) {
2010-09-14 15:35:47 +02:00
containerCache . addTopic ( result . references ) ;
containerCache . addTopic ( result . references ) ;
2007-09-08 13:50:19 +02:00
}
2007-09-04 01:43:55 +02:00
}
2010-09-14 15:35:47 +02:00
// read index abstract
if ( secondarySearchSuperviser ! = null ) {
String wordhash ;
String whacc = " " ;
ByteBuffer ci ;
int ac = 0 ;
for ( Map . Entry < byte [ ] , String > abstractEntry : result . indexabstract . entrySet ( ) ) {
wordhash = new String ( abstractEntry . getKey ( ) ) ;
whacc + = wordhash ;
try {
ci = new ByteBuffer ( abstractEntry . getValue ( ) . getBytes ( " UTF-8 " ) ) ;
} catch ( UnsupportedEncodingException e ) {
Log . logException ( e ) ;
return - 1 ;
}
secondarySearchSuperviser . addAbstract ( wordhash , ReferenceContainer . decompressIndex ( ci , target . hash ) ) ;
ac + + ;
}
if ( ac > 0 ) {
secondarySearchSuperviser . commitAbstract ( ) ;
yacyCore . log . logInfo ( " remote search: peer " + target . getName ( ) + " sent " + ac + " index abstracts for words " + whacc ) ;
}
}
2008-02-03 13:40:40 +01:00
// generate statistics
2010-09-14 15:35:47 +02:00
if ( yacyCore . log . isFine ( ) ) yacyCore . log . logFine ( " SEARCH "
+ result . urlcount
+ " URLS FROM "
+ target . hash
+ " : "
+ target . getName ( )
+ " , score= "
+ target . selectscore
+ " , searchtime= " + result . searchtime + " , netdelay= "
+ ( totalrequesttime - result . searchtime ) + " , references= "
+ result . references ) ;
return result . urlcount ;
}
public static class SearchResult {
public String version ; // version : application version of responder
public String uptime ; // uptime : uptime in seconds of responder
public String fwhop ; // hops (depth) of forwards that had been performed to construct this result
public String fwsrc ; // peers that helped to construct this result
public String fwrec ; // peers that would have helped to construct this result (recommendations)
public int urlcount ; // number of returned LURL's for this search
public int joincount ; //
public Map < byte [ ] , Integer > indexcount ; //
public long searchtime ; // time that the peer actually spent to create the result
public String [ ] references ; // search hints, the top-words
public List < URIMetadataRow > links ; // LURLs of search
public Map < byte [ ] , String > indexabstract ; // index abstracts, a collection of url-hashes per word
2010-09-15 13:38:03 +02:00
public SearchResult (
LinkedHashMap < String , ContentBody > parts ,
final yacySeed mySeed ,
final String wordhashes ,
final String excludehashes ,
final String urlhashes ,
final Pattern prefer ,
final Pattern filter ,
final String language ,
final String sitehash ,
final String authorhash ,
final int count ,
final int maxDistance ,
final boolean global ,
final int partitions ,
String hostname ,
String hostaddress ,
final SearchEvent . SecondarySearchSuperviser secondarySearchSuperviser ,
final RankingProfile rankingProfile ,
final Bitfield constraint ) throws IOException {
// send a search request to peer with remote Hash
// INPUT:
// iam : complete seed of the requesting peer
// youare : seed hash of the target peer, used for testing network stability
// key : transmission key for response
// search : a list of search words
// hsearch : a string of word hashes
// fwdep : forward depth. if "0" then peer may NOT ask another peer for more results
// fwden : forward deny, a list of seed hashes. They may NOT be target of forward hopping
// count : maximum number of wanted results
// global : if "true", then result may consist of answers from other peers
// partitions : number of remote peers that are asked (for evaluation of QPM)
// duetime : maximum time that a peer should spent to create a result
// send request
Map < String , String > resultMap = null ;
parts . put ( " myseed " , new StringBody ( ( mySeed = = null ) ? " " : mySeed . genSeedStr ( parts . get ( " key " ) . toString ( ) ) ) ) ;
parts . put ( " count " , new StringBody ( Integer . toString ( Math . max ( 10 , count ) ) ) ) ;
parts . put ( " resource " , new StringBody ( ( ( global ) ? " global " : " local " ) ) ) ;
parts . put ( " partitions " , new StringBody ( Integer . toString ( partitions ) ) ) ;
parts . put ( " query " , new StringBody ( wordhashes ) ) ;
parts . put ( " exclude " , new StringBody ( excludehashes ) ) ;
parts . put ( " duetime " , new StringBody ( " 1000 " ) ) ;
parts . put ( " urls " , new StringBody ( urlhashes ) ) ;
parts . put ( " prefer " , new StringBody ( prefer . toString ( ) ) ) ;
parts . put ( " filter " , new StringBody ( filter . toString ( ) ) ) ;
parts . put ( " language " , new StringBody ( language ) ) ;
parts . put ( " sitehash " , new StringBody ( sitehash ) ) ;
parts . put ( " authorhash " , new StringBody ( authorhash ) ) ;
parts . put ( " ttl " , new StringBody ( " 0 " ) ) ;
parts . put ( " maxdist " , new StringBody ( Integer . toString ( maxDistance ) ) ) ;
parts . put ( " profile " , new StringBody ( crypt . simpleEncode ( rankingProfile . toExternalString ( ) ) ) ) ;
parts . put ( " constraint " , new StringBody ( ( constraint = = null ) ? " " : constraint . exportB64 ( ) ) ) ;
if ( secondarySearchSuperviser ! = null ) parts . put ( " abstracts " , new StringBody ( " auto " ) ) ;
resultMap = FileUtils . table ( HTTPConnector . getConnector ( HTTPLoader . crawlerUserAgent ) . post ( new MultiProtocolURI ( " http:// " + hostaddress + " /yacy/search.html " ) , 60000 , hostname , parts ) ) ;
//resultMap = FileUtils.table(HTTPConnector.getConnector(HTTPLoader.crawlerUserAgent).post(new MultiProtocolURI("http://" + target.getClusterAddress() + "/yacy/search.html"), 60000, target.getHexHash() + ".yacyh", parts));
// evaluate request result
if ( resultMap = = null | | resultMap . isEmpty ( ) ) throw new IOException ( " resultMap is NULL " ) ;
2010-09-14 15:35:47 +02:00
try {
this . searchtime = Integer . parseInt ( resultMap . get ( " searchtime " ) ) ;
} catch ( final NumberFormatException e ) {
2010-09-15 13:38:03 +02:00
throw new IOException ( " wrong output format for searchtime: " + e . getMessage ( ) + " , map = " + resultMap . toString ( ) ) ;
2010-09-14 15:35:47 +02:00
}
try {
this . joincount = Integer . parseInt ( resultMap . get ( " joincount " ) ) ; // the complete number of hits at remote site
} catch ( final NumberFormatException e ) {
throw new IOException ( " wrong output format for joincount: " + e . getMessage ( ) ) ;
}
try {
this . urlcount = Integer . parseInt ( resultMap . get ( " count " ) ) ; // the number of hits that are returned in the result list
} catch ( final NumberFormatException e ) {
throw new IOException ( " wrong output format for count: " + e . getMessage ( ) ) ;
}
this . fwhop = resultMap . get ( " fwhop " ) ;
this . fwsrc = resultMap . get ( " fwsrc " ) ;
this . fwrec = resultMap . get ( " fwrec " ) ;
// scan the result map for entries with special prefix
indexcount = new TreeMap < byte [ ] , Integer > ( Base64Order . enhancedCoder ) ;
indexabstract = new TreeMap < byte [ ] , String > ( Base64Order . enhancedCoder ) ;
for ( Map . Entry < String , String > entry : resultMap . entrySet ( ) ) {
if ( entry . getKey ( ) . startsWith ( " indexcount. " ) ) {
indexcount . put ( entry . getKey ( ) . substring ( 11 ) . getBytes ( ) , Integer . parseInt ( entry . getValue ( ) ) ) ;
}
if ( entry . getKey ( ) . startsWith ( " indexabstract. " ) ) {
indexabstract . put ( entry . getKey ( ) . substring ( 14 ) . getBytes ( ) , entry . getValue ( ) ) ;
}
}
references = resultMap . get ( " references " ) . split ( " , " ) ;
this . links = new ArrayList < URIMetadataRow > ( this . urlcount ) ;
for ( int n = 0 ; n < this . urlcount ; n + + ) {
// get one single search result
String resultLine = resultMap . get ( " resource " + n ) ;
if ( resultLine = = null ) continue ;
URIMetadataRow urlEntry = URIMetadataRow . importEntry ( resultLine ) ;
if ( urlEntry = = null ) continue ;
this . links . add ( urlEntry ) ;
}
}
}
2010-06-01 15:02:11 +02:00
public static Map < String , String > permissionMessage ( final yacySeedDB seedDB , final String targetHash ) {
2005-05-07 23:11:18 +02:00
// ask for allowed message size and attachement size
// if this replies null, the peer does not answer
2008-05-06 01:13:47 +02:00
2007-07-05 01:48:52 +02:00
// prepare request
2008-04-12 10:12:51 +02:00
final String salt = crypt . randomSalt ( ) ;
2005-10-22 15:28:04 +02:00
2007-07-05 01:48:52 +02:00
// send request
2005-05-07 23:11:18 +02:00
try {
2010-07-15 02:59:53 +02:00
final LinkedHashMap < String , ContentBody > parts = yacyNetwork . basicRequestParts ( Switchboard . getSwitchboard ( ) , targetHash , salt ) ;
parts . put ( " process " , new StringBody ( " permission " ) ) ;
final byte [ ] content = postToFile ( seedDB , targetHash , " message.html " , parts , 5000 ) ;
2010-06-01 15:02:11 +02:00
final Map < String , String > result = FileUtils . table ( content ) ;
2007-07-05 01:48:52 +02:00
return result ;
2008-08-02 14:12:04 +02:00
} catch ( final Exception e ) {
2005-05-07 23:11:18 +02:00
// most probably a network time-out exception
2005-08-30 23:32:59 +02:00
yacyCore . log . logSevere ( " yacyClient.permissionMessage error: " + e . getMessage ( ) ) ;
2005-05-07 23:11:18 +02:00
return null ;
}
2005-04-07 21:19:42 +02:00
}
2005-09-21 23:32:43 +02:00
2010-06-01 15:02:11 +02:00
public static Map < String , String > postMessage ( final yacySeedDB seedDB , final String targetHash , final String subject , final byte [ ] message ) {
2005-05-07 23:11:18 +02:00
// this post a message to the remote message board
2007-07-05 01:48:52 +02:00
// prepare request
2008-04-12 10:12:51 +02:00
final String salt = crypt . randomSalt ( ) ;
2007-08-08 20:23:45 +02:00
2007-07-05 01:48:52 +02:00
// send request
2005-11-11 00:48:20 +01:00
try {
2010-07-15 02:59:53 +02:00
final LinkedHashMap < String , ContentBody > parts = yacyNetwork . basicRequestParts ( Switchboard . getSwitchboard ( ) , targetHash , salt ) ;
parts . put ( " process " , new StringBody ( " post " ) ) ;
parts . put ( " myseed " , new StringBody ( seedDB . mySeed ( ) . genSeedStr ( salt ) ) ) ;
parts . put ( " subject " , new StringBody ( subject ) ) ;
try {
parts . put ( " message " , new StringBody ( new String ( message , " UTF-8 " ) ) ) ;
} catch ( final UnsupportedEncodingException e ) {
parts . put ( " message " , new StringBody ( new String ( message ) ) ) ;
}
final byte [ ] content = postToFile ( seedDB , targetHash , " message.html " , parts , 20000 ) ;
2010-06-01 15:02:11 +02:00
final Map < String , String > result = FileUtils . table ( content ) ;
2007-07-05 01:48:52 +02:00
return result ;
2008-08-02 14:12:04 +02:00
} catch ( final Exception e ) {
2005-11-11 00:48:20 +01:00
yacyCore . log . logSevere ( " yacyClient.postMessage error: " + e . getMessage ( ) ) ;
return null ;
}
}
2008-08-02 14:12:04 +02:00
public static String targetAddress ( final yacySeedDB seedDB , final String targetHash ) {
2005-11-11 00:48:20 +01:00
// find target address
2005-05-07 23:11:18 +02:00
String address ;
2008-05-06 01:13:47 +02:00
if ( targetHash . equals ( seedDB . mySeed ( ) . hash ) ) {
address = seedDB . mySeed ( ) . getClusterAddress ( ) ;
2005-09-21 23:32:43 +02:00
} else {
2008-05-06 01:13:47 +02:00
final yacySeed targetSeed = seedDB . getConnected ( targetHash ) ;
2005-11-11 00:48:20 +01:00
if ( targetSeed = = null ) { return null ; }
2007-04-30 00:05:34 +02:00
address = targetSeed . getClusterAddress ( ) ;
2005-09-21 23:32:43 +02:00
}
2005-11-11 00:48:20 +01:00
if ( address = = null ) address = " localhost:8080 " ;
return address ;
}
2010-06-01 15:02:11 +02:00
public static Map < String , String > transferPermission ( final String targetAddress , final long filesize , final String filename ) {
2007-07-05 01:48:52 +02:00
// prepare request
2008-04-12 10:12:51 +02:00
final String salt = crypt . randomSalt ( ) ;
2005-11-11 00:48:20 +01:00
// send request
try {
2010-07-14 00:10:24 +02:00
final LinkedHashMap < String , ContentBody > parts = yacyNetwork . basicRequestParts ( Switchboard . getSwitchboard ( ) , null , salt ) ;
parts . put ( " process " , new StringBody ( " permission " ) ) ;
parts . put ( " purpose " , new StringBody ( " crcon " ) ) ;
parts . put ( " filename " , new StringBody ( filename ) ) ;
parts . put ( " filesize " , new StringBody ( Long . toString ( filesize ) ) ) ;
parts . put ( " can-send-protocol " , new StringBody ( " http " ) ) ;
2010-08-23 03:08:56 +02:00
final byte [ ] content = HTTPConnector . getConnector ( HTTPLoader . crawlerUserAgent ) . post ( new MultiProtocolURI ( " http:// " + targetAddress + " /yacy/transfer.html " ) , 10000 , targetAddress , parts ) ;
2010-06-01 15:02:11 +02:00
final Map < String , String > result = FileUtils . table ( content ) ;
2007-07-05 01:48:52 +02:00
return result ;
2008-08-02 14:12:04 +02:00
} catch ( final Exception e ) {
2005-11-11 00:48:20 +01:00
// most probably a network time-out exception
yacyCore . log . logSevere ( " yacyClient.permissionTransfer error: " + e . getMessage ( ) ) ;
return null ;
}
}
2010-06-01 15:02:11 +02:00
public static Map < String , String > transferStore ( final String targetAddress , final String access , final String filename , final byte [ ] file ) {
2005-11-11 00:48:20 +01:00
2007-07-05 01:48:52 +02:00
// prepare request
2008-04-12 10:12:51 +02:00
final String salt = crypt . randomSalt ( ) ;
2005-10-22 15:28:04 +02:00
2007-07-05 01:48:52 +02:00
// send request
2005-04-07 21:19:42 +02:00
try {
2010-07-14 00:10:24 +02:00
final LinkedHashMap < String , ContentBody > parts = yacyNetwork . basicRequestParts ( Switchboard . getSwitchboard ( ) , null , salt ) ;
parts . put ( " process " , new StringBody ( " store " ) ) ;
parts . put ( " purpose " , new StringBody ( " crcon " ) ) ;
parts . put ( " filesize " , new StringBody ( Long . toString ( file . length ) ) ) ;
parts . put ( " md5 " , new StringBody ( Digest . encodeMD5Hex ( file ) ) ) ;
parts . put ( " access " , new StringBody ( access ) ) ;
parts . put ( " filename " , new ByteArrayBody ( file , filename ) ) ;
2010-08-23 03:08:56 +02:00
final byte [ ] content = HTTPConnector . getConnector ( HTTPLoader . crawlerUserAgent ) . post ( new MultiProtocolURI ( " http:// " + targetAddress + " /yacy/transfer.html " ) , 20000 , targetAddress , parts ) ;
2010-06-01 15:02:11 +02:00
final Map < String , String > result = FileUtils . table ( content ) ;
2007-07-05 01:48:52 +02:00
return result ;
2008-08-02 14:12:04 +02:00
} catch ( final Exception e ) {
2005-08-30 23:32:59 +02:00
yacyCore . log . logSevere ( " yacyClient.postMessage error: " + e . getMessage ( ) ) ;
2005-04-07 21:19:42 +02:00
return null ;
}
}
2005-11-11 00:48:20 +01:00
2008-08-02 14:12:04 +02:00
public static String transfer ( final String targetAddress , final String filename , final byte [ ] file ) {
2010-06-01 15:02:11 +02:00
final Map < String , String > phase1 = transferPermission ( targetAddress , file . length , filename ) ;
2005-11-11 00:48:20 +01:00
if ( phase1 = = null ) return " no connection to remote address " + targetAddress + " ; phase 1 " ;
2008-08-02 14:12:04 +02:00
final String access = phase1 . get ( " access " ) ;
final String nextaddress = phase1 . get ( " address " ) ;
final String protocol = phase1 . get ( " protocol " ) ;
2005-12-05 15:24:13 +01:00
//String path = (String) phase1.get("path");
//String maxsize = (String) phase1.get("maxsize");
2008-06-06 18:01:27 +02:00
String response = phase1 . get ( " response " ) ;
2005-11-11 00:48:20 +01:00
if ( ( response = = null ) | | ( protocol = = null ) | | ( access = = null ) ) return " wrong return values from other peer; phase 1 " ;
if ( ! ( response . equals ( " ok " ) ) ) return " remote peer rejected transfer: " + response ;
2009-01-30 16:33:00 +01:00
final String accesscode = Digest . encodeMD5Hex ( Base64Order . standardCoder . encodeString ( access ) ) ;
2005-11-11 00:48:20 +01:00
if ( protocol . equals ( " http " ) ) {
2010-06-01 15:02:11 +02:00
final Map < String , String > phase2 = transferStore ( nextaddress , accesscode , filename , file ) ;
2005-11-11 00:48:20 +01:00
if ( phase2 = = null ) return " no connection to remote address " + targetAddress + " ; phase 2 " ;
2008-06-06 18:01:27 +02:00
response = phase2 . get ( " response " ) ;
2005-11-11 00:48:20 +01:00
if ( response = = null ) return " wrong return values from other peer; phase 2 " ;
if ( ! ( response . equals ( " ok " ) ) ) {
return " remote peer failed with transfer: " + response ;
}
return null ;
}
return " wrong protocol: " + protocol ;
}
2005-09-21 23:32:43 +02:00
2010-06-01 15:02:11 +02:00
public static Map < String , String > crawlReceipt ( final yacySeed mySeed , final yacySeed target , final String process , final String result , final String reason , final URIMetadataRow entry , final String wordhashes ) {
2007-07-05 01:48:52 +02:00
assert ( target ! = null ) ;
2008-05-06 01:13:47 +02:00
assert ( mySeed ! = null ) ;
assert ( mySeed ! = target ) ;
2005-09-21 23:32:43 +02:00
2005-04-07 21:19:42 +02:00
/ *
the result can have one of the following values :
negative cases , no retry
unavailable - the resource is not avaiable ( a broken link ) ; not found or interrupted
robot - a robot - file has denied to crawl that resource
2005-05-07 23:11:18 +02:00
2005-04-07 21:19:42 +02:00
negative cases , retry possible
rejected - the peer has rejected to load the resource
dequeue - peer too busy - rejected to crawl
positive cases with crawling
fill - the resource was loaded and processed
update - the resource was already in database but re - loaded and processed
2005-05-07 23:11:18 +02:00
positive cases without crawling
2005-04-07 21:19:42 +02:00
known - the resource is already in database , believed to be fresh and not reloaded
stale - the resource was reloaded but not processed because source had no changes
2005-05-07 23:11:18 +02:00
* /
2005-10-22 15:28:04 +02:00
2007-07-05 01:48:52 +02:00
// prepare request
2008-04-12 10:12:51 +02:00
final String salt = crypt . randomSalt ( ) ;
2007-07-05 01:48:52 +02:00
2005-10-22 15:28:04 +02:00
// determining target address
2007-07-05 01:48:52 +02:00
final String address = target . getClusterAddress ( ) ;
2005-09-21 23:32:43 +02:00
if ( address = = null ) { return null ; }
2007-07-05 01:48:52 +02:00
// send request
2005-04-07 21:19:42 +02:00
try {
2010-07-14 00:10:24 +02:00
// prepare request
final LinkedHashMap < String , ContentBody > parts = yacyNetwork . basicRequestParts ( Switchboard . getSwitchboard ( ) , target . hash , salt ) ;
parts . put ( " process " , new StringBody ( process ) ) ;
parts . put ( " urlhash " , new StringBody ( ( ( entry = = null ) ? " " : new String ( entry . hash ( ) ) ) ) ) ;
parts . put ( " result " , new StringBody ( result ) ) ;
parts . put ( " reason " , new StringBody ( reason ) ) ;
parts . put ( " wordh " , new StringBody ( wordhashes ) ) ;
parts . put ( " lurlEntry " , new StringBody ( ( ( entry = = null ) ? " " : crypt . simpleEncode ( entry . toString ( ) , salt ) ) ) ) ;
// send request
2010-08-23 03:08:56 +02:00
final byte [ ] content = HTTPConnector . getConnector ( HTTPLoader . crawlerUserAgent ) . post ( new MultiProtocolURI ( " http:// " + address + " /yacy/crawlReceipt.html " ) , 10000 , target . getHexHash ( ) + " .yacyh " , parts ) ;
2009-10-05 22:11:41 +02:00
return FileUtils . table ( content ) ;
2008-08-02 14:12:04 +02:00
} catch ( final Exception e ) {
2005-05-07 23:11:18 +02:00
// most probably a network time-out exception
2005-08-30 23:32:59 +02:00
yacyCore . log . logSevere ( " yacyClient.crawlReceipt error: " + e . getMessage ( ) ) ;
2005-05-07 23:11:18 +02:00
return null ;
}
2005-04-07 21:19:42 +02:00
}
2005-09-21 23:32:43 +02:00
2009-02-16 22:28:48 +01:00
/ * *
* transfer the index . If the transmission fails , return a string describing the cause .
* If everything is ok , return null .
* @param targetSeed
* @param indexes
* @param urlCache
* @param gzipBody
* @param timeout
* @return
* /
public static String transferIndex (
replaced old DHT transmission method with new method. Many things have changed! some of them:
- after a index selection is made, the index is splitted into its vertical components
- from differrent index selctions the splitted components can be accumulated before they are placed into the transmission queue
- each splitted chunk gets its own transmission thread
- multiple transmission threads are started concurrently
- the process can be monitored with the blocking queue servlet
To implement that, a new package de.anomic.yacy.dht was created. Some old files have been removed.
The new index distribution model using a vertical DHT was implemented. An abstraction of this model
is implemented in the new dht package as interface. The freeworld network has now a configuration
of two vertial partitions; sixteen partitions are planned and will be configured if the process is bug-free.
This modification has three main targets:
- enhance the DHT transmission speed
- with a vertical DHT, a search will speed up. With two partitions, two times. With sixteen, sixteen times.
- the vertical DHT will apply a semi-dht for URLs, and peers will receive a fraction of the overall URLs they received before.
with two partitions, the fractions will be halve. With sixteen partitions, a 1/16 of the previous number of URLs.
BE CAREFULL, THIS IS A MAJOR CODE CHANGE, POSSIBLY FULL OF BUGS AND HARMFUL THINGS.
git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@5586 6c8d7289-2bf4-0310-a012-ef5d649a1542
2009-02-10 01:06:59 +01:00
final yacySeed targetSeed ,
2009-04-15 08:34:27 +02:00
final ReferenceContainerCache < WordReference > indexes ,
2010-04-15 15:22:59 +02:00
final TreeMap < byte [ ] , URIMetadataRow > urlCache ,
replaced old DHT transmission method with new method. Many things have changed! some of them:
- after a index selection is made, the index is splitted into its vertical components
- from differrent index selctions the splitted components can be accumulated before they are placed into the transmission queue
- each splitted chunk gets its own transmission thread
- multiple transmission threads are started concurrently
- the process can be monitored with the blocking queue servlet
To implement that, a new package de.anomic.yacy.dht was created. Some old files have been removed.
The new index distribution model using a vertical DHT was implemented. An abstraction of this model
is implemented in the new dht package as interface. The freeworld network has now a configuration
of two vertial partitions; sixteen partitions are planned and will be configured if the process is bug-free.
This modification has three main targets:
- enhance the DHT transmission speed
- with a vertical DHT, a search will speed up. With two partitions, two times. With sixteen, sixteen times.
- the vertical DHT will apply a semi-dht for URLs, and peers will receive a fraction of the overall URLs they received before.
with two partitions, the fractions will be halve. With sixteen partitions, a 1/16 of the previous number of URLs.
BE CAREFULL, THIS IS A MAJOR CODE CHANGE, POSSIBLY FULL OF BUGS AND HARMFUL THINGS.
git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@5586 6c8d7289-2bf4-0310-a012-ef5d649a1542
2009-02-10 01:06:59 +01:00
final boolean gzipBody ,
final int timeout ) {
2005-10-22 15:28:04 +02:00
2010-06-01 15:02:11 +02:00
final Map < String , Object > resultObj = new HashMap < String , Object > ( ) ;
2006-06-14 11:40:42 +02:00
int payloadSize = 0 ;
try {
// check if we got all necessary urls in the urlCache (only for debugging)
2009-04-15 08:34:27 +02:00
Iterator < WordReference > eenum ;
2009-03-02 00:58:14 +01:00
Reference entry ;
2009-04-15 08:34:27 +02:00
for ( ReferenceContainer < WordReference > ic : indexes ) {
replaced old DHT transmission method with new method. Many things have changed! some of them:
- after a index selection is made, the index is splitted into its vertical components
- from differrent index selctions the splitted components can be accumulated before they are placed into the transmission queue
- each splitted chunk gets its own transmission thread
- multiple transmission threads are started concurrently
- the process can be monitored with the blocking queue servlet
To implement that, a new package de.anomic.yacy.dht was created. Some old files have been removed.
The new index distribution model using a vertical DHT was implemented. An abstraction of this model
is implemented in the new dht package as interface. The freeworld network has now a configuration
of two vertial partitions; sixteen partitions are planned and will be configured if the process is bug-free.
This modification has three main targets:
- enhance the DHT transmission speed
- with a vertical DHT, a search will speed up. With two partitions, two times. With sixteen, sixteen times.
- the vertical DHT will apply a semi-dht for URLs, and peers will receive a fraction of the overall URLs they received before.
with two partitions, the fractions will be halve. With sixteen partitions, a 1/16 of the previous number of URLs.
BE CAREFULL, THIS IS A MAJOR CODE CHANGE, POSSIBLY FULL OF BUGS AND HARMFUL THINGS.
git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@5586 6c8d7289-2bf4-0310-a012-ef5d649a1542
2009-02-10 01:06:59 +01:00
eenum = ic . entries ( ) ;
2006-06-14 11:40:42 +02:00
while ( eenum . hasNext ( ) ) {
2008-06-06 18:01:27 +02:00
entry = eenum . next ( ) ;
2009-04-07 11:34:41 +02:00
if ( urlCache . get ( entry . metadataHash ( ) ) = = null ) {
2010-04-15 15:22:59 +02:00
if ( yacyCore . log . isFine ( ) ) yacyCore . log . logFine ( " DEBUG transferIndex: to-send url hash ' " + new String ( entry . metadataHash ( ) ) + " ' is not contained in urlCache " ) ;
2006-06-14 11:40:42 +02:00
}
2006-01-30 13:42:06 +01:00
}
2006-06-14 11:40:42 +02:00
}
// transfer the RWI without the URLs
2010-06-01 15:02:11 +02:00
Map < String , String > in = transferRWI ( targetSeed , indexes , gzipBody , timeout ) ;
2006-06-14 11:40:42 +02:00
resultObj . put ( " resultTransferRWI " , in ) ;
2008-03-12 01:24:20 +01:00
2006-06-14 11:40:42 +02:00
if ( in = = null ) {
2009-02-16 22:28:48 +01:00
return " no connection from transferRWI " ;
}
2008-01-17 19:43:01 +01:00
if ( in . containsKey ( " indexPayloadSize " ) ) payloadSize + = Integer . parseInt ( in . get ( " indexPayloadSize " ) ) ;
2008-03-12 01:24:20 +01:00
2008-06-06 18:01:27 +02:00
String result = in . get ( " result " ) ;
2009-02-16 22:28:48 +01:00
if ( result = = null ) {
return " no result from transferRWI " ;
2006-01-30 13:42:06 +01:00
}
2009-02-16 22:28:48 +01:00
2006-06-14 11:40:42 +02:00
if ( ! ( result . equals ( " ok " ) ) ) {
2009-02-16 22:28:48 +01:00
return result ;
2006-01-30 13:42:06 +01:00
}
2008-03-12 01:24:20 +01:00
2006-06-14 11:40:42 +02:00
// in now contains a list of unknown hashes
2009-02-16 22:28:48 +01:00
String uhss = in . get ( " unknownURL " ) ;
2006-06-14 11:40:42 +02:00
if ( uhss = = null ) {
2009-02-16 22:28:48 +01:00
return " no unknownURL tag in response " ;
2006-06-14 11:40:42 +02:00
}
2010-06-29 21:20:45 +02:00
yacyChannel . channels ( yacyChannel . DHTSEND ) . addMessage ( new RSSMessage ( " Sent " + indexes . size ( ) + " RWIs to " + targetSeed . getName ( ) , " " , targetSeed . hash ) ) ;
2010-06-27 00:39:27 +02:00
2009-02-16 22:28:48 +01:00
uhss = uhss . trim ( ) ;
if ( uhss . length ( ) = = 0 | | uhss . equals ( " , " ) ) { return null ; } // all url's known, we are ready here
2008-03-12 01:24:20 +01:00
2006-06-14 11:40:42 +02:00
final String [ ] uhs = uhss . split ( " , " ) ;
2009-02-16 22:28:48 +01:00
if ( uhs . length = = 0 ) { return null ; } // all url's known
2008-03-12 01:24:20 +01:00
2006-06-14 11:40:42 +02:00
// extract the urlCache from the result
2009-10-11 02:12:19 +02:00
final URIMetadataRow [ ] urls = new URIMetadataRow [ uhs . length ] ;
2006-06-14 11:40:42 +02:00
for ( int i = 0 ; i < uhs . length ; i + + ) {
2010-04-15 15:22:59 +02:00
urls [ i ] = urlCache . get ( uhs [ i ] . getBytes ( ) ) ;
2006-06-14 11:40:42 +02:00
if ( urls [ i ] = = null ) {
2008-09-03 02:30:21 +02:00
if ( yacyCore . log . isFine ( ) ) yacyCore . log . logFine ( " DEBUG transferIndex: requested url hash ' " + uhs [ i ] + " ', unknownURL=' " + uhss + " ' " ) ;
2006-06-14 11:40:42 +02:00
}
}
replaced old DHT transmission method with new method. Many things have changed! some of them:
- after a index selection is made, the index is splitted into its vertical components
- from differrent index selctions the splitted components can be accumulated before they are placed into the transmission queue
- each splitted chunk gets its own transmission thread
- multiple transmission threads are started concurrently
- the process can be monitored with the blocking queue servlet
To implement that, a new package de.anomic.yacy.dht was created. Some old files have been removed.
The new index distribution model using a vertical DHT was implemented. An abstraction of this model
is implemented in the new dht package as interface. The freeworld network has now a configuration
of two vertial partitions; sixteen partitions are planned and will be configured if the process is bug-free.
This modification has three main targets:
- enhance the DHT transmission speed
- with a vertical DHT, a search will speed up. With two partitions, two times. With sixteen, sixteen times.
- the vertical DHT will apply a semi-dht for URLs, and peers will receive a fraction of the overall URLs they received before.
with two partitions, the fractions will be halve. With sixteen partitions, a 1/16 of the previous number of URLs.
BE CAREFULL, THIS IS A MAJOR CODE CHANGE, POSSIBLY FULL OF BUGS AND HARMFUL THINGS.
git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@5586 6c8d7289-2bf4-0310-a012-ef5d649a1542
2009-02-10 01:06:59 +01:00
in = transferURL ( targetSeed , urls , gzipBody , timeout ) ;
2006-06-14 11:40:42 +02:00
resultObj . put ( " resultTransferURL " , in ) ;
if ( in = = null ) {
2009-02-16 22:28:48 +01:00
return " no connection from transferURL " ;
2006-06-14 11:40:42 +02:00
}
2009-02-16 22:28:48 +01:00
2008-01-17 19:43:01 +01:00
if ( in . containsKey ( " urlPayloadSize " ) ) payloadSize + = Integer . parseInt ( in . get ( " urlPayloadSize " ) ) ;
2006-06-14 11:40:42 +02:00
2008-06-06 18:01:27 +02:00
result = in . get ( " result " ) ;
2006-06-14 11:40:42 +02:00
if ( result = = null ) {
2009-02-16 22:28:48 +01:00
return " no result from transferURL " ;
2006-06-14 11:40:42 +02:00
}
2009-02-16 22:28:48 +01:00
2010-06-27 00:39:27 +02:00
if ( ! result . equals ( " ok " ) ) {
2009-02-16 22:28:48 +01:00
return result ;
2010-06-27 00:39:27 +02:00
}
2010-06-29 21:20:45 +02:00
yacyChannel . channels ( yacyChannel . DHTSEND ) . addMessage ( new RSSMessage ( " Sent " + uhs . length + " URLs to peer " + targetSeed . getName ( ) , " " , targetSeed . hash ) ) ;
2006-06-14 11:40:42 +02:00
2009-02-16 22:28:48 +01:00
return null ;
2010-07-15 02:59:53 +02:00
} catch ( UnsupportedEncodingException e ) {
yacyCore . log . logSevere ( " yacyClient.transferIndex error: " + e . getMessage ( ) ) ;
return null ;
} finally {
2008-08-06 21:43:12 +02:00
resultObj . put ( " payloadSize " , Integer . valueOf ( payloadSize ) ) ;
2005-04-07 21:19:42 +02:00
}
}
2005-09-21 23:32:43 +02:00
2010-06-01 15:02:11 +02:00
private static Map < String , String > transferRWI (
replaced old DHT transmission method with new method. Many things have changed! some of them:
- after a index selection is made, the index is splitted into its vertical components
- from differrent index selctions the splitted components can be accumulated before they are placed into the transmission queue
- each splitted chunk gets its own transmission thread
- multiple transmission threads are started concurrently
- the process can be monitored with the blocking queue servlet
To implement that, a new package de.anomic.yacy.dht was created. Some old files have been removed.
The new index distribution model using a vertical DHT was implemented. An abstraction of this model
is implemented in the new dht package as interface. The freeworld network has now a configuration
of two vertial partitions; sixteen partitions are planned and will be configured if the process is bug-free.
This modification has three main targets:
- enhance the DHT transmission speed
- with a vertical DHT, a search will speed up. With two partitions, two times. With sixteen, sixteen times.
- the vertical DHT will apply a semi-dht for URLs, and peers will receive a fraction of the overall URLs they received before.
with two partitions, the fractions will be halve. With sixteen partitions, a 1/16 of the previous number of URLs.
BE CAREFULL, THIS IS A MAJOR CODE CHANGE, POSSIBLY FULL OF BUGS AND HARMFUL THINGS.
git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@5586 6c8d7289-2bf4-0310-a012-ef5d649a1542
2009-02-10 01:06:59 +01:00
final yacySeed targetSeed ,
2009-04-15 08:34:27 +02:00
final ReferenceContainerCache < WordReference > indexes ,
replaced old DHT transmission method with new method. Many things have changed! some of them:
- after a index selection is made, the index is splitted into its vertical components
- from differrent index selctions the splitted components can be accumulated before they are placed into the transmission queue
- each splitted chunk gets its own transmission thread
- multiple transmission threads are started concurrently
- the process can be monitored with the blocking queue servlet
To implement that, a new package de.anomic.yacy.dht was created. Some old files have been removed.
The new index distribution model using a vertical DHT was implemented. An abstraction of this model
is implemented in the new dht package as interface. The freeworld network has now a configuration
of two vertial partitions; sixteen partitions are planned and will be configured if the process is bug-free.
This modification has three main targets:
- enhance the DHT transmission speed
- with a vertical DHT, a search will speed up. With two partitions, two times. With sixteen, sixteen times.
- the vertical DHT will apply a semi-dht for URLs, and peers will receive a fraction of the overall URLs they received before.
with two partitions, the fractions will be halve. With sixteen partitions, a 1/16 of the previous number of URLs.
BE CAREFULL, THIS IS A MAJOR CODE CHANGE, POSSIBLY FULL OF BUGS AND HARMFUL THINGS.
git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@5586 6c8d7289-2bf4-0310-a012-ef5d649a1542
2009-02-10 01:06:59 +01:00
boolean gzipBody ,
final int timeout ) {
2007-04-30 00:05:34 +02:00
final String address = targetSeed . getPublicAddress ( ) ;
2008-08-02 15:57:00 +02:00
if ( address = = null ) { yacyCore . log . logWarning ( " no address for transferRWI " ) ; return null ; }
2007-07-05 01:48:52 +02:00
2005-04-07 21:19:42 +02:00
// prepare post values
2008-04-12 10:12:51 +02:00
final String salt = crypt . randomSalt ( ) ;
2005-09-22 12:30:55 +02:00
// enabling gzip compression for post request body
2008-05-04 12:53:04 +02:00
if ( gzipBody & & ( targetSeed . getVersion ( ) < yacyVersion . YACY_SUPPORTS_GZIP_POST_REQUESTS_CHUNKED ) ) {
gzipBody = false ;
}
2005-10-22 15:28:04 +02:00
2005-04-07 21:19:42 +02:00
int indexcount = 0 ;
replaced old DHT transmission method with new method. Many things have changed! some of them:
- after a index selection is made, the index is splitted into its vertical components
- from differrent index selctions the splitted components can be accumulated before they are placed into the transmission queue
- each splitted chunk gets its own transmission thread
- multiple transmission threads are started concurrently
- the process can be monitored with the blocking queue servlet
To implement that, a new package de.anomic.yacy.dht was created. Some old files have been removed.
The new index distribution model using a vertical DHT was implemented. An abstraction of this model
is implemented in the new dht package as interface. The freeworld network has now a configuration
of two vertial partitions; sixteen partitions are planned and will be configured if the process is bug-free.
This modification has three main targets:
- enhance the DHT transmission speed
- with a vertical DHT, a search will speed up. With two partitions, two times. With sixteen, sixteen times.
- the vertical DHT will apply a semi-dht for URLs, and peers will receive a fraction of the overall URLs they received before.
with two partitions, the fractions will be halve. With sixteen partitions, a 1/16 of the previous number of URLs.
BE CAREFULL, THIS IS A MAJOR CODE CHANGE, POSSIBLY FULL OF BUGS AND HARMFUL THINGS.
git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@5586 6c8d7289-2bf4-0310-a012-ef5d649a1542
2009-02-10 01:06:59 +01:00
final StringBuilder entrypost = new StringBuilder ( indexes . size ( ) * 73 ) ;
2009-04-15 08:34:27 +02:00
Iterator < WordReference > eenum ;
2009-03-02 00:58:14 +01:00
Reference entry ;
2009-04-15 08:34:27 +02:00
for ( ReferenceContainer < WordReference > ic : indexes ) {
replaced old DHT transmission method with new method. Many things have changed! some of them:
- after a index selection is made, the index is splitted into its vertical components
- from differrent index selctions the splitted components can be accumulated before they are placed into the transmission queue
- each splitted chunk gets its own transmission thread
- multiple transmission threads are started concurrently
- the process can be monitored with the blocking queue servlet
To implement that, a new package de.anomic.yacy.dht was created. Some old files have been removed.
The new index distribution model using a vertical DHT was implemented. An abstraction of this model
is implemented in the new dht package as interface. The freeworld network has now a configuration
of two vertial partitions; sixteen partitions are planned and will be configured if the process is bug-free.
This modification has three main targets:
- enhance the DHT transmission speed
- with a vertical DHT, a search will speed up. With two partitions, two times. With sixteen, sixteen times.
- the vertical DHT will apply a semi-dht for URLs, and peers will receive a fraction of the overall URLs they received before.
with two partitions, the fractions will be halve. With sixteen partitions, a 1/16 of the previous number of URLs.
BE CAREFULL, THIS IS A MAJOR CODE CHANGE, POSSIBLY FULL OF BUGS AND HARMFUL THINGS.
git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@5586 6c8d7289-2bf4-0310-a012-ef5d649a1542
2009-02-10 01:06:59 +01:00
eenum = ic . entries ( ) ;
2005-10-13 15:57:15 +02:00
while ( eenum . hasNext ( ) ) {
2008-06-06 18:01:27 +02:00
entry = eenum . next ( ) ;
2010-04-15 15:22:59 +02:00
entrypost . append ( new String ( ic . getTermHash ( ) ) )
2006-11-17 15:17:20 +01:00
. append ( entry . toPropertyForm ( ) )
2007-12-14 20:17:54 +01:00
. append ( serverCore . CRLF_STRING ) ;
2005-08-13 00:14:24 +02:00
indexcount + + ;
2005-04-07 21:19:42 +02:00
}
}
2005-09-21 23:32:43 +02:00
2005-08-13 00:14:24 +02:00
if ( indexcount = = 0 ) {
// nothing to do but everything ok
2008-01-22 12:51:43 +01:00
final HashMap < String , String > result = new HashMap < String , String > ( 2 ) ;
2005-08-13 00:14:24 +02:00
result . put ( " result " , " ok " ) ;
result . put ( " unknownURL " , " " ) ;
return result ;
2005-04-07 21:19:42 +02:00
}
2005-05-07 23:11:18 +02:00
try {
2010-07-15 02:59:53 +02:00
final LinkedHashMap < String , ContentBody > parts = yacyNetwork . basicRequestParts ( Switchboard . getSwitchboard ( ) , targetSeed . hash , salt ) ;
parts . put ( " wordc " , new StringBody ( Integer . toString ( indexes . size ( ) ) ) ) ;
parts . put ( " entryc " , new StringBody ( Integer . toString ( indexcount ) ) ) ;
parts . put ( " indexes " , new StringBody ( entrypost . toString ( ) ) ) ;
2010-08-23 03:08:56 +02:00
final byte [ ] content = HTTPConnector . getConnector ( HTTPLoader . crawlerUserAgent ) . post ( new MultiProtocolURI ( " http:// " + address + " /yacy/transferRWI.html " ) , timeout , targetSeed . getHexHash ( ) + " .yacyh " , parts ) ;
2009-10-05 22:11:41 +02:00
final Iterator < String > v = FileUtils . strings ( content ) ;
2008-05-04 12:53:04 +02:00
// this should return a list of urlhashes that are unknown
2005-04-07 21:19:42 +02:00
2010-06-01 15:02:11 +02:00
final Map < String , String > result = FileUtils . table ( v ) ;
2006-06-14 11:40:42 +02:00
// return the transfered index data in bytes (for debugging only)
2008-01-17 19:43:01 +01:00
result . put ( " indexPayloadSize " , Integer . toString ( entrypost . length ( ) ) ) ;
2005-04-07 21:19:42 +02:00
return result ;
2008-08-02 14:12:04 +02:00
} catch ( final Exception e ) {
2010-07-15 02:59:53 +02:00
yacyCore . log . logInfo ( " yacyClient.transferRWI to " + address + " error: " + e . getMessage ( ) ) ;
2005-04-07 21:19:42 +02:00
return null ;
}
}
2005-09-21 23:32:43 +02:00
2010-07-15 02:59:53 +02:00
private static Map < String , String > transferURL ( final yacySeed targetSeed , final URIMetadataRow [ ] urls , boolean gzipBody , final int timeout ) throws UnsupportedEncodingException {
2005-05-07 23:11:18 +02:00
// this post a message to the remote message board
2007-04-30 00:05:34 +02:00
final String address = targetSeed . getPublicAddress ( ) ;
2005-09-21 23:32:43 +02:00
if ( address = = null ) { return null ; }
2007-07-05 01:48:52 +02:00
2005-04-07 21:19:42 +02:00
// prepare post values
2008-04-12 10:12:51 +02:00
final String salt = crypt . randomSalt ( ) ;
2010-07-15 02:59:53 +02:00
final LinkedHashMap < String , ContentBody > parts = yacyNetwork . basicRequestParts ( Switchboard . getSwitchboard ( ) , targetSeed . hash , salt ) ;
2005-09-22 12:30:55 +02:00
// enabling gzip compression for post request body
2008-05-04 12:53:04 +02:00
if ( gzipBody & & ( targetSeed . getVersion ( ) < yacyVersion . YACY_SUPPORTS_GZIP_POST_REQUESTS_CHUNKED ) ) {
gzipBody = false ;
}
2005-09-22 12:30:55 +02:00
2010-04-08 02:11:32 +02:00
String resource ;
2005-04-07 21:19:42 +02:00
int urlc = 0 ;
2006-06-14 11:40:42 +02:00
int urlPayloadSize = 0 ;
2005-04-07 21:19:42 +02:00
for ( int i = 0 ; i < urls . length ; i + + ) {
if ( urls [ i ] ! = null ) {
2010-04-08 02:11:32 +02:00
resource = urls [ i ] . toString ( ) ;
//System.out.println("*** DEBUG resource = " + resource);
2010-04-10 02:21:07 +02:00
if ( resource ! = null & & resource . indexOf ( 0 ) = = - 1 ) {
2010-07-15 02:59:53 +02:00
parts . put ( " url " + urlc , new StringBody ( resource ) ) ;
2006-06-14 11:40:42 +02:00
urlPayloadSize + = resource . length ( ) ;
2005-04-07 21:19:42 +02:00
urlc + + ;
}
}
}
2005-05-07 23:11:18 +02:00
try {
2010-07-15 02:59:53 +02:00
parts . put ( " urlc " , new StringBody ( Integer . toString ( urlc ) ) ) ;
2010-08-23 03:08:56 +02:00
final byte [ ] content = HTTPConnector . getConnector ( HTTPLoader . crawlerUserAgent ) . post ( new MultiProtocolURI ( " http:// " + address + " /yacy/transferURL.html " ) , timeout , targetSeed . getHexHash ( ) + " .yacyh " , parts ) ;
2009-10-05 22:11:41 +02:00
final Iterator < String > v = FileUtils . strings ( content ) ;
2005-10-22 15:28:04 +02:00
2010-06-01 15:02:11 +02:00
final Map < String , String > result = FileUtils . table ( v ) ;
2006-06-14 11:40:42 +02:00
// return the transfered url data in bytes (for debugging only)
2008-01-17 19:43:01 +01:00
result . put ( " urlPayloadSize " , Integer . toString ( urlPayloadSize ) ) ;
2006-06-14 11:40:42 +02:00
return result ;
2008-08-02 14:12:04 +02:00
} catch ( final Exception e ) {
2010-07-15 02:59:53 +02:00
yacyCore . log . logSevere ( " yacyClient.transferURL to " + address + " error: " + e . getMessage ( ) ) ;
2005-04-07 21:19:42 +02:00
return null ;
}
}
2005-09-21 23:32:43 +02:00
2010-06-01 15:02:11 +02:00
public static Map < String , String > getProfile ( final yacySeed targetSeed ) {
2007-07-05 01:48:52 +02:00
2005-05-07 23:11:18 +02:00
// this post a message to the remote message board
2008-04-12 10:12:51 +02:00
final String salt = crypt . randomSalt ( ) ;
2007-07-05 01:48:52 +02:00
2007-04-30 00:05:34 +02:00
String address = targetSeed . getClusterAddress ( ) ;
2005-09-21 23:32:43 +02:00
if ( address = = null ) { address = " localhost:8080 " ; }
2005-05-07 23:11:18 +02:00
try {
2010-07-14 00:10:24 +02:00
final LinkedHashMap < String , ContentBody > parts = yacyNetwork . basicRequestParts ( Switchboard . getSwitchboard ( ) , targetSeed . hash , salt ) ;
2010-08-23 03:08:56 +02:00
final byte [ ] content = HTTPConnector . getConnector ( HTTPLoader . crawlerUserAgent ) . post ( new MultiProtocolURI ( " http:// " + address + " /yacy/profile.html " ) , 5000 , targetSeed . getHexHash ( ) + " .yacyh " , parts ) ;
2009-10-05 22:11:41 +02:00
return FileUtils . table ( content ) ;
2008-08-02 14:12:04 +02:00
} catch ( final Exception e ) {
2005-08-30 23:32:59 +02:00
yacyCore . log . logSevere ( " yacyClient.getProfile error: " + e . getMessage ( ) ) ;
2005-04-07 21:19:42 +02:00
return null ;
}
}
2005-09-21 23:32:43 +02:00
2008-08-02 14:12:04 +02:00
public static void main ( final String [ ] args ) {
2010-09-14 15:35:47 +02:00
if ( args . length > 2 ) {
// search a remote peer. arguments:
// first arg: path to application home
// second arg: address of target peer
// third arg: search word or file name with list of search words
System . out . println ( " yacyClient Test " ) ;
File searchwordfile = new File ( args [ 2 ] ) ;
List < String > searchlines = new ArrayList < String > ( ) ;
if ( searchwordfile . exists ( ) ) {
Iterator < String > i ;
try {
i = FileUtils . strings ( FileUtils . read ( searchwordfile ) ) ;
while ( i . hasNext ( ) ) searchlines . add ( i . next ( ) ) ;
} catch ( IOException e ) {
e . printStackTrace ( ) ;
System . exit ( - 1 ) ;
}
} else {
searchlines . add ( args [ 2 ] ) ;
}
for ( String line : searchlines ) {
final byte [ ] wordhashe = QueryParams . hashSet2hashString ( Word . words2hashesHandles ( QueryParams . cleanQuery ( line ) [ 0 ] ) ) . getBytes ( ) ;
long time = System . currentTimeMillis ( ) ;
SearchResult result ;
try {
2010-09-15 13:38:03 +02:00
result = new SearchResult (
2010-09-14 15:35:47 +02:00
yacyNetwork . basicRequestParts ( ( String ) null , ( String ) null , " freeworld " ) ,
null , // sb.peers.mySeed(),
new String ( wordhashe ) ,
" " , // excludehashes,
" " , // urlhashes,
Pattern . compile ( " " ) , // prefer,
Pattern . compile ( " .* " ) , // filter,
" " , // language,
" " , // sitehash,
" " , // authorhash,
10 , // count,
1000 , // maxDistance,
true , //global,
16 , // partitions,
" " , args [ 1 ] ,
null , //secondarySearchSuperviser,
new RankingProfile ( ContentDomain . TEXT ) , // rankingProfile,
null // constraint);
) ;
2010-09-15 13:38:03 +02:00
for ( URIMetadataRow link : result . links ) {
2010-09-14 15:35:47 +02:00
System . out . println ( link . metadata ( ) . url ( ) . toNormalform ( true , false ) ) ;
System . out . println ( link . snippet ( ) ) ;
}
} catch ( IOException e ) {
2010-09-14 23:03:50 +02:00
// TODO Auto-generated catch block
2010-09-14 15:35:47 +02:00
e . printStackTrace ( ) ;
}
System . out . println ( " Search Time: " + ( System . currentTimeMillis ( ) - time ) ) ;
}
System . exit ( 0 ) ;
2008-05-04 12:53:04 +02:00
} else if ( args . length = = 1 ) {
System . out . println ( " wput Test " ) ;
// connection params
2010-08-23 03:08:56 +02:00
MultiProtocolURI url = null ;
2008-05-04 12:53:04 +02:00
try {
2010-08-23 03:08:56 +02:00
url = new MultiProtocolURI ( args [ 0 ] ) ;
2008-08-02 14:12:04 +02:00
} catch ( final MalformedURLException e ) {
2009-11-05 21:28:37 +01:00
Log . logException ( e ) ;
2008-05-04 12:53:04 +02:00
}
2010-09-14 15:35:47 +02:00
if ( url = = null ) {
2008-05-04 12:53:04 +02:00
System . exit ( 1 ) ;
return ;
}
final String vhost = url . getHost ( ) ;
final int timeout = 10000 ;
2010-07-13 01:07:05 +02:00
// new data
final LinkedHashMap < String , ContentBody > newpost = new LinkedHashMap < String , ContentBody > ( ) ;
try {
newpost . put ( " process " , new StringBody ( " permission " ) ) ;
newpost . put ( " purpose " , new StringBody ( " crcon " ) ) ;
} catch ( UnsupportedEncodingException e ) {
Log . logException ( e ) ;
}
byte [ ] res ;
try {
2010-08-23 03:08:56 +02:00
res = HTTPConnector . getConnector ( HTTPLoader . crawlerUserAgent ) . post ( url , timeout , vhost , newpost ) ;
2010-07-13 01:07:05 +02:00
System . out . println ( new String ( res ) ) ;
} catch ( IOException e1 ) {
Log . logException ( e1 ) ;
}
2008-05-04 12:53:04 +02:00
}
2010-07-23 01:08:37 +02:00
try {
2010-08-23 00:32:39 +02:00
net . yacy . cora . protocol . http . HTTPClient . closeConnectionManager ( ) ;
2010-07-23 01:08:37 +02:00
} catch ( InterruptedException e ) {
Log . logException ( e ) ;
}
2005-04-07 21:19:42 +02:00
}
2005-09-21 23:32:43 +02:00
2005-04-07 21:19:42 +02:00
}