2010-02-17 00:06:04 +01:00
// ResultURLs.java
2008-03-26 15:13:05 +01:00
// -----------------------
// part of YaCy
// (C) by Michael Peter Christen; mc@yacy.net
// first published on http://yacy.net
// Frankfurt, Germany, 2004
//
2010-04-05 14:37:33 +02:00
// $LastChangedDate$
// $LastChangedRevision$
// $LastChangedBy$
2008-03-26 15:13:05 +01:00
//
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation; either version 2 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
2008-05-06 02:32:41 +02:00
package de.anomic.crawler ;
2008-03-26 15:13:05 +01:00
2008-06-17 20:56:04 +02:00
import java.net.MalformedURLException ;
import java.util.Date ;
2008-09-04 11:59:17 +02:00
import java.util.Iterator ;
2010-02-17 00:06:04 +01:00
import java.util.LinkedHashMap ;
2009-10-21 22:14:30 +02:00
import java.util.Map ;
2010-06-01 15:02:11 +02:00
import java.util.concurrent.ConcurrentHashMap ;
2008-03-26 15:13:05 +01:00
2010-10-14 13:40:02 +02:00
import net.yacy.cora.storage.ScoreCluster ;
import net.yacy.cora.storage.DynamicScore ;
2009-10-11 02:12:19 +02:00
import net.yacy.kelondro.data.meta.DigestURI ;
import net.yacy.kelondro.data.meta.URIMetadataRow ;
2009-10-10 01:13:30 +02:00
import net.yacy.kelondro.logging.Log ;
2009-10-10 01:22:22 +02:00
import net.yacy.kelondro.order.Bitfield ;
2010-02-17 00:06:04 +01:00
import net.yacy.kelondro.util.ReverseMapIterator ;
2009-10-10 01:13:30 +02:00
2008-05-06 02:32:41 +02:00
public final class ResultURLs {
2008-03-26 15:13:05 +01:00
2011-02-12 01:01:40 +01:00
public enum EventOrigin {
2008-09-04 11:59:17 +02:00
2011-02-12 01:01:40 +01:00
// we must distinguish the following cases: resource-load was initiated by
// 1) global crawling: the index is extern, not here (not possible here)
// 2) result of search queries, some indexes are here (not possible here)
// 3) result of index transfer, some of them are here (not possible here)
// 4) proxy-load (initiator is "------------")
// 5) local prefetch/crawling (initiator is own seedHash)
// 6) local fetching for global crawling (other known or unknown initiator)
UNKNOWN ( 0 ) ,
REMOTE_RECEIPTS ( 1 ) ,
QUERIES ( 2 ) ,
DHT_TRANSFER ( 3 ) ,
PROXY_LOAD ( 4 ) ,
LOCAL_CRAWLING ( 5 ) ,
GLOBAL_CRAWLING ( 6 ) ,
SURROGATES ( 7 ) ;
protected int code ;
private static final EventOrigin [ ] list = {
UNKNOWN , REMOTE_RECEIPTS , QUERIES , DHT_TRANSFER , PROXY_LOAD , LOCAL_CRAWLING , GLOBAL_CRAWLING , SURROGATES } ;
private EventOrigin ( int code ) {
this . code = code ;
}
public int getCode ( ) {
return this . code ;
}
public static final EventOrigin getEvent ( int key ) {
return list [ key ] ;
2010-02-17 00:06:04 +01:00
}
}
2011-02-12 01:01:40 +01:00
private final static int initialStackCapacity = 500 ;
private final static Map < EventOrigin , Map < String , InitExecEntry > > resultStacks = new ConcurrentHashMap < EventOrigin , Map < String , InitExecEntry > > ( initialStackCapacity ) ; // a mapping from urlHash to Entries
private final static Map < EventOrigin , DynamicScore < String > > resultDomains = new ConcurrentHashMap < EventOrigin , DynamicScore < String > > ( initialStackCapacity ) ;
static {
2009-10-21 22:14:30 +02:00
for ( EventOrigin origin : EventOrigin . values ( ) ) {
2010-02-17 00:06:04 +01:00
resultStacks . put ( origin , new LinkedHashMap < String , InitExecEntry > ( ) ) ;
2009-10-21 22:14:30 +02:00
resultDomains . put ( origin , new ScoreCluster < String > ( ) ) ;
}
2008-03-26 15:13:05 +01:00
}
2011-02-12 01:01:40 +01:00
public static class InitExecEntry {
public byte [ ] initiatorHash , executorHash ;
public InitExecEntry ( final byte [ ] initiatorHash , final byte [ ] executorHash ) {
this . initiatorHash = initiatorHash ;
this . executorHash = executorHash ;
}
}
2008-03-26 15:13:05 +01:00
2011-02-12 01:01:40 +01:00
public static void stack (
2010-04-08 02:11:32 +02:00
final URIMetadataRow e ,
final byte [ ] initiatorHash ,
final byte [ ] executorHash ,
final EventOrigin stackType ) {
2010-04-10 17:12:39 +02:00
// assert initiatorHash != null; // null == proxy !
2008-05-06 01:13:47 +02:00
assert executorHash ! = null ;
2008-03-26 15:13:05 +01:00
if ( e = = null ) { return ; }
try {
2010-06-01 15:02:11 +02:00
final Map < String , InitExecEntry > resultStack = getStack ( stackType ) ;
2008-09-04 11:59:17 +02:00
if ( resultStack ! = null ) {
2010-03-26 19:33:20 +01:00
resultStack . put ( new String ( e . hash ( ) ) , new InitExecEntry ( initiatorHash , executorHash ) ) ;
2008-03-26 15:13:05 +01:00
}
2008-08-02 14:12:04 +02:00
} catch ( final Exception ex ) {
2008-03-26 15:13:05 +01:00
System . out . println ( " INTERNAL ERROR in newEntry/2: " + ex . toString ( ) ) ;
return ;
}
2008-09-04 11:59:17 +02:00
try {
2010-10-14 13:40:02 +02:00
final DynamicScore < String > domains = getDomains ( stackType ) ;
2008-09-04 11:59:17 +02:00
if ( domains ! = null ) {
2010-10-16 00:01:39 +02:00
domains . inc ( e . metadata ( ) . url ( ) . getHost ( ) ) ;
2008-09-04 11:59:17 +02:00
}
} catch ( final Exception ex ) {
System . out . println ( " INTERNAL ERROR in newEntry/3: " + ex . toString ( ) ) ;
return ;
}
2008-03-26 15:13:05 +01:00
}
2011-02-12 01:01:40 +01:00
public static int getStackSize ( final EventOrigin stack ) {
2010-06-01 15:02:11 +02:00
final Map < String , InitExecEntry > resultStack = getStack ( stack ) ;
2008-09-04 21:41:57 +02:00
if ( resultStack = = null ) return 0 ;
2008-08-02 15:57:00 +02:00
return resultStack . size ( ) ;
2008-03-26 15:13:05 +01:00
}
2008-09-04 21:41:57 +02:00
2011-02-12 01:01:40 +01:00
public static int getDomainListSize ( final EventOrigin stack ) {
2010-10-14 13:40:02 +02:00
final DynamicScore < String > domains = getDomains ( stack ) ;
2008-09-04 21:41:57 +02:00
if ( domains = = null ) return 0 ;
return domains . size ( ) ;
}
2008-06-17 20:56:04 +02:00
2011-02-12 01:01:40 +01:00
public static Iterator < Map . Entry < String , InitExecEntry > > results ( final EventOrigin stack ) {
2010-06-01 15:02:11 +02:00
final Map < String , InitExecEntry > resultStack = getStack ( stack ) ;
2010-02-17 00:06:04 +01:00
if ( resultStack = = null ) return new LinkedHashMap < String , InitExecEntry > ( ) . entrySet ( ) . iterator ( ) ;
return new ReverseMapIterator < String , InitExecEntry > ( resultStack ) ;
2008-03-26 15:13:05 +01:00
}
2008-09-04 11:59:17 +02:00
/ * *
* iterate all domains in the result domain statistic
* @return iterator of domains in reverse order ( downwards )
* /
2011-02-12 01:01:40 +01:00
public static Iterator < String > domains ( final EventOrigin stack ) {
2008-09-04 21:41:57 +02:00
assert getDomains ( stack ) ! = null : " getDomains( " + stack + " ) = null " ;
2010-10-16 00:01:39 +02:00
return getDomains ( stack ) . keys ( false ) ;
2008-09-04 11:59:17 +02:00
}
2011-02-12 01:01:40 +01:00
public static int deleteDomain ( final EventOrigin stack , String host , String hosthash ) {
2010-02-17 00:06:04 +01:00
assert host ! = null : " host = null " ;
2008-09-04 23:28:00 +02:00
assert hosthash . length ( ) = = 6 ;
2010-02-17 00:06:04 +01:00
final Iterator < Map . Entry < String , InitExecEntry > > i = results ( stack ) ;
Map . Entry < String , InitExecEntry > w ;
2010-01-19 12:29:22 +01:00
String urlhash ;
2010-02-17 00:06:04 +01:00
while ( i . hasNext ( ) ) {
w = i . next ( ) ;
urlhash = w . getKey ( ) ;
if ( urlhash = = null | | urlhash . substring ( 6 ) . equals ( hosthash ) ) i . remove ( ) ;
2008-09-04 21:41:57 +02:00
}
assert getDomains ( stack ) ! = null : " getDomains( " + stack + " ) = null " ;
2010-10-16 00:01:39 +02:00
return getDomains ( stack ) . delete ( host ) ;
2008-09-04 21:41:57 +02:00
}
2008-09-04 11:59:17 +02:00
/ * *
* return the count of the domain
* @param stack type
* @param domain name
* @return the number of occurrences of the domain in the stack statistics
* /
2011-02-12 01:01:40 +01:00
public static int domainCount ( final EventOrigin stack , String domain ) {
2008-09-04 21:41:57 +02:00
assert domain ! = null : " domain = null " ;
assert getDomains ( stack ) ! = null : " getDomains( " + stack + " ) = null " ;
2010-10-16 00:01:39 +02:00
return getDomains ( stack ) . get ( domain ) ;
2008-09-04 11:59:17 +02:00
}
2008-06-17 20:56:04 +02:00
/ * *
2009-10-21 22:14:30 +02:00
* returns the stack identified by the id < em > stack < / em >
2008-06-17 20:56:04 +02:00
*
* @param stack id of resultStack
* @return null if stack does not exist ( id is unknown or stack is null ( which should not occur and an error is logged ) )
* /
2011-02-12 01:01:40 +01:00
private static Map < String , InitExecEntry > getStack ( final EventOrigin stack ) {
2009-10-21 22:14:30 +02:00
return resultStacks . get ( stack ) ;
2008-06-17 20:56:04 +02:00
}
2011-02-12 01:01:40 +01:00
private static DynamicScore < String > getDomains ( final EventOrigin stack ) {
2009-10-21 22:14:30 +02:00
return resultDomains . get ( stack ) ;
2008-09-04 11:59:17 +02:00
}
2008-03-26 15:13:05 +01:00
2011-02-12 01:01:40 +01:00
public static void clearStacks ( ) {
2010-10-01 01:57:58 +02:00
for ( EventOrigin origin : EventOrigin . values ( ) ) clearStack ( origin ) ;
}
2011-02-12 01:01:40 +01:00
public static void clearStack ( final EventOrigin stack ) {
2010-06-01 15:02:11 +02:00
final Map < String , InitExecEntry > resultStack = getStack ( stack ) ;
2008-09-04 11:59:17 +02:00
if ( resultStack ! = null ) resultStack . clear ( ) ;
2010-10-14 13:40:02 +02:00
final DynamicScore < String > resultDomains = getDomains ( stack ) ;
2008-09-04 21:41:57 +02:00
if ( resultDomains ! = null ) {
// we do not clear this completely, just remove most of the less important entries
resultDomains . shrinkToMaxSize ( 100 ) ;
resultDomains . shrinkToMinScore ( 2 ) ;
}
2008-03-26 15:13:05 +01:00
}
2011-02-12 01:01:40 +01:00
public static boolean remove ( final String urlHash ) {
2008-03-26 15:13:05 +01:00
if ( urlHash = = null ) return false ;
2010-06-01 15:02:11 +02:00
Map < String , InitExecEntry > resultStack ;
2009-10-21 22:14:30 +02:00
for ( EventOrigin origin : EventOrigin . values ( ) ) {
2010-02-17 00:06:04 +01:00
resultStack = getStack ( origin ) ;
if ( resultStack ! = null ) resultStack . remove ( urlHash ) ;
2008-03-26 15:13:05 +01:00
}
return true ;
}
2008-06-17 20:56:04 +02:00
/ * *
* test and benchmark
* @param args
* /
2008-08-02 14:12:04 +02:00
public static void main ( final String [ ] args ) {
2008-06-17 20:56:04 +02:00
try {
2009-10-11 02:12:19 +02:00
final DigestURI url = new DigestURI ( " http " , " www.yacy.net " , 80 , " / " ) ;
final URIMetadataRow urlRef = new URIMetadataRow ( url , " YaCy Homepage " , " " , " " , " " , new Date ( ) , new Date ( ) , new Date ( ) , " " , new byte [ ] { } , 123 , 42 , '?' , new Bitfield ( ) , " de " , 0 , 0 , 0 , 0 , 0 , 0 ) ;
2009-10-21 22:14:30 +02:00
EventOrigin stackNo = EventOrigin . LOCAL_CRAWLING ;
2008-06-17 20:56:04 +02:00
System . out . println ( " valid test: \ n======= " ) ;
// add
2011-02-12 01:01:40 +01:00
stack ( urlRef , urlRef . hash ( ) , url . hash ( ) , stackNo ) ;
2008-06-17 20:56:04 +02:00
// size
2011-02-12 01:01:40 +01:00
System . out . println ( " size of stack: \ t " + getStackSize ( stackNo ) ) ;
2008-08-02 14:12:04 +02:00
} catch ( final MalformedURLException e ) {
2009-11-05 21:28:37 +01:00
Log . logException ( e ) ;
2008-06-17 20:56:04 +02:00
}
}
2008-03-26 15:13:05 +01:00
}