yacy_search_server/source/net/yacy/crawler/CrawlSwitchboard.java

499 lines
21 KiB
Java
Raw Normal View History

// CrawlSwitchboard.java
// (C) 2005, 2006 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany
// first published 2005 on http://www.anomic.de
//
// This is a part of YaCy, a peer-to-peer based web search engine
//
// $LastChangedDate$
// $LastChangedRevision$
// $LastChangedBy$
//
// LICENSE
//
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation; either version 2 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
2012-09-21 15:48:16 +02:00
package net.yacy.crawler;
import java.io.File;
import java.io.IOException;
import java.util.Collections;
import java.util.Map;
import java.util.Set;
import java.util.TreeMap;
import net.yacy.cora.document.UTF8;
2012-09-25 21:20:03 +02:00
import net.yacy.cora.federate.yacy.CacheStrategy;
2012-09-21 16:46:57 +02:00
import net.yacy.cora.order.Base64Order;
import net.yacy.cora.order.NaturalOrder;
2012-07-27 12:13:53 +02:00
import net.yacy.cora.util.SpaceExceededException;
2012-09-21 15:48:16 +02:00
import net.yacy.crawler.data.CrawlProfile;
import net.yacy.kelondro.blob.MapHeap;
import net.yacy.kelondro.data.word.Word;
import net.yacy.kelondro.logging.Log;
import net.yacy.kelondro.util.FileUtils;
import net.yacy.kelondro.util.kelondroException;
public final class CrawlSwitchboard {
public static final String CRAWL_PROFILE_PROXY = "proxy";
public static final String CRAWL_PROFILE_REMOTE = "remote";
public static final String CRAWL_PROFILE_SNIPPET_LOCAL_TEXT = "snippetLocalText";
public static final String CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT = "snippetGlobalText";
public static final String CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA = "snippetLocalMedia";
public static final String CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA = "snippetGlobalMedia";
public static final String CRAWL_PROFILE_SURROGATE = "surrogates";
public static final String DBFILE_ACTIVE_CRAWL_PROFILES = "crawlProfilesActive.heap";
public static final String DBFILE_PASSIVE_CRAWL_PROFILES = "crawlProfilesPassive.heap";
public static final long CRAWL_PROFILE_PROXY_RECRAWL_CYCLE = 60L * 24L;
public static final long CRAWL_PROFILE_SNIPPET_LOCAL_TEXT_RECRAWL_CYCLE = 60L * 24L * 30L;
public static final long CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT_RECRAWL_CYCLE = 60L * 24L * 30L;
public static final long CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA_RECRAWL_CYCLE = 60L * 24L * 30L;
public static final long CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA_RECRAWL_CYCLE = 60L * 24L * 30L;
public static final long CRAWL_PROFILE_SURROGATE_RECRAWL_CYCLE = 60L * 24L * 30L;
private final Log log;
private MapHeap profilesActiveCrawls;
private final MapHeap profilesPassiveCrawls;
private final Map<byte[], CrawlProfile> profilesActiveCrawlsCache; //TreeMap<byte[], DigestURI>(Base64Order.enhancedCoder);
public CrawlProfile defaultProxyProfile;
public CrawlProfile defaultRemoteProfile;
public CrawlProfile defaultTextSnippetLocalProfile, defaultTextSnippetGlobalProfile;
public CrawlProfile defaultMediaSnippetLocalProfile, defaultMediaSnippetGlobalProfile;
public CrawlProfile defaultSurrogateProfile;
private final File queuesRoot;
public CrawlSwitchboard(final String networkName, final Log log, final File queuesRoot) {
log.logInfo("Initializing Word Index for the network '" + networkName + "'.");
if ( networkName == null || networkName.isEmpty() ) {
log.logSevere("no network name given - shutting down");
System.exit(0);
}
this.log = log;
this.profilesActiveCrawlsCache =
Collections.synchronizedMap(new TreeMap<byte[], CrawlProfile>(Base64Order.enhancedCoder));
// make crawl profiles database and default profiles
this.queuesRoot = queuesRoot;
this.queuesRoot.mkdirs();
this.log.logConfig("Initializing Crawl Profiles");
final File profilesActiveFile = new File(queuesRoot, DBFILE_ACTIVE_CRAWL_PROFILES);
this.profilesActiveCrawls = loadFromDB(profilesActiveFile);
for ( final byte[] handle : this.profilesActiveCrawls.keySet() ) {
CrawlProfile p;
try {
p = new CrawlProfile(this.profilesActiveCrawls.get(handle));
} catch ( final IOException e ) {
p = null;
2012-07-27 12:13:53 +02:00
} catch ( final SpaceExceededException e ) {
p = null;
}
if ( p == null ) {
continue;
}
}
initActiveCrawlProfiles();
log.logInfo("Loaded active crawl profiles from file "
+ profilesActiveFile.getName()
+ ", "
+ this.profilesActiveCrawls.size()
+ " entries");
final File profilesPassiveFile = new File(queuesRoot, DBFILE_PASSIVE_CRAWL_PROFILES);
this.profilesPassiveCrawls = loadFromDB(profilesPassiveFile);
for ( final byte[] handle : this.profilesPassiveCrawls.keySet() ) {
CrawlProfile p;
try {
p = new CrawlProfile(this.profilesPassiveCrawls.get(handle));
Log.logInfo("CrawlProfiles", "loaded Profile " + p.handle() + ": " + p.collectionName());
} catch ( final IOException e ) {
continue;
2012-07-27 12:13:53 +02:00
} catch ( final SpaceExceededException e ) {
continue;
}
}
log.logInfo("Loaded passive crawl profiles from file "
+ profilesPassiveFile.getName()
+ ", "
+ this.profilesPassiveCrawls.size()
+ " entries"
+ ", "
+ profilesPassiveFile.length()
/ 1024);
}
public CrawlProfile getActive(final byte[] profileKey) {
if ( profileKey == null ) {
return null;
}
// get from cache
CrawlProfile p = this.profilesActiveCrawlsCache.get(profileKey);
if ( p != null ) {
return p;
}
// get from db
Map<String, String> m;
try {
m = this.profilesActiveCrawls.get(profileKey);
} catch ( final IOException e ) {
m = null;
2012-07-27 12:13:53 +02:00
} catch ( final SpaceExceededException e ) {
m = null;
}
if ( m == null ) {
return null;
}
p = new CrawlProfile(m);
this.profilesActiveCrawlsCache.put(profileKey, p);
return p;
}
public CrawlProfile getPassive(final byte[] profileKey) {
if ( profileKey == null ) {
return null;
}
Map<String, String> m;
try {
m = this.profilesPassiveCrawls.get(profileKey);
} catch ( final IOException e ) {
m = null;
2012-07-27 12:13:53 +02:00
} catch ( final SpaceExceededException e ) {
m = null;
}
if ( m == null ) {
return null;
}
return new CrawlProfile(m);
}
public int getActiveSize() {
return this.profilesActiveCrawls.size();
}
public int getPassiveSize() {
return this.profilesPassiveCrawls.size();
}
public Set<byte[]> getActive() {
return this.profilesActiveCrawls.keySet();
}
public Set<byte[]> getPassive() {
return this.profilesPassiveCrawls.keySet();
}
public void removeActive(final byte[] profileKey) {
if ( profileKey == null ) {
return;
}
this.profilesActiveCrawlsCache.remove(profileKey);
this.profilesActiveCrawls.remove(profileKey);
}
public void removePassive(final byte[] profileKey) {
if ( profileKey == null ) {
return;
}
this.profilesPassiveCrawls.remove(profileKey);
}
public void putActive(final byte[] profileKey, final CrawlProfile profile) {
this.profilesActiveCrawls.put(profileKey, profile);
this.profilesActiveCrawlsCache.put(profileKey, profile);
}
public void putPassive(final byte[] profileKey, final CrawlProfile profile) {
this.profilesPassiveCrawls.put(profileKey, profile);
}
private void initActiveCrawlProfiles() {
2012-10-08 10:50:40 +02:00
// generate new default entry for proxy crawling
this.defaultProxyProfile =
new CrawlProfile(
CRAWL_PROFILE_PROXY,
CrawlProfile.MATCH_ALL_STRING, //crawlerUrlMustMatch
CrawlProfile.MATCH_NEVER_STRING, //crawlerUrlMustNotMatch
CrawlProfile.MATCH_ALL_STRING, //crawlerIpMustMatch
CrawlProfile.MATCH_NEVER_STRING, //crawlerIpMustNotMatch
"", //crawlerCountryMustMatch
CrawlProfile.MATCH_NEVER_STRING, //crawlerNoDepthLimitMatch
CrawlProfile.MATCH_ALL_STRING, //indexUrlMustMatch
CrawlProfile.MATCH_NEVER_STRING, //indexUrlMustNotMatch
0 /*Integer.parseInt(getConfig(PROXY_PREFETCH_DEPTH, "0"))*/,
true,
CrawlProfile.getRecrawlDate(CRAWL_PROFILE_PROXY_RECRAWL_CYCLE),
-1,
false,
true /*getConfigBool(PROXY_INDEXING_LOCAL_TEXT, true)*/,
true /*getConfigBool(PROXY_INDEXING_LOCAL_MEDIA, true)*/,
true,
false /*getConfigBool(PROXY_INDEXING_REMOTE, false)*/,
CacheStrategy.IFFRESH,
"robot_" + CRAWL_PROFILE_PROXY);
this.profilesActiveCrawls.put(
UTF8.getBytes(this.defaultProxyProfile.handle()),
this.defaultProxyProfile);
// generate new default entry for remote crawling
this.defaultRemoteProfile =
new CrawlProfile(
CRAWL_PROFILE_REMOTE,
CrawlProfile.MATCH_ALL_STRING, //crawlerUrlMustMatch
CrawlProfile.MATCH_NEVER_STRING, //crawlerUrlMustNotMatch
CrawlProfile.MATCH_ALL_STRING, //crawlerIpMustMatch
CrawlProfile.MATCH_NEVER_STRING, //crawlerIpMustNotMatch
"", //crawlerCountryMustMatch
CrawlProfile.MATCH_NEVER_STRING, //crawlerNoDepthLimitMatch
CrawlProfile.MATCH_ALL_STRING, //indexUrlMustMatch
CrawlProfile.MATCH_NEVER_STRING, //indexUrlMustNotMatch
0,
false,
-1,
-1,
true,
true,
true,
false,
false,
CacheStrategy.IFFRESH,
"robot_" + CRAWL_PROFILE_REMOTE);
this.profilesActiveCrawls.put(
UTF8.getBytes(this.defaultRemoteProfile.handle()),
this.defaultRemoteProfile);
// generate new default entry for snippet fetch and optional crawling
this.defaultTextSnippetLocalProfile =
new CrawlProfile(
CRAWL_PROFILE_SNIPPET_LOCAL_TEXT,
CrawlProfile.MATCH_ALL_STRING, //crawlerUrlMustMatch
CrawlProfile.MATCH_NEVER_STRING, //crawlerUrlMustNotMatch
CrawlProfile.MATCH_ALL_STRING, //crawlerIpMustMatch
CrawlProfile.MATCH_NEVER_STRING, //crawlerIpMustNotMatch
"", //crawlerCountryMustMatch
CrawlProfile.MATCH_NEVER_STRING, //crawlerNoDepthLimitMatch
CrawlProfile.MATCH_ALL_STRING, //indexUrlMustMatch
CrawlProfile.MATCH_NEVER_STRING, //indexUrlMustNotMatch
0,
false,
CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SNIPPET_LOCAL_TEXT_RECRAWL_CYCLE),
-1,
true,
false,
false,
true,
false,
CacheStrategy.IFEXIST,
"robot_" + CRAWL_PROFILE_SNIPPET_LOCAL_TEXT);
this.profilesActiveCrawls.put(
UTF8.getBytes(this.defaultTextSnippetLocalProfile.handle()),
this.defaultTextSnippetLocalProfile);
// generate new default entry for snippet fetch and optional crawling
this.defaultTextSnippetGlobalProfile =
new CrawlProfile(
CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT,
CrawlProfile.MATCH_ALL_STRING, //crawlerUrlMustMatch
CrawlProfile.MATCH_NEVER_STRING, //crawlerUrlMustNotMatch
CrawlProfile.MATCH_ALL_STRING, //crawlerIpMustMatch
CrawlProfile.MATCH_NEVER_STRING, //crawlerIpMustNotMatch
"", //crawlerCountryMustMatch
CrawlProfile.MATCH_NEVER_STRING, //crawlerNoDepthLimitMatch
CrawlProfile.MATCH_ALL_STRING, //indexUrlMustMatch
CrawlProfile.MATCH_NEVER_STRING, //indexUrlMustNotMatch
0,
false,
CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT_RECRAWL_CYCLE),
-1,
true,
true,
true,
true,
false,
CacheStrategy.IFEXIST,
"robot_" + CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT);
this.profilesActiveCrawls.put(
UTF8.getBytes(this.defaultTextSnippetGlobalProfile.handle()),
this.defaultTextSnippetGlobalProfile);
this.defaultTextSnippetGlobalProfile.setCacheStrategy(CacheStrategy.IFEXIST);
2012-10-08 10:50:40 +02:00
// generate new default entry for snippet fetch and optional crawling
this.defaultMediaSnippetLocalProfile =
new CrawlProfile(
CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA,
CrawlProfile.MATCH_ALL_STRING, //crawlerUrlMustMatch
CrawlProfile.MATCH_NEVER_STRING, //crawlerUrlMustNotMatch
CrawlProfile.MATCH_ALL_STRING, //crawlerIpMustMatch
CrawlProfile.MATCH_NEVER_STRING, //crawlerIpMustNotMatch
"", //crawlerCountryMustMatch
CrawlProfile.MATCH_NEVER_STRING, //crawlerNoDepthLimitMatch
CrawlProfile.MATCH_ALL_STRING, //indexUrlMustMatch
CrawlProfile.MATCH_NEVER_STRING, //indexUrlMustNotMatch
0,
false,
CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA_RECRAWL_CYCLE),
-1,
true,
false,
false,
true,
false,
CacheStrategy.IFEXIST,
"robot_" + CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA);
this.profilesActiveCrawls.put(
UTF8.getBytes(this.defaultMediaSnippetLocalProfile.handle()),
this.defaultMediaSnippetLocalProfile);
// generate new default entry for snippet fetch and optional crawling
this.defaultMediaSnippetGlobalProfile =
new CrawlProfile(
CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA,
CrawlProfile.MATCH_ALL_STRING, //crawlerUrlMustMatch
CrawlProfile.MATCH_NEVER_STRING, //crawlerUrlMustNotMatch
CrawlProfile.MATCH_ALL_STRING, //crawlerIpMustMatch
CrawlProfile.MATCH_NEVER_STRING, //crawlerIpMustNotMatch
"", //crawlerCountryMustMatch
CrawlProfile.MATCH_NEVER_STRING, //crawlerNoDepthLimitMatch
CrawlProfile.MATCH_ALL_STRING, //indexUrlMustMatch
CrawlProfile.MATCH_NEVER_STRING, //indexUrlMustNotMatch
0,
false,
CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA_RECRAWL_CYCLE),
-1,
true,
false,
true,
true,
false,
CacheStrategy.IFEXIST,
"robot_" + CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA);
this.profilesActiveCrawls.put(
UTF8.getBytes(this.defaultMediaSnippetGlobalProfile.handle()),
this.defaultMediaSnippetGlobalProfile);
// generate new default entry for surrogate parsing
this.defaultSurrogateProfile =
new CrawlProfile(
CRAWL_PROFILE_SURROGATE,
CrawlProfile.MATCH_ALL_STRING, //crawlerUrlMustMatch
CrawlProfile.MATCH_NEVER_STRING, //crawlerUrlMustNotMatch
CrawlProfile.MATCH_ALL_STRING, //crawlerIpMustMatch
CrawlProfile.MATCH_NEVER_STRING, //crawlerIpMustNotMatch
"", //crawlerCountryMustMatch
CrawlProfile.MATCH_NEVER_STRING, //crawlerNoDepthLimitMatch
CrawlProfile.MATCH_ALL_STRING, //indexUrlMustMatch
CrawlProfile.MATCH_NEVER_STRING, //indexUrlMustNotMatch
0,
false,
CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SURROGATE_RECRAWL_CYCLE),
-1,
true,
true,
false,
false,
false,
CacheStrategy.NOCACHE,
"robot_" + CRAWL_PROFILE_SURROGATE);
this.profilesActiveCrawls.put(
UTF8.getBytes(this.defaultSurrogateProfile.handle()),
this.defaultSurrogateProfile);
}
private void resetProfiles() {
this.profilesActiveCrawlsCache.clear();
final File pdb = new File(this.queuesRoot, DBFILE_ACTIVE_CRAWL_PROFILES);
if ( pdb.exists() ) {
FileUtils.deletedelete(pdb);
}
try {
this.profilesActiveCrawls =
new MapHeap(pdb, Word.commonHashLength, NaturalOrder.naturalOrder, 1024 * 64, 500, ' ');
} catch ( final IOException e1 ) {
Log.logException(e1);
this.profilesActiveCrawls = null;
}
initActiveCrawlProfiles();
}
public boolean clear() throws InterruptedException {
this.profilesActiveCrawlsCache.clear();
CrawlProfile entry;
boolean hasDoneSomething = false;
try {
for ( final byte[] handle : this.profilesActiveCrawls.keySet() ) {
// check for interruption
if ( Thread.currentThread().isInterrupted() ) {
throw new InterruptedException("Shutdown in progress");
}
// getting next profile
try {
entry = new CrawlProfile(this.profilesActiveCrawls.get(handle));
} catch ( final IOException e ) {
continue;
2012-07-27 12:13:53 +02:00
} catch ( final SpaceExceededException e ) {
continue;
}
if ( !((entry.name().equals(CRAWL_PROFILE_PROXY))
|| (entry.name().equals(CRAWL_PROFILE_REMOTE))
|| (entry.name().equals(CRAWL_PROFILE_SNIPPET_LOCAL_TEXT))
|| (entry.name().equals(CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT))
|| (entry.name().equals(CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA))
|| (entry.name().equals(CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA)) || (entry.name()
.equals(CRAWL_PROFILE_SURROGATE))) ) {
final CrawlProfile p = new CrawlProfile(entry);
this.profilesPassiveCrawls.put(UTF8.getBytes(p.handle()), p);
this.profilesActiveCrawls.remove(handle);
hasDoneSomething = true;
}
}
} catch ( final kelondroException e ) {
resetProfiles();
hasDoneSomething = true;
}
return hasDoneSomething;
}
public synchronized void close() {
this.profilesActiveCrawlsCache.clear();
this.profilesActiveCrawls.close();
this.profilesPassiveCrawls.close();
}
/**
* Loads crawl profiles from a DB file.
*
* @param file DB file
* @return crawl profile data
*/
private static MapHeap loadFromDB(final File file) {
MapHeap ret;
try {
ret = new MapHeap(file, Word.commonHashLength, NaturalOrder.naturalOrder, 1024 * 64, 500, ' ');
} catch ( final IOException e ) {
Log.logException(e);
Log.logException(e);
FileUtils.deletedelete(file);
try {
ret =
new MapHeap(file, Word.commonHashLength, NaturalOrder.naturalOrder, 1024 * 64, 500, ' ');
} catch ( final IOException e1 ) {
Log.logException(e1);
ret = null;
}
}
return ret;
}
}