mirror of
https://github.com/yacy/yacy_search_server.git
synced 2024-09-19 00:01:41 +02:00
fixed problem with switching of networks
git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@6247 6c8d7289-2bf4-0310-a012-ef5d649a1542
This commit is contained in:
parent
0575f12838
commit
92edd24e70
|
@ -63,7 +63,7 @@ public class CrawlQueues {
|
|||
public NoticedURL noticeURL;
|
||||
public ZURL errorURL, delegatedURL;
|
||||
|
||||
public CrawlQueues(final Switchboard sb, final File plasmaPath) {
|
||||
public CrawlQueues(final Switchboard sb, final File queuePath) {
|
||||
this.sb = sb;
|
||||
this.log = new Log("CRAWLER");
|
||||
this.workers = new ConcurrentHashMap<Integer, crawlWorker>();
|
||||
|
@ -71,16 +71,69 @@ public class CrawlQueues {
|
|||
|
||||
// start crawling management
|
||||
log.logConfig("Starting Crawling Management");
|
||||
noticeURL = new NoticedURL(plasmaPath);
|
||||
noticeURL = new NoticedURL(queuePath);
|
||||
//errorURL = new plasmaCrawlZURL(); // fresh error DB each startup; can be hold in RAM and reduces IO;
|
||||
final File errorDBFile = new File(plasmaPath, "urlError2.db");
|
||||
final File errorDBFile = new File(queuePath, "urlError2.db");
|
||||
if (errorDBFile.exists()) {
|
||||
// delete the error db to get a fresh each time on startup
|
||||
// this is useful because there is currently no re-use of the data in this table.
|
||||
if (errorDBFile.isDirectory()) SplitTable.delete(plasmaPath, "urlError2.db"); else FileUtils.deletedelete(errorDBFile);
|
||||
if (errorDBFile.isDirectory()) SplitTable.delete(queuePath, "urlError2.db"); else FileUtils.deletedelete(errorDBFile);
|
||||
}
|
||||
errorURL = new ZURL(queuePath, "urlError3.db", false);
|
||||
delegatedURL = new ZURL(queuePath, "urlDelegated3.db", true);
|
||||
}
|
||||
|
||||
public void relocate(final File newQueuePath) {
|
||||
this.close();
|
||||
|
||||
this.workers = new ConcurrentHashMap<Integer, crawlWorker>();
|
||||
this.remoteCrawlProviderHashes.clear();
|
||||
|
||||
noticeURL = new NoticedURL(newQueuePath);
|
||||
final File errorDBFile = new File(newQueuePath, "urlError2.db");
|
||||
if (errorDBFile.exists()) {
|
||||
if (errorDBFile.isDirectory()) SplitTable.delete(newQueuePath, "urlError2.db"); else FileUtils.deletedelete(errorDBFile);
|
||||
}
|
||||
errorURL = new ZURL(newQueuePath, "urlError3.db", false);
|
||||
delegatedURL = new ZURL(newQueuePath, "urlDelegated3.db", true);
|
||||
}
|
||||
|
||||
public void close() {
|
||||
// wait for all workers to finish
|
||||
for (final crawlWorker w: workers.values()) {
|
||||
w.interrupt();
|
||||
}
|
||||
for (final crawlWorker w: workers.values()) {
|
||||
try {
|
||||
w.join();
|
||||
} catch (InterruptedException e) {
|
||||
e.printStackTrace();
|
||||
}
|
||||
}
|
||||
noticeURL.close();
|
||||
errorURL.close();
|
||||
delegatedURL.close();
|
||||
}
|
||||
|
||||
public void clear() {
|
||||
// wait for all workers to finish
|
||||
for (final crawlWorker w: workers.values()) {
|
||||
w.interrupt();
|
||||
}
|
||||
// TODO: wait some more time until all threads are finished
|
||||
workers.clear();
|
||||
remoteCrawlProviderHashes.clear();
|
||||
noticeURL.clear();
|
||||
try {
|
||||
errorURL.clear();
|
||||
} catch (final IOException e) {
|
||||
e.printStackTrace();
|
||||
}
|
||||
try {
|
||||
delegatedURL.clear();
|
||||
} catch (final IOException e) {
|
||||
e.printStackTrace();
|
||||
}
|
||||
errorURL = new ZURL(plasmaPath, "urlError3.db", false);
|
||||
delegatedURL = new ZURL(plasmaPath, "urlDelegated3.db", true);
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -127,44 +180,6 @@ public class CrawlQueues {
|
|||
}
|
||||
}
|
||||
|
||||
public void clear() {
|
||||
// wait for all workers to finish
|
||||
for (final crawlWorker w: workers.values()) {
|
||||
w.interrupt();
|
||||
}
|
||||
// TODO: wait some more time until all threads are finished
|
||||
workers.clear();
|
||||
remoteCrawlProviderHashes.clear();
|
||||
noticeURL.clear();
|
||||
try {
|
||||
errorURL.clear();
|
||||
} catch (final IOException e) {
|
||||
e.printStackTrace();
|
||||
}
|
||||
try {
|
||||
delegatedURL.clear();
|
||||
} catch (final IOException e) {
|
||||
e.printStackTrace();
|
||||
}
|
||||
}
|
||||
|
||||
public void close() {
|
||||
// wait for all workers to finish
|
||||
for (final crawlWorker w: workers.values()) {
|
||||
w.interrupt();
|
||||
}
|
||||
for (final crawlWorker w: workers.values()) {
|
||||
try {
|
||||
w.join();
|
||||
} catch (InterruptedException e) {
|
||||
e.printStackTrace();
|
||||
}
|
||||
}
|
||||
noticeURL.close();
|
||||
errorURL.close();
|
||||
delegatedURL.close();
|
||||
}
|
||||
|
||||
public Request[] activeWorkerEntries() {
|
||||
synchronized (workers) {
|
||||
final Request[] e = new Request[workers.size()];
|
||||
|
@ -195,9 +210,16 @@ public class CrawlQueues {
|
|||
", robinsonMode=" + ((sb.isRobinsonMode()) ? "on" : "off"));
|
||||
}
|
||||
|
||||
if(!crawlIsPossible(NoticedURL.STACK_TYPE_CORE, "Core")) return false;
|
||||
String queueCheck = crawlIsPossible(NoticedURL.STACK_TYPE_CORE, "Core");
|
||||
if (queueCheck != null) {
|
||||
if (log.isFine()) log.logFine("omitting de-queue/local: " + queueCheck);
|
||||
return false;
|
||||
}
|
||||
|
||||
if(isPaused(SwitchboardConstants.CRAWLJOB_LOCAL_CRAWL)) return false;
|
||||
if (isPaused(SwitchboardConstants.CRAWLJOB_LOCAL_CRAWL)) {
|
||||
if (log.isFinest()) log.logFinest("omitting de-queue/local: paused");
|
||||
return false;
|
||||
}
|
||||
|
||||
// do a local crawl
|
||||
Request urlEntry = null;
|
||||
|
@ -291,34 +313,29 @@ public class CrawlQueues {
|
|||
* @param type
|
||||
* @return
|
||||
*/
|
||||
private boolean crawlIsPossible(int stackType, final String type) {
|
||||
int value;
|
||||
private String crawlIsPossible(int stackType, final String type) {
|
||||
//System.out.println("stacksize = " + noticeURL.stackSize(stackType));
|
||||
if (noticeURL.stackSize(stackType) == 0) {
|
||||
//log.logDebug("GlobalCrawl: queue is empty");
|
||||
return false;
|
||||
return "stack is empty";
|
||||
}
|
||||
value = (int) sb.getConfigLong(SwitchboardConstants.CRAWLER_THREADS_ACTIVE_MAX, 10);
|
||||
if (this.size() >= value) {
|
||||
// try a cleanup
|
||||
|
||||
// check the worker threads
|
||||
int maxWorkers = (int) sb.getConfigLong(SwitchboardConstants.CRAWLER_THREADS_ACTIVE_MAX, 10);
|
||||
if (this.workers.size() >= maxWorkers) {
|
||||
// too many worker threads, try a cleanup
|
||||
this.cleanup();
|
||||
}
|
||||
// check again
|
||||
if (this.size() >= value) {
|
||||
if (this.log.isFine()) {
|
||||
log.logFine(type + "Crawl: too many processes in loader queue, dismissed (" + "cacheLoader=" + this.size() + "), httpClients = " + Client.connectionCount());
|
||||
}
|
||||
return false;
|
||||
if (this.workers.size() >= maxWorkers) {
|
||||
return "too many workers active: " + this.workers.size();
|
||||
}
|
||||
|
||||
String cautionCause = sb.onlineCaution();
|
||||
if (cautionCause != null) {
|
||||
if (this.log.isFine()) {
|
||||
log.logFine(type + "Crawl: online caution for " + cautionCause + ", omitting processing");
|
||||
}
|
||||
return false;
|
||||
return "online caution: " + cautionCause;
|
||||
}
|
||||
return true;
|
||||
return null;
|
||||
}
|
||||
|
||||
public boolean remoteCrawlLoaderJob() {
|
||||
|
@ -467,9 +484,16 @@ public class CrawlQueues {
|
|||
|
||||
// do nothing if either there are private processes to be done
|
||||
// or there is no global crawl on the stack
|
||||
if (!crawlIsPossible(NoticedURL.STACK_TYPE_REMOTE, "Global")) return false;
|
||||
String queueCheck = crawlIsPossible(NoticedURL.STACK_TYPE_REMOTE, "Global");
|
||||
if (queueCheck != null) {
|
||||
if (log.isFine()) log.logFine("omitting de-queue/remote: " + queueCheck);
|
||||
return false;
|
||||
}
|
||||
|
||||
if (isPaused(SwitchboardConstants.CRAWLJOB_REMOTE_TRIGGERED_CRAWL)) return false;
|
||||
if (isPaused(SwitchboardConstants.CRAWLJOB_REMOTE_TRIGGERED_CRAWL)) {
|
||||
if (log.isFinest()) log.logFinest("omitting de-queue/remote: paused");
|
||||
return false;
|
||||
}
|
||||
|
||||
// we don't want to crawl a global URL globally, since WE are the global part. (from this point of view)
|
||||
final String stats = "REMOTETRIGGEREDCRAWL[" + noticeURL.stackSize(NoticedURL.STACK_TYPE_CORE) + ", " + noticeURL.stackSize(NoticedURL.STACK_TYPE_LIMIT) + ", " + noticeURL.stackSize(NoticedURL.STACK_TYPE_OVERHANG) + ", "
|
||||
|
|
|
@ -809,11 +809,10 @@ public final class Switchboard extends serverAbstractSwitch implements serverSwi
|
|||
this.crawlStacker.close();
|
||||
this.webStructure.close();
|
||||
this.robots.close();
|
||||
this.crawlQueues.close();
|
||||
|
||||
log.logInfo("SWITCH NETWORK: START UP OF NEW INDEX DATABASE...");
|
||||
|
||||
// start up
|
||||
// new properties
|
||||
setConfig("network.unit.definition", networkDefinition);
|
||||
overwriteNetworkDefinition();
|
||||
final File indexPrimaryPath = getConfigPath(SwitchboardConstants.INDEX_PRIMARY_PATH, SwitchboardConstants.INDEX_PATH_DEFAULT);
|
||||
|
@ -826,6 +825,9 @@ public final class Switchboard extends serverAbstractSwitch implements serverSwi
|
|||
this.queuesRoot = new File(new File(indexPrimaryPath, networkName), "QUEUES");
|
||||
this.networkRoot.mkdirs();
|
||||
this.queuesRoot.mkdirs();
|
||||
|
||||
// relocate
|
||||
this.crawlQueues.relocate(this.queuesRoot); // cannot be closed because the busy threads are working with that object
|
||||
final File mySeedFile = new File(this.networkRoot, yacySeedDB.DBFILE_OWN_SEED);
|
||||
peers = new yacySeedDB(
|
||||
this.networkRoot,
|
||||
|
@ -844,13 +846,14 @@ public final class Switchboard extends serverAbstractSwitch implements serverSwi
|
|||
} catch (IOException e) {
|
||||
e.printStackTrace();
|
||||
}
|
||||
|
||||
// startup
|
||||
crawler = new CrawlSwitchboard(
|
||||
peers,
|
||||
networkName,
|
||||
log,
|
||||
this.queuesRoot);
|
||||
|
||||
|
||||
// create new web structure
|
||||
this.webStructure = new WebStructureGraph(log, rankingPath, "LOCAL/010_cr/", getConfig("CRDist0Path", CRDistribution.CR_OWN), new File(queuesRoot, "webStructure.map"));
|
||||
|
||||
|
|
Loading…
Reference in New Issue
Block a user