- redesigned crawl start menu, integrated monitoring pages

- removed web structure picture from indexing menu and grouped it together with htcache monitor
- added a database for terminated crawls, when a crawl is finished it is automatically moved to the new database
- extended crawl profile edit servlet, shows now also terminated crawls
- option that was used to delete profiles is now redesigned to a function that moves the current crawl to the terminated crawls and removes all urls from the current queues!
- fixed here and there problems with indexing queues
- enhances indexing speed by changing cache flush sizes.
- changed behaviour of crawl result servlet: the list of crawled urls is shown if there is one, othevise the overview window is shown

attention: the new profile databases are not compatible with the old one. current crawls will be lost! the web index is not touched.
next steps: the database of terminated crawls can be used to start with them a new crawl. This is useful if one wants to re-crawl specific pages and wants to use a old crawl profile.


git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@4113 6c8d7289-2bf4-0310-a012-ef5d649a1542
This commit is contained in:
orbiter 2007-09-28 01:21:31 +00:00
parent 341f7cb327
commit 842308ea97
40 changed files with 303 additions and 233 deletions

View File

@ -3,7 +3,7 @@ javacSource=1.4
javacTarget=1.4
# Release Configuration
releaseVersion=0.545
releaseVersion=0.546
releaseFile=yacy_v${releaseVersion}_${DSTAMP}_${releaseNr}.tar.gz
proReleaseFile=yacy_pro_v${releaseVersion}_${DSTAMP}_${releaseNr}.tar.gz
releaseFileParentDir=yacy

View File

@ -6,6 +6,7 @@
</head>
<body id="CacheAdmin">
#%env/templates/header.template%#
#%env/templates/submenuWebStructure.template%#
<h2>Web Cache</h2>
<p>The current cache size is #[cachesize]# KB. The maximum cache size is #[cachemax]# KB.</p>
<p><a

View File

@ -29,6 +29,7 @@
</colgroup>
<tr class="TableHeader">
<td><strong>Crawl Thread</strong></td>
<td><strong>Status</strong></td>
<td><strong>Start URL</strong></td>
<td><strong>Depth</strong></td>
<td><strong>Filter</strong></td>
@ -46,6 +47,7 @@
#{crawlProfiles}#
<tr class="TableCell#(dark)#Light::Dark#(/dark)#">
<td>#[name]#</td>
<td>#(status)#terminated::active#(/status)#</td>
<td><a href="#[startURL]#">#[startURL]#</a></td>
<td>#[depth]#</td>
<td>#[filter]#</td>
@ -58,12 +60,18 @@
<td>#(indexText)#no::yes#(/indexText)#</td>
<td>#(indexMedia)#no::yes#(/indexMedia)#</td>
<td>#(remoteIndexing)#no::yes#(/remoteIndexing)#</td>
<td>#(deleteButton)#::
<td>#(terminateButton)#::
<form action="CrawlProfileEditor_p.html" method="get" enctype="multipart/form-data">
<pre><input type="hidden" name="handle" value="#[handle]#" /></pre>
<pre><input type="submit" name="deleteprofile" value="Delete" /></pre>
#(/deleteButton)#
<input type="hidden" name="handle" value="#[handle]#" />
<input type="submit" name="terminate" value="Terminate" />
</form>
#(/terminateButton)#
#(deleteButton)#::
<form action="CrawlProfileEditor_p.html" method="get" enctype="multipart/form-data">
<input type="hidden" name="handle" value="#[handle]#" />
<input type="submit" name="delete" value="Delete" />
</form>
#(/deleteButton)#
</td>
</tr>
#{/crawlProfiles}#

View File

@ -1,10 +1,14 @@
// CrawlProfileEditor_p.java
// -------------------------------
// part of the AnomicHTTPD caching proxy
// (C) by Michael Peter Christen; mc@anomic.de
// first published on http://www.anomic.de
// Frankfurt, Germany, 2004, 2005
// last major change: 04.07.2005
// (C) 2005, by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany
// first published 04.07.2005 on http://yacy.net
//
// This is a part of YaCy, a peer-to-peer based web search engine
//
// $LastChangedDate: 2006-04-02 22:40:07 +0200 (So, 02 Apr 2006) $
// $LastChangedRevision: 1986 $
// $LastChangedBy: orbiter $
//
// LICENSE
//
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
@ -19,29 +23,6 @@
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
//
// Using this software in any meaning (reading, learning, copying, compiling,
// running) means that you agree that the Author(s) is (are) not responsible
// for cost, loss of data or any harm that may be caused directly or indirectly
// by usage of this softare or this documentation. The usage of this software
// is on your own risk. The installation and usage (starting/running) of this
// software may allow other people or application to access your computer and
// any attached devices and is highly dependent on the configuration of the
// software which must be done by the user of the software; the author(s) is
// (are) also not responsible for proper configuration and usage of the
// software, even if provoked by documentation provided together with
// the software.
//
// Any changes to this file according to the GPL as documented in the file
// gpl.txt aside this file in the shipment you received can be done to the
// lines that follows this copyright notice here, but changes must not be
// done inside the copyright notive above. A re-distribution must contain
// the intact and unchanged copyright notice.
// Contributions and changes to the program code must be marked as such.
// You must compile this file with
// javac -classpath .:../classes CrawlProfileEditor_p.java
// if the shell's current path is HTROOT
import java.io.IOException;
import java.util.ArrayList;
@ -103,14 +84,23 @@ public class CrawlProfileEditor_p {
// read post for handle
String handle = (post == null) ? "" : post.get("handle", "");
if ((post != null) && (post.containsKey("deleteprofile"))) {
// deletion of a crawl
sb.profiles.removeEntry(handle);
if (post != null) {
if (post.containsKey("terminate")) {
// termination of a crawl: shift the crawl from active to passive
sb.profilesPassiveCrawls.newEntry(sb.profilesActiveCrawls.getEntry(handle).map());
sb.profilesActiveCrawls.removeEntry(handle);
// delete all entries from the crawl queue that are deleted here
sb.noticeURL.removeByProfileHandle(handle);
}
if (post.containsKey("delete")) {
// deletion of a terminated crawl profile
sb.profilesPassiveCrawls.removeEntry(handle);
}
}
// generate handle list
int count = 0;
Iterator it = sb.profiles.profiles(true);
Iterator it = sb.profilesActiveCrawls.profiles(true);
entry selentry;
while (it.hasNext()) {
selentry = (entry)it.next();
@ -126,7 +116,7 @@ public class CrawlProfileEditor_p {
count++;
}
prop.put("profiles", count);
selentry = sb.profiles.getEntry(handle);
selentry = sb.profilesActiveCrawls.getEntry(handle);
// read post for change submit
if ((post != null) && (selentry != null)) {
@ -138,7 +128,7 @@ public class CrawlProfileEditor_p {
tee = (eentry) it.next();
String cval = (String) selentry.map().get(tee.name);
String val = (tee.type == eentry.BOOLEAN) ? Boolean.toString(post.containsKey(tee.name)) : post.get(tee.name, cval);
if (!cval.equals(val)) sb.profiles.changeEntry(selentry, tee.name, val);
if (!cval.equals(val)) sb.profilesActiveCrawls.changeEntry(selentry, tee.name, val);
}
} catch (IOException ex) {
prop.put("error", 1);
@ -149,47 +139,22 @@ public class CrawlProfileEditor_p {
// generate crawl profile table
count = 0;
int domlistlength = (post == null) ? 160 : post.getInt("domlistlength", 160);
it = sb.profiles.profiles(true);
plasmaCrawlProfile.entry profile;
boolean dark = true;
int domlistlength = (post == null) ? 160 : post.getInt("domlistlength", 160);
plasmaCrawlProfile.entry profile;
// put active crawls into list
it = sb.profilesActiveCrawls.profiles(true);
while (it.hasNext()) {
profile = (plasmaCrawlProfile.entry) it.next();
prop.put("crawlProfiles_"+count+"_dark", ((dark) ? 1 : 0));
prop.put("crawlProfiles_"+count+"_name", profile.name());
prop.put("crawlProfiles_"+count+"_startURL", profile.startURL());
prop.put("crawlProfiles_"+count+"_handle", profile.handle());
prop.put("crawlProfiles_"+count+"_depth", profile.generalDepth());
prop.put("crawlProfiles_"+count+"_filter", profile.generalFilter());
prop.put("crawlProfiles_"+count+"_crawlingIfOlder", (profile.recrawlIfOlder() == Long.MAX_VALUE) ? "no re-crawl" : ""+profile.recrawlIfOlder());
prop.put("crawlProfiles_"+count+"_crawlingDomFilterDepth", (profile.domFilterDepth() == Integer.MAX_VALUE) ? "inactive" : Integer.toString(profile.domFilterDepth()));
//start contrib [MN]
int i = 0;
String item;
while((i <= domlistlength) && !((item = profile.domName(true, i)).equals(""))){
if(i == domlistlength){
item = item + " ...";
putProfileEntry(prop, profile, true, dark, count, domlistlength);
dark = !dark;
count++;
}
prop.put("crawlProfiles_"+count+"_crawlingDomFilterContent_"+i+"_item", item);
i++;
}
prop.put("crawlProfiles_"+count+"_crawlingDomFilterContent", i);
//end contrib [MN]
prop.put("crawlProfiles_"+count+"_crawlingDomMaxPages", (profile.domMaxPages() == Integer.MAX_VALUE) ? "unlimited" : ""+profile.domMaxPages());
prop.put("crawlProfiles_"+count+"_withQuery", ((profile.crawlingQ()) ? 1 : 0));
prop.put("crawlProfiles_"+count+"_storeCache", ((profile.storeHTCache()) ? 1 : 0));
prop.put("crawlProfiles_"+count+"_indexText", ((profile.indexText()) ? 1 : 0));
prop.put("crawlProfiles_"+count+"_indexMedia", ((profile.indexMedia()) ? 1 : 0));
prop.put("crawlProfiles_"+count+"_remoteIndexing", ((profile.remoteIndexing()) ? 1 : 0));
prop.put("crawlProfiles_"+count+"_deleteButton", (((profile.name().equals("remote")) ||
(profile.name().equals("proxy")) ||
(profile.name().equals("snippetText")) ||
(profile.name().equals("snippetMedia")) ? 0 : 1)));
prop.put("crawlProfiles_"+count+"_deleteButton_handle", profile.handle());
// put passive crawls into list
it = sb.profilesPassiveCrawls.profiles(true);
while (it.hasNext()) {
profile = (plasmaCrawlProfile.entry) it.next();
putProfileEntry(prop, profile, false, dark, count, domlistlength);
dark = !dark;
count++;
}
@ -223,4 +188,44 @@ public class CrawlProfileEditor_p {
return prop;
}
private static void putProfileEntry(servletProperties prop, plasmaCrawlProfile.entry profile, boolean active, boolean dark, int count, int domlistlength) {
prop.put("crawlProfiles_" + count + "_dark", ((dark) ? 1 : 0));
prop.put("crawlProfiles_" + count + "_status", ((active) ? 1 : 0));
prop.put("crawlProfiles_" + count + "_name", profile.name());
prop.put("crawlProfiles_" + count + "_startURL", profile.startURL());
prop.put("crawlProfiles_" + count + "_handle", profile.handle());
prop.put("crawlProfiles_" + count + "_depth", profile.generalDepth());
prop.put("crawlProfiles_" + count + "_filter", profile.generalFilter());
prop.put("crawlProfiles_" + count + "_crawlingIfOlder", (profile.recrawlIfOlder() == Long.MAX_VALUE) ? "no re-crawl" : ""+profile.recrawlIfOlder());
prop.put("crawlProfiles_" + count + "_crawlingDomFilterDepth", (profile.domFilterDepth() == Integer.MAX_VALUE) ? "inactive" : Integer.toString(profile.domFilterDepth()));
// start contrib [MN]
int i = 0;
String item;
while ((i <= domlistlength) && !((item = profile.domName(true, i)).equals(""))){
if(i == domlistlength){
item = item + " ...";
}
prop.put("crawlProfiles_"+count+"_crawlingDomFilterContent_"+i+"_item", item);
i++;
}
prop.put("crawlProfiles_"+count+"_crawlingDomFilterContent", i);
// end contrib [MN]
prop.put("crawlProfiles_" + count + "_crawlingDomMaxPages", (profile.domMaxPages() == Integer.MAX_VALUE) ? "unlimited" : ""+profile.domMaxPages());
prop.put("crawlProfiles_" + count + "_withQuery", (profile.crawlingQ()) ? 1 : 0);
prop.put("crawlProfiles_" + count + "_storeCache", (profile.storeHTCache()) ? 1 : 0);
prop.put("crawlProfiles_" + count + "_indexText", (profile.indexText()) ? 1 : 0);
prop.put("crawlProfiles_" + count + "_indexMedia", (profile.indexMedia()) ? 1 : 0);
prop.put("crawlProfiles_" + count + "_remoteIndexing", (profile.remoteIndexing()) ? 1 : 0);
prop.put("crawlProfiles_" + count + "_terminateButton", ((!active) || (profile.name().equals("remote")) ||
(profile.name().equals("proxy")) ||
(profile.name().equals("snippetText")) ||
(profile.name().equals("snippetMedia"))) ? 0 : 1);
prop.put("crawlProfiles_" + count + "_terminateButton_handle", profile.handle());
prop.put("crawlProfiles_" + count + "_deleteButton", (active) ? 0 : 1);
prop.put("crawlProfiles_" + count + "_deleteButton_handle", profile.handle());
}
}

View File

@ -69,6 +69,11 @@ public class CrawlResults {
tabletype = 0;
}
if ((post != null) && (post.containsKey("autoforward")) && (tabletype == 5) && (sb.wordIndex.loadedURL.getStackSize(5) == 0)) {
// the main menu does a request to the local crawler page, but in case this table is empty, the overview page is shown
tabletype = 0;
}
// check if authorization is needed and/or given
if (((tabletype > 0) && (tabletype < 6)) ||
(post.containsKey("clearlist")) ||

View File

@ -83,9 +83,9 @@ public class CrawlURLFetch_p {
public static plasmaCrawlProfile.entry getCrawlProfile(serverSwitch env) {
if (profile == null) {
profile = ((plasmaSwitchboard)env).profiles.newEntry(
profile = ((plasmaSwitchboard)env).profilesActiveCrawls.newEntry(
"URLFetcher", // Name
"", // URL
null, // URL
".*", ".*", // General / specific filter
0, 0, // General / specific depth
-1, -1, -1, // Recrawl / Dom-filter depth / Dom-max-pages

View File

@ -6,7 +6,7 @@
</head>
<body id="IndexCreateIndexingQueue">
#%env/templates/header.template%#
#%env/templates/submenuCrawler.template%#
#%env/templates/submenuIndexCreate.template%#
<h2>Indexing Queue</h2>

View File

@ -92,6 +92,7 @@ public class IndexCreateIndexingQueue_p {
plasmaHTCache.deleteFile(entry.url());
}
}
switchboard.sbQueue.clear(); // reset file to clean up content completely
}
} catch (Exception e) {}
} else if (post.containsKey("deleteEntry")) {

View File

@ -6,7 +6,7 @@
</head>
<body id="IndexCreateLoaderQueue">
#%env/templates/header.template%#
#%env/templates/submenuCrawler.template%#
#%env/templates/submenuIndexCreate.template%#
<h2>Loader Queue</h2>
<p>

View File

@ -6,7 +6,7 @@
</head>
<body id="IndexCreateWWWGlobalQueue">
#%env/templates/header.template%#
#%env/templates/submenuCrawler.template%#
#%env/templates/submenuIndexCreate.template%#
<h2>Global Crawl Queue</h2>
<p>
This queue stores the urls that shall be sent to other peers to perform a remote crawl.

View File

@ -94,7 +94,7 @@ public class IndexCreateWWWGlobalQueue_p {
prop.put("info_numEntries", c);
} else if (post.containsKey("deleteEntry")) {
String urlHash = (String) post.get("deleteEntry");
switchboard.noticeURL.remove(urlHash);
switchboard.noticeURL.removeByURLHash(urlHash);
prop.put("LOCATION","");
return prop;
}
@ -118,7 +118,7 @@ public class IndexCreateWWWGlobalQueue_p {
if ((urle != null)&&(urle.url()!=null)) {
initiator = yacyCore.seedDB.getConnected(urle.initiator());
profileHandle = urle.profileHandle();
profileEntry = (profileHandle == null) ? null : switchboard.profiles.getEntry(profileHandle);
profileEntry = (profileHandle == null) ? null : switchboard.profilesActiveCrawls.getEntry(profileHandle);
prop.put("crawler-queue_list_"+showNum+"_dark", ((dark) ? 1 : 0) );
prop.put("crawler-queue_list_"+showNum+"_initiator", ((initiator == null) ? "proxy" : htmlTools.encodeUnicode2html(initiator.getName(), true)) );
prop.put("crawler-queue_list_"+showNum+"_profile", ((profileEntry == null) ? "unknown" : htmlTools.encodeUnicode2html(profileEntry.name(), true)));

View File

@ -4,9 +4,9 @@
<title>YaCy '#[clientname]#': Local Crawl Queue</title>
#%env/templates/metas.template%#
</head>
<body id="IndexCreateWWwLocalQueue">
<body id="IndexCreateWWWLocalQueue">
#%env/templates/header.template%#
#%env/templates/submenuCrawler.template%#
#%env/templates/submenuIndexCreate.template%#
<h2>Local Crawl Queue</h2>
<p>
This queue stores the urls that shall be crawled localy by this peer.

View File

@ -109,7 +109,7 @@ public class IndexCreateWWWLocalQueue_p {
if (option == PROFILE) {
// search and delete the crawl profile (_much_ faster, independant of queue size)
// XXX: what to do about the annoying LOST PROFILE messages in the log?
Iterator it = switchboard.profiles.profiles(true);
Iterator it = switchboard.profilesActiveCrawls.profiles(true);
plasmaCrawlProfile.entry entry;
while (it.hasNext()) {
entry = (plasmaCrawlProfile.entry)it.next();
@ -119,8 +119,9 @@ public class IndexCreateWWWLocalQueue_p {
name.equals(plasmaSwitchboard.CRAWL_PROFILE_SNIPPET_TEXT) ||
name.equals(plasmaSwitchboard.CRAWL_PROFILE_SNIPPET_MEDIA))
continue;
if (compiledPattern.matcher(name).find())
switchboard.profiles.removeEntry(entry.handle());
if (compiledPattern.matcher(name).find()) {
switchboard.profilesActiveCrawls.removeEntry(entry.handle());
}
}
} else {
// iterating through the list of URLs
@ -144,7 +145,7 @@ public class IndexCreateWWWLocalQueue_p {
if (value != null) {
Matcher matcher = compiledPattern.matcher(value);
if (matcher.find()) {
switchboard.noticeURL.remove(entry.url().hash());
switchboard.noticeURL.removeByURLHash(entry.url().hash());
}
}
}
@ -158,7 +159,7 @@ public class IndexCreateWWWLocalQueue_p {
prop.put("info_numEntries", c);
} else if (post.containsKey("deleteEntry")) {
String urlHash = (String) post.get("deleteEntry");
switchboard.noticeURL.remove(urlHash);
switchboard.noticeURL.removeByURLHash(urlHash);
prop.put("LOCATION","");
return prop;
}
@ -182,7 +183,7 @@ public class IndexCreateWWWLocalQueue_p {
if ((urle != null)&&(urle.url()!=null)) {
initiator = yacyCore.seedDB.getConnected(urle.initiator());
profileHandle = urle.profileHandle();
profileEntry = (profileHandle == null) ? null : switchboard.profiles.getEntry(profileHandle);
profileEntry = (profileHandle == null) ? null : switchboard.profilesActiveCrawls.getEntry(profileHandle);
prop.put("crawler-queue_list_"+showNum+"_dark", ((dark) ? 1 : 0) );
prop.put("crawler-queue_list_"+showNum+"_initiator", ((initiator == null) ? "proxy" : htmlTools.encodeUnicode2html(initiator.getName(), true)) );
prop.put("crawler-queue_list_"+showNum+"_profile", ((profileEntry == null) ? "unknown" : profileEntry.name()));

View File

@ -6,7 +6,7 @@
</head>
<body id="IndexCreateWWWGlobalQueue">
#%env/templates/header.template%#
#%env/templates/submenuCrawler.template%#
#%env/templates/submenuIndexCreate.template%#
<h2>Remote Crawl Queue</h2>
<p>
This queue stores the urls that other peers sent to you in order to perform a remote crawl for them.

View File

@ -93,7 +93,7 @@ public class IndexCreateWWWRemoteQueue_p {
prop.put("info_numEntries", c);
} else if (post.containsKey("deleteEntry")) {
String urlHash = (String) post.get("deleteEntry");
sb.noticeURL.remove(urlHash);
sb.noticeURL.removeByURLHash(urlHash);
prop.put("LOCATION","");
return prop;
}
@ -117,7 +117,7 @@ public class IndexCreateWWWRemoteQueue_p {
if (urle != null && urle.url() != null) {
initiator = yacyCore.seedDB.getConnected(urle.initiator());
profileHandle = urle.profileHandle();
profileEntry = (profileHandle == null) ? null : sb.profiles.getEntry(profileHandle);
profileEntry = (profileHandle == null) ? null : sb.profilesActiveCrawls.getEntry(profileHandle);
prop.put("crawler-queue_list_" + showNum + "_dark", ((dark) ? 1 : 0) );
prop.put("crawler-queue_list_" + showNum + "_initiator", ((initiator == null) ? "proxy" : initiator.getName()));
prop.put("crawler-queue_list_" + showNum + "_profile", ((profileEntry == null) ? "unknown" : profileEntry.name()));

View File

@ -117,11 +117,11 @@ public class ProxyIndexingMonitor_p {
prop.put("info", 1); //delete DATA/PLASMADB/crawlProfiles0.db
} else {
try {
sb.profiles.changeEntry(sb.defaultProxyProfile, "generalDepth", Integer.toString(newProxyPrefetchDepth));
sb.profiles.changeEntry(sb.defaultProxyProfile, "storeHTCache", (proxyStoreHTCache) ? "true": "false");
sb.profiles.changeEntry(sb.defaultProxyProfile, "remoteIndexing",proxyIndexingRemote ? "true":"false");
sb.profiles.changeEntry(sb.defaultProxyProfile, "indexText",proxyIndexingLocalText ? "true":"false");
sb.profiles.changeEntry(sb.defaultProxyProfile, "indexMedia",proxyIndexingLocalMedia ? "true":"false");
sb.profilesActiveCrawls.changeEntry(sb.defaultProxyProfile, "generalDepth", Integer.toString(newProxyPrefetchDepth));
sb.profilesActiveCrawls.changeEntry(sb.defaultProxyProfile, "storeHTCache", (proxyStoreHTCache) ? "true": "false");
sb.profilesActiveCrawls.changeEntry(sb.defaultProxyProfile, "remoteIndexing",proxyIndexingRemote ? "true":"false");
sb.profilesActiveCrawls.changeEntry(sb.defaultProxyProfile, "indexText",proxyIndexingLocalText ? "true":"false");
sb.profilesActiveCrawls.changeEntry(sb.defaultProxyProfile, "indexMedia",proxyIndexingLocalMedia ? "true":"false");
prop.put("info", 2);//new proxyPrefetchdepth
prop.put("info_message", newProxyPrefetchDepth);

View File

@ -141,15 +141,15 @@ public class QuickCrawlLink_p {
String urlhash = crawlingStartURL.hash();
switchboard.wordIndex.loadedURL.remove(urlhash);
switchboard.noticeURL.remove(urlhash);
switchboard.noticeURL.removeByURLHash(urlhash);
switchboard.errorURL.remove(urlhash);
// create crawling profile
plasmaCrawlProfile.entry pe = null;
try {
pe = switchboard.profiles.newEntry(
pe = switchboard.profilesActiveCrawls.newEntry(
crawlingStartURL.getHost(),
crawlingStart,
crawlingStartURL,
crawlingFilter,
crawlingFilter,
CrawlingDepth,

View File

@ -1,5 +1,6 @@
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
<html xmlns="http://www.w3.org/1999/xhtml">
#(forwardToCrawlStart)#::<meta http-equiv="REFRESH" content="0;/CrawlStartSimple_p.html">#(/forwardToCrawlStart)#
<head>
<title>YaCy '#[clientname]#': Crawler Queues</title>
#%env/templates/metas.template%#
@ -9,7 +10,7 @@
<script type="text/javascript" src="/js/WatchCrawler.js"></script></head>
<body id="watchCrawler">
#%env/templates/header.template%#
#%env/templates/submenuCrawler.template%#
#%env/templates/submenuIndexCreate.template%#
<h2>Crawler Queues</h2>
<p> Next update in <input type="text" id="nextUpdate" onfocus="changeInterval()" onblur="newInterval()" size="2" /> seconds. <img src="/env/grafics/empty.gif" name="ajax" alt="empty"/>
</p>

View File

@ -63,6 +63,7 @@ public class WatchCrawler_p {
// return variable that accumulates replacements
plasmaSwitchboard switchboard = (plasmaSwitchboard) env;
serverObjects prop = new serverObjects();
prop.put("forwardToCrawlStart", 0);
if (post == null) {
// not a crawl start, only monitoring
@ -70,6 +71,10 @@ public class WatchCrawler_p {
} else {
prop.put("info", 0);
if ((post.containsKey("autoforward")) && (switchboard.coreCrawlJobSize() == 0)) {
prop.put("forwardToCrawlStart", 1);
}
if (post.containsKey("continue")) {
// continue queue
String queue = post.get("continue", "");
@ -158,18 +163,12 @@ public class WatchCrawler_p {
if (pos == -1) crawlingStart = "http://" + crawlingStart;
// normalizing URL
try {crawlingStart = new yacyURL(crawlingStart, null).toNormalform(true, true);} catch (MalformedURLException e1) {}
// check if url is proper
yacyURL crawlingStartURL = null;
try {
crawlingStartURL = new yacyURL(crawlingStart, null);
} catch (MalformedURLException e) {
crawlingStartURL = null;
}
try {crawlingStartURL = new yacyURL(crawlingStart, null);} catch (MalformedURLException e1) {}
crawlingStart = (crawlingStartURL == null) ? null : crawlingStartURL.toNormalform(true, true);
// check if pattern matches
if ((crawlingStartURL == null) /* || (!(crawlingStart.matches(newcrawlingfilter))) */) {
if ((crawlingStart == null) /* || (!(crawlingStart.matches(newcrawlingfilter))) */) {
// print error message
prop.put("info", 4); //crawlfilter does not match url
prop.put("info_newcrawlingfilter", newcrawlingfilter);
@ -183,12 +182,13 @@ public class WatchCrawler_p {
// first delete old entry, if exists
String urlhash = (new yacyURL(crawlingStart, null)).hash();
switchboard.wordIndex.loadedURL.remove(urlhash);
switchboard.noticeURL.remove(urlhash);
switchboard.noticeURL.removeByURLHash(urlhash);
switchboard.errorURL.remove(urlhash);
// stack url
plasmaCrawlProfile.entry pe = switchboard.profiles.newEntry(
crawlingStartURL.getHost(), crawlingStart, newcrawlingfilter, newcrawlingfilter,
switchboard.profilesPassiveCrawls.removeEntry(crawlingStartURL.hash()); // if there is an old entry, delete it
plasmaCrawlProfile.entry pe = switchboard.profilesActiveCrawls.newEntry(
crawlingStartURL.getHost(), crawlingStartURL, newcrawlingfilter, newcrawlingfilter,
newcrawlingdepth, newcrawlingdepth,
crawlingIfOlder, crawlingDomFilterDepth, crawlingDomMaxPages,
crawlingQ,
@ -268,7 +268,8 @@ public class WatchCrawler_p {
HashMap hyperlinks = (HashMap) scraper.getAnchors();
// creating a crawler profile
plasmaCrawlProfile.entry profile = switchboard.profiles.newEntry(fileName, "file://" + file.toString(), newcrawlingfilter, newcrawlingfilter, newcrawlingdepth, newcrawlingdepth, crawlingIfOlder, crawlingDomFilterDepth, crawlingDomMaxPages, crawlingQ, indexText, indexMedia, storeHTCache, true, crawlOrder, xsstopw, xdstopw, xpstopw);
yacyURL crawlURL = new yacyURL("file://" + file.toString(), null);
plasmaCrawlProfile.entry profile = switchboard.profilesActiveCrawls.newEntry(fileName, crawlURL, newcrawlingfilter, newcrawlingfilter, newcrawlingdepth, newcrawlingdepth, crawlingIfOlder, crawlingDomFilterDepth, crawlingDomMaxPages, crawlingQ, indexText, indexMedia, storeHTCache, true, crawlOrder, xsstopw, xdstopw, xpstopw);
// loop through the contained links
Iterator interator = hyperlinks.entrySet().iterator();
@ -325,10 +326,11 @@ public class WatchCrawler_p {
try {
// getting the sitemap URL
sitemapURLStr = post.get("sitemapURL","");
yacyURL sitemapURL = new yacyURL(sitemapURLStr, null);
// create a new profile
plasmaCrawlProfile.entry pe = switchboard.profiles.newEntry(
sitemapURLStr, sitemapURLStr, newcrawlingfilter, newcrawlingfilter,
plasmaCrawlProfile.entry pe = switchboard.profilesActiveCrawls.newEntry(
sitemapURLStr, sitemapURL, newcrawlingfilter, newcrawlingfilter,
newcrawlingdepth, newcrawlingdepth,
crawlingIfOlder, crawlingDomFilterDepth, crawlingDomMaxPages,
crawlingQ,

View File

@ -20,7 +20,7 @@
</head>
<body id="WebStructure" style="margin:0px;">
#%env/templates/header.template%#
#%env/templates/submenuCrawler.template%#
#%env/templates/submenuWebStructure.template%#
<h2>Web Structure</h2>
<div id="left">

View File

@ -31,7 +31,7 @@ public class WatchWebStructure_p {
if (host.equals("auto")) {
// try to find the host from the crawl profiles
Iterator it = sb.profiles.profiles(true);
Iterator it = sb.profilesActiveCrawls.profiles(true);
entry e;
while (it.hasNext()) {
e = (entry)it.next();

View File

@ -16,7 +16,8 @@
<li class="menugroup" id="menugroupCrawlerControl">
<h3>Web&nbsp;Indexing</h3>
<ul class="menu">
<li><a href="/CrawlStartSimple_p.html" class="MenuItemLink lock">Start a Web Crawl</a></li>
<li><a href="/WatchCrawler_p.html?autoforward=" class="MenuItemLink lock">Crawl Start &amp; Monitoring</a></li>
<li><a href="/CrawlResults.html?process=5&autoforward=" class="MenuItemLink">Crawl Results</a></li>
<li><a href="/IndexControl_p.html" class="MenuItemLink lock">Index Administration</a></li>
<li><a href="/Blacklist_p.html" class="MenuItemLink lock">Filter &amp; Blacklists</a></li>
<li><a href="/ProxyIndexingMonitor_p.html" class="MenuItemLink lock">Indexing with Proxy</a></li>
@ -26,11 +27,9 @@
<h3>Monitoring</h3>
<ul class="menu">
<li><a href="/Network.html" accesskey="w" class="MenuItemLink">YaCy Network</a></li>
<li><a href="/WatchWebStructure_p.html?host=auto&amp;depth=2&amp;time=1000" class="MenuItemLink lock">Crawl Activity</a></li>
<li><a href="/CrawlResults.html" class="MenuItemLink">Crawl Results</a></li>
<li><a href="/WatchWebStructure_p.html?host=auto&amp;depth=2&amp;time=1000" class="MenuItemLink lock">Web Visualization</a></li>
<li><a href="/AccessTracker_p.html" class="MenuItemLink lock">Access Tracker</a></li>
<li><a href="/ViewLog_p.html" class="MenuItemLink lock">Server Log</a></li>
<li><a href="/CacheAdmin_p.html" class="MenuItemLink lock">Web Cache</a></li>
<li><a href="/News.html" accesskey="n" class="MenuItemLink">News Monitor</a></li>
<li><a href="/CookieMonitorIncoming_p.html" class="MenuItemLink lock">Cookies in Proxy</a></li>
</ul>

View File

@ -1,36 +0,0 @@
<div class="SubMenu">
<h3>Crawler Activity</h3>
<div class="SubMenugroup">
<h3>Activity Visualization</h3>
<ul class="SubMenu">
<li><a href="/WatchWebStructure_p.html" class="MenuItemLink lock">Web Structure</a></li>
<li><a href="/WatchCrawler_p.html" class="MenuItemLink lock">Crawler Queues</a></li>
</ul>
</div>
<div class="SubMenugroup">
<h3>Control Queues</h3>
<ul class="SubMenu">
<li><a href="/IndexCreateIndexingQueue_p.html" class="MenuItemLink lock">Indexing</a></li>
<li><a href="/IndexCreateLoaderQueue_p.html" class="MenuItemLink lock">Loader</a></li>
</ul>
</div>
<div class="SubMenugroup">
<h3>WWW Crawl Queues</h3>
<ul class="SubMenu">
<li><a href="/IndexCreateWWWLocalQueue_p.html" class="MenuItemLink lock">Local</a></li>
<li><a href="/IndexCreateWWWGlobalQueue_p.html" class="MenuItemLink lock">Global</a></li>
<li><a href="/IndexCreateWWWRemoteQueue_p.html" class="MenuItemLink lock">Remote</a></li>
<li><!--<a href="/IndexCreateWWWOverhangQueue_p.html" class="MenuItemLink">--><em class="lock">Overhang</em><!--</a>--></li>
</ul>
</div>
<div class="SubMenugroup">
<h3>Media Crawl Queues</h3>
<ul class="SubMenu">
<li><!--<a href="/IndexCreateImageQueue_p.html" class="MenuItemLink">--><em class="lock">Images</em><!--</a>--></li>
<li><!--<a href="/IndexCreateMovieQueue_p.html" class="MenuItemLink">--><em class="lock">Movies</em><!--</a>--></li>
<li><!--<a href="/IndexCreateMusicQueue_p.html" class="MenuItemLink">--><em class="lock">Music</em><!--</a>--></li>
</ul>
</div>
</div>

View File

@ -4,5 +4,35 @@
<li><a href="/CrawlStartSimple_p.html" class="MenuItemLink lock">Crawl Start (easy)</a></li>
<li><a href="/CrawlStartExpert_p.html" class="MenuItemLink lock">Crawl Start (expert)</a></li>
<li><a href="/CrawlProfileEditor_p.html" class="MenuItemLink lock">Crawl Profile Editor</a></li>
<li><a href="/WatchCrawler_p.html" class="MenuItemLink lock">Crawler Queues</a></li>
</ul>
<ul class="SubMenu">
<li>&nbsp;</li>
</ul>
</div>
<div class="SubMenu">
<div class="SubMenugroup">
<h3>Control Queues</h3>
<ul class="SubMenu">
<li><a href="/IndexCreateIndexingQueue_p.html" class="MenuItemLink lock">Indexing</a></li>
<li><a href="/IndexCreateLoaderQueue_p.html" class="MenuItemLink lock">Loader</a></li>
</ul>
</div>
<div class="SubMenugroup">
<h3>WWW Crawl Queues</h3>
<ul class="SubMenu">
<li><a href="/IndexCreateWWWLocalQueue_p.html" class="MenuItemLink lock">Local</a></li>
<li><a href="/IndexCreateWWWGlobalQueue_p.html" class="MenuItemLink lock">Global</a></li>
<li><a href="/IndexCreateWWWRemoteQueue_p.html" class="MenuItemLink lock">Remote</a></li>
<li><!--<a href="/IndexCreateWWWOverhangQueue_p.html" class="MenuItemLink">--><em class="lock">Overhang</em><!--</a>--></li>
</ul>
</div>
<div class="SubMenugroup">
<h3>Media Crawl Queues</h3>
<ul class="SubMenu">
<li><!--<a href="/IndexCreateImageQueue_p.html" class="MenuItemLink">--><em class="lock">Images</em><!--</a>--></li>
<li><!--<a href="/IndexCreateMovieQueue_p.html" class="MenuItemLink">--><em class="lock">Movies</em><!--</a>--></li>
<li><!--<a href="/IndexCreateMusicQueue_p.html" class="MenuItemLink">--><em class="lock">Music</em><!--</a>--></li>
</ul>
</div>
</div>

View File

@ -0,0 +1,7 @@
<div class="SubMenu">
<h3>Web Visualization</h3>
<ul class="SubMenu">
<li><a href="/WatchWebStructure_p.html" class="MenuItemLink lock">Web Structure</a></li>
<li><a href="/CacheAdmin_p.html" class="MenuItemLink lock">Web Cache</a></li>
</ul>
</div>

View File

@ -160,7 +160,7 @@ public class SitemapParser extends DefaultHandler {
if (theCrawlingProfile == null) {
// create a new profile
this.crawlingProfile = createProfile(this.siteMapURL.getHost(),this.siteMapURL.toString());
this.crawlingProfile = createProfile(this.siteMapURL.getHost(), this.siteMapURL);
} else {
// use an existing profile
this.crawlingProfile = theCrawlingProfile;
@ -348,8 +348,8 @@ public class SitemapParser extends DefaultHandler {
}
}
private plasmaCrawlProfile.entry createProfile(String domainName, String sitemapURL) {
return this.switchboard.profiles.newEntry(
private plasmaCrawlProfile.entry createProfile(String domainName, yacyURL sitemapURL) {
return this.switchboard.profilesActiveCrawls.newEntry(
domainName,
sitemapURL,
// crawlingFilter

View File

@ -466,7 +466,7 @@ public final class httpTemplate {
}
} catch (IOException e) {
//file not found?
serverLog.logSevere("FILEHANDLER","Include Error with file: " + new String(filename, "UTF-8"));
serverLog.logSevere("FILEHANDLER","Include Error with file " + new String(filename, "UTF-8") + ": " + e.getMessage());
} finally {
if (br != null) try { br.close(); br=null; } catch (Exception e) {}
}

View File

@ -332,7 +332,7 @@ public final class CrawlWorker extends AbstractCrawlWorker {
String urlhash = redirectionUrl.hash();
// removing url from loader queue
plasmaCrawlLoader.switchboard.noticeURL.remove(urlhash);
plasmaCrawlLoader.switchboard.noticeURL.removeByURLHash(urlhash);
// retry crawling with new url
this.url = redirectionUrl;

View File

@ -113,7 +113,7 @@ public class SitemapImporter extends AbstractImporter implements dbImporter {
this.sitemapURL = new yacyURL((String)initParams.get("sitemapURL"), null);
// getting the crawling profile to use
plasmaCrawlProfile.entry profileEntry = this.sb.profiles.getEntry((String)initParams.get("crawlingProfile"));
plasmaCrawlProfile.entry profileEntry = this.sb.profilesActiveCrawls.getEntry((String)initParams.get("crawlingProfile"));
// creating the sitemap parser
this.parser = new SitemapParser(this.sb,this.sitemapURL,profileEntry);

View File

@ -64,7 +64,7 @@ public class plasmaCrawlNURLImporter extends AbstractImporter implements dbImpor
this.preloadTime = Long.valueOf((String)initParams.get("preloadTime")).longValue();
File noticeUrlDbFile = new File(plasmaPath,"urlNotice1.db");
File profileDbFile = new File(plasmaPath, "crawlProfiles0.db");
File profileDbFile = new File(plasmaPath, plasmaSwitchboard.DBFILE_ACTIVE_CRAWL_PROFILES);
String errorMsg = null;
if (!plasmaPath.exists())
@ -169,7 +169,7 @@ public class plasmaCrawlNURLImporter extends AbstractImporter implements dbImpor
if (!this.importProfileHandleCache.contains(profileHandle)) {
// testing if the profile is already known
plasmaCrawlProfile.entry profileEntry = this.sb.profiles.getEntry(profileHandle);
plasmaCrawlProfile.entry profileEntry = this.sb.profilesActiveCrawls.getEntry(profileHandle);
// if not we need to import it
if (profileEntry == null) {
@ -178,7 +178,7 @@ public class plasmaCrawlNURLImporter extends AbstractImporter implements dbImpor
if (sourceEntry != null) {
this.profileCount++;
this.importProfileHandleCache.add(profileHandle);
this.sb.profiles.newEntry((TreeMap)((TreeMap)sourceEntry.map()).clone());
this.sb.profilesActiveCrawls.newEntry((TreeMap)((TreeMap)sourceEntry.map()).clone());
} else {
this.log.logWarning("Profile '" + profileHandle + "' of url entry '" + nextHash + "' unknown.");
continue;
@ -193,7 +193,7 @@ public class plasmaCrawlNURLImporter extends AbstractImporter implements dbImpor
// removing hash from the import db
} finally {
this.importNurlDB.remove(nextHash);
this.importNurlDB.removeByURLHash(nextHash);
}
if (this.urlCount % 100 == 0) {

View File

@ -63,8 +63,8 @@ import de.anomic.yacy.yacySeedDB;
public class plasmaCrawlBalancer {
private static final String stackSuffix = "7.stack";
private static final String indexSuffix = "7.db";
private static final String stackSuffix = "8.stack";
private static final String indexSuffix = "8.db";
// a shared domainAccess map for all balancers
private static final Map domainAccess = Collections.synchronizedMap(new HashMap());
@ -85,10 +85,10 @@ public class plasmaCrawlBalancer {
this.cacheStacksPath = cachePath;
this.stackname = stackname;
File stackFile = new File(cachePath, stackname + stackSuffix);
urlFileStack = kelondroStack.open(stackFile, stackrow);
domainStacks = new HashMap();
urlRAMStack = new ArrayList();
top = true;
this.urlFileStack = kelondroStack.open(stackFile, stackrow);
this.domainStacks = new HashMap();
this.urlRAMStack = new ArrayList();
this.top = true;
// create a stack for newly entered entries
if (!(cachePath.exists())) cachePath.mkdir(); // make the path
@ -142,6 +142,30 @@ public class plasmaCrawlBalancer {
return new plasmaCrawlEntry(entry);
}
public synchronized int removeAllByProfileHandle(String profileHandle) throws IOException {
// removes all entries with a specific profile hash.
// this may last some time
// returns number of deletions
// first find a list of url hashes that shall be deleted
Iterator i = urlFileIndex.rows(true, null);
ArrayList urlHashes = new ArrayList();
kelondroRow.Entry rowEntry;
plasmaCrawlEntry crawlEntry;
while (i.hasNext()) {
rowEntry = (kelondroRow.Entry) i.next();
crawlEntry = new plasmaCrawlEntry(rowEntry);
if (crawlEntry.profileHandle().equals(profileHandle)) {
urlHashes.add(crawlEntry.url().hash());
}
}
// then delete all these urls from the queues and the file index
i = urlHashes.iterator();
while (i.hasNext()) this.remove((String) i.next());
return urlHashes.size();
}
public synchronized plasmaCrawlEntry remove(String urlhash) throws IOException {
// this method is only here, because so many import/export methods need it
// and it was implemented in the previous architecture

View File

@ -48,7 +48,7 @@ public class plasmaCrawlEntry {
"String refhash-" + yacySeedDB.commonHashLength + ", " + // the url's referrer hash
"String urlname-80, " + // the name of the url, from anchor tag <a>name</a>
"Cardinal appdate-8 {b256}, " + // the time when the url was first time appeared
"String profile-4, " + // the name of the prefetch profile handle
"String profile-" + yacySeedDB.commonHashLength + ", " + // the name of the prefetch profile handle
"Cardinal depth-2 {b256}, " + // the prefetch depth so far, starts at 0
"Cardinal parentbr-3 {b256}, " + // number of anchors of the parent
"Cardinal forkfactor-4 {b256}, " + // sum of anchors of all ancestors

View File

@ -139,7 +139,7 @@ public class plasmaCrawlNURL {
return null;
}
public plasmaCrawlEntry remove(String urlhash) {
public plasmaCrawlEntry removeByURLHash(String urlhash) {
plasmaCrawlEntry entry = null;
try {if ((entry = coreStack.remove(urlhash)) != null) return entry;} catch (IOException e) {}
try {if ((entry = limitStack.remove(urlhash)) != null) return entry;} catch (IOException e) {}
@ -147,6 +147,14 @@ public class plasmaCrawlNURL {
return null;
}
public int removeByProfileHandle(String handle) {
int removed = 0;
try {removed += coreStack.removeAllByProfileHandle(handle);} catch (IOException e) {}
try {removed += limitStack.removeAllByProfileHandle(handle);} catch (IOException e) {}
try {removed += remoteStack.removeAllByProfileHandle(handle);} catch (IOException e) {}
return removed;
}
public plasmaCrawlEntry[] top(int stackType, int count) {
switch (stackType) {
case STACK_TYPE_CORE: return top(coreStack, count);

View File

@ -55,6 +55,8 @@ import de.anomic.kelondro.kelondroException;
import de.anomic.kelondro.kelondroMapObjects;
import de.anomic.kelondro.kelondroNaturalOrder;
import de.anomic.server.serverCodings;
import de.anomic.yacy.yacySeedDB;
import de.anomic.yacy.yacyURL;
public class plasmaCrawlProfile {
@ -64,13 +66,11 @@ public class plasmaCrawlProfile {
private File profileTableFile;
private long preloadTime;
public static final int crawlProfileHandleLength = 4; // name of the prefetch profile
public plasmaCrawlProfile(File file, long preloadTime) {
this.profileTableFile = file;
this.preloadTime = preloadTime;
profileTableFile.getParentFile().mkdirs();
kelondroDyn dyn = new kelondroDyn(profileTableFile, true, true, preloadTime, crawlProfileHandleLength, 2000, '#', kelondroNaturalOrder.naturalOrder, true, false, true);
kelondroDyn dyn = new kelondroDyn(profileTableFile, true, true, preloadTime, yacySeedDB.commonHashLength, 2000, '#', kelondroNaturalOrder.naturalOrder, true, false, true);
profileTable = new kelondroMapObjects(dyn, 500);
}
@ -79,7 +79,7 @@ public class plasmaCrawlProfile {
if (profileTable != null) profileTable.close();
if (!(profileTableFile.delete())) throw new RuntimeException("cannot delete crawl profile database");
profileTableFile.getParentFile().mkdirs();
kelondroDyn dyn = new kelondroDyn(profileTableFile, true, true, preloadTime, crawlProfileHandleLength, 2000, '#', kelondroNaturalOrder.naturalOrder, true, false, true);
kelondroDyn dyn = new kelondroDyn(profileTableFile, true, true, preloadTime, yacySeedDB.commonHashLength, 2000, '#', kelondroNaturalOrder.naturalOrder, true, false, true);
profileTable = new kelondroMapObjects(dyn, 500);
}
@ -164,7 +164,7 @@ public class plasmaCrawlProfile {
return ne;
}
public entry newEntry(String name, String startURL, String generalFilter, String specificFilter,
public entry newEntry(String name, yacyURL startURL, String generalFilter, String specificFilter,
int generalDepth, int specificDepth,
int recrawlIfOlder /*minutes*/, int domFilterDepth, int domMaxPages,
boolean crawlingQ,
@ -257,7 +257,7 @@ public class plasmaCrawlProfile {
private Map mem;
private Map doms;
public entry(String name, String startURL, String generalFilter, String specificFilter,
public entry(String name, yacyURL startURL, String generalFilter, String specificFilter,
int generalDepth, int specificDepth,
int recrawlIfOlder /*minutes*/, int domFilterDepth, int domMaxPages,
boolean crawlingQ,
@ -266,11 +266,11 @@ public class plasmaCrawlProfile {
boolean remoteIndexing,
boolean xsstopw, boolean xdstopw, boolean xpstopw) {
if (name == null || name.length() == 0) throw new NullPointerException("name must not be null");
String handle = kelondroBase64Order.enhancedCoder.encode(serverCodings.encodeMD5Raw(Long.toString(System.currentTimeMillis()))).substring(0, crawlProfileHandleLength);
String handle = (startURL == null) ? kelondroBase64Order.enhancedCoder.encode(serverCodings.encodeMD5Raw(Long.toString(System.currentTimeMillis()))).substring(0, yacySeedDB.commonHashLength) : startURL.hash();
mem = new HashMap();
mem.put(HANDLE, handle);
mem.put(NAME, name);
mem.put(START_URL, (startURL == null) ? "" : startURL);
mem.put(START_URL, (startURL == null) ? "" : startURL.toNormalform(true, false));
mem.put(GENERAL_FILTER, (generalFilter == null) ? ".*" : generalFilter);
mem.put(SPECIFIC_FILTER, (specificFilter == null) ? ".*" : specificFilter);
mem.put(GENERAL_DEPTH, Integer.toString(generalDepth));

View File

@ -214,7 +214,7 @@ public final class plasmaCrawlStacker {
public String dequeue(plasmaCrawlEntry theMsg) throws InterruptedException {
plasmaCrawlProfile.entry profile = this.sb.profiles.getEntry(theMsg.profileHandle());
plasmaCrawlProfile.entry profile = this.sb.profilesActiveCrawls.getEntry(theMsg.profileHandle());
if (profile == null) {
String errorMsg = "LOST PROFILE HANDLE '" + theMsg.profileHandle() + "' for URL " + theMsg.url();
this.log.logSevere(errorMsg);

View File

@ -575,7 +575,7 @@ public final class plasmaParser {
// testing if the resource is not empty
if (sourceArray == null || sourceArray.length == 0) {
String errorMsg = "No resource content available.";
String errorMsg = "No resource content available (1).";
this.theLogger.logInfo("Unable to parse '" + location + "'. " + errorMsg);
throw new ParserException(errorMsg,location,plasmaCrawlEURL.DENIED_NOT_PARSEABLE_NO_CONTENT);
}
@ -609,7 +609,7 @@ public final class plasmaParser {
// testing if the resource is not empty
if (!(sourceFile.exists() && sourceFile.canRead() && sourceFile.length() > 0)) {
String errorMsg = sourceFile.exists() ? "Empty resource file." : "No resource content available.";
String errorMsg = sourceFile.exists() ? "Empty resource file." : "No resource content available (2).";
this.theLogger.logInfo("Unable to parse '" + location + "'. " + errorMsg);
throw new ParserException(errorMsg,location,plasmaCrawlEURL.DENIED_NOT_PARSEABLE_NO_CONTENT);
}

View File

@ -224,7 +224,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
public blogBoard blogDB;
public blogBoardComments blogCommentDB;
public static plasmaCrawlRobotsTxt robots;
public plasmaCrawlProfile profiles;
public plasmaCrawlProfile profilesActiveCrawls, profilesPassiveCrawls;
public plasmaCrawlProfile.entry defaultProxyProfile;
public plasmaCrawlProfile.entry defaultRemoteProfile;
public plasmaCrawlProfile.entry defaultTextSnippetProfile;
@ -866,7 +866,8 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
*
* @see plasmaSwitchboard#DBPATH for the folder this file lies in
*/
public static final String DBFILE_CRAWL_PROFILES = "crawlProfiles0.db";
public static final String DBFILE_ACTIVE_CRAWL_PROFILES = "crawlProfilesActive.db";
public static final String DBFILE_PASSIVE_CRAWL_PROFILES = "crawlProfilesPassive.db";
/**
* <p><code>public static final String <strong>DBFILE_CRAWL_ROBOTS</strong> = "crawlRobotsTxt.db"</code></p>
* <p>Name of the file containing the database holding all <code>robots.txt</code>-entries of the lately crawled domains</p>
@ -1066,12 +1067,17 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
// make crawl profiles database and default profiles
this.log.logConfig("Initializing Crawl Profiles");
File profilesFile = new File(this.plasmaPath, DBFILE_CRAWL_PROFILES);
this.profiles = new plasmaCrawlProfile(profilesFile, ramProfiles_time);
initProfiles();
log.logConfig("Loaded profiles from file " + profilesFile.getName() +
", " + this.profiles.size() + " entries" +
", " + ppRamString(profilesFile.length()/1024));
File profilesActiveFile = new File(this.plasmaPath, DBFILE_ACTIVE_CRAWL_PROFILES);
this.profilesActiveCrawls = new plasmaCrawlProfile(profilesActiveFile, ramProfiles_time);
initActiveCrawlProfiles();
log.logConfig("Loaded active crawl profiles from file " + profilesActiveFile.getName() +
", " + this.profilesActiveCrawls.size() + " entries" +
", " + ppRamString(profilesActiveFile.length()/1024));
File profilesPassiveFile = new File(this.plasmaPath, DBFILE_PASSIVE_CRAWL_PROFILES);
this.profilesPassiveCrawls = new plasmaCrawlProfile(profilesPassiveFile, ramProfiles_time);
log.logConfig("Loaded passive crawl profiles from file " + profilesPassiveFile.getName() +
", " + this.profilesPassiveCrawls.size() + " entries" +
", " + ppRamString(profilesPassiveFile.length()/1024));
// loading the robots.txt db
this.log.logConfig("Initializing robots.txt DB");
@ -1135,8 +1141,8 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
log.logConfig("Starting Indexing Management");
noticeURL = new plasmaCrawlNURL(plasmaPath);
//errorURL = new plasmaCrawlZURL(); // fresh error DB each startup; can be hold in RAM and reduces IO;
errorURL = new plasmaCrawlZURL(plasmaPath, "urlError.db", true);
delegatedURL = new plasmaCrawlZURL(plasmaPath, "urlDelegated.db", false);
errorURL = new plasmaCrawlZURL(plasmaPath, "urlError1.db", true);
delegatedURL = new plasmaCrawlZURL(plasmaPath, "urlDelegated1.db", false);
wordIndex = new plasmaWordIndex(indexPrimaryPath, indexSecondaryPath, ramRWI_time, log);
// set a high maximum cache size to current size; this is adopted later automatically
@ -1161,7 +1167,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
* initialize switchboard queue
* ====================================================================== */
// create queue
this.sbQueue = new plasmaSwitchboardQueue(this.wordIndex.loadedURL, new File(this.plasmaPath, "switchboardQueue1.stack"), this.profiles);
this.sbQueue = new plasmaSwitchboardQueue(this.wordIndex.loadedURL, new File(this.plasmaPath, "switchboardQueue2.stack"), this.profilesActiveCrawls);
// setting the indexing queue slots
indexingSlots = (int) getConfigLong(INDEXER_SLOTS, 30);
@ -1504,7 +1510,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
public void urlRemove(String hash) {
wordIndex.loadedURL.remove(hash);
noticeURL.remove(hash);
noticeURL.removeByURLHash(hash);
delegatedURL.remove(hash);
errorURL.remove(hash);
}
@ -1547,12 +1553,12 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
return (bytes / 1024) + "TByte";
}
private void initProfiles() {
private void initActiveCrawlProfiles() {
this.defaultProxyProfile = null;
this.defaultRemoteProfile = null;
this.defaultTextSnippetProfile = null;
this.defaultMediaSnippetProfile = null;
Iterator i = this.profiles.profiles(true);
Iterator i = this.profilesActiveCrawls.profiles(true);
plasmaCrawlProfile.entry profile;
String name;
while (i.hasNext()) {
@ -1565,7 +1571,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
}
if (this.defaultProxyProfile == null) {
// generate new default entry for proxy crawling
this.defaultProxyProfile = this.profiles.newEntry("proxy", "", ".*", ".*",
this.defaultProxyProfile = this.profilesActiveCrawls.newEntry("proxy", null, ".*", ".*",
Integer.parseInt(getConfig(PROXY_PREFETCH_DEPTH, "0")),
Integer.parseInt(getConfig(PROXY_PREFETCH_DEPTH, "0")),
60 * 24, -1, -1, false,
@ -1576,27 +1582,27 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
}
if (this.defaultRemoteProfile == null) {
// generate new default entry for remote crawling
defaultRemoteProfile = this.profiles.newEntry(CRAWL_PROFILE_REMOTE, "", ".*", ".*", 0, 0,
defaultRemoteProfile = this.profilesActiveCrawls.newEntry(CRAWL_PROFILE_REMOTE, null, ".*", ".*", 0, 0,
-1, -1, -1, true, true, true, false, true, false, true, true, false);
}
if (this.defaultTextSnippetProfile == null) {
// generate new default entry for snippet fetch and optional crawling
defaultTextSnippetProfile = this.profiles.newEntry(CRAWL_PROFILE_SNIPPET_TEXT, "", ".*", ".*", 0, 0,
defaultTextSnippetProfile = this.profilesActiveCrawls.newEntry(CRAWL_PROFILE_SNIPPET_TEXT, null, ".*", ".*", 0, 0,
60 * 24 * 30, -1, -1, true, true, true, true, true, false, true, true, false);
}
if (this.defaultMediaSnippetProfile == null) {
// generate new default entry for snippet fetch and optional crawling
defaultMediaSnippetProfile = this.profiles.newEntry(CRAWL_PROFILE_SNIPPET_MEDIA, "", ".*", ".*", 0, 0,
defaultMediaSnippetProfile = this.profilesActiveCrawls.newEntry(CRAWL_PROFILE_SNIPPET_MEDIA, null, ".*", ".*", 0, 0,
60 * 24 * 30, -1, -1, true, false, true, true, true, false, true, true, false);
}
}
private void resetProfiles() {
final File pdb = new File(plasmaPath, DBFILE_CRAWL_PROFILES);
final File pdb = new File(plasmaPath, DBFILE_ACTIVE_CRAWL_PROFILES);
if (pdb.exists()) pdb.delete();
long ramProfiles_time = getConfigLong(RAM_CACHE_PROFILES_TIME, 1000);
profiles = new plasmaCrawlProfile(pdb, ramProfiles_time);
initProfiles();
profilesActiveCrawls = new plasmaCrawlProfile(pdb, ramProfiles_time);
initActiveCrawlProfiles();
}
/**
@ -1623,7 +1629,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
*/
public boolean cleanProfiles() throws InterruptedException {
if ((sbQueue.size() > 0) || (cacheLoader.size() > 0) || (noticeURL.notEmpty())) return false;
final Iterator iter = profiles.profiles(true);
final Iterator iter = profilesActiveCrawls.profiles(true);
plasmaCrawlProfile.entry entry;
boolean hasDoneSomething = false;
try {
@ -1637,6 +1643,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
(entry.name().equals(CRAWL_PROFILE_REMOTE)) ||
(entry.name().equals(CRAWL_PROFILE_SNIPPET_TEXT)) ||
(entry.name().equals(CRAWL_PROFILE_SNIPPET_MEDIA)))) {
profilesPassiveCrawls.newEntry(entry.map());
iter.remove();
hasDoneSomething = true;
}
@ -1780,7 +1787,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
messageDB.close();
if (facilityDB != null) facilityDB.close();
sbStackCrawlThread.close();
profiles.close();
profilesActiveCrawls.close();
robots.close();
parser.close();
plasmaHTCache.close();
@ -1799,10 +1806,10 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
public int queueSize() {
return sbQueue.size();
//return processStack.size() + cacheLoader.size() + noticeURL.stackSize();
}
public void enQueue(Object job) {
assert job != null;
if (!(job instanceof plasmaSwitchboardQueue.Entry)) {
System.out.println("Internal error at plasmaSwitchboard.enQueue: wrong job type");
System.exit(0);
@ -1900,9 +1907,16 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
", overhangStackSize=" + noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_OVERHANG) +
", remoteStackSize=" + noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_REMOTE));
try {
int sizeBefore = sbQueue.size();
nextentry = sbQueue.pop();
if (nextentry == null) {
log.logFine("deQueue: null entry on queue stack");
log.logWarning("deQueue: null entry on queue stack.");
if (sbQueue.size() == sizeBefore) {
// this is a severe problem: because this time a null is returned, it means that this status will last forever
// to re-enable use of the sbQueue, it must be emptied completely
log.logSevere("deQueue: does not shrink after pop() == null. Emergency reset.");
sbQueue.clear();
}
return false;
}
} catch (IOException e) {
@ -2179,7 +2193,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
log.logSevere(stats + ": NULL PROFILE HANDLE '" + urlEntry.profileHandle() + "' for URL " + urlEntry.url());
return true;
}
plasmaCrawlProfile.entry profile = profiles.getEntry(profileHandle);
plasmaCrawlProfile.entry profile = profilesActiveCrawls.getEntry(profileHandle);
if (profile == null) {
log.logWarning(stats + ": LOST PROFILE HANDLE '" + urlEntry.profileHandle() + "' for URL " + urlEntry.url());
return true;
@ -2244,7 +2258,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
String profileHandle = urlEntry.profileHandle();
// System.out.println("DEBUG plasmaSwitchboard.processCrawling:
// profileHandle = " + profileHandle + ", urlEntry.url = " + urlEntry.url());
plasmaCrawlProfile.entry profile = profiles.getEntry(profileHandle);
plasmaCrawlProfile.entry profile = profilesActiveCrawls.getEntry(profileHandle);
if (profile == null) {
log.logWarning(stats + ": LOST PROFILE HANDLE '" + urlEntry.profileHandle() + "' for URL " + urlEntry.url());
return true;
@ -2332,7 +2346,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
// System.out.println("DEBUG plasmaSwitchboard.processCrawling:
// profileHandle = " + profileHandle + ", urlEntry.url = " +
// urlEntry.url());
plasmaCrawlProfile.entry profile = profiles.getEntry(profileHandle);
plasmaCrawlProfile.entry profile = profilesActiveCrawls.getEntry(profileHandle);
if (profile == null) {
log.logWarning(stats + ": LOST PROFILE HANDLE '" + urlEntry.profileHandle() + "' for URL " + urlEntry.url());

View File

@ -82,7 +82,7 @@ public class plasmaSwitchboardQueue {
"byte[] flags-1, " + // flags
"String initiator-" + yacySeedDB.commonHashLength + ", " + // the crawling initiator
"Cardinal depth-2 {b64e}, " + // the prefetch depth so far, starts at 0
"String profile-" + plasmaCrawlProfile.crawlProfileHandleLength + ", " + // the name of the prefetch profile handle
"String profile-" + yacySeedDB.commonHashLength + ", " + // the name of the prefetch profile handle
"String urldescr-80",
kelondroNaturalOrder.naturalOrder,
0);

View File

@ -61,7 +61,7 @@ public final class plasmaWordIndex implements indexRI {
// environment constants
public static final long wCacheMaxAge = 1000 * 60 * 30; // milliseconds; 30 minutes
public static final int wCacheMaxChunk = 1000; // number of references for each urlhash
public static final int lowcachedivisor = 200;
public static final int lowcachedivisor = 320;
public static final int maxCollectionPartition = 7; // should be 7
private final kelondroOrder indexOrder = kelondroBase64Order.enhancedCoder;

View File

@ -34,11 +34,11 @@ public class urlRedirectord implements serverHandler {
}
if (profile == null) {
profile = switchboard.profiles.newEntry(
profile = switchboard.profilesActiveCrawls.newEntry(
// name
"URL Redirector",
// start URL
"",
null,
// crawling filter
".*",
".*",
@ -151,7 +151,7 @@ public class urlRedirectord implements serverHandler {
if (pos != -1) {
String newDepth = line.substring(pos).trim();
this.theLogger.logFine("Changing crawling depth to '" + newDepth + "'.");
switchboard.profiles.changeEntry(profile, "generalDepth",newDepth);
switchboard.profilesActiveCrawls.changeEntry(profile, "generalDepth",newDepth);
}
outputWriter.print("\r\n");
outputWriter.flush();
@ -160,7 +160,7 @@ public class urlRedirectord implements serverHandler {
if (pos != -1) {
String newValue = line.substring(pos).trim();
this.theLogger.logFine("Changing crawl dynamic setting to '" + newValue + "'");
switchboard.profiles.changeEntry(profile, "crawlingQ",newValue);
switchboard.profilesActiveCrawls.changeEntry(profile, "crawlingQ",newValue);
}
outputWriter.print("\r\n");
outputWriter.flush();
@ -192,7 +192,7 @@ public class urlRedirectord implements serverHandler {
// first delete old entry, if exists
String urlhash = reqURL.hash();
switchboard.wordIndex.loadedURL.remove(urlhash);
switchboard.noticeURL.remove(urlhash);
switchboard.noticeURL.removeByURLHash(urlhash);
switchboard.errorURL.remove(urlhash);
// enqueuing URL for crawling