From c500178fd7e3e4b320010af74659f2428b05d36a Mon Sep 17 00:00:00 2001 From: orbiter Date: Mon, 18 Dec 2006 02:56:32 +0000 Subject: [PATCH] redesign of index creation interface - the input remains in the IndexCreation menu point - after pressing the submit button, the IndexingMonitor is called - the code for creation of new indexing starts was moved to the indexingMonitor - Existing crawl profiles can be monitored in the Indexing Monitor - the code for creation of crawl profile data was shifted from indexing start to indexing monitor - existing crawl profiles can be deleted on the crawl monitor page git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@3095 6c8d7289-2bf4-0310-a012-ef5d649a1542 --- build.properties | 2 +- htroot/IndexCreate_p.html | 84 +---------- htroot/IndexCreate_p.java | 297 +------------------------------------ htroot/WatchCrawler_p.html | 79 +++++++++- 4 files changed, 84 insertions(+), 378 deletions(-) diff --git a/build.properties b/build.properties index 58511f531..dc8f26d86 100644 --- a/build.properties +++ b/build.properties @@ -3,7 +3,7 @@ javacSource=1.4 javacTarget=1.4 # Release Configuration -releaseVersion=0.493 +releaseVersion=0.494 releaseFile=yacy_dev_v${releaseVersion}_${DSTAMP}_${releaseNr}.tar.gz #releaseFile=yacy_v${releaseVersion}_${DSTAMP}_${releaseNr}.tar.gz releaseDir=yacy_dev_v${releaseVersion}_${DSTAMP}_${releaseNr} diff --git a/htroot/IndexCreate_p.html b/htroot/IndexCreate_p.html index 1db6f862c..d9b32a7c2 100644 --- a/htroot/IndexCreate_p.html +++ b/htroot/IndexCreate_p.html @@ -16,7 +16,7 @@ You can define URLs as start points for Web page crawling and start crawling here. "Crawling" means that YaCy will download the given website, extract all links in it and then download the content behind these links. This is repeated as long as specified under "Crawling Depth".

-
+ @@ -242,51 +242,15 @@
Attribut
- -

- #(error)# - :: - Error with profile management. Please stop YaCy, delete the file DATA/PLASMADB/crawlProfiles0.db and restart. - :: - Error: #[errmsg]# - :: - Application not yet initialized. Sorry. Please wait some seconds and repeat the request. - :: - ERROR: Crawl filter "#[newcrawlingfilter]#" does not match with crawl root "#[crawlingStart]#". Please try again with different filter. - :: - Crawling of "#[crawlingURL]#" failed. Reason: #[reasonString]#
- :: - Error with URL input "#[crawlingStart]#": #[error]# - :: - Error with file input "#[crawlingStart]#": #[error]# - #(/error)# -

#(info)# :: - Set new prefetch depth to "#[newproxyPrefetchDepth]#" - :: - Crawling of "#[crawlingURL]#" started. - You can monitor the crawling progress either by watching the URL queues - (local queue, - global queue, - loader queue, - indexing queue) - or see the fill/process count of all queues on the - performance page. - Please wait some seconds, because the request is enqueued and delayed until the proxy/HTTP-server is idle for a certain time. - The indexing results are presented on the - Index Monitor-page. - It will take at least 30 seconds until the first result appears there. Please be patient, the crawling will pause each time you use the proxy or web server to ensure maximum availability. - If you crawl any un-wanted pages, you can delete them here.
- :: - Removed #[numEntries]# entries from crawl queue. This queue may fill again if the loading and indexing queue is not empty. - :: Crawling paused successfully. :: Continue crawling. #(/info)#

+ #(refreshbutton)# ::
@@ -305,50 +269,6 @@
-

Crawl Profile List:

- - - - - - - - - - - - - - - - - - - - - - - - - - #{crawlProfiles}# - - - - - - - - - - - - - - - - #{/crawlProfiles}# -
Crawl ThreadStart URLDepthFilterMaxAgeAuto Filter DepthAuto Filter ContentMax Page Per DomainAccept '?' URLsFill Proxy CacheLocal IndexingRemote Indexing
#[name]##[startURL]##[depth]##[filter]##[crawlingIfOlder]##[crawlingDomFilterDepth]##[crawlingDomFilterContent]##[crawlingDomMaxPages]##(withQuery)#no::yes#(/withQuery)##(storeCache)#no::yes#(/storeCache)##(localIndexing)#no::yes#(/localIndexing)##(remoteIndexing)#no::yes#(/remoteIndexing)##(deleteButton)#::
#(/deleteButton)#

Recently started remote crawls in progress:

diff --git a/htroot/IndexCreate_p.java b/htroot/IndexCreate_p.java index ee5bbda4f..02c62a26c 100644 --- a/htroot/IndexCreate_p.java +++ b/htroot/IndexCreate_p.java @@ -43,29 +43,13 @@ // javac -classpath .:../classes IndexCreate_p.java // if the shell's current path is HTROOT -import java.io.File; import java.io.IOException; -import java.io.Writer; -import java.net.MalformedURLException; -import java.util.Date; import java.util.Enumeration; -import java.util.HashMap; -import java.util.Iterator; -import java.util.Map; -import java.util.regex.Pattern; -import java.util.regex.PatternSyntaxException; import de.anomic.data.wikiCode; -import de.anomic.htmlFilter.htmlFilterContentScraper; -import de.anomic.htmlFilter.htmlFilterWriter; import de.anomic.http.httpHeader; -import de.anomic.kelondro.kelondroBitfield; import de.anomic.plasma.plasmaURL; -import de.anomic.net.URL; -import de.anomic.plasma.plasmaCrawlEURL; -import de.anomic.plasma.plasmaCrawlProfile; import de.anomic.plasma.plasmaSwitchboard; -import de.anomic.server.serverFileUtils; import de.anomic.server.serverObjects; import de.anomic.server.serverSwitch; import de.anomic.server.serverThread; @@ -81,230 +65,10 @@ public class IndexCreate_p { plasmaSwitchboard switchboard = (plasmaSwitchboard) env; serverObjects prop = new serverObjects(); - prop.put("error", 0); prop.put("info", 0); prop.put("refreshbutton", 0); if (post != null) { - if (post.containsKey("crawlingstart")) { - // init crawl - if (yacyCore.seedDB == null) { - prop.put("error", 3); - } else { - // set new properties - String newcrawlingfilter = post.get("crawlingFilter", ".*"); - env.setConfig("crawlingFilter", newcrawlingfilter); - - int newcrawlingdepth = Integer.parseInt(post.get("crawlingDepth", "0")); - env.setConfig("crawlingDepth", Integer.toString(newcrawlingdepth)); - - boolean crawlingIfOlderCheck = post.get("crawlingIfOlderCheck", "off").equals("on"); - int crawlingIfOlderNumber = Integer.parseInt(post.get("crawlingIfOlderNumber", "-1")); - String crawlingIfOlderUnit = post.get("crawlingIfOlderUnit","year"); - int crawlingIfOlder = recrawlIfOlderC(crawlingIfOlderCheck, crawlingIfOlderNumber, crawlingIfOlderUnit); - env.setConfig("crawlingIfOlder", crawlingIfOlder); - - boolean crawlingDomFilterCheck = post.get("crawlingDomFilterCheck", "off").equals("on"); - int crawlingDomFilterDepth = (crawlingDomFilterCheck) ? Integer.parseInt(post.get("crawlingDomFilterDepth", "-1")) : -1; - env.setConfig("crawlingDomFilterDepth", Integer.toString(crawlingDomFilterDepth)); - - boolean crawlingDomMaxCheck = post.get("crawlingDomMaxCheck", "off").equals("on"); - int crawlingDomMaxPages = (crawlingDomMaxCheck) ? Integer.parseInt(post.get("crawlingDomMaxPages", "-1")) : -1; - env.setConfig("crawlingDomMaxPages", Integer.toString(crawlingDomMaxPages)); - - boolean crawlingQ = post.get("crawlingQ", "off").equals("on"); - env.setConfig("crawlingQ", (crawlingQ) ? "true" : "false"); - - boolean storeHTCache = post.get("storeHTCache", "off").equals("on"); - env.setConfig("storeHTCache", (storeHTCache) ? "true" : "false"); - - boolean localIndexing = post.get("localIndexing", "off").equals("on"); - env.setConfig("localIndexing", (localIndexing) ? "true" : "false"); - - boolean crawlOrder = post.get("crawlOrder", "off").equals("on"); - env.setConfig("crawlOrder", (crawlOrder) ? "true" : "false"); - - boolean xsstopw = post.get("xsstopw", "off").equals("on"); - env.setConfig("xsstopw", (xsstopw) ? "true" : "false"); - - boolean xdstopw = post.get("xdstopw", "off").equals("on"); - env.setConfig("xdstopw", (xdstopw) ? "true" : "false"); - - boolean xpstopw = post.get("xpstopw", "off").equals("on"); - env.setConfig("xpstopw", (xpstopw) ? "true" : "false"); - - String crawlingMode = post.get("crawlingMode","url"); - if (crawlingMode.equals("url")) { - // getting the crawljob start url - String crawlingStart = post.get("crawlingURL",""); - crawlingStart = crawlingStart.trim(); - - // adding the prefix http:// if necessary - int pos = crawlingStart.indexOf("://"); - if (pos == -1) crawlingStart = "http://" + crawlingStart; - - // normalizing URL - try {crawlingStart = new URL(crawlingStart).toNormalform();} catch (MalformedURLException e1) {} - - // check if url is proper - URL crawlingStartURL = null; - try { - crawlingStartURL = new URL(crawlingStart); - } catch (MalformedURLException e) { - crawlingStartURL = null; - } - - // check if pattern matches - if ((crawlingStartURL == null) /* || (!(crawlingStart.matches(newcrawlingfilter))) */) { - // print error message - prop.put("error", 4); //crawlfilter does not match url - prop.put("error_newcrawlingfilter", newcrawlingfilter); - prop.put("error_crawlingStart", crawlingStart); - } else try { - - // check if the crawl filter works correctly - Pattern.compile(newcrawlingfilter); - - // stack request - // first delete old entry, if exists - String urlhash = plasmaURL.urlHash(crawlingStart); - switchboard.wordIndex.loadedURL.remove(urlhash); - switchboard.noticeURL.remove(urlhash); - switchboard.errorURL.remove(urlhash); - - // stack url - plasmaCrawlProfile.entry pe = switchboard.profiles.newEntry(crawlingStartURL.getHost(), crawlingStart, newcrawlingfilter, newcrawlingfilter, newcrawlingdepth, newcrawlingdepth, crawlingIfOlder, crawlingDomFilterDepth, crawlingDomMaxPages, crawlingQ, storeHTCache, true, localIndexing, crawlOrder, xsstopw, xdstopw, xpstopw); - String reasonString = switchboard.sbStackCrawlThread.stackCrawl(crawlingStart, null, yacyCore.seedDB.mySeed.hash, "CRAWLING-ROOT", new Date(), 0, pe); - - if (reasonString == null) { - // liftoff! - prop.put("info", 2);//start msg - prop.put("info_crawlingURL", ((String) post.get("crawlingURL"))); - - // generate a YaCyNews if the global flag was set - if (crawlOrder) { - Map m = new HashMap(pe.map()); // must be cloned - m.remove("specificDepth"); - m.remove("localIndexing"); - m.remove("remoteIndexing"); - m.remove("xsstopw"); - m.remove("xpstopw"); - m.remove("xdstopw"); - m.remove("storeTXCache"); - m.remove("storeHTCache"); - m.remove("generalFilter"); - m.remove("specificFilter"); - m.put("intention", post.get("intention", "").replace(',', '/')); - yacyCore.newsPool.publishMyNews(new yacyNewsRecord("crwlstrt", m)); - } - - } else { - prop.put("error", 5); //Crawling failed - prop.put("error_crawlingURL", wikiCode.replaceHTML(((String) post.get("crawlingURL")))); - prop.put("error_reasonString", reasonString); - - plasmaCrawlEURL.Entry ee = switchboard.errorURL.newEntry(crawlingStartURL, null, yacyCore.seedDB.mySeed.hash, yacyCore.seedDB.mySeed.hash, - crawlingStartURL.getHost(), reasonString, new kelondroBitfield()); - ee.store(); - switchboard.errorURL.stackPushEntry(ee); - } - } catch (PatternSyntaxException e) { - prop.put("error", 8); //crawlfilter does not match url - prop.put("error_newcrawlingfilter", newcrawlingfilter); - prop.put("error_error", e.getMessage()); - } catch (Exception e) { - // mist - prop.put("error", 6);//Error with url - prop.put("error_crawlingStart", crawlingStart); - prop.put("error_error", e.getMessage()); - e.printStackTrace(); - } - - } else if (crawlingMode.equals("file")) { - if (post.containsKey("crawlingFile")) { - // getting the name of the uploaded file - String fileName = (String) post.get("crawlingFile"); - try { - // check if the crawl filter works correctly - Pattern.compile(newcrawlingfilter); - - // loading the file content - File file = new File(fileName); - - // getting the content of the bookmark file - byte[] fileContent = (byte[]) post.get("crawlingFile$file"); - - // TODO: determine the real charset here .... - String fileString = new String(fileContent,"UTF-8"); - - // parsing the bookmark file and fetching the headline and contained links - htmlFilterContentScraper scraper = new htmlFilterContentScraper(new URL(file)); - //OutputStream os = new htmlFilterOutputStream(null, scraper, null, false); - Writer writer = new htmlFilterWriter(null,null,scraper,null,false); - serverFileUtils.write(fileString,writer); - writer.close(); - - //String headline = scraper.getHeadline(); - HashMap hyperlinks = (HashMap) scraper.getAnchors(); - - // creating a crawler profile - plasmaCrawlProfile.entry profile = switchboard.profiles.newEntry(fileName, file.toURL().toString(), newcrawlingfilter, newcrawlingfilter, newcrawlingdepth, newcrawlingdepth, crawlingIfOlder, crawlingDomFilterDepth, crawlingDomMaxPages, crawlingQ, storeHTCache, true, localIndexing, crawlOrder, xsstopw, xdstopw, xpstopw); - - // loop through the contained links - Iterator interator = hyperlinks.entrySet().iterator(); - int c = 0; - while (interator.hasNext()) { - Map.Entry e = (Map.Entry) interator.next(); - String nexturlstring = (String) e.getKey(); - - if (nexturlstring == null) continue; - - nexturlstring = nexturlstring.trim(); - - // normalizing URL - nexturlstring = new URL(nexturlstring).toNormalform(); - - // generating an url object - URL nexturlURL = null; - try { - nexturlURL = new URL(nexturlstring); - } catch (MalformedURLException ex) { - nexturlURL = null; - c++; - continue; - } - - // enqueuing the url for crawling - String rejectReason = switchboard.sbStackCrawlThread.stackCrawl(nexturlstring, null, yacyCore.seedDB.mySeed.hash, (String)e.getValue(), new Date(), 1, profile); - - // if something failed add the url into the errorURL list - if (rejectReason == null) { - c++; - } else { - plasmaCrawlEURL.Entry ee = switchboard.errorURL.newEntry(nexturlURL, null, yacyCore.seedDB.mySeed.hash, yacyCore.seedDB.mySeed.hash, - (String) e.getValue(), rejectReason, new kelondroBitfield()); - ee.store(); - switchboard.errorURL.stackPushEntry(ee); - } - } - - } catch (PatternSyntaxException e) { - // print error message - prop.put("error", 8); //crawlfilter does not match url - prop.put("error_newcrawlingfilter", newcrawlingfilter); - prop.put("error_error", e.getMessage()); - } catch (Exception e) { - // mist - prop.put("error", 7);//Error with file - prop.put("error_crawlingStart", fileName); - prop.put("error_error", e.getMessage()); - e.printStackTrace(); - } - } - } - } - } - if (post.containsKey("distributedcrawling")) { long newBusySleep = Integer.parseInt(env.getConfig("62_remotetriggeredcrawl_busysleep", "100")); if (post.get("dcr", "").equals("acceptCrawlMax")) { @@ -328,18 +92,14 @@ public class IndexCreate_p { if (post.containsKey("pausecrawlqueue")) { switchboard.pauseCrawlJob(plasmaSwitchboard.CRAWLJOB_LOCAL_CRAWL); - prop.put("info", 4);//crawling paused + prop.put("info", 1);//crawling paused } if (post.containsKey("continuecrawlqueue")) { switchboard.continueCrawlJob(plasmaSwitchboard.CRAWLJOB_LOCAL_CRAWL); - prop.put("info", 5);//crawling continued + prop.put("info", 2);//crawling continued } - if (post.containsKey("deleteprofile")) { - String handle = (String) post.get("handle"); - if (handle != null) switchboard.profiles.removeEntry(handle); - } } // define visible variables @@ -420,43 +180,7 @@ public class IndexCreate_p { } // create prefetch table - boolean dark; - - // sed crawl profiles - int count = 0; - int domlistlength = (post == null) ? 160 : post.getInt("domlistlength", 160); - //try{ - Iterator it = switchboard.profiles.profiles(true); - plasmaCrawlProfile.entry profile; - dark = true; - while (it.hasNext()) { - profile = (plasmaCrawlProfile.entry) it.next(); - //table += profile.map().toString() + "
"; - prop.put("crawlProfiles_"+count+"_dark", ((dark) ? 1 : 0)); - prop.put("crawlProfiles_"+count+"_name", wikiCode.replaceHTML(profile.name())); - prop.put("crawlProfiles_"+count+"_startURL", wikiCode.replaceHTML(profile.startURL())); - prop.put("crawlProfiles_"+count+"_handle", wikiCode.replaceHTML(profile.handle())); - prop.put("crawlProfiles_"+count+"_depth", profile.generalDepth()); - prop.put("crawlProfiles_"+count+"_filter", profile.generalFilter()); - prop.put("crawlProfiles_"+count+"_crawlingIfOlder", (profile.recrawlIfOlder() == Long.MAX_VALUE) ? "no re-crawl" : ""+profile.recrawlIfOlder()); - prop.put("crawlProfiles_"+count+"_crawlingDomFilterDepth", (profile.domFilterDepth() == Integer.MAX_VALUE) ? "inactive" : ""+profile.domFilterDepth()); - prop.put("crawlProfiles_"+count+"_crawlingDomFilterContent", profile.domNames(true, domlistlength)); - prop.put("crawlProfiles_"+count+"_crawlingDomMaxPages", (profile.domMaxPages() == Integer.MAX_VALUE) ? "unlimited" : ""+profile.domMaxPages()); - prop.put("crawlProfiles_"+count+"_withQuery", ((profile.crawlingQ()) ? 1 : 0)); - prop.put("crawlProfiles_"+count+"_storeCache", ((profile.storeHTCache()) ? 1 : 0)); - prop.put("crawlProfiles_"+count+"_localIndexing", ((profile.localIndexing()) ? 1 : 0)); - prop.put("crawlProfiles_"+count+"_remoteIndexing", ((profile.remoteIndexing()) ? 1 : 0)); - prop.put("crawlProfiles_"+count+"_deleteButton", (((profile.name().equals("remote")) || - (profile.name().equals("proxy")) || - (profile.name().equals("snippet"))) ? 0 : 1)); - prop.put("crawlProfiles_"+count+"_deleteButton_handle", profile.handle()); - - dark = !dark; - count++; - } - //}catch(IOException e){}; - prop.put("crawlProfiles", count); - + boolean dark = true; // create other peer crawl table using YaCyNews int availableNews = yacyCore.newsPool.size(yacyNewsPool.INCOMING_DB); @@ -513,9 +237,8 @@ public class IndexCreate_p { // remote crawl peers - if (yacyCore.seedDB == null) { - //table += "Sorry, cannot show any crawl output now because the system is not completely initialised. Please re-try."; - prop.put("error", 3); + if (yacyCore.seedDB != null) { + prop.put("remoteCrawlPeers", 0); } else { Enumeration crawlavail = yacyCore.dhtAgent.getAcceptRemoteCrawlSeeds(plasmaURL.dummyHash, true); Enumeration crawlpendi = yacyCore.dhtAgent.getAcceptRemoteCrawlSeeds(plasmaURL.dummyHash, false); @@ -546,22 +269,12 @@ public class IndexCreate_p { } - prop.put("crawler-paused",(switchboard.crawlJobIsPaused(plasmaSwitchboard.CRAWLJOB_LOCAL_CRAWL))?0:1); // return rewrite properties return prop; } - private static int recrawlIfOlderC(boolean recrawlIfOlderCheck, int recrawlIfOlderNumber, String crawlingIfOlderUnit) { - if (!recrawlIfOlderCheck) return -1; - if (crawlingIfOlderUnit.equals("year")) return recrawlIfOlderNumber * 60 * 24 * 356; - if (crawlingIfOlderUnit.equals("month")) return recrawlIfOlderNumber * 60 * 24 * 30; - if (crawlingIfOlderUnit.equals("day")) return recrawlIfOlderNumber * 60 * 24; - if (crawlingIfOlderUnit.equals("hour")) return recrawlIfOlderNumber * 60; - if (crawlingIfOlderUnit.equals("minute")) return recrawlIfOlderNumber; - return -1; - } } diff --git a/htroot/WatchCrawler_p.html b/htroot/WatchCrawler_p.html index 3152412e0..661e685a8 100644 --- a/htroot/WatchCrawler_p.html +++ b/htroot/WatchCrawler_p.html @@ -49,7 +49,7 @@ -
  + +
@@ -66,11 +66,84 @@ -
Indicator 

+ +

+ #(info)# + :: + Error with profile management. Please stop YaCy, delete the file DATA/PLASMADB/crawlProfiles0.db and restart. + :: + Error: #[errmsg]# + :: + Application not yet initialized. Sorry. Please wait some seconds and repeat the request. + :: + ERROR: Crawl filter "#[newcrawlingfilter]#" does not match with crawl root "#[crawlingStart]#". Please try again with different filter. + :: + Crawling of "#[crawlingURL]#" failed. Reason: #[reasonString]#
+ :: + Error with URL input "#[crawlingStart]#": #[error]# + :: + Error with file input "#[crawlingStart]#": #[error]# + :: + Crawling of "#[crawlingURL]#" started. + Please wait some seconds, it may take some seconds until the first result appears there. + If you crawl any un-wanted pages, you can delete them here.
+ #(/info)# +

+ +

-

+ +

Crawl Profiles:
+

+ + + + + + + + + + + + + + + + + + + + + + + + + #{crawlProfiles}# + + + + + + + + + + + + + + + + #{/crawlProfiles}# +
Crawl ThreadStart URLDepthFilterMaxAgeAuto Filter DepthAuto Filter ContentMax Page Per DomainAccept '?' URLsFill Proxy CacheLocal IndexingRemote Indexing
#[name]##[startURL]##[depth]##[filter]##[crawlingIfOlder]##[crawlingDomFilterDepth]##[crawlingDomFilterContent]##[crawlingDomMaxPages]##(withQuery)#no::yes#(/withQuery)##(storeCache)#no::yes#(/storeCache)##(localIndexing)#no::yes#(/localIndexing)##(remoteIndexing)#no::yes#(/remoteIndexing)##(deleteButton)#::
#(/deleteButton)#

+ + +

Crawl Queue:
+
Queue