diff --git a/htroot/CrawlURLFetch_p.html b/htroot/CrawlURLFetch_p.html index cc238d6ca..8548f2880 100644 --- a/htroot/CrawlURLFetch_p.html +++ b/htroot/CrawlURLFetch_p.html @@ -7,49 +7,60 @@ #%env/templates/header.template%#

URL-Fetcher

-

-

- All newly added URLs will be crawled using the Proxy Crawl Profile. -

Fetch new URLs to crawl +

+ The newly added URLs will be crawled without any filter restricions except of the static stop-words. + The Re-Crawl option isn't used and the sites won't be stored in the Proxy Cache. Text and media types will be indexed. + Since these URLs are explicitely requested from another peer, they won't be distributed for remote indexing. +

:
- #(hostError)#::Malformed URL#(/hostError)# + #(saved)#::
+ Or select previously entered URL: #(/saved)# + #(hostError)#:: Malformed URL#(/hostError)#
+ #(peersKnown)#::
:
- + #{peers}# #{/peers}# - #(peerError)#:: + #(peerError)#:: +  Error fetching URL-list from #[hash]#:#[name]#:: +  Peer with hash #[hash]# doesn't seem to be online anymore#(/peerError)#
#(/peersKnown)# -
:
+ +
Frequency:
- -
-
:
-
- , +
+
+ : - - + + #(freqError)#:: Invalid period, fetching only once#(/freqError)#
+ + #(threadError)#:: + Error on stopping thread, it isn't alive anymore:: + Error on restarting thread, it isn't alive anymore#(/threadError)# + #(runs)#::
Thread to fetch URLs is #(status)#running::stopped::paused#(/status)#
@@ -59,7 +70,18 @@
Total fetched URLs:
#[totalFetchedURLs]#
Total failed URLs:
#[totalFailedURLs]#
Last fetched URLs:
#[lastFetchedURLs]#
-
+
Failed URLs:
+
+ #[error]# +
    #{error}# +
  • #[reason]#: #[url]#
  • #{/error}# +
+
+
#(status)# + :: + :: + #(/status)# +
#(/runs)# diff --git a/htroot/CrawlURLFetch_p.java b/htroot/CrawlURLFetch_p.java index 43d8f02b3..939d9a7fc 100644 --- a/htroot/CrawlURLFetch_p.java +++ b/htroot/CrawlURLFetch_p.java @@ -2,33 +2,193 @@ import java.io.IOException; import java.net.MalformedURLException; +import java.util.ArrayList; import java.util.Date; import java.util.Enumeration; +import java.util.HashMap; +import java.util.Iterator; +import java.util.Random; import java.util.TreeMap; +import de.anomic.kelondro.kelondroBitfield; import de.anomic.net.URL; +import de.anomic.plasma.plasmaCrawlEURL; import de.anomic.plasma.plasmaCrawlProfile; import de.anomic.plasma.plasmaSwitchboard; import de.anomic.server.serverSwitch; +import de.anomic.data.wikiCode; import de.anomic.http.httpHeader; import de.anomic.http.httpc; import de.anomic.server.serverObjects; +import de.anomic.server.logging.serverLog; import de.anomic.yacy.yacyCore; import de.anomic.yacy.yacySeed; public class CrawlURLFetch_p { + private static final long ERR_DATE = 1; + private static final long ERR_HOST_MALFORMED_URL = 1; + private static final long ERR_PEER_GENERAL_CONN = 1; + private static final long ERR_PEER_OFFLINE = 2; + private static final long ERR_THREAD_STOP = 1; + private static final long ERR_THREAD_RESUME = 2; + + private static final long STAT_THREAD_ALIVE = 0; + private static final long STAT_THREAD_STOPPED = 1; + private static final long STAT_THREAD_PAUSED = 2; + private static URLFetcher fetcher = null; + private static plasmaCrawlProfile.entry profile = null; + private static ArrayList savedURLs = new ArrayList(); public static serverObjects respond(httpHeader header, serverObjects post, serverSwitch env) { serverObjects prop = new serverObjects(); - prop.put("host", ""); + listURLs(prop); // List previously saved URLs for easy selection + listPeers(prop); // List known hosts - // List known hosts for message sending + if (profile == null) { + profile = ((plasmaSwitchboard)env).profiles.newEntry( + "URLFetcher", // Name + null, // URL + ".*", ".*", // General / specific filter + 0, 0, // General / specific depth + -1, -1, -1, // Recrawl / Dom-filter depth / Dom-max-pages + true, // Crawl query + true, true, // Index text / media + false, true, // Store in HT- / TX-Cache + false, // Remote indexing + true, false, false); // Exclude static / dynamic / parent stopwords + } + + if (post != null) { + if (post.containsKey("start")) { + long frequency = URLFetcher.DELAY_ONCE; + if (post.containsKey("reg")) { + if (post.get("reg", "").equals("self_det")) { + frequency = URLFetcher.DELAY_SELF_DET; + } else if (post.get("reg", "").equals("delay")) { + frequency = getDate(post.get("frequency", ""), post.get("freq_type", "")); + if (frequency == -1) + prop.put("freqError", ERR_DATE); + } + } + + fetcher = null; + if (post.get("source", "").equals("peer") && + post.get("peerhash", "").equals("random")) { + fetcher = new URLFetcher( + env, + profile, + frequency); + } else { + URL url = null; + if (post.get("source", "").equals("url")) { + try { + url = new URL(post.get("host", null)); + if (!savedURLs.contains(url.toNormalform())) + savedURLs.add(url.toNormalform()); + prop.put("host", post.get("host", url.toString())); + } catch (MalformedURLException e) { + prop.put("host", post.get("host", "")); + prop.put("hostError", ERR_HOST_MALFORMED_URL); + } + } else if (post.get("source", "").equals("peer")) { + yacySeed ys = null; + try { + ys = yacyCore.seedDB.getConnected(post.get("peerhash", "")); + if (ys != null) { + url = new URL("http://" + ys.getAddress() + "/yacy/urllist.html"); + } else { + prop.put("peerError", ERR_PEER_OFFLINE); + prop.put("peerError_hash", post.get("peerhash", "")); + } + } catch (MalformedURLException e) { + prop.put("peerError", ERR_PEER_GENERAL_CONN); + prop.put("peerError_hash", post.get("peerhash", "")); + prop.put("peerError_name", ys.getName()); + } + } + + if (url != null) { + fetcher = new URLFetcher( + env, + profile, + url, + frequency); + } + } + if (fetcher != null) + fetcher.start(); + } + else if (post.containsKey("stop")) { + if (fetcher != null) { + fetcher.interrupt(); + } else { + prop.put("threadError", ERR_THREAD_STOP); + } + } + else if (post.containsKey("restart") || post.containsKey("resume")) { + if (fetcher != null) { + if (fetcher.url == null) { + fetcher = new URLFetcher( + env, + profile, + fetcher.delay); + } else { + fetcher = new URLFetcher( + env, + profile, + fetcher.url, + fetcher.delay); + } + fetcher.start(); + } else { + prop.put("threadError", ERR_THREAD_RESUME); + } + } + } + + if (fetcher != null) { + prop.put("runs", 1); + prop.put("runs_status", (fetcher.isAlive()) ? STAT_THREAD_ALIVE : + (fetcher.paused) ? STAT_THREAD_PAUSED : STAT_THREAD_STOPPED); + prop.put("runs_totalRuns", URLFetcher.totalRuns); + prop.put("runs_totalFetchedURLs", URLFetcher.totalFetchedURLs); + prop.put("runs_totalFailedURLs", URLFetcher.totalFailed); + prop.put("runs_lastRun", fetcher.lastRun); + prop.put("runs_lastFetchedURLs", fetcher.lastFetchedURLs); + prop.put("runs_lastServerResponse", (fetcher.lastServerResponse == null) + ? "" : fetcher.lastServerResponse); + + Iterator it = fetcher.failed.keySet().iterator(); + int i = 0; + Object key; + while (it.hasNext()) { + key = it.next(); + prop.put("runs_error_" + i + "_reason", fetcher.failed.get(key)); + prop.put("runs_error_" + i + "_url", (String)key); + i++; + } + prop.put("runs_error", i); + } + + return prop; + } + + private static int listURLs(serverObjects prop) { + if (savedURLs.size() == 0) return 0; + prop.put("saved", 1); + for (int i=0; i 0) { prop.put("peersKnown", 1); - int peerCount = 0; try { TreeMap hostList = new TreeMap(); final Enumeration e = yacyCore.seedDB.seedsConnected(true, false, null, (float) 0.0); @@ -50,65 +210,7 @@ public class CrawlURLFetch_p { } else { prop.put("peersKnown", 0); } - - if (post != null) { - if (post.containsKey("start")) { - try { - - long frequency = -1; - if (post.containsKey("regularly")) - frequency = getDate(post.get("frequency", ""), post.get("freq_type", "")); - - String t = post.get("type", "text"); - int type = -1; - if (t.equals("text")) { - type = URLFetcher.TYPE_TEXT; - } else if (t.equals("xml")) { - type = URLFetcher.TYPE_XML; - } - - URL url = new URL(post.get("host", null)); - prop.put("host", post.get("host", "")); - - if (type > -1) { - if (frequency > -1) { - fetcher = new URLFetcher( - env, - ((plasmaSwitchboard)env).defaultProxyProfile, - url, - frequency, - type); - } else { // only fetch once - fetcher = new URLFetcher( - env, - ((plasmaSwitchboard)env).defaultProxyProfile, - url, - type); - } - fetcher.start(); - } - } catch (MalformedURLException e) { - prop.put("host", post.get("host", "")); - prop.put("hostError", 1); - } - } else if (post.containsKey("stop")) { - fetcher.interrupt(); - } - } - - if (fetcher != null) { - prop.put("runs", 1); - prop.put("runs_status", (fetcher.isRunning()) ? 0 : (fetcher.isPaused()) ? 2 : 1); - prop.put("runs_totalRuns", URLFetcher.totalRuns); - prop.put("runs_totalFetchedURLs", URLFetcher.totalFetchedURLs); - prop.put("runs_totalFailedURLs", URLFetcher.totalFailed); - prop.put("runs_lastRun", URLFetcher.lastRun); - prop.put("runs_lastFetchedURLs", URLFetcher.lastFetchedURLs); - prop.put("runs_lastServerResponse", (URLFetcher.lastServerResponse == null) - ? "" : URLFetcher.lastServerResponse); - } - - return prop; + return peerCount; } private static long getDate(String count, String type) { @@ -116,7 +218,7 @@ public class CrawlURLFetch_p { if (count != null && count.matches("\\d+")) r = Long.parseLong(count); if (r < 1) return -1; - r *= 3600 * 24; + r *= 3600; if (type.equals("weeks")) return r * 24 * 7; else if (type.equals("days")) return r * 24; else if (type.equals("hours")) return r; @@ -125,120 +227,161 @@ public class CrawlURLFetch_p { public static class URLFetcher extends Thread { - public static final int TYPE_TEXT = 0; - public static final int TYPE_XML = 1; + public static final long DELAY_ONCE = -1; + public static final long DELAY_SELF_DET = 0; - public static int lastFetchedURLs = 0; - public static long lastRun = 0; - public static String lastServerResponse = null; - public static int lastFailed = 0; public static int totalRuns = 0; public static int totalFetchedURLs = 0; public static int totalFailed = 0; - private final URL url; - private final long delay; - private final int type; - private final plasmaSwitchboard sb; - private final plasmaCrawlProfile.entry profile; + public final HashMap failed = new HashMap(); - private boolean running = false; - private boolean paused = false; + public int lastFetchedURLs = 0; + public long lastRun = 0; + public String lastServerResponse = null; + public int lastFailed = 0; + + public final URL url; + public final long delay; + public final plasmaSwitchboard sb; + public final plasmaCrawlProfile.entry profile; + + public boolean paused = false; public URLFetcher( serverSwitch env, plasmaCrawlProfile.entry profile, URL url, - int type) { - this.sb = (plasmaSwitchboard)env; - this.profile = profile; - this.url = url; - this.type = type; - this.delay = 0; - this.setName("URL-Fetcher"); - } - - public URLFetcher( - serverSwitch env, - plasmaCrawlProfile.entry profile, - URL url, - long delayMs, - int type) { + long delayMs) { + if (env == null || profile == null || url == null) + throw new NullPointerException("env, profile or url must not be null"); this.sb = (plasmaSwitchboard)env; this.profile = profile; this.url = url; this.delay = delayMs; - this.type = type; - this.setName("URL-Fetcher"); + this.setName("URLFetcher"); } - public boolean isRunning() { return this.running; } - public boolean isPaused() { return this.paused; } + public URLFetcher( + serverSwitch env, + plasmaCrawlProfile.entry profile, + long delayMs) { + if (env == null || profile == null) + throw new NullPointerException("env or profile must not be null"); + this.sb = (plasmaSwitchboard)env; + this.profile = profile; + this.url = null; + this.delay = delayMs; + this.setName("URLFetcher"); + } public void run() { - this.running = true; this.paused = false; long start; - do { + URL url; + while (!isInterrupted()) { try { start = System.currentTimeMillis(); - totalFetchedURLs += addURLs(); + url = getDLURL(); + if (url == null) { + serverLog.logSevere(this.getName(), "canceled because no valid URL for the URL-list could be determinded"); + break; + } + totalFetchedURLs += stackURLs(getURLs(url)); lastRun = System.currentTimeMillis() - start; totalRuns++; - this.paused = true; - this.wait(this.delay); + if (this.delay < 0) { + break; + } else if (this.delay == 0) { + this.paused = true; + while (this.paused) this.wait(); + } else { + this.paused = true; + this.wait(this.delay); + } this.paused = false; } catch (InterruptedException e) { break; } - } while (!isInterrupted() && this.delay > 0); - this.running = false; + } } - private int addURLs() throws InterruptedException { - String[] urls = getURLs(); - lastFailed = 0; + private URL getDLURL() { + if (this.url != null) return this.url; + + // choose random seed + yacySeed ys = null; + Enumeration e = yacyCore.seedDB.seedsConnected(true, false, null, 0F); + int num = new Random().nextInt(yacyCore.seedDB.sizeConnected()) + 1; + Object o; + for (int i=0; i