- enabled fetching new crawls via /yacy/list.html?list=queueUrls for testing purposes

- sent URLs are taken off the limit-stack (of the global crawl trigger) (may be moved somewhere else in future versions) - added option to set the requested chunk-size git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@3367 6c8d7289-2bf4-0310-a012-ef5d649a1542
2024-09-19 00:01:41 +02:00 · 2007-02-14 14:50:55 +00:00 · 2007-02-14 14:50:55 +00:00 · e6ddf135bb
commit e6ddf135bb
parent 67d96249b4
3 changed files with 65 additions and 17 deletions
--- a/htroot/CrawlURLFetch_p.html
+++ b/htroot/CrawlURLFetch_p.html
@ -33,11 +33,13 @@
          #(peersKnown)#::
          <dt><label for="peer">Fetch from Peer</label>:</dt>
          <dd>
-            <input type="radio" name="source" value="peer" id="peer" disabled="disabled" />
-            <select name="peerhash" disabled="disabled">
+            <input type="radio" name="source" value="peer" id="peer" />
+            <select name="peerhash">
              <option value="random" selected="selected">Choose a random peer</option>#{peers}# 
              <option value="#[hash]#">#[name]#</option>#{/peers}# 
            </select>
+            &nbsp;<label for="amount">Amount of URLs to request</label>:
+            <input type="text" name="amount" id="amount" value="50" maxlength="3" size="3" />
            #(peerError)#::
            &nbsp;<span class="error">Error fetching URL-list from <span class="tt">#[hash]#:#[name]#</span></span>::
            &nbsp;<span class="error">Peer with hash <span class="tt">#[hash]#</span> doesn't seem to be online anymore</span>#(/peerError)#
@ -49,7 +51,7 @@
            <input type="radio" name="reg" value="self_det" id="self_det" disabled="disabled"/> <label for="self_det">Fetch when queue is empty</label><br />
            <input type="radio" name="reg" value="delay" id="delay" /> <label for="delay">Fetch in a specified delay</label>:
            <label for="frequency">every</label>
-            &nbsp;<input type="text" name="frequency" id="frequency" text-align="left" size="2" style="text-align: right;" maxlength="2"/>
+            &nbsp;<input type="text" name="frequency" id="frequency" size="2" style="text-align: right;" maxlength="2"/>
            <select name="freq_type">
              <option value="weeks">Weeks</option>
              <option value="days" selected="selected">Days</option>
@ -80,7 +82,7 @@
          <dd>
            #[error]#
            <ul>#{error}# 
-              <li><span class="error">#[reason]#</span>: <a href="#[url]#">#[url]#</a></li>#{/error}#
+              <li><span class="error">#[reason]#</span>: <a href="#[url]#">#[url]#</a></li>#{/error}# 
            </ul> 
          </dd>
          <dt>#(status)#
--- a/htroot/CrawlURLFetch_p.java
+++ b/htroot/CrawlURLFetch_p.java
@ -37,6 +37,8 @@ public class CrawlURLFetch_p {
    private static final long STAT_THREAD_STOPPED = 1;
    private static final long STAT_THREAD_PAUSED = 2;
    
+    public static final float MIN_PEER_VERSION_LIST_SERVLET = 0.504033F;
+    
    private static URLFetcher fetcher = null;
    private static plasmaCrawlProfile.entry profile = null;
    private static ArrayList savedURLs = new ArrayList();
@ -74,6 +76,12 @@ public class CrawlURLFetch_p {
                    }
                }
                
+                int count = 50;
+                if (post.get("amount", "").matches("\\d+")) {
+                    count = Integer.parseInt(post.get("amount", ""));
+                    if (count > 999) count = 999;
+                }
+                
                if (fetcher != null) fetcher.interrupt();
                fetcher = null;
                if (post.get("source", "").equals("peer") &&
@ -81,6 +89,7 @@ public class CrawlURLFetch_p {
                    fetcher = new URLFetcher(
                            env,
                            profile,
+                            count,
                            frequency);
                } else {
                    URL url = null;
@ -103,9 +112,9 @@ public class CrawlURLFetch_p {
                    } else if (post.get("source", "").equals("peer")) {
                        yacySeed ys = null;
                        try {
-                            ys = yacyCore.seedDB.getConnected(post.get("peerhash", ""));
+                            ys = yacyCore.seedDB.get(post.get("peerhash", ""));
                            if (ys != null) {
-                                url = new URL("http://" + ys.getAddress() + "/yacy/urllist.html");
+                                url = new URL("http://" + ys.getAddress() + URLFetcher.LIST_SERVLET);
                            } else {
                                prop.put("peerError", ERR_PEER_OFFLINE);
                                prop.put("peerError_hash", post.get("peerhash", ""));
@ -122,6 +131,7 @@ public class CrawlURLFetch_p {
                                env,
                                profile,
                                url,
+                                count,
                                frequency);
                    }
                }
@ -142,12 +152,14 @@ public class CrawlURLFetch_p {
                        fetcher = new URLFetcher(
                                env,
                                profile,
+                                fetcher.count,
                                fetcher.delay);
                    } else {
                        fetcher = new URLFetcher(
                                env,
                                profile,
                                fetcher.url,
+                                fetcher.count,
                                fetcher.delay);
                    }
                    fetcher.start();
@ -200,7 +212,7 @@ public class CrawlURLFetch_p {
            prop.put("peersKnown", 1);
            try {
                TreeMap hostList = new TreeMap();
-                final Enumeration e = yacyCore.seedDB.seedsConnected(true, false, null, (float) 0.0);
+                final Enumeration e = yacyCore.seedDB.seedsConnected(true, false, null, MIN_PEER_VERSION_LIST_SERVLET);
                while (e.hasMoreElements()) {
                    yacySeed seed = (yacySeed) e.nextElement();
                    if (seed != null) hostList.put(seed.get(yacySeed.NAME, "nameless"),seed.hash);
@ -209,6 +221,7 @@ public class CrawlURLFetch_p {
                String peername;
                while ((peername = (String) hostList.firstKey()) != null) {
                    final String Hash = (String) hostList.get(peername);
+                    if (Hash.equals(yacyCore.seedDB.mySeed.hash)) continue;
                    prop.put("peersKnown_peers_" + peerCount + "_hash", Hash);
                    prop.put("peersKnown_peers_" + peerCount + "_name", peername);
                    hostList.remove(peername);
@ -239,6 +252,8 @@ public class CrawlURLFetch_p {
        public static final long DELAY_ONCE = -1;
        public static final long DELAY_SELF_DET = 0;
        
+        private static final String LIST_SERVLET = "/yacy/list.html?list=queueUrls";
+        
        public static int totalRuns = 0;
        public static int totalFetchedURLs = 0;
        public static int totalFailed = 0;
@ -251,6 +266,7 @@ public class CrawlURLFetch_p {
        public int        lastFailed = 0;
        
        public final URL url;
+        public final int count;
        public final long delay;
        public final plasmaSwitchboard sb;
        public final plasmaCrawlProfile.entry profile;
@ -261,12 +277,14 @@ public class CrawlURLFetch_p {
                serverSwitch env,
                plasmaCrawlProfile.entry profile,
                URL url,
+                int count,
                long delayMs) {
            if (env == null || profile == null || url == null)
                throw new NullPointerException("env, profile or url must not be null");
            this.sb = (plasmaSwitchboard)env;
            this.profile = profile;
            this.url = url;
+            this.count = count;
            this.delay = delayMs;
            this.setName("URLFetcher");
        }
@ -274,12 +292,14 @@ public class CrawlURLFetch_p {
        public URLFetcher(
                serverSwitch env,
                plasmaCrawlProfile.entry profile,
+                int count,
                long delayMs) {
            if (env == null || profile == null)
                throw new NullPointerException("env or profile must not be null");
            this.sb = (plasmaSwitchboard)env;
            this.profile = profile;
            this.url = null;
+            this.count = count;
            this.delay = delayMs;
            this.setName("URLFetcher");
        }
@ -297,7 +317,7 @@ public class CrawlURLFetch_p {
                        return;
                    }
                    totalFetchedURLs += stackURLs(getURLs(url));
-                    lastRun = System.currentTimeMillis() - start;
+                    this.lastRun = System.currentTimeMillis() - start;
                    totalRuns++;
                    if (this.delay < 0 || isInterrupted()) {
                        return;
@ -320,7 +340,7 @@ public class CrawlURLFetch_p {
            
            // choose random seed
            yacySeed ys = null;
-            Enumeration e = yacyCore.seedDB.seedsConnected(true, false, null, 0F);
+            Enumeration e = yacyCore.seedDB.seedsConnected(true, false, null, MIN_PEER_VERSION_LIST_SERVLET);
            int num = new Random().nextInt(yacyCore.seedDB.sizeConnected()) + 1;
            Object o;
            for (int i=0; i<num && e.hasMoreElements(); i++) {
@ -330,16 +350,18 @@ public class CrawlURLFetch_p {
            if (ys == null) return null;
            
            try {
-                return new URL("http://" + ys.getAddress() + "/yacy/urllist.html");
+                return new URL("http://" + ys.getAddress() + LIST_SERVLET + "&count=" + this.count);
            } catch (MalformedURLException ee) { return null; }
        }
        
        private int stackURLs(String[] urls) throws InterruptedException {
            this.lastFailed = 0;
+            this.lastFetchedURLs = 0;
            if (urls == null) return 0;
            String reason;
            for (int i=0; i<urls.length && !isInterrupted(); i++) {
-                serverLog.logFinest(this.getName(), "stacking " + urls[i]);
+                if (urls[i].trim().length() == 0) continue;
+                serverLog.logFine(this.getName(), "stacking " + urls[i]);
                reason = this.sb.sbStackCrawlThread.stackCrawl(
                        urls[i],
                        null,
@ -348,7 +370,9 @@ public class CrawlURLFetch_p {
                        new Date(),
                        this.profile.generalDepth(),
                        this.profile);
-                if (reason != null)  {
+                if (reason == null) {
+                    this.lastFetchedURLs++;
+                } else {
                    this.lastFailed++;
                    totalFailed++;
                    this.failed.put(urls[i], reason);
@ -366,7 +390,7 @@ public class CrawlURLFetch_p {
                    } catch (MalformedURLException e) {  }
                }
            }
-            return urls.length - this.lastFailed;
+            return this.lastFetchedURLs;
        }
        
        private String[] getURLs(URL url) {
@ -384,7 +408,7 @@ public class CrawlURLFetch_p {
                header.put(httpHeader.ACCEPT_ENCODING, "US-ASCII");
                header.put(httpHeader.HOST, url.getHost());
                
-                httpc.response res = con.GET(url.getPath(), header);
+                httpc.response res = con.GET(url.getPath() + "?" + url.getQuery(), header);
                serverLog.logFine(this.getName(), "downloaded URL-list from " + url + " (" + res.statusCode + ")");
                this.lastServerResponse = res.statusCode + " (" + res.statusText + ")";
                if (res.status.startsWith("2")) {
--- a/htroot/yacy/list.java
+++ b/htroot/yacy/list.java
@ -49,18 +49,24 @@
 // if the shell's current path is HTROOT

 import java.io.File;
+import java.io.IOException;

 import de.anomic.data.listManager;
+import de.anomic.data.wikiCode;
 import de.anomic.http.httpHeader;
+import de.anomic.plasma.plasmaCrawlNURL;
+import de.anomic.plasma.plasmaSwitchboard;
 import de.anomic.server.serverCore;
 import de.anomic.server.serverObjects;
 import de.anomic.server.serverSwitch;
+import de.anomic.server.logging.serverLog;

 public final class list {

    public static serverObjects respond(httpHeader header, serverObjects post, serverSwitch ss) {
-        if (post == null || ss == null ) { return null; }
-
+        if (post == null || ss == null)
+            throw new NullPointerException("post: " + post + ", sb: " + ss);
+        
        // return variable that accumulates replacements
        final serverObjects prop = new serverObjects();
        
@ -83,11 +89,27 @@ public final class list {
            } // if filenamesarray.length > 0

            prop.put("list",out);
+        } else if (col.length() == 0 && post.get("list", "").equals("queueUrls")) {
+            // list urls from remote crawler queue for other peers
+            int count = 50;
+            if (post.get("count", "").length() > 0 && post.get("count", "").matches("\\d+"))
+                count = Integer.parseInt(post.get("count", ""));
+            
+            final StringBuffer sb = new StringBuffer();
+            plasmaCrawlNURL.Entry entry;
+            for (int i=0; i<count && count - i<((plasmaSwitchboard)ss).noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_LIMIT); i++) {
+                try {
+                    entry = ((plasmaSwitchboard)ss).noticeURL.pop(plasmaCrawlNURL.STACK_TYPE_LIMIT);
+                    sb.append(wikiCode.deReplaceHTMLEntities(entry.url().toNormalform())).append("\n");
+                } catch (IOException e) {
+                    serverLog.logSevere("/yacy/list.html", "CANNOT FETCH ENTRY " + i + "/" + count + ": " + e.getMessage());
+                }
+            }
+            prop.put("list", sb);
        } else {
            prop.putASIS("list","");
        }

        return prop;
    }
-
 }