mirror of
https://github.com/yacy/yacy_search_server.git
synced 2024-09-19 00:01:41 +02:00
- enabled fetching new crawls via /yacy/list.html?list=queueUrls for testing purposes
- sent URLs are taken off the limit-stack (of the global crawl trigger) (may be moved somewhere else in future versions) - added option to set the requested chunk-size git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@3367 6c8d7289-2bf4-0310-a012-ef5d649a1542
This commit is contained in:
parent
67d96249b4
commit
e6ddf135bb
|
@ -33,11 +33,13 @@
|
|||
#(peersKnown)#::
|
||||
<dt><label for="peer">Fetch from Peer</label>:</dt>
|
||||
<dd>
|
||||
<input type="radio" name="source" value="peer" id="peer" disabled="disabled" />
|
||||
<select name="peerhash" disabled="disabled">
|
||||
<input type="radio" name="source" value="peer" id="peer" />
|
||||
<select name="peerhash">
|
||||
<option value="random" selected="selected">Choose a random peer</option>#{peers}#
|
||||
<option value="#[hash]#">#[name]#</option>#{/peers}#
|
||||
</select>
|
||||
<label for="amount">Amount of URLs to request</label>:
|
||||
<input type="text" name="amount" id="amount" value="50" maxlength="3" size="3" />
|
||||
#(peerError)#::
|
||||
<span class="error">Error fetching URL-list from <span class="tt">#[hash]#:#[name]#</span></span>::
|
||||
<span class="error">Peer with hash <span class="tt">#[hash]#</span> doesn't seem to be online anymore</span>#(/peerError)#
|
||||
|
@ -49,7 +51,7 @@
|
|||
<input type="radio" name="reg" value="self_det" id="self_det" disabled="disabled"/> <label for="self_det">Fetch when queue is empty</label><br />
|
||||
<input type="radio" name="reg" value="delay" id="delay" /> <label for="delay">Fetch in a specified delay</label>:
|
||||
<label for="frequency">every</label>
|
||||
<input type="text" name="frequency" id="frequency" text-align="left" size="2" style="text-align: right;" maxlength="2"/>
|
||||
<input type="text" name="frequency" id="frequency" size="2" style="text-align: right;" maxlength="2"/>
|
||||
<select name="freq_type">
|
||||
<option value="weeks">Weeks</option>
|
||||
<option value="days" selected="selected">Days</option>
|
||||
|
@ -80,7 +82,7 @@
|
|||
<dd>
|
||||
#[error]#
|
||||
<ul>#{error}#
|
||||
<li><span class="error">#[reason]#</span>: <a href="#[url]#">#[url]#</a></li>#{/error}#
|
||||
<li><span class="error">#[reason]#</span>: <a href="#[url]#">#[url]#</a></li>#{/error}#
|
||||
</ul>
|
||||
</dd>
|
||||
<dt>#(status)#
|
||||
|
|
|
@ -37,6 +37,8 @@ public class CrawlURLFetch_p {
|
|||
private static final long STAT_THREAD_STOPPED = 1;
|
||||
private static final long STAT_THREAD_PAUSED = 2;
|
||||
|
||||
public static final float MIN_PEER_VERSION_LIST_SERVLET = 0.504033F;
|
||||
|
||||
private static URLFetcher fetcher = null;
|
||||
private static plasmaCrawlProfile.entry profile = null;
|
||||
private static ArrayList savedURLs = new ArrayList();
|
||||
|
@ -74,6 +76,12 @@ public class CrawlURLFetch_p {
|
|||
}
|
||||
}
|
||||
|
||||
int count = 50;
|
||||
if (post.get("amount", "").matches("\\d+")) {
|
||||
count = Integer.parseInt(post.get("amount", ""));
|
||||
if (count > 999) count = 999;
|
||||
}
|
||||
|
||||
if (fetcher != null) fetcher.interrupt();
|
||||
fetcher = null;
|
||||
if (post.get("source", "").equals("peer") &&
|
||||
|
@ -81,6 +89,7 @@ public class CrawlURLFetch_p {
|
|||
fetcher = new URLFetcher(
|
||||
env,
|
||||
profile,
|
||||
count,
|
||||
frequency);
|
||||
} else {
|
||||
URL url = null;
|
||||
|
@ -103,9 +112,9 @@ public class CrawlURLFetch_p {
|
|||
} else if (post.get("source", "").equals("peer")) {
|
||||
yacySeed ys = null;
|
||||
try {
|
||||
ys = yacyCore.seedDB.getConnected(post.get("peerhash", ""));
|
||||
ys = yacyCore.seedDB.get(post.get("peerhash", ""));
|
||||
if (ys != null) {
|
||||
url = new URL("http://" + ys.getAddress() + "/yacy/urllist.html");
|
||||
url = new URL("http://" + ys.getAddress() + URLFetcher.LIST_SERVLET);
|
||||
} else {
|
||||
prop.put("peerError", ERR_PEER_OFFLINE);
|
||||
prop.put("peerError_hash", post.get("peerhash", ""));
|
||||
|
@ -122,6 +131,7 @@ public class CrawlURLFetch_p {
|
|||
env,
|
||||
profile,
|
||||
url,
|
||||
count,
|
||||
frequency);
|
||||
}
|
||||
}
|
||||
|
@ -142,12 +152,14 @@ public class CrawlURLFetch_p {
|
|||
fetcher = new URLFetcher(
|
||||
env,
|
||||
profile,
|
||||
fetcher.count,
|
||||
fetcher.delay);
|
||||
} else {
|
||||
fetcher = new URLFetcher(
|
||||
env,
|
||||
profile,
|
||||
fetcher.url,
|
||||
fetcher.count,
|
||||
fetcher.delay);
|
||||
}
|
||||
fetcher.start();
|
||||
|
@ -200,7 +212,7 @@ public class CrawlURLFetch_p {
|
|||
prop.put("peersKnown", 1);
|
||||
try {
|
||||
TreeMap hostList = new TreeMap();
|
||||
final Enumeration e = yacyCore.seedDB.seedsConnected(true, false, null, (float) 0.0);
|
||||
final Enumeration e = yacyCore.seedDB.seedsConnected(true, false, null, MIN_PEER_VERSION_LIST_SERVLET);
|
||||
while (e.hasMoreElements()) {
|
||||
yacySeed seed = (yacySeed) e.nextElement();
|
||||
if (seed != null) hostList.put(seed.get(yacySeed.NAME, "nameless"),seed.hash);
|
||||
|
@ -209,6 +221,7 @@ public class CrawlURLFetch_p {
|
|||
String peername;
|
||||
while ((peername = (String) hostList.firstKey()) != null) {
|
||||
final String Hash = (String) hostList.get(peername);
|
||||
if (Hash.equals(yacyCore.seedDB.mySeed.hash)) continue;
|
||||
prop.put("peersKnown_peers_" + peerCount + "_hash", Hash);
|
||||
prop.put("peersKnown_peers_" + peerCount + "_name", peername);
|
||||
hostList.remove(peername);
|
||||
|
@ -239,6 +252,8 @@ public class CrawlURLFetch_p {
|
|||
public static final long DELAY_ONCE = -1;
|
||||
public static final long DELAY_SELF_DET = 0;
|
||||
|
||||
private static final String LIST_SERVLET = "/yacy/list.html?list=queueUrls";
|
||||
|
||||
public static int totalRuns = 0;
|
||||
public static int totalFetchedURLs = 0;
|
||||
public static int totalFailed = 0;
|
||||
|
@ -251,6 +266,7 @@ public class CrawlURLFetch_p {
|
|||
public int lastFailed = 0;
|
||||
|
||||
public final URL url;
|
||||
public final int count;
|
||||
public final long delay;
|
||||
public final plasmaSwitchboard sb;
|
||||
public final plasmaCrawlProfile.entry profile;
|
||||
|
@ -261,12 +277,14 @@ public class CrawlURLFetch_p {
|
|||
serverSwitch env,
|
||||
plasmaCrawlProfile.entry profile,
|
||||
URL url,
|
||||
int count,
|
||||
long delayMs) {
|
||||
if (env == null || profile == null || url == null)
|
||||
throw new NullPointerException("env, profile or url must not be null");
|
||||
this.sb = (plasmaSwitchboard)env;
|
||||
this.profile = profile;
|
||||
this.url = url;
|
||||
this.count = count;
|
||||
this.delay = delayMs;
|
||||
this.setName("URLFetcher");
|
||||
}
|
||||
|
@ -274,12 +292,14 @@ public class CrawlURLFetch_p {
|
|||
public URLFetcher(
|
||||
serverSwitch env,
|
||||
plasmaCrawlProfile.entry profile,
|
||||
int count,
|
||||
long delayMs) {
|
||||
if (env == null || profile == null)
|
||||
throw new NullPointerException("env or profile must not be null");
|
||||
this.sb = (plasmaSwitchboard)env;
|
||||
this.profile = profile;
|
||||
this.url = null;
|
||||
this.count = count;
|
||||
this.delay = delayMs;
|
||||
this.setName("URLFetcher");
|
||||
}
|
||||
|
@ -297,7 +317,7 @@ public class CrawlURLFetch_p {
|
|||
return;
|
||||
}
|
||||
totalFetchedURLs += stackURLs(getURLs(url));
|
||||
lastRun = System.currentTimeMillis() - start;
|
||||
this.lastRun = System.currentTimeMillis() - start;
|
||||
totalRuns++;
|
||||
if (this.delay < 0 || isInterrupted()) {
|
||||
return;
|
||||
|
@ -320,7 +340,7 @@ public class CrawlURLFetch_p {
|
|||
|
||||
// choose random seed
|
||||
yacySeed ys = null;
|
||||
Enumeration e = yacyCore.seedDB.seedsConnected(true, false, null, 0F);
|
||||
Enumeration e = yacyCore.seedDB.seedsConnected(true, false, null, MIN_PEER_VERSION_LIST_SERVLET);
|
||||
int num = new Random().nextInt(yacyCore.seedDB.sizeConnected()) + 1;
|
||||
Object o;
|
||||
for (int i=0; i<num && e.hasMoreElements(); i++) {
|
||||
|
@ -330,16 +350,18 @@ public class CrawlURLFetch_p {
|
|||
if (ys == null) return null;
|
||||
|
||||
try {
|
||||
return new URL("http://" + ys.getAddress() + "/yacy/urllist.html");
|
||||
return new URL("http://" + ys.getAddress() + LIST_SERVLET + "&count=" + this.count);
|
||||
} catch (MalformedURLException ee) { return null; }
|
||||
}
|
||||
|
||||
private int stackURLs(String[] urls) throws InterruptedException {
|
||||
this.lastFailed = 0;
|
||||
this.lastFetchedURLs = 0;
|
||||
if (urls == null) return 0;
|
||||
String reason;
|
||||
for (int i=0; i<urls.length && !isInterrupted(); i++) {
|
||||
serverLog.logFinest(this.getName(), "stacking " + urls[i]);
|
||||
if (urls[i].trim().length() == 0) continue;
|
||||
serverLog.logFine(this.getName(), "stacking " + urls[i]);
|
||||
reason = this.sb.sbStackCrawlThread.stackCrawl(
|
||||
urls[i],
|
||||
null,
|
||||
|
@ -348,7 +370,9 @@ public class CrawlURLFetch_p {
|
|||
new Date(),
|
||||
this.profile.generalDepth(),
|
||||
this.profile);
|
||||
if (reason != null) {
|
||||
if (reason == null) {
|
||||
this.lastFetchedURLs++;
|
||||
} else {
|
||||
this.lastFailed++;
|
||||
totalFailed++;
|
||||
this.failed.put(urls[i], reason);
|
||||
|
@ -366,7 +390,7 @@ public class CrawlURLFetch_p {
|
|||
} catch (MalformedURLException e) { }
|
||||
}
|
||||
}
|
||||
return urls.length - this.lastFailed;
|
||||
return this.lastFetchedURLs;
|
||||
}
|
||||
|
||||
private String[] getURLs(URL url) {
|
||||
|
@ -384,7 +408,7 @@ public class CrawlURLFetch_p {
|
|||
header.put(httpHeader.ACCEPT_ENCODING, "US-ASCII");
|
||||
header.put(httpHeader.HOST, url.getHost());
|
||||
|
||||
httpc.response res = con.GET(url.getPath(), header);
|
||||
httpc.response res = con.GET(url.getPath() + "?" + url.getQuery(), header);
|
||||
serverLog.logFine(this.getName(), "downloaded URL-list from " + url + " (" + res.statusCode + ")");
|
||||
this.lastServerResponse = res.statusCode + " (" + res.statusText + ")";
|
||||
if (res.status.startsWith("2")) {
|
||||
|
|
|
@ -49,18 +49,24 @@
|
|||
// if the shell's current path is HTROOT
|
||||
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
|
||||
import de.anomic.data.listManager;
|
||||
import de.anomic.data.wikiCode;
|
||||
import de.anomic.http.httpHeader;
|
||||
import de.anomic.plasma.plasmaCrawlNURL;
|
||||
import de.anomic.plasma.plasmaSwitchboard;
|
||||
import de.anomic.server.serverCore;
|
||||
import de.anomic.server.serverObjects;
|
||||
import de.anomic.server.serverSwitch;
|
||||
import de.anomic.server.logging.serverLog;
|
||||
|
||||
public final class list {
|
||||
|
||||
public static serverObjects respond(httpHeader header, serverObjects post, serverSwitch ss) {
|
||||
if (post == null || ss == null ) { return null; }
|
||||
|
||||
if (post == null || ss == null)
|
||||
throw new NullPointerException("post: " + post + ", sb: " + ss);
|
||||
|
||||
// return variable that accumulates replacements
|
||||
final serverObjects prop = new serverObjects();
|
||||
|
||||
|
@ -83,11 +89,27 @@ public final class list {
|
|||
} // if filenamesarray.length > 0
|
||||
|
||||
prop.put("list",out);
|
||||
} else if (col.length() == 0 && post.get("list", "").equals("queueUrls")) {
|
||||
// list urls from remote crawler queue for other peers
|
||||
int count = 50;
|
||||
if (post.get("count", "").length() > 0 && post.get("count", "").matches("\\d+"))
|
||||
count = Integer.parseInt(post.get("count", ""));
|
||||
|
||||
final StringBuffer sb = new StringBuffer();
|
||||
plasmaCrawlNURL.Entry entry;
|
||||
for (int i=0; i<count && count - i<((plasmaSwitchboard)ss).noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_LIMIT); i++) {
|
||||
try {
|
||||
entry = ((plasmaSwitchboard)ss).noticeURL.pop(plasmaCrawlNURL.STACK_TYPE_LIMIT);
|
||||
sb.append(wikiCode.deReplaceHTMLEntities(entry.url().toNormalform())).append("\n");
|
||||
} catch (IOException e) {
|
||||
serverLog.logSevere("/yacy/list.html", "CANNOT FETCH ENTRY " + i + "/" + count + ": " + e.getMessage());
|
||||
}
|
||||
}
|
||||
prop.put("list", sb);
|
||||
} else {
|
||||
prop.putASIS("list","");
|
||||
}
|
||||
|
||||
return prop;
|
||||
}
|
||||
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue
Block a user