- prepared URL fetch from other peers

- more feedback for user

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@3365 6c8d7289-2bf4-0310-a012-ef5d649a1542
This commit is contained in:
karlchenofhell 2007-02-13 20:18:12 +00:00
parent 661a7bb702
commit c5a2ba3a23
2 changed files with 308 additions and 143 deletions

View File

@ -7,49 +7,60 @@
<body id="CrawlURLFetch_p">
#%env/templates/header.template%#
<h2>URL-Fetcher</h2>
<p><!-- desc --></p>
<form method="post" action="CrawlURLFetch_p.html" enctype="multipart/form-data">
<p>
All newly added URLs will be crawled using the <span class="tt">Proxy</span> Crawl Profile.
</p>
<fieldset><legend>Fetch new URLs to crawl</legend>
<p>
The newly added URLs will be crawled without any filter restricions except of the <em>static</em> stop-words.
The Re-Crawl option isn't used and the sites won't be stored in the Proxy Cache. Text and media types will be indexed.
Since these URLs are explicitely requested from another peer, they won't be distributed for remote indexing.
</p>
<dl>
<dt><label for="url">Fetch from URL</label>:</dt>
<dd>
<input type="radio" name="source" value="url" id="url" checked="checked" />
<input type="text" id="host" name="host" size="60" value="#[host]#" />
#(hostError)#::<span class="error">Malformed URL</span>#(/hostError)#
#(saved)#::<br />
Or select previously entered URL: <select name="savedURL">#{urls}#
<option>#[url]#</option>
</select>#(/saved)#
#(hostError)#::&nbsp;<span class="error">Malformed URL</span>#(/hostError)#
</dd>
#(peersKnown)#::
<dt><label for="peer">Fetch from Peer</label>:</dt>
<dd>
<input type="radio" name="source" value="peer" id="peer" disabled="disabled" />
<select name="peerhash" disabled="disabled">#{peers}#
<select name="peerhash" disabled="disabled">
<option value="random" selected="selected">Choose a random peer</option>#{peers}#
<option value="#[hash]#">#[name]#</option>#{/peers}#
</select>
#(peerError)#::<span class="error">
#(peerError)#::
&nbsp;<span class="error">Error fetching URL-list from <span class="tt">#[hash]#:#[name]#</span></span>::
&nbsp;<span class="error">Peer with hash <span class="tt">#[hash]#</span> doesn't seem to be online anymore</span>#(/peerError)#
</dd>#(/peersKnown)#
<dt><label for="type">List-type</label>:</dt>
<dt>Frequency:</dt>
<dd>
<select name="type" id="type">
<option value="text">Text</option>
<option value="xml" disabled="disabled">XML</option>
</select>
</dd>
<dt><label for="regularly">Run regularly</label>:</dt>
<dd>
<input type="checkbox" name="regularly" id="regularly" disabled="disabled" />,
<input type="radio" name="reg" value="once" id="once" checked="checked" /> <label for="once">Fetch only once</label><br />
<input type="radio" name="reg" value="self_det" id="self_det" disabled="disabled"/> <label for="self_det">Fetch when queue is empty</label><br />
<input type="radio" name="reg" value="delay" id="delay" /> <label for="delay">Fetch in a specified delay</label>:
<label for="frequency">every</label>
<input type="text" name="frequency" id="frequency" text-align="left" size="5" disabled="disabled" />
<select name="freq_type" disabled="disabled">
&nbsp;<input type="text" name="frequency" id="frequency" text-align="left" size="2" style="text-align: right;" maxlength="2"/>
<select name="freq_type">
<option value="weeks">Weeks</option>
<option value="days" selected="selected">Days</option>
<option value="hours">Hours</option>
</select>
#(freqError)#::&nbsp;<span class="error">Invalid period, fetching only once</span>#(/freqError)#
</dd>
<dt><input type="submit" name="start" value="Fetch URLs" /></dt>
</dl>
</fieldset>
#(threadError)#::
<span class="error">Error on stopping thread, it isn't alive anymore</span>::
<span class="error">Error on restarting thread, it isn't alive anymore</span>#(/threadError)#
#(runs)#::
<fieldset><legend>Thread to fetch URLs is #(status)#running::stopped::paused#(/status)#</legend>
<dl>
@ -59,7 +70,18 @@
<dt>Total fetched URLs:</dt><dd>#[totalFetchedURLs]#</dd>
<dt>Total failed URLs:</dt><dd>#[totalFailedURLs]#</dd>
<dt>Last fetched URLs:</dt><dd>#[lastFetchedURLs]#</dd>
<dt><input type="submit" name="stop" value="Stop Thread" /></dt>
<dt>Failed URLs:</dt>
<dd>
#[error]#
<ul>#{error}#
<li><span class="error">#[reason]#</span>: #[url]#</li>#{/error}#
</ul>
</dd>
<dt>#(status)#
<input type="submit" name="stop" value="Stop Thread" />::
<input type="submit" name="restart" value="Restart Thread" />::
<input type="submit" name="resume" value="Resume Thread" />#(/status)#
</dt>
</dl>
</fieldset>
#(/runs)#

View File

@ -2,33 +2,193 @@
import java.io.IOException;
import java.net.MalformedURLException;
import java.util.ArrayList;
import java.util.Date;
import java.util.Enumeration;
import java.util.HashMap;
import java.util.Iterator;
import java.util.Random;
import java.util.TreeMap;
import de.anomic.kelondro.kelondroBitfield;
import de.anomic.net.URL;
import de.anomic.plasma.plasmaCrawlEURL;
import de.anomic.plasma.plasmaCrawlProfile;
import de.anomic.plasma.plasmaSwitchboard;
import de.anomic.server.serverSwitch;
import de.anomic.data.wikiCode;
import de.anomic.http.httpHeader;
import de.anomic.http.httpc;
import de.anomic.server.serverObjects;
import de.anomic.server.logging.serverLog;
import de.anomic.yacy.yacyCore;
import de.anomic.yacy.yacySeed;
public class CrawlURLFetch_p {
private static final long ERR_DATE = 1;
private static final long ERR_HOST_MALFORMED_URL = 1;
private static final long ERR_PEER_GENERAL_CONN = 1;
private static final long ERR_PEER_OFFLINE = 2;
private static final long ERR_THREAD_STOP = 1;
private static final long ERR_THREAD_RESUME = 2;
private static final long STAT_THREAD_ALIVE = 0;
private static final long STAT_THREAD_STOPPED = 1;
private static final long STAT_THREAD_PAUSED = 2;
private static URLFetcher fetcher = null;
private static plasmaCrawlProfile.entry profile = null;
private static ArrayList savedURLs = new ArrayList();
public static serverObjects respond(httpHeader header, serverObjects post, serverSwitch env) {
serverObjects prop = new serverObjects();
prop.put("host", "");
listURLs(prop); // List previously saved URLs for easy selection
listPeers(prop); // List known hosts
// List known hosts for message sending
if (profile == null) {
profile = ((plasmaSwitchboard)env).profiles.newEntry(
"URLFetcher", // Name
null, // URL
".*", ".*", // General / specific filter
0, 0, // General / specific depth
-1, -1, -1, // Recrawl / Dom-filter depth / Dom-max-pages
true, // Crawl query
true, true, // Index text / media
false, true, // Store in HT- / TX-Cache
false, // Remote indexing
true, false, false); // Exclude static / dynamic / parent stopwords
}
if (post != null) {
if (post.containsKey("start")) {
long frequency = URLFetcher.DELAY_ONCE;
if (post.containsKey("reg")) {
if (post.get("reg", "").equals("self_det")) {
frequency = URLFetcher.DELAY_SELF_DET;
} else if (post.get("reg", "").equals("delay")) {
frequency = getDate(post.get("frequency", ""), post.get("freq_type", ""));
if (frequency == -1)
prop.put("freqError", ERR_DATE);
}
}
fetcher = null;
if (post.get("source", "").equals("peer") &&
post.get("peerhash", "").equals("random")) {
fetcher = new URLFetcher(
env,
profile,
frequency);
} else {
URL url = null;
if (post.get("source", "").equals("url")) {
try {
url = new URL(post.get("host", null));
if (!savedURLs.contains(url.toNormalform()))
savedURLs.add(url.toNormalform());
prop.put("host", post.get("host", url.toString()));
} catch (MalformedURLException e) {
prop.put("host", post.get("host", ""));
prop.put("hostError", ERR_HOST_MALFORMED_URL);
}
} else if (post.get("source", "").equals("peer")) {
yacySeed ys = null;
try {
ys = yacyCore.seedDB.getConnected(post.get("peerhash", ""));
if (ys != null) {
url = new URL("http://" + ys.getAddress() + "/yacy/urllist.html");
} else {
prop.put("peerError", ERR_PEER_OFFLINE);
prop.put("peerError_hash", post.get("peerhash", ""));
}
} catch (MalformedURLException e) {
prop.put("peerError", ERR_PEER_GENERAL_CONN);
prop.put("peerError_hash", post.get("peerhash", ""));
prop.put("peerError_name", ys.getName());
}
}
if (url != null) {
fetcher = new URLFetcher(
env,
profile,
url,
frequency);
}
}
if (fetcher != null)
fetcher.start();
}
else if (post.containsKey("stop")) {
if (fetcher != null) {
fetcher.interrupt();
} else {
prop.put("threadError", ERR_THREAD_STOP);
}
}
else if (post.containsKey("restart") || post.containsKey("resume")) {
if (fetcher != null) {
if (fetcher.url == null) {
fetcher = new URLFetcher(
env,
profile,
fetcher.delay);
} else {
fetcher = new URLFetcher(
env,
profile,
fetcher.url,
fetcher.delay);
}
fetcher.start();
} else {
prop.put("threadError", ERR_THREAD_RESUME);
}
}
}
if (fetcher != null) {
prop.put("runs", 1);
prop.put("runs_status", (fetcher.isAlive()) ? STAT_THREAD_ALIVE :
(fetcher.paused) ? STAT_THREAD_PAUSED : STAT_THREAD_STOPPED);
prop.put("runs_totalRuns", URLFetcher.totalRuns);
prop.put("runs_totalFetchedURLs", URLFetcher.totalFetchedURLs);
prop.put("runs_totalFailedURLs", URLFetcher.totalFailed);
prop.put("runs_lastRun", fetcher.lastRun);
prop.put("runs_lastFetchedURLs", fetcher.lastFetchedURLs);
prop.put("runs_lastServerResponse", (fetcher.lastServerResponse == null)
? "" : fetcher.lastServerResponse);
Iterator it = fetcher.failed.keySet().iterator();
int i = 0;
Object key;
while (it.hasNext()) {
key = it.next();
prop.put("runs_error_" + i + "_reason", fetcher.failed.get(key));
prop.put("runs_error_" + i + "_url", (String)key);
i++;
}
prop.put("runs_error", i);
}
return prop;
}
private static int listURLs(serverObjects prop) {
if (savedURLs.size() == 0) return 0;
prop.put("saved", 1);
for (int i=0; i<savedURLs.size(); i++)
prop.put("saved_urls_" + i + "url", savedURLs.get(i));
prop.put("saved_urls", savedURLs.size());
return savedURLs.size();
}
private static int listPeers(serverObjects prop) {
int peerCount = 0;
if (yacyCore.seedDB != null && yacyCore.seedDB.sizeConnected() > 0) {
prop.put("peersKnown", 1);
int peerCount = 0;
try {
TreeMap hostList = new TreeMap();
final Enumeration e = yacyCore.seedDB.seedsConnected(true, false, null, (float) 0.0);
@ -50,65 +210,7 @@ public class CrawlURLFetch_p {
} else {
prop.put("peersKnown", 0);
}
if (post != null) {
if (post.containsKey("start")) {
try {
long frequency = -1;
if (post.containsKey("regularly"))
frequency = getDate(post.get("frequency", ""), post.get("freq_type", ""));
String t = post.get("type", "text");
int type = -1;
if (t.equals("text")) {
type = URLFetcher.TYPE_TEXT;
} else if (t.equals("xml")) {
type = URLFetcher.TYPE_XML;
}
URL url = new URL(post.get("host", null));
prop.put("host", post.get("host", ""));
if (type > -1) {
if (frequency > -1) {
fetcher = new URLFetcher(
env,
((plasmaSwitchboard)env).defaultProxyProfile,
url,
frequency,
type);
} else { // only fetch once
fetcher = new URLFetcher(
env,
((plasmaSwitchboard)env).defaultProxyProfile,
url,
type);
}
fetcher.start();
}
} catch (MalformedURLException e) {
prop.put("host", post.get("host", ""));
prop.put("hostError", 1);
}
} else if (post.containsKey("stop")) {
fetcher.interrupt();
}
}
if (fetcher != null) {
prop.put("runs", 1);
prop.put("runs_status", (fetcher.isRunning()) ? 0 : (fetcher.isPaused()) ? 2 : 1);
prop.put("runs_totalRuns", URLFetcher.totalRuns);
prop.put("runs_totalFetchedURLs", URLFetcher.totalFetchedURLs);
prop.put("runs_totalFailedURLs", URLFetcher.totalFailed);
prop.put("runs_lastRun", URLFetcher.lastRun);
prop.put("runs_lastFetchedURLs", URLFetcher.lastFetchedURLs);
prop.put("runs_lastServerResponse", (URLFetcher.lastServerResponse == null)
? "" : URLFetcher.lastServerResponse);
}
return prop;
return peerCount;
}
private static long getDate(String count, String type) {
@ -116,7 +218,7 @@ public class CrawlURLFetch_p {
if (count != null && count.matches("\\d+")) r = Long.parseLong(count);
if (r < 1) return -1;
r *= 3600 * 24;
r *= 3600;
if (type.equals("weeks")) return r * 24 * 7;
else if (type.equals("days")) return r * 24;
else if (type.equals("hours")) return r;
@ -125,120 +227,161 @@ public class CrawlURLFetch_p {
public static class URLFetcher extends Thread {
public static final int TYPE_TEXT = 0;
public static final int TYPE_XML = 1;
public static final long DELAY_ONCE = -1;
public static final long DELAY_SELF_DET = 0;
public static int lastFetchedURLs = 0;
public static long lastRun = 0;
public static String lastServerResponse = null;
public static int lastFailed = 0;
public static int totalRuns = 0;
public static int totalFetchedURLs = 0;
public static int totalFailed = 0;
private final URL url;
private final long delay;
private final int type;
private final plasmaSwitchboard sb;
private final plasmaCrawlProfile.entry profile;
public final HashMap failed = new HashMap();
private boolean running = false;
private boolean paused = false;
public int lastFetchedURLs = 0;
public long lastRun = 0;
public String lastServerResponse = null;
public int lastFailed = 0;
public final URL url;
public final long delay;
public final plasmaSwitchboard sb;
public final plasmaCrawlProfile.entry profile;
public boolean paused = false;
public URLFetcher(
serverSwitch env,
plasmaCrawlProfile.entry profile,
URL url,
int type) {
this.sb = (plasmaSwitchboard)env;
this.profile = profile;
this.url = url;
this.type = type;
this.delay = 0;
this.setName("URL-Fetcher");
}
public URLFetcher(
serverSwitch env,
plasmaCrawlProfile.entry profile,
URL url,
long delayMs,
int type) {
long delayMs) {
if (env == null || profile == null || url == null)
throw new NullPointerException("env, profile or url must not be null");
this.sb = (plasmaSwitchboard)env;
this.profile = profile;
this.url = url;
this.delay = delayMs;
this.type = type;
this.setName("URL-Fetcher");
this.setName("URLFetcher");
}
public boolean isRunning() { return this.running; }
public boolean isPaused() { return this.paused; }
public URLFetcher(
serverSwitch env,
plasmaCrawlProfile.entry profile,
long delayMs) {
if (env == null || profile == null)
throw new NullPointerException("env or profile must not be null");
this.sb = (plasmaSwitchboard)env;
this.profile = profile;
this.url = null;
this.delay = delayMs;
this.setName("URLFetcher");
}
public void run() {
this.running = true;
this.paused = false;
long start;
do {
URL url;
while (!isInterrupted()) {
try {
start = System.currentTimeMillis();
totalFetchedURLs += addURLs();
url = getDLURL();
if (url == null) {
serverLog.logSevere(this.getName(), "canceled because no valid URL for the URL-list could be determinded");
break;
}
totalFetchedURLs += stackURLs(getURLs(url));
lastRun = System.currentTimeMillis() - start;
totalRuns++;
this.paused = true;
this.wait(this.delay);
if (this.delay < 0) {
break;
} else if (this.delay == 0) {
this.paused = true;
while (this.paused) this.wait();
} else {
this.paused = true;
this.wait(this.delay);
}
this.paused = false;
} catch (InterruptedException e) { break; }
} while (!isInterrupted() && this.delay > 0);
this.running = false;
}
}
private int addURLs() throws InterruptedException {
String[] urls = getURLs();
lastFailed = 0;
private URL getDLURL() {
if (this.url != null) return this.url;
// choose random seed
yacySeed ys = null;
Enumeration e = yacyCore.seedDB.seedsConnected(true, false, null, 0F);
int num = new Random().nextInt(yacyCore.seedDB.sizeConnected()) + 1;
Object o;
for (int i=0; i<num && e.hasMoreElements(); i++) {
o = e.nextElement();
if (o != null) ys = (yacySeed)o;
}
if (ys == null) return null;
try {
return new URL("http://" + ys.getAddress() + "/yacy/urllist.html");
} catch (MalformedURLException ee) { return null; }
}
private int stackURLs(String[] urls) throws InterruptedException {
this.lastFailed = 0;
if (urls == null) return 0;
String reason;
for (int i=0; i<urls.length; i++) {
serverLog.logFinest(this.getName(), "stacking " + urls[i]);
reason = this.sb.sbStackCrawlThread.stackCrawl(
urls[i],
null,
yacyCore.seedDB.mySeed.hash,
"PROXY",
null,
new Date(),
this.profile.generalDepth(),
this.profile);
if (reason != null) lastFailed++;
}
return urls.length;
if (reason != null) {
this.lastFailed++;
this.failed.put(urls[i], reason);
try {
plasmaCrawlEURL.Entry ee = this.sb.errorURL.newEntry(
new URL(urls[i]),
null,
yacyCore.seedDB.mySeed.hash,
yacyCore.seedDB.mySeed.hash,
null,
reason,
new kelondroBitfield());
ee.store();
this.sb.errorURL.stackPushEntry(ee);
} catch (MalformedURLException e) { }
}
}
return urls.length - this.lastFailed;
}
private String[] getURLs() {
private String[] getURLs(URL url) {
if (url == null) return null;
String[] r = null;
try {
httpc con = httpc.getInstance(
this.url.getHost(),
this.url.getHost(),
this.url.getPort(),
url.getHost(),
url.getHost(),
url.getPort(),
15000,
this.url.getProtocol().equals("https"));
url.getProtocol().equals("https"));
httpHeader header = new httpHeader();
header.put(httpHeader.ACCEPT_ENCODING, "utf-8");
header.put(httpHeader.HOST, this.url.getHost());
header.put(httpHeader.ACCEPT_ENCODING, "US-ASCII");
header.put(httpHeader.HOST, url.getHost());
httpc.response res = con.GET(this.url.getPath(), header);
lastServerResponse = res.statusCode + " (" + res.statusText + ")";
httpc.response res = con.GET(url.getPath(), header);
serverLog.logFine(this.getName(), "downloaded URL-list from " + url + " (" + res.statusCode + ")");
this.lastServerResponse = res.statusCode + " (" + res.statusText + ")";
if (res.status.startsWith("2")) {
byte[] cbs = res.writeContent();
String encoding = res.responseHeader.getCharacterEncoding();
if (encoding == null) encoding = "ASCII";
switch (this.type) {
case TYPE_TEXT: r = parseText(new String(cbs, encoding)); break;
// case TYPE_XML: r = parseXML(new String(cbs, encoding));
}
if (encoding == null) encoding = "US-ASCII";
r = parseText(wikiCode.deReplaceHTMLEntities(new String(cbs, encoding)));
}
con.close();
httpc.returnInstance(con);
} catch (IOException e) { }
return r;