mirror of
https://github.com/yacy/yacy_search_server.git
synced 2024-09-21 00:00:13 +02:00
- prepared URL fetch from other peers
- more feedback for user git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@3365 6c8d7289-2bf4-0310-a012-ef5d649a1542
This commit is contained in:
parent
661a7bb702
commit
c5a2ba3a23
|
@ -7,49 +7,60 @@
|
|||
<body id="CrawlURLFetch_p">
|
||||
#%env/templates/header.template%#
|
||||
<h2>URL-Fetcher</h2>
|
||||
<p><!-- desc --></p>
|
||||
<form method="post" action="CrawlURLFetch_p.html" enctype="multipart/form-data">
|
||||
<p>
|
||||
All newly added URLs will be crawled using the <span class="tt">Proxy</span> Crawl Profile.
|
||||
</p>
|
||||
<fieldset><legend>Fetch new URLs to crawl</legend>
|
||||
<p>
|
||||
The newly added URLs will be crawled without any filter restricions except of the <em>static</em> stop-words.
|
||||
The Re-Crawl option isn't used and the sites won't be stored in the Proxy Cache. Text and media types will be indexed.
|
||||
Since these URLs are explicitely requested from another peer, they won't be distributed for remote indexing.
|
||||
</p>
|
||||
<dl>
|
||||
<dt><label for="url">Fetch from URL</label>:</dt>
|
||||
<dd>
|
||||
<input type="radio" name="source" value="url" id="url" checked="checked" />
|
||||
<input type="text" id="host" name="host" size="60" value="#[host]#" />
|
||||
#(hostError)#::<span class="error">Malformed URL</span>#(/hostError)#
|
||||
#(saved)#::<br />
|
||||
Or select previously entered URL: <select name="savedURL">#{urls}#
|
||||
<option>#[url]#</option>
|
||||
</select>#(/saved)#
|
||||
#(hostError)#:: <span class="error">Malformed URL</span>#(/hostError)#
|
||||
</dd>
|
||||
|
||||
#(peersKnown)#::
|
||||
<dt><label for="peer">Fetch from Peer</label>:</dt>
|
||||
<dd>
|
||||
<input type="radio" name="source" value="peer" id="peer" disabled="disabled" />
|
||||
<select name="peerhash" disabled="disabled">#{peers}#
|
||||
<select name="peerhash" disabled="disabled">
|
||||
<option value="random" selected="selected">Choose a random peer</option>#{peers}#
|
||||
<option value="#[hash]#">#[name]#</option>#{/peers}#
|
||||
</select>
|
||||
#(peerError)#::<span class="error">
|
||||
#(peerError)#::
|
||||
<span class="error">Error fetching URL-list from <span class="tt">#[hash]#:#[name]#</span></span>::
|
||||
<span class="error">Peer with hash <span class="tt">#[hash]#</span> doesn't seem to be online anymore</span>#(/peerError)#
|
||||
</dd>#(/peersKnown)#
|
||||
<dt><label for="type">List-type</label>:</dt>
|
||||
|
||||
<dt>Frequency:</dt>
|
||||
<dd>
|
||||
<select name="type" id="type">
|
||||
<option value="text">Text</option>
|
||||
<option value="xml" disabled="disabled">XML</option>
|
||||
</select>
|
||||
</dd>
|
||||
<dt><label for="regularly">Run regularly</label>:</dt>
|
||||
<dd>
|
||||
<input type="checkbox" name="regularly" id="regularly" disabled="disabled" />,
|
||||
<input type="radio" name="reg" value="once" id="once" checked="checked" /> <label for="once">Fetch only once</label><br />
|
||||
<input type="radio" name="reg" value="self_det" id="self_det" disabled="disabled"/> <label for="self_det">Fetch when queue is empty</label><br />
|
||||
<input type="radio" name="reg" value="delay" id="delay" /> <label for="delay">Fetch in a specified delay</label>:
|
||||
<label for="frequency">every</label>
|
||||
<input type="text" name="frequency" id="frequency" text-align="left" size="5" disabled="disabled" />
|
||||
<select name="freq_type" disabled="disabled">
|
||||
<input type="text" name="frequency" id="frequency" text-align="left" size="2" style="text-align: right;" maxlength="2"/>
|
||||
<select name="freq_type">
|
||||
<option value="weeks">Weeks</option>
|
||||
<option value="days" selected="selected">Days</option>
|
||||
<option value="hours">Hours</option>
|
||||
</select>
|
||||
#(freqError)#:: <span class="error">Invalid period, fetching only once</span>#(/freqError)#
|
||||
</dd>
|
||||
<dt><input type="submit" name="start" value="Fetch URLs" /></dt>
|
||||
</dl>
|
||||
</fieldset>
|
||||
|
||||
#(threadError)#::
|
||||
<span class="error">Error on stopping thread, it isn't alive anymore</span>::
|
||||
<span class="error">Error on restarting thread, it isn't alive anymore</span>#(/threadError)#
|
||||
|
||||
#(runs)#::
|
||||
<fieldset><legend>Thread to fetch URLs is #(status)#running::stopped::paused#(/status)#</legend>
|
||||
<dl>
|
||||
|
@ -59,7 +70,18 @@
|
|||
<dt>Total fetched URLs:</dt><dd>#[totalFetchedURLs]#</dd>
|
||||
<dt>Total failed URLs:</dt><dd>#[totalFailedURLs]#</dd>
|
||||
<dt>Last fetched URLs:</dt><dd>#[lastFetchedURLs]#</dd>
|
||||
<dt><input type="submit" name="stop" value="Stop Thread" /></dt>
|
||||
<dt>Failed URLs:</dt>
|
||||
<dd>
|
||||
#[error]#
|
||||
<ul>#{error}#
|
||||
<li><span class="error">#[reason]#</span>: #[url]#</li>#{/error}#
|
||||
</ul>
|
||||
</dd>
|
||||
<dt>#(status)#
|
||||
<input type="submit" name="stop" value="Stop Thread" />::
|
||||
<input type="submit" name="restart" value="Restart Thread" />::
|
||||
<input type="submit" name="resume" value="Resume Thread" />#(/status)#
|
||||
</dt>
|
||||
</dl>
|
||||
</fieldset>
|
||||
#(/runs)#
|
||||
|
|
|
@ -2,33 +2,193 @@
|
|||
|
||||
import java.io.IOException;
|
||||
import java.net.MalformedURLException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Date;
|
||||
import java.util.Enumeration;
|
||||
import java.util.HashMap;
|
||||
import java.util.Iterator;
|
||||
import java.util.Random;
|
||||
import java.util.TreeMap;
|
||||
|
||||
import de.anomic.kelondro.kelondroBitfield;
|
||||
import de.anomic.net.URL;
|
||||
import de.anomic.plasma.plasmaCrawlEURL;
|
||||
import de.anomic.plasma.plasmaCrawlProfile;
|
||||
import de.anomic.plasma.plasmaSwitchboard;
|
||||
import de.anomic.server.serverSwitch;
|
||||
import de.anomic.data.wikiCode;
|
||||
import de.anomic.http.httpHeader;
|
||||
import de.anomic.http.httpc;
|
||||
import de.anomic.server.serverObjects;
|
||||
import de.anomic.server.logging.serverLog;
|
||||
import de.anomic.yacy.yacyCore;
|
||||
import de.anomic.yacy.yacySeed;
|
||||
|
||||
public class CrawlURLFetch_p {
|
||||
|
||||
private static final long ERR_DATE = 1;
|
||||
private static final long ERR_HOST_MALFORMED_URL = 1;
|
||||
private static final long ERR_PEER_GENERAL_CONN = 1;
|
||||
private static final long ERR_PEER_OFFLINE = 2;
|
||||
private static final long ERR_THREAD_STOP = 1;
|
||||
private static final long ERR_THREAD_RESUME = 2;
|
||||
|
||||
private static final long STAT_THREAD_ALIVE = 0;
|
||||
private static final long STAT_THREAD_STOPPED = 1;
|
||||
private static final long STAT_THREAD_PAUSED = 2;
|
||||
|
||||
private static URLFetcher fetcher = null;
|
||||
private static plasmaCrawlProfile.entry profile = null;
|
||||
private static ArrayList savedURLs = new ArrayList();
|
||||
|
||||
public static serverObjects respond(httpHeader header, serverObjects post, serverSwitch env) {
|
||||
serverObjects prop = new serverObjects();
|
||||
|
||||
prop.put("host", "");
|
||||
listURLs(prop); // List previously saved URLs for easy selection
|
||||
listPeers(prop); // List known hosts
|
||||
|
||||
// List known hosts for message sending
|
||||
if (profile == null) {
|
||||
profile = ((plasmaSwitchboard)env).profiles.newEntry(
|
||||
"URLFetcher", // Name
|
||||
null, // URL
|
||||
".*", ".*", // General / specific filter
|
||||
0, 0, // General / specific depth
|
||||
-1, -1, -1, // Recrawl / Dom-filter depth / Dom-max-pages
|
||||
true, // Crawl query
|
||||
true, true, // Index text / media
|
||||
false, true, // Store in HT- / TX-Cache
|
||||
false, // Remote indexing
|
||||
true, false, false); // Exclude static / dynamic / parent stopwords
|
||||
}
|
||||
|
||||
if (post != null) {
|
||||
if (post.containsKey("start")) {
|
||||
long frequency = URLFetcher.DELAY_ONCE;
|
||||
if (post.containsKey("reg")) {
|
||||
if (post.get("reg", "").equals("self_det")) {
|
||||
frequency = URLFetcher.DELAY_SELF_DET;
|
||||
} else if (post.get("reg", "").equals("delay")) {
|
||||
frequency = getDate(post.get("frequency", ""), post.get("freq_type", ""));
|
||||
if (frequency == -1)
|
||||
prop.put("freqError", ERR_DATE);
|
||||
}
|
||||
}
|
||||
|
||||
fetcher = null;
|
||||
if (post.get("source", "").equals("peer") &&
|
||||
post.get("peerhash", "").equals("random")) {
|
||||
fetcher = new URLFetcher(
|
||||
env,
|
||||
profile,
|
||||
frequency);
|
||||
} else {
|
||||
URL url = null;
|
||||
if (post.get("source", "").equals("url")) {
|
||||
try {
|
||||
url = new URL(post.get("host", null));
|
||||
if (!savedURLs.contains(url.toNormalform()))
|
||||
savedURLs.add(url.toNormalform());
|
||||
prop.put("host", post.get("host", url.toString()));
|
||||
} catch (MalformedURLException e) {
|
||||
prop.put("host", post.get("host", ""));
|
||||
prop.put("hostError", ERR_HOST_MALFORMED_URL);
|
||||
}
|
||||
} else if (post.get("source", "").equals("peer")) {
|
||||
yacySeed ys = null;
|
||||
try {
|
||||
ys = yacyCore.seedDB.getConnected(post.get("peerhash", ""));
|
||||
if (ys != null) {
|
||||
url = new URL("http://" + ys.getAddress() + "/yacy/urllist.html");
|
||||
} else {
|
||||
prop.put("peerError", ERR_PEER_OFFLINE);
|
||||
prop.put("peerError_hash", post.get("peerhash", ""));
|
||||
}
|
||||
} catch (MalformedURLException e) {
|
||||
prop.put("peerError", ERR_PEER_GENERAL_CONN);
|
||||
prop.put("peerError_hash", post.get("peerhash", ""));
|
||||
prop.put("peerError_name", ys.getName());
|
||||
}
|
||||
}
|
||||
|
||||
if (url != null) {
|
||||
fetcher = new URLFetcher(
|
||||
env,
|
||||
profile,
|
||||
url,
|
||||
frequency);
|
||||
}
|
||||
}
|
||||
if (fetcher != null)
|
||||
fetcher.start();
|
||||
}
|
||||
else if (post.containsKey("stop")) {
|
||||
if (fetcher != null) {
|
||||
fetcher.interrupt();
|
||||
} else {
|
||||
prop.put("threadError", ERR_THREAD_STOP);
|
||||
}
|
||||
}
|
||||
else if (post.containsKey("restart") || post.containsKey("resume")) {
|
||||
if (fetcher != null) {
|
||||
if (fetcher.url == null) {
|
||||
fetcher = new URLFetcher(
|
||||
env,
|
||||
profile,
|
||||
fetcher.delay);
|
||||
} else {
|
||||
fetcher = new URLFetcher(
|
||||
env,
|
||||
profile,
|
||||
fetcher.url,
|
||||
fetcher.delay);
|
||||
}
|
||||
fetcher.start();
|
||||
} else {
|
||||
prop.put("threadError", ERR_THREAD_RESUME);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (fetcher != null) {
|
||||
prop.put("runs", 1);
|
||||
prop.put("runs_status", (fetcher.isAlive()) ? STAT_THREAD_ALIVE :
|
||||
(fetcher.paused) ? STAT_THREAD_PAUSED : STAT_THREAD_STOPPED);
|
||||
prop.put("runs_totalRuns", URLFetcher.totalRuns);
|
||||
prop.put("runs_totalFetchedURLs", URLFetcher.totalFetchedURLs);
|
||||
prop.put("runs_totalFailedURLs", URLFetcher.totalFailed);
|
||||
prop.put("runs_lastRun", fetcher.lastRun);
|
||||
prop.put("runs_lastFetchedURLs", fetcher.lastFetchedURLs);
|
||||
prop.put("runs_lastServerResponse", (fetcher.lastServerResponse == null)
|
||||
? "" : fetcher.lastServerResponse);
|
||||
|
||||
Iterator it = fetcher.failed.keySet().iterator();
|
||||
int i = 0;
|
||||
Object key;
|
||||
while (it.hasNext()) {
|
||||
key = it.next();
|
||||
prop.put("runs_error_" + i + "_reason", fetcher.failed.get(key));
|
||||
prop.put("runs_error_" + i + "_url", (String)key);
|
||||
i++;
|
||||
}
|
||||
prop.put("runs_error", i);
|
||||
}
|
||||
|
||||
return prop;
|
||||
}
|
||||
|
||||
private static int listURLs(serverObjects prop) {
|
||||
if (savedURLs.size() == 0) return 0;
|
||||
prop.put("saved", 1);
|
||||
for (int i=0; i<savedURLs.size(); i++)
|
||||
prop.put("saved_urls_" + i + "url", savedURLs.get(i));
|
||||
prop.put("saved_urls", savedURLs.size());
|
||||
return savedURLs.size();
|
||||
}
|
||||
|
||||
private static int listPeers(serverObjects prop) {
|
||||
int peerCount = 0;
|
||||
if (yacyCore.seedDB != null && yacyCore.seedDB.sizeConnected() > 0) {
|
||||
prop.put("peersKnown", 1);
|
||||
int peerCount = 0;
|
||||
try {
|
||||
TreeMap hostList = new TreeMap();
|
||||
final Enumeration e = yacyCore.seedDB.seedsConnected(true, false, null, (float) 0.0);
|
||||
|
@ -50,65 +210,7 @@ public class CrawlURLFetch_p {
|
|||
} else {
|
||||
prop.put("peersKnown", 0);
|
||||
}
|
||||
|
||||
if (post != null) {
|
||||
if (post.containsKey("start")) {
|
||||
try {
|
||||
|
||||
long frequency = -1;
|
||||
if (post.containsKey("regularly"))
|
||||
frequency = getDate(post.get("frequency", ""), post.get("freq_type", ""));
|
||||
|
||||
String t = post.get("type", "text");
|
||||
int type = -1;
|
||||
if (t.equals("text")) {
|
||||
type = URLFetcher.TYPE_TEXT;
|
||||
} else if (t.equals("xml")) {
|
||||
type = URLFetcher.TYPE_XML;
|
||||
}
|
||||
|
||||
URL url = new URL(post.get("host", null));
|
||||
prop.put("host", post.get("host", ""));
|
||||
|
||||
if (type > -1) {
|
||||
if (frequency > -1) {
|
||||
fetcher = new URLFetcher(
|
||||
env,
|
||||
((plasmaSwitchboard)env).defaultProxyProfile,
|
||||
url,
|
||||
frequency,
|
||||
type);
|
||||
} else { // only fetch once
|
||||
fetcher = new URLFetcher(
|
||||
env,
|
||||
((plasmaSwitchboard)env).defaultProxyProfile,
|
||||
url,
|
||||
type);
|
||||
}
|
||||
fetcher.start();
|
||||
}
|
||||
} catch (MalformedURLException e) {
|
||||
prop.put("host", post.get("host", ""));
|
||||
prop.put("hostError", 1);
|
||||
}
|
||||
} else if (post.containsKey("stop")) {
|
||||
fetcher.interrupt();
|
||||
}
|
||||
}
|
||||
|
||||
if (fetcher != null) {
|
||||
prop.put("runs", 1);
|
||||
prop.put("runs_status", (fetcher.isRunning()) ? 0 : (fetcher.isPaused()) ? 2 : 1);
|
||||
prop.put("runs_totalRuns", URLFetcher.totalRuns);
|
||||
prop.put("runs_totalFetchedURLs", URLFetcher.totalFetchedURLs);
|
||||
prop.put("runs_totalFailedURLs", URLFetcher.totalFailed);
|
||||
prop.put("runs_lastRun", URLFetcher.lastRun);
|
||||
prop.put("runs_lastFetchedURLs", URLFetcher.lastFetchedURLs);
|
||||
prop.put("runs_lastServerResponse", (URLFetcher.lastServerResponse == null)
|
||||
? "" : URLFetcher.lastServerResponse);
|
||||
}
|
||||
|
||||
return prop;
|
||||
return peerCount;
|
||||
}
|
||||
|
||||
private static long getDate(String count, String type) {
|
||||
|
@ -116,7 +218,7 @@ public class CrawlURLFetch_p {
|
|||
if (count != null && count.matches("\\d+")) r = Long.parseLong(count);
|
||||
if (r < 1) return -1;
|
||||
|
||||
r *= 3600 * 24;
|
||||
r *= 3600;
|
||||
if (type.equals("weeks")) return r * 24 * 7;
|
||||
else if (type.equals("days")) return r * 24;
|
||||
else if (type.equals("hours")) return r;
|
||||
|
@ -125,120 +227,161 @@ public class CrawlURLFetch_p {
|
|||
|
||||
public static class URLFetcher extends Thread {
|
||||
|
||||
public static final int TYPE_TEXT = 0;
|
||||
public static final int TYPE_XML = 1;
|
||||
public static final long DELAY_ONCE = -1;
|
||||
public static final long DELAY_SELF_DET = 0;
|
||||
|
||||
public static int lastFetchedURLs = 0;
|
||||
public static long lastRun = 0;
|
||||
public static String lastServerResponse = null;
|
||||
public static int lastFailed = 0;
|
||||
public static int totalRuns = 0;
|
||||
public static int totalFetchedURLs = 0;
|
||||
public static int totalFailed = 0;
|
||||
|
||||
private final URL url;
|
||||
private final long delay;
|
||||
private final int type;
|
||||
private final plasmaSwitchboard sb;
|
||||
private final plasmaCrawlProfile.entry profile;
|
||||
public final HashMap failed = new HashMap();
|
||||
|
||||
private boolean running = false;
|
||||
private boolean paused = false;
|
||||
public int lastFetchedURLs = 0;
|
||||
public long lastRun = 0;
|
||||
public String lastServerResponse = null;
|
||||
public int lastFailed = 0;
|
||||
|
||||
public final URL url;
|
||||
public final long delay;
|
||||
public final plasmaSwitchboard sb;
|
||||
public final plasmaCrawlProfile.entry profile;
|
||||
|
||||
public boolean paused = false;
|
||||
|
||||
public URLFetcher(
|
||||
serverSwitch env,
|
||||
plasmaCrawlProfile.entry profile,
|
||||
URL url,
|
||||
int type) {
|
||||
this.sb = (plasmaSwitchboard)env;
|
||||
this.profile = profile;
|
||||
this.url = url;
|
||||
this.type = type;
|
||||
this.delay = 0;
|
||||
this.setName("URL-Fetcher");
|
||||
}
|
||||
|
||||
public URLFetcher(
|
||||
serverSwitch env,
|
||||
plasmaCrawlProfile.entry profile,
|
||||
URL url,
|
||||
long delayMs,
|
||||
int type) {
|
||||
long delayMs) {
|
||||
if (env == null || profile == null || url == null)
|
||||
throw new NullPointerException("env, profile or url must not be null");
|
||||
this.sb = (plasmaSwitchboard)env;
|
||||
this.profile = profile;
|
||||
this.url = url;
|
||||
this.delay = delayMs;
|
||||
this.type = type;
|
||||
this.setName("URL-Fetcher");
|
||||
this.setName("URLFetcher");
|
||||
}
|
||||
|
||||
public boolean isRunning() { return this.running; }
|
||||
public boolean isPaused() { return this.paused; }
|
||||
public URLFetcher(
|
||||
serverSwitch env,
|
||||
plasmaCrawlProfile.entry profile,
|
||||
long delayMs) {
|
||||
if (env == null || profile == null)
|
||||
throw new NullPointerException("env or profile must not be null");
|
||||
this.sb = (plasmaSwitchboard)env;
|
||||
this.profile = profile;
|
||||
this.url = null;
|
||||
this.delay = delayMs;
|
||||
this.setName("URLFetcher");
|
||||
}
|
||||
|
||||
public void run() {
|
||||
this.running = true;
|
||||
this.paused = false;
|
||||
long start;
|
||||
do {
|
||||
URL url;
|
||||
while (!isInterrupted()) {
|
||||
try {
|
||||
start = System.currentTimeMillis();
|
||||
totalFetchedURLs += addURLs();
|
||||
url = getDLURL();
|
||||
if (url == null) {
|
||||
serverLog.logSevere(this.getName(), "canceled because no valid URL for the URL-list could be determinded");
|
||||
break;
|
||||
}
|
||||
totalFetchedURLs += stackURLs(getURLs(url));
|
||||
lastRun = System.currentTimeMillis() - start;
|
||||
totalRuns++;
|
||||
if (this.delay < 0) {
|
||||
break;
|
||||
} else if (this.delay == 0) {
|
||||
this.paused = true;
|
||||
while (this.paused) this.wait();
|
||||
} else {
|
||||
this.paused = true;
|
||||
this.wait(this.delay);
|
||||
}
|
||||
this.paused = false;
|
||||
} catch (InterruptedException e) { break; }
|
||||
} while (!isInterrupted() && this.delay > 0);
|
||||
this.running = false;
|
||||
}
|
||||
}
|
||||
|
||||
private int addURLs() throws InterruptedException {
|
||||
String[] urls = getURLs();
|
||||
lastFailed = 0;
|
||||
private URL getDLURL() {
|
||||
if (this.url != null) return this.url;
|
||||
|
||||
// choose random seed
|
||||
yacySeed ys = null;
|
||||
Enumeration e = yacyCore.seedDB.seedsConnected(true, false, null, 0F);
|
||||
int num = new Random().nextInt(yacyCore.seedDB.sizeConnected()) + 1;
|
||||
Object o;
|
||||
for (int i=0; i<num && e.hasMoreElements(); i++) {
|
||||
o = e.nextElement();
|
||||
if (o != null) ys = (yacySeed)o;
|
||||
}
|
||||
if (ys == null) return null;
|
||||
|
||||
try {
|
||||
return new URL("http://" + ys.getAddress() + "/yacy/urllist.html");
|
||||
} catch (MalformedURLException ee) { return null; }
|
||||
}
|
||||
|
||||
private int stackURLs(String[] urls) throws InterruptedException {
|
||||
this.lastFailed = 0;
|
||||
if (urls == null) return 0;
|
||||
String reason;
|
||||
for (int i=0; i<urls.length; i++) {
|
||||
serverLog.logFinest(this.getName(), "stacking " + urls[i]);
|
||||
reason = this.sb.sbStackCrawlThread.stackCrawl(
|
||||
urls[i],
|
||||
null,
|
||||
yacyCore.seedDB.mySeed.hash,
|
||||
"PROXY",
|
||||
null,
|
||||
new Date(),
|
||||
this.profile.generalDepth(),
|
||||
this.profile);
|
||||
if (reason != null) lastFailed++;
|
||||
if (reason != null) {
|
||||
this.lastFailed++;
|
||||
this.failed.put(urls[i], reason);
|
||||
try {
|
||||
plasmaCrawlEURL.Entry ee = this.sb.errorURL.newEntry(
|
||||
new URL(urls[i]),
|
||||
null,
|
||||
yacyCore.seedDB.mySeed.hash,
|
||||
yacyCore.seedDB.mySeed.hash,
|
||||
null,
|
||||
reason,
|
||||
new kelondroBitfield());
|
||||
ee.store();
|
||||
this.sb.errorURL.stackPushEntry(ee);
|
||||
} catch (MalformedURLException e) { }
|
||||
}
|
||||
return urls.length;
|
||||
}
|
||||
return urls.length - this.lastFailed;
|
||||
}
|
||||
|
||||
private String[] getURLs() {
|
||||
private String[] getURLs(URL url) {
|
||||
if (url == null) return null;
|
||||
String[] r = null;
|
||||
try {
|
||||
httpc con = httpc.getInstance(
|
||||
this.url.getHost(),
|
||||
this.url.getHost(),
|
||||
this.url.getPort(),
|
||||
url.getHost(),
|
||||
url.getHost(),
|
||||
url.getPort(),
|
||||
15000,
|
||||
this.url.getProtocol().equals("https"));
|
||||
url.getProtocol().equals("https"));
|
||||
|
||||
httpHeader header = new httpHeader();
|
||||
header.put(httpHeader.ACCEPT_ENCODING, "utf-8");
|
||||
header.put(httpHeader.HOST, this.url.getHost());
|
||||
header.put(httpHeader.ACCEPT_ENCODING, "US-ASCII");
|
||||
header.put(httpHeader.HOST, url.getHost());
|
||||
|
||||
httpc.response res = con.GET(this.url.getPath(), header);
|
||||
lastServerResponse = res.statusCode + " (" + res.statusText + ")";
|
||||
httpc.response res = con.GET(url.getPath(), header);
|
||||
serverLog.logFine(this.getName(), "downloaded URL-list from " + url + " (" + res.statusCode + ")");
|
||||
this.lastServerResponse = res.statusCode + " (" + res.statusText + ")";
|
||||
if (res.status.startsWith("2")) {
|
||||
byte[] cbs = res.writeContent();
|
||||
String encoding = res.responseHeader.getCharacterEncoding();
|
||||
|
||||
if (encoding == null) encoding = "ASCII";
|
||||
switch (this.type) {
|
||||
case TYPE_TEXT: r = parseText(new String(cbs, encoding)); break;
|
||||
// case TYPE_XML: r = parseXML(new String(cbs, encoding));
|
||||
if (encoding == null) encoding = "US-ASCII";
|
||||
r = parseText(wikiCode.deReplaceHTMLEntities(new String(cbs, encoding)));
|
||||
}
|
||||
}
|
||||
con.close();
|
||||
httpc.returnInstance(con);
|
||||
} catch (IOException e) { }
|
||||
return r;
|
||||
|
|
Loading…
Reference in New Issue
Block a user