mirror of
https://github.com/yacy/yacy_search_server.git
synced 2024-09-19 00:01:41 +02:00
- better logging when rejecting a url because it is not in declared domain
- more XSS attack protection git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@4720 6c8d7289-2bf4-0310-a012-ef5d649a1542
This commit is contained in:
parent
6d1be66822
commit
5e3ce46339
|
@ -146,7 +146,7 @@ public class AccessTracker_p {
|
|||
if (page == 2) {
|
||||
// local search
|
||||
prop.putNum("page_list_" + entCount + "_offset", searchProfile.offset);
|
||||
prop.put("page_list_" + entCount + "_querystring", searchProfile.queryString);
|
||||
prop.putHTML("page_list_" + entCount + "_querystring", searchProfile.queryString);
|
||||
} else {
|
||||
// remote search
|
||||
prop.putHTML("page_list_" + entCount + "_peername", (searchProfile.remotepeer == null) ? "<unknown>" : searchProfile.remotepeer.getName());
|
||||
|
|
|
@ -83,7 +83,7 @@ public class Config_p {
|
|||
while(keys.hasNext()){
|
||||
key = (String) keys.next();
|
||||
prop.put("options_"+count+"_key", key);
|
||||
prop.put("options_"+count+"_value", env.getConfig(key, "ERROR"));
|
||||
prop.putHTML("options_"+count+"_value", env.getConfig(key, "ERROR"));
|
||||
count++;
|
||||
}
|
||||
|
||||
|
|
|
@ -208,7 +208,7 @@ public final class Connections_p {
|
|||
prop.put("list_" + idx + "_ms", "1");
|
||||
prop.putNum("list_" + idx + "_ms_duration", sessionTime);
|
||||
}
|
||||
prop.put("list_" + idx + "_source",(seed!=null)?seed.getName()+".yacy":userAddress.getHostAddress()+":"+userPort);
|
||||
prop.putHTML("list_" + idx + "_source",(seed!=null)?seed.getName()+".yacy":userAddress.getHostAddress()+":"+userPort);
|
||||
prop.put("list_" + idx + "_dest",(dest==null)?"-":dest);
|
||||
if (blockingRequest) {
|
||||
prop.put("list_" + idx + "_running", "0");
|
||||
|
|
|
@ -138,11 +138,11 @@ public class IndexCreateIndexingQueue_p {
|
|||
totalSize += entrySize;
|
||||
initiator = yacyCore.seedDB.getConnected(pcentry.initiator());
|
||||
prop.put("indexing-queue_list_"+entryCount+"_dark", inProcess ? "2" : (dark ? "1" : "0"));
|
||||
prop.put("indexing-queue_list_"+entryCount+"_initiator", ((initiator == null) ? "proxy" : initiator.getName()));
|
||||
prop.putHTML("indexing-queue_list_"+entryCount+"_initiator", ((initiator == null) ? "proxy" : initiator.getName()));
|
||||
prop.put("indexing-queue_list_"+entryCount+"_depth", pcentry.depth());
|
||||
prop.put("indexing-queue_list_"+entryCount+"_modified", pcentry.getModificationDate().toString());
|
||||
prop.putHTML("indexing-queue_list_"+entryCount+"_anchor", (pcentry.anchorName()==null)?"":pcentry.anchorName());
|
||||
prop.put("indexing-queue_list_"+entryCount+"_url", pcentry.url().toNormalform(false, true));
|
||||
prop.putHTML("indexing-queue_list_"+entryCount+"_url", pcentry.url().toNormalform(false, true));
|
||||
prop.put("indexing-queue_list_"+entryCount+"_size", serverMemory.bytesToString(entrySize));
|
||||
prop.put("indexing-queue_list_"+entryCount+"_inProcess", inProcess ? "1" :"0");
|
||||
prop.put("indexing-queue_list_"+entryCount+"_inProcess_hash", pcentry.urlHash());
|
||||
|
@ -185,9 +185,9 @@ public class IndexCreateIndexingQueue_p {
|
|||
executorHash = entry.executor();
|
||||
initiatorSeed = yacyCore.seedDB.getConnected(initiatorHash);
|
||||
executorSeed = yacyCore.seedDB.getConnected(executorHash);
|
||||
prop.put("rejected_list_"+j+"_initiator", ((initiatorSeed == null) ? "proxy" : initiatorSeed.getName()));
|
||||
prop.put("rejected_list_"+j+"_executor", ((executorSeed == null) ? "proxy" : executorSeed.getName()));
|
||||
prop.put("rejected_list_"+j+"_url", url.toNormalform(false, true));
|
||||
prop.putHTML("rejected_list_"+j+"_initiator", ((initiatorSeed == null) ? "proxy" : initiatorSeed.getName()));
|
||||
prop.putHTML("rejected_list_"+j+"_executor", ((executorSeed == null) ? "proxy" : executorSeed.getName()));
|
||||
prop.putHTML("rejected_list_"+j+"_url", url.toNormalform(false, true));
|
||||
prop.putHTML("rejected_list_"+j+"_failreason", entry.anycause());
|
||||
prop.put("rejected_list_"+j+"_dark", dark ? "1" : "0");
|
||||
dark = !dark;
|
||||
|
|
|
@ -72,10 +72,10 @@ public class IndexCreateLoaderQueue_p {
|
|||
|
||||
initiator = yacyCore.seedDB.getConnected(w[i].initiator());
|
||||
prop.put("loader-set_list_"+count+"_dark", dark ? "1" : "0");
|
||||
prop.put("loader-set_list_"+count+"_initiator", ((initiator == null) ? "proxy" : initiator.getName()));
|
||||
prop.putHTML("loader-set_list_"+count+"_initiator", ((initiator == null) ? "proxy" : initiator.getName()));
|
||||
prop.put("loader-set_list_"+count+"_depth", w[i].depth());
|
||||
prop.put("loader-set_list_"+count+"_status", w[i].getStatus());
|
||||
prop.put("loader-set_list_"+count+"_url", w[i].url().toNormalform(true, false));
|
||||
prop.putHTML("loader-set_list_"+count+"_url", w[i].url().toNormalform(true, false));
|
||||
dark = !dark;
|
||||
count++;
|
||||
}
|
||||
|
|
|
@ -119,12 +119,12 @@ public class IndexCreateWWWGlobalQueue_p {
|
|||
profileHandle = urle.profileHandle();
|
||||
profileEntry = (profileHandle == null) ? null : switchboard.profilesActiveCrawls.getEntry(profileHandle);
|
||||
prop.put("crawler-queue_list_"+showNum+"_dark", dark ? "1" : "0");
|
||||
prop.put("crawler-queue_list_"+showNum+"_initiator", ((initiator == null) ? "proxy" : initiator.getName()) );
|
||||
prop.putHTML("crawler-queue_list_"+showNum+"_initiator", ((initiator == null) ? "proxy" : initiator.getName()) );
|
||||
prop.put("crawler-queue_list_"+showNum+"_profile", ((profileEntry == null) ? "unknown" : profileEntry.name()));
|
||||
prop.put("crawler-queue_list_"+showNum+"_depth", urle.depth());
|
||||
prop.put("crawler-queue_list_"+showNum+"_modified", daydate(urle.loaddate()) );
|
||||
prop.putHTML("crawler-queue_list_"+showNum+"_anchor", urle.name());
|
||||
prop.put("crawler-queue_list_"+showNum+"_url", urle.url().toNormalform(false, true));
|
||||
prop.putHTML("crawler-queue_list_"+showNum+"_url", urle.url().toNormalform(false, true));
|
||||
prop.put("crawler-queue_list_"+showNum+"_hash", urle.url().hash());
|
||||
dark = !dark;
|
||||
showNum++;
|
||||
|
|
|
@ -184,12 +184,12 @@ public class IndexCreateWWWLocalQueue_p {
|
|||
profileHandle = urle.profileHandle();
|
||||
profileEntry = (profileHandle == null) ? null : sb.profilesActiveCrawls.getEntry(profileHandle);
|
||||
prop.put("crawler-queue_list_"+showNum+"_dark", dark ? "1" : "0");
|
||||
prop.put("crawler-queue_list_"+showNum+"_initiator", ((initiator == null) ? "proxy" : initiator.getName()) );
|
||||
prop.putHTML("crawler-queue_list_"+showNum+"_initiator", ((initiator == null) ? "proxy" : initiator.getName()) );
|
||||
prop.put("crawler-queue_list_"+showNum+"_profile", ((profileEntry == null) ? "unknown" : profileEntry.name()));
|
||||
prop.put("crawler-queue_list_"+showNum+"_depth", urle.depth());
|
||||
prop.put("crawler-queue_list_"+showNum+"_modified", daydate(urle.loaddate()) );
|
||||
prop.putHTML("crawler-queue_list_"+showNum+"_anchor", urle.name());
|
||||
prop.put("crawler-queue_list_"+showNum+"_url", urle.url().toNormalform(false, true));
|
||||
prop.putHTML("crawler-queue_list_"+showNum+"_url", urle.url().toNormalform(false, true));
|
||||
prop.put("crawler-queue_list_"+showNum+"_hash", urle.url().hash());
|
||||
dark = !dark;
|
||||
showNum++;
|
||||
|
|
|
@ -119,12 +119,12 @@ public class IndexCreateWWWRemoteQueue_p {
|
|||
profileHandle = urle.profileHandle();
|
||||
profileEntry = (profileHandle == null) ? null : sb.profilesActiveCrawls.getEntry(profileHandle);
|
||||
prop.put("crawler-queue_list_" + showNum + "_dark", dark ? "1" : "0");
|
||||
prop.put("crawler-queue_list_" + showNum + "_initiator", ((initiator == null) ? "proxy" : initiator.getName()));
|
||||
prop.putHTML("crawler-queue_list_" + showNum + "_initiator", ((initiator == null) ? "proxy" : initiator.getName()));
|
||||
prop.put("crawler-queue_list_" + showNum + "_profile", ((profileEntry == null) ? "unknown" : profileEntry.name()));
|
||||
prop.put("crawler-queue_list_" + showNum + "_depth", urle.depth());
|
||||
prop.put("crawler-queue_list_" + showNum + "_modified", daydate(urle.loaddate()) );
|
||||
prop.putHTML("crawler-queue_list_" + showNum + "_anchor", urle.name());
|
||||
prop.put("crawler-queue_list_" + showNum + "_url", urle.url().toString());
|
||||
prop.putHTML("crawler-queue_list_" + showNum + "_url", urle.url().toString());
|
||||
prop.put("crawler-queue_list_" + showNum + "_hash", urle.url().hash());
|
||||
dark = !dark;
|
||||
showNum++;
|
||||
|
|
|
@ -73,7 +73,8 @@ public class rct_p {
|
|||
loaddate = new Date();
|
||||
}
|
||||
yacyURL referrer = null; // referrer needed!
|
||||
if (sb.acceptURL(url)) {
|
||||
String urlRejectReason = sb.acceptURL(url);
|
||||
if (urlRejectReason == null) {
|
||||
// stack url
|
||||
sb.getLog().logFinest("crawlOrder: stack: url='" + url + "'");
|
||||
String reasonString = sb.crawlStacker.stackCrawl(url, referrer, peerhash, "REMOTE-CRAWLING", loaddate, 0, sb.defaultRemoteProfile);
|
||||
|
@ -88,7 +89,7 @@ public class rct_p {
|
|||
env.getLog().logInfo("crawlOrder: ignored [" + reasonString + "] remote crawl url: " + url.toNormalform(true, false));
|
||||
}
|
||||
} else {
|
||||
env.getLog().logWarning("crawlOrder: Received URL outside of our domain: " + url.toNormalform(true, false));
|
||||
env.getLog().logWarning("crawlOrder: Rejected URL '" + url.toNormalform(true, false) + "': " + urlRejectReason);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -147,8 +147,9 @@ public final class crawlReceipt {
|
|||
}
|
||||
|
||||
// check if the entry is in our network domain
|
||||
if (!switchboard.acceptURL(comp.url())) {
|
||||
log.logWarning("crawlReceipt: RECEIVED wrong RECEIPT (url outside of our domain) for hash " + entry.hash() + " from peer " + iam + "\n\tURL properties: "+ propStr);
|
||||
String urlRejectReason = switchboard.acceptURL(comp.url());
|
||||
if (urlRejectReason != null) {
|
||||
log.logWarning("crawlReceipt: RECEIVED wrong RECEIPT (" + urlRejectReason + ") for hash " + entry.hash() + " from peer " + iam + "\n\tURL properties: "+ propStr);
|
||||
prop.put("delay", "9999");
|
||||
return prop;
|
||||
}
|
||||
|
|
|
@ -145,8 +145,9 @@ public final class transferURL {
|
|||
}
|
||||
|
||||
// check if the entry is in our network domain
|
||||
if (!sb.acceptURL(comp.url())) {
|
||||
yacyCore.log.logFine("transferURL: blocked URL outside of our domain '" + comp.url().toNormalform(false, true) + "' from peer " + otherPeerName);
|
||||
String urlRejectReason = sb.acceptURL(comp.url());
|
||||
if (urlRejectReason != null) {
|
||||
yacyCore.log.logFine("transferURL: blocked URL '" + comp.url() + "' (" + urlRejectReason + ") from peer " + otherPeerName);
|
||||
lEntry = null;
|
||||
blocked++;
|
||||
continue;
|
||||
|
|
|
@ -321,7 +321,8 @@ public class plasmaCrawlQueues {
|
|||
} catch (ParseException e) {
|
||||
loaddate = new Date();
|
||||
}
|
||||
if (sb.acceptURL(url)) {
|
||||
String urlRejectReason = sb.acceptURL(url);
|
||||
if (urlRejectReason == null) {
|
||||
// stack url
|
||||
sb.getLog().logFinest("crawlOrder: stack: url='" + url + "'");
|
||||
String reasonString = sb.crawlStacker.stackCrawl(url, referrer, hash, item.getDescription(), loaddate, 0, sb.defaultRemoteProfile);
|
||||
|
@ -336,7 +337,7 @@ public class plasmaCrawlQueues {
|
|||
log.logInfo("crawlOrder: ignored [" + reasonString + "] remote crawl url: " + url.toNormalform(true, false));
|
||||
}
|
||||
} else {
|
||||
log.logWarning("crawlOrder: Received URL outside of our domain: " + url.toNormalform(true, false));
|
||||
log.logWarning("crawlOrder: Rejected URL '" + url.toNormalform(true, false) + "': " + urlRejectReason);
|
||||
}
|
||||
}
|
||||
return true;
|
||||
|
|
|
@ -36,7 +36,6 @@ public class plasmaCrawlEURL {
|
|||
public static final String DENIED_URL_NULL = "denied_(url_null)";
|
||||
public static final String DENIED_MALFORMED_URL = "denied_(malformed_url)";
|
||||
public static final String DENIED_UNSUPPORTED_PROTOCOL = "denied_(unsupported_protocol)";
|
||||
public static final String DENIED_IP_ADDRESS_NOT_IN_DECLARED_DOMAIN = "denied_(address_not_in_declared_domain)";
|
||||
public static final String DENIED_LOOPBACK_IP_ADDRESS = "denied_(loopback_ip_address)";
|
||||
public static final String DENIED_CACHEFILE_PATH_TOO_LONG = "denied_(cachefile_path_too_long)";
|
||||
public static final String DENIED_INVALID_CACHEFILE_PATH = "denied_(invalid_cachefile_path)";
|
||||
|
|
|
@ -385,10 +385,10 @@ public final class plasmaCrawlStacker extends Thread {
|
|||
}
|
||||
|
||||
// check if ip is local ip address
|
||||
if (!sb.acceptURL(entry.url())) {
|
||||
reason = plasmaCrawlEURL.DENIED_IP_ADDRESS_NOT_IN_DECLARED_DOMAIN + "[" + sb.getConfig("network.unit.domain", "unknown") + "]";
|
||||
if (this.log.isFine()) this.log.logFine("Host in URL '" + entry.url().toString() + "' has IP address outside of declared range (" + sb.getConfig("network.unit.domain", "unknown") + "). " +
|
||||
"Stack processing time: " + (System.currentTimeMillis()-startTime) + "ms");
|
||||
String urlRejectReason = sb.acceptURL(entry.url());
|
||||
if (urlRejectReason != null) {
|
||||
reason = "denied_(" + urlRejectReason + ")_domain=" + sb.getConfig("network.unit.domain", "unknown");
|
||||
if (this.log.isFine()) this.log.logFine(reason + "Stack processing time: " + (System.currentTimeMillis()-startTime) + "ms");
|
||||
return reason;
|
||||
}
|
||||
|
||||
|
|
|
@ -1427,21 +1427,32 @@ public final class plasmaSwitchboard extends serverAbstractSwitch<plasmaSwitchbo
|
|||
}
|
||||
}
|
||||
|
||||
public boolean acceptURL(yacyURL url) {
|
||||
/**
|
||||
* Test a url if it can be used for crawling/indexing
|
||||
* This mainly checks if the url is in the declared domain (local/global)
|
||||
* @param url
|
||||
* @return null if the url can be accepted, a string containing a rejection reason if the url cannot be accepted
|
||||
*/
|
||||
public String acceptURL(yacyURL url) {
|
||||
// returns true if the url can be accepted accoring to network.unit.domain
|
||||
if (url == null) return false;
|
||||
if (url == null) return "url is null";
|
||||
String host = url.getHost();
|
||||
if (host == null) return false;
|
||||
if (this.acceptGlobalURLs && this.acceptLocalURLs) return true; // fast shortcut to avoid dnsResolve
|
||||
if (host == null) return "url.host is null";
|
||||
if (this.acceptGlobalURLs && this.acceptLocalURLs) return null; // fast shortcut to avoid dnsResolve
|
||||
InetAddress hostAddress = serverDomains.dnsResolve(host);
|
||||
// if we don't know the host, we cannot load that resource anyway.
|
||||
// But in case we use a proxy, it is possible that we dont have a DNS service.
|
||||
final httpRemoteProxyConfig remoteProxyConfig = httpdProxyHandler.getRemoteProxyConfig();
|
||||
if (hostAddress == null) return ((remoteProxyConfig != null) && (remoteProxyConfig.useProxy()));
|
||||
if (hostAddress == null) {
|
||||
if ((remoteProxyConfig != null) && (remoteProxyConfig.useProxy())) return null; else return "the dns of the host '" + host + "' cannot be resolved";
|
||||
}
|
||||
// check if this is a local address and we are allowed to index local pages:
|
||||
boolean local = hostAddress.isSiteLocalAddress() || hostAddress.isLoopbackAddress();
|
||||
//assert local == yacyURL.isLocalDomain(url.hash()); // TODO: remove the dnsResolve above!
|
||||
return (this.acceptGlobalURLs && !local) || (this.acceptLocalURLs && local);
|
||||
if ((this.acceptGlobalURLs && !local) || (this.acceptLocalURLs && local)) return null;
|
||||
return (local) ?
|
||||
("the host '" + host + "' is local, but local addresses are not accepted") :
|
||||
("the host '" + host + "' is global, but global addresses are not accepted");
|
||||
}
|
||||
|
||||
public String urlExists(String hash) {
|
||||
|
@ -1631,8 +1642,9 @@ public final class plasmaSwitchboard extends serverAbstractSwitch<plasmaSwitchbo
|
|||
*
|
||||
* check if ip is local ip address // TODO: remove this procotol specific code here
|
||||
* ========================================================================= */
|
||||
if (!acceptURL(entry.url())) {
|
||||
if (this.log.isFine()) this.log.logFine("Host in URL '" + entry.url() + "' is not in defined indexing domain.");
|
||||
String urlRejectReason = acceptURL(entry.url());
|
||||
if (urlRejectReason != null) {
|
||||
if (this.log.isFine()) this.log.logFine("Rejected URL '" + entry.url() + "': " + urlRejectReason);
|
||||
doIndexing = false;
|
||||
}
|
||||
|
||||
|
|
|
@ -536,8 +536,9 @@ public final class yacyClient {
|
|||
continue; // block with backlist
|
||||
}
|
||||
|
||||
if (!plasmaSwitchboard.getSwitchboard().acceptURL(comp.url())) {
|
||||
yacyCore.log.logInfo("remote search (client): rejected url outside of our domain " + comp.url() + " from peer " + target.getName());
|
||||
String urlRejectReason = plasmaSwitchboard.getSwitchboard().acceptURL(comp.url());
|
||||
if (urlRejectReason != null) {
|
||||
yacyCore.log.logInfo("remote search (client): rejected url '" + comp.url() + "' (" + urlRejectReason + ") from peer " + target.getName());
|
||||
continue; // reject url outside of our domain
|
||||
}
|
||||
|
||||
|
|
Loading…
Reference in New Issue
Block a user