From d2fd93135ca2acfe23d8b61682e37463bd5a4845 Mon Sep 17 00:00:00 2001 From: orbiter Date: Mon, 27 Sep 2010 14:54:32 +0000 Subject: [PATCH] - moved yacybot user agent string definition to MultiProtocolURI since there are basic access mechanisms where the bot string is needed - migrated the 'yacy' user agent to 'yacybot' in many client methods since the 'yacy' user agent is only used for the proxy git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@7199 6c8d7289-2bf4-0310-a012-ef5d649a1542 --- htroot/ConfigAppearance_p.java | 4 +- htroot/ConfigLanguage_p.java | 4 +- htroot/Network.java | 9 ++-- htroot/sharedBlacklist_p.java | 6 +-- source/de/anomic/crawler/RobotsTxt.java | 4 +- source/de/anomic/crawler/SitemapImporter.java | 4 +- .../anomic/crawler/retrieval/HTTPLoader.java | 6 +-- .../anomic/http/server/HTTPDProxyHandler.java | 7 ++- source/de/anomic/search/Switchboard.java | 5 +- source/de/anomic/tools/loaderThreads.java | 5 +- source/de/anomic/yacy/yacyClient.java | 27 ++++++----- source/de/anomic/yacy/yacyRelease.java | 3 +- source/de/anomic/yacy/yacySeedDB.java | 4 +- .../yacy/cora/document/MultiProtocolURI.java | 46 +++++++++++++++++-- source/net/yacy/cora/protocol/Domains.java | 22 +++++++++ .../net/yacy/cora/protocol/ftp/FTPClient.java | 6 ++- .../yacy/cora/protocol/http/HTTPClient.java | 44 ++---------------- source/net/yacy/cora/services/Search.java | 4 +- .../net/yacy/document/parser/htmlParser.java | 4 +- .../net/yacy/repository/LoaderDispatcher.java | 2 +- source/net/yacy/yacy.java | 3 +- 21 files changed, 119 insertions(+), 100 deletions(-) diff --git a/htroot/ConfigAppearance_p.java b/htroot/ConfigAppearance_p.java index 5e7c2b88e..6793f9f3f 100644 --- a/htroot/ConfigAppearance_p.java +++ b/htroot/ConfigAppearance_p.java @@ -37,11 +37,11 @@ import java.io.PrintWriter; import java.util.Iterator; import java.util.List; +import net.yacy.cora.document.MultiProtocolURI; import net.yacy.cora.protocol.RequestHeader; import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.kelondro.util.FileUtils; -import de.anomic.crawler.retrieval.HTTPLoader; import de.anomic.search.Switchboard; import de.anomic.server.serverObjects; import de.anomic.server.serverSwitch; @@ -98,7 +98,7 @@ public class ConfigAppearance_p { Iterator it; try { final DigestURI u = new DigestURI(url, null); - it = FileUtils.strings(u.get(HTTPLoader.yacyUserAgent, 10000)); + it = FileUtils.strings(u.get(MultiProtocolURI.yacybotUserAgent, 10000)); } catch (final IOException e) { prop.put("status", "1");// unable to get URL prop.put("status_url", url); diff --git a/htroot/ConfigLanguage_p.java b/htroot/ConfigLanguage_p.java index c26d9eed4..a975a444c 100644 --- a/htroot/ConfigLanguage_p.java +++ b/htroot/ConfigLanguage_p.java @@ -37,11 +37,11 @@ import java.io.PrintWriter; import java.util.Iterator; import java.util.List; +import net.yacy.cora.document.MultiProtocolURI; import net.yacy.cora.protocol.RequestHeader; import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.kelondro.util.FileUtils; -import de.anomic.crawler.retrieval.HTTPLoader; import de.anomic.data.WorkTables; import de.anomic.data.translator; import de.anomic.search.Switchboard; @@ -103,7 +103,7 @@ public class ConfigLanguage_p { Iterator it; try{ final DigestURI u = new DigestURI(url, null); - it = FileUtils.strings(u.get(HTTPLoader.yacyUserAgent, 10000)); + it = FileUtils.strings(u.get(MultiProtocolURI.yacybotUserAgent, 10000)); }catch(final IOException e){ prop.put("status", "1");//unable to get url prop.put("status_url", url); diff --git a/htroot/Network.java b/htroot/Network.java index 2b5f1a8b6..b85df2b93 100644 --- a/htroot/Network.java +++ b/htroot/Network.java @@ -36,13 +36,12 @@ import java.util.regex.Matcher; import java.util.regex.Pattern; import java.util.regex.PatternSyntaxException; +import net.yacy.cora.document.MultiProtocolURI; import net.yacy.cora.protocol.HeaderFramework; import net.yacy.cora.protocol.RequestHeader; -import net.yacy.cora.protocol.http.HTTPClient; import net.yacy.kelondro.util.DateFormatter; import net.yacy.kelondro.util.MapTools; -import de.anomic.crawler.retrieval.HTTPLoader; //import de.anomic.http.client.Client; import de.anomic.search.Switchboard; import de.anomic.search.SwitchboardConstants; @@ -149,7 +148,7 @@ public class Network { prop.put("table_my-url", seed.get(yacySeed.SEEDLIST, "")); // generating the location string - prop.putHTML("table_my-location", HTTPClient.generateLocation()); + prop.putHTML("table_my-location", MultiProtocolURI.generateLocation()); } // overall results: Network statistics @@ -363,8 +362,8 @@ public class Network { prop.putHTML(STR_TABLE_LIST + conCount + "_fullname", seed.get(yacySeed.NAME, "deadlink")); userAgent = null; if (seed.hash != null && seed.hash.equals(sb.peers.mySeed().hash)) { - userAgent = HTTPLoader.yacyUserAgent; - location = HTTPClient.generateLocation(); + userAgent = MultiProtocolURI.yacybotUserAgent; + location = MultiProtocolURI.generateLocation(); } else { userAgent = sb.peers.peerActions.getUserAgent(seed.getIP()); location = parseLocationInUserAgent(userAgent); diff --git a/htroot/sharedBlacklist_p.java b/htroot/sharedBlacklist_p.java index 009615073..6cc83866d 100644 --- a/htroot/sharedBlacklist_p.java +++ b/htroot/sharedBlacklist_p.java @@ -39,7 +39,6 @@ import java.util.HashSet; import java.util.Iterator; import java.util.List; -import de.anomic.crawler.retrieval.HTTPLoader; import de.anomic.data.listManager; import de.anomic.data.list.ListAccumulator; import de.anomic.data.list.XMLBlacklistImporter; @@ -48,6 +47,7 @@ import de.anomic.search.Switchboard; import de.anomic.server.serverObjects; import de.anomic.server.serverSwitch; import de.anomic.yacy.yacySeed; +import net.yacy.cora.document.MultiProtocolURI; import net.yacy.cora.protocol.RequestHeader; import net.yacy.document.parser.html.CharacterCoding; import net.yacy.kelondro.data.meta.DigestURI; @@ -143,7 +143,7 @@ public class sharedBlacklist_p { // get List DigestURI u = new DigestURI(downloadURLOld, null); - otherBlacklist = FileUtils.strings(u.get(HTTPLoader.yacyUserAgent, 10000)); + otherBlacklist = FileUtils.strings(u.get(MultiProtocolURI.yacybotUserAgent, 10000)); } catch (final Exception e) { prop.put("status", STATUS_PEER_UNKNOWN); prop.putHTML("status_name", Hash); @@ -160,7 +160,7 @@ public class sharedBlacklist_p { try { final DigestURI u = new DigestURI(downloadURL, null); - otherBlacklist = FileUtils.strings(u.get(HTTPLoader.yacyUserAgent, 10000)); + otherBlacklist = FileUtils.strings(u.get(MultiProtocolURI.yacybotUserAgent, 10000)); } catch (final Exception e) { prop.put("status", STATUS_URL_PROBLEM); prop.putHTML("status_address",downloadURL); diff --git a/source/de/anomic/crawler/RobotsTxt.java b/source/de/anomic/crawler/RobotsTxt.java index 9c16a9eee..70a226720 100644 --- a/source/de/anomic/crawler/RobotsTxt.java +++ b/source/de/anomic/crawler/RobotsTxt.java @@ -45,8 +45,6 @@ import net.yacy.kelondro.blob.BEncodedHeap; import net.yacy.kelondro.index.RowSpaceExceededException; import net.yacy.kelondro.io.ByteCount; -import de.anomic.crawler.retrieval.HTTPLoader; - public class RobotsTxt { private static Logger log = Logger.getLogger(RobotsTxt.class); @@ -325,7 +323,7 @@ public class RobotsTxt { RequestHeader reqHeaders = new RequestHeader(); // add yacybot user agent - reqHeaders.put(HeaderFramework.USER_AGENT, HTTPLoader.crawlerUserAgent); + reqHeaders.put(HeaderFramework.USER_AGENT, MultiProtocolURI.yacybotUserAgent); // adding referer reqHeaders.put(RequestHeader.REFERER, (MultiProtocolURI.newURL(robotsURL,"/")).toNormalform(true, true)); diff --git a/source/de/anomic/crawler/SitemapImporter.java b/source/de/anomic/crawler/SitemapImporter.java index 6937c78c0..f91db73f8 100644 --- a/source/de/anomic/crawler/SitemapImporter.java +++ b/source/de/anomic/crawler/SitemapImporter.java @@ -30,6 +30,7 @@ import java.net.MalformedURLException; import java.util.Date; import java.util.zip.GZIPInputStream; +import net.yacy.cora.document.MultiProtocolURI; import net.yacy.cora.protocol.HeaderFramework; import net.yacy.cora.protocol.RequestHeader; import net.yacy.cora.protocol.ResponseHeader; @@ -39,7 +40,6 @@ import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.kelondro.data.meta.URIMetadataRow; import net.yacy.kelondro.io.ByteCountInputStream; import net.yacy.kelondro.logging.Log; -import de.anomic.crawler.retrieval.HTTPLoader; import de.anomic.crawler.retrieval.Request; import de.anomic.search.Segments; import de.anomic.search.Switchboard; @@ -62,7 +62,7 @@ public class SitemapImporter extends Thread { public void run() { // download document final RequestHeader requestHeader = new RequestHeader(); - requestHeader.put(HeaderFramework.USER_AGENT, HTTPLoader.crawlerUserAgent); + requestHeader.put(HeaderFramework.USER_AGENT, MultiProtocolURI.yacybotUserAgent); final HTTPClient client = new HTTPClient(); client.setTimout(5000); client.setHeader(requestHeader.entrySet()); diff --git a/source/de/anomic/crawler/retrieval/HTTPLoader.java b/source/de/anomic/crawler/retrieval/HTTPLoader.java index a42c0aedd..9d26b262b 100644 --- a/source/de/anomic/crawler/retrieval/HTTPLoader.java +++ b/source/de/anomic/crawler/retrieval/HTTPLoader.java @@ -50,8 +50,6 @@ public final class HTTPLoader { private static final String DEFAULT_CHARSET = "ISO-8859-1,utf-8;q=0.7,*;q=0.7"; public static final long DEFAULT_MAXFILESIZE = 1024 * 1024 * 10; public static final int DEFAULT_CRAWLING_RETRY_COUNT = 5; - public static final String crawlerUserAgent = "yacybot (" + HTTPClient.getSystemOST() +") http://yacy.net/bot.html"; - public static final String yacyUserAgent = "yacy (" + HTTPClient.getSystemOST() +") yacy.net"; /** * The socket timeout that should be used @@ -101,7 +99,7 @@ public final class HTTPLoader { // create a request header final RequestHeader requestHeader = new RequestHeader(); - requestHeader.put(HeaderFramework.USER_AGENT, crawlerUserAgent); + requestHeader.put(HeaderFramework.USER_AGENT, MultiProtocolURI.yacybotUserAgent); DigestURI refererURL = null; if (request.referrerhash() != null) refererURL = sb.getURL(Segments.Process.LOCALCRAWLING, request.referrerhash()); if (refererURL != null) requestHeader.put(RequestHeader.REFERER, refererURL.toNormalform(true, true)); @@ -216,7 +214,7 @@ public final class HTTPLoader { // create a request header final RequestHeader requestHeader = new RequestHeader(); - requestHeader.put(HeaderFramework.USER_AGENT, crawlerUserAgent); + requestHeader.put(HeaderFramework.USER_AGENT, MultiProtocolURI.yacybotUserAgent); requestHeader.put(HeaderFramework.ACCEPT_LANGUAGE, DEFAULT_LANGUAGE); requestHeader.put(HeaderFramework.ACCEPT_CHARSET, DEFAULT_CHARSET); requestHeader.put(HeaderFramework.ACCEPT_ENCODING, DEFAULT_ENCODING); diff --git a/source/de/anomic/http/server/HTTPDProxyHandler.java b/source/de/anomic/http/server/HTTPDProxyHandler.java index f5a0a2461..19bbf89aa 100644 --- a/source/de/anomic/http/server/HTTPDProxyHandler.java +++ b/source/de/anomic/http/server/HTTPDProxyHandler.java @@ -71,6 +71,7 @@ import java.util.logging.LogManager; import java.util.logging.Logger; import java.util.zip.GZIPOutputStream; +import net.yacy.cora.document.MultiProtocolURI; import net.yacy.cora.protocol.Domains; import net.yacy.cora.protocol.HeaderFramework; import net.yacy.cora.protocol.RequestHeader; @@ -86,7 +87,6 @@ import net.yacy.kelondro.logging.Log; import net.yacy.kelondro.util.FileUtils; import net.yacy.repository.Blacklist; -import de.anomic.crawler.retrieval.HTTPLoader; import de.anomic.crawler.retrieval.Request; import de.anomic.crawler.retrieval.Response; //import de.anomic.http.client.Client; @@ -98,6 +98,9 @@ import de.anomic.server.serverObjects; public final class HTTPDProxyHandler { + + public static final String yacyUserAgent = "yacy (" + MultiProtocolURI.systemOST +") yacy.net"; + // static variables // can only be instantiated upon first instantiation of this class object private static Switchboard sb = null; @@ -1539,7 +1542,7 @@ public final class HTTPDProxyHandler { private static synchronized String generateUserAgent(final HeaderFramework requestHeaders) { userAgentStr.setLength(0); - final String browserUserAgent = requestHeaders.get(HeaderFramework.USER_AGENT, HTTPLoader.yacyUserAgent); + final String browserUserAgent = requestHeaders.get(HeaderFramework.USER_AGENT, yacyUserAgent); final int pos = browserUserAgent.lastIndexOf(')'); if (pos >= 0) { userAgentStr diff --git a/source/de/anomic/search/Switchboard.java b/source/de/anomic/search/Switchboard.java index 1125b4585..2e2468e67 100644 --- a/source/de/anomic/search/Switchboard.java +++ b/source/de/anomic/search/Switchboard.java @@ -124,7 +124,6 @@ import de.anomic.crawler.ResultURLs; import de.anomic.crawler.RobotsTxt; import de.anomic.crawler.CrawlProfile.CacheStrategy; import de.anomic.crawler.retrieval.EventOrigin; -import de.anomic.crawler.retrieval.HTTPLoader; import de.anomic.crawler.retrieval.Request; import de.anomic.crawler.retrieval.Response; import de.anomic.data.LibraryProvider; @@ -2406,7 +2405,7 @@ public final class Switchboard extends serverSwitch { final RequestHeader reqHeader = new RequestHeader(); reqHeader.put(HeaderFramework.PRAGMA, "no-cache"); reqHeader.put(HeaderFramework.CACHE_CONTROL, "no-cache"); - reqHeader.put(HeaderFramework.USER_AGENT, HTTPLoader.yacyUserAgent); + reqHeader.put(HeaderFramework.USER_AGENT, MultiProtocolURI.yacybotUserAgent); final HTTPClient client = new HTTPClient(); client.setHeader(reqHeader.entrySet()); client.setTimout((int) getConfigLong("bootstrapLoadTimeout", 20000)); @@ -2557,7 +2556,7 @@ public final class Switchboard extends serverSwitch { */ public static Map loadFileAsMap(final DigestURI url) { final RequestHeader reqHeader = new RequestHeader(); - reqHeader.put(HeaderFramework.USER_AGENT, HTTPLoader.yacyUserAgent); + reqHeader.put(HeaderFramework.USER_AGENT, MultiProtocolURI.yacybotUserAgent); final HTTPClient client = new HTTPClient(); client.setHeader(reqHeader.entrySet()); try { diff --git a/source/de/anomic/tools/loaderThreads.java b/source/de/anomic/tools/loaderThreads.java index c2233caaf..b816ac4ac 100644 --- a/source/de/anomic/tools/loaderThreads.java +++ b/source/de/anomic/tools/loaderThreads.java @@ -23,11 +23,10 @@ package de.anomic.tools; import java.util.Hashtable; +import net.yacy.cora.document.MultiProtocolURI; import net.yacy.cora.protocol.http.ProxySettings; import net.yacy.kelondro.data.meta.DigestURI; -import de.anomic.crawler.retrieval.HTTPLoader; - public class loaderThreads { // global values for loader threads @@ -116,7 +115,7 @@ public class loaderThreads { public void run() { try { - page = url.get(HTTPLoader.crawlerUserAgent, timeout); + page = url.get(MultiProtocolURI.yacybotUserAgent, timeout); loaded = true; process.feed(page); if (process.status() == loaderCore.STATUS_FAILED) { diff --git a/source/de/anomic/yacy/yacyClient.java b/source/de/anomic/yacy/yacyClient.java index 0b117c651..2b5095962 100644 --- a/source/de/anomic/yacy/yacyClient.java +++ b/source/de/anomic/yacy/yacyClient.java @@ -83,7 +83,6 @@ import org.apache.http.entity.mime.content.StringBody; import de.anomic.crawler.ResultURLs; import de.anomic.crawler.retrieval.EventOrigin; -import de.anomic.crawler.retrieval.HTTPLoader; import de.anomic.search.ContentDomain; import de.anomic.search.QueryParams; import de.anomic.search.RankingProfile; @@ -99,10 +98,10 @@ public final class yacyClient { private static byte[] postToFile(final yacySeed target, final String filename, final LinkedHashMap parts, final int timeout) throws IOException { - return HTTPConnector.getConnector(HTTPLoader.crawlerUserAgent).post(new MultiProtocolURI("http://" + target.getClusterAddress() + "/yacy/" + filename), timeout, target.getHexHash() + ".yacyh", parts); + return HTTPConnector.getConnector(MultiProtocolURI.yacybotUserAgent).post(new MultiProtocolURI("http://" + target.getClusterAddress() + "/yacy/" + filename), timeout, target.getHexHash() + ".yacyh", parts); } private static byte[] postToFile(final yacySeedDB seedDB, final String targetHash, final String filename, final LinkedHashMap parts, final int timeout) throws IOException { - return HTTPConnector.getConnector(HTTPLoader.crawlerUserAgent).post(new MultiProtocolURI("http://" + targetAddress(seedDB, targetHash) + "/yacy/" + filename), timeout, yacySeed.b64Hash2hexHash(targetHash)+ ".yacyh", parts); + return HTTPConnector.getConnector(MultiProtocolURI.yacybotUserAgent).post(new MultiProtocolURI("http://" + targetAddress(seedDB, targetHash) + "/yacy/" + filename), timeout, yacySeed.b64Hash2hexHash(targetHash)+ ".yacyh", parts); } /** @@ -138,7 +137,7 @@ public final class yacyClient { parts.put("seed", new StringBody(mySeed.genSeedStr(salt))); // send request final long start = System.currentTimeMillis(); - final byte[] content = HTTPConnector.getConnector(HTTPLoader.crawlerUserAgent).post(new MultiProtocolURI("http://" + address + "/yacy/hello.html"), 30000, yacySeed.b64Hash2hexHash(otherHash) + ".yacyh", parts); + final byte[] content = HTTPConnector.getConnector(MultiProtocolURI.yacybotUserAgent).post(new MultiProtocolURI("http://" + address + "/yacy/hello.html"), 30000, yacySeed.b64Hash2hexHash(otherHash) + ".yacyh", parts); yacyCore.log.logInfo("yacyClient.publishMySeed thread '" + Thread.currentThread().getName() + "' contacted peer at " + address + ", received " + ((content == null) ? "null" : content.length) + " bytes, time = " + (System.currentTimeMillis() - start) + " milliseconds"); result = FileUtils.table(content); } catch (final Exception e) { @@ -331,7 +330,7 @@ public final class yacyClient { parts.put("call", new StringBody("remotecrawl")); parts.put("count", new StringBody(Integer.toString(maxCount))); parts.put("time", new StringBody(Long.toString(maxTime))); - final byte[] result = HTTPConnector.getConnector(HTTPLoader.crawlerUserAgent).post(new MultiProtocolURI("http://" + target.getClusterAddress() + "/yacy/urls.xml"), (int) maxTime, target.getHexHash() + ".yacyh", parts); + final byte[] result = HTTPConnector.getConnector(MultiProtocolURI.yacybotUserAgent).post(new MultiProtocolURI("http://" + target.getClusterAddress() + "/yacy/urls.xml"), (int) maxTime, target.getHexHash() + ".yacyh", parts); final RSSReader reader = RSSReader.parse(RSSFeed.DEFAULT_MAXSIZE, result); if (reader == null) { yacyCore.log.logWarning("yacyClient.queryRemoteCrawlURLs failed asking peer '" + target.getName() + "': probably bad response from remote peer (1), reader == null"); @@ -629,8 +628,8 @@ public final class yacyClient { parts.put("profile", new StringBody(crypt.simpleEncode(rankingProfile.toExternalString()))); parts.put("constraint", new StringBody((constraint == null) ? "" : constraint.exportB64())); if (secondarySearchSuperviser != null) parts.put("abstracts", new StringBody("auto")); - resultMap = FileUtils.table(HTTPConnector.getConnector(HTTPLoader.crawlerUserAgent).post(new MultiProtocolURI("http://" + hostaddress + "/yacy/search.html"), 60000, hostname, parts)); - //resultMap = FileUtils.table(HTTPConnector.getConnector(HTTPLoader.crawlerUserAgent).post(new MultiProtocolURI("http://" + target.getClusterAddress() + "/yacy/search.html"), 60000, target.getHexHash() + ".yacyh", parts)); + resultMap = FileUtils.table(HTTPConnector.getConnector(MultiProtocolURI.yacybotUserAgent).post(new MultiProtocolURI("http://" + hostaddress + "/yacy/search.html"), 60000, hostname, parts)); + //resultMap = FileUtils.table(HTTPConnector.getConnector(MultiProtocolURI.crawlerUserAgent).post(new MultiProtocolURI("http://" + target.getClusterAddress() + "/yacy/search.html"), 60000, target.getHexHash() + ".yacyh", parts)); // evaluate request result if (resultMap == null || resultMap.isEmpty()) throw new IOException("resultMap is NULL"); @@ -750,7 +749,7 @@ public final class yacyClient { parts.put("filename", new StringBody(filename)); parts.put("filesize", new StringBody(Long.toString(filesize))); parts.put("can-send-protocol", new StringBody("http")); - final byte[] content = HTTPConnector.getConnector(HTTPLoader.crawlerUserAgent).post(new MultiProtocolURI("http://" + targetAddress + "/yacy/transfer.html"), 10000, targetAddress, parts); + final byte[] content = HTTPConnector.getConnector(MultiProtocolURI.yacybotUserAgent).post(new MultiProtocolURI("http://" + targetAddress + "/yacy/transfer.html"), 10000, targetAddress, parts); final Map result = FileUtils.table(content); return result; } catch (final Exception e) { @@ -774,7 +773,7 @@ public final class yacyClient { parts.put("md5", new StringBody(Digest.encodeMD5Hex(file))); parts.put("access", new StringBody(access)); parts.put("filename", new ByteArrayBody(file, filename)); - final byte[] content = HTTPConnector.getConnector(HTTPLoader.crawlerUserAgent).post(new MultiProtocolURI("http://" + targetAddress + "/yacy/transfer.html"), 20000, targetAddress, parts); + final byte[] content = HTTPConnector.getConnector(MultiProtocolURI.yacybotUserAgent).post(new MultiProtocolURI("http://" + targetAddress + "/yacy/transfer.html"), 20000, targetAddress, parts); final Map result = FileUtils.table(content); return result; } catch (final Exception e) { @@ -851,7 +850,7 @@ public final class yacyClient { parts.put("wordh", new StringBody(wordhashes)); parts.put("lurlEntry", new StringBody(((entry == null) ? "" : crypt.simpleEncode(entry.toString(), salt)))); // send request - final byte[] content = HTTPConnector.getConnector(HTTPLoader.crawlerUserAgent).post(new MultiProtocolURI("http://" + address + "/yacy/crawlReceipt.html"), 10000, target.getHexHash() + ".yacyh", parts); + final byte[] content = HTTPConnector.getConnector(MultiProtocolURI.yacybotUserAgent).post(new MultiProtocolURI("http://" + address + "/yacy/crawlReceipt.html"), 10000, target.getHexHash() + ".yacyh", parts); return FileUtils.table(content); } catch (final Exception e) { // most probably a network time-out exception @@ -1006,7 +1005,7 @@ public final class yacyClient { parts.put("wordc", new StringBody(Integer.toString(indexes.size()))); parts.put("entryc", new StringBody(Integer.toString(indexcount))); parts.put("indexes", new StringBody(entrypost.toString())); - final byte[] content = HTTPConnector.getConnector(HTTPLoader.crawlerUserAgent).post(new MultiProtocolURI("http://" + address + "/yacy/transferRWI.html"), timeout, targetSeed.getHexHash() + ".yacyh", parts); + final byte[] content = HTTPConnector.getConnector(MultiProtocolURI.yacybotUserAgent).post(new MultiProtocolURI("http://" + address + "/yacy/transferRWI.html"), timeout, targetSeed.getHexHash() + ".yacyh", parts); final Iterator v = FileUtils.strings(content); // this should return a list of urlhashes that are unknown @@ -1050,7 +1049,7 @@ public final class yacyClient { } try { parts.put("urlc", new StringBody(Integer.toString(urlc))); - final byte[] content = HTTPConnector.getConnector(HTTPLoader.crawlerUserAgent).post(new MultiProtocolURI("http://" + address + "/yacy/transferURL.html"), timeout, targetSeed.getHexHash() + ".yacyh", parts); + final byte[] content = HTTPConnector.getConnector(MultiProtocolURI.yacybotUserAgent).post(new MultiProtocolURI("http://" + address + "/yacy/transferURL.html"), timeout, targetSeed.getHexHash() + ".yacyh", parts); final Iterator v = FileUtils.strings(content); final Map result = FileUtils.table(v); @@ -1072,7 +1071,7 @@ public final class yacyClient { if (address == null) { address = "localhost:8080"; } try { final LinkedHashMap parts = yacyNetwork.basicRequestParts(Switchboard.getSwitchboard(), targetSeed.hash, salt); - final byte[] content = HTTPConnector.getConnector(HTTPLoader.crawlerUserAgent).post(new MultiProtocolURI("http://" + address + "/yacy/profile.html"), 5000, targetSeed.getHexHash() + ".yacyh", parts); + final byte[] content = HTTPConnector.getConnector(MultiProtocolURI.yacybotUserAgent).post(new MultiProtocolURI("http://" + address + "/yacy/profile.html"), 5000, targetSeed.getHexHash() + ".yacyh", parts); return FileUtils.table(content); } catch (final Exception e) { yacyCore.log.logSevere("yacyClient.getProfile error:" + e.getMessage()); @@ -1162,7 +1161,7 @@ public final class yacyClient { } byte[] res; try { - res = HTTPConnector.getConnector(HTTPLoader.crawlerUserAgent).post(url, timeout, vhost, newpost); + res = HTTPConnector.getConnector(MultiProtocolURI.yacybotUserAgent).post(url, timeout, vhost, newpost); System.out.println(new String(res)); } catch (IOException e1) { Log.logException(e1); diff --git a/source/de/anomic/yacy/yacyRelease.java b/source/de/anomic/yacy/yacyRelease.java index 4a8028371..d8e4c5cd5 100644 --- a/source/de/anomic/yacy/yacyRelease.java +++ b/source/de/anomic/yacy/yacyRelease.java @@ -57,7 +57,6 @@ import net.yacy.kelondro.util.FileUtils; import net.yacy.kelondro.util.OS; import de.anomic.crawler.CrawlProfile; -import de.anomic.crawler.retrieval.HTTPLoader; import de.anomic.search.Switchboard; import de.anomic.server.serverCore; import de.anomic.tools.CryptoLib; @@ -283,7 +282,7 @@ public final class yacyRelease extends yacyVersion { File download = null; // setup httpClient final RequestHeader reqHeader = new RequestHeader(); - reqHeader.put(HeaderFramework.USER_AGENT, HTTPLoader.yacyUserAgent); + reqHeader.put(HeaderFramework.USER_AGENT, MultiProtocolURI.yacybotUserAgent); final String name = this.getUrl().getFileName(); byte[] signatureBytes = null; diff --git a/source/de/anomic/yacy/yacySeedDB.java b/source/de/anomic/yacy/yacySeedDB.java index c121bf927..264ccda61 100644 --- a/source/de/anomic/yacy/yacySeedDB.java +++ b/source/de/anomic/yacy/yacySeedDB.java @@ -38,6 +38,7 @@ import java.util.Map; import java.util.TreeMap; import java.util.concurrent.ConcurrentHashMap; +import net.yacy.cora.document.MultiProtocolURI; import net.yacy.cora.protocol.Domains; import net.yacy.cora.protocol.HeaderFramework; import net.yacy.cora.protocol.RequestHeader; @@ -52,7 +53,6 @@ import net.yacy.kelondro.order.Base64Order; import net.yacy.kelondro.util.FileUtils; import net.yacy.kelondro.util.kelondroException; -import de.anomic.crawler.retrieval.HTTPLoader; //import de.anomic.http.client.Client; import de.anomic.http.server.HTTPDemon; import de.anomic.http.server.AlternativeDomainNames; @@ -846,7 +846,7 @@ public final class yacySeedDB implements AlternativeDomainNames { final RequestHeader reqHeader = new RequestHeader(); reqHeader.put(HeaderFramework.PRAGMA, "no-cache"); reqHeader.put(HeaderFramework.CACHE_CONTROL, "no-cache"); // httpc uses HTTP/1.0 is this necessary? - reqHeader.put(HeaderFramework.USER_AGENT, HTTPLoader.yacyUserAgent); + reqHeader.put(HeaderFramework.USER_AGENT, MultiProtocolURI.yacybotUserAgent); // init http-client // final Client client = new Client(10000, reqHeader); diff --git a/source/net/yacy/cora/document/MultiProtocolURI.java b/source/net/yacy/cora/document/MultiProtocolURI.java index c58856a03..685097b01 100644 --- a/source/net/yacy/cora/document/MultiProtocolURI.java +++ b/source/net/yacy/cora/document/MultiProtocolURI.java @@ -49,7 +49,8 @@ import net.yacy.cora.protocol.http.HTTPClient; * MultiProtocolURI provides a URL object for multiple protocols like http, https, ftp, smb and file * */ -public class MultiProtocolURI implements Serializable { +public class MultiProtocolURI implements Serializable, Comparable { + private static final long serialVersionUID = -1173233022912141884L; public static final int TLD_any_zone_filter = 255; // from TLD zones can be filtered during search; this is the catch-all filter @@ -78,10 +79,47 @@ public class MultiProtocolURI implements Serializable { } } + /** + * provide system information for client identification + */ + public static final String systemOST = System.getProperty("os.arch", "no-os-arch") + " " + + System.getProperty("os.name", "no-os-name") + " " + System.getProperty("os.version", "no-os-version") + + "; " + "java " + System.getProperty("java.version", "no-java-version") + "; " + generateLocation(); + + public static final String yacybotUserAgent = "yacybot (" + systemOST +") http://yacy.net/bot.html"; + + /** + * generating the location string + * + * @return + */ + public static String generateLocation() { + String loc = System.getProperty("user.timezone", "nowhere"); + final int p = loc.indexOf('/'); + if (p > 0) { + loc = loc.substring(0, p); + } + loc = loc + "/" + System.getProperty("user.language", "dumb"); + return loc; + } + // class variables protected String protocol, host, userInfo, path, quest, ref; protected int port; + /** + * initialization of a MultiProtocolURI to produce poison pills for concurrent blocking queues + */ + public MultiProtocolURI() { + this.protocol = null; + this.host = null; + this.userInfo = null; + this.path = null; + this.quest = null; + this.ref = null; + this.port = -1; + } + public MultiProtocolURI(final File file) throws MalformedURLException { this("file", "", -1, file.getAbsolutePath()); } @@ -762,9 +800,8 @@ public class MultiProtocolURI implements Serializable { return this.toString().equals(other.toString()); } - public int compareTo(final Object h) { - assert (h instanceof MultiProtocolURI); - return this.toString().compareTo(((MultiProtocolURI) h).toString()); + public int compareTo(MultiProtocolURI h) { + return this.toString().compareTo(h.toString()); } public boolean isPOST() { @@ -1112,4 +1149,5 @@ public class MultiProtocolURI implements Serializable { } } } + } diff --git a/source/net/yacy/cora/protocol/Domains.java b/source/net/yacy/cora/protocol/Domains.java index 3a7cd0841..c42f76371 100644 --- a/source/net/yacy/cora/protocol/Domains.java +++ b/source/net/yacy/cora/protocol/Domains.java @@ -609,6 +609,28 @@ public class Domains { return localHostAddresses[0]; } + /** + * generate a list of intranet InetAddresses without the loopback address 127.0.0.1 + * @return list of all intranet addresses + */ + public static List myIntranetIPs() { + // list all local addresses + if (localHostAddresses.length < 2) try {Thread.sleep(1000);} catch (InterruptedException e) {} + ArrayList list = new ArrayList(localHostAddresses.length); + if (localHostAddresses.length == 0) { + if (localHostAddress != null && isLocal(localHostAddress.getHostAddress())) { + list.add(localHostAddress); + } + return list; + } + for (int i = 0; i < localHostAddresses.length; i++) { + if ((0Xff & localHostAddresses[i].getAddress()[0]) == 127) continue; + if (!matchesList(localHostAddresses[i].getHostAddress(), localhostPatterns)) continue; + list.add(localHostAddresses[i]); + } + return list; + } + public static int getDomainID(final String host) { if (host == null || host.length() == 0) return TLD_Local_ID; if (isLocal(host)) return TLD_Local_ID; diff --git a/source/net/yacy/cora/protocol/ftp/FTPClient.java b/source/net/yacy/cora/protocol/ftp/FTPClient.java index 684a39155..b7e5cf209 100644 --- a/source/net/yacy/cora/protocol/ftp/FTPClient.java +++ b/source/net/yacy/cora/protocol/ftp/FTPClient.java @@ -44,6 +44,7 @@ import java.io.RandomAccessFile; import java.lang.reflect.InvocationTargetException; import java.lang.reflect.Method; import java.net.InetAddress; +import java.net.InetSocketAddress; import java.net.ServerSocket; import java.net.Socket; import java.net.SocketException; @@ -85,7 +86,7 @@ public class FTPClient { private Socket ControlSocket = null; // socket timeout - private static final int ControlSocketTimeout = 10000; + private static final int ControlSocketTimeout = 1000; // data socket timeout private int DataSocketTimeout = 0; // in seconds (default infinite) @@ -1515,13 +1516,14 @@ public class FTPClient { } try { - ControlSocket = new Socket(host, port); + ControlSocket = new Socket(); ControlSocket.setSoTimeout(getTimeout()); ControlSocket.setKeepAlive(true); ControlSocket.setTcpNoDelay(true); // no accumulation until buffer is full ControlSocket.setSoLinger(false, getTimeout()); // !wait for all data being written on close() ControlSocket.setSendBufferSize(1440); // read http://www.cisco.com/warp/public/105/38.shtml ControlSocket.setReceiveBufferSize(1440); // read http://www.cisco.com/warp/public/105/38.shtml + ControlSocket.connect(new InetSocketAddress(host, port), 1000); clientInput = new BufferedReader(new InputStreamReader(ControlSocket.getInputStream())); clientOutput = new DataOutputStream(new BufferedOutputStream(ControlSocket.getOutputStream())); diff --git a/source/net/yacy/cora/protocol/http/HTTPClient.java b/source/net/yacy/cora/protocol/http/HTTPClient.java index 3c37c5e50..09fdfcd16 100644 --- a/source/net/yacy/cora/protocol/http/HTTPClient.java +++ b/source/net/yacy/cora/protocol/http/HTTPClient.java @@ -42,6 +42,7 @@ import javax.net.ssl.SSLContext; import javax.net.ssl.TrustManager; import javax.net.ssl.X509TrustManager; +import net.yacy.cora.document.MultiProtocolURI; import net.yacy.cora.protocol.ConnectionInfo; import org.apache.http.Header; @@ -93,7 +94,7 @@ public class HTTPClient { private final static int maxcon = 20; private static IdledConnectionEvictor idledConnectionEvictor = null; - private static HttpClient httpClient = null; + private static HttpClient httpClient = initConnectionManager(); private Header[] headers = null; private HttpResponse httpResponse = null; private HttpUriRequest currentRequest = null; @@ -106,19 +107,13 @@ public class HTTPClient { public HTTPClient() { super(); - if (httpClient == null) { - initConnectionManager(); - } } public static void setDefaultUserAgent(final String defaultAgent) { - if (httpClient == null) { - initConnectionManager(); - } HttpProtocolParams.setUserAgent(httpClient.getParams(), defaultAgent); } - private static void initConnectionManager() { + private static HttpClient initConnectionManager() { // Create and initialize HTTP parameters final HttpParams httpParams = new BasicHttpParams(); /** @@ -141,7 +136,7 @@ public class HTTPClient { */ HttpProtocolParams.setVersion(httpParams, HttpVersion.HTTP_1_1); // UserAgent - HttpProtocolParams.setUserAgent(httpParams, "yacy (" + systemOST +") yacy.net"); + HttpProtocolParams.setUserAgent(httpParams, MultiProtocolURI.yacybotUserAgent); HttpProtocolParams.setUseExpectContinue(httpParams, false); // IMPORTANT - if not set to 'false' then servers do not process the request until a time-out of 2 seconds /** * HTTP connection settings @@ -175,7 +170,7 @@ public class HTTPClient { idledConnectionEvictor = new IdledConnectionEvictor(clientConnectionManager); idledConnectionEvictor.start(); - + return httpClient; } /** @@ -532,35 +527,6 @@ public class HTTPClient { upbytes)); } - /** - * provide system information for client identification - */ - private static final String systemOST = System.getProperty("os.arch", "no-os-arch") + " " + - System.getProperty("os.name", "no-os-name") + " " + System.getProperty("os.version", "no-os-version") + - "; " + "java " + System.getProperty("java.version", "no-java-version") + "; " + generateLocation(); - - /** - * generating the location string - * - * @return - */ - public static String generateLocation() { - String loc = System.getProperty("user.timezone", "nowhere"); - final int p = loc.indexOf('/'); - if (p > 0) { - loc = loc.substring(0, p); - } - loc = loc + "/" + System.getProperty("user.language", "dumb"); - return loc; - } - - /** - * @return the systemOST - */ - public static String getSystemOST() { - return systemOST; - } - private static SSLSocketFactory getSSLSocketFactory() { final TrustManager trustManager = new X509TrustManager() { public void checkClientTrusted(X509Certificate[] chain, String authType) diff --git a/source/net/yacy/cora/services/Search.java b/source/net/yacy/cora/services/Search.java index eb759ddea..354fe3840 100644 --- a/source/net/yacy/cora/services/Search.java +++ b/source/net/yacy/cora/services/Search.java @@ -41,8 +41,6 @@ import net.yacy.cora.protocol.http.HTTPConnector; import org.apache.http.entity.mime.content.ContentBody; import org.apache.http.entity.mime.content.StringBody; -import de.anomic.crawler.retrieval.HTTPLoader; - public class Search { public static BlockingQueue search(String rssSearchServiceURL, String query, boolean verify, boolean global, long timeout, int maximumRecords) { @@ -128,7 +126,7 @@ public class Search { parts.put("maximumRecords", new StringBody(Long.toString(maximumRecords))); parts.put("verify", new StringBody(verify ? "true" : "false")); parts.put("resource", new StringBody(global ? "global" : "local")); - final byte[] result = HTTPConnector.getConnector(HTTPLoader.yacyUserAgent).post(new MultiProtocolURI(rssSearchServiceURL), (int) timeout, uri.getHost(), parts); + final byte[] result = HTTPConnector.getConnector(MultiProtocolURI.yacybotUserAgent).post(new MultiProtocolURI(rssSearchServiceURL), (int) timeout, uri.getHost(), parts); //String debug = new String(result); System.out.println("*** DEBUG: " + debug); final RSSReader reader = RSSReader.parse(RSSFeed.DEFAULT_MAXSIZE, result); if (reader == null) { diff --git a/source/net/yacy/document/parser/htmlParser.java b/source/net/yacy/document/parser/htmlParser.java index d0946769c..c04768798 100644 --- a/source/net/yacy/document/parser/htmlParser.java +++ b/source/net/yacy/document/parser/htmlParser.java @@ -32,8 +32,6 @@ import java.util.regex.Pattern; import com.ibm.icu.text.CharsetDetector; -import de.anomic.crawler.retrieval.HTTPLoader; - import net.yacy.cora.document.MultiProtocolURI; import net.yacy.document.AbstractParser; import net.yacy.document.Document; @@ -268,7 +266,7 @@ public class htmlParser extends AbstractParser implements Parser { MultiProtocolURI url; try { url = new MultiProtocolURI(args[0]); - byte[] content = url.get(HTTPLoader.crawlerUserAgent, 3000); + byte[] content = url.get(MultiProtocolURI.yacybotUserAgent, 3000); Document[] document = new htmlParser().parse(url, "text/html", null, new ByteArrayInputStream(content)); String title = document[0].dc_title(); System.out.println(title); diff --git a/source/net/yacy/repository/LoaderDispatcher.java b/source/net/yacy/repository/LoaderDispatcher.java index 076caa33e..8fd8e751f 100644 --- a/source/net/yacy/repository/LoaderDispatcher.java +++ b/source/net/yacy/repository/LoaderDispatcher.java @@ -201,7 +201,7 @@ public final class LoaderDispatcher { // create request header values and a response object because we need that // in case that we want to return the cached content in the next step final RequestHeader requestHeader = new RequestHeader(); - requestHeader.put(HeaderFramework.USER_AGENT, HTTPLoader.crawlerUserAgent); + requestHeader.put(HeaderFramework.USER_AGENT, MultiProtocolURI.yacybotUserAgent); DigestURI refererURL = null; if (request.referrerhash() != null) refererURL = sb.getURL(Segments.Process.LOCALCRAWLING, request.referrerhash()); if (refererURL != null) requestHeader.put(RequestHeader.REFERER, refererURL.toNormalform(true, true)); diff --git a/source/net/yacy/yacy.java b/source/net/yacy/yacy.java index de8f629ed..40b88df66 100644 --- a/source/net/yacy/yacy.java +++ b/source/net/yacy/yacy.java @@ -46,6 +46,7 @@ import java.util.concurrent.Semaphore; import java.util.zip.ZipEntry; import java.util.zip.ZipOutputStream; +import net.yacy.cora.document.MultiProtocolURI; import net.yacy.cora.protocol.RequestHeader; import net.yacy.cora.protocol.http.HTTPClient; import net.yacy.gui.YaCyApp; @@ -289,7 +290,7 @@ public final class yacy { // set user-agent final String userAgent = "yacy/" + Double.toString(version) + " (www.yacy.net; " - + HTTPClient.getSystemOST() + ")"; + + MultiProtocolURI.systemOST + ")"; HTTPClient.setDefaultUserAgent(userAgent); // start main threads