yacy_search_server/htroot/yacy/hello.java

274 lines
14 KiB
Java
Raw Normal View History

// hello.java
// -----------------------
// part of the AnomicHTTPD caching proxy
// (C) by Michael Peter Christen; mc@yacy.net
// first published on http://www.anomic.de
// Frankfurt, Germany, 2004
//
// $LastChangedDate$
// $LastChangedRevision$
// $LastChangedBy$
//
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation; either version 2 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
//
// You must compile this file with
// javac -classpath .:../../classes hello.java
// if the shell's current path is HTROOT
import java.io.IOException;
import java.net.InetAddress;
import java.net.MalformedURLException;
import java.util.Iterator;
import java.util.Set;
import java.util.concurrent.ConcurrentMap;
import net.yacy.cora.document.id.MultiProtocolURL;
import net.yacy.cora.protocol.Domains;
import net.yacy.cora.protocol.HeaderFramework;
import net.yacy.cora.protocol.RequestHeader;
import net.yacy.peers.Network;
2012-09-21 16:46:57 +02:00
import net.yacy.peers.DHTSelection;
import net.yacy.peers.Protocol;
import net.yacy.peers.Seed;
import net.yacy.peers.graphics.ProfilingGraph;
2012-05-04 17:28:27 +02:00
import net.yacy.search.EventTracker;
import net.yacy.search.Switchboard;
import net.yacy.search.SwitchboardConstants;
2012-09-21 15:48:16 +02:00
import net.yacy.server.serverCore;
import net.yacy.server.serverObjects;
import net.yacy.server.serverSwitch;
*) Asynchronous queuing of crawl job URLs (stackCrawl) various checks like the blacklist check or the robots.txt disallow check are now done by a separate thread to unburden the indexer thread(s) TODO: maybe we have to introduce a threadpool here if it turn out that this single thread is a bottleneck because of the time consuming robots.txt downloads *) improved index transfer The index selection and transmission is done in parallel now to improve index transfer performance. TODO: maybe we could speed up performance by unsing multiple transmission threads in parallel instead of only a single one. *) gzip encoded post requests it is now configureable if a gzip encoded post request should be send on intex transfer/distribution *) storage Peer (very experimentell and not optimized yet) Now it's possible to send the result of the yacy indexer thread to a remote peer istead of storing the indexed words locally. This could be done by setting the property "storagePeerHash" in the yacy config file - Please note that if the index transfer fails, the index ist stored locally. - TODO: currently this index transfer is done by the indexer thread. To seedup the indexer a) this transmission should be done in parallel and b) multiple chunks should be bundled and transfered together *) general performance improvements - better memory cleanup after http request processing has finished - replacing some string concatenations with stringBuffers - replacing BufferedInputStreams with serverByteBuffer - replacing vectors with arraylists wherever possible - replacing hashtables with hashmaps wherever possible This was done because function calls to verctor or hashtable functions take 3 time longer than calls to functions of arraylists or hashmaps. TODO: we should take a look on the class serverObject which is inherited from hashmap Do we realy need a synchronization for this class? TODO: replace arraylists with linkedLists if random access to the list elements is not needed *) Robots Parser supports if-modified-since downloads now If the downloaded robots.txt file is older than 7 days the robots parser tries to download the robots.txt with the if-modified-since header to avoid unnecessary downloads if the file was not changed. Additionally the ETag header is used to detect changes. *) Crawler: better handling of unsupported mimeTypes + FileExtension *) Bugfix: plasmaWordIndexEntity was not closed correctly in - query.java - plasmaswitchboard.java *) function minimizeUrlDB added to yacy.java this function tests the current urlHashDB for unused urls ATTENTION: please don't use this function at the moment because it causes the wordIndexDB to flush all words into the word directory! git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@853 6c8d7289-2bf4-0310-a012-ef5d649a1542
2005-10-05 12:45:33 +02:00
public final class hello {
// example:
// http://localhost:8090/yacy/hello.html?count=1&seed=p|{Hash=sCJ6Tq8T0N9x,Port=8090,PeerType=junior}
// http://localhost:8090/yacy/hello.html?count=10&seed=z|H4sIAAAAAAAAADWQW2vDMAyF_81eJork3GyGX-YxGigly2WFvZTQijbQJsHx1pWx_z7nMj1J4ug7B_2s6-GsP5q3G-G6vBz2e0iz8t6zfuBr7-5PUNanQfulhqyzTkuUCFXvmitrBJtq4ed3tkPTtRpXhIiRDAmq0uhHFIiQMduJ-NXYU9NCbrrP1vnjIdUqgk09uIK51V6rMBRIilAo2NajwzfhGcx8QUKsEIp5iCJo-eaTVUXPfPQ4k5dm4pp8NzaESsLzS-14QVNIMlA-ka2m1JuZJJWIBRwPo0GIIiYp4zCSkC5GQSLiJIah0p6X_rvlS-MTbWdhkCSBIni9jA_rfP3-Ae1Oye9dAQAA
public static serverObjects respond(final RequestHeader header, final serverObjects post, final serverSwitch env) {
final Switchboard sb = (Switchboard) env;
final serverObjects prop = new serverObjects();
final long start = System.currentTimeMillis();
prop.put("message", "none");
String clientip = header.getRemoteAddr();
2014-12-21 17:53:06 +01:00
//ConcurrentLog.info("**hello-DEBUG**", "client request from = " + clientip);
final InetAddress ias = Domains.dnsResolve(clientip);
long time = System.currentTimeMillis();
final long time_dnsResolve = System.currentTimeMillis() - time;
if (ias == null) {
if (clientip == null) clientip = "<unknown>";
Network.log.info("hello/server: failed contacting seed; clientip not resolvable (clientip=" + clientip + ", time_dnsResolve=" + time_dnsResolve + ")");
prop.put("message", "cannot resolve your IP from your reported location " + clientip);
return prop;
}
prop.put("yourip", ias.getHostAddress());
prop.put(Seed.YOURTYPE, Seed.PEERTYPE_VIRGIN); // a default value
prop.put("seedlist", "");
if ((post == null) || (env == null)) {
prop.put("message", "no post or no enviroment");
return prop;
}
if (!Protocol.authentifyRequest(post, env)) {
prop.put("message", "not in my network");
return prop;
}
// final String iam = (String) post.get("iam", ""); // complete seed of the requesting peer
// final String mytime = (String) post.get(MYTIME, ""); //
final String key = post.get("key", ""); // transmission key for response
final String seed = post.get("seed", "");
int count = post.getInt("count", 0);
// final long magic = post.getLong("magic", 0);
// final Date remoteTime = yacyCore.parseUniversalDate(post.get(MYTIME)); // read remote time
if (seed.length() > Seed.maxsize) {
Network.log.info("hello/server: rejected contacting seed; too large (" + seed.length() + " > " + Seed.maxsize + ", time_dnsResolve=" + time_dnsResolve + ")");
prop.put("message", "your seed is too long (" + seed.length() + ")");
return prop;
}
Seed remoteSeed;
try {
2012-07-05 10:23:07 +02:00
remoteSeed = Seed.genRemoteSeed(seed, true, ias.getHostAddress());
} catch (final IOException e) {
Network.log.info("hello/server: bad seed: " + e.getMessage() + ", time_dnsResolve=" + time_dnsResolve);
prop.put("message", "bad seed: " + e.getMessage());
return prop;
}
if (remoteSeed == null || remoteSeed.hash == null) {
Network.log.info("hello/server: bad seed: null, time_dnsResolve=" + time_dnsResolve);
prop.put("message", "cannot parse your seed");
return prop;
}
// we easily know the caller's IP:
final String userAgent = header.get(HeaderFramework.USER_AGENT, "<unknown>");
2012-08-16 16:28:57 +02:00
sb.peers.peerActions.setUserAgent(clientip, userAgent);
final Set<String> reportedips = remoteSeed.getIPs();
final String reportedPeerType = remoteSeed.get(Seed.PEERTYPE, Seed.PEERTYPE_JUNIOR);
//final double clientversion = remoteSeed.getVersion();
if (remoteSeed.getPort() == sb.peers.mySeed().getPort()) {
2014-10-08 12:38:56 +02:00
if (sb.peers.mySeed().clash(reportedips)) {
// reject a self-ping
prop.put("message", "I am I");
return prop;
}
}
if (remoteSeed.hash.equals(sb.peers.mySeed().hash)) {
// reject a ping with my own hash
prop.put("message", "You are using my peer hash");
return prop;
}
/*
if (remoteSeed.getName().equals(sb.peers.mySeed().getName())) {
// reject a ping with my name
prop.put("message", "You are using my name");
return prop;
}
*/
if (sb.isRobinsonMode() && !sb.isPublicRobinson()) {
// if we are a robinson cluster, answer only if this client is known by our network definition
prop.put("message", "I am robinson, I do not answer");
return prop;
}
long[] callback = new long[]{-1, -1};
// if the remote client has reported its own IP address and the client supports
// the port forwarding feature (if client version >= 0.383) then we try to
// connect to the reported IP address first
long time_backping = 0;
String backping_method = "none";
boolean success = false;
// TODO: make this a concurrent process
if (!serverCore.useStaticIP || !ias.isSiteLocalAddress()) {
reportedips.add(ias.getHostAddress());
}
final int connectedBefore = sb.peers.sizeConnected();
2014-12-21 17:53:06 +01:00
//ConcurrentLog.info("**hello-DEBUG**", "peer " + remoteSeed.getName() + " challenged us with IPs " + reportedips);
final boolean preferHttps = sb.getConfigBool(SwitchboardConstants.NETWORK_PROTOCOL_HTTPS_PREFERRED,
SwitchboardConstants.NETWORK_PROTOCOL_HTTPS_PREFERRED_DEFAULT);
final int totalTimeout = preferHttps ? 13000 : 6500;
int callbackRemain = Math.min(5, reportedips.size());
final long callbackStart = System.currentTimeMillis();
if (callbackRemain > 0 && reportedips.size() > 0) {
for (String reportedip: reportedips) {
int partialtimeout = ((int) (callbackStart + totalTimeout - System.currentTimeMillis())) / callbackRemain; // bad hack until a concurrent version is implemented
if (partialtimeout <= 0) break;
2014-12-21 17:53:06 +01:00
//ConcurrentLog.info("**hello-DEBUG**", "reportedip = " + reportedip + " is handled");
if (Seed.isProperIP(reportedip)) {
2014-12-21 17:53:06 +01:00
//ConcurrentLog.info("**hello-DEBUG**", "starting callback to reportedip = " + reportedip + ", timeout = " + partialtimeout);
prop.put("yourip", reportedip);
remoteSeed.setIP(reportedip);
time = System.currentTimeMillis();
try {
MultiProtocolURL remoteBaseURL = remoteSeed.getPublicMultiprotocolURL(reportedip, preferHttps);
callback = Protocol.queryRWICount(remoteBaseURL, remoteSeed, partialtimeout);
if (callback[0] < 0 && remoteBaseURL.isHTTPS()) {
/* Failed using https : retry using http */
remoteBaseURL = remoteSeed.getPublicMultiprotocolURL(reportedip, false);
callback = Protocol.queryRWICount(remoteBaseURL, remoteSeed, partialtimeout);
}
} catch(final MalformedURLException e) {
callback = new long[] {-1, -1};
}
2014-12-21 17:53:06 +01:00
//ConcurrentLog.info("**hello-DEBUG**", "reportedip = " + reportedip + " returns callback " + (callback == null ? "NULL" : callback[0]));
time_backping = System.currentTimeMillis() - time;
backping_method = "reportedip=" + reportedip;
if (callback[0] >= 0) {
success = true;
break;
}
if (--callbackRemain <= 0) break; // no more tries left / restrict to a limited number of ips
}
}
}
if (success) {
2014-12-21 17:53:06 +01:00
//ConcurrentLog.info("**hello-DEBUG**", "success for IP(s) " + remoteSeed.getIPs() + ", port " + remoteSeed.getPort());
if (remoteSeed.get(Seed.PEERTYPE, Seed.PEERTYPE_SENIOR) == null) {
prop.put(Seed.YOURTYPE, Seed.PEERTYPE_SENIOR);
remoteSeed.put(Seed.PEERTYPE, Seed.PEERTYPE_SENIOR);
} else if (remoteSeed.get(Seed.PEERTYPE, Seed.PEERTYPE_PRINCIPAL).equals(Seed.PEERTYPE_PRINCIPAL)) {
prop.put(Seed.YOURTYPE, Seed.PEERTYPE_PRINCIPAL);
} else {
prop.put(Seed.YOURTYPE, Seed.PEERTYPE_SENIOR);
remoteSeed.put(Seed.PEERTYPE, Seed.PEERTYPE_SENIOR);
}
// connect the seed
Network.log.info("hello/server: responded remote " + reportedPeerType + " peer '" + remoteSeed.getName() + "' from " + reportedips + ", time_dnsResolve=" + time_dnsResolve + ", time_backping=" + time_backping + ", method=" + backping_method + ", urls=" + callback[0]);
sb.peers.peerActions.peerArrival(remoteSeed, true);
} else {
2014-12-21 17:53:06 +01:00
//ConcurrentLog.info("**hello-DEBUG**", "fail for IP(s) " + remoteSeed.getIPs() + ", port " + remoteSeed.getPort());
prop.put("yourip", ias.getHostAddress());
remoteSeed.setIP(ias.getHostAddress());
prop.put(Seed.YOURTYPE, Seed.PEERTYPE_JUNIOR);
remoteSeed.put(Seed.PEERTYPE, Seed.PEERTYPE_JUNIOR);
Network.log.info("hello/server: responded remote " + reportedPeerType + " peer '" + remoteSeed.getName() + "' from " + reportedips + ", time_dnsResolve=" + time_dnsResolve + ", time_backping=" + time_backping + ", method=" + backping_method + ", urls=" + callback[0]);
// no connection here, instead store junior in connection cache
if ((remoteSeed.hash != null) && (remoteSeed.isProper(false) == null)) {
sb.peers.peerActions.peerPing(remoteSeed);
}
}
remoteSeed.setLastSeenUTC();
final int connectedAfter = sb.peers.sizeConnected();
// update event tracker
EventTracker.update(EventTracker.EClass.PEERPING, new ProfilingGraph.EventPing(remoteSeed.getName(), sb.peers.myName(), false, connectedAfter - connectedBefore), false);
if (!(prop.get(Seed.YOURTYPE)).equals(reportedPeerType)) {
Network.log.info("hello/server: changing remote peer '" + remoteSeed.getName() + "' " + reportedips + " peerType from '" + reportedPeerType + "' to '" + prop.get(Seed.YOURTYPE) + "'.");
}
final StringBuilder seeds = new StringBuilder(768);
// attach some more seeds, as requested
if (sb.peers.sizeConnected() > 0) {
if (count > sb.peers.sizeConnected()) { count = sb.peers.sizeConnected(); }
if (count > 100) { count = 100; }
// latest seeds
final ConcurrentMap<String, Seed> ySeeds = DHTSelection.seedsByAge(sb.peers, true, count); // peerhash/yacySeed relation
// attach also my own seed
seeds.append("seed0=").append(sb.peers.mySeed().genSeedStr(key)).append(serverCore.CRLF_STRING);
count = 1;
// attach other seeds
if (ySeeds != null) {
seeds.ensureCapacity((ySeeds.size() + 1) * 768);
final Iterator<Seed> si = ySeeds.values().iterator();
Seed s;
String seedString;
while (si.hasNext()) {
s = si.next();
if ((s != null) && (s.isProper(false) == null)) {
seedString = s.genSeedStr(key);
if (seedString != null) {
seeds.append("seed").append(count).append('=').append(seedString).append(serverCore.CRLF_STRING);
count++;
}
}
}
}
} else {
// attach also my own seed
seeds.append("seed0=").append(sb.peers.mySeed().genSeedStr(key)).append(serverCore.CRLF_STRING);
}
* Complete number localization and provide a more reasonable interface to serverObjects: - put(key, value) methods are now used if a value added to the map should be kept as it is. Numbers are transformed (but not formatted) to an equivalent String representation. - putASIS(...) have been removed, now done with simple put(...) (see above). - puNum(...) can be used for number values which should be stored in a formatted way, either depending on the current locale setting for yacy (default) or in a "none" locale (see javadocs and setLocalize()). - putHTML(...) escapes special characters into corresponding HTML enities ('<' => '&lt;') which was done with put(...) before and so was called too often, becauses it is necessary only for very few cases. Additionally there is a "forXML" mode which only replaces < > & ". In short: Use put(...) for almost everything, use putXY(...) if you need some special transformation of the value. A few bugs have been fixed as well, and there should be a small performance improvement for complex pages with a lot of values. * added additional Sum/Avg rows to access tracker pages, see http://forum.yacy-websuche.de/viewtopic.php?f=5&t=456 * removed duplicate code (mostly related to the big changes above). TODO: - make sure, number formats work as expected _everywhere_, report overseen stuff http://forum.yacy-websuche.de/viewtopic.php?f=5&t=437 - probably a good idea to add special putDate() methods as they are used in many pages and create duplicated formatting code + maybe some centralized handling for memory value formatting. - further improve the speed of page creation for the WatchCrawler. git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@4178 6c8d7289-2bf4-0310-a012-ef5d649a1542
2007-10-24 23:38:19 +02:00
prop.put("seedlist", seeds.toString());
// return rewrite properties
prop.put("message", "ok " + seed.length());
Network.log.info("hello/server: responded remote peer '" + remoteSeed.getName() + "' " + reportedips + " in " + (System.currentTimeMillis() - start) + " milliseconds");
return prop;
}
}