yacy_search_server/htroot/api/webstructure.java
orbiter 67aaffc0a2 - added Latency control to the crawler:
because of the strongly enhanced indexing speed when using the new IndexCell RWI data structures (> 2000PPM on my notebook), it is now necessary to control the crawling speed depending on the response time of the target server (which is also YaCy in case of some intranet indexing use cases).
The latency factor in crawl delay times is derived from the time that a target hosts takes to answer on http requests. For internet domains, the crawl delay is a minimum of twice the response time, in intranet cases the delay time is now a halve of the response time.

- added API to monitor the latency times of the crawler:
a new api at /api/latency_p.xml returns the current response times of domains, the time when the domain was accessed by the crawler the last time and many more attributes.

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@5733 6c8d7289-2bf4-0310-a012-ef5d649a1542
2009-03-20 10:21:23 +00:00

76 lines
3.1 KiB
Java

// webstructure.java
// ------------
// (C) 2009 by Michael Peter Christen; mc@yacy.net
// first published 01.05.2008 on http://yacy.net
//
// $LastChangedDate: 2009-03-16 19:08:43 +0100 (Mo, 16 Mrz 2009) $
// $LastChangedRevision: 5723 $
// $LastChangedBy: borg-0300 $
//
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation; either version 2 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
import java.util.Iterator;
import java.util.Map;
import de.anomic.http.httpRequestHeader;
import de.anomic.plasma.plasmaSwitchboard;
import de.anomic.plasma.plasmaWebStructure;
import de.anomic.server.serverObjects;
import de.anomic.server.serverSwitch;
public class webstructure {
public static serverObjects respond(final httpRequestHeader header, final serverObjects post, final serverSwitch<?> env) {
final serverObjects prop = new serverObjects();
final plasmaSwitchboard sb = (plasmaSwitchboard) env;
final boolean latest = ((post == null) ? false : post.containsKey("latest"));
final Iterator<plasmaWebStructure.structureEntry> i = sb.webStructure.structureEntryIterator(latest);
int c = 0, d;
plasmaWebStructure.structureEntry sentry;
Map.Entry<String, Integer> refentry;
String refdom, refhash;
Integer refcount;
Iterator<Map.Entry<String, Integer>> k;
while (i.hasNext()) {
sentry = i.next();
prop.put("domains_" + c + "_hash", sentry.domhash);
prop.put("domains_" + c + "_domain", sentry.domain);
prop.put("domains_" + c + "_date", sentry.date);
k = sentry.references.entrySet().iterator();
d = 0;
refloop: while (k.hasNext()) {
refentry = k.next();
refhash = refentry.getKey();
refdom = sb.webStructure.resolveDomHash2DomString(refhash);
if (refdom == null) continue refloop;
prop.put("domains_" + c + "_citations_" + d + "_refhash", refhash);
prop.put("domains_" + c + "_citations_" + d + "_refdom", refdom);
refcount = refentry.getValue();
prop.put("domains_" + c + "_citations_" + d + "_refcount", refcount.intValue());
d++;
}
prop.put("domains_" + c + "_citations", d);
c++;
}
prop.put("domains", c);
prop.put("maxref", plasmaWebStructure.maxref);
if (latest) sb.webStructure.joinOldNew();
// return rewrite properties
return prop;
}
}