mirror of
https://github.com/yacy/yacy_search_server.git
synced 2024-09-21 00:00:13 +02:00
d2ba1fd2ab
This change is inspired by the need to see a network connected to the index it creates in a indexing team. It is not possible to divide the network and the index. Therefore all control files for the network was moved to the network within the INDEX/<network-name> subfolder. The remaining YACYDB is superfluous and can be deleted. The yacyDB and yacyNews data structures are now part of plasmaWordIndex. Therefore all methods, using static access to yacySeedDB had to be rewritten. A special problem had been all the port forwarding methods which had been tightly mixed with seed construction. It was not possible to move the port forwarding functions to the place, meaning and usage of plasmaWordIndex. Therefore the port forwarding had been deleted (I guess nobody used it and it can be simulated by methods outside of YaCy). The mySeed.txt is automatically moved to the current network position. A new effect causes that every network will create a different local seed file, which is ok, since the seed identifies the peer only against the network (it is the purpose of the seed hash to give a peer a location within the DHT). No other functional change has been made. The next steps to enable network switcing are: - shift of crawler tables from PLASMADB into the network (crawls are also network-specific) - possibly shift of plasmaWordIndex code into yacy package (index management is network-specific) - servlet to switch networks git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@4765 6c8d7289-2bf4-0310-a012-ef5d649a1542
398 lines
20 KiB
Java
398 lines
20 KiB
Java
// yacyPeerActions.java
|
|
// -------------------------------------
|
|
// (C) by Michael Peter Christen; mc@anomic.de
|
|
// first published on http://www.anomic.de
|
|
// Frankfurt, Germany, 2005
|
|
//
|
|
// $LastChangedDate$
|
|
// $LastChangedRevision$
|
|
// $LastChangedBy$
|
|
//
|
|
// This program is free software; you can redistribute it and/or modify
|
|
// it under the terms of the GNU General Public License as published by
|
|
// the Free Software Foundation; either version 2 of the License, or
|
|
// (at your option) any later version.
|
|
//
|
|
// This program is distributed in the hope that it will be useful,
|
|
// but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
// GNU General Public License for more details.
|
|
//
|
|
// You should have received a copy of the GNU General Public License
|
|
// along with this program; if not, write to the Free Software
|
|
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
|
//
|
|
// Using this software in any meaning (reading, learning, copying, compiling,
|
|
// running) means that you agree that the Author(s) is (are) not responsible
|
|
// for cost, loss of data or any harm that may be caused directly or indirectly
|
|
// by usage of this softare or this documentation. The usage of this software
|
|
// is on your own risk. The installation and usage (starting/running) of this
|
|
// software may allow other people or application to access your computer and
|
|
// any attached devices and is highly dependent on the configuration of the
|
|
// software which must be done by the user of the software; the author(s) is
|
|
// (are) also not responsible for proper configuration and usage of the
|
|
// software, even if provoked by documentation provided together with
|
|
// the software.
|
|
//
|
|
// Any changes to this file according to the GPL as documented in the file
|
|
// gpl.txt aside this file in the shipment you received can be done to the
|
|
// lines that follows this copyright notice here, but changes must not be
|
|
// done inside the copyright notive above. A re-distribution must contain
|
|
// the intact and unchanged copyright notice.
|
|
// Contributions and changes to the program code must be marked as such.
|
|
|
|
package de.anomic.yacy;
|
|
|
|
import java.io.IOException;
|
|
import java.util.ArrayList;
|
|
import java.util.HashMap;
|
|
import java.util.HashSet;
|
|
import java.util.Iterator;
|
|
|
|
import de.anomic.http.HttpClient;
|
|
import de.anomic.http.httpHeader;
|
|
import de.anomic.plasma.plasmaCrawlNURL;
|
|
import de.anomic.plasma.plasmaSwitchboard;
|
|
import de.anomic.server.serverCore;
|
|
import de.anomic.server.serverDate;
|
|
import de.anomic.tools.nxTools;
|
|
|
|
public class yacyPeerActions {
|
|
|
|
private yacySeedDB seedDB;
|
|
private plasmaSwitchboard sb;
|
|
private HashSet<yacyPeerAction> actions;
|
|
private HashMap<String, String> userAgents;
|
|
public long juniorConnects;
|
|
public long seniorConnects;
|
|
public long principalConnects;
|
|
public long disconnects;
|
|
private int bootstrapLoadTimeout;
|
|
|
|
public yacyPeerActions(yacySeedDB seedDB, plasmaSwitchboard switchboard) {
|
|
this.seedDB = seedDB;
|
|
this.sb = switchboard;
|
|
this.actions = new HashSet<yacyPeerAction>();
|
|
this.userAgents = new HashMap<String, String>();
|
|
this.juniorConnects = 0;
|
|
this.seniorConnects = 0;
|
|
this.principalConnects = 0;
|
|
this.disconnects = 0;
|
|
this.bootstrapLoadTimeout = (int) switchboard.getConfigLong("bootstrapLoadTimeout", 6000);
|
|
}
|
|
|
|
public void deploy(yacyPeerAction action) {
|
|
actions.add(action);
|
|
}
|
|
|
|
public void updateMySeed() {
|
|
if (sb.getConfig("peerName", "anomic").equals("anomic")) {
|
|
// generate new peer name
|
|
sb.setConfig("peerName", yacySeed.makeDefaultPeerName());
|
|
}
|
|
seedDB.mySeed().put(yacySeed.NAME, sb.getConfig("peerName", "nameless"));
|
|
seedDB.mySeed().put(yacySeed.PORT, Integer.toString(serverCore.getPortNr(sb.getConfig("port", "8080"))));
|
|
|
|
long uptime = (System.currentTimeMillis() - serverCore.startupTime) / 1000;
|
|
long uptimediff = uptime - sb.lastseedcheckuptime;
|
|
long indexedcdiff = sb.indexedPages - sb.lastindexedPages;
|
|
//double requestcdiff = sb.requestedQueries - sb.lastrequestedQueries;
|
|
if (uptimediff > 300 || uptimediff <= 0 || sb.lastseedcheckuptime == -1 ) {
|
|
sb.lastseedcheckuptime = uptime;
|
|
sb.lastindexedPages = sb.indexedPages;
|
|
sb.lastrequestedQueries = sb.requestedQueries;
|
|
}
|
|
|
|
//the speed of indexing (pages/minute) of the peer
|
|
sb.totalPPM = (int) (sb.indexedPages * 60 / Math.max(uptime, 1));
|
|
seedDB.mySeed().put(yacySeed.ISPEED, Long.toString(Math.round(Math.max((float) indexedcdiff, 0f) * 60f / Math.max((float) uptimediff, 1f))));
|
|
sb.totalQPM = sb.requestedQueries * 60d / Math.max((double) uptime, 1d);
|
|
seedDB.mySeed().put(yacySeed.RSPEED, Double.toString(sb.totalQPM /*Math.max((float) requestcdiff, 0f) * 60f / Math.max((float) uptimediff, 1f)*/ ));
|
|
|
|
seedDB.mySeed().put(yacySeed.UPTIME, Long.toString(uptime/60)); // the number of minutes that the peer is up in minutes/day (moving average MA30)
|
|
seedDB.mySeed().put(yacySeed.LCOUNT, Integer.toString(sb.wordIndex.countURL())); // the number of links that the peer has stored (LURL's)
|
|
seedDB.mySeed().put(yacySeed.NCOUNT, Integer.toString(sb.crawlQueues.noticeURL.size())); // the number of links that the peer has noticed, but not loaded (NURL's)
|
|
seedDB.mySeed().put(yacySeed.RCOUNT, Integer.toString(sb.crawlQueues.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_LIMIT))); // the number of links that the peer provides for remote crawling (ZURL's)
|
|
seedDB.mySeed().put(yacySeed.ICOUNT, Integer.toString(sb.wordIndex.size())); // the minimum number of words that the peer has indexed (as it says)
|
|
seedDB.mySeed().put(yacySeed.SCOUNT, Integer.toString(seedDB.sizeConnected())); // the number of seeds that the peer has stored
|
|
seedDB.mySeed().put(yacySeed.CCOUNT, Double.toString(((int) ((seedDB.sizeConnected() + seedDB.sizeDisconnected() + seedDB.sizePotential()) * 60.0 / (uptime + 1.01)) * 100) / 100.0)); // the number of clients that the peer connects (as connects/hour)
|
|
seedDB.mySeed().put(yacySeed.VERSION, sb.getConfig("version", ""));
|
|
if (seedDB.mySeed().get(yacySeed.PEERTYPE,"").equals(yacySeed.PEERTYPE_PRINCIPAL)) {
|
|
// attach information about seed location
|
|
seedDB.mySeed().put("seedURL", sb.getConfig("seedURL", ""));
|
|
}
|
|
seedDB.mySeed().setFlagDirectConnect(true);
|
|
seedDB.mySeed().setLastSeenUTC();
|
|
seedDB.mySeed().put(yacySeed.UTC, serverDate.UTCDiffString());
|
|
seedDB.mySeed().setFlagAcceptRemoteCrawl(sb.getConfig("crawlResponse", "").equals("true"));
|
|
seedDB.mySeed().setFlagAcceptRemoteIndex(sb.getConfig("allowReceiveIndex", "").equals("true"));
|
|
//mySeed.setFlagAcceptRemoteIndex(true);
|
|
}
|
|
|
|
public void saveMySeed() {
|
|
try {
|
|
seedDB.mySeed().save(sb.getOwnSeedFile());
|
|
} catch (IOException e) {}
|
|
}
|
|
|
|
public void loadSeedLists() {
|
|
// uses the superseed to initialize the database with known seeds
|
|
|
|
yacySeed ys;
|
|
String seedListFileURL;
|
|
yacyURL url;
|
|
ArrayList<String> seedList;
|
|
Iterator<String> enu;
|
|
int lc;
|
|
int sc = seedDB.sizeConnected();
|
|
httpHeader header;
|
|
|
|
yacyCore.log.logInfo("BOOTSTRAP: " + sc + " seeds known from previous run");
|
|
|
|
// - use the superseed to further fill up the seedDB
|
|
int ssc = 0, c = 0;
|
|
while (true) {
|
|
if (Thread.currentThread().isInterrupted()) break;
|
|
seedListFileURL = sb.getConfig("network.unit.bootstrap.seedlist" + c, "");
|
|
if (seedListFileURL.length() == 0) break;
|
|
c++;
|
|
if (
|
|
seedListFileURL.startsWith("http://") ||
|
|
seedListFileURL.startsWith("https://")
|
|
) {
|
|
// load the seed list
|
|
try {
|
|
httpHeader reqHeader = new httpHeader();
|
|
reqHeader.put(httpHeader.PRAGMA,"no-cache");
|
|
reqHeader.put(httpHeader.CACHE_CONTROL,"no-cache");
|
|
|
|
url = new yacyURL(seedListFileURL, null);
|
|
long start = System.currentTimeMillis();
|
|
header = HttpClient.whead(url.toString(), reqHeader);
|
|
long loadtime = System.currentTimeMillis() - start;
|
|
if (header == null) {
|
|
if (loadtime > this.bootstrapLoadTimeout) {
|
|
yacyCore.log.logWarning("BOOTSTRAP: seed-list URL " + seedListFileURL + " not available, time-out after " + loadtime + " milliseconds");
|
|
} else {
|
|
yacyCore.log.logWarning("BOOTSTRAP: seed-list URL " + seedListFileURL + " not available, no content");
|
|
}
|
|
} else if (header.lastModified() == null) {
|
|
yacyCore.log.logWarning("BOOTSTRAP: seed-list URL " + seedListFileURL + " not usable, last-modified is missing");
|
|
} else if ((header.age() > 86400000) && (ssc > 0)) {
|
|
yacyCore.log.logInfo("BOOTSTRAP: seed-list URL " + seedListFileURL + " too old (" + (header.age() / 86400000) + " days)");
|
|
} else {
|
|
ssc++;
|
|
final byte[] content = HttpClient.wget(url.toString(), reqHeader, null, bootstrapLoadTimeout);
|
|
seedList = nxTools.strings(content, "UTF-8");
|
|
enu = seedList.iterator();
|
|
lc = 0;
|
|
while (enu.hasNext()) {
|
|
ys = yacySeed.genRemoteSeed((String) enu.next(), null, true);
|
|
if ((ys != null) && (ys.isProper() == null) &&
|
|
((!seedDB.mySeedIsDefined()) || (seedDB.mySeed().hash != ys.hash))) {
|
|
if (connectPeer(ys, false)) lc++;
|
|
//seedDB.writeMap(ys.hash, ys.getMap(), "init");
|
|
//System.out.println("BOOTSTRAP: received peer " + ys.get(yacySeed.NAME, "anonymous") + "/" + ys.getAddress());
|
|
//lc++;
|
|
}
|
|
}
|
|
yacyCore.log.logInfo("BOOTSTRAP: " + lc + " seeds from seed-list URL " + seedListFileURL + ", AGE=" + (header.age() / 3600000) + "h");
|
|
}
|
|
|
|
} catch (IOException e) {
|
|
// this is when wget fails, commonly because of timeout
|
|
yacyCore.log.logWarning("BOOTSTRAP: failed (1) to load seeds from seed-list URL " + seedListFileURL + ": " + e.getMessage());
|
|
} catch (Exception e) {
|
|
// this is when wget fails; may be because of missing internet connection
|
|
yacyCore.log.logSevere("BOOTSTRAP: failed (2) to load seeds from seed-list URL " + seedListFileURL + ": " + e.getMessage(), e);
|
|
}
|
|
}
|
|
}
|
|
yacyCore.log.logInfo("BOOTSTRAP: " + (seedDB.sizeConnected() - sc) + " new seeds while bootstraping.");
|
|
}
|
|
|
|
private synchronized boolean connectPeer(yacySeed seed, boolean direct) {
|
|
// store a remote peer's seed
|
|
// returns true if the peer is new and previously unknown
|
|
if (seed == null) {
|
|
yacyCore.log.logSevere("connect: WRONG seed (NULL)");
|
|
return false;
|
|
}
|
|
final String error = seed.isProper();
|
|
if (error != null) {
|
|
yacyCore.log.logSevere("connect: WRONG seed (" + seed.getName() + "/" + seed.hash + "): " + error);
|
|
return false;
|
|
}
|
|
if ((this.seedDB.mySeedIsDefined()) && (seed.hash.equals(this.seedDB.mySeed().hash))) {
|
|
yacyCore.log.logInfo("connect: SELF reference " + seed.getPublicAddress());
|
|
return false;
|
|
}
|
|
final String peerType = seed.get(yacySeed.PEERTYPE, yacySeed.PEERTYPE_VIRGIN);
|
|
|
|
if ((peerType.equals(yacySeed.PEERTYPE_VIRGIN)) || (peerType.equals(yacySeed.PEERTYPE_JUNIOR))) {
|
|
// reject unqualified seeds
|
|
yacyCore.log.logFine("connect: rejecting NOT QUALIFIED " + peerType + " seed " + seed.getName());
|
|
return false;
|
|
}
|
|
|
|
final yacySeed doubleSeed = this.seedDB.lookupByIP(seed.getInetAddress(), true, false, false);
|
|
if ((doubleSeed != null) && (doubleSeed.getPort() == seed.getPort()) && (!(doubleSeed.hash.equals(seed.hash)))) {
|
|
// a user frauds with his peer different peer hashes
|
|
yacyCore.log.logFine("connect: rejecting FRAUD (double hashes " + doubleSeed.hash + "/" + seed.hash + " on same port " + seed.getPort() + ") peer " + seed.getName());
|
|
return false;
|
|
}
|
|
|
|
if (seed.get(yacySeed.LASTSEEN, "").length() != 14) {
|
|
// hack for peers that do not have a LastSeen date
|
|
seed.setLastSeenUTC();
|
|
yacyCore.log.logFine("connect: reset wrong date (" + seed.getName() + "/" + seed.hash + ")");
|
|
}
|
|
|
|
// connection time
|
|
final long nowUTC0Time = System.currentTimeMillis(); // is better to have this value in a variable for debugging
|
|
long ctimeUTC0 = seed.getLastSeenUTC();
|
|
|
|
if (ctimeUTC0 > nowUTC0Time) {
|
|
// the peer is future-dated, correct it
|
|
seed.setLastSeenUTC();
|
|
ctimeUTC0 = nowUTC0Time;
|
|
assert (seed.getLastSeenUTC() - ctimeUTC0 < 100);
|
|
}
|
|
if (Math.abs(nowUTC0Time - ctimeUTC0) > 60 * 60 * 24 * 1000) {
|
|
// the new connection is out-of-age, we reject the connection
|
|
yacyCore.log.logFine("connect: rejecting out-dated peer '" + seed.getName() + "' from " + seed.getPublicAddress() + "; nowUTC0=" + nowUTC0Time + ", seedUTC0=" + ctimeUTC0 + ", TimeDiff=" + serverDate.formatInterval(Math.abs(nowUTC0Time - ctimeUTC0)));
|
|
return false;
|
|
}
|
|
|
|
// disconnection time
|
|
long dtimeUTC0;
|
|
final yacySeed disconnectedSeed = seedDB.getDisconnected(seed.hash);
|
|
if (disconnectedSeed == null) {
|
|
dtimeUTC0 = 0; // never disconnected: virtually disconnected maximum time ago
|
|
} else {
|
|
dtimeUTC0 = disconnectedSeed.getLong("dct", 0);
|
|
}
|
|
|
|
if (direct) {
|
|
// remember the moment
|
|
// Date applies the local UTC offset, which is wrong
|
|
// we correct that by subtracting the local offset and adding
|
|
// the remote offset.
|
|
seed.setLastSeenUTC();
|
|
seed.setFlagDirectConnect(true);
|
|
} else {
|
|
// set connection flag
|
|
if (Math.abs(nowUTC0Time - ctimeUTC0) > 120000) seed.setFlagDirectConnect(false); // 2 minutes
|
|
}
|
|
|
|
// update latest version number
|
|
if (seed.getVersion() > yacyVersion.latestRelease) yacyVersion.latestRelease = seed.getVersion();
|
|
|
|
// prepare to update
|
|
if (disconnectedSeed != null) {
|
|
// if the indirect connect aims to announce a peer that we know
|
|
// has been disconnected then we compare the dates:
|
|
// if the new peer has a LastSeen date, and that date is before
|
|
// the disconnection date, then we ignore the new peer
|
|
if (!direct) {
|
|
if (ctimeUTC0 < dtimeUTC0) {
|
|
// the disconnection was later, we reject the connection
|
|
yacyCore.log.logFine("connect: rejecting disconnected peer '" + seed.getName() + "' from " + seed.getPublicAddress());
|
|
return false;
|
|
}
|
|
}
|
|
|
|
// this is a return of a lost peer
|
|
yacyCore.log.logFine("connect: returned KNOWN " + peerType + " peer '" + seed.getName() + "' from " + seed.getPublicAddress());
|
|
this.seedDB.addConnected(seed);
|
|
return true;
|
|
} else {
|
|
final yacySeed connectedSeed = this.seedDB.getConnected(seed.hash);
|
|
if (connectedSeed != null) {
|
|
// the seed is known: this is an update
|
|
try {
|
|
// if the old LastSeen date is later then the other
|
|
// info, then we reject the info
|
|
if ((ctimeUTC0 < (connectedSeed.getLastSeenUTC())) && (!direct)) {
|
|
yacyCore.log.logFine("connect: rejecting old info about peer '" + seed.getName() + "'");
|
|
return false;
|
|
}
|
|
|
|
/*if (connectedSeed.getName() != seed.getName()) {
|
|
// TODO: update seed name lookup cache
|
|
}*/
|
|
} catch (NumberFormatException e) {
|
|
yacyCore.log.logFine("connect: rejecting wrong peer '" + seed.getName() + "' from " + seed.getPublicAddress() + ". Cause: " + e.getMessage());
|
|
return false;
|
|
}
|
|
yacyCore.log.logFine("connect: updated KNOWN " + ((direct) ? "direct " : "") + peerType + " peer '" + seed.getName() + "' from " + seed.getPublicAddress());
|
|
seedDB.addConnected(seed);
|
|
return true;
|
|
} else {
|
|
// the seed is new
|
|
if ((seedDB.mySeedIsDefined()) && (seed.get(yacySeed.IP, "127.0.0.1").equals(this.seedDB.mySeed().get(yacySeed.IP, "127.0.0.1")))) {
|
|
// seed from the same IP as the calling client: can be
|
|
// the case if there runs another one over a NAT
|
|
yacyCore.log.logFine("connect: saved NEW seed (myself IP) " + seed.getPublicAddress());
|
|
} else {
|
|
// completely new seed
|
|
yacyCore.log.logFine("connect: saved NEW " + peerType + " peer '" + seed.getName() + "' from " + seed.getPublicAddress());
|
|
}
|
|
if (peerType.equals(yacySeed.PEERTYPE_SENIOR))
|
|
this.seniorConnects++; // update statistics
|
|
if (peerType.equals(yacySeed.PEERTYPE_PRINCIPAL))
|
|
this.principalConnects++; // update statistics
|
|
this.seedDB.addConnected(seed);
|
|
return true;
|
|
}
|
|
}
|
|
}
|
|
|
|
private final void disconnectPeer(yacySeed seed, String cause) {
|
|
// we do this if we did not get contact with the other peer
|
|
yacyCore.log.logFine("connect: no contact to a " + seed.get(yacySeed.PEERTYPE, yacySeed.PEERTYPE_VIRGIN) + " peer '" + seed.getName() + "' at " + seed.getPublicAddress() + ". Cause: " + cause);
|
|
synchronized (seedDB) {
|
|
if (!seedDB.hasDisconnected(seed.hash)) { disconnects++; }
|
|
seed.put("dct", Long.toString(System.currentTimeMillis()));
|
|
seedDB.addDisconnected(seed); // update info
|
|
}
|
|
}
|
|
|
|
public boolean peerArrival(yacySeed peer, boolean direct) {
|
|
if (peer == null) return false;
|
|
boolean res = connectPeer(peer, direct);
|
|
// perform all actions if peer is effective new
|
|
if (res) {
|
|
Iterator<yacyPeerAction> i = actions.iterator();
|
|
while (i.hasNext()) i.next().processPeerArrival(peer, direct);
|
|
}
|
|
return res;
|
|
}
|
|
|
|
public void peerDeparture(yacySeed peer, String cause) {
|
|
if (peer == null) return;
|
|
disconnectPeer(peer, cause);
|
|
// perform all actions
|
|
Iterator<yacyPeerAction> i = actions.iterator();
|
|
while (i.hasNext()) i.next().processPeerDeparture(peer);
|
|
}
|
|
|
|
public void peerPing(yacySeed peer) {
|
|
if (peer == null) return;
|
|
// this is called only if the peer has junior status
|
|
seedDB.addPotential(peer);
|
|
// perform all actions
|
|
Iterator<yacyPeerAction> i = actions.iterator();
|
|
while (i.hasNext()) i.next().processPeerPing(peer);
|
|
}
|
|
|
|
public void setUserAgent(String IP, String userAgent) {
|
|
userAgents.put(IP, userAgent);
|
|
}
|
|
|
|
public String getUserAgent(String IP) {
|
|
String userAgent = (String) userAgents.get(IP);
|
|
return (userAgent == null) ? "" : userAgent;
|
|
}
|
|
}
|