mirror of
https://github.com/yacy/yacy_search_server.git
synced 2024-09-19 00:01:41 +02:00
fixed missing remove operation in balancer
git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@4990 6c8d7289-2bf4-0310-a012-ef5d649a1542
This commit is contained in:
parent
606b323a2d
commit
05c26d58d9
|
@ -3,7 +3,7 @@ javacSource=1.5
|
|||
javacTarget=1.5
|
||||
|
||||
# Release Configuration
|
||||
releaseVersion=0.592
|
||||
releaseVersion=0.593
|
||||
stdReleaseFile=yacy_v${releaseVersion}_${DSTAMP}_${releaseNr}.tar.gz
|
||||
embReleaseFile=yacy_emb_v${releaseVersion}_${DSTAMP}_${releaseNr}.tar.gz
|
||||
proReleaseFile=yacy_pro_v${releaseVersion}_${DSTAMP}_${releaseNr}.tar.gz
|
||||
|
|
|
@ -241,6 +241,20 @@ public class Balancer {
|
|||
if (urlHashes.contains(h)) j.remove();
|
||||
}
|
||||
|
||||
// iterate through the domain stacks
|
||||
Iterator<Map.Entry<String, LinkedList<String>>> k = domainStacks.entrySet().iterator();
|
||||
Map.Entry<String, LinkedList<String>> se;
|
||||
LinkedList<String> stack;
|
||||
while (k.hasNext()) {
|
||||
se = k.next();
|
||||
stack = se.getValue();
|
||||
i = stack.iterator();
|
||||
while (i.hasNext()) {
|
||||
if (urlHashes.contains(i.next())) i.remove();
|
||||
}
|
||||
if (stack.size() == 0) k.remove();
|
||||
}
|
||||
|
||||
return removedCounter;
|
||||
}
|
||||
|
||||
|
@ -256,13 +270,15 @@ public class Balancer {
|
|||
|
||||
public int size() {
|
||||
int componentsize = urlFileIndex.size();
|
||||
/*
|
||||
assert componentsize == urlFileStack.size() + urlRAMStack.size() + sizeDomainStacks() :
|
||||
"size wrong in " + stackname +
|
||||
" - urlFileIndex = " + urlFileIndex.size() +
|
||||
", componentsize = " + componentsize +
|
||||
", componentsize = " + urlFileStack.size() + urlRAMStack.size() + sizeDomainStacks() +
|
||||
" = (urlFileStack = " + urlFileStack.size() +
|
||||
", urlRAMStack = " + urlRAMStack.size() +
|
||||
", sizeDomainStacks = " + sizeDomainStacks() + ")";
|
||||
*/
|
||||
return componentsize;
|
||||
}
|
||||
|
||||
|
|
|
@ -56,6 +56,7 @@ import java.util.Date;
|
|||
import java.util.HashMap;
|
||||
import java.util.Iterator;
|
||||
import java.util.LinkedList;
|
||||
import java.util.concurrent.ConcurrentHashMap;
|
||||
|
||||
import de.anomic.http.HttpClient;
|
||||
import de.anomic.http.JakartaCommonsHttpClient;
|
||||
|
@ -78,6 +79,7 @@ public class RobotsTxt {
|
|||
|
||||
kelondroMap robotsTable;
|
||||
private final File robotsTableFile;
|
||||
private ConcurrentHashMap<String, Long> syncObjects;
|
||||
//private static final HashSet<String> loadedRobots = new HashSet<String>(); // only for debugging
|
||||
|
||||
public RobotsTxt(File robotsTableFile) {
|
||||
|
@ -94,6 +96,7 @@ public class RobotsTxt {
|
|||
blob = new kelondroBLOBTree(robotsTableFile, true, true, 256, 512, '_', kelondroNaturalOrder.naturalOrder, false, false, true);
|
||||
}
|
||||
robotsTable = new kelondroMap(blob, 100);
|
||||
syncObjects = new ConcurrentHashMap<String, Long>();
|
||||
}
|
||||
|
||||
private void resetDatabase() {
|
||||
|
@ -133,86 +136,98 @@ public class RobotsTxt {
|
|||
robotsTxt4Host == null ||
|
||||
robotsTxt4Host.getLoadedDate() == null ||
|
||||
System.currentTimeMillis() - robotsTxt4Host.getLoadedDate().getTime() > 7*24*60*60*1000
|
||||
)) synchronized(this) {
|
||||
// if we have not found any data or the data is older than 7 days, we need to load it from the remote server
|
||||
)) {
|
||||
|
||||
// check the robots table again for all threads that come here because they waited for another one
|
||||
// to complete a download
|
||||
try {
|
||||
HashMap<String, String> record = this.robotsTable.get(urlHostPort);
|
||||
if (record != null) robotsTxt4Host = new Entry(urlHostPort, record);
|
||||
} catch (kelondroException e) {
|
||||
resetDatabase();
|
||||
} catch (IOException e) {
|
||||
resetDatabase();
|
||||
}
|
||||
if (robotsTxt4Host != null &&
|
||||
robotsTxt4Host.getLoadedDate() != null &&
|
||||
System.currentTimeMillis() - robotsTxt4Host.getLoadedDate().getTime() <= 7*24*60*60*1000) {
|
||||
return robotsTxt4Host;
|
||||
// make or get a synchronization object
|
||||
Long syncObj = this.syncObjects.get(urlHostPort);
|
||||
if (syncObj == null) {
|
||||
syncObj = new Long(System.currentTimeMillis());
|
||||
this.syncObjects.put(urlHostPort, syncObj);
|
||||
}
|
||||
|
||||
// generating the proper url to download the robots txt
|
||||
yacyURL robotsURL = null;
|
||||
try {
|
||||
robotsURL = new yacyURL("http://" + urlHostPort + "/robots.txt", null);
|
||||
} catch (MalformedURLException e) {
|
||||
serverLog.logSevere("ROBOTS","Unable to generate robots.txt URL for host:port '" + urlHostPort + "'.");
|
||||
robotsURL = null;
|
||||
}
|
||||
|
||||
Object[] result = null;
|
||||
if (robotsURL != null) {
|
||||
serverLog.logFine("ROBOTS","Trying to download the robots.txt file from URL '" + robotsURL + "'.");
|
||||
// we can now synchronize for each host separately
|
||||
synchronized (syncObj) {
|
||||
|
||||
// if we have not found any data or the data is older than 7 days, we need to load it from the remote server
|
||||
|
||||
// check the robots table again for all threads that come here because they waited for another one
|
||||
// to complete a download
|
||||
try {
|
||||
result = downloadRobotsTxt(robotsURL, 5, robotsTxt4Host);
|
||||
} catch (Exception e) {
|
||||
result = null;
|
||||
HashMap<String, String> record = this.robotsTable.get(urlHostPort);
|
||||
if (record != null) robotsTxt4Host = new Entry(urlHostPort, record);
|
||||
} catch (kelondroException e) {
|
||||
resetDatabase();
|
||||
} catch (IOException e) {
|
||||
resetDatabase();
|
||||
}
|
||||
}
|
||||
/*
|
||||
assert !loadedRobots.contains(robotsURL.toNormalform(false, false)) :
|
||||
"robots-url=" + robotsURL.toString() +
|
||||
", robots=" + ((result == null || result[DOWNLOAD_ROBOTS_TXT] == null) ? "NULL" : new String((byte[]) result[DOWNLOAD_ROBOTS_TXT])) +
|
||||
", robotsTxt4Host=" + ((robotsTxt4Host == null) ? "NULL" : robotsTxt4Host.getLoadedDate().toString());
|
||||
loadedRobots.add(robotsURL.toNormalform(false, false));
|
||||
*/
|
||||
|
||||
if (result == null) {
|
||||
// no robots.txt available, make an entry to prevent that the robots loading is done twice
|
||||
if (robotsTxt4Host == null) {
|
||||
// generate artificial entry
|
||||
robotsTxt4Host = new Entry(
|
||||
urlHostPort,
|
||||
new ArrayList<String>(),
|
||||
new Date(),
|
||||
new Date(),
|
||||
null,
|
||||
null,
|
||||
new Integer(0));
|
||||
if (robotsTxt4Host != null &&
|
||||
robotsTxt4Host.getLoadedDate() != null &&
|
||||
System.currentTimeMillis() - robotsTxt4Host.getLoadedDate().getTime() <= 7*24*60*60*1000) {
|
||||
return robotsTxt4Host;
|
||||
}
|
||||
|
||||
// generating the proper url to download the robots txt
|
||||
yacyURL robotsURL = null;
|
||||
try {
|
||||
robotsURL = new yacyURL("http://" + urlHostPort + "/robots.txt", null);
|
||||
} catch (MalformedURLException e) {
|
||||
serverLog.logSevere("ROBOTS","Unable to generate robots.txt URL for host:port '" + urlHostPort + "'.");
|
||||
robotsURL = null;
|
||||
}
|
||||
|
||||
Object[] result = null;
|
||||
if (robotsURL != null) {
|
||||
serverLog.logFine("ROBOTS","Trying to download the robots.txt file from URL '" + robotsURL + "'.");
|
||||
try {
|
||||
result = downloadRobotsTxt(robotsURL, 5, robotsTxt4Host);
|
||||
} catch (Exception e) {
|
||||
result = null;
|
||||
}
|
||||
}
|
||||
/*
|
||||
assert !loadedRobots.contains(robotsURL.toNormalform(false, false)) :
|
||||
"robots-url=" + robotsURL.toString() +
|
||||
", robots=" + ((result == null || result[DOWNLOAD_ROBOTS_TXT] == null) ? "NULL" : new String((byte[]) result[DOWNLOAD_ROBOTS_TXT])) +
|
||||
", robotsTxt4Host=" + ((robotsTxt4Host == null) ? "NULL" : robotsTxt4Host.getLoadedDate().toString());
|
||||
loadedRobots.add(robotsURL.toNormalform(false, false));
|
||||
*/
|
||||
|
||||
if (result == null) {
|
||||
// no robots.txt available, make an entry to prevent that the robots loading is done twice
|
||||
if (robotsTxt4Host == null) {
|
||||
// generate artificial entry
|
||||
robotsTxt4Host = new Entry(
|
||||
urlHostPort,
|
||||
new ArrayList<String>(),
|
||||
new Date(),
|
||||
new Date(),
|
||||
null,
|
||||
null,
|
||||
new Integer(0));
|
||||
} else {
|
||||
robotsTxt4Host.setLoadedDate(new Date());
|
||||
}
|
||||
|
||||
// store the data into the robots DB
|
||||
addEntry(robotsTxt4Host);
|
||||
} else {
|
||||
robotsTxt4Host.setLoadedDate(new Date());
|
||||
Object[] parserResult = robotsParser.parse((byte[]) result[DOWNLOAD_ROBOTS_TXT]);
|
||||
ArrayList<String> denyPath = (ArrayList<String>) parserResult[0];
|
||||
if (((Boolean) result[DOWNLOAD_ACCESS_RESTRICTED]).booleanValue()) {
|
||||
denyPath = new ArrayList<String>();
|
||||
denyPath.add("/");
|
||||
}
|
||||
|
||||
// store the data into the robots DB
|
||||
robotsTxt4Host = addEntry(
|
||||
urlHostPort,
|
||||
denyPath,
|
||||
new Date(),
|
||||
(Date) result[DOWNLOAD_MODDATE],
|
||||
(String) result[DOWNLOAD_ETAG],
|
||||
(String) parserResult[1],
|
||||
(Integer) parserResult[2]);
|
||||
}
|
||||
|
||||
// store the data into the robots DB
|
||||
addEntry(robotsTxt4Host);
|
||||
} else {
|
||||
Object[] parserResult = robotsParser.parse((byte[]) result[DOWNLOAD_ROBOTS_TXT]);
|
||||
ArrayList<String> denyPath = (ArrayList<String>) parserResult[0];
|
||||
if (((Boolean) result[DOWNLOAD_ACCESS_RESTRICTED]).booleanValue()) {
|
||||
denyPath = new ArrayList<String>();
|
||||
denyPath.add("/");
|
||||
}
|
||||
|
||||
// store the data into the robots DB
|
||||
robotsTxt4Host = addEntry(
|
||||
urlHostPort,
|
||||
denyPath,
|
||||
new Date(),
|
||||
(Date) result[DOWNLOAD_MODDATE],
|
||||
(String) result[DOWNLOAD_ETAG],
|
||||
(String) parserResult[1],
|
||||
(Integer) parserResult[2]);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
Loading…
Reference in New Issue
Block a user