Ensure proper synchronous robots entry retrieval on first check.

Previously, when checking for the first time the robots.txt policy on a
unknown host (not cached in the robots table), result was always empty
in the /getpageinfo_p.xml api and in the /CrawlCheck_p.html page. Next
calls returned however the correct information.
This commit is contained in:
luccioman 2017-08-16 09:30:33 +02:00
parent 9da75ac76d
commit 3f0446f14b
2 changed files with 11 additions and 5 deletions

View File

@ -221,7 +221,6 @@ public class getpageinfo_p {
// determine if crawling of the current URL is allowed
ClientIdentification.Agent agent = ClientIdentification.getAgent(post.get("agentName", ClientIdentification.yacyInternetCrawlerAgentName));
sb.robots.ensureExist(theURL, agent, true);
RobotsTxtEntry robotsEntry = sb.robots.getEntry(theURL, agent);
prop.put("robots-allowed", robotsEntry == null ? 1 : robotsEntry.isDisallowed(theURL) ? 0 : 1);
prop.putHTML("robotsInfo", robotsEntry == null ? "" : robotsEntry.getInfo());

View File

@ -198,7 +198,7 @@ public class RobotsTxt {
if (response == null) {
processOldEntry(robotsTxt4Host, robotsURL, robotsTable);
} else {
processNewEntry(robotsURL, response, agent.robotIDs);
robotsTxt4Host = processNewEntry(response, agent.robotIDs);
}
}
}
@ -266,7 +266,7 @@ public class RobotsTxt {
if (response == null) {
processOldEntry(null, robotsURL, robotsTable);
} else {
processNewEntry(robotsURL, response, agent.robotIDs);
processNewEntry(response, agent.robotIDs);
}
}
}
@ -314,7 +314,13 @@ public class RobotsTxt {
}
}
private void processNewEntry(DigestURL robotsURL, Response response, final String[] thisAgents) {
/**
* Process a response to a robots.txt request, create a new robots entry, add it to the robots table then return it.
* @param response the response to the requested robots.txt URL. Must not be null.
* @param thisAgents the agent identifier(s) used to request the robots.txt URL
* @return the new robots entry
*/
private RobotsTxtEntry processNewEntry(final Response response, final String[] thisAgents) {
final byte[] robotsTxt = response.getContent();
//Log.logInfo("RobotsTxt", "robots of " + robotsURL.toNormalform(true, true) + ":\n" + ((robotsTxt == null) ? "null" : UTF8.String(robotsTxt))); // debug TODO remove
RobotsTxtParser parserResult;
@ -334,7 +340,7 @@ public class RobotsTxt {
boolean isBrowserAgent = thisAgents.length == 1 && thisAgents[0].equals("Mozilla");
if (isBrowserAgent) denyPath.clear();
final RobotsTxtEntry robotsTxt4Host = new RobotsTxtEntry(
robotsURL,
response.getRequest().url(),
parserResult.allowList(),
denyPath,
new Date(),
@ -344,6 +350,7 @@ public class RobotsTxt {
parserResult.crawlDelayMillis(),
parserResult.agentName());
addEntry(robotsTxt4Host);
return robotsTxt4Host;
}
private String addEntry(final RobotsTxtEntry entry) {