From 3f0446f14b89e6a502363dc516f82f7dbab330df Mon Sep 17 00:00:00 2001 From: luccioman Date: Wed, 16 Aug 2017 09:30:33 +0200 Subject: [PATCH] Ensure proper synchronous robots entry retrieval on first check. Previously, when checking for the first time the robots.txt policy on a unknown host (not cached in the robots table), result was always empty in the /getpageinfo_p.xml api and in the /CrawlCheck_p.html page. Next calls returned however the correct information. --- htroot/api/getpageinfo_p.java | 1 - source/net/yacy/crawler/robots/RobotsTxt.java | 15 +++++++++++---- 2 files changed, 11 insertions(+), 5 deletions(-) diff --git a/htroot/api/getpageinfo_p.java b/htroot/api/getpageinfo_p.java index 309421a63..8da9cef8a 100644 --- a/htroot/api/getpageinfo_p.java +++ b/htroot/api/getpageinfo_p.java @@ -221,7 +221,6 @@ public class getpageinfo_p { // determine if crawling of the current URL is allowed ClientIdentification.Agent agent = ClientIdentification.getAgent(post.get("agentName", ClientIdentification.yacyInternetCrawlerAgentName)); - sb.robots.ensureExist(theURL, agent, true); RobotsTxtEntry robotsEntry = sb.robots.getEntry(theURL, agent); prop.put("robots-allowed", robotsEntry == null ? 1 : robotsEntry.isDisallowed(theURL) ? 0 : 1); prop.putHTML("robotsInfo", robotsEntry == null ? "" : robotsEntry.getInfo()); diff --git a/source/net/yacy/crawler/robots/RobotsTxt.java b/source/net/yacy/crawler/robots/RobotsTxt.java index dba0a55f0..b97402114 100644 --- a/source/net/yacy/crawler/robots/RobotsTxt.java +++ b/source/net/yacy/crawler/robots/RobotsTxt.java @@ -198,7 +198,7 @@ public class RobotsTxt { if (response == null) { processOldEntry(robotsTxt4Host, robotsURL, robotsTable); } else { - processNewEntry(robotsURL, response, agent.robotIDs); + robotsTxt4Host = processNewEntry(response, agent.robotIDs); } } } @@ -266,7 +266,7 @@ public class RobotsTxt { if (response == null) { processOldEntry(null, robotsURL, robotsTable); } else { - processNewEntry(robotsURL, response, agent.robotIDs); + processNewEntry(response, agent.robotIDs); } } } @@ -314,7 +314,13 @@ public class RobotsTxt { } } - private void processNewEntry(DigestURL robotsURL, Response response, final String[] thisAgents) { + /** + * Process a response to a robots.txt request, create a new robots entry, add it to the robots table then return it. + * @param response the response to the requested robots.txt URL. Must not be null. + * @param thisAgents the agent identifier(s) used to request the robots.txt URL + * @return the new robots entry + */ + private RobotsTxtEntry processNewEntry(final Response response, final String[] thisAgents) { final byte[] robotsTxt = response.getContent(); //Log.logInfo("RobotsTxt", "robots of " + robotsURL.toNormalform(true, true) + ":\n" + ((robotsTxt == null) ? "null" : UTF8.String(robotsTxt))); // debug TODO remove RobotsTxtParser parserResult; @@ -334,7 +340,7 @@ public class RobotsTxt { boolean isBrowserAgent = thisAgents.length == 1 && thisAgents[0].equals("Mozilla"); if (isBrowserAgent) denyPath.clear(); final RobotsTxtEntry robotsTxt4Host = new RobotsTxtEntry( - robotsURL, + response.getRequest().url(), parserResult.allowList(), denyPath, new Date(), @@ -344,6 +350,7 @@ public class RobotsTxt { parserResult.crawlDelayMillis(), parserResult.agentName()); addEntry(robotsTxt4Host); + return robotsTxt4Host; } private String addEntry(final RobotsTxtEntry entry) {