Ensure proper synchronous robots entry retrieval on first check.

Previously, when checking for the first time the robots.txt policy on a unknown host (not cached in the robots table), result was always empty in the /getpageinfo_p.xml api and in the /CrawlCheck_p.html page. Next calls returned however the correct information.
2024-09-19 00:01:41 +02:00 · 2017-08-16 09:30:33 +02:00 · 2017-08-16 09:30:33 +02:00 · 3f0446f14b
commit 3f0446f14b
parent 9da75ac76d
2 changed files with 11 additions and 5 deletions
--- a/htroot/api/getpageinfo_p.java
+++ b/htroot/api/getpageinfo_p.java
@ -221,7 +221,6 @@ public class getpageinfo_p {

                	// determine if crawling of the current URL is allowed
                    ClientIdentification.Agent agent = ClientIdentification.getAgent(post.get("agentName", ClientIdentification.yacyInternetCrawlerAgentName));
-                    sb.robots.ensureExist(theURL, agent, true);
                    RobotsTxtEntry robotsEntry = sb.robots.getEntry(theURL, agent);
                	prop.put("robots-allowed", robotsEntry == null ? 1 : robotsEntry.isDisallowed(theURL) ? 0 : 1);
                    prop.putHTML("robotsInfo", robotsEntry == null ? "" : robotsEntry.getInfo());
--- a/source/net/yacy/crawler/robots/RobotsTxt.java
+++ b/source/net/yacy/crawler/robots/RobotsTxt.java
@ -198,7 +198,7 @@ public class RobotsTxt {
                if (response == null) {
                    processOldEntry(robotsTxt4Host, robotsURL, robotsTable);
                } else {
-                    processNewEntry(robotsURL, response, agent.robotIDs);
+                	robotsTxt4Host = processNewEntry(response, agent.robotIDs);
                }
            }
        }
@ -266,7 +266,7 @@ public class RobotsTxt {
                    if (response == null) {
                        processOldEntry(null, robotsURL, robotsTable);
                    } else {
-                        processNewEntry(robotsURL, response, agent.robotIDs);
+                        processNewEntry(response, agent.robotIDs);
                    }
                }
            }
@ -314,7 +314,13 @@ public class RobotsTxt {
        }
    }
    
-    private void processNewEntry(DigestURL robotsURL, Response response, final String[] thisAgents) {
+    /**
+     * Process a response to a robots.txt request, create a new robots entry, add it to the robots table then return it.
+     * @param response the response to the requested robots.txt URL. Must not be null.
+     * @param thisAgents the agent identifier(s) used to request the robots.txt URL
+     * @return the new robots entry
+     */
+    private RobotsTxtEntry processNewEntry(final Response response, final String[] thisAgents) {
        final byte[] robotsTxt = response.getContent();
        //Log.logInfo("RobotsTxt", "robots of " + robotsURL.toNormalform(true, true) + ":\n" + ((robotsTxt == null) ? "null" : UTF8.String(robotsTxt))); // debug TODO remove
        RobotsTxtParser parserResult;
@ -334,7 +340,7 @@ public class RobotsTxt {
        boolean isBrowserAgent = thisAgents.length == 1 && thisAgents[0].equals("Mozilla");
        if (isBrowserAgent) denyPath.clear();
        final RobotsTxtEntry robotsTxt4Host = new RobotsTxtEntry(
-                    robotsURL,
+                    response.getRequest().url(),
                    parserResult.allowList(),
                    denyPath,
                    new Date(),
@ -344,6 +350,7 @@ public class RobotsTxt {
                    parserResult.crawlDelayMillis(),
                    parserResult.agentName());
        addEntry(robotsTxt4Host);
+        return robotsTxt4Host;
    }
    
    private String addEntry(final RobotsTxtEntry entry) {