From 3f0446f14b89e6a502363dc516f82f7dbab330df Mon Sep 17 00:00:00 2001
From: luccioman <luccioman@users.noreply.github.com>
Date: Wed, 16 Aug 2017 09:30:33 +0200
Subject: [PATCH] Ensure proper synchronous robots entry retrieval on first
 check.

Previously, when checking for the first time the robots.txt policy on a
unknown host (not cached in the robots table), result was always empty
in the /getpageinfo_p.xml api and in the /CrawlCheck_p.html page. Next
calls returned however the correct information.
---
 htroot/api/getpageinfo_p.java                 |  1 -
 source/net/yacy/crawler/robots/RobotsTxt.java | 15 +++++++++++----
 2 files changed, 11 insertions(+), 5 deletions(-)

diff --git a/htroot/api/getpageinfo_p.java b/htroot/api/getpageinfo_p.java
index 309421a63..8da9cef8a 100644
--- a/htroot/api/getpageinfo_p.java
+++ b/htroot/api/getpageinfo_p.java
@@ -221,7 +221,6 @@ public class getpageinfo_p {
 
                 	// determine if crawling of the current URL is allowed
                     ClientIdentification.Agent agent = ClientIdentification.getAgent(post.get("agentName", ClientIdentification.yacyInternetCrawlerAgentName));
-                    sb.robots.ensureExist(theURL, agent, true);
                     RobotsTxtEntry robotsEntry = sb.robots.getEntry(theURL, agent);
                 	prop.put("robots-allowed", robotsEntry == null ? 1 : robotsEntry.isDisallowed(theURL) ? 0 : 1);
                     prop.putHTML("robotsInfo", robotsEntry == null ? "" : robotsEntry.getInfo());
diff --git a/source/net/yacy/crawler/robots/RobotsTxt.java b/source/net/yacy/crawler/robots/RobotsTxt.java
index dba0a55f0..b97402114 100644
--- a/source/net/yacy/crawler/robots/RobotsTxt.java
+++ b/source/net/yacy/crawler/robots/RobotsTxt.java
@@ -198,7 +198,7 @@ public class RobotsTxt {
                 if (response == null) {
                     processOldEntry(robotsTxt4Host, robotsURL, robotsTable);
                 } else {
-                    processNewEntry(robotsURL, response, agent.robotIDs);
+                	robotsTxt4Host = processNewEntry(response, agent.robotIDs);
                 }
             }
         }
@@ -266,7 +266,7 @@ public class RobotsTxt {
                     if (response == null) {
                         processOldEntry(null, robotsURL, robotsTable);
                     } else {
-                        processNewEntry(robotsURL, response, agent.robotIDs);
+                        processNewEntry(response, agent.robotIDs);
                     }
                 }
             }
@@ -314,7 +314,13 @@ public class RobotsTxt {
         }
     }
     
-    private void processNewEntry(DigestURL robotsURL, Response response, final String[] thisAgents) {
+    /**
+     * Process a response to a robots.txt request, create a new robots entry, add it to the robots table then return it.
+     * @param response the response to the requested robots.txt URL. Must not be null.
+     * @param thisAgents the agent identifier(s) used to request the robots.txt URL
+     * @return the new robots entry
+     */
+    private RobotsTxtEntry processNewEntry(final Response response, final String[] thisAgents) {
         final byte[] robotsTxt = response.getContent();
         //Log.logInfo("RobotsTxt", "robots of " + robotsURL.toNormalform(true, true) + ":\n" + ((robotsTxt == null) ? "null" : UTF8.String(robotsTxt))); // debug TODO remove
         RobotsTxtParser parserResult;
@@ -334,7 +340,7 @@ public class RobotsTxt {
         boolean isBrowserAgent = thisAgents.length == 1 && thisAgents[0].equals("Mozilla");
         if (isBrowserAgent) denyPath.clear();
         final RobotsTxtEntry robotsTxt4Host = new RobotsTxtEntry(
-                    robotsURL,
+                    response.getRequest().url(),
                     parserResult.allowList(),
                     denyPath,
                     new Date(),
@@ -344,6 +350,7 @@ public class RobotsTxt {
                     parserResult.crawlDelayMillis(),
                     parserResult.agentName());
         addEntry(robotsTxt4Host);
+        return robotsTxt4Host;
     }
     
     private String addEntry(final RobotsTxtEntry entry) {