mirror of
https://github.com/yacy/yacy_search_server.git
synced 2024-09-19 00:01:41 +02:00
Ensure proper synchronous robots entry retrieval on first check.
Previously, when checking for the first time the robots.txt policy on a unknown host (not cached in the robots table), result was always empty in the /getpageinfo_p.xml api and in the /CrawlCheck_p.html page. Next calls returned however the correct information.
This commit is contained in:
parent
9da75ac76d
commit
3f0446f14b
|
@ -221,7 +221,6 @@ public class getpageinfo_p {
|
|||
|
||||
// determine if crawling of the current URL is allowed
|
||||
ClientIdentification.Agent agent = ClientIdentification.getAgent(post.get("agentName", ClientIdentification.yacyInternetCrawlerAgentName));
|
||||
sb.robots.ensureExist(theURL, agent, true);
|
||||
RobotsTxtEntry robotsEntry = sb.robots.getEntry(theURL, agent);
|
||||
prop.put("robots-allowed", robotsEntry == null ? 1 : robotsEntry.isDisallowed(theURL) ? 0 : 1);
|
||||
prop.putHTML("robotsInfo", robotsEntry == null ? "" : robotsEntry.getInfo());
|
||||
|
|
|
@ -198,7 +198,7 @@ public class RobotsTxt {
|
|||
if (response == null) {
|
||||
processOldEntry(robotsTxt4Host, robotsURL, robotsTable);
|
||||
} else {
|
||||
processNewEntry(robotsURL, response, agent.robotIDs);
|
||||
robotsTxt4Host = processNewEntry(response, agent.robotIDs);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -266,7 +266,7 @@ public class RobotsTxt {
|
|||
if (response == null) {
|
||||
processOldEntry(null, robotsURL, robotsTable);
|
||||
} else {
|
||||
processNewEntry(robotsURL, response, agent.robotIDs);
|
||||
processNewEntry(response, agent.robotIDs);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -314,7 +314,13 @@ public class RobotsTxt {
|
|||
}
|
||||
}
|
||||
|
||||
private void processNewEntry(DigestURL robotsURL, Response response, final String[] thisAgents) {
|
||||
/**
|
||||
* Process a response to a robots.txt request, create a new robots entry, add it to the robots table then return it.
|
||||
* @param response the response to the requested robots.txt URL. Must not be null.
|
||||
* @param thisAgents the agent identifier(s) used to request the robots.txt URL
|
||||
* @return the new robots entry
|
||||
*/
|
||||
private RobotsTxtEntry processNewEntry(final Response response, final String[] thisAgents) {
|
||||
final byte[] robotsTxt = response.getContent();
|
||||
//Log.logInfo("RobotsTxt", "robots of " + robotsURL.toNormalform(true, true) + ":\n" + ((robotsTxt == null) ? "null" : UTF8.String(robotsTxt))); // debug TODO remove
|
||||
RobotsTxtParser parserResult;
|
||||
|
@ -334,7 +340,7 @@ public class RobotsTxt {
|
|||
boolean isBrowserAgent = thisAgents.length == 1 && thisAgents[0].equals("Mozilla");
|
||||
if (isBrowserAgent) denyPath.clear();
|
||||
final RobotsTxtEntry robotsTxt4Host = new RobotsTxtEntry(
|
||||
robotsURL,
|
||||
response.getRequest().url(),
|
||||
parserResult.allowList(),
|
||||
denyPath,
|
||||
new Date(),
|
||||
|
@ -344,6 +350,7 @@ public class RobotsTxt {
|
|||
parserResult.crawlDelayMillis(),
|
||||
parserResult.agentName());
|
||||
addEntry(robotsTxt4Host);
|
||||
return robotsTxt4Host;
|
||||
}
|
||||
|
||||
private String addEntry(final RobotsTxtEntry entry) {
|
||||
|
|
Loading…
Reference in New Issue
Block a user