mirror of
https://github.com/yacy/yacy_search_server.git
synced 2024-09-19 00:01:41 +02:00
Use unredirected robots.txt URL when adding an entry to the table.
This commit is contained in:
parent
3f0446f14b
commit
6cec2cdcb5
|
@ -198,7 +198,7 @@ public class RobotsTxt {
|
||||||
if (response == null) {
|
if (response == null) {
|
||||||
processOldEntry(robotsTxt4Host, robotsURL, robotsTable);
|
processOldEntry(robotsTxt4Host, robotsURL, robotsTable);
|
||||||
} else {
|
} else {
|
||||||
robotsTxt4Host = processNewEntry(response, agent.robotIDs);
|
robotsTxt4Host = processNewEntry(robotsURL, response, agent.robotIDs);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -266,7 +266,7 @@ public class RobotsTxt {
|
||||||
if (response == null) {
|
if (response == null) {
|
||||||
processOldEntry(null, robotsURL, robotsTable);
|
processOldEntry(null, robotsURL, robotsTable);
|
||||||
} else {
|
} else {
|
||||||
processNewEntry(response, agent.robotIDs);
|
processNewEntry(robotsURL, response, agent.robotIDs);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -316,11 +316,12 @@ public class RobotsTxt {
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Process a response to a robots.txt request, create a new robots entry, add it to the robots table then return it.
|
* Process a response to a robots.txt request, create a new robots entry, add it to the robots table then return it.
|
||||||
|
* @param robotsURL the initial robots.txt URL (before any eventual redirection). Must not be null.
|
||||||
* @param response the response to the requested robots.txt URL. Must not be null.
|
* @param response the response to the requested robots.txt URL. Must not be null.
|
||||||
* @param thisAgents the agent identifier(s) used to request the robots.txt URL
|
* @param thisAgents the agent identifier(s) used to request the robots.txt URL
|
||||||
* @return the new robots entry
|
* @return the new robots entry
|
||||||
*/
|
*/
|
||||||
private RobotsTxtEntry processNewEntry(final Response response, final String[] thisAgents) {
|
private RobotsTxtEntry processNewEntry(final DigestURL robotsURL, final Response response, final String[] thisAgents) {
|
||||||
final byte[] robotsTxt = response.getContent();
|
final byte[] robotsTxt = response.getContent();
|
||||||
//Log.logInfo("RobotsTxt", "robots of " + robotsURL.toNormalform(true, true) + ":\n" + ((robotsTxt == null) ? "null" : UTF8.String(robotsTxt))); // debug TODO remove
|
//Log.logInfo("RobotsTxt", "robots of " + robotsURL.toNormalform(true, true) + ":\n" + ((robotsTxt == null) ? "null" : UTF8.String(robotsTxt))); // debug TODO remove
|
||||||
RobotsTxtParser parserResult;
|
RobotsTxtParser parserResult;
|
||||||
|
@ -338,9 +339,14 @@ public class RobotsTxt {
|
||||||
// store the data into the robots DB
|
// store the data into the robots DB
|
||||||
String etag = response.getResponseHeader().containsKey(HeaderFramework.ETAG) ? (response.getResponseHeader().get(HeaderFramework.ETAG)).trim() : null;
|
String etag = response.getResponseHeader().containsKey(HeaderFramework.ETAG) ? (response.getResponseHeader().get(HeaderFramework.ETAG)).trim() : null;
|
||||||
boolean isBrowserAgent = thisAgents.length == 1 && thisAgents[0].equals("Mozilla");
|
boolean isBrowserAgent = thisAgents.length == 1 && thisAgents[0].equals("Mozilla");
|
||||||
if (isBrowserAgent) denyPath.clear();
|
if (isBrowserAgent) {
|
||||||
|
denyPath.clear();
|
||||||
|
}
|
||||||
|
/* The robotsURL may eventually be redirected (from http to https is common),
|
||||||
|
* but we store here the url before any redirection. If would not process this way, the unredirected URL would later
|
||||||
|
* never found in the robots table thus needing each time a http load.*/
|
||||||
final RobotsTxtEntry robotsTxt4Host = new RobotsTxtEntry(
|
final RobotsTxtEntry robotsTxt4Host = new RobotsTxtEntry(
|
||||||
response.getRequest().url(),
|
robotsURL,
|
||||||
parserResult.allowList(),
|
parserResult.allowList(),
|
||||||
denyPath,
|
denyPath,
|
||||||
new Date(),
|
new Date(),
|
||||||
|
|
Loading…
Reference in New Issue
Block a user