mirror of
https://github.com/yacy/yacy_search_server.git
synced 2024-09-21 00:00:13 +02:00
038f956821
appeared after the declaration of robots allow/deny for the crawler because the sitemap parser terminated after the allow/deny rules had been found. Now the parser reads the robots.txt until the end to discover also sitemap rules at the end of the file.
274 lines
11 KiB
Java
274 lines
11 KiB
Java
/*
|
|
robotsParser.java
|
|
-------------------------------------
|
|
part of YACY
|
|
|
|
(C) 2005, 2006 by Alexander Schier
|
|
Martin Thelian
|
|
|
|
last change: $LastChangedDate$LastChangedBy: orbiter $
|
|
Revision: $LastChangedRevision$
|
|
|
|
This program is free software; you can redistribute it and/or modify
|
|
it under the terms of the GNU General public License as published by
|
|
the Free Software Foundation; either version 2 of the License, or
|
|
(at your option) any later version.
|
|
|
|
This program is distributed in the hope that it will be useful,
|
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
GNU General public License for more details.
|
|
|
|
You should have received a copy of the GNU General private License
|
|
along with this program; if not, write to the Free Software
|
|
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
|
|
|
extended to return structured objects instead of a Object[] and
|
|
extended to return a Allow-List by Michael Christen, 21.07.2008
|
|
extended to allow multiple user agents given by definition and
|
|
returning the used user agent my Michael Christen 3.4.2011
|
|
*/
|
|
|
|
package net.yacy.crawler.robots;
|
|
|
|
import java.io.BufferedReader;
|
|
import java.io.ByteArrayInputStream;
|
|
import java.io.IOException;
|
|
import java.io.InputStreamReader;
|
|
import java.util.ArrayList;
|
|
import java.util.Set;
|
|
import java.util.regex.Pattern;
|
|
|
|
import net.yacy.cora.document.UTF8;
|
|
|
|
/*
|
|
* A class for Parsing robots.txt files.
|
|
* It only parses the Deny Part, yet.
|
|
*
|
|
* Robots RFC
|
|
* http://www.robotstxt.org/wc/norobots-rfc.html
|
|
*
|
|
* TODO:
|
|
* - On the request attempt resulted in temporary failure a robot
|
|
* should defer visits to the site until such time as the resource
|
|
* can be retrieved.
|
|
*
|
|
* - Extended Standard for Robot Exclusion
|
|
* See: http://www.conman.org/people/spc/robots2.html
|
|
*
|
|
* - Robot Exclusion Standard Revisited
|
|
* See: http://www.kollar.com/robots.html
|
|
*/
|
|
|
|
public final class RobotsTxtParser {
|
|
|
|
private static final Pattern patternTab = Pattern.compile("\t");
|
|
|
|
private static final String ROBOTS_USER_AGENT = "User-agent:".toUpperCase();
|
|
private static final String ROBOTS_DISALLOW = "Disallow:".toUpperCase();
|
|
private static final String ROBOTS_ALLOW = "Allow:".toUpperCase();
|
|
private static final String ROBOTS_COMMENT = "#";
|
|
private static final String ROBOTS_SITEMAP = "Sitemap:".toUpperCase();
|
|
private static final String ROBOTS_CRAWL_DELAY = "Crawl-delay:".toUpperCase();
|
|
|
|
private final ArrayList<String> allowList;
|
|
private final ArrayList<String> denyList;
|
|
private String sitemap;
|
|
private long crawlDelayMillis;
|
|
private final Set<String> myNames; // a list of own name lists
|
|
private String agentName; // the name of the agent that was used to return the result
|
|
|
|
protected RobotsTxtParser(final Set<String> myNames) {
|
|
this.allowList = new ArrayList<String>(0);
|
|
this.denyList = new ArrayList<String>(0);
|
|
this.sitemap = "";
|
|
this.crawlDelayMillis = 0;
|
|
this.myNames = myNames;
|
|
this.agentName = null;
|
|
}
|
|
|
|
protected RobotsTxtParser(final Set<String> myNames, final byte[] robotsTxt) {
|
|
this(myNames);
|
|
if (robotsTxt != null && robotsTxt.length != 0) {
|
|
final ByteArrayInputStream bin = new ByteArrayInputStream(robotsTxt);
|
|
final BufferedReader reader = new BufferedReader(new InputStreamReader(bin));
|
|
parse(reader);
|
|
}
|
|
}
|
|
|
|
private void parse(final BufferedReader reader) {
|
|
final ArrayList<String> deny4AllAgents = new ArrayList<String>();
|
|
final ArrayList<String> deny4ThisAgents = new ArrayList<String>();
|
|
final ArrayList<String> allow4AllAgents = new ArrayList<String>();
|
|
final ArrayList<String> allow4ThisAgents = new ArrayList<String>();
|
|
|
|
int pos;
|
|
String line = null, lineUpper = null;
|
|
boolean isRule4AllAgents = false,
|
|
isRule4ThisAgents = false,
|
|
rule4ThisAgentsFound = false,
|
|
inBlock = false;
|
|
|
|
try {
|
|
lineparser: while ((line = reader.readLine()) != null) {
|
|
// replacing all tabs with spaces
|
|
line = patternTab.matcher(line).replaceAll(" ").trim();
|
|
lineUpper = line.toUpperCase();
|
|
|
|
// parse empty line
|
|
if (line.isEmpty()) {
|
|
// we have reached the end of the rule block
|
|
continue lineparser;
|
|
}
|
|
|
|
// parse comment
|
|
if (line.startsWith(ROBOTS_COMMENT)) {
|
|
// we can ignore this. Just a comment line
|
|
continue lineparser;
|
|
}
|
|
|
|
// parse sitemap; if there are several sitemaps then take the first url
|
|
// TODO: support for multiple sitemaps
|
|
if (lineUpper.startsWith(ROBOTS_SITEMAP) && (this.sitemap == null || this.sitemap.isEmpty())) {
|
|
pos = line.indexOf(' ');
|
|
if (pos != -1) {
|
|
this.sitemap = line.substring(pos).trim();
|
|
}
|
|
continue lineparser;
|
|
}
|
|
|
|
// parse user agent
|
|
if (lineUpper.startsWith(ROBOTS_USER_AGENT)) {
|
|
|
|
if (inBlock) {
|
|
// we have detected the start of a new block
|
|
inBlock = false;
|
|
isRule4AllAgents = false;
|
|
isRule4ThisAgents = false;
|
|
this.crawlDelayMillis = 0; // each block has a separate delay
|
|
}
|
|
|
|
// cutting off comments at the line end
|
|
pos = line.indexOf(ROBOTS_COMMENT);
|
|
if (pos != -1) line = line.substring(0,pos).trim();
|
|
|
|
// getting out the robots name
|
|
pos = line.indexOf(' ');
|
|
if (pos != -1) {
|
|
final String userAgent = line.substring(pos).trim();
|
|
isRule4AllAgents |= userAgent.equals("*");
|
|
for (final String agent: this.myNames) {
|
|
if (userAgent.toLowerCase().equals(agent)) {
|
|
this.agentName = agent;
|
|
isRule4ThisAgents = true;
|
|
break;
|
|
}
|
|
}
|
|
if (isRule4ThisAgents) rule4ThisAgentsFound = true;
|
|
}
|
|
continue lineparser;
|
|
}
|
|
|
|
// parse crawl delay
|
|
if (lineUpper.startsWith(ROBOTS_CRAWL_DELAY)) {
|
|
inBlock = true;
|
|
if (isRule4ThisAgents || isRule4AllAgents) {
|
|
pos = line.indexOf(' ');
|
|
if (pos != -1) {
|
|
try {
|
|
// the crawl delay can be a float number and means number of seconds
|
|
this.crawlDelayMillis = (long) (1000.0 * Float.parseFloat(line.substring(pos).trim()));
|
|
} catch (final NumberFormatException e) {
|
|
// invalid crawling delay
|
|
}
|
|
}
|
|
}
|
|
continue lineparser;
|
|
}
|
|
|
|
// parse disallow
|
|
if (lineUpper.startsWith(ROBOTS_DISALLOW) || lineUpper.startsWith(ROBOTS_ALLOW)) {
|
|
inBlock = true;
|
|
final boolean isDisallowRule = lineUpper.startsWith(ROBOTS_DISALLOW);
|
|
|
|
if (isRule4ThisAgents || isRule4AllAgents) {
|
|
// cutting off comments at the line end
|
|
pos = line.indexOf(ROBOTS_COMMENT);
|
|
if (pos != -1) line = line.substring(0,pos).trim();
|
|
|
|
// cut off tailing *
|
|
if (line.endsWith("*")) line = line.substring(0,line.length()-1);
|
|
|
|
// parse the path
|
|
pos = line.indexOf(' ');
|
|
if (pos >= 0) {
|
|
// getting the path
|
|
String path = line.substring(pos).trim();
|
|
|
|
// unencoding all special charsx
|
|
try {
|
|
path = UTF8.decodeURL(path);
|
|
} catch (final Exception e) {
|
|
/*
|
|
* url decoding failed. E.g. because of
|
|
* "Incomplete trailing escape (%) pattern"
|
|
*/
|
|
}
|
|
|
|
// escaping all occurences of ; because this char is used as special char in the Robots DB
|
|
path = RobotsTxt.ROBOTS_DB_PATH_SEPARATOR_MATCHER.matcher(path).replaceAll("%3B");
|
|
|
|
// adding it to the pathlist
|
|
if (isDisallowRule) {
|
|
if (isRule4AllAgents) deny4AllAgents.add(path);
|
|
if (isRule4ThisAgents) deny4ThisAgents.add(path);
|
|
} else {
|
|
if (isRule4AllAgents) allow4AllAgents.add(path);
|
|
if (isRule4ThisAgents) allow4ThisAgents.add(path);
|
|
}
|
|
}
|
|
}
|
|
continue lineparser;
|
|
}
|
|
}
|
|
} catch (final IOException e) {}
|
|
|
|
this.allowList.addAll(rule4ThisAgentsFound ? allow4ThisAgents : allow4AllAgents);
|
|
this.denyList.addAll(rule4ThisAgentsFound ? deny4ThisAgents : deny4AllAgents);
|
|
}
|
|
|
|
/**
|
|
* a crawl delay can be assigned to every agent or for all agents
|
|
* a special case is where the user agent of this yacy peer is given explicitely
|
|
* using the peer name and then if the crawl delay is given as '0' the crawler
|
|
* does not make any no-DOS-forced crawl pause.
|
|
* @return the crawl delay between two crawl access times in milliseconds
|
|
*/
|
|
protected long crawlDelayMillis() {
|
|
return this.crawlDelayMillis;
|
|
}
|
|
|
|
/**
|
|
* the user agent that was applied to get the crawl properties is recorded
|
|
* because it is possible that this robots.txt parser applies to several user agents
|
|
* which may be i.e. 'yacy', 'yacybot', <peer-name>'.yacy' or <peer-hash>'.yacyh'
|
|
* Effects: see also comment to crawlDelayMillis()
|
|
* @return the name of the user agent that was used for the result properties or null if no user agent name was used to identify the agent
|
|
*/
|
|
protected String agentName() {
|
|
return this.agentName;
|
|
}
|
|
|
|
protected String sitemap() {
|
|
return this.sitemap;
|
|
}
|
|
|
|
protected ArrayList<String> allowList() {
|
|
return this.allowList;
|
|
}
|
|
|
|
protected ArrayList<String> denyList() {
|
|
return this.denyList;
|
|
}
|
|
}
|