*) robots.txt: adding support for crawl-delay

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@3737 6c8d7289-2bf4-0310-a012-ef5d649a1542
This commit is contained in:
theli 2007-05-18 13:00:42 +00:00
parent 11ac7688d5
commit 9a4375b115
2 changed files with 67 additions and 13 deletions

View File

@ -90,6 +90,7 @@ public final class robotsParser{
public static final String ROBOTS_ALLOW = "Allow:".toUpperCase();
public static final String ROBOTS_COMMENT = "#";
public static final String ROBOTS_SITEMAP = "Sitemap:".toUpperCase();
public static final String ROBOTS_CRAWL_DELAY = "Crawl-Delay:".toUpperCase();
/*public robotsParser(URL robotsUrl){
}*/
@ -121,6 +122,7 @@ public final class robotsParser{
int pos;
String line = null, lineUpper = null, sitemap = null;
Integer crawlDelay = null;
boolean isRuleBlock4AllAgents = false,
isRuleBlock4YaCyAgent = false,
rule4YaCyFound = false,
@ -149,6 +151,7 @@ public final class robotsParser{
inBlock = false;
isRuleBlock4AllAgents = false;
isRuleBlock4YaCyAgent = false;
crawlDelay = null; // each block has a separate delay
}
// cutting off comments at the line end
@ -166,6 +169,15 @@ public final class robotsParser{
isRuleBlock4YaCyAgent |= userAgent.toLowerCase().indexOf("yacy") >=0;
if (isRuleBlock4YaCyAgent) rule4YaCyFound = true;
}
} else if (lineUpper.startsWith(ROBOTS_CRAWL_DELAY)) {
pos = line.indexOf(" ");
if (pos != -1) {
try {
crawlDelay = Integer.valueOf(line.substring(pos).trim());
} catch (NumberFormatException e) {
// invalid crawling delay
}
}
} else if (lineUpper.startsWith(ROBOTS_DISALLOW) ||
lineUpper.startsWith(ROBOTS_ALLOW)) {
inBlock = true;
@ -211,7 +223,7 @@ public final class robotsParser{
}
ArrayList denyList = (rule4YaCyFound)?deny4YaCyAgent:deny4AllAgents;
return new Object[]{denyList,sitemap};
return new Object[]{denyList,sitemap,crawlDelay};
}
private static final int getPort(URL theURL) {
@ -258,6 +270,27 @@ public final class robotsParser{
return sitemapURL;
}
public static Integer getCrawlDelay(URL theURL) {
if (theURL == null) throw new IllegalArgumentException();
Integer crawlDelay = null;
// generating the hostname:poart string needed to do a DB lookup
String urlHostPort = getHostPort(theURL);
plasmaCrawlRobotsTxt.Entry robotsTxt4Host = null;
synchronized(urlHostPort) {
// doing a DB lookup to determine if the robots data is already available
robotsTxt4Host = plasmaSwitchboard.robots.getEntry(urlHostPort);
}
if (robotsTxt4Host == null) return null;
try {
crawlDelay = robotsTxt4Host.getCrawlDelay();
} catch (NumberFormatException e) {/* ignore this */}
return crawlDelay;
}
public static boolean isDisallowed(URL nexturl) {
if (nexturl == null) throw new IllegalArgumentException();
@ -309,6 +342,7 @@ public final class robotsParser{
if ((robotsTxt4Host==null)||((robotsTxt4Host!=null)&&(result!=null))) {
ArrayList denyPath = null;
String sitemap = null;
Integer crawlDelay = null;
if (accessCompletelyRestricted) {
denyPath = new ArrayList();
denyPath.add("/");
@ -318,13 +352,14 @@ public final class robotsParser{
Object[] parserResult = robotsParser.parse(robotsTxt);
denyPath = (ArrayList) parserResult[0];
sitemap = (String) parserResult[1];
crawlDelay = (Integer) parserResult[2];
} catch (IOException e) {
serverLog.logSevere("ROBOTS","Unable to parse the robots.txt file from URL '" + robotsURL + "'.");
}
}
// storing the data into the robots DB
robotsTxt4Host = plasmaSwitchboard.robots.addEntry(urlHostPort,denyPath,new Date(),modDate,eTag,sitemap);
robotsTxt4Host = plasmaSwitchboard.robots.addEntry(urlHostPort,denyPath,new Date(),modDate,eTag,sitemap,crawlDelay);
}
}
}

View File

@ -84,16 +84,16 @@ public class plasmaCrawlRobotsTxt {
}
public void close() {
robotsTable.close();
this.robotsTable.close();
}
public int size() {
return robotsTable.size();
return this.robotsTable.size();
}
public void removeEntry(String hostName) {
try {
robotsTable.remove(hostName.toLowerCase());
this.robotsTable.remove(hostName.toLowerCase());
} catch (IOException e) {
} catch (kelondroException e) {
@ -103,7 +103,7 @@ public class plasmaCrawlRobotsTxt {
public Entry getEntry(String hostName) {
try {
Map record = robotsTable.getMap(hostName);
Map record = this.robotsTable.getMap(hostName);
if (record == null) return null;
return new Entry(hostName, record);
} catch (kelondroException e) {
@ -112,8 +112,16 @@ public class plasmaCrawlRobotsTxt {
}
}
public Entry addEntry(String hostName, ArrayList disallowPathList, Date loadedDate, Date modDate, String eTag, String sitemap) {
Entry entry = new Entry(hostName,disallowPathList,loadedDate,modDate,eTag,sitemap);
public Entry addEntry(
String hostName,
ArrayList disallowPathList,
Date loadedDate,
Date modDate,
String eTag,
String sitemap,
Integer crawlDelay
) {
Entry entry = new Entry(hostName,disallowPathList,loadedDate,modDate,eTag,sitemap,crawlDelay);
addEntry(entry);
return entry;
}
@ -121,7 +129,7 @@ public class plasmaCrawlRobotsTxt {
public String addEntry(Entry entry) {
// writes a new page and returns key
try {
robotsTable.set(entry.hostName,entry.mem);
this.robotsTable.set(entry.hostName,entry.mem);
return entry.hostName;
} catch (IOException e) {
return null;
@ -134,11 +142,12 @@ public class plasmaCrawlRobotsTxt {
public static final String MOD_DATE = "modDate";
public static final String ETAG = "etag";
public static final String SITEMAP = "sitemap";
public static final String CRAWL_DELAY = "crawlDelay";
// this is a simple record structure that hold all properties of a single crawl start
private Map mem;
Map mem;
private LinkedList disallowPathList;
private String hostName;
String hostName;
public Entry(String hostName, Map mem) {
this.hostName = hostName.toLowerCase();
@ -164,8 +173,10 @@ public class plasmaCrawlRobotsTxt {
Date loadedDate,
Date modDate,
String eTag,
String sitemap) {
if ((hostName == null) || (hostName.length() == 0)) throw new IllegalArgumentException();
String sitemap,
Integer crawlDelay
) {
if ((hostName == null) || (hostName.length() == 0)) throw new IllegalArgumentException("The hostname is missing");
this.hostName = hostName.trim().toLowerCase();
this.disallowPathList = new LinkedList();
@ -175,6 +186,7 @@ public class plasmaCrawlRobotsTxt {
if (modDate != null) this.mem.put(MOD_DATE,Long.toString(modDate.getTime()));
if (eTag != null) this.mem.put(ETAG,eTag);
if (sitemap != null) this.mem.put(SITEMAP,sitemap);
if (crawlDelay != null) this.mem.put(CRAWL_DELAY,crawlDelay.toString());
if ((disallowPathList != null)&&(disallowPathList.size()>0)) {
this.disallowPathList.addAll(disallowPathList);
@ -231,6 +243,13 @@ public class plasmaCrawlRobotsTxt {
return null;
}
public Integer getCrawlDelay() {
if (this.mem.containsKey(CRAWL_DELAY)) {
return Integer.valueOf((String)this.mem.get(CRAWL_DELAY));
}
return null;
}
public boolean isDisallowed(String path) {
if ((this.mem == null) || (this.disallowPathList.size() == 0)) return false;