From 9a4375b1157af31b626570a8f89bc8b2264986af Mon Sep 17 00:00:00 2001 From: theli Date: Fri, 18 May 2007 13:00:42 +0000 Subject: [PATCH] *) robots.txt: adding support for crawl-delay git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@3737 6c8d7289-2bf4-0310-a012-ef5d649a1542 --- source/de/anomic/data/robotsParser.java | 39 +++++++++++++++++- .../anomic/plasma/plasmaCrawlRobotsTxt.java | 41 ++++++++++++++----- 2 files changed, 67 insertions(+), 13 deletions(-) diff --git a/source/de/anomic/data/robotsParser.java b/source/de/anomic/data/robotsParser.java index 185e96254..dc46d510a 100644 --- a/source/de/anomic/data/robotsParser.java +++ b/source/de/anomic/data/robotsParser.java @@ -90,6 +90,7 @@ public final class robotsParser{ public static final String ROBOTS_ALLOW = "Allow:".toUpperCase(); public static final String ROBOTS_COMMENT = "#"; public static final String ROBOTS_SITEMAP = "Sitemap:".toUpperCase(); + public static final String ROBOTS_CRAWL_DELAY = "Crawl-Delay:".toUpperCase(); /*public robotsParser(URL robotsUrl){ }*/ @@ -121,6 +122,7 @@ public final class robotsParser{ int pos; String line = null, lineUpper = null, sitemap = null; + Integer crawlDelay = null; boolean isRuleBlock4AllAgents = false, isRuleBlock4YaCyAgent = false, rule4YaCyFound = false, @@ -149,6 +151,7 @@ public final class robotsParser{ inBlock = false; isRuleBlock4AllAgents = false; isRuleBlock4YaCyAgent = false; + crawlDelay = null; // each block has a separate delay } // cutting off comments at the line end @@ -166,6 +169,15 @@ public final class robotsParser{ isRuleBlock4YaCyAgent |= userAgent.toLowerCase().indexOf("yacy") >=0; if (isRuleBlock4YaCyAgent) rule4YaCyFound = true; } + } else if (lineUpper.startsWith(ROBOTS_CRAWL_DELAY)) { + pos = line.indexOf(" "); + if (pos != -1) { + try { + crawlDelay = Integer.valueOf(line.substring(pos).trim()); + } catch (NumberFormatException e) { + // invalid crawling delay + } + } } else if (lineUpper.startsWith(ROBOTS_DISALLOW) || lineUpper.startsWith(ROBOTS_ALLOW)) { inBlock = true; @@ -211,7 +223,7 @@ public final class robotsParser{ } ArrayList denyList = (rule4YaCyFound)?deny4YaCyAgent:deny4AllAgents; - return new Object[]{denyList,sitemap}; + return new Object[]{denyList,sitemap,crawlDelay}; } private static final int getPort(URL theURL) { @@ -258,6 +270,27 @@ public final class robotsParser{ return sitemapURL; } + public static Integer getCrawlDelay(URL theURL) { + if (theURL == null) throw new IllegalArgumentException(); + Integer crawlDelay = null; + + // generating the hostname:poart string needed to do a DB lookup + String urlHostPort = getHostPort(theURL); + + plasmaCrawlRobotsTxt.Entry robotsTxt4Host = null; + synchronized(urlHostPort) { + // doing a DB lookup to determine if the robots data is already available + robotsTxt4Host = plasmaSwitchboard.robots.getEntry(urlHostPort); + } + if (robotsTxt4Host == null) return null; + + try { + crawlDelay = robotsTxt4Host.getCrawlDelay(); + } catch (NumberFormatException e) {/* ignore this */} + + return crawlDelay; + } + public static boolean isDisallowed(URL nexturl) { if (nexturl == null) throw new IllegalArgumentException(); @@ -309,6 +342,7 @@ public final class robotsParser{ if ((robotsTxt4Host==null)||((robotsTxt4Host!=null)&&(result!=null))) { ArrayList denyPath = null; String sitemap = null; + Integer crawlDelay = null; if (accessCompletelyRestricted) { denyPath = new ArrayList(); denyPath.add("/"); @@ -318,13 +352,14 @@ public final class robotsParser{ Object[] parserResult = robotsParser.parse(robotsTxt); denyPath = (ArrayList) parserResult[0]; sitemap = (String) parserResult[1]; + crawlDelay = (Integer) parserResult[2]; } catch (IOException e) { serverLog.logSevere("ROBOTS","Unable to parse the robots.txt file from URL '" + robotsURL + "'."); } } // storing the data into the robots DB - robotsTxt4Host = plasmaSwitchboard.robots.addEntry(urlHostPort,denyPath,new Date(),modDate,eTag,sitemap); + robotsTxt4Host = plasmaSwitchboard.robots.addEntry(urlHostPort,denyPath,new Date(),modDate,eTag,sitemap,crawlDelay); } } } diff --git a/source/de/anomic/plasma/plasmaCrawlRobotsTxt.java b/source/de/anomic/plasma/plasmaCrawlRobotsTxt.java index d2ca9d538..14a873dba 100644 --- a/source/de/anomic/plasma/plasmaCrawlRobotsTxt.java +++ b/source/de/anomic/plasma/plasmaCrawlRobotsTxt.java @@ -84,16 +84,16 @@ public class plasmaCrawlRobotsTxt { } public void close() { - robotsTable.close(); + this.robotsTable.close(); } public int size() { - return robotsTable.size(); + return this.robotsTable.size(); } public void removeEntry(String hostName) { try { - robotsTable.remove(hostName.toLowerCase()); + this.robotsTable.remove(hostName.toLowerCase()); } catch (IOException e) { } catch (kelondroException e) { @@ -103,7 +103,7 @@ public class plasmaCrawlRobotsTxt { public Entry getEntry(String hostName) { try { - Map record = robotsTable.getMap(hostName); + Map record = this.robotsTable.getMap(hostName); if (record == null) return null; return new Entry(hostName, record); } catch (kelondroException e) { @@ -112,8 +112,16 @@ public class plasmaCrawlRobotsTxt { } } - public Entry addEntry(String hostName, ArrayList disallowPathList, Date loadedDate, Date modDate, String eTag, String sitemap) { - Entry entry = new Entry(hostName,disallowPathList,loadedDate,modDate,eTag,sitemap); + public Entry addEntry( + String hostName, + ArrayList disallowPathList, + Date loadedDate, + Date modDate, + String eTag, + String sitemap, + Integer crawlDelay + ) { + Entry entry = new Entry(hostName,disallowPathList,loadedDate,modDate,eTag,sitemap,crawlDelay); addEntry(entry); return entry; } @@ -121,7 +129,7 @@ public class plasmaCrawlRobotsTxt { public String addEntry(Entry entry) { // writes a new page and returns key try { - robotsTable.set(entry.hostName,entry.mem); + this.robotsTable.set(entry.hostName,entry.mem); return entry.hostName; } catch (IOException e) { return null; @@ -134,11 +142,12 @@ public class plasmaCrawlRobotsTxt { public static final String MOD_DATE = "modDate"; public static final String ETAG = "etag"; public static final String SITEMAP = "sitemap"; + public static final String CRAWL_DELAY = "crawlDelay"; // this is a simple record structure that hold all properties of a single crawl start - private Map mem; + Map mem; private LinkedList disallowPathList; - private String hostName; + String hostName; public Entry(String hostName, Map mem) { this.hostName = hostName.toLowerCase(); @@ -164,8 +173,10 @@ public class plasmaCrawlRobotsTxt { Date loadedDate, Date modDate, String eTag, - String sitemap) { - if ((hostName == null) || (hostName.length() == 0)) throw new IllegalArgumentException(); + String sitemap, + Integer crawlDelay + ) { + if ((hostName == null) || (hostName.length() == 0)) throw new IllegalArgumentException("The hostname is missing"); this.hostName = hostName.trim().toLowerCase(); this.disallowPathList = new LinkedList(); @@ -175,6 +186,7 @@ public class plasmaCrawlRobotsTxt { if (modDate != null) this.mem.put(MOD_DATE,Long.toString(modDate.getTime())); if (eTag != null) this.mem.put(ETAG,eTag); if (sitemap != null) this.mem.put(SITEMAP,sitemap); + if (crawlDelay != null) this.mem.put(CRAWL_DELAY,crawlDelay.toString()); if ((disallowPathList != null)&&(disallowPathList.size()>0)) { this.disallowPathList.addAll(disallowPathList); @@ -231,6 +243,13 @@ public class plasmaCrawlRobotsTxt { return null; } + public Integer getCrawlDelay() { + if (this.mem.containsKey(CRAWL_DELAY)) { + return Integer.valueOf((String)this.mem.get(CRAWL_DELAY)); + } + return null; + } + public boolean isDisallowed(String path) { if ((this.mem == null) || (this.disallowPathList.size() == 0)) return false;