mirror of
https://github.com/yacy/yacy_search_server.git
synced 2024-09-19 00:01:41 +02:00
*) robots.txt: adding support for crawl-delay
git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@3737 6c8d7289-2bf4-0310-a012-ef5d649a1542
This commit is contained in:
parent
11ac7688d5
commit
9a4375b115
|
@ -90,6 +90,7 @@ public final class robotsParser{
|
|||
public static final String ROBOTS_ALLOW = "Allow:".toUpperCase();
|
||||
public static final String ROBOTS_COMMENT = "#";
|
||||
public static final String ROBOTS_SITEMAP = "Sitemap:".toUpperCase();
|
||||
public static final String ROBOTS_CRAWL_DELAY = "Crawl-Delay:".toUpperCase();
|
||||
|
||||
/*public robotsParser(URL robotsUrl){
|
||||
}*/
|
||||
|
@ -121,6 +122,7 @@ public final class robotsParser{
|
|||
|
||||
int pos;
|
||||
String line = null, lineUpper = null, sitemap = null;
|
||||
Integer crawlDelay = null;
|
||||
boolean isRuleBlock4AllAgents = false,
|
||||
isRuleBlock4YaCyAgent = false,
|
||||
rule4YaCyFound = false,
|
||||
|
@ -149,6 +151,7 @@ public final class robotsParser{
|
|||
inBlock = false;
|
||||
isRuleBlock4AllAgents = false;
|
||||
isRuleBlock4YaCyAgent = false;
|
||||
crawlDelay = null; // each block has a separate delay
|
||||
}
|
||||
|
||||
// cutting off comments at the line end
|
||||
|
@ -166,6 +169,15 @@ public final class robotsParser{
|
|||
isRuleBlock4YaCyAgent |= userAgent.toLowerCase().indexOf("yacy") >=0;
|
||||
if (isRuleBlock4YaCyAgent) rule4YaCyFound = true;
|
||||
}
|
||||
} else if (lineUpper.startsWith(ROBOTS_CRAWL_DELAY)) {
|
||||
pos = line.indexOf(" ");
|
||||
if (pos != -1) {
|
||||
try {
|
||||
crawlDelay = Integer.valueOf(line.substring(pos).trim());
|
||||
} catch (NumberFormatException e) {
|
||||
// invalid crawling delay
|
||||
}
|
||||
}
|
||||
} else if (lineUpper.startsWith(ROBOTS_DISALLOW) ||
|
||||
lineUpper.startsWith(ROBOTS_ALLOW)) {
|
||||
inBlock = true;
|
||||
|
@ -211,7 +223,7 @@ public final class robotsParser{
|
|||
}
|
||||
|
||||
ArrayList denyList = (rule4YaCyFound)?deny4YaCyAgent:deny4AllAgents;
|
||||
return new Object[]{denyList,sitemap};
|
||||
return new Object[]{denyList,sitemap,crawlDelay};
|
||||
}
|
||||
|
||||
private static final int getPort(URL theURL) {
|
||||
|
@ -258,6 +270,27 @@ public final class robotsParser{
|
|||
return sitemapURL;
|
||||
}
|
||||
|
||||
public static Integer getCrawlDelay(URL theURL) {
|
||||
if (theURL == null) throw new IllegalArgumentException();
|
||||
Integer crawlDelay = null;
|
||||
|
||||
// generating the hostname:poart string needed to do a DB lookup
|
||||
String urlHostPort = getHostPort(theURL);
|
||||
|
||||
plasmaCrawlRobotsTxt.Entry robotsTxt4Host = null;
|
||||
synchronized(urlHostPort) {
|
||||
// doing a DB lookup to determine if the robots data is already available
|
||||
robotsTxt4Host = plasmaSwitchboard.robots.getEntry(urlHostPort);
|
||||
}
|
||||
if (robotsTxt4Host == null) return null;
|
||||
|
||||
try {
|
||||
crawlDelay = robotsTxt4Host.getCrawlDelay();
|
||||
} catch (NumberFormatException e) {/* ignore this */}
|
||||
|
||||
return crawlDelay;
|
||||
}
|
||||
|
||||
public static boolean isDisallowed(URL nexturl) {
|
||||
if (nexturl == null) throw new IllegalArgumentException();
|
||||
|
||||
|
@ -309,6 +342,7 @@ public final class robotsParser{
|
|||
if ((robotsTxt4Host==null)||((robotsTxt4Host!=null)&&(result!=null))) {
|
||||
ArrayList denyPath = null;
|
||||
String sitemap = null;
|
||||
Integer crawlDelay = null;
|
||||
if (accessCompletelyRestricted) {
|
||||
denyPath = new ArrayList();
|
||||
denyPath.add("/");
|
||||
|
@ -318,13 +352,14 @@ public final class robotsParser{
|
|||
Object[] parserResult = robotsParser.parse(robotsTxt);
|
||||
denyPath = (ArrayList) parserResult[0];
|
||||
sitemap = (String) parserResult[1];
|
||||
crawlDelay = (Integer) parserResult[2];
|
||||
} catch (IOException e) {
|
||||
serverLog.logSevere("ROBOTS","Unable to parse the robots.txt file from URL '" + robotsURL + "'.");
|
||||
}
|
||||
}
|
||||
|
||||
// storing the data into the robots DB
|
||||
robotsTxt4Host = plasmaSwitchboard.robots.addEntry(urlHostPort,denyPath,new Date(),modDate,eTag,sitemap);
|
||||
robotsTxt4Host = plasmaSwitchboard.robots.addEntry(urlHostPort,denyPath,new Date(),modDate,eTag,sitemap,crawlDelay);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -84,16 +84,16 @@ public class plasmaCrawlRobotsTxt {
|
|||
}
|
||||
|
||||
public void close() {
|
||||
robotsTable.close();
|
||||
this.robotsTable.close();
|
||||
}
|
||||
|
||||
public int size() {
|
||||
return robotsTable.size();
|
||||
return this.robotsTable.size();
|
||||
}
|
||||
|
||||
public void removeEntry(String hostName) {
|
||||
try {
|
||||
robotsTable.remove(hostName.toLowerCase());
|
||||
this.robotsTable.remove(hostName.toLowerCase());
|
||||
} catch (IOException e) {
|
||||
|
||||
} catch (kelondroException e) {
|
||||
|
@ -103,7 +103,7 @@ public class plasmaCrawlRobotsTxt {
|
|||
|
||||
public Entry getEntry(String hostName) {
|
||||
try {
|
||||
Map record = robotsTable.getMap(hostName);
|
||||
Map record = this.robotsTable.getMap(hostName);
|
||||
if (record == null) return null;
|
||||
return new Entry(hostName, record);
|
||||
} catch (kelondroException e) {
|
||||
|
@ -112,8 +112,16 @@ public class plasmaCrawlRobotsTxt {
|
|||
}
|
||||
}
|
||||
|
||||
public Entry addEntry(String hostName, ArrayList disallowPathList, Date loadedDate, Date modDate, String eTag, String sitemap) {
|
||||
Entry entry = new Entry(hostName,disallowPathList,loadedDate,modDate,eTag,sitemap);
|
||||
public Entry addEntry(
|
||||
String hostName,
|
||||
ArrayList disallowPathList,
|
||||
Date loadedDate,
|
||||
Date modDate,
|
||||
String eTag,
|
||||
String sitemap,
|
||||
Integer crawlDelay
|
||||
) {
|
||||
Entry entry = new Entry(hostName,disallowPathList,loadedDate,modDate,eTag,sitemap,crawlDelay);
|
||||
addEntry(entry);
|
||||
return entry;
|
||||
}
|
||||
|
@ -121,7 +129,7 @@ public class plasmaCrawlRobotsTxt {
|
|||
public String addEntry(Entry entry) {
|
||||
// writes a new page and returns key
|
||||
try {
|
||||
robotsTable.set(entry.hostName,entry.mem);
|
||||
this.robotsTable.set(entry.hostName,entry.mem);
|
||||
return entry.hostName;
|
||||
} catch (IOException e) {
|
||||
return null;
|
||||
|
@ -134,11 +142,12 @@ public class plasmaCrawlRobotsTxt {
|
|||
public static final String MOD_DATE = "modDate";
|
||||
public static final String ETAG = "etag";
|
||||
public static final String SITEMAP = "sitemap";
|
||||
public static final String CRAWL_DELAY = "crawlDelay";
|
||||
|
||||
// this is a simple record structure that hold all properties of a single crawl start
|
||||
private Map mem;
|
||||
Map mem;
|
||||
private LinkedList disallowPathList;
|
||||
private String hostName;
|
||||
String hostName;
|
||||
|
||||
public Entry(String hostName, Map mem) {
|
||||
this.hostName = hostName.toLowerCase();
|
||||
|
@ -164,8 +173,10 @@ public class plasmaCrawlRobotsTxt {
|
|||
Date loadedDate,
|
||||
Date modDate,
|
||||
String eTag,
|
||||
String sitemap) {
|
||||
if ((hostName == null) || (hostName.length() == 0)) throw new IllegalArgumentException();
|
||||
String sitemap,
|
||||
Integer crawlDelay
|
||||
) {
|
||||
if ((hostName == null) || (hostName.length() == 0)) throw new IllegalArgumentException("The hostname is missing");
|
||||
|
||||
this.hostName = hostName.trim().toLowerCase();
|
||||
this.disallowPathList = new LinkedList();
|
||||
|
@ -175,6 +186,7 @@ public class plasmaCrawlRobotsTxt {
|
|||
if (modDate != null) this.mem.put(MOD_DATE,Long.toString(modDate.getTime()));
|
||||
if (eTag != null) this.mem.put(ETAG,eTag);
|
||||
if (sitemap != null) this.mem.put(SITEMAP,sitemap);
|
||||
if (crawlDelay != null) this.mem.put(CRAWL_DELAY,crawlDelay.toString());
|
||||
|
||||
if ((disallowPathList != null)&&(disallowPathList.size()>0)) {
|
||||
this.disallowPathList.addAll(disallowPathList);
|
||||
|
@ -231,6 +243,13 @@ public class plasmaCrawlRobotsTxt {
|
|||
return null;
|
||||
}
|
||||
|
||||
public Integer getCrawlDelay() {
|
||||
if (this.mem.containsKey(CRAWL_DELAY)) {
|
||||
return Integer.valueOf((String)this.mem.get(CRAWL_DELAY));
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
public boolean isDisallowed(String path) {
|
||||
if ((this.mem == null) || (this.disallowPathList.size() == 0)) return false;
|
||||
|
||||
|
|
Loading…
Reference in New Issue
Block a user