fix for robots.txt handling: delete old entry before starting a new

crawl.
This commit is contained in:
Michael Peter Christen 2014-04-09 18:33:48 +02:00
parent 8068e68474
commit 1c21b3256d
3 changed files with 24 additions and 2 deletions

View File

@ -217,6 +217,9 @@ public class Crawler_p {
if (crawlName.endsWith(",")) crawlName = crawlName.substring(0, crawlName.length() - 1);
if (crawlName.length() == 0 && sitemapURLStr.length() > 0) crawlName = "sitemap loader for " + sitemapURLStr;
// delete old robots entries
for (DigestURL ru: rootURLs) sb.robots.delete(ru);
// set the crawl filter
String ipMustMatch = post.get("ipMustmatch", CrawlProfile.MATCH_ALL_STRING);
final String ipMustNotMatch = post.get("ipMustnotmatch", CrawlProfile.MATCH_NEVER_STRING);

View File

@ -185,7 +185,24 @@ public class RobotsTxt {
return robotsTxt4Host;
}
public void delete(final MultiProtocolURL theURL) {
final String urlHostPort = getHostPort(theURL);
if (urlHostPort == null) return;
final BEncodedHeap robotsTable;
try {
robotsTable = this.tables.getHeap(WorkTables.TABLE_ROBOTS_NAME);
} catch (final IOException e1) {
log.severe("tables not available", e1);
return;
}
if (robotsTable == null) return;
try {
robotsTable.delete(robotsTable.encodedKey(urlHostPort));
} catch (IOException e) {
}
}
public void ensureExist(final MultiProtocolURL theURL, final ClientIdentification.Agent agent, boolean concurrent) {
if (theURL.isLocal()) return;
final String urlHostPort = getHostPort(theURL);

View File

@ -1492,7 +1492,9 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
Collection<Object> c = doc.getFieldValues(CollectionSchema.collection_sxt.getSolrFieldName());
if (c != null) for (Object cn: c) if (cn != null) this.collections.put((String) cn, QueryParams.catchall_pattern);
this.failReason = (String) doc.getFieldValue(CollectionSchema.failreason_s.getSolrFieldName());
this.failType = FailType.valueOf((String) doc.getFieldValue(CollectionSchema.failtype_s.getSolrFieldName()));
String fts = (String) doc.getFieldValue(CollectionSchema.failtype_s.getSolrFieldName());
if (fts == null) ConcurrentLog.warn("CollectionConfiguration", "no fail type given for URL " + this.digestURL.toNormalform(true));
this.failType = fts == null ? FailType.fail : FailType.valueOf(fts);
this.httpstatus = (Integer) doc.getFieldValue(CollectionSchema.httpstatus_i.getSolrFieldName());
this.failtime = (Date) doc.getFieldValue(CollectionSchema.load_date_dt.getSolrFieldName());
}