mirror of
https://github.com/yacy/yacy_search_server.git
synced 2024-09-19 00:01:41 +02:00
fix for robots.txt handling: delete old entry before starting a new
crawl.
This commit is contained in:
parent
8068e68474
commit
1c21b3256d
|
@ -217,6 +217,9 @@ public class Crawler_p {
|
|||
if (crawlName.endsWith(",")) crawlName = crawlName.substring(0, crawlName.length() - 1);
|
||||
if (crawlName.length() == 0 && sitemapURLStr.length() > 0) crawlName = "sitemap loader for " + sitemapURLStr;
|
||||
|
||||
// delete old robots entries
|
||||
for (DigestURL ru: rootURLs) sb.robots.delete(ru);
|
||||
|
||||
// set the crawl filter
|
||||
String ipMustMatch = post.get("ipMustmatch", CrawlProfile.MATCH_ALL_STRING);
|
||||
final String ipMustNotMatch = post.get("ipMustnotmatch", CrawlProfile.MATCH_NEVER_STRING);
|
||||
|
|
|
@ -186,6 +186,23 @@ public class RobotsTxt {
|
|||
return robotsTxt4Host;
|
||||
}
|
||||
|
||||
public void delete(final MultiProtocolURL theURL) {
|
||||
final String urlHostPort = getHostPort(theURL);
|
||||
if (urlHostPort == null) return;
|
||||
final BEncodedHeap robotsTable;
|
||||
try {
|
||||
robotsTable = this.tables.getHeap(WorkTables.TABLE_ROBOTS_NAME);
|
||||
} catch (final IOException e1) {
|
||||
log.severe("tables not available", e1);
|
||||
return;
|
||||
}
|
||||
if (robotsTable == null) return;
|
||||
try {
|
||||
robotsTable.delete(robotsTable.encodedKey(urlHostPort));
|
||||
} catch (IOException e) {
|
||||
}
|
||||
}
|
||||
|
||||
public void ensureExist(final MultiProtocolURL theURL, final ClientIdentification.Agent agent, boolean concurrent) {
|
||||
if (theURL.isLocal()) return;
|
||||
final String urlHostPort = getHostPort(theURL);
|
||||
|
|
|
@ -1492,7 +1492,9 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
|
|||
Collection<Object> c = doc.getFieldValues(CollectionSchema.collection_sxt.getSolrFieldName());
|
||||
if (c != null) for (Object cn: c) if (cn != null) this.collections.put((String) cn, QueryParams.catchall_pattern);
|
||||
this.failReason = (String) doc.getFieldValue(CollectionSchema.failreason_s.getSolrFieldName());
|
||||
this.failType = FailType.valueOf((String) doc.getFieldValue(CollectionSchema.failtype_s.getSolrFieldName()));
|
||||
String fts = (String) doc.getFieldValue(CollectionSchema.failtype_s.getSolrFieldName());
|
||||
if (fts == null) ConcurrentLog.warn("CollectionConfiguration", "no fail type given for URL " + this.digestURL.toNormalform(true));
|
||||
this.failType = fts == null ? FailType.fail : FailType.valueOf(fts);
|
||||
this.httpstatus = (Integer) doc.getFieldValue(CollectionSchema.httpstatus_i.getSolrFieldName());
|
||||
this.failtime = (Date) doc.getFieldValue(CollectionSchema.load_date_dt.getSolrFieldName());
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue
Block a user