fixed sitemap crawl start

This commit is contained in:
Michael Peter Christen 2013-10-21 12:49:32 +02:00
parent b743e6d79f
commit 1a09771be8

View File

@ -171,6 +171,7 @@ public class Crawler_p {
}
final boolean deleteold = (deleteage && deleteageDate != null) || (restrictedcrawl && post.getBoolean("deleteold"));
final String sitemapURLStr = post.get("sitemapURL","");
String crawlingStart0 = post.get("crawlingURL","").trim(); // the crawljob start url
String[] rootURLs0 = crawlingStart0.indexOf('\n') > 0 || crawlingStart0.indexOf('\r') > 0 ? crawlingStart0.split("[\\r\\n]+") : crawlingStart0.split(Pattern.quote("|"));
Set<DigestURL> rootURLs = new HashSet<DigestURL>();
@ -199,7 +200,7 @@ public class Crawler_p {
if (p >= 8) crawlName = crawlName.substring(0, p);
}
if (crawlName.endsWith(",")) crawlName = crawlName.substring(0, crawlName.length() - 1);
if (crawlName.length() == 0 && sitemapURLStr.length() > 0) crawlName = "sitemap loader for " + sitemapURLStr;
// set the crawl filter
String ipMustMatch = post.get("ipMustmatch", CrawlProfile.MATCH_ALL_STRING);
@ -442,7 +443,6 @@ public class Crawler_p {
if (successurls.size() > 0) sb.continueCrawlJob(SwitchboardConstants.CRAWLJOB_LOCAL_CRAWL);
}
} else if ("sitemap".equals(crawlingMode)) {
final String sitemapURLStr = post.get("sitemapURL","");
try {
final DigestURL sitemapURL = new DigestURL(sitemapURLStr);
sb.crawler.putActive(handle, profile);