*) added a Set to which filter elements are written before mustmatch-filter is created to avoid huge lists of double elements in mustmatch-filter when starting a crawl from a "Link-List of URL" on CrawlStartSite_p.html

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@7456 6c8d7289-2bf4-0310-a012-ef5d649a1542
This commit is contained in:
low012 2011-01-28 16:24:33 +00:00
parent 9a1e0158fa
commit ae10ed5613

View File

@ -30,6 +30,7 @@ import java.io.Writer;
import java.net.MalformedURLException;
import java.util.Date;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.Map;
import java.util.Set;
@ -479,8 +480,12 @@ public class Crawler_p {
// get links and generate filter
final StringBuilder filter = new StringBuilder();
final Map<MultiProtocolURI, String> hyperlinks = scraper.getAnchors();
for (MultiProtocolURI uri: hyperlinks.keySet()) {
filter.append('|').append(uri.getProtocol()).append("://").append(uri.getHost()).append(".*");
final Set<String> filterSet = new HashSet<String>();
for (final MultiProtocolURI uri: hyperlinks.keySet()) {
filterSet.add(new StringBuilder().append(uri.getProtocol()).append("://").append(uri.getHost()).append(".*").toString());
}
for (final String element : filterSet) {
filter.append('|').append(element);
}
newcrawlingMustMatch = filter.length() > 0 ? filter.substring(1) : "";