mirror of
https://github.com/yacy/yacy_search_server.git
synced 2024-09-19 00:01:41 +02:00
*) added a Set to which filter elements are written before mustmatch-filter is created to avoid huge lists of double elements in mustmatch-filter when starting a crawl from a "Link-List of URL" on CrawlStartSite_p.html
git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@7456 6c8d7289-2bf4-0310-a012-ef5d649a1542
This commit is contained in:
parent
9a1e0158fa
commit
ae10ed5613
|
@ -30,6 +30,7 @@ import java.io.Writer;
|
|||
import java.net.MalformedURLException;
|
||||
import java.util.Date;
|
||||
import java.util.HashMap;
|
||||
import java.util.HashSet;
|
||||
import java.util.Iterator;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
|
@ -479,8 +480,12 @@ public class Crawler_p {
|
|||
// get links and generate filter
|
||||
final StringBuilder filter = new StringBuilder();
|
||||
final Map<MultiProtocolURI, String> hyperlinks = scraper.getAnchors();
|
||||
for (MultiProtocolURI uri: hyperlinks.keySet()) {
|
||||
filter.append('|').append(uri.getProtocol()).append("://").append(uri.getHost()).append(".*");
|
||||
final Set<String> filterSet = new HashSet<String>();
|
||||
for (final MultiProtocolURI uri: hyperlinks.keySet()) {
|
||||
filterSet.add(new StringBuilder().append(uri.getProtocol()).append("://").append(uri.getHost()).append(".*").toString());
|
||||
}
|
||||
for (final String element : filterSet) {
|
||||
filter.append('|').append(element);
|
||||
}
|
||||
newcrawlingMustMatch = filter.length() > 0 ? filter.substring(1) : "";
|
||||
|
||||
|
|
Loading…
Reference in New Issue
Block a user