Blacklist import and update performance improvements.

Measurement sample : import from blacklist local file containing about
15000 entries
 - before refactoring : several minutes
 - after refactoring : a few seconds!
This commit is contained in:
luccioman 2017-01-06 12:24:31 +01:00
parent e3892b0957
commit 339f005ced
5 changed files with 190 additions and 98 deletions

View File

@ -47,6 +47,7 @@ import net.yacy.cora.util.ConcurrentLog;
import net.yacy.data.ListManager; import net.yacy.data.ListManager;
import net.yacy.kelondro.util.FileUtils; import net.yacy.kelondro.util.FileUtils;
import net.yacy.repository.Blacklist; import net.yacy.repository.Blacklist;
import net.yacy.repository.BlacklistHostAndPath;
import net.yacy.repository.Blacklist.BlacklistError; import net.yacy.repository.Blacklist.BlacklistError;
import net.yacy.repository.Blacklist.BlacklistType; import net.yacy.repository.Blacklist.BlacklistType;
import net.yacy.search.Switchboard; import net.yacy.search.Switchboard;
@ -389,6 +390,8 @@ public class BlacklistCleaner_p {
final String[] oldEntry, final String[] newEntry) { final String[] oldEntry, final String[] newEntry) {
removeEntries(blacklistToUse, supportedBlacklistTypes, oldEntry); removeEntries(blacklistToUse, supportedBlacklistTypes, oldEntry);
String host, path; String host, path;
/* Prepare the new blacklist items list to add then them in one operation for better performance */
final Collection<BlacklistHostAndPath> newEntries = new ArrayList<>();
for (final String n : newEntry) { for (final String n : newEntry) {
final int pos = n.indexOf('/', 0); final int pos = n.indexOf('/', 0);
if (pos < 0) { if (pos < 0) {
@ -398,21 +401,21 @@ public class BlacklistCleaner_p {
host = n.substring(0, pos); host = n.substring(0, pos);
path = n.substring(pos + 1); path = n.substring(pos + 1);
} }
for (final BlacklistType s : supportedBlacklistTypes) { newEntries.add(new BlacklistHostAndPath(host, path));
if (ListManager.listSetContains(s + ".BlackLists", }
blacklistToUse)) { for (final BlacklistType s : supportedBlacklistTypes) {
try { if (ListManager.listSetContains(s + ".BlackLists",
Switchboard.urlBlacklist.add(s, blacklistToUse, host, blacklistToUse)) {
path); try {
} catch (PunycodeException e) { Switchboard.urlBlacklist.add(s, blacklistToUse, newEntries);
ConcurrentLog.warn(APP_NAME, } catch (PunycodeException e) {
"Unable to add blacklist entry to blacklist " ConcurrentLog.warn(APP_NAME,
+ s, e); "Unable to add blacklist entry to blacklist "
} + s, e);
} }
} }
SearchEventCache.cleanupEvents(true);
} }
SearchEventCache.cleanupEvents(true);
return newEntry.length; return newEntry.length;
} }
} }

View File

@ -26,6 +26,8 @@
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
import java.io.IOException; import java.io.IOException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Date; import java.util.Date;
import java.util.Iterator; import java.util.Iterator;
import java.util.List; import java.util.List;
@ -59,6 +61,7 @@ import net.yacy.peers.DHTSelection;
import net.yacy.peers.Protocol; import net.yacy.peers.Protocol;
import net.yacy.peers.Seed; import net.yacy.peers.Seed;
import net.yacy.repository.Blacklist; import net.yacy.repository.Blacklist;
import net.yacy.repository.BlacklistHostAndPath;
import net.yacy.repository.Blacklist.BlacklistType; import net.yacy.repository.Blacklist.BlacklistType;
import net.yacy.search.Switchboard; import net.yacy.search.Switchboard;
import net.yacy.search.SwitchboardConstants; import net.yacy.search.SwitchboardConstants;
@ -374,8 +377,9 @@ public class IndexControlRWIs_p {
Word.commonHashOrder, Word.commonHashOrder,
urlb.size()); urlb.size());
if ( post.containsKey("blacklisturls") ) { if ( post.containsKey("blacklisturls") ) {
final String[] supportedBlacklistTypes = env.getConfigArray("BlackLists.types", "");
DigestURL url; DigestURL url;
/* Prepare the new blacklist items list to add then them in one operation for better performance */
final Collection<BlacklistHostAndPath> items = new ArrayList<>();
for ( final byte[] b : urlb ) { for ( final byte[] b : urlb ) {
try { try {
urlHashes.put(b); urlHashes.put(b);
@ -386,29 +390,27 @@ public class IndexControlRWIs_p {
url = segment.fulltext().getURL(ASCII.String(b)); url = segment.fulltext().getURL(ASCII.String(b));
segment.fulltext().remove(b); segment.fulltext().remove(b);
if ( url != null ) { if ( url != null ) {
for ( final String supportedBlacklistType : supportedBlacklistTypes ) { items.add(new BlacklistHostAndPath(url.getHost(), url.getFile()));
if ( ListManager.listSetContains(
supportedBlacklistType + ".BlackLists",
blacklist) ) {
try {
Switchboard.urlBlacklist.add(
BlacklistType.valueOf(supportedBlacklistType),
blacklist,
url.getHost(),
url.getFile());
} catch (PunycodeException e) {
ConcurrentLog.warn(APP_NAME,
"Unable to add blacklist entry to blacklist "
+ supportedBlacklistType, e);
}
}
}
SearchEventCache.cleanupEvents(true);
} }
} catch (IOException e1) { } catch (IOException e1) {
ConcurrentLog.logException(e1); ConcurrentLog.logException(e1);
} }
} }
for (final BlacklistType supportedBlacklistType : BlacklistType.values()) {
if ( ListManager.listSetContains(
supportedBlacklistType + ".BlackLists",
blacklist) ) {
try {
Switchboard.urlBlacklist.add(supportedBlacklistType,
blacklist, items);
} catch (PunycodeException e) {
ConcurrentLog.warn(APP_NAME,
"Unable to add blacklist entries to blacklist "
+ supportedBlacklistType, e);
}
}
}
SearchEventCache.cleanupEvents(true);
} }
if ( post.containsKey("blacklistdomains") ) { if ( post.containsKey("blacklistdomains") ) {

View File

@ -32,12 +32,16 @@
import java.io.File; import java.io.File;
import java.io.IOException; import java.io.IOException;
import java.io.StringReader; import java.io.StringReader;
import java.util.ArrayList;
import java.util.Arrays; import java.util.Arrays;
import java.util.Collection;
import java.util.HashSet; import java.util.HashSet;
import java.util.Iterator; import java.util.Iterator;
import java.util.List; import java.util.List;
import java.util.Set; import java.util.Set;
import org.xml.sax.SAXException;
import net.yacy.cora.document.encoding.UTF8; import net.yacy.cora.document.encoding.UTF8;
import net.yacy.cora.document.id.DigestURL; import net.yacy.cora.document.id.DigestURL;
import net.yacy.cora.protocol.ClientIdentification; import net.yacy.cora.protocol.ClientIdentification;
@ -49,14 +53,13 @@ import net.yacy.document.parser.html.CharacterCoding;
import net.yacy.kelondro.util.FileUtils; import net.yacy.kelondro.util.FileUtils;
import net.yacy.peers.Seed; import net.yacy.peers.Seed;
import net.yacy.repository.Blacklist; import net.yacy.repository.Blacklist;
import net.yacy.repository.BlacklistHostAndPath;
import net.yacy.repository.Blacklist.BlacklistType; import net.yacy.repository.Blacklist.BlacklistType;
import net.yacy.search.Switchboard; import net.yacy.search.Switchboard;
import net.yacy.search.query.SearchEventCache; import net.yacy.search.query.SearchEventCache;
import net.yacy.server.serverObjects; import net.yacy.server.serverObjects;
import net.yacy.server.serverSwitch; import net.yacy.server.serverSwitch;
import org.xml.sax.SAXException;
/** /**
* Handle blacklist import operations. Either : * Handle blacklist import operations. Either :
@ -226,9 +229,11 @@ public class sharedBlacklist_p {
try { try {
// loop through the received entry list // loop through the received entry list
final int num = post.getInt("num", 0); final int num = post.getInt("num", 0);
for(int i = 0; i < num; i++){ final Collection<BlacklistHostAndPath> newItems = new ArrayList<>();
if( post.containsKey("item" + i) ){ /* Prepare the new blacklist items list to add then them in one operation for better performance */
String newItem = post.get("item" + i); for(int i = 0; i < num; i++) {
String newItem = post.get("item" + i);
if(newItem != null){
//This should not be needed... //This should not be needed...
if ( newItem.startsWith("http://") ){ if ( newItem.startsWith("http://") ){
@ -242,16 +247,16 @@ public class sharedBlacklist_p {
pos = newItem.length(); pos = newItem.length();
newItem = newItem + "/.*"; newItem = newItem + "/.*";
} }
newItems.add(new BlacklistHostAndPath(newItem.substring(0, pos), newItem.substring(pos + 1)));
if (Switchboard.urlBlacklist != null) { }
for (final BlacklistType supportedBlacklistType : BlacklistType.values()) { }
if (ListManager.listSetContains(supportedBlacklistType + ".BlackLists",selectedBlacklistName)) { if (Switchboard.urlBlacklist != null) {
Switchboard.urlBlacklist.add(supportedBlacklistType,selectedBlacklistName,newItem.substring(0, pos), newItem.substring(pos + 1)); for (final BlacklistType supportedBlacklistType : BlacklistType.values()) {
} if (ListManager.listSetContains(supportedBlacklistType + ".BlackLists",selectedBlacklistName)) {
} Switchboard.urlBlacklist.add(supportedBlacklistType, selectedBlacklistName, newItems);
SearchEventCache.cleanupEvents(true);
} }
} }
SearchEventCache.cleanupEvents(true);
} }
} catch (final Exception e) { } catch (final Exception e) {
prop.put("status", "1"); prop.put("status", "1");

View File

@ -33,6 +33,8 @@ import java.io.IOException;
import java.io.ObjectInputStream; import java.io.ObjectInputStream;
import java.io.ObjectOutputStream; import java.io.ObjectOutputStream;
import java.io.PrintWriter; import java.io.PrintWriter;
import java.util.ArrayList;
import java.util.Collection;
import java.util.HashSet; import java.util.HashSet;
import java.util.Iterator; import java.util.Iterator;
import java.util.List; import java.util.List;
@ -317,6 +319,89 @@ public class Blacklist {
} }
} }
/**
* Adds entries to a given blacklist internal data and updates the source
* file
*
* @param blacklistType
* @param blacklistToUse
* source file
* @param items
* blacklist host/path items to add
* @throws PunycodeException
*/
public final void add(final BlacklistType blacklistType, final String blacklistToUse,
final Collection<BlacklistHostAndPath> items) throws PunycodeException {
if (items != null) {
PrintWriter pw = null;
try {
/* Get the content of the blacklist file in memory */
final Set<String> blacklist = new HashSet<String>(
FileUtils.getListArray(new File(this.blacklistRootPath, blacklistToUse)));
/* Open a writer on the file */
pw = new PrintWriter(new FileWriter(new File(this.blacklistRootPath, blacklistToUse), true));
for (BlacklistHostAndPath itemToAdd : items) {
final String host = itemToAdd.getHost();
final String path = itemToAdd.getPath();
final String safeHost = Punycode.isBasic(host) ? host : MultiProtocolURL.toPunycode(host);
if (contains(blacklistType, safeHost, path)) {
/* Continue to the next item */
continue;
}
if (safeHost == null) {
log.warn("host must not be null");
/* Continue to the next item */
continue;
}
if (path == null) {
log.warn("path must not be null");
/* Continue to the next item */
continue;
}
String p = (!path.isEmpty() && path.charAt(0) == '/') ? path.substring(1) : path;
final Map<String, Set<Pattern>> blacklistMap = getBlacklistMap(blacklistType, isMatchable(host));
// avoid PatternSyntaxException e
final String h = ((!isMatchable(safeHost) && !safeHost.isEmpty() && safeHost.charAt(0) == '*')
? "." + safeHost : safeHost).toLowerCase();
if (!p.isEmpty() && p.charAt(0) == '*') {
p = "." + p;
}
Set<Pattern> hostList;
if (!(blacklistMap.containsKey(h) && ((hostList = blacklistMap.get(h)) != null))) {
blacklistMap.put(h, (hostList = new HashSet<Pattern>()));
}
Pattern pattern = Pattern.compile(p, Pattern.CASE_INSENSITIVE);
hostList.add(pattern);
// Append the line to the file.
final String newEntry = h + "/" + pattern;
if (!blacklist.contains(newEntry)) {
pw.println(newEntry);
blacklist.add(newEntry);
}
}
} catch (final IOException e) {
ConcurrentLog.logException(e);
} finally {
if (pw != null) {
pw.close();
if (pw.checkError()) {
log.warn("could not close stream to " + blacklistToUse + "! ");
}
}
}
}
}
/** /**
* Adds entry to a given blacklist internal data and updates the source file * Adds entry to a given blacklist internal data and updates the source file
* @param blacklistType * @param blacklistType
@ -326,60 +411,9 @@ public class Blacklist {
* @throws PunycodeException * @throws PunycodeException
*/ */
public final void add(final BlacklistType blacklistType, final String blacklistToUse, final String host, final String path) throws PunycodeException { public final void add(final BlacklistType blacklistType, final String blacklistToUse, final String host, final String path) throws PunycodeException {
final Collection<BlacklistHostAndPath> oneItemList = new ArrayList<>();
final String safeHost = Punycode.isBasic(host) ? host : MultiProtocolURL.toPunycode(host); oneItemList.add(new BlacklistHostAndPath(host, path));
this.add(blacklistType, blacklistToUse, oneItemList);
if (contains(blacklistType, safeHost, path)) {
return;
}
if (safeHost == null) {
throw new IllegalArgumentException("host may not be null");
}
if (path == null) {
throw new IllegalArgumentException("path may not be null");
}
String p = (!path.isEmpty() && path.charAt(0) == '/') ? path.substring(1) : path;
final Map<String, Set<Pattern>> blacklistMap = getBlacklistMap(blacklistType, isMatchable(host));
// avoid PatternSyntaxException e
final String h = ((!isMatchable(safeHost) && !safeHost.isEmpty() && safeHost.charAt(0) == '*') ? "." + safeHost : safeHost).toLowerCase();
if (!p.isEmpty() && p.charAt(0) == '*') {
p = "." + p;
}
Set<Pattern> hostList;
if (!(blacklistMap.containsKey(h) && ((hostList = blacklistMap.get(h)) != null))) {
blacklistMap.put(h, (hostList = new HashSet<Pattern>()));
}
Pattern pattern = Pattern.compile(p, Pattern.CASE_INSENSITIVE);
hostList.add(pattern);
// Append the line to the file.
PrintWriter pw = null;
try {
final String newEntry = h + "/" + pattern;
if (!blacklistFileContains(blacklistRootPath,
blacklistToUse, newEntry)) {
pw = new PrintWriter(new FileWriter(new File(blacklistRootPath,
blacklistToUse), true));
pw.println(newEntry);
pw.close();
}
} catch (final IOException e) {
ConcurrentLog.logException(e);
} finally {
if (pw != null) {
try {
pw.close();
} catch (final Exception e) {
log.warn("could not close stream to " +
blacklistToUse + "! " + e.getMessage());
}
}
}
} }
/** /**

View File

@ -0,0 +1,48 @@
// BlacklitHostAndPath.java
// Copyright 2017 by luccioman; https://github.com/luccioman
//
// This is a part of YaCy, a peer-to-peer based web search engine
//
// LICENSE
//
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation; either version 2 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
package net.yacy.repository;
/**
* Blacklist host and path pair.
*/
public class BlacklistHostAndPath {
/** Blacklisted host */
private final String host;
/** Blacklisted path */
private final String path;
public BlacklistHostAndPath(final String host, final String path) {
this.host = host;
this.path = path;
}
public String getHost() {
return host;
}
public String getPath() {
return path;
}
}