Blacklist import and update performance improvements.

Measurement sample : import from blacklist local file containing about
15000 entries
 - before refactoring : several minutes
 - after refactoring : a few seconds!
This commit is contained in:
luccioman 2017-01-06 12:24:31 +01:00
parent e3892b0957
commit 339f005ced
5 changed files with 190 additions and 98 deletions

View File

@ -47,6 +47,7 @@ import net.yacy.cora.util.ConcurrentLog;
import net.yacy.data.ListManager; import net.yacy.data.ListManager;
import net.yacy.kelondro.util.FileUtils; import net.yacy.kelondro.util.FileUtils;
import net.yacy.repository.Blacklist; import net.yacy.repository.Blacklist;
import net.yacy.repository.BlacklistHostAndPath;
import net.yacy.repository.Blacklist.BlacklistError; import net.yacy.repository.Blacklist.BlacklistError;
import net.yacy.repository.Blacklist.BlacklistType; import net.yacy.repository.Blacklist.BlacklistType;
import net.yacy.search.Switchboard; import net.yacy.search.Switchboard;
@ -389,6 +390,8 @@ public class BlacklistCleaner_p {
final String[] oldEntry, final String[] newEntry) { final String[] oldEntry, final String[] newEntry) {
removeEntries(blacklistToUse, supportedBlacklistTypes, oldEntry); removeEntries(blacklistToUse, supportedBlacklistTypes, oldEntry);
String host, path; String host, path;
/* Prepare the new blacklist items list to add then them in one operation for better performance */
final Collection<BlacklistHostAndPath> newEntries = new ArrayList<>();
for (final String n : newEntry) { for (final String n : newEntry) {
final int pos = n.indexOf('/', 0); final int pos = n.indexOf('/', 0);
if (pos < 0) { if (pos < 0) {
@ -398,12 +401,13 @@ public class BlacklistCleaner_p {
host = n.substring(0, pos); host = n.substring(0, pos);
path = n.substring(pos + 1); path = n.substring(pos + 1);
} }
newEntries.add(new BlacklistHostAndPath(host, path));
}
for (final BlacklistType s : supportedBlacklistTypes) { for (final BlacklistType s : supportedBlacklistTypes) {
if (ListManager.listSetContains(s + ".BlackLists", if (ListManager.listSetContains(s + ".BlackLists",
blacklistToUse)) { blacklistToUse)) {
try { try {
Switchboard.urlBlacklist.add(s, blacklistToUse, host, Switchboard.urlBlacklist.add(s, blacklistToUse, newEntries);
path);
} catch (PunycodeException e) { } catch (PunycodeException e) {
ConcurrentLog.warn(APP_NAME, ConcurrentLog.warn(APP_NAME,
"Unable to add blacklist entry to blacklist " "Unable to add blacklist entry to blacklist "
@ -412,7 +416,6 @@ public class BlacklistCleaner_p {
} }
} }
SearchEventCache.cleanupEvents(true); SearchEventCache.cleanupEvents(true);
}
return newEntry.length; return newEntry.length;
} }
} }

View File

@ -26,6 +26,8 @@
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
import java.io.IOException; import java.io.IOException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Date; import java.util.Date;
import java.util.Iterator; import java.util.Iterator;
import java.util.List; import java.util.List;
@ -59,6 +61,7 @@ import net.yacy.peers.DHTSelection;
import net.yacy.peers.Protocol; import net.yacy.peers.Protocol;
import net.yacy.peers.Seed; import net.yacy.peers.Seed;
import net.yacy.repository.Blacklist; import net.yacy.repository.Blacklist;
import net.yacy.repository.BlacklistHostAndPath;
import net.yacy.repository.Blacklist.BlacklistType; import net.yacy.repository.Blacklist.BlacklistType;
import net.yacy.search.Switchboard; import net.yacy.search.Switchboard;
import net.yacy.search.SwitchboardConstants; import net.yacy.search.SwitchboardConstants;
@ -374,8 +377,9 @@ public class IndexControlRWIs_p {
Word.commonHashOrder, Word.commonHashOrder,
urlb.size()); urlb.size());
if ( post.containsKey("blacklisturls") ) { if ( post.containsKey("blacklisturls") ) {
final String[] supportedBlacklistTypes = env.getConfigArray("BlackLists.types", "");
DigestURL url; DigestURL url;
/* Prepare the new blacklist items list to add then them in one operation for better performance */
final Collection<BlacklistHostAndPath> items = new ArrayList<>();
for ( final byte[] b : urlb ) { for ( final byte[] b : urlb ) {
try { try {
urlHashes.put(b); urlHashes.put(b);
@ -386,29 +390,27 @@ public class IndexControlRWIs_p {
url = segment.fulltext().getURL(ASCII.String(b)); url = segment.fulltext().getURL(ASCII.String(b));
segment.fulltext().remove(b); segment.fulltext().remove(b);
if ( url != null ) { if ( url != null ) {
for ( final String supportedBlacklistType : supportedBlacklistTypes ) { items.add(new BlacklistHostAndPath(url.getHost(), url.getFile()));
if ( ListManager.listSetContains(
supportedBlacklistType + ".BlackLists",
blacklist) ) {
try {
Switchboard.urlBlacklist.add(
BlacklistType.valueOf(supportedBlacklistType),
blacklist,
url.getHost(),
url.getFile());
} catch (PunycodeException e) {
ConcurrentLog.warn(APP_NAME,
"Unable to add blacklist entry to blacklist "
+ supportedBlacklistType, e);
}
}
}
SearchEventCache.cleanupEvents(true);
} }
} catch (IOException e1) { } catch (IOException e1) {
ConcurrentLog.logException(e1); ConcurrentLog.logException(e1);
} }
} }
for (final BlacklistType supportedBlacklistType : BlacklistType.values()) {
if ( ListManager.listSetContains(
supportedBlacklistType + ".BlackLists",
blacklist) ) {
try {
Switchboard.urlBlacklist.add(supportedBlacklistType,
blacklist, items);
} catch (PunycodeException e) {
ConcurrentLog.warn(APP_NAME,
"Unable to add blacklist entries to blacklist "
+ supportedBlacklistType, e);
}
}
}
SearchEventCache.cleanupEvents(true);
} }
if ( post.containsKey("blacklistdomains") ) { if ( post.containsKey("blacklistdomains") ) {

View File

@ -32,12 +32,16 @@
import java.io.File; import java.io.File;
import java.io.IOException; import java.io.IOException;
import java.io.StringReader; import java.io.StringReader;
import java.util.ArrayList;
import java.util.Arrays; import java.util.Arrays;
import java.util.Collection;
import java.util.HashSet; import java.util.HashSet;
import java.util.Iterator; import java.util.Iterator;
import java.util.List; import java.util.List;
import java.util.Set; import java.util.Set;
import org.xml.sax.SAXException;
import net.yacy.cora.document.encoding.UTF8; import net.yacy.cora.document.encoding.UTF8;
import net.yacy.cora.document.id.DigestURL; import net.yacy.cora.document.id.DigestURL;
import net.yacy.cora.protocol.ClientIdentification; import net.yacy.cora.protocol.ClientIdentification;
@ -49,14 +53,13 @@ import net.yacy.document.parser.html.CharacterCoding;
import net.yacy.kelondro.util.FileUtils; import net.yacy.kelondro.util.FileUtils;
import net.yacy.peers.Seed; import net.yacy.peers.Seed;
import net.yacy.repository.Blacklist; import net.yacy.repository.Blacklist;
import net.yacy.repository.BlacklistHostAndPath;
import net.yacy.repository.Blacklist.BlacklistType; import net.yacy.repository.Blacklist.BlacklistType;
import net.yacy.search.Switchboard; import net.yacy.search.Switchboard;
import net.yacy.search.query.SearchEventCache; import net.yacy.search.query.SearchEventCache;
import net.yacy.server.serverObjects; import net.yacy.server.serverObjects;
import net.yacy.server.serverSwitch; import net.yacy.server.serverSwitch;
import org.xml.sax.SAXException;
/** /**
* Handle blacklist import operations. Either : * Handle blacklist import operations. Either :
@ -226,9 +229,11 @@ public class sharedBlacklist_p {
try { try {
// loop through the received entry list // loop through the received entry list
final int num = post.getInt("num", 0); final int num = post.getInt("num", 0);
final Collection<BlacklistHostAndPath> newItems = new ArrayList<>();
/* Prepare the new blacklist items list to add then them in one operation for better performance */
for(int i = 0; i < num; i++) { for(int i = 0; i < num; i++) {
if( post.containsKey("item" + i) ){
String newItem = post.get("item" + i); String newItem = post.get("item" + i);
if(newItem != null){
//This should not be needed... //This should not be needed...
if ( newItem.startsWith("http://") ){ if ( newItem.startsWith("http://") ){
@ -242,17 +247,17 @@ public class sharedBlacklist_p {
pos = newItem.length(); pos = newItem.length();
newItem = newItem + "/.*"; newItem = newItem + "/.*";
} }
newItems.add(new BlacklistHostAndPath(newItem.substring(0, pos), newItem.substring(pos + 1)));
}
}
if (Switchboard.urlBlacklist != null) { if (Switchboard.urlBlacklist != null) {
for (final BlacklistType supportedBlacklistType : BlacklistType.values()) { for (final BlacklistType supportedBlacklistType : BlacklistType.values()) {
if (ListManager.listSetContains(supportedBlacklistType + ".BlackLists",selectedBlacklistName)) { if (ListManager.listSetContains(supportedBlacklistType + ".BlackLists",selectedBlacklistName)) {
Switchboard.urlBlacklist.add(supportedBlacklistType,selectedBlacklistName,newItem.substring(0, pos), newItem.substring(pos + 1)); Switchboard.urlBlacklist.add(supportedBlacklistType, selectedBlacklistName, newItems);
} }
} }
SearchEventCache.cleanupEvents(true); SearchEventCache.cleanupEvents(true);
} }
}
}
} catch (final Exception e) { } catch (final Exception e) {
prop.put("status", "1"); prop.put("status", "1");
prop.putHTML("status_error", e.getLocalizedMessage()); prop.putHTML("status_error", e.getLocalizedMessage());

View File

@ -33,6 +33,8 @@ import java.io.IOException;
import java.io.ObjectInputStream; import java.io.ObjectInputStream;
import java.io.ObjectOutputStream; import java.io.ObjectOutputStream;
import java.io.PrintWriter; import java.io.PrintWriter;
import java.util.ArrayList;
import java.util.Collection;
import java.util.HashSet; import java.util.HashSet;
import java.util.Iterator; import java.util.Iterator;
import java.util.List; import java.util.List;
@ -318,32 +320,54 @@ public class Blacklist {
} }
/** /**
* Adds entry to a given blacklist internal data and updates the source file * Adds entries to a given blacklist internal data and updates the source
* file
*
* @param blacklistType * @param blacklistType
* @param blacklistToUse source file * @param blacklistToUse
* @param host * source file
* @param path * @param items
* blacklist host/path items to add
* @throws PunycodeException * @throws PunycodeException
*/ */
public final void add(final BlacklistType blacklistType, final String blacklistToUse, final String host, final String path) throws PunycodeException { public final void add(final BlacklistType blacklistType, final String blacklistToUse,
final Collection<BlacklistHostAndPath> items) throws PunycodeException {
if (items != null) {
PrintWriter pw = null;
try {
/* Get the content of the blacklist file in memory */
final Set<String> blacklist = new HashSet<String>(
FileUtils.getListArray(new File(this.blacklistRootPath, blacklistToUse)));
/* Open a writer on the file */
pw = new PrintWriter(new FileWriter(new File(this.blacklistRootPath, blacklistToUse), true));
for (BlacklistHostAndPath itemToAdd : items) {
final String host = itemToAdd.getHost();
final String path = itemToAdd.getPath();
final String safeHost = Punycode.isBasic(host) ? host : MultiProtocolURL.toPunycode(host); final String safeHost = Punycode.isBasic(host) ? host : MultiProtocolURL.toPunycode(host);
if (contains(blacklistType, safeHost, path)) { if (contains(blacklistType, safeHost, path)) {
return; /* Continue to the next item */
continue;
} }
if (safeHost == null) { if (safeHost == null) {
throw new IllegalArgumentException("host may not be null"); log.warn("host must not be null");
/* Continue to the next item */
continue;
} }
if (path == null) { if (path == null) {
throw new IllegalArgumentException("path may not be null"); log.warn("path must not be null");
/* Continue to the next item */
continue;
} }
String p = (!path.isEmpty() && path.charAt(0) == '/') ? path.substring(1) : path; String p = (!path.isEmpty() && path.charAt(0) == '/') ? path.substring(1) : path;
final Map<String, Set<Pattern>> blacklistMap = getBlacklistMap(blacklistType, isMatchable(host)); final Map<String, Set<Pattern>> blacklistMap = getBlacklistMap(blacklistType, isMatchable(host));
// avoid PatternSyntaxException e // avoid PatternSyntaxException e
final String h = ((!isMatchable(safeHost) && !safeHost.isEmpty() && safeHost.charAt(0) == '*') ? "." + safeHost : safeHost).toLowerCase(); final String h = ((!isMatchable(safeHost) && !safeHost.isEmpty() && safeHost.charAt(0) == '*')
? "." + safeHost : safeHost).toLowerCase();
if (!p.isEmpty() && p.charAt(0) == '*') { if (!p.isEmpty() && p.charAt(0) == '*') {
p = "." + p; p = "." + p;
} }
@ -358,29 +382,39 @@ public class Blacklist {
hostList.add(pattern); hostList.add(pattern);
// Append the line to the file. // Append the line to the file.
PrintWriter pw = null;
try {
final String newEntry = h + "/" + pattern; final String newEntry = h + "/" + pattern;
if (!blacklistFileContains(blacklistRootPath, if (!blacklist.contains(newEntry)) {
blacklistToUse, newEntry)) {
pw = new PrintWriter(new FileWriter(new File(blacklistRootPath,
blacklistToUse), true));
pw.println(newEntry); pw.println(newEntry);
pw.close(); blacklist.add(newEntry);
}
} }
} catch (final IOException e) { } catch (final IOException e) {
ConcurrentLog.logException(e); ConcurrentLog.logException(e);
} finally { } finally {
if (pw != null) { if (pw != null) {
try {
pw.close(); pw.close();
} catch (final Exception e) { if (pw.checkError()) {
log.warn("could not close stream to " + log.warn("could not close stream to " + blacklistToUse + "! ");
blacklistToUse + "! " + e.getMessage());
} }
} }
} }
} }
}
/**
* Adds entry to a given blacklist internal data and updates the source file
* @param blacklistType
* @param blacklistToUse source file
* @param host
* @param path
* @throws PunycodeException
*/
public final void add(final BlacklistType blacklistType, final String blacklistToUse, final String host, final String path) throws PunycodeException {
final Collection<BlacklistHostAndPath> oneItemList = new ArrayList<>();
oneItemList.add(new BlacklistHostAndPath(host, path));
this.add(blacklistType, blacklistToUse, oneItemList);
}
/** /**
* appends aN entry to the backlist source file and updates internal blacklist maps. * appends aN entry to the backlist source file and updates internal blacklist maps.

View File

@ -0,0 +1,48 @@
// BlacklitHostAndPath.java
// Copyright 2017 by luccioman; https://github.com/luccioman
//
// This is a part of YaCy, a peer-to-peer based web search engine
//
// LICENSE
//
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation; either version 2 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
package net.yacy.repository;
/**
* Blacklist host and path pair.
*/
public class BlacklistHostAndPath {
/** Blacklisted host */
private final String host;
/** Blacklisted path */
private final String path;
public BlacklistHostAndPath(final String host, final String path) {
this.host = host;
this.path = path;
}
public String getHost() {
return host;
}
public String getPath() {
return path;
}
}