mirror of
https://github.com/yacy/yacy_search_server.git
synced 2024-09-19 00:01:41 +02:00
Blacklist import and update performance improvements.
Measurement sample : import from blacklist local file containing about 15000 entries - before refactoring : several minutes - after refactoring : a few seconds!
This commit is contained in:
parent
e3892b0957
commit
339f005ced
|
@ -47,6 +47,7 @@ import net.yacy.cora.util.ConcurrentLog;
|
|||
import net.yacy.data.ListManager;
|
||||
import net.yacy.kelondro.util.FileUtils;
|
||||
import net.yacy.repository.Blacklist;
|
||||
import net.yacy.repository.BlacklistHostAndPath;
|
||||
import net.yacy.repository.Blacklist.BlacklistError;
|
||||
import net.yacy.repository.Blacklist.BlacklistType;
|
||||
import net.yacy.search.Switchboard;
|
||||
|
@ -389,6 +390,8 @@ public class BlacklistCleaner_p {
|
|||
final String[] oldEntry, final String[] newEntry) {
|
||||
removeEntries(blacklistToUse, supportedBlacklistTypes, oldEntry);
|
||||
String host, path;
|
||||
/* Prepare the new blacklist items list to add then them in one operation for better performance */
|
||||
final Collection<BlacklistHostAndPath> newEntries = new ArrayList<>();
|
||||
for (final String n : newEntry) {
|
||||
final int pos = n.indexOf('/', 0);
|
||||
if (pos < 0) {
|
||||
|
@ -398,21 +401,21 @@ public class BlacklistCleaner_p {
|
|||
host = n.substring(0, pos);
|
||||
path = n.substring(pos + 1);
|
||||
}
|
||||
for (final BlacklistType s : supportedBlacklistTypes) {
|
||||
if (ListManager.listSetContains(s + ".BlackLists",
|
||||
blacklistToUse)) {
|
||||
try {
|
||||
Switchboard.urlBlacklist.add(s, blacklistToUse, host,
|
||||
path);
|
||||
} catch (PunycodeException e) {
|
||||
ConcurrentLog.warn(APP_NAME,
|
||||
"Unable to add blacklist entry to blacklist "
|
||||
+ s, e);
|
||||
}
|
||||
newEntries.add(new BlacklistHostAndPath(host, path));
|
||||
}
|
||||
for (final BlacklistType s : supportedBlacklistTypes) {
|
||||
if (ListManager.listSetContains(s + ".BlackLists",
|
||||
blacklistToUse)) {
|
||||
try {
|
||||
Switchboard.urlBlacklist.add(s, blacklistToUse, newEntries);
|
||||
} catch (PunycodeException e) {
|
||||
ConcurrentLog.warn(APP_NAME,
|
||||
"Unable to add blacklist entry to blacklist "
|
||||
+ s, e);
|
||||
}
|
||||
}
|
||||
SearchEventCache.cleanupEvents(true);
|
||||
}
|
||||
SearchEventCache.cleanupEvents(true);
|
||||
return newEntry.length;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -26,6 +26,8 @@
|
|||
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collection;
|
||||
import java.util.Date;
|
||||
import java.util.Iterator;
|
||||
import java.util.List;
|
||||
|
@ -59,6 +61,7 @@ import net.yacy.peers.DHTSelection;
|
|||
import net.yacy.peers.Protocol;
|
||||
import net.yacy.peers.Seed;
|
||||
import net.yacy.repository.Blacklist;
|
||||
import net.yacy.repository.BlacklistHostAndPath;
|
||||
import net.yacy.repository.Blacklist.BlacklistType;
|
||||
import net.yacy.search.Switchboard;
|
||||
import net.yacy.search.SwitchboardConstants;
|
||||
|
@ -374,8 +377,9 @@ public class IndexControlRWIs_p {
|
|||
Word.commonHashOrder,
|
||||
urlb.size());
|
||||
if ( post.containsKey("blacklisturls") ) {
|
||||
final String[] supportedBlacklistTypes = env.getConfigArray("BlackLists.types", "");
|
||||
DigestURL url;
|
||||
/* Prepare the new blacklist items list to add then them in one operation for better performance */
|
||||
final Collection<BlacklistHostAndPath> items = new ArrayList<>();
|
||||
for ( final byte[] b : urlb ) {
|
||||
try {
|
||||
urlHashes.put(b);
|
||||
|
@ -386,29 +390,27 @@ public class IndexControlRWIs_p {
|
|||
url = segment.fulltext().getURL(ASCII.String(b));
|
||||
segment.fulltext().remove(b);
|
||||
if ( url != null ) {
|
||||
for ( final String supportedBlacklistType : supportedBlacklistTypes ) {
|
||||
if ( ListManager.listSetContains(
|
||||
supportedBlacklistType + ".BlackLists",
|
||||
blacklist) ) {
|
||||
try {
|
||||
Switchboard.urlBlacklist.add(
|
||||
BlacklistType.valueOf(supportedBlacklistType),
|
||||
blacklist,
|
||||
url.getHost(),
|
||||
url.getFile());
|
||||
} catch (PunycodeException e) {
|
||||
ConcurrentLog.warn(APP_NAME,
|
||||
"Unable to add blacklist entry to blacklist "
|
||||
+ supportedBlacklistType, e);
|
||||
}
|
||||
}
|
||||
}
|
||||
SearchEventCache.cleanupEvents(true);
|
||||
items.add(new BlacklistHostAndPath(url.getHost(), url.getFile()));
|
||||
}
|
||||
} catch (IOException e1) {
|
||||
ConcurrentLog.logException(e1);
|
||||
}
|
||||
}
|
||||
for (final BlacklistType supportedBlacklistType : BlacklistType.values()) {
|
||||
if ( ListManager.listSetContains(
|
||||
supportedBlacklistType + ".BlackLists",
|
||||
blacklist) ) {
|
||||
try {
|
||||
Switchboard.urlBlacklist.add(supportedBlacklistType,
|
||||
blacklist, items);
|
||||
} catch (PunycodeException e) {
|
||||
ConcurrentLog.warn(APP_NAME,
|
||||
"Unable to add blacklist entries to blacklist "
|
||||
+ supportedBlacklistType, e);
|
||||
}
|
||||
}
|
||||
}
|
||||
SearchEventCache.cleanupEvents(true);
|
||||
}
|
||||
|
||||
if ( post.containsKey("blacklistdomains") ) {
|
||||
|
|
|
@ -32,12 +32,16 @@
|
|||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
import java.io.StringReader;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.Collection;
|
||||
import java.util.HashSet;
|
||||
import java.util.Iterator;
|
||||
import java.util.List;
|
||||
import java.util.Set;
|
||||
|
||||
import org.xml.sax.SAXException;
|
||||
|
||||
import net.yacy.cora.document.encoding.UTF8;
|
||||
import net.yacy.cora.document.id.DigestURL;
|
||||
import net.yacy.cora.protocol.ClientIdentification;
|
||||
|
@ -49,14 +53,13 @@ import net.yacy.document.parser.html.CharacterCoding;
|
|||
import net.yacy.kelondro.util.FileUtils;
|
||||
import net.yacy.peers.Seed;
|
||||
import net.yacy.repository.Blacklist;
|
||||
import net.yacy.repository.BlacklistHostAndPath;
|
||||
import net.yacy.repository.Blacklist.BlacklistType;
|
||||
import net.yacy.search.Switchboard;
|
||||
import net.yacy.search.query.SearchEventCache;
|
||||
import net.yacy.server.serverObjects;
|
||||
import net.yacy.server.serverSwitch;
|
||||
|
||||
import org.xml.sax.SAXException;
|
||||
|
||||
|
||||
/**
|
||||
* Handle blacklist import operations. Either :
|
||||
|
@ -226,9 +229,11 @@ public class sharedBlacklist_p {
|
|||
try {
|
||||
// loop through the received entry list
|
||||
final int num = post.getInt("num", 0);
|
||||
for(int i = 0; i < num; i++){
|
||||
if( post.containsKey("item" + i) ){
|
||||
String newItem = post.get("item" + i);
|
||||
final Collection<BlacklistHostAndPath> newItems = new ArrayList<>();
|
||||
/* Prepare the new blacklist items list to add then them in one operation for better performance */
|
||||
for(int i = 0; i < num; i++) {
|
||||
String newItem = post.get("item" + i);
|
||||
if(newItem != null){
|
||||
|
||||
//This should not be needed...
|
||||
if ( newItem.startsWith("http://") ){
|
||||
|
@ -242,16 +247,16 @@ public class sharedBlacklist_p {
|
|||
pos = newItem.length();
|
||||
newItem = newItem + "/.*";
|
||||
}
|
||||
|
||||
if (Switchboard.urlBlacklist != null) {
|
||||
for (final BlacklistType supportedBlacklistType : BlacklistType.values()) {
|
||||
if (ListManager.listSetContains(supportedBlacklistType + ".BlackLists",selectedBlacklistName)) {
|
||||
Switchboard.urlBlacklist.add(supportedBlacklistType,selectedBlacklistName,newItem.substring(0, pos), newItem.substring(pos + 1));
|
||||
}
|
||||
}
|
||||
SearchEventCache.cleanupEvents(true);
|
||||
newItems.add(new BlacklistHostAndPath(newItem.substring(0, pos), newItem.substring(pos + 1)));
|
||||
}
|
||||
}
|
||||
if (Switchboard.urlBlacklist != null) {
|
||||
for (final BlacklistType supportedBlacklistType : BlacklistType.values()) {
|
||||
if (ListManager.listSetContains(supportedBlacklistType + ".BlackLists",selectedBlacklistName)) {
|
||||
Switchboard.urlBlacklist.add(supportedBlacklistType, selectedBlacklistName, newItems);
|
||||
}
|
||||
}
|
||||
SearchEventCache.cleanupEvents(true);
|
||||
}
|
||||
} catch (final Exception e) {
|
||||
prop.put("status", "1");
|
||||
|
|
|
@ -33,6 +33,8 @@ import java.io.IOException;
|
|||
import java.io.ObjectInputStream;
|
||||
import java.io.ObjectOutputStream;
|
||||
import java.io.PrintWriter;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collection;
|
||||
import java.util.HashSet;
|
||||
import java.util.Iterator;
|
||||
import java.util.List;
|
||||
|
@ -317,6 +319,89 @@ public class Blacklist {
|
|||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Adds entries to a given blacklist internal data and updates the source
|
||||
* file
|
||||
*
|
||||
* @param blacklistType
|
||||
* @param blacklistToUse
|
||||
* source file
|
||||
* @param items
|
||||
* blacklist host/path items to add
|
||||
* @throws PunycodeException
|
||||
*/
|
||||
public final void add(final BlacklistType blacklistType, final String blacklistToUse,
|
||||
final Collection<BlacklistHostAndPath> items) throws PunycodeException {
|
||||
|
||||
if (items != null) {
|
||||
PrintWriter pw = null;
|
||||
try {
|
||||
/* Get the content of the blacklist file in memory */
|
||||
final Set<String> blacklist = new HashSet<String>(
|
||||
FileUtils.getListArray(new File(this.blacklistRootPath, blacklistToUse)));
|
||||
/* Open a writer on the file */
|
||||
pw = new PrintWriter(new FileWriter(new File(this.blacklistRootPath, blacklistToUse), true));
|
||||
|
||||
for (BlacklistHostAndPath itemToAdd : items) {
|
||||
final String host = itemToAdd.getHost();
|
||||
final String path = itemToAdd.getPath();
|
||||
final String safeHost = Punycode.isBasic(host) ? host : MultiProtocolURL.toPunycode(host);
|
||||
|
||||
if (contains(blacklistType, safeHost, path)) {
|
||||
/* Continue to the next item */
|
||||
continue;
|
||||
}
|
||||
if (safeHost == null) {
|
||||
log.warn("host must not be null");
|
||||
/* Continue to the next item */
|
||||
continue;
|
||||
}
|
||||
if (path == null) {
|
||||
log.warn("path must not be null");
|
||||
/* Continue to the next item */
|
||||
continue;
|
||||
}
|
||||
|
||||
String p = (!path.isEmpty() && path.charAt(0) == '/') ? path.substring(1) : path;
|
||||
final Map<String, Set<Pattern>> blacklistMap = getBlacklistMap(blacklistType, isMatchable(host));
|
||||
|
||||
// avoid PatternSyntaxException e
|
||||
final String h = ((!isMatchable(safeHost) && !safeHost.isEmpty() && safeHost.charAt(0) == '*')
|
||||
? "." + safeHost : safeHost).toLowerCase();
|
||||
if (!p.isEmpty() && p.charAt(0) == '*') {
|
||||
p = "." + p;
|
||||
}
|
||||
|
||||
Set<Pattern> hostList;
|
||||
if (!(blacklistMap.containsKey(h) && ((hostList = blacklistMap.get(h)) != null))) {
|
||||
blacklistMap.put(h, (hostList = new HashSet<Pattern>()));
|
||||
}
|
||||
|
||||
Pattern pattern = Pattern.compile(p, Pattern.CASE_INSENSITIVE);
|
||||
|
||||
hostList.add(pattern);
|
||||
|
||||
// Append the line to the file.
|
||||
final String newEntry = h + "/" + pattern;
|
||||
if (!blacklist.contains(newEntry)) {
|
||||
pw.println(newEntry);
|
||||
blacklist.add(newEntry);
|
||||
}
|
||||
|
||||
}
|
||||
} catch (final IOException e) {
|
||||
ConcurrentLog.logException(e);
|
||||
} finally {
|
||||
if (pw != null) {
|
||||
pw.close();
|
||||
if (pw.checkError()) {
|
||||
log.warn("could not close stream to " + blacklistToUse + "! ");
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Adds entry to a given blacklist internal data and updates the source file
|
||||
* @param blacklistType
|
||||
|
@ -326,60 +411,9 @@ public class Blacklist {
|
|||
* @throws PunycodeException
|
||||
*/
|
||||
public final void add(final BlacklistType blacklistType, final String blacklistToUse, final String host, final String path) throws PunycodeException {
|
||||
|
||||
final String safeHost = Punycode.isBasic(host) ? host : MultiProtocolURL.toPunycode(host);
|
||||
|
||||
if (contains(blacklistType, safeHost, path)) {
|
||||
return;
|
||||
}
|
||||
if (safeHost == null) {
|
||||
throw new IllegalArgumentException("host may not be null");
|
||||
}
|
||||
if (path == null) {
|
||||
throw new IllegalArgumentException("path may not be null");
|
||||
}
|
||||
|
||||
String p = (!path.isEmpty() && path.charAt(0) == '/') ? path.substring(1) : path;
|
||||
final Map<String, Set<Pattern>> blacklistMap = getBlacklistMap(blacklistType, isMatchable(host));
|
||||
|
||||
// avoid PatternSyntaxException e
|
||||
final String h = ((!isMatchable(safeHost) && !safeHost.isEmpty() && safeHost.charAt(0) == '*') ? "." + safeHost : safeHost).toLowerCase();
|
||||
if (!p.isEmpty() && p.charAt(0) == '*') {
|
||||
p = "." + p;
|
||||
}
|
||||
|
||||
Set<Pattern> hostList;
|
||||
if (!(blacklistMap.containsKey(h) && ((hostList = blacklistMap.get(h)) != null))) {
|
||||
blacklistMap.put(h, (hostList = new HashSet<Pattern>()));
|
||||
}
|
||||
|
||||
Pattern pattern = Pattern.compile(p, Pattern.CASE_INSENSITIVE);
|
||||
|
||||
hostList.add(pattern);
|
||||
|
||||
// Append the line to the file.
|
||||
PrintWriter pw = null;
|
||||
try {
|
||||
final String newEntry = h + "/" + pattern;
|
||||
if (!blacklistFileContains(blacklistRootPath,
|
||||
blacklistToUse, newEntry)) {
|
||||
pw = new PrintWriter(new FileWriter(new File(blacklistRootPath,
|
||||
blacklistToUse), true));
|
||||
pw.println(newEntry);
|
||||
pw.close();
|
||||
}
|
||||
} catch (final IOException e) {
|
||||
ConcurrentLog.logException(e);
|
||||
} finally {
|
||||
if (pw != null) {
|
||||
try {
|
||||
pw.close();
|
||||
} catch (final Exception e) {
|
||||
log.warn("could not close stream to " +
|
||||
blacklistToUse + "! " + e.getMessage());
|
||||
}
|
||||
}
|
||||
}
|
||||
final Collection<BlacklistHostAndPath> oneItemList = new ArrayList<>();
|
||||
oneItemList.add(new BlacklistHostAndPath(host, path));
|
||||
this.add(blacklistType, blacklistToUse, oneItemList);
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
48
source/net/yacy/repository/BlacklistHostAndPath.java
Normal file
48
source/net/yacy/repository/BlacklistHostAndPath.java
Normal file
|
@ -0,0 +1,48 @@
|
|||
// BlacklitHostAndPath.java
|
||||
// Copyright 2017 by luccioman; https://github.com/luccioman
|
||||
//
|
||||
// This is a part of YaCy, a peer-to-peer based web search engine
|
||||
//
|
||||
// LICENSE
|
||||
//
|
||||
// This program is free software; you can redistribute it and/or modify
|
||||
// it under the terms of the GNU General Public License as published by
|
||||
// the Free Software Foundation; either version 2 of the License, or
|
||||
// (at your option) any later version.
|
||||
//
|
||||
// This program is distributed in the hope that it will be useful,
|
||||
// but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
// GNU General Public License for more details.
|
||||
//
|
||||
// You should have received a copy of the GNU General Public License
|
||||
// along with this program; if not, write to the Free Software
|
||||
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
||||
|
||||
package net.yacy.repository;
|
||||
|
||||
/**
|
||||
* Blacklist host and path pair.
|
||||
*/
|
||||
public class BlacklistHostAndPath {
|
||||
|
||||
/** Blacklisted host */
|
||||
private final String host;
|
||||
|
||||
/** Blacklisted path */
|
||||
private final String path;
|
||||
|
||||
public BlacklistHostAndPath(final String host, final String path) {
|
||||
this.host = host;
|
||||
this.path = path;
|
||||
}
|
||||
|
||||
public String getHost() {
|
||||
return host;
|
||||
}
|
||||
|
||||
public String getPath() {
|
||||
return path;
|
||||
}
|
||||
|
||||
}
|
Loading…
Reference in New Issue
Block a user