mirror of
https://github.com/yacy/yacy_search_server.git
synced 2024-09-19 00:01:41 +02:00
Blacklist import and update performance improvements.
Measurement sample : import from blacklist local file containing about 15000 entries - before refactoring : several minutes - after refactoring : a few seconds!
This commit is contained in:
parent
e3892b0957
commit
339f005ced
|
@ -47,6 +47,7 @@ import net.yacy.cora.util.ConcurrentLog;
|
||||||
import net.yacy.data.ListManager;
|
import net.yacy.data.ListManager;
|
||||||
import net.yacy.kelondro.util.FileUtils;
|
import net.yacy.kelondro.util.FileUtils;
|
||||||
import net.yacy.repository.Blacklist;
|
import net.yacy.repository.Blacklist;
|
||||||
|
import net.yacy.repository.BlacklistHostAndPath;
|
||||||
import net.yacy.repository.Blacklist.BlacklistError;
|
import net.yacy.repository.Blacklist.BlacklistError;
|
||||||
import net.yacy.repository.Blacklist.BlacklistType;
|
import net.yacy.repository.Blacklist.BlacklistType;
|
||||||
import net.yacy.search.Switchboard;
|
import net.yacy.search.Switchboard;
|
||||||
|
@ -389,6 +390,8 @@ public class BlacklistCleaner_p {
|
||||||
final String[] oldEntry, final String[] newEntry) {
|
final String[] oldEntry, final String[] newEntry) {
|
||||||
removeEntries(blacklistToUse, supportedBlacklistTypes, oldEntry);
|
removeEntries(blacklistToUse, supportedBlacklistTypes, oldEntry);
|
||||||
String host, path;
|
String host, path;
|
||||||
|
/* Prepare the new blacklist items list to add then them in one operation for better performance */
|
||||||
|
final Collection<BlacklistHostAndPath> newEntries = new ArrayList<>();
|
||||||
for (final String n : newEntry) {
|
for (final String n : newEntry) {
|
||||||
final int pos = n.indexOf('/', 0);
|
final int pos = n.indexOf('/', 0);
|
||||||
if (pos < 0) {
|
if (pos < 0) {
|
||||||
|
@ -398,12 +401,13 @@ public class BlacklistCleaner_p {
|
||||||
host = n.substring(0, pos);
|
host = n.substring(0, pos);
|
||||||
path = n.substring(pos + 1);
|
path = n.substring(pos + 1);
|
||||||
}
|
}
|
||||||
|
newEntries.add(new BlacklistHostAndPath(host, path));
|
||||||
|
}
|
||||||
for (final BlacklistType s : supportedBlacklistTypes) {
|
for (final BlacklistType s : supportedBlacklistTypes) {
|
||||||
if (ListManager.listSetContains(s + ".BlackLists",
|
if (ListManager.listSetContains(s + ".BlackLists",
|
||||||
blacklistToUse)) {
|
blacklistToUse)) {
|
||||||
try {
|
try {
|
||||||
Switchboard.urlBlacklist.add(s, blacklistToUse, host,
|
Switchboard.urlBlacklist.add(s, blacklistToUse, newEntries);
|
||||||
path);
|
|
||||||
} catch (PunycodeException e) {
|
} catch (PunycodeException e) {
|
||||||
ConcurrentLog.warn(APP_NAME,
|
ConcurrentLog.warn(APP_NAME,
|
||||||
"Unable to add blacklist entry to blacklist "
|
"Unable to add blacklist entry to blacklist "
|
||||||
|
@ -412,7 +416,6 @@ public class BlacklistCleaner_p {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
SearchEventCache.cleanupEvents(true);
|
SearchEventCache.cleanupEvents(true);
|
||||||
}
|
|
||||||
return newEntry.length;
|
return newEntry.length;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -26,6 +26,8 @@
|
||||||
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.Collection;
|
||||||
import java.util.Date;
|
import java.util.Date;
|
||||||
import java.util.Iterator;
|
import java.util.Iterator;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
@ -59,6 +61,7 @@ import net.yacy.peers.DHTSelection;
|
||||||
import net.yacy.peers.Protocol;
|
import net.yacy.peers.Protocol;
|
||||||
import net.yacy.peers.Seed;
|
import net.yacy.peers.Seed;
|
||||||
import net.yacy.repository.Blacklist;
|
import net.yacy.repository.Blacklist;
|
||||||
|
import net.yacy.repository.BlacklistHostAndPath;
|
||||||
import net.yacy.repository.Blacklist.BlacklistType;
|
import net.yacy.repository.Blacklist.BlacklistType;
|
||||||
import net.yacy.search.Switchboard;
|
import net.yacy.search.Switchboard;
|
||||||
import net.yacy.search.SwitchboardConstants;
|
import net.yacy.search.SwitchboardConstants;
|
||||||
|
@ -374,8 +377,9 @@ public class IndexControlRWIs_p {
|
||||||
Word.commonHashOrder,
|
Word.commonHashOrder,
|
||||||
urlb.size());
|
urlb.size());
|
||||||
if ( post.containsKey("blacklisturls") ) {
|
if ( post.containsKey("blacklisturls") ) {
|
||||||
final String[] supportedBlacklistTypes = env.getConfigArray("BlackLists.types", "");
|
|
||||||
DigestURL url;
|
DigestURL url;
|
||||||
|
/* Prepare the new blacklist items list to add then them in one operation for better performance */
|
||||||
|
final Collection<BlacklistHostAndPath> items = new ArrayList<>();
|
||||||
for ( final byte[] b : urlb ) {
|
for ( final byte[] b : urlb ) {
|
||||||
try {
|
try {
|
||||||
urlHashes.put(b);
|
urlHashes.put(b);
|
||||||
|
@ -386,29 +390,27 @@ public class IndexControlRWIs_p {
|
||||||
url = segment.fulltext().getURL(ASCII.String(b));
|
url = segment.fulltext().getURL(ASCII.String(b));
|
||||||
segment.fulltext().remove(b);
|
segment.fulltext().remove(b);
|
||||||
if ( url != null ) {
|
if ( url != null ) {
|
||||||
for ( final String supportedBlacklistType : supportedBlacklistTypes ) {
|
items.add(new BlacklistHostAndPath(url.getHost(), url.getFile()));
|
||||||
if ( ListManager.listSetContains(
|
|
||||||
supportedBlacklistType + ".BlackLists",
|
|
||||||
blacklist) ) {
|
|
||||||
try {
|
|
||||||
Switchboard.urlBlacklist.add(
|
|
||||||
BlacklistType.valueOf(supportedBlacklistType),
|
|
||||||
blacklist,
|
|
||||||
url.getHost(),
|
|
||||||
url.getFile());
|
|
||||||
} catch (PunycodeException e) {
|
|
||||||
ConcurrentLog.warn(APP_NAME,
|
|
||||||
"Unable to add blacklist entry to blacklist "
|
|
||||||
+ supportedBlacklistType, e);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
SearchEventCache.cleanupEvents(true);
|
|
||||||
}
|
}
|
||||||
} catch (IOException e1) {
|
} catch (IOException e1) {
|
||||||
ConcurrentLog.logException(e1);
|
ConcurrentLog.logException(e1);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
for (final BlacklistType supportedBlacklistType : BlacklistType.values()) {
|
||||||
|
if ( ListManager.listSetContains(
|
||||||
|
supportedBlacklistType + ".BlackLists",
|
||||||
|
blacklist) ) {
|
||||||
|
try {
|
||||||
|
Switchboard.urlBlacklist.add(supportedBlacklistType,
|
||||||
|
blacklist, items);
|
||||||
|
} catch (PunycodeException e) {
|
||||||
|
ConcurrentLog.warn(APP_NAME,
|
||||||
|
"Unable to add blacklist entries to blacklist "
|
||||||
|
+ supportedBlacklistType, e);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
SearchEventCache.cleanupEvents(true);
|
||||||
}
|
}
|
||||||
|
|
||||||
if ( post.containsKey("blacklistdomains") ) {
|
if ( post.containsKey("blacklistdomains") ) {
|
||||||
|
|
|
@ -32,12 +32,16 @@
|
||||||
import java.io.File;
|
import java.io.File;
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.io.StringReader;
|
import java.io.StringReader;
|
||||||
|
import java.util.ArrayList;
|
||||||
import java.util.Arrays;
|
import java.util.Arrays;
|
||||||
|
import java.util.Collection;
|
||||||
import java.util.HashSet;
|
import java.util.HashSet;
|
||||||
import java.util.Iterator;
|
import java.util.Iterator;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.util.Set;
|
import java.util.Set;
|
||||||
|
|
||||||
|
import org.xml.sax.SAXException;
|
||||||
|
|
||||||
import net.yacy.cora.document.encoding.UTF8;
|
import net.yacy.cora.document.encoding.UTF8;
|
||||||
import net.yacy.cora.document.id.DigestURL;
|
import net.yacy.cora.document.id.DigestURL;
|
||||||
import net.yacy.cora.protocol.ClientIdentification;
|
import net.yacy.cora.protocol.ClientIdentification;
|
||||||
|
@ -49,14 +53,13 @@ import net.yacy.document.parser.html.CharacterCoding;
|
||||||
import net.yacy.kelondro.util.FileUtils;
|
import net.yacy.kelondro.util.FileUtils;
|
||||||
import net.yacy.peers.Seed;
|
import net.yacy.peers.Seed;
|
||||||
import net.yacy.repository.Blacklist;
|
import net.yacy.repository.Blacklist;
|
||||||
|
import net.yacy.repository.BlacklistHostAndPath;
|
||||||
import net.yacy.repository.Blacklist.BlacklistType;
|
import net.yacy.repository.Blacklist.BlacklistType;
|
||||||
import net.yacy.search.Switchboard;
|
import net.yacy.search.Switchboard;
|
||||||
import net.yacy.search.query.SearchEventCache;
|
import net.yacy.search.query.SearchEventCache;
|
||||||
import net.yacy.server.serverObjects;
|
import net.yacy.server.serverObjects;
|
||||||
import net.yacy.server.serverSwitch;
|
import net.yacy.server.serverSwitch;
|
||||||
|
|
||||||
import org.xml.sax.SAXException;
|
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Handle blacklist import operations. Either :
|
* Handle blacklist import operations. Either :
|
||||||
|
@ -226,9 +229,11 @@ public class sharedBlacklist_p {
|
||||||
try {
|
try {
|
||||||
// loop through the received entry list
|
// loop through the received entry list
|
||||||
final int num = post.getInt("num", 0);
|
final int num = post.getInt("num", 0);
|
||||||
for(int i = 0; i < num; i++){
|
final Collection<BlacklistHostAndPath> newItems = new ArrayList<>();
|
||||||
if( post.containsKey("item" + i) ){
|
/* Prepare the new blacklist items list to add then them in one operation for better performance */
|
||||||
|
for(int i = 0; i < num; i++) {
|
||||||
String newItem = post.get("item" + i);
|
String newItem = post.get("item" + i);
|
||||||
|
if(newItem != null){
|
||||||
|
|
||||||
//This should not be needed...
|
//This should not be needed...
|
||||||
if ( newItem.startsWith("http://") ){
|
if ( newItem.startsWith("http://") ){
|
||||||
|
@ -242,17 +247,17 @@ public class sharedBlacklist_p {
|
||||||
pos = newItem.length();
|
pos = newItem.length();
|
||||||
newItem = newItem + "/.*";
|
newItem = newItem + "/.*";
|
||||||
}
|
}
|
||||||
|
newItems.add(new BlacklistHostAndPath(newItem.substring(0, pos), newItem.substring(pos + 1)));
|
||||||
|
}
|
||||||
|
}
|
||||||
if (Switchboard.urlBlacklist != null) {
|
if (Switchboard.urlBlacklist != null) {
|
||||||
for (final BlacklistType supportedBlacklistType : BlacklistType.values()) {
|
for (final BlacklistType supportedBlacklistType : BlacklistType.values()) {
|
||||||
if (ListManager.listSetContains(supportedBlacklistType + ".BlackLists",selectedBlacklistName)) {
|
if (ListManager.listSetContains(supportedBlacklistType + ".BlackLists",selectedBlacklistName)) {
|
||||||
Switchboard.urlBlacklist.add(supportedBlacklistType,selectedBlacklistName,newItem.substring(0, pos), newItem.substring(pos + 1));
|
Switchboard.urlBlacklist.add(supportedBlacklistType, selectedBlacklistName, newItems);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
SearchEventCache.cleanupEvents(true);
|
SearchEventCache.cleanupEvents(true);
|
||||||
}
|
}
|
||||||
}
|
|
||||||
}
|
|
||||||
} catch (final Exception e) {
|
} catch (final Exception e) {
|
||||||
prop.put("status", "1");
|
prop.put("status", "1");
|
||||||
prop.putHTML("status_error", e.getLocalizedMessage());
|
prop.putHTML("status_error", e.getLocalizedMessage());
|
||||||
|
|
|
@ -33,6 +33,8 @@ import java.io.IOException;
|
||||||
import java.io.ObjectInputStream;
|
import java.io.ObjectInputStream;
|
||||||
import java.io.ObjectOutputStream;
|
import java.io.ObjectOutputStream;
|
||||||
import java.io.PrintWriter;
|
import java.io.PrintWriter;
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.Collection;
|
||||||
import java.util.HashSet;
|
import java.util.HashSet;
|
||||||
import java.util.Iterator;
|
import java.util.Iterator;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
@ -318,32 +320,54 @@ public class Blacklist {
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Adds entry to a given blacklist internal data and updates the source file
|
* Adds entries to a given blacklist internal data and updates the source
|
||||||
|
* file
|
||||||
|
*
|
||||||
* @param blacklistType
|
* @param blacklistType
|
||||||
* @param blacklistToUse source file
|
* @param blacklistToUse
|
||||||
* @param host
|
* source file
|
||||||
* @param path
|
* @param items
|
||||||
|
* blacklist host/path items to add
|
||||||
* @throws PunycodeException
|
* @throws PunycodeException
|
||||||
*/
|
*/
|
||||||
public final void add(final BlacklistType blacklistType, final String blacklistToUse, final String host, final String path) throws PunycodeException {
|
public final void add(final BlacklistType blacklistType, final String blacklistToUse,
|
||||||
|
final Collection<BlacklistHostAndPath> items) throws PunycodeException {
|
||||||
|
|
||||||
|
if (items != null) {
|
||||||
|
PrintWriter pw = null;
|
||||||
|
try {
|
||||||
|
/* Get the content of the blacklist file in memory */
|
||||||
|
final Set<String> blacklist = new HashSet<String>(
|
||||||
|
FileUtils.getListArray(new File(this.blacklistRootPath, blacklistToUse)));
|
||||||
|
/* Open a writer on the file */
|
||||||
|
pw = new PrintWriter(new FileWriter(new File(this.blacklistRootPath, blacklistToUse), true));
|
||||||
|
|
||||||
|
for (BlacklistHostAndPath itemToAdd : items) {
|
||||||
|
final String host = itemToAdd.getHost();
|
||||||
|
final String path = itemToAdd.getPath();
|
||||||
final String safeHost = Punycode.isBasic(host) ? host : MultiProtocolURL.toPunycode(host);
|
final String safeHost = Punycode.isBasic(host) ? host : MultiProtocolURL.toPunycode(host);
|
||||||
|
|
||||||
if (contains(blacklistType, safeHost, path)) {
|
if (contains(blacklistType, safeHost, path)) {
|
||||||
return;
|
/* Continue to the next item */
|
||||||
|
continue;
|
||||||
}
|
}
|
||||||
if (safeHost == null) {
|
if (safeHost == null) {
|
||||||
throw new IllegalArgumentException("host may not be null");
|
log.warn("host must not be null");
|
||||||
|
/* Continue to the next item */
|
||||||
|
continue;
|
||||||
}
|
}
|
||||||
if (path == null) {
|
if (path == null) {
|
||||||
throw new IllegalArgumentException("path may not be null");
|
log.warn("path must not be null");
|
||||||
|
/* Continue to the next item */
|
||||||
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
String p = (!path.isEmpty() && path.charAt(0) == '/') ? path.substring(1) : path;
|
String p = (!path.isEmpty() && path.charAt(0) == '/') ? path.substring(1) : path;
|
||||||
final Map<String, Set<Pattern>> blacklistMap = getBlacklistMap(blacklistType, isMatchable(host));
|
final Map<String, Set<Pattern>> blacklistMap = getBlacklistMap(blacklistType, isMatchable(host));
|
||||||
|
|
||||||
// avoid PatternSyntaxException e
|
// avoid PatternSyntaxException e
|
||||||
final String h = ((!isMatchable(safeHost) && !safeHost.isEmpty() && safeHost.charAt(0) == '*') ? "." + safeHost : safeHost).toLowerCase();
|
final String h = ((!isMatchable(safeHost) && !safeHost.isEmpty() && safeHost.charAt(0) == '*')
|
||||||
|
? "." + safeHost : safeHost).toLowerCase();
|
||||||
if (!p.isEmpty() && p.charAt(0) == '*') {
|
if (!p.isEmpty() && p.charAt(0) == '*') {
|
||||||
p = "." + p;
|
p = "." + p;
|
||||||
}
|
}
|
||||||
|
@ -358,29 +382,39 @@ public class Blacklist {
|
||||||
hostList.add(pattern);
|
hostList.add(pattern);
|
||||||
|
|
||||||
// Append the line to the file.
|
// Append the line to the file.
|
||||||
PrintWriter pw = null;
|
|
||||||
try {
|
|
||||||
final String newEntry = h + "/" + pattern;
|
final String newEntry = h + "/" + pattern;
|
||||||
if (!blacklistFileContains(blacklistRootPath,
|
if (!blacklist.contains(newEntry)) {
|
||||||
blacklistToUse, newEntry)) {
|
|
||||||
pw = new PrintWriter(new FileWriter(new File(blacklistRootPath,
|
|
||||||
blacklistToUse), true));
|
|
||||||
pw.println(newEntry);
|
pw.println(newEntry);
|
||||||
pw.close();
|
blacklist.add(newEntry);
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
} catch (final IOException e) {
|
} catch (final IOException e) {
|
||||||
ConcurrentLog.logException(e);
|
ConcurrentLog.logException(e);
|
||||||
} finally {
|
} finally {
|
||||||
if (pw != null) {
|
if (pw != null) {
|
||||||
try {
|
|
||||||
pw.close();
|
pw.close();
|
||||||
} catch (final Exception e) {
|
if (pw.checkError()) {
|
||||||
log.warn("could not close stream to " +
|
log.warn("could not close stream to " + blacklistToUse + "! ");
|
||||||
blacklistToUse + "! " + e.getMessage());
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Adds entry to a given blacklist internal data and updates the source file
|
||||||
|
* @param blacklistType
|
||||||
|
* @param blacklistToUse source file
|
||||||
|
* @param host
|
||||||
|
* @param path
|
||||||
|
* @throws PunycodeException
|
||||||
|
*/
|
||||||
|
public final void add(final BlacklistType blacklistType, final String blacklistToUse, final String host, final String path) throws PunycodeException {
|
||||||
|
final Collection<BlacklistHostAndPath> oneItemList = new ArrayList<>();
|
||||||
|
oneItemList.add(new BlacklistHostAndPath(host, path));
|
||||||
|
this.add(blacklistType, blacklistToUse, oneItemList);
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* appends aN entry to the backlist source file and updates internal blacklist maps.
|
* appends aN entry to the backlist source file and updates internal blacklist maps.
|
||||||
|
|
48
source/net/yacy/repository/BlacklistHostAndPath.java
Normal file
48
source/net/yacy/repository/BlacklistHostAndPath.java
Normal file
|
@ -0,0 +1,48 @@
|
||||||
|
// BlacklitHostAndPath.java
|
||||||
|
// Copyright 2017 by luccioman; https://github.com/luccioman
|
||||||
|
//
|
||||||
|
// This is a part of YaCy, a peer-to-peer based web search engine
|
||||||
|
//
|
||||||
|
// LICENSE
|
||||||
|
//
|
||||||
|
// This program is free software; you can redistribute it and/or modify
|
||||||
|
// it under the terms of the GNU General Public License as published by
|
||||||
|
// the Free Software Foundation; either version 2 of the License, or
|
||||||
|
// (at your option) any later version.
|
||||||
|
//
|
||||||
|
// This program is distributed in the hope that it will be useful,
|
||||||
|
// but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
// GNU General Public License for more details.
|
||||||
|
//
|
||||||
|
// You should have received a copy of the GNU General Public License
|
||||||
|
// along with this program; if not, write to the Free Software
|
||||||
|
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
||||||
|
|
||||||
|
package net.yacy.repository;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Blacklist host and path pair.
|
||||||
|
*/
|
||||||
|
public class BlacklistHostAndPath {
|
||||||
|
|
||||||
|
/** Blacklisted host */
|
||||||
|
private final String host;
|
||||||
|
|
||||||
|
/** Blacklisted path */
|
||||||
|
private final String path;
|
||||||
|
|
||||||
|
public BlacklistHostAndPath(final String host, final String path) {
|
||||||
|
this.host = host;
|
||||||
|
this.path = path;
|
||||||
|
}
|
||||||
|
|
||||||
|
public String getHost() {
|
||||||
|
return host;
|
||||||
|
}
|
||||||
|
|
||||||
|
public String getPath() {
|
||||||
|
return path;
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
Loading…
Reference in New Issue
Block a user