mirror of
https://github.com/yacy/yacy_search_server.git
synced 2024-09-19 00:01:41 +02:00
Merge branch 'master' of ssh://git@gitorious.org/yacy/rc1.git
This commit is contained in:
commit
7bec253bb0
|
@ -127,13 +127,8 @@ public class CrawlResults {
|
|||
final String domain = post.get("domain", null);
|
||||
final String hashpart = domain == null ? null : DigestURI.hosthash6(domain);
|
||||
if (hashpart != null) {
|
||||
// delete all urls for this domain from database
|
||||
try {
|
||||
sb.index.fulltext().deleteDomain(hashpart);
|
||||
ResultURLs.deleteDomain(tabletype, domain, hashpart);
|
||||
} catch (final IOException e) {
|
||||
Log.logException(e);
|
||||
}
|
||||
sb.index.fulltext().deleteDomain(hashpart, false);
|
||||
ResultURLs.deleteDomain(tabletype, domain, hashpart);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -153,10 +153,11 @@
|
|||
<tr valign="top" class="TableCellLight">
|
||||
<td><label for="mustmatch">Must-Match Filter for URLs for crawling</label>:</td>
|
||||
<td>
|
||||
<input type="radio" name="range" id="rangeWide" value="wide" checked="checked" />Use filter
|
||||
<input type="radio" name="range" id="rangeWide" value="wide" checked="checked" onclick="document.getElementById('deleteold').checked=false;document.getElementById('deleteold').disabled=true;"/>Use filter
|
||||
<input name="mustmatch" id="mustmatch" type="text" size="60" maxlength="100" value="#[mustmatch]#" /><br />
|
||||
<input type="radio" name="range" id="rangeDomain" value="domain" />Restrict to start domain<br />
|
||||
<input type="radio" name="range" id="rangeSubpath" value="subpath" />Restrict to sub-path
|
||||
<input type="radio" name="range" id="rangeDomain" value="domain" onclick="document.getElementById('deleteold').disabled=false;document.getElementById('deleteold').checked=true;"/>Restrict to start domain<br />
|
||||
<input type="radio" name="range" id="rangeSubpath" value="subpath" onclick="document.getElementById('deleteold').disabled=false;document.getElementById('deleteold').checked=true;" />Restrict to sub-path<br />
|
||||
<input type="checkbox" name="deleteold" id="deleteold" disabled/>Delete all old documents in domain/subpath
|
||||
</td>
|
||||
<td>
|
||||
The filter is a <b><a href="http://download.oracle.com/javase/6/docs/api/java/util/regex/Pattern.html">regular expression</a></b>
|
||||
|
|
|
@ -81,6 +81,7 @@
|
|||
<dd>
|
||||
<input type="radio" name="range" id="rangeDomain" value="domain" checked="checked"/>load all files in domain<br />
|
||||
<input type="radio" name="range" id="rangeSubpath" value="subpath" />load only files in a sub-path of given url
|
||||
<input type="hidden" name="deleteold" id="deleteold" value="on" />
|
||||
<input type="hidden" name="mustnotmatch" id="mustnotmatch" value="" />
|
||||
<input type="hidden" name="crawlingDomFilterCheck" id="crawlingDomFilterCheck" value="off" />
|
||||
<input type="hidden" name="crawlingDomFilterDepth" id="crawlingDomFilterDepth" value="#[crawlingDomFilterDepth]#" />
|
||||
|
|
|
@ -150,7 +150,8 @@ public class Crawler_p {
|
|||
if (newcrawlingMustMatch.length() < 2) newcrawlingMustMatch = CrawlProfile.MATCH_ALL_STRING; // avoid that all urls are filtered out if bad value was submitted
|
||||
final boolean fullDomain = "domain".equals(post.get("range", "wide")); // special property in simple crawl start
|
||||
final boolean subPath = "subpath".equals(post.get("range", "wide")); // special property in simple crawl start
|
||||
|
||||
final boolean deleteold = (fullDomain || subPath) && post.getBoolean("deleteold");
|
||||
|
||||
String crawlingStart0 = post.get("crawlingURL","").trim(); // the crawljob start url
|
||||
String[] rootURLs0 = crawlingStart0.indexOf('\n') > 0 || crawlingStart0.indexOf('\r') > 0 ? crawlingStart0.split("[\\r\\n]+") : crawlingStart0.split(Pattern.quote("|"));
|
||||
Set<DigestURI> rootURLs = new HashSet<DigestURI>();
|
||||
|
@ -301,8 +302,18 @@ public class Crawler_p {
|
|||
String siteFilter = ".*";
|
||||
if (fullDomain) {
|
||||
siteFilter = CrawlProfile.siteFilter(rootURLs);
|
||||
if (deleteold) {
|
||||
for (DigestURI u: rootURLs) sb.index.fulltext().deleteDomain(u.hosthash(), true);
|
||||
}
|
||||
} else if (subPath) {
|
||||
siteFilter = CrawlProfile.subpathFilter(rootURLs);
|
||||
if (deleteold) {
|
||||
for (DigestURI u: rootURLs) {
|
||||
String subpath = CrawlProfile.mustMatchSubpath(u);
|
||||
if (subpath.endsWith(".*")) subpath = subpath.substring(0, subpath.length() - 2);
|
||||
sb.index.fulltext().remove(subpath, true);
|
||||
}
|
||||
}
|
||||
}
|
||||
if (CrawlProfile.MATCH_ALL_STRING.equals(newcrawlingMustMatch)) {
|
||||
newcrawlingMustMatch = siteFilter;
|
||||
|
|
|
@ -297,12 +297,7 @@ public class IndexControlURLs_p {
|
|||
|
||||
if (post.containsKey("deletedomain")) {
|
||||
final String hp = post.get("hashpart");
|
||||
try {
|
||||
segment.fulltext().deleteDomain(hp);
|
||||
} catch (final IOException e) {
|
||||
// TODO Auto-generated catch block
|
||||
Log.logException(e);
|
||||
}
|
||||
segment.fulltext().deleteDomain(hp, false);
|
||||
// trigger the loading of the table
|
||||
post.put("statistics", "");
|
||||
}
|
||||
|
|
|
@ -482,6 +482,12 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
|
|||
return System.currentTimeMillis() - (60000L * oldTimeMinutes);
|
||||
}
|
||||
|
||||
public static String siteFilter(final Set<? extends MultiProtocolURI> uris) {
|
||||
final StringBuilder filter = new StringBuilder();
|
||||
for (final MultiProtocolURI uri: uris) filter.append('|').append(mustMatchFilterFullDomain(uri));
|
||||
return filter.length() > 0 ? filter.substring(1) : CrawlProfile.MATCH_ALL_STRING;
|
||||
}
|
||||
|
||||
public static String mustMatchFilterFullDomain(final MultiProtocolURI uri) {
|
||||
String host = uri.getHost();
|
||||
if (host.startsWith("www.")) host = host.substring(4);
|
||||
|
@ -490,24 +496,18 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
|
|||
return new StringBuilder(host.length() + 20).append(protocol).append("://(www.)?").append(Pattern.quote(host)).append(".*").toString();
|
||||
}
|
||||
|
||||
private static String mustMatchSubpath(final MultiProtocolURI uri) {
|
||||
String u = uri.toNormalform(true);
|
||||
if (!u.endsWith("/")) {int p = u.lastIndexOf("/"); if (p > 0) u = u.substring(0, p + 1);}
|
||||
return new StringBuilder(u.length() + 5).append(Pattern.quote(u)).append(".*").toString();
|
||||
}
|
||||
|
||||
public static String siteFilter(final Set<? extends MultiProtocolURI> uris) {
|
||||
final StringBuilder filter = new StringBuilder();
|
||||
for (final MultiProtocolURI uri: uris) filter.append('|').append(mustMatchFilterFullDomain(uri));
|
||||
return filter.length() > 0 ? filter.substring(1) : CrawlProfile.MATCH_ALL_STRING;
|
||||
}
|
||||
|
||||
public static String subpathFilter(final Set<? extends MultiProtocolURI> uris) {
|
||||
final StringBuilder filter = new StringBuilder();
|
||||
for (final MultiProtocolURI uri: uris) filter.append('|').append(mustMatchSubpath(uri));
|
||||
return filter.length() > 0 ? filter.substring(1) : CrawlProfile.MATCH_ALL_STRING;
|
||||
}
|
||||
|
||||
public static String mustMatchSubpath(final MultiProtocolURI uri) {
|
||||
String u = uri.toNormalform(true);
|
||||
if (!u.endsWith("/")) {int p = u.lastIndexOf("/"); if (p > 0) u = u.substring(0, p + 1);}
|
||||
return new StringBuilder(u.length() + 5).append(Pattern.quote(u)).append(".*").toString();
|
||||
}
|
||||
|
||||
public static final Set<String> ignoreNames = new HashSet<String>();
|
||||
static {
|
||||
ignoreNames.add(CrawlSwitchboard.CRAWL_PROFILE_PROXY);
|
||||
|
|
|
@ -795,44 +795,53 @@ public final class Fulltext implements Iterable<byte[]> {
|
|||
* @return number of deleted domains
|
||||
* @throws IOException
|
||||
*/
|
||||
public int deleteDomain(final String hosthash) throws IOException {
|
||||
public void deleteDomain(final String hosthash, boolean concurrent) {
|
||||
// first collect all url hashes that belong to the domain
|
||||
assert hosthash.length() == 6;
|
||||
// delete in solr
|
||||
synchronized (this.solr) {
|
||||
this.solr.deleteByQuery(YaCySchema.host_id_s.name() + ":\"" + hosthash + "\"");
|
||||
}
|
||||
|
||||
// delete in old metadata structure
|
||||
final ArrayList<String> l = new ArrayList<String>();
|
||||
synchronized (this) {
|
||||
final CloneableIterator<byte[]> i = this.urlIndexFile.keys(true, null);
|
||||
String hash;
|
||||
while (i != null && i.hasNext()) {
|
||||
hash = ASCII.String(i.next());
|
||||
if (hosthash.equals(hash.substring(6))) l.add(hash);
|
||||
}
|
||||
}
|
||||
|
||||
// then delete the urls using this list
|
||||
int cnt = 0;
|
||||
for (final String h: l) {
|
||||
if (this.urlIndexFile.delete(ASCII.getBytes(h))) cnt++;
|
||||
}
|
||||
|
||||
// finally remove the line with statistics
|
||||
if (this.statsDump != null) {
|
||||
final Iterator<HostStat> hsi = this.statsDump.iterator();
|
||||
HostStat hs;
|
||||
while (hsi.hasNext()) {
|
||||
hs = hsi.next();
|
||||
if (hs.hosthash.equals(hosthash)) {
|
||||
hsi.remove();
|
||||
break;
|
||||
|
||||
Thread t = new Thread() {
|
||||
public void run() {
|
||||
// delete in solr
|
||||
synchronized (Fulltext.this.solr) {
|
||||
try {
|
||||
Fulltext.this.solr.deleteByQuery(YaCySchema.host_id_s.name() + ":\"" + hosthash + "\"");
|
||||
Fulltext.this.solr.commit();
|
||||
} catch (IOException e) {}
|
||||
}
|
||||
|
||||
// delete in old metadata structure
|
||||
if (Fulltext.this.urlIndexFile != null) {
|
||||
final ArrayList<String> l = new ArrayList<String>();
|
||||
synchronized (this) {
|
||||
CloneableIterator<byte[]> i;
|
||||
try {
|
||||
i = Fulltext.this.urlIndexFile.keys(true, null);
|
||||
String hash;
|
||||
while (i != null && i.hasNext()) {
|
||||
hash = ASCII.String(i.next());
|
||||
if (hosthash.equals(hash.substring(6))) l.add(hash);
|
||||
}
|
||||
|
||||
// then delete the urls using this list
|
||||
for (final String h: l) Fulltext.this.urlIndexFile.delete(ASCII.getBytes(h));
|
||||
} catch (IOException e) {}
|
||||
}
|
||||
}
|
||||
|
||||
// finally remove the line with statistics
|
||||
if (Fulltext.this.statsDump != null) {
|
||||
final Iterator<HostStat> hsi = Fulltext.this.statsDump.iterator();
|
||||
HostStat hs;
|
||||
while (hsi.hasNext()) {
|
||||
hs = hsi.next();
|
||||
if (hs.hosthash.equals(hosthash)) {
|
||||
hsi.remove();
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return cnt;
|
||||
};
|
||||
if (concurrent) t.start(); else t.run();
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue
Block a user