*) Adding possibility to delete crawler queue entries using regular expressions

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@1160 6c8d7289-2bf4-0310-a012-ef5d649a1542
This commit is contained in:
theli 2005-12-05 09:11:28 +00:00
parent 1d6a6d1f85
commit 64478b1f02
3 changed files with 89 additions and 5 deletions

View File

@ -18,9 +18,20 @@ It may also contain urls that are computed by the proxy-prefetch.
The local crawler queue is empty<br><br>
::
<form action="IndexCreateWWWLocalQueue_p.html" method="post" enctype="multipart/form-data">
<input type="submit" name="clearcrawlqueue" value="clear local crawl queue">
Delete Enties:
<input type="text" name="pattern" value=".*" size="20" maxlength="200"/>
<select name="option" size="1">
<option value="Initiator">Initiator</option>
<option value="Profile">Profile</option>
<option value="Depth">Depth</option>
<option value="ModifiedDate">Modified Date</option>
<option value="AnchorName">Anchor Name</option>
<option value="URL" selected="selected">URL</option>
</select>
<input type="submit" name="deleteEntries" value="Delete"><i>This may take a quite long time</i>
</form>
<br>
<hr>
There are #[num]# entries in the local crawler queue. Showing #[show-num]# most recent entries:
<table border="0" cellpadding="2" cellspacing="1">
<tr class="TableHeader">

View File

@ -45,13 +45,18 @@
import java.text.SimpleDateFormat;
import java.util.Date;
import java.util.Iterator;
import java.util.Locale;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.regex.PatternSyntaxException;
import de.anomic.data.wikiCode;
import de.anomic.http.httpHeader;
import de.anomic.plasma.plasmaCrawlNURL;
import de.anomic.plasma.plasmaCrawlProfile;
import de.anomic.plasma.plasmaSwitchboard;
import de.anomic.plasma.plasmaCrawlNURL.Entry;
import de.anomic.server.serverObjects;
import de.anomic.server.serverSwitch;
import de.anomic.yacy.yacyCore;
@ -70,10 +75,65 @@ public class IndexCreateWWWLocalQueue_p {
serverObjects prop = new serverObjects();
if (post != null) {
if (post.containsKey("clearcrawlqueue")) {
int c = switchboard.urlPool.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_CORE);
switchboard.urlPool.noticeURL.clear(plasmaCrawlNURL.STACK_TYPE_CORE);
switchboard.cleanProfiles();
if (post.containsKey("deleteEntries")) {
int c = 0;
String pattern = post.get("pattern", ".*").trim();
String option = post.get("option", ".*").trim();
if (pattern.equals(".*")) {
c = switchboard.urlPool.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_CORE);
switchboard.urlPool.noticeURL.clear(plasmaCrawlNURL.STACK_TYPE_CORE);
switchboard.cleanProfiles();
} else{
Pattern compiledPattern = null;
try {
// compiling the regular expression
compiledPattern = Pattern.compile(pattern);
// iterating through the list of URLs
Iterator iter = switchboard.urlPool.noticeURL.iterator(plasmaCrawlNURL.STACK_TYPE_CORE);
while (iter.hasNext()) {
String value = null;
String nextHash = new String((byte[]) iter.next());
Entry entry = switchboard.urlPool.noticeURL.getEntry(nextHash);
if (entry == null) continue;
if ((option.equals("URL")&&(entry.url() != null))) {
value = entry.url().toString();
} else if ((option.equals("AnchorName"))) {
value = entry.name();
} else if ((option.equals("Profile"))) {
String profileHandle = entry.profileHandle();
if (profileHandle == null) {
value = "unknown";
} else {
plasmaCrawlProfile.entry profile = switchboard.profiles.getEntry(profileHandle);
if (profile == null) {
value = "unknown";
} else {
value = profile.name();
}
}
} else if ((option.equals("Depth"))) {
value = Integer.toString(entry.depth());
} else if ((option.equals("Initiator"))) {
value = (entry.initiator()==null)?"proxy":wikiCode.replaceHTML(entry.initiator());
} else if ((option.equals("ModifiedDate"))) {
value = daydate(entry.loaddate());
}
if (value != null) {
Matcher matcher = compiledPattern.matcher(value);
if (matcher.find()) {
switchboard.urlPool.noticeURL.remove(nextHash);
}
}
}
} catch (PatternSyntaxException e) {
e.printStackTrace();
}
}
prop.put("info", 3);//crawling queue cleared
prop.put("info_numEntries", c);

View File

@ -251,6 +251,19 @@ public class plasmaCrawlNURL extends plasmaURL {
default: return null;
}
}
public Iterator iterator(int stackType) {
switch (stackType) {
case STACK_TYPE_CORE: return coreStack.iterator();
case STACK_TYPE_LIMIT: return limitStack.iterator();
case STACK_TYPE_OVERHANG: return overhangStack.iterator();
case STACK_TYPE_REMOTE: return remoteStack.iterator();
case STACK_TYPE_IMAGE: return imageStack.iterator();
case STACK_TYPE_MOVIE: return movieStack.iterator();
case STACK_TYPE_MUSIC: return musicStack.iterator();
default: return null;
}
}
public Entry pop(int stackType) {
switch (stackType) {