mirror of
https://github.com/yacy/yacy_search_server.git
synced 2024-09-19 00:01:41 +02:00
*) Adding possibility to delete crawler queue entries using regular expressions
git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@1160 6c8d7289-2bf4-0310-a012-ef5d649a1542
This commit is contained in:
parent
1d6a6d1f85
commit
64478b1f02
|
@ -18,9 +18,20 @@ It may also contain urls that are computed by the proxy-prefetch.
|
|||
The local crawler queue is empty<br><br>
|
||||
::
|
||||
<form action="IndexCreateWWWLocalQueue_p.html" method="post" enctype="multipart/form-data">
|
||||
<input type="submit" name="clearcrawlqueue" value="clear local crawl queue">
|
||||
Delete Enties:
|
||||
<input type="text" name="pattern" value=".*" size="20" maxlength="200"/>
|
||||
<select name="option" size="1">
|
||||
<option value="Initiator">Initiator</option>
|
||||
<option value="Profile">Profile</option>
|
||||
<option value="Depth">Depth</option>
|
||||
<option value="ModifiedDate">Modified Date</option>
|
||||
<option value="AnchorName">Anchor Name</option>
|
||||
<option value="URL" selected="selected">URL</option>
|
||||
</select>
|
||||
<input type="submit" name="deleteEntries" value="Delete"><i>This may take a quite long time</i>
|
||||
</form>
|
||||
<br>
|
||||
<hr>
|
||||
There are #[num]# entries in the local crawler queue. Showing #[show-num]# most recent entries:
|
||||
<table border="0" cellpadding="2" cellspacing="1">
|
||||
<tr class="TableHeader">
|
||||
|
|
|
@ -45,13 +45,18 @@
|
|||
|
||||
import java.text.SimpleDateFormat;
|
||||
import java.util.Date;
|
||||
import java.util.Iterator;
|
||||
import java.util.Locale;
|
||||
import java.util.regex.Matcher;
|
||||
import java.util.regex.Pattern;
|
||||
import java.util.regex.PatternSyntaxException;
|
||||
|
||||
import de.anomic.data.wikiCode;
|
||||
import de.anomic.http.httpHeader;
|
||||
import de.anomic.plasma.plasmaCrawlNURL;
|
||||
import de.anomic.plasma.plasmaCrawlProfile;
|
||||
import de.anomic.plasma.plasmaSwitchboard;
|
||||
import de.anomic.plasma.plasmaCrawlNURL.Entry;
|
||||
import de.anomic.server.serverObjects;
|
||||
import de.anomic.server.serverSwitch;
|
||||
import de.anomic.yacy.yacyCore;
|
||||
|
@ -70,10 +75,65 @@ public class IndexCreateWWWLocalQueue_p {
|
|||
serverObjects prop = new serverObjects();
|
||||
|
||||
if (post != null) {
|
||||
if (post.containsKey("clearcrawlqueue")) {
|
||||
int c = switchboard.urlPool.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_CORE);
|
||||
switchboard.urlPool.noticeURL.clear(plasmaCrawlNURL.STACK_TYPE_CORE);
|
||||
switchboard.cleanProfiles();
|
||||
if (post.containsKey("deleteEntries")) {
|
||||
int c = 0;
|
||||
|
||||
String pattern = post.get("pattern", ".*").trim();
|
||||
String option = post.get("option", ".*").trim();
|
||||
if (pattern.equals(".*")) {
|
||||
c = switchboard.urlPool.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_CORE);
|
||||
switchboard.urlPool.noticeURL.clear(plasmaCrawlNURL.STACK_TYPE_CORE);
|
||||
switchboard.cleanProfiles();
|
||||
} else{
|
||||
Pattern compiledPattern = null;
|
||||
try {
|
||||
// compiling the regular expression
|
||||
compiledPattern = Pattern.compile(pattern);
|
||||
|
||||
// iterating through the list of URLs
|
||||
Iterator iter = switchboard.urlPool.noticeURL.iterator(plasmaCrawlNURL.STACK_TYPE_CORE);
|
||||
while (iter.hasNext()) {
|
||||
String value = null;
|
||||
String nextHash = new String((byte[]) iter.next());
|
||||
Entry entry = switchboard.urlPool.noticeURL.getEntry(nextHash);
|
||||
if (entry == null) continue;
|
||||
|
||||
if ((option.equals("URL")&&(entry.url() != null))) {
|
||||
value = entry.url().toString();
|
||||
} else if ((option.equals("AnchorName"))) {
|
||||
value = entry.name();
|
||||
} else if ((option.equals("Profile"))) {
|
||||
String profileHandle = entry.profileHandle();
|
||||
if (profileHandle == null) {
|
||||
value = "unknown";
|
||||
} else {
|
||||
plasmaCrawlProfile.entry profile = switchboard.profiles.getEntry(profileHandle);
|
||||
if (profile == null) {
|
||||
value = "unknown";
|
||||
} else {
|
||||
value = profile.name();
|
||||
}
|
||||
}
|
||||
} else if ((option.equals("Depth"))) {
|
||||
value = Integer.toString(entry.depth());
|
||||
} else if ((option.equals("Initiator"))) {
|
||||
value = (entry.initiator()==null)?"proxy":wikiCode.replaceHTML(entry.initiator());
|
||||
} else if ((option.equals("ModifiedDate"))) {
|
||||
value = daydate(entry.loaddate());
|
||||
}
|
||||
|
||||
if (value != null) {
|
||||
Matcher matcher = compiledPattern.matcher(value);
|
||||
if (matcher.find()) {
|
||||
switchboard.urlPool.noticeURL.remove(nextHash);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
} catch (PatternSyntaxException e) {
|
||||
e.printStackTrace();
|
||||
}
|
||||
}
|
||||
|
||||
prop.put("info", 3);//crawling queue cleared
|
||||
prop.put("info_numEntries", c);
|
||||
|
|
|
@ -251,6 +251,19 @@ public class plasmaCrawlNURL extends plasmaURL {
|
|||
default: return null;
|
||||
}
|
||||
}
|
||||
|
||||
public Iterator iterator(int stackType) {
|
||||
switch (stackType) {
|
||||
case STACK_TYPE_CORE: return coreStack.iterator();
|
||||
case STACK_TYPE_LIMIT: return limitStack.iterator();
|
||||
case STACK_TYPE_OVERHANG: return overhangStack.iterator();
|
||||
case STACK_TYPE_REMOTE: return remoteStack.iterator();
|
||||
case STACK_TYPE_IMAGE: return imageStack.iterator();
|
||||
case STACK_TYPE_MOVIE: return movieStack.iterator();
|
||||
case STACK_TYPE_MUSIC: return musicStack.iterator();
|
||||
default: return null;
|
||||
}
|
||||
}
|
||||
|
||||
public Entry pop(int stackType) {
|
||||
switch (stackType) {
|
||||
|
|
Loading…
Reference in New Issue
Block a user