*) Bugfix for "-UNRESOLVED_PATTERN-" Bug on IndexCreateWWWLocalQueue_p.html and "urlEntry.url() == null" Bug

- Logging message for "urlEntry.url() == null" is now displayed as info
   - IndexCreateWWWLocalQueue_p.html now detects null entries while looping throug the list and removes them automatically
   See: 
   - http://www.yacy-forum.de/viewtopic.php?t=532#8781
   - http://www.yacy-forum.de/viewtopic.php?t=639
   - http://www.yacy-forum.de/viewtopic.php?t=1071
   - http://www.yacy-forum.de/viewtopic.php?t=338
   - http://www.yacy-forum.de/viewtopic.php?t=980

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@640 6c8d7289-2bf4-0310-a012-ef5d649a1542
This commit is contained in:
theli 2005-09-02 09:33:05 +00:00
parent 33aaffbfc6
commit 732a107160
4 changed files with 41 additions and 33 deletions

View File

@ -113,13 +113,13 @@ public class IndexCreateIndexingQueue_p {
pcentry = (plasmaSwitchboardQueue.Entry) entryList.get(i);
if ((pcentry != null)&&(pcentry.url() != null)) {
initiator = yacyCore.seedDB.getConnected(pcentry.initiator());
prop.put("indexing-queue_list_"+i+"_dark", ((dark) ? 1 : 0));
prop.put("indexing-queue_list_"+i+"_initiator", ((initiator == null) ? "proxy" : initiator.getName()));
prop.put("indexing-queue_list_"+i+"_depth", pcentry.depth());
prop.put("indexing-queue_list_"+i+"_modified", (pcentry.responseHeader() == null) ? "" : daydate(pcentry.responseHeader().lastModified()));
prop.put("indexing-queue_list_"+i+"_anchor", (pcentry.anchorName()==null)?"":pcentry.anchorName());
prop.put("indexing-queue_list_"+i+"_url", pcentry.normalizedURLString());
prop.put("indexing-queue_list_"+i+"_size", Status.bytesToString(pcentry.size()));
prop.put("indexing-queue_list_"+entryCount+"_dark", ((dark) ? 1 : 0));
prop.put("indexing-queue_list_"+entryCount+"_initiator", ((initiator == null) ? "proxy" : initiator.getName()));
prop.put("indexing-queue_list_"+entryCount+"_depth", pcentry.depth());
prop.put("indexing-queue_list_"+entryCount+"_modified", (pcentry.responseHeader() == null) ? "" : daydate(pcentry.responseHeader().lastModified()));
prop.put("indexing-queue_list_"+entryCount+"_anchor", (pcentry.anchorName()==null)?"":pcentry.anchorName());
prop.put("indexing-queue_list_"+entryCount+"_url", pcentry.normalizedURLString());
prop.put("indexing-queue_list_"+entryCount+"_size", Status.bytesToString(pcentry.size()));
dark = !dark;
entryCount++;
}

View File

@ -84,33 +84,38 @@ public class IndexCreateWWWLocalQueue_p {
}
}
int stackSize = switchboard.urlPool.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_CORE);
int showNum = 0, stackSize = switchboard.urlPool.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_CORE);
if (stackSize == 0) {
prop.put("crawler-queue", 0);
} else {
prop.put("crawler-queue", 1);
plasmaCrawlNURL.Entry[] crawlerList = switchboard.urlPool.noticeURL.top(plasmaCrawlNURL.STACK_TYPE_CORE, 100);
prop.put("crawler-queue_num", stackSize);//num Entries
prop.put("crawler-queue_show-num", crawlerList.length); //showin sjow-num most recent
plasmaCrawlNURL.Entry[] crawlerList = switchboard.urlPool.noticeURL.top(plasmaCrawlNURL.STACK_TYPE_CORE, 120);
plasmaCrawlNURL.Entry urle;
boolean dark = true;
yacySeed initiator;
int i;
for (i = 0; i < crawlerList.length; i++) {
for (i = 0; (i < crawlerList.length) && (showNum < 100); i++) {
urle = crawlerList[i];
if (urle != null) {
if ((urle != null)&&(urle.url()!=null)) {
initiator = yacyCore.seedDB.getConnected(urle.initiator());
prop.put("crawler-queue_list_"+i+"_dark", ((dark) ? 1 : 0) );
prop.put("crawler-queue_list_"+i+"_initiator", ((initiator == null) ? "proxy" : initiator.getName()) );
prop.put("crawler-queue_list_"+i+"_depth", urle.depth());
prop.put("crawler-queue_list_"+i+"_modified", daydate(urle.loaddate()) );
prop.put("crawler-queue_list_"+i+"_anchor", urle.name());
prop.put("crawler-queue_list_"+i+"_url", urle.url());
prop.put("crawler-queue_list_"+i+"_hash", urle.hash());
prop.put("crawler-queue_list_"+showNum+"_dark", ((dark) ? 1 : 0) );
prop.put("crawler-queue_list_"+showNum+"_initiator", ((initiator == null) ? "proxy" : initiator.getName()) );
prop.put("crawler-queue_list_"+showNum+"_depth", urle.depth());
prop.put("crawler-queue_list_"+showNum+"_modified", daydate(urle.loaddate()) );
prop.put("crawler-queue_list_"+showNum+"_anchor", urle.name());
prop.put("crawler-queue_list_"+showNum+"_url", urle.url());
prop.put("crawler-queue_list_"+showNum+"_hash", urle.hash());
dark = !dark;
showNum++;
} else {
stackSize--;
}
}
prop.put("crawler-queue_list", i);
prop.put("crawler-queue_list", showNum);
prop.put("crawler-queue_num", stackSize);//num Entries
prop.put("crawler-queue_show-num", showNum); //showin sjow-num most recent
}
// return rewrite properties

View File

@ -46,6 +46,7 @@ package de.anomic.plasma;
import java.io.File;
import java.io.IOException;
import java.net.URL;
import java.util.ArrayList;
import java.util.Date;
import java.util.HashSet;
import java.util.Iterator;
@ -281,17 +282,19 @@ public class plasmaCrawlNURL extends plasmaURL {
}
private Entry[] top(kelondroStack stack, int count) {
// this is a filo - top
// this is a filo - top
if (count > stack.size()) count = stack.size();
Entry[] list = new Entry[count];
try {
ArrayList list = new ArrayList(count);
try {
for (int i = 0; i < count; i++) {
list[i] = new Entry(new String(stack.top(i)[0]));
}
return list;
byte[] hash = stack.top(i)[0];
if (hash == null) continue;
list.add(new Entry(new String(hash)));
}
return (Entry[])list.toArray(new Entry[list.size()]);
} catch (IOException e) {
return null;
}
return null;
}
}
public synchronized Entry getEntry(String hash) {
@ -349,7 +352,7 @@ public class plasmaCrawlNURL extends plasmaURL {
public String toString() {
StringBuffer str = new StringBuffer();
str.append("hash: ").append(url==null ? "null" : urlHash(url)).append(" | ")
str.append("hash: ").append(hash==null ? "null" : hash).append(" | ")
.append("initiator: ").append(initiator==null?"null":initiator).append(" | ")
.append("url: ").append(url==null?"null":url.toString()).append(" | ")
.append("referrer: ").append((referrer == null) ? dummyHash : referrer).append(" | ")

View File

@ -733,8 +733,8 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
plasmaCrawlNURL.Entry urlEntry = urlPool.noticeURL.pop(plasmaCrawlNURL.STACK_TYPE_CORE);
String stats = "LOCALCRAWL[" + urlPool.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_CORE) + ", " + urlPool.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_LIMIT) + ", " + urlPool.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_OVERHANG) + ", " + urlPool.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_REMOTE) + "]";
if ((urlEntry.url() == null) || (urlEntry.url().toString().length() < 10)) {
log.logSevere(stats + ": urlEntry.url() == null. URL-Hash: " + ((urlEntry.hash()==null)?"Unknown":urlEntry.hash()));
return true;
log.logInfo(stats + ": URL with hash " + ((urlEntry.hash()==null)?"Unknown":urlEntry.hash()) + " already removed from queue.");
return true;
}
String profileHandle = urlEntry.profileHandle();
//System.out.println("DEBUG plasmaSwitchboard.processCrawling: profileHandle = " + profileHandle + ", urlEntry.url = " + urlEntry.url());
@ -747,7 +747,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
log.logSevere(stats + ": LOST PROFILE HANDLE '" + urlEntry.profileHandle() + "' (must be internal error) for URL " + urlEntry.url());
return true;
}
log.logFine("LOCALCRAWL: url=" + urlEntry.url() + ", initiator=" + urlEntry.initiator() +
log.logFine("LOCALCRAWL: URL=" + urlEntry.url() + ", initiator=" + urlEntry.initiator() +
", crawlOrder=" + ((profile.remoteIndexing()) ? "true" : "false") + ", depth=" + urlEntry.depth() + ", crawlDepth=" + profile.generalDepth() + ", filter=" + profile.generalFilter() +
", permission=" + ((yacyCore.seedDB == null) ? "undefined" : (((yacyCore.seedDB.mySeed.isSenior()) || (yacyCore.seedDB.mySeed.isPrincipal())) ? "true" : "false")));