mirror of
https://github.com/yacy/yacy_search_server.git
synced 2024-09-19 00:01:41 +02:00
*) Bugfix for "-UNRESOLVED_PATTERN-" Bug on IndexCreateWWWLocalQueue_p.html and "urlEntry.url() == null" Bug
- Logging message for "urlEntry.url() == null" is now displayed as info - IndexCreateWWWLocalQueue_p.html now detects null entries while looping throug the list and removes them automatically See: - http://www.yacy-forum.de/viewtopic.php?t=532#8781 - http://www.yacy-forum.de/viewtopic.php?t=639 - http://www.yacy-forum.de/viewtopic.php?t=1071 - http://www.yacy-forum.de/viewtopic.php?t=338 - http://www.yacy-forum.de/viewtopic.php?t=980 git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@640 6c8d7289-2bf4-0310-a012-ef5d649a1542
This commit is contained in:
parent
33aaffbfc6
commit
732a107160
|
@ -113,13 +113,13 @@ public class IndexCreateIndexingQueue_p {
|
|||
pcentry = (plasmaSwitchboardQueue.Entry) entryList.get(i);
|
||||
if ((pcentry != null)&&(pcentry.url() != null)) {
|
||||
initiator = yacyCore.seedDB.getConnected(pcentry.initiator());
|
||||
prop.put("indexing-queue_list_"+i+"_dark", ((dark) ? 1 : 0));
|
||||
prop.put("indexing-queue_list_"+i+"_initiator", ((initiator == null) ? "proxy" : initiator.getName()));
|
||||
prop.put("indexing-queue_list_"+i+"_depth", pcentry.depth());
|
||||
prop.put("indexing-queue_list_"+i+"_modified", (pcentry.responseHeader() == null) ? "" : daydate(pcentry.responseHeader().lastModified()));
|
||||
prop.put("indexing-queue_list_"+i+"_anchor", (pcentry.anchorName()==null)?"":pcentry.anchorName());
|
||||
prop.put("indexing-queue_list_"+i+"_url", pcentry.normalizedURLString());
|
||||
prop.put("indexing-queue_list_"+i+"_size", Status.bytesToString(pcentry.size()));
|
||||
prop.put("indexing-queue_list_"+entryCount+"_dark", ((dark) ? 1 : 0));
|
||||
prop.put("indexing-queue_list_"+entryCount+"_initiator", ((initiator == null) ? "proxy" : initiator.getName()));
|
||||
prop.put("indexing-queue_list_"+entryCount+"_depth", pcentry.depth());
|
||||
prop.put("indexing-queue_list_"+entryCount+"_modified", (pcentry.responseHeader() == null) ? "" : daydate(pcentry.responseHeader().lastModified()));
|
||||
prop.put("indexing-queue_list_"+entryCount+"_anchor", (pcentry.anchorName()==null)?"":pcentry.anchorName());
|
||||
prop.put("indexing-queue_list_"+entryCount+"_url", pcentry.normalizedURLString());
|
||||
prop.put("indexing-queue_list_"+entryCount+"_size", Status.bytesToString(pcentry.size()));
|
||||
dark = !dark;
|
||||
entryCount++;
|
||||
}
|
||||
|
|
|
@ -84,33 +84,38 @@ public class IndexCreateWWWLocalQueue_p {
|
|||
}
|
||||
}
|
||||
|
||||
int stackSize = switchboard.urlPool.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_CORE);
|
||||
int showNum = 0, stackSize = switchboard.urlPool.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_CORE);
|
||||
if (stackSize == 0) {
|
||||
prop.put("crawler-queue", 0);
|
||||
} else {
|
||||
prop.put("crawler-queue", 1);
|
||||
plasmaCrawlNURL.Entry[] crawlerList = switchboard.urlPool.noticeURL.top(plasmaCrawlNURL.STACK_TYPE_CORE, 100);
|
||||
prop.put("crawler-queue_num", stackSize);//num Entries
|
||||
prop.put("crawler-queue_show-num", crawlerList.length); //showin sjow-num most recent
|
||||
plasmaCrawlNURL.Entry[] crawlerList = switchboard.urlPool.noticeURL.top(plasmaCrawlNURL.STACK_TYPE_CORE, 120);
|
||||
|
||||
plasmaCrawlNURL.Entry urle;
|
||||
boolean dark = true;
|
||||
yacySeed initiator;
|
||||
int i;
|
||||
for (i = 0; i < crawlerList.length; i++) {
|
||||
for (i = 0; (i < crawlerList.length) && (showNum < 100); i++) {
|
||||
urle = crawlerList[i];
|
||||
if (urle != null) {
|
||||
if ((urle != null)&&(urle.url()!=null)) {
|
||||
initiator = yacyCore.seedDB.getConnected(urle.initiator());
|
||||
prop.put("crawler-queue_list_"+i+"_dark", ((dark) ? 1 : 0) );
|
||||
prop.put("crawler-queue_list_"+i+"_initiator", ((initiator == null) ? "proxy" : initiator.getName()) );
|
||||
prop.put("crawler-queue_list_"+i+"_depth", urle.depth());
|
||||
prop.put("crawler-queue_list_"+i+"_modified", daydate(urle.loaddate()) );
|
||||
prop.put("crawler-queue_list_"+i+"_anchor", urle.name());
|
||||
prop.put("crawler-queue_list_"+i+"_url", urle.url());
|
||||
prop.put("crawler-queue_list_"+i+"_hash", urle.hash());
|
||||
prop.put("crawler-queue_list_"+showNum+"_dark", ((dark) ? 1 : 0) );
|
||||
prop.put("crawler-queue_list_"+showNum+"_initiator", ((initiator == null) ? "proxy" : initiator.getName()) );
|
||||
prop.put("crawler-queue_list_"+showNum+"_depth", urle.depth());
|
||||
prop.put("crawler-queue_list_"+showNum+"_modified", daydate(urle.loaddate()) );
|
||||
prop.put("crawler-queue_list_"+showNum+"_anchor", urle.name());
|
||||
prop.put("crawler-queue_list_"+showNum+"_url", urle.url());
|
||||
prop.put("crawler-queue_list_"+showNum+"_hash", urle.hash());
|
||||
dark = !dark;
|
||||
showNum++;
|
||||
} else {
|
||||
stackSize--;
|
||||
}
|
||||
}
|
||||
prop.put("crawler-queue_list", i);
|
||||
prop.put("crawler-queue_list", showNum);
|
||||
prop.put("crawler-queue_num", stackSize);//num Entries
|
||||
prop.put("crawler-queue_show-num", showNum); //showin sjow-num most recent
|
||||
|
||||
}
|
||||
|
||||
// return rewrite properties
|
||||
|
|
|
@ -46,6 +46,7 @@ package de.anomic.plasma;
|
|||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
import java.net.URL;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Date;
|
||||
import java.util.HashSet;
|
||||
import java.util.Iterator;
|
||||
|
@ -281,17 +282,19 @@ public class plasmaCrawlNURL extends plasmaURL {
|
|||
}
|
||||
|
||||
private Entry[] top(kelondroStack stack, int count) {
|
||||
// this is a filo - top
|
||||
// this is a filo - top
|
||||
if (count > stack.size()) count = stack.size();
|
||||
Entry[] list = new Entry[count];
|
||||
try {
|
||||
ArrayList list = new ArrayList(count);
|
||||
try {
|
||||
for (int i = 0; i < count; i++) {
|
||||
list[i] = new Entry(new String(stack.top(i)[0]));
|
||||
}
|
||||
return list;
|
||||
byte[] hash = stack.top(i)[0];
|
||||
if (hash == null) continue;
|
||||
list.add(new Entry(new String(hash)));
|
||||
}
|
||||
return (Entry[])list.toArray(new Entry[list.size()]);
|
||||
} catch (IOException e) {
|
||||
return null;
|
||||
}
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
public synchronized Entry getEntry(String hash) {
|
||||
|
@ -349,7 +352,7 @@ public class plasmaCrawlNURL extends plasmaURL {
|
|||
public String toString() {
|
||||
StringBuffer str = new StringBuffer();
|
||||
|
||||
str.append("hash: ").append(url==null ? "null" : urlHash(url)).append(" | ")
|
||||
str.append("hash: ").append(hash==null ? "null" : hash).append(" | ")
|
||||
.append("initiator: ").append(initiator==null?"null":initiator).append(" | ")
|
||||
.append("url: ").append(url==null?"null":url.toString()).append(" | ")
|
||||
.append("referrer: ").append((referrer == null) ? dummyHash : referrer).append(" | ")
|
||||
|
|
|
@ -733,8 +733,8 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
|
|||
plasmaCrawlNURL.Entry urlEntry = urlPool.noticeURL.pop(plasmaCrawlNURL.STACK_TYPE_CORE);
|
||||
String stats = "LOCALCRAWL[" + urlPool.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_CORE) + ", " + urlPool.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_LIMIT) + ", " + urlPool.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_OVERHANG) + ", " + urlPool.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_REMOTE) + "]";
|
||||
if ((urlEntry.url() == null) || (urlEntry.url().toString().length() < 10)) {
|
||||
log.logSevere(stats + ": urlEntry.url() == null. URL-Hash: " + ((urlEntry.hash()==null)?"Unknown":urlEntry.hash()));
|
||||
return true;
|
||||
log.logInfo(stats + ": URL with hash " + ((urlEntry.hash()==null)?"Unknown":urlEntry.hash()) + " already removed from queue.");
|
||||
return true;
|
||||
}
|
||||
String profileHandle = urlEntry.profileHandle();
|
||||
//System.out.println("DEBUG plasmaSwitchboard.processCrawling: profileHandle = " + profileHandle + ", urlEntry.url = " + urlEntry.url());
|
||||
|
@ -747,7 +747,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
|
|||
log.logSevere(stats + ": LOST PROFILE HANDLE '" + urlEntry.profileHandle() + "' (must be internal error) for URL " + urlEntry.url());
|
||||
return true;
|
||||
}
|
||||
log.logFine("LOCALCRAWL: url=" + urlEntry.url() + ", initiator=" + urlEntry.initiator() +
|
||||
log.logFine("LOCALCRAWL: URL=" + urlEntry.url() + ", initiator=" + urlEntry.initiator() +
|
||||
", crawlOrder=" + ((profile.remoteIndexing()) ? "true" : "false") + ", depth=" + urlEntry.depth() + ", crawlDepth=" + profile.generalDepth() + ", filter=" + profile.generalFilter() +
|
||||
", permission=" + ((yacyCore.seedDB == null) ? "undefined" : (((yacyCore.seedDB.mySeed.isSenior()) || (yacyCore.seedDB.mySeed.isPrincipal())) ? "true" : "false")));
|
||||
|
||||
|
|
Loading…
Reference in New Issue
Block a user