From 7fd90ca7c874194d45e675529bca31806d49344f Mon Sep 17 00:00:00 2001 From: orbiter Date: Mon, 24 Jul 2006 16:04:14 +0000 Subject: [PATCH] * strict handling of NURL entry element generation, storage and stacking * more space for EURL reason strings (you must delete the EURL db to use this) git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@2324 6c8d7289-2bf4-0310-a012-ef5d649a1542 --- source/de/anomic/index/indexURL.java | 2 +- .../dbImport/plasmaCrawlNURLImporter.java | 4 +++- source/de/anomic/plasma/plasmaCrawlEURL.java | 3 ++- source/de/anomic/plasma/plasmaCrawlNURL.java | 17 +++++++---------- source/de/anomic/plasma/plasmaCrawlStacker.java | 13 ++++++++----- 5 files changed, 21 insertions(+), 18 deletions(-) diff --git a/source/de/anomic/index/indexURL.java b/source/de/anomic/index/indexURL.java index 604d854b8..b401da9b7 100644 --- a/source/de/anomic/index/indexURL.java +++ b/source/de/anomic/index/indexURL.java @@ -49,7 +49,7 @@ public class indexURL { public static final int urlStringLength = 256;// not too short for links without parameters public static final int urlDescrLength = 80; // The headline of a web page (meta-tag or

) public static final int urlNameLength = 40; // the tag content between and - public static final int urlErrorLength = 20; // a reason description for unavailable urls + public static final int urlErrorLength = 80; // a reason description for unavailable urls public static final int urlDateLength = 4; // any date, shortened public static final int urlCopyCountLength = 2; // counter for numbers of copies of this index public static final int urlFlagLength = 2; // any stuff diff --git a/source/de/anomic/plasma/dbImport/plasmaCrawlNURLImporter.java b/source/de/anomic/plasma/dbImport/plasmaCrawlNURLImporter.java index 0a87d5f9a..3f80cc108 100644 --- a/source/de/anomic/plasma/dbImport/plasmaCrawlNURLImporter.java +++ b/source/de/anomic/plasma/dbImport/plasmaCrawlNURLImporter.java @@ -176,7 +176,9 @@ public class plasmaCrawlNURLImporter extends AbstractImporter implements dbImpor // if the url does not alredy exists in the destination stack we insert it now if (!this.sb.urlPool.noticeURL.existsInStack(nextHash)) { - this.sb.urlPool.noticeURL.newEntry(urlEntry,(stackTypes[i] != -1)?stackTypes[i]:plasmaCrawlNURL.STACK_TYPE_CORE); + plasmaCrawlNURL.Entry ne = this.sb.urlPool.noticeURL.newEntry(urlEntry); + ne.store(); + this.sb.urlPool.noticeURL.push((stackTypes[i] != -1) ? stackTypes[i] : plasmaCrawlNURL.STACK_TYPE_CORE, ne.url().getHost(), ne.hash()); } // removing hash from the import db diff --git a/source/de/anomic/plasma/plasmaCrawlEURL.java b/source/de/anomic/plasma/plasmaCrawlEURL.java index 19f220cf3..6b54cf233 100644 --- a/source/de/anomic/plasma/plasmaCrawlEURL.java +++ b/source/de/anomic/plasma/plasmaCrawlEURL.java @@ -188,7 +188,8 @@ public class plasmaCrawlEURL extends indexURL { this.initiator = entry.getColString(2, "UTF-8"); this.executor = entry.getColString(3, "UTF-8"); this.url = new URL(entry.getColString(4, "UTF-8").trim()); - this.name = entry.getColString(5, "UTF-8").trim(); + String n = entry.getColString(5, "UTF-8"); + this.name = (n == null) ? "" : n.trim(); this.initdate = new Date(86400000 * entry.getColLongB64E(6)); this.trydate = new Date(86400000 * entry.getColLongB64E(7)); this.trycount = (int) entry.getColLongB64E(8); diff --git a/source/de/anomic/plasma/plasmaCrawlNURL.java b/source/de/anomic/plasma/plasmaCrawlNURL.java index 987048cb9..57c94a6d8 100644 --- a/source/de/anomic/plasma/plasmaCrawlNURL.java +++ b/source/de/anomic/plasma/plasmaCrawlNURL.java @@ -288,30 +288,27 @@ public class plasmaCrawlNURL extends indexURL { public synchronized Entry newEntry(String initiator, URL url, Date loaddate, String referrer, String name, String profile, - int depth, int anchors, int forkfactor, int stackMode) { - Entry e = new Entry(initiator, url, referrer, name, loaddate, + int depth, int anchors, int forkfactor) { + return new Entry(initiator, url, referrer, name, loaddate, profile, depth, anchors, forkfactor); - push(stackMode, url.getHost(), e.hash); - return e; } - public synchronized Entry newEntry(Entry oldEntry, int stackMode) { + public synchronized Entry newEntry(Entry oldEntry) { if (oldEntry == null) return null; - return newEntry( + return new Entry( oldEntry.initiator(), oldEntry.url(), - oldEntry.loaddate(), oldEntry.referrerHash(), oldEntry.name(), + oldEntry.loaddate(), oldEntry.profileHandle(), oldEntry.depth(), oldEntry.anchors, - oldEntry.forkfactor, - stackMode + oldEntry.forkfactor ); } - private void push(int stackType, String domain, String hash) { + public void push(int stackType, String domain, String hash) { try { switch (stackType) { case STACK_TYPE_CORE: coreStack.add(domain, hash.getBytes()); break; diff --git a/source/de/anomic/plasma/plasmaCrawlStacker.java b/source/de/anomic/plasma/plasmaCrawlStacker.java index 4cb0ec56b..eee603520 100644 --- a/source/de/anomic/plasma/plasmaCrawlStacker.java +++ b/source/de/anomic/plasma/plasmaCrawlStacker.java @@ -394,7 +394,7 @@ public final class plasmaCrawlStacker { this.log.logSevere("URL '" + nexturlString + "' can neither be crawled local nor global."); } - plasmaCrawlNURL.Entry ee = this.sb.urlPool.noticeURL.newEntry(initiatorHash, /* initiator, needed for p2p-feedback */ + plasmaCrawlNURL.Entry ne = this.sb.urlPool.noticeURL.newEntry(initiatorHash, /* initiator, needed for p2p-feedback */ nexturl, /* url clear text string */ loadDate, /* load date */ referrerHash, /* last url in crawling queue */ @@ -402,11 +402,14 @@ public final class plasmaCrawlStacker { (profile == null) ? null : profile.handle(), // profile must not be null! currentdepth, /*depth so far*/ 0, /*anchors, default value */ - 0, /*forkfactor, default value */ - ((global) ? plasmaCrawlNURL.STACK_TYPE_LIMIT : - ((local) ? plasmaCrawlNURL.STACK_TYPE_CORE : plasmaCrawlNURL.STACK_TYPE_REMOTE)) /*local/remote stack*/ + 0 /*forkfactor, default value */ ); - ee.store(); + ne.store(); + this.sb.urlPool.noticeURL.push( + ((global) ? plasmaCrawlNURL.STACK_TYPE_LIMIT : + ((local) ? plasmaCrawlNURL.STACK_TYPE_CORE : plasmaCrawlNURL.STACK_TYPE_REMOTE)) /*local/remote stack*/, + nexturl.getHost(), + ne.hash()); return null; }