mirror of
https://github.com/yacy/yacy_search_server.git
synced 2024-09-21 00:00:13 +02:00
* strict handling of NURL entry element generation, storage and stacking
* more space for EURL reason strings (you must delete the EURL db to use this) git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@2324 6c8d7289-2bf4-0310-a012-ef5d649a1542
This commit is contained in:
parent
5f72be2a95
commit
7fd90ca7c8
|
@ -49,7 +49,7 @@ public class indexURL {
|
|||
public static final int urlStringLength = 256;// not too short for links without parameters
|
||||
public static final int urlDescrLength = 80; // The headline of a web page (meta-tag or <h1>)
|
||||
public static final int urlNameLength = 40; // the tag content between <a> and </a>
|
||||
public static final int urlErrorLength = 20; // a reason description for unavailable urls
|
||||
public static final int urlErrorLength = 80; // a reason description for unavailable urls
|
||||
public static final int urlDateLength = 4; // any date, shortened
|
||||
public static final int urlCopyCountLength = 2; // counter for numbers of copies of this index
|
||||
public static final int urlFlagLength = 2; // any stuff
|
||||
|
|
|
@ -176,7 +176,9 @@ public class plasmaCrawlNURLImporter extends AbstractImporter implements dbImpor
|
|||
|
||||
// if the url does not alredy exists in the destination stack we insert it now
|
||||
if (!this.sb.urlPool.noticeURL.existsInStack(nextHash)) {
|
||||
this.sb.urlPool.noticeURL.newEntry(urlEntry,(stackTypes[i] != -1)?stackTypes[i]:plasmaCrawlNURL.STACK_TYPE_CORE);
|
||||
plasmaCrawlNURL.Entry ne = this.sb.urlPool.noticeURL.newEntry(urlEntry);
|
||||
ne.store();
|
||||
this.sb.urlPool.noticeURL.push((stackTypes[i] != -1) ? stackTypes[i] : plasmaCrawlNURL.STACK_TYPE_CORE, ne.url().getHost(), ne.hash());
|
||||
}
|
||||
|
||||
// removing hash from the import db
|
||||
|
|
|
@ -188,7 +188,8 @@ public class plasmaCrawlEURL extends indexURL {
|
|||
this.initiator = entry.getColString(2, "UTF-8");
|
||||
this.executor = entry.getColString(3, "UTF-8");
|
||||
this.url = new URL(entry.getColString(4, "UTF-8").trim());
|
||||
this.name = entry.getColString(5, "UTF-8").trim();
|
||||
String n = entry.getColString(5, "UTF-8");
|
||||
this.name = (n == null) ? "" : n.trim();
|
||||
this.initdate = new Date(86400000 * entry.getColLongB64E(6));
|
||||
this.trydate = new Date(86400000 * entry.getColLongB64E(7));
|
||||
this.trycount = (int) entry.getColLongB64E(8);
|
||||
|
|
|
@ -288,30 +288,27 @@ public class plasmaCrawlNURL extends indexURL {
|
|||
|
||||
public synchronized Entry newEntry(String initiator, URL url, Date loaddate,
|
||||
String referrer, String name, String profile,
|
||||
int depth, int anchors, int forkfactor, int stackMode) {
|
||||
Entry e = new Entry(initiator, url, referrer, name, loaddate,
|
||||
int depth, int anchors, int forkfactor) {
|
||||
return new Entry(initiator, url, referrer, name, loaddate,
|
||||
profile, depth, anchors, forkfactor);
|
||||
push(stackMode, url.getHost(), e.hash);
|
||||
return e;
|
||||
}
|
||||
|
||||
public synchronized Entry newEntry(Entry oldEntry, int stackMode) {
|
||||
public synchronized Entry newEntry(Entry oldEntry) {
|
||||
if (oldEntry == null) return null;
|
||||
return newEntry(
|
||||
return new Entry(
|
||||
oldEntry.initiator(),
|
||||
oldEntry.url(),
|
||||
oldEntry.loaddate(),
|
||||
oldEntry.referrerHash(),
|
||||
oldEntry.name(),
|
||||
oldEntry.loaddate(),
|
||||
oldEntry.profileHandle(),
|
||||
oldEntry.depth(),
|
||||
oldEntry.anchors,
|
||||
oldEntry.forkfactor,
|
||||
stackMode
|
||||
oldEntry.forkfactor
|
||||
);
|
||||
}
|
||||
|
||||
private void push(int stackType, String domain, String hash) {
|
||||
public void push(int stackType, String domain, String hash) {
|
||||
try {
|
||||
switch (stackType) {
|
||||
case STACK_TYPE_CORE: coreStack.add(domain, hash.getBytes()); break;
|
||||
|
|
|
@ -394,7 +394,7 @@ public final class plasmaCrawlStacker {
|
|||
this.log.logSevere("URL '" + nexturlString + "' can neither be crawled local nor global.");
|
||||
}
|
||||
|
||||
plasmaCrawlNURL.Entry ee = this.sb.urlPool.noticeURL.newEntry(initiatorHash, /* initiator, needed for p2p-feedback */
|
||||
plasmaCrawlNURL.Entry ne = this.sb.urlPool.noticeURL.newEntry(initiatorHash, /* initiator, needed for p2p-feedback */
|
||||
nexturl, /* url clear text string */
|
||||
loadDate, /* load date */
|
||||
referrerHash, /* last url in crawling queue */
|
||||
|
@ -402,11 +402,14 @@ public final class plasmaCrawlStacker {
|
|||
(profile == null) ? null : profile.handle(), // profile must not be null!
|
||||
currentdepth, /*depth so far*/
|
||||
0, /*anchors, default value */
|
||||
0, /*forkfactor, default value */
|
||||
((global) ? plasmaCrawlNURL.STACK_TYPE_LIMIT :
|
||||
((local) ? plasmaCrawlNURL.STACK_TYPE_CORE : plasmaCrawlNURL.STACK_TYPE_REMOTE)) /*local/remote stack*/
|
||||
0 /*forkfactor, default value */
|
||||
);
|
||||
ee.store();
|
||||
ne.store();
|
||||
this.sb.urlPool.noticeURL.push(
|
||||
((global) ? plasmaCrawlNURL.STACK_TYPE_LIMIT :
|
||||
((local) ? plasmaCrawlNURL.STACK_TYPE_CORE : plasmaCrawlNURL.STACK_TYPE_REMOTE)) /*local/remote stack*/,
|
||||
nexturl.getHost(),
|
||||
ne.hash());
|
||||
return null;
|
||||
}
|
||||
|
||||
|
|
Loading…
Reference in New Issue
Block a user