mirror of
https://github.com/yacy/yacy_search_server.git
synced 2024-09-19 00:01:41 +02:00
some redesign of EURL storage
* store() is now called explicitely * more urls are written to the EURL table * the EURL stack does not store the complete entry any more, now only the URL hash git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@2323 6c8d7289-2bf4-0310-a012-ef5d649a1542
This commit is contained in:
parent
1ed3e2daef
commit
5f72be2a95
|
@ -196,8 +196,9 @@ public class IndexCreateIndexingQueue_p {
|
||||||
plasmaCrawlEURL.Entry entry;
|
plasmaCrawlEURL.Entry entry;
|
||||||
yacySeed initiatorSeed, executorSeed;
|
yacySeed initiatorSeed, executorSeed;
|
||||||
int j=0;
|
int j=0;
|
||||||
for ( int i = switchboard.urlPool.errorURL.stackSize() - 1; i >= (switchboard.urlPool.errorURL.stackSize() - showRejectedCount); i--) {
|
for (int i = switchboard.urlPool.errorURL.stackSize() - 1; i >= (switchboard.urlPool.errorURL.stackSize() - showRejectedCount); i--) {
|
||||||
entry = switchboard.urlPool.errorURL.getStack(i);
|
try {
|
||||||
|
entry = switchboard.urlPool.errorURL.stackPopEntry(i);
|
||||||
initiatorHash = entry.initiator();
|
initiatorHash = entry.initiator();
|
||||||
executorHash = entry.executor();
|
executorHash = entry.executor();
|
||||||
url = entry.url().toString();
|
url = entry.url().toString();
|
||||||
|
@ -210,6 +211,9 @@ public class IndexCreateIndexingQueue_p {
|
||||||
prop.put("rejected_list_"+j+"_dark", ((dark) ? 1 : 0));
|
prop.put("rejected_list_"+j+"_dark", ((dark) ? 1 : 0));
|
||||||
dark = !dark;
|
dark = !dark;
|
||||||
j++;
|
j++;
|
||||||
|
} catch (IOException e) {
|
||||||
|
e.printStackTrace();
|
||||||
|
}
|
||||||
}
|
}
|
||||||
prop.put("rejected_list", j);
|
prop.put("rejected_list", j);
|
||||||
}
|
}
|
||||||
|
|
|
@ -59,6 +59,7 @@ import de.anomic.htmlFilter.htmlFilterContentScraper;
|
||||||
import de.anomic.htmlFilter.htmlFilterOutputStream;
|
import de.anomic.htmlFilter.htmlFilterOutputStream;
|
||||||
import de.anomic.http.httpHeader;
|
import de.anomic.http.httpHeader;
|
||||||
import de.anomic.index.indexURL;
|
import de.anomic.index.indexURL;
|
||||||
|
import de.anomic.plasma.plasmaCrawlEURL;
|
||||||
import de.anomic.plasma.plasmaCrawlProfile;
|
import de.anomic.plasma.plasmaCrawlProfile;
|
||||||
import de.anomic.plasma.plasmaSwitchboard;
|
import de.anomic.plasma.plasmaSwitchboard;
|
||||||
import de.anomic.server.serverFileUtils;
|
import de.anomic.server.serverFileUtils;
|
||||||
|
@ -195,8 +196,10 @@ public class IndexCreate_p {
|
||||||
prop.put("error_crawlingURL", wikiCode.replaceHTML(((String) post.get("crawlingURL"))));
|
prop.put("error_crawlingURL", wikiCode.replaceHTML(((String) post.get("crawlingURL"))));
|
||||||
prop.put("error_reasonString", reasonString);
|
prop.put("error_reasonString", reasonString);
|
||||||
|
|
||||||
switchboard.urlPool.errorURL.newEntry(crawlingStartURL, null, yacyCore.seedDB.mySeed.hash, yacyCore.seedDB.mySeed.hash,
|
plasmaCrawlEURL.Entry ee = switchboard.urlPool.errorURL.newEntry(crawlingStartURL, null, yacyCore.seedDB.mySeed.hash, yacyCore.seedDB.mySeed.hash,
|
||||||
crawlingStartURL.getHost(), reasonString, new bitfield(indexURL.urlFlagLength), false);
|
crawlingStartURL.getHost(), reasonString, new bitfield(indexURL.urlFlagLength));
|
||||||
|
ee.store();
|
||||||
|
switchboard.urlPool.errorURL.stackPushEntry(ee);
|
||||||
}
|
}
|
||||||
} catch (Exception e) {
|
} catch (Exception e) {
|
||||||
// mist
|
// mist
|
||||||
|
@ -259,8 +262,10 @@ public class IndexCreate_p {
|
||||||
if (rejectReason == null) {
|
if (rejectReason == null) {
|
||||||
c++;
|
c++;
|
||||||
} else {
|
} else {
|
||||||
switchboard.urlPool.errorURL.newEntry(nexturlURL, null, yacyCore.seedDB.mySeed.hash, yacyCore.seedDB.mySeed.hash,
|
plasmaCrawlEURL.Entry ee = switchboard.urlPool.errorURL.newEntry(nexturlURL, null, yacyCore.seedDB.mySeed.hash, yacyCore.seedDB.mySeed.hash,
|
||||||
(String) e.getValue(), rejectReason, new bitfield(indexURL.urlFlagLength), false);
|
(String) e.getValue(), rejectReason, new bitfield(indexURL.urlFlagLength));
|
||||||
|
ee.store();
|
||||||
|
switchboard.urlPool.errorURL.stackPushEntry(ee);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -52,6 +52,7 @@ import de.anomic.http.httpHeader;
|
||||||
import de.anomic.index.indexURL;
|
import de.anomic.index.indexURL;
|
||||||
import de.anomic.plasma.plasmaCrawlNURL;
|
import de.anomic.plasma.plasmaCrawlNURL;
|
||||||
import de.anomic.plasma.plasmaCrawlLURL;
|
import de.anomic.plasma.plasmaCrawlLURL;
|
||||||
|
import de.anomic.plasma.plasmaCrawlEURL;
|
||||||
import de.anomic.plasma.plasmaSwitchboard;
|
import de.anomic.plasma.plasmaSwitchboard;
|
||||||
import de.anomic.server.serverObjects;
|
import de.anomic.server.serverObjects;
|
||||||
import de.anomic.server.serverSwitch;
|
import de.anomic.server.serverSwitch;
|
||||||
|
@ -148,7 +149,9 @@ public final class crawlReceipt {
|
||||||
} else {
|
} else {
|
||||||
try {
|
try {
|
||||||
plasmaCrawlNURL.Entry en = switchboard.urlPool.noticeURL.getEntry(receivedUrlhash);
|
plasmaCrawlNURL.Entry en = switchboard.urlPool.noticeURL.getEntry(receivedUrlhash);
|
||||||
switchboard.urlPool.errorURL.newEntry(en.url(), en.referrerHash(), en.initiator(), iam, en.name(), result + ":" + reason, new bitfield(indexURL.urlFlagLength), false);
|
plasmaCrawlEURL.Entry ee = switchboard.urlPool.errorURL.newEntry(en.url(), en.referrerHash(), en.initiator(), iam, en.name(), result + ":" + reason, new bitfield(indexURL.urlFlagLength));
|
||||||
|
ee.store();
|
||||||
|
switchboard.urlPool.errorURL.stackPushEntry(ee);
|
||||||
switchboard.urlPool.noticeURL.remove(receivedUrlhash);
|
switchboard.urlPool.noticeURL.remove(receivedUrlhash);
|
||||||
} catch (IOException e) {
|
} catch (IOException e) {
|
||||||
|
|
||||||
|
|
|
@ -48,7 +48,6 @@ import java.io.IOException;
|
||||||
import de.anomic.net.URL;
|
import de.anomic.net.URL;
|
||||||
|
|
||||||
import java.util.Date;
|
import java.util.Date;
|
||||||
import java.util.HashMap;
|
|
||||||
import java.util.LinkedList;
|
import java.util.LinkedList;
|
||||||
import java.util.Iterator;
|
import java.util.Iterator;
|
||||||
|
|
||||||
|
@ -91,27 +90,22 @@ public class plasmaCrawlEURL extends indexURL {
|
||||||
}
|
}
|
||||||
|
|
||||||
public synchronized Entry newEntry(URL url, String referrer, String initiator, String executor,
|
public synchronized Entry newEntry(URL url, String referrer, String initiator, String executor,
|
||||||
String name, String failreason, bitfield flags, boolean retry) {
|
String name, String failreason, bitfield flags) {
|
||||||
if ((referrer == null) || (referrer.length() < urlHashLength)) referrer = dummyHash;
|
if ((referrer == null) || (referrer.length() < urlHashLength)) referrer = dummyHash;
|
||||||
if ((initiator == null) || (initiator.length() < urlHashLength)) initiator = dummyHash;
|
if ((initiator == null) || (initiator.length() < urlHashLength)) initiator = dummyHash;
|
||||||
if ((executor == null) || (executor.length() < urlHashLength)) executor = dummyHash;
|
if ((executor == null) || (executor.length() < urlHashLength)) executor = dummyHash;
|
||||||
if (failreason == null) failreason = "unknown";
|
if (failreason == null) failreason = "unknown";
|
||||||
|
return new Entry(url, referrer, initiator, executor, name, failreason, flags);
|
||||||
|
}
|
||||||
|
|
||||||
// create a stack entry
|
public synchronized void stackPushEntry(Entry e) {
|
||||||
HashMap map = new HashMap();
|
rejectedStack.add(e.hash);
|
||||||
map.put("url", url);
|
}
|
||||||
map.put("referrer", referrer);
|
|
||||||
map.put("initiator", initiator);
|
|
||||||
map.put("executor", executor);
|
|
||||||
map.put("name", name);
|
|
||||||
map.put("failreason", failreason);
|
|
||||||
map.put("flags", flags);
|
|
||||||
rejectedStack.add(map);
|
|
||||||
Entry e = new Entry(url, referrer, initiator, executor, name, failreason, flags);
|
|
||||||
|
|
||||||
// put in table
|
public Entry stackPopEntry(int pos) throws IOException {
|
||||||
if (retry) e.store();
|
String urlhash = (String) rejectedStack.get(pos);
|
||||||
return e;
|
if (urlhash == null) return null;
|
||||||
|
return new Entry(urlhash);
|
||||||
}
|
}
|
||||||
|
|
||||||
public synchronized Entry getEntry(String hash) throws IOException {
|
public synchronized Entry getEntry(String hash) throws IOException {
|
||||||
|
@ -134,12 +128,6 @@ public class plasmaCrawlEURL extends indexURL {
|
||||||
return rejectedStack.size();
|
return rejectedStack.size();
|
||||||
}
|
}
|
||||||
|
|
||||||
public Entry getStack(int pos) {
|
|
||||||
HashMap m = (HashMap) rejectedStack.get(pos);
|
|
||||||
return new Entry((URL) m.get("url"), (String) m.get("referrer"), (String) m.get("initiator"), (String) m.get("executor"),
|
|
||||||
(String) m.get("name"), (String) m.get("failreason"), (bitfield) m.get("flags"));
|
|
||||||
}
|
|
||||||
|
|
||||||
public class Entry {
|
public class Entry {
|
||||||
|
|
||||||
private String hash; // the url's hash
|
private String hash; // the url's hash
|
||||||
|
@ -153,10 +141,11 @@ public class plasmaCrawlEURL extends indexURL {
|
||||||
private int trycount; // number of tryings
|
private int trycount; // number of tryings
|
||||||
private String failreason; // string describing reason for load fail
|
private String failreason; // string describing reason for load fail
|
||||||
private bitfield flags; // extra space
|
private bitfield flags; // extra space
|
||||||
|
private boolean stored;
|
||||||
|
|
||||||
public Entry(URL url, String referrer, String initiator,
|
public Entry(URL url, String referrer, String initiator,
|
||||||
String executor, String name, String failreason, bitfield flags) {
|
String executor, String name, String failreason, bitfield flags) {
|
||||||
// create new entry and store it into database
|
// create new entry
|
||||||
this.hash = urlHash(url);
|
this.hash = urlHash(url);
|
||||||
this.referrer = (referrer == null) ? dummyHash : referrer;
|
this.referrer = (referrer == null) ? dummyHash : referrer;
|
||||||
this.initiator = initiator;
|
this.initiator = initiator;
|
||||||
|
@ -168,6 +157,7 @@ public class plasmaCrawlEURL extends indexURL {
|
||||||
this.trycount = 0;
|
this.trycount = 0;
|
||||||
this.failreason = failreason;
|
this.failreason = failreason;
|
||||||
this.flags = flags;
|
this.flags = flags;
|
||||||
|
this.stored = false;
|
||||||
}
|
}
|
||||||
|
|
||||||
public Entry(String hash) throws IOException {
|
public Entry(String hash) throws IOException {
|
||||||
|
@ -183,10 +173,12 @@ public class plasmaCrawlEURL extends indexURL {
|
||||||
if (entry != null) {
|
if (entry != null) {
|
||||||
insertEntry(entry);
|
insertEntry(entry);
|
||||||
}
|
}
|
||||||
|
this.stored = true;
|
||||||
}
|
}
|
||||||
|
|
||||||
public Entry(kelondroRow.Entry entry) throws IOException {
|
public Entry(kelondroRow.Entry entry) throws IOException {
|
||||||
insertEntry(entry);
|
insertEntry(entry);
|
||||||
|
this.stored = false;
|
||||||
}
|
}
|
||||||
|
|
||||||
private void insertEntry(kelondroRow.Entry entry) throws IOException {
|
private void insertEntry(kelondroRow.Entry entry) throws IOException {
|
||||||
|
@ -205,8 +197,9 @@ public class plasmaCrawlEURL extends indexURL {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
private void store() {
|
public void store() {
|
||||||
// stores the values from the object variables into the database
|
// stores the values from the object variables into the database
|
||||||
|
if (this.stored) return;
|
||||||
String initdatestr = kelondroBase64Order.enhancedCoder.encodeLong(initdate.getTime() / 86400000, urlDateLength);
|
String initdatestr = kelondroBase64Order.enhancedCoder.encodeLong(initdate.getTime() / 86400000, urlDateLength);
|
||||||
String trydatestr = kelondroBase64Order.enhancedCoder.encodeLong(trydate.getTime() / 86400000, urlDateLength);
|
String trydatestr = kelondroBase64Order.enhancedCoder.encodeLong(trydate.getTime() / 86400000, urlDateLength);
|
||||||
|
|
||||||
|
@ -227,6 +220,7 @@ public class plasmaCrawlEURL extends indexURL {
|
||||||
this.flags.getBytes()
|
this.flags.getBytes()
|
||||||
};
|
};
|
||||||
urlHashCache.put(urlHashCache.row().newEntry(entry));
|
urlHashCache.put(urlHashCache.row().newEntry(entry));
|
||||||
|
this.stored = true;
|
||||||
} catch (IOException e) {
|
} catch (IOException e) {
|
||||||
System.out.println("INTERNAL ERROR AT plasmaEURL:url2hash:" + e.toString());
|
System.out.println("INTERNAL ERROR AT plasmaEURL:url2hash:" + e.toString());
|
||||||
}
|
}
|
||||||
|
|
|
@ -463,11 +463,13 @@ public final class plasmaCrawlLURL extends indexURL {
|
||||||
kelondroRow.Entry entry = plasmaCrawlLURL.this.urlHashCache.get(urlHash.getBytes());
|
kelondroRow.Entry entry = plasmaCrawlLURL.this.urlHashCache.get(urlHash.getBytes());
|
||||||
if (entry == null) throw new IOException("url hash " + urlHash + " not found in LURL");
|
if (entry == null) throw new IOException("url hash " + urlHash + " not found in LURL");
|
||||||
insertEntry(entry, searchedWord);
|
insertEntry(entry, searchedWord);
|
||||||
|
this.stored = true;
|
||||||
}
|
}
|
||||||
|
|
||||||
public Entry(kelondroRow.Entry entry, indexURLEntry searchedWord) throws IOException {
|
public Entry(kelondroRow.Entry entry, indexURLEntry searchedWord) throws IOException {
|
||||||
assert (entry != null);
|
assert (entry != null);
|
||||||
insertEntry(entry, word);
|
insertEntry(entry, word);
|
||||||
|
this.stored = false;
|
||||||
}
|
}
|
||||||
|
|
||||||
private void insertEntry(kelondroRow.Entry entry, indexURLEntry searchedWord) throws IOException {
|
private void insertEntry(kelondroRow.Entry entry, indexURLEntry searchedWord) throws IOException {
|
||||||
|
|
|
@ -460,6 +460,7 @@ public class plasmaCrawlNURL extends indexURL {
|
||||||
private int forkfactor; // sum of anchors of all ancestors
|
private int forkfactor; // sum of anchors of all ancestors
|
||||||
private bitfield flags;
|
private bitfield flags;
|
||||||
private int handle;
|
private int handle;
|
||||||
|
private boolean stored;;
|
||||||
|
|
||||||
public Entry(String initiator,
|
public Entry(String initiator,
|
||||||
URL url,
|
URL url,
|
||||||
|
@ -484,24 +485,10 @@ public class plasmaCrawlNURL extends indexURL {
|
||||||
this.forkfactor = forkfactor;
|
this.forkfactor = forkfactor;
|
||||||
this.flags = new bitfield(urlFlagLength);
|
this.flags = new bitfield(urlFlagLength);
|
||||||
this.handle = 0;
|
this.handle = 0;
|
||||||
|
this.stored = false;
|
||||||
store();
|
store();
|
||||||
}
|
}
|
||||||
|
|
||||||
public String toString() {
|
|
||||||
StringBuffer str = new StringBuffer();
|
|
||||||
str.append("hash: ").append(hash==null ? "null" : hash).append(" | ")
|
|
||||||
.append("initiator: ").append(initiator==null?"null":initiator).append(" | ")
|
|
||||||
.append("url: ").append(url==null?"null":url.toString()).append(" | ")
|
|
||||||
.append("referrer: ").append((referrer == null) ? dummyHash : referrer).append(" | ")
|
|
||||||
.append("name: ").append((name == null) ? "null" : name).append(" | ")
|
|
||||||
.append("loaddate: ").append((loaddate == null) ? new Date() : loaddate).append(" | ")
|
|
||||||
.append("profile: ").append(profileHandle==null?"null":profileHandle).append(" | ")
|
|
||||||
.append("depth: ").append(Integer.toString(depth)).append(" | ")
|
|
||||||
.append("forkfactor: ").append(Integer.toString(forkfactor)).append(" | ")
|
|
||||||
.append("flags: ").append((flags==null) ? "null" : flags.toString());
|
|
||||||
return str.toString();
|
|
||||||
}
|
|
||||||
|
|
||||||
public Entry(String hash) throws IOException {
|
public Entry(String hash) throws IOException {
|
||||||
// generates an plasmaNURLEntry using the url hash
|
// generates an plasmaNURLEntry using the url hash
|
||||||
// to speed up the access, the url-hashes are buffered
|
// to speed up the access, the url-hashes are buffered
|
||||||
|
@ -525,6 +512,7 @@ public class plasmaCrawlNURL extends indexURL {
|
||||||
this.forkfactor = (int) entry.getColLongB64E(9);
|
this.forkfactor = (int) entry.getColLongB64E(9);
|
||||||
this.flags = new bitfield(entry.getColBytes(10));
|
this.flags = new bitfield(entry.getColBytes(10));
|
||||||
this.handle = Integer.parseInt(entry.getColString(11, null), 16);
|
this.handle = Integer.parseInt(entry.getColString(11, null), 16);
|
||||||
|
this.stored = true;
|
||||||
return;
|
return;
|
||||||
//} catch (MalformedURLException e) {
|
//} catch (MalformedURLException e) {
|
||||||
// throw new IOException("plasmaCrawlNURL/Entry: " + e);
|
// throw new IOException("plasmaCrawlNURL/Entry: " + e);
|
||||||
|
@ -536,8 +524,9 @@ public class plasmaCrawlNURL extends indexURL {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
private void store() {
|
public void store() {
|
||||||
// stores the values from the object variables into the database
|
// stores the values from the object variables into the database
|
||||||
|
if (this.stored) return;
|
||||||
String loaddatestr = kelondroBase64Order.enhancedCoder.encodeLong(loaddate.getTime() / 86400000, urlDateLength);
|
String loaddatestr = kelondroBase64Order.enhancedCoder.encodeLong(loaddate.getTime() / 86400000, urlDateLength);
|
||||||
// store the hash in the hash cache
|
// store the hash in the hash cache
|
||||||
try {
|
try {
|
||||||
|
@ -557,6 +546,7 @@ public class plasmaCrawlNURL extends indexURL {
|
||||||
normalizeHandle(this.handle).getBytes()
|
normalizeHandle(this.handle).getBytes()
|
||||||
};
|
};
|
||||||
urlHashCache.put(urlHashCache.row().newEntry(entry));
|
urlHashCache.put(urlHashCache.row().newEntry(entry));
|
||||||
|
this.stored = true;
|
||||||
} catch (IOException e) {
|
} catch (IOException e) {
|
||||||
serverLog.logSevere("PLASMA", "INTERNAL ERROR AT plasmaNURL:store:" + e.toString() + ", resetting NURL-DB");
|
serverLog.logSevere("PLASMA", "INTERNAL ERROR AT plasmaNURL:store:" + e.toString() + ", resetting NURL-DB");
|
||||||
e.printStackTrace();
|
e.printStackTrace();
|
||||||
|
@ -568,6 +558,21 @@ public class plasmaCrawlNURL extends indexURL {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public String toString() {
|
||||||
|
StringBuffer str = new StringBuffer();
|
||||||
|
str.append("hash: ").append(hash==null ? "null" : hash).append(" | ")
|
||||||
|
.append("initiator: ").append(initiator==null?"null":initiator).append(" | ")
|
||||||
|
.append("url: ").append(url==null?"null":url.toString()).append(" | ")
|
||||||
|
.append("referrer: ").append((referrer == null) ? dummyHash : referrer).append(" | ")
|
||||||
|
.append("name: ").append((name == null) ? "null" : name).append(" | ")
|
||||||
|
.append("loaddate: ").append((loaddate == null) ? new Date() : loaddate).append(" | ")
|
||||||
|
.append("profile: ").append(profileHandle==null?"null":profileHandle).append(" | ")
|
||||||
|
.append("depth: ").append(Integer.toString(depth)).append(" | ")
|
||||||
|
.append("forkfactor: ").append(Integer.toString(forkfactor)).append(" | ")
|
||||||
|
.append("flags: ").append((flags==null) ? "null" : flags.toString());
|
||||||
|
return str.toString();
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* return a url-hash, based on the md5 algorithm
|
* return a url-hash, based on the md5 algorithm
|
||||||
* the result is a String of 12 bytes within a 72-bit space
|
* the result is a String of 12 bytes within a 72-bit space
|
||||||
|
|
|
@ -64,6 +64,7 @@ import de.anomic.kelondro.kelondroBase64Order;
|
||||||
import de.anomic.kelondro.kelondroException;
|
import de.anomic.kelondro.kelondroException;
|
||||||
import de.anomic.kelondro.kelondroRow;
|
import de.anomic.kelondro.kelondroRow;
|
||||||
import de.anomic.kelondro.kelondroTree;
|
import de.anomic.kelondro.kelondroTree;
|
||||||
|
import de.anomic.plasma.plasmaCrawlEURL;
|
||||||
import de.anomic.server.serverSemaphore;
|
import de.anomic.server.serverSemaphore;
|
||||||
import de.anomic.server.logging.serverLog;
|
import de.anomic.server.logging.serverLog;
|
||||||
import de.anomic.tools.bitfield;
|
import de.anomic.tools.bitfield;
|
||||||
|
@ -393,7 +394,7 @@ public final class plasmaCrawlStacker {
|
||||||
this.log.logSevere("URL '" + nexturlString + "' can neither be crawled local nor global.");
|
this.log.logSevere("URL '" + nexturlString + "' can neither be crawled local nor global.");
|
||||||
}
|
}
|
||||||
|
|
||||||
this.sb.urlPool.noticeURL.newEntry(initiatorHash, /* initiator, needed for p2p-feedback */
|
plasmaCrawlNURL.Entry ee = this.sb.urlPool.noticeURL.newEntry(initiatorHash, /* initiator, needed for p2p-feedback */
|
||||||
nexturl, /* url clear text string */
|
nexturl, /* url clear text string */
|
||||||
loadDate, /* load date */
|
loadDate, /* load date */
|
||||||
referrerHash, /* last url in crawling queue */
|
referrerHash, /* last url in crawling queue */
|
||||||
|
@ -405,7 +406,7 @@ public final class plasmaCrawlStacker {
|
||||||
((global) ? plasmaCrawlNURL.STACK_TYPE_LIMIT :
|
((global) ? plasmaCrawlNURL.STACK_TYPE_LIMIT :
|
||||||
((local) ? plasmaCrawlNURL.STACK_TYPE_CORE : plasmaCrawlNURL.STACK_TYPE_REMOTE)) /*local/remote stack*/
|
((local) ? plasmaCrawlNURL.STACK_TYPE_CORE : plasmaCrawlNURL.STACK_TYPE_REMOTE)) /*local/remote stack*/
|
||||||
);
|
);
|
||||||
|
ee.store();
|
||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -937,16 +938,17 @@ public final class plasmaCrawlStacker {
|
||||||
String rejectReason = dequeue(this.theMsg);
|
String rejectReason = dequeue(this.theMsg);
|
||||||
|
|
||||||
if (rejectReason != null) {
|
if (rejectReason != null) {
|
||||||
plasmaCrawlStacker.this.sb.urlPool.errorURL.newEntry(
|
plasmaCrawlEURL.Entry ee = sb.urlPool.errorURL.newEntry(
|
||||||
new URL(this.theMsg.url()),
|
new URL(this.theMsg.url()),
|
||||||
this.theMsg.referrerHash(),
|
this.theMsg.referrerHash(),
|
||||||
this.theMsg.initiatorHash(),
|
this.theMsg.initiatorHash(),
|
||||||
yacyCore.seedDB.mySeed.hash,
|
yacyCore.seedDB.mySeed.hash,
|
||||||
this.theMsg.name,
|
this.theMsg.name,
|
||||||
rejectReason,
|
rejectReason,
|
||||||
new bitfield(indexURL.urlFlagLength),
|
new bitfield(indexURL.urlFlagLength)
|
||||||
false
|
|
||||||
);
|
);
|
||||||
|
ee.store();
|
||||||
|
sb.urlPool.errorURL.stackPushEntry(ee);
|
||||||
}
|
}
|
||||||
} catch (Exception e) {
|
} catch (Exception e) {
|
||||||
plasmaCrawlStacker.this.log.logWarning("Error while processing stackCrawl entry.\n" +
|
plasmaCrawlStacker.this.log.logWarning("Error while processing stackCrawl entry.\n" +
|
||||||
|
|
|
@ -312,16 +312,17 @@ public final class plasmaCrawlWorker extends Thread {
|
||||||
String hostlow = host.toLowerCase();
|
String hostlow = host.toLowerCase();
|
||||||
if (plasmaSwitchboard.urlBlacklist.isListed(hostlow, path)) {
|
if (plasmaSwitchboard.urlBlacklist.isListed(hostlow, path)) {
|
||||||
log.logInfo("CRAWLER Rejecting URL '" + url.toString() + "'. URL is in blacklist.");
|
log.logInfo("CRAWLER Rejecting URL '" + url.toString() + "'. URL is in blacklist.");
|
||||||
sb.urlPool.errorURL.newEntry(
|
plasmaCrawlEURL.Entry ee = sb.urlPool.errorURL.newEntry(
|
||||||
url,
|
url,
|
||||||
referer,
|
referer,
|
||||||
initiator,
|
initiator,
|
||||||
yacyCore.seedDB.mySeed.hash,
|
yacyCore.seedDB.mySeed.hash,
|
||||||
name,
|
name,
|
||||||
"denied_(url_in_blacklist)",
|
"denied_(url_in_blacklist)",
|
||||||
new bitfield(indexURL.urlFlagLength),
|
new bitfield(indexURL.urlFlagLength)
|
||||||
true
|
|
||||||
);
|
);
|
||||||
|
ee.store();
|
||||||
|
sb.urlPool.errorURL.stackPushEntry(ee);
|
||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -1563,10 +1563,12 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
|
||||||
|
|
||||||
} else {
|
} else {
|
||||||
log.logInfo("Not indexed any word in URL " + entry.url() + "; cause: " + noIndexReason);
|
log.logInfo("Not indexed any word in URL " + entry.url() + "; cause: " + noIndexReason);
|
||||||
urlPool.errorURL.newEntry(entry.url(), referrerHash,
|
plasmaCrawlEURL.Entry ee = urlPool.errorURL.newEntry(entry.url(), referrerHash,
|
||||||
((entry.proxy()) ? indexURL.dummyHash : entry.initiator()),
|
((entry.proxy()) ? indexURL.dummyHash : entry.initiator()),
|
||||||
yacyCore.seedDB.mySeed.hash,
|
yacyCore.seedDB.mySeed.hash,
|
||||||
descr, noIndexReason, new bitfield(indexURL.urlFlagLength), true);
|
descr, noIndexReason, new bitfield(indexURL.urlFlagLength));
|
||||||
|
ee.store();
|
||||||
|
urlPool.errorURL.stackPushEntry(ee);
|
||||||
if ((processCase == 6) && (initiator != null)) {
|
if ((processCase == 6) && (initiator != null)) {
|
||||||
yacyClient.crawlReceipt(initiator, "crawl", "rejected", noIndexReason, null, "");
|
yacyClient.crawlReceipt(initiator, "crawl", "rejected", noIndexReason, null, "");
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue
Block a user