mirror of
https://github.com/yacy/yacy_search_server.git
synced 2024-09-19 00:01:41 +02:00
some redesign of EURL storage
* store() is now called explicitely * more urls are written to the EURL table * the EURL stack does not store the complete entry any more, now only the URL hash git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@2323 6c8d7289-2bf4-0310-a012-ef5d649a1542
This commit is contained in:
parent
1ed3e2daef
commit
5f72be2a95
|
@ -196,8 +196,9 @@ public class IndexCreateIndexingQueue_p {
|
|||
plasmaCrawlEURL.Entry entry;
|
||||
yacySeed initiatorSeed, executorSeed;
|
||||
int j=0;
|
||||
for ( int i = switchboard.urlPool.errorURL.stackSize() - 1; i >= (switchboard.urlPool.errorURL.stackSize() - showRejectedCount); i--) {
|
||||
entry = switchboard.urlPool.errorURL.getStack(i);
|
||||
for (int i = switchboard.urlPool.errorURL.stackSize() - 1; i >= (switchboard.urlPool.errorURL.stackSize() - showRejectedCount); i--) {
|
||||
try {
|
||||
entry = switchboard.urlPool.errorURL.stackPopEntry(i);
|
||||
initiatorHash = entry.initiator();
|
||||
executorHash = entry.executor();
|
||||
url = entry.url().toString();
|
||||
|
@ -210,6 +211,9 @@ public class IndexCreateIndexingQueue_p {
|
|||
prop.put("rejected_list_"+j+"_dark", ((dark) ? 1 : 0));
|
||||
dark = !dark;
|
||||
j++;
|
||||
} catch (IOException e) {
|
||||
e.printStackTrace();
|
||||
}
|
||||
}
|
||||
prop.put("rejected_list", j);
|
||||
}
|
||||
|
|
|
@ -59,6 +59,7 @@ import de.anomic.htmlFilter.htmlFilterContentScraper;
|
|||
import de.anomic.htmlFilter.htmlFilterOutputStream;
|
||||
import de.anomic.http.httpHeader;
|
||||
import de.anomic.index.indexURL;
|
||||
import de.anomic.plasma.plasmaCrawlEURL;
|
||||
import de.anomic.plasma.plasmaCrawlProfile;
|
||||
import de.anomic.plasma.plasmaSwitchboard;
|
||||
import de.anomic.server.serverFileUtils;
|
||||
|
@ -195,8 +196,10 @@ public class IndexCreate_p {
|
|||
prop.put("error_crawlingURL", wikiCode.replaceHTML(((String) post.get("crawlingURL"))));
|
||||
prop.put("error_reasonString", reasonString);
|
||||
|
||||
switchboard.urlPool.errorURL.newEntry(crawlingStartURL, null, yacyCore.seedDB.mySeed.hash, yacyCore.seedDB.mySeed.hash,
|
||||
crawlingStartURL.getHost(), reasonString, new bitfield(indexURL.urlFlagLength), false);
|
||||
plasmaCrawlEURL.Entry ee = switchboard.urlPool.errorURL.newEntry(crawlingStartURL, null, yacyCore.seedDB.mySeed.hash, yacyCore.seedDB.mySeed.hash,
|
||||
crawlingStartURL.getHost(), reasonString, new bitfield(indexURL.urlFlagLength));
|
||||
ee.store();
|
||||
switchboard.urlPool.errorURL.stackPushEntry(ee);
|
||||
}
|
||||
} catch (Exception e) {
|
||||
// mist
|
||||
|
@ -259,8 +262,10 @@ public class IndexCreate_p {
|
|||
if (rejectReason == null) {
|
||||
c++;
|
||||
} else {
|
||||
switchboard.urlPool.errorURL.newEntry(nexturlURL, null, yacyCore.seedDB.mySeed.hash, yacyCore.seedDB.mySeed.hash,
|
||||
(String) e.getValue(), rejectReason, new bitfield(indexURL.urlFlagLength), false);
|
||||
plasmaCrawlEURL.Entry ee = switchboard.urlPool.errorURL.newEntry(nexturlURL, null, yacyCore.seedDB.mySeed.hash, yacyCore.seedDB.mySeed.hash,
|
||||
(String) e.getValue(), rejectReason, new bitfield(indexURL.urlFlagLength));
|
||||
ee.store();
|
||||
switchboard.urlPool.errorURL.stackPushEntry(ee);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -52,6 +52,7 @@ import de.anomic.http.httpHeader;
|
|||
import de.anomic.index.indexURL;
|
||||
import de.anomic.plasma.plasmaCrawlNURL;
|
||||
import de.anomic.plasma.plasmaCrawlLURL;
|
||||
import de.anomic.plasma.plasmaCrawlEURL;
|
||||
import de.anomic.plasma.plasmaSwitchboard;
|
||||
import de.anomic.server.serverObjects;
|
||||
import de.anomic.server.serverSwitch;
|
||||
|
@ -148,7 +149,9 @@ public final class crawlReceipt {
|
|||
} else {
|
||||
try {
|
||||
plasmaCrawlNURL.Entry en = switchboard.urlPool.noticeURL.getEntry(receivedUrlhash);
|
||||
switchboard.urlPool.errorURL.newEntry(en.url(), en.referrerHash(), en.initiator(), iam, en.name(), result + ":" + reason, new bitfield(indexURL.urlFlagLength), false);
|
||||
plasmaCrawlEURL.Entry ee = switchboard.urlPool.errorURL.newEntry(en.url(), en.referrerHash(), en.initiator(), iam, en.name(), result + ":" + reason, new bitfield(indexURL.urlFlagLength));
|
||||
ee.store();
|
||||
switchboard.urlPool.errorURL.stackPushEntry(ee);
|
||||
switchboard.urlPool.noticeURL.remove(receivedUrlhash);
|
||||
} catch (IOException e) {
|
||||
|
||||
|
|
|
@ -48,7 +48,6 @@ import java.io.IOException;
|
|||
import de.anomic.net.URL;
|
||||
|
||||
import java.util.Date;
|
||||
import java.util.HashMap;
|
||||
import java.util.LinkedList;
|
||||
import java.util.Iterator;
|
||||
|
||||
|
@ -91,27 +90,22 @@ public class plasmaCrawlEURL extends indexURL {
|
|||
}
|
||||
|
||||
public synchronized Entry newEntry(URL url, String referrer, String initiator, String executor,
|
||||
String name, String failreason, bitfield flags, boolean retry) {
|
||||
String name, String failreason, bitfield flags) {
|
||||
if ((referrer == null) || (referrer.length() < urlHashLength)) referrer = dummyHash;
|
||||
if ((initiator == null) || (initiator.length() < urlHashLength)) initiator = dummyHash;
|
||||
if ((executor == null) || (executor.length() < urlHashLength)) executor = dummyHash;
|
||||
if (failreason == null) failreason = "unknown";
|
||||
return new Entry(url, referrer, initiator, executor, name, failreason, flags);
|
||||
}
|
||||
|
||||
// create a stack entry
|
||||
HashMap map = new HashMap();
|
||||
map.put("url", url);
|
||||
map.put("referrer", referrer);
|
||||
map.put("initiator", initiator);
|
||||
map.put("executor", executor);
|
||||
map.put("name", name);
|
||||
map.put("failreason", failreason);
|
||||
map.put("flags", flags);
|
||||
rejectedStack.add(map);
|
||||
Entry e = new Entry(url, referrer, initiator, executor, name, failreason, flags);
|
||||
public synchronized void stackPushEntry(Entry e) {
|
||||
rejectedStack.add(e.hash);
|
||||
}
|
||||
|
||||
// put in table
|
||||
if (retry) e.store();
|
||||
return e;
|
||||
public Entry stackPopEntry(int pos) throws IOException {
|
||||
String urlhash = (String) rejectedStack.get(pos);
|
||||
if (urlhash == null) return null;
|
||||
return new Entry(urlhash);
|
||||
}
|
||||
|
||||
public synchronized Entry getEntry(String hash) throws IOException {
|
||||
|
@ -134,12 +128,6 @@ public class plasmaCrawlEURL extends indexURL {
|
|||
return rejectedStack.size();
|
||||
}
|
||||
|
||||
public Entry getStack(int pos) {
|
||||
HashMap m = (HashMap) rejectedStack.get(pos);
|
||||
return new Entry((URL) m.get("url"), (String) m.get("referrer"), (String) m.get("initiator"), (String) m.get("executor"),
|
||||
(String) m.get("name"), (String) m.get("failreason"), (bitfield) m.get("flags"));
|
||||
}
|
||||
|
||||
public class Entry {
|
||||
|
||||
private String hash; // the url's hash
|
||||
|
@ -153,10 +141,11 @@ public class plasmaCrawlEURL extends indexURL {
|
|||
private int trycount; // number of tryings
|
||||
private String failreason; // string describing reason for load fail
|
||||
private bitfield flags; // extra space
|
||||
private boolean stored;
|
||||
|
||||
public Entry(URL url, String referrer, String initiator,
|
||||
String executor, String name, String failreason, bitfield flags) {
|
||||
// create new entry and store it into database
|
||||
// create new entry
|
||||
this.hash = urlHash(url);
|
||||
this.referrer = (referrer == null) ? dummyHash : referrer;
|
||||
this.initiator = initiator;
|
||||
|
@ -168,6 +157,7 @@ public class plasmaCrawlEURL extends indexURL {
|
|||
this.trycount = 0;
|
||||
this.failreason = failreason;
|
||||
this.flags = flags;
|
||||
this.stored = false;
|
||||
}
|
||||
|
||||
public Entry(String hash) throws IOException {
|
||||
|
@ -183,10 +173,12 @@ public class plasmaCrawlEURL extends indexURL {
|
|||
if (entry != null) {
|
||||
insertEntry(entry);
|
||||
}
|
||||
this.stored = true;
|
||||
}
|
||||
|
||||
public Entry(kelondroRow.Entry entry) throws IOException {
|
||||
insertEntry(entry);
|
||||
this.stored = false;
|
||||
}
|
||||
|
||||
private void insertEntry(kelondroRow.Entry entry) throws IOException {
|
||||
|
@ -205,8 +197,9 @@ public class plasmaCrawlEURL extends indexURL {
|
|||
return;
|
||||
}
|
||||
|
||||
private void store() {
|
||||
public void store() {
|
||||
// stores the values from the object variables into the database
|
||||
if (this.stored) return;
|
||||
String initdatestr = kelondroBase64Order.enhancedCoder.encodeLong(initdate.getTime() / 86400000, urlDateLength);
|
||||
String trydatestr = kelondroBase64Order.enhancedCoder.encodeLong(trydate.getTime() / 86400000, urlDateLength);
|
||||
|
||||
|
@ -227,6 +220,7 @@ public class plasmaCrawlEURL extends indexURL {
|
|||
this.flags.getBytes()
|
||||
};
|
||||
urlHashCache.put(urlHashCache.row().newEntry(entry));
|
||||
this.stored = true;
|
||||
} catch (IOException e) {
|
||||
System.out.println("INTERNAL ERROR AT plasmaEURL:url2hash:" + e.toString());
|
||||
}
|
||||
|
|
|
@ -463,11 +463,13 @@ public final class plasmaCrawlLURL extends indexURL {
|
|||
kelondroRow.Entry entry = plasmaCrawlLURL.this.urlHashCache.get(urlHash.getBytes());
|
||||
if (entry == null) throw new IOException("url hash " + urlHash + " not found in LURL");
|
||||
insertEntry(entry, searchedWord);
|
||||
this.stored = true;
|
||||
}
|
||||
|
||||
public Entry(kelondroRow.Entry entry, indexURLEntry searchedWord) throws IOException {
|
||||
assert (entry != null);
|
||||
insertEntry(entry, word);
|
||||
this.stored = false;
|
||||
}
|
||||
|
||||
private void insertEntry(kelondroRow.Entry entry, indexURLEntry searchedWord) throws IOException {
|
||||
|
|
|
@ -460,6 +460,7 @@ public class plasmaCrawlNURL extends indexURL {
|
|||
private int forkfactor; // sum of anchors of all ancestors
|
||||
private bitfield flags;
|
||||
private int handle;
|
||||
private boolean stored;;
|
||||
|
||||
public Entry(String initiator,
|
||||
URL url,
|
||||
|
@ -484,24 +485,10 @@ public class plasmaCrawlNURL extends indexURL {
|
|||
this.forkfactor = forkfactor;
|
||||
this.flags = new bitfield(urlFlagLength);
|
||||
this.handle = 0;
|
||||
this.stored = false;
|
||||
store();
|
||||
}
|
||||
|
||||
public String toString() {
|
||||
StringBuffer str = new StringBuffer();
|
||||
str.append("hash: ").append(hash==null ? "null" : hash).append(" | ")
|
||||
.append("initiator: ").append(initiator==null?"null":initiator).append(" | ")
|
||||
.append("url: ").append(url==null?"null":url.toString()).append(" | ")
|
||||
.append("referrer: ").append((referrer == null) ? dummyHash : referrer).append(" | ")
|
||||
.append("name: ").append((name == null) ? "null" : name).append(" | ")
|
||||
.append("loaddate: ").append((loaddate == null) ? new Date() : loaddate).append(" | ")
|
||||
.append("profile: ").append(profileHandle==null?"null":profileHandle).append(" | ")
|
||||
.append("depth: ").append(Integer.toString(depth)).append(" | ")
|
||||
.append("forkfactor: ").append(Integer.toString(forkfactor)).append(" | ")
|
||||
.append("flags: ").append((flags==null) ? "null" : flags.toString());
|
||||
return str.toString();
|
||||
}
|
||||
|
||||
public Entry(String hash) throws IOException {
|
||||
// generates an plasmaNURLEntry using the url hash
|
||||
// to speed up the access, the url-hashes are buffered
|
||||
|
@ -525,6 +512,7 @@ public class plasmaCrawlNURL extends indexURL {
|
|||
this.forkfactor = (int) entry.getColLongB64E(9);
|
||||
this.flags = new bitfield(entry.getColBytes(10));
|
||||
this.handle = Integer.parseInt(entry.getColString(11, null), 16);
|
||||
this.stored = true;
|
||||
return;
|
||||
//} catch (MalformedURLException e) {
|
||||
// throw new IOException("plasmaCrawlNURL/Entry: " + e);
|
||||
|
@ -536,8 +524,9 @@ public class plasmaCrawlNURL extends indexURL {
|
|||
}
|
||||
}
|
||||
|
||||
private void store() {
|
||||
public void store() {
|
||||
// stores the values from the object variables into the database
|
||||
if (this.stored) return;
|
||||
String loaddatestr = kelondroBase64Order.enhancedCoder.encodeLong(loaddate.getTime() / 86400000, urlDateLength);
|
||||
// store the hash in the hash cache
|
||||
try {
|
||||
|
@ -557,6 +546,7 @@ public class plasmaCrawlNURL extends indexURL {
|
|||
normalizeHandle(this.handle).getBytes()
|
||||
};
|
||||
urlHashCache.put(urlHashCache.row().newEntry(entry));
|
||||
this.stored = true;
|
||||
} catch (IOException e) {
|
||||
serverLog.logSevere("PLASMA", "INTERNAL ERROR AT plasmaNURL:store:" + e.toString() + ", resetting NURL-DB");
|
||||
e.printStackTrace();
|
||||
|
@ -568,6 +558,21 @@ public class plasmaCrawlNURL extends indexURL {
|
|||
}
|
||||
}
|
||||
|
||||
public String toString() {
|
||||
StringBuffer str = new StringBuffer();
|
||||
str.append("hash: ").append(hash==null ? "null" : hash).append(" | ")
|
||||
.append("initiator: ").append(initiator==null?"null":initiator).append(" | ")
|
||||
.append("url: ").append(url==null?"null":url.toString()).append(" | ")
|
||||
.append("referrer: ").append((referrer == null) ? dummyHash : referrer).append(" | ")
|
||||
.append("name: ").append((name == null) ? "null" : name).append(" | ")
|
||||
.append("loaddate: ").append((loaddate == null) ? new Date() : loaddate).append(" | ")
|
||||
.append("profile: ").append(profileHandle==null?"null":profileHandle).append(" | ")
|
||||
.append("depth: ").append(Integer.toString(depth)).append(" | ")
|
||||
.append("forkfactor: ").append(Integer.toString(forkfactor)).append(" | ")
|
||||
.append("flags: ").append((flags==null) ? "null" : flags.toString());
|
||||
return str.toString();
|
||||
}
|
||||
|
||||
/**
|
||||
* return a url-hash, based on the md5 algorithm
|
||||
* the result is a String of 12 bytes within a 72-bit space
|
||||
|
|
|
@ -64,6 +64,7 @@ import de.anomic.kelondro.kelondroBase64Order;
|
|||
import de.anomic.kelondro.kelondroException;
|
||||
import de.anomic.kelondro.kelondroRow;
|
||||
import de.anomic.kelondro.kelondroTree;
|
||||
import de.anomic.plasma.plasmaCrawlEURL;
|
||||
import de.anomic.server.serverSemaphore;
|
||||
import de.anomic.server.logging.serverLog;
|
||||
import de.anomic.tools.bitfield;
|
||||
|
@ -393,7 +394,7 @@ public final class plasmaCrawlStacker {
|
|||
this.log.logSevere("URL '" + nexturlString + "' can neither be crawled local nor global.");
|
||||
}
|
||||
|
||||
this.sb.urlPool.noticeURL.newEntry(initiatorHash, /* initiator, needed for p2p-feedback */
|
||||
plasmaCrawlNURL.Entry ee = this.sb.urlPool.noticeURL.newEntry(initiatorHash, /* initiator, needed for p2p-feedback */
|
||||
nexturl, /* url clear text string */
|
||||
loadDate, /* load date */
|
||||
referrerHash, /* last url in crawling queue */
|
||||
|
@ -405,7 +406,7 @@ public final class plasmaCrawlStacker {
|
|||
((global) ? plasmaCrawlNURL.STACK_TYPE_LIMIT :
|
||||
((local) ? plasmaCrawlNURL.STACK_TYPE_CORE : plasmaCrawlNURL.STACK_TYPE_REMOTE)) /*local/remote stack*/
|
||||
);
|
||||
|
||||
ee.store();
|
||||
return null;
|
||||
}
|
||||
|
||||
|
@ -937,16 +938,17 @@ public final class plasmaCrawlStacker {
|
|||
String rejectReason = dequeue(this.theMsg);
|
||||
|
||||
if (rejectReason != null) {
|
||||
plasmaCrawlStacker.this.sb.urlPool.errorURL.newEntry(
|
||||
plasmaCrawlEURL.Entry ee = sb.urlPool.errorURL.newEntry(
|
||||
new URL(this.theMsg.url()),
|
||||
this.theMsg.referrerHash(),
|
||||
this.theMsg.initiatorHash(),
|
||||
yacyCore.seedDB.mySeed.hash,
|
||||
this.theMsg.name,
|
||||
rejectReason,
|
||||
new bitfield(indexURL.urlFlagLength),
|
||||
false
|
||||
new bitfield(indexURL.urlFlagLength)
|
||||
);
|
||||
ee.store();
|
||||
sb.urlPool.errorURL.stackPushEntry(ee);
|
||||
}
|
||||
} catch (Exception e) {
|
||||
plasmaCrawlStacker.this.log.logWarning("Error while processing stackCrawl entry.\n" +
|
||||
|
|
|
@ -312,16 +312,17 @@ public final class plasmaCrawlWorker extends Thread {
|
|||
String hostlow = host.toLowerCase();
|
||||
if (plasmaSwitchboard.urlBlacklist.isListed(hostlow, path)) {
|
||||
log.logInfo("CRAWLER Rejecting URL '" + url.toString() + "'. URL is in blacklist.");
|
||||
sb.urlPool.errorURL.newEntry(
|
||||
plasmaCrawlEURL.Entry ee = sb.urlPool.errorURL.newEntry(
|
||||
url,
|
||||
referer,
|
||||
initiator,
|
||||
yacyCore.seedDB.mySeed.hash,
|
||||
name,
|
||||
"denied_(url_in_blacklist)",
|
||||
new bitfield(indexURL.urlFlagLength),
|
||||
true
|
||||
new bitfield(indexURL.urlFlagLength)
|
||||
);
|
||||
ee.store();
|
||||
sb.urlPool.errorURL.stackPushEntry(ee);
|
||||
return null;
|
||||
}
|
||||
|
||||
|
|
|
@ -1563,10 +1563,12 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
|
|||
|
||||
} else {
|
||||
log.logInfo("Not indexed any word in URL " + entry.url() + "; cause: " + noIndexReason);
|
||||
urlPool.errorURL.newEntry(entry.url(), referrerHash,
|
||||
plasmaCrawlEURL.Entry ee = urlPool.errorURL.newEntry(entry.url(), referrerHash,
|
||||
((entry.proxy()) ? indexURL.dummyHash : entry.initiator()),
|
||||
yacyCore.seedDB.mySeed.hash,
|
||||
descr, noIndexReason, new bitfield(indexURL.urlFlagLength), true);
|
||||
descr, noIndexReason, new bitfield(indexURL.urlFlagLength));
|
||||
ee.store();
|
||||
urlPool.errorURL.stackPushEntry(ee);
|
||||
if ((processCase == 6) && (initiator != null)) {
|
||||
yacyClient.crawlReceipt(initiator, "crawl", "rejected", noIndexReason, null, "");
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue
Block a user