some redesign of EURL storage

* store() is now called explicitely
* more urls are written to the EURL table
* the EURL stack does not store the complete entry any more, now only the URL hash


git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@2323 6c8d7289-2bf4-0310-a012-ef5d649a1542
This commit is contained in:
orbiter 2006-07-24 15:25:47 +00:00
parent 1ed3e2daef
commit 5f72be2a95
9 changed files with 89 additions and 71 deletions

View File

@ -196,8 +196,9 @@ public class IndexCreateIndexingQueue_p {
plasmaCrawlEURL.Entry entry;
yacySeed initiatorSeed, executorSeed;
int j=0;
for ( int i = switchboard.urlPool.errorURL.stackSize() - 1; i >= (switchboard.urlPool.errorURL.stackSize() - showRejectedCount); i--) {
entry = switchboard.urlPool.errorURL.getStack(i);
for (int i = switchboard.urlPool.errorURL.stackSize() - 1; i >= (switchboard.urlPool.errorURL.stackSize() - showRejectedCount); i--) {
try {
entry = switchboard.urlPool.errorURL.stackPopEntry(i);
initiatorHash = entry.initiator();
executorHash = entry.executor();
url = entry.url().toString();
@ -210,6 +211,9 @@ public class IndexCreateIndexingQueue_p {
prop.put("rejected_list_"+j+"_dark", ((dark) ? 1 : 0));
dark = !dark;
j++;
} catch (IOException e) {
e.printStackTrace();
}
}
prop.put("rejected_list", j);
}

View File

@ -59,6 +59,7 @@ import de.anomic.htmlFilter.htmlFilterContentScraper;
import de.anomic.htmlFilter.htmlFilterOutputStream;
import de.anomic.http.httpHeader;
import de.anomic.index.indexURL;
import de.anomic.plasma.plasmaCrawlEURL;
import de.anomic.plasma.plasmaCrawlProfile;
import de.anomic.plasma.plasmaSwitchboard;
import de.anomic.server.serverFileUtils;
@ -195,8 +196,10 @@ public class IndexCreate_p {
prop.put("error_crawlingURL", wikiCode.replaceHTML(((String) post.get("crawlingURL"))));
prop.put("error_reasonString", reasonString);
switchboard.urlPool.errorURL.newEntry(crawlingStartURL, null, yacyCore.seedDB.mySeed.hash, yacyCore.seedDB.mySeed.hash,
crawlingStartURL.getHost(), reasonString, new bitfield(indexURL.urlFlagLength), false);
plasmaCrawlEURL.Entry ee = switchboard.urlPool.errorURL.newEntry(crawlingStartURL, null, yacyCore.seedDB.mySeed.hash, yacyCore.seedDB.mySeed.hash,
crawlingStartURL.getHost(), reasonString, new bitfield(indexURL.urlFlagLength));
ee.store();
switchboard.urlPool.errorURL.stackPushEntry(ee);
}
} catch (Exception e) {
// mist
@ -259,8 +262,10 @@ public class IndexCreate_p {
if (rejectReason == null) {
c++;
} else {
switchboard.urlPool.errorURL.newEntry(nexturlURL, null, yacyCore.seedDB.mySeed.hash, yacyCore.seedDB.mySeed.hash,
(String) e.getValue(), rejectReason, new bitfield(indexURL.urlFlagLength), false);
plasmaCrawlEURL.Entry ee = switchboard.urlPool.errorURL.newEntry(nexturlURL, null, yacyCore.seedDB.mySeed.hash, yacyCore.seedDB.mySeed.hash,
(String) e.getValue(), rejectReason, new bitfield(indexURL.urlFlagLength));
ee.store();
switchboard.urlPool.errorURL.stackPushEntry(ee);
}
}

View File

@ -52,6 +52,7 @@ import de.anomic.http.httpHeader;
import de.anomic.index.indexURL;
import de.anomic.plasma.plasmaCrawlNURL;
import de.anomic.plasma.plasmaCrawlLURL;
import de.anomic.plasma.plasmaCrawlEURL;
import de.anomic.plasma.plasmaSwitchboard;
import de.anomic.server.serverObjects;
import de.anomic.server.serverSwitch;
@ -148,7 +149,9 @@ public final class crawlReceipt {
} else {
try {
plasmaCrawlNURL.Entry en = switchboard.urlPool.noticeURL.getEntry(receivedUrlhash);
switchboard.urlPool.errorURL.newEntry(en.url(), en.referrerHash(), en.initiator(), iam, en.name(), result + ":" + reason, new bitfield(indexURL.urlFlagLength), false);
plasmaCrawlEURL.Entry ee = switchboard.urlPool.errorURL.newEntry(en.url(), en.referrerHash(), en.initiator(), iam, en.name(), result + ":" + reason, new bitfield(indexURL.urlFlagLength));
ee.store();
switchboard.urlPool.errorURL.stackPushEntry(ee);
switchboard.urlPool.noticeURL.remove(receivedUrlhash);
} catch (IOException e) {

View File

@ -48,7 +48,6 @@ import java.io.IOException;
import de.anomic.net.URL;
import java.util.Date;
import java.util.HashMap;
import java.util.LinkedList;
import java.util.Iterator;
@ -91,27 +90,22 @@ public class plasmaCrawlEURL extends indexURL {
}
public synchronized Entry newEntry(URL url, String referrer, String initiator, String executor,
String name, String failreason, bitfield flags, boolean retry) {
String name, String failreason, bitfield flags) {
if ((referrer == null) || (referrer.length() < urlHashLength)) referrer = dummyHash;
if ((initiator == null) || (initiator.length() < urlHashLength)) initiator = dummyHash;
if ((executor == null) || (executor.length() < urlHashLength)) executor = dummyHash;
if (failreason == null) failreason = "unknown";
return new Entry(url, referrer, initiator, executor, name, failreason, flags);
}
// create a stack entry
HashMap map = new HashMap();
map.put("url", url);
map.put("referrer", referrer);
map.put("initiator", initiator);
map.put("executor", executor);
map.put("name", name);
map.put("failreason", failreason);
map.put("flags", flags);
rejectedStack.add(map);
Entry e = new Entry(url, referrer, initiator, executor, name, failreason, flags);
public synchronized void stackPushEntry(Entry e) {
rejectedStack.add(e.hash);
}
// put in table
if (retry) e.store();
return e;
public Entry stackPopEntry(int pos) throws IOException {
String urlhash = (String) rejectedStack.get(pos);
if (urlhash == null) return null;
return new Entry(urlhash);
}
public synchronized Entry getEntry(String hash) throws IOException {
@ -134,12 +128,6 @@ public class plasmaCrawlEURL extends indexURL {
return rejectedStack.size();
}
public Entry getStack(int pos) {
HashMap m = (HashMap) rejectedStack.get(pos);
return new Entry((URL) m.get("url"), (String) m.get("referrer"), (String) m.get("initiator"), (String) m.get("executor"),
(String) m.get("name"), (String) m.get("failreason"), (bitfield) m.get("flags"));
}
public class Entry {
private String hash; // the url's hash
@ -153,10 +141,11 @@ public class plasmaCrawlEURL extends indexURL {
private int trycount; // number of tryings
private String failreason; // string describing reason for load fail
private bitfield flags; // extra space
private boolean stored;
public Entry(URL url, String referrer, String initiator,
String executor, String name, String failreason, bitfield flags) {
// create new entry and store it into database
// create new entry
this.hash = urlHash(url);
this.referrer = (referrer == null) ? dummyHash : referrer;
this.initiator = initiator;
@ -168,6 +157,7 @@ public class plasmaCrawlEURL extends indexURL {
this.trycount = 0;
this.failreason = failreason;
this.flags = flags;
this.stored = false;
}
public Entry(String hash) throws IOException {
@ -183,10 +173,12 @@ public class plasmaCrawlEURL extends indexURL {
if (entry != null) {
insertEntry(entry);
}
this.stored = true;
}
public Entry(kelondroRow.Entry entry) throws IOException {
insertEntry(entry);
this.stored = false;
}
private void insertEntry(kelondroRow.Entry entry) throws IOException {
@ -205,8 +197,9 @@ public class plasmaCrawlEURL extends indexURL {
return;
}
private void store() {
public void store() {
// stores the values from the object variables into the database
if (this.stored) return;
String initdatestr = kelondroBase64Order.enhancedCoder.encodeLong(initdate.getTime() / 86400000, urlDateLength);
String trydatestr = kelondroBase64Order.enhancedCoder.encodeLong(trydate.getTime() / 86400000, urlDateLength);
@ -227,6 +220,7 @@ public class plasmaCrawlEURL extends indexURL {
this.flags.getBytes()
};
urlHashCache.put(urlHashCache.row().newEntry(entry));
this.stored = true;
} catch (IOException e) {
System.out.println("INTERNAL ERROR AT plasmaEURL:url2hash:" + e.toString());
}

View File

@ -463,11 +463,13 @@ public final class plasmaCrawlLURL extends indexURL {
kelondroRow.Entry entry = plasmaCrawlLURL.this.urlHashCache.get(urlHash.getBytes());
if (entry == null) throw new IOException("url hash " + urlHash + " not found in LURL");
insertEntry(entry, searchedWord);
this.stored = true;
}
public Entry(kelondroRow.Entry entry, indexURLEntry searchedWord) throws IOException {
assert (entry != null);
insertEntry(entry, word);
this.stored = false;
}
private void insertEntry(kelondroRow.Entry entry, indexURLEntry searchedWord) throws IOException {

View File

@ -460,6 +460,7 @@ public class plasmaCrawlNURL extends indexURL {
private int forkfactor; // sum of anchors of all ancestors
private bitfield flags;
private int handle;
private boolean stored;;
public Entry(String initiator,
URL url,
@ -484,24 +485,10 @@ public class plasmaCrawlNURL extends indexURL {
this.forkfactor = forkfactor;
this.flags = new bitfield(urlFlagLength);
this.handle = 0;
this.stored = false;
store();
}
public String toString() {
StringBuffer str = new StringBuffer();
str.append("hash: ").append(hash==null ? "null" : hash).append(" | ")
.append("initiator: ").append(initiator==null?"null":initiator).append(" | ")
.append("url: ").append(url==null?"null":url.toString()).append(" | ")
.append("referrer: ").append((referrer == null) ? dummyHash : referrer).append(" | ")
.append("name: ").append((name == null) ? "null" : name).append(" | ")
.append("loaddate: ").append((loaddate == null) ? new Date() : loaddate).append(" | ")
.append("profile: ").append(profileHandle==null?"null":profileHandle).append(" | ")
.append("depth: ").append(Integer.toString(depth)).append(" | ")
.append("forkfactor: ").append(Integer.toString(forkfactor)).append(" | ")
.append("flags: ").append((flags==null) ? "null" : flags.toString());
return str.toString();
}
public Entry(String hash) throws IOException {
// generates an plasmaNURLEntry using the url hash
// to speed up the access, the url-hashes are buffered
@ -525,6 +512,7 @@ public class plasmaCrawlNURL extends indexURL {
this.forkfactor = (int) entry.getColLongB64E(9);
this.flags = new bitfield(entry.getColBytes(10));
this.handle = Integer.parseInt(entry.getColString(11, null), 16);
this.stored = true;
return;
//} catch (MalformedURLException e) {
// throw new IOException("plasmaCrawlNURL/Entry: " + e);
@ -536,8 +524,9 @@ public class plasmaCrawlNURL extends indexURL {
}
}
private void store() {
public void store() {
// stores the values from the object variables into the database
if (this.stored) return;
String loaddatestr = kelondroBase64Order.enhancedCoder.encodeLong(loaddate.getTime() / 86400000, urlDateLength);
// store the hash in the hash cache
try {
@ -557,6 +546,7 @@ public class plasmaCrawlNURL extends indexURL {
normalizeHandle(this.handle).getBytes()
};
urlHashCache.put(urlHashCache.row().newEntry(entry));
this.stored = true;
} catch (IOException e) {
serverLog.logSevere("PLASMA", "INTERNAL ERROR AT plasmaNURL:store:" + e.toString() + ", resetting NURL-DB");
e.printStackTrace();
@ -568,6 +558,21 @@ public class plasmaCrawlNURL extends indexURL {
}
}
public String toString() {
StringBuffer str = new StringBuffer();
str.append("hash: ").append(hash==null ? "null" : hash).append(" | ")
.append("initiator: ").append(initiator==null?"null":initiator).append(" | ")
.append("url: ").append(url==null?"null":url.toString()).append(" | ")
.append("referrer: ").append((referrer == null) ? dummyHash : referrer).append(" | ")
.append("name: ").append((name == null) ? "null" : name).append(" | ")
.append("loaddate: ").append((loaddate == null) ? new Date() : loaddate).append(" | ")
.append("profile: ").append(profileHandle==null?"null":profileHandle).append(" | ")
.append("depth: ").append(Integer.toString(depth)).append(" | ")
.append("forkfactor: ").append(Integer.toString(forkfactor)).append(" | ")
.append("flags: ").append((flags==null) ? "null" : flags.toString());
return str.toString();
}
/**
* return a url-hash, based on the md5 algorithm
* the result is a String of 12 bytes within a 72-bit space

View File

@ -64,6 +64,7 @@ import de.anomic.kelondro.kelondroBase64Order;
import de.anomic.kelondro.kelondroException;
import de.anomic.kelondro.kelondroRow;
import de.anomic.kelondro.kelondroTree;
import de.anomic.plasma.plasmaCrawlEURL;
import de.anomic.server.serverSemaphore;
import de.anomic.server.logging.serverLog;
import de.anomic.tools.bitfield;
@ -393,7 +394,7 @@ public final class plasmaCrawlStacker {
this.log.logSevere("URL '" + nexturlString + "' can neither be crawled local nor global.");
}
this.sb.urlPool.noticeURL.newEntry(initiatorHash, /* initiator, needed for p2p-feedback */
plasmaCrawlNURL.Entry ee = this.sb.urlPool.noticeURL.newEntry(initiatorHash, /* initiator, needed for p2p-feedback */
nexturl, /* url clear text string */
loadDate, /* load date */
referrerHash, /* last url in crawling queue */
@ -405,7 +406,7 @@ public final class plasmaCrawlStacker {
((global) ? plasmaCrawlNURL.STACK_TYPE_LIMIT :
((local) ? plasmaCrawlNURL.STACK_TYPE_CORE : plasmaCrawlNURL.STACK_TYPE_REMOTE)) /*local/remote stack*/
);
ee.store();
return null;
}
@ -937,16 +938,17 @@ public final class plasmaCrawlStacker {
String rejectReason = dequeue(this.theMsg);
if (rejectReason != null) {
plasmaCrawlStacker.this.sb.urlPool.errorURL.newEntry(
plasmaCrawlEURL.Entry ee = sb.urlPool.errorURL.newEntry(
new URL(this.theMsg.url()),
this.theMsg.referrerHash(),
this.theMsg.initiatorHash(),
yacyCore.seedDB.mySeed.hash,
this.theMsg.name,
rejectReason,
new bitfield(indexURL.urlFlagLength),
false
new bitfield(indexURL.urlFlagLength)
);
ee.store();
sb.urlPool.errorURL.stackPushEntry(ee);
}
} catch (Exception e) {
plasmaCrawlStacker.this.log.logWarning("Error while processing stackCrawl entry.\n" +

View File

@ -312,16 +312,17 @@ public final class plasmaCrawlWorker extends Thread {
String hostlow = host.toLowerCase();
if (plasmaSwitchboard.urlBlacklist.isListed(hostlow, path)) {
log.logInfo("CRAWLER Rejecting URL '" + url.toString() + "'. URL is in blacklist.");
sb.urlPool.errorURL.newEntry(
plasmaCrawlEURL.Entry ee = sb.urlPool.errorURL.newEntry(
url,
referer,
initiator,
yacyCore.seedDB.mySeed.hash,
name,
"denied_(url_in_blacklist)",
new bitfield(indexURL.urlFlagLength),
true
new bitfield(indexURL.urlFlagLength)
);
ee.store();
sb.urlPool.errorURL.stackPushEntry(ee);
return null;
}

View File

@ -1563,10 +1563,12 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
} else {
log.logInfo("Not indexed any word in URL " + entry.url() + "; cause: " + noIndexReason);
urlPool.errorURL.newEntry(entry.url(), referrerHash,
plasmaCrawlEURL.Entry ee = urlPool.errorURL.newEntry(entry.url(), referrerHash,
((entry.proxy()) ? indexURL.dummyHash : entry.initiator()),
yacyCore.seedDB.mySeed.hash,
descr, noIndexReason, new bitfield(indexURL.urlFlagLength), true);
descr, noIndexReason, new bitfield(indexURL.urlFlagLength));
ee.store();
urlPool.errorURL.stackPushEntry(ee);
if ((processCase == 6) && (initiator != null)) {
yacyClient.crawlReceipt(initiator, "crawl", "rejected", noIndexReason, null, "");
}