some redesign of EURL storage

* store() is now called explicitely
* more urls are written to the EURL table
* the EURL stack does not store the complete entry any more, now only the URL hash


git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@2323 6c8d7289-2bf4-0310-a012-ef5d649a1542
This commit is contained in:
orbiter 2006-07-24 15:25:47 +00:00
parent 1ed3e2daef
commit 5f72be2a95
9 changed files with 89 additions and 71 deletions

View File

@ -196,8 +196,9 @@ public class IndexCreateIndexingQueue_p {
plasmaCrawlEURL.Entry entry; plasmaCrawlEURL.Entry entry;
yacySeed initiatorSeed, executorSeed; yacySeed initiatorSeed, executorSeed;
int j=0; int j=0;
for ( int i = switchboard.urlPool.errorURL.stackSize() - 1; i >= (switchboard.urlPool.errorURL.stackSize() - showRejectedCount); i--) { for (int i = switchboard.urlPool.errorURL.stackSize() - 1; i >= (switchboard.urlPool.errorURL.stackSize() - showRejectedCount); i--) {
entry = switchboard.urlPool.errorURL.getStack(i); try {
entry = switchboard.urlPool.errorURL.stackPopEntry(i);
initiatorHash = entry.initiator(); initiatorHash = entry.initiator();
executorHash = entry.executor(); executorHash = entry.executor();
url = entry.url().toString(); url = entry.url().toString();
@ -210,6 +211,9 @@ public class IndexCreateIndexingQueue_p {
prop.put("rejected_list_"+j+"_dark", ((dark) ? 1 : 0)); prop.put("rejected_list_"+j+"_dark", ((dark) ? 1 : 0));
dark = !dark; dark = !dark;
j++; j++;
} catch (IOException e) {
e.printStackTrace();
}
} }
prop.put("rejected_list", j); prop.put("rejected_list", j);
} }

View File

@ -59,6 +59,7 @@ import de.anomic.htmlFilter.htmlFilterContentScraper;
import de.anomic.htmlFilter.htmlFilterOutputStream; import de.anomic.htmlFilter.htmlFilterOutputStream;
import de.anomic.http.httpHeader; import de.anomic.http.httpHeader;
import de.anomic.index.indexURL; import de.anomic.index.indexURL;
import de.anomic.plasma.plasmaCrawlEURL;
import de.anomic.plasma.plasmaCrawlProfile; import de.anomic.plasma.plasmaCrawlProfile;
import de.anomic.plasma.plasmaSwitchboard; import de.anomic.plasma.plasmaSwitchboard;
import de.anomic.server.serverFileUtils; import de.anomic.server.serverFileUtils;
@ -195,8 +196,10 @@ public class IndexCreate_p {
prop.put("error_crawlingURL", wikiCode.replaceHTML(((String) post.get("crawlingURL")))); prop.put("error_crawlingURL", wikiCode.replaceHTML(((String) post.get("crawlingURL"))));
prop.put("error_reasonString", reasonString); prop.put("error_reasonString", reasonString);
switchboard.urlPool.errorURL.newEntry(crawlingStartURL, null, yacyCore.seedDB.mySeed.hash, yacyCore.seedDB.mySeed.hash, plasmaCrawlEURL.Entry ee = switchboard.urlPool.errorURL.newEntry(crawlingStartURL, null, yacyCore.seedDB.mySeed.hash, yacyCore.seedDB.mySeed.hash,
crawlingStartURL.getHost(), reasonString, new bitfield(indexURL.urlFlagLength), false); crawlingStartURL.getHost(), reasonString, new bitfield(indexURL.urlFlagLength));
ee.store();
switchboard.urlPool.errorURL.stackPushEntry(ee);
} }
} catch (Exception e) { } catch (Exception e) {
// mist // mist
@ -259,8 +262,10 @@ public class IndexCreate_p {
if (rejectReason == null) { if (rejectReason == null) {
c++; c++;
} else { } else {
switchboard.urlPool.errorURL.newEntry(nexturlURL, null, yacyCore.seedDB.mySeed.hash, yacyCore.seedDB.mySeed.hash, plasmaCrawlEURL.Entry ee = switchboard.urlPool.errorURL.newEntry(nexturlURL, null, yacyCore.seedDB.mySeed.hash, yacyCore.seedDB.mySeed.hash,
(String) e.getValue(), rejectReason, new bitfield(indexURL.urlFlagLength), false); (String) e.getValue(), rejectReason, new bitfield(indexURL.urlFlagLength));
ee.store();
switchboard.urlPool.errorURL.stackPushEntry(ee);
} }
} }

View File

@ -52,6 +52,7 @@ import de.anomic.http.httpHeader;
import de.anomic.index.indexURL; import de.anomic.index.indexURL;
import de.anomic.plasma.plasmaCrawlNURL; import de.anomic.plasma.plasmaCrawlNURL;
import de.anomic.plasma.plasmaCrawlLURL; import de.anomic.plasma.plasmaCrawlLURL;
import de.anomic.plasma.plasmaCrawlEURL;
import de.anomic.plasma.plasmaSwitchboard; import de.anomic.plasma.plasmaSwitchboard;
import de.anomic.server.serverObjects; import de.anomic.server.serverObjects;
import de.anomic.server.serverSwitch; import de.anomic.server.serverSwitch;
@ -148,7 +149,9 @@ public final class crawlReceipt {
} else { } else {
try { try {
plasmaCrawlNURL.Entry en = switchboard.urlPool.noticeURL.getEntry(receivedUrlhash); plasmaCrawlNURL.Entry en = switchboard.urlPool.noticeURL.getEntry(receivedUrlhash);
switchboard.urlPool.errorURL.newEntry(en.url(), en.referrerHash(), en.initiator(), iam, en.name(), result + ":" + reason, new bitfield(indexURL.urlFlagLength), false); plasmaCrawlEURL.Entry ee = switchboard.urlPool.errorURL.newEntry(en.url(), en.referrerHash(), en.initiator(), iam, en.name(), result + ":" + reason, new bitfield(indexURL.urlFlagLength));
ee.store();
switchboard.urlPool.errorURL.stackPushEntry(ee);
switchboard.urlPool.noticeURL.remove(receivedUrlhash); switchboard.urlPool.noticeURL.remove(receivedUrlhash);
} catch (IOException e) { } catch (IOException e) {

View File

@ -48,7 +48,6 @@ import java.io.IOException;
import de.anomic.net.URL; import de.anomic.net.URL;
import java.util.Date; import java.util.Date;
import java.util.HashMap;
import java.util.LinkedList; import java.util.LinkedList;
import java.util.Iterator; import java.util.Iterator;
@ -91,27 +90,22 @@ public class plasmaCrawlEURL extends indexURL {
} }
public synchronized Entry newEntry(URL url, String referrer, String initiator, String executor, public synchronized Entry newEntry(URL url, String referrer, String initiator, String executor,
String name, String failreason, bitfield flags, boolean retry) { String name, String failreason, bitfield flags) {
if ((referrer == null) || (referrer.length() < urlHashLength)) referrer = dummyHash; if ((referrer == null) || (referrer.length() < urlHashLength)) referrer = dummyHash;
if ((initiator == null) || (initiator.length() < urlHashLength)) initiator = dummyHash; if ((initiator == null) || (initiator.length() < urlHashLength)) initiator = dummyHash;
if ((executor == null) || (executor.length() < urlHashLength)) executor = dummyHash; if ((executor == null) || (executor.length() < urlHashLength)) executor = dummyHash;
if (failreason == null) failreason = "unknown"; if (failreason == null) failreason = "unknown";
return new Entry(url, referrer, initiator, executor, name, failreason, flags);
}
// create a stack entry public synchronized void stackPushEntry(Entry e) {
HashMap map = new HashMap(); rejectedStack.add(e.hash);
map.put("url", url); }
map.put("referrer", referrer);
map.put("initiator", initiator);
map.put("executor", executor);
map.put("name", name);
map.put("failreason", failreason);
map.put("flags", flags);
rejectedStack.add(map);
Entry e = new Entry(url, referrer, initiator, executor, name, failreason, flags);
// put in table public Entry stackPopEntry(int pos) throws IOException {
if (retry) e.store(); String urlhash = (String) rejectedStack.get(pos);
return e; if (urlhash == null) return null;
return new Entry(urlhash);
} }
public synchronized Entry getEntry(String hash) throws IOException { public synchronized Entry getEntry(String hash) throws IOException {
@ -134,12 +128,6 @@ public class plasmaCrawlEURL extends indexURL {
return rejectedStack.size(); return rejectedStack.size();
} }
public Entry getStack(int pos) {
HashMap m = (HashMap) rejectedStack.get(pos);
return new Entry((URL) m.get("url"), (String) m.get("referrer"), (String) m.get("initiator"), (String) m.get("executor"),
(String) m.get("name"), (String) m.get("failreason"), (bitfield) m.get("flags"));
}
public class Entry { public class Entry {
private String hash; // the url's hash private String hash; // the url's hash
@ -153,10 +141,11 @@ public class plasmaCrawlEURL extends indexURL {
private int trycount; // number of tryings private int trycount; // number of tryings
private String failreason; // string describing reason for load fail private String failreason; // string describing reason for load fail
private bitfield flags; // extra space private bitfield flags; // extra space
private boolean stored;
public Entry(URL url, String referrer, String initiator, public Entry(URL url, String referrer, String initiator,
String executor, String name, String failreason, bitfield flags) { String executor, String name, String failreason, bitfield flags) {
// create new entry and store it into database // create new entry
this.hash = urlHash(url); this.hash = urlHash(url);
this.referrer = (referrer == null) ? dummyHash : referrer; this.referrer = (referrer == null) ? dummyHash : referrer;
this.initiator = initiator; this.initiator = initiator;
@ -168,6 +157,7 @@ public class plasmaCrawlEURL extends indexURL {
this.trycount = 0; this.trycount = 0;
this.failreason = failreason; this.failreason = failreason;
this.flags = flags; this.flags = flags;
this.stored = false;
} }
public Entry(String hash) throws IOException { public Entry(String hash) throws IOException {
@ -183,10 +173,12 @@ public class plasmaCrawlEURL extends indexURL {
if (entry != null) { if (entry != null) {
insertEntry(entry); insertEntry(entry);
} }
this.stored = true;
} }
public Entry(kelondroRow.Entry entry) throws IOException { public Entry(kelondroRow.Entry entry) throws IOException {
insertEntry(entry); insertEntry(entry);
this.stored = false;
} }
private void insertEntry(kelondroRow.Entry entry) throws IOException { private void insertEntry(kelondroRow.Entry entry) throws IOException {
@ -205,8 +197,9 @@ public class plasmaCrawlEURL extends indexURL {
return; return;
} }
private void store() { public void store() {
// stores the values from the object variables into the database // stores the values from the object variables into the database
if (this.stored) return;
String initdatestr = kelondroBase64Order.enhancedCoder.encodeLong(initdate.getTime() / 86400000, urlDateLength); String initdatestr = kelondroBase64Order.enhancedCoder.encodeLong(initdate.getTime() / 86400000, urlDateLength);
String trydatestr = kelondroBase64Order.enhancedCoder.encodeLong(trydate.getTime() / 86400000, urlDateLength); String trydatestr = kelondroBase64Order.enhancedCoder.encodeLong(trydate.getTime() / 86400000, urlDateLength);
@ -227,6 +220,7 @@ public class plasmaCrawlEURL extends indexURL {
this.flags.getBytes() this.flags.getBytes()
}; };
urlHashCache.put(urlHashCache.row().newEntry(entry)); urlHashCache.put(urlHashCache.row().newEntry(entry));
this.stored = true;
} catch (IOException e) { } catch (IOException e) {
System.out.println("INTERNAL ERROR AT plasmaEURL:url2hash:" + e.toString()); System.out.println("INTERNAL ERROR AT plasmaEURL:url2hash:" + e.toString());
} }

View File

@ -463,11 +463,13 @@ public final class plasmaCrawlLURL extends indexURL {
kelondroRow.Entry entry = plasmaCrawlLURL.this.urlHashCache.get(urlHash.getBytes()); kelondroRow.Entry entry = plasmaCrawlLURL.this.urlHashCache.get(urlHash.getBytes());
if (entry == null) throw new IOException("url hash " + urlHash + " not found in LURL"); if (entry == null) throw new IOException("url hash " + urlHash + " not found in LURL");
insertEntry(entry, searchedWord); insertEntry(entry, searchedWord);
this.stored = true;
} }
public Entry(kelondroRow.Entry entry, indexURLEntry searchedWord) throws IOException { public Entry(kelondroRow.Entry entry, indexURLEntry searchedWord) throws IOException {
assert (entry != null); assert (entry != null);
insertEntry(entry, word); insertEntry(entry, word);
this.stored = false;
} }
private void insertEntry(kelondroRow.Entry entry, indexURLEntry searchedWord) throws IOException { private void insertEntry(kelondroRow.Entry entry, indexURLEntry searchedWord) throws IOException {

View File

@ -460,6 +460,7 @@ public class plasmaCrawlNURL extends indexURL {
private int forkfactor; // sum of anchors of all ancestors private int forkfactor; // sum of anchors of all ancestors
private bitfield flags; private bitfield flags;
private int handle; private int handle;
private boolean stored;;
public Entry(String initiator, public Entry(String initiator,
URL url, URL url,
@ -484,24 +485,10 @@ public class plasmaCrawlNURL extends indexURL {
this.forkfactor = forkfactor; this.forkfactor = forkfactor;
this.flags = new bitfield(urlFlagLength); this.flags = new bitfield(urlFlagLength);
this.handle = 0; this.handle = 0;
this.stored = false;
store(); store();
} }
public String toString() {
StringBuffer str = new StringBuffer();
str.append("hash: ").append(hash==null ? "null" : hash).append(" | ")
.append("initiator: ").append(initiator==null?"null":initiator).append(" | ")
.append("url: ").append(url==null?"null":url.toString()).append(" | ")
.append("referrer: ").append((referrer == null) ? dummyHash : referrer).append(" | ")
.append("name: ").append((name == null) ? "null" : name).append(" | ")
.append("loaddate: ").append((loaddate == null) ? new Date() : loaddate).append(" | ")
.append("profile: ").append(profileHandle==null?"null":profileHandle).append(" | ")
.append("depth: ").append(Integer.toString(depth)).append(" | ")
.append("forkfactor: ").append(Integer.toString(forkfactor)).append(" | ")
.append("flags: ").append((flags==null) ? "null" : flags.toString());
return str.toString();
}
public Entry(String hash) throws IOException { public Entry(String hash) throws IOException {
// generates an plasmaNURLEntry using the url hash // generates an plasmaNURLEntry using the url hash
// to speed up the access, the url-hashes are buffered // to speed up the access, the url-hashes are buffered
@ -525,6 +512,7 @@ public class plasmaCrawlNURL extends indexURL {
this.forkfactor = (int) entry.getColLongB64E(9); this.forkfactor = (int) entry.getColLongB64E(9);
this.flags = new bitfield(entry.getColBytes(10)); this.flags = new bitfield(entry.getColBytes(10));
this.handle = Integer.parseInt(entry.getColString(11, null), 16); this.handle = Integer.parseInt(entry.getColString(11, null), 16);
this.stored = true;
return; return;
//} catch (MalformedURLException e) { //} catch (MalformedURLException e) {
// throw new IOException("plasmaCrawlNURL/Entry: " + e); // throw new IOException("plasmaCrawlNURL/Entry: " + e);
@ -536,8 +524,9 @@ public class plasmaCrawlNURL extends indexURL {
} }
} }
private void store() { public void store() {
// stores the values from the object variables into the database // stores the values from the object variables into the database
if (this.stored) return;
String loaddatestr = kelondroBase64Order.enhancedCoder.encodeLong(loaddate.getTime() / 86400000, urlDateLength); String loaddatestr = kelondroBase64Order.enhancedCoder.encodeLong(loaddate.getTime() / 86400000, urlDateLength);
// store the hash in the hash cache // store the hash in the hash cache
try { try {
@ -557,6 +546,7 @@ public class plasmaCrawlNURL extends indexURL {
normalizeHandle(this.handle).getBytes() normalizeHandle(this.handle).getBytes()
}; };
urlHashCache.put(urlHashCache.row().newEntry(entry)); urlHashCache.put(urlHashCache.row().newEntry(entry));
this.stored = true;
} catch (IOException e) { } catch (IOException e) {
serverLog.logSevere("PLASMA", "INTERNAL ERROR AT plasmaNURL:store:" + e.toString() + ", resetting NURL-DB"); serverLog.logSevere("PLASMA", "INTERNAL ERROR AT plasmaNURL:store:" + e.toString() + ", resetting NURL-DB");
e.printStackTrace(); e.printStackTrace();
@ -568,6 +558,21 @@ public class plasmaCrawlNURL extends indexURL {
} }
} }
public String toString() {
StringBuffer str = new StringBuffer();
str.append("hash: ").append(hash==null ? "null" : hash).append(" | ")
.append("initiator: ").append(initiator==null?"null":initiator).append(" | ")
.append("url: ").append(url==null?"null":url.toString()).append(" | ")
.append("referrer: ").append((referrer == null) ? dummyHash : referrer).append(" | ")
.append("name: ").append((name == null) ? "null" : name).append(" | ")
.append("loaddate: ").append((loaddate == null) ? new Date() : loaddate).append(" | ")
.append("profile: ").append(profileHandle==null?"null":profileHandle).append(" | ")
.append("depth: ").append(Integer.toString(depth)).append(" | ")
.append("forkfactor: ").append(Integer.toString(forkfactor)).append(" | ")
.append("flags: ").append((flags==null) ? "null" : flags.toString());
return str.toString();
}
/** /**
* return a url-hash, based on the md5 algorithm * return a url-hash, based on the md5 algorithm
* the result is a String of 12 bytes within a 72-bit space * the result is a String of 12 bytes within a 72-bit space

View File

@ -64,6 +64,7 @@ import de.anomic.kelondro.kelondroBase64Order;
import de.anomic.kelondro.kelondroException; import de.anomic.kelondro.kelondroException;
import de.anomic.kelondro.kelondroRow; import de.anomic.kelondro.kelondroRow;
import de.anomic.kelondro.kelondroTree; import de.anomic.kelondro.kelondroTree;
import de.anomic.plasma.plasmaCrawlEURL;
import de.anomic.server.serverSemaphore; import de.anomic.server.serverSemaphore;
import de.anomic.server.logging.serverLog; import de.anomic.server.logging.serverLog;
import de.anomic.tools.bitfield; import de.anomic.tools.bitfield;
@ -393,7 +394,7 @@ public final class plasmaCrawlStacker {
this.log.logSevere("URL '" + nexturlString + "' can neither be crawled local nor global."); this.log.logSevere("URL '" + nexturlString + "' can neither be crawled local nor global.");
} }
this.sb.urlPool.noticeURL.newEntry(initiatorHash, /* initiator, needed for p2p-feedback */ plasmaCrawlNURL.Entry ee = this.sb.urlPool.noticeURL.newEntry(initiatorHash, /* initiator, needed for p2p-feedback */
nexturl, /* url clear text string */ nexturl, /* url clear text string */
loadDate, /* load date */ loadDate, /* load date */
referrerHash, /* last url in crawling queue */ referrerHash, /* last url in crawling queue */
@ -405,7 +406,7 @@ public final class plasmaCrawlStacker {
((global) ? plasmaCrawlNURL.STACK_TYPE_LIMIT : ((global) ? plasmaCrawlNURL.STACK_TYPE_LIMIT :
((local) ? plasmaCrawlNURL.STACK_TYPE_CORE : plasmaCrawlNURL.STACK_TYPE_REMOTE)) /*local/remote stack*/ ((local) ? plasmaCrawlNURL.STACK_TYPE_CORE : plasmaCrawlNURL.STACK_TYPE_REMOTE)) /*local/remote stack*/
); );
ee.store();
return null; return null;
} }
@ -937,16 +938,17 @@ public final class plasmaCrawlStacker {
String rejectReason = dequeue(this.theMsg); String rejectReason = dequeue(this.theMsg);
if (rejectReason != null) { if (rejectReason != null) {
plasmaCrawlStacker.this.sb.urlPool.errorURL.newEntry( plasmaCrawlEURL.Entry ee = sb.urlPool.errorURL.newEntry(
new URL(this.theMsg.url()), new URL(this.theMsg.url()),
this.theMsg.referrerHash(), this.theMsg.referrerHash(),
this.theMsg.initiatorHash(), this.theMsg.initiatorHash(),
yacyCore.seedDB.mySeed.hash, yacyCore.seedDB.mySeed.hash,
this.theMsg.name, this.theMsg.name,
rejectReason, rejectReason,
new bitfield(indexURL.urlFlagLength), new bitfield(indexURL.urlFlagLength)
false
); );
ee.store();
sb.urlPool.errorURL.stackPushEntry(ee);
} }
} catch (Exception e) { } catch (Exception e) {
plasmaCrawlStacker.this.log.logWarning("Error while processing stackCrawl entry.\n" + plasmaCrawlStacker.this.log.logWarning("Error while processing stackCrawl entry.\n" +

View File

@ -312,16 +312,17 @@ public final class plasmaCrawlWorker extends Thread {
String hostlow = host.toLowerCase(); String hostlow = host.toLowerCase();
if (plasmaSwitchboard.urlBlacklist.isListed(hostlow, path)) { if (plasmaSwitchboard.urlBlacklist.isListed(hostlow, path)) {
log.logInfo("CRAWLER Rejecting URL '" + url.toString() + "'. URL is in blacklist."); log.logInfo("CRAWLER Rejecting URL '" + url.toString() + "'. URL is in blacklist.");
sb.urlPool.errorURL.newEntry( plasmaCrawlEURL.Entry ee = sb.urlPool.errorURL.newEntry(
url, url,
referer, referer,
initiator, initiator,
yacyCore.seedDB.mySeed.hash, yacyCore.seedDB.mySeed.hash,
name, name,
"denied_(url_in_blacklist)", "denied_(url_in_blacklist)",
new bitfield(indexURL.urlFlagLength), new bitfield(indexURL.urlFlagLength)
true
); );
ee.store();
sb.urlPool.errorURL.stackPushEntry(ee);
return null; return null;
} }

View File

@ -1563,10 +1563,12 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
} else { } else {
log.logInfo("Not indexed any word in URL " + entry.url() + "; cause: " + noIndexReason); log.logInfo("Not indexed any word in URL " + entry.url() + "; cause: " + noIndexReason);
urlPool.errorURL.newEntry(entry.url(), referrerHash, plasmaCrawlEURL.Entry ee = urlPool.errorURL.newEntry(entry.url(), referrerHash,
((entry.proxy()) ? indexURL.dummyHash : entry.initiator()), ((entry.proxy()) ? indexURL.dummyHash : entry.initiator()),
yacyCore.seedDB.mySeed.hash, yacyCore.seedDB.mySeed.hash,
descr, noIndexReason, new bitfield(indexURL.urlFlagLength), true); descr, noIndexReason, new bitfield(indexURL.urlFlagLength));
ee.store();
urlPool.errorURL.stackPushEntry(ee);
if ((processCase == 6) && (initiator != null)) { if ((processCase == 6) && (initiator != null)) {
yacyClient.crawlReceipt(initiator, "crawl", "rejected", noIndexReason, null, ""); yacyClient.crawlReceipt(initiator, "crawl", "rejected", noIndexReason, null, "");
} }