added referrer to remote crawl url list

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@4236 6c8d7289-2bf4-0310-a012-ef5d649a1542
This commit is contained in:
orbiter 2007-11-29 13:58:00 +00:00
parent 18e516317d
commit 9b0ae4b989
6 changed files with 55 additions and 14 deletions

View File

@ -35,6 +35,7 @@ import de.anomic.server.serverObjects;
import de.anomic.server.serverSwitch;
import de.anomic.yacy.yacyCore;
import de.anomic.yacy.yacyNetwork;
import de.anomic.yacy.yacyURL;
public class urls {
@ -60,6 +61,7 @@ public class urls {
int count = Math.min(100, post.getInt("count", 0));
int c = 0;
plasmaCrawlEntry entry;
yacyURL referrer;
while ((count > 0) && (sb.crawlQueues.noticeURL.stackSize(stackType) > 0)) {
try {
entry = sb.crawlQueues.noticeURL.pop(stackType, false);
@ -67,11 +69,14 @@ public class urls {
break;
}
if (entry == null) break;
// find referrer, if there is one
referrer = sb.getURL(entry.referrerhash());
// place url to notice-url db
sb.crawlQueues.delegatedURL.push(sb.crawlQueues.delegatedURL.newEntry(entry.url(), "client=____________"));
// create RSS entry
prop.put("item_" + c + "_title", "");
prop.putHTML("item_" + c + "_link", entry.url().toNormalform(true, false));
prop.putHTML("item_" + c + "_referrer", (referrer == null) ? "" : referrer.toNormalform(true, false));
prop.putHTML("item_" + c + "_description", entry.name());
prop.put("item_" + c + "_author", "");
prop.put("item_" + c + "_pubDate", serverDate.shortSecondTime(entry.appdate()));

View File

@ -21,6 +21,7 @@
<item>
<title>#[title]#</title>
<link>#[link]#</link>
<referrer>#[referrer]#</referrer>
<description>#[description]#</description>
<author>#[author]#</author>
<pubDate>#[pubDate]#</pubDate>

View File

@ -1454,10 +1454,16 @@ public final class httpc {
public void writeX(InputStream source, OutputStream procOS, OutputStream bufferOS) {
byte[] buffer = new byte[2048];
int l, c = 0;
lastIO = System.currentTimeMillis();
while (true) try {
io: while (true) try {
l = source.read(buffer, 0, buffer.length);
if (l <= 0) break;
if (l < 0) break;
if (l == 0) try {
if (System.currentTimeMillis() - lastIO > 30000) break;
this.wait(300);
continue io;
} catch (InterruptedException e) {} // may happen without EOF
lastIO = System.currentTimeMillis();
c += l;
if (procOS != null) procOS.write(buffer, 0, l);
@ -1479,10 +1485,16 @@ public final class httpc {
OutputStreamWriter bufferOSWriter = (bufferOS == null) ? null : new OutputStreamWriter(bufferOS,outputCharset);
char[] buffer = new char[2048];
int l, c= 0;
while (true) try{
lastIO = System.currentTimeMillis();
io: while (true) try{
l = sourceReader.read(buffer, 0, buffer.length);
if (l <= 0) break;
if (l < 0) break;
if (l == 0) try {
if (System.currentTimeMillis() - lastIO > 30000) break;
this.wait(300);
continue io;
} catch (InterruptedException e) {} // may happen without EOF
lastIO = System.currentTimeMillis();
c += l;
if (procOS != null) procOS.write(buffer, 0, l);

View File

@ -238,6 +238,21 @@ public class plasmaCrawlQueues {
return false;
}
if (sb.sbQueue.size() >= (int) sb.getConfigLong(plasmaSwitchboard.INDEXER_SLOTS, 30)) {
log.logFine("remoteCrawlLoaderJob: too many processes in indexing queue, dismissed (" + "sbQueueSize=" + sb.sbQueue.size() + ")");
return false;
}
if (this.size() >= sb.getConfigLong(plasmaSwitchboard.CRAWLER_THREADS_ACTIVE_MAX, 10)) {
log.logFine("remoteCrawlLoaderJob: too many processes in loader queue, dismissed (" + "cacheLoader=" + this.size() + ")");
return false;
}
if (sb.onlineCaution()) {
log.logFine("remoteCrawlLoaderJob: online caution, omitting processing");
return false;
}
// check if we have an entry in the provider list, otherwise fill the list
yacySeed seed;
if ((remoteCrawlProviderHashes.size() == 0) &&
@ -271,28 +286,32 @@ public class plasmaCrawlQueues {
if (reader == null) return true;
// parse the rss
rssReader.Item item;
yacyURL url, referrer;
Date loaddate;
for (int i = 0; i < reader.items(); i++) {
item = reader.getItem(i);
//System.out.println("URL=" + item.getLink() + ", desc=" + item.getDescription() + ", pubDate=" + item.getPubDate());
// put url on remote crawl stack
yacyURL url;
try {
url = new yacyURL(item.getLink(), null);
} catch (MalformedURLException e) {
url = null;
}
Date loaddate;
try {
referrer = new yacyURL(item.getReferrer(), null);
} catch (MalformedURLException e) {
referrer = null;
}
try {
loaddate = serverDate.parseShortSecondTime(item.getPubDate());
} catch (ParseException e) {
loaddate = new Date();
}
yacyURL referrer = null; // referrer needed!
if (sb.acceptURL(url)) {
// stack url
sb.getLog().logFinest("crawlOrder: stack: url='" + url + "'");
String reasonString = sb.crawlStacker.stackCrawl(url, referrer, hash, "REMOTE-CRAWLING", loaddate, 0, sb.defaultRemoteProfile);
String reasonString = sb.crawlStacker.stackCrawl(url, referrer, hash, item.getDescription(), loaddate, 0, sb.defaultRemoteProfile);
if (reasonString == null) {
// done
@ -328,20 +347,18 @@ public class plasmaCrawlQueues {
return false;
}
if (sb.sbQueue.size() >= (int) sb.getConfigLong(plasmaSwitchboard.INDEXER_SLOTS, 30)) {
log.logFine("GlobalCrawl: too many processes in indexing queue, dismissed (" +
"sbQueueSize=" + sb.sbQueue.size() + ")");
log.logFine("GlobalCrawl: too many processes in indexing queue, dismissed (" + "sbQueueSize=" + sb.sbQueue.size() + ")");
return false;
}
if (this.size() >= sb.getConfigLong(plasmaSwitchboard.CRAWLER_THREADS_ACTIVE_MAX, 10)) {
log.logFine("GlobalCrawl: too many processes in loader queue, dismissed (" +
"cacheLoader=" + this.size() + ")");
log.logFine("GlobalCrawl: too many processes in loader queue, dismissed (" + "cacheLoader=" + this.size() + ")");
return false;
}
if (sb.onlineCaution()) {
log.logFine("GlobalCrawl: online caution, omitting processing");
return false;
}
// if crawling was paused we have to wait until we wer notified to continue
Object[] status = (Object[]) sb.crawlJobsStatus.get(plasmaSwitchboard.CRAWLJOB_REMOTE_TRIGGERED_CRAWL);
synchronized(status[plasmaSwitchboard.CRAWLJOB_SYNC]) {

View File

@ -1497,6 +1497,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
}
public yacyURL getURL(String urlhash) {
if (urlhash == null) return null;
if (urlhash.equals(yacyURL.dummyHash)) return null;
yacyURL ne = crawlQueues.getURL(urlhash);
if (ne != null) return ne;

View File

@ -53,6 +53,7 @@ public class rssReader extends DefaultHandler {
"category", //
"title", //
"link", //
"referrer", //
"language", //
"description", //
"creator", //
@ -245,6 +246,10 @@ public class rssReader extends DefaultHandler {
return (String) map.get("link");
}
public String getReferrer() {
return (String) map.get("referrer");
}
public String getLanguage() {
return (String) map.get("language");
}