mirror of
https://github.com/yacy/yacy_search_server.git
synced 2024-09-19 00:01:41 +02:00
added referrer to remote crawl url list
git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@4236 6c8d7289-2bf4-0310-a012-ef5d649a1542
This commit is contained in:
parent
18e516317d
commit
9b0ae4b989
|
@ -35,6 +35,7 @@ import de.anomic.server.serverObjects;
|
|||
import de.anomic.server.serverSwitch;
|
||||
import de.anomic.yacy.yacyCore;
|
||||
import de.anomic.yacy.yacyNetwork;
|
||||
import de.anomic.yacy.yacyURL;
|
||||
|
||||
public class urls {
|
||||
|
||||
|
@ -60,6 +61,7 @@ public class urls {
|
|||
int count = Math.min(100, post.getInt("count", 0));
|
||||
int c = 0;
|
||||
plasmaCrawlEntry entry;
|
||||
yacyURL referrer;
|
||||
while ((count > 0) && (sb.crawlQueues.noticeURL.stackSize(stackType) > 0)) {
|
||||
try {
|
||||
entry = sb.crawlQueues.noticeURL.pop(stackType, false);
|
||||
|
@ -67,11 +69,14 @@ public class urls {
|
|||
break;
|
||||
}
|
||||
if (entry == null) break;
|
||||
// find referrer, if there is one
|
||||
referrer = sb.getURL(entry.referrerhash());
|
||||
// place url to notice-url db
|
||||
sb.crawlQueues.delegatedURL.push(sb.crawlQueues.delegatedURL.newEntry(entry.url(), "client=____________"));
|
||||
// create RSS entry
|
||||
prop.put("item_" + c + "_title", "");
|
||||
prop.putHTML("item_" + c + "_link", entry.url().toNormalform(true, false));
|
||||
prop.putHTML("item_" + c + "_referrer", (referrer == null) ? "" : referrer.toNormalform(true, false));
|
||||
prop.putHTML("item_" + c + "_description", entry.name());
|
||||
prop.put("item_" + c + "_author", "");
|
||||
prop.put("item_" + c + "_pubDate", serverDate.shortSecondTime(entry.appdate()));
|
||||
|
|
|
@ -21,6 +21,7 @@
|
|||
<item>
|
||||
<title>#[title]#</title>
|
||||
<link>#[link]#</link>
|
||||
<referrer>#[referrer]#</referrer>
|
||||
<description>#[description]#</description>
|
||||
<author>#[author]#</author>
|
||||
<pubDate>#[pubDate]#</pubDate>
|
||||
|
|
|
@ -1454,10 +1454,16 @@ public final class httpc {
|
|||
public void writeX(InputStream source, OutputStream procOS, OutputStream bufferOS) {
|
||||
byte[] buffer = new byte[2048];
|
||||
int l, c = 0;
|
||||
lastIO = System.currentTimeMillis();
|
||||
|
||||
while (true) try {
|
||||
io: while (true) try {
|
||||
l = source.read(buffer, 0, buffer.length);
|
||||
if (l <= 0) break;
|
||||
if (l < 0) break;
|
||||
if (l == 0) try {
|
||||
if (System.currentTimeMillis() - lastIO > 30000) break;
|
||||
this.wait(300);
|
||||
continue io;
|
||||
} catch (InterruptedException e) {} // may happen without EOF
|
||||
lastIO = System.currentTimeMillis();
|
||||
c += l;
|
||||
if (procOS != null) procOS.write(buffer, 0, l);
|
||||
|
@ -1479,10 +1485,16 @@ public final class httpc {
|
|||
OutputStreamWriter bufferOSWriter = (bufferOS == null) ? null : new OutputStreamWriter(bufferOS,outputCharset);
|
||||
char[] buffer = new char[2048];
|
||||
int l, c= 0;
|
||||
lastIO = System.currentTimeMillis();
|
||||
|
||||
while (true) try{
|
||||
io: while (true) try{
|
||||
l = sourceReader.read(buffer, 0, buffer.length);
|
||||
if (l <= 0) break;
|
||||
if (l < 0) break;
|
||||
if (l == 0) try {
|
||||
if (System.currentTimeMillis() - lastIO > 30000) break;
|
||||
this.wait(300);
|
||||
continue io;
|
||||
} catch (InterruptedException e) {} // may happen without EOF
|
||||
lastIO = System.currentTimeMillis();
|
||||
c += l;
|
||||
if (procOS != null) procOS.write(buffer, 0, l);
|
||||
|
|
|
@ -238,6 +238,21 @@ public class plasmaCrawlQueues {
|
|||
return false;
|
||||
}
|
||||
|
||||
if (sb.sbQueue.size() >= (int) sb.getConfigLong(plasmaSwitchboard.INDEXER_SLOTS, 30)) {
|
||||
log.logFine("remoteCrawlLoaderJob: too many processes in indexing queue, dismissed (" + "sbQueueSize=" + sb.sbQueue.size() + ")");
|
||||
return false;
|
||||
}
|
||||
|
||||
if (this.size() >= sb.getConfigLong(plasmaSwitchboard.CRAWLER_THREADS_ACTIVE_MAX, 10)) {
|
||||
log.logFine("remoteCrawlLoaderJob: too many processes in loader queue, dismissed (" + "cacheLoader=" + this.size() + ")");
|
||||
return false;
|
||||
}
|
||||
|
||||
if (sb.onlineCaution()) {
|
||||
log.logFine("remoteCrawlLoaderJob: online caution, omitting processing");
|
||||
return false;
|
||||
}
|
||||
|
||||
// check if we have an entry in the provider list, otherwise fill the list
|
||||
yacySeed seed;
|
||||
if ((remoteCrawlProviderHashes.size() == 0) &&
|
||||
|
@ -271,28 +286,32 @@ public class plasmaCrawlQueues {
|
|||
if (reader == null) return true;
|
||||
// parse the rss
|
||||
rssReader.Item item;
|
||||
yacyURL url, referrer;
|
||||
Date loaddate;
|
||||
for (int i = 0; i < reader.items(); i++) {
|
||||
item = reader.getItem(i);
|
||||
//System.out.println("URL=" + item.getLink() + ", desc=" + item.getDescription() + ", pubDate=" + item.getPubDate());
|
||||
|
||||
// put url on remote crawl stack
|
||||
yacyURL url;
|
||||
try {
|
||||
url = new yacyURL(item.getLink(), null);
|
||||
} catch (MalformedURLException e) {
|
||||
url = null;
|
||||
}
|
||||
Date loaddate;
|
||||
try {
|
||||
referrer = new yacyURL(item.getReferrer(), null);
|
||||
} catch (MalformedURLException e) {
|
||||
referrer = null;
|
||||
}
|
||||
try {
|
||||
loaddate = serverDate.parseShortSecondTime(item.getPubDate());
|
||||
} catch (ParseException e) {
|
||||
loaddate = new Date();
|
||||
}
|
||||
yacyURL referrer = null; // referrer needed!
|
||||
if (sb.acceptURL(url)) {
|
||||
// stack url
|
||||
sb.getLog().logFinest("crawlOrder: stack: url='" + url + "'");
|
||||
String reasonString = sb.crawlStacker.stackCrawl(url, referrer, hash, "REMOTE-CRAWLING", loaddate, 0, sb.defaultRemoteProfile);
|
||||
String reasonString = sb.crawlStacker.stackCrawl(url, referrer, hash, item.getDescription(), loaddate, 0, sb.defaultRemoteProfile);
|
||||
|
||||
if (reasonString == null) {
|
||||
// done
|
||||
|
@ -328,13 +347,11 @@ public class plasmaCrawlQueues {
|
|||
return false;
|
||||
}
|
||||
if (sb.sbQueue.size() >= (int) sb.getConfigLong(plasmaSwitchboard.INDEXER_SLOTS, 30)) {
|
||||
log.logFine("GlobalCrawl: too many processes in indexing queue, dismissed (" +
|
||||
"sbQueueSize=" + sb.sbQueue.size() + ")");
|
||||
log.logFine("GlobalCrawl: too many processes in indexing queue, dismissed (" + "sbQueueSize=" + sb.sbQueue.size() + ")");
|
||||
return false;
|
||||
}
|
||||
if (this.size() >= sb.getConfigLong(plasmaSwitchboard.CRAWLER_THREADS_ACTIVE_MAX, 10)) {
|
||||
log.logFine("GlobalCrawl: too many processes in loader queue, dismissed (" +
|
||||
"cacheLoader=" + this.size() + ")");
|
||||
log.logFine("GlobalCrawl: too many processes in loader queue, dismissed (" + "cacheLoader=" + this.size() + ")");
|
||||
return false;
|
||||
}
|
||||
if (sb.onlineCaution()) {
|
||||
|
|
|
@ -1497,6 +1497,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
|
|||
}
|
||||
|
||||
public yacyURL getURL(String urlhash) {
|
||||
if (urlhash == null) return null;
|
||||
if (urlhash.equals(yacyURL.dummyHash)) return null;
|
||||
yacyURL ne = crawlQueues.getURL(urlhash);
|
||||
if (ne != null) return ne;
|
||||
|
|
|
@ -53,6 +53,7 @@ public class rssReader extends DefaultHandler {
|
|||
"category", //
|
||||
"title", //
|
||||
"link", //
|
||||
"referrer", //
|
||||
"language", //
|
||||
"description", //
|
||||
"creator", //
|
||||
|
@ -245,6 +246,10 @@ public class rssReader extends DefaultHandler {
|
|||
return (String) map.get("link");
|
||||
}
|
||||
|
||||
public String getReferrer() {
|
||||
return (String) map.get("referrer");
|
||||
}
|
||||
|
||||
public String getLanguage() {
|
||||
return (String) map.get("language");
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue
Block a user