mirror of
https://github.com/yacy/yacy_search_server.git
synced 2024-09-19 00:01:41 +02:00
added new default profiles to distinguish snippet fetch for local and global search
the difference is, that a local search will no not cause a re-indexing of loaded pages git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@4731 6c8d7289-2bf4-0310-a012-ef5d649a1542
This commit is contained in:
parent
2c0c8f0f0c
commit
e024e3b9cf
|
@ -200,7 +200,7 @@ public class Bookmarks {
|
|||
plasmaParserDocument document = null;
|
||||
if (urlentry != null) {
|
||||
indexURLReference.Components comp = urlentry.comp();
|
||||
document = plasmaSnippetCache.retrieveDocument(comp.url(), true, 5000, true);
|
||||
document = plasmaSnippetCache.retrieveDocument(comp.url(), true, 5000, true, false);
|
||||
prop.put("mode_edit", "0"); // create mode
|
||||
prop.put("mode_url", comp.url().toNormalform(false, true));
|
||||
prop.putHTML("mode_title", comp.dc_title());
|
||||
|
|
|
@ -115,8 +115,10 @@ public class IndexCreateWWWLocalQueue_p {
|
|||
final String name = entry.name();
|
||||
if (name.equals(plasmaSwitchboard.CRAWL_PROFILE_PROXY) ||
|
||||
name.equals(plasmaSwitchboard.CRAWL_PROFILE_REMOTE) ||
|
||||
name.equals(plasmaSwitchboard.CRAWL_PROFILE_SNIPPET_TEXT) ||
|
||||
name.equals(plasmaSwitchboard.CRAWL_PROFILE_SNIPPET_MEDIA))
|
||||
name.equals(plasmaSwitchboard.CRAWL_PROFILE_SNIPPET_LOCAL_TEXT) ||
|
||||
name.equals(plasmaSwitchboard.CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT) ||
|
||||
name.equals(plasmaSwitchboard.CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA) ||
|
||||
name.equals(plasmaSwitchboard.CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA))
|
||||
continue;
|
||||
if (compiledPattern.matcher(name).find()) {
|
||||
sb.profilesActiveCrawls.removeEntry(entry.handle());
|
||||
|
|
|
@ -166,7 +166,7 @@ public class ViewFile {
|
|||
if (resource == null) {
|
||||
plasmaHTCache.Entry entry = null;
|
||||
try {
|
||||
entry = sb.crawlQueues.loadResourceFromWeb(url, 5000, false, true);
|
||||
entry = sb.crawlQueues.loadResourceFromWeb(url, 5000, false, true, false);
|
||||
} catch (Exception e) {
|
||||
prop.put("error", "4");
|
||||
prop.putHTML("error_errorText", e.getMessage());
|
||||
|
|
|
@ -98,7 +98,7 @@ public class ViewImage {
|
|||
// getting the image as stream
|
||||
Image scaled = iconcache.get(urlString);
|
||||
if (scaled == null) {
|
||||
Object[] resource = plasmaSnippetCache.getResource(url, true, timeout, false);
|
||||
Object[] resource = plasmaSnippetCache.getResource(url, true, timeout, false, true);
|
||||
byte[] imgb = null;
|
||||
if (resource == null) {
|
||||
if (urlString.endsWith(".ico")) {
|
||||
|
|
|
@ -37,8 +37,10 @@ public class WatchWebStructure_p {
|
|||
e = it.next();
|
||||
if (e.name().equals(plasmaSwitchboard.CRAWL_PROFILE_PROXY) ||
|
||||
e.name().equals(plasmaSwitchboard.CRAWL_PROFILE_REMOTE) ||
|
||||
e.name().equals(plasmaSwitchboard.CRAWL_PROFILE_SNIPPET_TEXT) ||
|
||||
e.name().equals(plasmaSwitchboard.CRAWL_PROFILE_SNIPPET_MEDIA))
|
||||
e.name().equals(plasmaSwitchboard.CRAWL_PROFILE_SNIPPET_LOCAL_TEXT) ||
|
||||
e.name().equals(plasmaSwitchboard.CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT) ||
|
||||
e.name().equals(plasmaSwitchboard.CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA) ||
|
||||
e.name().equals(plasmaSwitchboard.CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA))
|
||||
continue;
|
||||
host = e.name();
|
||||
break; // take the first one
|
||||
|
|
|
@ -200,7 +200,7 @@ public class ymarks {
|
|||
plasmaParserDocument document = null;
|
||||
if (urlentry != null) {
|
||||
indexURLReference.Components comp = urlentry.comp();
|
||||
document = plasmaSnippetCache.retrieveDocument(comp.url(), true, 5000, true);
|
||||
document = plasmaSnippetCache.retrieveDocument(comp.url(), true, 5000, true, true);
|
||||
prop.put("mode_edit", "0"); // create mode
|
||||
prop.put("mode_url", comp.url().toNormalform(false, true));
|
||||
prop.putHTML("mode_title", comp.dc_title());
|
||||
|
|
|
@ -53,7 +53,7 @@ public class sidebar_history {
|
|||
if (visibleQueries.contains(query.queryString)) continue; // avoid doubles
|
||||
visibleQueries.add(query.queryString);
|
||||
prop.put("history_list_" + c + "_querystring", query.queryString);
|
||||
prop.put("history_list_" + c + "_searchdom", query.searchdom());
|
||||
prop.put("history_list_" + c + "_searchdom", ((query.isLocal()) ? "local" : "global"));
|
||||
prop.put("history_list_" + c + "_contentdom", query.contentdom());
|
||||
c++;
|
||||
if (c >= 10) break;
|
||||
|
|
|
@ -117,7 +117,7 @@ public class sidebar_navigation {
|
|||
prop.put("navigation_topwords_words_" + hintcount + "_count", theQuery.displayResults());
|
||||
prop.put("navigation_topwords_words_" + hintcount + "_offset", "0");
|
||||
prop.put("navigation_topwords_words_" + hintcount + "_contentdom", theQuery.contentdom());
|
||||
prop.put("navigation_topwords_words_" + hintcount + "_resource", theQuery.searchdom());
|
||||
prop.put("navigation_topwords_words_" + hintcount + "_resource", ((theQuery.isLocal()) ? "local" : "global"));
|
||||
prop.put("navigation_topwords_words_" + hintcount + "_zonecode", theQuery.zonecode);
|
||||
}
|
||||
hintcount++;
|
||||
|
@ -182,7 +182,7 @@ public class sidebar_navigation {
|
|||
"<a href=\"ysearch.html?search=" + theQuery.queryString() +
|
||||
"&count="+ theQuery.displayResults() +
|
||||
"&offset=" + (page * theQuery.displayResults()) +
|
||||
"&resource=" + theQuery.searchdom() +
|
||||
"&resource=" + ((theQuery.isLocal()) ? "local" : "global") +
|
||||
"&urlmaskfilter=" + theQuery.urlMask +
|
||||
"&prefermaskfilter=" + theQuery.prefer +
|
||||
"&cat=href&constraint=" + ((theQuery.constraint == null) ? "" : theQuery.constraint.exportB64()) +
|
||||
|
@ -195,7 +195,7 @@ public class sidebar_navigation {
|
|||
prop.putHTML("navigation_languagezone_" + zonename + "_search", theQuery.queryString.replace(' ', '+'));
|
||||
prop.put("navigation_languagezone_" + zonename + "_offset", "0");
|
||||
prop.put("navigation_languagezone_" + zonename + "_contentdom", theQuery.contentdom());
|
||||
prop.put("navigation_languagezone_" + zonename + "_resource", theQuery.searchdom());
|
||||
prop.put("navigation_languagezone_" + zonename + "_resource", ((theQuery.isLocal()) ? "local" : "global"));
|
||||
prop.put("navigation_languagezone_" + zonename, 1);
|
||||
}
|
||||
|
||||
|
|
|
@ -222,7 +222,7 @@ public class yacysearch {
|
|||
if (urlentry != null) {
|
||||
indexURLReference.Components comp = urlentry.comp();
|
||||
plasmaParserDocument document;
|
||||
document = plasmaSnippetCache.retrieveDocument(comp.url(), true, 5000, true);
|
||||
document = plasmaSnippetCache.retrieveDocument(comp.url(), true, 5000, true, false);
|
||||
if (document != null) {
|
||||
// create a news message
|
||||
HashMap<String, String> map = new HashMap<String, String>();
|
||||
|
@ -417,7 +417,7 @@ public class yacysearch {
|
|||
"&search=" + theQuery.queryString() +
|
||||
"&count="+ theQuery.displayResults() +
|
||||
"&offset=" + (page * theQuery.displayResults()) +
|
||||
"&resource=" + theQuery.searchdom() +
|
||||
"&resource=" + ((theQuery.isLocal()) ? "local" : "global") +
|
||||
"&urlmaskfilter=" + theQuery.urlMask +
|
||||
"&prefermaskfilter=" + theQuery.prefer +
|
||||
"&cat=href&constraint=" + ((theQuery.constraint == null) ? "" : theQuery.constraint.exportB64()) +
|
||||
|
|
|
@ -155,7 +155,7 @@ public class yacysearchitem {
|
|||
prop.put("references_words_" + hintcount + "_count", theQuery.displayResults());
|
||||
prop.put("references_words_" + hintcount + "_offset", "0");
|
||||
prop.put("references_words_" + hintcount + "_contentdom", theQuery.contentdom());
|
||||
prop.put("references_words_" + hintcount + "_resource", theQuery.searchdom());
|
||||
prop.put("references_words_" + hintcount + "_resource", ((theQuery.isLocal()) ? "local" : "global"));
|
||||
}
|
||||
prop.put("references_words", hintcount);
|
||||
if (hintcount++ > MAX_TOPWORDS) {
|
||||
|
|
|
@ -436,7 +436,8 @@ public class plasmaCrawlQueues {
|
|||
yacyURL url,
|
||||
int socketTimeout,
|
||||
boolean keepInMemory,
|
||||
boolean forText
|
||||
boolean forText,
|
||||
boolean global
|
||||
) {
|
||||
|
||||
plasmaCrawlEntry centry = new plasmaCrawlEntry(
|
||||
|
@ -445,7 +446,14 @@ public class plasmaCrawlQueues {
|
|||
null,
|
||||
"",
|
||||
new Date(),
|
||||
(forText) ? sb.defaultTextSnippetProfile.handle() : sb.defaultMediaSnippetProfile.handle(), // crawl profile
|
||||
(forText) ?
|
||||
((global) ?
|
||||
sb.defaultTextSnippetGlobalProfile.handle() :
|
||||
sb.defaultTextSnippetLocalProfile.handle())
|
||||
:
|
||||
((global) ?
|
||||
sb.defaultMediaSnippetGlobalProfile.handle() :
|
||||
sb.defaultMediaSnippetLocalProfile.handle()), // crawl profile
|
||||
0,
|
||||
0,
|
||||
0);
|
||||
|
|
|
@ -667,7 +667,7 @@ public final class plasmaParser {
|
|||
}
|
||||
|
||||
if (!documentCharset.equalsIgnoreCase(charset)) {
|
||||
this.theLogger.logInfo("Charset transformation needed from '" + documentCharset + "' to '" + charset + "'");
|
||||
this.theLogger.logInfo("Charset transformation needed from '" + documentCharset + "' to '" + charset + "' for URL = " + location.toNormalform(true, true));
|
||||
}
|
||||
|
||||
// parsing the content
|
||||
|
|
|
@ -350,7 +350,7 @@ public final class plasmaSearchEvent {
|
|||
if (query.contentdom == plasmaSearchQuery.CONTENTDOM_TEXT) {
|
||||
// attach text snippet
|
||||
startTime = System.currentTimeMillis();
|
||||
plasmaSnippetCache.TextSnippet snippet = plasmaSnippetCache.retrieveTextSnippet(comp, snippetFetchWordHashes, (snippetFetchMode == 2), ((query.constraint != null) && (query.constraint.get(plasmaCondenser.flag_cat_indexof))), 180, 3000, (snippetFetchMode == 2) ? Integer.MAX_VALUE : 100000);
|
||||
plasmaSnippetCache.TextSnippet snippet = plasmaSnippetCache.retrieveTextSnippet(comp, snippetFetchWordHashes, (snippetFetchMode == 2), ((query.constraint != null) && (query.constraint.get(plasmaCondenser.flag_cat_indexof))), 180, 3000, (snippetFetchMode == 2) ? Integer.MAX_VALUE : 100000, query.isGlobal());
|
||||
long snippetComputationTime = System.currentTimeMillis() - startTime;
|
||||
serverLog.logInfo("SEARCH_EVENT", "text snippet load time for " + comp.url() + ": " + snippetComputationTime + ", " + ((snippet.getErrorCode() < 11) ? "snippet found" : ("no snippet found (" + snippet.getError() + ")")));
|
||||
|
||||
|
@ -370,7 +370,7 @@ public final class plasmaSearchEvent {
|
|||
} else {
|
||||
// attach media information
|
||||
startTime = System.currentTimeMillis();
|
||||
ArrayList<MediaSnippet> mediaSnippets = plasmaSnippetCache.retrieveMediaSnippets(comp.url(), snippetFetchWordHashes, query.contentdom, (snippetFetchMode == 2), 6000);
|
||||
ArrayList<MediaSnippet> mediaSnippets = plasmaSnippetCache.retrieveMediaSnippets(comp.url(), snippetFetchWordHashes, query.contentdom, (snippetFetchMode == 2), 6000, query.isGlobal());
|
||||
long snippetComputationTime = System.currentTimeMillis() - startTime;
|
||||
serverLog.logInfo("SEARCH_EVENT", "media snippet load time for " + comp.url() + ": " + snippetComputationTime);
|
||||
|
||||
|
|
|
@ -56,11 +56,11 @@ public final class plasmaSearchImages {
|
|||
|
||||
private HashMap<String, htmlFilterImageEntry> images;
|
||||
|
||||
public plasmaSearchImages(long maxTime, yacyURL url, int depth) {
|
||||
public plasmaSearchImages(long maxTime, yacyURL url, int depth, boolean indexing) {
|
||||
long start = System.currentTimeMillis();
|
||||
this.images = new HashMap<String, htmlFilterImageEntry>();
|
||||
if (maxTime > 10) {
|
||||
Object[] resource = plasmaSnippetCache.getResource(url, true, (int) maxTime, false);
|
||||
Object[] resource = plasmaSnippetCache.getResource(url, true, (int) maxTime, false, indexing);
|
||||
InputStream res = (InputStream) resource[0];
|
||||
Long resLength = (Long) resource[1];
|
||||
if (res != null) {
|
||||
|
@ -85,7 +85,7 @@ public final class plasmaSearchImages {
|
|||
while (i.hasNext()) {
|
||||
try {
|
||||
nexturlstring = i.next().toNormalform(true, true);
|
||||
addAll(new plasmaSearchImages(serverDate.remainingTime(start, maxTime, 10), new yacyURL(nexturlstring, null), depth - 1));
|
||||
addAll(new plasmaSearchImages(serverDate.remainingTime(start, maxTime, 10), new yacyURL(nexturlstring, null), depth - 1, indexing));
|
||||
} catch (MalformedURLException e1) {
|
||||
e1.printStackTrace();
|
||||
}
|
||||
|
|
|
@ -193,8 +193,12 @@ public final class plasmaSearchQuery {
|
|||
return "text";
|
||||
}
|
||||
|
||||
public String searchdom() {
|
||||
return (this.domType == SEARCHDOM_LOCAL) ? "local" : "global";
|
||||
public boolean isGlobal() {
|
||||
return this.domType != SEARCHDOM_LOCAL;
|
||||
}
|
||||
|
||||
public boolean isLocal() {
|
||||
return this.domType != SEARCHDOM_LOCAL;
|
||||
}
|
||||
|
||||
public static TreeSet<String> hashes2Set(String query) {
|
||||
|
|
|
@ -255,7 +255,7 @@ public class plasmaSnippetCache {
|
|||
}
|
||||
|
||||
@SuppressWarnings("unchecked")
|
||||
public static TextSnippet retrieveTextSnippet(indexURLReference.Components comp, Set<String> queryhashes, boolean fetchOnline, boolean pre, int snippetMaxLength, int timeout, int maxDocLen) {
|
||||
public static TextSnippet retrieveTextSnippet(indexURLReference.Components comp, Set<String> queryhashes, boolean fetchOnline, boolean pre, int snippetMaxLength, int timeout, int maxDocLen, boolean reindexing) {
|
||||
// heise = "0OQUNU3JSs05"
|
||||
yacyURL url = comp.url();
|
||||
if (queryhashes.size() == 0) {
|
||||
|
@ -305,7 +305,7 @@ public class plasmaSnippetCache {
|
|||
// if not found try to download it
|
||||
|
||||
// download resource using the crawler and keep resource in memory if possible
|
||||
plasmaHTCache.Entry entry = plasmaSwitchboard.getSwitchboard().crawlQueues.loadResourceFromWeb(url, timeout, true, true);
|
||||
plasmaHTCache.Entry entry = plasmaSwitchboard.getSwitchboard().crawlQueues.loadResourceFromWeb(url, timeout, true, true, reindexing);
|
||||
|
||||
// place entry on crawl queue
|
||||
plasmaHTCache.push(entry);
|
||||
|
@ -398,9 +398,10 @@ public class plasmaSnippetCache {
|
|||
* @param fetchOnline specifies if the resource should be loaded from web if it'as not available in the cache
|
||||
* @param timeout
|
||||
* @param forText
|
||||
* @param global the domain of the search. If global == true then the content is re-indexed
|
||||
* @return the parsed document as {@link plasmaParserDocument}
|
||||
*/
|
||||
public static plasmaParserDocument retrieveDocument(yacyURL url, boolean fetchOnline, int timeout, boolean forText) {
|
||||
public static plasmaParserDocument retrieveDocument(yacyURL url, boolean fetchOnline, int timeout, boolean forText, boolean global) {
|
||||
|
||||
// load resource
|
||||
long resContentLength = 0;
|
||||
|
@ -416,7 +417,7 @@ public class plasmaSnippetCache {
|
|||
// if not found try to download it
|
||||
|
||||
// download resource using the crawler and keep resource in memory if possible
|
||||
plasmaHTCache.Entry entry = plasmaSwitchboard.getSwitchboard().crawlQueues.loadResourceFromWeb(url, timeout, true, forText);
|
||||
plasmaHTCache.Entry entry = plasmaSwitchboard.getSwitchboard().crawlQueues.loadResourceFromWeb(url, timeout, true, forText, global);
|
||||
|
||||
// getting resource metadata (e.g. the http headers for http resources)
|
||||
if (entry != null) {
|
||||
|
@ -648,13 +649,13 @@ public class plasmaSnippetCache {
|
|||
}
|
||||
}
|
||||
|
||||
public static ArrayList<MediaSnippet> retrieveMediaSnippets(yacyURL url, Set<String> queryhashes, int mediatype, boolean fetchOnline, int timeout) {
|
||||
public static ArrayList<MediaSnippet> retrieveMediaSnippets(yacyURL url, Set<String> queryhashes, int mediatype, boolean fetchOnline, int timeout, boolean reindexing) {
|
||||
if (queryhashes.size() == 0) {
|
||||
serverLog.logFine("snippet fetch", "no query hashes given for url " + url);
|
||||
return new ArrayList<MediaSnippet>();
|
||||
}
|
||||
|
||||
plasmaParserDocument document = retrieveDocument(url, fetchOnline, timeout, false);
|
||||
plasmaParserDocument document = retrieveDocument(url, fetchOnline, timeout, false, reindexing);
|
||||
ArrayList<MediaSnippet> a = new ArrayList<MediaSnippet>();
|
||||
if (document != null) {
|
||||
if ((mediatype == plasmaSearchQuery.CONTENTDOM_ALL) || (mediatype == plasmaSearchQuery.CONTENTDOM_AUDIO)) a.addAll(computeMediaSnippets(document, queryhashes, plasmaSearchQuery.CONTENTDOM_AUDIO));
|
||||
|
@ -860,7 +861,7 @@ public class plasmaSnippetCache {
|
|||
* <tr><td>[1]</td><td>the content-length as {@link Integer}</td></tr>
|
||||
* </table>
|
||||
*/
|
||||
public static Object[] getResource(yacyURL url, boolean fetchOnline, int socketTimeout, boolean forText) {
|
||||
public static Object[] getResource(yacyURL url, boolean fetchOnline, int socketTimeout, boolean forText, boolean reindexing) {
|
||||
// load the url as resource from the web
|
||||
long contentLength = -1;
|
||||
|
||||
|
@ -872,7 +873,7 @@ public class plasmaSnippetCache {
|
|||
// if the content is not available in cache try to download it from web
|
||||
|
||||
// try to download the resource using a crawler
|
||||
plasmaHTCache.Entry entry = plasmaSwitchboard.getSwitchboard().crawlQueues.loadResourceFromWeb(url, (socketTimeout < 0) ? -1 : socketTimeout, true, forText);
|
||||
plasmaHTCache.Entry entry = plasmaSwitchboard.getSwitchboard().crawlQueues.loadResourceFromWeb(url, (socketTimeout < 0) ? -1 : socketTimeout, true, forText, reindexing);
|
||||
if (entry == null) return null; // not found in web
|
||||
|
||||
// read resource body (if it is there)
|
||||
|
|
|
@ -212,8 +212,8 @@ public final class plasmaSwitchboard extends serverAbstractSwitch<plasmaSwitchbo
|
|||
public plasmaCrawlProfile profilesActiveCrawls, profilesPassiveCrawls;
|
||||
public plasmaCrawlProfile.entry defaultProxyProfile;
|
||||
public plasmaCrawlProfile.entry defaultRemoteProfile;
|
||||
public plasmaCrawlProfile.entry defaultTextSnippetProfile;
|
||||
public plasmaCrawlProfile.entry defaultMediaSnippetProfile;
|
||||
public plasmaCrawlProfile.entry defaultTextSnippetLocalProfile, defaultTextSnippetGlobalProfile;
|
||||
public plasmaCrawlProfile.entry defaultMediaSnippetLocalProfile, defaultMediaSnippetGlobalProfile;
|
||||
public boolean rankingOn;
|
||||
public plasmaRankingDistribution rankingOwnDistribution;
|
||||
public plasmaRankingDistribution rankingOtherDistribution;
|
||||
|
@ -648,10 +648,12 @@ public final class plasmaSwitchboard extends serverAbstractSwitch<plasmaSwitchbo
|
|||
// Miscellaneous settings
|
||||
//////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
public static final String CRAWL_PROFILE_PROXY = "proxy";
|
||||
public static final String CRAWL_PROFILE_REMOTE = "remote";
|
||||
public static final String CRAWL_PROFILE_SNIPPET_TEXT = "snippetText";
|
||||
public static final String CRAWL_PROFILE_SNIPPET_MEDIA = "snippetMedia";
|
||||
public static final String CRAWL_PROFILE_PROXY = "proxy";
|
||||
public static final String CRAWL_PROFILE_REMOTE = "remote";
|
||||
public static final String CRAWL_PROFILE_SNIPPET_LOCAL_TEXT = "snippetLocalText";
|
||||
public static final String CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT = "snippetGlobalText";
|
||||
public static final String CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA = "snippetLocalMedia";
|
||||
public static final String CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA = "snippetGlobalMedia";
|
||||
|
||||
/**
|
||||
* <p><code>public static final String <strong>CRAWLER_THREADS_ACTIVE_MAX</strong> = "crawler.MaxActiveThreads"</code></p>
|
||||
|
@ -1511,8 +1513,10 @@ public final class plasmaSwitchboard extends serverAbstractSwitch<plasmaSwitchbo
|
|||
private void initActiveCrawlProfiles() {
|
||||
this.defaultProxyProfile = null;
|
||||
this.defaultRemoteProfile = null;
|
||||
this.defaultTextSnippetProfile = null;
|
||||
this.defaultMediaSnippetProfile = null;
|
||||
this.defaultTextSnippetLocalProfile = null;
|
||||
this.defaultTextSnippetGlobalProfile = null;
|
||||
this.defaultMediaSnippetLocalProfile = null;
|
||||
this.defaultMediaSnippetGlobalProfile = null;
|
||||
Iterator<plasmaCrawlProfile.entry> i = this.profilesActiveCrawls.profiles(true);
|
||||
plasmaCrawlProfile.entry profile;
|
||||
String name;
|
||||
|
@ -1521,8 +1525,10 @@ public final class plasmaSwitchboard extends serverAbstractSwitch<plasmaSwitchbo
|
|||
name = profile.name();
|
||||
if (name.equals(CRAWL_PROFILE_PROXY)) this.defaultProxyProfile = profile;
|
||||
if (name.equals(CRAWL_PROFILE_REMOTE)) this.defaultRemoteProfile = profile;
|
||||
if (name.equals(CRAWL_PROFILE_SNIPPET_TEXT)) this.defaultTextSnippetProfile = profile;
|
||||
if (name.equals(CRAWL_PROFILE_SNIPPET_MEDIA)) this.defaultMediaSnippetProfile = profile;
|
||||
if (name.equals(CRAWL_PROFILE_SNIPPET_LOCAL_TEXT)) this.defaultTextSnippetLocalProfile = profile;
|
||||
if (name.equals(CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT)) this.defaultTextSnippetGlobalProfile = profile;
|
||||
if (name.equals(CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA)) this.defaultMediaSnippetLocalProfile = profile;
|
||||
if (name.equals(CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA)) this.defaultMediaSnippetGlobalProfile = profile;
|
||||
}
|
||||
if (this.defaultProxyProfile == null) {
|
||||
// generate new default entry for proxy crawling
|
||||
|
@ -1540,14 +1546,24 @@ public final class plasmaSwitchboard extends serverAbstractSwitch<plasmaSwitchbo
|
|||
defaultRemoteProfile = this.profilesActiveCrawls.newEntry(CRAWL_PROFILE_REMOTE, null, ".*", ".*", 0, 0,
|
||||
-1, -1, -1, true, true, true, false, true, false, true, true, false);
|
||||
}
|
||||
if (this.defaultTextSnippetProfile == null) {
|
||||
if (this.defaultTextSnippetLocalProfile == null) {
|
||||
// generate new default entry for snippet fetch and optional crawling
|
||||
defaultTextSnippetProfile = this.profilesActiveCrawls.newEntry(CRAWL_PROFILE_SNIPPET_TEXT, null, ".*", ".*", 0, 0,
|
||||
defaultTextSnippetLocalProfile = this.profilesActiveCrawls.newEntry(CRAWL_PROFILE_SNIPPET_LOCAL_TEXT, null, ".*", ".*", 0, 0,
|
||||
60 * 24 * 30, -1, -1, true, false, false, false, false, false, true, true, false);
|
||||
}
|
||||
if (this.defaultTextSnippetGlobalProfile == null) {
|
||||
// generate new default entry for snippet fetch and optional crawling
|
||||
defaultTextSnippetGlobalProfile = this.profilesActiveCrawls.newEntry(CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT, null, ".*", ".*", 0, 0,
|
||||
60 * 24 * 30, -1, -1, true, true, true, true, true, false, true, true, false);
|
||||
}
|
||||
if (this.defaultMediaSnippetProfile == null) {
|
||||
if (this.defaultMediaSnippetLocalProfile == null) {
|
||||
// generate new default entry for snippet fetch and optional crawling
|
||||
defaultMediaSnippetProfile = this.profilesActiveCrawls.newEntry(CRAWL_PROFILE_SNIPPET_MEDIA, null, ".*", ".*", 0, 0,
|
||||
defaultMediaSnippetLocalProfile = this.profilesActiveCrawls.newEntry(CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA, null, ".*", ".*", 0, 0,
|
||||
60 * 24 * 30, -1, -1, true, false, false, false, false, false, true, true, false);
|
||||
}
|
||||
if (this.defaultMediaSnippetGlobalProfile == null) {
|
||||
// generate new default entry for snippet fetch and optional crawling
|
||||
defaultMediaSnippetGlobalProfile = this.profilesActiveCrawls.newEntry(CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA, null, ".*", ".*", 0, 0,
|
||||
60 * 24 * 30, -1, -1, true, false, true, true, true, false, true, true, false);
|
||||
}
|
||||
}
|
||||
|
@ -1598,8 +1614,10 @@ public final class plasmaSwitchboard extends serverAbstractSwitch<plasmaSwitchbo
|
|||
entry = iter.next();
|
||||
if (!((entry.name().equals(CRAWL_PROFILE_PROXY)) ||
|
||||
(entry.name().equals(CRAWL_PROFILE_REMOTE)) ||
|
||||
(entry.name().equals(CRAWL_PROFILE_SNIPPET_TEXT)) ||
|
||||
(entry.name().equals(CRAWL_PROFILE_SNIPPET_MEDIA)))) {
|
||||
(entry.name().equals(CRAWL_PROFILE_SNIPPET_LOCAL_TEXT)) ||
|
||||
(entry.name().equals(CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT)) ||
|
||||
(entry.name().equals(CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA)) ||
|
||||
(entry.name().equals(CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA)))) {
|
||||
profilesPassiveCrawls.newEntry(entry.map());
|
||||
iter.remove();
|
||||
hasDoneSomething = true;
|
||||
|
@ -2253,7 +2271,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch<plasmaSwitchbo
|
|||
// check for interruption
|
||||
checkInterruption();
|
||||
|
||||
log.logInfo("Not indexed any word in URL " + entry.url() + "; cause: " + noIndexReason);
|
||||
log.logFine("Not indexed any word in URL " + entry.url() + "; cause: " + noIndexReason);
|
||||
addURLtoErrorDB(entry.url(), (referrerURL == null) ? null : referrerURL.hash(), entry.initiator(), dc_title, noIndexReason, new kelondroBitfield());
|
||||
/*
|
||||
if ((processCase == PROCESSCASE_6_GLOBAL_CRAWLING) && (initiatorPeer != null)) {
|
||||
|
@ -2400,7 +2418,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch<plasmaSwitchbo
|
|||
InputStream resourceContent = null;
|
||||
try {
|
||||
// get the resource content
|
||||
Object[] resource = plasmaSnippetCache.getResource(comp.url(), fetchOnline, 10000, true);
|
||||
Object[] resource = plasmaSnippetCache.getResource(comp.url(), fetchOnline, 10000, true, false);
|
||||
resourceContent = (InputStream) resource[0];
|
||||
Long resourceContentLength = (Long) resource[1];
|
||||
|
||||
|
|
|
@ -34,7 +34,6 @@ import java.io.FileOutputStream;
|
|||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Comparator;
|
||||
import java.util.Date;
|
||||
import java.util.HashMap;
|
||||
import java.util.Iterator;
|
||||
import java.util.Map;
|
||||
|
|
|
@ -78,7 +78,7 @@ public class ymageOSM {
|
|||
InputStream tileStream = plasmaHTCache.getResourceContentStream(tileURL);
|
||||
if (tileStream == null) {
|
||||
// download resource using the crawler and keep resource in memory if possible
|
||||
plasmaHTCache.Entry entry = plasmaSwitchboard.getSwitchboard().crawlQueues.loadResourceFromWeb(tileURL, 20000, true, false);
|
||||
plasmaHTCache.Entry entry = plasmaSwitchboard.getSwitchboard().crawlQueues.loadResourceFromWeb(tileURL, 20000, true, false, false);
|
||||
if ((entry == null) || (entry.cacheArray() == null)) return null;
|
||||
tileStream = new ByteArrayInputStream(entry.cacheArray());
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue
Block a user