added an option to put image links to the crawl queue and handle these

like normal documents. Using this option (by default on at this moment;
this might change soon) it is possible to get the exif data into the
search index to be used in image search.
This commit is contained in:
Michael Peter Christen 2013-09-03 11:13:45 +02:00
parent e8e558a9b7
commit 69f85265e1
3 changed files with 10 additions and 2 deletions

View File

@ -796,6 +796,11 @@ search.excludehosth=
# the cases of nocache, iffresh and ifexist causes an index deletion
search.verify.delete = true
# images may be treated either as documents that are shown in search results or as objects
# that are only visible in special search environments, like image search
search.excludeintext.image = true
crawler.load.image = true;
# remote search details
remotesearch.maxcount = 10
remotesearch.maxtime = 3000

View File

@ -336,9 +336,10 @@ public final class CrawlStacker {
// check availability of parser and maxfilesize
String warning = null;
boolean loadImages = Switchboard.getSwitchboard().getConfigBool("crawler.load.image", true);
if ((maxFileSize >= 0 && entry.size() > maxFileSize) ||
entry.url().getContentDomain() == ContentDomain.APP ||
entry.url().getContentDomain() == ContentDomain.IMAGE ||
(!loadImages && entry.url().getContentDomain() == ContentDomain.IMAGE) ||
entry.url().getContentDomain() == ContentDomain.AUDIO ||
entry.url().getContentDomain() == ContentDomain.VIDEO ||
entry.url().getContentDomain() == ContentDomain.CTRL) {

View File

@ -2512,10 +2512,12 @@ public final class Switchboard extends serverSwitch {
) {
// get the hyperlinks
final Map<DigestURI, String> hl = Document.getHyperlinks(documents);
boolean loadImages = getConfigBool("crawler.load.image", true);
if (loadImages) hl.putAll(Document.getImagelinks(documents));
// add all media links also to the crawl stack. They will be re-sorted to the NOLOAD queue and indexed afterwards as pure links
if (response.profile().directDocByURL()) {
hl.putAll(Document.getImagelinks(documents));
if (!loadImages) hl.putAll(Document.getImagelinks(documents));
hl.putAll(Document.getApplinks(documents));
hl.putAll(Document.getVideolinks(documents));
hl.putAll(Document.getAudiolinks(documents));