added an option to put image links to the crawl queue and handle these

like normal documents. Using this option (by default on at this moment; this might change soon) it is possible to get the exif data into the search index to be used in image search.
2024-09-19 00:01:41 +02:00 · 2013-09-03 11:13:45 +02:00 · 2013-09-03 11:13:45 +02:00 · 69f85265e1
commit 69f85265e1
parent e8e558a9b7
3 changed files with 10 additions and 2 deletions
--- a/defaults/yacy.init
+++ b/defaults/yacy.init
@ -796,6 +796,11 @@ search.excludehosth=
 # the cases of nocache, iffresh and ifexist causes an index deletion
 search.verify.delete = true

+# images may be treated either as documents that are shown in search results or as objects
+# that are only visible in special search environments, like image search
+search.excludeintext.image = true
+crawler.load.image = true;
+
 # remote search details
 remotesearch.maxcount = 10
 remotesearch.maxtime = 3000
--- a/source/net/yacy/crawler/CrawlStacker.java
+++ b/source/net/yacy/crawler/CrawlStacker.java
@ -336,9 +336,10 @@ public final class CrawlStacker {

        // check availability of parser and maxfilesize
        String warning = null;
+        boolean loadImages = Switchboard.getSwitchboard().getConfigBool("crawler.load.image", true);
        if ((maxFileSize >= 0 && entry.size() > maxFileSize) ||
            entry.url().getContentDomain() == ContentDomain.APP  ||
-            entry.url().getContentDomain() == ContentDomain.IMAGE  ||
+            (!loadImages && entry.url().getContentDomain() == ContentDomain.IMAGE) ||
            entry.url().getContentDomain() == ContentDomain.AUDIO  ||
            entry.url().getContentDomain() == ContentDomain.VIDEO ||
            entry.url().getContentDomain() == ContentDomain.CTRL) {
--- a/source/net/yacy/search/Switchboard.java
+++ b/source/net/yacy/search/Switchboard.java
@ -2512,10 +2512,12 @@ public final class Switchboard extends serverSwitch {
           ) {
            // get the hyperlinks
            final Map<DigestURI, String> hl = Document.getHyperlinks(documents);
+            boolean loadImages = getConfigBool("crawler.load.image", true);
+            if (loadImages) hl.putAll(Document.getImagelinks(documents));
            
            // add all media links also to the crawl stack. They will be re-sorted to the NOLOAD queue and indexed afterwards as pure links
            if (response.profile().directDocByURL()) {
-                hl.putAll(Document.getImagelinks(documents));
+                if (!loadImages) hl.putAll(Document.getImagelinks(documents));
                hl.putAll(Document.getApplinks(documents));
                hl.putAll(Document.getVideolinks(documents));
                hl.putAll(Document.getAudiolinks(documents));