- enhanced recognition, parsing, management and double-occurrence-handling of image tags

- enhanced text parser (condenser): found and eliminated bad code parts; increase of speed - added handling of image preview using the image cache from HTCACHE - some other minor changes git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@4507 6c8d7289-2bf4-0310-a012-ef5d649a1542
2024-09-19 00:01:41 +02:00 · 2008-02-25 14:08:15 +00:00 · 2008-02-25 14:08:15 +00:00 · 87a8747ce3
commit 87a8747ce3
parent fcc919964b
17 changed files with 161 additions and 111 deletions
--- a/htroot/CacheAdmin_p.java
+++ b/htroot/CacheAdmin_p.java
@ -54,6 +54,7 @@ import java.io.File;
 import java.io.FilenameFilter;
 import java.io.IOException;
 import java.io.Writer;
+import java.util.HashMap;
 import java.util.Iterator;
 import java.util.Map;
 import java.util.TreeSet;
@ -313,8 +314,8 @@ public class CacheAdmin_p {
        prop.put("info_type_use." + extension, (i == 0) ? 0 : 1);
    }

-    private static void formatImageAnchor(serverObjects prop, TreeSet<htmlFilterImageEntry> anchor) {
-        final Iterator<htmlFilterImageEntry> iter = anchor.iterator();
+    private static void formatImageAnchor(serverObjects prop, HashMap<String, htmlFilterImageEntry> anchor) {
+        final Iterator<htmlFilterImageEntry> iter = anchor.values().iterator();
        htmlFilterImageEntry ie;
        prop.put("info_type_use.images_images", anchor.size());
        int i = 0;
--- a/htroot/ViewFile.java
+++ b/htroot/ViewFile.java
@ -49,9 +49,9 @@ import java.io.InputStream;
 import java.io.UnsupportedEncodingException;
 import java.net.MalformedURLException;
 import java.net.URLDecoder;
+import java.util.HashMap;
 import java.util.Iterator;
 import java.util.Map;
-import java.util.TreeSet;

 import de.anomic.data.htmlTools;
 import de.anomic.htmlFilter.htmlFilterImageEntry;
@ -339,8 +339,8 @@ public class ViewFile {
                i += putMediaInfo(prop, wordArray, i, document.getAudiolinks(), "audio", (i % 2 == 0));
                dark = (i % 2 == 0);
                
-                TreeSet<htmlFilterImageEntry> ts = document.getImages();
-                Iterator<htmlFilterImageEntry> tsi = ts.iterator();
+                HashMap<String, htmlFilterImageEntry> ts = document.getImages();
+                Iterator<htmlFilterImageEntry> tsi = ts.values().iterator();
                htmlFilterImageEntry entry;
                while (tsi.hasNext()) {
                    entry = tsi.next();
--- a/htroot/ViewImage.java
+++ b/htroot/ViewImage.java
@ -135,47 +135,47 @@ public class ViewImage {
            // find original size
            int h = image.getHeight(null);
            int w = image.getWidth(null);
-
-            // System.out.println("DEBUG: get access to image " +
-            // url.toNormalform() + " is " + ((auth) ? "authorized" : "NOT
-            // authorized"));
-
+            
            // in case of not-authorized access shrink the image to prevent
-            // copyright problems
-            // so that images are not larger than thumbnails
-            if ((!auth) && ((w > 16) || (h > 16))) {
+            // copyright problems, so that images are not larger than thumbnails
+            if (auth) {
+                maxwidth = (maxwidth == 0) ? w : maxwidth;
+                maxheight = (maxheight == 0) ? h : maxheight;
+            } else if ((w > 16) || (h > 16)) {
                maxwidth = (int) Math.min(64.0, w * 0.6);
                maxheight = (int) Math.min(64.0, h * 0.6);
+            } else {
+                maxwidth = 16;
+                maxheight = 16;
            }

            // calculate width & height from maxwidth & maxheight
-            if ((maxwidth != 0) || (maxheight != 0)) {
+            if ((maxwidth < w) || (maxheight < h)) {
+                // scale image
                double hs = (w <= maxwidth) ? 1.0 : ((double) maxwidth) / ((double) w);
                double vs = (h <= maxheight) ? 1.0 : ((double) maxheight) / ((double) h);
                double scale = Math.min(hs, vs);
                if (!auth) scale = Math.min(scale, 0.6); // this is for copyright purpose
                if (scale < 1.0) {
-                    width = (int) (w * scale);
-                    height = (int) (h * scale);
+                    width = Math.max(1, (int) (w * scale));
+                    height = Math.max(1, (int) (h * scale));
                } else {
-                    width = w;
-                    height = h;
+                    width = Math.max(1, w);
+                    height = Math.max(1, h);
                }
+                
+                // compute scaled image
+                scaled = ((w == width) && (h == height)) ? image : image.getScaledInstance(width, height, Image.SCALE_AREA_AVERAGING);
+                MediaTracker mediaTracker = new MediaTracker(new Container());
+                mediaTracker.addImage(scaled, 0);
+                try {mediaTracker.waitForID(0);} catch (InterruptedException e) {}
            } else {
+                // do not scale
                width = w;
                height = h;
+                scaled = image;
            }

-            // check for minimum values
-            width = Math.max(width, 1);
-            height = Math.max(height, 1);
-
-            // scale image
-            scaled = ((w == width) && (h == height)) ? image : image.getScaledInstance(width, height, Image.SCALE_AREA_AVERAGING);
-            MediaTracker mediaTracker = new MediaTracker(new Container());
-            mediaTracker.addImage(scaled, 0);
-            try {mediaTracker.waitForID(0);} catch (InterruptedException e) {}
-
            if ((height == 16) && (width == 16) && (resource != null)) {
                // this might be a favicon, store image to cache for faster re-load later on
                iconcache.put(urlString, scaled);
--- a/htroot/yacysearchitem.html
+++ b/htroot/yacysearchitem.html
@ -22,7 +22,7 @@
  ::
  #{items}#
  <div class="thumbcontainer">
-    <a href="#[href]#" class="thumblink" onclick="return hs.expand(this)">
+    <a href="#[hrefCache]#" class="thumblink" onclick="return hs.expand(this)">
      <img src="/ViewImage.png?maxwidth=96&maxheight=96&code=#[code]#" alt="#[name]#">
    </a>
    <div class="highslide-caption"><a href="#[href]#">#[name]#<br \><a href="#[source]#">#[sourcedom]#</a></a></div>
--- a/htroot/yacysearchitem.java
+++ b/htroot/yacysearchitem.java
@ -67,6 +67,7 @@ public class yacysearchitem {
        boolean rss = post.get("rss", "false").equals("true");
        boolean authenticated = sb.adminAuthenticated(header) >= 2;
        int item = post.getInt("item", -1);
+        boolean auth = ((String) header.get(httpHeader.CONNECTION_PROP_CLIENTIP, "")).equals("localhost") || sb.verifyAuthentication(header, true);
        
        // default settings for blank item
        prop.put("content", "0");
@ -233,6 +234,7 @@ public class yacysearchitem {
            if (ms == null) {
                prop.put("content_items", "0");
            } else {
+                prop.putHTML("content_items_0_hrefCache", (auth) ? "/ViewImage.png?url=" + ms.href.toNormalform(true, false) : ms.href.toNormalform(true, false));
                prop.putHTML("content_items_0_href", ms.href.toNormalform(true, false));
                prop.put("content_items_0_code", sb.licensedURLs.aquireLicense(ms.href));
                prop.putHTML("content_items_0_name", shorten(ms.name, namelength));
--- a/source/de/anomic/htmlFilter/htmlFilterContentScraper.java
+++ b/source/de/anomic/htmlFilter/htmlFilterContentScraper.java
@ -54,6 +54,7 @@ import java.net.MalformedURLException;
 import java.text.Collator;
 import java.util.ArrayList;
 import java.util.HashMap;
+import java.util.Iterator;
 import java.util.List;
 import java.util.Locale;
 import java.util.Map;
@ -102,7 +103,7 @@ public class htmlFilterContentScraper extends htmlFilterAbstractScraper implemen

    // class variables: collectors for links
    private HashMap<yacyURL, String> anchors;
-    private TreeSet<htmlFilterImageEntry> images; // String(absolute url)/ImageEntry relation
+    private HashMap<String, htmlFilterImageEntry> images; // urlhash/image relation
    private HashMap<String, String> metas;
    private String title;
    //private String headline;
@ -127,7 +128,7 @@ public class htmlFilterContentScraper extends htmlFilterAbstractScraper implemen
        super(linkTags0, linkTags1);
        this.root = root;
        this.anchors = new HashMap<yacyURL, String>();
-        this.images = new TreeSet<htmlFilterImageEntry>();
+        this.images = new HashMap<String, htmlFilterImageEntry>();
        this.metas = new HashMap<String, String>();
        this.title = "";
        this.headlines = new ArrayList[4];
@ -178,7 +179,7 @@ public class htmlFilterContentScraper extends htmlFilterAbstractScraper implemen
            } catch (NumberFormatException e) {}
            yacyURL url = absolutePath(tagopts.getProperty("src", ""));
            htmlFilterImageEntry ie = new htmlFilterImageEntry(url, tagopts.getProperty("alt",""), width, height);
-            images.add(ie);
+            addImage(images, ie);
        }
        if (tagname.equalsIgnoreCase("base")) try {
            root = new yacyURL(tagopts.getProperty("href", ""), null);
@ -212,7 +213,7 @@ public class htmlFilterContentScraper extends htmlFilterAbstractScraper implemen

                if (type.equalsIgnoreCase("shortcut icon")) {
                    htmlFilterImageEntry ie = new htmlFilterImageEntry(newLink, linktitle, -1,-1);
-                    images.add(ie);    
+                    images.put(ie.url().hash(), ie);    
                    this.favicon = newLink;
                } else if (!type.equalsIgnoreCase("stylesheet") && !type.equalsIgnoreCase("alternate stylesheet")) {
                    anchors.put(newLink, linktitle);
@ -234,12 +235,24 @@ public class htmlFilterContentScraper extends htmlFilterAbstractScraper implemen
        // fire event
        fireScrapeTag0(tagname, tagopts);
    }
-
+    
    public void scrapeTag1(String tagname, Properties tagopts, char[] text) {
        // System.out.println("ScrapeTag1: tagname=" + tagname + ", opts=" + tagopts.toString() + ", text=" + new String(text));
        if ((tagname.equalsIgnoreCase("a")) && (text.length < 2048)) {
            String href = tagopts.getProperty("href", "");
-            if (href.length() > 0) anchors.put(absolutePath(href), super.stripAll(new serverCharBuffer(text)).trim().toString());
+            if (href.length() > 0) {
+                yacyURL url = absolutePath(href);
+                String f = url.getFile();
+                int p = f.lastIndexOf('.');
+                String type = (p < 0) ? "" : f.substring(p + 1);
+                if (type.equals("png") || type.equals("gif") || type.equals("jpg") || type.equals("jpeg")) {
+                    // special handling of such urls: put them to the image urls
+                    htmlFilterImageEntry ie = new htmlFilterImageEntry(url, super.stripAll(new serverCharBuffer(text)).trim().toString(), -1, -1);
+                    addImage(images, ie);
+                } else {
+                    anchors.put(url, super.stripAll(new serverCharBuffer(text)).trim().toString());
+                }
+            }
        }
        String h;
        if ((tagname.equalsIgnoreCase("h1")) && (text.length < 1024)) {
@ -348,7 +361,7 @@ public class htmlFilterContentScraper extends htmlFilterAbstractScraper implemen
        return anchors;
    }

-    public TreeSet<htmlFilterImageEntry> getImages() {
+    public HashMap<String, htmlFilterImageEntry> getImages() {
        // this resturns a String(absolute url)/htmlFilterImageEntry - relation
        return images;
    }
@ -522,5 +535,24 @@ public class htmlFilterContentScraper extends htmlFilterAbstractScraper implemen
        
        return scraper;
    }
+    
+    public static void addAllImages(HashMap<String, htmlFilterImageEntry> a, HashMap<String, htmlFilterImageEntry> b) {
+        Iterator<Map.Entry<String, htmlFilterImageEntry>> i = b.entrySet().iterator();
+        Map.Entry<String, htmlFilterImageEntry> ie;
+        while (i.hasNext()) {
+            ie = i.next();
+            addImage(a, ie.getValue());
+        }
+    }
+    
+    public static void addImage(HashMap<String, htmlFilterImageEntry> a, htmlFilterImageEntry ie) {
+        if (a.containsKey(ie.url().hash())) {
+            // in case of a collision, take that image that has the better image size tags
+            if ((ie.height() > 0) && (ie.width() > 0)) a.put(ie.url().hash(), ie);
+        } else {
+            a.put(ie.url().hash(), ie);
+        }
+    }
+    
 }

--- a/source/de/anomic/http/httpdFileHandler.java
+++ b/source/de/anomic/http/httpdFileHandler.java
@ -452,6 +452,7 @@ public final class httpdFileHandler {
                    sb.append("<html>\n<head>\n</head>\n<body>\n<h1>Index of " + path + "</h1>\n  <ul>\n");
                    File dir = new File(htDocsPath, path);
                    String[] list = dir.list();
+                    if (list == null) list = new String[0]; // should not occur!
                    File f;
                    String size;
                    long sz;
--- a/source/de/anomic/plasma/parser/rss/rssParser.java
+++ b/source/de/anomic/plasma/parser/rss/rssParser.java
@ -50,7 +50,6 @@ import java.util.HashMap;
 import java.util.Hashtable;
 import java.util.LinkedList;
 import java.util.Map;
-import java.util.TreeSet;

 import de.anomic.htmlFilter.htmlFilterAbstractScraper;
 import de.anomic.htmlFilter.htmlFilterContentScraper;
@ -97,7 +96,7 @@ public class rssParser extends AbstractParser implements Parser {
        try {
            LinkedList<String> feedSections = new LinkedList<String>();
            HashMap<yacyURL, String> anchors = new HashMap<yacyURL, String>();
-            TreeSet<htmlFilterImageEntry> images  = new TreeSet<htmlFilterImageEntry>();
+            HashMap<String, htmlFilterImageEntry> images  = new HashMap<String, htmlFilterImageEntry>();
            serverByteBuffer text = new serverByteBuffer();
            serverCharBuffer authors = new serverCharBuffer();
            
@ -114,7 +113,8 @@ public class rssParser extends AbstractParser implements Parser {
            String feedDescription = reader.getChannel().getDescription();
            
            if (reader.getImage() != null) {
-                images.add(new htmlFilterImageEntry(new yacyURL(reader.getImage(), null), feedTitle, -1, -1));
+                yacyURL imgURL = new yacyURL(reader.getImage(), null);
+                images.put(imgURL.hash(), new htmlFilterImageEntry(imgURL, feedTitle, -1, -1));
            }            
            
            // loop through the feed items
@ -154,9 +154,9 @@ public class rssParser extends AbstractParser implements Parser {
                            anchors.putAll(itemLinks);
                        }
                        
-                        TreeSet<htmlFilterImageEntry> itemImages = scraper.getImages();
+                        HashMap<String, htmlFilterImageEntry> itemImages = scraper.getImages();
                        if ((itemImages != null) && (itemImages.size() > 0)) {
-                            images.addAll(itemImages);
+                            htmlFilterContentScraper.addAllImages(images, itemImages);
                        }
                        
                        byte[] extractedText = scraper.getText();
--- a/source/de/anomic/plasma/parser/tar/tarParser.java
+++ b/source/de/anomic/plasma/parser/tar/tarParser.java
@ -53,12 +53,12 @@ import java.util.HashMap;
 import java.util.Hashtable;
 import java.util.LinkedList;
 import java.util.Map;
-import java.util.TreeSet;
 import java.util.zip.GZIPInputStream;

 import com.ice.tar.TarEntry;
 import com.ice.tar.TarInputStream;

+import de.anomic.htmlFilter.htmlFilterContentScraper;
 import de.anomic.htmlFilter.htmlFilterImageEntry;
 import de.anomic.plasma.plasmaParser;
 import de.anomic.plasma.plasmaParserDocument;
@ -132,7 +132,7 @@ public class tarParser extends AbstractParser implements Parser {
            StringBuffer docAbstrct = new StringBuffer();

            Map<yacyURL, String> docAnchors = new HashMap<yacyURL, String>();
-            TreeSet<htmlFilterImageEntry> docImages = new TreeSet<htmlFilterImageEntry>(); 
+            HashMap<String, htmlFilterImageEntry> docImages = new HashMap<String, htmlFilterImageEntry>(); 
                        
            // looping through the contained files
            TarEntry entry;
@ -193,7 +193,7 @@ public class tarParser extends AbstractParser implements Parser {
                }               
                
                docAnchors.putAll(subDoc.getAnchors());
-                docImages.addAll(subDoc.getImages());
+                htmlFilterContentScraper.addAllImages(docImages, subDoc.getImages());
                
                // release subdocument
                subDoc.close();
--- a/source/de/anomic/plasma/parser/zip/zipParser.java
+++ b/source/de/anomic/plasma/parser/zip/zipParser.java
@ -53,10 +53,10 @@ import java.util.HashMap;
 import java.util.Hashtable;
 import java.util.LinkedList;
 import java.util.Map;
-import java.util.TreeSet;
 import java.util.zip.ZipEntry;
 import java.util.zip.ZipInputStream;

+import de.anomic.htmlFilter.htmlFilterContentScraper;
 import de.anomic.htmlFilter.htmlFilterImageEntry;
 import de.anomic.plasma.plasmaParser;
 import de.anomic.plasma.plasmaParserDocument;
@ -115,7 +115,7 @@ public class zipParser extends AbstractParser implements Parser {
            LinkedList<String> docSections = new LinkedList<String>();
            StringBuffer docAbstrct = new StringBuffer();
            Map<yacyURL, String> docAnchors = new HashMap<yacyURL, String>();
-            TreeSet<htmlFilterImageEntry> docImages = new TreeSet<htmlFilterImageEntry>(); 
+            HashMap<String, htmlFilterImageEntry> docImages = new HashMap<String, htmlFilterImageEntry>(); 
            
            // creating a new parser class to parse the unzipped content
            plasmaParser theParser = new plasmaParser();            
@ -176,7 +176,7 @@ public class zipParser extends AbstractParser implements Parser {
                }
                
                docAnchors.putAll(subDoc.getAnchors());
-                docImages.addAll(subDoc.getImages());
+                htmlFilterContentScraper.addAllImages(docImages, subDoc.getImages());
                
                // release subdocument
                subDoc.close();
--- a/source/de/anomic/plasma/plasmaCondenser.java
+++ b/source/de/anomic/plasma/plasmaCondenser.java
@ -107,6 +107,19 @@ public final class plasmaCondenser {
    
    private final static int numlength = 5;

+    // initialize array of invisible characters
+    private static boolean[] invisibleChar = new boolean['z' - ' ' + 1];
+    static {
+        // initialize array of invisible charachters
+        String invisibleString = "\"$%&/()=`^+*#'-_:;,<>[]\\";
+        for (int i = ' '; i <= 'z'; i++) {
+            invisibleChar[i - ' '] = false;
+        }
+        for (int i = 0; i < invisibleString.length(); i++) {
+            invisibleChar[invisibleString.charAt(i) - ' '] = true;
+        }
+    }
+    
    //private Properties analysis;
    private TreeMap<String, wordStatProp> words; // a string (the words) to (wordStatProp) - relation
    private HashMap<StringBuffer, phraseStatProp> sentences;
@ -198,7 +211,7 @@ public final class plasmaCondenser {
            }

            // images
-            Iterator<htmlFilterImageEntry> j = document.getImages().iterator();
+            Iterator<htmlFilterImageEntry> j = document.getImages().values().iterator();
            htmlFilterImageEntry ientry;
            while (j.hasNext()) {
                ientry = j.next();
@ -659,7 +672,7 @@ public final class plasmaCondenser {
    public final static boolean invisible(char c) {
        // TODO: Bugfix for UTF-8: does this work for non ISO-8859-1 chars?
        if ((c < ' ') || (c > 'z')) return true;
-        return ("$%&/()=\"$%&/()=`^+*~#'-_:;,|<>[]\\".indexOf(c) >= 0);
+        return invisibleChar[c - ' '];
    }

    public static Enumeration<StringBuffer> wordTokenizer(String s, String charset, int minLength) {
@ -727,7 +740,7 @@ public final class plasmaCondenser {

        public unsievedWordsEnum(InputStream is, String charset) throws UnsupportedEncodingException {
            e = new sentencesFromInputStreamEnum(is, charset);
-            s = new StringBuffer();
+            s = new StringBuffer(20);
            buffer = nextElement0();
        }

@ -859,9 +872,9 @@ public final class plasmaCondenser {
    }

    static StringBuffer readSentence(Reader reader, boolean pre) throws IOException {
-        StringBuffer s = new StringBuffer();
+        StringBuffer s = new StringBuffer(20);
        int nextChar;
-        char c;
+        char c, lc = (char) 0;
        
        // find sentence end
        for (;;) {
@ -871,20 +884,14 @@ public final class plasmaCondenser {
                if (s.length() == 0) return null; else break;
            }
            c = (char) nextChar;
+            if (pre && ((c == (char) 10) || (c == (char) 13))) break;
+            if ((c == (char) 8) || (c == (char) 10) || (c == (char) 13)) c = ' ';
+            if ((lc == ' ') && (c == ' ')) continue; // ignore double spaces
            s.append(c);
-            if (pre) {
-                if ((c == (char) 10) || (c == (char) 13)) break;
-            } else {
-                if (htmlFilterContentScraper.punctuation(c)) break;
-            }
+            if (htmlFilterContentScraper.punctuation(c)) break;
+            lc = c;
        }
-
-        // replace line endings and tabs by blanks
-        for (int i = 0; i < s.length(); i++) {
-            if ((s.charAt(i) == (char) 10) || (s.charAt(i) == (char) 13) || (s.charAt(i) == (char) 8)) s.setCharAt(i, ' ');
-        }
-        // remove all double-spaces
-        int p; while ((p = s.indexOf("  ")) >= 0) s.deleteCharAt(p);
+        
        return s;
    }

--- a/source/de/anomic/plasma/plasmaCrawlBalancer.java
+++ b/source/de/anomic/plasma/plasmaCrawlBalancer.java
@ -130,7 +130,10 @@ public class plasmaCrawlBalancer {
    }
    
    public void finalize() {
-        if (urlFileStack != null) close();
+        if (urlFileStack != null) {
+            serverLog.logWarning("plasmaCrawlBalancer", "crawl stack " + stackname + " closed by finalizer");
+            close();
+        }
    }
    
    public synchronized void clear() {
--- a/source/de/anomic/plasma/plasmaCrawlNURL.java
+++ b/source/de/anomic/plasma/plasmaCrawlNURL.java
@ -49,6 +49,8 @@ import java.util.ArrayList;
 import java.util.HashSet;
 import java.util.Iterator;

+import de.anomic.server.logging.serverLog;
+
 public class plasmaCrawlNURL {
    
    public static final int STACK_TYPE_NULL     =  0; // do not stack
@ -64,9 +66,9 @@ public class plasmaCrawlNURL {
    private static final long minimumGlobalDelta = 500; // the minimum time difference between access of the same global domain
    private static final long maximumDomAge =  60000; // the maximum age of a domain until it is used for another crawl attempt
    
-    private final plasmaCrawlBalancer coreStack;      // links found by crawling to depth-1
-    private final plasmaCrawlBalancer limitStack;     // links found by crawling at target depth
-    private final plasmaCrawlBalancer remoteStack;    // links from remote crawl orders
+    private plasmaCrawlBalancer coreStack;      // links found by crawling to depth-1
+    private plasmaCrawlBalancer limitStack;     // links found by crawling at target depth
+    private plasmaCrawlBalancer remoteStack;    // links from remote crawl orders
    //private final plasmaCrawlBalancer overhangStack;  // links found by crawling at depth+1
    //private kelondroStack imageStack;     // links pointing to image resources
    //private kelondroStack movieStack;     // links pointing to movie resources
@ -81,10 +83,26 @@ public class plasmaCrawlNURL {
    }

    public void close() {
-        coreStack.close();
-        limitStack.close();
+        if (coreStack != null) {
+            coreStack.close();
+            coreStack = null;
+        }
+        if (limitStack != null) {
+            limitStack.close();
+            limitStack = null;
+        }
        //overhangStack.close();
-        remoteStack.close();
+        if (remoteStack != null) {
+            remoteStack.close();
+            remoteStack = null;
+        }
+    }
+    
+    public void finalize() {
+        if ((coreStack != null) || (limitStack != null) || (remoteStack != null)) {
+            serverLog.logWarning("plasmaCrawlNURL", "NURL stack closed by finalizer");
+            close();
+        }
    }
    
    public boolean notEmpty() {
--- a/source/de/anomic/plasma/plasmaParser.java
+++ b/source/de/anomic/plasma/plasmaParser.java
@ -39,6 +39,7 @@ import java.io.UnsupportedEncodingException;
 import java.net.MalformedURLException;
 import java.net.URI;
 import java.util.Arrays;
+import java.util.Collection;
 import java.util.HashMap;
 import java.util.HashSet;
 import java.util.Hashtable;
@ -747,7 +748,7 @@ public final class plasmaParser {
        
    }
    
-    static Map<yacyURL, String> allReflinks(Set<?> links) {
+    static Map<yacyURL, String> allReflinks(Collection<?> links) {
        // links is either a Set of Strings (with urls) or htmlFilterImageEntries
        // we find all links that are part of a reference inside a url
        HashMap<yacyURL, String> v = new HashMap<yacyURL, String>();
@ -786,7 +787,7 @@ public final class plasmaParser {
        return v;
    }
    
-    static Map<yacyURL, String> allSubpaths(Set<?> links) {
+    static Map<yacyURL, String> allSubpaths(Collection<?> links) {
        // links is either a Set of Strings (urls) or a Set of htmlFilterImageEntries
        HashSet<String> h = new HashSet<String>();
        Iterator<?> i = links.iterator();
--- a/source/de/anomic/plasma/plasmaParserDocument.java
+++ b/source/de/anomic/plasma/plasmaParserDocument.java
@ -61,6 +61,7 @@ import java.util.List;
 import java.util.Map;
 import java.util.TreeSet;

+import de.anomic.htmlFilter.htmlFilterContentScraper;
 import de.anomic.htmlFilter.htmlFilterImageEntry;
 import de.anomic.plasma.parser.Parser;

@ -76,7 +77,7 @@ public class plasmaParserDocument {
    private StringBuffer description;   // an abstract, if present: short content description
    private Object text;            // the clear text, all that is visible
    private Map<yacyURL, String> anchors;    // all links embedded as clickeable entities (anchor tags)
-    private TreeSet<htmlFilterImageEntry> images;         // all visible pictures in document
+    private HashMap<String, htmlFilterImageEntry> images;         // all visible pictures in document
    // the anchors and images - Maps are URL-to-EntityDescription mappings.
    // The EntityDescription appear either as visible text in anchors or as alternative
    // text in image tags.
@ -89,7 +90,7 @@ public class plasmaParserDocument {
    protected plasmaParserDocument(yacyURL location, String mimeType, String charset,
                    String[] keywords, String title, String author,
                    String[] sections, String abstrct,
-                    Object text, Map<yacyURL, String> anchors, TreeSet<htmlFilterImageEntry> images) {
+                    Object text, Map<yacyURL, String> anchors, HashMap<String, htmlFilterImageEntry> images) {
        this.source = location;
        this.mimeType = (mimeType == null) ? "application/octet-stream" : mimeType;
        this.charset = charset;
@ -99,7 +100,7 @@ public class plasmaParserDocument {
        this.sections = (sections == null) ? new LinkedList<String>() : Arrays.asList(sections);
        this.description = (abstrct == null) ? new StringBuffer() : new StringBuffer(abstrct);
        this.anchors = (anchors == null) ? new HashMap<yacyURL, String>(0) : anchors;
-        this.images =  (images == null) ? new TreeSet<htmlFilterImageEntry>() : images;
+        this.images =  (images == null) ? new HashMap<String, htmlFilterImageEntry>() : images;
        this.hyperlinks = null;
        this.audiolinks = null;
        this.videolinks = null;
@ -124,21 +125,21 @@ public class plasmaParserDocument {
    public plasmaParserDocument(yacyURL location, String mimeType, String charset,
                    String[] keywords, String title, String author,
                    String[] sections, String abstrct,
-                    byte[] text, Map<yacyURL, String> anchors, TreeSet<htmlFilterImageEntry> images) {
+                    byte[] text, Map<yacyURL, String> anchors, HashMap<String, htmlFilterImageEntry> images) {
        this(location, mimeType, charset, keywords, title, author, sections, abstrct, (Object)text, anchors, images);
    }
    
    public plasmaParserDocument(yacyURL location, String mimeType, String charset,
            String[] keywords, String title, String author,
            String[] sections, String abstrct,
-            File text, Map<yacyURL, String> anchors, TreeSet<htmlFilterImageEntry> images) {
+            File text, Map<yacyURL, String> anchors, HashMap<String, htmlFilterImageEntry> images) {
        this(location, mimeType, charset, keywords, title, author, sections, abstrct, (Object)text, anchors, images);
    }
    
    public plasmaParserDocument(yacyURL location, String mimeType, String charset,
            String[] keywords, String title, String author,
            String[] sections, String abstrct,
-            serverCachedFileOutputStream text, Map<yacyURL, String> anchors, TreeSet<htmlFilterImageEntry> images) {
+            serverCachedFileOutputStream text, Map<yacyURL, String> anchors, HashMap<String, htmlFilterImageEntry> images) {
        this(location, mimeType, charset, keywords, title, author, sections, abstrct, (Object)text, anchors, images);
    }

@ -310,7 +311,7 @@ dc_rights
        return this.videolinks;
    }
    
-    public TreeSet<htmlFilterImageEntry> getImages() {
+    public HashMap<String, htmlFilterImageEntry> getImages() {
        // returns all links enbedded as pictures (visible in document)
        // this resturns a htmlFilterImageEntry collection
        if (!resorted) resortLinks();
@ -341,7 +342,7 @@ dc_rights
        audiolinks = new HashMap<yacyURL, String>();
        applinks   = new HashMap<yacyURL, String>();
        emaillinks = new HashMap<String, String>();
-        TreeSet<htmlFilterImageEntry> collectedImages = new TreeSet<htmlFilterImageEntry>(); // this is a set that is collected now and joined later to the imagelinks
+        HashMap<String, htmlFilterImageEntry> collectedImages = new HashMap<String, htmlFilterImageEntry>(); // this is a set that is collected now and joined later to the imagelinks
        Map.Entry<yacyURL, String> entry;
        while (i.hasNext()) {
            entry = i.next();
@ -361,7 +362,7 @@ dc_rights
                    if (plasmaParser.mediaExtContains(ext)) {
                        // this is not a normal anchor, its a media link
                        if (plasmaParser.imageExtContains(ext)) {
-                            collectedImages.add(new htmlFilterImageEntry(url, (String) entry.getValue(), -1, -1));
+                            htmlFilterContentScraper.addImage(collectedImages, new htmlFilterImageEntry(url, (String) entry.getValue(), -1, -1));
                        }
                        else if (plasmaParser.audioExtContains(ext)) audiolinks.put(url, (String)entry.getValue());
                        else if (plasmaParser.videoExtContains(ext)) videolinks.put(url, (String)entry.getValue());
@ -374,23 +375,18 @@ dc_rights
        }
        
        // add image links that we collected from the anchors to the image map
-        Iterator<htmlFilterImageEntry>  j = collectedImages.iterator();
-        htmlFilterImageEntry iEntry;
-        while (j.hasNext()) {
-            iEntry = (htmlFilterImageEntry) j.next();
-            if (!images.contains(iEntry)) images.add(iEntry);
-        }
+        htmlFilterContentScraper.addAllImages(images, collectedImages);
       
        // expand the hyperlinks:
        // we add artificial hyperlinks to the hyperlink set
        // that can be calculated from given hyperlinks and imagelinks
        
-        hyperlinks.putAll(plasmaParser.allReflinks(images));
+        hyperlinks.putAll(plasmaParser.allReflinks(images.values()));
        hyperlinks.putAll(plasmaParser.allReflinks(audiolinks.keySet()));
        hyperlinks.putAll(plasmaParser.allReflinks(videolinks.keySet()));
        hyperlinks.putAll(plasmaParser.allReflinks(applinks.keySet()));
        hyperlinks.putAll(plasmaParser.allSubpaths(hyperlinks.keySet()));
-        hyperlinks.putAll(plasmaParser.allSubpaths(images));
+        hyperlinks.putAll(plasmaParser.allSubpaths(images.values()));
        hyperlinks.putAll(plasmaParser.allSubpaths(audiolinks.keySet()));
        hyperlinks.putAll(plasmaParser.allSubpaths(videolinks.keySet()));
        hyperlinks.putAll(plasmaParser.allSubpaths(applinks.keySet()));
@ -417,7 +413,7 @@ dc_rights
        serverFileUtils.copy(doc.getText(), (serverCachedFileOutputStream)this.text);
        
        anchors.putAll(doc.getAnchors());
-        images.addAll(doc.getImages());
+        htmlFilterContentScraper.addAllImages(images, doc.getImages());
    }
    
    /**
--- a/source/de/anomic/plasma/plasmaSearchImages.java
+++ b/source/de/anomic/plasma/plasmaSearchImages.java
@ -43,9 +43,10 @@ package de.anomic.plasma;

 import java.io.InputStream;
 import java.net.MalformedURLException;
+import java.util.HashMap;
 import java.util.Iterator;
-import java.util.TreeSet;

+import de.anomic.htmlFilter.htmlFilterContentScraper;
 import de.anomic.htmlFilter.htmlFilterImageEntry;
 import de.anomic.plasma.parser.ParserException;
 import de.anomic.server.serverDate;
@ -53,11 +54,11 @@ import de.anomic.yacy.yacyURL;

 public final class plasmaSearchImages {

-    private TreeSet<htmlFilterImageEntry> images;
+    private HashMap<String, htmlFilterImageEntry> images;
    
    public plasmaSearchImages(long maxTime, yacyURL url, int depth) {
        long start = System.currentTimeMillis();
-        this.images = new TreeSet<htmlFilterImageEntry>();
+        this.images = new HashMap<String, htmlFilterImageEntry>();
        if (maxTime > 10) {
            Object[] resource = plasmaSnippetCache.getResource(url, true, (int) maxTime, false);
            InputStream res = (InputStream) resource[0];
@ -75,7 +76,7 @@ public final class plasmaSearchImages {
                if (document == null) return;
                
                // add the image links
-                this.addAll(document.getImages());
+                htmlFilterContentScraper.addAllImages(this.images, document.getImages());

                // add also links from pages one step deeper, if depth > 0
                if (depth > 0) {
@ -97,26 +98,13 @@ public final class plasmaSearchImages {
    
    public void addAll(plasmaSearchImages m) {
        synchronized (m.images) {
-            addAll(m.images);
-        }
-    }
-    
-    private void addAll(TreeSet<htmlFilterImageEntry> ts) {
-        Iterator<htmlFilterImageEntry> i = ts.iterator();
-        htmlFilterImageEntry ie;
-        while (i.hasNext()) {
-            ie = i.next();
-            if (images.contains(ie)) {
-                if ((ie.height() > 0) && (ie.width() > 0)) images.add(ie);
-            } else {
-                images.add(ie);
-            }
+            htmlFilterContentScraper.addAllImages(this.images, m.images);
        }
    }
    
    public Iterator<htmlFilterImageEntry> entries() {
        // returns htmlFilterImageEntry - Objects
-        return images.iterator();
+        return images.values().iterator();
    }
    
 }
--- a/source/de/anomic/plasma/plasmaSnippetCache.java
+++ b/source/de/anomic/plasma/plasmaSnippetCache.java
@ -697,7 +697,8 @@ public class plasmaSnippetCache {
    
    public static ArrayList<MediaSnippet> computeImageSnippets(plasmaParserDocument document, Set<String> queryhashes) {
        
-        TreeSet<htmlFilterImageEntry> images = document.getImages(); // iterates images in descending size order!
+        TreeSet<htmlFilterImageEntry> images = new TreeSet<htmlFilterImageEntry>();
+        images.addAll(document.getImages().values()); // iterates images in descending size order!
        // a measurement for the size of the images can be retrieved using the htmlFilterImageEntry.hashCode()
        
        Iterator<htmlFilterImageEntry> i = images.iterator();