- enhanced recognition, parsing, management and double-occurrence-handling of image tags

- enhanced text parser (condenser): found and eliminated bad code parts; increase of speed
- added handling of image preview using the image cache from HTCACHE
- some other minor changes

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@4507 6c8d7289-2bf4-0310-a012-ef5d649a1542
This commit is contained in:
orbiter 2008-02-25 14:08:15 +00:00
parent fcc919964b
commit 87a8747ce3
17 changed files with 161 additions and 111 deletions

View File

@ -54,6 +54,7 @@ import java.io.File;
import java.io.FilenameFilter;
import java.io.IOException;
import java.io.Writer;
import java.util.HashMap;
import java.util.Iterator;
import java.util.Map;
import java.util.TreeSet;
@ -313,8 +314,8 @@ public class CacheAdmin_p {
prop.put("info_type_use." + extension, (i == 0) ? 0 : 1);
}
private static void formatImageAnchor(serverObjects prop, TreeSet<htmlFilterImageEntry> anchor) {
final Iterator<htmlFilterImageEntry> iter = anchor.iterator();
private static void formatImageAnchor(serverObjects prop, HashMap<String, htmlFilterImageEntry> anchor) {
final Iterator<htmlFilterImageEntry> iter = anchor.values().iterator();
htmlFilterImageEntry ie;
prop.put("info_type_use.images_images", anchor.size());
int i = 0;

View File

@ -49,9 +49,9 @@ import java.io.InputStream;
import java.io.UnsupportedEncodingException;
import java.net.MalformedURLException;
import java.net.URLDecoder;
import java.util.HashMap;
import java.util.Iterator;
import java.util.Map;
import java.util.TreeSet;
import de.anomic.data.htmlTools;
import de.anomic.htmlFilter.htmlFilterImageEntry;
@ -339,8 +339,8 @@ public class ViewFile {
i += putMediaInfo(prop, wordArray, i, document.getAudiolinks(), "audio", (i % 2 == 0));
dark = (i % 2 == 0);
TreeSet<htmlFilterImageEntry> ts = document.getImages();
Iterator<htmlFilterImageEntry> tsi = ts.iterator();
HashMap<String, htmlFilterImageEntry> ts = document.getImages();
Iterator<htmlFilterImageEntry> tsi = ts.values().iterator();
htmlFilterImageEntry entry;
while (tsi.hasNext()) {
entry = tsi.next();

View File

@ -135,47 +135,47 @@ public class ViewImage {
// find original size
int h = image.getHeight(null);
int w = image.getWidth(null);
// System.out.println("DEBUG: get access to image " +
// url.toNormalform() + " is " + ((auth) ? "authorized" : "NOT
// authorized"));
// in case of not-authorized access shrink the image to prevent
// copyright problems
// so that images are not larger than thumbnails
if ((!auth) && ((w > 16) || (h > 16))) {
// copyright problems, so that images are not larger than thumbnails
if (auth) {
maxwidth = (maxwidth == 0) ? w : maxwidth;
maxheight = (maxheight == 0) ? h : maxheight;
} else if ((w > 16) || (h > 16)) {
maxwidth = (int) Math.min(64.0, w * 0.6);
maxheight = (int) Math.min(64.0, h * 0.6);
} else {
maxwidth = 16;
maxheight = 16;
}
// calculate width & height from maxwidth & maxheight
if ((maxwidth != 0) || (maxheight != 0)) {
if ((maxwidth < w) || (maxheight < h)) {
// scale image
double hs = (w <= maxwidth) ? 1.0 : ((double) maxwidth) / ((double) w);
double vs = (h <= maxheight) ? 1.0 : ((double) maxheight) / ((double) h);
double scale = Math.min(hs, vs);
if (!auth) scale = Math.min(scale, 0.6); // this is for copyright purpose
if (scale < 1.0) {
width = (int) (w * scale);
height = (int) (h * scale);
width = Math.max(1, (int) (w * scale));
height = Math.max(1, (int) (h * scale));
} else {
width = w;
height = h;
width = Math.max(1, w);
height = Math.max(1, h);
}
// compute scaled image
scaled = ((w == width) && (h == height)) ? image : image.getScaledInstance(width, height, Image.SCALE_AREA_AVERAGING);
MediaTracker mediaTracker = new MediaTracker(new Container());
mediaTracker.addImage(scaled, 0);
try {mediaTracker.waitForID(0);} catch (InterruptedException e) {}
} else {
// do not scale
width = w;
height = h;
scaled = image;
}
// check for minimum values
width = Math.max(width, 1);
height = Math.max(height, 1);
// scale image
scaled = ((w == width) && (h == height)) ? image : image.getScaledInstance(width, height, Image.SCALE_AREA_AVERAGING);
MediaTracker mediaTracker = new MediaTracker(new Container());
mediaTracker.addImage(scaled, 0);
try {mediaTracker.waitForID(0);} catch (InterruptedException e) {}
if ((height == 16) && (width == 16) && (resource != null)) {
// this might be a favicon, store image to cache for faster re-load later on
iconcache.put(urlString, scaled);

View File

@ -22,7 +22,7 @@
::
#{items}#
<div class="thumbcontainer">
<a href="#[href]#" class="thumblink" onclick="return hs.expand(this)">
<a href="#[hrefCache]#" class="thumblink" onclick="return hs.expand(this)">
<img src="/ViewImage.png?maxwidth=96&maxheight=96&code=#[code]#" alt="#[name]#">
</a>
<div class="highslide-caption"><a href="#[href]#">#[name]#<br \><a href="#[source]#">#[sourcedom]#</a></a></div>

View File

@ -67,6 +67,7 @@ public class yacysearchitem {
boolean rss = post.get("rss", "false").equals("true");
boolean authenticated = sb.adminAuthenticated(header) >= 2;
int item = post.getInt("item", -1);
boolean auth = ((String) header.get(httpHeader.CONNECTION_PROP_CLIENTIP, "")).equals("localhost") || sb.verifyAuthentication(header, true);
// default settings for blank item
prop.put("content", "0");
@ -233,6 +234,7 @@ public class yacysearchitem {
if (ms == null) {
prop.put("content_items", "0");
} else {
prop.putHTML("content_items_0_hrefCache", (auth) ? "/ViewImage.png?url=" + ms.href.toNormalform(true, false) : ms.href.toNormalform(true, false));
prop.putHTML("content_items_0_href", ms.href.toNormalform(true, false));
prop.put("content_items_0_code", sb.licensedURLs.aquireLicense(ms.href));
prop.putHTML("content_items_0_name", shorten(ms.name, namelength));

View File

@ -54,6 +54,7 @@ import java.net.MalformedURLException;
import java.text.Collator;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Locale;
import java.util.Map;
@ -102,7 +103,7 @@ public class htmlFilterContentScraper extends htmlFilterAbstractScraper implemen
// class variables: collectors for links
private HashMap<yacyURL, String> anchors;
private TreeSet<htmlFilterImageEntry> images; // String(absolute url)/ImageEntry relation
private HashMap<String, htmlFilterImageEntry> images; // urlhash/image relation
private HashMap<String, String> metas;
private String title;
//private String headline;
@ -127,7 +128,7 @@ public class htmlFilterContentScraper extends htmlFilterAbstractScraper implemen
super(linkTags0, linkTags1);
this.root = root;
this.anchors = new HashMap<yacyURL, String>();
this.images = new TreeSet<htmlFilterImageEntry>();
this.images = new HashMap<String, htmlFilterImageEntry>();
this.metas = new HashMap<String, String>();
this.title = "";
this.headlines = new ArrayList[4];
@ -178,7 +179,7 @@ public class htmlFilterContentScraper extends htmlFilterAbstractScraper implemen
} catch (NumberFormatException e) {}
yacyURL url = absolutePath(tagopts.getProperty("src", ""));
htmlFilterImageEntry ie = new htmlFilterImageEntry(url, tagopts.getProperty("alt",""), width, height);
images.add(ie);
addImage(images, ie);
}
if (tagname.equalsIgnoreCase("base")) try {
root = new yacyURL(tagopts.getProperty("href", ""), null);
@ -212,7 +213,7 @@ public class htmlFilterContentScraper extends htmlFilterAbstractScraper implemen
if (type.equalsIgnoreCase("shortcut icon")) {
htmlFilterImageEntry ie = new htmlFilterImageEntry(newLink, linktitle, -1,-1);
images.add(ie);
images.put(ie.url().hash(), ie);
this.favicon = newLink;
} else if (!type.equalsIgnoreCase("stylesheet") && !type.equalsIgnoreCase("alternate stylesheet")) {
anchors.put(newLink, linktitle);
@ -234,12 +235,24 @@ public class htmlFilterContentScraper extends htmlFilterAbstractScraper implemen
// fire event
fireScrapeTag0(tagname, tagopts);
}
public void scrapeTag1(String tagname, Properties tagopts, char[] text) {
// System.out.println("ScrapeTag1: tagname=" + tagname + ", opts=" + tagopts.toString() + ", text=" + new String(text));
if ((tagname.equalsIgnoreCase("a")) && (text.length < 2048)) {
String href = tagopts.getProperty("href", "");
if (href.length() > 0) anchors.put(absolutePath(href), super.stripAll(new serverCharBuffer(text)).trim().toString());
if (href.length() > 0) {
yacyURL url = absolutePath(href);
String f = url.getFile();
int p = f.lastIndexOf('.');
String type = (p < 0) ? "" : f.substring(p + 1);
if (type.equals("png") || type.equals("gif") || type.equals("jpg") || type.equals("jpeg")) {
// special handling of such urls: put them to the image urls
htmlFilterImageEntry ie = new htmlFilterImageEntry(url, super.stripAll(new serverCharBuffer(text)).trim().toString(), -1, -1);
addImage(images, ie);
} else {
anchors.put(url, super.stripAll(new serverCharBuffer(text)).trim().toString());
}
}
}
String h;
if ((tagname.equalsIgnoreCase("h1")) && (text.length < 1024)) {
@ -348,7 +361,7 @@ public class htmlFilterContentScraper extends htmlFilterAbstractScraper implemen
return anchors;
}
public TreeSet<htmlFilterImageEntry> getImages() {
public HashMap<String, htmlFilterImageEntry> getImages() {
// this resturns a String(absolute url)/htmlFilterImageEntry - relation
return images;
}
@ -522,5 +535,24 @@ public class htmlFilterContentScraper extends htmlFilterAbstractScraper implemen
return scraper;
}
public static void addAllImages(HashMap<String, htmlFilterImageEntry> a, HashMap<String, htmlFilterImageEntry> b) {
Iterator<Map.Entry<String, htmlFilterImageEntry>> i = b.entrySet().iterator();
Map.Entry<String, htmlFilterImageEntry> ie;
while (i.hasNext()) {
ie = i.next();
addImage(a, ie.getValue());
}
}
public static void addImage(HashMap<String, htmlFilterImageEntry> a, htmlFilterImageEntry ie) {
if (a.containsKey(ie.url().hash())) {
// in case of a collision, take that image that has the better image size tags
if ((ie.height() > 0) && (ie.width() > 0)) a.put(ie.url().hash(), ie);
} else {
a.put(ie.url().hash(), ie);
}
}
}

View File

@ -452,6 +452,7 @@ public final class httpdFileHandler {
sb.append("<html>\n<head>\n</head>\n<body>\n<h1>Index of " + path + "</h1>\n <ul>\n");
File dir = new File(htDocsPath, path);
String[] list = dir.list();
if (list == null) list = new String[0]; // should not occur!
File f;
String size;
long sz;

View File

@ -50,7 +50,6 @@ import java.util.HashMap;
import java.util.Hashtable;
import java.util.LinkedList;
import java.util.Map;
import java.util.TreeSet;
import de.anomic.htmlFilter.htmlFilterAbstractScraper;
import de.anomic.htmlFilter.htmlFilterContentScraper;
@ -97,7 +96,7 @@ public class rssParser extends AbstractParser implements Parser {
try {
LinkedList<String> feedSections = new LinkedList<String>();
HashMap<yacyURL, String> anchors = new HashMap<yacyURL, String>();
TreeSet<htmlFilterImageEntry> images = new TreeSet<htmlFilterImageEntry>();
HashMap<String, htmlFilterImageEntry> images = new HashMap<String, htmlFilterImageEntry>();
serverByteBuffer text = new serverByteBuffer();
serverCharBuffer authors = new serverCharBuffer();
@ -114,7 +113,8 @@ public class rssParser extends AbstractParser implements Parser {
String feedDescription = reader.getChannel().getDescription();
if (reader.getImage() != null) {
images.add(new htmlFilterImageEntry(new yacyURL(reader.getImage(), null), feedTitle, -1, -1));
yacyURL imgURL = new yacyURL(reader.getImage(), null);
images.put(imgURL.hash(), new htmlFilterImageEntry(imgURL, feedTitle, -1, -1));
}
// loop through the feed items
@ -154,9 +154,9 @@ public class rssParser extends AbstractParser implements Parser {
anchors.putAll(itemLinks);
}
TreeSet<htmlFilterImageEntry> itemImages = scraper.getImages();
HashMap<String, htmlFilterImageEntry> itemImages = scraper.getImages();
if ((itemImages != null) && (itemImages.size() > 0)) {
images.addAll(itemImages);
htmlFilterContentScraper.addAllImages(images, itemImages);
}
byte[] extractedText = scraper.getText();

View File

@ -53,12 +53,12 @@ import java.util.HashMap;
import java.util.Hashtable;
import java.util.LinkedList;
import java.util.Map;
import java.util.TreeSet;
import java.util.zip.GZIPInputStream;
import com.ice.tar.TarEntry;
import com.ice.tar.TarInputStream;
import de.anomic.htmlFilter.htmlFilterContentScraper;
import de.anomic.htmlFilter.htmlFilterImageEntry;
import de.anomic.plasma.plasmaParser;
import de.anomic.plasma.plasmaParserDocument;
@ -132,7 +132,7 @@ public class tarParser extends AbstractParser implements Parser {
StringBuffer docAbstrct = new StringBuffer();
Map<yacyURL, String> docAnchors = new HashMap<yacyURL, String>();
TreeSet<htmlFilterImageEntry> docImages = new TreeSet<htmlFilterImageEntry>();
HashMap<String, htmlFilterImageEntry> docImages = new HashMap<String, htmlFilterImageEntry>();
// looping through the contained files
TarEntry entry;
@ -193,7 +193,7 @@ public class tarParser extends AbstractParser implements Parser {
}
docAnchors.putAll(subDoc.getAnchors());
docImages.addAll(subDoc.getImages());
htmlFilterContentScraper.addAllImages(docImages, subDoc.getImages());
// release subdocument
subDoc.close();

View File

@ -53,10 +53,10 @@ import java.util.HashMap;
import java.util.Hashtable;
import java.util.LinkedList;
import java.util.Map;
import java.util.TreeSet;
import java.util.zip.ZipEntry;
import java.util.zip.ZipInputStream;
import de.anomic.htmlFilter.htmlFilterContentScraper;
import de.anomic.htmlFilter.htmlFilterImageEntry;
import de.anomic.plasma.plasmaParser;
import de.anomic.plasma.plasmaParserDocument;
@ -115,7 +115,7 @@ public class zipParser extends AbstractParser implements Parser {
LinkedList<String> docSections = new LinkedList<String>();
StringBuffer docAbstrct = new StringBuffer();
Map<yacyURL, String> docAnchors = new HashMap<yacyURL, String>();
TreeSet<htmlFilterImageEntry> docImages = new TreeSet<htmlFilterImageEntry>();
HashMap<String, htmlFilterImageEntry> docImages = new HashMap<String, htmlFilterImageEntry>();
// creating a new parser class to parse the unzipped content
plasmaParser theParser = new plasmaParser();
@ -176,7 +176,7 @@ public class zipParser extends AbstractParser implements Parser {
}
docAnchors.putAll(subDoc.getAnchors());
docImages.addAll(subDoc.getImages());
htmlFilterContentScraper.addAllImages(docImages, subDoc.getImages());
// release subdocument
subDoc.close();

View File

@ -107,6 +107,19 @@ public final class plasmaCondenser {
private final static int numlength = 5;
// initialize array of invisible characters
private static boolean[] invisibleChar = new boolean['z' - ' ' + 1];
static {
// initialize array of invisible charachters
String invisibleString = "\"$%&/()=`^+*#'-_:;,<>[]\\";
for (int i = ' '; i <= 'z'; i++) {
invisibleChar[i - ' '] = false;
}
for (int i = 0; i < invisibleString.length(); i++) {
invisibleChar[invisibleString.charAt(i) - ' '] = true;
}
}
//private Properties analysis;
private TreeMap<String, wordStatProp> words; // a string (the words) to (wordStatProp) - relation
private HashMap<StringBuffer, phraseStatProp> sentences;
@ -198,7 +211,7 @@ public final class plasmaCondenser {
}
// images
Iterator<htmlFilterImageEntry> j = document.getImages().iterator();
Iterator<htmlFilterImageEntry> j = document.getImages().values().iterator();
htmlFilterImageEntry ientry;
while (j.hasNext()) {
ientry = j.next();
@ -659,7 +672,7 @@ public final class plasmaCondenser {
public final static boolean invisible(char c) {
// TODO: Bugfix for UTF-8: does this work for non ISO-8859-1 chars?
if ((c < ' ') || (c > 'z')) return true;
return ("$%&/()=\"$%&/()=`^+*~#'-_:;,|<>[]\\".indexOf(c) >= 0);
return invisibleChar[c - ' '];
}
public static Enumeration<StringBuffer> wordTokenizer(String s, String charset, int minLength) {
@ -727,7 +740,7 @@ public final class plasmaCondenser {
public unsievedWordsEnum(InputStream is, String charset) throws UnsupportedEncodingException {
e = new sentencesFromInputStreamEnum(is, charset);
s = new StringBuffer();
s = new StringBuffer(20);
buffer = nextElement0();
}
@ -859,9 +872,9 @@ public final class plasmaCondenser {
}
static StringBuffer readSentence(Reader reader, boolean pre) throws IOException {
StringBuffer s = new StringBuffer();
StringBuffer s = new StringBuffer(20);
int nextChar;
char c;
char c, lc = (char) 0;
// find sentence end
for (;;) {
@ -871,20 +884,14 @@ public final class plasmaCondenser {
if (s.length() == 0) return null; else break;
}
c = (char) nextChar;
if (pre && ((c == (char) 10) || (c == (char) 13))) break;
if ((c == (char) 8) || (c == (char) 10) || (c == (char) 13)) c = ' ';
if ((lc == ' ') && (c == ' ')) continue; // ignore double spaces
s.append(c);
if (pre) {
if ((c == (char) 10) || (c == (char) 13)) break;
} else {
if (htmlFilterContentScraper.punctuation(c)) break;
}
if (htmlFilterContentScraper.punctuation(c)) break;
lc = c;
}
// replace line endings and tabs by blanks
for (int i = 0; i < s.length(); i++) {
if ((s.charAt(i) == (char) 10) || (s.charAt(i) == (char) 13) || (s.charAt(i) == (char) 8)) s.setCharAt(i, ' ');
}
// remove all double-spaces
int p; while ((p = s.indexOf(" ")) >= 0) s.deleteCharAt(p);
return s;
}

View File

@ -130,7 +130,10 @@ public class plasmaCrawlBalancer {
}
public void finalize() {
if (urlFileStack != null) close();
if (urlFileStack != null) {
serverLog.logWarning("plasmaCrawlBalancer", "crawl stack " + stackname + " closed by finalizer");
close();
}
}
public synchronized void clear() {

View File

@ -49,6 +49,8 @@ import java.util.ArrayList;
import java.util.HashSet;
import java.util.Iterator;
import de.anomic.server.logging.serverLog;
public class plasmaCrawlNURL {
public static final int STACK_TYPE_NULL = 0; // do not stack
@ -64,9 +66,9 @@ public class plasmaCrawlNURL {
private static final long minimumGlobalDelta = 500; // the minimum time difference between access of the same global domain
private static final long maximumDomAge = 60000; // the maximum age of a domain until it is used for another crawl attempt
private final plasmaCrawlBalancer coreStack; // links found by crawling to depth-1
private final plasmaCrawlBalancer limitStack; // links found by crawling at target depth
private final plasmaCrawlBalancer remoteStack; // links from remote crawl orders
private plasmaCrawlBalancer coreStack; // links found by crawling to depth-1
private plasmaCrawlBalancer limitStack; // links found by crawling at target depth
private plasmaCrawlBalancer remoteStack; // links from remote crawl orders
//private final plasmaCrawlBalancer overhangStack; // links found by crawling at depth+1
//private kelondroStack imageStack; // links pointing to image resources
//private kelondroStack movieStack; // links pointing to movie resources
@ -81,10 +83,26 @@ public class plasmaCrawlNURL {
}
public void close() {
coreStack.close();
limitStack.close();
if (coreStack != null) {
coreStack.close();
coreStack = null;
}
if (limitStack != null) {
limitStack.close();
limitStack = null;
}
//overhangStack.close();
remoteStack.close();
if (remoteStack != null) {
remoteStack.close();
remoteStack = null;
}
}
public void finalize() {
if ((coreStack != null) || (limitStack != null) || (remoteStack != null)) {
serverLog.logWarning("plasmaCrawlNURL", "NURL stack closed by finalizer");
close();
}
}
public boolean notEmpty() {

View File

@ -39,6 +39,7 @@ import java.io.UnsupportedEncodingException;
import java.net.MalformedURLException;
import java.net.URI;
import java.util.Arrays;
import java.util.Collection;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Hashtable;
@ -747,7 +748,7 @@ public final class plasmaParser {
}
static Map<yacyURL, String> allReflinks(Set<?> links) {
static Map<yacyURL, String> allReflinks(Collection<?> links) {
// links is either a Set of Strings (with urls) or htmlFilterImageEntries
// we find all links that are part of a reference inside a url
HashMap<yacyURL, String> v = new HashMap<yacyURL, String>();
@ -786,7 +787,7 @@ public final class plasmaParser {
return v;
}
static Map<yacyURL, String> allSubpaths(Set<?> links) {
static Map<yacyURL, String> allSubpaths(Collection<?> links) {
// links is either a Set of Strings (urls) or a Set of htmlFilterImageEntries
HashSet<String> h = new HashSet<String>();
Iterator<?> i = links.iterator();

View File

@ -61,6 +61,7 @@ import java.util.List;
import java.util.Map;
import java.util.TreeSet;
import de.anomic.htmlFilter.htmlFilterContentScraper;
import de.anomic.htmlFilter.htmlFilterImageEntry;
import de.anomic.plasma.parser.Parser;
@ -76,7 +77,7 @@ public class plasmaParserDocument {
private StringBuffer description; // an abstract, if present: short content description
private Object text; // the clear text, all that is visible
private Map<yacyURL, String> anchors; // all links embedded as clickeable entities (anchor tags)
private TreeSet<htmlFilterImageEntry> images; // all visible pictures in document
private HashMap<String, htmlFilterImageEntry> images; // all visible pictures in document
// the anchors and images - Maps are URL-to-EntityDescription mappings.
// The EntityDescription appear either as visible text in anchors or as alternative
// text in image tags.
@ -89,7 +90,7 @@ public class plasmaParserDocument {
protected plasmaParserDocument(yacyURL location, String mimeType, String charset,
String[] keywords, String title, String author,
String[] sections, String abstrct,
Object text, Map<yacyURL, String> anchors, TreeSet<htmlFilterImageEntry> images) {
Object text, Map<yacyURL, String> anchors, HashMap<String, htmlFilterImageEntry> images) {
this.source = location;
this.mimeType = (mimeType == null) ? "application/octet-stream" : mimeType;
this.charset = charset;
@ -99,7 +100,7 @@ public class plasmaParserDocument {
this.sections = (sections == null) ? new LinkedList<String>() : Arrays.asList(sections);
this.description = (abstrct == null) ? new StringBuffer() : new StringBuffer(abstrct);
this.anchors = (anchors == null) ? new HashMap<yacyURL, String>(0) : anchors;
this.images = (images == null) ? new TreeSet<htmlFilterImageEntry>() : images;
this.images = (images == null) ? new HashMap<String, htmlFilterImageEntry>() : images;
this.hyperlinks = null;
this.audiolinks = null;
this.videolinks = null;
@ -124,21 +125,21 @@ public class plasmaParserDocument {
public plasmaParserDocument(yacyURL location, String mimeType, String charset,
String[] keywords, String title, String author,
String[] sections, String abstrct,
byte[] text, Map<yacyURL, String> anchors, TreeSet<htmlFilterImageEntry> images) {
byte[] text, Map<yacyURL, String> anchors, HashMap<String, htmlFilterImageEntry> images) {
this(location, mimeType, charset, keywords, title, author, sections, abstrct, (Object)text, anchors, images);
}
public plasmaParserDocument(yacyURL location, String mimeType, String charset,
String[] keywords, String title, String author,
String[] sections, String abstrct,
File text, Map<yacyURL, String> anchors, TreeSet<htmlFilterImageEntry> images) {
File text, Map<yacyURL, String> anchors, HashMap<String, htmlFilterImageEntry> images) {
this(location, mimeType, charset, keywords, title, author, sections, abstrct, (Object)text, anchors, images);
}
public plasmaParserDocument(yacyURL location, String mimeType, String charset,
String[] keywords, String title, String author,
String[] sections, String abstrct,
serverCachedFileOutputStream text, Map<yacyURL, String> anchors, TreeSet<htmlFilterImageEntry> images) {
serverCachedFileOutputStream text, Map<yacyURL, String> anchors, HashMap<String, htmlFilterImageEntry> images) {
this(location, mimeType, charset, keywords, title, author, sections, abstrct, (Object)text, anchors, images);
}
@ -310,7 +311,7 @@ dc_rights
return this.videolinks;
}
public TreeSet<htmlFilterImageEntry> getImages() {
public HashMap<String, htmlFilterImageEntry> getImages() {
// returns all links enbedded as pictures (visible in document)
// this resturns a htmlFilterImageEntry collection
if (!resorted) resortLinks();
@ -341,7 +342,7 @@ dc_rights
audiolinks = new HashMap<yacyURL, String>();
applinks = new HashMap<yacyURL, String>();
emaillinks = new HashMap<String, String>();
TreeSet<htmlFilterImageEntry> collectedImages = new TreeSet<htmlFilterImageEntry>(); // this is a set that is collected now and joined later to the imagelinks
HashMap<String, htmlFilterImageEntry> collectedImages = new HashMap<String, htmlFilterImageEntry>(); // this is a set that is collected now and joined later to the imagelinks
Map.Entry<yacyURL, String> entry;
while (i.hasNext()) {
entry = i.next();
@ -361,7 +362,7 @@ dc_rights
if (plasmaParser.mediaExtContains(ext)) {
// this is not a normal anchor, its a media link
if (plasmaParser.imageExtContains(ext)) {
collectedImages.add(new htmlFilterImageEntry(url, (String) entry.getValue(), -1, -1));
htmlFilterContentScraper.addImage(collectedImages, new htmlFilterImageEntry(url, (String) entry.getValue(), -1, -1));
}
else if (plasmaParser.audioExtContains(ext)) audiolinks.put(url, (String)entry.getValue());
else if (plasmaParser.videoExtContains(ext)) videolinks.put(url, (String)entry.getValue());
@ -374,23 +375,18 @@ dc_rights
}
// add image links that we collected from the anchors to the image map
Iterator<htmlFilterImageEntry> j = collectedImages.iterator();
htmlFilterImageEntry iEntry;
while (j.hasNext()) {
iEntry = (htmlFilterImageEntry) j.next();
if (!images.contains(iEntry)) images.add(iEntry);
}
htmlFilterContentScraper.addAllImages(images, collectedImages);
// expand the hyperlinks:
// we add artificial hyperlinks to the hyperlink set
// that can be calculated from given hyperlinks and imagelinks
hyperlinks.putAll(plasmaParser.allReflinks(images));
hyperlinks.putAll(plasmaParser.allReflinks(images.values()));
hyperlinks.putAll(plasmaParser.allReflinks(audiolinks.keySet()));
hyperlinks.putAll(plasmaParser.allReflinks(videolinks.keySet()));
hyperlinks.putAll(plasmaParser.allReflinks(applinks.keySet()));
hyperlinks.putAll(plasmaParser.allSubpaths(hyperlinks.keySet()));
hyperlinks.putAll(plasmaParser.allSubpaths(images));
hyperlinks.putAll(plasmaParser.allSubpaths(images.values()));
hyperlinks.putAll(plasmaParser.allSubpaths(audiolinks.keySet()));
hyperlinks.putAll(plasmaParser.allSubpaths(videolinks.keySet()));
hyperlinks.putAll(plasmaParser.allSubpaths(applinks.keySet()));
@ -417,7 +413,7 @@ dc_rights
serverFileUtils.copy(doc.getText(), (serverCachedFileOutputStream)this.text);
anchors.putAll(doc.getAnchors());
images.addAll(doc.getImages());
htmlFilterContentScraper.addAllImages(images, doc.getImages());
}
/**

View File

@ -43,9 +43,10 @@ package de.anomic.plasma;
import java.io.InputStream;
import java.net.MalformedURLException;
import java.util.HashMap;
import java.util.Iterator;
import java.util.TreeSet;
import de.anomic.htmlFilter.htmlFilterContentScraper;
import de.anomic.htmlFilter.htmlFilterImageEntry;
import de.anomic.plasma.parser.ParserException;
import de.anomic.server.serverDate;
@ -53,11 +54,11 @@ import de.anomic.yacy.yacyURL;
public final class plasmaSearchImages {
private TreeSet<htmlFilterImageEntry> images;
private HashMap<String, htmlFilterImageEntry> images;
public plasmaSearchImages(long maxTime, yacyURL url, int depth) {
long start = System.currentTimeMillis();
this.images = new TreeSet<htmlFilterImageEntry>();
this.images = new HashMap<String, htmlFilterImageEntry>();
if (maxTime > 10) {
Object[] resource = plasmaSnippetCache.getResource(url, true, (int) maxTime, false);
InputStream res = (InputStream) resource[0];
@ -75,7 +76,7 @@ public final class plasmaSearchImages {
if (document == null) return;
// add the image links
this.addAll(document.getImages());
htmlFilterContentScraper.addAllImages(this.images, document.getImages());
// add also links from pages one step deeper, if depth > 0
if (depth > 0) {
@ -97,26 +98,13 @@ public final class plasmaSearchImages {
public void addAll(plasmaSearchImages m) {
synchronized (m.images) {
addAll(m.images);
}
}
private void addAll(TreeSet<htmlFilterImageEntry> ts) {
Iterator<htmlFilterImageEntry> i = ts.iterator();
htmlFilterImageEntry ie;
while (i.hasNext()) {
ie = i.next();
if (images.contains(ie)) {
if ((ie.height() > 0) && (ie.width() > 0)) images.add(ie);
} else {
images.add(ie);
}
htmlFilterContentScraper.addAllImages(this.images, m.images);
}
}
public Iterator<htmlFilterImageEntry> entries() {
// returns htmlFilterImageEntry - Objects
return images.iterator();
return images.values().iterator();
}
}

View File

@ -697,7 +697,8 @@ public class plasmaSnippetCache {
public static ArrayList<MediaSnippet> computeImageSnippets(plasmaParserDocument document, Set<String> queryhashes) {
TreeSet<htmlFilterImageEntry> images = document.getImages(); // iterates images in descending size order!
TreeSet<htmlFilterImageEntry> images = new TreeSet<htmlFilterImageEntry>();
images.addAll(document.getImages().values()); // iterates images in descending size order!
// a measurement for the size of the images can be retrieved using the htmlFilterImageEntry.hashCode()
Iterator<htmlFilterImageEntry> i = images.iterator();