mirror of
https://github.com/yacy/yacy_search_server.git
synced 2024-09-19 00:01:41 +02:00
- enhanced recognition, parsing, management and double-occurrence-handling of image tags
- enhanced text parser (condenser): found and eliminated bad code parts; increase of speed - added handling of image preview using the image cache from HTCACHE - some other minor changes git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@4507 6c8d7289-2bf4-0310-a012-ef5d649a1542
This commit is contained in:
parent
fcc919964b
commit
87a8747ce3
|
@ -54,6 +54,7 @@ import java.io.File;
|
|||
import java.io.FilenameFilter;
|
||||
import java.io.IOException;
|
||||
import java.io.Writer;
|
||||
import java.util.HashMap;
|
||||
import java.util.Iterator;
|
||||
import java.util.Map;
|
||||
import java.util.TreeSet;
|
||||
|
@ -313,8 +314,8 @@ public class CacheAdmin_p {
|
|||
prop.put("info_type_use." + extension, (i == 0) ? 0 : 1);
|
||||
}
|
||||
|
||||
private static void formatImageAnchor(serverObjects prop, TreeSet<htmlFilterImageEntry> anchor) {
|
||||
final Iterator<htmlFilterImageEntry> iter = anchor.iterator();
|
||||
private static void formatImageAnchor(serverObjects prop, HashMap<String, htmlFilterImageEntry> anchor) {
|
||||
final Iterator<htmlFilterImageEntry> iter = anchor.values().iterator();
|
||||
htmlFilterImageEntry ie;
|
||||
prop.put("info_type_use.images_images", anchor.size());
|
||||
int i = 0;
|
||||
|
|
|
@ -49,9 +49,9 @@ import java.io.InputStream;
|
|||
import java.io.UnsupportedEncodingException;
|
||||
import java.net.MalformedURLException;
|
||||
import java.net.URLDecoder;
|
||||
import java.util.HashMap;
|
||||
import java.util.Iterator;
|
||||
import java.util.Map;
|
||||
import java.util.TreeSet;
|
||||
|
||||
import de.anomic.data.htmlTools;
|
||||
import de.anomic.htmlFilter.htmlFilterImageEntry;
|
||||
|
@ -339,8 +339,8 @@ public class ViewFile {
|
|||
i += putMediaInfo(prop, wordArray, i, document.getAudiolinks(), "audio", (i % 2 == 0));
|
||||
dark = (i % 2 == 0);
|
||||
|
||||
TreeSet<htmlFilterImageEntry> ts = document.getImages();
|
||||
Iterator<htmlFilterImageEntry> tsi = ts.iterator();
|
||||
HashMap<String, htmlFilterImageEntry> ts = document.getImages();
|
||||
Iterator<htmlFilterImageEntry> tsi = ts.values().iterator();
|
||||
htmlFilterImageEntry entry;
|
||||
while (tsi.hasNext()) {
|
||||
entry = tsi.next();
|
||||
|
|
|
@ -135,47 +135,47 @@ public class ViewImage {
|
|||
// find original size
|
||||
int h = image.getHeight(null);
|
||||
int w = image.getWidth(null);
|
||||
|
||||
// System.out.println("DEBUG: get access to image " +
|
||||
// url.toNormalform() + " is " + ((auth) ? "authorized" : "NOT
|
||||
// authorized"));
|
||||
|
||||
|
||||
// in case of not-authorized access shrink the image to prevent
|
||||
// copyright problems
|
||||
// so that images are not larger than thumbnails
|
||||
if ((!auth) && ((w > 16) || (h > 16))) {
|
||||
// copyright problems, so that images are not larger than thumbnails
|
||||
if (auth) {
|
||||
maxwidth = (maxwidth == 0) ? w : maxwidth;
|
||||
maxheight = (maxheight == 0) ? h : maxheight;
|
||||
} else if ((w > 16) || (h > 16)) {
|
||||
maxwidth = (int) Math.min(64.0, w * 0.6);
|
||||
maxheight = (int) Math.min(64.0, h * 0.6);
|
||||
} else {
|
||||
maxwidth = 16;
|
||||
maxheight = 16;
|
||||
}
|
||||
|
||||
// calculate width & height from maxwidth & maxheight
|
||||
if ((maxwidth != 0) || (maxheight != 0)) {
|
||||
if ((maxwidth < w) || (maxheight < h)) {
|
||||
// scale image
|
||||
double hs = (w <= maxwidth) ? 1.0 : ((double) maxwidth) / ((double) w);
|
||||
double vs = (h <= maxheight) ? 1.0 : ((double) maxheight) / ((double) h);
|
||||
double scale = Math.min(hs, vs);
|
||||
if (!auth) scale = Math.min(scale, 0.6); // this is for copyright purpose
|
||||
if (scale < 1.0) {
|
||||
width = (int) (w * scale);
|
||||
height = (int) (h * scale);
|
||||
width = Math.max(1, (int) (w * scale));
|
||||
height = Math.max(1, (int) (h * scale));
|
||||
} else {
|
||||
width = w;
|
||||
height = h;
|
||||
width = Math.max(1, w);
|
||||
height = Math.max(1, h);
|
||||
}
|
||||
|
||||
// compute scaled image
|
||||
scaled = ((w == width) && (h == height)) ? image : image.getScaledInstance(width, height, Image.SCALE_AREA_AVERAGING);
|
||||
MediaTracker mediaTracker = new MediaTracker(new Container());
|
||||
mediaTracker.addImage(scaled, 0);
|
||||
try {mediaTracker.waitForID(0);} catch (InterruptedException e) {}
|
||||
} else {
|
||||
// do not scale
|
||||
width = w;
|
||||
height = h;
|
||||
scaled = image;
|
||||
}
|
||||
|
||||
// check for minimum values
|
||||
width = Math.max(width, 1);
|
||||
height = Math.max(height, 1);
|
||||
|
||||
// scale image
|
||||
scaled = ((w == width) && (h == height)) ? image : image.getScaledInstance(width, height, Image.SCALE_AREA_AVERAGING);
|
||||
MediaTracker mediaTracker = new MediaTracker(new Container());
|
||||
mediaTracker.addImage(scaled, 0);
|
||||
try {mediaTracker.waitForID(0);} catch (InterruptedException e) {}
|
||||
|
||||
if ((height == 16) && (width == 16) && (resource != null)) {
|
||||
// this might be a favicon, store image to cache for faster re-load later on
|
||||
iconcache.put(urlString, scaled);
|
||||
|
|
|
@ -22,7 +22,7 @@
|
|||
::
|
||||
#{items}#
|
||||
<div class="thumbcontainer">
|
||||
<a href="#[href]#" class="thumblink" onclick="return hs.expand(this)">
|
||||
<a href="#[hrefCache]#" class="thumblink" onclick="return hs.expand(this)">
|
||||
<img src="/ViewImage.png?maxwidth=96&maxheight=96&code=#[code]#" alt="#[name]#">
|
||||
</a>
|
||||
<div class="highslide-caption"><a href="#[href]#">#[name]#<br \><a href="#[source]#">#[sourcedom]#</a></a></div>
|
||||
|
|
|
@ -67,6 +67,7 @@ public class yacysearchitem {
|
|||
boolean rss = post.get("rss", "false").equals("true");
|
||||
boolean authenticated = sb.adminAuthenticated(header) >= 2;
|
||||
int item = post.getInt("item", -1);
|
||||
boolean auth = ((String) header.get(httpHeader.CONNECTION_PROP_CLIENTIP, "")).equals("localhost") || sb.verifyAuthentication(header, true);
|
||||
|
||||
// default settings for blank item
|
||||
prop.put("content", "0");
|
||||
|
@ -233,6 +234,7 @@ public class yacysearchitem {
|
|||
if (ms == null) {
|
||||
prop.put("content_items", "0");
|
||||
} else {
|
||||
prop.putHTML("content_items_0_hrefCache", (auth) ? "/ViewImage.png?url=" + ms.href.toNormalform(true, false) : ms.href.toNormalform(true, false));
|
||||
prop.putHTML("content_items_0_href", ms.href.toNormalform(true, false));
|
||||
prop.put("content_items_0_code", sb.licensedURLs.aquireLicense(ms.href));
|
||||
prop.putHTML("content_items_0_name", shorten(ms.name, namelength));
|
||||
|
|
|
@ -54,6 +54,7 @@ import java.net.MalformedURLException;
|
|||
import java.text.Collator;
|
||||
import java.util.ArrayList;
|
||||
import java.util.HashMap;
|
||||
import java.util.Iterator;
|
||||
import java.util.List;
|
||||
import java.util.Locale;
|
||||
import java.util.Map;
|
||||
|
@ -102,7 +103,7 @@ public class htmlFilterContentScraper extends htmlFilterAbstractScraper implemen
|
|||
|
||||
// class variables: collectors for links
|
||||
private HashMap<yacyURL, String> anchors;
|
||||
private TreeSet<htmlFilterImageEntry> images; // String(absolute url)/ImageEntry relation
|
||||
private HashMap<String, htmlFilterImageEntry> images; // urlhash/image relation
|
||||
private HashMap<String, String> metas;
|
||||
private String title;
|
||||
//private String headline;
|
||||
|
@ -127,7 +128,7 @@ public class htmlFilterContentScraper extends htmlFilterAbstractScraper implemen
|
|||
super(linkTags0, linkTags1);
|
||||
this.root = root;
|
||||
this.anchors = new HashMap<yacyURL, String>();
|
||||
this.images = new TreeSet<htmlFilterImageEntry>();
|
||||
this.images = new HashMap<String, htmlFilterImageEntry>();
|
||||
this.metas = new HashMap<String, String>();
|
||||
this.title = "";
|
||||
this.headlines = new ArrayList[4];
|
||||
|
@ -178,7 +179,7 @@ public class htmlFilterContentScraper extends htmlFilterAbstractScraper implemen
|
|||
} catch (NumberFormatException e) {}
|
||||
yacyURL url = absolutePath(tagopts.getProperty("src", ""));
|
||||
htmlFilterImageEntry ie = new htmlFilterImageEntry(url, tagopts.getProperty("alt",""), width, height);
|
||||
images.add(ie);
|
||||
addImage(images, ie);
|
||||
}
|
||||
if (tagname.equalsIgnoreCase("base")) try {
|
||||
root = new yacyURL(tagopts.getProperty("href", ""), null);
|
||||
|
@ -212,7 +213,7 @@ public class htmlFilterContentScraper extends htmlFilterAbstractScraper implemen
|
|||
|
||||
if (type.equalsIgnoreCase("shortcut icon")) {
|
||||
htmlFilterImageEntry ie = new htmlFilterImageEntry(newLink, linktitle, -1,-1);
|
||||
images.add(ie);
|
||||
images.put(ie.url().hash(), ie);
|
||||
this.favicon = newLink;
|
||||
} else if (!type.equalsIgnoreCase("stylesheet") && !type.equalsIgnoreCase("alternate stylesheet")) {
|
||||
anchors.put(newLink, linktitle);
|
||||
|
@ -234,12 +235,24 @@ public class htmlFilterContentScraper extends htmlFilterAbstractScraper implemen
|
|||
// fire event
|
||||
fireScrapeTag0(tagname, tagopts);
|
||||
}
|
||||
|
||||
|
||||
public void scrapeTag1(String tagname, Properties tagopts, char[] text) {
|
||||
// System.out.println("ScrapeTag1: tagname=" + tagname + ", opts=" + tagopts.toString() + ", text=" + new String(text));
|
||||
if ((tagname.equalsIgnoreCase("a")) && (text.length < 2048)) {
|
||||
String href = tagopts.getProperty("href", "");
|
||||
if (href.length() > 0) anchors.put(absolutePath(href), super.stripAll(new serverCharBuffer(text)).trim().toString());
|
||||
if (href.length() > 0) {
|
||||
yacyURL url = absolutePath(href);
|
||||
String f = url.getFile();
|
||||
int p = f.lastIndexOf('.');
|
||||
String type = (p < 0) ? "" : f.substring(p + 1);
|
||||
if (type.equals("png") || type.equals("gif") || type.equals("jpg") || type.equals("jpeg")) {
|
||||
// special handling of such urls: put them to the image urls
|
||||
htmlFilterImageEntry ie = new htmlFilterImageEntry(url, super.stripAll(new serverCharBuffer(text)).trim().toString(), -1, -1);
|
||||
addImage(images, ie);
|
||||
} else {
|
||||
anchors.put(url, super.stripAll(new serverCharBuffer(text)).trim().toString());
|
||||
}
|
||||
}
|
||||
}
|
||||
String h;
|
||||
if ((tagname.equalsIgnoreCase("h1")) && (text.length < 1024)) {
|
||||
|
@ -348,7 +361,7 @@ public class htmlFilterContentScraper extends htmlFilterAbstractScraper implemen
|
|||
return anchors;
|
||||
}
|
||||
|
||||
public TreeSet<htmlFilterImageEntry> getImages() {
|
||||
public HashMap<String, htmlFilterImageEntry> getImages() {
|
||||
// this resturns a String(absolute url)/htmlFilterImageEntry - relation
|
||||
return images;
|
||||
}
|
||||
|
@ -522,5 +535,24 @@ public class htmlFilterContentScraper extends htmlFilterAbstractScraper implemen
|
|||
|
||||
return scraper;
|
||||
}
|
||||
|
||||
public static void addAllImages(HashMap<String, htmlFilterImageEntry> a, HashMap<String, htmlFilterImageEntry> b) {
|
||||
Iterator<Map.Entry<String, htmlFilterImageEntry>> i = b.entrySet().iterator();
|
||||
Map.Entry<String, htmlFilterImageEntry> ie;
|
||||
while (i.hasNext()) {
|
||||
ie = i.next();
|
||||
addImage(a, ie.getValue());
|
||||
}
|
||||
}
|
||||
|
||||
public static void addImage(HashMap<String, htmlFilterImageEntry> a, htmlFilterImageEntry ie) {
|
||||
if (a.containsKey(ie.url().hash())) {
|
||||
// in case of a collision, take that image that has the better image size tags
|
||||
if ((ie.height() > 0) && (ie.width() > 0)) a.put(ie.url().hash(), ie);
|
||||
} else {
|
||||
a.put(ie.url().hash(), ie);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
|
|
@ -452,6 +452,7 @@ public final class httpdFileHandler {
|
|||
sb.append("<html>\n<head>\n</head>\n<body>\n<h1>Index of " + path + "</h1>\n <ul>\n");
|
||||
File dir = new File(htDocsPath, path);
|
||||
String[] list = dir.list();
|
||||
if (list == null) list = new String[0]; // should not occur!
|
||||
File f;
|
||||
String size;
|
||||
long sz;
|
||||
|
|
|
@ -50,7 +50,6 @@ import java.util.HashMap;
|
|||
import java.util.Hashtable;
|
||||
import java.util.LinkedList;
|
||||
import java.util.Map;
|
||||
import java.util.TreeSet;
|
||||
|
||||
import de.anomic.htmlFilter.htmlFilterAbstractScraper;
|
||||
import de.anomic.htmlFilter.htmlFilterContentScraper;
|
||||
|
@ -97,7 +96,7 @@ public class rssParser extends AbstractParser implements Parser {
|
|||
try {
|
||||
LinkedList<String> feedSections = new LinkedList<String>();
|
||||
HashMap<yacyURL, String> anchors = new HashMap<yacyURL, String>();
|
||||
TreeSet<htmlFilterImageEntry> images = new TreeSet<htmlFilterImageEntry>();
|
||||
HashMap<String, htmlFilterImageEntry> images = new HashMap<String, htmlFilterImageEntry>();
|
||||
serverByteBuffer text = new serverByteBuffer();
|
||||
serverCharBuffer authors = new serverCharBuffer();
|
||||
|
||||
|
@ -114,7 +113,8 @@ public class rssParser extends AbstractParser implements Parser {
|
|||
String feedDescription = reader.getChannel().getDescription();
|
||||
|
||||
if (reader.getImage() != null) {
|
||||
images.add(new htmlFilterImageEntry(new yacyURL(reader.getImage(), null), feedTitle, -1, -1));
|
||||
yacyURL imgURL = new yacyURL(reader.getImage(), null);
|
||||
images.put(imgURL.hash(), new htmlFilterImageEntry(imgURL, feedTitle, -1, -1));
|
||||
}
|
||||
|
||||
// loop through the feed items
|
||||
|
@ -154,9 +154,9 @@ public class rssParser extends AbstractParser implements Parser {
|
|||
anchors.putAll(itemLinks);
|
||||
}
|
||||
|
||||
TreeSet<htmlFilterImageEntry> itemImages = scraper.getImages();
|
||||
HashMap<String, htmlFilterImageEntry> itemImages = scraper.getImages();
|
||||
if ((itemImages != null) && (itemImages.size() > 0)) {
|
||||
images.addAll(itemImages);
|
||||
htmlFilterContentScraper.addAllImages(images, itemImages);
|
||||
}
|
||||
|
||||
byte[] extractedText = scraper.getText();
|
||||
|
|
|
@ -53,12 +53,12 @@ import java.util.HashMap;
|
|||
import java.util.Hashtable;
|
||||
import java.util.LinkedList;
|
||||
import java.util.Map;
|
||||
import java.util.TreeSet;
|
||||
import java.util.zip.GZIPInputStream;
|
||||
|
||||
import com.ice.tar.TarEntry;
|
||||
import com.ice.tar.TarInputStream;
|
||||
|
||||
import de.anomic.htmlFilter.htmlFilterContentScraper;
|
||||
import de.anomic.htmlFilter.htmlFilterImageEntry;
|
||||
import de.anomic.plasma.plasmaParser;
|
||||
import de.anomic.plasma.plasmaParserDocument;
|
||||
|
@ -132,7 +132,7 @@ public class tarParser extends AbstractParser implements Parser {
|
|||
StringBuffer docAbstrct = new StringBuffer();
|
||||
|
||||
Map<yacyURL, String> docAnchors = new HashMap<yacyURL, String>();
|
||||
TreeSet<htmlFilterImageEntry> docImages = new TreeSet<htmlFilterImageEntry>();
|
||||
HashMap<String, htmlFilterImageEntry> docImages = new HashMap<String, htmlFilterImageEntry>();
|
||||
|
||||
// looping through the contained files
|
||||
TarEntry entry;
|
||||
|
@ -193,7 +193,7 @@ public class tarParser extends AbstractParser implements Parser {
|
|||
}
|
||||
|
||||
docAnchors.putAll(subDoc.getAnchors());
|
||||
docImages.addAll(subDoc.getImages());
|
||||
htmlFilterContentScraper.addAllImages(docImages, subDoc.getImages());
|
||||
|
||||
// release subdocument
|
||||
subDoc.close();
|
||||
|
|
|
@ -53,10 +53,10 @@ import java.util.HashMap;
|
|||
import java.util.Hashtable;
|
||||
import java.util.LinkedList;
|
||||
import java.util.Map;
|
||||
import java.util.TreeSet;
|
||||
import java.util.zip.ZipEntry;
|
||||
import java.util.zip.ZipInputStream;
|
||||
|
||||
import de.anomic.htmlFilter.htmlFilterContentScraper;
|
||||
import de.anomic.htmlFilter.htmlFilterImageEntry;
|
||||
import de.anomic.plasma.plasmaParser;
|
||||
import de.anomic.plasma.plasmaParserDocument;
|
||||
|
@ -115,7 +115,7 @@ public class zipParser extends AbstractParser implements Parser {
|
|||
LinkedList<String> docSections = new LinkedList<String>();
|
||||
StringBuffer docAbstrct = new StringBuffer();
|
||||
Map<yacyURL, String> docAnchors = new HashMap<yacyURL, String>();
|
||||
TreeSet<htmlFilterImageEntry> docImages = new TreeSet<htmlFilterImageEntry>();
|
||||
HashMap<String, htmlFilterImageEntry> docImages = new HashMap<String, htmlFilterImageEntry>();
|
||||
|
||||
// creating a new parser class to parse the unzipped content
|
||||
plasmaParser theParser = new plasmaParser();
|
||||
|
@ -176,7 +176,7 @@ public class zipParser extends AbstractParser implements Parser {
|
|||
}
|
||||
|
||||
docAnchors.putAll(subDoc.getAnchors());
|
||||
docImages.addAll(subDoc.getImages());
|
||||
htmlFilterContentScraper.addAllImages(docImages, subDoc.getImages());
|
||||
|
||||
// release subdocument
|
||||
subDoc.close();
|
||||
|
|
|
@ -107,6 +107,19 @@ public final class plasmaCondenser {
|
|||
|
||||
private final static int numlength = 5;
|
||||
|
||||
// initialize array of invisible characters
|
||||
private static boolean[] invisibleChar = new boolean['z' - ' ' + 1];
|
||||
static {
|
||||
// initialize array of invisible charachters
|
||||
String invisibleString = "\"$%&/()=`^+*#'-_:;,<>[]\\";
|
||||
for (int i = ' '; i <= 'z'; i++) {
|
||||
invisibleChar[i - ' '] = false;
|
||||
}
|
||||
for (int i = 0; i < invisibleString.length(); i++) {
|
||||
invisibleChar[invisibleString.charAt(i) - ' '] = true;
|
||||
}
|
||||
}
|
||||
|
||||
//private Properties analysis;
|
||||
private TreeMap<String, wordStatProp> words; // a string (the words) to (wordStatProp) - relation
|
||||
private HashMap<StringBuffer, phraseStatProp> sentences;
|
||||
|
@ -198,7 +211,7 @@ public final class plasmaCondenser {
|
|||
}
|
||||
|
||||
// images
|
||||
Iterator<htmlFilterImageEntry> j = document.getImages().iterator();
|
||||
Iterator<htmlFilterImageEntry> j = document.getImages().values().iterator();
|
||||
htmlFilterImageEntry ientry;
|
||||
while (j.hasNext()) {
|
||||
ientry = j.next();
|
||||
|
@ -659,7 +672,7 @@ public final class plasmaCondenser {
|
|||
public final static boolean invisible(char c) {
|
||||
// TODO: Bugfix for UTF-8: does this work for non ISO-8859-1 chars?
|
||||
if ((c < ' ') || (c > 'z')) return true;
|
||||
return ("$%&/()=\"$%&/()=`^+*~#'-_:;,|<>[]\\".indexOf(c) >= 0);
|
||||
return invisibleChar[c - ' '];
|
||||
}
|
||||
|
||||
public static Enumeration<StringBuffer> wordTokenizer(String s, String charset, int minLength) {
|
||||
|
@ -727,7 +740,7 @@ public final class plasmaCondenser {
|
|||
|
||||
public unsievedWordsEnum(InputStream is, String charset) throws UnsupportedEncodingException {
|
||||
e = new sentencesFromInputStreamEnum(is, charset);
|
||||
s = new StringBuffer();
|
||||
s = new StringBuffer(20);
|
||||
buffer = nextElement0();
|
||||
}
|
||||
|
||||
|
@ -859,9 +872,9 @@ public final class plasmaCondenser {
|
|||
}
|
||||
|
||||
static StringBuffer readSentence(Reader reader, boolean pre) throws IOException {
|
||||
StringBuffer s = new StringBuffer();
|
||||
StringBuffer s = new StringBuffer(20);
|
||||
int nextChar;
|
||||
char c;
|
||||
char c, lc = (char) 0;
|
||||
|
||||
// find sentence end
|
||||
for (;;) {
|
||||
|
@ -871,20 +884,14 @@ public final class plasmaCondenser {
|
|||
if (s.length() == 0) return null; else break;
|
||||
}
|
||||
c = (char) nextChar;
|
||||
if (pre && ((c == (char) 10) || (c == (char) 13))) break;
|
||||
if ((c == (char) 8) || (c == (char) 10) || (c == (char) 13)) c = ' ';
|
||||
if ((lc == ' ') && (c == ' ')) continue; // ignore double spaces
|
||||
s.append(c);
|
||||
if (pre) {
|
||||
if ((c == (char) 10) || (c == (char) 13)) break;
|
||||
} else {
|
||||
if (htmlFilterContentScraper.punctuation(c)) break;
|
||||
}
|
||||
if (htmlFilterContentScraper.punctuation(c)) break;
|
||||
lc = c;
|
||||
}
|
||||
|
||||
// replace line endings and tabs by blanks
|
||||
for (int i = 0; i < s.length(); i++) {
|
||||
if ((s.charAt(i) == (char) 10) || (s.charAt(i) == (char) 13) || (s.charAt(i) == (char) 8)) s.setCharAt(i, ' ');
|
||||
}
|
||||
// remove all double-spaces
|
||||
int p; while ((p = s.indexOf(" ")) >= 0) s.deleteCharAt(p);
|
||||
|
||||
return s;
|
||||
}
|
||||
|
||||
|
|
|
@ -130,7 +130,10 @@ public class plasmaCrawlBalancer {
|
|||
}
|
||||
|
||||
public void finalize() {
|
||||
if (urlFileStack != null) close();
|
||||
if (urlFileStack != null) {
|
||||
serverLog.logWarning("plasmaCrawlBalancer", "crawl stack " + stackname + " closed by finalizer");
|
||||
close();
|
||||
}
|
||||
}
|
||||
|
||||
public synchronized void clear() {
|
||||
|
|
|
@ -49,6 +49,8 @@ import java.util.ArrayList;
|
|||
import java.util.HashSet;
|
||||
import java.util.Iterator;
|
||||
|
||||
import de.anomic.server.logging.serverLog;
|
||||
|
||||
public class plasmaCrawlNURL {
|
||||
|
||||
public static final int STACK_TYPE_NULL = 0; // do not stack
|
||||
|
@ -64,9 +66,9 @@ public class plasmaCrawlNURL {
|
|||
private static final long minimumGlobalDelta = 500; // the minimum time difference between access of the same global domain
|
||||
private static final long maximumDomAge = 60000; // the maximum age of a domain until it is used for another crawl attempt
|
||||
|
||||
private final plasmaCrawlBalancer coreStack; // links found by crawling to depth-1
|
||||
private final plasmaCrawlBalancer limitStack; // links found by crawling at target depth
|
||||
private final plasmaCrawlBalancer remoteStack; // links from remote crawl orders
|
||||
private plasmaCrawlBalancer coreStack; // links found by crawling to depth-1
|
||||
private plasmaCrawlBalancer limitStack; // links found by crawling at target depth
|
||||
private plasmaCrawlBalancer remoteStack; // links from remote crawl orders
|
||||
//private final plasmaCrawlBalancer overhangStack; // links found by crawling at depth+1
|
||||
//private kelondroStack imageStack; // links pointing to image resources
|
||||
//private kelondroStack movieStack; // links pointing to movie resources
|
||||
|
@ -81,10 +83,26 @@ public class plasmaCrawlNURL {
|
|||
}
|
||||
|
||||
public void close() {
|
||||
coreStack.close();
|
||||
limitStack.close();
|
||||
if (coreStack != null) {
|
||||
coreStack.close();
|
||||
coreStack = null;
|
||||
}
|
||||
if (limitStack != null) {
|
||||
limitStack.close();
|
||||
limitStack = null;
|
||||
}
|
||||
//overhangStack.close();
|
||||
remoteStack.close();
|
||||
if (remoteStack != null) {
|
||||
remoteStack.close();
|
||||
remoteStack = null;
|
||||
}
|
||||
}
|
||||
|
||||
public void finalize() {
|
||||
if ((coreStack != null) || (limitStack != null) || (remoteStack != null)) {
|
||||
serverLog.logWarning("plasmaCrawlNURL", "NURL stack closed by finalizer");
|
||||
close();
|
||||
}
|
||||
}
|
||||
|
||||
public boolean notEmpty() {
|
||||
|
|
|
@ -39,6 +39,7 @@ import java.io.UnsupportedEncodingException;
|
|||
import java.net.MalformedURLException;
|
||||
import java.net.URI;
|
||||
import java.util.Arrays;
|
||||
import java.util.Collection;
|
||||
import java.util.HashMap;
|
||||
import java.util.HashSet;
|
||||
import java.util.Hashtable;
|
||||
|
@ -747,7 +748,7 @@ public final class plasmaParser {
|
|||
|
||||
}
|
||||
|
||||
static Map<yacyURL, String> allReflinks(Set<?> links) {
|
||||
static Map<yacyURL, String> allReflinks(Collection<?> links) {
|
||||
// links is either a Set of Strings (with urls) or htmlFilterImageEntries
|
||||
// we find all links that are part of a reference inside a url
|
||||
HashMap<yacyURL, String> v = new HashMap<yacyURL, String>();
|
||||
|
@ -786,7 +787,7 @@ public final class plasmaParser {
|
|||
return v;
|
||||
}
|
||||
|
||||
static Map<yacyURL, String> allSubpaths(Set<?> links) {
|
||||
static Map<yacyURL, String> allSubpaths(Collection<?> links) {
|
||||
// links is either a Set of Strings (urls) or a Set of htmlFilterImageEntries
|
||||
HashSet<String> h = new HashSet<String>();
|
||||
Iterator<?> i = links.iterator();
|
||||
|
|
|
@ -61,6 +61,7 @@ import java.util.List;
|
|||
import java.util.Map;
|
||||
import java.util.TreeSet;
|
||||
|
||||
import de.anomic.htmlFilter.htmlFilterContentScraper;
|
||||
import de.anomic.htmlFilter.htmlFilterImageEntry;
|
||||
import de.anomic.plasma.parser.Parser;
|
||||
|
||||
|
@ -76,7 +77,7 @@ public class plasmaParserDocument {
|
|||
private StringBuffer description; // an abstract, if present: short content description
|
||||
private Object text; // the clear text, all that is visible
|
||||
private Map<yacyURL, String> anchors; // all links embedded as clickeable entities (anchor tags)
|
||||
private TreeSet<htmlFilterImageEntry> images; // all visible pictures in document
|
||||
private HashMap<String, htmlFilterImageEntry> images; // all visible pictures in document
|
||||
// the anchors and images - Maps are URL-to-EntityDescription mappings.
|
||||
// The EntityDescription appear either as visible text in anchors or as alternative
|
||||
// text in image tags.
|
||||
|
@ -89,7 +90,7 @@ public class plasmaParserDocument {
|
|||
protected plasmaParserDocument(yacyURL location, String mimeType, String charset,
|
||||
String[] keywords, String title, String author,
|
||||
String[] sections, String abstrct,
|
||||
Object text, Map<yacyURL, String> anchors, TreeSet<htmlFilterImageEntry> images) {
|
||||
Object text, Map<yacyURL, String> anchors, HashMap<String, htmlFilterImageEntry> images) {
|
||||
this.source = location;
|
||||
this.mimeType = (mimeType == null) ? "application/octet-stream" : mimeType;
|
||||
this.charset = charset;
|
||||
|
@ -99,7 +100,7 @@ public class plasmaParserDocument {
|
|||
this.sections = (sections == null) ? new LinkedList<String>() : Arrays.asList(sections);
|
||||
this.description = (abstrct == null) ? new StringBuffer() : new StringBuffer(abstrct);
|
||||
this.anchors = (anchors == null) ? new HashMap<yacyURL, String>(0) : anchors;
|
||||
this.images = (images == null) ? new TreeSet<htmlFilterImageEntry>() : images;
|
||||
this.images = (images == null) ? new HashMap<String, htmlFilterImageEntry>() : images;
|
||||
this.hyperlinks = null;
|
||||
this.audiolinks = null;
|
||||
this.videolinks = null;
|
||||
|
@ -124,21 +125,21 @@ public class plasmaParserDocument {
|
|||
public plasmaParserDocument(yacyURL location, String mimeType, String charset,
|
||||
String[] keywords, String title, String author,
|
||||
String[] sections, String abstrct,
|
||||
byte[] text, Map<yacyURL, String> anchors, TreeSet<htmlFilterImageEntry> images) {
|
||||
byte[] text, Map<yacyURL, String> anchors, HashMap<String, htmlFilterImageEntry> images) {
|
||||
this(location, mimeType, charset, keywords, title, author, sections, abstrct, (Object)text, anchors, images);
|
||||
}
|
||||
|
||||
public plasmaParserDocument(yacyURL location, String mimeType, String charset,
|
||||
String[] keywords, String title, String author,
|
||||
String[] sections, String abstrct,
|
||||
File text, Map<yacyURL, String> anchors, TreeSet<htmlFilterImageEntry> images) {
|
||||
File text, Map<yacyURL, String> anchors, HashMap<String, htmlFilterImageEntry> images) {
|
||||
this(location, mimeType, charset, keywords, title, author, sections, abstrct, (Object)text, anchors, images);
|
||||
}
|
||||
|
||||
public plasmaParserDocument(yacyURL location, String mimeType, String charset,
|
||||
String[] keywords, String title, String author,
|
||||
String[] sections, String abstrct,
|
||||
serverCachedFileOutputStream text, Map<yacyURL, String> anchors, TreeSet<htmlFilterImageEntry> images) {
|
||||
serverCachedFileOutputStream text, Map<yacyURL, String> anchors, HashMap<String, htmlFilterImageEntry> images) {
|
||||
this(location, mimeType, charset, keywords, title, author, sections, abstrct, (Object)text, anchors, images);
|
||||
}
|
||||
|
||||
|
@ -310,7 +311,7 @@ dc_rights
|
|||
return this.videolinks;
|
||||
}
|
||||
|
||||
public TreeSet<htmlFilterImageEntry> getImages() {
|
||||
public HashMap<String, htmlFilterImageEntry> getImages() {
|
||||
// returns all links enbedded as pictures (visible in document)
|
||||
// this resturns a htmlFilterImageEntry collection
|
||||
if (!resorted) resortLinks();
|
||||
|
@ -341,7 +342,7 @@ dc_rights
|
|||
audiolinks = new HashMap<yacyURL, String>();
|
||||
applinks = new HashMap<yacyURL, String>();
|
||||
emaillinks = new HashMap<String, String>();
|
||||
TreeSet<htmlFilterImageEntry> collectedImages = new TreeSet<htmlFilterImageEntry>(); // this is a set that is collected now and joined later to the imagelinks
|
||||
HashMap<String, htmlFilterImageEntry> collectedImages = new HashMap<String, htmlFilterImageEntry>(); // this is a set that is collected now and joined later to the imagelinks
|
||||
Map.Entry<yacyURL, String> entry;
|
||||
while (i.hasNext()) {
|
||||
entry = i.next();
|
||||
|
@ -361,7 +362,7 @@ dc_rights
|
|||
if (plasmaParser.mediaExtContains(ext)) {
|
||||
// this is not a normal anchor, its a media link
|
||||
if (plasmaParser.imageExtContains(ext)) {
|
||||
collectedImages.add(new htmlFilterImageEntry(url, (String) entry.getValue(), -1, -1));
|
||||
htmlFilterContentScraper.addImage(collectedImages, new htmlFilterImageEntry(url, (String) entry.getValue(), -1, -1));
|
||||
}
|
||||
else if (plasmaParser.audioExtContains(ext)) audiolinks.put(url, (String)entry.getValue());
|
||||
else if (plasmaParser.videoExtContains(ext)) videolinks.put(url, (String)entry.getValue());
|
||||
|
@ -374,23 +375,18 @@ dc_rights
|
|||
}
|
||||
|
||||
// add image links that we collected from the anchors to the image map
|
||||
Iterator<htmlFilterImageEntry> j = collectedImages.iterator();
|
||||
htmlFilterImageEntry iEntry;
|
||||
while (j.hasNext()) {
|
||||
iEntry = (htmlFilterImageEntry) j.next();
|
||||
if (!images.contains(iEntry)) images.add(iEntry);
|
||||
}
|
||||
htmlFilterContentScraper.addAllImages(images, collectedImages);
|
||||
|
||||
// expand the hyperlinks:
|
||||
// we add artificial hyperlinks to the hyperlink set
|
||||
// that can be calculated from given hyperlinks and imagelinks
|
||||
|
||||
hyperlinks.putAll(plasmaParser.allReflinks(images));
|
||||
hyperlinks.putAll(plasmaParser.allReflinks(images.values()));
|
||||
hyperlinks.putAll(plasmaParser.allReflinks(audiolinks.keySet()));
|
||||
hyperlinks.putAll(plasmaParser.allReflinks(videolinks.keySet()));
|
||||
hyperlinks.putAll(plasmaParser.allReflinks(applinks.keySet()));
|
||||
hyperlinks.putAll(plasmaParser.allSubpaths(hyperlinks.keySet()));
|
||||
hyperlinks.putAll(plasmaParser.allSubpaths(images));
|
||||
hyperlinks.putAll(plasmaParser.allSubpaths(images.values()));
|
||||
hyperlinks.putAll(plasmaParser.allSubpaths(audiolinks.keySet()));
|
||||
hyperlinks.putAll(plasmaParser.allSubpaths(videolinks.keySet()));
|
||||
hyperlinks.putAll(plasmaParser.allSubpaths(applinks.keySet()));
|
||||
|
@ -417,7 +413,7 @@ dc_rights
|
|||
serverFileUtils.copy(doc.getText(), (serverCachedFileOutputStream)this.text);
|
||||
|
||||
anchors.putAll(doc.getAnchors());
|
||||
images.addAll(doc.getImages());
|
||||
htmlFilterContentScraper.addAllImages(images, doc.getImages());
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
|
@ -43,9 +43,10 @@ package de.anomic.plasma;
|
|||
|
||||
import java.io.InputStream;
|
||||
import java.net.MalformedURLException;
|
||||
import java.util.HashMap;
|
||||
import java.util.Iterator;
|
||||
import java.util.TreeSet;
|
||||
|
||||
import de.anomic.htmlFilter.htmlFilterContentScraper;
|
||||
import de.anomic.htmlFilter.htmlFilterImageEntry;
|
||||
import de.anomic.plasma.parser.ParserException;
|
||||
import de.anomic.server.serverDate;
|
||||
|
@ -53,11 +54,11 @@ import de.anomic.yacy.yacyURL;
|
|||
|
||||
public final class plasmaSearchImages {
|
||||
|
||||
private TreeSet<htmlFilterImageEntry> images;
|
||||
private HashMap<String, htmlFilterImageEntry> images;
|
||||
|
||||
public plasmaSearchImages(long maxTime, yacyURL url, int depth) {
|
||||
long start = System.currentTimeMillis();
|
||||
this.images = new TreeSet<htmlFilterImageEntry>();
|
||||
this.images = new HashMap<String, htmlFilterImageEntry>();
|
||||
if (maxTime > 10) {
|
||||
Object[] resource = plasmaSnippetCache.getResource(url, true, (int) maxTime, false);
|
||||
InputStream res = (InputStream) resource[0];
|
||||
|
@ -75,7 +76,7 @@ public final class plasmaSearchImages {
|
|||
if (document == null) return;
|
||||
|
||||
// add the image links
|
||||
this.addAll(document.getImages());
|
||||
htmlFilterContentScraper.addAllImages(this.images, document.getImages());
|
||||
|
||||
// add also links from pages one step deeper, if depth > 0
|
||||
if (depth > 0) {
|
||||
|
@ -97,26 +98,13 @@ public final class plasmaSearchImages {
|
|||
|
||||
public void addAll(plasmaSearchImages m) {
|
||||
synchronized (m.images) {
|
||||
addAll(m.images);
|
||||
}
|
||||
}
|
||||
|
||||
private void addAll(TreeSet<htmlFilterImageEntry> ts) {
|
||||
Iterator<htmlFilterImageEntry> i = ts.iterator();
|
||||
htmlFilterImageEntry ie;
|
||||
while (i.hasNext()) {
|
||||
ie = i.next();
|
||||
if (images.contains(ie)) {
|
||||
if ((ie.height() > 0) && (ie.width() > 0)) images.add(ie);
|
||||
} else {
|
||||
images.add(ie);
|
||||
}
|
||||
htmlFilterContentScraper.addAllImages(this.images, m.images);
|
||||
}
|
||||
}
|
||||
|
||||
public Iterator<htmlFilterImageEntry> entries() {
|
||||
// returns htmlFilterImageEntry - Objects
|
||||
return images.iterator();
|
||||
return images.values().iterator();
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -697,7 +697,8 @@ public class plasmaSnippetCache {
|
|||
|
||||
public static ArrayList<MediaSnippet> computeImageSnippets(plasmaParserDocument document, Set<String> queryhashes) {
|
||||
|
||||
TreeSet<htmlFilterImageEntry> images = document.getImages(); // iterates images in descending size order!
|
||||
TreeSet<htmlFilterImageEntry> images = new TreeSet<htmlFilterImageEntry>();
|
||||
images.addAll(document.getImages().values()); // iterates images in descending size order!
|
||||
// a measurement for the size of the images can be retrieved using the htmlFilterImageEntry.hashCode()
|
||||
|
||||
Iterator<htmlFilterImageEntry> i = images.iterator();
|
||||
|
|
Loading…
Reference in New Issue
Block a user