yacy_search_server/source/de/anomic/crawler/ResultImages.java
orbiter b6fb239e74 redesign of parser interface:
some file types are containers for several files. These containers had been parsed in such a way that the set of resulting parsed content was merged into one single document before parsing. Using this parser infrastructure it is not possible to parse document containers that contain individual files. An example is a rss file where the rss messages can be treated as individual documents with their own url reference. Another example is a surrogate file which was treated with a special operation outside of the parser infrastructure.
This commit introduces a redesigned parser interface and a new abstract parser implementation. The new parser interface has now only one entry point and returns always a set of parsed documents. In case of single documents the parser method returns a set of one documents.
To be compliant with the new interface, the zip and tar parser had been also completely redesigned. All parsers are now much more simple and cleaner in its structure. The switchboard operations had been extended to operate with sets of parsed files, not single parsed files.
additionally, parsing of jar manifest files had been added.

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@6955 6c8d7289-2bf4-0310-a012-ef5d649a1542
2010-06-29 19:20:45 +00:00

155 lines
6.3 KiB
Java
Executable File

// plasmaCrawlResultImages.java
// (C) 2008 by by Detlef Reichl; detlef!reichl()gmx!org and Michael Peter Christen; mc@yacy.net
// first published 13.04.2008 on http://yacy.net
//
// This is a part of YaCy, a peer-to-peer based web search engine
//
// $LastChangedDate$
// $LastChangedRevision$
// $LastChangedBy$
//
// LICENSE
//
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation; either version 2 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
package de.anomic.crawler;
import java.util.HashMap;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.ConcurrentLinkedQueue;
import net.yacy.cora.document.MultiProtocolURI;
import net.yacy.document.Document;
import net.yacy.document.parser.html.ImageEntry;
import net.yacy.kelondro.data.meta.DigestURI;
public class ResultImages {
// we maintain two different queues for private and public crawls and divide both into two halves:
// such images that appear to be good quality for a image monitor bacause their size is known, and other images
// that are not declared with sizes.
private static final ConcurrentLinkedQueue<OriginEntry> privateImageQueueHigh = new ConcurrentLinkedQueue<OriginEntry>();
private static final ConcurrentLinkedQueue<OriginEntry> privateImageQueueLow = new ConcurrentLinkedQueue<OriginEntry>();
private static final ConcurrentLinkedQueue<OriginEntry> publicImageQueueHigh = new ConcurrentLinkedQueue<OriginEntry>();
private static final ConcurrentLinkedQueue<OriginEntry> publicImageQueueLow = new ConcurrentLinkedQueue<OriginEntry>();
// we also check all links for a double-check so we don't get the same image more than once in any queue
// image links may appear double here even if the pages where the image links are embedded already are checked for double-occurrence:
// the same images may be linked from different pages
private static final ConcurrentHashMap<MultiProtocolURI, Long> doubleCheck = new ConcurrentHashMap<MultiProtocolURI, Long>(); // (url, time) when the url appeared first
public static void registerImages(final DigestURI source, final Document document, final boolean privateEntry) {
if (document == null) return;
if (source == null) return;
final HashMap<MultiProtocolURI, ImageEntry> images = document.getImages();
for (final ImageEntry image: images.values()) {
// do a double-check; attention: this can be time-consuming since this possibly needs a DNS-lookup
if (doubleCheck.containsKey(image.url())) continue;
doubleCheck.put(image.url(), System.currentTimeMillis());
final String name = image.url().getFile();
boolean good = false;
if (image.width() > 120 &&
image.height() > 100 &&
image.width() < 1200 &&
image.height() < 1000 &&
name.lastIndexOf(".gif") == -1) {
// && ((urlString.lastIndexOf(".jpg") != -1)) ||
// ((urlString.lastIndexOf(".png") != -1)){
good = true;
float ratio;
if (image.width() > image.height()) {
ratio = (float) image.width() / (float) image.height();
} else {
ratio = (float) image.height() / (float) image.width();
}
if (ratio < 1.0f || ratio > 2.0f) good = false;
}
if (good) {
if (privateEntry) {
privateImageQueueHigh.add(new OriginEntry(image, source));
} else {
publicImageQueueHigh.add(new OriginEntry(image, source));
}
} else {
if (privateEntry) {
privateImageQueueLow.add(new OriginEntry(image, source));
} else {
publicImageQueueLow.add(new OriginEntry(image, source));
}
}
}
}
public static OriginEntry next(final boolean privateEntryOnly) {
OriginEntry e = null;
if (privateEntryOnly) {
e = privateImageQueueHigh.poll();
if (e == null) e = privateImageQueueLow.poll();
} else {
e = publicImageQueueHigh.poll();
if (e == null) e = privateImageQueueHigh.poll();
if (e == null) e = publicImageQueueLow.poll();
if (e == null) e = privateImageQueueLow.poll();
}
return e;
}
public static int queueSize(final boolean privateEntryOnly) {
int publicSize = 0;
if (!privateEntryOnly) {
publicSize = publicImageQueueHigh.size() + publicImageQueueLow.size();
}
return privateImageQueueHigh.size() + privateImageQueueLow.size() + publicSize;
}
public static int privateQueueHighSize() {
return privateImageQueueHigh.size();
}
public static int privateQueueLowSize() {
return privateImageQueueLow.size();
}
public static int publicQueueHighSize() {
return publicImageQueueHigh.size();
}
public static int publicQueueLowSize() {
return publicImageQueueLow.size();
}
public static void clearQueues() {
privateImageQueueHigh.clear();
privateImageQueueLow.clear();
publicImageQueueHigh.clear();
publicImageQueueLow.clear();
doubleCheck.clear();
}
public static class OriginEntry {
public ImageEntry imageEntry;
public MultiProtocolURI baseURL;
public OriginEntry(final ImageEntry imageEntry, final MultiProtocolURI baseURL) {
this.imageEntry = imageEntry;
this.baseURL = baseURL;
}
}
}