yacy_search_server/source/net/yacy/search/snippet/MediaSnippet.java
Michael Peter Christen 788288eb9e added the generation of 50 (!!) new solr field in the core 'webgraph'.
The default schema uses only some of them and the resting search index
has now the following properties:
- webgraph size will have about 40 times as much entries as default
index
- the complete index size will increase and may be about the double size
of current amount
As testing showed, not much indexing performance is lost. The default
index will be smaller (moved fields out of it); thus searching
can be faster.
The new index will cause that some old parts in YaCy can be removed,
i.e. specialized webgraph data and the noload crawler. The new index
will make it possible to:
- search within link texts of linked but not indexed documents (about 20
times of document index in size!!)
- get a very detailed link graph
- enhance ranking using a complete link graph

To get the full access to the new index, the API to solr has now two
access points: one with attribute core=collection1 for the default
search index and core=webgraph to the new webgraph search index. This is
also avaiable for p2p operation but client access is not yet
implemented.
2013-02-22 15:45:15 +01:00

272 lines
12 KiB
Java

// MediaSnippet.java
// -----------------
// (C) by Michael Peter Christen; mc@yacy.net
// first published on http://www.anomic.de
// Frankfurt, Germany, 2005
//
// $LastChangedDate$
// $LastChangedRevision$
// $LastChangedBy$
//
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation; either version 2 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
package net.yacy.search.snippet;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Comparator;
import java.util.Date;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.SortedMap;
import java.util.SortedSet;
import java.util.TreeSet;
import net.yacy.cora.document.ASCII;
import net.yacy.cora.document.analysis.Classification;
import net.yacy.cora.document.analysis.Classification.ContentDomain;
import net.yacy.cora.federate.yacy.CacheStrategy;
import net.yacy.cora.order.Base64Order;
import net.yacy.cora.storage.HandleSet;
import net.yacy.cora.util.NumberTools;
import net.yacy.cora.util.SpaceExceededException;
import net.yacy.crawler.data.ZURL.FailCategory;
import net.yacy.crawler.retrieval.Request;
import net.yacy.document.Document;
import net.yacy.document.Parser;
import net.yacy.document.WordTokenizer;
import net.yacy.document.parser.html.ImageEntry;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.index.RowHandleSet;
import net.yacy.kelondro.logging.Log;
import net.yacy.kelondro.util.ByteArray;
import net.yacy.repository.Blacklist.BlacklistType;
import net.yacy.search.Switchboard;
public class MediaSnippet implements Comparable<MediaSnippet>, Comparator<MediaSnippet> {
public ContentDomain type;
public DigestURI href, source;
public String name, attr, mime;
public long ranking;
public int width, height;
public long fileSize;
public MediaSnippet(final ContentDomain type, final DigestURI href, final String mime, final String name, final long fileSize, final String attr, final long ranking, final DigestURI source) {
this.type = type;
this.href = href;
this.mime = mime;
this.fileSize = fileSize;
this.source = source; // the web page where the media resource appeared
this.name = name;
this.attr = attr;
this.width = -1;
this.height = -1;
int p = 0;
if (attr != null && (p = attr.indexOf(" x ",0)) > 0) {
this.width = NumberTools.parseIntDecSubstring(attr, 0, p);
this.height = NumberTools.parseIntDecSubstring(attr, p + 3);
}
this.ranking = ranking; // the smaller the better! small values should be shown first
if ((this.name == null) || (this.name.isEmpty())) this.name = "_";
if ((this.attr == null) || (this.attr.isEmpty())) this.attr = "_";
}
public MediaSnippet(final ContentDomain type, final DigestURI href, final String mime, final String name, final long fileSize, final int width, final int height, final long ranking, final DigestURI source) {
this.type = type;
this.href = href;
this.mime = mime;
this.fileSize = fileSize;
this.source = source; // the web page where the media resource appeared
this.name = name;
this.attr = width + " x " + height;
this.width = width;
this.height = height;
this.ranking = ranking; // the smaller the better! small values should be shown first
if ((this.name == null) || (this.name.isEmpty())) this.name = "_";
if ((this.attr == null) || (this.attr.isEmpty())) this.attr = "_";
}
private int hashCache = Integer.MIN_VALUE; // if this is used in a compare method many times, a cache is useful
@Override
public int hashCode() {
if (this.hashCache == Integer.MIN_VALUE) {
this.hashCache = ByteArray.hashCode(this.href.hash());
}
return this.hashCache;
}
@Override
public String toString() {
return ASCII.String(this.href.hash());
}
@Override
public boolean equals(final Object obj) {
if (this == obj) return true;
if (obj == null) return false;
if (!(obj instanceof MediaSnippet)) return false;
final MediaSnippet other = (MediaSnippet) obj;
return Base64Order.enhancedCoder.equal(this.href.hash(), other.href.hash());
}
@Override
public int compareTo(final MediaSnippet o) {
return Base64Order.enhancedCoder.compare(this.href.hash(), o.href.hash());
}
@Override
public int compare(final MediaSnippet o1, final MediaSnippet o2) {
return o1.compareTo(o2);
}
public static List<MediaSnippet> retrieveMediaSnippets(final DigestURI url, final HandleSet queryhashes, final Classification.ContentDomain mediatype, final CacheStrategy cacheStrategy, final boolean reindexing) {
if (queryhashes.isEmpty()) {
Log.logFine("snippet fetch", "no query hashes given for url " + url);
return new ArrayList<MediaSnippet>();
}
Document document;
try {
document = Document.mergeDocuments(url, null, Switchboard.getSwitchboard().loader.loadDocuments(Switchboard.getSwitchboard().loader.request(url, false, reindexing), cacheStrategy, Integer.MAX_VALUE, BlacklistType.SEARCH, TextSnippet.snippetMinLoadDelay));
} catch (final IOException e) {
Log.logFine("snippet fetch", "load error: " + e.getMessage());
return new ArrayList<MediaSnippet>();
} catch (final Parser.Failure e) {
Log.logFine("snippet fetch", "parser error: " + e.getMessage());
return new ArrayList<MediaSnippet>();
}
final ArrayList<MediaSnippet> a = new ArrayList<MediaSnippet>();
if (document != null) {
if ((mediatype == ContentDomain.ALL) || (mediatype == ContentDomain.AUDIO)) a.addAll(computeMediaSnippets(url, document, queryhashes, ContentDomain.AUDIO));
if ((mediatype == ContentDomain.ALL) || (mediatype == ContentDomain.VIDEO)) a.addAll(computeMediaSnippets(url, document, queryhashes, ContentDomain.VIDEO));
if ((mediatype == ContentDomain.ALL) || (mediatype == ContentDomain.APP)) a.addAll(computeMediaSnippets(url, document, queryhashes, ContentDomain.APP));
if ((mediatype == ContentDomain.ALL) || (mediatype == ContentDomain.IMAGE)) a.addAll(computeImageSnippets(url, document, queryhashes));
}
return a;
}
public static List<MediaSnippet> computeMediaSnippets(final DigestURI source, final Document document, final HandleSet queryhashes, final ContentDomain mediatype) {
if (document == null) return new ArrayList<MediaSnippet>();
Map<DigestURI, String> media = null;
if (mediatype == ContentDomain.AUDIO) media = document.getAudiolinks();
else if (mediatype == ContentDomain.VIDEO) media = document.getVideolinks();
else if (mediatype == ContentDomain.APP) media = document.getApplinks();
if (media == null) return null;
final Iterator<Map.Entry<DigestURI, String>> i = media.entrySet().iterator();
Map.Entry<DigestURI, String> entry;
DigestURI url;
String desc;
final List<MediaSnippet> result = new ArrayList<MediaSnippet>();
while (i.hasNext()) {
entry = i.next();
url = entry.getKey();
desc = entry.getValue();
if (isUrlBlacklisted(BlacklistType.SEARCH, url)) continue;
final int ranking = removeAppearanceHashes(url.toNormalform(true), queryhashes).size() +
removeAppearanceHashes(desc, queryhashes).size();
if (ranking < 2 * queryhashes.size()) {
result.add(new MediaSnippet(mediatype, url, Classification.url2mime(url), desc, document.getTextLength(), null, ranking, source));
}
}
return result;
}
public static List<MediaSnippet> computeImageSnippets(final DigestURI source, final Document document, final HandleSet queryhashes) {
final SortedSet<ImageEntry> images = new TreeSet<ImageEntry>();
images.addAll(document.getImages().values()); // iterates images in descending size order!
// a measurement for the size of the images can be retrieved using the htmlFilterImageEntry.hashCode()
final Iterator<ImageEntry> i = images.iterator();
ImageEntry ientry;
DigestURI url;
String desc;
final List<MediaSnippet> result = new ArrayList<MediaSnippet>();
while (i.hasNext()) {
ientry = i.next();
url = ientry.url();
final String u = url.toString();
if (isUrlBlacklisted(BlacklistType.SEARCH, url)) continue;
if (u.indexOf(".ico",0) >= 0 || u.indexOf("favicon",0) >= 0) continue;
if (ientry.height() > 0 && ientry.height() < 32) continue;
if (ientry.width() > 0 && ientry.width() < 32) continue;
desc = ientry.alt();
final int appcount = queryhashes.size() * 2 -
removeAppearanceHashes(url.toNormalform(true), queryhashes).size() -
removeAppearanceHashes(desc, queryhashes).size();
final long ranking = Long.MAX_VALUE - (ientry.height() + 1) * (ientry.width() + 1) * (appcount + 1);
result.add(new MediaSnippet(ContentDomain.IMAGE, url, Classification.url2mime(url), desc, ientry.fileSize(), ientry.width(), ientry.height(), ranking, source));
}
return result;
}
/**
* removed all word hashes that can be computed as tokens from a given sentence from a given hash set
* @param sentence
* @param queryhashes
* @return the given hash set minus the hashes from the tokenization of the given sentence
*/
private static HandleSet removeAppearanceHashes(final String sentence, final HandleSet queryhashes) {
// remove all hashes that appear in the sentence
if (sentence == null) return queryhashes;
final SortedMap<byte[], Integer> hs = WordTokenizer.hashSentence(sentence, null, 100);
final Iterator<byte[]> j = queryhashes.iterator();
byte[] hash;
Integer pos;
final HandleSet remaininghashes = new RowHandleSet(queryhashes.keylen(), queryhashes.comparator(), queryhashes.size());
while (j.hasNext()) {
hash = j.next();
pos = hs.get(hash);
if (pos == null) {
try {
remaininghashes.put(hash);
} catch (final SpaceExceededException e) {
Log.logException(e);
}
}
}
return remaininghashes;
}
/**
* Checks wether given URL is in blacklist for given blacklist type
*
* @param url The URL to check
* @param blacklistType Type of blacklist (see class Blacklist, BLACKLIST_FOO)
* @return isBlacklisted Wether the given URL is blacklisted
*/
private static boolean isUrlBlacklisted (final BlacklistType blacklistType, final DigestURI url) {
// Default is not blacklisted
boolean isBlacklisted = false;
// check if url is in blacklist
if (Switchboard.urlBlacklist.isListed(blacklistType, url.getHost().toLowerCase(), url.getFile())) {
Switchboard.getSwitchboard().crawlQueues.errorURL.push(new Request(url, null), ASCII.getBytes(Switchboard.getSwitchboard().peers.mySeed().hash), new Date(), 1, FailCategory.FINAL_LOAD_CONTEXT, "url in blacklist", -1);
Log.logFine("snippet fetch", "MEDIA-SNIPPET Rejecting URL '" + url.toString() + "'. URL is in blacklist.");
isBlacklisted = true;
}
// Return result
return isBlacklisted;
}
}