2009-08-27 16:34:41 +02:00
// MediaSnippet.java
// -----------------
// (C) by Michael Peter Christen; mc@yacy.net
// first published on http://www.anomic.de
// Frankfurt, Germany, 2005
//
2009-09-05 22:41:21 +02:00
// $LastChangedDate$
// $LastChangedRevision$
// $LastChangedBy$
2009-08-27 16:34:41 +02:00
//
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation; either version 2 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
2011-09-25 18:59:06 +02:00
package net.yacy.search.snippet ;
2009-08-27 16:34:41 +02:00
2010-06-22 14:28:53 +02:00
import java.io.IOException ;
2009-08-27 16:34:41 +02:00
import java.util.ArrayList ;
2009-11-20 04:30:48 +01:00
import java.util.Comparator ;
2012-01-25 12:48:48 +01:00
import java.util.Date ;
2009-08-27 16:34:41 +02:00
import java.util.Iterator ;
2010-11-28 03:57:31 +01:00
import java.util.List ;
2009-08-27 16:34:41 +02:00
import java.util.Map ;
2010-11-28 03:57:31 +01:00
import java.util.SortedMap ;
import java.util.SortedSet ;
2009-08-27 16:34:41 +02:00
import java.util.TreeSet ;
2011-05-27 10:24:54 +02:00
import net.yacy.cora.document.ASCII ;
2012-04-21 21:31:13 +02:00
import net.yacy.cora.document.Classification ;
2012-04-22 00:04:36 +02:00
import net.yacy.cora.document.Classification.ContentDomain ;
2010-05-25 14:54:57 +02:00
import net.yacy.cora.document.MultiProtocolURI ;
2011-06-13 23:44:03 +02:00
import net.yacy.cora.services.federated.yacy.CacheStrategy ;
2012-05-21 17:52:30 +02:00
import net.yacy.cora.util.NumberTools ;
2009-10-18 02:53:43 +02:00
import net.yacy.document.Document ;
2010-06-29 21:20:45 +02:00
import net.yacy.document.Parser ;
2011-02-12 01:01:40 +01:00
import net.yacy.document.WordTokenizer ;
2009-10-18 02:53:43 +02:00
import net.yacy.document.parser.html.ImageEntry ;
2009-10-11 02:12:19 +02:00
import net.yacy.kelondro.data.meta.DigestURI ;
2010-04-15 15:22:59 +02:00
import net.yacy.kelondro.index.HandleSet ;
2010-09-22 22:50:02 +02:00
import net.yacy.kelondro.index.RowSpaceExceededException ;
2009-10-10 01:13:30 +02:00
import net.yacy.kelondro.logging.Log ;
2009-11-19 14:49:28 +01:00
import net.yacy.kelondro.order.Base64Order ;
2010-04-15 16:19:29 +02:00
import net.yacy.kelondro.util.ByteArray ;
2012-06-11 00:17:30 +02:00
import net.yacy.repository.Blacklist.BlacklistType ;
2011-09-25 18:59:06 +02:00
import net.yacy.search.Switchboard ;
2011-12-28 20:09:17 +01:00
import de.anomic.crawler.ZURL.FailCategory ;
2012-01-25 12:48:48 +01:00
import de.anomic.crawler.retrieval.Request ;
2009-10-10 01:13:30 +02:00
2009-08-27 16:34:41 +02:00
2009-11-20 04:30:48 +01:00
public class MediaSnippet implements Comparable < MediaSnippet > , Comparator < MediaSnippet > {
2009-11-19 00:56:05 +01:00
public ContentDomain type ;
2009-10-11 02:12:19 +02:00
public DigestURI href , source ;
2009-11-23 17:10:50 +01:00
public String name , attr , mime ;
2010-09-09 17:30:25 +02:00
public long ranking ;
2009-11-23 17:10:50 +01:00
public int width , height ;
public long fileSize ;
2009-09-05 22:41:21 +02:00
2010-09-09 17:30:25 +02:00
public MediaSnippet ( final ContentDomain type , final DigestURI href , final String mime , final String name , final long fileSize , final String attr , final long ranking , final DigestURI source ) {
2009-08-27 16:34:41 +02:00
this . type = type ;
this . href = href ;
2009-11-23 17:10:50 +01:00
this . mime = mime ;
this . fileSize = fileSize ;
2009-08-27 16:34:41 +02:00
this . source = source ; // the web page where the media resource appeared
this . name = name ;
this . attr = attr ;
2009-11-23 17:10:50 +01:00
this . width = - 1 ;
this . height = - 1 ;
int p = 0 ;
2011-11-25 12:23:52 +01:00
if ( attr ! = null & & ( p = attr . indexOf ( " x " , 0 ) ) > 0 ) {
2012-05-21 17:52:30 +02:00
this . width = NumberTools . parseIntDecSubstring ( attr , 0 , p ) ;
2012-05-21 13:40:46 +02:00
this . height = NumberTools . parseIntDecSubstring ( attr , p + 3 ) ;
2009-11-23 17:10:50 +01:00
}
this . ranking = ranking ; // the smaller the better! small values should be shown first
if ( ( this . name = = null ) | | ( this . name . length ( ) = = 0 ) ) this . name = " _ " ;
if ( ( this . attr = = null ) | | ( this . attr . length ( ) = = 0 ) ) this . attr = " _ " ;
}
2011-06-13 23:44:03 +02:00
2010-09-09 17:30:25 +02:00
public MediaSnippet ( final ContentDomain type , final DigestURI href , final String mime , final String name , final long fileSize , final int width , final int height , final long ranking , final DigestURI source ) {
2009-11-23 17:10:50 +01:00
this . type = type ;
this . href = href ;
2009-11-24 12:13:11 +01:00
this . mime = mime ;
2009-11-23 17:10:50 +01:00
this . fileSize = fileSize ;
this . source = source ; // the web page where the media resource appeared
this . name = name ;
this . attr = width + " x " + height ;
this . width = width ;
this . height = height ;
2009-08-27 16:34:41 +02:00
this . ranking = ranking ; // the smaller the better! small values should be shown first
if ( ( this . name = = null ) | | ( this . name . length ( ) = = 0 ) ) this . name = " _ " ;
if ( ( this . attr = = null ) | | ( this . attr . length ( ) = = 0 ) ) this . attr = " _ " ;
}
2011-06-13 23:44:03 +02:00
2012-05-30 16:59:13 +02:00
private int hashCache = Integer . MIN_VALUE ; // if this is used in a compare method many times, a cache is useful
2009-09-05 22:41:21 +02:00
@Override
2009-08-27 16:34:41 +02:00
public int hashCode ( ) {
2012-05-30 16:59:13 +02:00
if ( this . hashCache = = Integer . MIN_VALUE ) {
this . hashCache = ByteArray . hashCode ( this . href . hash ( ) ) ;
}
return this . hashCache ;
2009-08-27 16:34:41 +02:00
}
2011-06-13 23:44:03 +02:00
2010-11-28 03:57:31 +01:00
@Override
2009-11-20 13:11:56 +01:00
public String toString ( ) {
2011-06-13 23:44:03 +02:00
return ASCII . String ( this . href . hash ( ) ) ;
2009-11-20 13:11:56 +01:00
}
2011-06-13 23:44:03 +02:00
2010-11-28 03:57:31 +01:00
@Override
2009-11-20 13:11:56 +01:00
public boolean equals ( final Object obj ) {
if ( this = = obj ) return true ;
if ( obj = = null ) return false ;
if ( ! ( obj instanceof MediaSnippet ) ) return false ;
2011-06-13 23:44:03 +02:00
final MediaSnippet other = ( MediaSnippet ) obj ;
2010-04-08 02:11:32 +02:00
return Base64Order . enhancedCoder . equal ( this . href . hash ( ) , other . href . hash ( ) ) ;
2009-11-19 14:49:28 +01:00
}
2011-06-13 23:44:03 +02:00
2012-01-25 12:48:48 +01:00
@Override
2011-06-13 23:44:03 +02:00
public int compareTo ( final MediaSnippet o ) {
2010-04-08 02:11:32 +02:00
return Base64Order . enhancedCoder . compare ( this . href . hash ( ) , o . href . hash ( ) ) ;
2009-11-19 14:49:28 +01:00
}
2011-06-13 23:44:03 +02:00
2012-01-25 12:48:48 +01:00
@Override
2011-06-13 23:44:03 +02:00
public int compare ( final MediaSnippet o1 , final MediaSnippet o2 ) {
2009-11-20 04:30:48 +01:00
return o1 . compareTo ( o2 ) ;
}
2011-06-13 23:44:03 +02:00
2012-07-05 10:44:30 +02:00
public static List < MediaSnippet > retrieveMediaSnippets ( final DigestURI url , final HandleSet queryhashes , final Classification . ContentDomain mediatype , final CacheStrategy cacheStrategy , final boolean reindexing ) {
2009-12-02 01:37:59 +01:00
if ( queryhashes . isEmpty ( ) ) {
2009-08-27 16:34:41 +02:00
Log . logFine ( " snippet fetch " , " no query hashes given for url " + url ) ;
return new ArrayList < MediaSnippet > ( ) ;
}
2011-06-13 23:44:03 +02:00
2010-06-22 14:28:53 +02:00
Document document ;
try {
2012-07-05 10:44:30 +02:00
document = Document . mergeDocuments ( url , null , Switchboard . getSwitchboard ( ) . loader . loadDocuments ( Switchboard . getSwitchboard ( ) . loader . request ( url , false , reindexing ) , cacheStrategy , Integer . MAX_VALUE , BlacklistType . SEARCH ) ) ;
2011-06-13 23:44:03 +02:00
} catch ( final IOException e ) {
2010-06-22 14:28:53 +02:00
Log . logFine ( " snippet fetch " , " load error: " + e . getMessage ( ) ) ;
return new ArrayList < MediaSnippet > ( ) ;
2011-06-13 23:44:03 +02:00
} catch ( final Parser . Failure e ) {
2010-06-22 14:28:53 +02:00
Log . logFine ( " snippet fetch " , " parser error: " + e . getMessage ( ) ) ;
return new ArrayList < MediaSnippet > ( ) ;
}
2009-08-27 16:34:41 +02:00
final ArrayList < MediaSnippet > a = new ArrayList < MediaSnippet > ( ) ;
if ( document ! = null ) {
2010-06-29 21:20:45 +02:00
if ( ( mediatype = = ContentDomain . ALL ) | | ( mediatype = = ContentDomain . AUDIO ) ) a . addAll ( computeMediaSnippets ( url , document , queryhashes , ContentDomain . AUDIO ) ) ;
if ( ( mediatype = = ContentDomain . ALL ) | | ( mediatype = = ContentDomain . VIDEO ) ) a . addAll ( computeMediaSnippets ( url , document , queryhashes , ContentDomain . VIDEO ) ) ;
if ( ( mediatype = = ContentDomain . ALL ) | | ( mediatype = = ContentDomain . APP ) ) a . addAll ( computeMediaSnippets ( url , document , queryhashes , ContentDomain . APP ) ) ;
if ( ( mediatype = = ContentDomain . ALL ) | | ( mediatype = = ContentDomain . IMAGE ) ) a . addAll ( computeImageSnippets ( url , document , queryhashes ) ) ;
2009-08-27 16:34:41 +02:00
}
return a ;
}
2011-06-13 23:44:03 +02:00
2010-11-28 03:57:31 +01:00
public static List < MediaSnippet > computeMediaSnippets ( final DigestURI source , final Document document , final HandleSet queryhashes , final ContentDomain mediatype ) {
2011-06-13 23:44:03 +02:00
2009-08-27 16:34:41 +02:00
if ( document = = null ) return new ArrayList < MediaSnippet > ( ) ;
2010-05-25 14:54:57 +02:00
Map < MultiProtocolURI , String > media = null ;
2009-11-19 00:56:05 +01:00
if ( mediatype = = ContentDomain . AUDIO ) media = document . getAudiolinks ( ) ;
else if ( mediatype = = ContentDomain . VIDEO ) media = document . getVideolinks ( ) ;
else if ( mediatype = = ContentDomain . APP ) media = document . getApplinks ( ) ;
2009-08-27 16:34:41 +02:00
if ( media = = null ) return null ;
2011-06-13 23:44:03 +02:00
2010-05-25 14:54:57 +02:00
final Iterator < Map . Entry < MultiProtocolURI , String > > i = media . entrySet ( ) . iterator ( ) ;
Map . Entry < MultiProtocolURI , String > entry ;
2009-10-11 02:12:19 +02:00
DigestURI url ;
2009-08-27 16:34:41 +02:00
String desc ;
2010-11-28 03:57:31 +01:00
final List < MediaSnippet > result = new ArrayList < MediaSnippet > ( ) ;
2009-08-27 16:34:41 +02:00
while ( i . hasNext ( ) ) {
entry = i . next ( ) ;
2010-05-25 14:54:57 +02:00
url = new DigestURI ( entry . getKey ( ) ) ;
2012-06-11 00:17:30 +02:00
desc = entry . getValue ( ) ;
if ( isUrlBlacklisted ( BlacklistType . SEARCH , url ) ) continue ;
2011-06-13 23:44:03 +02:00
final int ranking = removeAppearanceHashes ( url . toNormalform ( false , false ) , queryhashes ) . size ( ) +
2010-09-22 22:50:02 +02:00
removeAppearanceHashes ( desc , queryhashes ) . size ( ) ;
2009-11-20 14:19:12 +01:00
if ( ranking < 2 * queryhashes . size ( ) ) {
2011-09-01 18:05:00 +02:00
result . add ( new MediaSnippet ( mediatype , url , Classification . url2mime ( url ) , desc , document . getTextLength ( ) , null , ranking , source ) ) ;
2009-08-27 16:34:41 +02:00
}
}
return result ;
}
2011-06-13 23:44:03 +02:00
2010-11-28 03:57:31 +01:00
public static List < MediaSnippet > computeImageSnippets ( final DigestURI source , final Document document , final HandleSet queryhashes ) {
2011-06-13 23:44:03 +02:00
2010-11-28 03:57:31 +01:00
final SortedSet < ImageEntry > images = new TreeSet < ImageEntry > ( ) ;
2009-08-27 16:34:41 +02:00
images . addAll ( document . getImages ( ) . values ( ) ) ; // iterates images in descending size order!
// a measurement for the size of the images can be retrieved using the htmlFilterImageEntry.hashCode()
2011-06-13 23:44:03 +02:00
2009-08-27 16:34:41 +02:00
final Iterator < ImageEntry > i = images . iterator ( ) ;
ImageEntry ientry ;
2009-10-11 02:12:19 +02:00
DigestURI url ;
2009-08-27 16:34:41 +02:00
String desc ;
2010-11-28 03:57:31 +01:00
final List < MediaSnippet > result = new ArrayList < MediaSnippet > ( ) ;
2009-08-27 16:34:41 +02:00
while ( i . hasNext ( ) ) {
ientry = i . next ( ) ;
2010-05-25 14:54:57 +02:00
url = new DigestURI ( ientry . url ( ) ) ;
2012-06-11 00:17:30 +02:00
final String u = url . toString ( ) ;
if ( isUrlBlacklisted ( BlacklistType . SEARCH , url ) ) continue ;
2011-11-25 12:23:52 +01:00
if ( u . indexOf ( " .ico " , 0 ) > = 0 | | u . indexOf ( " favicon " , 0 ) > = 0 ) continue ;
2011-05-07 09:37:46 +02:00
if ( ientry . height ( ) > 0 & & ientry . height ( ) < 32 ) continue ;
if ( ientry . width ( ) > 0 & & ientry . width ( ) < 32 ) continue ;
2009-08-27 16:34:41 +02:00
desc = ientry . alt ( ) ;
2011-06-13 23:44:03 +02:00
final int appcount = queryhashes . size ( ) * 2 -
2010-09-22 22:50:02 +02:00
removeAppearanceHashes ( url . toNormalform ( false , false ) , queryhashes ) . size ( ) -
removeAppearanceHashes ( desc , queryhashes ) . size ( ) ;
2011-06-13 23:44:03 +02:00
final long ranking = Long . MAX_VALUE - ( ientry . height ( ) + 1 ) * ( ientry . width ( ) + 1 ) * ( appcount + 1 ) ;
2011-09-01 18:05:00 +02:00
result . add ( new MediaSnippet ( ContentDomain . IMAGE , url , Classification . url2mime ( url ) , desc , ientry . fileSize ( ) , ientry . width ( ) , ientry . height ( ) , ranking , source ) ) ;
2009-08-27 16:34:41 +02:00
}
return result ;
}
2011-06-13 23:44:03 +02:00
2010-09-22 22:50:02 +02:00
/ * *
* removed all word hashes that can be computed as tokens from a given sentence from a given hash set
* @param sentence
* @param queryhashes
* @return the given hash set minus the hashes from the tokenization of the given sentence
* /
private static HandleSet removeAppearanceHashes ( final String sentence , final HandleSet queryhashes ) {
// remove all hashes that appear in the sentence
if ( sentence = = null ) return queryhashes ;
2012-01-25 12:48:48 +01:00
final SortedMap < byte [ ] , Integer > hs = WordTokenizer . hashSentence ( sentence , null , 100 ) ;
2010-09-22 22:50:02 +02:00
final Iterator < byte [ ] > j = queryhashes . iterator ( ) ;
byte [ ] hash ;
Integer pos ;
final HandleSet remaininghashes = new HandleSet ( queryhashes . row ( ) . primaryKeyLength , queryhashes . comparator ( ) , queryhashes . size ( ) ) ;
while ( j . hasNext ( ) ) {
hash = j . next ( ) ;
pos = hs . get ( hash ) ;
if ( pos = = null ) {
try {
remaininghashes . put ( hash ) ;
2011-06-13 23:44:03 +02:00
} catch ( final RowSpaceExceededException e ) {
2010-09-22 22:50:02 +02:00
Log . logException ( e ) ;
}
2009-08-27 16:34:41 +02:00
}
}
2010-09-22 22:50:02 +02:00
return remaininghashes ;
2009-08-27 16:34:41 +02:00
}
2011-06-13 23:44:03 +02:00
2011-12-28 20:09:17 +01:00
/ * *
* Checks wether given URL is in blacklist for given blacklist type
*
* @param url The URL to check
* @param blacklistType Type of blacklist ( see class Blacklist , BLACKLIST_FOO )
* @return isBlacklisted Wether the given URL is blacklisted
2012-06-11 00:17:30 +02:00
* /
private static boolean isUrlBlacklisted ( final BlacklistType blacklistType , final DigestURI url ) {
2011-12-28 20:09:17 +01:00
// Default is not blacklisted
boolean isBlacklisted = false ;
// check if url is in blacklist
if ( Switchboard . urlBlacklist . isListed ( blacklistType , url . getHost ( ) . toLowerCase ( ) , url . getFile ( ) ) ) {
Switchboard . getSwitchboard ( ) . crawlQueues . errorURL . push ( new Request ( url , null ) , Switchboard . getSwitchboard ( ) . peers . mySeed ( ) . hash . getBytes ( ) , new Date ( ) , 1 , FailCategory . FINAL_LOAD_CONTEXT , " url in blacklist " , - 1 ) ;
Log . logFine ( " snippet fetch " , " MEDIA-SNIPPET Rejecting URL ' " + url . toString ( ) + " '. URL is in blacklist. " ) ;
isBlacklisted = true ;
}
// Return result
return isBlacklisted ;
}
2009-08-27 16:34:41 +02:00
}
2011-12-28 20:09:17 +01:00