2011-06-01 21:31:56 +02:00
//Document.java
2005-04-24 23:47:34 +02:00
//------------------------
//part of YaCy
2008-07-20 19:14:51 +02:00
//(C) by Michael Peter Christen; mc@yacy.net
2005-04-24 23:47:34 +02:00
//first published on http://www.anomic.de
//Frankfurt, Germany, 2005
//
//This program is free software; you can redistribute it and/or modify
//it under the terms of the GNU General Public License as published by
//the Free Software Foundation; either version 2 of the License, or
//(at your option) any later version.
//
//This program is distributed in the hope that it will be useful,
//but WITHOUT ANY WARRANTY; without even the implied warranty of
//MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
//GNU General Public License for more details.
//
//You should have received a copy of the GNU General Public License
//along with this program; if not, write to the Free Software
//Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
2005-04-24 23:24:53 +02:00
2009-10-18 02:53:43 +02:00
package net.yacy.document ;
2005-04-24 23:24:53 +02:00
2006-09-30 11:31:53 +02:00
import java.io.BufferedInputStream ;
2005-06-02 03:33:10 +02:00
import java.io.ByteArrayInputStream ;
2010-01-19 15:59:58 +01:00
import java.io.ByteArrayOutputStream ;
2006-09-30 11:31:53 +02:00
import java.io.File ;
import java.io.FileInputStream ;
2007-05-19 01:13:44 +02:00
import java.io.IOException ;
2006-09-30 11:31:53 +02:00
import java.io.InputStream ;
2009-04-17 16:20:12 +02:00
import java.io.OutputStreamWriter ;
2010-05-11 13:14:05 +02:00
import java.io.UnsupportedEncodingException ;
2010-12-27 18:07:21 +01:00
import java.io.Writer ;
2009-07-10 00:25:31 +02:00
import java.net.MalformedURLException ;
2010-12-27 18:07:21 +01:00
import java.net.URL ;
2016-01-05 23:37:05 +01:00
import java.nio.charset.StandardCharsets ;
2010-09-22 22:50:02 +02:00
import java.util.ArrayList ;
2007-05-19 01:13:44 +02:00
import java.util.Arrays ;
2009-07-10 00:25:31 +02:00
import java.util.Collection ;
2008-03-26 20:51:05 +01:00
import java.util.Date ;
2005-04-24 23:24:53 +02:00
import java.util.HashMap ;
2009-07-10 00:25:31 +02:00
import java.util.HashSet ;
2005-04-24 23:24:53 +02:00
import java.util.Iterator ;
2013-09-15 00:30:23 +02:00
import java.util.LinkedHashMap ;
2012-08-31 10:30:43 +02:00
import java.util.LinkedHashSet ;
2007-05-19 01:13:44 +02:00
import java.util.LinkedList ;
import java.util.List ;
2005-04-24 23:24:53 +02:00
import java.util.Map ;
2008-09-20 00:19:11 +02:00
import java.util.Set ;
2013-11-10 18:50:34 +01:00
import java.util.regex.Matcher ;
import java.util.regex.Pattern ;
2005-04-24 23:24:53 +02:00
2011-01-03 21:52:54 +01:00
import net.yacy.cora.date.ISO8601Formatter ;
2012-11-21 18:46:49 +01:00
import net.yacy.cora.document.analysis.Classification ;
2013-10-23 00:16:54 +02:00
import net.yacy.cora.document.analysis.Classification.ContentDomain ;
2013-09-15 00:30:23 +02:00
import net.yacy.cora.document.encoding.UTF8 ;
import net.yacy.cora.document.id.AnchorURL ;
import net.yacy.cora.document.id.DigestURL ;
import net.yacy.cora.document.id.MultiProtocolURL ;
2012-06-11 23:49:30 +02:00
import net.yacy.cora.lod.vocabulary.Tagging ;
2013-08-20 15:46:04 +02:00
import net.yacy.cora.util.ByteBuffer ;
2013-07-09 14:28:25 +02:00
import net.yacy.cora.util.ConcurrentLog ;
2012-09-21 15:48:16 +02:00
import net.yacy.crawler.retrieval.Request ;
2009-10-18 02:53:43 +02:00
import net.yacy.document.parser.html.ContentScraper ;
import net.yacy.document.parser.html.ImageEntry ;
2009-10-10 03:14:19 +02:00
import net.yacy.kelondro.util.FileUtils ;
2006-09-30 00:27:20 +02:00
2009-07-08 23:48:08 +02:00
public class Document {
2011-06-01 21:31:56 +02:00
2013-11-10 18:50:34 +01:00
private DigestURL source ; // the source url
2008-08-02 14:12:04 +02:00
private final String mimeType ; // mimeType as taken from http header
private final String charset ; // the charset of the document
2015-11-13 01:48:28 +01:00
private final Set < String > keywords ; // most resources provide a keyword field
2012-08-31 10:30:43 +02:00
private List < String > titles ; // the document titles, taken from title and/or h1 tag; shall appear as headline of search result
2008-12-04 13:54:16 +01:00
private final StringBuilder creator ; // author or copyright
2010-05-11 13:14:05 +02:00
private final String publisher ; // publisher
2013-07-30 12:48:57 +02:00
private final List < String > sections ; // if present: more titles/headlines appearing in the document
private final List < String > descriptions ; // an abstract, if present: short content description
2010-04-15 00:17:18 +02:00
private Object text ; // the clear text, all that is visible
2013-09-15 00:30:23 +02:00
private final Collection < AnchorURL > anchors ; // all links embedded as clickeable entities (anchor tags)
private final LinkedHashMap < DigestURL , String > rss ; // all embedded rss feeds
2014-08-01 12:04:15 +02:00
private final LinkedHashMap < DigestURL , ImageEntry > images ; // all visible pictures in document
2005-04-24 23:24:53 +02:00
// the anchors and images - Maps are URL-to-EntityDescription mappings.
// The EntityDescription appear either as visible text in anchors or as alternative
// text in image tags.
2015-12-18 02:35:44 +01:00
private LinkedHashMap < AnchorURL , String > audiolinks , videolinks , applinks , hyperlinks ; // TODO: check if redundant value (set to key.getNameProperty()) is needed
2013-09-15 00:30:23 +02:00
private LinkedHashMap < DigestURL , String > inboundlinks , outboundlinks ;
2015-12-18 02:35:44 +01:00
private Set < AnchorURL > emaillinks ; // mailto: links
2013-09-15 00:30:23 +02:00
private MultiProtocolURL favicon ;
2006-12-01 17:21:17 +01:00
private boolean resorted ;
2011-06-01 21:31:56 +02:00
private final Set < String > languages ;
2014-07-10 17:13:35 +02:00
private boolean indexingDenied ;
2012-05-31 22:39:53 +02:00
private final double lon , lat ;
2016-08-14 03:53:16 +02:00
private final Parser parserObject ; // the source object that was used to create the Document
2012-12-18 02:29:03 +01:00
private final Map < String , Set < String > > generic_facets ; // a map from vocabulary names to the set of tags for that vocabulary which apply for this document
2014-12-11 23:37:41 +01:00
private final Date lastModified ;
2014-04-02 23:37:01 +02:00
private int crawldepth ;
2010-12-28 13:06:04 +01:00
2013-09-15 00:30:23 +02:00
public Document ( final DigestURL location , final String mimeType , final String charset ,
2016-08-14 03:53:16 +02:00
final Parser parserObject ,
2011-04-21 15:58:49 +02:00
final Set < String > languages ,
2012-08-31 10:30:43 +02:00
final String [ ] keywords ,
final List < String > titles ,
final String author , final String publisher ,
2013-07-30 12:48:57 +02:00
final String [ ] sections , final List < String > abstrcts ,
2012-05-31 22:39:53 +02:00
final double lon , final double lat ,
2010-08-25 20:24:54 +02:00
final Object text ,
2013-09-15 00:30:23 +02:00
final Collection < AnchorURL > anchors ,
final LinkedHashMap < DigestURL , String > rss ,
2014-08-01 12:04:15 +02:00
final LinkedHashMap < DigestURL , ImageEntry > images ,
2013-09-10 10:31:57 +02:00
final boolean indexingDenied ,
2014-12-11 23:37:41 +01:00
final Date lastModified ) {
2008-01-22 12:51:43 +01:00
this . source = location ;
2007-05-19 01:13:44 +02:00
this . mimeType = ( mimeType = = null ) ? " application/octet-stream " : mimeType ;
2006-09-15 15:18:12 +02:00
this . charset = charset ;
2011-04-21 15:58:49 +02:00
this . parserObject = parserObject ;
2015-11-13 01:48:28 +01:00
this . keywords = new LinkedHashSet < String > ( ) ;
2012-01-15 22:17:57 +01:00
if ( keywords ! = null ) this . keywords . addAll ( Arrays . asList ( keywords ) ) ;
2012-10-26 18:50:45 +02:00
this . titles = ( titles = = null ) ? new ArrayList < String > ( 1 ) : titles ;
2008-12-04 13:54:16 +01:00
this . creator = ( author = = null ) ? new StringBuilder ( 0 ) : new StringBuilder ( author ) ;
2012-10-26 18:50:45 +02:00
this . sections = new LinkedList < String > ( ) ;
if ( sections ! = null ) this . sections . addAll ( Arrays . asList ( sections ) ) ;
2013-07-30 12:48:57 +02:00
this . descriptions = ( abstrcts = = null ) ? new ArrayList < String > ( ) : abstrcts ;
2013-05-03 00:24:39 +02:00
if ( lat > = - 90 . 0d & & lat < = 90 . 0d & & lon > = - 180 . 0d & & lon < = 180 . 0d ) {
this . lon = lon ;
this . lat = lat ;
} else {
// we ignore false values because otherwise solr will cause an error when we input the coordinates into the index
this . lon = 0 . 0d ;
this . lat = 0 . 0d ;
}
2013-09-15 00:30:23 +02:00
this . anchors = ( anchors = = null ) ? new ArrayList < AnchorURL > ( 0 ) : anchors ;
this . rss = ( rss = = null ) ? new LinkedHashMap < DigestURL , String > ( 0 ) : rss ;
2014-08-01 12:04:15 +02:00
this . images = ( images = = null ) ? new LinkedHashMap < DigestURL , ImageEntry > ( ) : images ;
2010-05-11 13:14:05 +02:00
this . publisher = publisher ;
2005-04-24 23:24:53 +02:00
this . hyperlinks = null ;
2006-11-28 16:00:15 +01:00
this . audiolinks = null ;
this . videolinks = null ;
this . applinks = null ;
2005-04-24 23:24:53 +02:00
this . emaillinks = null ;
2006-04-04 16:36:01 +02:00
this . resorted = false ;
2011-04-21 15:58:49 +02:00
this . inboundlinks = null ;
this . outboundlinks = null ;
2008-09-20 00:19:11 +02:00
this . languages = languages ;
2010-03-04 00:32:56 +01:00
this . indexingDenied = indexingDenied ;
2012-07-04 21:15:10 +02:00
this . text = text = = null ? " " : text ;
2012-12-18 02:29:03 +01:00
this . generic_facets = new HashMap < String , Set < String > > ( ) ;
2014-12-11 23:37:41 +01:00
this . lastModified = lastModified = = null ? new Date ( ) : lastModified ;
2014-04-02 23:37:01 +02:00
this . crawldepth = 999 ; // unknown yet
2007-05-19 01:13:44 +02:00
}
2014-04-02 23:37:01 +02:00
2013-10-23 00:16:54 +02:00
/ * *
* Get the content domain of a document . This tries to get the content domain from the mime type
* and if this fails it uses alternatively the content domain from the file extension .
* @return the content domain which classifies the content type
* /
public ContentDomain getContentDomain ( ) {
ContentDomain contentDomain = Classification . getContentDomainFromMime ( this . mimeType ) ;
if ( contentDomain ! = ContentDomain . ALL ) return contentDomain ;
return this . dc_source ( ) . getContentDomainFromExt ( ) ;
}
2016-08-14 03:53:16 +02:00
/ * *
* The parser used to generate the document
* @return Parser
* /
public Parser getParserObject ( ) {
2011-04-21 15:58:49 +02:00
return this . parserObject ;
2009-10-18 03:38:07 +02:00
}
2011-06-01 21:31:56 +02:00
2016-08-14 03:53:16 +02:00
/ * *
* Confinient call to get the source / scraper object of the underlaying parser
* if the parser uses a scraper , like htmlParser
* @return scraper object typically of type ContentScraper but may also of type DCEntry
* /
public Object getScraperObject ( ) {
if ( this . parserObject instanceof AbstractParser ) {
if ( ( ( AbstractParser ) this . parserObject ) . scraperObject ! = null ) {
return ( ( AbstractParser ) this . parserObject ) . scraperObject ;
}
}
return null ;
}
2011-09-07 12:08:57 +02:00
public Set < String > getContentLanguages ( ) {
2011-11-25 12:23:52 +01:00
return this . languages ;
2011-11-23 00:50:49 +01:00
}
2011-11-25 12:23:52 +01:00
2013-06-25 16:27:20 +02:00
public String getFileName ( ) {
return this . source . getFileName ( ) ;
2011-09-07 12:08:57 +02:00
}
2012-12-18 02:29:03 +01:00
public Map < String , Set < String > > getGenericFacets ( ) {
return this . generic_facets ;
}
2008-09-20 00:19:11 +02:00
/ * *
* compute a set of languages that this document contains
* the language is not computed using a statistical analysis of the content , only from given metadata that came with the document
* if there are several languages defined in the document , the TLD is taken to check which one should be picked
* If there is no metadata at all , null is returned
* @return a string with a language name using the alpha - 2 code of ISO 639
* /
2009-04-17 16:20:12 +02:00
public String dc_language ( ) {
2008-09-20 00:19:11 +02:00
if ( this . languages = = null ) return null ;
2009-12-02 01:37:59 +01:00
if ( this . languages . isEmpty ( ) ) return null ;
2011-06-01 21:31:56 +02:00
if ( this . languages . size ( ) = = 1 ) return this . languages . iterator ( ) . next ( ) ;
2008-09-20 00:19:11 +02:00
if ( this . languages . contains ( this . source . language ( ) ) ) return this . source . language ( ) ;
// now we are confused: the declared languages differ all from the TLD
// just pick one of the languages that we have
2011-06-01 21:31:56 +02:00
return this . languages . iterator ( ) . next ( ) ;
2008-09-20 00:19:11 +02:00
}
2011-06-01 21:31:56 +02:00
2008-01-22 12:51:43 +01:00
/ *
DC according to rfc 5013
* dc_title
* dc_creator
* dc_subject
* dc_description
* dc_publisher
dc_contributor
dc_date
dc_type
* dc_format
* dc_identifier
* dc_source
dc_language
dc_relation
dc_coverage
dc_rights
* /
2011-06-01 21:31:56 +02:00
2015-11-13 01:48:28 +01:00
/ * *
* Get the main document title . This is the 1st in the list of titles .
* @return title_string ( may return null or empty string )
* /
2008-01-22 12:51:43 +01:00
public String dc_title ( ) {
2012-08-31 10:30:43 +02:00
return ( this . titles = = null | | this . titles . size ( ) = = 0 ) ? " " : this . titles . iterator ( ) . next ( ) ;
}
public List < String > titles ( ) {
return this . titles ;
2008-01-22 12:51:43 +01:00
}
2015-11-13 01:48:28 +01:00
/ * *
* Sets the title of the document , replacing any existing titles .
* @param title
* /
2011-06-01 21:31:56 +02:00
public void setTitle ( final String title ) {
2012-08-31 10:30:43 +02:00
this . titles = new ArrayList < String > ( ) ;
if ( title ! = null ) this . titles . add ( title ) ;
}
2012-12-18 02:29:03 +01:00
2011-06-01 21:31:56 +02:00
2008-01-22 12:51:43 +01:00
public String dc_creator ( ) {
2011-06-01 21:31:56 +02:00
return ( this . creator = = null ) ? " " : this . creator . toString ( ) ;
2006-12-01 17:21:17 +01:00
}
2011-04-21 15:58:49 +02:00
2012-01-15 22:17:57 +01:00
/ * *
* add the given words to the set of keywords .
* These keywords will appear in dc_subject
* @param tags
* /
public void addTags ( Set < String > tags ) {
for ( String s : tags ) {
2015-11-13 01:48:28 +01:00
if ( s ! = null & & ! s . isEmpty ( ) ) this . keywords . add ( s ) ;
2012-01-15 22:17:57 +01:00
}
}
2012-06-11 16:48:53 +02:00
/ * *
* add the given words to the set of keywords .
* These keywords will appear in dc_subject
2015-08-10 14:27:44 +02:00
* @param tags a map where the key is the navigator name and the value is the set of attributes as metatags
2012-06-11 16:48:53 +02:00
* /
2012-12-18 02:29:03 +01:00
protected void addMetatags ( Map < String , Set < Tagging . Metatag > > tags ) {
2015-07-02 00:23:50 +02:00
this . generic_facets . putAll ( computeGenericFacets ( tags ) ) ;
}
2015-08-10 14:27:44 +02:00
/ * *
* compute generic facets
* @param tags a map where the key is the navigator name and the value is the set of attributes as metatags
* @return a map where the key is the navigator name and the value is the set of attributes names
* /
2015-07-02 00:23:50 +02:00
public static Map < String , Set < String > > computeGenericFacets ( Map < String , Set < Tagging . Metatag > > tags ) {
Map < String , Set < String > > gf = new HashMap < String , Set < String > > ( ) ;
2012-06-11 23:49:30 +02:00
for ( Map . Entry < String , Set < Tagging . Metatag > > e : tags . entrySet ( ) ) {
2012-06-12 14:23:51 +02:00
Tagging vocabulary = LibraryProvider . autotagging . getVocabulary ( e . getKey ( ) ) ;
2012-06-14 18:50:35 +02:00
if ( vocabulary = = null ) continue ;
2012-12-18 02:29:03 +01:00
Set < String > objects = new HashSet < String > ( ) ;
2012-06-11 23:49:30 +02:00
for ( Tagging . Metatag s : e . getValue ( ) ) {
2012-12-18 02:29:03 +01:00
objects . add ( s . getObject ( ) ) ;
2012-06-11 16:48:53 +02:00
}
2015-07-02 00:23:50 +02:00
gf . put ( vocabulary . getName ( ) , objects ) ;
2012-06-11 16:48:53 +02:00
}
2015-07-02 00:23:50 +02:00
return gf ;
2012-06-11 16:48:53 +02:00
}
2015-11-13 01:48:28 +01:00
/ * *
* Get the set of keywords associated with the document
* @return set of unique keywords
* /
public Set < String > dc_subject ( ) {
return this . keywords ;
2011-04-21 15:58:49 +02:00
}
2011-06-01 21:31:56 +02:00
2015-11-13 01:48:28 +01:00
/ * *
* Get the set of keywords associated with the document and string
* each keyword separated by the separator character
*
* @param separator character
* @return string of keywords or empty string
* /
2011-04-21 15:58:49 +02:00
public String dc_subject ( final char separator ) {
2015-11-13 01:48:28 +01:00
if ( this . keywords . size ( ) = = 0 ) return " " ;
2008-01-22 12:51:43 +01:00
// generate a new list
2015-11-13 01:48:28 +01:00
final StringBuilder sb = new StringBuilder ( this . keywords . size ( ) * 8 ) ;
for ( final String s : this . keywords ) sb . append ( s ) . append ( separator ) ;
2008-01-22 12:51:43 +01:00
return sb . substring ( 0 , sb . length ( ) - 1 ) ;
}
2011-06-01 21:31:56 +02:00
2013-07-30 12:48:57 +02:00
public String [ ] dc_description ( ) {
if ( descriptions = = null ) return new String [ 0 ] ;
return this . descriptions . toArray ( new String [ this . descriptions . size ( ) ] ) ;
2008-01-22 12:51:43 +01:00
}
2011-06-01 21:31:56 +02:00
2008-01-22 12:51:43 +01:00
public String dc_publisher ( ) {
2010-05-25 14:54:57 +02:00
return this . publisher = = null ? " " : this . publisher ;
2008-01-22 12:51:43 +01:00
}
2011-06-01 21:31:56 +02:00
2008-01-22 12:51:43 +01:00
public String dc_format ( ) {
2005-09-05 12:34:34 +02:00
return this . mimeType ;
}
2011-06-01 21:31:56 +02:00
2008-01-22 12:51:43 +01:00
public String dc_identifier ( ) {
2012-10-10 11:46:22 +02:00
return this . source . toNormalform ( true ) ;
2008-01-22 12:51:43 +01:00
}
2011-06-01 21:31:56 +02:00
2013-09-15 00:30:23 +02:00
public DigestURL dc_source ( ) {
2008-01-22 12:51:43 +01:00
return this . source ;
}
2011-06-01 21:31:56 +02:00
2013-11-10 18:50:34 +01:00
/ * *
* rewrite the dc_source ; this can be used for normalization purpose
* @param pattern
* @param replacement
* /
public void rewrite_dc_source ( Pattern pattern , String replacement ) {
String u = this . source . toNormalform ( false ) ;
Matcher m = pattern . matcher ( u ) ;
if ( m . matches ( ) ) {
u = m . replaceAll ( replacement ) ;
try {
DigestURL du = new DigestURL ( u ) ;
this . source = du ;
} catch ( MalformedURLException e ) {
}
}
}
2006-09-15 15:18:12 +02:00
/ * *
* @return the supposed charset of this document or < code > null < / code > if unknown
* /
2006-12-01 17:21:17 +01:00
public String getCharset ( ) {
2006-09-15 15:18:12 +02:00
return this . charset ;
}
2011-06-01 21:31:56 +02:00
2005-04-24 23:24:53 +02:00
public String [ ] getSectionTitles ( ) {
2011-06-01 21:31:56 +02:00
if ( this . sections = = null ) {
2008-01-22 12:51:43 +01:00
return new String [ ] { dc_title ( ) } ;
2007-05-19 01:13:44 +02:00
}
2011-06-01 21:31:56 +02:00
return this . sections . toArray ( new String [ this . sections . size ( ) ] ) ;
2005-04-24 23:24:53 +02:00
}
2012-07-04 21:15:10 +02:00
public InputStream getTextStream ( ) {
2006-09-30 11:31:53 +02:00
try {
2011-04-12 07:02:36 +02:00
if ( this . text = = null ) return new ByteArrayInputStream ( UTF8 . getBytes ( " " ) ) ;
2010-12-02 12:05:04 +01:00
if ( this . text instanceof String ) {
2011-06-01 21:31:56 +02:00
//return new StreamReader((String) this.text);
2011-03-11 00:25:07 +01:00
return new ByteArrayInputStream ( UTF8 . getBytes ( ( ( String ) this . text ) ) ) ;
2010-12-02 12:05:04 +01:00
} else if ( this . text instanceof InputStream ) {
return ( InputStream ) this . text ;
} else if ( this . text instanceof File ) {
return new BufferedInputStream ( new FileInputStream ( ( File ) this . text ) ) ;
2006-09-30 12:09:01 +02:00
} else if ( this . text instanceof byte [ ] ) {
2010-12-02 12:05:04 +01:00
return new ByteArrayInputStream ( ( byte [ ] ) this . text ) ;
2010-04-15 00:17:18 +02:00
} else if ( this . text instanceof ByteArrayOutputStream ) {
2010-12-02 12:05:04 +01:00
return new ByteArrayInputStream ( ( ( ByteArrayOutputStream ) this . text ) . toByteArray ( ) ) ;
2006-09-30 12:09:01 +02:00
}
2010-12-02 12:05:04 +01:00
assert false : this . text . getClass ( ) . toString ( ) ;
return null ;
2008-08-02 14:12:04 +02:00
} catch ( final Exception e ) {
2013-07-09 14:28:25 +02:00
ConcurrentLog . logException ( e ) ;
2006-09-30 11:31:53 +02:00
}
2011-04-12 07:02:36 +02:00
return new ByteArrayInputStream ( UTF8 . getBytes ( " " ) ) ;
2006-09-30 11:31:53 +02:00
}
2011-06-01 21:31:56 +02:00
2012-07-04 21:15:10 +02:00
public String getTextString ( ) {
2006-09-30 11:31:53 +02:00
try {
2013-04-26 10:49:55 +02:00
if ( this . text = = null ) {
this . text = " " ;
2010-12-02 12:05:04 +01:00
} else if ( this . text instanceof InputStream ) {
2013-04-26 10:49:55 +02:00
this . text = UTF8 . String ( FileUtils . read ( ( InputStream ) this . text ) ) ;
2010-12-02 12:05:04 +01:00
} else if ( this . text instanceof File ) {
2013-04-26 10:49:55 +02:00
this . text = UTF8 . String ( FileUtils . read ( ( File ) this . text ) ) ;
2007-05-19 01:13:44 +02:00
} else if ( this . text instanceof byte [ ] ) {
2013-04-26 10:49:55 +02:00
this . text = UTF8 . String ( ( byte [ ] ) this . text ) ;
2010-04-15 00:17:18 +02:00
} else if ( this . text instanceof ByteArrayOutputStream ) {
2013-04-26 10:49:55 +02:00
this . text = UTF8 . String ( ( ( ByteArrayOutputStream ) this . text ) . toByteArray ( ) ) ;
2007-05-19 01:13:44 +02:00
}
2013-04-26 10:49:55 +02:00
assert this . text instanceof String : this . text . getClass ( ) . toString ( ) ;
return ( String ) this . text ;
2008-08-02 14:12:04 +02:00
} catch ( final Exception e ) {
2013-07-09 14:28:25 +02:00
ConcurrentLog . logException ( e ) ;
2006-09-30 11:31:53 +02:00
}
2012-07-04 21:15:10 +02:00
return " " ;
2006-09-30 11:31:53 +02:00
}
2011-06-01 21:31:56 +02:00
2006-09-30 11:31:53 +02:00
public long getTextLength ( ) {
2010-12-02 12:05:04 +01:00
try {
if ( this . text = = null ) return - 1 ;
if ( this . text instanceof String ) {
return ( ( String ) this . text ) . length ( ) ;
} else if ( this . text instanceof InputStream ) {
return ( ( InputStream ) this . text ) . available ( ) ;
} else if ( this . text instanceof File ) {
return ( ( File ) this . text ) . length ( ) ;
} else if ( this . text instanceof byte [ ] ) {
return ( ( byte [ ] ) this . text ) . length ;
} else if ( this . text instanceof ByteArrayOutputStream ) {
return ( ( ByteArrayOutputStream ) this . text ) . size ( ) ;
}
assert false : this . text . getClass ( ) . toString ( ) ;
return - 1 ;
} catch ( final Exception e ) {
2013-07-09 14:28:25 +02:00
ConcurrentLog . logException ( e ) ;
2007-05-19 01:13:44 +02:00
}
2011-06-01 21:31:56 +02:00
return - 1 ;
2005-04-24 23:24:53 +02:00
}
2011-06-01 21:31:56 +02:00
2010-09-22 22:50:02 +02:00
public List < StringBuilder > getSentences ( final boolean pre ) {
2012-07-05 00:20:58 +02:00
final SentenceReader sr = new SentenceReader ( getTextString ( ) , pre ) ;
2012-07-04 21:15:10 +02:00
List < StringBuilder > sentences = new ArrayList < StringBuilder > ( ) ;
while ( sr . hasNext ( ) ) {
sentences . add ( sr . next ( ) ) ;
2010-09-22 22:50:02 +02:00
}
return sentences ;
2005-06-02 03:33:10 +02:00
}
2011-06-01 21:31:56 +02:00
2015-12-21 04:42:26 +01:00
/ * *
* All anchor links of the document
* ( this includes mailto links )
* @return all links embedded as anchors ( clickeable entities )
* /
2013-09-15 00:30:23 +02:00
public Collection < AnchorURL > getAnchors ( ) {
2005-04-24 23:24:53 +02:00
// returns all links embedded as anchors (clickeable entities)
2006-12-08 03:14:56 +01:00
// this is a url(String)/text(String) map
2011-06-01 21:31:56 +02:00
return this . anchors ;
2005-04-24 23:24:53 +02:00
}
2011-06-01 21:31:56 +02:00
2014-07-07 13:37:17 +02:00
public LinkedHashMap < DigestURL , String > getRSS ( ) {
2010-08-25 20:24:54 +02:00
// returns all links embedded as anchors (clickeable entities)
// this is a url(String)/text(String) map
2011-06-01 21:31:56 +02:00
return this . rss ;
2010-08-25 20:24:54 +02:00
}
2011-06-01 21:31:56 +02:00
2005-04-24 23:24:53 +02:00
// the next three methods provide a calculated view on the getAnchors/getImages:
2011-06-01 21:31:56 +02:00
2015-12-21 04:42:26 +01:00
/ * *
* List of links to resources ( pages , images , files , media . . . )
* ( Hyperlinks do not include mailto : links )
* @return a subset of the getAnchor - set : only links to other hyperrefs
* /
2013-09-15 23:27:04 +02:00
public Map < AnchorURL , String > getHyperlinks ( ) {
2005-04-24 23:24:53 +02:00
// this is a subset of the getAnchor-set: only links to other hyperrefs
2011-06-01 21:31:56 +02:00
if ( ! this . resorted ) resortLinks ( ) ;
return this . hyperlinks ;
2005-04-24 23:24:53 +02:00
}
2011-06-01 21:31:56 +02:00
2013-09-15 23:27:04 +02:00
public Map < AnchorURL , String > getAudiolinks ( ) {
2011-06-01 21:31:56 +02:00
if ( ! this . resorted ) resortLinks ( ) ;
2006-11-28 16:00:15 +01:00
return this . audiolinks ;
}
2011-06-01 21:31:56 +02:00
2013-09-15 23:27:04 +02:00
public Map < AnchorURL , String > getVideolinks ( ) {
2011-06-01 21:31:56 +02:00
if ( ! this . resorted ) resortLinks ( ) ;
2006-11-28 16:00:15 +01:00
return this . videolinks ;
}
2011-06-01 21:31:56 +02:00
2014-08-01 12:04:15 +02:00
public LinkedHashMap < DigestURL , ImageEntry > getImages ( ) {
2006-12-01 17:21:17 +01:00
// returns all links enbedded as pictures (visible in document)
// this resturns a htmlFilterImageEntry collection
2011-06-01 21:31:56 +02:00
if ( ! this . resorted ) resortLinks ( ) ;
return this . images ;
2006-11-28 16:00:15 +01:00
}
2011-06-01 21:31:56 +02:00
2013-09-15 23:27:04 +02:00
public Map < AnchorURL , String > getApplinks ( ) {
2011-06-01 21:31:56 +02:00
if ( ! this . resorted ) resortLinks ( ) ;
2006-11-28 16:00:15 +01:00
return this . applinks ;
2005-04-24 23:24:53 +02:00
}
2011-06-01 21:31:56 +02:00
2015-12-18 02:35:44 +01:00
/ * *
* @return mailto links
* /
public Set < AnchorURL > getEmaillinks ( ) {
2005-04-24 23:24:53 +02:00
// this is part of the getAnchor-set: only links to email addresses
2011-06-01 21:31:56 +02:00
if ( ! this . resorted ) resortLinks ( ) ;
return this . emaillinks ;
2005-04-24 23:24:53 +02:00
}
2011-06-01 21:31:56 +02:00
2014-12-11 23:37:41 +01:00
public Date getLastModified ( ) {
return this . lastModified ;
2013-09-10 10:31:57 +02:00
}
2012-05-31 22:39:53 +02:00
public double lon ( ) {
2011-03-30 02:49:47 +02:00
return this . lon ;
}
2011-06-01 21:31:56 +02:00
2012-05-31 22:39:53 +02:00
public double lat ( ) {
2011-03-30 02:49:47 +02:00
return this . lat ;
}
2011-06-01 21:31:56 +02:00
2015-12-16 03:01:17 +01:00
/ * *
* sorts all links ( anchors ) into individual collections
* /
2011-04-21 15:58:49 +02:00
private void resortLinks ( ) {
2009-02-17 10:12:47 +01:00
if ( this . resorted ) return ;
2011-04-21 15:58:49 +02:00
synchronized ( this ) {
if ( this . resorted ) return ;
// extract hyperlinks, medialinks and emaillinks from anchorlinks
String u ;
int extpos , qpos ;
String ext = null ;
2011-06-01 21:31:56 +02:00
final String thishost = this . source . getHost ( ) ;
2013-09-15 00:30:23 +02:00
this . inboundlinks = new LinkedHashMap < DigestURL , String > ( ) ;
this . outboundlinks = new LinkedHashMap < DigestURL , String > ( ) ;
2013-09-15 23:27:04 +02:00
this . hyperlinks = new LinkedHashMap < AnchorURL , String > ( ) ;
this . videolinks = new LinkedHashMap < AnchorURL , String > ( ) ;
this . audiolinks = new LinkedHashMap < AnchorURL , String > ( ) ;
this . applinks = new LinkedHashMap < AnchorURL , String > ( ) ;
2015-12-18 02:35:44 +01:00
this . emaillinks = new LinkedHashSet < AnchorURL > ( ) ;
2013-09-15 23:27:04 +02:00
final Map < AnchorURL , ImageEntry > collectedImages = new HashMap < AnchorURL , ImageEntry > ( ) ; // this is a set that is collected now and joined later to the imagelinks
2014-08-01 12:04:15 +02:00
for ( final Map . Entry < DigestURL , ImageEntry > entry : this . images . entrySet ( ) ) {
2014-04-16 21:34:28 +02:00
if ( entry . getKey ( ) ! = null & & entry . getKey ( ) . getHost ( ) ! = null & & entry . getKey ( ) . getHost ( ) . equals ( thishost ) ) this . inboundlinks . put ( entry . getKey ( ) , " image " ) ; else this . outboundlinks . put ( entry . getKey ( ) , " image " ) ;
2011-04-21 15:58:49 +02:00
}
2013-09-15 23:27:04 +02:00
for ( final AnchorURL url : this . anchors ) {
2011-04-21 15:58:49 +02:00
if ( url = = null ) continue ;
2015-12-16 03:01:17 +01:00
u = url . toNormalform ( true ) ;
final String name = url . getNameProperty ( ) ;
// check mailto scheme first (not suppose to get into in/outboundlinks or hyperlinks -> crawler can't process)
if ( url . getProtocol ( ) . equals ( " mailto " ) ) {
2015-12-18 02:35:44 +01:00
this . emaillinks . add ( url ) ;
2015-12-16 03:01:17 +01:00
continue ;
}
2013-09-15 23:27:04 +02:00
final boolean noindex = url . getRelProperty ( ) . toLowerCase ( ) . indexOf ( " noindex " , 0 ) > = 0 ;
final boolean nofollow = url . getRelProperty ( ) . toLowerCase ( ) . indexOf ( " nofollow " , 0 ) > = 0 ;
2011-05-01 17:49:04 +02:00
if ( ( thishost = = null & & url . getHost ( ) = = null ) | |
( ( thishost ! = null & & url . getHost ( ) ! = null ) & &
( url . getHost ( ) . endsWith ( thishost ) | |
( thishost . startsWith ( " www. " ) & & url . getHost ( ) . endsWith ( thishost . substring ( 4 ) ) ) ) ) ) {
2011-07-03 08:40:05 +02:00
this . inboundlinks . put ( url , " anchor " + ( noindex ? " noindex " : " " ) + ( nofollow ? " nofollow " : " " ) ) ;
2011-05-01 17:49:04 +02:00
} else {
2011-07-03 08:40:05 +02:00
this . outboundlinks . put ( url , " anchor " + ( noindex ? " noindex " : " " ) + ( nofollow ? " nofollow " : " " ) ) ;
2011-05-01 17:49:04 +02:00
}
2015-12-16 03:01:17 +01:00
extpos = u . lastIndexOf ( '.' ) ;
if ( extpos > 0 ) {
if ( ( ( qpos = u . indexOf ( '?' ) ) > = 0 ) & & ( qpos > extpos ) ) {
ext = u . substring ( extpos + 1 , qpos ) . toLowerCase ( ) ;
} else {
ext = u . substring ( extpos + 1 ) . toLowerCase ( ) ;
}
if ( Classification . isMediaExtension ( ext ) ) {
// this is not a normal anchor, its a media link
if ( Classification . isImageExtension ( ext ) ) { // TODO: guess on a-tag href extension (may not be correct)
collectedImages . put ( url , new ImageEntry ( url , name , - 1 , - 1 , - 1 ) ) ;
} else if ( Classification . isAudioExtension ( ext ) ) this . audiolinks . put ( url , name ) ;
else if ( Classification . isVideoExtension ( ext ) ) this . videolinks . put ( url , name ) ;
else if ( Classification . isApplicationExtension ( ext ) ) this . applinks . put ( url , name ) ;
2005-04-24 23:24:53 +02:00
}
}
2015-12-16 03:01:17 +01:00
// in any case we consider this as a link and let the parser decide if that link can be followed
this . hyperlinks . put ( url , name ) ;
2005-04-24 23:24:53 +02:00
}
2011-06-01 21:31:56 +02:00
2011-04-21 15:58:49 +02:00
// add image links that we collected from the anchors to the image map
2013-09-15 00:30:23 +02:00
this . images . putAll ( collectedImages ) ;
2011-06-01 21:31:56 +02:00
2011-04-21 15:58:49 +02:00
// expand the hyperlinks:
// we add artificial hyperlinks to the hyperlink set
// that can be calculated from given hyperlinks and imagelinks
2011-06-01 21:31:56 +02:00
this . hyperlinks . putAll ( allReflinks ( this . images . values ( ) ) ) ;
this . hyperlinks . putAll ( allReflinks ( this . audiolinks . keySet ( ) ) ) ;
this . hyperlinks . putAll ( allReflinks ( this . videolinks . keySet ( ) ) ) ;
this . hyperlinks . putAll ( allReflinks ( this . applinks . keySet ( ) ) ) ;
2011-04-21 15:58:49 +02:00
/ *
hyperlinks . putAll ( allSubpaths ( hyperlinks . keySet ( ) ) ) ;
hyperlinks . putAll ( allSubpaths ( images . values ( ) ) ) ;
hyperlinks . putAll ( allSubpaths ( audiolinks . keySet ( ) ) ) ;
hyperlinks . putAll ( allSubpaths ( videolinks . keySet ( ) ) ) ;
hyperlinks . putAll ( allSubpaths ( applinks . keySet ( ) ) ) ;
2011-06-01 21:31:56 +02:00
* /
2011-04-21 15:58:49 +02:00
// don't do this again
this . resorted = true ;
2005-04-24 23:24:53 +02:00
}
}
2011-06-01 21:31:56 +02:00
2013-09-15 00:30:23 +02:00
public static Map < MultiProtocolURL , String > allSubpaths ( final Collection < ? > links ) {
2009-07-10 00:25:31 +02:00
// links is either a Set of Strings (urls) or a Set of
// htmlFilterImageEntries
2010-12-27 18:07:21 +01:00
final Set < String > h = new HashSet < String > ( ) ;
2009-07-10 00:25:31 +02:00
Iterator < ? > i = links . iterator ( ) ;
Object o ;
2013-09-15 00:30:23 +02:00
MultiProtocolURL url ;
2009-07-10 00:25:31 +02:00
String u ;
int pos ;
int l ;
while ( i . hasNext ( ) )
try {
o = i . next ( ) ;
2013-09-15 00:30:23 +02:00
if ( o instanceof MultiProtocolURL ) url = ( MultiProtocolURL ) o ;
else if ( o instanceof String ) url = new MultiProtocolURL ( ( String ) o ) ;
2009-07-10 00:25:31 +02:00
else if ( o instanceof ImageEntry ) url = ( ( ImageEntry ) o ) . url ( ) ;
else {
assert false ;
continue ;
}
2012-10-10 11:46:22 +02:00
u = url . toNormalform ( true ) ;
2009-07-10 00:25:31 +02:00
if ( u . endsWith ( " / " ) )
u = u . substring ( 0 , u . length ( ) - 1 ) ;
pos = u . lastIndexOf ( '/' ) ;
while ( pos > 8 ) {
l = u . length ( ) ;
u = u . substring ( 0 , pos + 1 ) ;
h . add ( u ) ;
u = u . substring ( 0 , pos ) ;
assert ( u . length ( ) < l ) : " u = " + u ;
pos = u . lastIndexOf ( '/' ) ;
}
} catch ( final MalformedURLException e ) { }
// now convert the strings to yacyURLs
i = h . iterator ( ) ;
2013-09-15 00:30:23 +02:00
final Map < MultiProtocolURL , String > v = new HashMap < MultiProtocolURL , String > ( ) ;
2009-07-10 00:25:31 +02:00
while ( i . hasNext ( ) ) {
u = ( String ) i . next ( ) ;
try {
2013-09-15 00:30:23 +02:00
url = new MultiProtocolURL ( u ) ;
2009-07-10 00:25:31 +02:00
v . put ( url , " sub " ) ;
} catch ( final MalformedURLException e ) {
}
}
return v ;
}
2011-06-01 21:31:56 +02:00
2016-07-17 23:42:25 +02:00
/ * *
* We find all links that are part of a reference inside a url
*
* @param links links is either a Set of AnchorURL , Strings ( with urls ) or htmlFilterImageEntries
* @return map with contained urls as key and " ref " as value
* /
2013-09-15 23:27:04 +02:00
private static Map < AnchorURL , String > allReflinks ( final Collection < ? > links ) {
final Map < AnchorURL , String > v = new HashMap < AnchorURL , String > ( ) ;
2009-07-10 00:25:31 +02:00
final Iterator < ? > i = links . iterator ( ) ;
Object o ;
2013-09-15 23:27:04 +02:00
AnchorURL url = null ;
2009-07-10 00:25:31 +02:00
String u ;
int pos ;
loop : while ( i . hasNext ( ) )
try {
2014-08-01 12:04:15 +02:00
url = null ;
2009-07-10 00:25:31 +02:00
o = i . next ( ) ;
2013-09-15 23:27:04 +02:00
if ( o instanceof AnchorURL )
url = ( AnchorURL ) o ;
2009-07-10 00:25:31 +02:00
else if ( o instanceof String )
2013-09-15 23:27:04 +02:00
url = new AnchorURL ( ( String ) o ) ;
2009-07-10 00:25:31 +02:00
else if ( o instanceof ImageEntry )
2014-08-01 12:04:15 +02:00
url = new AnchorURL ( ( ( ImageEntry ) o ) . url ( ) ) ;
2009-07-10 00:25:31 +02:00
else {
assert false ;
2011-03-22 10:34:10 +01:00
continue loop ;
2009-07-10 00:25:31 +02:00
}
2012-10-10 11:46:22 +02:00
u = url . toNormalform ( true ) ;
2016-07-17 23:42:25 +02:00
// find start of a referenced http url
if ( ( pos = u . toLowerCase ( ) . indexOf ( " http:// " , 7 ) ) > 0 ) { // 7 = skip the protocol part of the source url
2009-07-10 00:25:31 +02:00
i . remove ( ) ;
u = u . substring ( pos ) ;
while ( ( pos = u . toLowerCase ( ) . indexOf ( " http:// " , 7 ) ) > 0 )
u = u . substring ( pos ) ;
2013-09-15 23:27:04 +02:00
url = new AnchorURL ( u ) ;
2009-07-10 00:25:31 +02:00
if ( ! ( v . containsKey ( url ) ) )
v . put ( url , " ref " ) ;
continue loop ;
}
2016-07-17 23:42:25 +02:00
// find start of a referenced https url
if ( ( pos = u . toLowerCase ( ) . indexOf ( " https:// " , 7 ) ) > 0 ) { // 7 = skip the protocol part of the source url
2009-07-10 00:25:31 +02:00
i . remove ( ) ;
2016-07-17 23:42:25 +02:00
u = u . substring ( pos ) ;
while ( ( pos = u . toLowerCase ( ) . indexOf ( " https:// " , 7 ) ) > 0 )
u = u . substring ( pos ) ;
2013-09-15 23:27:04 +02:00
url = new AnchorURL ( u ) ;
2009-07-10 00:25:31 +02:00
if ( ! ( v . containsKey ( url ) ) )
v . put ( url , " ref " ) ;
continue loop ;
}
2016-07-17 23:42:25 +02:00
if ( ( pos = u . toLowerCase ( ) . indexOf ( " /www. " , 11 ) ) > 0 ) { // 11 = skip protocol part + www of source url "http://www."
i . remove ( ) ;
u = url . getProtocol ( ) + " :/ " + u . substring ( pos ) ;
while ( ( pos = u . toLowerCase ( ) . indexOf ( " /www. " , 11 ) ) > 0 )
u = url . getProtocol ( ) + " :/ " + u . substring ( pos ) ;
AnchorURL addurl = new AnchorURL ( u ) ;
if ( ! ( v . containsKey ( addurl ) ) )
v . put ( addurl , " ref " ) ;
continue loop ;
}
2009-07-10 00:25:31 +02:00
} catch ( final MalformedURLException e ) {
}
return v ;
}
2011-06-01 21:31:56 +02:00
2015-11-06 23:58:55 +01:00
/ * *
* Adds the main content of subdocuments to this document .
* This is useful if the document is a container for other documents ( like zip or other archives )
* to make the content of the subdocuments searcheable ,
* but has only one url ( unlike container - urls as rss ) .
*
* This is similar to mergeDocuments but directly joins internal content variables ,
* uses less parsed details and keeps this documents crawl data ( like crawldepth , lastmodified )
*
* @see mergeDocuments ( )
* @param docs to be included
* @throws IOException
* /
2010-06-29 21:20:45 +02:00
public void addSubDocuments ( final Document [ ] docs ) throws IOException {
2011-06-01 21:31:56 +02:00
for ( final Document doc : docs ) {
2012-10-26 18:50:45 +02:00
this . sections . addAll ( doc . sections ) ;
2012-08-31 10:30:43 +02:00
this . titles . addAll ( doc . titles ( ) ) ;
2015-11-13 01:48:28 +01:00
this . keywords . addAll ( doc . dc_subject ( ) ) ;
2013-07-30 12:48:57 +02:00
for ( String d : doc . dc_description ( ) ) this . descriptions . add ( d ) ;
2011-06-01 21:31:56 +02:00
2010-06-29 21:20:45 +02:00
if ( ! ( this . text instanceof ByteArrayOutputStream ) ) {
this . text = new ByteArrayOutputStream ( ) ;
}
2012-07-04 21:15:10 +02:00
FileUtils . copy ( doc . getTextStream ( ) , ( ByteArrayOutputStream ) this . text ) ;
2010-08-25 20:24:54 +02:00
2013-09-15 00:30:23 +02:00
this . anchors . addAll ( doc . getAnchors ( ) ) ;
2011-06-01 21:31:56 +02:00
this . rss . putAll ( doc . getRSS ( ) ) ;
2013-09-15 00:30:23 +02:00
this . images . putAll ( doc . getImages ( ) ) ;
2007-05-19 01:13:44 +02:00
}
}
2011-06-01 21:31:56 +02:00
2007-06-09 17:22:37 +02:00
/ * *
* @return the { @link URL } to the favicon that belongs to the document
* /
2013-09-15 00:30:23 +02:00
public MultiProtocolURL getFavicon ( ) {
2007-06-09 17:22:37 +02:00
return this . favicon ;
}
2011-06-01 21:31:56 +02:00
2007-06-09 17:22:37 +02:00
/ * *
* @param faviconURL the { @link URL } to the favicon that belongs to the document
* /
2013-09-15 00:30:23 +02:00
public void setFavicon ( final MultiProtocolURL faviconURL ) {
2007-06-09 17:22:37 +02:00
this . favicon = faviconURL ;
}
2011-06-01 21:31:56 +02:00
2012-01-31 23:46:35 +01:00
public int inboundLinkNofollowCount ( ) {
2011-07-03 08:40:05 +02:00
if ( this . inboundlinks = = null ) resortLinks ( ) ;
if ( this . inboundlinks = = null ) return 0 ;
int c = 0 ;
for ( final String tag : this . inboundlinks . values ( ) ) {
2012-01-31 23:46:35 +01:00
if ( tag . contains ( " nofollow " ) ) c + + ;
2011-07-03 08:40:05 +02:00
}
return c ;
}
2012-01-31 23:46:35 +01:00
public int outboundLinkNofollowCount ( ) {
2011-07-03 08:40:05 +02:00
if ( this . outboundlinks = = null ) resortLinks ( ) ;
if ( this . outboundlinks = = null ) return 0 ;
int c = 0 ;
for ( final String tag : this . outboundlinks . values ( ) ) {
2012-01-31 23:46:35 +01:00
if ( tag . contains ( " nofollow " ) ) c + + ;
2011-07-03 08:40:05 +02:00
}
return c ;
}
2013-09-15 00:30:23 +02:00
public LinkedHashMap < DigestURL , String > inboundLinks ( ) {
2011-04-21 15:58:49 +02:00
if ( this . inboundlinks = = null ) resortLinks ( ) ;
2013-09-15 00:30:23 +02:00
return ( this . inboundlinks = = null ) ? null : this . inboundlinks ;
2011-04-21 15:58:49 +02:00
}
2011-06-01 21:31:56 +02:00
2013-09-15 00:30:23 +02:00
public LinkedHashMap < DigestURL , String > outboundLinks ( ) {
2011-04-21 15:58:49 +02:00
if ( this . outboundlinks = = null ) resortLinks ( ) ;
2013-09-15 00:30:23 +02:00
return ( this . outboundlinks = = null ) ? null : this . outboundlinks ;
2008-03-26 20:51:05 +01:00
}
2011-06-01 21:31:56 +02:00
2010-03-04 00:32:56 +01:00
public boolean indexingDenied ( ) {
return this . indexingDenied ;
}
2011-06-01 21:31:56 +02:00
2014-07-10 17:13:35 +02:00
public void setIndexingDenied ( boolean indexingDenied ) {
this . indexingDenied = indexingDenied ;
}
2014-04-02 23:37:01 +02:00
public void setDepth ( int depth ) {
this . crawldepth = depth ;
}
public int getDepth ( ) {
return this . crawldepth ;
}
2015-12-17 02:53:10 +01:00
public void writeXML ( final Writer os ) throws IOException {
2009-04-17 16:20:12 +02:00
os . write ( " <record> \ n " ) ;
2011-06-01 21:31:56 +02:00
final String title = dc_title ( ) ;
2010-05-11 13:14:05 +02:00
if ( title ! = null & & title . length ( ) > 0 ) os . write ( " <dc:title><![CDATA[ " + title + " ]]></dc:title> \ n " ) ;
2011-06-01 21:31:56 +02:00
os . write ( " <dc:identifier> " + dc_identifier ( ) + " </dc:identifier> \ n " ) ;
final String creator = dc_creator ( ) ;
2010-05-11 13:14:05 +02:00
if ( creator ! = null & & creator . length ( ) > 0 ) os . write ( " <dc:creator><![CDATA[ " + creator + " ]]></dc:creator> \ n " ) ;
2011-06-01 21:31:56 +02:00
final String publisher = dc_publisher ( ) ;
2010-05-11 13:14:05 +02:00
if ( publisher ! = null & & publisher . length ( ) > 0 ) os . write ( " <dc:publisher><![CDATA[ " + publisher + " ]]></dc:publisher> \ n " ) ;
2011-06-01 21:31:56 +02:00
final String subject = this . dc_subject ( ';' ) ;
2010-05-11 13:14:05 +02:00
if ( subject ! = null & & subject . length ( ) > 0 ) os . write ( " <dc:subject><![CDATA[ " + subject + " ]]></dc:subject> \ n " ) ;
2010-01-19 15:59:58 +01:00
if ( this . text ! = null ) {
2010-05-11 13:14:05 +02:00
os . write ( " <dc:description><![CDATA[ " ) ;
2012-07-04 21:15:10 +02:00
os . write ( getTextString ( ) ) ;
2010-05-11 13:14:05 +02:00
os . write ( " ]]></dc:description> \ n " ) ;
2010-01-19 15:59:58 +01:00
}
2011-06-01 21:31:56 +02:00
final String language = dc_language ( ) ;
if ( language ! = null & & language . length ( ) > 0 ) os . write ( " <dc:language> " + dc_language ( ) + " </dc:language> \ n " ) ;
2015-12-17 02:53:10 +01:00
os . write ( " <dc:date> " + ISO8601Formatter . FORMATTER . format ( getLastModified ( ) ) + " </dc:date> \ n " ) ;
2013-01-14 03:06:24 +01:00
if ( this . lon ! = 0 . 0 & & this . lat ! = 0 . 0 ) os . write ( " <geo:Point><geo:long> " + this . lon + " </geo:long><geo:lat> " + this . lat + " </geo:lat></geo:Point> \ n " ) ;
2009-04-17 16:20:12 +02:00
os . write ( " </record> \ n " ) ;
}
2011-06-01 21:31:56 +02:00
2010-12-27 18:07:21 +01:00
@Override
2010-01-19 15:59:58 +01:00
public String toString ( ) {
2010-12-27 18:07:21 +01:00
final ByteArrayOutputStream baos = new ByteArrayOutputStream ( ) ;
2010-01-19 15:59:58 +01:00
try {
2016-01-05 23:37:05 +01:00
final Writer osw = new OutputStreamWriter ( baos , StandardCharsets . UTF_8 ) ;
2015-12-17 02:53:10 +01:00
writeXML ( osw ) ;
2010-01-19 15:59:58 +01:00
osw . close ( ) ;
2011-03-07 21:36:40 +01:00
return UTF8 . String ( baos . toByteArray ( ) ) ;
2011-06-01 21:31:56 +02:00
} catch ( final UnsupportedEncodingException e1 ) {
2010-05-11 13:14:05 +02:00
return " " ;
2011-06-01 21:31:56 +02:00
} catch ( final IOException e ) {
2010-01-19 15:59:58 +01:00
return " " ;
}
}
2011-06-01 21:31:56 +02:00
2012-05-14 07:41:55 +02:00
public synchronized void close ( ) {
2010-12-02 12:05:04 +01:00
if ( this . text = = null ) return ;
2011-06-01 21:31:56 +02:00
2006-09-30 12:09:01 +02:00
// try close the output stream
2010-12-02 12:05:04 +01:00
if ( this . text instanceof InputStream ) try {
( ( InputStream ) this . text ) . close ( ) ;
} catch ( final Exception e ) { } finally {
this . text = null ;
2006-09-30 12:09:01 +02:00
}
2011-06-01 21:31:56 +02:00
2006-09-30 11:31:53 +02:00
// delete the temp file
2011-06-01 21:31:56 +02:00
if ( this . text instanceof File ) try {
FileUtils . deletedelete ( ( File ) this . text ) ;
2010-12-02 12:05:04 +01:00
} catch ( final Exception e ) { } finally {
this . text = null ;
}
2006-09-30 11:31:53 +02:00
}
2011-06-01 21:31:56 +02:00
2010-06-29 21:20:45 +02:00
/ * *
* merge documents : a helper method for all parsers that return multiple documents
* @param docs
* @return
* /
2013-09-15 00:30:23 +02:00
public static Document mergeDocuments ( final DigestURL location , final String globalMime , final Document [ ] docs ) {
2010-06-29 21:20:45 +02:00
if ( docs = = null | | docs . length = = 0 ) return null ;
if ( docs . length = = 1 ) return docs [ 0 ] ;
2011-06-01 21:31:56 +02:00
2010-06-29 21:20:45 +02:00
long docTextLength = 0 ;
final ByteBuffer content = new ByteBuffer ( ) ;
2011-03-09 10:29:05 +01:00
final StringBuilder authors = new StringBuilder ( 80 ) ;
final StringBuilder publishers = new StringBuilder ( 80 ) ;
final StringBuilder subjects = new StringBuilder ( 80 ) ;
2014-08-01 12:04:15 +02:00
final List < String > descriptions = new ArrayList < > ( ) ;
final Collection < String > titles = new LinkedHashSet < > ( ) ;
final Collection < String > sectionTitles = new LinkedHashSet < > ( ) ;
final List < AnchorURL > anchors = new ArrayList < > ( ) ;
final LinkedHashMap < DigestURL , String > rss = new LinkedHashMap < > ( ) ;
final LinkedHashMap < DigestURL , ImageEntry > images = new LinkedHashMap < > ( ) ;
final Set < String > languages = new HashSet < > ( ) ;
2012-05-31 22:39:53 +02:00
double lon = 0 . 0d , lat = 0 . 0d ;
2014-07-10 17:13:35 +02:00
boolean indexingDenied = false ;
2013-09-10 10:31:57 +02:00
Date date = new Date ( ) ;
2014-06-26 22:16:15 +02:00
String charset = null ;
2011-06-01 21:31:56 +02:00
2014-04-02 23:37:01 +02:00
int mindepth = 999 ;
2011-06-01 21:31:56 +02:00
for ( final Document doc : docs ) {
2014-06-26 22:16:15 +02:00
if ( doc = = null ) continue ;
if ( charset = = null ) charset = doc . charset ; // TODO: uses this charset for merged content
2011-06-01 21:31:56 +02:00
final String author = doc . dc_creator ( ) ;
2010-06-29 21:20:45 +02:00
if ( author . length ( ) > 0 ) {
if ( authors . length ( ) > 0 ) authors . append ( " , " ) ;
2014-06-26 22:16:15 +02:00
authors . append ( author ) ;
2010-06-29 21:20:45 +02:00
}
2011-06-01 21:31:56 +02:00
final String publisher = doc . dc_publisher ( ) ;
2010-06-29 21:20:45 +02:00
if ( publisher . length ( ) > 0 ) {
if ( publishers . length ( ) > 0 ) publishers . append ( " , " ) ;
publishers . append ( publisher ) ;
}
2011-06-01 21:31:56 +02:00
final String subject = doc . dc_subject ( ',' ) ;
2010-06-29 21:20:45 +02:00
if ( subject . length ( ) > 0 ) {
if ( subjects . length ( ) > 0 ) subjects . append ( " , " ) ;
subjects . append ( subject ) ;
}
2011-06-01 21:31:56 +02:00
2012-08-31 10:30:43 +02:00
titles . addAll ( doc . titles ( ) ) ;
2010-06-29 21:20:45 +02:00
sectionTitles . addAll ( Arrays . asList ( doc . getSectionTitles ( ) ) ) ;
2013-07-30 12:48:57 +02:00
for ( String d : doc . dc_description ( ) ) descriptions . add ( d ) ;
2011-06-01 21:31:56 +02:00
2010-06-29 21:20:45 +02:00
if ( doc . getTextLength ( ) > 0 ) {
if ( docTextLength > 0 ) content . write ( '\n' ) ;
try {
2012-07-04 21:15:10 +02:00
docTextLength + = FileUtils . copy ( doc . getTextStream ( ) , content ) ;
2011-06-01 21:31:56 +02:00
} catch ( final IOException e ) {
2013-07-09 14:28:25 +02:00
ConcurrentLog . logException ( e ) ;
2010-06-29 21:20:45 +02:00
}
}
2013-09-15 00:30:23 +02:00
anchors . addAll ( doc . getAnchors ( ) ) ;
2010-08-25 20:24:54 +02:00
rss . putAll ( doc . getRSS ( ) ) ;
2013-09-15 00:30:23 +02:00
images . putAll ( doc . getImages ( ) ) ;
2013-01-14 03:06:24 +01:00
if ( doc . lon ( ) ! = 0 . 0 & & doc . lat ( ) ! = 0 . 0 ) { lon = doc . lon ( ) ; lat = doc . lat ( ) ; }
2014-12-11 23:37:41 +01:00
if ( doc . lastModified . before ( date ) ) date = doc . lastModified ;
2014-04-02 23:37:01 +02:00
if ( doc . getDepth ( ) < mindepth ) mindepth = doc . getDepth ( ) ;
2014-06-26 22:16:15 +02:00
if ( doc . dc_language ( ) ! = null ) languages . add ( doc . dc_language ( ) ) ;
2014-07-10 17:13:35 +02:00
indexingDenied | = doc . indexingDenied ;
2010-06-29 21:20:45 +02:00
}
2012-07-03 17:20:41 +02:00
// clean up parser data
for ( final Document doc : docs ) {
2016-08-14 03:53:16 +02:00
Object scraper = doc . getScraperObject ( ) ;
if ( scraper instanceof ContentScraper ) {
final ContentScraper html = ( ContentScraper ) scraper ;
2012-07-03 17:20:41 +02:00
html . close ( ) ;
}
}
// return consolidation
2012-08-31 10:30:43 +02:00
ArrayList < String > titlesa = new ArrayList < String > ( ) ;
titlesa . addAll ( titles ) ;
2014-04-02 23:37:01 +02:00
Document newDoc = new Document (
2010-06-29 21:20:45 +02:00
location ,
globalMime ,
2014-06-26 22:16:15 +02:00
charset ,
2010-06-29 21:20:45 +02:00
null ,
2014-06-26 22:16:15 +02:00
languages ,
2010-06-29 21:20:45 +02:00
subjects . toString ( ) . split ( " |, " ) ,
2012-08-31 10:30:43 +02:00
titlesa ,
2010-06-29 21:20:45 +02:00
authors . toString ( ) ,
publishers . toString ( ) ,
sectionTitles . toArray ( new String [ sectionTitles . size ( ) ] ) ,
2013-07-30 12:48:57 +02:00
descriptions ,
2011-03-30 02:49:47 +02:00
lon , lat ,
2010-06-29 21:20:45 +02:00
content . getBytes ( ) ,
anchors ,
2010-08-25 20:24:54 +02:00
rss ,
2010-06-29 21:20:45 +02:00
images ,
2014-07-10 17:13:35 +02:00
indexingDenied ,
2013-09-10 10:31:57 +02:00
date ) ;
2014-04-02 23:37:01 +02:00
newDoc . setDepth ( mindepth ) ;
return newDoc ;
2010-06-29 21:20:45 +02:00
}
2011-06-01 21:31:56 +02:00
2014-04-18 06:51:46 +02:00
public final static String CANONICAL_MARKER = " canonical " ;
2015-01-02 02:44:03 +01:00
public final static String IFRAME_MARKER = " iframe " ;
2015-01-06 14:14:25 +01:00
public final static String FRAME_MARKER = " frame " ;
2015-01-02 02:44:03 +01:00
public final static String EMBED_MARKER = " embed " ;
2014-04-18 06:51:46 +02:00
2014-07-18 12:43:01 +02:00
public static Map < AnchorURL , String > getHyperlinks ( final Document [ ] documents , boolean includeNofollow ) {
final Map < AnchorURL , String > result = new HashMap < > ( ) ;
2010-12-27 18:07:21 +01:00
for ( final Document d : documents ) {
2014-07-18 12:43:01 +02:00
if ( includeNofollow ) {
result . putAll ( d . getHyperlinks ( ) ) ;
} else {
for ( Map . Entry < AnchorURL , String > entry : d . getHyperlinks ( ) . entrySet ( ) ) {
if ( ! entry . getKey ( ) . attachedNofollow ( ) ) result . put ( entry . getKey ( ) , entry . getValue ( ) ) ;
}
}
2016-08-14 03:53:16 +02:00
final Object scraper = d . getScraperObject ( ) ;
if ( scraper instanceof ContentScraper ) {
final ContentScraper html = ( ContentScraper ) scraper ;
2012-10-29 21:42:31 +01:00
String refresh = html . getRefreshPath ( ) ;
2014-07-18 12:43:01 +02:00
if ( refresh ! = null & & refresh . length ( ) > 0 ) try { result . put ( new AnchorURL ( refresh ) , " refresh " ) ; } catch ( final MalformedURLException e ) { }
AnchorURL canonical = html . getCanonical ( ) ;
2014-04-18 06:51:46 +02:00
if ( canonical ! = null ) {
result . put ( canonical , CANONICAL_MARKER ) ;
}
2015-01-06 14:14:25 +01:00
for ( AnchorURL u : html . getFrames ( ) ) result . put ( u , FRAME_MARKER ) ;
2015-01-02 02:44:03 +01:00
for ( AnchorURL u : html . getIFrames ( ) ) result . put ( u , IFRAME_MARKER ) ;
for ( AnchorURL u : html . getEmbeds ( ) . keySet ( ) ) result . put ( u , EMBED_MARKER ) ;
2012-10-29 21:42:31 +01:00
}
2010-12-27 18:07:21 +01:00
}
2010-06-29 21:20:45 +02:00
return result ;
}
2011-06-01 21:31:56 +02:00
2013-09-15 00:30:23 +02:00
public static Map < DigestURL , String > getImagelinks ( final Document [ ] documents ) {
final Map < DigestURL , String > result = new HashMap < DigestURL , String > ( ) ;
2010-12-27 18:07:21 +01:00
for ( final Document d : documents ) {
2011-06-01 21:31:56 +02:00
for ( final ImageEntry imageReference : d . getImages ( ) . values ( ) ) {
2012-04-24 16:07:03 +02:00
// construct a image name which contains the document title to enhance the search process for images
result . put ( imageReference . url ( ) , description ( d , imageReference . alt ( ) ) ) ;
2010-06-29 21:20:45 +02:00
}
}
return result ;
}
2009-08-27 16:34:41 +02:00
2013-09-15 00:30:23 +02:00
public static Map < DigestURL , String > getAudiolinks ( final Document [ ] documents ) {
final Map < DigestURL , String > result = new HashMap < DigestURL , String > ( ) ;
2012-04-24 16:07:03 +02:00
for ( final Document d : documents ) {
2013-09-15 23:27:04 +02:00
for ( Map . Entry < AnchorURL , String > e : d . audiolinks . entrySet ( ) ) {
2012-04-24 16:07:03 +02:00
result . put ( e . getKey ( ) , description ( d , e . getValue ( ) ) ) ;
}
}
2012-04-22 02:05:17 +02:00
return result ;
}
2013-09-15 00:30:23 +02:00
public static Map < DigestURL , String > getVideolinks ( final Document [ ] documents ) {
final Map < DigestURL , String > result = new HashMap < DigestURL , String > ( ) ;
2012-04-24 16:07:03 +02:00
for ( final Document d : documents ) {
2013-09-15 23:27:04 +02:00
for ( Map . Entry < AnchorURL , String > e : d . videolinks . entrySet ( ) ) {
2012-04-24 16:07:03 +02:00
result . put ( e . getKey ( ) , description ( d , e . getValue ( ) ) ) ;
}
}
2012-04-22 02:05:17 +02:00
return result ;
}
2013-09-15 00:30:23 +02:00
public static Map < DigestURL , String > getApplinks ( final Document [ ] documents ) {
final Map < DigestURL , String > result = new HashMap < DigestURL , String > ( ) ;
2012-04-24 16:07:03 +02:00
for ( final Document d : documents ) {
2013-09-15 23:27:04 +02:00
for ( Map . Entry < AnchorURL , String > e : d . applinks . entrySet ( ) ) {
2012-04-24 16:07:03 +02:00
result . put ( e . getKey ( ) , description ( d , e . getValue ( ) ) ) ;
}
}
2012-04-22 02:05:17 +02:00
return result ;
}
2011-06-01 21:31:56 +02:00
2012-04-24 16:07:03 +02:00
private static final String description ( Document d , String tagname ) {
2012-07-10 22:59:03 +02:00
if ( tagname = = null | | tagname . isEmpty ( ) ) {
2012-04-24 16:07:03 +02:00
tagname = d . source . toTokens ( ) ;
}
StringBuilder sb = new StringBuilder ( 60 ) ;
sb . append ( d . dc_title ( ) ) ;
2014-08-04 02:38:58 +02:00
if ( d . dc_description ( ) . length > 0 ) {
if ( ! d . dc_description ( ) [ 0 ] . equals ( d . dc_title ( ) ) & & sb . length ( ) < Request . descrLength - tagname . length ( ) ) {
sb . append ( ' ' ) ;
sb . append ( d . dc_description ( ) [ 0 ] ) ;
}
2012-04-24 16:07:03 +02:00
}
if ( sb . length ( ) < Request . descrLength - tagname . length ( ) ) {
sb . append ( ' ' ) ;
sb . append ( d . dc_subject ( ',' ) ) ;
}
if ( tagname . length ( ) > 0 ) {
if ( sb . length ( ) > Request . descrLength - tagname . length ( ) - 3 ) {
// cut this off because otherwise the tagname is lost.
2015-05-24 01:59:40 +02:00
if ( tagname . length ( ) > Request . descrLength ) { // but in rare case tagname could be extreme long
sb . setLength ( 0 ) ;
sb . append ( tagname . substring ( 0 , Request . descrLength ) ) ;
} else {
sb . setLength ( Request . descrLength - tagname . length ( ) ) ;
sb . append ( " - " ) ;
sb . append ( tagname ) ;
}
} else {
sb . append ( " - " ) ;
sb . append ( tagname ) ;
2012-04-24 16:07:03 +02:00
}
}
return sb . toString ( ) . trim ( ) ;
}
2005-04-29 00:04:57 +02:00
}