2005-04-24 23:47:34 +02:00
//plasmaParserDocument.java
//------------------------
//part of YaCy
2008-07-20 19:14:51 +02:00
//(C) by Michael Peter Christen; mc@yacy.net
2005-04-24 23:47:34 +02:00
//first published on http://www.anomic.de
//Frankfurt, Germany, 2005
//
//last major change: 24.04.2005
//
//This program is free software; you can redistribute it and/or modify
//it under the terms of the GNU General Public License as published by
//the Free Software Foundation; either version 2 of the License, or
//(at your option) any later version.
//
//This program is distributed in the hope that it will be useful,
//but WITHOUT ANY WARRANTY; without even the implied warranty of
//MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
//GNU General Public License for more details.
//
//You should have received a copy of the GNU General Public License
//along with this program; if not, write to the Free Software
//Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
2005-04-24 23:24:53 +02:00
package de.anomic.plasma ;
2006-09-30 11:31:53 +02:00
import java.io.BufferedInputStream ;
2005-06-02 03:33:10 +02:00
import java.io.ByteArrayInputStream ;
2006-09-30 11:31:53 +02:00
import java.io.File ;
import java.io.FileInputStream ;
2007-05-19 01:13:44 +02:00
import java.io.IOException ;
2006-09-30 11:31:53 +02:00
import java.io.InputStream ;
2009-04-17 16:20:12 +02:00
import java.io.OutputStreamWriter ;
2007-05-19 01:13:44 +02:00
import java.util.Arrays ;
2008-03-26 20:51:05 +01:00
import java.util.Date ;
2005-04-24 23:24:53 +02:00
import java.util.HashMap ;
import java.util.Iterator ;
2007-05-19 01:13:44 +02:00
import java.util.LinkedList ;
import java.util.List ;
2005-04-24 23:24:53 +02:00
import java.util.Map ;
2008-09-20 00:19:11 +02:00
import java.util.Set ;
2006-04-04 16:36:01 +02:00
import java.util.TreeSet ;
2005-04-24 23:24:53 +02:00
2008-02-25 15:08:15 +01:00
import de.anomic.htmlFilter.htmlFilterContentScraper ;
2006-09-30 00:27:20 +02:00
import de.anomic.htmlFilter.htmlFilterImageEntry ;
2009-04-17 16:20:12 +02:00
import de.anomic.kelondro.util.DateFormatter ;
2009-01-31 02:06:56 +01:00
import de.anomic.kelondro.util.FileUtils ;
2007-05-19 01:13:44 +02:00
import de.anomic.plasma.parser.Parser ;
2009-04-03 15:23:45 +02:00
import de.anomic.plasma.parser.Condenser ;
2008-06-06 18:01:27 +02:00
import de.anomic.server.serverCachedFileOutputStream ;
import de.anomic.yacy.yacyURL ;
2006-09-30 00:27:20 +02:00
2005-04-24 23:24:53 +02:00
public class plasmaParserDocument {
2008-08-02 14:12:04 +02:00
private final yacyURL source ; // the source url
private final String mimeType ; // mimeType as taken from http header
private final String charset ; // the charset of the document
private final List < String > keywords ; // most resources provide a keyword field
2009-05-08 09:54:10 +02:00
private StringBuilder title ; // a document title, taken from title or h1 tag; shall appear as headline of search result
2008-12-04 13:54:16 +01:00
private final StringBuilder creator ; // author or copyright
private final List < String > sections ; // if present: more titles/headlines appearing in the document
private final StringBuilder description ; // an abstract, if present: short content description
2008-03-24 23:51:26 +01:00
private Object text ; // the clear text, all that is visible
2008-08-02 14:12:04 +02:00
private final Map < yacyURL , String > anchors ; // all links embedded as clickeable entities (anchor tags)
private final HashMap < String , htmlFilterImageEntry > images ; // all visible pictures in document
2005-04-24 23:24:53 +02:00
// the anchors and images - Maps are URL-to-EntityDescription mappings.
// The EntityDescription appear either as visible text in anchors or as alternative
// text in image tags.
2008-01-22 12:51:43 +01:00
private Map < yacyURL , String > hyperlinks , audiolinks , videolinks , applinks ;
2007-12-27 23:37:02 +01:00
private Map < String , String > emaillinks ;
2007-09-05 11:01:35 +02:00
private yacyURL favicon ;
2006-12-01 17:21:17 +01:00
private boolean resorted ;
2007-05-19 01:13:44 +02:00
private InputStream textStream ;
2008-03-26 20:51:05 +01:00
private int inboundLinks , outboundLinks ; // counters for inbound and outbound links, are counted after calling notifyWebStructure
2008-09-20 00:19:11 +02:00
private Set < String > languages ;
2007-05-19 01:13:44 +02:00
2008-09-20 00:19:11 +02:00
protected plasmaParserDocument ( final yacyURL location , final String mimeType , final String charset , final Set < String > languages ,
2008-08-02 14:12:04 +02:00
final String [ ] keywords , final String title , final String author ,
final String [ ] sections , final String abstrct ,
final Object text , final Map < yacyURL , String > anchors , final HashMap < String , htmlFilterImageEntry > images ) {
2008-01-22 12:51:43 +01:00
this . source = location ;
2007-05-19 01:13:44 +02:00
this . mimeType = ( mimeType = = null ) ? " application/octet-stream " : mimeType ;
2006-09-15 15:18:12 +02:00
this . charset = charset ;
2007-12-27 23:37:02 +01:00
this . keywords = ( keywords = = null ) ? new LinkedList < String > ( ) : Arrays . asList ( keywords ) ;
2008-12-04 13:54:16 +01:00
this . title = ( title = = null ) ? new StringBuilder ( 0 ) : new StringBuilder ( title ) ;
this . creator = ( author = = null ) ? new StringBuilder ( 0 ) : new StringBuilder ( author ) ;
2007-12-27 23:37:02 +01:00
this . sections = ( sections = = null ) ? new LinkedList < String > ( ) : Arrays . asList ( sections ) ;
2008-12-04 13:54:16 +01:00
this . description = ( abstrct = = null ) ? new StringBuilder ( 0 ) : new StringBuilder ( abstrct ) ;
2008-01-22 12:51:43 +01:00
this . anchors = ( anchors = = null ) ? new HashMap < yacyURL , String > ( 0 ) : anchors ;
2008-02-25 15:08:15 +01:00
this . images = ( images = = null ) ? new HashMap < String , htmlFilterImageEntry > ( ) : images ;
2005-04-24 23:24:53 +02:00
this . hyperlinks = null ;
2006-11-28 16:00:15 +01:00
this . audiolinks = null ;
this . videolinks = null ;
this . applinks = null ;
2005-04-24 23:24:53 +02:00
this . emaillinks = null ;
2006-04-04 16:36:01 +02:00
this . resorted = false ;
2008-03-26 20:51:05 +01:00
this . inboundLinks = - 1 ;
this . outboundLinks = - 1 ;
2008-09-20 00:19:11 +02:00
this . languages = languages ;
2007-05-19 01:13:44 +02:00
if ( text = = null ) try {
this . text = new serverCachedFileOutputStream ( Parser . MAX_KEEP_IN_MEMORY_SIZE ) ;
2008-08-02 14:12:04 +02:00
} catch ( final IOException e ) {
2007-05-19 01:13:44 +02:00
e . printStackTrace ( ) ;
2008-12-04 13:54:16 +01:00
this . text = new StringBuilder ( ) ;
2007-05-19 01:13:44 +02:00
} else {
this . text = text ;
}
}
2008-09-20 00:19:11 +02:00
public plasmaParserDocument ( final yacyURL location , final String mimeType , final String charset , final Set < String > languages ) {
this ( location , mimeType , charset , languages , null , null , null , null , null , ( Object ) null , null , null ) ;
2007-05-19 01:13:44 +02:00
}
2008-09-20 00:19:11 +02:00
public plasmaParserDocument ( final yacyURL location , final String mimeType , final String charset , final Set < String > languages ,
2008-08-02 14:12:04 +02:00
final String [ ] keywords , final String title , final String author ,
final String [ ] sections , final String abstrct ,
final byte [ ] text , final Map < yacyURL , String > anchors , final HashMap < String , htmlFilterImageEntry > images ) {
2008-09-20 00:19:11 +02:00
this ( location , mimeType , charset , languages , keywords , title , author , sections , abstrct , ( Object ) text , anchors , images ) ;
2005-04-24 23:24:53 +02:00
}
2006-09-30 11:31:53 +02:00
2008-09-20 00:19:11 +02:00
public plasmaParserDocument ( final yacyURL location , final String mimeType , final String charset , final Set < String > languages ,
2008-08-02 14:12:04 +02:00
final String [ ] keywords , final String title , final String author ,
final String [ ] sections , final String abstrct ,
final File text , final Map < yacyURL , String > anchors , final HashMap < String , htmlFilterImageEntry > images ) {
2008-09-20 00:19:11 +02:00
this ( location , mimeType , charset , languages , keywords , title , author , sections , abstrct , ( Object ) text , anchors , images ) ;
2007-05-19 01:13:44 +02:00
}
2008-09-20 00:19:11 +02:00
public plasmaParserDocument ( final yacyURL location , final String mimeType , final String charset , final Set < String > languages ,
2008-08-02 14:12:04 +02:00
final String [ ] keywords , final String title , final String author ,
final String [ ] sections , final String abstrct ,
final serverCachedFileOutputStream text , final Map < yacyURL , String > anchors , final HashMap < String , htmlFilterImageEntry > images ) {
2008-09-20 00:19:11 +02:00
this ( location , mimeType , charset , languages , keywords , title , author , sections , abstrct , ( Object ) text , anchors , images ) ;
2007-05-19 01:13:44 +02:00
}
2006-04-04 16:36:01 +02:00
2008-09-20 00:19:11 +02:00
/ * *
* compute a set of languages that this document contains
* the language is not computed using a statistical analysis of the content , only from given metadata that came with the document
* if there are several languages defined in the document , the TLD is taken to check which one should be picked
* If there is no metadata at all , null is returned
* @return a string with a language name using the alpha - 2 code of ISO 639
* /
2009-04-17 16:20:12 +02:00
public String dc_language ( ) {
2008-09-20 00:19:11 +02:00
if ( this . languages = = null ) return null ;
if ( this . languages . size ( ) = = 0 ) return null ;
if ( this . languages . size ( ) = = 1 ) return languages . iterator ( ) . next ( ) ;
if ( this . languages . contains ( this . source . language ( ) ) ) return this . source . language ( ) ;
// now we are confused: the declared languages differ all from the TLD
// just pick one of the languages that we have
return languages . iterator ( ) . next ( ) ;
}
2008-01-22 12:51:43 +01:00
/ *
DC according to rfc 5013
* dc_title
* dc_creator
* dc_subject
* dc_description
* dc_publisher
dc_contributor
dc_date
dc_type
* dc_format
* dc_identifier
* dc_source
dc_language
dc_relation
dc_coverage
dc_rights
* /
public String dc_title ( ) {
return title . toString ( ) ;
}
2009-05-08 09:54:10 +02:00
public void setTitle ( String title ) {
this . title = new StringBuilder ( title ) ;
}
2008-01-22 12:51:43 +01:00
public String dc_creator ( ) {
2008-08-02 15:57:00 +02:00
if ( creator = = null )
2008-08-06 21:43:12 +02:00
return " " ;
2008-08-02 15:57:00 +02:00
return creator . toString ( ) ;
2006-12-01 17:21:17 +01:00
}
2008-08-02 14:12:04 +02:00
public String dc_subject ( final char separator ) {
2008-01-22 12:51:43 +01:00
// sort out doubles and empty words
2008-08-02 14:12:04 +02:00
final TreeSet < String > hs = new TreeSet < String > ( ) ;
2008-01-22 12:51:43 +01:00
String s ;
for ( int i = 0 ; i < this . keywords . size ( ) ; i + + ) {
if ( this . keywords . get ( i ) = = null ) continue ;
2008-06-06 18:01:27 +02:00
s = ( this . keywords . get ( i ) ) . trim ( ) ;
2008-01-22 12:51:43 +01:00
if ( s . length ( ) > 0 ) hs . add ( s . toLowerCase ( ) ) ;
}
if ( hs . size ( ) = = 0 ) return " " ;
// generate a new list
2008-12-04 13:54:16 +01:00
final StringBuilder sb = new StringBuilder ( this . keywords . size ( ) * 6 ) ;
2008-08-02 14:12:04 +02:00
final Iterator < String > i = hs . iterator ( ) ;
2008-01-22 12:51:43 +01:00
while ( i . hasNext ( ) ) sb . append ( i . next ( ) ) . append ( separator ) ;
return sb . substring ( 0 , sb . length ( ) - 1 ) ;
}
public String dc_description ( ) {
2008-08-02 15:57:00 +02:00
if ( description = = null )
return dc_title ( ) ;
return description . toString ( ) ;
2008-01-22 12:51:43 +01:00
}
public String dc_publisher ( ) {
// if we don't have a publisher, simply return the host/domain name
return this . source . getHost ( ) ;
}
public String dc_format ( ) {
2005-09-05 12:34:34 +02:00
return this . mimeType ;
}
2008-01-22 12:51:43 +01:00
public String dc_identifier ( ) {
2009-04-17 16:20:12 +02:00
return this . source . toNormalform ( true , false ) ;
2008-01-22 12:51:43 +01:00
}
public yacyURL dc_source ( ) {
return this . source ;
}
2006-09-15 15:18:12 +02:00
/ * *
* @return the supposed charset of this document or < code > null < / code > if unknown
* /
2006-12-01 17:21:17 +01:00
public String getCharset ( ) {
2006-09-15 15:18:12 +02:00
return this . charset ;
}
2005-04-24 23:24:53 +02:00
public String [ ] getSectionTitles ( ) {
2008-08-02 15:57:00 +02:00
if ( sections = = null ) {
2008-01-22 12:51:43 +01:00
return new String [ ] { dc_title ( ) } ;
2007-05-19 01:13:44 +02:00
}
2008-08-02 15:57:00 +02:00
return sections . toArray ( new String [ this . sections . size ( ) ] ) ;
2005-04-24 23:24:53 +02:00
}
2006-09-30 11:31:53 +02:00
public InputStream getText ( ) {
try {
if ( this . text = = null ) return null ;
2006-09-30 12:09:01 +02:00
if ( this . text instanceof File ) {
this . textStream = new BufferedInputStream ( new FileInputStream ( ( File ) this . text ) ) ;
} else if ( this . text instanceof byte [ ] ) {
this . textStream = new ByteArrayInputStream ( ( byte [ ] ) this . text ) ;
2007-05-19 01:13:44 +02:00
} else if ( this . text instanceof serverCachedFileOutputStream ) {
return ( ( serverCachedFileOutputStream ) this . text ) . getContent ( ) ;
2006-09-30 12:09:01 +02:00
}
return this . textStream ;
2008-08-02 14:12:04 +02:00
} catch ( final Exception e ) {
2006-09-30 11:31:53 +02:00
e . printStackTrace ( ) ;
}
return null ;
}
public byte [ ] getTextBytes ( ) {
try {
if ( this . text = = null ) return new byte [ 0 ] ;
2007-05-19 01:13:44 +02:00
if ( this . text instanceof File ) {
2009-01-31 02:06:56 +01:00
return FileUtils . read ( ( File ) this . text ) ;
2007-05-19 01:13:44 +02:00
} else if ( this . text instanceof byte [ ] ) {
return ( byte [ ] ) this . text ;
} else if ( this . text instanceof serverCachedFileOutputStream ) {
2008-08-02 14:12:04 +02:00
final serverCachedFileOutputStream ffbaos = ( serverCachedFileOutputStream ) this . text ;
2007-05-19 01:13:44 +02:00
if ( ffbaos . isFallback ( ) ) {
2009-01-31 02:06:56 +01:00
return FileUtils . read ( ffbaos . getContent ( ) ) ;
2007-05-19 01:13:44 +02:00
}
2008-08-02 15:57:00 +02:00
return ffbaos . getContentBAOS ( ) ;
2007-05-19 01:13:44 +02:00
}
2008-08-02 14:12:04 +02:00
} catch ( final Exception e ) {
2006-09-30 11:31:53 +02:00
e . printStackTrace ( ) ;
}
return new byte [ 0 ] ;
}
public long getTextLength ( ) {
if ( this . text = = null ) return 0 ;
if ( this . text instanceof File ) return ( ( File ) this . text ) . length ( ) ;
else if ( this . text instanceof byte [ ] ) return ( ( byte [ ] ) this . text ) . length ;
2007-05-19 01:13:44 +02:00
else if ( this . text instanceof serverCachedFileOutputStream ) {
return ( ( serverCachedFileOutputStream ) this . text ) . getLength ( ) ;
}
2006-09-30 11:31:53 +02:00
return - 1 ;
2005-04-24 23:24:53 +02:00
}
2008-12-04 13:54:16 +01:00
public Iterator < StringBuilder > getSentences ( final boolean pre ) {
2006-10-07 02:06:09 +02:00
if ( this . text = = null ) return null ;
2009-04-03 15:23:45 +02:00
final Condenser . sentencesFromInputStreamEnum e = Condenser . sentencesFromInputStream ( getText ( ) ) ;
2006-11-28 16:00:15 +01:00
e . pre ( pre ) ;
return e ;
2005-06-02 03:33:10 +02:00
}
2007-12-27 23:37:02 +01:00
public List < String > getKeywords ( ) {
2007-05-19 01:13:44 +02:00
return this . keywords ;
}
2008-01-22 12:51:43 +01:00
public Map < yacyURL , String > getAnchors ( ) {
2005-04-24 23:24:53 +02:00
// returns all links embedded as anchors (clickeable entities)
2006-12-08 03:14:56 +01:00
// this is a url(String)/text(String) map
2005-04-24 23:24:53 +02:00
return anchors ;
}
// the next three methods provide a calculated view on the getAnchors/getImages:
2008-01-22 12:51:43 +01:00
public Map < yacyURL , String > getHyperlinks ( ) {
2005-04-24 23:24:53 +02:00
// this is a subset of the getAnchor-set: only links to other hyperrefs
2006-04-04 16:36:01 +02:00
if ( ! resorted ) resortLinks ( ) ;
2005-04-24 23:24:53 +02:00
return hyperlinks ;
}
2008-01-22 12:51:43 +01:00
public Map < yacyURL , String > getAudiolinks ( ) {
2006-04-04 16:36:01 +02:00
if ( ! resorted ) resortLinks ( ) ;
2006-11-28 16:00:15 +01:00
return this . audiolinks ;
}
2008-01-22 12:51:43 +01:00
public Map < yacyURL , String > getVideolinks ( ) {
2006-11-28 16:00:15 +01:00
if ( ! resorted ) resortLinks ( ) ;
return this . videolinks ;
}
2008-02-25 15:08:15 +01:00
public HashMap < String , htmlFilterImageEntry > getImages ( ) {
2006-12-01 17:21:17 +01:00
// returns all links enbedded as pictures (visible in document)
// this resturns a htmlFilterImageEntry collection
2006-11-28 16:00:15 +01:00
if ( ! resorted ) resortLinks ( ) ;
2006-12-01 17:21:17 +01:00
return images ;
2006-11-28 16:00:15 +01:00
}
2008-01-22 12:51:43 +01:00
public Map < yacyURL , String > getApplinks ( ) {
2006-11-28 16:00:15 +01:00
if ( ! resorted ) resortLinks ( ) ;
return this . applinks ;
2005-04-24 23:24:53 +02:00
}
2007-12-27 23:37:02 +01:00
public Map < String , String > getEmaillinks ( ) {
2005-04-24 23:24:53 +02:00
// this is part of the getAnchor-set: only links to email addresses
2006-04-04 16:36:01 +02:00
if ( ! resorted ) resortLinks ( ) ;
2005-04-24 23:24:53 +02:00
return emaillinks ;
}
private synchronized void resortLinks ( ) {
2009-02-17 10:12:47 +01:00
if ( this . resorted ) return ;
2006-04-04 16:36:01 +02:00
// extract hyperlinks, medialinks and emaillinks from anchorlinks
2007-09-05 11:01:35 +02:00
yacyURL url ;
2006-11-28 16:00:15 +01:00
String u ;
2005-05-07 23:11:18 +02:00
int extpos , qpos ;
2005-05-03 11:47:56 +02:00
String ext = null ;
2008-08-02 14:12:04 +02:00
final Iterator < Map . Entry < yacyURL , String > > i = anchors . entrySet ( ) . iterator ( ) ;
2008-01-22 12:51:43 +01:00
hyperlinks = new HashMap < yacyURL , String > ( ) ;
videolinks = new HashMap < yacyURL , String > ( ) ;
audiolinks = new HashMap < yacyURL , String > ( ) ;
applinks = new HashMap < yacyURL , String > ( ) ;
2007-12-27 23:37:02 +01:00
emaillinks = new HashMap < String , String > ( ) ;
2008-08-02 14:12:04 +02:00
final HashMap < String , htmlFilterImageEntry > collectedImages = new HashMap < String , htmlFilterImageEntry > ( ) ; // this is a set that is collected now and joined later to the imagelinks
2008-01-22 12:51:43 +01:00
Map . Entry < yacyURL , String > entry ;
2005-04-24 23:24:53 +02:00
while ( i . hasNext ( ) ) {
2008-01-18 18:14:02 +01:00
entry = i . next ( ) ;
2008-01-22 12:51:43 +01:00
url = entry . getKey ( ) ;
if ( url = = null ) continue ;
u = url . toNormalform ( true , false ) ;
2008-06-06 18:01:27 +02:00
if ( u . startsWith ( " mailto: " ) ) {
2008-01-18 18:14:02 +01:00
emaillinks . put ( u . substring ( 7 ) , entry . getValue ( ) ) ;
2005-04-24 23:24:53 +02:00
} else {
2006-11-28 16:00:15 +01:00
extpos = u . lastIndexOf ( " . " ) ;
2005-04-24 23:24:53 +02:00
if ( extpos > 0 ) {
2006-11-28 16:00:15 +01:00
if ( ( ( qpos = u . indexOf ( " ? " ) ) > = 0 ) & & ( qpos > extpos ) ) {
ext = u . substring ( extpos + 1 , qpos ) . toLowerCase ( ) ;
2005-05-03 11:47:56 +02:00
} else {
2006-11-28 16:00:15 +01:00
ext = u . substring ( extpos + 1 ) . toLowerCase ( ) ;
2005-05-03 11:47:56 +02:00
}
2008-01-22 12:51:43 +01:00
if ( plasmaParser . mediaExtContains ( ext ) ) {
// this is not a normal anchor, its a media link
if ( plasmaParser . imageExtContains ( ext ) ) {
2008-06-06 18:01:27 +02:00
htmlFilterContentScraper . addImage ( collectedImages , new htmlFilterImageEntry ( url , entry . getValue ( ) , - 1 , - 1 ) ) ;
2006-04-04 16:36:01 +02:00
}
2008-06-06 18:01:27 +02:00
else if ( plasmaParser . audioExtContains ( ext ) ) audiolinks . put ( url , entry . getValue ( ) ) ;
else if ( plasmaParser . videoExtContains ( ext ) ) videolinks . put ( url , entry . getValue ( ) ) ;
else if ( plasmaParser . appsExtContains ( ext ) ) applinks . put ( url , entry . getValue ( ) ) ;
2008-01-22 12:51:43 +01:00
} else {
2008-06-06 18:01:27 +02:00
hyperlinks . put ( url , entry . getValue ( ) ) ;
2005-04-24 23:24:53 +02:00
}
2008-06-11 11:54:58 +02:00
} else {
// a path to a directory
hyperlinks . put ( url , entry . getValue ( ) ) ;
2005-04-24 23:24:53 +02:00
}
}
}
2006-04-04 16:36:01 +02:00
2006-12-01 17:21:17 +01:00
// add image links that we collected from the anchors to the image map
2008-02-25 15:08:15 +01:00
htmlFilterContentScraper . addAllImages ( images , collectedImages ) ;
2007-12-27 23:37:02 +01:00
2006-12-01 17:21:17 +01:00
// expand the hyperlinks:
// we add artificial hyperlinks to the hyperlink set
// that can be calculated from given hyperlinks and imagelinks
2007-12-27 23:37:02 +01:00
2008-02-25 15:08:15 +01:00
hyperlinks . putAll ( plasmaParser . allReflinks ( images . values ( ) ) ) ;
2006-12-01 17:21:17 +01:00
hyperlinks . putAll ( plasmaParser . allReflinks ( audiolinks . keySet ( ) ) ) ;
hyperlinks . putAll ( plasmaParser . allReflinks ( videolinks . keySet ( ) ) ) ;
hyperlinks . putAll ( plasmaParser . allReflinks ( applinks . keySet ( ) ) ) ;
2009-05-01 09:30:53 +02:00
/ *
2006-12-01 17:21:17 +01:00
hyperlinks . putAll ( plasmaParser . allSubpaths ( hyperlinks . keySet ( ) ) ) ;
2008-02-25 15:08:15 +01:00
hyperlinks . putAll ( plasmaParser . allSubpaths ( images . values ( ) ) ) ;
2006-12-01 17:21:17 +01:00
hyperlinks . putAll ( plasmaParser . allSubpaths ( audiolinks . keySet ( ) ) ) ;
hyperlinks . putAll ( plasmaParser . allSubpaths ( videolinks . keySet ( ) ) ) ;
hyperlinks . putAll ( plasmaParser . allSubpaths ( applinks . keySet ( ) ) ) ;
2009-05-01 09:30:53 +02:00
* /
2006-04-04 16:36:01 +02:00
// don't do this again
this . resorted = true ;
2005-04-24 23:24:53 +02:00
}
2008-08-02 14:12:04 +02:00
public void addSubDocument ( final plasmaParserDocument doc ) throws IOException {
2007-05-19 01:13:44 +02:00
this . sections . addAll ( Arrays . asList ( doc . getSectionTitles ( ) ) ) ;
if ( this . title . length ( ) > 0 ) this . title . append ( '\n' ) ;
2008-01-22 12:51:43 +01:00
this . title . append ( doc . dc_title ( ) ) ;
2007-05-19 01:13:44 +02:00
this . keywords . addAll ( doc . getKeywords ( ) ) ;
2008-01-22 12:51:43 +01:00
if ( this . description . length ( ) > 0 ) this . description . append ( '\n' ) ;
this . description . append ( doc . dc_description ( ) ) ;
2007-05-19 01:13:44 +02:00
if ( ! ( this . text instanceof serverCachedFileOutputStream ) ) {
this . text = new serverCachedFileOutputStream ( Parser . MAX_KEEP_IN_MEMORY_SIZE ) ;
2009-01-31 02:06:56 +01:00
FileUtils . copy ( getText ( ) , ( serverCachedFileOutputStream ) this . text ) ;
2007-05-19 01:13:44 +02:00
}
2009-01-31 02:06:56 +01:00
FileUtils . copy ( doc . getText ( ) , ( serverCachedFileOutputStream ) this . text ) ;
2007-05-19 01:13:44 +02:00
anchors . putAll ( doc . getAnchors ( ) ) ;
2008-02-25 15:08:15 +01:00
htmlFilterContentScraper . addAllImages ( images , doc . getImages ( ) ) ;
2007-05-19 01:13:44 +02:00
}
2007-06-09 17:22:37 +02:00
/ * *
* @return the { @link URL } to the favicon that belongs to the document
* /
2007-09-05 11:01:35 +02:00
public yacyURL getFavicon ( ) {
2007-06-09 17:22:37 +02:00
return this . favicon ;
}
/ * *
* @param faviconURL the { @link URL } to the favicon that belongs to the document
* /
2008-08-02 14:12:04 +02:00
public void setFavicon ( final yacyURL faviconURL ) {
2007-06-09 17:22:37 +02:00
this . favicon = faviconURL ;
}
2009-04-03 15:23:45 +02:00
public void notifyWebStructure ( final plasmaWebStructure webStructure , final Condenser condenser , final Date docDate ) {
2008-08-02 14:12:04 +02:00
final Integer [ ] ioLinks = webStructure . generateCitationReference ( this , condenser , docDate ) ; // [outlinksSame, outlinksOther]
2008-03-26 20:51:05 +01:00
this . inboundLinks = ioLinks [ 0 ] . intValue ( ) ;
this . outboundLinks = ioLinks [ 1 ] . intValue ( ) ;
}
public int inboundLinks ( ) {
assert this . inboundLinks > = 0 ;
return ( this . inboundLinks < 0 ) ? 0 : this . inboundLinks ;
}
public int outboundLinks ( ) {
assert this . outboundLinks > = 0 ;
return ( this . outboundLinks < 0 ) ? 0 : this . outboundLinks ;
}
2009-04-17 16:20:12 +02:00
public void writeXML ( OutputStreamWriter os , Date date ) throws IOException {
os . write ( " <record> \ n " ) ;
os . write ( " <dc:Title><![CDATA[ " + this . dc_title ( ) + " ]]></dc:Title> \ n " ) ;
os . write ( " <dc:Identifier> " + this . dc_identifier ( ) + " </dc:Identifier> \ n " ) ;
os . write ( " <dc:Description><![CDATA[ " ) ;
byte [ ] buffer = new byte [ 1000 ] ;
int c = 0 ;
InputStream is = this . getText ( ) ;
while ( ( c = is . read ( buffer ) ) > 0 ) os . write ( new String ( buffer , 0 , c ) ) ;
is . close ( ) ;
os . write ( " ]]></dc:Description> \ n " ) ;
os . write ( " <dc:Language> " + this . dc_language ( ) + " </dc:Language> \ n " ) ;
os . write ( " <dc:Date> " + DateFormatter . formatISO8601 ( date ) + " </dc:Date> \ n " ) ;
os . write ( " </record> \ n " ) ;
}
2006-09-30 11:31:53 +02:00
public void close ( ) {
2006-09-30 12:09:01 +02:00
// try close the output stream
if ( this . textStream ! = null ) {
try {
this . textStream . close ( ) ;
2008-08-02 14:12:04 +02:00
} catch ( final Exception e ) {
2006-09-30 12:09:01 +02:00
/* ignore this */
} finally {
this . textStream = null ;
}
}
2006-09-30 11:31:53 +02:00
// delete the temp file
if ( ( this . text ! = null ) & & ( this . text instanceof File ) ) {
2006-09-30 12:09:01 +02:00
try {
2009-03-30 17:31:25 +02:00
FileUtils . deletedelete ( ( File ) this . text ) ;
2008-08-02 14:12:04 +02:00
} catch ( final Exception e ) {
2006-09-30 12:09:01 +02:00
/* ignore this */
} finally {
this . text = null ;
}
2006-09-30 11:31:53 +02:00
}
}
protected void finalize ( ) throws Throwable {
this . close ( ) ;
super . finalize ( ) ;
}
2005-04-29 00:04:57 +02:00
}