2011-04-14 22:12:14 +02:00
/ * *
2013-02-21 13:23:55 +01:00
* CollectionConfiguration
2011-04-14 22:12:14 +02:00
* Copyright 2011 by Michael Peter Christen
* First released 14 . 04 . 2011 at http : //yacy.net
*
* $LastChangedDate : 2011 - 04 - 14 22 : 05 : 04 + 0200 ( Do , 14 Apr 2011 ) $
* $LastChangedRevision : 7654 $
* $LastChangedBy : orbiter $
*
* This library is free software ; you can redistribute it and / or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation ; either
* version 2 . 1 of the License , or ( at your option ) any later version .
2011-06-30 17:49:21 +02:00
*
2011-04-14 22:12:14 +02:00
* This library is distributed in the hope that it will be useful ,
* but WITHOUT ANY WARRANTY ; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE . See the GNU
* Lesser General Public License for more details .
2011-06-30 17:49:21 +02:00
*
2011-04-14 22:12:14 +02:00
* You should have received a copy of the GNU Lesser General Public License
* along with this program in the file lgpl21 . txt
* If not , see < http : //www.gnu.org/licenses/>.
* /
2013-02-21 13:23:55 +01:00
package net.yacy.search.schema ;
2011-04-14 22:05:04 +02:00
2011-06-30 17:49:21 +02:00
import java.io.File ;
2012-05-14 14:56:21 +02:00
import java.io.IOException ;
2012-05-15 23:10:47 +02:00
import java.io.Serializable ;
2011-04-21 15:58:49 +02:00
import java.net.InetAddress ;
2011-09-13 16:39:41 +02:00
import java.net.MalformedURLException ;
2012-05-15 23:10:47 +02:00
import java.util.ArrayList ;
import java.util.Collection ;
import java.util.Date ;
2012-09-07 22:06:51 +02:00
import java.util.HashMap ;
2013-02-15 01:38:10 +01:00
import java.util.HashSet ;
2012-05-15 23:10:47 +02:00
import java.util.Iterator ;
2013-01-02 20:55:43 +01:00
import java.util.LinkedHashSet ;
2012-05-15 23:10:47 +02:00
import java.util.List ;
import java.util.Map ;
import java.util.Properties ;
import java.util.Set ;
2011-04-21 15:58:49 +02:00
2012-05-14 14:56:21 +02:00
import net.yacy.cora.document.ASCII ;
2011-06-30 17:49:21 +02:00
import net.yacy.cora.document.MultiProtocolURI ;
2012-08-05 15:49:27 +02:00
import net.yacy.cora.document.UTF8 ;
2013-02-21 13:23:55 +01:00
import net.yacy.cora.federate.solr.SchemaConfiguration ;
2012-11-23 14:00:30 +01:00
import net.yacy.cora.federate.solr.FailType ;
2013-01-02 20:55:43 +01:00
import net.yacy.cora.federate.solr.ProcessType ;
2013-02-21 13:23:55 +01:00
import net.yacy.cora.federate.solr.SchemaDeclaration ;
2012-08-28 16:58:06 +02:00
import net.yacy.cora.protocol.Domains ;
2011-05-27 14:35:08 +02:00
import net.yacy.cora.protocol.HeaderFramework ;
2011-04-21 15:58:49 +02:00
import net.yacy.cora.protocol.ResponseHeader ;
2012-11-26 13:11:55 +01:00
import net.yacy.cora.util.CommonPattern ;
2013-01-02 20:55:43 +01:00
import net.yacy.cora.util.SpaceExceededException ;
2012-09-21 15:48:16 +02:00
import net.yacy.crawler.data.CrawlProfile ;
import net.yacy.crawler.retrieval.Response ;
2012-08-05 15:49:27 +02:00
import net.yacy.document.Condenser ;
2011-04-14 22:05:04 +02:00
import net.yacy.document.Document ;
2011-04-21 15:58:49 +02:00
import net.yacy.document.parser.html.ContentScraper ;
import net.yacy.document.parser.html.ImageEntry ;
2013-01-02 20:55:43 +01:00
import net.yacy.kelondro.data.citation.CitationReference ;
2011-04-14 22:05:04 +02:00
import net.yacy.kelondro.data.meta.DigestURI ;
2012-09-11 20:15:54 +02:00
import net.yacy.kelondro.data.meta.URIMetadataRow ;
2013-01-02 20:55:43 +01:00
import net.yacy.kelondro.index.RowHandleSet ;
2012-01-13 11:25:15 +01:00
import net.yacy.kelondro.logging.Log ;
2013-01-02 20:55:43 +01:00
import net.yacy.kelondro.rwi.IndexCell ;
import net.yacy.kelondro.rwi.ReferenceContainer ;
2012-09-21 16:46:57 +02:00
import net.yacy.kelondro.util.Bitfield ;
2013-01-02 20:55:43 +01:00
import net.yacy.kelondro.util.ByteBuffer ;
2011-06-30 17:49:21 +02:00
2013-02-15 01:38:10 +01:00
import org.apache.solr.common.SolrDocument ;
2012-08-18 13:05:27 +02:00
import org.apache.solr.common.SolrInputDocument ;
2011-04-14 22:05:04 +02:00
2012-08-05 15:49:27 +02:00
2013-02-21 13:23:55 +01:00
public class CollectionConfiguration extends SchemaConfiguration implements Serializable {
2012-05-15 23:10:47 +02:00
private static final long serialVersionUID = - 499100932212840385L ;
2011-06-30 17:49:21 +02:00
2012-06-27 12:17:58 +02:00
2011-06-30 17:49:21 +02:00
/ * *
* initialize with an empty ConfigurationSet which will cause that all the index
* attributes are used
* /
2013-02-21 13:23:55 +01:00
public CollectionConfiguration ( ) {
2011-06-30 17:49:21 +02:00
super ( ) ;
}
2012-12-02 16:54:29 +01:00
2011-06-30 17:49:21 +02:00
/ * *
2013-02-15 01:38:10 +01:00
* initialize the schema with a given configuration file
2011-06-30 17:49:21 +02:00
* the configuration file simply contains a list of lines with keywords
2012-05-15 22:34:02 +02:00
* or keyword = value lines ( while value is a custom Solr field name
2011-06-30 17:49:21 +02:00
* @param configurationFile
* /
2013-02-21 13:23:55 +01:00
public CollectionConfiguration ( final File configurationFile , boolean lazy ) {
2011-06-30 17:49:21 +02:00
super ( configurationFile ) ;
2013-02-21 13:23:55 +01:00
super . lazy = lazy ;
2012-05-09 16:46:45 +02:00
// check consistency: compare with YaCyField enum
2012-05-15 22:34:02 +02:00
if ( this . isEmpty ( ) ) return ;
Iterator < Entry > it = this . entryIterator ( ) ;
2013-02-21 13:23:55 +01:00
for ( SchemaConfiguration . Entry etr = it . next ( ) ; it . hasNext ( ) ; etr = it . next ( ) ) {
2012-01-13 11:25:15 +01:00
try {
2013-02-21 13:23:55 +01:00
CollectionSchema f = CollectionSchema . valueOf ( etr . key ( ) ) ;
2012-05-15 22:34:02 +02:00
f . setSolrFieldName ( etr . getValue ( ) ) ;
2012-01-13 11:25:15 +01:00
} catch ( IllegalArgumentException e ) {
2013-02-21 13:23:55 +01:00
Log . logFine ( " SolrCollectionWriter " , " solr schema file " + configurationFile . getAbsolutePath ( ) + " defines unknown attribute ' " + etr . toString ( ) + " ' " ) ;
2012-05-15 22:34:02 +02:00
it . remove ( ) ;
2012-01-13 11:25:15 +01:00
}
}
2012-08-05 15:49:27 +02:00
// check consistency the other way: look if all enum constants in SolrField appear in the configuration file
2013-02-21 13:23:55 +01:00
for ( CollectionSchema field : CollectionSchema . values ( ) ) {
2012-08-05 15:49:27 +02:00
if ( this . get ( field . name ( ) ) = = null ) {
2013-02-21 13:23:55 +01:00
Log . logWarning ( " SolrCollectionWriter " , " solr schema file " + configurationFile . getAbsolutePath ( ) + " is missing declaration for ' " + field . name ( ) + " ' " ) ;
2012-08-05 15:49:27 +02:00
}
}
2012-06-25 18:17:31 +02:00
}
2012-08-27 14:41:33 +02:00
2012-05-15 22:34:02 +02:00
/ * *
* save configuration to file and update enum SolrFields
* @throws IOException
* /
public void commit ( ) throws IOException {
try {
super . commit ( ) ;
// make sure the enum SolrField.SolrFieldName is current
Iterator < Entry > it = this . entryIterator ( ) ;
2013-02-21 13:23:55 +01:00
for ( SchemaConfiguration . Entry etr = it . next ( ) ; it . hasNext ( ) ; etr = it . next ( ) ) {
2012-05-15 22:34:02 +02:00
try {
2013-02-21 13:23:55 +01:00
SchemaDeclaration f = CollectionSchema . valueOf ( etr . key ( ) ) ;
2012-05-15 22:34:02 +02:00
f . setSolrFieldName ( etr . getValue ( ) ) ;
} catch ( IllegalArgumentException e ) {
continue ;
}
}
} catch ( final IOException e ) { }
}
2012-08-10 13:26:51 +02:00
2013-02-21 13:23:55 +01:00
/ * *
* Convert a SolrDocument to a SolrInputDocument .
* This is useful if a document from the search index shall be modified and indexed again .
* This shall be used as replacement of ClientUtils . toSolrInputDocument because we remove some fields
* which are created automatically during the indexing process .
* @param doc the solr document
* @return a solr input document
* /
public SolrInputDocument toSolrInputDocument ( SolrDocument doc ) {
SolrInputDocument sid = new SolrInputDocument ( ) ;
Set < String > omitFields = new HashSet < String > ( ) ;
omitFields . add ( CollectionSchema . coordinate_p . getSolrFieldName ( ) + " _0_coordinate " ) ;
omitFields . add ( CollectionSchema . coordinate_p . getSolrFieldName ( ) + " _1_coordinate " ) ;
omitFields . add ( CollectionSchema . author_sxt . getSolrFieldName ( ) ) ;
for ( String name : doc . getFieldNames ( ) ) {
if ( this . contains ( name ) & & ! omitFields . contains ( name ) ) sid . addField ( name , doc . getFieldValue ( name ) , 1 . 0f ) ;
}
return sid ;
}
public SolrInputDocument metadata2solr ( final URIMetadataRow md ) {
2012-08-27 14:41:33 +02:00
2012-08-23 09:51:45 +02:00
final SolrInputDocument doc = new SolrInputDocument ( ) ;
2012-10-15 13:17:13 +02:00
final DigestURI digestURI = DigestURI . toDigestURI ( md . url ( ) ) ;
2012-08-05 15:49:27 +02:00
boolean allAttr = this . isEmpty ( ) ;
2013-02-21 13:23:55 +01:00
if ( allAttr | | contains ( CollectionSchema . failreason_t ) ) add ( doc , CollectionSchema . failreason_t , " " ) ;
add ( doc , CollectionSchema . id , ASCII . String ( md . hash ( ) ) ) ;
2012-10-10 11:46:22 +02:00
String us = digestURI . toNormalform ( true ) ;
2013-02-21 13:23:55 +01:00
add ( doc , CollectionSchema . sku , us ) ;
if ( allAttr | | contains ( CollectionSchema . ip_s ) ) {
2012-08-05 15:49:27 +02:00
final InetAddress address = digestURI . getInetAddress ( ) ;
2013-02-21 13:23:55 +01:00
if ( address ! = null ) add ( doc , CollectionSchema . ip_s , address . getHostAddress ( ) ) ;
2012-08-05 15:49:27 +02:00
}
2013-02-21 13:23:55 +01:00
if ( allAttr | | contains ( CollectionSchema . url_protocol_s ) ) add ( doc , CollectionSchema . url_protocol_s , digestURI . getProtocol ( ) ) ;
2012-08-29 16:11:23 +02:00
Map < String , String > searchpart = digestURI . getSearchpartMap ( ) ;
if ( searchpart = = null ) {
2013-02-21 13:23:55 +01:00
if ( allAttr | | contains ( CollectionSchema . url_parameter_i ) ) add ( doc , CollectionSchema . url_parameter_i , 0 ) ;
2012-08-29 16:11:23 +02:00
} else {
2013-02-21 13:23:55 +01:00
if ( allAttr | | contains ( CollectionSchema . url_parameter_i ) ) add ( doc , CollectionSchema . url_parameter_i , searchpart . size ( ) ) ;
if ( allAttr | | contains ( CollectionSchema . url_parameter_key_sxt ) ) add ( doc , CollectionSchema . url_parameter_key_sxt , searchpart . keySet ( ) . toArray ( new String [ searchpart . size ( ) ] ) ) ;
if ( allAttr | | contains ( CollectionSchema . url_parameter_value_sxt ) ) add ( doc , CollectionSchema . url_parameter_value_sxt , searchpart . values ( ) . toArray ( new String [ searchpart . size ( ) ] ) ) ;
2012-08-29 16:11:23 +02:00
}
2013-02-21 13:23:55 +01:00
if ( allAttr | | contains ( CollectionSchema . url_chars_i ) ) add ( doc , CollectionSchema . url_chars_i , us . length ( ) ) ;
2012-08-28 16:58:06 +02:00
String host = null ;
if ( ( host = digestURI . getHost ( ) ) ! = null ) {
String dnc = Domains . getDNC ( host ) ;
2012-12-23 01:30:52 +01:00
String subdomOrga = host . length ( ) - dnc . length ( ) < = 0 ? " " : host . substring ( 0 , host . length ( ) - dnc . length ( ) - 1 ) ;
2012-08-28 16:58:06 +02:00
int p = subdomOrga . lastIndexOf ( '.' ) ;
String subdom = ( p < 0 ) ? " " : subdomOrga . substring ( 0 , p ) ;
String orga = ( p < 0 ) ? subdomOrga : subdomOrga . substring ( p + 1 ) ;
2013-02-21 13:23:55 +01:00
if ( allAttr | | contains ( CollectionSchema . host_s ) ) add ( doc , CollectionSchema . host_s , host ) ;
if ( allAttr | | contains ( CollectionSchema . host_dnc_s ) ) add ( doc , CollectionSchema . host_dnc_s , dnc ) ;
if ( allAttr | | contains ( CollectionSchema . host_organization_s ) ) add ( doc , CollectionSchema . host_organization_s , orga ) ;
if ( allAttr | | contains ( CollectionSchema . host_organizationdnc_s ) ) add ( doc , CollectionSchema . host_organizationdnc_s , orga + '.' + dnc ) ;
if ( allAttr | | contains ( CollectionSchema . host_subdomain_s ) ) add ( doc , CollectionSchema . host_subdomain_s , subdom ) ;
2012-08-28 16:58:06 +02:00
}
2012-08-31 10:30:43 +02:00
String title = md . dc_title ( ) ;
2013-02-21 13:23:55 +01:00
if ( allAttr | | contains ( CollectionSchema . title ) ) add ( doc , CollectionSchema . title , new String [ ] { title } ) ;
if ( allAttr | | contains ( CollectionSchema . title_count_i ) ) add ( doc , CollectionSchema . title_count_i , 1 ) ;
if ( allAttr | | contains ( CollectionSchema . title_chars_val ) ) {
2012-08-31 10:30:43 +02:00
Integer [ ] cv = new Integer [ ] { new Integer ( title . length ( ) ) } ;
2013-02-21 13:23:55 +01:00
add ( doc , CollectionSchema . title_chars_val , cv ) ;
2012-08-31 10:30:43 +02:00
}
2013-02-21 13:23:55 +01:00
if ( allAttr | | contains ( CollectionSchema . title_words_val ) ) {
2012-11-26 13:11:55 +01:00
Integer [ ] cv = new Integer [ ] { new Integer ( CommonPattern . SPACE . split ( title ) . length ) } ;
2013-02-21 13:23:55 +01:00
add ( doc , CollectionSchema . title_words_val , cv ) ;
2012-08-31 10:30:43 +02:00
}
String description = md . snippet ( ) ; if ( description = = null ) description = " " ;
2013-02-21 13:23:55 +01:00
if ( allAttr | | contains ( CollectionSchema . description ) ) add ( doc , CollectionSchema . description , description ) ;
if ( allAttr | | contains ( CollectionSchema . description_count_i ) ) add ( doc , CollectionSchema . description_count_i , 1 ) ;
if ( allAttr | | contains ( CollectionSchema . description_chars_val ) ) {
2012-08-31 10:30:43 +02:00
Integer [ ] cv = new Integer [ ] { new Integer ( description . length ( ) ) } ;
2013-02-21 13:23:55 +01:00
add ( doc , CollectionSchema . description_chars_val , cv ) ;
2012-08-31 10:30:43 +02:00
}
2013-02-21 13:23:55 +01:00
if ( allAttr | | contains ( CollectionSchema . description_words_val ) ) {
2012-11-26 13:11:55 +01:00
Integer [ ] cv = new Integer [ ] { new Integer ( CommonPattern . SPACE . split ( description ) . length ) } ;
2013-02-21 13:23:55 +01:00
add ( doc , CollectionSchema . description_words_val , cv ) ;
2012-08-31 10:30:43 +02:00
}
2013-02-21 13:23:55 +01:00
if ( allAttr | | contains ( CollectionSchema . author ) ) add ( doc , CollectionSchema . author , md . dc_creator ( ) ) ;
if ( allAttr | | contains ( CollectionSchema . content_type ) ) add ( doc , CollectionSchema . content_type , Response . doctype2mime ( digestURI . getFileExtension ( ) , md . doctype ( ) ) ) ;
if ( allAttr | | contains ( CollectionSchema . last_modified ) ) add ( doc , CollectionSchema . last_modified , md . moddate ( ) ) ;
if ( allAttr | | contains ( CollectionSchema . wordcount_i ) ) add ( doc , CollectionSchema . wordcount_i , md . wordCount ( ) ) ;
2012-08-27 14:41:33 +02:00
2012-08-18 19:36:21 +02:00
String keywords = md . dc_subject ( ) ;
Bitfield flags = md . flags ( ) ;
if ( flags . get ( Condenser . flag_cat_indexof ) ) {
if ( keywords = = null | | keywords . isEmpty ( ) ) keywords = " indexof " ; else {
if ( keywords . indexOf ( ',' ) > 0 ) keywords + = " , indexof " ; else keywords + = " indexof " ;
}
}
2013-02-21 13:23:55 +01:00
if ( allAttr | | contains ( CollectionSchema . keywords ) ) {
add ( doc , CollectionSchema . keywords , keywords ) ;
2012-08-05 15:49:27 +02:00
}
2012-08-10 13:26:51 +02:00
2012-08-05 15:49:27 +02:00
// path elements of link
2013-02-21 13:23:55 +01:00
if ( allAttr | | contains ( CollectionSchema . url_paths_sxt ) ) add ( doc , CollectionSchema . url_paths_sxt , digestURI . getPaths ( ) ) ;
if ( allAttr | | contains ( CollectionSchema . url_file_ext_s ) ) add ( doc , CollectionSchema . url_file_ext_s , digestURI . getFileExtension ( ) ) ;
2012-08-05 15:49:27 +02:00
2013-02-21 13:23:55 +01:00
if ( allAttr | | contains ( CollectionSchema . imagescount_i ) ) add ( doc , CollectionSchema . imagescount_i , md . limage ( ) ) ;
if ( allAttr | | contains ( CollectionSchema . inboundlinkscount_i ) ) add ( doc , CollectionSchema . inboundlinkscount_i , md . llocal ( ) ) ;
if ( allAttr | | contains ( CollectionSchema . outboundlinkscount_i ) ) add ( doc , CollectionSchema . outboundlinkscount_i , md . lother ( ) ) ;
if ( allAttr | | contains ( CollectionSchema . charset_s ) ) add ( doc , CollectionSchema . charset_s , " UTF8 " ) ;
2012-08-05 15:49:27 +02:00
// coordinates
2013-01-14 03:06:24 +01:00
if ( md . lat ( ) ! = 0 . 0 & & md . lon ( ) ! = 0 . 0 ) {
2013-02-21 13:23:55 +01:00
if ( allAttr | | contains ( CollectionSchema . coordinate_p ) ) add ( doc , CollectionSchema . coordinate_p , Double . toString ( md . lat ( ) ) + " , " + Double . toString ( md . lon ( ) ) ) ;
2012-08-05 15:49:27 +02:00
}
2013-02-21 13:23:55 +01:00
if ( allAttr | | contains ( CollectionSchema . httpstatus_i ) ) add ( doc , CollectionSchema . httpstatus_i , 200 ) ;
2012-08-05 15:49:27 +02:00
// fields that are in URIMetadataRow additional to yacy2solr basic requirement
2013-02-21 13:23:55 +01:00
if ( allAttr | | contains ( CollectionSchema . load_date_dt ) ) add ( doc , CollectionSchema . load_date_dt , md . loaddate ( ) ) ;
if ( allAttr | | contains ( CollectionSchema . fresh_date_dt ) ) add ( doc , CollectionSchema . fresh_date_dt , md . freshdate ( ) ) ;
if ( allAttr | | contains ( CollectionSchema . host_id_s ) ) add ( doc , CollectionSchema . host_id_s , md . hosthash ( ) ) ;
if ( ( allAttr | | contains ( CollectionSchema . referrer_id_txt ) ) & & md . referrerHash ( ) ! = null ) add ( doc , CollectionSchema . referrer_id_txt , new String [ ] { ASCII . String ( md . referrerHash ( ) ) } ) ;
if ( allAttr | | contains ( CollectionSchema . md5_s ) ) add ( doc , CollectionSchema . md5_s , md . md5 ( ) ) ;
if ( allAttr | | contains ( CollectionSchema . publisher_t ) ) add ( doc , CollectionSchema . publisher_t , md . dc_publisher ( ) ) ;
if ( ( allAttr | | contains ( CollectionSchema . language_s ) ) & & md . language ( ) ! = null ) add ( doc , CollectionSchema . language_s , UTF8 . String ( md . language ( ) ) ) ;
if ( allAttr | | contains ( CollectionSchema . size_i ) ) add ( doc , CollectionSchema . size_i , md . size ( ) ) ;
if ( allAttr | | contains ( CollectionSchema . audiolinkscount_i ) ) add ( doc , CollectionSchema . audiolinkscount_i , md . laudio ( ) ) ;
if ( allAttr | | contains ( CollectionSchema . videolinkscount_i ) ) add ( doc , CollectionSchema . videolinkscount_i , md . lvideo ( ) ) ;
if ( allAttr | | contains ( CollectionSchema . applinkscount_i ) ) add ( doc , CollectionSchema . applinkscount_i , md . lapp ( ) ) ;
if ( allAttr | | contains ( CollectionSchema . text_t ) ) {
2012-08-18 19:36:21 +02:00
// construct the text from other metadata parts.
// This is necessary here since that is used to search the link when no other data (parsed text body) is available
StringBuilder sb = new StringBuilder ( 120 ) ;
accText ( sb , md . dc_title ( ) ) ;
accText ( sb , md . dc_creator ( ) ) ;
accText ( sb , md . dc_publisher ( ) ) ;
accText ( sb , md . snippet ( ) ) ;
accText ( sb , digestURI . toTokens ( ) ) ;
accText ( sb , keywords ) ;
2013-02-21 13:23:55 +01:00
add ( doc , CollectionSchema . text_t , sb . toString ( ) ) ;
2012-08-18 19:36:21 +02:00
}
2012-08-27 14:41:33 +02:00
2012-08-23 09:51:45 +02:00
return doc ;
2012-08-05 15:49:27 +02:00
}
2012-08-27 14:41:33 +02:00
2012-08-18 19:36:21 +02:00
private static void accText ( final StringBuilder sb , String text ) {
if ( text = = null | | text . length ( ) = = 0 ) return ;
if ( sb . length ( ) ! = 0 ) sb . append ( ' ' ) ;
text = text . trim ( ) ;
2012-08-21 21:03:26 +02:00
if ( ! text . isEmpty ( ) & & text . charAt ( text . length ( ) - 1 ) = = '.' ) sb . append ( text ) ; else sb . append ( text ) . append ( '.' ) ;
2012-08-18 19:36:21 +02:00
}
2012-12-18 17:20:42 +01:00
2013-02-21 13:23:55 +01:00
public SolrInputDocument yacy2solr (
2013-01-02 20:55:43 +01:00
final String id , final CrawlProfile profile , final ResponseHeader responseHeader ,
final Document document , Condenser condenser , DigestURI referrerURL , String language ,
IndexCell < CitationReference > citations ) {
2013-02-15 01:38:10 +01:00
// we use the SolrCell design as index schema
2012-08-23 09:51:45 +02:00
final SolrInputDocument doc = new SolrInputDocument ( ) ;
2012-10-18 14:29:11 +02:00
final DigestURI digestURI = DigestURI . toDigestURI ( document . dc_source ( ) ) ;
2012-08-05 15:49:27 +02:00
boolean allAttr = this . isEmpty ( ) ;
2013-01-02 20:55:43 +01:00
Set < ProcessType > processTypes = new LinkedHashSet < ProcessType > ( ) ;
2013-02-21 13:23:55 +01:00
add ( doc , CollectionSchema . id , id ) ;
if ( allAttr | | contains ( CollectionSchema . failreason_t ) ) add ( doc , CollectionSchema . failreason_t , " " ) ; // overwrite a possible fail reason (in case that there was a fail reason before)
2012-12-18 17:20:42 +01:00
String docurl = digestURI . toNormalform ( true ) ;
2013-02-21 13:23:55 +01:00
add ( doc , CollectionSchema . sku , docurl ) ;
2012-12-18 17:20:42 +01:00
2013-02-21 13:23:55 +01:00
if ( ( allAttr | | contains ( CollectionSchema . clickdepth_i ) ) & & citations ! = null ) {
2013-01-03 19:21:21 +01:00
if ( digestURI . probablyRootURL ( ) ) {
boolean lc = this . lazy ; this . lazy = false ;
2013-02-21 13:23:55 +01:00
add ( doc , CollectionSchema . clickdepth_i , 0 ) ;
2013-01-03 19:21:21 +01:00
this . lazy = lc ;
2013-01-02 20:55:43 +01:00
} else {
// search the citations for references
int clickdepth = - 1 ;
try {
2013-01-03 19:21:21 +01:00
clickdepth = getClickDepth ( citations , digestURI ) ;
2013-01-02 20:55:43 +01:00
} catch ( IOException e ) {
2013-02-21 13:23:55 +01:00
add ( doc , CollectionSchema . clickdepth_i , - 1 ) ;
2013-01-02 20:55:43 +01:00
}
2013-02-21 13:23:55 +01:00
add ( doc , CollectionSchema . clickdepth_i , clickdepth ) ;
2013-01-02 20:55:43 +01:00
if ( clickdepth < 0 | | clickdepth > 1 ) {
processTypes . add ( ProcessType . CLICKDEPTH ) ; // postprocessing needed; this is also needed if the depth is positive; there could be a shortcut
}
}
2012-12-18 17:20:42 +01:00
}
2013-02-21 13:23:55 +01:00
if ( allAttr | | contains ( CollectionSchema . ip_s ) ) {
2012-08-29 16:11:23 +02:00
final InetAddress address = digestURI . getInetAddress ( ) ;
2013-02-21 13:23:55 +01:00
if ( address ! = null ) add ( doc , CollectionSchema . ip_s , address . getHostAddress ( ) ) ;
2012-08-29 16:11:23 +02:00
}
2013-02-21 13:23:55 +01:00
if ( allAttr | | contains ( CollectionSchema . collection_sxt ) & & profile ! = null ) add ( doc , CollectionSchema . collection_sxt , profile . collections ( ) ) ;
if ( allAttr | | contains ( CollectionSchema . url_protocol_s ) ) add ( doc , CollectionSchema . url_protocol_s , digestURI . getProtocol ( ) ) ;
2012-08-29 16:11:23 +02:00
Map < String , String > searchpart = digestURI . getSearchpartMap ( ) ;
if ( searchpart = = null ) {
2013-02-21 13:23:55 +01:00
if ( allAttr | | contains ( CollectionSchema . url_parameter_i ) ) add ( doc , CollectionSchema . url_parameter_i , 0 ) ;
2012-08-29 16:11:23 +02:00
} else {
2013-02-21 13:23:55 +01:00
if ( allAttr | | contains ( CollectionSchema . url_parameter_i ) ) add ( doc , CollectionSchema . url_parameter_i , searchpart . size ( ) ) ;
if ( allAttr | | contains ( CollectionSchema . url_parameter_key_sxt ) ) add ( doc , CollectionSchema . url_parameter_key_sxt , searchpart . keySet ( ) . toArray ( new String [ searchpart . size ( ) ] ) ) ;
if ( allAttr | | contains ( CollectionSchema . url_parameter_value_sxt ) ) add ( doc , CollectionSchema . url_parameter_value_sxt , searchpart . values ( ) . toArray ( new String [ searchpart . size ( ) ] ) ) ;
2012-08-05 15:49:27 +02:00
}
2013-02-21 13:23:55 +01:00
if ( allAttr | | contains ( CollectionSchema . url_chars_i ) ) add ( doc , CollectionSchema . url_chars_i , docurl . length ( ) ) ;
2012-08-28 16:58:06 +02:00
String host = null ;
if ( ( host = digestURI . getHost ( ) ) ! = null ) {
String dnc = Domains . getDNC ( host ) ;
2012-12-10 07:22:42 +01:00
String subdomOrga = host . length ( ) - dnc . length ( ) < = 0 ? " " : host . substring ( 0 , host . length ( ) - dnc . length ( ) - 1 ) ;
2012-08-28 16:58:06 +02:00
int p = subdomOrga . lastIndexOf ( '.' ) ;
String subdom = ( p < 0 ) ? " " : subdomOrga . substring ( 0 , p ) ;
String orga = ( p < 0 ) ? subdomOrga : subdomOrga . substring ( p + 1 ) ;
2013-02-21 13:23:55 +01:00
if ( allAttr | | contains ( CollectionSchema . host_s ) ) add ( doc , CollectionSchema . host_s , host ) ;
if ( allAttr | | contains ( CollectionSchema . host_dnc_s ) ) add ( doc , CollectionSchema . host_dnc_s , dnc ) ;
if ( allAttr | | contains ( CollectionSchema . host_organization_s ) ) add ( doc , CollectionSchema . host_organization_s , orga ) ;
if ( allAttr | | contains ( CollectionSchema . host_organizationdnc_s ) ) add ( doc , CollectionSchema . host_organizationdnc_s , orga + '.' + dnc ) ;
if ( allAttr | | contains ( CollectionSchema . host_subdomain_s ) ) add ( doc , CollectionSchema . host_subdomain_s , subdom ) ;
2012-08-28 16:58:06 +02:00
}
2012-08-31 10:30:43 +02:00
2012-10-18 14:29:11 +02:00
List < String > titles = document . titles ( ) ;
2013-02-21 13:23:55 +01:00
if ( allAttr | | contains ( CollectionSchema . title ) ) add ( doc , CollectionSchema . title , titles ) ;
if ( allAttr | | contains ( CollectionSchema . title_count_i ) ) add ( doc , CollectionSchema . title_count_i , titles . size ( ) ) ;
if ( allAttr | | contains ( CollectionSchema . title_chars_val ) ) {
2012-08-31 10:30:43 +02:00
ArrayList < Integer > cv = new ArrayList < Integer > ( titles . size ( ) ) ;
for ( String s : titles ) cv . add ( new Integer ( s . length ( ) ) ) ;
2013-02-21 13:23:55 +01:00
add ( doc , CollectionSchema . title_chars_val , cv ) ;
2012-08-31 10:30:43 +02:00
}
2013-02-21 13:23:55 +01:00
if ( allAttr | | contains ( CollectionSchema . title_words_val ) ) {
2012-08-31 10:30:43 +02:00
ArrayList < Integer > cv = new ArrayList < Integer > ( titles . size ( ) ) ;
2012-11-26 13:11:55 +01:00
for ( String s : titles ) cv . add ( new Integer ( CommonPattern . SPACE . split ( s ) . length ) ) ;
2013-02-21 13:23:55 +01:00
add ( doc , CollectionSchema . title_words_val , cv ) ;
2012-08-31 10:30:43 +02:00
}
2012-10-18 14:29:11 +02:00
String description = document . dc_description ( ) ;
2012-08-31 10:30:43 +02:00
List < String > descriptions = new ArrayList < String > ( ) ;
2012-11-26 13:11:55 +01:00
for ( String s : CommonPattern . NEWLINE . split ( description ) ) descriptions . add ( s ) ;
2013-02-21 13:23:55 +01:00
if ( allAttr | | contains ( CollectionSchema . description ) ) add ( doc , CollectionSchema . description , description ) ;
if ( allAttr | | contains ( CollectionSchema . description_count_i ) ) add ( doc , CollectionSchema . description_count_i , descriptions . size ( ) ) ;
if ( allAttr | | contains ( CollectionSchema . description_chars_val ) ) {
2012-08-31 10:30:43 +02:00
ArrayList < Integer > cv = new ArrayList < Integer > ( descriptions . size ( ) ) ;
for ( String s : descriptions ) cv . add ( new Integer ( s . length ( ) ) ) ;
2013-02-21 13:23:55 +01:00
add ( doc , CollectionSchema . description_chars_val , cv ) ;
2012-08-31 10:30:43 +02:00
}
2013-02-21 13:23:55 +01:00
if ( allAttr | | contains ( CollectionSchema . description_words_val ) ) {
2012-08-31 10:30:43 +02:00
ArrayList < Integer > cv = new ArrayList < Integer > ( descriptions . size ( ) ) ;
2012-11-26 13:11:55 +01:00
for ( String s : descriptions ) cv . add ( new Integer ( CommonPattern . SPACE . split ( s ) . length ) ) ;
2013-02-21 13:23:55 +01:00
add ( doc , CollectionSchema . description_words_val , cv ) ;
2012-08-31 10:30:43 +02:00
}
2013-02-21 13:23:55 +01:00
if ( allAttr | | contains ( CollectionSchema . author ) ) {
2012-12-19 01:54:35 +01:00
String author = document . dc_creator ( ) ;
if ( author = = null | | author . length ( ) = = 0 ) author = document . dc_publisher ( ) ;
2013-02-21 13:23:55 +01:00
add ( doc , CollectionSchema . author , author ) ;
2012-12-19 01:54:35 +01:00
}
2013-02-21 13:23:55 +01:00
if ( allAttr | | contains ( CollectionSchema . content_type ) ) add ( doc , CollectionSchema . content_type , new String [ ] { document . dc_format ( ) } ) ;
if ( allAttr | | contains ( CollectionSchema . last_modified ) ) add ( doc , CollectionSchema . last_modified , responseHeader = = null ? new Date ( ) : responseHeader . lastModified ( ) ) ;
if ( allAttr | | contains ( CollectionSchema . keywords ) ) add ( doc , CollectionSchema . keywords , document . dc_subject ( ' ' ) ) ;
2012-11-07 17:27:13 +01:00
String content = document . getTextString ( ) ;
if ( content = = null | | content . length ( ) = = 0 ) {
content = digestURI . toTokens ( ) ;
}
2013-02-21 13:23:55 +01:00
if ( allAttr | | contains ( CollectionSchema . text_t ) ) add ( doc , CollectionSchema . text_t , content ) ;
if ( allAttr | | contains ( CollectionSchema . wordcount_i ) ) {
2012-11-24 22:30:05 +01:00
if ( content . length ( ) = = 0 ) {
2013-02-21 13:23:55 +01:00
add ( doc , CollectionSchema . wordcount_i , 0 ) ;
2012-11-24 22:30:05 +01:00
} else {
int contentwc = 1 ;
for ( int i = content . length ( ) - 1 ; i > = 0 ; i - - ) if ( content . charAt ( i ) = = ' ' ) contentwc + + ;
2013-02-21 13:23:55 +01:00
add ( doc , CollectionSchema . wordcount_i , contentwc ) ;
2012-11-24 22:30:05 +01:00
}
2011-06-30 17:49:21 +02:00
}
2013-02-21 13:23:55 +01:00
if ( allAttr | | contains ( CollectionSchema . synonyms_sxt ) ) {
2012-10-02 11:13:06 +02:00
List < String > synonyms = condenser . synonyms ( ) ;
2013-02-21 13:23:55 +01:00
add ( doc , CollectionSchema . synonyms_sxt , synonyms ) ;
2012-10-02 00:02:50 +02:00
}
2013-02-21 13:23:55 +01:00
add ( doc , CollectionSchema . exact_signature_l , condenser . exactSignature ( ) ) ;
add ( doc , CollectionSchema . exact_signature_unique_b , true ) ; // this must be corrected afterwards!
add ( doc , CollectionSchema . fuzzy_signature_l , condenser . fuzzySignature ( ) ) ;
add ( doc , CollectionSchema . fuzzy_signature_text_t , condenser . fuzzySignatureText ( ) ) ;
add ( doc , CollectionSchema . fuzzy_signature_unique_b , true ) ; // this must be corrected afterwards!
2011-04-21 15:58:49 +02:00
// path elements of link
2013-02-21 13:23:55 +01:00
if ( allAttr | | contains ( CollectionSchema . url_paths_sxt ) ) add ( doc , CollectionSchema . url_paths_sxt , digestURI . getPaths ( ) ) ;
if ( allAttr | | contains ( CollectionSchema . url_file_ext_s ) ) add ( doc , CollectionSchema . url_file_ext_s , digestURI . getFileExtension ( ) ) ;
2011-06-30 17:49:21 +02:00
2013-02-15 01:38:10 +01:00
// get list of all links; they will be shrinked by urls that appear in other fields of the solr schema
2012-10-18 14:29:11 +02:00
Set < MultiProtocolURI > inboundLinks = document . inboundLinks ( ) ;
Set < MultiProtocolURI > outboundLinks = document . outboundLinks ( ) ;
2011-08-25 17:52:25 +02:00
2012-04-27 16:48:51 +02:00
int c = 0 ;
2012-10-18 14:29:11 +02:00
final Object parser = document . getParserObject ( ) ;
2012-09-07 22:06:51 +02:00
Map < MultiProtocolURI , ImageEntry > images = new HashMap < MultiProtocolURI , ImageEntry > ( ) ;
2011-04-21 15:58:49 +02:00
if ( parser instanceof ContentScraper ) {
2011-06-30 17:49:21 +02:00
final ContentScraper html = ( ContentScraper ) parser ;
2012-09-07 22:06:51 +02:00
images = html . getImages ( ) ;
2011-06-30 17:49:21 +02:00
2011-04-21 15:58:49 +02:00
// header tags
int h = 0 ;
int f = 1 ;
2012-01-13 11:25:15 +01:00
String [ ] hs ;
2013-02-21 13:23:55 +01:00
hs = html . getHeadlines ( 1 ) ; h = h | ( hs . length > 0 ? f : 0 ) ; f = f * 2 ; add ( doc , CollectionSchema . h1_txt , hs ) ; add ( doc , CollectionSchema . h1_i , hs . length ) ;
hs = html . getHeadlines ( 2 ) ; h = h | ( hs . length > 0 ? f : 0 ) ; f = f * 2 ; add ( doc , CollectionSchema . h2_txt , hs ) ; add ( doc , CollectionSchema . h2_i , hs . length ) ;
hs = html . getHeadlines ( 3 ) ; h = h | ( hs . length > 0 ? f : 0 ) ; f = f * 2 ; add ( doc , CollectionSchema . h3_txt , hs ) ; add ( doc , CollectionSchema . h3_i , hs . length ) ;
hs = html . getHeadlines ( 4 ) ; h = h | ( hs . length > 0 ? f : 0 ) ; f = f * 2 ; add ( doc , CollectionSchema . h4_txt , hs ) ; add ( doc , CollectionSchema . h4_i , hs . length ) ;
hs = html . getHeadlines ( 5 ) ; h = h | ( hs . length > 0 ? f : 0 ) ; f = f * 2 ; add ( doc , CollectionSchema . h5_txt , hs ) ; add ( doc , CollectionSchema . h5_i , hs . length ) ;
hs = html . getHeadlines ( 6 ) ; h = h | ( hs . length > 0 ? f : 0 ) ; f = f * 2 ; add ( doc , CollectionSchema . h6_txt , hs ) ; add ( doc , CollectionSchema . h6_i , hs . length ) ;
2012-10-09 13:02:43 +02:00
2013-02-21 13:23:55 +01:00
add ( doc , CollectionSchema . htags_i , h ) ;
add ( doc , CollectionSchema . schema_org_breadcrumb_i , html . breadcrumbCount ( ) ) ;
2011-04-21 15:58:49 +02:00
2012-10-09 17:28:48 +02:00
// meta tags: Open Graph properties
String og ;
2013-02-21 13:23:55 +01:00
og = html . getMetas ( ) . get ( " og:title " ) ; if ( og ! = null ) add ( doc , CollectionSchema . opengraph_title_t , og ) ;
og = html . getMetas ( ) . get ( " og:type " ) ; if ( og ! = null ) add ( doc , CollectionSchema . opengraph_type_s , og ) ;
og = html . getMetas ( ) . get ( " og:url " ) ; if ( og ! = null ) add ( doc , CollectionSchema . opengraph_url_s , og ) ;
og = html . getMetas ( ) . get ( " og:image " ) ; if ( og ! = null ) add ( doc , CollectionSchema . opengraph_image_s , og ) ;
2012-10-09 17:28:48 +02:00
2011-09-30 15:39:01 +02:00
// noindex and nofollow attributes
// from HTML (meta-tag in HTML header: robots)
// and HTTP header (x-robots property)
// coded as binary value:
// bit 0: "all" contained in html header meta
// bit 1: "index" contained in html header meta
// bit 2: "noindex" contained in html header meta
// bit 3: "nofollow" contained in html header meta
// bit 8: "noarchive" contained in http header properties
// bit 9: "nosnippet" contained in http header properties
// bit 10: "noindex" contained in http header properties
// bit 11: "nofollow" contained in http header properties
// bit 12: "unavailable_after" contained in http header properties
int b = 0 ;
final String robots_meta = html . getMetas ( ) . get ( " robots " ) ;
// this tag may have values: all, index, noindex, nofollow
if ( robots_meta ! = null ) {
2011-11-25 12:23:52 +01:00
if ( robots_meta . indexOf ( " all " , 0 ) > = 0 ) b + = 1 ; // set bit 0
if ( robots_meta . indexOf ( " index " , 0 ) = = 0 | | robots_meta . indexOf ( " index " , 0 ) > = 0 | | robots_meta . indexOf ( " ,index " , 0 ) > = 0 ) b + = 2 ; // set bit 1
if ( robots_meta . indexOf ( " noindex " , 0 ) > = 0 ) b + = 4 ; // set bit 2
if ( robots_meta . indexOf ( " nofollow " , 0 ) > = 0 ) b + = 8 ; // set bit 3
2011-09-30 15:39:01 +02:00
}
2012-07-25 01:53:47 +02:00
String x_robots_tag = " " ;
2012-10-18 14:29:11 +02:00
if ( responseHeader ! = null ) {
x_robots_tag = responseHeader . get ( HeaderFramework . X_ROBOTS_TAG , " " ) ;
2012-07-25 01:53:47 +02:00
if ( x_robots_tag . isEmpty ( ) ) {
2012-10-18 14:29:11 +02:00
x_robots_tag = responseHeader . get ( HeaderFramework . X_ROBOTS , " " ) ;
2012-07-25 01:53:47 +02:00
}
}
if ( ! x_robots_tag . isEmpty ( ) ) {
2012-07-10 22:59:03 +02:00
// this tag may have values: noarchive, nosnippet, noindex, unavailable_after
2011-11-25 12:23:52 +01:00
if ( x_robots_tag . indexOf ( " noarchive " , 0 ) > = 0 ) b + = 256 ; // set bit 8
if ( x_robots_tag . indexOf ( " nosnippet " , 0 ) > = 0 ) b + = 512 ; // set bit 9
if ( x_robots_tag . indexOf ( " noindex " , 0 ) > = 0 ) b + = 1024 ; // set bit 10
if ( x_robots_tag . indexOf ( " nofollow " , 0 ) > = 0 ) b + = 2048 ; // set bit 11
if ( x_robots_tag . indexOf ( " unavailable_after " , 0 ) > = 0 ) b + = 4096 ; // set bit 12
2011-09-30 15:39:01 +02:00
}
2013-02-21 13:23:55 +01:00
add ( doc , CollectionSchema . robots_i , b ) ;
2011-09-30 15:39:01 +02:00
// meta tags: generator
final String generator = html . getMetas ( ) . get ( " generator " ) ;
2013-02-21 13:23:55 +01:00
if ( generator ! = null ) add ( doc , CollectionSchema . metagenerator_t , generator ) ;
2011-06-30 17:49:21 +02:00
2011-04-21 15:58:49 +02:00
// bold, italic
2011-06-30 17:49:21 +02:00
final String [ ] bold = html . getBold ( ) ;
2013-02-21 13:23:55 +01:00
add ( doc , CollectionSchema . boldcount_i , bold . length ) ;
2011-04-28 15:09:01 +02:00
if ( bold . length > 0 ) {
2013-02-21 13:23:55 +01:00
add ( doc , CollectionSchema . bold_txt , bold ) ;
if ( allAttr | | contains ( CollectionSchema . bold_val ) ) {
add ( doc , CollectionSchema . bold_val , html . getBoldCount ( bold ) ) ;
2011-06-30 17:49:21 +02:00
}
2011-04-28 15:09:01 +02:00
}
2011-06-30 17:49:21 +02:00
final String [ ] italic = html . getItalic ( ) ;
2013-02-21 13:23:55 +01:00
add ( doc , CollectionSchema . italiccount_i , italic . length ) ;
2011-04-28 15:09:01 +02:00
if ( italic . length > 0 ) {
2013-02-21 13:23:55 +01:00
add ( doc , CollectionSchema . italic_txt , italic ) ;
if ( allAttr | | contains ( CollectionSchema . italic_val ) ) {
add ( doc , CollectionSchema . italic_val , html . getItalicCount ( italic ) ) ;
2011-06-30 17:49:21 +02:00
}
2011-04-28 15:09:01 +02:00
}
2012-10-01 14:16:49 +02:00
final String [ ] underline = html . getUnderline ( ) ;
2013-02-21 13:23:55 +01:00
add ( doc , CollectionSchema . underlinecount_i , underline . length ) ;
2012-10-01 14:16:49 +02:00
if ( underline . length > 0 ) {
2013-02-21 13:23:55 +01:00
add ( doc , CollectionSchema . underline_txt , underline ) ;
if ( allAttr | | contains ( CollectionSchema . underline_val ) ) {
add ( doc , CollectionSchema . underline_val , html . getUnderlineCount ( underline ) ) ;
2012-10-01 14:16:49 +02:00
}
}
2011-06-30 17:49:21 +02:00
final String [ ] li = html . getLi ( ) ;
2013-02-21 13:23:55 +01:00
add ( doc , CollectionSchema . licount_i , li . length ) ;
if ( li . length > 0 ) add ( doc , CollectionSchema . li_txt , li ) ;
2011-06-30 17:49:21 +02:00
2011-04-21 15:58:49 +02:00
// images
2012-09-07 22:06:51 +02:00
final Collection < ImageEntry > imagesc = images . values ( ) ;
2012-07-04 21:15:38 +02:00
final List < String > imgtags = new ArrayList < String > ( imagesc . size ( ) ) ;
final List < String > imgprots = new ArrayList < String > ( imagesc . size ( ) ) ;
final List < String > imgstubs = new ArrayList < String > ( imagesc . size ( ) ) ;
final List < String > imgalts = new ArrayList < String > ( imagesc . size ( ) ) ;
2012-09-07 21:33:45 +02:00
int withalt = 0 ;
2011-08-31 18:02:06 +02:00
for ( final ImageEntry ie : imagesc ) {
final MultiProtocolURI uri = ie . url ( ) ;
2012-04-27 16:48:51 +02:00
inboundLinks . remove ( uri ) ;
2012-09-07 22:06:51 +02:00
outboundLinks . remove ( uri ) ;
2012-07-04 21:15:38 +02:00
imgtags . add ( ie . toString ( ) ) ;
String protocol = uri . getProtocol ( ) ;
imgprots . add ( protocol ) ;
imgstubs . add ( uri . toString ( ) . substring ( protocol . length ( ) + 3 ) ) ;
imgalts . add ( ie . alt ( ) ) ;
2012-09-07 21:33:45 +02:00
if ( ie . alt ( ) ! = null & & ie . alt ( ) . length ( ) > 0 ) withalt + + ;
2011-06-30 17:49:21 +02:00
}
2013-02-21 13:23:55 +01:00
if ( allAttr | | contains ( CollectionSchema . imagescount_i ) ) add ( doc , CollectionSchema . imagescount_i , imgtags . size ( ) ) ;
if ( allAttr | | contains ( CollectionSchema . images_tag_txt ) ) add ( doc , CollectionSchema . images_tag_txt , imgtags ) ;
if ( allAttr | | contains ( CollectionSchema . images_protocol_sxt ) ) add ( doc , CollectionSchema . images_protocol_sxt , protocolList2indexedList ( imgprots ) ) ;
if ( allAttr | | contains ( CollectionSchema . images_urlstub_txt ) ) add ( doc , CollectionSchema . images_urlstub_txt , imgstubs ) ;
if ( allAttr | | contains ( CollectionSchema . images_alt_txt ) ) add ( doc , CollectionSchema . images_alt_txt , imgalts ) ;
if ( allAttr | | contains ( CollectionSchema . images_withalt_i ) ) add ( doc , CollectionSchema . images_withalt_i , withalt ) ;
2011-04-21 15:58:49 +02:00
// style sheets
2013-02-21 13:23:55 +01:00
if ( allAttr | | contains ( CollectionSchema . css_tag_txt ) ) {
2011-06-30 17:49:21 +02:00
final Map < MultiProtocolURI , String > csss = html . getCSS ( ) ;
2011-08-31 18:02:06 +02:00
final String [ ] css_tag = new String [ csss . size ( ) ] ;
final String [ ] css_url = new String [ csss . size ( ) ] ;
2011-06-30 17:49:21 +02:00
c = 0 ;
for ( final Map . Entry < MultiProtocolURI , String > entry : csss . entrySet ( ) ) {
2012-12-18 17:20:42 +01:00
final String cssurl = entry . getKey ( ) . toNormalform ( false ) ;
inboundLinks . remove ( cssurl ) ;
outboundLinks . remove ( cssurl ) ;
2011-08-31 18:02:06 +02:00
css_tag [ c ] =
2011-06-30 17:49:21 +02:00
" <link rel= \" stylesheet \" type= \" text/css \" media= \" " + entry . getValue ( ) + " \" " +
2012-12-18 17:20:42 +01:00
" href= \" " + cssurl + " \" /> " ;
css_url [ c ] = cssurl ;
2011-08-31 18:02:06 +02:00
c + + ;
2011-06-30 17:49:21 +02:00
}
2013-02-21 13:23:55 +01:00
add ( doc , CollectionSchema . csscount_i , css_tag . length ) ;
if ( css_tag . length > 0 ) add ( doc , CollectionSchema . css_tag_txt , css_tag ) ;
if ( css_url . length > 0 ) add ( doc , CollectionSchema . css_url_txt , css_url ) ;
2011-04-21 15:58:49 +02:00
}
2011-06-30 17:49:21 +02:00
2011-04-21 15:58:49 +02:00
// Scripts
2013-02-21 13:23:55 +01:00
if ( allAttr | | contains ( CollectionSchema . scripts_txt ) ) {
2011-06-30 17:49:21 +02:00
final Set < MultiProtocolURI > scriptss = html . getScript ( ) ;
final String [ ] scripts = new String [ scriptss . size ( ) ] ;
c = 0 ;
2012-12-18 17:20:42 +01:00
for ( final MultiProtocolURI u : scriptss ) {
inboundLinks . remove ( u ) ;
outboundLinks . remove ( u ) ;
scripts [ c + + ] = u . toNormalform ( false ) ;
2011-06-30 17:49:21 +02:00
}
2013-02-21 13:23:55 +01:00
add ( doc , CollectionSchema . scriptscount_i , scripts . length ) ;
if ( scripts . length > 0 ) add ( doc , CollectionSchema . scripts_txt , scripts ) ;
2011-04-21 15:58:49 +02:00
}
2011-06-30 17:49:21 +02:00
2011-04-21 15:58:49 +02:00
// Frames
2013-02-21 13:23:55 +01:00
if ( allAttr | | contains ( CollectionSchema . frames_txt ) ) {
2011-06-30 17:49:21 +02:00
final Set < MultiProtocolURI > framess = html . getFrames ( ) ;
final String [ ] frames = new String [ framess . size ( ) ] ;
c = 0 ;
2012-12-18 17:20:42 +01:00
for ( final MultiProtocolURI u : framess ) {
inboundLinks . remove ( u ) ;
outboundLinks . remove ( u ) ;
frames [ c + + ] = u . toNormalform ( false ) ;
2011-06-30 17:49:21 +02:00
}
2013-02-21 13:23:55 +01:00
add ( doc , CollectionSchema . framesscount_i , frames . length ) ;
if ( frames . length > 0 ) add ( doc , CollectionSchema . frames_txt , frames ) ;
2011-04-21 15:58:49 +02:00
}
2011-06-30 17:49:21 +02:00
2011-04-21 15:58:49 +02:00
// IFrames
2013-02-21 13:23:55 +01:00
if ( allAttr | | contains ( CollectionSchema . iframes_txt ) ) {
2011-06-30 17:49:21 +02:00
final Set < MultiProtocolURI > iframess = html . getIFrames ( ) ;
final String [ ] iframes = new String [ iframess . size ( ) ] ;
c = 0 ;
2012-12-18 17:20:42 +01:00
for ( final MultiProtocolURI u : iframess ) {
inboundLinks . remove ( u ) ;
outboundLinks . remove ( u ) ;
iframes [ c + + ] = u . toNormalform ( false ) ;
2011-06-30 17:49:21 +02:00
}
2013-02-21 13:23:55 +01:00
add ( doc , CollectionSchema . iframesscount_i , iframes . length ) ;
if ( iframes . length > 0 ) add ( doc , CollectionSchema . iframes_txt , iframes ) ;
2011-04-21 15:58:49 +02:00
}
2011-06-30 17:49:21 +02:00
2012-06-26 14:51:57 +02:00
// canonical tag
2013-02-21 13:23:55 +01:00
if ( allAttr | | contains ( CollectionSchema . canonical_t ) ) {
2012-06-26 14:51:57 +02:00
final MultiProtocolURI canonical = html . getCanonical ( ) ;
if ( canonical ! = null ) {
inboundLinks . remove ( canonical ) ;
2012-09-07 22:06:51 +02:00
outboundLinks . remove ( canonical ) ;
2013-02-21 13:23:55 +01:00
add ( doc , CollectionSchema . canonical_t , canonical . toNormalform ( false ) ) ;
2013-01-23 14:40:58 +01:00
// set a flag if this is equal to sku
2013-02-21 13:23:55 +01:00
if ( contains ( CollectionSchema . canonical_equal_sku_b ) & & canonical . equals ( docurl ) ) {
add ( doc , CollectionSchema . canonical_equal_sku_b , true ) ;
2013-01-23 14:40:58 +01:00
}
2012-06-26 14:51:57 +02:00
}
}
2012-06-28 13:27:45 +02:00
// meta refresh tag
2013-02-21 13:23:55 +01:00
if ( allAttr | | contains ( CollectionSchema . refresh_s ) ) {
2012-06-28 13:27:45 +02:00
String refresh = html . getRefreshPath ( ) ;
if ( refresh ! = null & & refresh . length ( ) > 0 ) {
MultiProtocolURI refreshURL ;
try {
refreshURL = refresh . startsWith ( " http " ) ? new MultiProtocolURI ( html . getRefreshPath ( ) ) : new MultiProtocolURI ( digestURI , html . getRefreshPath ( ) ) ;
if ( refreshURL ! = null ) {
inboundLinks . remove ( refreshURL ) ;
2012-09-07 22:06:51 +02:00
outboundLinks . remove ( refreshURL ) ;
2013-02-21 13:23:55 +01:00
add ( doc , CollectionSchema . refresh_s , refreshURL . toNormalform ( false ) ) ;
2012-06-28 13:27:45 +02:00
}
} catch ( MalformedURLException e ) {
2013-02-21 13:23:55 +01:00
add ( doc , CollectionSchema . refresh_s , refresh ) ;
2012-06-28 13:27:45 +02:00
}
}
}
2011-04-21 15:58:49 +02:00
// flash embedded
2013-02-21 13:23:55 +01:00
if ( allAttr | | contains ( CollectionSchema . flash_b ) ) {
2012-06-25 18:17:31 +02:00
MultiProtocolURI [ ] flashURLs = html . getFlash ( ) ;
for ( MultiProtocolURI u : flashURLs ) {
// remove all flash links from ibound/outbound links
inboundLinks . remove ( u ) ;
2012-09-07 22:06:51 +02:00
outboundLinks . remove ( u ) ;
2012-06-25 18:17:31 +02:00
}
2013-02-21 13:23:55 +01:00
add ( doc , CollectionSchema . flash_b , flashURLs . length > 0 ) ;
2012-06-25 18:17:31 +02:00
}
2011-06-30 17:49:21 +02:00
2011-04-28 15:09:01 +02:00
// generic evaluation pattern
2011-06-30 17:49:21 +02:00
for ( final String model : html . getEvaluationModelNames ( ) ) {
2012-08-05 15:49:27 +02:00
if ( allAttr | | contains ( " ext_ " + model + " _txt " ) ) {
2011-06-30 17:49:21 +02:00
final String [ ] scorenames = html . getEvaluationModelScoreNames ( model ) ;
if ( scorenames . length > 0 ) {
2013-02-21 13:23:55 +01:00
add ( doc , CollectionSchema . valueOf ( " ext_ " + model + " _txt " ) , scorenames ) ;
add ( doc , CollectionSchema . valueOf ( " ext_ " + model + " _val " ) , html . getEvaluationModelScoreCounts ( model , scorenames ) ) ;
2011-06-30 17:49:21 +02:00
}
2011-04-28 15:09:01 +02:00
}
}
2011-06-30 17:49:21 +02:00
2011-05-27 14:35:08 +02:00
// response time
2013-02-21 13:23:55 +01:00
add ( doc , CollectionSchema . responsetime_i , responseHeader = = null ? 0 : Integer . parseInt ( responseHeader . get ( HeaderFramework . RESPONSE_TIME_MILLIS , " 0 " ) ) ) ;
2011-04-21 15:58:49 +02:00
}
2012-04-27 16:48:51 +02:00
// list all links
2012-10-18 14:29:11 +02:00
final Map < MultiProtocolURI , Properties > alllinks = document . getAnchors ( ) ;
2012-04-27 16:48:51 +02:00
c = 0 ;
2013-02-21 13:23:55 +01:00
if ( allAttr | | contains ( CollectionSchema . inboundlinkscount_i ) ) add ( doc , CollectionSchema . inboundlinkscount_i , inboundLinks . size ( ) ) ;
if ( allAttr | | contains ( CollectionSchema . inboundlinksnofollowcount_i ) ) add ( doc , CollectionSchema . inboundlinksnofollowcount_i , document . inboundLinkNofollowCount ( ) ) ;
2012-07-04 21:15:38 +02:00
final List < String > inboundlinksTag = new ArrayList < String > ( inboundLinks . size ( ) ) ;
final List < String > inboundlinksURLProtocol = new ArrayList < String > ( inboundLinks . size ( ) ) ;
final List < String > inboundlinksURLStub = new ArrayList < String > ( inboundLinks . size ( ) ) ;
final List < String > inboundlinksName = new ArrayList < String > ( inboundLinks . size ( ) ) ;
final List < String > inboundlinksRel = new ArrayList < String > ( inboundLinks . size ( ) ) ;
final List < String > inboundlinksText = new ArrayList < String > ( inboundLinks . size ( ) ) ;
2012-09-07 22:06:51 +02:00
final List < Integer > inboundlinksTextChars = new ArrayList < Integer > ( inboundLinks . size ( ) ) ;
final List < Integer > inboundlinksTextWords = new ArrayList < Integer > ( inboundLinks . size ( ) ) ;
final List < String > inboundlinksAltTag = new ArrayList < String > ( inboundLinks . size ( ) ) ;
2012-12-18 17:20:42 +01:00
for ( final MultiProtocolURI u : inboundLinks ) {
final Properties p = alllinks . get ( u ) ;
2012-07-04 21:15:38 +02:00
if ( p = = null ) continue ;
2012-04-27 16:48:51 +02:00
final String name = p . getProperty ( " name " , " " ) ; // the name attribute
final String rel = p . getProperty ( " rel " , " " ) ; // the rel-attribute
final String text = p . getProperty ( " text " , " " ) ; // the text between the <a></a> tag
2012-12-18 17:20:42 +01:00
final String urls = u . toNormalform ( false ) ;
2012-04-27 16:48:51 +02:00
final int pr = urls . indexOf ( " :// " , 0 ) ;
2012-07-04 21:15:38 +02:00
inboundlinksURLProtocol . add ( urls . substring ( 0 , pr ) ) ;
inboundlinksURLStub . add ( urls . substring ( pr + 3 ) ) ;
inboundlinksName . add ( name . length ( ) > 0 ? name : " " ) ;
inboundlinksRel . add ( rel . length ( ) > 0 ? rel : " " ) ;
inboundlinksText . add ( text . length ( ) > 0 ? text : " " ) ;
2012-09-07 22:06:51 +02:00
inboundlinksTextChars . add ( text . length ( ) > 0 ? text . length ( ) : 0 ) ;
2012-11-26 13:11:55 +01:00
inboundlinksTextWords . add ( text . length ( ) > 0 ? CommonPattern . SPACE . split ( text ) . length : 0 ) ;
2012-07-04 21:15:38 +02:00
inboundlinksTag . add (
2012-12-18 17:20:42 +01:00
" <a href= \" " + u . toNormalform ( false ) + " \" " +
2012-04-27 16:48:51 +02:00
( rel . length ( ) > 0 ? " rel= \" " + rel + " \" " : " " ) +
( name . length ( ) > 0 ? " name= \" " + name + " \" " : " " ) +
" > " +
2012-07-04 21:15:38 +02:00
( ( text . length ( ) > 0 ) ? text : " " ) + " </a> " ) ;
2012-12-18 17:20:42 +01:00
ImageEntry ientry = images . get ( u ) ;
2012-09-07 22:06:51 +02:00
inboundlinksAltTag . add ( ientry = = null ? " " : ientry . alt ( ) ) ;
2012-04-27 16:48:51 +02:00
c + + ;
}
2013-02-21 13:23:55 +01:00
if ( allAttr | | contains ( CollectionSchema . inboundlinks_tag_txt ) ) add ( doc , CollectionSchema . inboundlinks_tag_txt , inboundlinksTag ) ;
if ( allAttr | | contains ( CollectionSchema . inboundlinks_protocol_sxt ) ) add ( doc , CollectionSchema . inboundlinks_protocol_sxt , protocolList2indexedList ( inboundlinksURLProtocol ) ) ;
if ( allAttr | | contains ( CollectionSchema . inboundlinks_urlstub_txt ) ) add ( doc , CollectionSchema . inboundlinks_urlstub_txt , inboundlinksURLStub ) ;
if ( allAttr | | contains ( CollectionSchema . inboundlinks_name_txt ) ) add ( doc , CollectionSchema . inboundlinks_name_txt , inboundlinksName ) ;
if ( allAttr | | contains ( CollectionSchema . inboundlinks_rel_sxt ) ) add ( doc , CollectionSchema . inboundlinks_rel_sxt , inboundlinksRel ) ;
if ( allAttr | | contains ( CollectionSchema . inboundlinks_relflags_val ) ) add ( doc , CollectionSchema . inboundlinks_relflags_val , relEval ( inboundlinksRel ) ) ;
if ( allAttr | | contains ( CollectionSchema . inboundlinks_text_txt ) ) add ( doc , CollectionSchema . inboundlinks_text_txt , inboundlinksText ) ;
if ( allAttr | | contains ( CollectionSchema . inboundlinks_text_chars_val ) ) add ( doc , CollectionSchema . inboundlinks_text_chars_val , inboundlinksTextChars ) ;
if ( allAttr | | contains ( CollectionSchema . inboundlinks_text_words_val ) ) add ( doc , CollectionSchema . inboundlinks_text_words_val , inboundlinksTextWords ) ;
if ( allAttr | | contains ( CollectionSchema . inboundlinks_alttag_txt ) ) add ( doc , CollectionSchema . inboundlinks_alttag_txt , inboundlinksAltTag ) ;
2012-04-27 16:48:51 +02:00
c = 0 ;
2013-02-21 13:23:55 +01:00
if ( allAttr | | contains ( CollectionSchema . outboundlinkscount_i ) ) add ( doc , CollectionSchema . outboundlinkscount_i , outboundLinks . size ( ) ) ;
if ( allAttr | | contains ( CollectionSchema . outboundlinksnofollowcount_i ) ) add ( doc , CollectionSchema . outboundlinksnofollowcount_i , document . outboundLinkNofollowCount ( ) ) ;
2012-09-07 22:06:51 +02:00
final List < String > outboundlinksTag = new ArrayList < String > ( outboundLinks . size ( ) ) ;
final List < String > outboundlinksURLProtocol = new ArrayList < String > ( outboundLinks . size ( ) ) ;
final List < String > outboundlinksURLStub = new ArrayList < String > ( outboundLinks . size ( ) ) ;
final List < String > outboundlinksName = new ArrayList < String > ( outboundLinks . size ( ) ) ;
final List < String > outboundlinksRel = new ArrayList < String > ( outboundLinks . size ( ) ) ;
final List < Integer > outboundlinksTextChars = new ArrayList < Integer > ( outboundLinks . size ( ) ) ;
final List < Integer > outboundlinksTextWords = new ArrayList < Integer > ( outboundLinks . size ( ) ) ;
final List < String > outboundlinksText = new ArrayList < String > ( outboundLinks . size ( ) ) ;
final List < String > outboundlinksAltTag = new ArrayList < String > ( outboundLinks . size ( ) ) ;
2012-12-18 17:20:42 +01:00
for ( final MultiProtocolURI u : outboundLinks ) {
final Properties p = alllinks . get ( u ) ;
2012-07-04 21:15:38 +02:00
if ( p = = null ) continue ;
2012-04-27 16:48:51 +02:00
final String name = p . getProperty ( " name " , " " ) ; // the name attribute
final String rel = p . getProperty ( " rel " , " " ) ; // the rel-attribute
final String text = p . getProperty ( " text " , " " ) ; // the text between the <a></a> tag
2012-12-18 17:20:42 +01:00
final String urls = u . toNormalform ( false ) ;
2012-04-27 16:48:51 +02:00
final int pr = urls . indexOf ( " :// " , 0 ) ;
2012-07-04 21:15:38 +02:00
outboundlinksURLProtocol . add ( urls . substring ( 0 , pr ) ) ;
outboundlinksURLStub . add ( urls . substring ( pr + 3 ) ) ;
outboundlinksName . add ( name . length ( ) > 0 ? name : " " ) ;
outboundlinksRel . add ( rel . length ( ) > 0 ? rel : " " ) ;
outboundlinksText . add ( text . length ( ) > 0 ? text : " " ) ;
2012-09-07 22:06:51 +02:00
outboundlinksTextChars . add ( text . length ( ) > 0 ? text . length ( ) : 0 ) ;
2012-11-26 13:11:55 +01:00
outboundlinksTextWords . add ( text . length ( ) > 0 ? CommonPattern . SPACE . split ( text ) . length : 0 ) ;
2012-07-04 21:15:38 +02:00
outboundlinksTag . add (
2012-12-18 17:20:42 +01:00
" <a href= \" " + u . toNormalform ( false ) + " \" " +
2012-04-27 16:48:51 +02:00
( rel . length ( ) > 0 ? " rel= \" " + rel + " \" " : " " ) +
( name . length ( ) > 0 ? " name= \" " + name + " \" " : " " ) +
" > " +
2012-07-04 21:15:38 +02:00
( ( text . length ( ) > 0 ) ? text : " " ) + " </a> " ) ;
2012-12-18 17:20:42 +01:00
ImageEntry ientry = images . get ( u ) ;
2012-09-07 22:06:51 +02:00
inboundlinksAltTag . add ( ientry = = null ? " " : ientry . alt ( ) ) ;
2012-04-27 16:48:51 +02:00
c + + ;
}
2013-02-21 13:23:55 +01:00
if ( allAttr | | contains ( CollectionSchema . outboundlinks_tag_txt ) ) add ( doc , CollectionSchema . outboundlinks_tag_txt , outboundlinksTag ) ;
if ( allAttr | | contains ( CollectionSchema . outboundlinks_protocol_sxt ) ) add ( doc , CollectionSchema . outboundlinks_protocol_sxt , protocolList2indexedList ( outboundlinksURLProtocol ) ) ;
if ( allAttr | | contains ( CollectionSchema . outboundlinks_urlstub_txt ) ) add ( doc , CollectionSchema . outboundlinks_urlstub_txt , outboundlinksURLStub ) ;
if ( allAttr | | contains ( CollectionSchema . outboundlinks_name_txt ) ) add ( doc , CollectionSchema . outboundlinks_name_txt , outboundlinksName ) ;
if ( allAttr | | contains ( CollectionSchema . outboundlinks_rel_sxt ) ) add ( doc , CollectionSchema . outboundlinks_rel_sxt , outboundlinksRel ) ;
if ( allAttr | | contains ( CollectionSchema . outboundlinks_relflags_val ) ) add ( doc , CollectionSchema . outboundlinks_relflags_val , relEval ( outboundlinksRel ) ) ;
if ( allAttr | | contains ( CollectionSchema . outboundlinks_text_txt ) ) add ( doc , CollectionSchema . outboundlinks_text_txt , outboundlinksText ) ;
if ( allAttr | | contains ( CollectionSchema . outboundlinks_text_chars_val ) ) add ( doc , CollectionSchema . outboundlinks_text_chars_val , outboundlinksTextChars ) ;
if ( allAttr | | contains ( CollectionSchema . outboundlinks_text_words_val ) ) add ( doc , CollectionSchema . outboundlinks_text_words_val , outboundlinksTextWords ) ;
if ( allAttr | | contains ( CollectionSchema . outboundlinks_alttag_txt ) ) add ( doc , CollectionSchema . outboundlinks_alttag_txt , outboundlinksAltTag ) ;
2012-04-27 16:48:51 +02:00
// charset
2013-02-21 13:23:55 +01:00
if ( allAttr | | contains ( CollectionSchema . charset_s ) ) add ( doc , CollectionSchema . charset_s , document . getCharset ( ) ) ;
2012-04-27 16:48:51 +02:00
// coordinates
2013-01-14 03:06:24 +01:00
if ( document . lat ( ) ! = 0 . 0 & & document . lon ( ) ! = 0 . 0 ) {
2013-02-21 13:23:55 +01:00
if ( allAttr | | contains ( CollectionSchema . coordinate_p ) ) add ( doc , CollectionSchema . coordinate_p , Double . toString ( document . lat ( ) ) + " , " + Double . toString ( document . lon ( ) ) ) ;
2012-04-27 16:48:51 +02:00
}
2013-02-21 13:23:55 +01:00
if ( allAttr | | contains ( CollectionSchema . httpstatus_i ) ) add ( doc , CollectionSchema . httpstatus_i , responseHeader = = null ? 200 : responseHeader . getStatusCode ( ) ) ;
2012-10-18 14:29:11 +02:00
// fields that were additionally in URIMetadataRow
Date loadDate = new Date ( ) ;
2012-11-07 02:46:51 +01:00
Date modDate = responseHeader = = null ? new Date ( ) : responseHeader . lastModified ( ) ;
2012-10-18 14:29:11 +02:00
if ( modDate . getTime ( ) > loadDate . getTime ( ) ) modDate = loadDate ;
2012-11-07 02:46:51 +01:00
int size = ( int ) Math . max ( document . dc_source ( ) . length ( ) , responseHeader = = null ? 0 : responseHeader . getContentLength ( ) ) ;
2013-02-21 13:23:55 +01:00
if ( allAttr | | contains ( CollectionSchema . load_date_dt ) ) add ( doc , CollectionSchema . load_date_dt , loadDate ) ;
if ( allAttr | | contains ( CollectionSchema . fresh_date_dt ) ) add ( doc , CollectionSchema . fresh_date_dt , new Date ( loadDate . getTime ( ) + Math . max ( 0 , loadDate . getTime ( ) - modDate . getTime ( ) ) / 2 ) ) ; // freshdate, computed with Proxy-TTL formula
if ( allAttr | | contains ( CollectionSchema . host_id_s ) ) add ( doc , CollectionSchema . host_id_s , document . dc_source ( ) . hosthash ( ) ) ;
if ( ( allAttr | | contains ( CollectionSchema . referrer_id_txt ) ) & & referrerURL ! = null ) add ( doc , CollectionSchema . referrer_id_txt , new String [ ] { ASCII . String ( referrerURL . hash ( ) ) } ) ;
2012-08-23 09:51:45 +02:00
//if (allAttr || contains(SolrField.md5_s)) add(solrdoc, SolrField.md5_s, new byte[0]);
2013-02-21 13:23:55 +01:00
if ( allAttr | | contains ( CollectionSchema . publisher_t ) ) add ( doc , CollectionSchema . publisher_t , document . dc_publisher ( ) ) ;
if ( ( allAttr | | contains ( CollectionSchema . language_s ) ) & & language ! = null ) add ( doc , CollectionSchema . language_s , language ) ;
if ( allAttr | | contains ( CollectionSchema . size_i ) ) add ( doc , CollectionSchema . size_i , size ) ;
if ( allAttr | | contains ( CollectionSchema . audiolinkscount_i ) ) add ( doc , CollectionSchema . audiolinkscount_i , document . getAudiolinks ( ) . size ( ) ) ;
if ( allAttr | | contains ( CollectionSchema . videolinkscount_i ) ) add ( doc , CollectionSchema . videolinkscount_i , document . getVideolinks ( ) . size ( ) ) ;
if ( allAttr | | contains ( CollectionSchema . applinkscount_i ) ) add ( doc , CollectionSchema . applinkscount_i , document . getApplinks ( ) . size ( ) ) ;
2012-08-23 09:51:45 +02:00
2012-12-18 02:29:03 +01:00
// write generic navigation
// there are no pre-defined solr fields for navigation because the vocabulary is generic
// we use dynamically allocated solr fields for this.
// It must be a multi-value string/token field, therefore we use _sxt extensions for the field names
for ( Map . Entry < String , Set < String > > facet : document . getGenericFacets ( ) . entrySet ( ) ) {
String facetName = facet . getKey ( ) ;
Set < String > facetValues = facet . getValue ( ) ;
2013-02-21 13:23:55 +01:00
doc . setField ( CollectionSchema . VOCABULARY_PREFIX + facetName + CollectionSchema . VOCABULARY_SUFFIX , facetValues . toArray ( new String [ facetValues . size ( ) ] ) ) ;
2012-12-18 02:29:03 +01:00
}
2013-01-02 20:55:43 +01:00
2013-02-21 13:23:55 +01:00
if ( allAttr | | contains ( CollectionSchema . process_sxt ) ) {
2013-01-02 20:55:43 +01:00
List < String > p = new ArrayList < String > ( ) ;
for ( ProcessType t : processTypes ) p . add ( t . name ( ) ) ;
2013-02-21 13:23:55 +01:00
add ( doc , CollectionSchema . process_sxt , p ) ;
2013-01-02 20:55:43 +01:00
}
2012-08-23 09:51:45 +02:00
return doc ;
2011-04-14 22:05:04 +02:00
}
2011-06-30 17:49:21 +02:00
2013-01-02 20:55:43 +01:00
/ * *
* compute the click level using the citation reference database
* @param citations the citation database
* @param searchhash the hash of the url to be checked
* @return the clickdepth level or - 1 if the root url cannot be found or a recursion limit is reached
* @throws IOException
* /
2013-01-04 16:37:39 +01:00
public static int getClickDepth ( final IndexCell < CitationReference > citations , final DigestURI url ) throws IOException {
2013-01-02 20:55:43 +01:00
2013-01-03 19:21:21 +01:00
final byte [ ] searchhash = url . hash ( ) ;
RowHandleSet rootCandidates = url . getPossibleRootHashes ( ) ;
2013-01-02 20:55:43 +01:00
RowHandleSet ignore = new RowHandleSet ( URIMetadataRow . rowdef . primaryKeyLength , URIMetadataRow . rowdef . objectOrder , 100 ) ; // a set of urlhashes to be ignored. This is generated from all hashes that are seen during recursion to prevent enless loops
RowHandleSet levelhashes = new RowHandleSet ( URIMetadataRow . rowdef . primaryKeyLength , URIMetadataRow . rowdef . objectOrder , 1 ) ; // all hashes of a clickdepth. The first call contains the target hash only and therefore just one entry
try { levelhashes . put ( searchhash ) ; } catch ( SpaceExceededException e ) { throw new IOException ( e ) ; }
int leveldepth = 0 ; // the recursion depth and therefore the result depth-1. Shall be 0 for the first call
final byte [ ] hosthash = new byte [ 6 ] ; // the host of the url to be checked
System . arraycopy ( searchhash , 6 , hosthash , 0 , 6 ) ;
long timeout = System . currentTimeMillis ( ) + 10000 ;
for ( int maxdepth = 0 ; maxdepth < 10 & & System . currentTimeMillis ( ) < timeout ; maxdepth + + ) {
RowHandleSet checknext = new RowHandleSet ( URIMetadataRow . rowdef . primaryKeyLength , URIMetadataRow . rowdef . objectOrder , 100 ) ;
// loop over all hashes at this clickdepth; the first call to this loop should contain only one hash and a leveldepth = 0
checkloop : for ( byte [ ] urlhash : levelhashes ) {
// get all the citations for this url and iterate
ReferenceContainer < CitationReference > references = citations . get ( urlhash , null ) ;
if ( references = = null | | references . size ( ) = = 0 ) continue checkloop ; // don't know
Iterator < CitationReference > i = references . entries ( ) ;
nextloop : while ( i . hasNext ( ) ) {
CitationReference ref = i . next ( ) ;
if ( ref = = null ) continue nextloop ;
byte [ ] u = ref . urlhash ( ) ;
// check ignore
if ( ignore . has ( u ) ) continue nextloop ;
// check if this is from the same host
if ( ! ByteBuffer . equals ( u , 6 , hosthash , 0 , 6 ) ) continue nextloop ;
// check if the url is a root url
2013-01-03 19:21:21 +01:00
if ( rootCandidates . has ( u ) ) {
2013-01-02 20:55:43 +01:00
return leveldepth + 1 ;
}
// step to next depth level
try { checknext . put ( u ) ; } catch ( SpaceExceededException e ) { }
try { ignore . put ( u ) ; } catch ( SpaceExceededException e ) { }
}
}
leveldepth + + ;
levelhashes = checknext ;
}
return - 1 ;
}
2012-09-29 02:13:11 +02:00
/ * *
* this method compresses a list of protocol names to an indexed list .
* To do this , all ' http ' entries are removed and considered as default .
* The remaining entries are indexed as follows : a list of < i > - < p > entries is produced , where
* < i > is an index pointing to the original index of the protocol entry and < p > is the protocol entry itself .
* The < i > entry is formatted as a 3 - digit decimal number with leading zero digits .
* @param protocol
* @return a list of indexed protocol entries
* /
2012-07-04 21:15:38 +02:00
private static List < String > protocolList2indexedList ( List < String > protocol ) {
2012-02-01 18:12:59 +01:00
List < String > a = new ArrayList < String > ( ) ;
2012-07-04 21:15:38 +02:00
String p ;
for ( int i = 0 ; i < protocol . size ( ) ; i + + ) {
p = protocol . get ( i ) ;
if ( ! p . equals ( " http " ) ) {
2012-02-01 18:12:59 +01:00
String c = Integer . toString ( i ) ;
while ( c . length ( ) < 3 ) c = " 0 " + c ;
2012-07-04 21:15:38 +02:00
a . add ( c + " - " + p ) ;
2012-02-01 18:12:59 +01:00
}
}
2012-07-04 21:15:38 +02:00
return a ;
2012-02-01 18:12:59 +01:00
}
2012-09-29 02:13:11 +02:00
2012-10-18 14:29:11 +02:00
/ * *
* encode a string containing attributes from anchor rel properties binary :
* bit 0 : " me " contained in rel
* bit 1 : " nofollow " contained in rel
* @param rel
* @return binary encoded information about rel
* /
private static List < Integer > relEval ( final List < String > rel ) {
List < Integer > il = new ArrayList < Integer > ( rel . size ( ) ) ;
for ( final String s : rel ) {
int i = 0 ;
final String s0 = s . toLowerCase ( ) . trim ( ) ;
if ( " me " . equals ( s0 ) ) i + = 1 ;
if ( " nofollow " . equals ( s0 ) ) i + = 2 ;
il . add ( i ) ;
}
return il ;
}
2012-10-02 14:29:45 +02:00
2012-05-14 14:56:21 +02:00
/ * *
* register an entry as error document
* @param digestURI
* @param failReason
* @param httpstatus
* @throws IOException
* /
2012-11-23 14:00:30 +01:00
public SolrInputDocument err ( final DigestURI digestURI , final String failReason , final FailType failType , final int httpstatus ) throws IOException {
2012-08-23 09:51:45 +02:00
final SolrInputDocument solrdoc = new SolrInputDocument ( ) ;
2013-02-21 13:23:55 +01:00
add ( solrdoc , CollectionSchema . id , ASCII . String ( digestURI . hash ( ) ) ) ;
add ( solrdoc , CollectionSchema . sku , digestURI . toNormalform ( true ) ) ;
2012-05-14 14:56:21 +02:00
final InetAddress address = digestURI . getInetAddress ( ) ;
2013-02-21 13:23:55 +01:00
if ( contains ( CollectionSchema . ip_s ) & & address ! = null ) add ( solrdoc , CollectionSchema . ip_s , address . getHostAddress ( ) ) ;
if ( contains ( CollectionSchema . host_s ) & & digestURI . getHost ( ) ! = null ) add ( solrdoc , CollectionSchema . host_s , digestURI . getHost ( ) ) ;
2012-05-14 14:56:21 +02:00
// path elements of link
2013-02-21 13:23:55 +01:00
if ( contains ( CollectionSchema . url_paths_sxt ) ) add ( solrdoc , CollectionSchema . url_paths_sxt , digestURI . getPaths ( ) ) ;
if ( contains ( CollectionSchema . url_file_ext_s ) ) add ( solrdoc , CollectionSchema . url_file_ext_s , digestURI . getFileExtension ( ) ) ;
2012-09-11 22:46:39 +02:00
// fail reason and status
2013-02-21 13:23:55 +01:00
if ( contains ( CollectionSchema . failreason_t ) ) add ( solrdoc , CollectionSchema . failreason_t , failReason ) ;
if ( contains ( CollectionSchema . failtype_s ) ) add ( solrdoc , CollectionSchema . failtype_s , failType . name ( ) ) ;
if ( contains ( CollectionSchema . httpstatus_i ) ) add ( solrdoc , CollectionSchema . httpstatus_i , httpstatus ) ;
2012-05-14 14:56:21 +02:00
return solrdoc ;
}
2011-04-14 22:05:04 +02:00
/ *
2012-02-01 18:12:59 +01:00
standard solr schema
2011-04-14 22:05:04 +02:00
< field name = " name " type = " textgen " indexed = " true " stored = " true " / >
< field name = " cat " type = " string " indexed = " true " stored = " true " multiValued = " true " / >
< field name = " features " type = " text " indexed = " true " stored = " true " multiValued = " true " / >
< field name = " includes " type = " text " indexed = " true " stored = " true " termVectors = " true " termPositions = " true " termOffsets = " true " / >
< field name = " weight " type = " float " indexed = " true " stored = " true " / >
< field name = " price " type = " float " indexed = " true " stored = " true " / >
< field name = " popularity " type = " int " indexed = " true " stored = " true " / >
< ! - - Common metadata fields , named specifically to match up with
SolrCell metadata when parsing rich documents such as Word , PDF .
Some fields are multiValued only because Tika currently may return
multiple values for them .
- - >
< field name = " title " type = " text " indexed = " true " stored = " true " multiValued = " true " / >
< field name = " subject " type = " text " indexed = " true " stored = " true " / >
< field name = " description " type = " text " indexed = " true " stored = " true " / >
< field name = " comments " type = " text " indexed = " true " stored = " true " / >
< field name = " author " type = " textgen " indexed = " true " stored = " true " / >
< field name = " keywords " type = " textgen " indexed = " true " stored = " true " / >
< field name = " category " type = " textgen " indexed = " true " stored = " true " / >
< field name = " content_type " type = " string " indexed = " true " stored = " true " multiValued = " true " / >
< field name = " last_modified " type = " date " indexed = " true " stored = " true " / >
< field name = " links " type = " string " indexed = " true " stored = " true " multiValued = " true " / >
* /
}