2010-06-29 21:20:45 +02:00
/ * *
* Condenser . java
* Copyright 2004 by Michael Peter Christen , mc @yacy.net , Frankfurt am Main , Germany
* First released 09 . 01 . 2004 at http : //yacy.net
*
* This library is free software ; you can redistribute it and / or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation ; either
* version 2 . 1 of the License , or ( at your option ) any later version .
2011-11-09 15:42:55 +01:00
*
2010-06-29 21:20:45 +02:00
* This library is distributed in the hope that it will be useful ,
* but WITHOUT ANY WARRANTY ; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE . See the GNU
* Lesser General Public License for more details .
2011-11-09 15:42:55 +01:00
*
2010-06-29 21:20:45 +02:00
* You should have received a copy of the GNU Lesser General Public License
* along with this program in the file lgpl21 . txt
* If not , see < http : //www.gnu.org/licenses/>.
* /
2005-04-07 21:19:42 +02:00
2009-10-18 02:53:43 +02:00
package net.yacy.document ;
2005-04-07 21:19:42 +02:00
2005-05-05 07:32:19 +02:00
import java.io.File ;
2006-11-23 03:16:30 +01:00
import java.io.FileInputStream ;
import java.io.FileNotFoundException ;
2005-05-05 07:32:19 +02:00
import java.io.IOException ;
2012-10-02 11:13:06 +02:00
import java.util.ArrayList ;
2005-05-05 07:32:19 +02:00
import java.util.HashMap ;
import java.util.HashSet ;
import java.util.Iterator ;
2012-10-02 11:13:06 +02:00
import java.util.LinkedHashSet ;
import java.util.List ;
2008-10-21 22:19:10 +02:00
import java.util.Locale ;
2005-05-05 07:32:19 +02:00
import java.util.Map ;
2006-11-23 03:16:30 +01:00
import java.util.Properties ;
2010-11-28 03:57:31 +01:00
import java.util.Set ;
import java.util.SortedSet ;
2005-05-05 07:32:19 +02:00
import java.util.TreeMap ;
2012-11-21 18:46:49 +01:00
import org.apache.solr.common.params.MapSolrParams ;
import org.apache.solr.update.processor.Lookup3Signature ;
2011-05-27 10:24:54 +02:00
import net.yacy.cora.document.ASCII ;
2012-09-20 19:38:22 +02:00
import net.yacy.cora.document.WordCache ;
2012-11-21 18:46:49 +01:00
import net.yacy.cora.document.analysis.Classification.ContentDomain ;
import net.yacy.cora.document.analysis.EnhancedTextProfileSignature ;
2010-05-25 14:54:57 +02:00
import net.yacy.cora.document.MultiProtocolURI ;
2012-12-02 16:54:29 +01:00
import net.yacy.cora.federate.solr.Boost ;
2012-10-02 00:02:50 +02:00
import net.yacy.cora.language.synonyms.SynonymLibrary ;
2012-06-11 23:49:30 +02:00
import net.yacy.cora.lod.vocabulary.Tagging ;
2009-10-18 02:53:43 +02:00
import net.yacy.document.language.Identificator ;
import net.yacy.document.parser.html.ImageEntry ;
2009-10-11 02:12:19 +02:00
import net.yacy.kelondro.data.word.Word ;
import net.yacy.kelondro.data.word.WordReferenceRow ;
2009-11-05 21:28:37 +01:00
import net.yacy.kelondro.logging.Log ;
2012-09-21 16:46:57 +02:00
import net.yacy.kelondro.util.Bitfield ;
2009-10-10 03:14:19 +02:00
import net.yacy.kelondro.util.SetTools ;
2009-10-10 01:22:22 +02:00
2005-04-07 21:19:42 +02:00
2009-04-03 15:23:45 +02:00
public final class Condenser {
2011-11-09 15:42:55 +01:00
2006-11-23 03:16:30 +01:00
// this is the page analysis class
2010-06-26 18:31:26 +02:00
public final static boolean pseudostemming = false ; // switch for removal of words that appear in shortened form
public final static int wordminsize = 2 ;
public final static int wordcut = 2 ;
2006-11-23 03:16:30 +01:00
// category flags that show how the page can be distinguished in different interest groups
public static final int flag_cat_indexof = 0 ; // a directory listing page (i.e. containing 'index of')
2011-03-31 11:41:30 +02:00
public static final int flag_cat_haslocation = 19 ; // the page has a location metadata attached
2006-12-01 17:21:17 +01:00
public static final int flag_cat_hasimage = 20 ; // the page refers to (at least one) images
public static final int flag_cat_hasaudio = 21 ; // the page refers to (at least one) audio file
public static final int flag_cat_hasvideo = 22 ; // the page refers to (at least one) videos
public static final int flag_cat_hasapp = 23 ; // the page refers to (at least one) application file
2011-11-09 15:42:55 +01:00
2005-11-07 13:33:02 +01:00
//private Properties analysis;
2011-11-09 15:42:55 +01:00
private final Map < String , Word > words ; // a string (the words) to (indexWord) - relation
2012-06-11 23:49:30 +02:00
private final Map < String , Set < Tagging . Metatag > > tags = new HashMap < String , Set < Tagging . Metatag > > ( ) ; // a set of tags, discovered from Autotagging
2012-10-02 00:02:50 +02:00
private final Set < String > synonyms ; // a set of synonyms to the words
2012-11-21 18:46:49 +01:00
private long fuzzy_signature = 0 , exact_signature = 0 ; // signatures for double-check detection
private String fuzzy_signature_text = null ; // signatures for double-check detection
2012-10-02 00:02:50 +02:00
2005-11-07 13:33:02 +01:00
public int RESULT_NUMB_WORDS = - 1 ;
public int RESULT_DIFF_WORDS = - 1 ;
public int RESULT_NUMB_SENTENCES = - 1 ;
public int RESULT_DIFF_SENTENCES = - 1 ;
2009-01-30 16:33:00 +01:00
public Bitfield RESULT_FLAGS = new Bitfield ( 4 ) ;
2011-11-09 15:42:55 +01:00
private final Identificator languageIdentificator ;
2009-05-29 12:03:35 +02:00
public Condenser (
2009-07-08 23:48:08 +02:00
final Document document ,
2009-05-29 12:03:35 +02:00
final boolean indexText ,
2010-10-18 13:35:09 +02:00
final boolean indexMedia ,
2012-06-04 15:37:39 +02:00
final WordCache meaningLib ,
2012-11-21 18:46:49 +01:00
final SynonymLibrary synlib ,
2012-06-04 15:37:39 +02:00
final boolean doAutotagging
2011-02-13 18:37:28 +01:00
) {
2011-05-06 00:37:06 +02:00
Thread . currentThread ( ) . setName ( " condenser- " + document . dc_identifier ( ) ) ; // for debugging
2006-12-08 03:14:56 +01:00
// if addMedia == true, then all the media links are also parsed and added to the words
2007-11-15 04:03:18 +01:00
// added media words are flagged with the appropriate media flag
2009-08-28 15:28:11 +02:00
this . words = new HashMap < String , Word > ( ) ;
2012-10-02 11:13:06 +02:00
this . synonyms = new LinkedHashSet < String > ( ) ;
2009-01-30 16:33:00 +01:00
this . RESULT_FLAGS = new Bitfield ( 4 ) ;
2009-01-04 03:27:29 +01:00
// construct flag set for document
2012-04-22 02:05:17 +02:00
if ( document . dc_source ( ) . getContentDomain ( ) = = ContentDomain . IMAGE | | ! document . getImages ( ) . isEmpty ( ) ) this . RESULT_FLAGS . set ( flag_cat_hasimage , true ) ;
if ( document . dc_source ( ) . getContentDomain ( ) = = ContentDomain . AUDIO | | ! document . getAudiolinks ( ) . isEmpty ( ) ) this . RESULT_FLAGS . set ( flag_cat_hasaudio , true ) ;
if ( document . dc_source ( ) . getContentDomain ( ) = = ContentDomain . VIDEO | | ! document . getVideolinks ( ) . isEmpty ( ) ) this . RESULT_FLAGS . set ( flag_cat_hasvideo , true ) ;
if ( document . dc_source ( ) . getContentDomain ( ) = = ContentDomain . APP | | ! document . getApplinks ( ) . isEmpty ( ) ) this . RESULT_FLAGS . set ( flag_cat_hasapp , true ) ;
2011-11-09 15:42:55 +01:00
if ( document . lat ( ) ! = 0 . 0f & & document . lon ( ) ! = 0 . 0f ) this . RESULT_FLAGS . set ( flag_cat_haslocation , true ) ;
2008-09-18 15:12:33 +02:00
this . languageIdentificator = new Identificator ( ) ;
2011-11-09 15:42:55 +01:00
2012-07-08 16:48:09 +02:00
// add the URL components to the word list
insertTextToWords ( new SentenceReader ( document . dc_source ( ) . toTokens ( ) ) , 0 , WordReferenceRow . flag_app_dc_identifier , this . RESULT_FLAGS , false , meaningLib ) ;
2011-11-09 15:42:55 +01:00
2010-05-25 14:54:57 +02:00
Map . Entry < MultiProtocolURI , String > entry ;
2006-12-19 04:10:46 +01:00
if ( indexText ) {
2012-07-04 21:15:10 +02:00
createCondensement ( document . getTextString ( ) , meaningLib , doAutotagging ) ;
2006-12-19 04:10:46 +01:00
// the phrase counter:
// phrase 0 are words taken from the URL
2007-03-18 13:33:19 +01:00
// phrase 1 is the MainTitle
// phrase 2 is <not used>
2006-12-19 04:10:46 +01:00
// phrase 3 is the Document Abstract
// phrase 4 is the Document Author
2010-05-11 13:14:05 +02:00
// phrase 5 is the Document Publisher
// phrase 6 are the tags specified in document
2006-12-19 04:10:46 +01:00
// phrase 10 and above are the section headlines/titles (88 possible)
2008-09-28 23:12:26 +02:00
// phrase 98 is taken from the embedded anchor/hyperlinks description (REMOVED!)
2006-12-19 04:10:46 +01:00
// phrase 99 is taken from the media Link url and anchor description
// phrase 100 and above are lines from the text
2012-07-04 21:15:10 +02:00
insertTextToWords ( new SentenceReader ( document . dc_title ( ) ) , 1 , WordReferenceRow . flag_app_dc_title , this . RESULT_FLAGS , true , meaningLib ) ;
insertTextToWords ( new SentenceReader ( document . dc_description ( ) ) , 3 , WordReferenceRow . flag_app_dc_description , this . RESULT_FLAGS , true , meaningLib ) ;
insertTextToWords ( new SentenceReader ( document . dc_creator ( ) ) , 4 , WordReferenceRow . flag_app_dc_creator , this . RESULT_FLAGS , true , meaningLib ) ;
insertTextToWords ( new SentenceReader ( document . dc_publisher ( ) ) , 5 , WordReferenceRow . flag_app_dc_creator , this . RESULT_FLAGS , true , meaningLib ) ;
insertTextToWords ( new SentenceReader ( document . dc_subject ( ' ' ) ) , 6 , WordReferenceRow . flag_app_dc_description , this . RESULT_FLAGS , true , meaningLib ) ;
2006-12-19 04:10:46 +01:00
// missing: tags!
2008-08-02 14:12:04 +02:00
final String [ ] titles = document . getSectionTitles ( ) ;
2006-12-19 04:10:46 +01:00
for ( int i = 0 ; i < titles . length ; i + + ) {
2012-07-04 21:15:10 +02:00
insertTextToWords ( new SentenceReader ( titles [ i ] ) , i + 10 , WordReferenceRow . flag_app_emphasized , this . RESULT_FLAGS , true , meaningLib ) ;
2006-12-19 04:10:46 +01:00
}
2011-11-09 15:42:55 +01:00
2008-09-21 22:25:47 +02:00
// anchors: for text indexing we add only the anchor description
2008-09-28 23:12:26 +02:00
// REMOVED! Reason:
// words from the anchor description should appear as normal text in the output from the parser
// to flag these words as appearance in dc_description would confuse, since the user expects such word as titles of
// pages that are shown in the search result. The words from the URLS should also not appear as part of the index, because they
// are not visible in the text and could be used to crate fake-content
/ *
2008-08-02 14:12:04 +02:00
final Iterator < Map . Entry < yacyURL , String > > i = document . getAnchors ( ) . entrySet ( ) . iterator ( ) ;
2006-12-19 04:10:46 +01:00
while ( i . hasNext ( ) ) {
2008-01-22 12:51:43 +01:00
entry = i . next ( ) ;
2008-01-22 20:03:47 +01:00
if ( ( entry = = null ) | | ( entry . getKey ( ) = = null ) ) continue ;
2008-09-21 22:25:47 +02:00
insertTextToWords ( entry . getValue ( ) , 98 , indexRWIEntry . flag_app_dc_description , RESULT_FLAGS , true ) ;
2006-12-19 04:10:46 +01:00
}
2008-09-28 23:12:26 +02:00
* /
2006-12-19 04:10:46 +01:00
} else {
this . RESULT_NUMB_WORDS = 0 ;
this . RESULT_DIFF_WORDS = 0 ;
this . RESULT_NUMB_SENTENCES = 0 ;
this . RESULT_DIFF_SENTENCES = 0 ;
2006-12-08 03:14:56 +01:00
}
2011-11-09 15:42:55 +01:00
2006-12-19 04:10:46 +01:00
if ( indexMedia ) {
2008-09-21 22:25:47 +02:00
// add anchor descriptions: here, we also add the url components
2006-12-19 04:10:46 +01:00
// audio
2010-05-25 14:54:57 +02:00
Iterator < Map . Entry < MultiProtocolURI , String > > i = document . getAudiolinks ( ) . entrySet ( ) . iterator ( ) ;
2006-12-19 04:10:46 +01:00
while ( i . hasNext ( ) ) {
2008-01-22 12:51:43 +01:00
entry = i . next ( ) ;
2012-10-10 11:46:22 +02:00
insertTextToWords ( new SentenceReader ( entry . getKey ( ) . toNormalform ( true ) ) , 99 , flag_cat_hasaudio , this . RESULT_FLAGS , false , meaningLib ) ;
2012-07-04 21:15:10 +02:00
insertTextToWords ( new SentenceReader ( entry . getValue ( ) ) , 99 , flag_cat_hasaudio , this . RESULT_FLAGS , true , meaningLib ) ;
2006-12-19 04:10:46 +01:00
}
2006-12-08 03:14:56 +01:00
2006-12-19 04:10:46 +01:00
// video
i = document . getVideolinks ( ) . entrySet ( ) . iterator ( ) ;
while ( i . hasNext ( ) ) {
2008-01-22 12:51:43 +01:00
entry = i . next ( ) ;
2012-10-10 11:46:22 +02:00
insertTextToWords ( new SentenceReader ( entry . getKey ( ) . toNormalform ( true ) ) , 99 , flag_cat_hasvideo , this . RESULT_FLAGS , false , meaningLib ) ;
2012-07-04 21:15:10 +02:00
insertTextToWords ( new SentenceReader ( entry . getValue ( ) ) , 99 , flag_cat_hasvideo , this . RESULT_FLAGS , true , meaningLib ) ;
2006-12-19 04:10:46 +01:00
}
2006-12-08 03:14:56 +01:00
2006-12-19 04:10:46 +01:00
// applications
i = document . getApplinks ( ) . entrySet ( ) . iterator ( ) ;
while ( i . hasNext ( ) ) {
2008-01-22 12:51:43 +01:00
entry = i . next ( ) ;
2012-10-10 11:46:22 +02:00
insertTextToWords ( new SentenceReader ( entry . getKey ( ) . toNormalform ( true ) ) , 99 , flag_cat_hasapp , this . RESULT_FLAGS , false , meaningLib ) ;
2012-07-04 21:15:10 +02:00
insertTextToWords ( new SentenceReader ( entry . getValue ( ) ) , 99 , flag_cat_hasapp , this . RESULT_FLAGS , true , meaningLib ) ;
2006-12-19 04:10:46 +01:00
}
2006-12-08 03:14:56 +01:00
2006-12-19 04:10:46 +01:00
// images
2009-07-08 23:48:08 +02:00
final Iterator < ImageEntry > j = document . getImages ( ) . values ( ) . iterator ( ) ;
ImageEntry ientry ;
2011-03-22 10:34:10 +01:00
MultiProtocolURI url ;
2008-01-22 12:51:43 +01:00
while ( j . hasNext ( ) ) {
ientry = j . next ( ) ;
2011-03-22 10:34:10 +01:00
url = ientry . url ( ) ;
if ( url = = null ) continue ;
2012-10-10 11:46:22 +02:00
insertTextToWords ( new SentenceReader ( url . toNormalform ( true ) ) , 99 , flag_cat_hasimage , this . RESULT_FLAGS , false , meaningLib ) ;
2012-07-04 21:15:10 +02:00
insertTextToWords ( new SentenceReader ( ientry . alt ( ) ) , 99 , flag_cat_hasimage , this . RESULT_FLAGS , true , meaningLib ) ;
2006-12-19 04:10:46 +01:00
}
2011-11-09 15:42:55 +01:00
2006-12-19 04:10:46 +01:00
// finally check all words for missing flag entry
2011-11-09 15:42:55 +01:00
final Iterator < Map . Entry < String , Word > > k = this . words . entrySet ( ) . iterator ( ) ;
2009-03-02 00:58:14 +01:00
Word wprop ;
Map . Entry < String , Word > we ;
2008-01-22 12:51:43 +01:00
while ( k . hasNext ( ) ) {
we = k . next ( ) ;
wprop = we . getValue ( ) ;
2006-12-19 04:10:46 +01:00
if ( wprop . flags = = null ) {
2011-11-09 15:42:55 +01:00
wprop . flags = this . RESULT_FLAGS . clone ( ) ;
this . words . put ( we . getKey ( ) , wprop ) ;
2006-12-19 04:10:46 +01:00
}
2006-12-08 03:14:56 +01:00
}
}
2012-01-15 22:17:57 +01:00
// extend the tags in the document object with autotagging tags
if ( ! this . tags . isEmpty ( ) ) {
2012-06-11 16:48:53 +02:00
document . addMetatags ( this . tags ) ;
2012-01-15 22:17:57 +01:00
}
2012-11-21 18:46:49 +01:00
if ( synlib ! = null ) {
for ( String word : this . words . keySet ( ) ) {
Set < String > syms = synlib . getSynonyms ( word ) ;
if ( syms ! = null ) this . synonyms . addAll ( syms ) ;
}
}
String text = document . getTextString ( ) ;
2012-10-02 00:02:50 +02:00
// create the synonyms set
2012-10-02 11:13:06 +02:00
if ( synonyms ! = null ) {
2012-10-02 00:02:50 +02:00
for ( String word : this . words . keySet ( ) ) {
2012-11-21 18:46:49 +01:00
Set < String > syms = synlib . getSynonyms ( word ) ;
2012-10-02 00:02:50 +02:00
if ( syms ! = null ) this . synonyms . addAll ( syms ) ;
}
}
2012-11-21 18:46:49 +01:00
// create hashes for duplicate detection
// check dups with http://localhost:8090/solr/select?q=*:*&start=0&rows=3&fl=sku,fuzzy_signature_text_t,fuzzy_signature_l,fuzzy_signature_unique_b
EnhancedTextProfileSignature fuzzySignatureFactory = new EnhancedTextProfileSignature ( ) ;
Map < String , String > sp = new HashMap < String , String > ( ) ;
2012-12-02 16:54:29 +01:00
sp . put ( " quantRate " , Float . toString ( Boost . RANKING . getQuantRate ( ) ) ) ; // for minTokenLen = 2 the value should not be below 0.24; for minTokenLen = 3 the value must be not below 0.5!
sp . put ( " minTokenLen " , Integer . toString ( Boost . RANKING . getMinTokenLen ( ) ) ) ;
2012-11-21 18:46:49 +01:00
fuzzySignatureFactory . init ( new MapSolrParams ( sp ) ) ;
fuzzySignatureFactory . add ( text ) ;
byte [ ] fuzzy_signature_hash = fuzzySignatureFactory . getSignature ( ) ;
long l = 0 ; for ( int i = 0 ; i < 8 ; i + + ) l = ( l < < 8 ) + ( fuzzy_signature_hash [ i ] & 0xff ) ;
this . fuzzy_signature = l ;
this . fuzzy_signature_text = fuzzySignatureFactory . getSignatureText ( ) . toString ( ) ;
Lookup3Signature exactSignatureFactory = new Lookup3Signature ( ) ;
exactSignatureFactory . add ( text ) ;
byte [ ] exact_signature_hash = exactSignatureFactory . getSignature ( ) ;
l = 0 ; for ( int i = 0 ; i < 8 ; i + + ) l = ( l < < 8 ) + ( exact_signature_hash [ i ] & 0xff ) ;
this . exact_signature = l ;
}
2012-12-18 02:29:03 +01:00
private Condenser ( final String text , final WordCache meaningLib , boolean doAutotagging ) {
2012-11-21 18:46:49 +01:00
this . languageIdentificator = null ; // we don't need that here
// analysis = new Properties();
this . words = new TreeMap < String , Word > ( ) ;
this . synonyms = new HashSet < String > ( ) ;
createCondensement ( text , meaningLib , doAutotagging ) ;
2006-12-01 17:21:17 +01:00
}
2011-11-09 15:42:55 +01:00
2010-10-18 13:35:09 +02:00
private void insertTextToWords (
2012-07-04 21:15:10 +02:00
final SentenceReader text ,
2010-10-18 13:35:09 +02:00
final int phrase ,
final int flagpos ,
final Bitfield flagstemplate ,
2010-11-28 03:57:31 +01:00
final boolean useForLanguageIdentification ,
2011-02-12 01:01:40 +01:00
final WordCache meaningLib ) {
2010-12-28 00:57:29 +01:00
if ( text = = null ) return ;
2006-12-08 03:14:56 +01:00
String word ;
2009-03-02 00:58:14 +01:00
Word wprop ;
2012-07-04 21:15:10 +02:00
WordTokenizer wordenum = new WordTokenizer ( text , meaningLib ) ;
2011-11-30 12:15:54 +01:00
try {
int pip = 0 ;
while ( wordenum . hasMoreElements ( ) ) {
word = ( wordenum . nextElement ( ) . toString ( ) ) . toLowerCase ( Locale . ENGLISH ) ;
if ( useForLanguageIdentification ) this . languageIdentificator . add ( word ) ;
if ( word . length ( ) < 2 ) continue ;
wprop = this . words . get ( word ) ;
if ( wprop = = null ) wprop = new Word ( 0 , pip , phrase ) ;
if ( wprop . flags = = null ) wprop . flags = flagstemplate . clone ( ) ;
wprop . flags . set ( flagpos , true ) ;
this . words . put ( word , wprop ) ;
pip + + ;
this . RESULT_NUMB_WORDS + + ;
this . RESULT_DIFF_WORDS + + ;
}
} finally {
wordenum . close ( ) ;
2006-12-08 03:14:56 +01:00
}
}
2010-11-28 03:57:31 +01:00
public int excludeWords ( final SortedSet < String > stopwords ) {
2005-04-07 21:19:42 +02:00
// subtracts the given stopwords from the word list
// the word list shrinkes. This returns the number of shrinked words
2011-11-09 15:42:55 +01:00
final int oldsize = this . words . size ( ) ;
SetTools . excludeDestructive ( this . words , stopwords ) ;
return oldsize - this . words . size ( ) ;
2005-04-07 21:19:42 +02:00
}
2009-03-02 00:58:14 +01:00
public Map < String , Word > words ( ) {
2008-03-26 16:37:49 +01:00
// returns the words as word/indexWord relation map
2011-11-09 15:42:55 +01:00
return this . words ;
2005-04-07 21:19:42 +02:00
}
2012-10-02 00:02:50 +02:00
2012-10-02 11:13:06 +02:00
public List < String > synonyms ( ) {
ArrayList < String > l = new ArrayList < String > ( this . synonyms . size ( ) ) ;
for ( String s : this . synonyms ) l . add ( s ) ;
return l ;
2012-10-02 00:02:50 +02:00
}
2011-11-09 15:42:55 +01:00
2012-11-21 18:46:49 +01:00
public long fuzzySignature ( ) {
return this . fuzzy_signature ;
}
public String fuzzySignatureText ( ) {
return this . fuzzy_signature_text ;
}
public long exactSignature ( ) {
return this . exact_signature ;
}
2008-09-18 15:12:33 +02:00
public String language ( ) {
return this . languageIdentificator . getLanguage ( ) ;
}
2006-01-19 15:13:39 +01:00
2012-07-04 21:15:10 +02:00
private void createCondensement ( final String text , final WordCache meaningLib , boolean doAutotagging ) {
assert text ! = null ;
2010-11-28 03:57:31 +01:00
final Set < String > currsentwords = new HashSet < String > ( ) ;
2006-01-19 13:24:35 +01:00
String word = " " ;
2012-06-16 19:40:27 +02:00
String [ ] wordcache = new String [ LibraryProvider . autotagging . getMaxWordsInTerm ( ) - 1 ] ;
for ( int i = 0 ; i < wordcache . length ; i + + ) wordcache [ i ] = " " ;
2012-06-11 16:48:53 +02:00
String k ;
2012-06-11 23:49:30 +02:00
Tagging . Metatag tag ;
2006-01-19 13:24:35 +01:00
int wordlen ;
2011-11-09 15:42:55 +01:00
Word wsp ;
final Word wsp1 ;
2006-01-19 13:24:35 +01:00
int wordHandle ;
int wordHandleCount = 0 ;
2011-11-09 15:42:55 +01:00
final int sentenceHandleCount = 0 ;
2006-01-19 13:24:35 +01:00
int allwordcounter = 0 ;
2011-11-09 15:42:55 +01:00
final int allsentencecounter = 0 ;
2006-01-19 15:13:39 +01:00
int wordInSentenceCounter = 1 ;
2006-11-28 16:00:15 +01:00
boolean comb_indexof = false , last_last = false , last_index = false ;
2010-11-28 03:57:31 +01:00
final Map < StringBuilder , Phrase > sentences = new HashMap < StringBuilder , Phrase > ( 100 ) ;
2012-07-10 22:59:03 +02:00
if ( LibraryProvider . autotagging . isEmpty ( ) ) doAutotagging = false ;
2012-07-08 16:48:09 +02:00
2006-01-19 13:24:35 +01:00
// read source
2012-07-04 21:15:10 +02:00
final WordTokenizer wordenum = new WordTokenizer ( new SentenceReader ( text ) , meaningLib ) ;
2011-11-30 12:15:54 +01:00
try {
while ( wordenum . hasMoreElements ( ) ) {
word = wordenum . nextElement ( ) . toString ( ) . toLowerCase ( Locale . ENGLISH ) ;
if ( this . languageIdentificator ! = null ) this . languageIdentificator . add ( word ) ;
if ( word . length ( ) < wordminsize ) continue ;
2012-01-13 11:24:08 +01:00
2012-01-15 22:17:57 +01:00
// get tags from autotagging
2012-06-04 15:37:39 +02:00
if ( doAutotagging ) {
2012-06-16 19:40:27 +02:00
for ( int wordc = 1 ; wordc < = wordcache . length + 1 ; wordc + + ) {
// wordc is number of words that are tested
StringBuilder sb = new StringBuilder ( ) ;
if ( wordc = = 1 ) {
sb . append ( word ) ;
} else {
for ( int w = 0 ; w < wordc - 1 ; w + + ) {
sb . append ( wordcache [ wordcache . length - wordc + w + 1 ] ) . append ( ' ' ) ;
}
sb . append ( word ) ;
}
String testterm = sb . toString ( ) . trim ( ) ;
//System.out.println("Testing: " + testterm);
tag = LibraryProvider . autotagging . getTagFromTerm ( testterm ) ;
if ( tag ! = null ) {
2012-12-18 02:29:03 +01:00
String navigatorName = tag . getVocabularyName ( ) ;
Set < Tagging . Metatag > tagset = this . tags . get ( navigatorName ) ;
2012-06-16 19:40:27 +02:00
if ( tagset = = null ) {
tagset = new HashSet < Tagging . Metatag > ( ) ;
2012-12-18 02:29:03 +01:00
this . tags . put ( navigatorName , tagset ) ;
2012-06-16 19:40:27 +02:00
}
2012-06-11 23:49:30 +02:00
tagset . add ( tag ) ;
2012-06-16 19:40:27 +02:00
}
}
2012-06-04 15:37:39 +02:00
}
2012-06-16 19:40:27 +02:00
// shift wordcache
System . arraycopy ( wordcache , 1 , wordcache , 0 , wordcache . length - 1 ) ;
wordcache [ wordcache . length - 1 ] = word ;
2012-01-15 22:17:57 +01:00
2011-11-30 12:15:54 +01:00
// distinguish punctuation and words
wordlen = word . length ( ) ;
if ( wordlen = = 1 & & SentenceReader . punctuation ( word . charAt ( 0 ) ) ) {
// store sentence
currsentwords . clear ( ) ;
wordInSentenceCounter = 1 ;
} else {
// check index.of detection
if ( last_last & & comb_indexof & & word . equals ( " modified " ) ) {
this . RESULT_FLAGS . set ( flag_cat_indexof , true ) ;
wordenum . pre ( true ) ; // parse lines as they come with CRLF
}
if ( last_index & & ( wordminsize > 2 | | word . equals ( " of " ) ) ) comb_indexof = true ;
last_last = word . equals ( " last " ) ;
last_index = word . equals ( " index " ) ;
2012-01-13 11:24:08 +01:00
2011-11-30 12:15:54 +01:00
// store word
allwordcounter + + ;
currsentwords . add ( word ) ;
wsp = this . words . get ( word ) ;
if ( wsp ! = null ) {
// word already exists
wordHandle = wsp . posInText ;
wsp . inc ( ) ;
} else {
// word does not yet exist, create new word entry
wordHandle = wordHandleCount + + ;
wsp = new Word ( wordHandle , wordInSentenceCounter , sentences . size ( ) + 100 ) ;
wsp . flags = this . RESULT_FLAGS . clone ( ) ;
this . words . put ( word , wsp ) ;
}
// we now have the unique handle of the word, put it into the sentence:
wordInSentenceCounter + + ;
}
}
} finally {
wordenum . close ( ) ;
2006-01-19 13:24:35 +01:00
}
2005-04-07 21:19:42 +02:00
2009-11-24 12:13:11 +01:00
if ( pseudostemming ) {
Map . Entry < String , Word > entry ;
// we search for similar words and reorganize the corresponding sentences
// a word is similar, if a shortened version is equal
2011-11-09 15:42:55 +01:00
final Iterator < Map . Entry < String , Word > > wi = this . words . entrySet ( ) . iterator ( ) ; // enumerates the keys in descending order
2009-11-24 12:13:11 +01:00
wordsearch : while ( wi . hasNext ( ) ) {
entry = wi . next ( ) ;
word = entry . getKey ( ) ;
wordlen = word . length ( ) ;
wsp = entry . getValue ( ) ;
for ( int i = wordcut ; i > 0 ; i - - ) {
if ( wordlen > i ) {
k = word . substring ( 0 , wordlen - i ) ;
2011-11-09 15:42:55 +01:00
if ( this . words . containsKey ( k ) ) {
2009-11-24 12:13:11 +01:00
// update word counter
wsp1 . count = wsp1 . count + wsp . count ;
2011-11-09 15:42:55 +01:00
this . words . put ( k , wsp1 ) ;
2009-11-24 12:13:11 +01:00
// remove current word
wi . remove ( ) ;
continue wordsearch ;
2006-01-19 13:24:35 +01:00
}
}
}
}
}
// store result
2006-11-28 16:00:15 +01:00
//this.RESULT_NUMB_TEXT_BYTES = wordenum.count();
2006-01-19 13:24:35 +01:00
this . RESULT_NUMB_WORDS = allwordcounter ;
this . RESULT_DIFF_WORDS = wordHandleCount ;
this . RESULT_NUMB_SENTENCES = allsentencecounter ;
this . RESULT_DIFF_SENTENCES = sentenceHandleCount ;
2005-04-07 21:19:42 +02:00
}
2011-11-09 15:42:55 +01:00
2011-02-12 01:01:40 +01:00
public static Map < String , Word > getWords ( final String text , final WordCache meaningLib ) {
2008-03-26 16:37:49 +01:00
// returns a word/indexWord relation map
2006-12-08 03:14:56 +01:00
if ( text = = null ) return null ;
2012-07-04 21:15:10 +02:00
return new Condenser ( text , meaningLib , false ) . words ( ) ;
2005-04-15 16:18:14 +02:00
}
2011-11-09 15:42:55 +01:00
2008-08-02 14:12:04 +02:00
public static void main ( final String [ ] args ) {
2008-01-08 21:12:31 +01:00
// read a property file and convert them into configuration lines
2006-11-23 03:16:30 +01:00
try {
2008-08-02 14:12:04 +02:00
final File f = new File ( args [ 0 ] ) ;
final Properties p = new Properties ( ) ;
2006-11-23 03:16:30 +01:00
p . load ( new FileInputStream ( f ) ) ;
2008-12-04 13:54:16 +01:00
final StringBuilder sb = new StringBuilder ( ) ;
2006-11-23 03:16:30 +01:00
sb . append ( " { \ n " ) ;
for ( int i = 0 ; i < = 15 ; i + + ) {
sb . append ( '"' ) ;
2008-08-02 14:12:04 +02:00
final String s = p . getProperty ( " keywords " + i ) ;
final String [ ] l = s . split ( " , " ) ;
2011-11-09 15:42:55 +01:00
for ( final String element : l ) {
sb . append ( ASCII . String ( Word . word2hash ( element ) ) ) ;
2006-11-23 03:16:30 +01:00
}
if ( i < 15 ) sb . append ( " , \ n " ) ;
}
sb . append ( " } \ n " ) ;
2011-03-07 21:36:40 +01:00
System . out . println ( sb . toString ( ) ) ;
2008-08-02 14:12:04 +02:00
} catch ( final FileNotFoundException e ) {
2009-11-05 21:28:37 +01:00
Log . logException ( e ) ;
2008-08-02 14:12:04 +02:00
} catch ( final IOException e ) {
2009-11-05 21:28:37 +01:00
Log . logException ( e ) ;
2006-11-23 03:16:30 +01:00
}
2011-11-09 15:42:55 +01:00
2005-04-07 21:19:42 +02:00
}
}