mirror of
https://github.com/yacy/yacy_search_server.git
synced 2024-09-19 00:01:41 +02:00
90f75c8c3d
during surrogate reading: those attributes from the dump are removed during the import process and replaced by new detected attributes according to the setting of the YaCy peer. This may cause that all such attributes are removed if the importing peer has no synonyms and/or no vocabularies defined.
246 lines
12 KiB
Java
246 lines
12 KiB
Java
/**
|
|
* Annotation.java
|
|
* Copyright 2004 by Michael Peter Christen, mc@yacy.net, Frankfurt am Main, Germany
|
|
* First released 09.01.2004 at http://yacy.net
|
|
*
|
|
* This library is free software; you can redistribute it and/or
|
|
* modify it under the terms of the GNU Lesser General private
|
|
* License as published by the Free Software Foundation; either
|
|
* version 2.1 of the License, or (at your option) any later version.
|
|
*
|
|
* This library is distributed in the hope that it will be useful,
|
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
* Lesser General private License for more details.
|
|
*
|
|
* You should have received a copy of the GNU Lesser General private License
|
|
* along with this program in the file lgpl21.txt
|
|
* If not, see <http://www.gnu.org/licenses/>.
|
|
*/
|
|
|
|
package net.yacy.document;
|
|
|
|
import java.io.IOException;
|
|
import java.util.ArrayList;
|
|
import java.util.HashMap;
|
|
import java.util.HashSet;
|
|
import java.util.Iterator;
|
|
import java.util.LinkedHashSet;
|
|
import java.util.List;
|
|
import java.util.Locale;
|
|
import java.util.Map;
|
|
import java.util.Set;
|
|
import java.util.TreeMap;
|
|
|
|
import net.yacy.cora.document.WordCache;
|
|
import net.yacy.cora.document.id.DigestURL;
|
|
import net.yacy.cora.language.synonyms.SynonymLibrary;
|
|
import net.yacy.cora.lod.vocabulary.Tagging;
|
|
import net.yacy.cora.order.NaturalOrder;
|
|
import net.yacy.kelondro.data.word.Word;
|
|
import net.yacy.kelondro.util.Bitfield;
|
|
|
|
public class Tokenizer {
|
|
|
|
// this is the page analysis class
|
|
public final static boolean pseudostemming = false; // switch for removal of words that appear in shortened form
|
|
public final static int wordminsize = 2;
|
|
public final static int wordcut = 2;
|
|
|
|
// category flags that show how the page can be distinguished in different interest groups
|
|
public static final int flag_cat_indexof = 0; // a directory listing page (i.e. containing 'index of')
|
|
public static final int flag_cat_haslocation = 19; // the page has a location metadata attached
|
|
public static final int flag_cat_hasimage = 20; // the page refers to (at least one) images
|
|
public static final int flag_cat_hasaudio = 21; // the page refers to (at least one) audio file
|
|
public static final int flag_cat_hasvideo = 22; // the page refers to (at least one) videos
|
|
public static final int flag_cat_hasapp = 23; // the page refers to (at least one) application file
|
|
|
|
//private Properties analysis;
|
|
protected final Map<String, Word> words; // a string (the words) to (indexWord) - relation
|
|
private final Set<String> synonyms; // a set of synonyms to the words
|
|
protected final Map<String, Set<Tagging.Metatag>> tags = new HashMap<String, Set<Tagging.Metatag>>(); // a set of tags, discovered from Autotagging
|
|
|
|
public int RESULT_NUMB_WORDS = -1;
|
|
public int RESULT_NUMB_SENTENCES = -1;
|
|
public Bitfield RESULT_FLAGS = new Bitfield(4);
|
|
|
|
public Tokenizer(final DigestURL root, final String text, final WordCache meaningLib, boolean doAutotagging, final VocabularyScraper scraper) {
|
|
this.words = new TreeMap<String, Word>(NaturalOrder.naturalComparator);
|
|
this.synonyms = new LinkedHashSet<String>();
|
|
assert text != null;
|
|
final Set<String> currsentwords = new HashSet<String>();
|
|
String[] wordcache = new String[LibraryProvider.autotagging.getMaxWordsInTerm() - 1];
|
|
for (int i = 0; i < wordcache.length; i++) wordcache[i] = "";
|
|
String k;
|
|
Tagging.Metatag tag;
|
|
int wordlen;
|
|
int wordHandle;
|
|
int wordHandleCount = 0;
|
|
//final int sentenceHandleCount = 0;
|
|
int allwordcounter = 0;
|
|
final int allsentencecounter = 0;
|
|
int wordInSentenceCounter = 1;
|
|
boolean comb_indexof = false, last_last = false, last_index = false;
|
|
//final Map<StringBuilder, Phrase> sentences = new HashMap<StringBuilder, Phrase>(100);
|
|
if (LibraryProvider.autotagging.isEmpty()) doAutotagging = false;
|
|
|
|
// read source
|
|
WordTokenizer wordenum = new WordTokenizer(new SentenceReader(text), meaningLib);
|
|
try {
|
|
while (wordenum.hasMoreElements()) {
|
|
String word = wordenum.nextElement().toString().toLowerCase(Locale.ENGLISH);
|
|
if (word.length() < wordminsize) continue;
|
|
|
|
// get tags from autotagging
|
|
if (doAutotagging) {
|
|
Set<String> vocabularyNames = LibraryProvider.autotagging.getVocabularyNames();
|
|
//Collection<Tagging> vocabularies = LibraryProvider.autotagging.getVocabularies();
|
|
//assert vocabularyNames.size() == vocabularies.size();
|
|
Map<String, String> vocMap = scraper == null ? null : scraper.removeVocMap(root);
|
|
if (vocMap != null && vocMap.size() > 0) {
|
|
for (Map.Entry<String, String> entry: vocMap.entrySet()) {
|
|
String navigatorName = entry.getKey();
|
|
String term = entry.getValue();
|
|
vocabularyNames.remove(navigatorName); // prevent that this is used again for auto-annotation
|
|
Tagging vocabulary = LibraryProvider.autotagging.getVocabulary(navigatorName);
|
|
if (vocabulary != null) {
|
|
// extend the vocabulary
|
|
String obj = vocabulary.getObjectlink(term);
|
|
if (obj == null) try {vocabulary.put(term, "", root.toNormalform(true));} catch (IOException e) {} // this makes IO, be careful!
|
|
// create annotation
|
|
tag = vocabulary.getMetatagFromTerm(term);
|
|
Set<Tagging.Metatag> tagset = new HashSet<>();
|
|
tagset.add(tag);
|
|
this.tags.put(navigatorName, tagset);
|
|
}
|
|
}
|
|
}
|
|
if (vocabularyNames.size() > 0) for (int wordc = 1; wordc <= wordcache.length + 1; wordc++) {
|
|
// wordc is number of words that are tested
|
|
StringBuilder sb = new StringBuilder();
|
|
if (wordc == 1) {
|
|
sb.append(word);
|
|
} else {
|
|
for (int w = 0; w < wordc - 1; w++) {
|
|
sb.append(wordcache[wordcache.length - wordc + w + 1]).append(' ');
|
|
}
|
|
sb.append(word);
|
|
}
|
|
String testterm = sb.toString().trim();
|
|
//System.out.println("Testing: " + testterm);
|
|
tag = LibraryProvider.autotagging.getTagFromTerm(vocabularyNames, testterm);
|
|
if (tag != null) {
|
|
String navigatorName = tag.getVocabularyName();
|
|
Set<Tagging.Metatag> tagset = this.tags.get(navigatorName);
|
|
if (tagset == null) {
|
|
tagset = new HashSet<Tagging.Metatag>();
|
|
this.tags.put(navigatorName, tagset);
|
|
}
|
|
tagset.add(tag);
|
|
}
|
|
}
|
|
}
|
|
// shift wordcache
|
|
System.arraycopy(wordcache, 1, wordcache, 0, wordcache.length - 1);
|
|
wordcache[wordcache.length - 1] = word;
|
|
|
|
// distinguish punctuation and words
|
|
wordlen = word.length();
|
|
if (wordlen == 1 && SentenceReader.punctuation(word.charAt(0))) { // TODO: wordlen == 1 never true (see earlier if < wordminsize )
|
|
// store sentence
|
|
currsentwords.clear();
|
|
wordInSentenceCounter = 1;
|
|
} else {
|
|
// check index.of detection
|
|
if (last_last && comb_indexof && word.equals("modified")) {
|
|
this.RESULT_FLAGS.set(flag_cat_indexof, true);
|
|
wordenum.pre(true); // parse lines as they come with CRLF
|
|
}
|
|
if (last_index && (wordminsize > 2 || word.equals("of"))) comb_indexof = true;
|
|
last_last = word.equals("last");
|
|
last_index = word.equals("index");
|
|
|
|
// store word
|
|
allwordcounter++;
|
|
currsentwords.add(word);
|
|
Word wsp = this.words.get(word);
|
|
if (wsp != null) {
|
|
// word already exists
|
|
wordHandle = wsp.posInText;
|
|
wsp.inc();
|
|
} else {
|
|
// word does not yet exist, create new word entry
|
|
wordHandle = wordHandleCount++;
|
|
wsp = new Word(wordHandle, wordInSentenceCounter, /* sentences.size() + */ 100);
|
|
wsp.flags = this.RESULT_FLAGS.clone();
|
|
this.words.put(word.toLowerCase(), wsp);
|
|
}
|
|
// we now have the unique handle of the word, put it into the sentence:
|
|
wordInSentenceCounter++;
|
|
}
|
|
}
|
|
} finally {
|
|
wordenum.close();
|
|
wordenum = null;
|
|
}
|
|
|
|
if (pseudostemming) {
|
|
// we search for similar words and reorganize the corresponding sentences
|
|
// a word is similar, if a shortened version is equal
|
|
Iterator<Map.Entry<String, Word>> wi = this.words.entrySet().iterator(); // enumerates the keys in descending order?
|
|
Map.Entry<String, Word> entry;
|
|
wordsearch: while (wi.hasNext()) {
|
|
entry = wi.next();
|
|
String word = entry.getKey();
|
|
wordlen = word.length();
|
|
Word wsp = entry.getValue();
|
|
for (int i = wordcut; i > 0; i--) {
|
|
if (wordlen > i) {
|
|
k = word.substring(0, wordlen - i);
|
|
Word wsp1 = this.words.get(k);
|
|
if (wsp1 != null) {
|
|
wsp1.count = wsp1.count + wsp.count; // update word counter
|
|
wi.remove(); // remove current word
|
|
continue wordsearch;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
// create the synonyms set
|
|
if (SynonymLibrary.size() > 0) {
|
|
for (String word: this.words.keySet()) {
|
|
Set<String> syms = SynonymLibrary.getSynonyms(word);
|
|
if (syms != null) this.synonyms.addAll(syms);
|
|
}
|
|
}
|
|
|
|
// store result
|
|
this.RESULT_NUMB_WORDS = allwordcounter;
|
|
this.RESULT_NUMB_SENTENCES = allsentencecounter;
|
|
}
|
|
|
|
public Map<String, Word> words() {
|
|
// returns the words as word/indexWord relation map
|
|
return this.words;
|
|
}
|
|
|
|
public static Map<String, Word> getWords(final String text, final WordCache meaningLib) {
|
|
// returns a word/indexWord relation map
|
|
if (text == null) return null;
|
|
return new Tokenizer(null, text, meaningLib, false, null).words();
|
|
}
|
|
|
|
public List<String> synonyms() {
|
|
ArrayList<String> l = new ArrayList<String>(this.synonyms.size());
|
|
for (String s: this.synonyms) l.add(s);
|
|
return l;
|
|
}
|
|
|
|
public Map<String, Set<Tagging.Metatag>> tags() {
|
|
return this.tags;
|
|
}
|
|
|
|
}
|