adjust Tokenizer sentence count to ignore repeated punktuation (like !!!! )

+ remove unused sentenceword map (we use only the count)
+ upd test case for sentence count
This commit is contained in:
reger 2016-10-06 03:41:07 +02:00
parent b5eb7a9217
commit ae3717d087
2 changed files with 25 additions and 5 deletions

View File

@ -68,7 +68,6 @@ public class Tokenizer {
this.words = new TreeMap<String, Word>(NaturalOrder.naturalComparator);
this.synonyms = new LinkedHashSet<String>();
assert text != null;
final Set<String> currsentwords = new HashSet<String>();
String[] wordcache = new String[LibraryProvider.autotagging.getMaxWordsInTerm() - 1];
for (int i = 0; i < wordcache.length; i++) wordcache[i] = "";
String k;
@ -89,9 +88,9 @@ public class Tokenizer {
// handle punktuation (start new sentence)
if (word.length() == 1 && SentenceReader.punctuation(word.charAt(0))) {
// store sentence
currsentwords.clear();
if (wordInSentenceCounter > 1) // if no word in sentence repeated punktuation ".....", don't count as sentence
allsentencecounter++;
wordInSentenceCounter = 1;
allsentencecounter++;
continue;
}
if (word.length() < wordminsize) continue;
@ -160,7 +159,6 @@ public class Tokenizer {
// store word
allwordcounter++;
currsentwords.add(word);
Word wsp = this.words.get(word);
if (wsp != null) {
// word already exists
@ -214,7 +212,7 @@ public class Tokenizer {
// store result
this.RESULT_NUMB_WORDS = allwordcounter;
// if text doesn't end with punktuation but has words after last found sentence, inc sentence count for trailing text.
this.RESULT_NUMB_SENTENCES = allsentencecounter + (currsentwords.size() > 0 ? 1 : 0);
this.RESULT_NUMB_SENTENCES = allsentencecounter + (wordInSentenceCounter > 1 ? 1 : 0);
}
public Map<String, Word> words() {

View File

@ -2,7 +2,9 @@
package net.yacy.document;
import java.net.MalformedURLException;
import java.util.HashSet;
import java.util.Map;
import java.util.Set;
import net.yacy.cora.document.WordCache;
import net.yacy.kelondro.data.word.Word;
import org.junit.Test;
@ -36,4 +38,24 @@ public class TokenizerTest {
assertEquals("occurence of 'words' ", 2, w.occurrences());
}
/**
* Test of RESULT_NUMB_SENTENCES, of class Tokenizer.
*/
@Test
public void testNumberOfSentences() {
Set<String> testText = new HashSet();
// text with 5 sentences
testText.add("Sentence One. Sentence Two. Comment on this. This is sentence four! Good By................");
testText.add("Sentence One. Sentence two. Sentence 3? Sentence 4! Sentence w/o punktuation at end of text");
testText.add("!!! ! ! ! Sentence One. Sentence two. Sentence 3? Sentence 4! Sentence 5 ! ! ! !!!");
WordCache meaningLib = new WordCache(null);
boolean doAutotagging = false;
VocabularyScraper scraper = null;
for (String text : testText) {
Tokenizer t = new Tokenizer(null, text, meaningLib, doAutotagging, scraper);
System.out.println(t.RESULT_NUMB_WORDS);
assertEquals("Tokenizer.RESULT_NUMB_SENTENCES", 5, t.RESULT_NUMB_SENTENCES);
}
}
}