mirror of
https://github.com/yacy/yacy_search_server.git
synced 2024-09-19 00:01:41 +02:00
adjust Tokenizer sentence count to ignore repeated punktuation (like !!!! )
+ remove unused sentenceword map (we use only the count) + upd test case for sentence count
This commit is contained in:
parent
b5eb7a9217
commit
ae3717d087
|
@ -68,7 +68,6 @@ public class Tokenizer {
|
|||
this.words = new TreeMap<String, Word>(NaturalOrder.naturalComparator);
|
||||
this.synonyms = new LinkedHashSet<String>();
|
||||
assert text != null;
|
||||
final Set<String> currsentwords = new HashSet<String>();
|
||||
String[] wordcache = new String[LibraryProvider.autotagging.getMaxWordsInTerm() - 1];
|
||||
for (int i = 0; i < wordcache.length; i++) wordcache[i] = "";
|
||||
String k;
|
||||
|
@ -89,9 +88,9 @@ public class Tokenizer {
|
|||
// handle punktuation (start new sentence)
|
||||
if (word.length() == 1 && SentenceReader.punctuation(word.charAt(0))) {
|
||||
// store sentence
|
||||
currsentwords.clear();
|
||||
if (wordInSentenceCounter > 1) // if no word in sentence repeated punktuation ".....", don't count as sentence
|
||||
allsentencecounter++;
|
||||
wordInSentenceCounter = 1;
|
||||
allsentencecounter++;
|
||||
continue;
|
||||
}
|
||||
if (word.length() < wordminsize) continue;
|
||||
|
@ -160,7 +159,6 @@ public class Tokenizer {
|
|||
|
||||
// store word
|
||||
allwordcounter++;
|
||||
currsentwords.add(word);
|
||||
Word wsp = this.words.get(word);
|
||||
if (wsp != null) {
|
||||
// word already exists
|
||||
|
@ -214,7 +212,7 @@ public class Tokenizer {
|
|||
// store result
|
||||
this.RESULT_NUMB_WORDS = allwordcounter;
|
||||
// if text doesn't end with punktuation but has words after last found sentence, inc sentence count for trailing text.
|
||||
this.RESULT_NUMB_SENTENCES = allsentencecounter + (currsentwords.size() > 0 ? 1 : 0);
|
||||
this.RESULT_NUMB_SENTENCES = allsentencecounter + (wordInSentenceCounter > 1 ? 1 : 0);
|
||||
}
|
||||
|
||||
public Map<String, Word> words() {
|
||||
|
|
|
@ -2,7 +2,9 @@
|
|||
package net.yacy.document;
|
||||
|
||||
import java.net.MalformedURLException;
|
||||
import java.util.HashSet;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
import net.yacy.cora.document.WordCache;
|
||||
import net.yacy.kelondro.data.word.Word;
|
||||
import org.junit.Test;
|
||||
|
@ -36,4 +38,24 @@ public class TokenizerTest {
|
|||
assertEquals("occurence of 'words' ", 2, w.occurrences());
|
||||
}
|
||||
|
||||
/**
|
||||
* Test of RESULT_NUMB_SENTENCES, of class Tokenizer.
|
||||
*/
|
||||
@Test
|
||||
public void testNumberOfSentences() {
|
||||
Set<String> testText = new HashSet();
|
||||
// text with 5 sentences
|
||||
testText.add("Sentence One. Sentence Two. Comment on this. This is sentence four! Good By................");
|
||||
testText.add("Sentence One. Sentence two. Sentence 3? Sentence 4! Sentence w/o punktuation at end of text");
|
||||
testText.add("!!! ! ! ! Sentence One. Sentence two. Sentence 3? Sentence 4! Sentence 5 ! ! ! !!!");
|
||||
|
||||
WordCache meaningLib = new WordCache(null);
|
||||
boolean doAutotagging = false;
|
||||
VocabularyScraper scraper = null;
|
||||
for (String text : testText) {
|
||||
Tokenizer t = new Tokenizer(null, text, meaningLib, doAutotagging, scraper);
|
||||
System.out.println(t.RESULT_NUMB_WORDS);
|
||||
assertEquals("Tokenizer.RESULT_NUMB_SENTENCES", 5, t.RESULT_NUMB_SENTENCES);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue
Block a user