yacy_search_server/test/java/net/yacy/document/WordTokenizerTest.java
reger 272cdd496a reactivate sentence counter in WordTokenizer for phrasepos ranking,
by counting punktuation (delivered as 1 char word) again.
2016-09-07 02:16:16 +02:00

38 lines
1.2 KiB
Java

package net.yacy.document;
import org.junit.Test;
import static org.junit.Assert.*;
public class WordTokenizerTest {
/**
* Test of nextElement method, of class WordTokenizer.
*/
@Test
public void testNextElement() {
// test sentences containing 10x the word "word"
String[] testTxtArr = new String[]{
" word word..... (word) [word] . 'word word' \"word word\" word ? word! ",
"word-word word . word.word@word.word ....word... word,word "
};
for (String testTxt : testTxtArr) {
SentenceReader sr = new SentenceReader(testTxt);
WordTokenizer wt = new WordTokenizer(sr, null);
int cnt = 0;
while (wt.hasMoreElements()) {
StringBuilder sb = wt.nextElement();
if (sb.length() > 1) { // skip punktuation
assertEquals("word", sb.toString());
cnt++;
} else {
assertTrue("punktuation", SentenceReader.punctuation(sb.charAt(0)));
}
}
wt.close();
assertEquals(10, cnt);
}
}
}