mirror of
https://github.com/yacy/yacy_search_server.git
synced 2024-09-19 00:01:41 +02:00
Merge branch 'master' of https://github.com/yacy/yacy_search_server.git
This commit is contained in:
commit
b73d2db914
|
@ -78,7 +78,7 @@ public class Tokenizer {
|
|||
int wordHandleCount = 0;
|
||||
//final int sentenceHandleCount = 0;
|
||||
int allwordcounter = 0;
|
||||
final int allsentencecounter = 0;
|
||||
int allsentencecounter = 0;
|
||||
int wordInSentenceCounter = 1;
|
||||
boolean comb_indexof = false, last_last = false, last_index = false;
|
||||
//final Map<StringBuilder, Phrase> sentences = new HashMap<StringBuilder, Phrase>(100);
|
||||
|
@ -89,6 +89,14 @@ public class Tokenizer {
|
|||
try {
|
||||
while (wordenum.hasMoreElements()) {
|
||||
String word = wordenum.nextElement().toString().toLowerCase(Locale.ENGLISH);
|
||||
// handle punktuation (start new sentence)
|
||||
if (word.length() == 1 && SentenceReader.punctuation(word.charAt(0))) {
|
||||
// store sentence
|
||||
currsentwords.clear();
|
||||
wordInSentenceCounter = 1;
|
||||
allsentencecounter++;
|
||||
continue;
|
||||
}
|
||||
if (word.length() < wordminsize) continue;
|
||||
|
||||
// get tags from autotagging
|
||||
|
@ -144,40 +152,32 @@ public class Tokenizer {
|
|||
System.arraycopy(wordcache, 1, wordcache, 0, wordcache.length - 1);
|
||||
wordcache[wordcache.length - 1] = word;
|
||||
|
||||
// distinguish punctuation and words
|
||||
wordlen = word.length();
|
||||
if (wordlen == 1 && SentenceReader.punctuation(word.charAt(0))) { // TODO: wordlen == 1 never true (see earlier if < wordminsize )
|
||||
// store sentence
|
||||
currsentwords.clear();
|
||||
wordInSentenceCounter = 1;
|
||||
} else {
|
||||
// check index.of detection
|
||||
if (last_last && comb_indexof && word.equals("modified")) {
|
||||
this.RESULT_FLAGS.set(flag_cat_indexof, true);
|
||||
wordenum.pre(true); // parse lines as they come with CRLF
|
||||
}
|
||||
if (last_index && (wordminsize > 2 || word.equals("of"))) comb_indexof = true;
|
||||
last_last = word.equals("last");
|
||||
last_index = word.equals("index");
|
||||
|
||||
// store word
|
||||
allwordcounter++;
|
||||
currsentwords.add(word);
|
||||
Word wsp = this.words.get(word);
|
||||
if (wsp != null) {
|
||||
// word already exists
|
||||
wordHandle = wsp.posInText;
|
||||
wsp.inc();
|
||||
} else {
|
||||
// word does not yet exist, create new word entry
|
||||
wordHandle = ++wordHandleCount; // let start pos with 1
|
||||
wsp = new Word(wordHandle, wordInSentenceCounter, /* sentences.size() + */ 100);
|
||||
wsp.flags = this.RESULT_FLAGS.clone();
|
||||
this.words.put(word.toLowerCase(), wsp);
|
||||
}
|
||||
// we now have the unique handle of the word, put it into the sentence:
|
||||
wordInSentenceCounter++;
|
||||
// check index.of detection
|
||||
if (last_last && comb_indexof && word.equals("modified")) {
|
||||
this.RESULT_FLAGS.set(flag_cat_indexof, true);
|
||||
wordenum.pre(true); // parse lines as they come with CRLF
|
||||
}
|
||||
if (last_index && (wordminsize > 2 || word.equals("of"))) comb_indexof = true;
|
||||
last_last = word.equals("last");
|
||||
last_index = word.equals("index");
|
||||
|
||||
// store word
|
||||
allwordcounter++;
|
||||
currsentwords.add(word);
|
||||
Word wsp = this.words.get(word);
|
||||
if (wsp != null) {
|
||||
// word already exists
|
||||
wordHandle = wsp.posInText;
|
||||
wsp.inc();
|
||||
} else {
|
||||
// word does not yet exist, create new word entry
|
||||
wordHandle = ++wordHandleCount; // let start pos with 1
|
||||
wsp = new Word(wordHandle, wordInSentenceCounter, allsentencecounter + 100); // nomal sentence start at 100 !
|
||||
wsp.flags = this.RESULT_FLAGS.clone();
|
||||
this.words.put(word.toLowerCase(), wsp);
|
||||
}
|
||||
// we now have the unique handle of the word, put it into the sentence:
|
||||
wordInSentenceCounter++;
|
||||
}
|
||||
} finally {
|
||||
wordenum.close();
|
||||
|
|
|
@ -56,7 +56,7 @@ public class WordTokenizer implements Enumeration<StringBuilder> {
|
|||
private StringBuilder nextElement0() {
|
||||
StringBuilder s;
|
||||
while (this.e.hasMoreElements()) {
|
||||
s = this.e.nextElement(); // next word (punctuation and invisible chars filtered)
|
||||
s = this.e.nextElement(); // next word (invisible chars filtered)
|
||||
return s;
|
||||
}
|
||||
return null;
|
||||
|
@ -118,7 +118,13 @@ public class WordTokenizer implements Enumeration<StringBuilder> {
|
|||
for (int i = 0; i < r.length(); i++) { // tokenize one sentence
|
||||
c = r.charAt(i);
|
||||
if (SentenceReader.punctuation(c)) { // punctuation check is simple/quick, do it before invisible
|
||||
if (sb.length() > 0) {this.s.add(sb); sb = new StringBuilder(20);}
|
||||
if (sb.length() > 0) {
|
||||
this.s.add(sb);
|
||||
sb = new StringBuilder(1);
|
||||
}
|
||||
sb.append(c);
|
||||
this.s.add(sb);
|
||||
sb = new StringBuilder(20);
|
||||
} else if (SentenceReader.invisible(c)) { // ! currently punctuation again checked by invisible()
|
||||
if (sb.length() > 0) {this.s.add(sb); sb = new StringBuilder(20);}
|
||||
} else {
|
||||
|
|
|
@ -22,8 +22,12 @@ public class WordTokenizerTest {
|
|||
int cnt = 0;
|
||||
while (wt.hasMoreElements()) {
|
||||
StringBuilder sb = wt.nextElement();
|
||||
assertEquals("word", sb.toString());
|
||||
cnt++;
|
||||
if (sb.length() > 1) { // skip punktuation
|
||||
assertEquals("word", sb.toString());
|
||||
cnt++;
|
||||
} else {
|
||||
assertTrue("punktuation", SentenceReader.punctuation(sb.charAt(0)));
|
||||
}
|
||||
}
|
||||
wt.close();
|
||||
assertEquals(10, cnt);
|
||||
|
|
Loading…
Reference in New Issue
Block a user