mirror of
https://github.com/yacy/yacy_search_server.git
synced 2024-09-19 00:01:41 +02:00
skip to tokenize punktuation as word in WordTokenizer
remove unused variables in condenser related to Tokenizer
This commit is contained in:
parent
f07392ff17
commit
5790c7242e
|
@ -80,9 +80,9 @@ public final class Condenser {
|
|||
private String fuzzy_signature_text = null; // signatures for double-check detection
|
||||
|
||||
public int RESULT_NUMB_WORDS = -1;
|
||||
public int RESULT_DIFF_WORDS = -1;
|
||||
//public int RESULT_DIFF_WORDS = -1;
|
||||
public int RESULT_NUMB_SENTENCES = -1;
|
||||
public int RESULT_DIFF_SENTENCES = -1;
|
||||
//public int RESULT_DIFF_SENTENCES = -1;
|
||||
public Bitfield RESULT_FLAGS = new Bitfield(4);
|
||||
private final Identificator languageIdentificator;
|
||||
|
||||
|
@ -157,9 +157,9 @@ public final class Condenser {
|
|||
*/
|
||||
} else {
|
||||
this.RESULT_NUMB_WORDS = 0;
|
||||
this.RESULT_DIFF_WORDS = 0;
|
||||
//this.RESULT_DIFF_WORDS = 0;
|
||||
this.RESULT_NUMB_SENTENCES = 0;
|
||||
this.RESULT_DIFF_SENTENCES = 0;
|
||||
//this.RESULT_DIFF_SENTENCES = 0;
|
||||
}
|
||||
|
||||
if (indexMedia) {
|
||||
|
@ -274,7 +274,7 @@ public final class Condenser {
|
|||
this.words.put(word.toLowerCase(), wprop);
|
||||
pip++;
|
||||
this.RESULT_NUMB_WORDS++;
|
||||
this.RESULT_DIFF_WORDS++;
|
||||
//this.RESULT_DIFF_WORDS++;
|
||||
}
|
||||
} finally {
|
||||
wordenum.close();
|
||||
|
@ -330,12 +330,12 @@ public final class Condenser {
|
|||
final Word wsp1;
|
||||
int wordHandle;
|
||||
int wordHandleCount = 0;
|
||||
final int sentenceHandleCount = 0;
|
||||
//final int sentenceHandleCount = 0;
|
||||
int allwordcounter = 0;
|
||||
final int allsentencecounter = 0;
|
||||
int wordInSentenceCounter = 1;
|
||||
boolean comb_indexof = false, last_last = false, last_index = false;
|
||||
final Map<StringBuilder, Phrase> sentences = new HashMap<StringBuilder, Phrase>(100);
|
||||
//final Map<StringBuilder, Phrase> sentences = new HashMap<StringBuilder, Phrase>(100);
|
||||
if (LibraryProvider.autotagging.isEmpty()) doAutotagging = false;
|
||||
|
||||
// read source
|
||||
|
@ -379,7 +379,7 @@ public final class Condenser {
|
|||
|
||||
// distinguish punctuation and words
|
||||
wordlen = word.length();
|
||||
if (wordlen == 1 && SentenceReader.punctuation(word.charAt(0))) {
|
||||
if (wordlen == 1 && SentenceReader.punctuation(word.charAt(0))) { // TODO: wordlen == 1 never true (see earlier if < wordminsize )
|
||||
// store sentence
|
||||
currsentwords.clear();
|
||||
wordInSentenceCounter = 1;
|
||||
|
@ -404,7 +404,7 @@ public final class Condenser {
|
|||
} else {
|
||||
// word does not yet exist, create new word entry
|
||||
wordHandle = wordHandleCount++;
|
||||
wsp = new Word(wordHandle, wordInSentenceCounter, sentences.size() + 100);
|
||||
wsp = new Word(wordHandle, wordInSentenceCounter, /* sentences.size() + */ 100);
|
||||
wsp.flags = this.RESULT_FLAGS.clone();
|
||||
this.words.put(word.toLowerCase(), wsp);
|
||||
}
|
||||
|
@ -446,9 +446,9 @@ public final class Condenser {
|
|||
// store result
|
||||
//this.RESULT_NUMB_TEXT_BYTES = wordenum.count();
|
||||
this.RESULT_NUMB_WORDS = allwordcounter;
|
||||
this.RESULT_DIFF_WORDS = wordHandleCount;
|
||||
//this.RESULT_DIFF_WORDS = wordHandleCount;
|
||||
this.RESULT_NUMB_SENTENCES = allsentencecounter;
|
||||
this.RESULT_DIFF_SENTENCES = sentenceHandleCount;
|
||||
//this.RESULT_DIFF_SENTENCES = sentenceHandleCount;
|
||||
}
|
||||
|
||||
public static Map<String, Word> getWords(final String text, final WordCache meaningLib) {
|
||||
|
|
|
@ -55,12 +55,8 @@ public class WordTokenizer implements Enumeration<StringBuilder> {
|
|||
|
||||
private StringBuilder nextElement0() {
|
||||
StringBuilder s;
|
||||
loop: while (this.e.hasMoreElements()) {
|
||||
s = this.e.nextElement();
|
||||
if ((s.length() == 1) && (SentenceReader.punctuation(s.charAt(0)))) return s;
|
||||
for (int i = 0; i < s.length(); i++) {
|
||||
if (SentenceReader.invisible(s.charAt(i))) continue loop;
|
||||
}
|
||||
while (this.e.hasMoreElements()) {
|
||||
s = this.e.nextElement(); // next word (punctuation and invisible chars filtered)
|
||||
return s;
|
||||
}
|
||||
return null;
|
||||
|
@ -86,7 +82,7 @@ public class WordTokenizer implements Enumeration<StringBuilder> {
|
|||
this.buffer = null;
|
||||
}
|
||||
|
||||
private static class unsievedWordsEnum implements Enumeration<StringBuilder> {
|
||||
private class unsievedWordsEnum implements Enumeration<StringBuilder> {
|
||||
// returns an enumeration of StringBuilder Objects
|
||||
private StringBuilder buffer = null;
|
||||
private SentenceReader sr;
|
||||
|
@ -115,19 +111,16 @@ public class WordTokenizer implements Enumeration<StringBuilder> {
|
|||
}
|
||||
while (this.s.isEmpty()) {
|
||||
if (!this.sr.hasNext()) return null;
|
||||
r = this.sr.next();
|
||||
r = this.sr.next(); // read next sentence (incl. ending punctuation)
|
||||
if (r == null) return null;
|
||||
r = trim(r);
|
||||
sb = new StringBuilder(20);
|
||||
for (int i = 0; i < r.length(); i++) {
|
||||
for (int i = 0; i < r.length(); i++) { // tokenize one sentence
|
||||
c = r.charAt(i);
|
||||
if (SentenceReader.invisible(c)) {
|
||||
if (SentenceReader.punctuation(c)) { // punctuation check is simple/quick, do it before invisible
|
||||
if (sb.length() > 0) {this.s.add(sb); sb = new StringBuilder(20);}
|
||||
} else if (SentenceReader.invisible(c)) { // ! currently punctuation again checked by invisible()
|
||||
if (sb.length() > 0) {this.s.add(sb); sb = new StringBuilder(20);}
|
||||
} else if (SentenceReader.punctuation(c)) {
|
||||
if (sb.length() > 0) {this.s.add(sb); sb = new StringBuilder(1);}
|
||||
sb.append(c);
|
||||
this.s.add(sb);
|
||||
sb = new StringBuilder(20);
|
||||
} else {
|
||||
sb = sb.append(c);
|
||||
}
|
||||
|
@ -157,8 +150,8 @@ public class WordTokenizer implements Enumeration<StringBuilder> {
|
|||
this.sIndex = 0;
|
||||
this.s.clear();
|
||||
this.s = null;
|
||||
this.sr.close();
|
||||
this.sr = null;
|
||||
this.sr.close();
|
||||
this.sr = null;
|
||||
}
|
||||
}
|
||||
|
||||
|
|
Loading…
Reference in New Issue
Block a user