skip to tokenize punktuation as word in WordTokenizer

remove unused variables in condenser related to Tokenizer
This commit is contained in:
reger 2014-11-29 17:16:05 +01:00
parent f07392ff17
commit 5790c7242e
2 changed files with 21 additions and 28 deletions

View File

@ -80,9 +80,9 @@ public final class Condenser {
private String fuzzy_signature_text = null; // signatures for double-check detection
public int RESULT_NUMB_WORDS = -1;
public int RESULT_DIFF_WORDS = -1;
//public int RESULT_DIFF_WORDS = -1;
public int RESULT_NUMB_SENTENCES = -1;
public int RESULT_DIFF_SENTENCES = -1;
//public int RESULT_DIFF_SENTENCES = -1;
public Bitfield RESULT_FLAGS = new Bitfield(4);
private final Identificator languageIdentificator;
@ -157,9 +157,9 @@ public final class Condenser {
*/
} else {
this.RESULT_NUMB_WORDS = 0;
this.RESULT_DIFF_WORDS = 0;
//this.RESULT_DIFF_WORDS = 0;
this.RESULT_NUMB_SENTENCES = 0;
this.RESULT_DIFF_SENTENCES = 0;
//this.RESULT_DIFF_SENTENCES = 0;
}
if (indexMedia) {
@ -274,7 +274,7 @@ public final class Condenser {
this.words.put(word.toLowerCase(), wprop);
pip++;
this.RESULT_NUMB_WORDS++;
this.RESULT_DIFF_WORDS++;
//this.RESULT_DIFF_WORDS++;
}
} finally {
wordenum.close();
@ -330,12 +330,12 @@ public final class Condenser {
final Word wsp1;
int wordHandle;
int wordHandleCount = 0;
final int sentenceHandleCount = 0;
//final int sentenceHandleCount = 0;
int allwordcounter = 0;
final int allsentencecounter = 0;
int wordInSentenceCounter = 1;
boolean comb_indexof = false, last_last = false, last_index = false;
final Map<StringBuilder, Phrase> sentences = new HashMap<StringBuilder, Phrase>(100);
//final Map<StringBuilder, Phrase> sentences = new HashMap<StringBuilder, Phrase>(100);
if (LibraryProvider.autotagging.isEmpty()) doAutotagging = false;
// read source
@ -379,7 +379,7 @@ public final class Condenser {
// distinguish punctuation and words
wordlen = word.length();
if (wordlen == 1 && SentenceReader.punctuation(word.charAt(0))) {
if (wordlen == 1 && SentenceReader.punctuation(word.charAt(0))) { // TODO: wordlen == 1 never true (see earlier if < wordminsize )
// store sentence
currsentwords.clear();
wordInSentenceCounter = 1;
@ -404,7 +404,7 @@ public final class Condenser {
} else {
// word does not yet exist, create new word entry
wordHandle = wordHandleCount++;
wsp = new Word(wordHandle, wordInSentenceCounter, sentences.size() + 100);
wsp = new Word(wordHandle, wordInSentenceCounter, /* sentences.size() + */ 100);
wsp.flags = this.RESULT_FLAGS.clone();
this.words.put(word.toLowerCase(), wsp);
}
@ -446,9 +446,9 @@ public final class Condenser {
// store result
//this.RESULT_NUMB_TEXT_BYTES = wordenum.count();
this.RESULT_NUMB_WORDS = allwordcounter;
this.RESULT_DIFF_WORDS = wordHandleCount;
//this.RESULT_DIFF_WORDS = wordHandleCount;
this.RESULT_NUMB_SENTENCES = allsentencecounter;
this.RESULT_DIFF_SENTENCES = sentenceHandleCount;
//this.RESULT_DIFF_SENTENCES = sentenceHandleCount;
}
public static Map<String, Word> getWords(final String text, final WordCache meaningLib) {

View File

@ -55,12 +55,8 @@ public class WordTokenizer implements Enumeration<StringBuilder> {
private StringBuilder nextElement0() {
StringBuilder s;
loop: while (this.e.hasMoreElements()) {
s = this.e.nextElement();
if ((s.length() == 1) && (SentenceReader.punctuation(s.charAt(0)))) return s;
for (int i = 0; i < s.length(); i++) {
if (SentenceReader.invisible(s.charAt(i))) continue loop;
}
while (this.e.hasMoreElements()) {
s = this.e.nextElement(); // next word (punctuation and invisible chars filtered)
return s;
}
return null;
@ -86,7 +82,7 @@ public class WordTokenizer implements Enumeration<StringBuilder> {
this.buffer = null;
}
private static class unsievedWordsEnum implements Enumeration<StringBuilder> {
private class unsievedWordsEnum implements Enumeration<StringBuilder> {
// returns an enumeration of StringBuilder Objects
private StringBuilder buffer = null;
private SentenceReader sr;
@ -115,19 +111,16 @@ public class WordTokenizer implements Enumeration<StringBuilder> {
}
while (this.s.isEmpty()) {
if (!this.sr.hasNext()) return null;
r = this.sr.next();
r = this.sr.next(); // read next sentence (incl. ending punctuation)
if (r == null) return null;
r = trim(r);
sb = new StringBuilder(20);
for (int i = 0; i < r.length(); i++) {
for (int i = 0; i < r.length(); i++) { // tokenize one sentence
c = r.charAt(i);
if (SentenceReader.invisible(c)) {
if (SentenceReader.punctuation(c)) { // punctuation check is simple/quick, do it before invisible
if (sb.length() > 0) {this.s.add(sb); sb = new StringBuilder(20);}
} else if (SentenceReader.invisible(c)) { // ! currently punctuation again checked by invisible()
if (sb.length() > 0) {this.s.add(sb); sb = new StringBuilder(20);}
} else if (SentenceReader.punctuation(c)) {
if (sb.length() > 0) {this.s.add(sb); sb = new StringBuilder(1);}
sb.append(c);
this.s.add(sb);
sb = new StringBuilder(20);
} else {
sb = sb.append(c);
}
@ -157,8 +150,8 @@ public class WordTokenizer implements Enumeration<StringBuilder> {
this.sIndex = 0;
this.s.clear();
this.s = null;
this.sr.close();
this.sr = null;
this.sr.close();
this.sr = null;
}
}