skip to tokenize punktuation as word in WordTokenizer

remove unused variables in condenser related to Tokenizer
2024-09-19 00:01:41 +02:00 · 2014-11-29 17:16:05 +01:00 · 2014-11-29 17:16:05 +01:00 · 5790c7242e
commit 5790c7242e
parent f07392ff17
2 changed files with 21 additions and 28 deletions
--- a/source/net/yacy/document/Condenser.java
+++ b/source/net/yacy/document/Condenser.java
@ -80,9 +80,9 @@ public final class Condenser {
    private String fuzzy_signature_text = null; // signatures for double-check detection
    
    public int RESULT_NUMB_WORDS = -1;
-    public int RESULT_DIFF_WORDS = -1;
+    //public int RESULT_DIFF_WORDS = -1;
    public int RESULT_NUMB_SENTENCES = -1;
-    public int RESULT_DIFF_SENTENCES = -1;
+    //public int RESULT_DIFF_SENTENCES = -1;
    public Bitfield RESULT_FLAGS = new Bitfield(4);
    private final Identificator languageIdentificator;

@ -157,9 +157,9 @@ public final class Condenser {
            */
        } else {
            this.RESULT_NUMB_WORDS = 0;
-            this.RESULT_DIFF_WORDS = 0;
+            //this.RESULT_DIFF_WORDS = 0;
            this.RESULT_NUMB_SENTENCES = 0;
-            this.RESULT_DIFF_SENTENCES = 0;
+            //this.RESULT_DIFF_SENTENCES = 0;
        }

        if (indexMedia) {
@ -274,7 +274,7 @@ public final class Condenser {
 	            this.words.put(word.toLowerCase(), wprop);
 	            pip++;
 	            this.RESULT_NUMB_WORDS++;
-	            this.RESULT_DIFF_WORDS++;
+	            //this.RESULT_DIFF_WORDS++;
 	        }
        } finally {
        	wordenum.close();
@ -330,12 +330,12 @@ public final class Condenser {
        final Word wsp1;
        int wordHandle;
        int wordHandleCount = 0;
-        final int sentenceHandleCount = 0;
+        //final int sentenceHandleCount = 0;
        int allwordcounter = 0;
        final int allsentencecounter = 0;
        int wordInSentenceCounter = 1;
        boolean comb_indexof = false, last_last = false, last_index = false;
-        final Map<StringBuilder, Phrase> sentences = new HashMap<StringBuilder, Phrase>(100);
+        //final Map<StringBuilder, Phrase> sentences = new HashMap<StringBuilder, Phrase>(100);
        if (LibraryProvider.autotagging.isEmpty()) doAutotagging = false;

        // read source
@ -379,7 +379,7 @@ public final class Condenser {

 	            // distinguish punctuation and words
 	            wordlen = word.length();
-	            if (wordlen == 1 && SentenceReader.punctuation(word.charAt(0))) {
+	            if (wordlen == 1 && SentenceReader.punctuation(word.charAt(0))) { // TODO: wordlen == 1 never true (see earlier if < wordminsize )
 	                // store sentence
 	                currsentwords.clear();
 	                wordInSentenceCounter = 1;
@ -404,7 +404,7 @@ public final class Condenser {
 	                } else {
 	                    // word does not yet exist, create new word entry
 	                    wordHandle = wordHandleCount++;
-	                    wsp = new Word(wordHandle, wordInSentenceCounter, sentences.size() + 100);
+	                    wsp = new Word(wordHandle, wordInSentenceCounter, /* sentences.size() + */ 100);
 	                    wsp.flags = this.RESULT_FLAGS.clone();
 	                    this.words.put(word.toLowerCase(), wsp);
 	                }
@ -446,9 +446,9 @@ public final class Condenser {
        // store result
        //this.RESULT_NUMB_TEXT_BYTES = wordenum.count();
        this.RESULT_NUMB_WORDS = allwordcounter;
-        this.RESULT_DIFF_WORDS = wordHandleCount;
+        //this.RESULT_DIFF_WORDS = wordHandleCount;
        this.RESULT_NUMB_SENTENCES = allsentencecounter;
-        this.RESULT_DIFF_SENTENCES = sentenceHandleCount;
+        //this.RESULT_DIFF_SENTENCES = sentenceHandleCount;
    }

    public static Map<String, Word> getWords(final String text, final WordCache meaningLib) {
--- a/source/net/yacy/document/WordTokenizer.java
+++ b/source/net/yacy/document/WordTokenizer.java
@ -55,12 +55,8 @@ public class WordTokenizer implements Enumeration<StringBuilder> {

    private StringBuilder nextElement0() {
        StringBuilder s;
-        loop: while (this.e.hasMoreElements()) {
-            s = this.e.nextElement();
-            if ((s.length() == 1) && (SentenceReader.punctuation(s.charAt(0)))) return s;
-            for (int i = 0; i < s.length(); i++) {
-                if (SentenceReader.invisible(s.charAt(i))) continue loop;
-            }
+        while (this.e.hasMoreElements()) {
+            s = this.e.nextElement(); // next word (punctuation and invisible chars filtered)
            return s;
        }
        return null;
@ -86,7 +82,7 @@ public class WordTokenizer implements Enumeration<StringBuilder> {
    	this.buffer = null;
    }

-    private static class unsievedWordsEnum implements Enumeration<StringBuilder> {
+    private class unsievedWordsEnum implements Enumeration<StringBuilder> {
        // returns an enumeration of StringBuilder Objects
        private StringBuilder buffer = null;
        private SentenceReader sr;
@ -115,19 +111,16 @@ public class WordTokenizer implements Enumeration<StringBuilder> {
            }
            while (this.s.isEmpty()) {
                if (!this.sr.hasNext()) return null;
-                r = this.sr.next();
+                r = this.sr.next(); // read next sentence (incl. ending punctuation)
                if (r == null) return null;
                r = trim(r);
                sb = new StringBuilder(20);
-                for (int i = 0; i < r.length(); i++) {
+                for (int i = 0; i < r.length(); i++) { // tokenize one sentence
                    c = r.charAt(i);
-                    if (SentenceReader.invisible(c)) {
+                    if (SentenceReader.punctuation(c)) { // punctuation check is simple/quick, do it before invisible
+                        if (sb.length() > 0) {this.s.add(sb); sb = new StringBuilder(20);}
+                    } else if (SentenceReader.invisible(c)) { // ! currently punctuation again checked by invisible()
                        if (sb.length() > 0) {this.s.add(sb); sb = new StringBuilder(20);}
-                    } else if (SentenceReader.punctuation(c)) {
-                        if (sb.length() > 0) {this.s.add(sb); sb = new StringBuilder(1);}
-                        sb.append(c);
-                        this.s.add(sb);
-                        sb = new StringBuilder(20);
                    } else {
                        sb = sb.append(c);
                    }
@ -157,8 +150,8 @@ public class WordTokenizer implements Enumeration<StringBuilder> {
            this.sIndex = 0;
            this.s.clear();
            this.s = null;
-        	this.sr.close();
-        	this.sr = null;
+            this.sr.close();
+            this.sr = null;
        }
    }