fixed a problem with attribute flags on RWI entries that prevented proper selection of index-of constraint

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@5437 6c8d7289-2bf4-0310-a012-ef5d649a1542
2024-09-19 00:01:41 +02:00 · 2009-01-04 02:27:29 +00:00 · 2009-01-04 02:27:29 +00:00 · c4c4c223b9
commit c4c4c223b9
parent 6072831235
5 changed files with 55 additions and 24 deletions
--- a/htroot/ViewFile.html
+++ b/htroot/ViewFile.html
@ -31,6 +31,7 @@
              <option value="plain"#(vMode-plain)#:: selected="selected"#(/vMode-plain)#>Plain Text</option>
              <option value="parsed"#(vMode-parsed)#:: selected="selected"#(/vMode-parsed)#>Parsed Text</option>
              <option value="sentences"#(vMode-sentences)#:: selected="selected"#(/vMode-sentences)#>Parsed Sentences</option>
+              <option value="words"#(vMode-words)#:: selected="selected"#(/vMode-words)#>Parsed Tokens/Words</option>
              <option value="links"#(vMode-links)#:: selected="selected"#(/vMode-links)#>Link List</option>
            </select>
            <input type="submit" name="show" value="Show" />
@ -93,6 +94,12 @@
 			<td class="tt">#[attr]#</tt></td>
 		</tr>#{/links}#
 	</table>	
+:: <!-- 6 -->
+	<fieldset><legend>Parsed Tokens</legend>
+	  <ol>#{words}#
+		<li class="tt">#[word]#</li>#{/words}#
+	  </ol>
+	</fieldset>
 #(/viewMode)#
 </p>

--- a/htroot/ViewFile.java
+++ b/htroot/ViewFile.java
@ -29,6 +29,7 @@ import java.io.InputStream;
 import java.io.UnsupportedEncodingException;
 import java.net.MalformedURLException;
 import java.net.URLDecoder;
+import java.util.Enumeration;
 import java.util.HashMap;
 import java.util.Iterator;
 import java.util.Map;
@ -59,6 +60,7 @@ public class ViewFile {
    public static final int VIEW_MODE_AS_PARSED_SENTENCES = 3;
    public static final int VIEW_MODE_AS_IFRAME = 4;
    public static final int VIEW_MODE_AS_LINKLIST = 5;
+    public static final int VIEW_MODE_AS_PARSED_WORDS = 6;
    
    private static final String HIGHLIGHT_CSS = "searchHighlight";
    private static final int MAX_HIGHLIGHTS = 6;
@ -240,7 +242,7 @@ public class ViewFile {
            prop.put("viewMode", VIEW_MODE_AS_IFRAME);
            prop.put("viewMode_url", url.toNormalform(false, true));
            
-        } else if (viewMode.equals("parsed") || viewMode.equals("sentences") || viewMode.equals("links")) {
+        } else if (viewMode.equals("parsed") || viewMode.equals("sentences")  || viewMode.equals("words") || viewMode.equals("links")) {
            // parsing the resource content
            plasmaParserDocument document = null;
            try {
@ -297,6 +299,33 @@ public class ViewFile {
                }
                prop.put("viewMode_sentences", i);

+            } else if (viewMode.equals("words")) {
+                prop.put("viewMode", VIEW_MODE_AS_PARSED_WORDS);
+                final Iterator<StringBuilder> sentences = document.getSentences(pre);
+
+                boolean dark = true;
+                int i = 0;
+                String sentence, token;
+                if (sentences != null) {
+                    
+                    // Search word highlighting
+                    while (sentences.hasNext()) {
+                        sentence = sentences.next().toString();
+                        Enumeration<StringBuilder> tokens = plasmaCondenser.wordTokenizer(sentence, "UTF-8");
+                        while (tokens.hasMoreElements()) {
+                            token = tokens.nextElement().toString();
+                            if (token.length() > 0) {
+                                prop.put("viewMode_words_" + i + "_nr", i + 1);
+                                prop.put("viewMode_words_" + i + "_word", token);
+                                prop.put("viewMode_words_" + i + "_dark", dark ? "1" : "0");
+                                dark = !dark;
+                                i++;
+                            }
+                        }
+                    }
+                }
+                prop.put("viewMode_words", i);
+
            } else if (viewMode.equals("links")) {
                prop.put("viewMode", VIEW_MODE_AS_LINKLIST);
                boolean dark = true;
--- a/source/de/anomic/plasma/plasmaCondenser.java
+++ b/source/de/anomic/plasma/plasmaCondenser.java
@ -108,13 +108,15 @@ public final class plasmaCondenser {
        this.wordcut = 2;
        this.words = new TreeMap<String, indexWord>();
        this.RESULT_FLAGS = new kelondroBitfield(4);
+
+        // construct flag set for document
+        if (document.getImages().size() > 0) RESULT_FLAGS.set(flag_cat_hasimage, true);
+        if (document.getAudiolinks().size() > 0) RESULT_FLAGS.set(flag_cat_hasaudio, true);
+        if (document.getVideolinks().size() > 0) RESULT_FLAGS.set(flag_cat_hasvideo, true);
+        if (document.getApplinks().size()   > 0) RESULT_FLAGS.set(flag_cat_hasapp,   true);
        
        this.languageIdentificator = new Identificator();
        
-        //System.out.println("DEBUG: condensing " + document.getMainLongTitle() + ", indexText=" + Boolean.toString(indexText) + ", indexMedia=" + Boolean.toString(indexMedia));
-
-        // add the URL components to the word list
-        insertTextToWords(document.dc_source().toNormalform(false, true), 0, indexRWIEntry.flag_app_dc_identifier, RESULT_FLAGS, false);
        
        Map.Entry<yacyURL, String> entry;
        if (indexText) {
@ -161,6 +163,9 @@ public final class plasmaCondenser {
            this.RESULT_DIFF_SENTENCES = 0;
        }
        
+        // add the URL components to the word list
+        insertTextToWords(document.dc_source().toNormalform(false, true), 0, indexRWIEntry.flag_app_dc_identifier, RESULT_FLAGS, false);
+
        if (indexMedia) {
            // add anchor descriptions: here, we also add the url components
            // audio
@ -209,12 +214,6 @@ public final class plasmaCondenser {
                }
            }
        }
-        
-        // construct flag set for document
-        if (document.getImages().size() > 0) RESULT_FLAGS.set(flag_cat_hasimage, true);
-        if (document.getAudiolinks().size() > 0) RESULT_FLAGS.set(flag_cat_hasaudio, true);
-        if (document.getVideolinks().size() > 0) RESULT_FLAGS.set(flag_cat_hasvideo, true);
-        if (document.getApplinks().size()   > 0) RESULT_FLAGS.set(flag_cat_hasapp,   true);
    }
    
    private void insertTextToWords(final String text, final int phrase, final int flagpos, final kelondroBitfield flagstemplate, boolean useForLanguageIdentification) {
@ -360,7 +359,7 @@ public final class plasmaCondenser {
                    this.RESULT_FLAGS.set(flag_cat_indexof, true);
                    wordenum.pre(true); // parse lines as they come with CRLF
                }
-                if ((last_index) && (word.equals("of"))) comb_indexof = true;
+                if ((last_index) && (wordminsize > 2 || (word.equals("of")))) comb_indexof = true;
                last_last = word.equals("last");
                last_index = word.equals("index");
                
@ -491,10 +490,10 @@ public final class plasmaCondenser {
    	else
    		return true;
    }
-
+    
    public static Enumeration<StringBuilder> wordTokenizer(final String s, final String charset) {
        try {
-            return new sievedWordsEnum(new ByteArrayInputStream(s.getBytes("UTF-8")));
+            return new sievedWordsEnum(new ByteArrayInputStream(s.getBytes(charset)));
        } catch (final Exception e) {
            return null;
        }
--- a/source/de/anomic/plasma/plasmaSnippetCache.java
+++ b/source/de/anomic/plasma/plasmaSnippetCache.java
@ -330,13 +330,9 @@ public class plasmaSnippetCache {
            // trying to load the resource from the cache
            resContent = plasmaHTCache.getResourceContentStream(url);
            responseHeader = plasmaHTCache.loadResponseHeader(url);
-            if (resContent != null) {
-                // if the content was found
-                resContentLength = plasmaHTCache.getResourceContentLength(url);
-                if ((resContentLength > maxDocLen) && (!fetchOnline)) {
-                    // content may be too large to be parsed here. To be fast, we omit calculation of snippet here
-                    return new TextSnippet(url, null, ERROR_SOURCE_LOADING, queryhashes, "resource available, but too large: " + resContentLength + " bytes");
-                }
+            if (resContent != null && ((resContentLength = plasmaHTCache.getResourceContentLength(url)) > maxDocLen) && (!fetchOnline)) {
+                // content may be too large to be parsed here. To be fast, we omit calculation of snippet here
+                return new TextSnippet(url, null, ERROR_SOURCE_LOADING, queryhashes, "resource available, but too large: " + resContentLength + " bytes");
            } else if (containsAllHashes(comp.dc_title(), queryhashes)) {
                // try to create the snippet from information given in the url itself
                return new TextSnippet(url, (comp.dc_subject().length() > 0) ? comp.dc_creator() : comp.dc_subject(), SOURCE_METADATA, null, null, faviconCache.get(url.hash()));
@ -346,7 +342,7 @@ public class plasmaSnippetCache {
            } else if (containsAllHashes(comp.dc_subject(), queryhashes)) {
                // try to create the snippet from information given in the subject metadata
                return new TextSnippet(url, (comp.dc_creator().length() > 0) ? comp.dc_creator() : comp.dc_subject(), SOURCE_METADATA, null, null, faviconCache.get(url.hash()));
-            } else if (containsAllHashes(comp.url().toNormalform(true, true), queryhashes)) {
+            } else if (containsAllHashes(comp.url().toNormalform(true, true).replace('-', ' '), queryhashes)) {
                // try to create the snippet from information given in the subject metadata
                return new TextSnippet(url, (comp.dc_creator().length() > 0) ? comp.dc_creator() : comp.dc_subject(), SOURCE_METADATA, null, null, faviconCache.get(url.hash()));
            } else if (fetchOnline) {
@ -673,7 +669,7 @@ public class plasmaSnippetCache {
                final int newlen = Math.max(10, maxpos - minpos + 10);
                final int around = (maxLength - newlen) / 2;
                assert minpos - around < sentence.length() : "maxpos = " + maxpos + ", minpos = " + minpos + ", around = " + around + ", sentence.length() = " + sentence.length();
-                assert ((maxpos + around) <= sentence.length()) && ((maxpos + around) <= sentence.length()) : "maxpos = " + maxpos + ", minpos = " + minpos + ", around = " + around + ", sentence.length() = " + sentence.length();
+                //assert ((maxpos + around) <= sentence.length()) && ((maxpos + around) <= sentence.length()) : "maxpos = " + maxpos + ", minpos = " + minpos + ", around = " + around + ", sentence.length() = " + sentence.length();
                sentence = "[..] " + sentence.substring(minpos - around, ((maxpos + around) > sentence.length()) ? sentence.length() : (maxpos + around)).trim() + " [..]";
                minpos = around;
                maxpos = sentence.length() - around - 5;
--- a/source/de/anomic/yacy/yacyPeerSelection.java
+++ b/source/de/anomic/yacy/yacyPeerSelection.java
@ -100,7 +100,7 @@ public class yacyPeerSelection {
            this.remaining = max;
            this.doublecheck = new HashSet<String>();
            this.nextSeed = nextInternal();
-            this.alsoMyOwn = alsoMyOwn && (kelondroBase64Order.enhancedCoder.compare(seedDB.mySeed().hash.getBytes(), nextSeed.hash.getBytes()) > 0);
+            this.alsoMyOwn = alsoMyOwn && nextSeed != null && (kelondroBase64Order.enhancedCoder.compare(seedDB.mySeed().hash.getBytes(), nextSeed.hash.getBytes()) > 0);
        }
        
        public boolean hasNext() {