fixed a problem with attribute flags on RWI entries that prevented proper selection of index-of constraint

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@5437 6c8d7289-2bf4-0310-a012-ef5d649a1542
This commit is contained in:
orbiter 2009-01-04 02:27:29 +00:00
parent 6072831235
commit c4c4c223b9
5 changed files with 55 additions and 24 deletions

View File

@ -31,6 +31,7 @@
<option value="plain"#(vMode-plain)#:: selected="selected"#(/vMode-plain)#>Plain Text</option>
<option value="parsed"#(vMode-parsed)#:: selected="selected"#(/vMode-parsed)#>Parsed Text</option>
<option value="sentences"#(vMode-sentences)#:: selected="selected"#(/vMode-sentences)#>Parsed Sentences</option>
<option value="words"#(vMode-words)#:: selected="selected"#(/vMode-words)#>Parsed Tokens/Words</option>
<option value="links"#(vMode-links)#:: selected="selected"#(/vMode-links)#>Link List</option>
</select>
<input type="submit" name="show" value="Show" />
@ -93,6 +94,12 @@
<td class="tt">#[attr]#</tt></td>
</tr>#{/links}#
</table>
:: <!-- 6 -->
<fieldset><legend>Parsed Tokens</legend>
<ol>#{words}#
<li class="tt">#[word]#</li>#{/words}#
</ol>
</fieldset>
#(/viewMode)#
</p>

View File

@ -29,6 +29,7 @@ import java.io.InputStream;
import java.io.UnsupportedEncodingException;
import java.net.MalformedURLException;
import java.net.URLDecoder;
import java.util.Enumeration;
import java.util.HashMap;
import java.util.Iterator;
import java.util.Map;
@ -59,6 +60,7 @@ public class ViewFile {
public static final int VIEW_MODE_AS_PARSED_SENTENCES = 3;
public static final int VIEW_MODE_AS_IFRAME = 4;
public static final int VIEW_MODE_AS_LINKLIST = 5;
public static final int VIEW_MODE_AS_PARSED_WORDS = 6;
private static final String HIGHLIGHT_CSS = "searchHighlight";
private static final int MAX_HIGHLIGHTS = 6;
@ -240,7 +242,7 @@ public class ViewFile {
prop.put("viewMode", VIEW_MODE_AS_IFRAME);
prop.put("viewMode_url", url.toNormalform(false, true));
} else if (viewMode.equals("parsed") || viewMode.equals("sentences") || viewMode.equals("links")) {
} else if (viewMode.equals("parsed") || viewMode.equals("sentences") || viewMode.equals("words") || viewMode.equals("links")) {
// parsing the resource content
plasmaParserDocument document = null;
try {
@ -297,6 +299,33 @@ public class ViewFile {
}
prop.put("viewMode_sentences", i);
} else if (viewMode.equals("words")) {
prop.put("viewMode", VIEW_MODE_AS_PARSED_WORDS);
final Iterator<StringBuilder> sentences = document.getSentences(pre);
boolean dark = true;
int i = 0;
String sentence, token;
if (sentences != null) {
// Search word highlighting
while (sentences.hasNext()) {
sentence = sentences.next().toString();
Enumeration<StringBuilder> tokens = plasmaCondenser.wordTokenizer(sentence, "UTF-8");
while (tokens.hasMoreElements()) {
token = tokens.nextElement().toString();
if (token.length() > 0) {
prop.put("viewMode_words_" + i + "_nr", i + 1);
prop.put("viewMode_words_" + i + "_word", token);
prop.put("viewMode_words_" + i + "_dark", dark ? "1" : "0");
dark = !dark;
i++;
}
}
}
}
prop.put("viewMode_words", i);
} else if (viewMode.equals("links")) {
prop.put("viewMode", VIEW_MODE_AS_LINKLIST);
boolean dark = true;

View File

@ -108,13 +108,15 @@ public final class plasmaCondenser {
this.wordcut = 2;
this.words = new TreeMap<String, indexWord>();
this.RESULT_FLAGS = new kelondroBitfield(4);
// construct flag set for document
if (document.getImages().size() > 0) RESULT_FLAGS.set(flag_cat_hasimage, true);
if (document.getAudiolinks().size() > 0) RESULT_FLAGS.set(flag_cat_hasaudio, true);
if (document.getVideolinks().size() > 0) RESULT_FLAGS.set(flag_cat_hasvideo, true);
if (document.getApplinks().size() > 0) RESULT_FLAGS.set(flag_cat_hasapp, true);
this.languageIdentificator = new Identificator();
//System.out.println("DEBUG: condensing " + document.getMainLongTitle() + ", indexText=" + Boolean.toString(indexText) + ", indexMedia=" + Boolean.toString(indexMedia));
// add the URL components to the word list
insertTextToWords(document.dc_source().toNormalform(false, true), 0, indexRWIEntry.flag_app_dc_identifier, RESULT_FLAGS, false);
Map.Entry<yacyURL, String> entry;
if (indexText) {
@ -161,6 +163,9 @@ public final class plasmaCondenser {
this.RESULT_DIFF_SENTENCES = 0;
}
// add the URL components to the word list
insertTextToWords(document.dc_source().toNormalform(false, true), 0, indexRWIEntry.flag_app_dc_identifier, RESULT_FLAGS, false);
if (indexMedia) {
// add anchor descriptions: here, we also add the url components
// audio
@ -209,12 +214,6 @@ public final class plasmaCondenser {
}
}
}
// construct flag set for document
if (document.getImages().size() > 0) RESULT_FLAGS.set(flag_cat_hasimage, true);
if (document.getAudiolinks().size() > 0) RESULT_FLAGS.set(flag_cat_hasaudio, true);
if (document.getVideolinks().size() > 0) RESULT_FLAGS.set(flag_cat_hasvideo, true);
if (document.getApplinks().size() > 0) RESULT_FLAGS.set(flag_cat_hasapp, true);
}
private void insertTextToWords(final String text, final int phrase, final int flagpos, final kelondroBitfield flagstemplate, boolean useForLanguageIdentification) {
@ -360,7 +359,7 @@ public final class plasmaCondenser {
this.RESULT_FLAGS.set(flag_cat_indexof, true);
wordenum.pre(true); // parse lines as they come with CRLF
}
if ((last_index) && (word.equals("of"))) comb_indexof = true;
if ((last_index) && (wordminsize > 2 || (word.equals("of")))) comb_indexof = true;
last_last = word.equals("last");
last_index = word.equals("index");
@ -491,10 +490,10 @@ public final class plasmaCondenser {
else
return true;
}
public static Enumeration<StringBuilder> wordTokenizer(final String s, final String charset) {
try {
return new sievedWordsEnum(new ByteArrayInputStream(s.getBytes("UTF-8")));
return new sievedWordsEnum(new ByteArrayInputStream(s.getBytes(charset)));
} catch (final Exception e) {
return null;
}

View File

@ -330,13 +330,9 @@ public class plasmaSnippetCache {
// trying to load the resource from the cache
resContent = plasmaHTCache.getResourceContentStream(url);
responseHeader = plasmaHTCache.loadResponseHeader(url);
if (resContent != null) {
// if the content was found
resContentLength = plasmaHTCache.getResourceContentLength(url);
if ((resContentLength > maxDocLen) && (!fetchOnline)) {
// content may be too large to be parsed here. To be fast, we omit calculation of snippet here
return new TextSnippet(url, null, ERROR_SOURCE_LOADING, queryhashes, "resource available, but too large: " + resContentLength + " bytes");
}
if (resContent != null && ((resContentLength = plasmaHTCache.getResourceContentLength(url)) > maxDocLen) && (!fetchOnline)) {
// content may be too large to be parsed here. To be fast, we omit calculation of snippet here
return new TextSnippet(url, null, ERROR_SOURCE_LOADING, queryhashes, "resource available, but too large: " + resContentLength + " bytes");
} else if (containsAllHashes(comp.dc_title(), queryhashes)) {
// try to create the snippet from information given in the url itself
return new TextSnippet(url, (comp.dc_subject().length() > 0) ? comp.dc_creator() : comp.dc_subject(), SOURCE_METADATA, null, null, faviconCache.get(url.hash()));
@ -346,7 +342,7 @@ public class plasmaSnippetCache {
} else if (containsAllHashes(comp.dc_subject(), queryhashes)) {
// try to create the snippet from information given in the subject metadata
return new TextSnippet(url, (comp.dc_creator().length() > 0) ? comp.dc_creator() : comp.dc_subject(), SOURCE_METADATA, null, null, faviconCache.get(url.hash()));
} else if (containsAllHashes(comp.url().toNormalform(true, true), queryhashes)) {
} else if (containsAllHashes(comp.url().toNormalform(true, true).replace('-', ' '), queryhashes)) {
// try to create the snippet from information given in the subject metadata
return new TextSnippet(url, (comp.dc_creator().length() > 0) ? comp.dc_creator() : comp.dc_subject(), SOURCE_METADATA, null, null, faviconCache.get(url.hash()));
} else if (fetchOnline) {
@ -673,7 +669,7 @@ public class plasmaSnippetCache {
final int newlen = Math.max(10, maxpos - minpos + 10);
final int around = (maxLength - newlen) / 2;
assert minpos - around < sentence.length() : "maxpos = " + maxpos + ", minpos = " + minpos + ", around = " + around + ", sentence.length() = " + sentence.length();
assert ((maxpos + around) <= sentence.length()) && ((maxpos + around) <= sentence.length()) : "maxpos = " + maxpos + ", minpos = " + minpos + ", around = " + around + ", sentence.length() = " + sentence.length();
//assert ((maxpos + around) <= sentence.length()) && ((maxpos + around) <= sentence.length()) : "maxpos = " + maxpos + ", minpos = " + minpos + ", around = " + around + ", sentence.length() = " + sentence.length();
sentence = "[..] " + sentence.substring(minpos - around, ((maxpos + around) > sentence.length()) ? sentence.length() : (maxpos + around)).trim() + " [..]";
minpos = around;
maxpos = sentence.length() - around - 5;

View File

@ -100,7 +100,7 @@ public class yacyPeerSelection {
this.remaining = max;
this.doublecheck = new HashSet<String>();
this.nextSeed = nextInternal();
this.alsoMyOwn = alsoMyOwn && (kelondroBase64Order.enhancedCoder.compare(seedDB.mySeed().hash.getBytes(), nextSeed.hash.getBytes()) > 0);
this.alsoMyOwn = alsoMyOwn && nextSeed != null && (kelondroBase64Order.enhancedCoder.compare(seedDB.mySeed().hash.getBytes(), nextSeed.hash.getBytes()) > 0);
}
public boolean hasNext() {