mirror of
https://github.com/yacy/yacy_search_server.git
synced 2024-09-19 00:01:41 +02:00
fixed a problem with attribute flags on RWI entries that prevented proper selection of index-of constraint
git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@5437 6c8d7289-2bf4-0310-a012-ef5d649a1542
This commit is contained in:
parent
6072831235
commit
c4c4c223b9
|
@ -31,6 +31,7 @@
|
|||
<option value="plain"#(vMode-plain)#:: selected="selected"#(/vMode-plain)#>Plain Text</option>
|
||||
<option value="parsed"#(vMode-parsed)#:: selected="selected"#(/vMode-parsed)#>Parsed Text</option>
|
||||
<option value="sentences"#(vMode-sentences)#:: selected="selected"#(/vMode-sentences)#>Parsed Sentences</option>
|
||||
<option value="words"#(vMode-words)#:: selected="selected"#(/vMode-words)#>Parsed Tokens/Words</option>
|
||||
<option value="links"#(vMode-links)#:: selected="selected"#(/vMode-links)#>Link List</option>
|
||||
</select>
|
||||
<input type="submit" name="show" value="Show" />
|
||||
|
@ -93,6 +94,12 @@
|
|||
<td class="tt">#[attr]#</tt></td>
|
||||
</tr>#{/links}#
|
||||
</table>
|
||||
:: <!-- 6 -->
|
||||
<fieldset><legend>Parsed Tokens</legend>
|
||||
<ol>#{words}#
|
||||
<li class="tt">#[word]#</li>#{/words}#
|
||||
</ol>
|
||||
</fieldset>
|
||||
#(/viewMode)#
|
||||
</p>
|
||||
|
||||
|
|
|
@ -29,6 +29,7 @@ import java.io.InputStream;
|
|||
import java.io.UnsupportedEncodingException;
|
||||
import java.net.MalformedURLException;
|
||||
import java.net.URLDecoder;
|
||||
import java.util.Enumeration;
|
||||
import java.util.HashMap;
|
||||
import java.util.Iterator;
|
||||
import java.util.Map;
|
||||
|
@ -59,6 +60,7 @@ public class ViewFile {
|
|||
public static final int VIEW_MODE_AS_PARSED_SENTENCES = 3;
|
||||
public static final int VIEW_MODE_AS_IFRAME = 4;
|
||||
public static final int VIEW_MODE_AS_LINKLIST = 5;
|
||||
public static final int VIEW_MODE_AS_PARSED_WORDS = 6;
|
||||
|
||||
private static final String HIGHLIGHT_CSS = "searchHighlight";
|
||||
private static final int MAX_HIGHLIGHTS = 6;
|
||||
|
@ -240,7 +242,7 @@ public class ViewFile {
|
|||
prop.put("viewMode", VIEW_MODE_AS_IFRAME);
|
||||
prop.put("viewMode_url", url.toNormalform(false, true));
|
||||
|
||||
} else if (viewMode.equals("parsed") || viewMode.equals("sentences") || viewMode.equals("links")) {
|
||||
} else if (viewMode.equals("parsed") || viewMode.equals("sentences") || viewMode.equals("words") || viewMode.equals("links")) {
|
||||
// parsing the resource content
|
||||
plasmaParserDocument document = null;
|
||||
try {
|
||||
|
@ -297,6 +299,33 @@ public class ViewFile {
|
|||
}
|
||||
prop.put("viewMode_sentences", i);
|
||||
|
||||
} else if (viewMode.equals("words")) {
|
||||
prop.put("viewMode", VIEW_MODE_AS_PARSED_WORDS);
|
||||
final Iterator<StringBuilder> sentences = document.getSentences(pre);
|
||||
|
||||
boolean dark = true;
|
||||
int i = 0;
|
||||
String sentence, token;
|
||||
if (sentences != null) {
|
||||
|
||||
// Search word highlighting
|
||||
while (sentences.hasNext()) {
|
||||
sentence = sentences.next().toString();
|
||||
Enumeration<StringBuilder> tokens = plasmaCondenser.wordTokenizer(sentence, "UTF-8");
|
||||
while (tokens.hasMoreElements()) {
|
||||
token = tokens.nextElement().toString();
|
||||
if (token.length() > 0) {
|
||||
prop.put("viewMode_words_" + i + "_nr", i + 1);
|
||||
prop.put("viewMode_words_" + i + "_word", token);
|
||||
prop.put("viewMode_words_" + i + "_dark", dark ? "1" : "0");
|
||||
dark = !dark;
|
||||
i++;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
prop.put("viewMode_words", i);
|
||||
|
||||
} else if (viewMode.equals("links")) {
|
||||
prop.put("viewMode", VIEW_MODE_AS_LINKLIST);
|
||||
boolean dark = true;
|
||||
|
|
|
@ -108,13 +108,15 @@ public final class plasmaCondenser {
|
|||
this.wordcut = 2;
|
||||
this.words = new TreeMap<String, indexWord>();
|
||||
this.RESULT_FLAGS = new kelondroBitfield(4);
|
||||
|
||||
// construct flag set for document
|
||||
if (document.getImages().size() > 0) RESULT_FLAGS.set(flag_cat_hasimage, true);
|
||||
if (document.getAudiolinks().size() > 0) RESULT_FLAGS.set(flag_cat_hasaudio, true);
|
||||
if (document.getVideolinks().size() > 0) RESULT_FLAGS.set(flag_cat_hasvideo, true);
|
||||
if (document.getApplinks().size() > 0) RESULT_FLAGS.set(flag_cat_hasapp, true);
|
||||
|
||||
this.languageIdentificator = new Identificator();
|
||||
|
||||
//System.out.println("DEBUG: condensing " + document.getMainLongTitle() + ", indexText=" + Boolean.toString(indexText) + ", indexMedia=" + Boolean.toString(indexMedia));
|
||||
|
||||
// add the URL components to the word list
|
||||
insertTextToWords(document.dc_source().toNormalform(false, true), 0, indexRWIEntry.flag_app_dc_identifier, RESULT_FLAGS, false);
|
||||
|
||||
Map.Entry<yacyURL, String> entry;
|
||||
if (indexText) {
|
||||
|
@ -161,6 +163,9 @@ public final class plasmaCondenser {
|
|||
this.RESULT_DIFF_SENTENCES = 0;
|
||||
}
|
||||
|
||||
// add the URL components to the word list
|
||||
insertTextToWords(document.dc_source().toNormalform(false, true), 0, indexRWIEntry.flag_app_dc_identifier, RESULT_FLAGS, false);
|
||||
|
||||
if (indexMedia) {
|
||||
// add anchor descriptions: here, we also add the url components
|
||||
// audio
|
||||
|
@ -209,12 +214,6 @@ public final class plasmaCondenser {
|
|||
}
|
||||
}
|
||||
}
|
||||
|
||||
// construct flag set for document
|
||||
if (document.getImages().size() > 0) RESULT_FLAGS.set(flag_cat_hasimage, true);
|
||||
if (document.getAudiolinks().size() > 0) RESULT_FLAGS.set(flag_cat_hasaudio, true);
|
||||
if (document.getVideolinks().size() > 0) RESULT_FLAGS.set(flag_cat_hasvideo, true);
|
||||
if (document.getApplinks().size() > 0) RESULT_FLAGS.set(flag_cat_hasapp, true);
|
||||
}
|
||||
|
||||
private void insertTextToWords(final String text, final int phrase, final int flagpos, final kelondroBitfield flagstemplate, boolean useForLanguageIdentification) {
|
||||
|
@ -360,7 +359,7 @@ public final class plasmaCondenser {
|
|||
this.RESULT_FLAGS.set(flag_cat_indexof, true);
|
||||
wordenum.pre(true); // parse lines as they come with CRLF
|
||||
}
|
||||
if ((last_index) && (word.equals("of"))) comb_indexof = true;
|
||||
if ((last_index) && (wordminsize > 2 || (word.equals("of")))) comb_indexof = true;
|
||||
last_last = word.equals("last");
|
||||
last_index = word.equals("index");
|
||||
|
||||
|
@ -491,10 +490,10 @@ public final class plasmaCondenser {
|
|||
else
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
public static Enumeration<StringBuilder> wordTokenizer(final String s, final String charset) {
|
||||
try {
|
||||
return new sievedWordsEnum(new ByteArrayInputStream(s.getBytes("UTF-8")));
|
||||
return new sievedWordsEnum(new ByteArrayInputStream(s.getBytes(charset)));
|
||||
} catch (final Exception e) {
|
||||
return null;
|
||||
}
|
||||
|
|
|
@ -330,13 +330,9 @@ public class plasmaSnippetCache {
|
|||
// trying to load the resource from the cache
|
||||
resContent = plasmaHTCache.getResourceContentStream(url);
|
||||
responseHeader = plasmaHTCache.loadResponseHeader(url);
|
||||
if (resContent != null) {
|
||||
// if the content was found
|
||||
resContentLength = plasmaHTCache.getResourceContentLength(url);
|
||||
if ((resContentLength > maxDocLen) && (!fetchOnline)) {
|
||||
// content may be too large to be parsed here. To be fast, we omit calculation of snippet here
|
||||
return new TextSnippet(url, null, ERROR_SOURCE_LOADING, queryhashes, "resource available, but too large: " + resContentLength + " bytes");
|
||||
}
|
||||
if (resContent != null && ((resContentLength = plasmaHTCache.getResourceContentLength(url)) > maxDocLen) && (!fetchOnline)) {
|
||||
// content may be too large to be parsed here. To be fast, we omit calculation of snippet here
|
||||
return new TextSnippet(url, null, ERROR_SOURCE_LOADING, queryhashes, "resource available, but too large: " + resContentLength + " bytes");
|
||||
} else if (containsAllHashes(comp.dc_title(), queryhashes)) {
|
||||
// try to create the snippet from information given in the url itself
|
||||
return new TextSnippet(url, (comp.dc_subject().length() > 0) ? comp.dc_creator() : comp.dc_subject(), SOURCE_METADATA, null, null, faviconCache.get(url.hash()));
|
||||
|
@ -346,7 +342,7 @@ public class plasmaSnippetCache {
|
|||
} else if (containsAllHashes(comp.dc_subject(), queryhashes)) {
|
||||
// try to create the snippet from information given in the subject metadata
|
||||
return new TextSnippet(url, (comp.dc_creator().length() > 0) ? comp.dc_creator() : comp.dc_subject(), SOURCE_METADATA, null, null, faviconCache.get(url.hash()));
|
||||
} else if (containsAllHashes(comp.url().toNormalform(true, true), queryhashes)) {
|
||||
} else if (containsAllHashes(comp.url().toNormalform(true, true).replace('-', ' '), queryhashes)) {
|
||||
// try to create the snippet from information given in the subject metadata
|
||||
return new TextSnippet(url, (comp.dc_creator().length() > 0) ? comp.dc_creator() : comp.dc_subject(), SOURCE_METADATA, null, null, faviconCache.get(url.hash()));
|
||||
} else if (fetchOnline) {
|
||||
|
@ -673,7 +669,7 @@ public class plasmaSnippetCache {
|
|||
final int newlen = Math.max(10, maxpos - minpos + 10);
|
||||
final int around = (maxLength - newlen) / 2;
|
||||
assert minpos - around < sentence.length() : "maxpos = " + maxpos + ", minpos = " + minpos + ", around = " + around + ", sentence.length() = " + sentence.length();
|
||||
assert ((maxpos + around) <= sentence.length()) && ((maxpos + around) <= sentence.length()) : "maxpos = " + maxpos + ", minpos = " + minpos + ", around = " + around + ", sentence.length() = " + sentence.length();
|
||||
//assert ((maxpos + around) <= sentence.length()) && ((maxpos + around) <= sentence.length()) : "maxpos = " + maxpos + ", minpos = " + minpos + ", around = " + around + ", sentence.length() = " + sentence.length();
|
||||
sentence = "[..] " + sentence.substring(minpos - around, ((maxpos + around) > sentence.length()) ? sentence.length() : (maxpos + around)).trim() + " [..]";
|
||||
minpos = around;
|
||||
maxpos = sentence.length() - around - 5;
|
||||
|
|
|
@ -100,7 +100,7 @@ public class yacyPeerSelection {
|
|||
this.remaining = max;
|
||||
this.doublecheck = new HashSet<String>();
|
||||
this.nextSeed = nextInternal();
|
||||
this.alsoMyOwn = alsoMyOwn && (kelondroBase64Order.enhancedCoder.compare(seedDB.mySeed().hash.getBytes(), nextSeed.hash.getBytes()) > 0);
|
||||
this.alsoMyOwn = alsoMyOwn && nextSeed != null && (kelondroBase64Order.enhancedCoder.compare(seedDB.mySeed().hash.getBytes(), nextSeed.hash.getBytes()) > 0);
|
||||
}
|
||||
|
||||
public boolean hasNext() {
|
||||
|
|
Loading…
Reference in New Issue
Block a user