yacy_search_server/htroot/xml/snippet.java
orbiter 861f41e67e redesigned NURL-handling:
- the general NURL-index for all crawl stack types was splitted into separate indexes for these stacks
- the new NURL-index is managed by the crawl balancer
- the crawl balancer does not need an internal index any more, it is replaced by the NURL-index
- the NURL.Entry was generalized and is now a new class plasmaCrawlEntry
- the new class plasmaCrawlEntry replaces also the preNURL.Entry class, and will also replace the switchboardEntry class in the future
- the new class plasmaCrawlEntry is more accurate for date entries (holds milliseconds) and can contain larger 'name' entries (anchor tag names)
- the EURL object was replaced by a new ZURL object, which is a container for the plasmaCrawlEntry and some tracking information
- the EURL index is now filled with ZURL objects
- a new index delegatedURL holds ZURL objects about plasmaCrawlEntry obects to track which url is handed over to other peers
- redesigned handling of plasmaCrawlEntry - handover, because there is no need any more to convert one entry object into another
- found and fixed numerous bugs in the context of crawl state handling
- fixed a serious bug in kelondroCache which caused that entries could not be removed
- fixed some bugs in online interface and adopted monitor output to new entry objects
- adopted yacy protocol to handle new delegatedURL entries
all old crawl queues will disappear after this update!

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@3483 6c8d7289-2bf4-0310-a012-ef5d649a1542
2007-03-16 13:25:56 +00:00

95 lines
4.4 KiB
Java

package xml;
import java.net.MalformedURLException;
import java.util.ArrayList;
import java.util.Set;
import java.util.TreeSet;
import de.anomic.http.httpHeader;
import de.anomic.plasma.plasmaURL;
import de.anomic.kelondro.kelondroMSetTools;
import de.anomic.net.URL;
import de.anomic.plasma.plasmaCondenser;
import de.anomic.plasma.plasmaSearchQuery;
import de.anomic.plasma.plasmaSnippetCache;
import de.anomic.plasma.plasmaSwitchboard;
import de.anomic.server.serverObjects;
import de.anomic.server.serverSwitch;
public class snippet {
public static serverObjects respond(httpHeader header, serverObjects post, serverSwitch env) throws MalformedURLException {
// return variable that accumulates replacements
plasmaSwitchboard switchboard = (plasmaSwitchboard) env;
serverObjects prop = new serverObjects();
//get the timeout for snippet-fetching
int mediasnippet_timeout = 15000;
int textsnippet_timeout = 10000;
mediasnippet_timeout = Integer.parseInt((env.getConfig("timeout_text", "15000")));
textsnippet_timeout = Integer.parseInt((env.getConfig("timeout_media", "10000")));
// getting url
String urlString = post.get("url", "");
URL url = new URL(urlString);
prop.put("urlHash",plasmaURL.urlHash(url));
// if 'remove' is set to true, then RWI references to URLs that do not have the snippet are removed
boolean remove = post.get("remove", "false").equals("true");
// boolean line_end_with_punctuation
boolean pre = post.get("pre", "false").equals("true");
// type of media
String media = post.get("media", "text");
String querystring = post.get("search", "").trim();
if ((querystring.length() > 2) && (querystring.charAt(0) == '"') && (querystring.charAt(querystring.length() - 1) == '"')) {
querystring = querystring.substring(1, querystring.length() - 1).trim();
}
final TreeSet query = plasmaSearchQuery.cleanQuery(querystring);
Set queryHashes = plasmaCondenser.words2hashes(query);
// filter out stopwords
final TreeSet filtered = kelondroMSetTools.joinConstructive(query, plasmaSwitchboard.stopwords);
if (filtered.size() > 0) {
kelondroMSetTools.excludeDestructive(query, plasmaSwitchboard.stopwords);
}
// find snippet
if (media.equals("text")) {
// attach text snippet
plasmaSnippetCache.TextSnippet snippet = switchboard.snippetCache.retrieveTextSnippet(url, queryHashes, true, pre, 260, textsnippet_timeout);
prop.put("status",snippet.getErrorCode());
if (snippet.getErrorCode() < 11) {
// no problems occurred
//prop.put("text", (snippet.exists()) ? snippet.getLineMarked(queryHashes) : "unknown");
prop.putASIS("text", (snippet.exists()) ? snippet.getLineMarked(queryHashes) : "unknown"); //FIXME: the ASIS should not be needed, but we have still htmlcode in .java files
} else {
// problems with snippet fetch
prop.put("text", (remove) ? switchboard.snippetCache.failConsequences(snippet, queryHashes) : snippet.getError());
}
prop.put("link", 0);
prop.put("links", 0);
} else {
// attach media information
ArrayList mediaSnippets = switchboard.snippetCache.retrieveMediaSnippets(url, queryHashes, media, true, mediasnippet_timeout);
plasmaSnippetCache.MediaSnippet ms;
for (int i = 0; i < mediaSnippets.size(); i++) {
ms = (plasmaSnippetCache.MediaSnippet) mediaSnippets.get(i);
prop.put("link_" + i + "_type", ms.type);
prop.put("link_" + i + "_href", ms.href);
prop.put("link_" + i + "_name", ms.name);
prop.put("link_" + i + "_attr", ms.attr);
}
//System.out.println("DEBUG: " + mediaSnippets.size() + " ENTRIES IN MEDIA SNIPPET LINKS for url " + urlString);
prop.put("text", "");
prop.put("link", mediaSnippets.size());
prop.put("links", mediaSnippets.size());
}
// return rewrite properties
return prop;
}
}