2006-02-23 14:29:07 +01:00
package xml ;
import java.net.MalformedURLException ;
2006-12-11 02:31:23 +01:00
import java.util.ArrayList ;
2006-02-23 14:29:07 +01:00
import java.util.Set ;
import java.util.TreeSet ;
import de.anomic.http.httpHeader ;
2006-11-10 02:13:33 +01:00
import de.anomic.plasma.plasmaURL ;
2006-02-23 14:29:07 +01:00
import de.anomic.kelondro.kelondroMSetTools ;
2006-09-30 00:27:20 +02:00
import de.anomic.net.URL ;
2006-11-23 03:16:30 +01:00
import de.anomic.plasma.plasmaCondenser ;
2006-02-23 14:29:07 +01:00
import de.anomic.plasma.plasmaSearchQuery ;
import de.anomic.plasma.plasmaSnippetCache ;
import de.anomic.plasma.plasmaSwitchboard ;
import de.anomic.server.serverObjects ;
import de.anomic.server.serverSwitch ;
2006-12-09 03:13:43 +01:00
import de.anomic.server.logging.serverLog ;
2006-02-23 14:29:07 +01:00
public class snippet {
public static serverObjects respond ( httpHeader header , serverObjects post , serverSwitch env ) throws MalformedURLException {
// return variable that accumulates replacements
plasmaSwitchboard switchboard = ( plasmaSwitchboard ) env ;
serverObjects prop = new serverObjects ( ) ;
2007-01-15 20:45:15 +01:00
//get the timeout for snippet-fetching
int mediasnippet_timeout = 15000 ;
int textsnippet_timeout = 10000 ;
mediasnippet_timeout = Integer . parseInt ( ( env . getConfig ( " timeout_text " , " 15000 " ) ) ) ;
textsnippet_timeout = Integer . parseInt ( ( env . getConfig ( " timeout_media " , " 10000 " ) ) ) ;
2006-02-23 14:29:07 +01:00
// getting url
String urlString = post . get ( " url " , " " ) ;
URL url = new URL ( urlString ) ;
2006-12-19 04:10:46 +01:00
prop . put ( " urlHash " , plasmaURL . urlHash ( url ) ) ;
2006-02-23 14:29:07 +01:00
2006-11-25 01:38:09 +01:00
// if 'remove' is set to true, then RWI references to URLs that do not have the snippet are removed
boolean remove = post . get ( " remove " , " false " ) . equals ( " true " ) ;
2006-11-28 16:00:15 +01:00
// boolean line_end_with_punctuation
boolean pre = post . get ( " pre " , " false " ) . equals ( " true " ) ;
2006-12-19 04:10:46 +01:00
// type of media
String media = post . get ( " media " , " text " ) ;
2006-02-23 14:29:07 +01:00
String querystring = post . get ( " search " , " " ) . trim ( ) ;
if ( ( querystring . length ( ) > 2 ) & & ( querystring . charAt ( 0 ) = = '"' ) & & ( querystring . charAt ( querystring . length ( ) - 1 ) = = '"' ) ) {
querystring = querystring . substring ( 1 , querystring . length ( ) - 1 ) . trim ( ) ;
}
final TreeSet query = plasmaSearchQuery . cleanQuery ( querystring ) ;
2006-12-19 04:10:46 +01:00
Set queryHashes = plasmaCondenser . words2hashes ( query ) ;
2006-02-23 14:29:07 +01:00
// filter out stopwords
final TreeSet filtered = kelondroMSetTools . joinConstructive ( query , plasmaSwitchboard . stopwords ) ;
if ( filtered . size ( ) > 0 ) {
kelondroMSetTools . excludeDestructive ( query , plasmaSwitchboard . stopwords ) ;
}
2006-11-28 16:00:15 +01:00
// find snippet
2006-12-19 04:10:46 +01:00
if ( media . equals ( " text " ) ) {
// attach text snippet
2007-01-15 20:45:15 +01:00
plasmaSnippetCache . TextSnippet snippet = switchboard . snippetCache . retrieveTextSnippet ( url , queryHashes , true , pre , 260 , textsnippet_timeout ) ;
2006-12-19 04:10:46 +01:00
prop . put ( " status " , snippet . getSource ( ) ) ;
if ( snippet . getSource ( ) < 11 ) {
//prop.put("text", (snippet.exists()) ? snippet.getLineMarked(queryHashes) : "unknown");
2007-01-16 22:35:25 +01:00
prop . putASIS ( " text " , ( snippet . exists ( ) ) ? snippet . getLineMarked ( queryHashes ) : " unknown " ) ; //FIXME: the ASIS should not be needed, but we have still htmlcode in .java files
2006-12-19 04:10:46 +01:00
} else {
String error = snippet . getError ( ) ;
if ( ( remove ) & & ( error . equals ( " no matching snippet found " ) ) ) {
serverLog . logInfo ( " snippet-fetch " , " no snippet found, remove words ' " + querystring + " ' for url = " + url . toNormalform ( ) ) ;
switchboard . wordIndex . removeReferences ( query , plasmaURL . urlHash ( url ) ) ;
}
prop . put ( " text " , error ) ;
}
prop . put ( " link " , 0 ) ;
prop . put ( " links " , 0 ) ;
2006-02-23 14:29:07 +01:00
} else {
2006-12-19 04:10:46 +01:00
// attach media information
2007-01-15 20:45:15 +01:00
ArrayList mediaSnippets = switchboard . snippetCache . retrieveMediaSnippets ( url , queryHashes , media , true , mediasnippet_timeout ) ;
2006-12-19 04:10:46 +01:00
plasmaSnippetCache . MediaSnippet ms ;
for ( int i = 0 ; i < mediaSnippets . size ( ) ; i + + ) {
ms = ( plasmaSnippetCache . MediaSnippet ) mediaSnippets . get ( i ) ;
prop . put ( " link_ " + i + " _type " , ms . type ) ;
prop . put ( " link_ " + i + " _href " , ms . href ) ;
prop . put ( " link_ " + i + " _name " , ms . name ) ;
prop . put ( " link_ " + i + " _attr " , ms . attr ) ;
2006-10-02 03:40:52 +02:00
}
2006-12-20 16:44:29 +01:00
//System.out.println("DEBUG: " + mediaSnippets.size() + " ENTRIES IN MEDIA SNIPPET LINKS for url " + urlString);
2006-12-19 04:10:46 +01:00
prop . put ( " text " , " " ) ;
prop . put ( " link " , mediaSnippets . size ( ) ) ;
prop . put ( " links " , mediaSnippets . size ( ) ) ;
2006-12-11 02:31:23 +01:00
}
2006-12-10 00:15:58 +01:00
2006-02-23 14:29:07 +01:00
// return rewrite properties
return prop ;
}
}