- enhanced description on search front page

- fixed language and heuristic modifier
- added hint to crawl start that we can do also ftp and smb crawls
- added a protocol extension to remote crawls to transport all search modifiers to remote peers

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@8108 6c8d7289-2bf4-0310-a012-ef5d649a1542
This commit is contained in:
orbiter 2011-11-26 13:40:33 +00:00
parent 2512119e5f
commit ebd840ebf6
10 changed files with 126 additions and 62 deletions

View File

@ -41,7 +41,7 @@
<td>
<table cellpadding="0" cellspacing="0">
<tr>
<td><label for="url"><span class="nobr">From URL</span></label>:</td>
<td><label for="url"><span class="nobr">From URL<br/>(http/https/ftp/smb/file)</span></label>:</td>
<td><input type="radio" name="crawlingMode" id="url" value="url" checked="checked" /></td>
<td>
<input name="crawlingURL" id="crawlingURL" type="text" size="41" maxlength="256" value="#[starturl]#" onkeypress="changed()" onfocus="check('url')" />

View File

@ -39,7 +39,7 @@
<dd>
<table border="0" cellpadding="0" cellspacing="0"><tr valign="top">
<td valign="top"><input type="radio" name="crawlingMode" id="url" value="url" checked="checked"
onmousedown="document.getElementById('rangeDomain').disabled=false;document.getElementById('rangeSubpath').disabled=false;document.getElementById('crawlingDomMaxCheck').disabled=false;document.getElementById('crawlingDomMaxPages').disabled=false;document.getElementById('crawlingQ').disabled=false;"/>Start URL</td>
onmousedown="document.getElementById('rangeDomain').disabled=false;document.getElementById('rangeSubpath').disabled=false;document.getElementById('crawlingDomMaxCheck').disabled=false;document.getElementById('crawlingDomMaxPages').disabled=false;document.getElementById('crawlingQ').disabled=false;"/>Start URL<br/>(http/https/ftp/smb/file)</td>
<td valign="top">
<input name="crawlingURL" id="crawlingURL" type="text" size="50" maxlength="256" value="#[starturl]#" onkeypress="changed()" onfocus="check('url')" style="font-size:16px"/><br/>
<input name="bookmarkTitle" id="bookmarkTitle" type="text" size="50" maxlength="256" value="" readonly="readonly" style="background:transparent; border:0px"/>

View File

@ -98,7 +98,7 @@
<table>
<tr>
<td><label for="count">Max. number of results</label>:</td>
<td><label for="count">Results per page</label>:</td>
<td>
<input type="radio" name="maximumRecords" value="10" #(count-10)#::checked="checked"#(/count-10)#/>10
<input type="radio" name="maximumRecords" value="50" #(count-50)#::checked="checked"#(/count-50)#/>50
@ -109,8 +109,8 @@
#(resource-select)#::
<td><label for="resource">Resource</label>:</td>
<td>
<input type="radio" name="resource" value="global" #(global)#::checked="checked"#(/global)# #(global-disabled)#::disabled="disabled"#(/global-disabled)#/>global
<input type="radio" name="resource" value="local" #(local)#::checked="checked"#(/local)#/>local
<input type="radio" name="resource" value="global" #(global)#::checked="checked"#(/global)# #(global-disabled)#::disabled="disabled"#(/global-disabled)#/>the peer-to-peer network
<input type="radio" name="resource" value="local" #(local)#::checked="checked"#(/local)#/>only the local index
</td>
#(/resource-select)#
</tr>
@ -145,22 +145,14 @@
<input type="checkbox" id="indexof" name="indexof" #[indexofChecked]# /> <label for="indexof">only index pages</label>
</td>
</tr>
<tr><td></td><td></td></tr>
<tr>
<td>
<a href="http://www.yacy-websuche.de/wiki/index.php/En:SearchParameters">help: advanced parameters</a>
</td>
<td>
</td>
</tr>
</table>
<h4>Query Operators</h4>
<dl style="width:800px">
<dt>restrictions</dt>
<dd>
<dl style="width:600px">
<dl style="width:700px">
<dt style="width:100px">restrictions</dt>
<dd>
<dl style="width:500px">
<dt>inurl:&lt;phrase&gt;</dt>
<dd>only urls with the &lt;phrase&gt; in the url</dd>
<dt>filetype:&lt;ext&gt;</dt>
@ -171,39 +163,64 @@
<dd>only pages with as-author-anotated &lt;author&gt;</dd>
<dt>tld:&lt;tld&gt;</dt>
<dd>only pages from top-level-domains &lt;tld&gt;</dd>
<dt>/http or /https</dt>
<dt>/http</dt>
<dd>only resources from http or https servers</dd>
<dt>/ftp</dt>
<dd>only resources from ftp servers</dd>
<dd>only resources from ftp servers (they are rare, <a href="/CrawlStartSite_p.html">crawl them yourself</a>)</dd>
<dt>/smb</dt>
<dd>only resources from smb server</dd>
<dd>only resources from smb servers (<a href="/ConfigBasic.html">Intranet Indexing</a> must be selected)</dd>
<dt>/file</dt>
<dd>only files from the local file system (intranet mode must be selected)</dd>
<dd>only files from a local file system (<a href="/ConfigBasic.html">Intranet Indexing</a> must be selected)</dd>
</dl>
</dd>
<dt>ranking modifier</dt>
<dd>
<dl style="width:600px">
<dt style="width:100px">ranking modifier</dt>
<dd>
<dl style="width:500px">
<dt>/date</dt>
<dd>sort by date (latest first)</dd>
<dt>/near</dt>
<dd>multiple words shall appear near</dd>
<dt>"" (doublequotes)</dt>
<dd>multiple words shall appear near</dd>
<dt>/language/&lt;2-character language code&gt;</dt>
<dd>prefer given language</dd>
<dt>/language/&lt;lang&gt;</dt>
<dd>prefer given language (an ISO639 2-letter code)</dd>
</dl>
</dd>
<dt>heuristics</dt>
<dd>
<dl style="width:600px">
<dt>heuristic:scroogle</dt>
<dt style="width:100px">heuristics</dt>
<dd>
<dl style="width:500px">
<dt>/heuristic/scroogle</dt>
<dd>add search results from scroogle</dd>
<dt>heuristic:blekko</dt>
<dt>/heuristic/blekko</dt>
<dd>add search results from blekko</dd>
</dl>
</dd>
</dl>
<h4>Search Navigation</h4>
<dl style="width:700px">
<dt style="width:100px">keyboard shotcuts</dt>
<dd>
<dl style="width:500px">
<dt>tab or page-up</dt>
<dd>next result page</dd>
<dt>page-down</dt>
<dd>previous result page</dd>
</dl>
</dd>
<dt style="width:100px">automatic result retrieval</dt>
<dd>
<dl style="width:500px">
<dt>browser integration</dt>
<dd>after searching, click-open on the default search engine in the upper right search field of your browser and select 'Add "YaCy Search.."'</dd>
<dt>search as rss feed</dt>
<dd>click on the red icon in the upper right after a search. this works good in combination with the '/date' ranking modifier. See an <a href="http://localhost:8090/yacysearch.rss?query=news+%2Fdate&Enter=Search&verify=cacheonly&contentdom=text&nav=hosts%2Cauthors%2Cnamespace%2Ctopics%2Cfiletype%2Cprotocol&startRecord=0&indexof=off&meanCount=5&maximumRecords=10&resource=global&urlmaskfilter=.*&prefermaskfilter=">example</a>.</dd>
<dt>json search results</dt>
<dd>for ajax developers: get the search rss feed and replace the '.rss' extension in the search result url with '.json'</dd>
</dl>
</dd>
</dl>
#(/searchoptions)#
</form>
<script type="text/javascript">
@ -215,8 +232,6 @@
::
#%env/templates/simplefooter.template%#
#(/topmenu)#
<p>
<img src="cytag.png?icon=invisible&amp;nick=yacyh_#[clientid]#&amp;tag=search" alt=""/>
</p>
</body>
</html>

View File

@ -116,6 +116,7 @@ public final class search {
final long maxtime = Math.min((int) sb.getConfigLong(SwitchboardConstants.REMOTESEARCH_MAXTIME_DEFAULT, 3000), post.getLong("time", 3000)); // maximum waiting time
final int maxdist= post.getInt("maxdist", Integer.MAX_VALUE);
final String prefer = post.get("prefer", "");
final String modifier = post.get("modifier", "").trim();
final String contentdom = post.get("contentdom", "text");
final String filter = post.get("filter", ".*"); // a filter on the url
final Pattern snippetPattern = Pattern.compile(post.get("snippet", ".*")); // a filter on the snippet
@ -228,6 +229,7 @@ public final class search {
null,
snippetPattern,
null,
modifier,
maxdist,
prefer,
ContentDomain.contentdomParser(contentdom),
@ -288,6 +290,7 @@ public final class search {
null,
snippetPattern,
null,
modifier,
maxdist,
prefer,
ContentDomain.contentdomParser(contentdom),

View File

@ -278,7 +278,7 @@ public class yacysearch {
}
}
if ((!block) && (post == null || post.get("cat", "href").equals("href"))) {
if (!block && (post == null || post.get("cat", "href").equals("href"))) {
String urlmask = null;
// check available memory and clean up if necessary
@ -288,34 +288,42 @@ public class yacysearch {
}
final RankingProfile ranking = sb.getRanking();
final StringBuilder modifier = new StringBuilder(20);
if (querystring.indexOf("/near",0) >= 0) {
querystring = querystring.replace("/near", "");
ranking.coeff_worddistance = RankingProfile.COEFF_MAX;
modifier.append("/near ");
}
if (querystring.indexOf("/date",0) >= 0) {
querystring = querystring.replace("/date", "");
ranking.coeff_date = RankingProfile.COEFF_MAX;
modifier.append("/date ");
}
if (querystring.indexOf("/http",0) >= 0) {
querystring = querystring.replace("/http", "");
urlmask = "https?://.*";
modifier.append("/http ");
}
if (querystring.indexOf("/https",0) >= 0) {
querystring = querystring.replace("/https", "");
urlmask = "https?://.*";
modifier.append("/https ");
}
if (querystring.indexOf("/ftp",0) >= 0) {
querystring = querystring.replace("/ftp", "");
urlmask = "ftp://.*";
modifier.append("/ftp ");
}
if (querystring.indexOf("/smb",0) >= 0) {
querystring = querystring.replace("/smb", "");
urlmask = "smb://.*";
modifier.append("/smb ");
}
if (querystring.indexOf("/file",0) >= 0) {
querystring = querystring.replace("/file", "");
urlmask = "file://.*";
modifier.append("/file ");
}
if (querystring.indexOf("/location",0) >= 0) {
querystring = querystring.replace("/location", "");
@ -323,16 +331,17 @@ public class yacysearch {
constraint = new Bitfield(4);
}
constraint.set(Condenser.flag_cat_haslocation, true);
modifier.append("/location ");
}
final int lrp = querystring.indexOf("/language/",0);
String lr = "";
String language = "";
if (lrp >= 0) {
if (querystring.length() >= (lrp + 11)) {
lr = querystring.substring(lrp + 9, lrp + 11);
if (querystring.length() >= (lrp + 12)) {
language = querystring.substring(lrp + 10, lrp + 12);
}
querystring = querystring.replace("/language/" + lr, "");
lr = lr.toLowerCase();
querystring = querystring.replace("/language/" + language, "");
language = language.toLowerCase();
modifier.append("/language/").append(language).append(" ");
}
final int inurl = querystring.indexOf("inurl:",0);
if (inurl >= 0) {
@ -343,8 +352,9 @@ public class yacysearch {
final String urlstr = querystring.substring(inurl + 6, ftb);
querystring = querystring.replace("inurl:" + urlstr, "");
if (!urlstr.isEmpty()) {
urlmask = ".*" + urlstr + ".*";
urlmask = urlmask == null ? ".*" + urlstr + ".*" : urlmask + urlstr + ".*";
}
modifier.append("inurl:").append(urlstr).append(" ");
}
final int filetype = querystring.indexOf("filetype:",0);
if (filetype >= 0) {
@ -362,6 +372,7 @@ public class yacysearch {
urlmask = urlmask + ".*\\." + ft;
}
}
modifier.append("filetype:").append(ft).append(" ");
}
String tenant = null;
if (post.containsKey("tenant")) {
@ -392,16 +403,19 @@ public class yacysearch {
sitehost = sitehost.substring(0, sitehost.length() - 1);
}
sitehash = DigestURI.hosthash(sitehost);
modifier.append("site:").append(sitehost).append(" ");
}
final int heuristicScroogle = querystring.indexOf("heuristic:scroogle",0);
final int heuristicScroogle = querystring.indexOf("/heuristic/scroogle",0);
if (heuristicScroogle >= 0) {
querystring = querystring.replace("heuristic:scroogle", "");
querystring = querystring.replace("/heuristic/scroogle", "");
modifier.append("/heuristic/scroogle ");
}
final int heuristicBlekko = querystring.indexOf("heuristic:blekko",0);
final int heuristicBlekko = querystring.indexOf("/heuristic/blekko",0);
if (heuristicBlekko >= 0) {
querystring = querystring.replace("heuristic:blekko", "");
querystring = querystring.replace("/heuristic/blekko", "");
modifier.append("/heuristic/blekko ");
}
final int authori = querystring.indexOf("author:",0);
@ -417,6 +431,7 @@ public class yacysearch {
}
author = querystring.substring(authori + 8, ftb);
querystring = querystring.replace("author:'" + author + "'", "");
modifier.append("author:'").append(author).append("' ");
} else {
int ftb = querystring.indexOf(' ', authori);
if (ftb == -1) {
@ -424,6 +439,7 @@ public class yacysearch {
}
author = querystring.substring(authori + 7, ftb);
querystring = querystring.replace("author:" + author, "");
modifier.append("author:").append(author).append(" ");
}
authorhash = ASCII.String(Word.word2hash(author));
}
@ -435,6 +451,7 @@ public class yacysearch {
}
String domain = querystring.substring(tld + 4, ftb);
querystring = querystring.replace("tld:" + domain, "");
modifier.append("tld:").append(domain).append(" ");
while (domain.length() > 0 && domain.charAt(0) == '.') {
domain = domain.substring(1);
}
@ -451,7 +468,7 @@ public class yacysearch {
// read the language from the language-restrict option 'lr'
// if no one is given, use the user agent or the system language as default
String language = (post == null) ? lr : post.get("lr", lr);
language = (post == null) ? language : post.get("lr", language);
if (language.startsWith("lang_")) {
language = language.substring(5);
}
@ -586,6 +603,7 @@ public class yacysearch {
Word.words2hashesHandles(query[2]),
snippetPattern,
tenant,
modifier.toString().trim(),
maxDistance,
prefermask,
contentdom,

View File

@ -40,8 +40,12 @@ public class ImageParser {
} else if ((filename.endsWith(".ico")) && (icoParser.isICO(source))) {
// parse image with ICO parser
icoParser icoparser;
icoparser = new icoParser(source);
image = icoparser.getImage(0);
try {
icoparser = new icoParser(source);
image = icoparser.getImage(0);
} catch (final Throwable e) {
image = null;
}
if (image == null) return null;
} else {
// awt can handle jpg, png and gif formats, try it

View File

@ -435,6 +435,7 @@ public final class Protocol {
final Pattern prefer,
final Pattern filter,
final Pattern snippet,
final String modifier,
final String language,
final String sitehash,
final String authorhash,
@ -471,7 +472,7 @@ public final class Protocol {
try {
result = new SearchResult(
basicRequestParts(Switchboard.getSwitchboard(), target.hash, crypt.randomSalt()),
mySeed, wordhashes, excludehashes, urlhashes, prefer, filter, snippet, language,
mySeed, wordhashes, excludehashes, urlhashes, prefer, filter, snippet, modifier, language,
sitehash, authorhash, count, time, maxDistance, global, partitions, target.getHexHash() + ".yacyh", target.getClusterAddress(),
secondarySearchSuperviser, rankingProfile, constraint);
} catch (final IOException e) {
@ -643,6 +644,7 @@ public final class Protocol {
final Pattern prefer,
final Pattern filter,
final Pattern snippet,
final String modifier,
final String language,
final String sitehash,
final String authorhash,
@ -694,6 +696,7 @@ public final class Protocol {
parts.put("prefer", UTF8.StringBody(prefer.pattern()));
parts.put("filter", UTF8.StringBody(filter.pattern()));
parts.put("snippet", UTF8.StringBody(snippet.pattern()));
parts.put("modifier", UTF8.StringBody(modifier));
parts.put("language", UTF8.StringBody(language));
parts.put("sitehash", UTF8.StringBody(sitehash));
parts.put("authorhash", UTF8.StringBody(authorhash));
@ -1157,6 +1160,7 @@ public final class Protocol {
QueryParams.matchnothing_pattern, // prefer,
QueryParams.catchall_pattern, // filter,
QueryParams.catchall_pattern, // snippet,
"", // modifier
"", // language,
"", // sitehash,
"", // authorhash,

View File

@ -58,6 +58,7 @@ public class RemoteSearch extends Thread {
private final long time;
final private RankingProfile rankingProfile;
final private Pattern prefer, filter, snippet;
final private QueryParams.Modifier modifier;
final private String language;
final private Bitfield constraint;
final private SeedDB peers;
@ -68,6 +69,7 @@ public class RemoteSearch extends Thread {
final Pattern prefer,
final Pattern filter,
final Pattern snippet,
final QueryParams.Modifier modifier,
final String language,
final String sitehash, final String authorhash,
final int count, final long time, final int maxDistance,
@ -89,6 +91,7 @@ public class RemoteSearch extends Thread {
this.prefer = prefer;
this.filter = filter;
this.snippet = snippet;
this.modifier = modifier;
this.language = language;
this.sitehash = sitehash;
this.authorhash = authorhash;
@ -114,7 +117,7 @@ public class RemoteSearch extends Thread {
this.urls = Protocol.search(
this.peers.mySeed(),
this.wordhashes, this.excludehashes, this.urlhashes,
this.prefer, this.filter, this.snippet,
this.prefer, this.filter, this.snippet, this.modifier.getModifier(),
this.language, this.sitehash, this.authorhash,
this.count, this.time, this.maxDistance, this.global, this.partitions,
this.targetPeer, this.indexSegment, this.containerCache, this.secondarySearchSuperviser,
@ -156,6 +159,7 @@ public class RemoteSearch extends Thread {
public static RemoteSearch[] primaryRemoteSearches(
final String wordhashes, final String excludehashes,
final Pattern prefer, final Pattern filter, final Pattern snippet,
final QueryParams.Modifier modifier,
final String language,
final String sitehash,
final String authorhash,
@ -193,7 +197,7 @@ public class RemoteSearch extends Thread {
if (targetPeers[i] == null || targetPeers[i].hash == null) continue;
try {
searchThreads[i] = new RemoteSearch(
wordhashes, excludehashes, "", prefer, filter, snippet,
wordhashes, excludehashes, "", prefer, filter, snippet, modifier,
language, sitehash, authorhash,
count, time, maxDist, true, targets, targetPeers[i],
indexSegment, peers, containerCache, secondarySearchSuperviser, blacklist, rankingProfile, constraint);
@ -227,7 +231,7 @@ public class RemoteSearch extends Thread {
if (targetPeer == null || targetPeer.hash == null) return null;
if (clusterselection != null) targetPeer.setAlternativeAddress(clusterselection.get(ASCII.getBytes(targetPeer.hash)));
final RemoteSearch searchThread = new RemoteSearch(
wordhashes, "", urlhashes, QueryParams.matchnothing_pattern, QueryParams.catchall_pattern, QueryParams.catchall_pattern, "", "", "", 20, time, 9999, true, 0, targetPeer,
wordhashes, "", urlhashes, QueryParams.matchnothing_pattern, QueryParams.catchall_pattern, QueryParams.catchall_pattern, new QueryParams.Modifier(""), "", "", "", 20, time, 9999, true, 0, targetPeer,
indexSegment, peers, containerCache, null, blacklist, rankingProfile, constraint);
searchThread.start();
return searchThread;

View File

@ -88,6 +88,17 @@ public final class QueryParams {
FETCH_AND_VERIFY_ONLINE;
}
public static class Modifier {
String s;
public Modifier(final String modifier) {
this.s = modifier;
}
public String getModifier() {
return this.s;
}
}
public static final Bitfield empty_constraint = new Bitfield(4, "AAAAAA");
public static final Pattern catchall_pattern = Pattern.compile(".*");
public static final Pattern matchnothing_pattern = Pattern.compile("");
@ -115,6 +126,7 @@ public final class QueryParams {
public final String sitehash; // this is a domain hash, 6 bytes long or null
public final String authorhash;
public final String tenant;
public final Modifier modifier;
public Seed remotepeer;
public final Long time;
// values that are set after a search:
@ -152,6 +164,7 @@ public final class QueryParams {
this.snippetMatcher = QueryParams.catchall_pattern;
this.ranking = ranking;
this.tenant = null;
this.modifier = new Modifier("");
this.maxDistance = Integer.MAX_VALUE;
this.urlMask = catchall_pattern;
this.urlMask_isCatchall = true;
@ -186,6 +199,7 @@ public final class QueryParams {
final HandleSet fullqueryHashes,
final Pattern snippetMatcher,
final String tenant,
final String modifier,
final int maxDistance, final String prefer, final ContentDomain contentdom,
final String language,
final String navigators,
@ -209,6 +223,7 @@ public final class QueryParams {
this.fullqueryHashes = fullqueryHashes;
this.snippetMatcher = snippetMatcher;
this.tenant = (tenant != null && tenant.length() == 0) ? null : tenant;
this.modifier = new Modifier(modifier == null ? "" : modifier);
this.ranking = ranking;
this.maxDistance = maxDistance;
this.contentdom = contentdom;

View File

@ -134,6 +134,7 @@ public final class SearchEvent {
this.query.prefer,
this.query.urlMask,
this.query.snippetMatcher,
this.query.modifier,
this.query.targetlang == null ? "" : this.query.targetlang,
this.query.sitehash == null ? "" : this.query.sitehash,
this.query.authorhash == null ? "" : this.query.authorhash,