enhanced location search:

search is now done using verify=false (instead of verify=cacheonly) which will cause that much more targets can be found.
This showed a bug where no location information was used from the metadata (and other metadata information) if cache=false is requested. The bug was fixed.

Added also location parsing from wikimedia dumps. A wikipedia dump can now also be a source for a location search.
Fixed many smaller bugs in connection with location search.

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@7657 6c8d7289-2bf4-0310-a012-ef5d649a1542
This commit is contained in:
orbiter 2011-04-15 15:54:19 +00:00
parent 8d63f3b70f
commit 958ff4778e
10 changed files with 101 additions and 31 deletions

View File

@ -79,24 +79,23 @@
</head>
<body id="yacysearch_location" onload="init();">
#(display)#
#%env/templates/simpleheader.template%#
::
#%env/templates/header.template%#
::
#(topmenu)#
#%env/templates/embeddedheader.template%#
#(/display)#
::
<div id="api">
<a href="yacysearch_location.rss" id="apilink"><img src="/env/grafics/api.png" width="60" height="40" alt="API"/></a>
<script type="text/javascript">
//<![CDATA[
document.getElementById('apilink').setAttribute('href', 'yacysearch_location.rss?' + window.location.search.substring(1));
document.getElementById('apilink').setAttribute('href', 'yacysearch_location.rss?dom=metatag|alltext&' + window.location.search.substring(1));
//]]>
</script>
<span>The information that is presented on this page can also be retrieved as XML
Click the API icon to see the XML.
To see a list of all APIs, please visit the <a href="http://www.yacy-websuche.de/wiki/index.php/Dev:API">API wiki page</a>.</span>
</div>
#%env/templates/simpleheader.template%#
#(/topmenu)#
<form class="search small" onsubmit="return false;" class="search small" accept-charset="UTF-8">
<h2>#[promoteSearchPageGreeting]#</h2>
<div class="yacylogo"><a href="#[promoteSearchPageGreeting.homepage]#" class="yacylogo"><img src="#[promoteSearchPageGreeting.smallImage]#" alt="yacysearch" /></a></div>

View File

@ -29,7 +29,6 @@ import net.yacy.cora.protocol.RequestHeader;
import net.yacy.cora.services.federated.opensearch.SRURSSConnector;
import net.yacy.document.LibraryProvider;
import net.yacy.document.geolocalization.Location;
import de.anomic.crawler.CrawlProfile;
import de.anomic.search.Switchboard;
import de.anomic.search.SwitchboardConstants;
import de.anomic.server.serverCore;
@ -94,7 +93,7 @@ public class yacysearch_location {
// get a queue of search results
String rssSearchServiceURL = "http://127.0.0.1:" + sb.getConfig("port", "8090") + "/yacysearch.rss";
BlockingQueue<RSSMessage> results = new LinkedBlockingQueue<RSSMessage>();
SRURSSConnector.searchSRURSS(results, rssSearchServiceURL, query, maximumTime, Integer.MAX_VALUE, CrawlProfile.CacheStrategy.NOCACHE, false, null);
SRURSSConnector.searchSRURSS(results, rssSearchServiceURL, query, maximumTime, Integer.MAX_VALUE, null, false, null);
// take the results and compute some locations
RSSMessage message;
@ -164,10 +163,7 @@ public class yacysearch_location {
}
if (header.get(HeaderFramework.CONNECTION_PROP_EXT, "").equals("html")) {
final boolean authenticated = sb.adminAuthenticated(header) >= 2;
int display = (post == null) ? 0 : post.getInt("display", 0);
if (!authenticated) display = 2;
prop.put("display", display);
prop.put("topmenu", sb.getConfigBool("publicTopmenu", true) ? 1 : 0);
prop.put("promoteSearchPageGreeting", sb.getConfig(SwitchboardConstants.GREETING, ""));
prop.put("promoteSearchPageGreeting.homepage", sb.getConfig(SwitchboardConstants.GREETING_HOMEPAGE, ""));
prop.put("promoteSearchPageGreeting.smallImage", sb.getConfig(SwitchboardConstants.GREETING_SMALL_IMAGE, ""));

View File

@ -249,6 +249,7 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
IFFRESH(1), // use the cache if the cache exists and is fresh using the proxy-fresh rules
IFEXIST(2), // use the cache if the cache exist. Do no check freshness. Otherwise use online source.
CACHEONLY(3); // never go online, use all content from cache. If no cache entry exist, consider content nevertheless as available
// the fifth case may be that the CacheStrategy object is assigned NULL. That means that no snippet creation is wanted.
public int code;
private CacheStrategy(int code) {
this.code = code;

View File

@ -107,6 +107,8 @@ public class WikiCode extends AbstractWikiParser implements WikiParser {
private static final String WIKI_CLOSE_LINK = "]]";
private static final String WIKI_OPEN_LINK = "[[";
private static final String WIKI_CLOSE_METADATA = "}}";
private static final String WIKI_OPEN_METADATA = "{{";
private static final String WIKI_CLOSE_EXTERNAL_LINK = "]";
private static final String WIKI_OPEN_EXTERNAL_LINK = "[";
private static final String WIKI_CLOSE_PRE_ESCAPED = "&lt;/pre&gt;";
@ -926,6 +928,7 @@ public class WikiCode extends AbstractWikiParser implements WikiParser {
*/
public String processLineOfWikiCode(String hostport, String line) {
//If HTML has not been replaced yet (can happen if method gets called in recursion), replace now!
line = processMetadata(line);
if ((!replacedHtmlAlready || preformattedSpanning) && line.indexOf(WIKI_CLOSE_PRE_ESCAPED) < 0) {
line = CharacterCoding.unicode2html(line, true);
replacedHtmlAlready = true;
@ -974,6 +977,7 @@ public class WikiCode extends AbstractWikiParser implements WikiParser {
line = tagReplace(line, Tags.STRIKE);
line = processUnorderedList(line);
line = processOrderedList(line);
line = processDefinitionList(line);
@ -991,6 +995,58 @@ public class WikiCode extends AbstractWikiParser implements WikiParser {
return line;
}
public String processMetadata(String line) {
int p, q, s = 0;
while ((p = line.indexOf(WIKI_OPEN_METADATA, s)) >= 0 && (q = line.indexOf(WIKI_CLOSE_METADATA, p + 1)) >= 0) {
s = q; // continue with next position
String a = line.substring(p + 2, q);
if (a.toLowerCase().startsWith("coordinate")) {
// parse Geographical Coordinates as described in
// http://en.wikipedia.org/wiki/Wikipedia:Manual_of_Style_%28dates_and_numbers%29#Geographical_coordinates
// looks like:
// {{Coord|57|18|22.5|N|4|27|32.7|W|display=title}}
// however, such information does not appear as defined above but as:
// {{coordinate|NS=52.205944|EW=0.117593|region=GB-CAM|type=landmark}}
// {{coordinate|NS=43/50/29/N|EW=73/23/17/W|type=landmark|region=US-NY}}
// and if passed through this parser:
// {{Coordinate |NS 45/37/43.0/N |EW. 07/58/41.0/E |type=landmark |region=IT-BI}} ## means: degree/minute/second
// {{Coordinate |NS 51.48994 |EW. 7.33249 |type=landmark |region=DE-NW}}
String b[] = a.split("\\|");
float lon = 0.0f, lat = 0.0f;
float lonm = 0.0f, latm = 0.0f;
String lono = "E", lato = "N";
String name = "";
for (String c: b) {
if (c.toLowerCase().startsWith("name=")) {
name = c.substring(5);
}
if (c.toUpperCase().startsWith("NS=")) {
String d[] = c.substring(3).split("/");
if (d.length == 1) {float l = Float.parseFloat(d[0]); if (l < 0) {lato = "S"; l = -l;} lat = (float) Math.floor(l); latm = 60.0f * (l - lat);}
else if (d.length == 2) {lat = Float.parseFloat(d[0]); latm = Float.parseFloat(d[1]);}
else if (d.length == 3) {lat = Float.parseFloat(d[0]); latm = Float.parseFloat(d[1]) + Float.parseFloat(d[2]) / 60.0f;}
if (d[d.length-1].toUpperCase().equals("S")) {}
}
if (c.toUpperCase().startsWith("EW=")) {
String d[] = c.substring(3).split("/");
if (d.length == 1) {float l = Float.parseFloat(d[0]); if (l < 0) {lono = "W"; l = -l;} lon = (float) Math.floor(l); lonm = 60.0f * (l - lon);}
else if (d.length == 2) {lon = Float.parseFloat(d[0]); lonm = Float.parseFloat(d[1]);}
else if (d.length == 3) {lon = Float.parseFloat(d[0]); lonm = Float.parseFloat(d[1]) + Float.parseFloat(d[2]) / 60.0f;}
if (d[d.length-1].toUpperCase().equals("w")) {lon = -lon; lonm = -lonm;}
}
}
if (lon != 0.0f && lat != 0.0f) {
// replace this with a format that the html parser can understand
line = line.substring(0, p) + (name.length() > 0 ? (" " + name) : "") + " <nobr> " + lato + " " + lat + "\u00B0 " + latm + "'</nobr><nobr>" + lono + " " + lon + "\u00B0 " + lonm + "'</nobr> " + line.substring(q + WIKI_CLOSE_METADATA.length());
s = p;
continue;
}
}
}
return line;
}
private class TableOfContent {
private final List<String> toc = new ArrayList<String>(); // needs to be list which ensures order

View File

@ -385,7 +385,16 @@ public class ResultFetcher {
final long dbRetrievalTime = System.currentTimeMillis() - startTime;
if (cacheStrategy == null) {
return new ResultEntry(page, query.getSegment(), peers, null, null, dbRetrievalTime, 0); // result without snippet
final TextSnippet snippet = new TextSnippet(
null,
metadata,
snippetFetchWordHashes,
null,
((query.constraint != null) && (query.constraint.get(Condenser.flag_cat_indexof))),
180,
Integer.MAX_VALUE,
!query.isLocal());
return new ResultEntry(page, query.getSegment(), peers, snippet, null, dbRetrievalTime, 0); // result without snippet
}
// load snippet

View File

@ -167,7 +167,7 @@ public class TextSnippet implements Comparable<TextSnippet>, Comparator<TextSnip
String loc;
boolean noCacheUsage = url.isFile() || url.isSMB();
boolean objectWasInCache = (noCacheUsage) ? false : de.anomic.http.client.Cache.has(url);
boolean useMetadata = !objectWasInCache && !cacheStrategy.mustBeOffline();
boolean useMetadata = !objectWasInCache && (cacheStrategy == null || !cacheStrategy.mustBeOffline());
if (useMetadata && containsAllHashes(loc = comp.dc_title(), queryhashes)) {
// try to create the snippet from information given in the url itself
init(url.hash(), loc, ResultClass.SOURCE_METADATA, null);
@ -186,10 +186,10 @@ public class TextSnippet implements Comparable<TextSnippet>, Comparator<TextSnip
return;
} else {
// try to load the resource from the cache
response = loader.load(loader.request(url, true, reindexing), noCacheUsage ? CrawlProfile.CacheStrategy.NOCACHE : cacheStrategy, Long.MAX_VALUE, true);
response = loader == null ? null : loader.load(loader.request(url, true, reindexing), noCacheUsage ? CrawlProfile.CacheStrategy.NOCACHE : cacheStrategy, Long.MAX_VALUE, true);
if (response == null) {
// in case that we did not get any result we can still return a success when we are not allowed to go online
if (cacheStrategy.mustBeOffline()) {
if (cacheStrategy == null || cacheStrategy.mustBeOffline()) {
init(url.hash(), null, ResultClass.ERROR_SOURCE_LOADING, "omitted network load (not allowed), no cache entry");
return;
}

View File

@ -199,7 +199,7 @@ public class SRURSSConnector extends Thread implements SearchAccumulator {
parts.put("query", UTF8.StringBody(query));
parts.put("startRecord", UTF8.StringBody(Integer.toString(startRecord)));
parts.put("maximumRecords", UTF8.StringBody(Long.toString(maximumRecords)));
parts.put("verify", UTF8.StringBody(cacheStrategy.toName()));
parts.put("verify", cacheStrategy == null ? UTF8.StringBody("false") : UTF8.StringBody(cacheStrategy.toName()));
parts.put("resource", UTF8.StringBody(global ? "global" : "local"));
parts.put("nav", UTF8.StringBody("none"));
result = HTTPConnector.getConnector(userAgent == null ? MultiProtocolURI.yacybotUserAgent : userAgent).post(new MultiProtocolURI(rssSearchServiceURL), (int) timeout, uri.getHost(), parts);

View File

@ -608,6 +608,7 @@ dc_rights
String language = this.dc_language();
if (language != null && language.length() > 0) os.write("<dc:language>" + this.dc_language() + "</dc:language>\n");
os.write("<dc:date>" + ISO8601Formatter.FORMATTER.format(date) + "</dc:date>\n");
if (this.lon != 0.0f && this.lat != 0.0f) os.write("<geo:long>" + this.lon +"</geo:long><geo:lat>" + this.lat + "</geo:lat>\n");
os.write("</record>\n");
}

View File

@ -504,7 +504,8 @@ public class MediawikiImporter extends Thread implements Importer {
public void genDocument() throws Parser.Failure {
try {
url = new DigestURI(urlStub + title);
document = Document.mergeDocuments(url, "text/html", TextParser.parseSource(url, "text/html", "UTF-8", UTF8.getBytes(html)));
Document[] parsed = TextParser.parseSource(url, "text/html", "UTF-8", UTF8.getBytes(html));
document = Document.mergeDocuments(url, "text/html", parsed);
// the wiki parser is not able to find the proper title in the source text, so it must be set here
document.setTitle(title);
} catch (MalformedURLException e1) {

View File

@ -124,19 +124,22 @@ public class ContentScraper extends AbstractScraper implements Scraper {
public void scrapeText(final char[] newtext, final String insideTag) {
// System.out.println("SCRAPE: " + UTF8.String(newtext));
int p, q, s = 0;
int p, pl, q, s = 0;
// try to find location information in text
// Opencaching:
// <nobr>N 50o 05.453&#039;</nobr><nobr>E 008o 30.191&#039;</nobr>
// N 52o 28.025 E 013o 20.299
location: while (s < newtext.length) {
pl = 1;
p = CharBuffer.indexOf(newtext, s, degree);
if (p < 0) {p = CharBuffer.indexOf(newtext, s, "&deg;".toCharArray()); if (p >= 0) pl = 5;}
if (p < 0) break location;
// try to find a coordinate
// <nobr>N 50o 05.453&#039;</nobr><nobr>E 008o 30.191&#039;</nobr>
// N 52o 28.025 E 013o 20.299
q = CharBuffer.indexOf(newtext, p, minuteCharsHTML);
if (q < 0) q = CharBuffer.indexOf(newtext, p, " E".toCharArray());
if (q < 0) q = CharBuffer.indexOf(newtext, p, " W".toCharArray());
if (q < 0 && newtext.length - p == 8) q = newtext.length;
q = CharBuffer.indexOf(newtext, p + pl, minuteCharsHTML);
if (q < 0) q = CharBuffer.indexOf(newtext, p + pl, "'".toCharArray());
if (q < 0) q = CharBuffer.indexOf(newtext, p + pl, " E".toCharArray());
if (q < 0) q = CharBuffer.indexOf(newtext, p + pl, " W".toCharArray());
if (q < 0 && newtext.length - p == 7 + pl) q = newtext.length;
if (q < 0) break location;
int r = p;
while (r-- > 1) {
@ -144,25 +147,29 @@ public class ContentScraper extends AbstractScraper implements Scraper {
r--;
if (newtext[r] == 'N') {
this.lat = Float.parseFloat(new String(newtext, r + 2, p - r - 2)) +
Float.parseFloat(new String(newtext, p + 2, q - p - 2)) / 60.0f;
Float.parseFloat(new String(newtext, p + pl + 1, q - p - pl - 1)) / 60.0f;
if (this.lon != 0.0f) break location;
s = q + 6;
continue location;
}
if (newtext[r] == 'S') {
this.lat = -Float.parseFloat(new String(newtext, r + 2, p - r - 2)) -
Float.parseFloat(new String(newtext, p + 2, q - p - 2)) / 60.0f;
Float.parseFloat(new String(newtext, p + pl + 1, q - p - pl - 1)) / 60.0f;
if (this.lon != 0.0f) break location;
s = q + 6;
continue location;
}
if (newtext[r] == 'E') {
this.lon = Float.parseFloat(new String(newtext, r + 2, p - r - 2)) +
Float.parseFloat(new String(newtext, p + 2, q - p - 2)) / 60.0f;
Float.parseFloat(new String(newtext, p + pl + 1, q - p - pl - 1)) / 60.0f;
if (this.lat != 0.0f) break location;
s = q + 6;
continue location;
}
if (newtext[r] == 'W') {
this.lon = -Float.parseFloat(new String(newtext, r + 2, p - r - 2)) -
Float.parseFloat(new String(newtext, p + 2, q - p - 2)) / 60.0f;
Float.parseFloat(new String(newtext, p + 2, q - p - pl - 1)) / 60.0f;
if (this.lat != 0.0f) break location;
s = q + 6;
continue location;
}