mirror of
https://github.com/yacy/yacy_search_server.git
synced 2024-09-19 00:01:41 +02:00
enhanced location search:
search is now done using verify=false (instead of verify=cacheonly) which will cause that much more targets can be found. This showed a bug where no location information was used from the metadata (and other metadata information) if cache=false is requested. The bug was fixed. Added also location parsing from wikimedia dumps. A wikipedia dump can now also be a source for a location search. Fixed many smaller bugs in connection with location search. git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@7657 6c8d7289-2bf4-0310-a012-ef5d649a1542
This commit is contained in:
parent
8d63f3b70f
commit
958ff4778e
|
@ -79,24 +79,23 @@
|
|||
|
||||
</head>
|
||||
<body id="yacysearch_location" onload="init();">
|
||||
#(display)#
|
||||
#%env/templates/simpleheader.template%#
|
||||
::
|
||||
#%env/templates/header.template%#
|
||||
::
|
||||
#(topmenu)#
|
||||
#%env/templates/embeddedheader.template%#
|
||||
#(/display)#
|
||||
::
|
||||
<div id="api">
|
||||
<a href="yacysearch_location.rss" id="apilink"><img src="/env/grafics/api.png" width="60" height="40" alt="API"/></a>
|
||||
<script type="text/javascript">
|
||||
//<![CDATA[
|
||||
document.getElementById('apilink').setAttribute('href', 'yacysearch_location.rss?' + window.location.search.substring(1));
|
||||
document.getElementById('apilink').setAttribute('href', 'yacysearch_location.rss?dom=metatag|alltext&' + window.location.search.substring(1));
|
||||
//]]>
|
||||
</script>
|
||||
<span>The information that is presented on this page can also be retrieved as XML
|
||||
Click the API icon to see the XML.
|
||||
To see a list of all APIs, please visit the <a href="http://www.yacy-websuche.de/wiki/index.php/Dev:API">API wiki page</a>.</span>
|
||||
</div>
|
||||
#%env/templates/simpleheader.template%#
|
||||
#(/topmenu)#
|
||||
|
||||
<form class="search small" onsubmit="return false;" class="search small" accept-charset="UTF-8">
|
||||
<h2>#[promoteSearchPageGreeting]#</h2>
|
||||
<div class="yacylogo"><a href="#[promoteSearchPageGreeting.homepage]#" class="yacylogo"><img src="#[promoteSearchPageGreeting.smallImage]#" alt="yacysearch" /></a></div>
|
||||
|
|
|
@ -29,7 +29,6 @@ import net.yacy.cora.protocol.RequestHeader;
|
|||
import net.yacy.cora.services.federated.opensearch.SRURSSConnector;
|
||||
import net.yacy.document.LibraryProvider;
|
||||
import net.yacy.document.geolocalization.Location;
|
||||
import de.anomic.crawler.CrawlProfile;
|
||||
import de.anomic.search.Switchboard;
|
||||
import de.anomic.search.SwitchboardConstants;
|
||||
import de.anomic.server.serverCore;
|
||||
|
@ -94,7 +93,7 @@ public class yacysearch_location {
|
|||
// get a queue of search results
|
||||
String rssSearchServiceURL = "http://127.0.0.1:" + sb.getConfig("port", "8090") + "/yacysearch.rss";
|
||||
BlockingQueue<RSSMessage> results = new LinkedBlockingQueue<RSSMessage>();
|
||||
SRURSSConnector.searchSRURSS(results, rssSearchServiceURL, query, maximumTime, Integer.MAX_VALUE, CrawlProfile.CacheStrategy.NOCACHE, false, null);
|
||||
SRURSSConnector.searchSRURSS(results, rssSearchServiceURL, query, maximumTime, Integer.MAX_VALUE, null, false, null);
|
||||
|
||||
// take the results and compute some locations
|
||||
RSSMessage message;
|
||||
|
@ -164,10 +163,7 @@ public class yacysearch_location {
|
|||
|
||||
}
|
||||
if (header.get(HeaderFramework.CONNECTION_PROP_EXT, "").equals("html")) {
|
||||
final boolean authenticated = sb.adminAuthenticated(header) >= 2;
|
||||
int display = (post == null) ? 0 : post.getInt("display", 0);
|
||||
if (!authenticated) display = 2;
|
||||
prop.put("display", display);
|
||||
prop.put("topmenu", sb.getConfigBool("publicTopmenu", true) ? 1 : 0);
|
||||
prop.put("promoteSearchPageGreeting", sb.getConfig(SwitchboardConstants.GREETING, ""));
|
||||
prop.put("promoteSearchPageGreeting.homepage", sb.getConfig(SwitchboardConstants.GREETING_HOMEPAGE, ""));
|
||||
prop.put("promoteSearchPageGreeting.smallImage", sb.getConfig(SwitchboardConstants.GREETING_SMALL_IMAGE, ""));
|
||||
|
|
|
@ -249,6 +249,7 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
|
|||
IFFRESH(1), // use the cache if the cache exists and is fresh using the proxy-fresh rules
|
||||
IFEXIST(2), // use the cache if the cache exist. Do no check freshness. Otherwise use online source.
|
||||
CACHEONLY(3); // never go online, use all content from cache. If no cache entry exist, consider content nevertheless as available
|
||||
// the fifth case may be that the CacheStrategy object is assigned NULL. That means that no snippet creation is wanted.
|
||||
public int code;
|
||||
private CacheStrategy(int code) {
|
||||
this.code = code;
|
||||
|
|
|
@ -107,6 +107,8 @@ public class WikiCode extends AbstractWikiParser implements WikiParser {
|
|||
|
||||
private static final String WIKI_CLOSE_LINK = "]]";
|
||||
private static final String WIKI_OPEN_LINK = "[[";
|
||||
private static final String WIKI_CLOSE_METADATA = "}}";
|
||||
private static final String WIKI_OPEN_METADATA = "{{";
|
||||
private static final String WIKI_CLOSE_EXTERNAL_LINK = "]";
|
||||
private static final String WIKI_OPEN_EXTERNAL_LINK = "[";
|
||||
private static final String WIKI_CLOSE_PRE_ESCAPED = "</pre>";
|
||||
|
@ -926,6 +928,7 @@ public class WikiCode extends AbstractWikiParser implements WikiParser {
|
|||
*/
|
||||
public String processLineOfWikiCode(String hostport, String line) {
|
||||
//If HTML has not been replaced yet (can happen if method gets called in recursion), replace now!
|
||||
line = processMetadata(line);
|
||||
if ((!replacedHtmlAlready || preformattedSpanning) && line.indexOf(WIKI_CLOSE_PRE_ESCAPED) < 0) {
|
||||
line = CharacterCoding.unicode2html(line, true);
|
||||
replacedHtmlAlready = true;
|
||||
|
@ -974,6 +977,7 @@ public class WikiCode extends AbstractWikiParser implements WikiParser {
|
|||
|
||||
line = tagReplace(line, Tags.STRIKE);
|
||||
|
||||
|
||||
line = processUnorderedList(line);
|
||||
line = processOrderedList(line);
|
||||
line = processDefinitionList(line);
|
||||
|
@ -991,6 +995,58 @@ public class WikiCode extends AbstractWikiParser implements WikiParser {
|
|||
return line;
|
||||
}
|
||||
|
||||
|
||||
public String processMetadata(String line) {
|
||||
int p, q, s = 0;
|
||||
while ((p = line.indexOf(WIKI_OPEN_METADATA, s)) >= 0 && (q = line.indexOf(WIKI_CLOSE_METADATA, p + 1)) >= 0) {
|
||||
s = q; // continue with next position
|
||||
String a = line.substring(p + 2, q);
|
||||
if (a.toLowerCase().startsWith("coordinate")) {
|
||||
// parse Geographical Coordinates as described in
|
||||
// http://en.wikipedia.org/wiki/Wikipedia:Manual_of_Style_%28dates_and_numbers%29#Geographical_coordinates
|
||||
// looks like:
|
||||
// {{Coord|57|18|22.5|N|4|27|32.7|W|display=title}}
|
||||
// however, such information does not appear as defined above but as:
|
||||
// {{coordinate|NS=52.205944|EW=0.117593|region=GB-CAM|type=landmark}}
|
||||
// {{coordinate|NS=43/50/29/N|EW=73/23/17/W|type=landmark|region=US-NY}}
|
||||
// and if passed through this parser:
|
||||
// {{Coordinate |NS 45/37/43.0/N |EW. 07/58/41.0/E |type=landmark |region=IT-BI}} ## means: degree/minute/second
|
||||
// {{Coordinate |NS 51.48994 |EW. 7.33249 |type=landmark |region=DE-NW}}
|
||||
String b[] = a.split("\\|");
|
||||
float lon = 0.0f, lat = 0.0f;
|
||||
float lonm = 0.0f, latm = 0.0f;
|
||||
String lono = "E", lato = "N";
|
||||
String name = "";
|
||||
for (String c: b) {
|
||||
if (c.toLowerCase().startsWith("name=")) {
|
||||
name = c.substring(5);
|
||||
}
|
||||
if (c.toUpperCase().startsWith("NS=")) {
|
||||
String d[] = c.substring(3).split("/");
|
||||
if (d.length == 1) {float l = Float.parseFloat(d[0]); if (l < 0) {lato = "S"; l = -l;} lat = (float) Math.floor(l); latm = 60.0f * (l - lat);}
|
||||
else if (d.length == 2) {lat = Float.parseFloat(d[0]); latm = Float.parseFloat(d[1]);}
|
||||
else if (d.length == 3) {lat = Float.parseFloat(d[0]); latm = Float.parseFloat(d[1]) + Float.parseFloat(d[2]) / 60.0f;}
|
||||
if (d[d.length-1].toUpperCase().equals("S")) {}
|
||||
}
|
||||
if (c.toUpperCase().startsWith("EW=")) {
|
||||
String d[] = c.substring(3).split("/");
|
||||
if (d.length == 1) {float l = Float.parseFloat(d[0]); if (l < 0) {lono = "W"; l = -l;} lon = (float) Math.floor(l); lonm = 60.0f * (l - lon);}
|
||||
else if (d.length == 2) {lon = Float.parseFloat(d[0]); lonm = Float.parseFloat(d[1]);}
|
||||
else if (d.length == 3) {lon = Float.parseFloat(d[0]); lonm = Float.parseFloat(d[1]) + Float.parseFloat(d[2]) / 60.0f;}
|
||||
if (d[d.length-1].toUpperCase().equals("w")) {lon = -lon; lonm = -lonm;}
|
||||
}
|
||||
}
|
||||
if (lon != 0.0f && lat != 0.0f) {
|
||||
// replace this with a format that the html parser can understand
|
||||
line = line.substring(0, p) + (name.length() > 0 ? (" " + name) : "") + " <nobr> " + lato + " " + lat + "\u00B0 " + latm + "'</nobr><nobr>" + lono + " " + lon + "\u00B0 " + lonm + "'</nobr> " + line.substring(q + WIKI_CLOSE_METADATA.length());
|
||||
s = p;
|
||||
continue;
|
||||
}
|
||||
}
|
||||
}
|
||||
return line;
|
||||
}
|
||||
|
||||
private class TableOfContent {
|
||||
|
||||
private final List<String> toc = new ArrayList<String>(); // needs to be list which ensures order
|
||||
|
|
|
@ -385,7 +385,16 @@ public class ResultFetcher {
|
|||
final long dbRetrievalTime = System.currentTimeMillis() - startTime;
|
||||
|
||||
if (cacheStrategy == null) {
|
||||
return new ResultEntry(page, query.getSegment(), peers, null, null, dbRetrievalTime, 0); // result without snippet
|
||||
final TextSnippet snippet = new TextSnippet(
|
||||
null,
|
||||
metadata,
|
||||
snippetFetchWordHashes,
|
||||
null,
|
||||
((query.constraint != null) && (query.constraint.get(Condenser.flag_cat_indexof))),
|
||||
180,
|
||||
Integer.MAX_VALUE,
|
||||
!query.isLocal());
|
||||
return new ResultEntry(page, query.getSegment(), peers, snippet, null, dbRetrievalTime, 0); // result without snippet
|
||||
}
|
||||
|
||||
// load snippet
|
||||
|
|
|
@ -167,7 +167,7 @@ public class TextSnippet implements Comparable<TextSnippet>, Comparator<TextSnip
|
|||
String loc;
|
||||
boolean noCacheUsage = url.isFile() || url.isSMB();
|
||||
boolean objectWasInCache = (noCacheUsage) ? false : de.anomic.http.client.Cache.has(url);
|
||||
boolean useMetadata = !objectWasInCache && !cacheStrategy.mustBeOffline();
|
||||
boolean useMetadata = !objectWasInCache && (cacheStrategy == null || !cacheStrategy.mustBeOffline());
|
||||
if (useMetadata && containsAllHashes(loc = comp.dc_title(), queryhashes)) {
|
||||
// try to create the snippet from information given in the url itself
|
||||
init(url.hash(), loc, ResultClass.SOURCE_METADATA, null);
|
||||
|
@ -186,10 +186,10 @@ public class TextSnippet implements Comparable<TextSnippet>, Comparator<TextSnip
|
|||
return;
|
||||
} else {
|
||||
// try to load the resource from the cache
|
||||
response = loader.load(loader.request(url, true, reindexing), noCacheUsage ? CrawlProfile.CacheStrategy.NOCACHE : cacheStrategy, Long.MAX_VALUE, true);
|
||||
response = loader == null ? null : loader.load(loader.request(url, true, reindexing), noCacheUsage ? CrawlProfile.CacheStrategy.NOCACHE : cacheStrategy, Long.MAX_VALUE, true);
|
||||
if (response == null) {
|
||||
// in case that we did not get any result we can still return a success when we are not allowed to go online
|
||||
if (cacheStrategy.mustBeOffline()) {
|
||||
if (cacheStrategy == null || cacheStrategy.mustBeOffline()) {
|
||||
init(url.hash(), null, ResultClass.ERROR_SOURCE_LOADING, "omitted network load (not allowed), no cache entry");
|
||||
return;
|
||||
}
|
||||
|
|
|
@ -199,7 +199,7 @@ public class SRURSSConnector extends Thread implements SearchAccumulator {
|
|||
parts.put("query", UTF8.StringBody(query));
|
||||
parts.put("startRecord", UTF8.StringBody(Integer.toString(startRecord)));
|
||||
parts.put("maximumRecords", UTF8.StringBody(Long.toString(maximumRecords)));
|
||||
parts.put("verify", UTF8.StringBody(cacheStrategy.toName()));
|
||||
parts.put("verify", cacheStrategy == null ? UTF8.StringBody("false") : UTF8.StringBody(cacheStrategy.toName()));
|
||||
parts.put("resource", UTF8.StringBody(global ? "global" : "local"));
|
||||
parts.put("nav", UTF8.StringBody("none"));
|
||||
result = HTTPConnector.getConnector(userAgent == null ? MultiProtocolURI.yacybotUserAgent : userAgent).post(new MultiProtocolURI(rssSearchServiceURL), (int) timeout, uri.getHost(), parts);
|
||||
|
|
|
@ -608,6 +608,7 @@ dc_rights
|
|||
String language = this.dc_language();
|
||||
if (language != null && language.length() > 0) os.write("<dc:language>" + this.dc_language() + "</dc:language>\n");
|
||||
os.write("<dc:date>" + ISO8601Formatter.FORMATTER.format(date) + "</dc:date>\n");
|
||||
if (this.lon != 0.0f && this.lat != 0.0f) os.write("<geo:long>" + this.lon +"</geo:long><geo:lat>" + this.lat + "</geo:lat>\n");
|
||||
os.write("</record>\n");
|
||||
}
|
||||
|
||||
|
|
|
@ -504,7 +504,8 @@ public class MediawikiImporter extends Thread implements Importer {
|
|||
public void genDocument() throws Parser.Failure {
|
||||
try {
|
||||
url = new DigestURI(urlStub + title);
|
||||
document = Document.mergeDocuments(url, "text/html", TextParser.parseSource(url, "text/html", "UTF-8", UTF8.getBytes(html)));
|
||||
Document[] parsed = TextParser.parseSource(url, "text/html", "UTF-8", UTF8.getBytes(html));
|
||||
document = Document.mergeDocuments(url, "text/html", parsed);
|
||||
// the wiki parser is not able to find the proper title in the source text, so it must be set here
|
||||
document.setTitle(title);
|
||||
} catch (MalformedURLException e1) {
|
||||
|
|
|
@ -124,19 +124,22 @@ public class ContentScraper extends AbstractScraper implements Scraper {
|
|||
|
||||
public void scrapeText(final char[] newtext, final String insideTag) {
|
||||
// System.out.println("SCRAPE: " + UTF8.String(newtext));
|
||||
int p, q, s = 0;
|
||||
int p, pl, q, s = 0;
|
||||
|
||||
// try to find location information in text
|
||||
// Opencaching:
|
||||
// <nobr>N 50o 05.453'</nobr><nobr>E 008o 30.191'</nobr>
|
||||
// N 52o 28.025 E 013o 20.299
|
||||
location: while (s < newtext.length) {
|
||||
pl = 1;
|
||||
p = CharBuffer.indexOf(newtext, s, degree);
|
||||
if (p < 0) {p = CharBuffer.indexOf(newtext, s, "°".toCharArray()); if (p >= 0) pl = 5;}
|
||||
if (p < 0) break location;
|
||||
// try to find a coordinate
|
||||
// <nobr>N 50o 05.453'</nobr><nobr>E 008o 30.191'</nobr>
|
||||
// N 52o 28.025 E 013o 20.299
|
||||
q = CharBuffer.indexOf(newtext, p, minuteCharsHTML);
|
||||
if (q < 0) q = CharBuffer.indexOf(newtext, p, " E".toCharArray());
|
||||
if (q < 0) q = CharBuffer.indexOf(newtext, p, " W".toCharArray());
|
||||
if (q < 0 && newtext.length - p == 8) q = newtext.length;
|
||||
q = CharBuffer.indexOf(newtext, p + pl, minuteCharsHTML);
|
||||
if (q < 0) q = CharBuffer.indexOf(newtext, p + pl, "'".toCharArray());
|
||||
if (q < 0) q = CharBuffer.indexOf(newtext, p + pl, " E".toCharArray());
|
||||
if (q < 0) q = CharBuffer.indexOf(newtext, p + pl, " W".toCharArray());
|
||||
if (q < 0 && newtext.length - p == 7 + pl) q = newtext.length;
|
||||
if (q < 0) break location;
|
||||
int r = p;
|
||||
while (r-- > 1) {
|
||||
|
@ -144,25 +147,29 @@ public class ContentScraper extends AbstractScraper implements Scraper {
|
|||
r--;
|
||||
if (newtext[r] == 'N') {
|
||||
this.lat = Float.parseFloat(new String(newtext, r + 2, p - r - 2)) +
|
||||
Float.parseFloat(new String(newtext, p + 2, q - p - 2)) / 60.0f;
|
||||
Float.parseFloat(new String(newtext, p + pl + 1, q - p - pl - 1)) / 60.0f;
|
||||
if (this.lon != 0.0f) break location;
|
||||
s = q + 6;
|
||||
continue location;
|
||||
}
|
||||
if (newtext[r] == 'S') {
|
||||
this.lat = -Float.parseFloat(new String(newtext, r + 2, p - r - 2)) -
|
||||
Float.parseFloat(new String(newtext, p + 2, q - p - 2)) / 60.0f;
|
||||
Float.parseFloat(new String(newtext, p + pl + 1, q - p - pl - 1)) / 60.0f;
|
||||
if (this.lon != 0.0f) break location;
|
||||
s = q + 6;
|
||||
continue location;
|
||||
}
|
||||
if (newtext[r] == 'E') {
|
||||
this.lon = Float.parseFloat(new String(newtext, r + 2, p - r - 2)) +
|
||||
Float.parseFloat(new String(newtext, p + 2, q - p - 2)) / 60.0f;
|
||||
Float.parseFloat(new String(newtext, p + pl + 1, q - p - pl - 1)) / 60.0f;
|
||||
if (this.lat != 0.0f) break location;
|
||||
s = q + 6;
|
||||
continue location;
|
||||
}
|
||||
if (newtext[r] == 'W') {
|
||||
this.lon = -Float.parseFloat(new String(newtext, r + 2, p - r - 2)) -
|
||||
Float.parseFloat(new String(newtext, p + 2, q - p - 2)) / 60.0f;
|
||||
Float.parseFloat(new String(newtext, p + 2, q - p - pl - 1)) / 60.0f;
|
||||
if (this.lat != 0.0f) break location;
|
||||
s = q + 6;
|
||||
continue location;
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue
Block a user