the location search shows now not re-evaluated locations but only such locations that are attached as metadata to web pages

- added parser for in-text appearing geo-locations
- added geo-locations to rss search result
- added evaluation of metadata-attached geo-locations in yacysearch_location to show search results within a map


git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@7631 6c8d7289-2bf4-0310-a012-ef5d649a1542
This commit is contained in:
orbiter 2011-03-30 23:26:36 +00:00
parent 8412f8787d
commit 0430a94eaa
9 changed files with 118 additions and 13 deletions

View File

@ -6,6 +6,7 @@
xmlns:media="http://search.yahoo.com/mrss/"
xmlns:atom="http://www.w3.org/2005/Atom"
xmlns:dc="http://purl.org/dc/elements/1.1/"
xmlns:geo="http://www.w3.org/2003/01/geo/wgs84_pos#"
>
<!-- YaCy Search Engine; http://yacy.net -->
<channel>

View File

@ -9,11 +9,11 @@
var map;
var searchLayer_md = null;
var searchLayer_co = null;
var path_mdsearch = 'yacysearch_location.rss?dom=title,publisher,creator,subject&query=';
var path_mdsearch = 'yacysearch_location.rss?dom=metatag&query=';
var path_cosearch = 'yacysearch_location.rss?dom=query&query=';
var marker_md = new OpenLayers.Icon("/env/grafics/marker_red.png", new OpenLayers.Size(11,16));
var marker_co = new OpenLayers.Icon("/env/grafics/star_yellow.png", new OpenLayers.Size(25,25));
// possible values for dom: query,mdall,title,publisher,creator,subject
// possible values for dom: query,metatag,alltext,title,publisher,creator,subject
function init() {
map = new OpenLayers.Map('map', {

View File

@ -55,13 +55,13 @@ public class yacysearch_location {
prop.put("kml", 1);
if (post == null) return prop;
String query = post.get("query", "");
boolean search_all = !post.containsKey("dom") || post.get("dom", "").equals("all");
boolean search_query = search_all || post.get("dom", "").indexOf("query") >= 0;
boolean search_mdall = search_all || post.get("dom", "").indexOf("mdall") >= 0;
boolean search_title = search_mdall || post.get("dom", "").indexOf("title") >= 0;
boolean search_publisher = search_mdall || post.get("dom", "").indexOf("publisher") >= 0;
boolean search_creator = search_mdall || post.get("dom", "").indexOf("creator") >= 0;
boolean search_subject = search_mdall || post.get("dom", "").indexOf("subject") >= 0;
boolean search_query = post.get("dom", "").indexOf("query") >= 0;
boolean metatag = post.get("dom", "").indexOf("metatag") >= 0;
boolean alltext = post.get("dom", "").indexOf("alltext") >= 0;
boolean search_title = alltext || post.get("dom", "").indexOf("title") >= 0;
boolean search_publisher = alltext || post.get("dom", "").indexOf("publisher") >= 0;
boolean search_creator = alltext || post.get("dom", "").indexOf("creator") >= 0;
boolean search_subject = alltext || post.get("dom", "").indexOf("subject") >= 0;
long maximumTime = post.getLong("maximumTime", 3000);
int maximumRecords = post.getInt("maximumRecords", 200);
//i.e. http://localhost:8090/yacysearch_location.kml?query=berlin&maximumTime=2000&maximumRecords=100
@ -89,7 +89,7 @@ public class yacysearch_location {
}
}
if (search_title || search_publisher || search_creator || search_subject) try {
if (metatag || search_title || search_publisher || search_creator || search_subject) try {
// get a queue of search results
String rssSearchServiceURL = "http://127.0.0.1:" + sb.getConfig("port", "8090") + "/yacysearch.rss";
BlockingQueue<RSSMessage> results = new LinkedBlockingQueue<RSSMessage>();
@ -98,6 +98,7 @@ public class yacysearch_location {
// take the results and compute some locations
RSSMessage message;
loop: while ((message = results.poll(maximumTime, TimeUnit.MILLISECONDS)) != RSSMessage.POISON) {
// find all associated locations
Set<Location> locations = new HashSet<Location>();
StringBuilder words = new StringBuilder(120);
@ -112,6 +113,13 @@ public class yacysearch_location {
for (int i = 0; i < wordlist.length - 1; i++) locations.addAll(LibraryProvider.geoLoc.find(wordlist[i] + space + wordlist[i + 1], true));
for (int i = 0; i < wordlist.length - 2; i++) locations.addAll(LibraryProvider.geoLoc.find(wordlist[i] + space + wordlist[i + 1] + space + wordlist[i + 2], true));
// add locations from metatag
if (metatag) {
if (message.getLat() != 0.0f && message.getLon() != 0.0f) {
locations.add(new Location(message.getLon(), message.getLat(), message.getTitle().trim()));
}
}
for (Location location: locations) {
// write for all locations a point to this message
prop.put("kml_placemark_" + placemarkCounter + "_location", location.getName());

View File

@ -193,6 +193,13 @@ public class yacysearchitem {
} else {
prop.put("content_code", "");
}
if (result.lat() == 0.0f || result.lon() == 0.0f) {
prop.put("content_loc", 0);
} else {
prop.put("content_loc", 1);
prop.put("content_loc_lat", result.lat());
prop.put("content_loc_lon", result.lon());
}
theQuery.transmitcount = item + 1;
return prop;
}

View File

@ -12,6 +12,7 @@
<yacy:path>#[path]#</yacy:path>
<yacy:file>#[file]#</yacy:file>
<guid isPermaLink="false">#[urlhash]#</guid>
#(loc)#::<geo:lat>#[lat]#</geo:lat><geo:long>#[lon]#</geo:long>#(/loc)#
</item>::
#(item)#::<item>
<title>#[name]#</title>

View File

@ -168,6 +168,12 @@ public class ResultEntry implements Comparable<ResultEntry>, Comparator<ResultEn
public int lapp() {
return urlentry.lapp();
}
public float lat() {
return urlentry.metadata().lat();
}
public float lon() {
return urlentry.metadata().lon();
}
public WordReferenceVars word() {
final Reference word = urlentry.word();
assert word instanceof WordReferenceVars;

View File

@ -54,6 +54,9 @@ import net.yacy.kelondro.util.ISO639;
public class ContentScraper extends AbstractScraper implements Scraper {
private final char degree = '\u00B0';
private final char[] minuteCharsHTML = "&#039;".toCharArray();
// statics: for initialization of the HTMLFilterAbstractScraper
private static final Set<String> linkTags0 = new HashSet<String>(9,0.99f);
private static final Set<String> linkTags1 = new HashSet<String>(7,0.99f);
@ -121,6 +124,55 @@ public class ContentScraper extends AbstractScraper implements Scraper {
public void scrapeText(final char[] newtext, final String insideTag) {
// System.out.println("SCRAPE: " + UTF8.String(newtext));
int p, q, s = 0;
// try to find location information in text
location: while (s < newtext.length) {
p = CharBuffer.indexOf(newtext, s, degree);
if (p < 0) break location;
// try to find a coordinate
// <nobr>N 50o 05.453&#039;</nobr><nobr>E 008o 30.191&#039;</nobr>
// N 52o 28.025 E 013o 20.299
q = CharBuffer.indexOf(newtext, p, minuteCharsHTML);
if (q < 0) q = CharBuffer.indexOf(newtext, p, " E".toCharArray());
if (q < 0) q = CharBuffer.indexOf(newtext, p, " W".toCharArray());
if (q < 0 && newtext.length - p == 8) q = newtext.length;
if (q < 0) break location;
int r = p;
while (r-- > 1) {
if (newtext[r] == ' ') {
r--;
if (newtext[r] == 'N') {
this.lat = Float.parseFloat(new String(newtext, r + 2, p - r - 2)) +
Float.parseFloat(new String(newtext, p + 2, q - p - 2)) / 60.0f;
s = q + 6;
continue location;
}
if (newtext[r] == 'S') {
this.lat = -Float.parseFloat(new String(newtext, r + 2, p - r - 2)) -
Float.parseFloat(new String(newtext, p + 2, q - p - 2)) / 60.0f;
s = q + 6;
continue location;
}
if (newtext[r] == 'E') {
this.lon = Float.parseFloat(new String(newtext, r + 2, p - r - 2)) +
Float.parseFloat(new String(newtext, p + 2, q - p - 2)) / 60.0f;
s = q + 6;
continue location;
}
if (newtext[r] == 'W') {
this.lon = -Float.parseFloat(new String(newtext, r + 2, p - r - 2)) -
Float.parseFloat(new String(newtext, p + 2, q - p - 2)) / 60.0f;
s = q + 6;
continue location;
}
break location;
}
}
break location;
}
// find tags inside text
String b = cleanLine(super.stripAll(newtext));
if ((insideTag != null) && (!(insideTag.equals("a")))) {
// texts inside tags sometimes have no punctuation at the line end
@ -132,7 +184,7 @@ public class ContentScraper extends AbstractScraper implements Scraper {
//System.out.println("*** Appended dot: " + b.toString());
}
// find http links inside text
int p, q, s = 0;
s = 0;
String u;
MultiProtocolURI url;
while (s < b.length()) {

View File

@ -191,8 +191,8 @@ public class URIMetadataRow implements URIMetadata {
final CharBuffer s = new CharBuffer(360);
s.append(url.toNormalform(false, true)).append(10);
s.append(dc_title).append(10);
s.append(dc_creator).append(10);
s.append(dc_subject).append(10);
s.append(dc_creator.length() > 80 ? dc_creator.substring(0, 80) : dc_creator).append(10);
s.append(dc_subject.length() > 120 ? dc_subject.substring(0, 120) : dc_subject).append(10);
s.append(dc_publisher).append(10);
if (lon == 0.0f && lat == 0.0f) s.append(10); else s.append(Float.toString(lat)).append(',').append(Float.toString(lon)).append(10);
return UTF8.getBytes(s.toString());

View File

@ -247,6 +247,36 @@ public final class CharBuffer extends Writer {
return -1;
}
public static int indexOf(final char[] b, final char c) {
return indexOf(b, 0, c);
}
public static int indexOf(final char[] b, final int offset, final char c) {
for (int i = offset; i < b.length; i++) if (b[i] == c) return i;
return -1;
}
public static int indexOf(final char[] b, final char[] s) {
return indexOf(b, 0, s);
}
public static int indexOf(final char[] b, final int start, final char[] bs) {
if (start + bs.length > b.length) return -1;
loop: for (int i = start; i <= b.length - bs.length; i++) {
// first test only first char
if (b[i] != bs[0]) continue loop;
// then test all remaining char
for (int j = 1; j < bs.length; j++) {
if (b[i + j] != bs[j]) continue loop;
}
// found hit
return i;
}
return -1;
}
public int lastIndexOf(final char b) {
for (int i = length - 1; i >= 0; i--) if (buffer[offset + i] == b) return i;
return -1;