mirror of
https://github.com/yacy/yacy_search_server.git
synced 2024-09-19 00:01:41 +02:00
the location search shows now not re-evaluated locations but only such locations that are attached as metadata to web pages
- added parser for in-text appearing geo-locations - added geo-locations to rss search result - added evaluation of metadata-attached geo-locations in yacysearch_location to show search results within a map git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@7631 6c8d7289-2bf4-0310-a012-ef5d649a1542
This commit is contained in:
parent
8412f8787d
commit
0430a94eaa
|
@ -6,6 +6,7 @@
|
|||
xmlns:media="http://search.yahoo.com/mrss/"
|
||||
xmlns:atom="http://www.w3.org/2005/Atom"
|
||||
xmlns:dc="http://purl.org/dc/elements/1.1/"
|
||||
xmlns:geo="http://www.w3.org/2003/01/geo/wgs84_pos#"
|
||||
>
|
||||
<!-- YaCy Search Engine; http://yacy.net -->
|
||||
<channel>
|
||||
|
|
|
@ -9,11 +9,11 @@
|
|||
var map;
|
||||
var searchLayer_md = null;
|
||||
var searchLayer_co = null;
|
||||
var path_mdsearch = 'yacysearch_location.rss?dom=title,publisher,creator,subject&query=';
|
||||
var path_mdsearch = 'yacysearch_location.rss?dom=metatag&query=';
|
||||
var path_cosearch = 'yacysearch_location.rss?dom=query&query=';
|
||||
var marker_md = new OpenLayers.Icon("/env/grafics/marker_red.png", new OpenLayers.Size(11,16));
|
||||
var marker_co = new OpenLayers.Icon("/env/grafics/star_yellow.png", new OpenLayers.Size(25,25));
|
||||
// possible values for dom: query,mdall,title,publisher,creator,subject
|
||||
// possible values for dom: query,metatag,alltext,title,publisher,creator,subject
|
||||
|
||||
function init() {
|
||||
map = new OpenLayers.Map('map', {
|
||||
|
|
|
@ -55,13 +55,13 @@ public class yacysearch_location {
|
|||
prop.put("kml", 1);
|
||||
if (post == null) return prop;
|
||||
String query = post.get("query", "");
|
||||
boolean search_all = !post.containsKey("dom") || post.get("dom", "").equals("all");
|
||||
boolean search_query = search_all || post.get("dom", "").indexOf("query") >= 0;
|
||||
boolean search_mdall = search_all || post.get("dom", "").indexOf("mdall") >= 0;
|
||||
boolean search_title = search_mdall || post.get("dom", "").indexOf("title") >= 0;
|
||||
boolean search_publisher = search_mdall || post.get("dom", "").indexOf("publisher") >= 0;
|
||||
boolean search_creator = search_mdall || post.get("dom", "").indexOf("creator") >= 0;
|
||||
boolean search_subject = search_mdall || post.get("dom", "").indexOf("subject") >= 0;
|
||||
boolean search_query = post.get("dom", "").indexOf("query") >= 0;
|
||||
boolean metatag = post.get("dom", "").indexOf("metatag") >= 0;
|
||||
boolean alltext = post.get("dom", "").indexOf("alltext") >= 0;
|
||||
boolean search_title = alltext || post.get("dom", "").indexOf("title") >= 0;
|
||||
boolean search_publisher = alltext || post.get("dom", "").indexOf("publisher") >= 0;
|
||||
boolean search_creator = alltext || post.get("dom", "").indexOf("creator") >= 0;
|
||||
boolean search_subject = alltext || post.get("dom", "").indexOf("subject") >= 0;
|
||||
long maximumTime = post.getLong("maximumTime", 3000);
|
||||
int maximumRecords = post.getInt("maximumRecords", 200);
|
||||
//i.e. http://localhost:8090/yacysearch_location.kml?query=berlin&maximumTime=2000&maximumRecords=100
|
||||
|
@ -89,7 +89,7 @@ public class yacysearch_location {
|
|||
}
|
||||
}
|
||||
|
||||
if (search_title || search_publisher || search_creator || search_subject) try {
|
||||
if (metatag || search_title || search_publisher || search_creator || search_subject) try {
|
||||
// get a queue of search results
|
||||
String rssSearchServiceURL = "http://127.0.0.1:" + sb.getConfig("port", "8090") + "/yacysearch.rss";
|
||||
BlockingQueue<RSSMessage> results = new LinkedBlockingQueue<RSSMessage>();
|
||||
|
@ -98,6 +98,7 @@ public class yacysearch_location {
|
|||
// take the results and compute some locations
|
||||
RSSMessage message;
|
||||
loop: while ((message = results.poll(maximumTime, TimeUnit.MILLISECONDS)) != RSSMessage.POISON) {
|
||||
|
||||
// find all associated locations
|
||||
Set<Location> locations = new HashSet<Location>();
|
||||
StringBuilder words = new StringBuilder(120);
|
||||
|
@ -112,6 +113,13 @@ public class yacysearch_location {
|
|||
for (int i = 0; i < wordlist.length - 1; i++) locations.addAll(LibraryProvider.geoLoc.find(wordlist[i] + space + wordlist[i + 1], true));
|
||||
for (int i = 0; i < wordlist.length - 2; i++) locations.addAll(LibraryProvider.geoLoc.find(wordlist[i] + space + wordlist[i + 1] + space + wordlist[i + 2], true));
|
||||
|
||||
// add locations from metatag
|
||||
if (metatag) {
|
||||
if (message.getLat() != 0.0f && message.getLon() != 0.0f) {
|
||||
locations.add(new Location(message.getLon(), message.getLat(), message.getTitle().trim()));
|
||||
}
|
||||
}
|
||||
|
||||
for (Location location: locations) {
|
||||
// write for all locations a point to this message
|
||||
prop.put("kml_placemark_" + placemarkCounter + "_location", location.getName());
|
||||
|
|
|
@ -193,6 +193,13 @@ public class yacysearchitem {
|
|||
} else {
|
||||
prop.put("content_code", "");
|
||||
}
|
||||
if (result.lat() == 0.0f || result.lon() == 0.0f) {
|
||||
prop.put("content_loc", 0);
|
||||
} else {
|
||||
prop.put("content_loc", 1);
|
||||
prop.put("content_loc_lat", result.lat());
|
||||
prop.put("content_loc_lon", result.lon());
|
||||
}
|
||||
theQuery.transmitcount = item + 1;
|
||||
return prop;
|
||||
}
|
||||
|
|
|
@ -12,6 +12,7 @@
|
|||
<yacy:path>#[path]#</yacy:path>
|
||||
<yacy:file>#[file]#</yacy:file>
|
||||
<guid isPermaLink="false">#[urlhash]#</guid>
|
||||
#(loc)#::<geo:lat>#[lat]#</geo:lat><geo:long>#[lon]#</geo:long>#(/loc)#
|
||||
</item>::
|
||||
#(item)#::<item>
|
||||
<title>#[name]#</title>
|
||||
|
|
|
@ -168,6 +168,12 @@ public class ResultEntry implements Comparable<ResultEntry>, Comparator<ResultEn
|
|||
public int lapp() {
|
||||
return urlentry.lapp();
|
||||
}
|
||||
public float lat() {
|
||||
return urlentry.metadata().lat();
|
||||
}
|
||||
public float lon() {
|
||||
return urlentry.metadata().lon();
|
||||
}
|
||||
public WordReferenceVars word() {
|
||||
final Reference word = urlentry.word();
|
||||
assert word instanceof WordReferenceVars;
|
||||
|
|
|
@ -54,6 +54,9 @@ import net.yacy.kelondro.util.ISO639;
|
|||
|
||||
public class ContentScraper extends AbstractScraper implements Scraper {
|
||||
|
||||
private final char degree = '\u00B0';
|
||||
private final char[] minuteCharsHTML = "'".toCharArray();
|
||||
|
||||
// statics: for initialization of the HTMLFilterAbstractScraper
|
||||
private static final Set<String> linkTags0 = new HashSet<String>(9,0.99f);
|
||||
private static final Set<String> linkTags1 = new HashSet<String>(7,0.99f);
|
||||
|
@ -121,6 +124,55 @@ public class ContentScraper extends AbstractScraper implements Scraper {
|
|||
|
||||
public void scrapeText(final char[] newtext, final String insideTag) {
|
||||
// System.out.println("SCRAPE: " + UTF8.String(newtext));
|
||||
int p, q, s = 0;
|
||||
|
||||
// try to find location information in text
|
||||
location: while (s < newtext.length) {
|
||||
p = CharBuffer.indexOf(newtext, s, degree);
|
||||
if (p < 0) break location;
|
||||
// try to find a coordinate
|
||||
// <nobr>N 50o 05.453'</nobr><nobr>E 008o 30.191'</nobr>
|
||||
// N 52o 28.025 E 013o 20.299
|
||||
q = CharBuffer.indexOf(newtext, p, minuteCharsHTML);
|
||||
if (q < 0) q = CharBuffer.indexOf(newtext, p, " E".toCharArray());
|
||||
if (q < 0) q = CharBuffer.indexOf(newtext, p, " W".toCharArray());
|
||||
if (q < 0 && newtext.length - p == 8) q = newtext.length;
|
||||
if (q < 0) break location;
|
||||
int r = p;
|
||||
while (r-- > 1) {
|
||||
if (newtext[r] == ' ') {
|
||||
r--;
|
||||
if (newtext[r] == 'N') {
|
||||
this.lat = Float.parseFloat(new String(newtext, r + 2, p - r - 2)) +
|
||||
Float.parseFloat(new String(newtext, p + 2, q - p - 2)) / 60.0f;
|
||||
s = q + 6;
|
||||
continue location;
|
||||
}
|
||||
if (newtext[r] == 'S') {
|
||||
this.lat = -Float.parseFloat(new String(newtext, r + 2, p - r - 2)) -
|
||||
Float.parseFloat(new String(newtext, p + 2, q - p - 2)) / 60.0f;
|
||||
s = q + 6;
|
||||
continue location;
|
||||
}
|
||||
if (newtext[r] == 'E') {
|
||||
this.lon = Float.parseFloat(new String(newtext, r + 2, p - r - 2)) +
|
||||
Float.parseFloat(new String(newtext, p + 2, q - p - 2)) / 60.0f;
|
||||
s = q + 6;
|
||||
continue location;
|
||||
}
|
||||
if (newtext[r] == 'W') {
|
||||
this.lon = -Float.parseFloat(new String(newtext, r + 2, p - r - 2)) -
|
||||
Float.parseFloat(new String(newtext, p + 2, q - p - 2)) / 60.0f;
|
||||
s = q + 6;
|
||||
continue location;
|
||||
}
|
||||
break location;
|
||||
}
|
||||
}
|
||||
break location;
|
||||
}
|
||||
|
||||
// find tags inside text
|
||||
String b = cleanLine(super.stripAll(newtext));
|
||||
if ((insideTag != null) && (!(insideTag.equals("a")))) {
|
||||
// texts inside tags sometimes have no punctuation at the line end
|
||||
|
@ -132,7 +184,7 @@ public class ContentScraper extends AbstractScraper implements Scraper {
|
|||
//System.out.println("*** Appended dot: " + b.toString());
|
||||
}
|
||||
// find http links inside text
|
||||
int p, q, s = 0;
|
||||
s = 0;
|
||||
String u;
|
||||
MultiProtocolURI url;
|
||||
while (s < b.length()) {
|
||||
|
|
|
@ -191,8 +191,8 @@ public class URIMetadataRow implements URIMetadata {
|
|||
final CharBuffer s = new CharBuffer(360);
|
||||
s.append(url.toNormalform(false, true)).append(10);
|
||||
s.append(dc_title).append(10);
|
||||
s.append(dc_creator).append(10);
|
||||
s.append(dc_subject).append(10);
|
||||
s.append(dc_creator.length() > 80 ? dc_creator.substring(0, 80) : dc_creator).append(10);
|
||||
s.append(dc_subject.length() > 120 ? dc_subject.substring(0, 120) : dc_subject).append(10);
|
||||
s.append(dc_publisher).append(10);
|
||||
if (lon == 0.0f && lat == 0.0f) s.append(10); else s.append(Float.toString(lat)).append(',').append(Float.toString(lon)).append(10);
|
||||
return UTF8.getBytes(s.toString());
|
||||
|
|
|
@ -246,6 +246,36 @@ public final class CharBuffer extends Writer {
|
|||
}
|
||||
return -1;
|
||||
}
|
||||
|
||||
public static int indexOf(final char[] b, final char c) {
|
||||
return indexOf(b, 0, c);
|
||||
}
|
||||
|
||||
public static int indexOf(final char[] b, final int offset, final char c) {
|
||||
for (int i = offset; i < b.length; i++) if (b[i] == c) return i;
|
||||
return -1;
|
||||
}
|
||||
|
||||
public static int indexOf(final char[] b, final char[] s) {
|
||||
return indexOf(b, 0, s);
|
||||
}
|
||||
|
||||
public static int indexOf(final char[] b, final int start, final char[] bs) {
|
||||
if (start + bs.length > b.length) return -1;
|
||||
loop: for (int i = start; i <= b.length - bs.length; i++) {
|
||||
// first test only first char
|
||||
if (b[i] != bs[0]) continue loop;
|
||||
|
||||
// then test all remaining char
|
||||
for (int j = 1; j < bs.length; j++) {
|
||||
if (b[i + j] != bs[j]) continue loop;
|
||||
}
|
||||
|
||||
// found hit
|
||||
return i;
|
||||
}
|
||||
return -1;
|
||||
}
|
||||
|
||||
public int lastIndexOf(final char b) {
|
||||
for (int i = length - 1; i >= 0; i--) if (buffer[offset + i] == b) return i;
|
||||
|
|
Loading…
Reference in New Issue
Block a user