mirror of
https://github.com/yacy/yacy_search_server.git
synced 2024-09-19 00:01:41 +02:00
Sort out double keywords (dc_subject) early in parsed documents
- by direct using Set vs. List - remove not neede String[] getter
This commit is contained in:
parent
47d70732f6
commit
52a9040ae6
|
@ -114,12 +114,11 @@ public class getpageinfo {
|
||||||
prop.put("favicon", (scraper.getFavicon()==null) ? "" : scraper.getFavicon().toString());
|
prop.put("favicon", (scraper.getFavicon()==null) ? "" : scraper.getFavicon().toString());
|
||||||
|
|
||||||
// put keywords
|
// put keywords
|
||||||
final String list[] = scraper.dc_subject();
|
final Set<String> list = scraper.dc_subject();
|
||||||
int count = 0;
|
int count = 0;
|
||||||
for (final String element: list) {
|
for (final String element: list) {
|
||||||
final String tag = element;
|
if (!element.equals("")) {
|
||||||
if (!tag.equals("")) {
|
prop.putXML("tags_"+count+"_tag", element);
|
||||||
prop.putXML("tags_"+count+"_tag", tag);
|
|
||||||
count++;
|
count++;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -114,12 +114,11 @@ public class getpageinfo_p {
|
||||||
prop.put("favicon", (scraper.getFavicon()==null) ? "" : scraper.getFavicon().toString());
|
prop.put("favicon", (scraper.getFavicon()==null) ? "" : scraper.getFavicon().toString());
|
||||||
|
|
||||||
// put keywords
|
// put keywords
|
||||||
final String list[] = scraper.dc_subject();
|
final Set<String> list = scraper.dc_subject();
|
||||||
int count = 0;
|
int count = 0;
|
||||||
for (final String element: list) {
|
for (final String element: list) {
|
||||||
final String tag = element;
|
if (!element.equals("")) {
|
||||||
if (!tag.equals("")) {
|
prop.putXML("tags_"+count+"_tag", element);
|
||||||
prop.putXML("tags_"+count+"_tag", tag);
|
|
||||||
count++;
|
count++;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -46,7 +46,6 @@ import java.util.LinkedList;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
import java.util.Set;
|
import java.util.Set;
|
||||||
import java.util.TreeSet;
|
|
||||||
import java.util.regex.Matcher;
|
import java.util.regex.Matcher;
|
||||||
import java.util.regex.Pattern;
|
import java.util.regex.Pattern;
|
||||||
|
|
||||||
|
@ -71,7 +70,7 @@ public class Document {
|
||||||
private DigestURL source; // the source url
|
private DigestURL source; // the source url
|
||||||
private final String mimeType; // mimeType as taken from http header
|
private final String mimeType; // mimeType as taken from http header
|
||||||
private final String charset; // the charset of the document
|
private final String charset; // the charset of the document
|
||||||
private final List<String> keywords; // most resources provide a keyword field
|
private final Set<String> keywords; // most resources provide a keyword field
|
||||||
private List<String> titles; // the document titles, taken from title and/or h1 tag; shall appear as headline of search result
|
private List<String> titles; // the document titles, taken from title and/or h1 tag; shall appear as headline of search result
|
||||||
private final StringBuilder creator; // author or copyright
|
private final StringBuilder creator; // author or copyright
|
||||||
private final String publisher; // publisher
|
private final String publisher; // publisher
|
||||||
|
@ -115,7 +114,7 @@ public class Document {
|
||||||
this.mimeType = (mimeType == null) ? "application/octet-stream" : mimeType;
|
this.mimeType = (mimeType == null) ? "application/octet-stream" : mimeType;
|
||||||
this.charset = charset;
|
this.charset = charset;
|
||||||
this.parserObject = parserObject;
|
this.parserObject = parserObject;
|
||||||
this.keywords = new LinkedList<String>();
|
this.keywords = new LinkedHashSet<String>();
|
||||||
if (keywords != null) this.keywords.addAll(Arrays.asList(keywords));
|
if (keywords != null) this.keywords.addAll(Arrays.asList(keywords));
|
||||||
this.titles = (titles == null) ? new ArrayList<String>(1) : titles;
|
this.titles = (titles == null) ? new ArrayList<String>(1) : titles;
|
||||||
this.creator = (author == null) ? new StringBuilder(0) : new StringBuilder(author);
|
this.creator = (author == null) ? new StringBuilder(0) : new StringBuilder(author);
|
||||||
|
@ -214,6 +213,10 @@ dc_coverage
|
||||||
dc_rights
|
dc_rights
|
||||||
*/
|
*/
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Get the main document title. This is the 1st in the list of titles.
|
||||||
|
* @return title_string (may return null or empty string)
|
||||||
|
*/
|
||||||
public String dc_title() {
|
public String dc_title() {
|
||||||
return (this.titles == null || this.titles.size() == 0) ? "" : this.titles.iterator().next();
|
return (this.titles == null || this.titles.size() == 0) ? "" : this.titles.iterator().next();
|
||||||
}
|
}
|
||||||
|
@ -222,6 +225,10 @@ dc_rights
|
||||||
return this.titles;
|
return this.titles;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Sets the title of the document, replacing any existing titles.
|
||||||
|
* @param title
|
||||||
|
*/
|
||||||
public void setTitle(final String title) {
|
public void setTitle(final String title) {
|
||||||
this.titles = new ArrayList<String>();
|
this.titles = new ArrayList<String>();
|
||||||
if (title != null) this.titles.add(title);
|
if (title != null) this.titles.add(title);
|
||||||
|
@ -239,11 +246,8 @@ dc_rights
|
||||||
* @param tags
|
* @param tags
|
||||||
*/
|
*/
|
||||||
public void addTags(Set<String> tags) {
|
public void addTags(Set<String> tags) {
|
||||||
for (String s: this.keywords) {
|
|
||||||
tags.remove(s);
|
|
||||||
}
|
|
||||||
for (String s: tags) {
|
for (String s: tags) {
|
||||||
this.keywords.add(s);
|
if (s != null && !s.isEmpty()) this.keywords.add(s);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -274,28 +278,27 @@ dc_rights
|
||||||
}
|
}
|
||||||
return gf;
|
return gf;
|
||||||
}
|
}
|
||||||
|
|
||||||
public String[] dc_subject() {
|
/**
|
||||||
// sort out doubles and empty words
|
* Get the set of keywords associated with the document
|
||||||
final TreeSet<String> hs = new TreeSet<String>();
|
* @return set of unique keywords
|
||||||
String s;
|
*/
|
||||||
for (int i = 0; i < this.keywords.size(); i++) {
|
public Set<String> dc_subject() {
|
||||||
if (this.keywords.get(i) == null) continue;
|
return this.keywords;
|
||||||
s = (this.keywords.get(i)).trim();
|
|
||||||
if (!s.isEmpty()) hs.add(s);
|
|
||||||
}
|
|
||||||
final String[] t = new String[hs.size()];
|
|
||||||
int i = 0;
|
|
||||||
for (final String u: hs) t[i++] = u;
|
|
||||||
return t;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Get the set of keywords associated with the document and string
|
||||||
|
* each keyword separated by the separator character
|
||||||
|
*
|
||||||
|
* @param separator character
|
||||||
|
* @return string of keywords or empty string
|
||||||
|
*/
|
||||||
public String dc_subject(final char separator) {
|
public String dc_subject(final char separator) {
|
||||||
final String[] t = dc_subject();
|
if (this.keywords.size() == 0) return "";
|
||||||
if (t.length == 0) return "";
|
|
||||||
// generate a new list
|
// generate a new list
|
||||||
final StringBuilder sb = new StringBuilder(t.length * 8);
|
final StringBuilder sb = new StringBuilder(this.keywords.size() * 8);
|
||||||
for (final String s: t) sb.append(s).append(separator);
|
for (final String s: this.keywords) sb.append(s).append(separator);
|
||||||
return sb.substring(0, sb.length() - 1);
|
return sb.substring(0, sb.length() - 1);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -427,10 +430,6 @@ dc_rights
|
||||||
return sentences;
|
return sentences;
|
||||||
}
|
}
|
||||||
|
|
||||||
public List<String> getKeywords() {
|
|
||||||
return this.keywords;
|
|
||||||
}
|
|
||||||
|
|
||||||
public Collection<AnchorURL> getAnchors() {
|
public Collection<AnchorURL> getAnchors() {
|
||||||
// returns all links embedded as anchors (clickeable entities)
|
// returns all links embedded as anchors (clickeable entities)
|
||||||
// this is a url(String)/text(String) map
|
// this is a url(String)/text(String) map
|
||||||
|
@ -688,7 +687,7 @@ dc_rights
|
||||||
for (final Document doc: docs) {
|
for (final Document doc: docs) {
|
||||||
this.sections.addAll(doc.sections);
|
this.sections.addAll(doc.sections);
|
||||||
this.titles.addAll(doc.titles());
|
this.titles.addAll(doc.titles());
|
||||||
this.keywords.addAll(doc.getKeywords());
|
this.keywords.addAll(doc.dc_subject());
|
||||||
for (String d: doc.dc_description()) this.descriptions.add(d);
|
for (String d: doc.dc_description()) this.descriptions.add(d);
|
||||||
|
|
||||||
if (!(this.text instanceof ByteArrayOutputStream)) {
|
if (!(this.text instanceof ByteArrayOutputStream)) {
|
||||||
|
|
|
@ -3205,7 +3205,7 @@ public final class Switchboard extends serverSwitch {
|
||||||
//final Set<String> tags=ListManager.string2set(BookmarkHelper.cleanTagsString(post.get("bookmarkFolder","/crawlStart")));
|
//final Set<String> tags=ListManager.string2set(BookmarkHelper.cleanTagsString(post.get("bookmarkFolder","/crawlStart")));
|
||||||
final Set<String> tags=ListManager.string2set(BookmarkHelper.cleanTagsString("/crawlStart"));
|
final Set<String> tags=ListManager.string2set(BookmarkHelper.cleanTagsString("/crawlStart"));
|
||||||
tags.add("crawlStart");
|
tags.add("crawlStart");
|
||||||
final String[] keywords = scraper.dc_subject();
|
final Set<String> keywords = scraper.dc_subject();
|
||||||
if (keywords != null) {
|
if (keywords != null) {
|
||||||
for (final String k: keywords) {
|
for (final String k: keywords) {
|
||||||
final String kk = BookmarkHelper.cleanTagsString(k);
|
final String kk = BookmarkHelper.cleanTagsString(k);
|
||||||
|
|
Loading…
Reference in New Issue
Block a user