Sort out double keywords (dc_subject) early in parsed documents

- by direct using Set vs. List
- remove not neede String[] getter
This commit is contained in:
reger 2015-11-13 01:48:28 +01:00
parent 47d70732f6
commit 52a9040ae6
4 changed files with 36 additions and 39 deletions

View File

@ -114,12 +114,11 @@ public class getpageinfo {
prop.put("favicon", (scraper.getFavicon()==null) ? "" : scraper.getFavicon().toString()); prop.put("favicon", (scraper.getFavicon()==null) ? "" : scraper.getFavicon().toString());
// put keywords // put keywords
final String list[] = scraper.dc_subject(); final Set<String> list = scraper.dc_subject();
int count = 0; int count = 0;
for (final String element: list) { for (final String element: list) {
final String tag = element; if (!element.equals("")) {
if (!tag.equals("")) { prop.putXML("tags_"+count+"_tag", element);
prop.putXML("tags_"+count+"_tag", tag);
count++; count++;
} }
} }

View File

@ -114,12 +114,11 @@ public class getpageinfo_p {
prop.put("favicon", (scraper.getFavicon()==null) ? "" : scraper.getFavicon().toString()); prop.put("favicon", (scraper.getFavicon()==null) ? "" : scraper.getFavicon().toString());
// put keywords // put keywords
final String list[] = scraper.dc_subject(); final Set<String> list = scraper.dc_subject();
int count = 0; int count = 0;
for (final String element: list) { for (final String element: list) {
final String tag = element; if (!element.equals("")) {
if (!tag.equals("")) { prop.putXML("tags_"+count+"_tag", element);
prop.putXML("tags_"+count+"_tag", tag);
count++; count++;
} }
} }

View File

@ -46,7 +46,6 @@ import java.util.LinkedList;
import java.util.List; import java.util.List;
import java.util.Map; import java.util.Map;
import java.util.Set; import java.util.Set;
import java.util.TreeSet;
import java.util.regex.Matcher; import java.util.regex.Matcher;
import java.util.regex.Pattern; import java.util.regex.Pattern;
@ -71,7 +70,7 @@ public class Document {
private DigestURL source; // the source url private DigestURL source; // the source url
private final String mimeType; // mimeType as taken from http header private final String mimeType; // mimeType as taken from http header
private final String charset; // the charset of the document private final String charset; // the charset of the document
private final List<String> keywords; // most resources provide a keyword field private final Set<String> keywords; // most resources provide a keyword field
private List<String> titles; // the document titles, taken from title and/or h1 tag; shall appear as headline of search result private List<String> titles; // the document titles, taken from title and/or h1 tag; shall appear as headline of search result
private final StringBuilder creator; // author or copyright private final StringBuilder creator; // author or copyright
private final String publisher; // publisher private final String publisher; // publisher
@ -115,7 +114,7 @@ public class Document {
this.mimeType = (mimeType == null) ? "application/octet-stream" : mimeType; this.mimeType = (mimeType == null) ? "application/octet-stream" : mimeType;
this.charset = charset; this.charset = charset;
this.parserObject = parserObject; this.parserObject = parserObject;
this.keywords = new LinkedList<String>(); this.keywords = new LinkedHashSet<String>();
if (keywords != null) this.keywords.addAll(Arrays.asList(keywords)); if (keywords != null) this.keywords.addAll(Arrays.asList(keywords));
this.titles = (titles == null) ? new ArrayList<String>(1) : titles; this.titles = (titles == null) ? new ArrayList<String>(1) : titles;
this.creator = (author == null) ? new StringBuilder(0) : new StringBuilder(author); this.creator = (author == null) ? new StringBuilder(0) : new StringBuilder(author);
@ -214,6 +213,10 @@ dc_coverage
dc_rights dc_rights
*/ */
/**
* Get the main document title. This is the 1st in the list of titles.
* @return title_string (may return null or empty string)
*/
public String dc_title() { public String dc_title() {
return (this.titles == null || this.titles.size() == 0) ? "" : this.titles.iterator().next(); return (this.titles == null || this.titles.size() == 0) ? "" : this.titles.iterator().next();
} }
@ -222,6 +225,10 @@ dc_rights
return this.titles; return this.titles;
} }
/**
* Sets the title of the document, replacing any existing titles.
* @param title
*/
public void setTitle(final String title) { public void setTitle(final String title) {
this.titles = new ArrayList<String>(); this.titles = new ArrayList<String>();
if (title != null) this.titles.add(title); if (title != null) this.titles.add(title);
@ -239,11 +246,8 @@ dc_rights
* @param tags * @param tags
*/ */
public void addTags(Set<String> tags) { public void addTags(Set<String> tags) {
for (String s: this.keywords) {
tags.remove(s);
}
for (String s: tags) { for (String s: tags) {
this.keywords.add(s); if (s != null && !s.isEmpty()) this.keywords.add(s);
} }
} }
@ -274,28 +278,27 @@ dc_rights
} }
return gf; return gf;
} }
public String[] dc_subject() { /**
// sort out doubles and empty words * Get the set of keywords associated with the document
final TreeSet<String> hs = new TreeSet<String>(); * @return set of unique keywords
String s; */
for (int i = 0; i < this.keywords.size(); i++) { public Set<String> dc_subject() {
if (this.keywords.get(i) == null) continue; return this.keywords;
s = (this.keywords.get(i)).trim();
if (!s.isEmpty()) hs.add(s);
}
final String[] t = new String[hs.size()];
int i = 0;
for (final String u: hs) t[i++] = u;
return t;
} }
/**
* Get the set of keywords associated with the document and string
* each keyword separated by the separator character
*
* @param separator character
* @return string of keywords or empty string
*/
public String dc_subject(final char separator) { public String dc_subject(final char separator) {
final String[] t = dc_subject(); if (this.keywords.size() == 0) return "";
if (t.length == 0) return "";
// generate a new list // generate a new list
final StringBuilder sb = new StringBuilder(t.length * 8); final StringBuilder sb = new StringBuilder(this.keywords.size() * 8);
for (final String s: t) sb.append(s).append(separator); for (final String s: this.keywords) sb.append(s).append(separator);
return sb.substring(0, sb.length() - 1); return sb.substring(0, sb.length() - 1);
} }
@ -427,10 +430,6 @@ dc_rights
return sentences; return sentences;
} }
public List<String> getKeywords() {
return this.keywords;
}
public Collection<AnchorURL> getAnchors() { public Collection<AnchorURL> getAnchors() {
// returns all links embedded as anchors (clickeable entities) // returns all links embedded as anchors (clickeable entities)
// this is a url(String)/text(String) map // this is a url(String)/text(String) map
@ -688,7 +687,7 @@ dc_rights
for (final Document doc: docs) { for (final Document doc: docs) {
this.sections.addAll(doc.sections); this.sections.addAll(doc.sections);
this.titles.addAll(doc.titles()); this.titles.addAll(doc.titles());
this.keywords.addAll(doc.getKeywords()); this.keywords.addAll(doc.dc_subject());
for (String d: doc.dc_description()) this.descriptions.add(d); for (String d: doc.dc_description()) this.descriptions.add(d);
if (!(this.text instanceof ByteArrayOutputStream)) { if (!(this.text instanceof ByteArrayOutputStream)) {

View File

@ -3205,7 +3205,7 @@ public final class Switchboard extends serverSwitch {
//final Set<String> tags=ListManager.string2set(BookmarkHelper.cleanTagsString(post.get("bookmarkFolder","/crawlStart"))); //final Set<String> tags=ListManager.string2set(BookmarkHelper.cleanTagsString(post.get("bookmarkFolder","/crawlStart")));
final Set<String> tags=ListManager.string2set(BookmarkHelper.cleanTagsString("/crawlStart")); final Set<String> tags=ListManager.string2set(BookmarkHelper.cleanTagsString("/crawlStart"));
tags.add("crawlStart"); tags.add("crawlStart");
final String[] keywords = scraper.dc_subject(); final Set<String> keywords = scraper.dc_subject();
if (keywords != null) { if (keywords != null) {
for (final String k: keywords) { for (final String k: keywords) {
final String kk = BookmarkHelper.cleanTagsString(k); final String kk = BookmarkHelper.cleanTagsString(k);