//Document.java //------------------------ //part of YaCy //(C) by Michael Peter Christen; mc@yacy.net //first published on http://www.anomic.de //Frankfurt, Germany, 2005 // // $LastChangedDate$ // $LastChangedRevision$ // $LastChangedBy$ // //This program is free software; you can redistribute it and/or modify //it under the terms of the GNU General Public License as published by //the Free Software Foundation; either version 2 of the License, or //(at your option) any later version. // //This program is distributed in the hope that it will be useful, //but WITHOUT ANY WARRANTY; without even the implied warranty of //MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the //GNU General Public License for more details. // //You should have received a copy of the GNU General Public License //along with this program; if not, write to the Free Software //Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA package net.yacy.document; import java.io.BufferedInputStream; import java.io.ByteArrayInputStream; import java.io.ByteArrayOutputStream; import java.io.File; import java.io.FileInputStream; import java.io.IOException; import java.io.InputStream; import java.io.OutputStreamWriter; import java.io.UnsupportedEncodingException; import java.io.Writer; import java.net.MalformedURLException; import java.net.URL; import java.util.ArrayList; import java.util.Arrays; import java.util.Collection; import java.util.Date; import java.util.HashMap; import java.util.HashSet; import java.util.Iterator; import java.util.LinkedHashSet; import java.util.LinkedList; import java.util.List; import java.util.Map; import java.util.Properties; import java.util.Set; import java.util.TreeSet; import net.yacy.cora.date.ISO8601Formatter; import net.yacy.cora.document.MultiProtocolURI; import net.yacy.cora.document.UTF8; import net.yacy.cora.document.analysis.Classification; import net.yacy.cora.lod.vocabulary.Tagging; import net.yacy.cora.util.ConcurrentLog; import net.yacy.crawler.retrieval.Request; import net.yacy.document.parser.html.ContentScraper; import net.yacy.document.parser.html.ImageEntry; import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.kelondro.util.ByteBuffer; import net.yacy.kelondro.util.FileUtils; public class Document { private final DigestURI source; // the source url private final String mimeType; // mimeType as taken from http header private final String charset; // the charset of the document private final List keywords; // most resources provide a keyword field private List titles; // the document titles, taken from title and/or h1 tag; shall appear as headline of search result private final StringBuilder creator; // author or copyright private final String publisher; // publisher private final List sections; // if present: more titles/headlines appearing in the document private final StringBuilder description; // an abstract, if present: short content description private Object text; // the clear text, all that is visible private final Map anchors; // all links embedded as clickeable entities (anchor tags) private final Map rss; // all embedded rss feeds private final Map images; // all visible pictures in document // the anchors and images - Maps are URL-to-EntityDescription mappings. // The EntityDescription appear either as visible text in anchors or as alternative // text in image tags. private Map audiolinks, videolinks, applinks, hyperlinks; private Map inboundlinks, outboundlinks; private Map emaillinks; private MultiProtocolURI favicon; private boolean resorted; private final Set languages; private final boolean indexingDenied; private final double lon, lat; private final Object parserObject; // the source object that was used to create the Document private final Map> generic_facets; // a map from vocabulary names to the set of tags for that vocabulary which apply for this document public Document(final DigestURI location, final String mimeType, final String charset, final Object parserObject, final Set languages, final String[] keywords, final List titles, final String author, final String publisher, final String[] sections, final String abstrct, final double lon, final double lat, final Object text, final Map anchors, final Map rss, final Map images, final boolean indexingDenied) { this.source = location; this.mimeType = (mimeType == null) ? "application/octet-stream" : mimeType; this.charset = charset; this.parserObject = parserObject; this.keywords = new LinkedList(); if (keywords != null) this.keywords.addAll(Arrays.asList(keywords)); this.titles = (titles == null) ? new ArrayList(1) : titles; this.creator = (author == null) ? new StringBuilder(0) : new StringBuilder(author); this.sections = new LinkedList() ; if (sections != null) this.sections.addAll(Arrays.asList(sections)); this.description = (abstrct == null) ? new StringBuilder(0) : new StringBuilder(abstrct); if (lat >= -90.0d && lat <= 90.0d && lon >= -180.0d && lon <= 180.0d) { this.lon = lon; this.lat = lat; } else { // we ignore false values because otherwise solr will cause an error when we input the coordinates into the index this.lon = 0.0d; this.lat = 0.0d; } this.anchors = (anchors == null) ? new HashMap(0) : anchors; this.rss = (rss == null) ? new HashMap(0) : rss; this.images = (images == null) ? new HashMap() : images; this.publisher = publisher; this.hyperlinks = null; this.audiolinks = null; this.videolinks = null; this.applinks = null; this.emaillinks = null; this.resorted = false; this.inboundlinks = null; this.outboundlinks = null; this.languages = languages; this.indexingDenied = indexingDenied; this.text = text == null ? "" : text; this.generic_facets = new HashMap>(); } public Object getParserObject() { return this.parserObject; } public Set getContentLanguages() { return this.languages; } public String getFileName() { return this.source.getFileName(); } public Map> getGenericFacets() { return this.generic_facets; } /** * compute a set of languages that this document contains * the language is not computed using a statistical analysis of the content, only from given metadata that came with the document * if there are several languages defined in the document, the TLD is taken to check which one should be picked * If there is no metadata at all, null is returned * @return a string with a language name using the alpha-2 code of ISO 639 */ public String dc_language() { if (this.languages == null) return null; if (this.languages.isEmpty()) return null; if (this.languages.size() == 1) return this.languages.iterator().next(); if (this.languages.contains(this.source.language())) return this.source.language(); // now we are confused: the declared languages differ all from the TLD // just pick one of the languages that we have return this.languages.iterator().next(); } /* DC according to rfc 5013 * dc_title * dc_creator * dc_subject * dc_description * dc_publisher dc_contributor dc_date dc_type * dc_format * dc_identifier * dc_source dc_language dc_relation dc_coverage dc_rights */ public String dc_title() { return (this.titles == null || this.titles.size() == 0) ? "" : this.titles.iterator().next(); } public List titles() { return this.titles; } public void setTitle(final String title) { this.titles = new ArrayList(); if (title != null) this.titles.add(title); } public String dc_creator() { return (this.creator == null) ? "" : this.creator.toString(); } /** * add the given words to the set of keywords. * These keywords will appear in dc_subject * @param tags */ public void addTags(Set tags) { for (String s: this.keywords) { tags.remove(s); } for (String s: tags) { this.keywords.add(s); } } /** * add the given words to the set of keywords. * These keywords will appear in dc_subject * @param tags */ protected void addMetatags(Map> tags) { //String subject = YaCyMetadata.hashURI(this.source.hash()); //for (String s: this.keywords) { // tags.remove(s); //} for (Map.Entry> e: tags.entrySet()) { Tagging vocabulary = LibraryProvider.autotagging.getVocabulary(e.getKey()); if (vocabulary == null) continue; //String objectspace = vocabulary.getObjectspace(); //StringBuilder sb = new StringBuilder(e.getValue().size() * 20); Set objects = new HashSet(); for (Tagging.Metatag s: e.getValue()) { objects.add(s.getObject()); //sb.append(',').append(s.getObject()); /* String objectlink = vocabulary.getObjectlink(s.getObject()); if ((objectspace != null && objectspace.length() > 0) || (objectlink != null && objectlink.length() > 0)) { JenaTripleStore.addTriple(subject, DCTerms.references.getPredicate(), objectlink == null || objectlink.isEmpty() ? objectspace + s.getObject() + "#" + s.getObject() : objectlink + "#" + s.getObject()); } */ } // put to triplestore //JenaTripleStore.addTriple(subject, Owl.SameAs.getPredicate(), this.source.toNormalform(true)); //JenaTripleStore.addTriple(subject, vocabulary.getPredicate(), sb.substring(1)); // superfluous with the generic_facets this.generic_facets.put(vocabulary.getName(), objects); } } public String[] dc_subject() { // sort out doubles and empty words final TreeSet hs = new TreeSet(); String s; for (int i = 0; i < this.keywords.size(); i++) { if (this.keywords.get(i) == null) continue; s = (this.keywords.get(i)).trim(); if (!s.isEmpty()) hs.add(s); } final String[] t = new String[hs.size()]; int i = 0; for (final String u: hs) t[i++] = u; return t; } public String dc_subject(final char separator) { final String[] t = dc_subject(); if (t.length == 0) return ""; // generate a new list final StringBuilder sb = new StringBuilder(t.length * 8); for (final String s: t) sb.append(s).append(separator); return sb.substring(0, sb.length() - 1); } public String dc_description() { if (this.description == null) return dc_title(); return this.description.toString(); } public String dc_publisher() { return this.publisher == null ? "" : this.publisher; } public String dc_format() { return this.mimeType; } public String dc_identifier() { return this.source.toNormalform(true); } public DigestURI dc_source() { return this.source; } /** * @return the supposed charset of this document or null if unknown */ public String getCharset() { return this.charset; } public String[] getSectionTitles() { if (this.sections == null) { return new String[] { dc_title() }; } return this.sections.toArray(new String[this.sections.size()]); } public InputStream getTextStream() { try { if (this.text == null) return new ByteArrayInputStream(UTF8.getBytes("")); if (this.text instanceof String) { //return new StreamReader((String) this.text); return new ByteArrayInputStream(UTF8.getBytes(((String) this.text))); } else if (this.text instanceof InputStream) { return (InputStream) this.text; } else if (this.text instanceof File) { return new BufferedInputStream(new FileInputStream((File)this.text)); } else if (this.text instanceof byte[]) { return new ByteArrayInputStream((byte[]) this.text); } else if (this.text instanceof ByteArrayOutputStream) { return new ByteArrayInputStream(((ByteArrayOutputStream) this.text).toByteArray()); } assert false : this.text.getClass().toString(); return null; } catch (final Exception e) { ConcurrentLog.logException(e); } return new ByteArrayInputStream(UTF8.getBytes("")); } public String getTextString() { try { if (this.text == null) { this.text = ""; } else if (this.text instanceof InputStream) { this.text = UTF8.String(FileUtils.read((InputStream) this.text)); } else if (this.text instanceof File) { this.text = UTF8.String(FileUtils.read((File) this.text)); } else if (this.text instanceof byte[]) { this.text = UTF8.String((byte[]) this.text); } else if (this.text instanceof ByteArrayOutputStream) { this.text = UTF8.String(((ByteArrayOutputStream) this.text).toByteArray()); } assert this.text instanceof String : this.text.getClass().toString(); return (String) this.text; } catch (final Exception e) { ConcurrentLog.logException(e); } return ""; } public long getTextLength() { try { if (this.text == null) return -1; if (this.text instanceof String) { return ((String) this.text).length(); } else if (this.text instanceof InputStream) { return ((InputStream) this.text).available(); } else if (this.text instanceof File) { return ((File) this.text).length(); } else if (this.text instanceof byte[]) { return ((byte[]) this.text).length; } else if (this.text instanceof ByteArrayOutputStream) { return ((ByteArrayOutputStream) this.text).size(); } assert false : this.text.getClass().toString(); return -1; } catch (final Exception e) { ConcurrentLog.logException(e); } return -1; } public List getSentences(final boolean pre) { final SentenceReader sr = new SentenceReader(getTextString(), pre); List sentences = new ArrayList(); while (sr.hasNext()) { sentences.add(sr.next()); } return sentences; } public List getKeywords() { return this.keywords; } public Map getAnchors() { // returns all links embedded as anchors (clickeable entities) // this is a url(String)/text(String) map return this.anchors; } public Map getRSS() { // returns all links embedded as anchors (clickeable entities) // this is a url(String)/text(String) map return this.rss; } // the next three methods provide a calculated view on the getAnchors/getImages: public Map getHyperlinks() { // this is a subset of the getAnchor-set: only links to other hyperrefs if (!this.resorted) resortLinks(); return this.hyperlinks; } public Map getAudiolinks() { if (!this.resorted) resortLinks(); return this.audiolinks; } public Map getVideolinks() { if (!this.resorted) resortLinks(); return this.videolinks; } public Map getImages() { // returns all links enbedded as pictures (visible in document) // this resturns a htmlFilterImageEntry collection if (!this.resorted) resortLinks(); return this.images; } public Map getApplinks() { if (!this.resorted) resortLinks(); return this.applinks; } public Map getEmaillinks() { // this is part of the getAnchor-set: only links to email addresses if (!this.resorted) resortLinks(); return this.emaillinks; } public double lon() { return this.lon; } public double lat() { return this.lat; } private void resortLinks() { if (this.resorted) return; synchronized (this) { if (this.resorted) return; // extract hyperlinks, medialinks and emaillinks from anchorlinks DigestURI url; String u; int extpos, qpos; String ext = null; final String thishost = this.source.getHost(); this.inboundlinks = new HashMap(); this.outboundlinks = new HashMap(); this.hyperlinks = new HashMap(); this.videolinks = new HashMap(); this.audiolinks = new HashMap(); this.applinks = new HashMap(); this.emaillinks = new HashMap(); final Map collectedImages = new HashMap(); // this is a set that is collected now and joined later to the imagelinks for (final Map.Entry entry: collectedImages.entrySet()) { if (entry.getKey().getHost().equals(thishost)) this.inboundlinks.put(entry.getKey(), "image"); else this.outboundlinks.put(entry.getKey(), "image"); } for (final Map.Entry entry: this.anchors.entrySet()) { url = entry.getKey(); if (url == null) continue; final boolean noindex = entry.getValue().getProperty("rel", "").toLowerCase().indexOf("noindex",0) >= 0; final boolean nofollow = entry.getValue().getProperty("rel", "").toLowerCase().indexOf("nofollow",0) >= 0; if ((thishost == null && url.getHost() == null) || ((thishost != null && url.getHost() != null) && (url.getHost().endsWith(thishost) || (thishost.startsWith("www.") && url.getHost().endsWith(thishost.substring(4)))))) { this.inboundlinks.put(url, "anchor" + (noindex ? " noindex" : "") + (nofollow ? " nofollow" : "")); } else { this.outboundlinks.put(url, "anchor" + (noindex ? " noindex" : "") + (nofollow ? " nofollow" : "")); } u = url.toNormalform(true); final String name = entry.getValue().getProperty("name", ""); if (u.startsWith("mailto:")) { this.emaillinks.put(u.substring(7), name); } else { extpos = u.lastIndexOf('.'); if (extpos > 0) { if (((qpos = u.indexOf('?')) >= 0) && (qpos > extpos)) { ext = u.substring(extpos + 1, qpos).toLowerCase(); } else { ext = u.substring(extpos + 1).toLowerCase(); } if (Classification.isMediaExtension(ext)) { // this is not a normal anchor, its a media link if (Classification.isImageExtension(ext)) { ContentScraper.addImage(collectedImages, new ImageEntry(url, name, -1, -1, -1)); } else if (Classification.isAudioExtension(ext)) this.audiolinks.put(url, name); else if (Classification.isVideoExtension(ext)) this.videolinks.put(url, name); else if (Classification.isApplicationExtension(ext)) this.applinks.put(url, name); } } // in any case we consider this as a link and let the parser decide if that link can be followed this.hyperlinks.put(url, name); } } // add image links that we collected from the anchors to the image map ContentScraper.addAllImages(this.images, collectedImages); // expand the hyperlinks: // we add artificial hyperlinks to the hyperlink set // that can be calculated from given hyperlinks and imagelinks this.hyperlinks.putAll(allReflinks(this.images.values())); this.hyperlinks.putAll(allReflinks(this.audiolinks.keySet())); this.hyperlinks.putAll(allReflinks(this.videolinks.keySet())); this.hyperlinks.putAll(allReflinks(this.applinks.keySet())); /* hyperlinks.putAll(allSubpaths(hyperlinks.keySet())); hyperlinks.putAll(allSubpaths(images.values())); hyperlinks.putAll(allSubpaths(audiolinks.keySet())); hyperlinks.putAll(allSubpaths(videolinks.keySet())); hyperlinks.putAll(allSubpaths(applinks.keySet())); */ // don't do this again this.resorted = true; } } public static Map allSubpaths(final Collection links) { // links is either a Set of Strings (urls) or a Set of // htmlFilterImageEntries final Set h = new HashSet(); Iterator i = links.iterator(); Object o; MultiProtocolURI url; String u; int pos; int l; while (i.hasNext()) try { o = i.next(); if (o instanceof MultiProtocolURI) url = (MultiProtocolURI) o; else if (o instanceof String) url = new MultiProtocolURI((String) o); else if (o instanceof ImageEntry) url = ((ImageEntry) o).url(); else { assert false; continue; } u = url.toNormalform(true); if (u.endsWith("/")) u = u.substring(0, u.length() - 1); pos = u.lastIndexOf('/'); while (pos > 8) { l = u.length(); u = u.substring(0, pos + 1); h.add(u); u = u.substring(0, pos); assert (u.length() < l) : "u = " + u; pos = u.lastIndexOf('/'); } } catch (final MalformedURLException e) { } // now convert the strings to yacyURLs i = h.iterator(); final Map v = new HashMap(); while (i.hasNext()) { u = (String) i.next(); try { url = new MultiProtocolURI(u); v.put(url, "sub"); } catch (final MalformedURLException e) { } } return v; } private static Map allReflinks(final Collection links) { // links is either a Set of Strings (with urls) or // htmlFilterImageEntries // we find all links that are part of a reference inside a url final Map v = new HashMap(); final Iterator i = links.iterator(); Object o; DigestURI url = null; String u; int pos; loop: while (i.hasNext()) try { o = i.next(); if (o instanceof DigestURI) url = (DigestURI) o; else if (o instanceof String) url = new DigestURI((String) o); else if (o instanceof ImageEntry) url = ((ImageEntry) o).url(); else { assert false; continue loop; } if (url == null) continue loop; u = url.toNormalform(true); if ((pos = u.toLowerCase().indexOf("http://", 7)) > 0) { i.remove(); u = u.substring(pos); while ((pos = u.toLowerCase().indexOf("http://", 7)) > 0) u = u.substring(pos); url = new DigestURI(u); if (!(v.containsKey(url))) v.put(url, "ref"); continue loop; } if ((pos = u.toLowerCase().indexOf("/www.", 7)) > 0) { i.remove(); u = "http:/" + u.substring(pos); while ((pos = u.toLowerCase().indexOf("/www.", 7)) > 0) u = "http:/" + u.substring(pos); url = new DigestURI(u); if (!(v.containsKey(url))) v.put(url, "ref"); continue loop; } } catch (final MalformedURLException e) { } return v; } public void addSubDocuments(final Document[] docs) throws IOException { for (final Document doc: docs) { this.sections.addAll(doc.sections); this.titles.addAll(doc.titles()); this.keywords.addAll(doc.getKeywords()); if (this.description.length() > 0) this.description.append('\n'); this.description.append(doc.dc_description()); if (!(this.text instanceof ByteArrayOutputStream)) { this.text = new ByteArrayOutputStream(); } FileUtils.copy(doc.getTextStream(), (ByteArrayOutputStream) this.text); this.anchors.putAll(doc.getAnchors()); this.rss.putAll(doc.getRSS()); ContentScraper.addAllImages(this.images, doc.getImages()); } } /** * @return the {@link URL} to the favicon that belongs to the document */ public MultiProtocolURI getFavicon() { return this.favicon; } /** * @param faviconURL the {@link URL} to the favicon that belongs to the document */ public void setFavicon(final MultiProtocolURI faviconURL) { this.favicon = faviconURL; } public int inboundLinkNofollowCount() { if (this.inboundlinks == null) resortLinks(); if (this.inboundlinks == null) return 0; int c = 0; for (final String tag: this.inboundlinks.values()) { if (tag.contains("nofollow")) c++; } return c; } public int outboundLinkNofollowCount() { if (this.outboundlinks == null) resortLinks(); if (this.outboundlinks == null) return 0; int c = 0; for (final String tag: this.outboundlinks.values()) { if (tag.contains("nofollow")) c++; } return c; } public Set inboundLinks() { if (this.inboundlinks == null) resortLinks(); return (this.inboundlinks == null) ? null : this.inboundlinks.keySet(); } public Set outboundLinks() { if (this.outboundlinks == null) resortLinks(); return (this.outboundlinks == null) ? null : this.outboundlinks.keySet(); } public boolean indexingDenied() { return this.indexingDenied; } public void writeXML(final Writer os, final Date date) throws IOException { os.write("\n"); final String title = dc_title(); if (title != null && title.length() > 0) os.write("\n"); os.write("" + dc_identifier() + "\n"); final String creator = dc_creator(); if (creator != null && creator.length() > 0) os.write("\n"); final String publisher = dc_publisher(); if (publisher != null && publisher.length() > 0) os.write("\n"); final String subject = this.dc_subject(';'); if (subject != null && subject.length() > 0) os.write("\n"); if (this.text != null) { os.write("\n"); } final String language = dc_language(); if (language != null && language.length() > 0) os.write("" + dc_language() + "\n"); os.write("" + ISO8601Formatter.FORMATTER.format(date) + "\n"); if (this.lon != 0.0 && this.lat != 0.0) os.write("" + this.lon +"" + this.lat + "\n"); os.write("\n"); } @Override public String toString() { final ByteArrayOutputStream baos = new ByteArrayOutputStream(); try { final Writer osw = new OutputStreamWriter(baos, "UTF-8"); writeXML(osw, new Date()); osw.close(); return UTF8.String(baos.toByteArray()); } catch (final UnsupportedEncodingException e1) { return ""; } catch (final IOException e) { return ""; } } public synchronized void close() { if (this.text == null) return; // try close the output stream if (this.text instanceof InputStream) try { ((InputStream) this.text).close(); } catch (final Exception e) {} finally { this.text = null; } // delete the temp file if (this.text instanceof File) try { FileUtils.deletedelete((File) this.text); } catch (final Exception e) {} finally { this.text = null; } } /** * merge documents: a helper method for all parsers that return multiple documents * @param docs * @return */ public static Document mergeDocuments(final DigestURI location, final String globalMime, final Document[] docs) { if (docs == null || docs.length == 0) return null; if (docs.length == 1) return docs[0]; long docTextLength = 0; final ByteBuffer content = new ByteBuffer(); final StringBuilder authors = new StringBuilder(80); final StringBuilder publishers = new StringBuilder(80); final StringBuilder subjects = new StringBuilder(80); final StringBuilder description = new StringBuilder(80); final Collection titles = new LinkedHashSet(); final Collection sectionTitles = new LinkedHashSet(); final Map anchors = new HashMap(); final Map rss = new HashMap(); final Map images = new HashMap(); double lon = 0.0d, lat = 0.0d; for (final Document doc: docs) { if (doc == null) continue; final String author = doc.dc_creator(); if (author.length() > 0) { if (authors.length() > 0) authors.append(","); subjects.append(author); } final String publisher = doc.dc_publisher(); if (publisher.length() > 0) { if (publishers.length() > 0) publishers.append(","); publishers.append(publisher); } final String subject = doc.dc_subject(','); if (subject.length() > 0) { if (subjects.length() > 0) subjects.append(","); subjects.append(subject); } titles.addAll(doc.titles()); sectionTitles.addAll(Arrays.asList(doc.getSectionTitles())); if (description.length() > 0) description.append("\n"); description.append(doc.dc_description()); if (doc.getTextLength() > 0) { if (docTextLength > 0) content.write('\n'); try { docTextLength += FileUtils.copy(doc.getTextStream(), content); } catch (final IOException e) { ConcurrentLog.logException(e); } } anchors.putAll(doc.getAnchors()); rss.putAll(doc.getRSS()); ContentScraper.addAllImages(images, doc.getImages()); if (doc.lon() != 0.0 && doc.lat() != 0.0) { lon = doc.lon(); lat = doc.lat(); } } // clean up parser data for (final Document doc: docs) { Object parserObject = doc.getParserObject(); if (parserObject instanceof ContentScraper) { final ContentScraper html = (ContentScraper) parserObject; html.close(); } } // return consolidation ArrayList titlesa = new ArrayList(); titlesa.addAll(titles); return new Document( location, globalMime, null, null, null, subjects.toString().split(" |,"), titlesa, authors.toString(), publishers.toString(), sectionTitles.toArray(new String[sectionTitles.size()]), description.toString(), lon, lat, content.getBytes(), anchors, rss, images, false); } public static Map getHyperlinks(final Document[] documents) { final Map result = new HashMap(); for (final Document d: documents) { result.putAll(d.getHyperlinks()); final Object parser = d.getParserObject(); if (parser instanceof ContentScraper) { final ContentScraper html = (ContentScraper) parser; String refresh = html.getRefreshPath(); if (refresh != null && refresh.length() > 0) try {result.put(new DigestURI(refresh), "refresh");} catch (final MalformedURLException e) {} DigestURI canonical = html.getCanonical(); if (canonical != null) result.put(canonical, "canonical"); } } return result; } public static Map getImagelinks(final Document[] documents) { final Map result = new HashMap(); for (final Document d: documents) { for (final ImageEntry imageReference : d.getImages().values()) { // construct a image name which contains the document title to enhance the search process for images result.put(imageReference.url(), description(d, imageReference.alt())); } } return result; } public static Map getAudiolinks(final Document[] documents) { final Map result = new HashMap(); for (final Document d: documents) { for (Map.Entry e: d.audiolinks.entrySet()) { result.put(e.getKey(), description(d, e.getValue())); } } return result; } public static Map getVideolinks(final Document[] documents) { final Map result = new HashMap(); for (final Document d: documents) { for (Map.Entry e: d.videolinks.entrySet()) { result.put(e.getKey(), description(d, e.getValue())); } } return result; } public static Map getApplinks(final Document[] documents) { final Map result = new HashMap(); for (final Document d: documents) { for (Map.Entry e: d.applinks.entrySet()) { result.put(e.getKey(), description(d, e.getValue())); } } return result; } private static final String description(Document d, String tagname) { if (tagname == null || tagname.isEmpty()) { tagname = d.source.toTokens(); } StringBuilder sb = new StringBuilder(60); sb.append(d.dc_title()); if (!d.dc_description().equals(d.dc_title()) && sb.length() < Request.descrLength - tagname.length()) { sb.append(' '); sb.append(d.dc_description()); } if (sb.length() < Request.descrLength - tagname.length()) { sb.append(' '); sb.append(d.dc_subject(',')); } if (tagname.length() > 0) { if (sb.length() > Request.descrLength - tagname.length() - 3) { // cut this off because otherwise the tagname is lost. sb.setLength(Request.descrLength - tagname.length() - 3); } sb.append(" - "); sb.append(tagname); } return sb.toString().trim(); } }