yacy_search_server/source/de/anomic/document/Document.java
orbiter 1d8d51075c refactoring:
- removed the plasma package. The name of that package came from a very early pre-version of YaCy, even before YaCy was named AnomicHTTPProxy. The Proxy project introduced search for cache contents using class files that had been developed during the plasma project. Information from 2002 about plasma can be found here:
http://web.archive.org/web/20020802110827/http://anomic.de/AnomicPlasma/index.html
We stil have one class that comes mostly unchanged from the plasma project, the Condenser class. But this is now part of the document package and all other classes in the plasma package can be assigned to other packages.
- cleaned up the http package: better structure of that class and clean isolation of server and client classes. The old HTCache becomes part of the client sub-package of http.
- because the plasmaSwitchboard is now part of the search package all servlets had to be touched to declare a different package source.

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@6232 6c8d7289-2bf4-0310-a012-ef5d649a1542
2009-07-19 20:37:44 +00:00

611 lines
24 KiB
Java

//plasmaParserDocument.java
//------------------------
//part of YaCy
//(C) by Michael Peter Christen; mc@yacy.net
//first published on http://www.anomic.de
//Frankfurt, Germany, 2005
//
//last major change: 24.04.2005
//
//This program is free software; you can redistribute it and/or modify
//it under the terms of the GNU General Public License as published by
//the Free Software Foundation; either version 2 of the License, or
//(at your option) any later version.
//
//This program is distributed in the hope that it will be useful,
//but WITHOUT ANY WARRANTY; without even the implied warranty of
//MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
//GNU General Public License for more details.
//
//You should have received a copy of the GNU General Public License
//along with this program; if not, write to the Free Software
//Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
package de.anomic.document;
import java.io.BufferedInputStream;
import java.io.ByteArrayInputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStreamWriter;
import java.net.MalformedURLException;
import java.util.Arrays;
import java.util.Collection;
import java.util.Date;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.TreeSet;
import de.anomic.document.parser.html.ContentScraper;
import de.anomic.document.parser.html.ImageEntry;
import de.anomic.kelondro.util.DateFormatter;
import de.anomic.kelondro.util.FileUtils;
import de.anomic.server.serverCachedFileOutputStream;
import de.anomic.yacy.yacyURL;
import de.anomic.ymage.WebStructureGraph;
public class Document {
private final yacyURL source; // the source url
private final String mimeType; // mimeType as taken from http header
private final String charset; // the charset of the document
private final List<String> keywords; // most resources provide a keyword field
private StringBuilder title; // a document title, taken from title or h1 tag; shall appear as headline of search result
private final StringBuilder creator; // author or copyright
private final List<String> sections; // if present: more titles/headlines appearing in the document
private final StringBuilder description; // an abstract, if present: short content description
private Object text; // the clear text, all that is visible
private final Map<yacyURL, String> anchors; // all links embedded as clickeable entities (anchor tags)
private final HashMap<String, ImageEntry> images; // all visible pictures in document
// the anchors and images - Maps are URL-to-EntityDescription mappings.
// The EntityDescription appear either as visible text in anchors or as alternative
// text in image tags.
private Map<yacyURL, String> hyperlinks, audiolinks, videolinks, applinks;
private Map<String, String> emaillinks;
private yacyURL favicon;
private boolean resorted;
private InputStream textStream;
private int inboundLinks, outboundLinks; // counters for inbound and outbound links, are counted after calling notifyWebStructure
private Set<String> languages;
protected Document(final yacyURL location, final String mimeType, final String charset, final Set<String> languages,
final String[] keywords, final String title, final String author,
final String[] sections, final String abstrct,
final Object text, final Map<yacyURL, String> anchors, final HashMap<String, ImageEntry> images) {
this.source = location;
this.mimeType = (mimeType == null) ? "application/octet-stream" : mimeType;
this.charset = charset;
this.keywords = (keywords == null) ? new LinkedList<String>() : Arrays.asList(keywords);
this.title = (title == null) ? new StringBuilder(0) : new StringBuilder(title);
this.creator = (author == null) ? new StringBuilder(0) : new StringBuilder(author);
this.sections = (sections == null) ? new LinkedList<String>() : Arrays.asList(sections);
this.description = (abstrct == null) ? new StringBuilder(0) : new StringBuilder(abstrct);
this.anchors = (anchors == null) ? new HashMap<yacyURL, String>(0) : anchors;
this.images = (images == null) ? new HashMap<String, ImageEntry>() : images;
this.hyperlinks = null;
this.audiolinks = null;
this.videolinks = null;
this.applinks = null;
this.emaillinks = null;
this.resorted = false;
this.inboundLinks = -1;
this.outboundLinks = -1;
this.languages = languages;
if (text == null) try {
this.text = new serverCachedFileOutputStream(Idiom.MAX_KEEP_IN_MEMORY_SIZE);
} catch (final IOException e) {
e.printStackTrace();
this.text = new StringBuilder();
} else {
this.text = text;
}
}
public Document(final yacyURL location, final String mimeType, final String charset, final Set<String> languages) {
this(location, mimeType, charset, languages, null, null, null, null, null, (Object)null, null, null);
}
public Document(final yacyURL location, final String mimeType, final String charset, final Set<String> languages,
final String[] keywords, final String title, final String author,
final String[] sections, final String abstrct,
final byte[] text, final Map<yacyURL, String> anchors, final HashMap<String, ImageEntry> images) {
this(location, mimeType, charset, languages, keywords, title, author, sections, abstrct, (Object)text, anchors, images);
}
public Document(final yacyURL location, final String mimeType, final String charset, final Set<String> languages,
final String[] keywords, final String title, final String author,
final String[] sections, final String abstrct,
final File text, final Map<yacyURL, String> anchors, final HashMap<String, ImageEntry> images) {
this(location, mimeType, charset, languages, keywords, title, author, sections, abstrct, (Object)text, anchors, images);
}
public Document(final yacyURL location, final String mimeType, final String charset, final Set<String> languages,
final String[] keywords, final String title, final String author,
final String[] sections, final String abstrct,
final serverCachedFileOutputStream text, final Map<yacyURL, String> anchors, final HashMap<String, ImageEntry> images) {
this(location, mimeType, charset, languages, keywords, title, author, sections, abstrct, (Object)text, anchors, images);
}
/**
* compute a set of languages that this document contains
* the language is not computed using a statistical analysis of the content, only from given metadata that came with the document
* if there are several languages defined in the document, the TLD is taken to check which one should be picked
* If there is no metadata at all, null is returned
* @return a string with a language name using the alpha-2 code of ISO 639
*/
public String dc_language() {
if (this.languages == null) return null;
if (this.languages.size() == 0) return null;
if (this.languages.size() == 1) return languages.iterator().next();
if (this.languages.contains(this.source.language())) return this.source.language();
// now we are confused: the declared languages differ all from the TLD
// just pick one of the languages that we have
return languages.iterator().next();
}
/*
DC according to rfc 5013
* dc_title
* dc_creator
* dc_subject
* dc_description
* dc_publisher
dc_contributor
dc_date
dc_type
* dc_format
* dc_identifier
* dc_source
dc_language
dc_relation
dc_coverage
dc_rights
*/
public String dc_title() {
return title.toString();
}
public void setTitle(String title) {
this.title = new StringBuilder(title);
}
public String dc_creator() {
if (creator == null)
return "";
return creator.toString();
}
public String dc_subject(final char separator) {
// sort out doubles and empty words
final TreeSet<String> hs = new TreeSet<String>();
String s;
for (int i = 0; i < this.keywords.size(); i++) {
if (this.keywords.get(i) == null) continue;
s = (this.keywords.get(i)).trim();
if (s.length() > 0) hs.add(s.toLowerCase());
}
if (hs.size() == 0) return "";
// generate a new list
final StringBuilder sb = new StringBuilder(this.keywords.size() * 6);
final Iterator<String> i = hs.iterator();
while (i.hasNext()) sb.append(i.next()).append(separator);
return sb.substring(0, sb.length() - 1);
}
public String dc_description() {
if (description == null)
return dc_title();
return description.toString();
}
public String dc_publisher() {
// if we don't have a publisher, simply return the host/domain name
return this.source.getHost();
}
public String dc_format() {
return this.mimeType;
}
public String dc_identifier() {
return this.source.toNormalform(true, false);
}
public yacyURL dc_source() {
return this.source;
}
/**
* @return the supposed charset of this document or <code>null</code> if unknown
*/
public String getCharset() {
return this.charset;
}
public String[] getSectionTitles() {
if (sections == null) {
return new String[] { dc_title() };
}
return sections.toArray(new String[this.sections.size()]);
}
public InputStream getText() {
try {
if (this.text == null) return null;
if (this.text instanceof File) {
this.textStream = new BufferedInputStream(new FileInputStream((File)this.text));
} else if (this.text instanceof byte[]) {
this.textStream = new ByteArrayInputStream((byte[])this.text);
} else if (this.text instanceof serverCachedFileOutputStream) {
return ((serverCachedFileOutputStream)this.text).getContent();
}
return this.textStream;
} catch (final Exception e) {
e.printStackTrace();
}
return null;
}
public byte[] getTextBytes() {
try {
if (this.text == null) return new byte[0];
if (this.text instanceof File) {
return FileUtils.read((File)this.text);
} else if (this.text instanceof byte[]) {
return (byte[])this.text;
} else if (this.text instanceof serverCachedFileOutputStream) {
final serverCachedFileOutputStream ffbaos = (serverCachedFileOutputStream)this.text;
if (ffbaos.isFallback()) {
return FileUtils.read(ffbaos.getContent());
}
return ffbaos.getContentBAOS();
}
} catch (final Exception e) {
e.printStackTrace();
}
return new byte[0];
}
public long getTextLength() {
if (this.text == null) return 0;
if (this.text instanceof File) return ((File)this.text).length();
else if (this.text instanceof byte[]) return ((byte[])this.text).length;
else if (this.text instanceof serverCachedFileOutputStream) {
return ((serverCachedFileOutputStream)this.text).getLength();
}
return -1;
}
public Iterator<StringBuilder> getSentences(final boolean pre) {
if (this.text == null) return null;
final Condenser.sentencesFromInputStreamEnum e = Condenser.sentencesFromInputStream(getText());
e.pre(pre);
return e;
}
public List<String> getKeywords() {
return this.keywords;
}
public Map<yacyURL, String> getAnchors() {
// returns all links embedded as anchors (clickeable entities)
// this is a url(String)/text(String) map
return anchors;
}
// the next three methods provide a calculated view on the getAnchors/getImages:
public Map<yacyURL, String> getHyperlinks() {
// this is a subset of the getAnchor-set: only links to other hyperrefs
if (!resorted) resortLinks();
return hyperlinks;
}
public Map<yacyURL, String> getAudiolinks() {
if (!resorted) resortLinks();
return this.audiolinks;
}
public Map<yacyURL, String> getVideolinks() {
if (!resorted) resortLinks();
return this.videolinks;
}
public HashMap<String, ImageEntry> getImages() {
// returns all links enbedded as pictures (visible in document)
// this resturns a htmlFilterImageEntry collection
if (!resorted) resortLinks();
return images;
}
public Map<yacyURL, String> getApplinks() {
if (!resorted) resortLinks();
return this.applinks;
}
public Map<String, String> getEmaillinks() {
// this is part of the getAnchor-set: only links to email addresses
if (!resorted) resortLinks();
return emaillinks;
}
private synchronized void resortLinks() {
if (this.resorted) return;
// extract hyperlinks, medialinks and emaillinks from anchorlinks
yacyURL url;
String u;
int extpos, qpos;
String ext = null;
final Iterator<Map.Entry<yacyURL, String>> i = anchors.entrySet().iterator();
hyperlinks = new HashMap<yacyURL, String>();
videolinks = new HashMap<yacyURL, String>();
audiolinks = new HashMap<yacyURL, String>();
applinks = new HashMap<yacyURL, String>();
emaillinks = new HashMap<String, String>();
final HashMap<String, ImageEntry> collectedImages = new HashMap<String, ImageEntry>(); // this is a set that is collected now and joined later to the imagelinks
Map.Entry<yacyURL, String> entry;
while (i.hasNext()) {
entry = i.next();
url = entry.getKey();
if (url == null) continue;
u = url.toNormalform(true, false);
if (u.startsWith("mailto:")) {
emaillinks.put(u.substring(7), entry.getValue());
} else {
extpos = u.lastIndexOf(".");
if (extpos > 0) {
if (((qpos = u.indexOf("?")) >= 0) && (qpos > extpos)) {
ext = u.substring(extpos + 1, qpos).toLowerCase();
} else {
ext = u.substring(extpos + 1).toLowerCase();
}
if (Classification.isMediaExtension(ext)) {
// this is not a normal anchor, its a media link
if (Classification.isImageExtension(ext)) {
ContentScraper.addImage(collectedImages, new ImageEntry(url, entry.getValue(), -1, -1));
}
else if (Classification.isAudioExtension(ext)) audiolinks.put(url, entry.getValue());
else if (Classification.isVideoExtension(ext)) videolinks.put(url, entry.getValue());
else if (Classification.isApplicationExtension(ext)) applinks.put(url, entry.getValue());
} else {
hyperlinks.put(url, entry.getValue());
}
} else {
// a path to a directory
hyperlinks.put(url, entry.getValue());
}
}
}
// add image links that we collected from the anchors to the image map
ContentScraper.addAllImages(images, collectedImages);
// expand the hyperlinks:
// we add artificial hyperlinks to the hyperlink set
// that can be calculated from given hyperlinks and imagelinks
hyperlinks.putAll(allReflinks(images.values()));
hyperlinks.putAll(allReflinks(audiolinks.keySet()));
hyperlinks.putAll(allReflinks(videolinks.keySet()));
hyperlinks.putAll(allReflinks(applinks.keySet()));
/*
hyperlinks.putAll(allSubpaths(hyperlinks.keySet()));
hyperlinks.putAll(allSubpaths(images.values()));
hyperlinks.putAll(allSubpaths(audiolinks.keySet()));
hyperlinks.putAll(allSubpaths(videolinks.keySet()));
hyperlinks.putAll(allSubpaths(applinks.keySet()));
*/
// don't do this again
this.resorted = true;
}
public static Map<yacyURL, String> allSubpaths(final Collection<?> links) {
// links is either a Set of Strings (urls) or a Set of
// htmlFilterImageEntries
final HashSet<String> h = new HashSet<String>();
Iterator<?> i = links.iterator();
Object o;
yacyURL url;
String u;
int pos;
int l;
while (i.hasNext())
try {
o = i.next();
if (o instanceof yacyURL) url = (yacyURL) o;
else if (o instanceof String) url = new yacyURL((String) o, null);
else if (o instanceof ImageEntry) url = ((ImageEntry) o).url();
else {
assert false;
continue;
}
u = url.toNormalform(true, true);
if (u.endsWith("/"))
u = u.substring(0, u.length() - 1);
pos = u.lastIndexOf('/');
while (pos > 8) {
l = u.length();
u = u.substring(0, pos + 1);
h.add(u);
u = u.substring(0, pos);
assert (u.length() < l) : "u = " + u;
pos = u.lastIndexOf('/');
}
} catch (final MalformedURLException e) { }
// now convert the strings to yacyURLs
i = h.iterator();
final HashMap<yacyURL, String> v = new HashMap<yacyURL, String>();
while (i.hasNext()) {
u = (String) i.next();
try {
url = new yacyURL(u, null);
v.put(url, "sub");
} catch (final MalformedURLException e) {
}
}
return v;
}
public static Map<yacyURL, String> allReflinks(final Collection<?> links) {
// links is either a Set of Strings (with urls) or
// htmlFilterImageEntries
// we find all links that are part of a reference inside a url
final HashMap<yacyURL, String> v = new HashMap<yacyURL, String>();
final Iterator<?> i = links.iterator();
Object o;
yacyURL url;
String u;
int pos;
loop: while (i.hasNext())
try {
o = i.next();
if (o instanceof yacyURL)
url = (yacyURL) o;
else if (o instanceof String)
url = new yacyURL((String) o, null);
else if (o instanceof ImageEntry)
url = ((ImageEntry) o).url();
else {
assert false;
continue;
}
u = url.toNormalform(true, true);
if ((pos = u.toLowerCase().indexOf("http://", 7)) > 0) {
i.remove();
u = u.substring(pos);
while ((pos = u.toLowerCase().indexOf("http://", 7)) > 0)
u = u.substring(pos);
url = new yacyURL(u, null);
if (!(v.containsKey(url)))
v.put(url, "ref");
continue loop;
}
if ((pos = u.toLowerCase().indexOf("/www.", 7)) > 0) {
i.remove();
u = "http:/" + u.substring(pos);
while ((pos = u.toLowerCase().indexOf("/www.", 7)) > 0)
u = "http:/" + u.substring(pos);
url = new yacyURL(u, null);
if (!(v.containsKey(url)))
v.put(url, "ref");
continue loop;
}
} catch (final MalformedURLException e) {
}
return v;
}
public void addSubDocument(final Document doc) throws IOException {
this.sections.addAll(Arrays.asList(doc.getSectionTitles()));
if (this.title.length() > 0) this.title.append('\n');
this.title.append(doc.dc_title());
this.keywords.addAll(doc.getKeywords());
if (this.description.length() > 0) this.description.append('\n');
this.description.append(doc.dc_description());
if (!(this.text instanceof serverCachedFileOutputStream)) {
this.text = new serverCachedFileOutputStream(Idiom.MAX_KEEP_IN_MEMORY_SIZE);
FileUtils.copy(getText(), (serverCachedFileOutputStream)this.text);
}
FileUtils.copy(doc.getText(), (serverCachedFileOutputStream)this.text);
anchors.putAll(doc.getAnchors());
ContentScraper.addAllImages(images, doc.getImages());
}
/**
* @return the {@link URL} to the favicon that belongs to the document
*/
public yacyURL getFavicon() {
return this.favicon;
}
/**
* @param faviconURL the {@link URL} to the favicon that belongs to the document
*/
public void setFavicon(final yacyURL faviconURL) {
this.favicon = faviconURL;
}
public void notifyWebStructure(final WebStructureGraph webStructure, final Condenser condenser, final Date docDate) {
final Integer[] ioLinks = webStructure.generateCitationReference(this, condenser, docDate); // [outlinksSame, outlinksOther]
this.inboundLinks = ioLinks[0].intValue();
this.outboundLinks = ioLinks[1].intValue();
}
public int inboundLinks() {
assert this.inboundLinks >= 0;
return (this.inboundLinks < 0) ? 0 : this.inboundLinks;
}
public int outboundLinks() {
assert this.outboundLinks >= 0;
return (this.outboundLinks < 0) ? 0 : this.outboundLinks;
}
public void writeXML(OutputStreamWriter os, Date date) throws IOException {
os.write("<record>\n");
os.write("<dc:Title><![CDATA[" + this.dc_title() + "]]></dc:Title>\n");
os.write("<dc:Identifier>" + this.dc_identifier() + "</dc:Identifier>\n");
os.write("<dc:Creator><![CDATA[" + this.dc_creator() + "]]></dc:Creator>\n");
os.write("<dc:Description><![CDATA[");
byte[] buffer = new byte[1000];
int c = 0;
InputStream is = this.getText();
while ((c = is.read(buffer)) > 0) os.write(new String(buffer, 0, c));
is.close();
os.write("]]></dc:Description>\n");
os.write("<dc:Language>" + this.dc_language() + "</dc:Language>\n");
os.write("<dc:Date>" + DateFormatter.formatISO8601(date) + "</dc:Date>\n");
os.write("</record>\n");
}
public void close() {
// try close the output stream
if (this.textStream != null) {
try {
this.textStream.close();
} catch (final Exception e) {
/* ignore this */
} finally {
this.textStream = null;
}
}
// delete the temp file
if ((this.text != null) && (this.text instanceof File)) {
try {
FileUtils.deletedelete((File) this.text);
} catch (final Exception e) {
/* ignore this */
} finally {
this.text = null;
}
}
}
protected void finalize() throws Throwable {
this.close();
super.finalize();
}
}