yacy_search_server/source/de/anomic/plasma/plasmaParserDocument.java
orbiter daf0f74361 joined anomic.net.URL, plasmaURL and url hash computation:
search profiling showed, that a major amount of time is wasted by computing url hashes. The computation does an intranet-check, which needs a DNS lookup. This caused that each urlhash computation needed 100-200 milliseconds, which caused remote searches to delay at least 1 second more that necessary. The solution to this problem is to attach a URL hash to the URL data structure, because that means that the url hash value can be filled after retrieval of the URL from the database. The redesign of the url/urlhash management caused a major redesign of many parts of the software. Since some parts had been decided to be given up they had been removed during this change to avoid unnecessary maintenance of unused code.

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@4074 6c8d7289-2bf4-0310-a012-ef5d649a1542
2007-09-05 09:01:35 +00:00

443 lines
17 KiB
Java

//plasmaParserDocument.java
//------------------------
//part of YaCy
//(C) by Michael Peter Christen; mc@anomic.de
//first published on http://www.anomic.de
//Frankfurt, Germany, 2005
//
//last major change: 24.04.2005
//
//This program is free software; you can redistribute it and/or modify
//it under the terms of the GNU General Public License as published by
//the Free Software Foundation; either version 2 of the License, or
//(at your option) any later version.
//
//This program is distributed in the hope that it will be useful,
//but WITHOUT ANY WARRANTY; without even the implied warranty of
//MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
//GNU General Public License for more details.
//
//You should have received a copy of the GNU General Public License
//along with this program; if not, write to the Free Software
//Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
//
//Using this software in any meaning (reading, learning, copying, compiling,
//running) means that you agree that the Author(s) is (are) not responsible
//for cost, loss of data or any harm that may be caused directly or indirectly
//by usage of this softare or this documentation. The usage of this software
//is on your own risk. The installation and usage (starting/running) of this
//software may allow other people or application to access your computer and
//any attached devices and is highly dependent on the configuration of the
//software which must be done by the user of the software; the author(s) is
//(are) also not responsible for proper configuration and usage of the
//software, even if provoked by documentation provided together with
//the software.
//
//Any changes to this file according to the GPL as documented in the file
//gpl.txt aside this file in the shipment you received can be done to the
//lines that follows this copyright notice here, but changes must not be
//done inside the copyright notive above. A re-distribution must contain
//the intact and unchanged copyright notice.
//Contributions and changes to the program code must be marked as such.
package de.anomic.plasma;
import java.io.BufferedInputStream;
import java.io.ByteArrayInputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.net.MalformedURLException;
import de.anomic.server.serverCachedFileOutputStream;
import de.anomic.server.serverFileUtils;
import de.anomic.yacy.yacyURL;
import java.util.Arrays;
import java.util.HashMap;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.TreeSet;
import de.anomic.htmlFilter.htmlFilterImageEntry;
import de.anomic.plasma.parser.Parser;
public class plasmaParserDocument {
private yacyURL location; // the source url
private String mimeType; // mimeType as taken from http header
private String charset; // the charset of the document
private List keywords; // most resources provide a keyword field
private StringBuffer title; // a document title, taken from title or h1 tag; shall appear as headline of search result
private StringBuffer author; // author or copyright
private List sections; // if present: more titles/headlines appearing in the document
private StringBuffer abstrct; // an abstract, if present: short content description
private Object text; // the clear text, all that is visible
private Map anchors; // all links embedded as clickeable entities (anchor tags)
private TreeSet images; // all visible pictures in document
// the anchors and images - Maps are URL-to-EntityDescription mappings.
// The EntityDescription appear either as visible text in anchors or as alternative
// text in image tags.
private Map hyperlinks, audiolinks, videolinks, applinks;
private Map emaillinks;
private yacyURL favicon;
private boolean resorted;
private InputStream textStream;
protected plasmaParserDocument(yacyURL location, String mimeType, String charset,
String[] keywords, String title, String author,
String[] sections, String abstrct,
Object text, Map anchors, TreeSet images) {
this.location = location;
this.mimeType = (mimeType == null) ? "application/octet-stream" : mimeType;
this.charset = charset;
this.keywords = (keywords == null) ? new LinkedList() : Arrays.asList(keywords);
this.title = (title == null) ? new StringBuffer() : new StringBuffer(title);
this.author = (author == null) ? new StringBuffer() : new StringBuffer(author);
this.sections = (sections == null) ? new LinkedList() : Arrays.asList(sections);
this.abstrct = (abstrct == null) ? new StringBuffer() : new StringBuffer(abstrct);
this.anchors = (anchors == null) ? new HashMap(0) : anchors;
this.images = (images == null) ? new TreeSet() : images;
this.hyperlinks = null;
this.audiolinks = null;
this.videolinks = null;
this.applinks = null;
this.emaillinks = null;
this.resorted = false;
if (text == null) try {
this.text = new serverCachedFileOutputStream(Parser.MAX_KEEP_IN_MEMORY_SIZE);
} catch (IOException e) {
e.printStackTrace();
this.text = new StringBuffer();
} else {
this.text = text;
}
}
public plasmaParserDocument(yacyURL location, String mimeType, String charset) {
this(location, mimeType, charset, null, null, null, null, null, (Object)null, null, null);
}
public plasmaParserDocument(yacyURL location, String mimeType, String charset,
String[] keywords, String title, String author,
String[] sections, String abstrct,
byte[] text, Map anchors, TreeSet images) {
this(location, mimeType, charset, keywords, title, author, sections, abstrct, (Object)text, anchors, images);
}
public plasmaParserDocument(yacyURL location, String mimeType, String charset,
String[] keywords, String title, String author,
String[] sections, String abstrct,
File text, Map anchors, TreeSet images) {
this(location, mimeType, charset, keywords, title, author, sections, abstrct, (Object)text, anchors, images);
}
public plasmaParserDocument(yacyURL location, String mimeType, String charset,
String[] keywords, String title, String author,
String[] sections, String abstrct,
serverCachedFileOutputStream text, Map anchors, TreeSet images) {
this(location, mimeType, charset, keywords, title, author, sections, abstrct, (Object)text, anchors, images);
}
public yacyURL getLocation() {
return this.location;
}
public String getMimeType() {
return this.mimeType;
}
/**
* @return the supposed charset of this document or <code>null</code> if unknown
*/
public String getCharset() {
return this.charset;
}
public String getTitle() {
return title.toString();
}
public String[] getSectionTitles() {
if (sections != null) {
return (String[])sections.toArray(new String[this.sections.size()]);
} else {
return new String[] { getTitle() };
}
}
public String getAbstract() {
if (abstrct != null) return abstrct.toString(); else return getTitle();
}
public String getAuthor() {
if (author != null) return author.toString(); else return new String();
}
public InputStream getText() {
try {
if (this.text == null) return null;
if (this.text instanceof File) {
this.textStream = new BufferedInputStream(new FileInputStream((File)this.text));
} else if (this.text instanceof byte[]) {
this.textStream = new ByteArrayInputStream((byte[])this.text);
} else if (this.text instanceof serverCachedFileOutputStream) {
return ((serverCachedFileOutputStream)this.text).getContent();
}
return this.textStream;
} catch (Exception e) {
e.printStackTrace();
}
return null;
}
public byte[] getTextBytes() {
try {
if (this.text == null) return new byte[0];
if (this.text instanceof File) {
return serverFileUtils.read((File)this.text);
} else if (this.text instanceof byte[]) {
return (byte[])this.text;
} else if (this.text instanceof serverCachedFileOutputStream) {
serverCachedFileOutputStream ffbaos = (serverCachedFileOutputStream)this.text;
if (ffbaos.isFallback()) {
return serverFileUtils.read(ffbaos.getContent());
} else {
return ffbaos.getContentBAOS();
}
}
} catch (Exception e) {
e.printStackTrace();
}
return new byte[0];
}
public long getTextLength() {
if (this.text == null) return 0;
if (this.text instanceof File) return ((File)this.text).length();
else if (this.text instanceof byte[]) return ((byte[])this.text).length;
else if (this.text instanceof serverCachedFileOutputStream) {
return ((serverCachedFileOutputStream)this.text).getLength();
}
return -1;
}
public Iterator getSentences(boolean pre) {
if (this.text == null) return null;
plasmaCondenser.sentencesFromInputStreamEnum e = plasmaCondenser.sentencesFromInputStream(getText(), this.charset);
e.pre(pre);
return e;
}
public String getKeywords(char separator) {
// sort out doubles and empty words
TreeSet hs = new TreeSet();
String s;
for (int i = 0; i < this.keywords.size(); i++) {
if (this.keywords.get(i) == null) continue;
s = ((String)this.keywords.get(i)).trim();
if (s.length() > 0) hs.add(s.toLowerCase());
}
if (hs.size() == 0) return "";
// generate a new list
StringBuffer sb = new StringBuffer(this.keywords.size() * 6);
Iterator i = hs.iterator();
while (i.hasNext()) sb.append((String) i.next()).append(separator);
return sb.substring(0, sb.length() - 1);
}
public List getKeywords() {
return this.keywords;
}
public Map getAnchors() {
// returns all links embedded as anchors (clickeable entities)
// this is a url(String)/text(String) map
return anchors;
}
// the next three methods provide a calculated view on the getAnchors/getImages:
public Map getHyperlinks() {
// this is a subset of the getAnchor-set: only links to other hyperrefs
if (!resorted) resortLinks();
return hyperlinks;
}
public Map getAudiolinks() {
if (!resorted) resortLinks();
return this.audiolinks;
}
public Map getVideolinks() {
if (!resorted) resortLinks();
return this.videolinks;
}
public TreeSet getImages() {
// returns all links enbedded as pictures (visible in document)
// this resturns a htmlFilterImageEntry collection
if (!resorted) resortLinks();
return images;
}
public Map getApplinks() {
if (!resorted) resortLinks();
return this.applinks;
}
public Map getEmaillinks() {
// this is part of the getAnchor-set: only links to email addresses
if (!resorted) resortLinks();
return emaillinks;
}
private synchronized void resortLinks() {
// extract hyperlinks, medialinks and emaillinks from anchorlinks
Iterator i;
yacyURL url;
String u;
int extpos, qpos;
String ext = null;
i = anchors.entrySet().iterator();
hyperlinks = new HashMap();
videolinks = new HashMap();
audiolinks = new HashMap();
applinks = new HashMap();
emaillinks = new HashMap();
TreeSet collectedImages = new TreeSet(); // this is a set that is collected now and joined later to the imagelinks
Map.Entry entry;
while (i.hasNext()) {
entry = (Map.Entry) i.next();
u = (String) entry.getKey();
if ((u != null) && (u.startsWith("mailto:"))) {
emaillinks.put(u.substring(7), entry.getValue());
} else {
extpos = u.lastIndexOf(".");
if (extpos > 0) {
if (((qpos = u.indexOf("?")) >= 0) && (qpos > extpos)) {
ext = u.substring(extpos + 1, qpos).toLowerCase();
} else {
ext = u.substring(extpos + 1).toLowerCase();
}
try {
url = new yacyURL(u, null);
u = url.toNormalform(true, true);
if (plasmaParser.mediaExtContains(ext)) {
// this is not a normal anchor, its a media link
if (plasmaParser.imageExtContains(ext)) {
collectedImages.add(new htmlFilterImageEntry(url, (String) entry.getValue(), -1, -1));
}
else if (plasmaParser.audioExtContains(ext)) audiolinks.put(u, entry.getValue());
else if (plasmaParser.videoExtContains(ext)) videolinks.put(u, entry.getValue());
else if (plasmaParser.appsExtContains(ext)) applinks.put(u, entry.getValue());
} else {
hyperlinks.put(u, entry.getValue());
}
} catch (MalformedURLException e1) {
}
}
}
}
// add image links that we collected from the anchors to the image map
i = collectedImages.iterator();
htmlFilterImageEntry iEntry;
while (i.hasNext()) {
iEntry = (htmlFilterImageEntry) i.next();
if (!images.contains(iEntry)) images.add(iEntry);
}
// expand the hyperlinks:
// we add artificial hyperlinks to the hyperlink set
// that can be calculated from given hyperlinks and imagelinks
hyperlinks.putAll(plasmaParser.allReflinks(hyperlinks.keySet()));
hyperlinks.putAll(plasmaParser.allReflinks(images));
hyperlinks.putAll(plasmaParser.allReflinks(audiolinks.keySet()));
hyperlinks.putAll(plasmaParser.allReflinks(videolinks.keySet()));
hyperlinks.putAll(plasmaParser.allReflinks(applinks.keySet()));
hyperlinks.putAll(plasmaParser.allSubpaths(hyperlinks.keySet()));
hyperlinks.putAll(plasmaParser.allSubpaths(images));
hyperlinks.putAll(plasmaParser.allSubpaths(audiolinks.keySet()));
hyperlinks.putAll(plasmaParser.allSubpaths(videolinks.keySet()));
hyperlinks.putAll(plasmaParser.allSubpaths(applinks.keySet()));
// don't do this again
this.resorted = true;
}
public void addSubDocument(plasmaParserDocument doc) throws IOException {
this.sections.addAll(Arrays.asList(doc.getSectionTitles()));
if (this.title.length() > 0) this.title.append('\n');
this.title.append(doc.getTitle());
this.keywords.addAll(doc.getKeywords());
if (this.abstrct.length() > 0) this.abstrct.append('\n');
this.abstrct.append(doc.getAbstract());
if (!(this.text instanceof serverCachedFileOutputStream)) {
this.text = new serverCachedFileOutputStream(Parser.MAX_KEEP_IN_MEMORY_SIZE);
serverFileUtils.copy(getText(), (serverCachedFileOutputStream)this.text);
}
serverFileUtils.copy(doc.getText(), (serverCachedFileOutputStream)this.text);
anchors.putAll(doc.getAnchors());
images.addAll(doc.getImages());
}
/**
* @return the {@link URL} to the favicon that belongs to the document
*/
public yacyURL getFavicon() {
return this.favicon;
}
/**
* @param faviconURL the {@link URL} to the favicon that belongs to the document
*/
public void setFavicon(yacyURL faviconURL) {
this.favicon = faviconURL;
}
public void close() {
// try close the output stream
if (this.textStream != null) {
try {
this.textStream.close();
} catch (Exception e) {
/* ignore this */
} finally {
this.textStream = null;
}
}
// delete the temp file
if ((this.text != null) && (this.text instanceof File)) {
try {
((File)this.text).delete();
} catch (Exception e) {
/* ignore this */
} finally {
this.text = null;
}
}
}
protected void finalize() throws Throwable {
this.close();
super.finalize();
}
}