*) plasmaHTCache:

- method loadResourceContent defined as deprecated. 
     Please do not use this function to avoid OutOfMemory Exceptions 
     when loading large files
   - new function getResourceContentStream to get an inputstream of a cache file
   - new function getResourceContentLength to get the size of a cached file
*) httpc.java:
   - Bugfix: resource content was loaded into memory even if this was not requested
*) Crawler:
   - new option to hold loaded resource content in memory
   - adding option to use the worker class without the worker pool 
     (needed by the snippet fetcher)
*) plasmaSnippetCache
   - snippet loader does not use a crawl-worker from pool but uses
     a newly created instance to avoid blocking by normal crawling
     activity.
   - now operates on streams instead of byte arrays to avoid OutOfMemory 
     Exceptions when operating on large files 
   - snippet loader now forces the crawl-worker to keep the loaded
     resource in memory to avoid IO 
*) plasmaCondenser: adding new function getWords that can directly operate on input streams
*) Parsers
   - keep resource in memory whenever possible (to avoid IO)
   - when parsing from stream the content length must be passed to the parser function now.
     this length value is needed by the parsers to decide if the parsed resource content is to large
     to hold it in memory and must be stored to file 
   - AbstractParser.java: new function to pass the contentLength of a resource to the parsers
   


git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@2701 6c8d7289-2bf4-0310-a012-ef5d649a1542
This commit is contained in:
theli 2006-10-03 11:05:48 +00:00
parent 630a955674
commit f17ce28b6d
31 changed files with 466 additions and 221 deletions

View File

@ -45,6 +45,7 @@
//if the shell's current path is HTROOT
import java.io.IOException;
import java.io.InputStream;
import java.io.UnsupportedEncodingException;
import java.net.URLDecoder;
import java.net.URLEncoder;
@ -60,6 +61,7 @@ import de.anomic.plasma.cache.IResourceInfo;
import de.anomic.plasma.crawler.plasmaCrawlerException;
import de.anomic.plasma.parser.ParserException;
import de.anomic.plasma.plasmaCrawlLURL.Entry;
import de.anomic.server.serverFileUtils;
import de.anomic.server.serverObjects;
import de.anomic.server.serverSwitch;
@ -121,18 +123,20 @@ public class ViewFile {
}
// loading the resource content as byte array
byte[] resource = null;
InputStream resource = null;
long resourceLength = -1;
IResourceInfo resInfo = null;
String resMime = null;
try {
// trying to load the resource body
resource = sb.cacheManager.loadResourceContent(url);
resource = sb.cacheManager.getResourceContentStream(url);
resourceLength = sb.cacheManager.getResourceContentLength(url);
// if the resource body was not cached we try to load it from web
if (resource == null) {
plasmaHTCache.Entry entry = null;
try {
entry = sb.snippetCache.loadResourceFromWeb(url, 5000);
entry = sb.snippetCache.loadResourceFromWeb(url, 5000, false);
} catch (plasmaCrawlerException e) {
prop.put("error",4);
prop.put("error_errorText",e.getMessage());
@ -142,11 +146,13 @@ public class ViewFile {
if (entry != null) {
resInfo = entry.getDocumentInfo();
resource = sb.cacheManager.loadResourceContent(url);
resource = sb.cacheManager.getResourceContentStream(url);
resourceLength = sb.cacheManager.getResourceContentLength(url);
}
if (resource == null) {
prop.put("error",4);
prop.put("error_errorText","No resource available");
prop.put("viewMode",VIEW_MODE_NO_TEXT);
return prop;
}
@ -172,6 +178,15 @@ public class ViewFile {
httpHeader responseHeader = httpc.whead(url,url.getHost(),5000,null,null,sb.remoteProxyConfig);
if (responseHeader == null) {
prop.put("error",4);
prop.put("error_errorText","Unable to load resource metadata.");
prop.put("viewMode",VIEW_MODE_NO_TEXT);
return prop;
}
try {
resInfo = sb.cacheManager.getResourceInfoFactory().buildResourceInfoObj(url, responseHeader);
} catch (Exception e) {
prop.put("error",4);
prop.put("error_errorText",e.getMessage());
prop.put("viewMode",VIEW_MODE_NO_TEXT);
return prop;
}
@ -181,12 +196,28 @@ public class ViewFile {
resMime = resInfo.getMimeType();
}
} catch (IOException e) {
if (resource != null) try { resource.close(); } catch (Exception ex) {/* ignore this */}
prop.put("error",4);
prop.put("error_errorText",e.getMessage());
prop.put("viewMode",VIEW_MODE_NO_TEXT);
return prop;
}
if (viewMode.equals("plain")) {
String content = new String(resource);
// TODO: how to handle very large files here ?
String content;
try {
content = new String(serverFileUtils.read(resource),"UTF-8");
} catch (Exception e) {
prop.put("error",4);
prop.put("error_errorText",e.getMessage());
prop.put("viewMode",VIEW_MODE_NO_TEXT);
return prop;
} finally {
if (resource != null) try { resource.close(); } catch (Exception e) {/* ignore this */}
}
content = content.replaceAll("<","&lt;")
.replaceAll(">","&gt;")
.replaceAll("\"","&quot;")
@ -196,11 +227,14 @@ public class ViewFile {
prop.put("error",0);
prop.put("viewMode",VIEW_MODE_AS_PLAIN_TEXT);
prop.put("viewMode_plainText",content);
} else if (viewMode.equals("parsed") || viewMode.equals("sentences") || viewMode.equals("iframe")) {
} else if (viewMode.equals("iframe")) {
prop.put("viewMode",VIEW_MODE_AS_IFRAME);
prop.put("viewMode_url",url.toString());
} else if (viewMode.equals("parsed") || viewMode.equals("sentences")) {
// parsing the resource content
plasmaParserDocument document = null;
try {
document = sb.snippetCache.parseDocument(url, resource,resInfo);
document = sb.snippetCache.parseDocument(url, resourceLength, resource,resInfo);
if (document == null) {
prop.put("error",5);
prop.put("error_errorText","Unknown error");
@ -212,7 +246,10 @@ public class ViewFile {
prop.put("error_errorText",e.getMessage());
prop.put("viewMode",VIEW_MODE_NO_TEXT);
return prop;
} finally {
if (resource != null) try { resource.close(); } catch (Exception e) {/* ignore this */}
}
resMime = document.getMimeType();
if (viewMode.equals("parsed")) {
@ -223,9 +260,6 @@ public class ViewFile {
prop.put("viewMode",VIEW_MODE_AS_PARSED_TEXT);
prop.put("viewMode_parsedText",content);
} else if (viewMode.equals("iframe")) {
prop.put("viewMode",VIEW_MODE_AS_IFRAME);
prop.put("viewMode_url",url.toString());
} else {
prop.put("viewMode",VIEW_MODE_AS_PARSED_SENTENCES);
String[] sentences = document.getSentences();

View File

@ -43,11 +43,14 @@ import java.awt.Container;
import java.awt.Image;
import java.awt.MediaTracker;
import java.awt.Toolkit;
import java.io.IOException;
import java.io.InputStream;
import java.net.MalformedURLException;
import de.anomic.http.httpHeader;
import de.anomic.net.URL;
import de.anomic.plasma.plasmaSwitchboard;
import de.anomic.server.serverFileUtils;
import de.anomic.server.serverObjects;
import de.anomic.server.serverSwitch;
@ -70,9 +73,20 @@ public class ViewImage {
int maxheight = post.getInt("maxheight", 0);
int timeout = post.getInt("timeout", 5000);
// load image
byte[] imgb = sb.snippetCache.getResource(url, true, timeout);
if (imgb == null) return null;
// getting the image as stream
InputStream imgStream = (InputStream) sb.snippetCache.getResource(url, true, timeout)[0];
if (imgStream == null) return null;
// read image data
byte[] imgb = null;
try {
imgb = serverFileUtils.read(imgStream);
} catch (IOException e) {
return null;
} finally {
try { imgStream.close(); } catch (Exception e) {/* ignore this */}
}
// create image
MediaTracker mediaTracker = new MediaTracker(new Container());

View File

@ -1837,9 +1837,13 @@ do upload
* @return
* @throws IOException
*/
public byte[] writeContent(Object procOS) throws IOException {
public byte[] writeContent(Object procOS, boolean returnByteArray) throws IOException {
serverByteBuffer sbb = null;
if (returnByteArray) {
int contentLength = (int) this.responseHeader.contentLength();
serverByteBuffer sbb = new serverByteBuffer((contentLength==-1)?8192:contentLength);
sbb = new serverByteBuffer((contentLength==-1)?8192:contentLength);
}
if (procOS instanceof OutputStream) {
//writeContentX(httpc.this.clientInput, this.gzip, this.responseHeader.contentLength(), procOS, sbb);
@ -1852,7 +1856,7 @@ do upload
throw new IllegalArgumentException("Invalid procOS object type '" + procOS.getClass().getName() + "'");
}
return sbb.getBytes();
return (sbb==null)?null:sbb.getBytes();
}
/**

View File

@ -662,7 +662,7 @@ public final class httpdProxyHandler extends httpdAbstractHandler implements htt
if ((contentLength > 0) && (contentLength < 1048576)) // if the length is known and < 1 MB
{
// ok, we don't write actually into a file, only to RAM, and schedule writing the file.
byte[] cacheArray = res.writeContent(hfos);
byte[] cacheArray = res.writeContent(hfos,true);
this.theLogger.logFine("writeContent of " + url + " produced cacheArray = " + ((cacheArray == null) ? "null" : ("size=" + cacheArray.length)));
if (hfos instanceof htmlFilterWriter) ((htmlFilterWriter) hfos).finalize();

View File

@ -80,6 +80,7 @@ public abstract class AbstractCrawlWorker extends Thread implements plasmaCrawlW
*/
protected boolean done = false;
/* ============================================================
* Crawl job specific variables
* ============================================================ */
@ -92,6 +93,7 @@ public abstract class AbstractCrawlWorker extends Thread implements plasmaCrawlW
protected long startdate;
protected plasmaCrawlProfile.entry profile;
protected boolean acceptAllContent;
protected boolean keepInMemory;
protected String errorMessage;
@ -159,8 +161,9 @@ public abstract class AbstractCrawlWorker extends Thread implements plasmaCrawlW
try {
// The thread keeps running.
while (!this.stopped && !this.isInterrupted() && !this.myPool.isClosed) {
while (!this.stopped && !this.isInterrupted()) {
if (this.done) {
if (this.myPool != null && !this.myPool.isClosed) {
synchronized (this) {
// return thread back into pool
this.myPool.returnObject(this.protocol,this);
@ -170,11 +173,15 @@ public abstract class AbstractCrawlWorker extends Thread implements plasmaCrawlW
this.wait();
}
}
} else {
this.stopped = true;
}
} else {
try {
// executing the new task
execute();
} finally {
// free memory
reset();
}
}
@ -231,6 +238,7 @@ public abstract class AbstractCrawlWorker extends Thread implements plasmaCrawlW
this.depth = theNewMsg.depth;
this.profile = theNewMsg.profile;
this.acceptAllContent = theNewMsg.acceptAllContent;
this.keepInMemory = theNewMsg.keepInMemory;
this.startdate = System.currentTimeMillis();
@ -260,6 +268,7 @@ public abstract class AbstractCrawlWorker extends Thread implements plasmaCrawlW
public void reset() {
this.theMsg = null;
this.url = null;
this.name = null;
this.refererURLString = null;
@ -268,6 +277,8 @@ public abstract class AbstractCrawlWorker extends Thread implements plasmaCrawlW
this.startdate = 0;
this.profile = null;
this.acceptAllContent = false;
this.keepInMemory = false;
this.errorMessage = null;
}

View File

@ -262,8 +262,9 @@ public final class CrawlWorker extends AbstractCrawlWorker {
}
// we write the new cache entry to file system directly
res.writeContent(fos);
htCache.setCacheArray(null);
byte[] cacheArray = null;
cacheArray = res.writeContent(fos,this.keepInMemory);
htCache.setCacheArray(cacheArray);
this.cacheManager.writeFileAnnouncement(cacheFile);
} finally {
if (fos!=null)try{fos.close();}catch(Exception e){/* ignore this */}

View File

@ -84,10 +84,14 @@ public final class plasmaCrawlerFactory implements KeyedPoolableObjectFactory {
this.thePool = pool;
}
public Object makeObject(Object key) throws Exception {
return makeObject(key, true);
}
/**
* @see org.apache.commons.pool.PoolableObjectFactory#makeObject()
*/
public Object makeObject(Object key) throws Exception {
public Object makeObject(Object key, boolean usePool) throws Exception {
if (!(key instanceof String))
throw new IllegalArgumentException("The object key must be of type string.");
@ -109,7 +113,7 @@ public final class plasmaCrawlerFactory implements KeyedPoolableObjectFactory {
// instantiating class
plasmaCrawlWorker theCrawlWorker = (plasmaCrawlWorker) classConstructor.newInstance(new Object[] {
this.theThreadGroup,
this.thePool,
(usePool)?this.thePool:null,
this.sb,
this.cacheManager,
this.theLog

View File

@ -52,15 +52,22 @@ import org.apache.commons.pool.impl.GenericKeyedObjectPool;
import de.anomic.server.logging.serverLog;
public final class plasmaCrawlerPool extends GenericKeyedObjectPool {
private plasmaCrawlerFactory theFactory;
private final ThreadGroup theThreadGroup;
public boolean isClosed = false;
public plasmaCrawlerPool(plasmaCrawlerFactory objFactory, GenericKeyedObjectPool.Config config, ThreadGroup threadGroup) {
super(objFactory, config);
this.theFactory = objFactory;
this.theThreadGroup = threadGroup;
objFactory.setPool(this);
}
public plasmaCrawlerFactory getFactory() {
return this.theFactory;
}
public Object borrowObject(Object key) throws Exception {
return super.borrowObject(key);
}

View File

@ -90,7 +90,7 @@ public abstract class AbstractParser implements Parser{
* The source file file size in bytes if the source document was passed
* in as file
*/
protected long fileSize = -1;
protected long contentLength = -1;
/**
* The Constructor of this class.
@ -100,6 +100,15 @@ public abstract class AbstractParser implements Parser{
this.libxDependencies = libxDependencies;
}
/**
* Set the content length of the source file.
* This value is needed by some parsers to decide
* if the parsed text could be hold in memory
*/
public void setContentLength(long length) {
this.contentLength = length;
}
/**
* Check if the parser was interrupted.
* @throws InterruptedException if the parser was interrupted
@ -185,7 +194,7 @@ public abstract class AbstractParser implements Parser{
BufferedInputStream contentInputStream = null;
try {
// getting the file size of the document
this.fileSize = sourceFile.length();
this.contentLength = sourceFile.length();
// create a stream from the file
contentInputStream = new BufferedInputStream(new FileInputStream(sourceFile));
@ -242,4 +251,8 @@ public abstract class AbstractParser implements Parser{
public String getName() {
return this.parserName;
}
public void reset() {
this.contentLength = -1;
}
}

View File

@ -117,6 +117,8 @@ public interface Parser {
*/
public void reset();
public void setContentLength(long length);
/**
* @return Returns a list of library names that are needed by this parser
*/

View File

@ -139,6 +139,6 @@ public class bzipParser extends AbstractParser implements Parser {
public void reset() {
// Nothing todo here at the moment
super.reset();
}
}

View File

@ -118,6 +118,7 @@ implements Parser {
public void reset() {
// Nothing todo here at the moment
super.reset();
}
}

View File

@ -123,6 +123,6 @@ public class gzipParser extends AbstractParser implements Parser {
public void reset() {
// Nothing todo here at the moment
super.reset();
}
}

View File

@ -187,8 +187,7 @@ implements Parser {
}
}
public plasmaParserDocument parse(URL location, String mimeType,String charset,
InputStream source) throws ParserException, InterruptedException {
public plasmaParserDocument parse(URL location, String mimeType,String charset, InputStream source) throws ParserException, InterruptedException {
File dstFile = null;
try {
dstFile = File.createTempFile("mimeTypeParser",".tmp");
@ -208,6 +207,7 @@ implements Parser {
public void reset() {
// Nothing todo here at the moment
super.reset();
}
}

View File

@ -198,7 +198,7 @@ public class odtParser extends AbstractParser implements Parser {
public void reset() {
// Nothing todo here at the moment
super.reset();
}
public static void main(String[] args) {

View File

@ -132,7 +132,7 @@ public class pdfParser extends AbstractParser implements Parser {
}
// creating a writer for output
if ((this.fileSize != -1) && (this.fileSize > Parser.MAX_KEEP_IN_MEMORY_SIZE)) {
if ((this.contentLength == -1) || (this.contentLength > Parser.MAX_KEEP_IN_MEMORY_SIZE)) {
writerFile = File.createTempFile("pdfParser",".tmp");
writer = new OutputStreamWriter(new FileOutputStream(writerFile),"UTF-8");
} else {
@ -199,7 +199,8 @@ public class pdfParser extends AbstractParser implements Parser {
}
public void reset() {
this.fileSize = -1;
// Nothing todo here at the moment
super.reset();
}
}

View File

@ -170,7 +170,7 @@ public class rpmParser extends AbstractParser implements Parser {
public void reset() {
// Nothing todo here at the moment
super.reset();
}
public static void main(String[] args) {

View File

@ -211,8 +211,8 @@ public class rssParser extends AbstractParser implements Parser {
}
public void reset() {
// TODO Auto-generated method stub
// Nothing todo here at the moment
super.reset();
}
}

View File

@ -124,6 +124,7 @@ implements Parser {
public void reset() {
// Nothing todo here at the moment
super.reset();
}
}

View File

@ -104,7 +104,7 @@ public class tarParser extends AbstractParser implements Parser {
File outputFile = null;
plasmaParserDocument subDoc = null;
try {
if ((this.fileSize != -1) && (this.fileSize > Parser.MAX_KEEP_IN_MEMORY_SIZE)) {
if ((this.contentLength == -1) || (this.contentLength > Parser.MAX_KEEP_IN_MEMORY_SIZE)) {
outputFile = File.createTempFile("zipParser",".tmp");
docText = new BufferedOutputStream(new FileOutputStream(outputFile));
} else {
@ -252,6 +252,6 @@ public class tarParser extends AbstractParser implements Parser {
public void reset() {
// Nothing todo here at the moment
super.reset();
}
}

View File

@ -263,7 +263,7 @@ public class vcfParser extends AbstractParser implements Parser {
public void reset() {
// Nothing todo here at the moment
super.reset();
}
public static void main(String[] args) {

View File

@ -102,7 +102,7 @@ public class zipParser extends AbstractParser implements Parser {
File outputFile = null;
plasmaParserDocument subDoc = null;
try {
if ((this.fileSize != -1) && (this.fileSize > Parser.MAX_KEEP_IN_MEMORY_SIZE)) {
if ((this.contentLength == -1) || (this.contentLength > Parser.MAX_KEEP_IN_MEMORY_SIZE)) {
outputFile = File.createTempFile("zipParser",".tmp");
docText = new BufferedOutputStream(new FileOutputStream(outputFile));
} else {
@ -236,6 +236,6 @@ public class zipParser extends AbstractParser implements Parser {
public void reset() {
// Nothing todo here at the moment
super.reset();
}
}

View File

@ -671,11 +671,16 @@ public final class plasmaCondenser {
}
*/
public static Iterator getWords(InputStream input) {
if (input == null) return null;
plasmaCondenser condenser = new plasmaCondenser(input);
return condenser.words();
}
public static Iterator getWords(byte[] text) {
if (text == null) return null;
ByteArrayInputStream buffer = new ByteArrayInputStream(text);
plasmaCondenser condenser = new plasmaCondenser(buffer);
return condenser.words();
return getWords(buffer);
}
public static void main(String[] args) {

View File

@ -163,15 +163,23 @@ public final class plasmaCrawlLoader extends Thread {
return this.theThreadGroup;
}
private void execute(plasmaCrawlLoaderMessage theMsg) throws Exception {
private void execute(plasmaCrawlLoaderMessage theMsg, boolean useThreadPool) throws Exception {
// getting the protocol of the next URL
String protocol = theMsg.url.getProtocol();
// TODO: remove this
if (protocol.equals("https")) protocol = "http";
// get a new worker thread
plasmaCrawlWorker theWorker = null;
if (useThreadPool) {
// getting a new crawler from the crawler pool
plasmaCrawlWorker theWorker = (plasmaCrawlWorker) this.crawlwerPool.borrowObject(protocol);
theWorker = (plasmaCrawlWorker) this.crawlwerPool.borrowObject(protocol);
} else {
// create a new one
theWorker = (plasmaCrawlWorker) this.crawlwerPool.getFactory().makeObject(protocol,false);
}
if (theWorker == null) {
this.log.logWarning("Unsupported protocol '" + protocol + "' in url " + theMsg.url);
} else {
@ -187,7 +195,7 @@ public final class plasmaCrawlLoader extends Thread {
plasmaCrawlLoaderMessage theMsg = this.theQueue.waitForMessage();
// start new crawl job
this.execute(theMsg);
this.execute(theMsg, true);
} catch (InterruptedException e) {
Thread.interrupted();
@ -218,7 +226,8 @@ public final class plasmaCrawlLoader extends Thread {
String initiator,
int depth,
plasmaCrawlProfile.entry profile,
int timeout
int timeout,
boolean keepInMemory
) throws plasmaCrawlerException {
plasmaHTCache.Entry result = null;
@ -235,13 +244,14 @@ public final class plasmaCrawlLoader extends Thread {
profile,
crawlingPriority,
true,
timeout
timeout,
keepInMemory
);
try {
// start new crawl job
this.execute(theMsg);
this.execute(theMsg, false);
// wait for the crawl job result
result = theMsg.waitForResult();
@ -283,7 +293,8 @@ public final class plasmaCrawlLoader extends Thread {
profile, // crawling profile
crawlingPriority, // crawling priority
false, // only download documents whose mimetypes are enabled for the crawler
-1 // use default crawler timeout
-1, // use default crawler timeout
false // resource should not be kept in memory
);
// adding the message to the queue

View File

@ -56,6 +56,7 @@ public final class plasmaCrawlLoaderMessage {
public final plasmaCrawlProfile.entry profile;
public final boolean acceptAllContent;
public final int timeout;
public final boolean keepInMemory;
private serverSemaphore resultSync = null;
private plasmaHTCache.Entry result;
@ -71,7 +72,8 @@ public final class plasmaCrawlLoaderMessage {
plasmaCrawlProfile.entry profile,
int crawlingPriority,
boolean acceptAllContent,
int timeout
int timeout,
boolean keepInMemory
) {
this.url = url;
this.name = name;
@ -82,6 +84,7 @@ public final class plasmaCrawlLoaderMessage {
this.crawlingPriority = crawlingPriority;
this.acceptAllContent = acceptAllContent;
this.timeout = timeout;
this.keepInMemory = keepInMemory;
this.resultSync = new serverSemaphore(0);
this.result = null;

View File

@ -53,9 +53,12 @@
package de.anomic.plasma;
import java.io.BufferedInputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;
import java.net.InetAddress;
import java.net.MalformedURLException;
import java.util.Date;
@ -701,10 +704,17 @@ public final class plasmaHTCache {
return null;
}
/**
* @param url
* @return
*
* @deprecated dont't use this function to avoid OutOfMemory-Exceptions.
* Use {@link #getResourceContentStream(URL)} instead
*/
public byte[] loadResourceContent(URL url) {
// load the url as resource from the cache
File f = getCachePath(url);
if (f.exists()) try {
if (f.exists() && f.canRead()) try {
return serverFileUtils.read(f);
} catch (IOException e) {
return null;
@ -712,6 +722,34 @@ public final class plasmaHTCache {
return null;
}
/**
* Returns the content of a cached resource as {@link InputStream}
* @param url the requested resource
* @return the resource content as {@link InputStream}. In no data
* is available or the cached file is not readable, <code>null</code>
* is returned.
*/
public InputStream getResourceContentStream(URL url) {
// load the url as resource from the cache
File f = getCachePath(url);
if (f.exists() && f.canRead()) try {
return new BufferedInputStream(new FileInputStream(f));
} catch (IOException e) {
this.log.logSevere("Unable to create a BufferedInputStream from file " + f,e);
return null;
}
return null;
}
public long getResourceContentLength(URL url) {
// load the url as resource from the cache
File f = getCachePath(url);
if (f.exists() && f.canRead()) {
return f.length();
}
return 0;
}
public static boolean isPOST(String urlString) {
return (urlString.indexOf("?") >= 0 ||
urlString.indexOf("&") >= 0);

View File

@ -45,11 +45,13 @@
package de.anomic.plasma;
import java.io.BufferedInputStream;
import java.io.ByteArrayInputStream;
import java.io.File;
import java.io.FileFilter;
import java.io.FileInputStream;
import java.io.FilenameFilter;
import java.io.IOException;
import java.io.InputStream;
import java.net.MalformedURLException;
import java.net.URI;
import java.util.Arrays;
@ -465,16 +467,25 @@ public final class plasmaParser {
} catch (Exception e) {/* ignore this */}
}
public plasmaParserDocument parseSource(URL location, String mimeType, String charset, byte[] source)
public plasmaParserDocument parseSource(URL location, String mimeType, String charset, byte[] sourceArray)
throws InterruptedException, ParserException {
File tempFile = null;
ByteArrayInputStream byteIn = null;
try {
// creating a temp file to store the byte array
tempFile = File.createTempFile("parseSource", ".tmp");
serverFileUtils.write(source, tempFile);
if (this.theLogger.isFine())
this.theLogger.logFine("Parsing '" + location + "' from byte-array");
// testing if the resource is not empty
if (sourceArray == null || sourceArray.length == 0) {
String errorMsg = "No resource content available.";
this.theLogger.logInfo("Unable to parse '" + location + "'. " + errorMsg);
throw new ParserException(errorMsg,location,plasmaCrawlEURL.DENIED_NOT_PARSEABLE_NO_CONTENT);
}
// creating an InputStream
byteIn = new ByteArrayInputStream(sourceArray);
// parsing the temp file
return parseSource(location, mimeType, charset, tempFile);
return parseSource(location, mimeType, charset, sourceArray.length, byteIn);
} catch (Exception e) {
// Interrupted- and Parser-Exceptions should pass through
@ -482,20 +493,65 @@ public final class plasmaParser {
if (e instanceof ParserException) throw (ParserException) e;
// log unexpected error
this.theLogger.logSevere("Unexpected exception in parseSource1: " + e.getMessage(), e);
this.theLogger.logSevere("Unexpected exception in parseSource from byte-array: " + e.getMessage(), e);
throw new ParserException("Unexpected exception while parsing " + location,location, e);
} finally {
if (tempFile != null) try { tempFile.delete(); } catch (Exception ex){/* ignore this */}
if (byteIn != null) try { byteIn.close(); } catch (Exception ex){/* ignore this */}
}
}
public plasmaParserDocument parseSource(URL location, String theMimeType, String theDocumentCharset, File sourceFile)
throws InterruptedException, ParserException {
public plasmaParserDocument parseSource(URL location, String theMimeType, String theDocumentCharset, File sourceFile) throws InterruptedException, ParserException {
BufferedInputStream sourceStream = null;
try {
if (this.theLogger.isFine())
this.theLogger.logFine("Parsing '" + location + "' from file");
// testing if the resource is not empty
if (!(sourceFile.exists() && sourceFile.canRead() && sourceFile.length() > 0)) {
String errorMsg = sourceFile.exists() ? "Empty resource file." : "No resource content available.";
this.theLogger.logInfo("Unable to parse '" + location + "'. " + errorMsg);
throw new ParserException(errorMsg,location,plasmaCrawlEURL.DENIED_NOT_PARSEABLE_NO_CONTENT);
}
// create a new InputStream
sourceStream = new BufferedInputStream(new FileInputStream(sourceFile));
// parsing the data
return this.parseSource(location, theMimeType, theDocumentCharset, sourceFile.length(), sourceStream);
} catch (Exception e) {
// Interrupted- and Parser-Exceptions should pass through
if (e instanceof InterruptedException) throw (InterruptedException) e;
if (e instanceof ParserException) throw (ParserException) e;
// log unexpected error
this.theLogger.logSevere("Unexpected exception in parseSource from File: " + e.getMessage(), e);
throw new ParserException("Unexpected exception while parsing " + location,location, e);
} finally {
if (sourceStream != null) try { sourceStream.close(); } catch (Exception ex){/* ignore this */}
}
}
/**
* To parse a resource from an {@link InputStream}
* @param location the URL of the resource
* @param theMimeType the resource mimetype (<code>null</code> if unknown)
* @param theDocumentCharset the charset of the resource (<code>null</code> if unknown)
* @param contentLength the content length of the resource (<code>-1</code> if unknown)
* @param sourceStream an {@link InputStream} containing the resource body
* @return the parsed {@link plasmaParserDocument document}
* @throws InterruptedException
* @throws ParserException
*/
public plasmaParserDocument parseSource(URL location, String theMimeType, String theDocumentCharset, long contentLength, InputStream sourceStream) throws InterruptedException, ParserException {
Parser theParser = null;
String mimeType = null;
try {
if (this.theLogger.isFine())
this.theLogger.logFine("Parsing '" + location + "' from stream");
// getting the mimetype of the document
mimeType = getRealMimeType(theMimeType);
@ -513,76 +569,22 @@ public final class plasmaParser {
throw new ParserException(errorMsg,location,plasmaCrawlEURL.DENIED_WRONG_MIMETYPE_OR_EXT);
}
// testing if the resource is not empty
if (!(sourceFile.exists() && sourceFile.canRead() && sourceFile.length() > 0)) {
String errorMsg = sourceFile.exists() ? "Empty resource file." : "No resource content available.";
this.theLogger.logInfo("Unable to parse '" + location + "'. " + errorMsg);
throw new ParserException(errorMsg,location,plasmaCrawlEURL.DENIED_NOT_PARSEABLE_NO_CONTENT);
}
if (this.theLogger.isFine())
this.theLogger.logInfo("Parsing " + location + " with mimeType '" + mimeType +
"' and file extension '" + fileExt + "'.");
/*
* There are some problematic mimeType - fileExtension combination where we have to enforce
* a mimeType detection to get the proper parser for the content
*
* - application/zip + .odt
* - text/plain + .odt
* - text/plain + .vcf
* - text/xml + .rss
* - text/xml + .atom
*
* In all these cases we can trust the fileExtension and have to determine the proper mimeType.
*
*/
// // Handling of not trustable mimeTypes
// // - text/plain
// // - text/xml
// // - application/octet-stream
// // - application/zip
// if (
// (mimeType.equalsIgnoreCase("text/plain") && !fileExt.equalsIgnoreCase("txt")) ||
// (mimeType.equalsIgnoreCase("text/xml") && !fileExt.equalsIgnoreCase("txt"))
// ) {
// if (this.theLogger.isFine())
// this.theLogger.logFine("Document " + location + " has an mimeType '" + mimeType +
// "' that seems not to be correct for file extension '" + fileExt + "'.");
//
// if (enabledParserList.containsKey("application/octet-stream")) {
// theParser = this.getParser("application/octet-stream");
// Object newMime = theParser.getClass().getMethod("getMimeType", new Class[]{File.class}).invoke(theParser, sourceFile);
// if (newMime == null)
// if (newMime instanceof String) {
// String newMimeType = (String)newMime;
// if ((newMimeType.equals("application/octet-stream")) {
// return null;
// }
// mimeType = newMimeType;
// }
// } else {
// return null;
// }
// } else if (mimeType.equalsIgnoreCase("application/zip") && fileExt.equalsIgnoreCase("odt")){
// if (enabledParserList.containsKey("application/vnd.oasis.opendocument.text")) {
// mimeType = "application/vnd.oasis.opendocument.text";
// } else {
// return null;
// }
// }
// getting the correct parser for the given mimeType
theParser = this.getParser(mimeType);
// if a parser was found we use it ...
plasmaParserDocument doc = null;
if (theParser != null) {
doc = theParser.parse(location, mimeType,documentCharset,sourceFile);
// set the content length of the resource
theParser.setContentLength(contentLength);
// parse the resource
doc = theParser.parse(location, mimeType,documentCharset,sourceStream);
} else if (realtimeParsableMimeTypesContains(mimeType)) {
doc = parseHtml(location, mimeType, documentCharset, sourceFile);
doc = parseHtml(location, mimeType, documentCharset, sourceStream);
} else {
String errorMsg = "No parser available to parse mimetype '" + mimeType + "'";
this.theLogger.logInfo("Unable to parse '" + location + "'. " + errorMsg);
@ -614,11 +616,10 @@ public final class plasmaParser {
}
}
private plasmaParserDocument parseHtml(URL location, String mimeType, String documentCharset, File sourceFile) throws IOException, ParserException {
private plasmaParserDocument parseHtml(URL location, String mimeType, String documentCharset, InputStream sourceStream) throws IOException, ParserException {
// ...otherwise we make a scraper and transformer
FileInputStream fileIn = new FileInputStream(sourceFile);
htmlFilterInputStream htmlFilter = new htmlFilterInputStream(fileIn,documentCharset,location,null,false);
htmlFilterInputStream htmlFilter = new htmlFilterInputStream(sourceStream,documentCharset,location,null,false);
String charset = htmlFilter.detectCharset();
if (charset == null) {
charset = documentCharset;
@ -763,7 +764,7 @@ public final class plasmaParser {
//javac -classpath lib/commons-collections.jar:lib/commons-pool-1.2.jar -sourcepath source source/de/anomic/plasma/plasmaParser.java
//java -cp source:lib/commons-collections.jar:lib/commons-pool-1.2.jar de.anomic.plasma.plasmaParser bug.html bug.out
try {
File contentFile = null;
Object content = null;
URL contentURL = null;
String contentMimeType = "application/octet-stream";
String charSet = "UTF-8";
@ -774,17 +775,13 @@ public final class plasmaParser {
String mode = args[0];
if (mode.equalsIgnoreCase("-f")) {
contentFile = new File(args[1]);
contentURL = new URL(contentFile);
content = new File(args[1]);
contentURL = new URL((File)content);
} else if (mode.equalsIgnoreCase("-u")) {
contentURL = new URL(args[1]);
// downloading the document content
byte[] contentBytes = httpc.singleGET(contentURL, contentURL.getHost(), 10000, null, null, null);
contentFile = File.createTempFile("content",".tmp");
contentFile.deleteOnExit();
serverFileUtils.write(contentBytes, contentFile);
content = httpc.singleGET(contentURL, contentURL.getHost(), 10000, null, null, null);
}
if ((args.length >= 4)&&(args[2].equalsIgnoreCase("-m"))) {
@ -805,7 +802,12 @@ public final class plasmaParser {
plasmaParser.enableAllParsers(PARSER_MODE_PROXY);
// parsing the content
plasmaParserDocument document = theParser.parseSource(contentURL, contentMimeType, charSet, contentFile);
plasmaParserDocument document = null;
if (content instanceof byte[]) {
document = theParser.parseSource(contentURL, contentMimeType, charSet, (byte[])content);
} else if (content instanceof File) {
document = theParser.parseSource(contentURL, contentMimeType, charSet, (File)content);
}
// printing out all parsed sentences
if (document != null) {

View File

@ -41,6 +41,7 @@
package de.anomic.plasma;
import java.io.InputStream;
import java.net.MalformedURLException;
import java.util.Iterator;
import java.util.Map;
@ -59,13 +60,18 @@ public final class plasmaSearchImages {
long start = System.currentTimeMillis();
this.images = new TreeSet();
if (maxTime > 10) {
byte[] res = sc.getResource(url, true, (int) maxTime);
Object[] resource = sc.getResource(url, true, (int) maxTime);
InputStream res = (InputStream) resource[0];
Long resLength = (Long) resource[1];
if (res != null) {
plasmaParserDocument document = null;
try {
document = sc.parseDocument(url, res);
// parse the document
document = sc.parseDocument(url, resLength.longValue(), res);
} catch (ParserException e) {
// parsing failed
} finally {
try { res.close(); } catch (Exception e) {/* ignore this */}
}
if (document == null) return;

View File

@ -44,7 +44,9 @@
package de.anomic.plasma;
import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.util.Enumeration;
import java.util.HashMap;
import java.util.HashSet;
@ -187,46 +189,62 @@ public class plasmaSnippetCache {
* LOADING RESOURCE DATA
* =========================================================================== */
// if the snippet is not in the cache, we can try to get it from the htcache
byte[] resource = null;
IResourceInfo docInfo = null;
long resContentLength = 0;
InputStream resContent = null;
IResourceInfo resInfo = null;
try {
// trying to load the resource from the cache
resource = this.cacheManager.loadResourceContent(url);
resContent = this.cacheManager.getResourceContentStream(url);
if (resContent != null) {
// if the content was found
resContentLength = this.cacheManager.getResourceContentLength(url);
// getting resource metadata
resInfo = this.cacheManager.loadResourceInfo(url);
} else if (fetchOnline) {
// if not found try to download it
if ((resource == null) && (fetchOnline)) {
// download resource using the crawler
plasmaHTCache.Entry entry = loadResourceFromWeb(url, timeout);
// download resource using the crawler and keep resource in memory if possible
plasmaHTCache.Entry entry = loadResourceFromWeb(url, timeout, true);
// getting resource metadata (e.g. the http headers for http resources)
if (entry != null) docInfo = entry.getDocumentInfo();
if (entry != null) {
resInfo = entry.getDocumentInfo();
// read resource body (if it is there)
resource = entry.cacheArray();
byte []resourceArray = entry.cacheArray();
if (resourceArray != null) {
resContent = new ByteArrayInputStream(resourceArray);
resContentLength = resourceArray.length;
} else {
resContent = this.cacheManager.getResourceContentStream(url);
resContentLength = this.cacheManager.getResourceContentLength(url);
}
}
// in case that the reosurce was not in ram, read it from disk
if (resource == null) resource = this.cacheManager.loadResourceContent(url);
// if it is still not available, throw exception
if (resource == null) return new Snippet(null, ERROR_RESOURCE_LOADING, "error loading resource, plasmaHTCache.Entry cache is NULL");
// if it is still not available, report an error
if (resContent == null) return new Snippet(null, ERROR_RESOURCE_LOADING, "error loading resource, plasmaHTCache.Entry cache is NULL");
source = SOURCE_WEB;
} else {
return new Snippet(null, ERROR_SOURCE_LOADING, "no resource available");
}
} catch (Exception e) {
if (!(e instanceof plasmaCrawlerException)) e.printStackTrace();
return new Snippet(null, ERROR_SOURCE_LOADING, "error loading resource: " + e.getMessage());
}
if (resource == null) return new Snippet(null, ERROR_SOURCE_LOADING, "no resource available");
/* ===========================================================================
* PARSING RESOURCE
* =========================================================================== */
plasmaParserDocument document = null;
try {
document = parseDocument(url, resource, docInfo);
document = parseDocument(url, resContentLength, resContent, resInfo);
} catch (ParserException e) {
return new Snippet(null, ERROR_PARSER_FAILED, e.getMessage()); // cannot be parsed
} finally {
try { resContent.close(); } catch (Exception e) {/* ignore this */}
}
if (document == null) return new Snippet(null, ERROR_PARSER_FAILED, "parser error/failed"); // cannot be parsed
@ -263,30 +281,40 @@ public class plasmaSnippetCache {
* @return the parsed document as {@link plasmaParserDocument}
*/
public plasmaParserDocument retrieveDocument(URL url, boolean fetchOnline) {
byte[] resource = null;
IResourceInfo docInfo = null;
try {
// trying to load the resource body from cache
resource = this.cacheManager.loadResourceContent(url);
InputStream content = this.cacheManager.getResourceContentStream(url);
long resourceLength = this.cacheManager.getResourceContentLength(url);
// if not available try to load resource from web
if ((fetchOnline) && (resource == null)) {
if ((fetchOnline) && (content == null)) {
// download resource using crawler
plasmaHTCache.Entry entry = loadResourceFromWeb(url, 5000);
plasmaHTCache.Entry entry = loadResourceFromWeb(url, 5000, true);
// fetching metadata of the resource (e.g. http headers for http resource)
if (entry != null) docInfo = entry.getDocumentInfo();
if (entry != null) {
docInfo = entry.getDocumentInfo();
// getting the resource body from the cache
resource = this.cacheManager.loadResourceContent(url);
byte[] resourceArray = entry.cacheArray();
if (resourceArray != null) {
// read resource body (if it is there)
content = new ByteArrayInputStream(resourceArray);
resourceLength = resourceArray.length;
} else {
// in case that the reosurce was not in ram, read it from disk
content = this.cacheManager.getResourceContentStream(url);
resourceLength = this.cacheManager.getResourceContentLength(url);
}
}
} else {
// trying to load resource metadata
docInfo = this.cacheManager.loadResourceInfo(url);
}
// parsing document
if (resource == null) return null;
return parseDocument(url, resource, docInfo);
if (content == null) return null;
return parseDocument(url, resourceLength, content, docInfo);
} catch (ParserException e) {
this.log.logWarning("Unable to parse resource. " + e.getMessage());
return null;
@ -446,15 +474,24 @@ public class plasmaSnippetCache {
return map;
}
public plasmaParserDocument parseDocument(URL url, byte[] resource) throws ParserException {
return parseDocument(url, resource, null);
public plasmaParserDocument parseDocument(URL url, long contentLength, InputStream resourceStream) throws ParserException {
return parseDocument(url, contentLength, resourceStream, null);
}
public plasmaParserDocument parseDocument(URL url, byte[] resource, IResourceInfo docInfo) throws ParserException {
/**
* Parse the resource
* @param url the URL of the resource
* @param contentLength the contentLength of the resource
* @param resourceStream the resource body as stream
* @param docInfo metadata about the resource
* @return the extracted data
* @throws ParserException
*/
public plasmaParserDocument parseDocument(URL url, long contentLength, InputStream resourceStream, IResourceInfo docInfo) throws ParserException {
try {
if (resource == null) return null;
if (resourceStream == null) return null;
// if no resource metadata is available, try to load it
// STEP 1: if no resource metadata is available, try to load it from cache
if (docInfo == null) {
// try to get the header from the htcache directory
try {
@ -464,9 +501,11 @@ public class plasmaSnippetCache {
}
}
// TODO: we need a better solution here
// encapsulate this in the crawlLoader class
// STEP 2: if the metadata is still null try to download it from web
if ((docInfo == null) && (url.getProtocol().startsWith("http"))) {
// TODO: we need a better solution here
// e.g. encapsulate this in the crawlLoader class
// getting URL mimeType
try {
httpHeader header = httpc.whead(url, url.getHost(), 10000, null, null, this.sb.remoteProxyConfig);
@ -476,6 +515,7 @@ public class plasmaSnippetCache {
}
}
// STEP 3: if the metadata is still null try to guess the mimeType of the resource
if (docInfo == null) {
String filename = this.cacheManager.getCachePath(url).getName();
int p = filename.lastIndexOf('.');
@ -495,12 +535,12 @@ public class plasmaSnippetCache {
supposedMime = plasmaParser.getMimeTypeByFileExt(filename.substring(p + 1));
}
return this.parser.parseSource(url, supposedMime, null, resource);
return this.parser.parseSource(url, supposedMime, null, contentLength, resourceStream);
}
return null;
}
if (plasmaParser.supportedMimeTypesContains(docInfo.getMimeType())) {
return this.parser.parseSource(url, docInfo.getMimeType(), docInfo.getCharacterEncoding(), resource);
return this.parser.parseSource(url, docInfo.getMimeType(), docInfo.getCharacterEncoding(), contentLength, resourceStream);
}
return null;
} catch (InterruptedException e) {
@ -509,27 +549,57 @@ public class plasmaSnippetCache {
}
}
public byte[] getResource(URL url, boolean fetchOnline, int socketTimeout) {
/**
*
* @param url
* @param fetchOnline
* @param socketTimeout
* @return an Object array containing
* <table>
* <tr><td>[0]</td><td>the content as {@link InputStream}</td></tr>
* <tr><td>[1]</td><td>the content-length as {@link Integer}</td></tr>
* </table>
*/
public Object[] getResource(URL url, boolean fetchOnline, int socketTimeout) {
// load the url as resource from the web
try {
long contentLength = -1;
// trying to load the resource body from cache
byte[] resource = cacheManager.loadResourceContent(url);
InputStream resource = this.cacheManager.getResourceContentStream(url);
if (resource != null) {
contentLength = this.cacheManager.getResourceContentLength(url);
} else if (fetchOnline) {
// if the content is not available in cache try to download it from web
if ((fetchOnline) && (resource == null)) {
// try to download the resource using a crawler
loadResourceFromWeb(url, (socketTimeout < 0) ? -1 : socketTimeout);
// get the content from cache
resource = cacheManager.loadResourceContent(url);
// try to download the resource using a crawler
plasmaHTCache.Entry entry = loadResourceFromWeb(url, (socketTimeout < 0) ? -1 : socketTimeout, true);
// read resource body (if it is there)
byte[] resourceArray = entry.cacheArray();
// in case that the reosurce was not in ram, read it from disk
if (resourceArray == null) {
resource = this.cacheManager.getResourceContentStream(url);
contentLength = this.cacheManager.getResourceContentLength(url);
} else {
resource = new ByteArrayInputStream(resourceArray);
contentLength = resourceArray.length;
}
return resource;
} else {
return null;
}
return new Object[]{resource,new Long(contentLength)};
} catch (IOException e) {
return null;
}
}
public plasmaHTCache.Entry loadResourceFromWeb(URL url, int socketTimeout) throws plasmaCrawlerException {
public plasmaHTCache.Entry loadResourceFromWeb(
URL url,
int socketTimeout,
boolean keepInMemory
) throws plasmaCrawlerException {
plasmaHTCache.Entry result = this.sb.cacheLoader.loadSync(
url,
@ -538,7 +608,8 @@ public class plasmaSnippetCache {
null,
0,
null,
socketTimeout
socketTimeout,
keepInMemory
);
return result;

View File

@ -105,6 +105,7 @@ package de.anomic.plasma;
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.lang.reflect.Constructor;
import java.net.InetAddress;
import java.net.MalformedURLException;
@ -2181,17 +2182,32 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
URL url = entry.url();
if (url == null) return 0;
InputStream resourceContent = null;
try {
// get set of words
// Set words = plasmaCondenser.getWords(getText(getResource(url, fetchOnline)));
Iterator witer = plasmaCondenser.getWords(snippetCache.parseDocument(url, snippetCache.getResource(url, fetchOnline, 10000)).getTextBytes());
// get the resource content
Object[] resource = snippetCache.getResource(url, fetchOnline, 10000);
resourceContent = (InputStream) resource[0];
Long resourceContentLength = (Long) resource[1];
// parse the resource
plasmaParserDocument document = snippetCache.parseDocument(url, resourceContentLength.longValue(), resourceContent);
// getting parsed body input stream
InputStream docBodyInputStream = document.getText();
// getting word iterator
Iterator witer = plasmaCondenser.getWords(docBodyInputStream);
// delete all word references
int count = removeReferences(urlhash, witer);
// finally delete the url entry itself
urlPool.loadedURL.remove(urlhash);
return count;
} catch (ParserException e) {
return 0;
} finally {
if (resourceContent != null) try { resourceContent.close(); } catch (Exception e) {/* ignore this */}
}
}

View File

@ -144,7 +144,7 @@ public final class serverFileUtils {
public static void writeX(InputStream source, String inputCharset, Writer procOS, OutputStream bufferOS, String outputCharset) throws IOException {
InputStreamReader sourceReader = new InputStreamReader(source,inputCharset);
OutputStreamWriter bufferOSWriter = new OutputStreamWriter(bufferOS,outputCharset);
OutputStreamWriter bufferOSWriter = (bufferOS==null)?null:new OutputStreamWriter(bufferOS,outputCharset);
writeX(sourceReader,procOS,bufferOSWriter);
}