mirror of
https://github.com/yacy/yacy_search_server.git
synced 2024-09-19 00:01:41 +02:00
*) plasmaHTCache:
- method loadResourceContent defined as deprecated. Please do not use this function to avoid OutOfMemory Exceptions when loading large files - new function getResourceContentStream to get an inputstream of a cache file - new function getResourceContentLength to get the size of a cached file *) httpc.java: - Bugfix: resource content was loaded into memory even if this was not requested *) Crawler: - new option to hold loaded resource content in memory - adding option to use the worker class without the worker pool (needed by the snippet fetcher) *) plasmaSnippetCache - snippet loader does not use a crawl-worker from pool but uses a newly created instance to avoid blocking by normal crawling activity. - now operates on streams instead of byte arrays to avoid OutOfMemory Exceptions when operating on large files - snippet loader now forces the crawl-worker to keep the loaded resource in memory to avoid IO *) plasmaCondenser: adding new function getWords that can directly operate on input streams *) Parsers - keep resource in memory whenever possible (to avoid IO) - when parsing from stream the content length must be passed to the parser function now. this length value is needed by the parsers to decide if the parsed resource content is to large to hold it in memory and must be stored to file - AbstractParser.java: new function to pass the contentLength of a resource to the parsers git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@2701 6c8d7289-2bf4-0310-a012-ef5d649a1542
This commit is contained in:
parent
630a955674
commit
f17ce28b6d
|
@ -45,6 +45,7 @@
|
|||
//if the shell's current path is HTROOT
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.io.UnsupportedEncodingException;
|
||||
import java.net.URLDecoder;
|
||||
import java.net.URLEncoder;
|
||||
|
@ -60,6 +61,7 @@ import de.anomic.plasma.cache.IResourceInfo;
|
|||
import de.anomic.plasma.crawler.plasmaCrawlerException;
|
||||
import de.anomic.plasma.parser.ParserException;
|
||||
import de.anomic.plasma.plasmaCrawlLURL.Entry;
|
||||
import de.anomic.server.serverFileUtils;
|
||||
import de.anomic.server.serverObjects;
|
||||
import de.anomic.server.serverSwitch;
|
||||
|
||||
|
@ -121,18 +123,20 @@ public class ViewFile {
|
|||
}
|
||||
|
||||
// loading the resource content as byte array
|
||||
byte[] resource = null;
|
||||
InputStream resource = null;
|
||||
long resourceLength = -1;
|
||||
IResourceInfo resInfo = null;
|
||||
String resMime = null;
|
||||
try {
|
||||
// trying to load the resource body
|
||||
resource = sb.cacheManager.loadResourceContent(url);
|
||||
resource = sb.cacheManager.getResourceContentStream(url);
|
||||
resourceLength = sb.cacheManager.getResourceContentLength(url);
|
||||
|
||||
// if the resource body was not cached we try to load it from web
|
||||
if (resource == null) {
|
||||
plasmaHTCache.Entry entry = null;
|
||||
try {
|
||||
entry = sb.snippetCache.loadResourceFromWeb(url, 5000);
|
||||
entry = sb.snippetCache.loadResourceFromWeb(url, 5000, false);
|
||||
} catch (plasmaCrawlerException e) {
|
||||
prop.put("error",4);
|
||||
prop.put("error_errorText",e.getMessage());
|
||||
|
@ -142,11 +146,13 @@ public class ViewFile {
|
|||
|
||||
if (entry != null) {
|
||||
resInfo = entry.getDocumentInfo();
|
||||
resource = sb.cacheManager.loadResourceContent(url);
|
||||
resource = sb.cacheManager.getResourceContentStream(url);
|
||||
resourceLength = sb.cacheManager.getResourceContentLength(url);
|
||||
}
|
||||
|
||||
if (resource == null) {
|
||||
prop.put("error",4);
|
||||
prop.put("error_errorText","No resource available");
|
||||
prop.put("viewMode",VIEW_MODE_NO_TEXT);
|
||||
return prop;
|
||||
}
|
||||
|
@ -172,6 +178,15 @@ public class ViewFile {
|
|||
httpHeader responseHeader = httpc.whead(url,url.getHost(),5000,null,null,sb.remoteProxyConfig);
|
||||
if (responseHeader == null) {
|
||||
prop.put("error",4);
|
||||
prop.put("error_errorText","Unable to load resource metadata.");
|
||||
prop.put("viewMode",VIEW_MODE_NO_TEXT);
|
||||
return prop;
|
||||
}
|
||||
try {
|
||||
resInfo = sb.cacheManager.getResourceInfoFactory().buildResourceInfoObj(url, responseHeader);
|
||||
} catch (Exception e) {
|
||||
prop.put("error",4);
|
||||
prop.put("error_errorText",e.getMessage());
|
||||
prop.put("viewMode",VIEW_MODE_NO_TEXT);
|
||||
return prop;
|
||||
}
|
||||
|
@ -181,12 +196,28 @@ public class ViewFile {
|
|||
resMime = resInfo.getMimeType();
|
||||
}
|
||||
} catch (IOException e) {
|
||||
if (resource != null) try { resource.close(); } catch (Exception ex) {/* ignore this */}
|
||||
prop.put("error",4);
|
||||
prop.put("error_errorText",e.getMessage());
|
||||
prop.put("viewMode",VIEW_MODE_NO_TEXT);
|
||||
return prop;
|
||||
}
|
||||
|
||||
if (viewMode.equals("plain")) {
|
||||
String content = new String(resource);
|
||||
|
||||
// TODO: how to handle very large files here ?
|
||||
String content;
|
||||
try {
|
||||
content = new String(serverFileUtils.read(resource),"UTF-8");
|
||||
} catch (Exception e) {
|
||||
prop.put("error",4);
|
||||
prop.put("error_errorText",e.getMessage());
|
||||
prop.put("viewMode",VIEW_MODE_NO_TEXT);
|
||||
return prop;
|
||||
} finally {
|
||||
if (resource != null) try { resource.close(); } catch (Exception e) {/* ignore this */}
|
||||
}
|
||||
|
||||
content = content.replaceAll("<","<")
|
||||
.replaceAll(">",">")
|
||||
.replaceAll("\"",""")
|
||||
|
@ -196,11 +227,14 @@ public class ViewFile {
|
|||
prop.put("error",0);
|
||||
prop.put("viewMode",VIEW_MODE_AS_PLAIN_TEXT);
|
||||
prop.put("viewMode_plainText",content);
|
||||
} else if (viewMode.equals("parsed") || viewMode.equals("sentences") || viewMode.equals("iframe")) {
|
||||
} else if (viewMode.equals("iframe")) {
|
||||
prop.put("viewMode",VIEW_MODE_AS_IFRAME);
|
||||
prop.put("viewMode_url",url.toString());
|
||||
} else if (viewMode.equals("parsed") || viewMode.equals("sentences")) {
|
||||
// parsing the resource content
|
||||
plasmaParserDocument document = null;
|
||||
try {
|
||||
document = sb.snippetCache.parseDocument(url, resource,resInfo);
|
||||
document = sb.snippetCache.parseDocument(url, resourceLength, resource,resInfo);
|
||||
if (document == null) {
|
||||
prop.put("error",5);
|
||||
prop.put("error_errorText","Unknown error");
|
||||
|
@ -212,7 +246,10 @@ public class ViewFile {
|
|||
prop.put("error_errorText",e.getMessage());
|
||||
prop.put("viewMode",VIEW_MODE_NO_TEXT);
|
||||
return prop;
|
||||
} finally {
|
||||
if (resource != null) try { resource.close(); } catch (Exception e) {/* ignore this */}
|
||||
}
|
||||
|
||||
resMime = document.getMimeType();
|
||||
|
||||
if (viewMode.equals("parsed")) {
|
||||
|
@ -223,9 +260,6 @@ public class ViewFile {
|
|||
|
||||
prop.put("viewMode",VIEW_MODE_AS_PARSED_TEXT);
|
||||
prop.put("viewMode_parsedText",content);
|
||||
} else if (viewMode.equals("iframe")) {
|
||||
prop.put("viewMode",VIEW_MODE_AS_IFRAME);
|
||||
prop.put("viewMode_url",url.toString());
|
||||
} else {
|
||||
prop.put("viewMode",VIEW_MODE_AS_PARSED_SENTENCES);
|
||||
String[] sentences = document.getSentences();
|
||||
|
|
|
@ -43,11 +43,14 @@ import java.awt.Container;
|
|||
import java.awt.Image;
|
||||
import java.awt.MediaTracker;
|
||||
import java.awt.Toolkit;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.net.MalformedURLException;
|
||||
|
||||
import de.anomic.http.httpHeader;
|
||||
import de.anomic.net.URL;
|
||||
import de.anomic.plasma.plasmaSwitchboard;
|
||||
import de.anomic.server.serverFileUtils;
|
||||
import de.anomic.server.serverObjects;
|
||||
import de.anomic.server.serverSwitch;
|
||||
|
||||
|
@ -70,9 +73,20 @@ public class ViewImage {
|
|||
int maxheight = post.getInt("maxheight", 0);
|
||||
int timeout = post.getInt("timeout", 5000);
|
||||
|
||||
// load image
|
||||
byte[] imgb = sb.snippetCache.getResource(url, true, timeout);
|
||||
if (imgb == null) return null;
|
||||
// getting the image as stream
|
||||
InputStream imgStream = (InputStream) sb.snippetCache.getResource(url, true, timeout)[0];
|
||||
if (imgStream == null) return null;
|
||||
|
||||
// read image data
|
||||
byte[] imgb = null;
|
||||
try {
|
||||
imgb = serverFileUtils.read(imgStream);
|
||||
} catch (IOException e) {
|
||||
return null;
|
||||
} finally {
|
||||
try { imgStream.close(); } catch (Exception e) {/* ignore this */}
|
||||
}
|
||||
|
||||
|
||||
// create image
|
||||
MediaTracker mediaTracker = new MediaTracker(new Container());
|
||||
|
|
|
@ -1837,9 +1837,13 @@ do upload
|
|||
* @return
|
||||
* @throws IOException
|
||||
*/
|
||||
public byte[] writeContent(Object procOS) throws IOException {
|
||||
public byte[] writeContent(Object procOS, boolean returnByteArray) throws IOException {
|
||||
serverByteBuffer sbb = null;
|
||||
|
||||
if (returnByteArray) {
|
||||
int contentLength = (int) this.responseHeader.contentLength();
|
||||
serverByteBuffer sbb = new serverByteBuffer((contentLength==-1)?8192:contentLength);
|
||||
sbb = new serverByteBuffer((contentLength==-1)?8192:contentLength);
|
||||
}
|
||||
|
||||
if (procOS instanceof OutputStream) {
|
||||
//writeContentX(httpc.this.clientInput, this.gzip, this.responseHeader.contentLength(), procOS, sbb);
|
||||
|
@ -1852,7 +1856,7 @@ do upload
|
|||
throw new IllegalArgumentException("Invalid procOS object type '" + procOS.getClass().getName() + "'");
|
||||
}
|
||||
|
||||
return sbb.getBytes();
|
||||
return (sbb==null)?null:sbb.getBytes();
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
|
@ -662,7 +662,7 @@ public final class httpdProxyHandler extends httpdAbstractHandler implements htt
|
|||
if ((contentLength > 0) && (contentLength < 1048576)) // if the length is known and < 1 MB
|
||||
{
|
||||
// ok, we don't write actually into a file, only to RAM, and schedule writing the file.
|
||||
byte[] cacheArray = res.writeContent(hfos);
|
||||
byte[] cacheArray = res.writeContent(hfos,true);
|
||||
this.theLogger.logFine("writeContent of " + url + " produced cacheArray = " + ((cacheArray == null) ? "null" : ("size=" + cacheArray.length)));
|
||||
|
||||
if (hfos instanceof htmlFilterWriter) ((htmlFilterWriter) hfos).finalize();
|
||||
|
|
|
@ -80,6 +80,7 @@ public abstract class AbstractCrawlWorker extends Thread implements plasmaCrawlW
|
|||
*/
|
||||
protected boolean done = false;
|
||||
|
||||
|
||||
/* ============================================================
|
||||
* Crawl job specific variables
|
||||
* ============================================================ */
|
||||
|
@ -92,6 +93,7 @@ public abstract class AbstractCrawlWorker extends Thread implements plasmaCrawlW
|
|||
protected long startdate;
|
||||
protected plasmaCrawlProfile.entry profile;
|
||||
protected boolean acceptAllContent;
|
||||
protected boolean keepInMemory;
|
||||
|
||||
protected String errorMessage;
|
||||
|
||||
|
@ -159,8 +161,9 @@ public abstract class AbstractCrawlWorker extends Thread implements plasmaCrawlW
|
|||
|
||||
try {
|
||||
// The thread keeps running.
|
||||
while (!this.stopped && !this.isInterrupted() && !this.myPool.isClosed) {
|
||||
while (!this.stopped && !this.isInterrupted()) {
|
||||
if (this.done) {
|
||||
if (this.myPool != null && !this.myPool.isClosed) {
|
||||
synchronized (this) {
|
||||
// return thread back into pool
|
||||
this.myPool.returnObject(this.protocol,this);
|
||||
|
@ -170,11 +173,15 @@ public abstract class AbstractCrawlWorker extends Thread implements plasmaCrawlW
|
|||
this.wait();
|
||||
}
|
||||
}
|
||||
} else {
|
||||
this.stopped = true;
|
||||
}
|
||||
} else {
|
||||
try {
|
||||
// executing the new task
|
||||
execute();
|
||||
} finally {
|
||||
// free memory
|
||||
reset();
|
||||
}
|
||||
}
|
||||
|
@ -231,6 +238,7 @@ public abstract class AbstractCrawlWorker extends Thread implements plasmaCrawlW
|
|||
this.depth = theNewMsg.depth;
|
||||
this.profile = theNewMsg.profile;
|
||||
this.acceptAllContent = theNewMsg.acceptAllContent;
|
||||
this.keepInMemory = theNewMsg.keepInMemory;
|
||||
|
||||
this.startdate = System.currentTimeMillis();
|
||||
|
||||
|
@ -260,6 +268,7 @@ public abstract class AbstractCrawlWorker extends Thread implements plasmaCrawlW
|
|||
|
||||
public void reset() {
|
||||
this.theMsg = null;
|
||||
|
||||
this.url = null;
|
||||
this.name = null;
|
||||
this.refererURLString = null;
|
||||
|
@ -268,6 +277,8 @@ public abstract class AbstractCrawlWorker extends Thread implements plasmaCrawlW
|
|||
this.startdate = 0;
|
||||
this.profile = null;
|
||||
this.acceptAllContent = false;
|
||||
this.keepInMemory = false;
|
||||
|
||||
this.errorMessage = null;
|
||||
}
|
||||
|
||||
|
|
|
@ -262,8 +262,9 @@ public final class CrawlWorker extends AbstractCrawlWorker {
|
|||
}
|
||||
|
||||
// we write the new cache entry to file system directly
|
||||
res.writeContent(fos);
|
||||
htCache.setCacheArray(null);
|
||||
byte[] cacheArray = null;
|
||||
cacheArray = res.writeContent(fos,this.keepInMemory);
|
||||
htCache.setCacheArray(cacheArray);
|
||||
this.cacheManager.writeFileAnnouncement(cacheFile);
|
||||
} finally {
|
||||
if (fos!=null)try{fos.close();}catch(Exception e){/* ignore this */}
|
||||
|
|
|
@ -84,10 +84,14 @@ public final class plasmaCrawlerFactory implements KeyedPoolableObjectFactory {
|
|||
this.thePool = pool;
|
||||
}
|
||||
|
||||
public Object makeObject(Object key) throws Exception {
|
||||
return makeObject(key, true);
|
||||
}
|
||||
|
||||
/**
|
||||
* @see org.apache.commons.pool.PoolableObjectFactory#makeObject()
|
||||
*/
|
||||
public Object makeObject(Object key) throws Exception {
|
||||
public Object makeObject(Object key, boolean usePool) throws Exception {
|
||||
if (!(key instanceof String))
|
||||
throw new IllegalArgumentException("The object key must be of type string.");
|
||||
|
||||
|
@ -109,7 +113,7 @@ public final class plasmaCrawlerFactory implements KeyedPoolableObjectFactory {
|
|||
// instantiating class
|
||||
plasmaCrawlWorker theCrawlWorker = (plasmaCrawlWorker) classConstructor.newInstance(new Object[] {
|
||||
this.theThreadGroup,
|
||||
this.thePool,
|
||||
(usePool)?this.thePool:null,
|
||||
this.sb,
|
||||
this.cacheManager,
|
||||
this.theLog
|
||||
|
|
|
@ -52,15 +52,22 @@ import org.apache.commons.pool.impl.GenericKeyedObjectPool;
|
|||
import de.anomic.server.logging.serverLog;
|
||||
|
||||
public final class plasmaCrawlerPool extends GenericKeyedObjectPool {
|
||||
|
||||
private plasmaCrawlerFactory theFactory;
|
||||
private final ThreadGroup theThreadGroup;
|
||||
public boolean isClosed = false;
|
||||
|
||||
public plasmaCrawlerPool(plasmaCrawlerFactory objFactory, GenericKeyedObjectPool.Config config, ThreadGroup threadGroup) {
|
||||
super(objFactory, config);
|
||||
this.theFactory = objFactory;
|
||||
this.theThreadGroup = threadGroup;
|
||||
objFactory.setPool(this);
|
||||
}
|
||||
|
||||
public plasmaCrawlerFactory getFactory() {
|
||||
return this.theFactory;
|
||||
}
|
||||
|
||||
public Object borrowObject(Object key) throws Exception {
|
||||
return super.borrowObject(key);
|
||||
}
|
||||
|
|
|
@ -90,7 +90,7 @@ public abstract class AbstractParser implements Parser{
|
|||
* The source file file size in bytes if the source document was passed
|
||||
* in as file
|
||||
*/
|
||||
protected long fileSize = -1;
|
||||
protected long contentLength = -1;
|
||||
|
||||
/**
|
||||
* The Constructor of this class.
|
||||
|
@ -100,6 +100,15 @@ public abstract class AbstractParser implements Parser{
|
|||
this.libxDependencies = libxDependencies;
|
||||
}
|
||||
|
||||
/**
|
||||
* Set the content length of the source file.
|
||||
* This value is needed by some parsers to decide
|
||||
* if the parsed text could be hold in memory
|
||||
*/
|
||||
public void setContentLength(long length) {
|
||||
this.contentLength = length;
|
||||
}
|
||||
|
||||
/**
|
||||
* Check if the parser was interrupted.
|
||||
* @throws InterruptedException if the parser was interrupted
|
||||
|
@ -185,7 +194,7 @@ public abstract class AbstractParser implements Parser{
|
|||
BufferedInputStream contentInputStream = null;
|
||||
try {
|
||||
// getting the file size of the document
|
||||
this.fileSize = sourceFile.length();
|
||||
this.contentLength = sourceFile.length();
|
||||
|
||||
// create a stream from the file
|
||||
contentInputStream = new BufferedInputStream(new FileInputStream(sourceFile));
|
||||
|
@ -242,4 +251,8 @@ public abstract class AbstractParser implements Parser{
|
|||
public String getName() {
|
||||
return this.parserName;
|
||||
}
|
||||
|
||||
public void reset() {
|
||||
this.contentLength = -1;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -117,6 +117,8 @@ public interface Parser {
|
|||
*/
|
||||
public void reset();
|
||||
|
||||
public void setContentLength(long length);
|
||||
|
||||
/**
|
||||
* @return Returns a list of library names that are needed by this parser
|
||||
*/
|
||||
|
|
|
@ -139,6 +139,6 @@ public class bzipParser extends AbstractParser implements Parser {
|
|||
|
||||
public void reset() {
|
||||
// Nothing todo here at the moment
|
||||
|
||||
super.reset();
|
||||
}
|
||||
}
|
||||
|
|
|
@ -118,6 +118,7 @@ implements Parser {
|
|||
|
||||
public void reset() {
|
||||
// Nothing todo here at the moment
|
||||
super.reset();
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -123,6 +123,6 @@ public class gzipParser extends AbstractParser implements Parser {
|
|||
|
||||
public void reset() {
|
||||
// Nothing todo here at the moment
|
||||
|
||||
super.reset();
|
||||
}
|
||||
}
|
||||
|
|
|
@ -187,8 +187,7 @@ implements Parser {
|
|||
}
|
||||
}
|
||||
|
||||
public plasmaParserDocument parse(URL location, String mimeType,String charset,
|
||||
InputStream source) throws ParserException, InterruptedException {
|
||||
public plasmaParserDocument parse(URL location, String mimeType,String charset, InputStream source) throws ParserException, InterruptedException {
|
||||
File dstFile = null;
|
||||
try {
|
||||
dstFile = File.createTempFile("mimeTypeParser",".tmp");
|
||||
|
@ -208,6 +207,7 @@ implements Parser {
|
|||
|
||||
public void reset() {
|
||||
// Nothing todo here at the moment
|
||||
super.reset();
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -198,7 +198,7 @@ public class odtParser extends AbstractParser implements Parser {
|
|||
|
||||
public void reset() {
|
||||
// Nothing todo here at the moment
|
||||
|
||||
super.reset();
|
||||
}
|
||||
|
||||
public static void main(String[] args) {
|
||||
|
|
|
@ -132,7 +132,7 @@ public class pdfParser extends AbstractParser implements Parser {
|
|||
}
|
||||
|
||||
// creating a writer for output
|
||||
if ((this.fileSize != -1) && (this.fileSize > Parser.MAX_KEEP_IN_MEMORY_SIZE)) {
|
||||
if ((this.contentLength == -1) || (this.contentLength > Parser.MAX_KEEP_IN_MEMORY_SIZE)) {
|
||||
writerFile = File.createTempFile("pdfParser",".tmp");
|
||||
writer = new OutputStreamWriter(new FileOutputStream(writerFile),"UTF-8");
|
||||
} else {
|
||||
|
@ -199,7 +199,8 @@ public class pdfParser extends AbstractParser implements Parser {
|
|||
}
|
||||
|
||||
public void reset() {
|
||||
this.fileSize = -1;
|
||||
// Nothing todo here at the moment
|
||||
super.reset();
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -170,7 +170,7 @@ public class rpmParser extends AbstractParser implements Parser {
|
|||
|
||||
public void reset() {
|
||||
// Nothing todo here at the moment
|
||||
|
||||
super.reset();
|
||||
}
|
||||
|
||||
public static void main(String[] args) {
|
||||
|
|
|
@ -211,8 +211,8 @@ public class rssParser extends AbstractParser implements Parser {
|
|||
}
|
||||
|
||||
public void reset() {
|
||||
// TODO Auto-generated method stub
|
||||
|
||||
// Nothing todo here at the moment
|
||||
super.reset();
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -124,6 +124,7 @@ implements Parser {
|
|||
|
||||
public void reset() {
|
||||
// Nothing todo here at the moment
|
||||
super.reset();
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -104,7 +104,7 @@ public class tarParser extends AbstractParser implements Parser {
|
|||
File outputFile = null;
|
||||
plasmaParserDocument subDoc = null;
|
||||
try {
|
||||
if ((this.fileSize != -1) && (this.fileSize > Parser.MAX_KEEP_IN_MEMORY_SIZE)) {
|
||||
if ((this.contentLength == -1) || (this.contentLength > Parser.MAX_KEEP_IN_MEMORY_SIZE)) {
|
||||
outputFile = File.createTempFile("zipParser",".tmp");
|
||||
docText = new BufferedOutputStream(new FileOutputStream(outputFile));
|
||||
} else {
|
||||
|
@ -252,6 +252,6 @@ public class tarParser extends AbstractParser implements Parser {
|
|||
|
||||
public void reset() {
|
||||
// Nothing todo here at the moment
|
||||
|
||||
super.reset();
|
||||
}
|
||||
}
|
||||
|
|
|
@ -263,7 +263,7 @@ public class vcfParser extends AbstractParser implements Parser {
|
|||
|
||||
public void reset() {
|
||||
// Nothing todo here at the moment
|
||||
|
||||
super.reset();
|
||||
}
|
||||
|
||||
public static void main(String[] args) {
|
||||
|
|
|
@ -102,7 +102,7 @@ public class zipParser extends AbstractParser implements Parser {
|
|||
File outputFile = null;
|
||||
plasmaParserDocument subDoc = null;
|
||||
try {
|
||||
if ((this.fileSize != -1) && (this.fileSize > Parser.MAX_KEEP_IN_MEMORY_SIZE)) {
|
||||
if ((this.contentLength == -1) || (this.contentLength > Parser.MAX_KEEP_IN_MEMORY_SIZE)) {
|
||||
outputFile = File.createTempFile("zipParser",".tmp");
|
||||
docText = new BufferedOutputStream(new FileOutputStream(outputFile));
|
||||
} else {
|
||||
|
@ -236,6 +236,6 @@ public class zipParser extends AbstractParser implements Parser {
|
|||
|
||||
public void reset() {
|
||||
// Nothing todo here at the moment
|
||||
|
||||
super.reset();
|
||||
}
|
||||
}
|
||||
|
|
|
@ -671,11 +671,16 @@ public final class plasmaCondenser {
|
|||
}
|
||||
*/
|
||||
|
||||
public static Iterator getWords(InputStream input) {
|
||||
if (input == null) return null;
|
||||
plasmaCondenser condenser = new plasmaCondenser(input);
|
||||
return condenser.words();
|
||||
}
|
||||
|
||||
public static Iterator getWords(byte[] text) {
|
||||
if (text == null) return null;
|
||||
ByteArrayInputStream buffer = new ByteArrayInputStream(text);
|
||||
plasmaCondenser condenser = new plasmaCondenser(buffer);
|
||||
return condenser.words();
|
||||
return getWords(buffer);
|
||||
}
|
||||
|
||||
public static void main(String[] args) {
|
||||
|
|
|
@ -163,15 +163,23 @@ public final class plasmaCrawlLoader extends Thread {
|
|||
return this.theThreadGroup;
|
||||
}
|
||||
|
||||
private void execute(plasmaCrawlLoaderMessage theMsg) throws Exception {
|
||||
private void execute(plasmaCrawlLoaderMessage theMsg, boolean useThreadPool) throws Exception {
|
||||
// getting the protocol of the next URL
|
||||
String protocol = theMsg.url.getProtocol();
|
||||
|
||||
// TODO: remove this
|
||||
if (protocol.equals("https")) protocol = "http";
|
||||
|
||||
// get a new worker thread
|
||||
plasmaCrawlWorker theWorker = null;
|
||||
if (useThreadPool) {
|
||||
// getting a new crawler from the crawler pool
|
||||
plasmaCrawlWorker theWorker = (plasmaCrawlWorker) this.crawlwerPool.borrowObject(protocol);
|
||||
theWorker = (plasmaCrawlWorker) this.crawlwerPool.borrowObject(protocol);
|
||||
} else {
|
||||
// create a new one
|
||||
theWorker = (plasmaCrawlWorker) this.crawlwerPool.getFactory().makeObject(protocol,false);
|
||||
}
|
||||
|
||||
if (theWorker == null) {
|
||||
this.log.logWarning("Unsupported protocol '" + protocol + "' in url " + theMsg.url);
|
||||
} else {
|
||||
|
@ -187,7 +195,7 @@ public final class plasmaCrawlLoader extends Thread {
|
|||
plasmaCrawlLoaderMessage theMsg = this.theQueue.waitForMessage();
|
||||
|
||||
// start new crawl job
|
||||
this.execute(theMsg);
|
||||
this.execute(theMsg, true);
|
||||
|
||||
} catch (InterruptedException e) {
|
||||
Thread.interrupted();
|
||||
|
@ -218,7 +226,8 @@ public final class plasmaCrawlLoader extends Thread {
|
|||
String initiator,
|
||||
int depth,
|
||||
plasmaCrawlProfile.entry profile,
|
||||
int timeout
|
||||
int timeout,
|
||||
boolean keepInMemory
|
||||
) throws plasmaCrawlerException {
|
||||
|
||||
plasmaHTCache.Entry result = null;
|
||||
|
@ -235,13 +244,14 @@ public final class plasmaCrawlLoader extends Thread {
|
|||
profile,
|
||||
crawlingPriority,
|
||||
true,
|
||||
timeout
|
||||
timeout,
|
||||
keepInMemory
|
||||
);
|
||||
|
||||
|
||||
try {
|
||||
// start new crawl job
|
||||
this.execute(theMsg);
|
||||
this.execute(theMsg, false);
|
||||
|
||||
// wait for the crawl job result
|
||||
result = theMsg.waitForResult();
|
||||
|
@ -283,7 +293,8 @@ public final class plasmaCrawlLoader extends Thread {
|
|||
profile, // crawling profile
|
||||
crawlingPriority, // crawling priority
|
||||
false, // only download documents whose mimetypes are enabled for the crawler
|
||||
-1 // use default crawler timeout
|
||||
-1, // use default crawler timeout
|
||||
false // resource should not be kept in memory
|
||||
);
|
||||
|
||||
// adding the message to the queue
|
||||
|
|
|
@ -56,6 +56,7 @@ public final class plasmaCrawlLoaderMessage {
|
|||
public final plasmaCrawlProfile.entry profile;
|
||||
public final boolean acceptAllContent;
|
||||
public final int timeout;
|
||||
public final boolean keepInMemory;
|
||||
|
||||
private serverSemaphore resultSync = null;
|
||||
private plasmaHTCache.Entry result;
|
||||
|
@ -71,7 +72,8 @@ public final class plasmaCrawlLoaderMessage {
|
|||
plasmaCrawlProfile.entry profile,
|
||||
int crawlingPriority,
|
||||
boolean acceptAllContent,
|
||||
int timeout
|
||||
int timeout,
|
||||
boolean keepInMemory
|
||||
) {
|
||||
this.url = url;
|
||||
this.name = name;
|
||||
|
@ -82,6 +84,7 @@ public final class plasmaCrawlLoaderMessage {
|
|||
this.crawlingPriority = crawlingPriority;
|
||||
this.acceptAllContent = acceptAllContent;
|
||||
this.timeout = timeout;
|
||||
this.keepInMemory = keepInMemory;
|
||||
|
||||
this.resultSync = new serverSemaphore(0);
|
||||
this.result = null;
|
||||
|
|
|
@ -53,9 +53,12 @@
|
|||
|
||||
package de.anomic.plasma;
|
||||
|
||||
import java.io.BufferedInputStream;
|
||||
import java.io.File;
|
||||
import java.io.FileInputStream;
|
||||
import java.io.FileNotFoundException;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.net.InetAddress;
|
||||
import java.net.MalformedURLException;
|
||||
import java.util.Date;
|
||||
|
@ -701,10 +704,17 @@ public final class plasmaHTCache {
|
|||
return null;
|
||||
}
|
||||
|
||||
/**
|
||||
* @param url
|
||||
* @return
|
||||
*
|
||||
* @deprecated dont't use this function to avoid OutOfMemory-Exceptions.
|
||||
* Use {@link #getResourceContentStream(URL)} instead
|
||||
*/
|
||||
public byte[] loadResourceContent(URL url) {
|
||||
// load the url as resource from the cache
|
||||
File f = getCachePath(url);
|
||||
if (f.exists()) try {
|
||||
if (f.exists() && f.canRead()) try {
|
||||
return serverFileUtils.read(f);
|
||||
} catch (IOException e) {
|
||||
return null;
|
||||
|
@ -712,6 +722,34 @@ public final class plasmaHTCache {
|
|||
return null;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the content of a cached resource as {@link InputStream}
|
||||
* @param url the requested resource
|
||||
* @return the resource content as {@link InputStream}. In no data
|
||||
* is available or the cached file is not readable, <code>null</code>
|
||||
* is returned.
|
||||
*/
|
||||
public InputStream getResourceContentStream(URL url) {
|
||||
// load the url as resource from the cache
|
||||
File f = getCachePath(url);
|
||||
if (f.exists() && f.canRead()) try {
|
||||
return new BufferedInputStream(new FileInputStream(f));
|
||||
} catch (IOException e) {
|
||||
this.log.logSevere("Unable to create a BufferedInputStream from file " + f,e);
|
||||
return null;
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
public long getResourceContentLength(URL url) {
|
||||
// load the url as resource from the cache
|
||||
File f = getCachePath(url);
|
||||
if (f.exists() && f.canRead()) {
|
||||
return f.length();
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
public static boolean isPOST(String urlString) {
|
||||
return (urlString.indexOf("?") >= 0 ||
|
||||
urlString.indexOf("&") >= 0);
|
||||
|
|
|
@ -45,11 +45,13 @@
|
|||
package de.anomic.plasma;
|
||||
|
||||
import java.io.BufferedInputStream;
|
||||
import java.io.ByteArrayInputStream;
|
||||
import java.io.File;
|
||||
import java.io.FileFilter;
|
||||
import java.io.FileInputStream;
|
||||
import java.io.FilenameFilter;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.net.MalformedURLException;
|
||||
import java.net.URI;
|
||||
import java.util.Arrays;
|
||||
|
@ -465,16 +467,25 @@ public final class plasmaParser {
|
|||
} catch (Exception e) {/* ignore this */}
|
||||
}
|
||||
|
||||
public plasmaParserDocument parseSource(URL location, String mimeType, String charset, byte[] source)
|
||||
public plasmaParserDocument parseSource(URL location, String mimeType, String charset, byte[] sourceArray)
|
||||
throws InterruptedException, ParserException {
|
||||
File tempFile = null;
|
||||
ByteArrayInputStream byteIn = null;
|
||||
try {
|
||||
// creating a temp file to store the byte array
|
||||
tempFile = File.createTempFile("parseSource", ".tmp");
|
||||
serverFileUtils.write(source, tempFile);
|
||||
if (this.theLogger.isFine())
|
||||
this.theLogger.logFine("Parsing '" + location + "' from byte-array");
|
||||
|
||||
// testing if the resource is not empty
|
||||
if (sourceArray == null || sourceArray.length == 0) {
|
||||
String errorMsg = "No resource content available.";
|
||||
this.theLogger.logInfo("Unable to parse '" + location + "'. " + errorMsg);
|
||||
throw new ParserException(errorMsg,location,plasmaCrawlEURL.DENIED_NOT_PARSEABLE_NO_CONTENT);
|
||||
}
|
||||
|
||||
// creating an InputStream
|
||||
byteIn = new ByteArrayInputStream(sourceArray);
|
||||
|
||||
// parsing the temp file
|
||||
return parseSource(location, mimeType, charset, tempFile);
|
||||
return parseSource(location, mimeType, charset, sourceArray.length, byteIn);
|
||||
|
||||
} catch (Exception e) {
|
||||
// Interrupted- and Parser-Exceptions should pass through
|
||||
|
@ -482,20 +493,65 @@ public final class plasmaParser {
|
|||
if (e instanceof ParserException) throw (ParserException) e;
|
||||
|
||||
// log unexpected error
|
||||
this.theLogger.logSevere("Unexpected exception in parseSource1: " + e.getMessage(), e);
|
||||
this.theLogger.logSevere("Unexpected exception in parseSource from byte-array: " + e.getMessage(), e);
|
||||
throw new ParserException("Unexpected exception while parsing " + location,location, e);
|
||||
} finally {
|
||||
if (tempFile != null) try { tempFile.delete(); } catch (Exception ex){/* ignore this */}
|
||||
if (byteIn != null) try { byteIn.close(); } catch (Exception ex){/* ignore this */}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
public plasmaParserDocument parseSource(URL location, String theMimeType, String theDocumentCharset, File sourceFile)
|
||||
throws InterruptedException, ParserException {
|
||||
public plasmaParserDocument parseSource(URL location, String theMimeType, String theDocumentCharset, File sourceFile) throws InterruptedException, ParserException {
|
||||
|
||||
BufferedInputStream sourceStream = null;
|
||||
try {
|
||||
if (this.theLogger.isFine())
|
||||
this.theLogger.logFine("Parsing '" + location + "' from file");
|
||||
|
||||
// testing if the resource is not empty
|
||||
if (!(sourceFile.exists() && sourceFile.canRead() && sourceFile.length() > 0)) {
|
||||
String errorMsg = sourceFile.exists() ? "Empty resource file." : "No resource content available.";
|
||||
this.theLogger.logInfo("Unable to parse '" + location + "'. " + errorMsg);
|
||||
throw new ParserException(errorMsg,location,plasmaCrawlEURL.DENIED_NOT_PARSEABLE_NO_CONTENT);
|
||||
}
|
||||
|
||||
// create a new InputStream
|
||||
sourceStream = new BufferedInputStream(new FileInputStream(sourceFile));
|
||||
|
||||
// parsing the data
|
||||
return this.parseSource(location, theMimeType, theDocumentCharset, sourceFile.length(), sourceStream);
|
||||
|
||||
} catch (Exception e) {
|
||||
// Interrupted- and Parser-Exceptions should pass through
|
||||
if (e instanceof InterruptedException) throw (InterruptedException) e;
|
||||
if (e instanceof ParserException) throw (ParserException) e;
|
||||
|
||||
// log unexpected error
|
||||
this.theLogger.logSevere("Unexpected exception in parseSource from File: " + e.getMessage(), e);
|
||||
throw new ParserException("Unexpected exception while parsing " + location,location, e);
|
||||
} finally {
|
||||
if (sourceStream != null) try { sourceStream.close(); } catch (Exception ex){/* ignore this */}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* To parse a resource from an {@link InputStream}
|
||||
* @param location the URL of the resource
|
||||
* @param theMimeType the resource mimetype (<code>null</code> if unknown)
|
||||
* @param theDocumentCharset the charset of the resource (<code>null</code> if unknown)
|
||||
* @param contentLength the content length of the resource (<code>-1</code> if unknown)
|
||||
* @param sourceStream an {@link InputStream} containing the resource body
|
||||
* @return the parsed {@link plasmaParserDocument document}
|
||||
* @throws InterruptedException
|
||||
* @throws ParserException
|
||||
*/
|
||||
public plasmaParserDocument parseSource(URL location, String theMimeType, String theDocumentCharset, long contentLength, InputStream sourceStream) throws InterruptedException, ParserException {
|
||||
Parser theParser = null;
|
||||
String mimeType = null;
|
||||
try {
|
||||
if (this.theLogger.isFine())
|
||||
this.theLogger.logFine("Parsing '" + location + "' from stream");
|
||||
|
||||
// getting the mimetype of the document
|
||||
mimeType = getRealMimeType(theMimeType);
|
||||
|
||||
|
@ -513,76 +569,22 @@ public final class plasmaParser {
|
|||
throw new ParserException(errorMsg,location,plasmaCrawlEURL.DENIED_WRONG_MIMETYPE_OR_EXT);
|
||||
}
|
||||
|
||||
// testing if the resource is not empty
|
||||
if (!(sourceFile.exists() && sourceFile.canRead() && sourceFile.length() > 0)) {
|
||||
String errorMsg = sourceFile.exists() ? "Empty resource file." : "No resource content available.";
|
||||
this.theLogger.logInfo("Unable to parse '" + location + "'. " + errorMsg);
|
||||
throw new ParserException(errorMsg,location,plasmaCrawlEURL.DENIED_NOT_PARSEABLE_NO_CONTENT);
|
||||
}
|
||||
|
||||
|
||||
if (this.theLogger.isFine())
|
||||
this.theLogger.logInfo("Parsing " + location + " with mimeType '" + mimeType +
|
||||
"' and file extension '" + fileExt + "'.");
|
||||
|
||||
/*
|
||||
* There are some problematic mimeType - fileExtension combination where we have to enforce
|
||||
* a mimeType detection to get the proper parser for the content
|
||||
*
|
||||
* - application/zip + .odt
|
||||
* - text/plain + .odt
|
||||
* - text/plain + .vcf
|
||||
* - text/xml + .rss
|
||||
* - text/xml + .atom
|
||||
*
|
||||
* In all these cases we can trust the fileExtension and have to determine the proper mimeType.
|
||||
*
|
||||
*/
|
||||
|
||||
// // Handling of not trustable mimeTypes
|
||||
// // - text/plain
|
||||
// // - text/xml
|
||||
// // - application/octet-stream
|
||||
// // - application/zip
|
||||
// if (
|
||||
// (mimeType.equalsIgnoreCase("text/plain") && !fileExt.equalsIgnoreCase("txt")) ||
|
||||
// (mimeType.equalsIgnoreCase("text/xml") && !fileExt.equalsIgnoreCase("txt"))
|
||||
// ) {
|
||||
// if (this.theLogger.isFine())
|
||||
// this.theLogger.logFine("Document " + location + " has an mimeType '" + mimeType +
|
||||
// "' that seems not to be correct for file extension '" + fileExt + "'.");
|
||||
//
|
||||
// if (enabledParserList.containsKey("application/octet-stream")) {
|
||||
// theParser = this.getParser("application/octet-stream");
|
||||
// Object newMime = theParser.getClass().getMethod("getMimeType", new Class[]{File.class}).invoke(theParser, sourceFile);
|
||||
// if (newMime == null)
|
||||
// if (newMime instanceof String) {
|
||||
// String newMimeType = (String)newMime;
|
||||
// if ((newMimeType.equals("application/octet-stream")) {
|
||||
// return null;
|
||||
// }
|
||||
// mimeType = newMimeType;
|
||||
// }
|
||||
// } else {
|
||||
// return null;
|
||||
// }
|
||||
// } else if (mimeType.equalsIgnoreCase("application/zip") && fileExt.equalsIgnoreCase("odt")){
|
||||
// if (enabledParserList.containsKey("application/vnd.oasis.opendocument.text")) {
|
||||
// mimeType = "application/vnd.oasis.opendocument.text";
|
||||
// } else {
|
||||
// return null;
|
||||
// }
|
||||
// }
|
||||
|
||||
// getting the correct parser for the given mimeType
|
||||
theParser = this.getParser(mimeType);
|
||||
|
||||
// if a parser was found we use it ...
|
||||
plasmaParserDocument doc = null;
|
||||
if (theParser != null) {
|
||||
doc = theParser.parse(location, mimeType,documentCharset,sourceFile);
|
||||
// set the content length of the resource
|
||||
theParser.setContentLength(contentLength);
|
||||
// parse the resource
|
||||
doc = theParser.parse(location, mimeType,documentCharset,sourceStream);
|
||||
} else if (realtimeParsableMimeTypesContains(mimeType)) {
|
||||
doc = parseHtml(location, mimeType, documentCharset, sourceFile);
|
||||
doc = parseHtml(location, mimeType, documentCharset, sourceStream);
|
||||
} else {
|
||||
String errorMsg = "No parser available to parse mimetype '" + mimeType + "'";
|
||||
this.theLogger.logInfo("Unable to parse '" + location + "'. " + errorMsg);
|
||||
|
@ -614,11 +616,10 @@ public final class plasmaParser {
|
|||
}
|
||||
}
|
||||
|
||||
private plasmaParserDocument parseHtml(URL location, String mimeType, String documentCharset, File sourceFile) throws IOException, ParserException {
|
||||
private plasmaParserDocument parseHtml(URL location, String mimeType, String documentCharset, InputStream sourceStream) throws IOException, ParserException {
|
||||
|
||||
// ...otherwise we make a scraper and transformer
|
||||
FileInputStream fileIn = new FileInputStream(sourceFile);
|
||||
htmlFilterInputStream htmlFilter = new htmlFilterInputStream(fileIn,documentCharset,location,null,false);
|
||||
htmlFilterInputStream htmlFilter = new htmlFilterInputStream(sourceStream,documentCharset,location,null,false);
|
||||
String charset = htmlFilter.detectCharset();
|
||||
if (charset == null) {
|
||||
charset = documentCharset;
|
||||
|
@ -763,7 +764,7 @@ public final class plasmaParser {
|
|||
//javac -classpath lib/commons-collections.jar:lib/commons-pool-1.2.jar -sourcepath source source/de/anomic/plasma/plasmaParser.java
|
||||
//java -cp source:lib/commons-collections.jar:lib/commons-pool-1.2.jar de.anomic.plasma.plasmaParser bug.html bug.out
|
||||
try {
|
||||
File contentFile = null;
|
||||
Object content = null;
|
||||
URL contentURL = null;
|
||||
String contentMimeType = "application/octet-stream";
|
||||
String charSet = "UTF-8";
|
||||
|
@ -774,17 +775,13 @@ public final class plasmaParser {
|
|||
|
||||
String mode = args[0];
|
||||
if (mode.equalsIgnoreCase("-f")) {
|
||||
contentFile = new File(args[1]);
|
||||
contentURL = new URL(contentFile);
|
||||
content = new File(args[1]);
|
||||
contentURL = new URL((File)content);
|
||||
} else if (mode.equalsIgnoreCase("-u")) {
|
||||
contentURL = new URL(args[1]);
|
||||
|
||||
// downloading the document content
|
||||
byte[] contentBytes = httpc.singleGET(contentURL, contentURL.getHost(), 10000, null, null, null);
|
||||
|
||||
contentFile = File.createTempFile("content",".tmp");
|
||||
contentFile.deleteOnExit();
|
||||
serverFileUtils.write(contentBytes, contentFile);
|
||||
content = httpc.singleGET(contentURL, contentURL.getHost(), 10000, null, null, null);
|
||||
}
|
||||
|
||||
if ((args.length >= 4)&&(args[2].equalsIgnoreCase("-m"))) {
|
||||
|
@ -805,7 +802,12 @@ public final class plasmaParser {
|
|||
plasmaParser.enableAllParsers(PARSER_MODE_PROXY);
|
||||
|
||||
// parsing the content
|
||||
plasmaParserDocument document = theParser.parseSource(contentURL, contentMimeType, charSet, contentFile);
|
||||
plasmaParserDocument document = null;
|
||||
if (content instanceof byte[]) {
|
||||
document = theParser.parseSource(contentURL, contentMimeType, charSet, (byte[])content);
|
||||
} else if (content instanceof File) {
|
||||
document = theParser.parseSource(contentURL, contentMimeType, charSet, (File)content);
|
||||
}
|
||||
|
||||
// printing out all parsed sentences
|
||||
if (document != null) {
|
||||
|
|
|
@ -41,6 +41,7 @@
|
|||
|
||||
package de.anomic.plasma;
|
||||
|
||||
import java.io.InputStream;
|
||||
import java.net.MalformedURLException;
|
||||
import java.util.Iterator;
|
||||
import java.util.Map;
|
||||
|
@ -59,13 +60,18 @@ public final class plasmaSearchImages {
|
|||
long start = System.currentTimeMillis();
|
||||
this.images = new TreeSet();
|
||||
if (maxTime > 10) {
|
||||
byte[] res = sc.getResource(url, true, (int) maxTime);
|
||||
Object[] resource = sc.getResource(url, true, (int) maxTime);
|
||||
InputStream res = (InputStream) resource[0];
|
||||
Long resLength = (Long) resource[1];
|
||||
if (res != null) {
|
||||
plasmaParserDocument document = null;
|
||||
try {
|
||||
document = sc.parseDocument(url, res);
|
||||
// parse the document
|
||||
document = sc.parseDocument(url, resLength.longValue(), res);
|
||||
} catch (ParserException e) {
|
||||
// parsing failed
|
||||
} finally {
|
||||
try { res.close(); } catch (Exception e) {/* ignore this */}
|
||||
}
|
||||
if (document == null) return;
|
||||
|
||||
|
|
|
@ -44,7 +44,9 @@
|
|||
|
||||
package de.anomic.plasma;
|
||||
|
||||
import java.io.ByteArrayInputStream;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.util.Enumeration;
|
||||
import java.util.HashMap;
|
||||
import java.util.HashSet;
|
||||
|
@ -187,46 +189,62 @@ public class plasmaSnippetCache {
|
|||
* LOADING RESOURCE DATA
|
||||
* =========================================================================== */
|
||||
// if the snippet is not in the cache, we can try to get it from the htcache
|
||||
byte[] resource = null;
|
||||
IResourceInfo docInfo = null;
|
||||
long resContentLength = 0;
|
||||
InputStream resContent = null;
|
||||
IResourceInfo resInfo = null;
|
||||
try {
|
||||
// trying to load the resource from the cache
|
||||
resource = this.cacheManager.loadResourceContent(url);
|
||||
resContent = this.cacheManager.getResourceContentStream(url);
|
||||
if (resContent != null) {
|
||||
// if the content was found
|
||||
resContentLength = this.cacheManager.getResourceContentLength(url);
|
||||
|
||||
// getting resource metadata
|
||||
resInfo = this.cacheManager.loadResourceInfo(url);
|
||||
|
||||
} else if (fetchOnline) {
|
||||
// if not found try to download it
|
||||
if ((resource == null) && (fetchOnline)) {
|
||||
// download resource using the crawler
|
||||
plasmaHTCache.Entry entry = loadResourceFromWeb(url, timeout);
|
||||
|
||||
// download resource using the crawler and keep resource in memory if possible
|
||||
plasmaHTCache.Entry entry = loadResourceFromWeb(url, timeout, true);
|
||||
|
||||
// getting resource metadata (e.g. the http headers for http resources)
|
||||
if (entry != null) docInfo = entry.getDocumentInfo();
|
||||
if (entry != null) {
|
||||
resInfo = entry.getDocumentInfo();
|
||||
|
||||
// read resource body (if it is there)
|
||||
resource = entry.cacheArray();
|
||||
byte []resourceArray = entry.cacheArray();
|
||||
if (resourceArray != null) {
|
||||
resContent = new ByteArrayInputStream(resourceArray);
|
||||
resContentLength = resourceArray.length;
|
||||
} else {
|
||||
resContent = this.cacheManager.getResourceContentStream(url);
|
||||
resContentLength = this.cacheManager.getResourceContentLength(url);
|
||||
}
|
||||
}
|
||||
|
||||
// in case that the reosurce was not in ram, read it from disk
|
||||
if (resource == null) resource = this.cacheManager.loadResourceContent(url);
|
||||
|
||||
// if it is still not available, throw exception
|
||||
if (resource == null) return new Snippet(null, ERROR_RESOURCE_LOADING, "error loading resource, plasmaHTCache.Entry cache is NULL");
|
||||
// if it is still not available, report an error
|
||||
if (resContent == null) return new Snippet(null, ERROR_RESOURCE_LOADING, "error loading resource, plasmaHTCache.Entry cache is NULL");
|
||||
|
||||
source = SOURCE_WEB;
|
||||
} else {
|
||||
return new Snippet(null, ERROR_SOURCE_LOADING, "no resource available");
|
||||
}
|
||||
} catch (Exception e) {
|
||||
if (!(e instanceof plasmaCrawlerException)) e.printStackTrace();
|
||||
return new Snippet(null, ERROR_SOURCE_LOADING, "error loading resource: " + e.getMessage());
|
||||
}
|
||||
|
||||
if (resource == null) return new Snippet(null, ERROR_SOURCE_LOADING, "no resource available");
|
||||
|
||||
/* ===========================================================================
|
||||
* PARSING RESOURCE
|
||||
* =========================================================================== */
|
||||
plasmaParserDocument document = null;
|
||||
try {
|
||||
document = parseDocument(url, resource, docInfo);
|
||||
document = parseDocument(url, resContentLength, resContent, resInfo);
|
||||
} catch (ParserException e) {
|
||||
return new Snippet(null, ERROR_PARSER_FAILED, e.getMessage()); // cannot be parsed
|
||||
} finally {
|
||||
try { resContent.close(); } catch (Exception e) {/* ignore this */}
|
||||
}
|
||||
if (document == null) return new Snippet(null, ERROR_PARSER_FAILED, "parser error/failed"); // cannot be parsed
|
||||
|
||||
|
@ -263,30 +281,40 @@ public class plasmaSnippetCache {
|
|||
* @return the parsed document as {@link plasmaParserDocument}
|
||||
*/
|
||||
public plasmaParserDocument retrieveDocument(URL url, boolean fetchOnline) {
|
||||
byte[] resource = null;
|
||||
IResourceInfo docInfo = null;
|
||||
try {
|
||||
// trying to load the resource body from cache
|
||||
resource = this.cacheManager.loadResourceContent(url);
|
||||
InputStream content = this.cacheManager.getResourceContentStream(url);
|
||||
long resourceLength = this.cacheManager.getResourceContentLength(url);
|
||||
|
||||
// if not available try to load resource from web
|
||||
if ((fetchOnline) && (resource == null)) {
|
||||
if ((fetchOnline) && (content == null)) {
|
||||
// download resource using crawler
|
||||
plasmaHTCache.Entry entry = loadResourceFromWeb(url, 5000);
|
||||
plasmaHTCache.Entry entry = loadResourceFromWeb(url, 5000, true);
|
||||
|
||||
// fetching metadata of the resource (e.g. http headers for http resource)
|
||||
if (entry != null) docInfo = entry.getDocumentInfo();
|
||||
if (entry != null) {
|
||||
docInfo = entry.getDocumentInfo();
|
||||
|
||||
// getting the resource body from the cache
|
||||
resource = this.cacheManager.loadResourceContent(url);
|
||||
byte[] resourceArray = entry.cacheArray();
|
||||
if (resourceArray != null) {
|
||||
// read resource body (if it is there)
|
||||
content = new ByteArrayInputStream(resourceArray);
|
||||
resourceLength = resourceArray.length;
|
||||
} else {
|
||||
// in case that the reosurce was not in ram, read it from disk
|
||||
content = this.cacheManager.getResourceContentStream(url);
|
||||
resourceLength = this.cacheManager.getResourceContentLength(url);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// trying to load resource metadata
|
||||
docInfo = this.cacheManager.loadResourceInfo(url);
|
||||
}
|
||||
|
||||
// parsing document
|
||||
if (resource == null) return null;
|
||||
return parseDocument(url, resource, docInfo);
|
||||
if (content == null) return null;
|
||||
return parseDocument(url, resourceLength, content, docInfo);
|
||||
} catch (ParserException e) {
|
||||
this.log.logWarning("Unable to parse resource. " + e.getMessage());
|
||||
return null;
|
||||
|
@ -446,15 +474,24 @@ public class plasmaSnippetCache {
|
|||
return map;
|
||||
}
|
||||
|
||||
public plasmaParserDocument parseDocument(URL url, byte[] resource) throws ParserException {
|
||||
return parseDocument(url, resource, null);
|
||||
public plasmaParserDocument parseDocument(URL url, long contentLength, InputStream resourceStream) throws ParserException {
|
||||
return parseDocument(url, contentLength, resourceStream, null);
|
||||
}
|
||||
|
||||
public plasmaParserDocument parseDocument(URL url, byte[] resource, IResourceInfo docInfo) throws ParserException {
|
||||
/**
|
||||
* Parse the resource
|
||||
* @param url the URL of the resource
|
||||
* @param contentLength the contentLength of the resource
|
||||
* @param resourceStream the resource body as stream
|
||||
* @param docInfo metadata about the resource
|
||||
* @return the extracted data
|
||||
* @throws ParserException
|
||||
*/
|
||||
public plasmaParserDocument parseDocument(URL url, long contentLength, InputStream resourceStream, IResourceInfo docInfo) throws ParserException {
|
||||
try {
|
||||
if (resource == null) return null;
|
||||
if (resourceStream == null) return null;
|
||||
|
||||
// if no resource metadata is available, try to load it
|
||||
// STEP 1: if no resource metadata is available, try to load it from cache
|
||||
if (docInfo == null) {
|
||||
// try to get the header from the htcache directory
|
||||
try {
|
||||
|
@ -464,9 +501,11 @@ public class plasmaSnippetCache {
|
|||
}
|
||||
}
|
||||
|
||||
// TODO: we need a better solution here
|
||||
// encapsulate this in the crawlLoader class
|
||||
// STEP 2: if the metadata is still null try to download it from web
|
||||
if ((docInfo == null) && (url.getProtocol().startsWith("http"))) {
|
||||
// TODO: we need a better solution here
|
||||
// e.g. encapsulate this in the crawlLoader class
|
||||
|
||||
// getting URL mimeType
|
||||
try {
|
||||
httpHeader header = httpc.whead(url, url.getHost(), 10000, null, null, this.sb.remoteProxyConfig);
|
||||
|
@ -476,6 +515,7 @@ public class plasmaSnippetCache {
|
|||
}
|
||||
}
|
||||
|
||||
// STEP 3: if the metadata is still null try to guess the mimeType of the resource
|
||||
if (docInfo == null) {
|
||||
String filename = this.cacheManager.getCachePath(url).getName();
|
||||
int p = filename.lastIndexOf('.');
|
||||
|
@ -495,12 +535,12 @@ public class plasmaSnippetCache {
|
|||
supposedMime = plasmaParser.getMimeTypeByFileExt(filename.substring(p + 1));
|
||||
}
|
||||
|
||||
return this.parser.parseSource(url, supposedMime, null, resource);
|
||||
return this.parser.parseSource(url, supposedMime, null, contentLength, resourceStream);
|
||||
}
|
||||
return null;
|
||||
}
|
||||
if (plasmaParser.supportedMimeTypesContains(docInfo.getMimeType())) {
|
||||
return this.parser.parseSource(url, docInfo.getMimeType(), docInfo.getCharacterEncoding(), resource);
|
||||
return this.parser.parseSource(url, docInfo.getMimeType(), docInfo.getCharacterEncoding(), contentLength, resourceStream);
|
||||
}
|
||||
return null;
|
||||
} catch (InterruptedException e) {
|
||||
|
@ -509,27 +549,57 @@ public class plasmaSnippetCache {
|
|||
}
|
||||
}
|
||||
|
||||
public byte[] getResource(URL url, boolean fetchOnline, int socketTimeout) {
|
||||
/**
|
||||
*
|
||||
* @param url
|
||||
* @param fetchOnline
|
||||
* @param socketTimeout
|
||||
* @return an Object array containing
|
||||
* <table>
|
||||
* <tr><td>[0]</td><td>the content as {@link InputStream}</td></tr>
|
||||
* <tr><td>[1]</td><td>the content-length as {@link Integer}</td></tr>
|
||||
* </table>
|
||||
*/
|
||||
public Object[] getResource(URL url, boolean fetchOnline, int socketTimeout) {
|
||||
// load the url as resource from the web
|
||||
try {
|
||||
long contentLength = -1;
|
||||
|
||||
// trying to load the resource body from cache
|
||||
byte[] resource = cacheManager.loadResourceContent(url);
|
||||
|
||||
InputStream resource = this.cacheManager.getResourceContentStream(url);
|
||||
if (resource != null) {
|
||||
contentLength = this.cacheManager.getResourceContentLength(url);
|
||||
} else if (fetchOnline) {
|
||||
// if the content is not available in cache try to download it from web
|
||||
if ((fetchOnline) && (resource == null)) {
|
||||
// try to download the resource using a crawler
|
||||
loadResourceFromWeb(url, (socketTimeout < 0) ? -1 : socketTimeout);
|
||||
|
||||
// get the content from cache
|
||||
resource = cacheManager.loadResourceContent(url);
|
||||
// try to download the resource using a crawler
|
||||
plasmaHTCache.Entry entry = loadResourceFromWeb(url, (socketTimeout < 0) ? -1 : socketTimeout, true);
|
||||
|
||||
// read resource body (if it is there)
|
||||
byte[] resourceArray = entry.cacheArray();
|
||||
|
||||
// in case that the reosurce was not in ram, read it from disk
|
||||
if (resourceArray == null) {
|
||||
resource = this.cacheManager.getResourceContentStream(url);
|
||||
contentLength = this.cacheManager.getResourceContentLength(url);
|
||||
} else {
|
||||
resource = new ByteArrayInputStream(resourceArray);
|
||||
contentLength = resourceArray.length;
|
||||
}
|
||||
return resource;
|
||||
} else {
|
||||
return null;
|
||||
}
|
||||
return new Object[]{resource,new Long(contentLength)};
|
||||
} catch (IOException e) {
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
public plasmaHTCache.Entry loadResourceFromWeb(URL url, int socketTimeout) throws plasmaCrawlerException {
|
||||
public plasmaHTCache.Entry loadResourceFromWeb(
|
||||
URL url,
|
||||
int socketTimeout,
|
||||
boolean keepInMemory
|
||||
) throws plasmaCrawlerException {
|
||||
|
||||
plasmaHTCache.Entry result = this.sb.cacheLoader.loadSync(
|
||||
url,
|
||||
|
@ -538,7 +608,8 @@ public class plasmaSnippetCache {
|
|||
null,
|
||||
0,
|
||||
null,
|
||||
socketTimeout
|
||||
socketTimeout,
|
||||
keepInMemory
|
||||
);
|
||||
|
||||
return result;
|
||||
|
|
|
@ -105,6 +105,7 @@ package de.anomic.plasma;
|
|||
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.lang.reflect.Constructor;
|
||||
import java.net.InetAddress;
|
||||
import java.net.MalformedURLException;
|
||||
|
@ -2181,17 +2182,32 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
|
|||
URL url = entry.url();
|
||||
if (url == null) return 0;
|
||||
|
||||
InputStream resourceContent = null;
|
||||
try {
|
||||
// get set of words
|
||||
// Set words = plasmaCondenser.getWords(getText(getResource(url, fetchOnline)));
|
||||
Iterator witer = plasmaCondenser.getWords(snippetCache.parseDocument(url, snippetCache.getResource(url, fetchOnline, 10000)).getTextBytes());
|
||||
// get the resource content
|
||||
Object[] resource = snippetCache.getResource(url, fetchOnline, 10000);
|
||||
resourceContent = (InputStream) resource[0];
|
||||
Long resourceContentLength = (Long) resource[1];
|
||||
|
||||
// parse the resource
|
||||
plasmaParserDocument document = snippetCache.parseDocument(url, resourceContentLength.longValue(), resourceContent);
|
||||
|
||||
// getting parsed body input stream
|
||||
InputStream docBodyInputStream = document.getText();
|
||||
|
||||
// getting word iterator
|
||||
Iterator witer = plasmaCondenser.getWords(docBodyInputStream);
|
||||
|
||||
// delete all word references
|
||||
int count = removeReferences(urlhash, witer);
|
||||
|
||||
// finally delete the url entry itself
|
||||
urlPool.loadedURL.remove(urlhash);
|
||||
return count;
|
||||
} catch (ParserException e) {
|
||||
return 0;
|
||||
} finally {
|
||||
if (resourceContent != null) try { resourceContent.close(); } catch (Exception e) {/* ignore this */}
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -144,7 +144,7 @@ public final class serverFileUtils {
|
|||
|
||||
public static void writeX(InputStream source, String inputCharset, Writer procOS, OutputStream bufferOS, String outputCharset) throws IOException {
|
||||
InputStreamReader sourceReader = new InputStreamReader(source,inputCharset);
|
||||
OutputStreamWriter bufferOSWriter = new OutputStreamWriter(bufferOS,outputCharset);
|
||||
OutputStreamWriter bufferOSWriter = (bufferOS==null)?null:new OutputStreamWriter(bufferOS,outputCharset);
|
||||
writeX(sourceReader,procOS,bufferOSWriter);
|
||||
}
|
||||
|
||||
|
|
Loading…
Reference in New Issue
Block a user