mirror of
https://github.com/yacy/yacy_search_server.git
synced 2024-09-19 00:01:41 +02:00
memory hacks
This commit is contained in:
parent
b4409cc803
commit
4540174fe0
|
@ -143,7 +143,7 @@ public class BookmarkHelper {
|
|||
//load the links
|
||||
final ContentScraper scraper = new ContentScraper(baseURL);
|
||||
//OutputStream os = new htmlFilterOutputStream(null, scraper, null, false);
|
||||
final Writer writer= new TransformerWriter(null,null,scraper, null, false);
|
||||
final Writer writer = new TransformerWriter(null, null, scraper, null, false);
|
||||
FileUtils.copy(input,writer);
|
||||
writer.close();
|
||||
links = scraper.getAnchors();
|
||||
|
|
|
@ -1039,18 +1039,18 @@ public final class HTTPDFileHandler {
|
|||
|
||||
if (mimeType.startsWith("text")) {
|
||||
// every text-file distributed by yacy is UTF-8
|
||||
if(!path.startsWith("/repository")) {
|
||||
if (!path.startsWith("/repository")) {
|
||||
mimeType = mimeType + "; charset=UTF-8";
|
||||
} else {
|
||||
// detect charset of html-files
|
||||
if((path.endsWith("html") || path.endsWith("htm"))) {
|
||||
if ((path.endsWith("html") || path.endsWith("htm"))) {
|
||||
// save position
|
||||
fis.mark(1000);
|
||||
// scrape document to look up charset
|
||||
final ScraperInputStream htmlFilter = new ScraperInputStream(fis,"UTF-8",new DigestURI("http://localhost"),null,false);
|
||||
final ScraperInputStream htmlFilter = new ScraperInputStream(fis, "UTF-8", new DigestURI("http://localhost"), null, false);
|
||||
final String charset = htmlParser.patchCharsetEncoding(htmlFilter.detectCharset());
|
||||
if(charset != null)
|
||||
mimeType = mimeType + "; charset="+charset;
|
||||
htmlFilter.close();
|
||||
if (charset != null) mimeType = mimeType + "; charset="+charset;
|
||||
// reset position
|
||||
fis.reset();
|
||||
}
|
||||
|
|
|
@ -485,17 +485,24 @@ public class ContentScraper extends AbstractScraper implements Scraper {
|
|||
final TransformerWriter writer = new TransformerWriter(null, null, scraper, null, false);
|
||||
try {
|
||||
FileUtils.copy(new CharArrayReader(inlineHtml), writer);
|
||||
writer.close();
|
||||
} catch (final IOException e) {
|
||||
Log.logException(e);
|
||||
return cleanLine(super.stripAll(inlineHtml));
|
||||
} finally {
|
||||
scraper.close();
|
||||
try {
|
||||
writer.close();
|
||||
} catch (IOException e) {
|
||||
}
|
||||
}
|
||||
for (final Map.Entry<MultiProtocolURI, Properties> entry: scraper.getAnchors().entrySet()) {
|
||||
mergeAnchors(entry.getKey(), entry.getValue());
|
||||
}
|
||||
this.images.putAll(scraper.images);
|
||||
|
||||
return cleanLine(super.stripAll(scraper.content.getChars()));
|
||||
String line = cleanLine(super.stripAll(scraper.content.getChars()));
|
||||
scraper.close();
|
||||
return line;
|
||||
}
|
||||
|
||||
private final static String cleanLine(final String s) {
|
||||
|
@ -885,14 +892,14 @@ public class ContentScraper extends AbstractScraper implements Scraper {
|
|||
// scrape document to look up charset
|
||||
final ScraperInputStream htmlFilter = new ScraperInputStream(new ByteArrayInputStream(page),"UTF-8", new MultiProtocolURI("http://localhost"),null,false);
|
||||
String charset = htmlParser.patchCharsetEncoding(htmlFilter.detectCharset());
|
||||
if(charset == null)
|
||||
charset = Charset.defaultCharset().toString();
|
||||
htmlFilter.close();
|
||||
if (charset == null) charset = Charset.defaultCharset().toString();
|
||||
|
||||
// scrape content
|
||||
final ContentScraper scraper = new ContentScraper(new MultiProtocolURI("http://localhost"));
|
||||
final Writer writer = new TransformerWriter(null, null, scraper, null, false);
|
||||
FileUtils.copy(new ByteArrayInputStream(page), writer, Charset.forName(charset));
|
||||
|
||||
writer.close();
|
||||
return scraper;
|
||||
}
|
||||
|
||||
|
|
|
@ -34,7 +34,6 @@ import java.util.TreeSet;
|
|||
|
||||
import net.yacy.cora.document.ASCII;
|
||||
import net.yacy.kelondro.io.CharBuffer;
|
||||
import net.yacy.kelondro.logging.Log;
|
||||
|
||||
public class ContentTransformer extends AbstractTransformer implements Transformer {
|
||||
|
||||
|
@ -90,11 +89,7 @@ public class ContentTransformer extends AbstractTransformer implements Transform
|
|||
}
|
||||
bb.append("</FONT> ");
|
||||
final char[] result = bb.getChars();
|
||||
try {
|
||||
bb.close();
|
||||
} catch (IOException e) {
|
||||
Log.logException(e);
|
||||
}
|
||||
bb.close();
|
||||
return result;
|
||||
}
|
||||
|
||||
|
|
|
@ -9,7 +9,7 @@
|
|||
// $LastChangedBy$
|
||||
//
|
||||
// LICENSE
|
||||
//
|
||||
//
|
||||
// This program is free software; you can redistribute it and/or modify
|
||||
// it under the terms of the GNU General Public License as published by
|
||||
// the Free Software Foundation; either version 2 of the License, or
|
||||
|
@ -39,11 +39,11 @@ import net.yacy.cora.document.MultiProtocolURI;
|
|||
|
||||
|
||||
public class ScraperInputStream extends InputStream implements ScraperListener {
|
||||
|
||||
|
||||
private static final int MODE_PRESCAN = 0;
|
||||
private static final int MODE_PRESCAN_FINISHED = 1;
|
||||
private int mode = 1;
|
||||
|
||||
|
||||
private static final long preBufferSize = 4096;
|
||||
private long preRead = 0;
|
||||
private final BufferedInputStream bufferedIn;
|
||||
|
@ -51,10 +51,10 @@ public class ScraperInputStream extends InputStream implements ScraperListener {
|
|||
private String detectedCharset;
|
||||
private boolean charsetChanged = false;
|
||||
private boolean endOfHead = false;
|
||||
|
||||
|
||||
private Reader reader;
|
||||
private Writer writer;
|
||||
|
||||
|
||||
public ScraperInputStream(
|
||||
final InputStream inStream,
|
||||
final String inputStreamCharset,
|
||||
|
@ -65,10 +65,10 @@ public class ScraperInputStream extends InputStream implements ScraperListener {
|
|||
// create a input stream for buffereing
|
||||
this.bufferedIn = new BufferedInputStream(inStream, (int) preBufferSize);
|
||||
this.bufferedIn.mark((int) preBufferSize);
|
||||
|
||||
|
||||
final ContentScraper scraper = new ContentScraper(rooturl);
|
||||
scraper.registerHtmlFilterEventListener(this);
|
||||
|
||||
|
||||
try {
|
||||
this.reader = (inputStreamCharset == null) ? new InputStreamReader(this) : new InputStreamReader(this,inputStreamCharset);
|
||||
} catch (UnsupportedEncodingException e) {
|
||||
|
@ -78,17 +78,17 @@ public class ScraperInputStream extends InputStream implements ScraperListener {
|
|||
// how is that possible?
|
||||
this.reader = new InputStreamReader(this);
|
||||
}
|
||||
}
|
||||
}
|
||||
this.writer = new TransformerWriter(null,null,scraper,transformer,passbyIfBinarySuspect);
|
||||
}
|
||||
|
||||
private static String extractCharsetFromMimetypeHeader(final String mimeType) {
|
||||
if (mimeType == null) return null;
|
||||
|
||||
|
||||
final String[] parts = mimeType.split(";");
|
||||
if (parts == null || parts.length <= 1) return null;
|
||||
|
||||
for (int i=1; i < parts.length; i++) {
|
||||
|
||||
for (int i=1; i < parts.length; i++) {
|
||||
final String param = parts[i].trim();
|
||||
if (param.startsWith("charset=")) {
|
||||
String charset = param.substring("charset=".length()).trim();
|
||||
|
@ -97,13 +97,14 @@ public class ScraperInputStream extends InputStream implements ScraperListener {
|
|||
return charset.trim();
|
||||
}
|
||||
}
|
||||
|
||||
return null;
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void scrapeTag0(final String tagname, final Properties tagopts) {
|
||||
if (tagname == null || tagname.length() == 0) return;
|
||||
|
||||
|
||||
if (tagname.equalsIgnoreCase("meta")) {
|
||||
if (tagopts.containsKey("http-equiv")) {
|
||||
final String value = tagopts.getProperty("http-equiv");
|
||||
|
@ -113,7 +114,7 @@ public class ScraperInputStream extends InputStream implements ScraperListener {
|
|||
this.detectedCharset = extractCharsetFromMimetypeHeader(contentType);
|
||||
if (this.detectedCharset != null && this.detectedCharset.length() > 0) {
|
||||
this.charsetChanged = true;
|
||||
} else if (tagopts.containsKey("charset")) {
|
||||
} else if (tagopts.containsKey("charset")) {
|
||||
// sometimes the charset property is configured as extra attribut. try it ...
|
||||
this.detectedCharset = tagopts.getProperty("charset");
|
||||
this.charsetChanged = true;
|
||||
|
@ -123,48 +124,54 @@ public class ScraperInputStream extends InputStream implements ScraperListener {
|
|||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void scrapeTag1(final String tagname, final Properties tagopts, final char[] text) {
|
||||
if (tagname == null || tagname.length() == 0) return;
|
||||
|
||||
|
||||
if (tagname.equalsIgnoreCase("head")) {
|
||||
this.endOfHead = true;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
public String detectCharset() throws IOException {
|
||||
this.mode = MODE_PRESCAN;
|
||||
|
||||
this.mode = MODE_PRESCAN;
|
||||
|
||||
// loop until we have detected the header element or the charset data
|
||||
int c;
|
||||
while ((c = this.reader.read())!= -1) {
|
||||
this.writer.write(c);
|
||||
if (this.charsetChanged) break; // thats enough
|
||||
}
|
||||
|
||||
|
||||
// free writer
|
||||
this.writer = null;
|
||||
// don't close writer here, otherwise it will shutdown our source stream
|
||||
this.writer = null;
|
||||
// don't close writer here, otherwise it will shutdown our source stream
|
||||
|
||||
// reset the buffer if not already done
|
||||
if (this.mode != MODE_PRESCAN_FINISHED) {
|
||||
this.mode++;
|
||||
this.bufferedIn.reset();
|
||||
}
|
||||
|
||||
|
||||
// return scanning result
|
||||
return (this.charsetChanged) ? this.detectedCharset : null;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int read() throws IOException {
|
||||
// mode 0 is called from within the detectCharset function
|
||||
if (this.mode == MODE_PRESCAN) {
|
||||
if (this.mode == MODE_PRESCAN) {
|
||||
if (this.endOfHead || this.charsetChanged || this.preRead >= preBufferSize - 1) {
|
||||
return -1;
|
||||
return -1;
|
||||
}
|
||||
this.preRead++;
|
||||
}
|
||||
this.preRead++;
|
||||
}
|
||||
return this.bufferedIn.read();
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public void close() throws IOException {
|
||||
if (this.writer != null) this.writer.close();
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -127,11 +127,7 @@ public final class TransformerWriter extends Writer {
|
|||
}
|
||||
bb.append('>');
|
||||
final char[] result = bb.getChars();
|
||||
try {
|
||||
bb.close();
|
||||
} catch (final IOException e) {
|
||||
Log.logException(e);
|
||||
}
|
||||
bb.close();
|
||||
return result;
|
||||
}
|
||||
|
||||
|
@ -147,11 +143,7 @@ public final class TransformerWriter extends Writer {
|
|||
bb.append(text);
|
||||
bb.append('<').append('/').append(tagname).append('>');
|
||||
final char[] result = bb.getChars();
|
||||
try {
|
||||
bb.close();
|
||||
} catch (final IOException e) {
|
||||
Log.logException(e);
|
||||
}
|
||||
bb.close();
|
||||
return result;
|
||||
}
|
||||
|
||||
|
@ -165,11 +157,7 @@ public final class TransformerWriter extends Writer {
|
|||
}
|
||||
bb.append('>');
|
||||
final char[] result = bb.getChars();
|
||||
try {
|
||||
bb.close();
|
||||
} catch (final IOException e) {
|
||||
Log.logException(e);
|
||||
}
|
||||
bb.close();
|
||||
return result;
|
||||
}
|
||||
|
||||
|
@ -178,11 +166,7 @@ public final class TransformerWriter extends Writer {
|
|||
final CharBuffer cb = new CharBuffer(ContentScraper.MAX_DOCSIZE, gt0, gt0.length + text.length + tagname.length() + 3);
|
||||
cb.append(text).append('<').append('/').append(tagname).append('>');
|
||||
final char[] result = cb.getChars();
|
||||
try {
|
||||
cb.close();
|
||||
} catch (final IOException e) {
|
||||
Log.logException(e);
|
||||
}
|
||||
cb.close();
|
||||
return result;
|
||||
}
|
||||
|
||||
|
@ -202,11 +186,7 @@ public final class TransformerWriter extends Writer {
|
|||
result = bb.getChars(1);
|
||||
else
|
||||
result = bb.getChars();
|
||||
try {
|
||||
bb.close();
|
||||
} catch (final IOException ex) {
|
||||
Log.logException(ex);
|
||||
}
|
||||
bb.close();
|
||||
return result;
|
||||
}
|
||||
|
||||
|
@ -227,12 +207,7 @@ public final class TransformerWriter extends Writer {
|
|||
// this single tag is collected at once here
|
||||
final CharBuffer charBuffer = new CharBuffer(ContentScraper.MAX_DOCSIZE, content);
|
||||
this.scraper.scrapeTag0(tag, charBuffer.propParser());
|
||||
try {
|
||||
charBuffer.close();
|
||||
} catch (final IOException e) {
|
||||
// TODO Auto-generated catch block
|
||||
Log.logException(e);
|
||||
}
|
||||
charBuffer.close();
|
||||
}
|
||||
if ((this.transformer != null) && (this.transformer.isTag0(tag))) {
|
||||
// this single tag is collected at once here
|
||||
|
@ -240,11 +215,7 @@ public final class TransformerWriter extends Writer {
|
|||
try {
|
||||
return this.transformer.transformTag0(tag, scb.propParser(), quotechar);
|
||||
} finally {
|
||||
try {
|
||||
scb.close();
|
||||
} catch (final IOException e) {
|
||||
Log.logException(e);
|
||||
}
|
||||
scb.close();
|
||||
}
|
||||
} else if (((this.scraper != null) && (this.scraper.isTag1(tag))) ||
|
||||
((this.transformer != null) && (this.transformer.isTag1(tag)))) {
|
||||
|
@ -252,11 +223,7 @@ public final class TransformerWriter extends Writer {
|
|||
this.filterTag = tag;
|
||||
final CharBuffer scb = new CharBuffer(ContentScraper.MAX_DOCSIZE, content);
|
||||
this.filterOpts = scb.propParser();
|
||||
try {
|
||||
scb.close();
|
||||
} catch (final IOException e) {
|
||||
Log.logException(e);
|
||||
}
|
||||
scb.close();
|
||||
if (this.filterCont == null) this.filterCont = new CharBuffer(ContentScraper.MAX_DOCSIZE, Math.max(100, content.length)); else this.filterCont.reset();
|
||||
return new char[0];
|
||||
} else {
|
||||
|
|
|
@ -144,14 +144,13 @@ public class pdfParser extends AbstractParser implements Parser {
|
|||
try {
|
||||
writer.append(stripper.getText(pdfDoc));
|
||||
} catch (final Throwable e) {}
|
||||
}
|
||||
};
|
||||
}
|
||||
};
|
||||
t.start();
|
||||
t.join(3000);
|
||||
if (t.isAlive()) t.interrupt();
|
||||
pdfDoc.close();
|
||||
contentBytes = writer.getBytes(); // get final text before closing writer
|
||||
writer.close();
|
||||
contentBytes = writer.getBytes(); // get final text before closing writer
|
||||
} catch (final IOException e) {
|
||||
// close the writer
|
||||
if (writer != null) try { writer.close(); } catch (final Exception ex) {}
|
||||
|
@ -166,6 +165,7 @@ public class pdfParser extends AbstractParser implements Parser {
|
|||
//throw new Parser.Failure(e.getMessage(), location);
|
||||
} finally {
|
||||
try {pdfDoc.close();} catch (final IOException e) {}
|
||||
writer.close();
|
||||
}
|
||||
|
||||
String[] docKeywords = null;
|
||||
|
@ -175,7 +175,7 @@ public class pdfParser extends AbstractParser implements Parser {
|
|||
if (docTitle == null) {
|
||||
docTitle = docSubject;
|
||||
}
|
||||
|
||||
|
||||
// clear resources in pdfbox. they say that is resolved but it's not. see:
|
||||
// https://issues.apache.org/jira/browse/PDFBOX-313
|
||||
// https://issues.apache.org/jira/browse/PDFBOX-351
|
||||
|
|
|
@ -189,7 +189,7 @@ public class URIMetadataRow implements URIMetadata {
|
|||
final String dc_publisher,
|
||||
final float lat,
|
||||
final float lon) {
|
||||
final CharBuffer s = new CharBuffer(20000, 360);
|
||||
final CharBuffer s = new CharBuffer(3600, 360);
|
||||
s.append(url.toNormalform(false, true)).appendLF();
|
||||
s.append(dc_title).appendLF();
|
||||
if (dc_creator.length() > 80) s.append(dc_creator, 0, 80); else s.append(dc_creator);
|
||||
|
|
|
@ -130,7 +130,7 @@ public final class CharBuffer extends Writer {
|
|||
}
|
||||
|
||||
private void grow(int minSize) {
|
||||
int newsize = 2 * Math.max(this.buffer.length, minSize);
|
||||
int newsize = 12 * Math.max(this.buffer.length, minSize) / 10; // grow by 20%
|
||||
char[] tmp = new char[newsize];
|
||||
System.arraycopy(this.buffer, this.offset, tmp, 0, this.length);
|
||||
this.buffer = tmp;
|
||||
|
@ -478,15 +478,12 @@ public final class CharBuffer extends Writer {
|
|||
this.offset = 0;
|
||||
}
|
||||
|
||||
public void reset(final int newSize) {
|
||||
this.resize(newSize);
|
||||
this.reset();
|
||||
}
|
||||
|
||||
public void resize(final int newSize) {
|
||||
if(newSize < 0) throw new IllegalArgumentException("Illegal array size: " + newSize);
|
||||
final char[] v = new char[newSize];
|
||||
System.arraycopy(this.buffer,0,v,0,newSize > this.buffer.length ? this.buffer.length : newSize);
|
||||
/**
|
||||
* call trimToSize() whenever a CharBuffer is not extended any more and is kept to store the content permanently
|
||||
*/
|
||||
public void trimToSize() {
|
||||
final char[] v = new char[this.length];
|
||||
System.arraycopy(this.buffer, this.offset, v, 0, this.length);
|
||||
this.buffer = v;
|
||||
}
|
||||
|
||||
|
@ -497,13 +494,15 @@ public final class CharBuffer extends Writer {
|
|||
}
|
||||
|
||||
@Override
|
||||
public void close() throws IOException {
|
||||
public void close() {
|
||||
this.length = 0;
|
||||
this.offset = 0;
|
||||
this.buffer = null; // assist with garbage collection
|
||||
}
|
||||
|
||||
@Override
|
||||
public void flush() throws IOException {
|
||||
// TODO Auto-generated method stub
|
||||
public void flush() {
|
||||
trimToSize();
|
||||
}
|
||||
|
||||
}
|
Loading…
Reference in New Issue
Block a user