mirror of
https://github.com/yacy/yacy_search_server.git
synced 2024-09-19 00:01:41 +02:00
Merge branch 'master' of https://github.com/yacy/yacy_search_server
This commit is contained in:
commit
d481653202
|
@ -671,6 +671,19 @@ dc_rights
|
|||
return v;
|
||||
}
|
||||
|
||||
/**
|
||||
* Adds the main content of subdocuments to this document.
|
||||
* This is useful if the document is a container for other documents (like zip or other archives)
|
||||
* to make the content of the subdocuments searcheable,
|
||||
* but has only one url (unlike container-urls as rss).
|
||||
*
|
||||
* This is similar to mergeDocuments but directly joins internal content variables,
|
||||
* uses less parsed details and keeps this documents crawl data (like crawldepth, lastmodified)
|
||||
*
|
||||
* @see mergeDocuments()
|
||||
* @param docs to be included
|
||||
* @throws IOException
|
||||
*/
|
||||
public void addSubDocuments(final Document[] docs) throws IOException {
|
||||
for (final Document doc: docs) {
|
||||
this.sections.addAll(doc.sections);
|
||||
|
|
|
@ -30,6 +30,7 @@ package net.yacy.document.parser;
|
|||
import java.io.File;
|
||||
import java.io.FileOutputStream;
|
||||
import java.io.InputStream;
|
||||
import java.util.Date;
|
||||
|
||||
import net.yacy.cora.document.id.AnchorURL;
|
||||
import net.yacy.cora.document.id.DigestURL;
|
||||
|
@ -43,7 +44,10 @@ import net.yacy.kelondro.util.FileUtils;
|
|||
import org.apache.commons.compress.compressors.bzip2.BZip2CompressorInputStream;
|
||||
import org.apache.commons.compress.compressors.bzip2.BZip2Utils;
|
||||
|
||||
|
||||
/**
|
||||
* Parses a bz2 archive.
|
||||
* Unzips and parses the content and adds it to the created main document
|
||||
*/
|
||||
public class bzipParser extends AbstractParser implements Parser {
|
||||
|
||||
public bzipParser() {
|
||||
|
@ -69,7 +73,7 @@ public class bzipParser extends AbstractParser implements Parser {
|
|||
throws Parser.Failure, InterruptedException {
|
||||
|
||||
File tempFile = null;
|
||||
Document[] docs;
|
||||
Document maindoc = null;
|
||||
try {
|
||||
int read = 0;
|
||||
final byte[] data = new byte[1024];
|
||||
|
@ -82,18 +86,38 @@ public class bzipParser extends AbstractParser implements Parser {
|
|||
// creating a temp file to store the uncompressed data
|
||||
final FileOutputStream out = new FileOutputStream(tempFile);
|
||||
|
||||
// reading gzip file and store it uncompressed
|
||||
// reading bzip file and store it uncompressed
|
||||
while((read = zippedContent.read(data, 0, 1024)) != -1) {
|
||||
out.write(data, 0, read);
|
||||
}
|
||||
zippedContent.close();
|
||||
out.close();
|
||||
|
||||
// create maindoc for this bzip container, register with supplied url & mime
|
||||
maindoc = new Document(
|
||||
location,
|
||||
mimeType,
|
||||
charset,
|
||||
this,
|
||||
null,
|
||||
null,
|
||||
null,
|
||||
null,
|
||||
null,
|
||||
null,
|
||||
null,
|
||||
0.0d, 0.0d,
|
||||
(Object) null,
|
||||
null,
|
||||
null,
|
||||
null,
|
||||
false,
|
||||
new Date());
|
||||
// creating a new parser class to parse the unzipped content
|
||||
final String contentfilename = BZip2Utils.getUncompressedFilename(location.getFileName());
|
||||
final String mime = TextParser.mimeOf(DigestURL.getFileExtension(contentfilename));
|
||||
docs = TextParser.parseSource(location, mime, null, scraper, timezoneOffset, 999, tempFile);
|
||||
// TODO: this could return null from content parsing, even if bz2 successful read (see zipParser for alternative coding)
|
||||
final Document[] docs = TextParser.parseSource(location, mime, null, scraper, timezoneOffset, 999, tempFile);
|
||||
if (docs != null) maindoc.addSubDocuments(docs);
|
||||
} catch (final Exception e) {
|
||||
if (e instanceof InterruptedException) throw (InterruptedException) e;
|
||||
if (e instanceof Parser.Failure) throw (Parser.Failure) e;
|
||||
|
@ -102,6 +126,6 @@ public class bzipParser extends AbstractParser implements Parser {
|
|||
} finally {
|
||||
if (tempFile != null) FileUtils.deletedelete(tempFile);
|
||||
}
|
||||
return docs;
|
||||
return maindoc == null ? null : new Document[]{maindoc};
|
||||
}
|
||||
}
|
||||
|
|
|
@ -30,17 +30,23 @@ package net.yacy.document.parser;
|
|||
import java.io.File;
|
||||
import java.io.FileOutputStream;
|
||||
import java.io.InputStream;
|
||||
import java.util.Date;
|
||||
import java.util.zip.GZIPInputStream;
|
||||
|
||||
import net.yacy.cora.document.id.AnchorURL;
|
||||
import net.yacy.cora.document.id.DigestURL;
|
||||
import net.yacy.document.AbstractParser;
|
||||
import net.yacy.document.Document;
|
||||
import net.yacy.document.Parser;
|
||||
import net.yacy.document.TextParser;
|
||||
import net.yacy.document.VocabularyScraper;
|
||||
import net.yacy.kelondro.util.FileUtils;
|
||||
import org.apache.commons.compress.compressors.gzip.GzipUtils;
|
||||
|
||||
|
||||
/**
|
||||
* Parses a gz archive.
|
||||
* Unzips and parses the content and adds it to the created main document
|
||||
*/
|
||||
public class gzipParser extends AbstractParser implements Parser {
|
||||
|
||||
public gzipParser() {
|
||||
|
@ -65,7 +71,7 @@ public class gzipParser extends AbstractParser implements Parser {
|
|||
final InputStream source) throws Parser.Failure, InterruptedException {
|
||||
|
||||
File tempFile = null;
|
||||
Document[] docs = null;
|
||||
Document maindoc = null;
|
||||
try {
|
||||
int read = 0;
|
||||
final byte[] data = new byte[1024];
|
||||
|
@ -84,9 +90,31 @@ public class gzipParser extends AbstractParser implements Parser {
|
|||
}
|
||||
zippedContent.close();
|
||||
out.close();
|
||||
|
||||
// create maindoc for this gzip container, register with supplied url & mime
|
||||
maindoc = new Document(
|
||||
location,
|
||||
mimeType,
|
||||
charset,
|
||||
this,
|
||||
null,
|
||||
null,
|
||||
null,
|
||||
null,
|
||||
null,
|
||||
null,
|
||||
null,
|
||||
0.0d, 0.0d,
|
||||
(Object) null,
|
||||
null,
|
||||
null,
|
||||
null,
|
||||
false,
|
||||
new Date());
|
||||
// creating a new parser class to parse the unzipped content
|
||||
docs = TextParser.parseSource(location, null, null, scraper, timezoneOffset, 999, tempFile);
|
||||
final String contentfilename = GzipUtils.getUncompressedFilename(location.getFileName());
|
||||
final String mime = TextParser.mimeOf(DigestURL.getFileExtension(contentfilename));
|
||||
Document[] docs = TextParser.parseSource(location, mime, null, scraper, timezoneOffset, 999, tempFile);
|
||||
if (docs != null) maindoc.addSubDocuments(docs);
|
||||
} catch (final Exception e) {
|
||||
if (e instanceof InterruptedException) throw (InterruptedException) e;
|
||||
if (e instanceof Parser.Failure) throw (Parser.Failure) e;
|
||||
|
@ -95,7 +123,7 @@ public class gzipParser extends AbstractParser implements Parser {
|
|||
} finally {
|
||||
if (tempFile != null) FileUtils.deletedelete(tempFile);
|
||||
}
|
||||
return docs;
|
||||
return maindoc == null ? null : new Document[]{maindoc};
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -29,8 +29,7 @@ import java.io.FileNotFoundException;
|
|||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.io.RandomAccessFile;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.Date;
|
||||
import java.util.zip.GZIPInputStream;
|
||||
|
||||
import net.yacy.cora.document.encoding.UTF8;
|
||||
|
@ -47,7 +46,10 @@ import org.apache.commons.compress.archivers.tar.TarArchiveEntry;
|
|||
import org.apache.commons.compress.archivers.tar.TarArchiveInputStream;
|
||||
|
||||
// this is a new implementation of this parser idiom using multiple documents as result set
|
||||
|
||||
/**
|
||||
* Parses the tar file and each contained file,
|
||||
* returns one document with combined content.
|
||||
*/
|
||||
public class tarParser extends AbstractParser implements Parser {
|
||||
|
||||
private final static String MAGIC = "ustar"; // A magic for a tar archive, may appear at #101h-#105
|
||||
|
@ -70,8 +72,6 @@ public class tarParser extends AbstractParser implements Parser {
|
|||
final int timezoneOffset,
|
||||
InputStream source) throws Parser.Failure, InterruptedException {
|
||||
|
||||
final List<Document> docacc = new ArrayList<Document>();
|
||||
Document[] subDocs = null;
|
||||
final String ext = MultiProtocolURL.getFileExtension(location.getFileName());
|
||||
if (ext.equals("gz") || ext.equals("tgz")) {
|
||||
try {
|
||||
|
@ -82,11 +82,31 @@ public class tarParser extends AbstractParser implements Parser {
|
|||
}
|
||||
TarArchiveEntry entry;
|
||||
final TarArchiveInputStream tis = new TarArchiveInputStream(source);
|
||||
File tmp = null;
|
||||
|
||||
|
||||
// create maindoc for this bzip container
|
||||
Document maindoc = new Document(
|
||||
location,
|
||||
mimeType,
|
||||
charset,
|
||||
this,
|
||||
null,
|
||||
null,
|
||||
null,
|
||||
null,
|
||||
null,
|
||||
null,
|
||||
null,
|
||||
0.0d, 0.0d,
|
||||
(Object) null,
|
||||
null,
|
||||
null,
|
||||
null,
|
||||
false,
|
||||
new Date());
|
||||
// loop through the elements in the tar file and parse every single file inside
|
||||
while (true) {
|
||||
try {
|
||||
File tmp = null;
|
||||
entry = tis.getNextTarEntry();
|
||||
if (entry == null) break;
|
||||
if (entry.isDirectory() || entry.getSize() <= 0) continue;
|
||||
|
@ -96,9 +116,9 @@ public class tarParser extends AbstractParser implements Parser {
|
|||
try {
|
||||
tmp = FileUtils.createTempFile(this.getClass(), name);
|
||||
FileUtils.copy(tis, tmp, entry.getSize());
|
||||
subDocs = TextParser.parseSource(AnchorURL.newAnchor(location, "#" + name), mime, null, scraper, timezoneOffset, 999, tmp);
|
||||
final Document[] subDocs = TextParser.parseSource(AnchorURL.newAnchor(location, "#" + name), mime, null, scraper, timezoneOffset, 999, tmp);
|
||||
if (subDocs == null) continue;
|
||||
for (final Document d: subDocs) docacc.add(d);
|
||||
maindoc.addSubDocuments(subDocs);
|
||||
} catch (final Parser.Failure e) {
|
||||
AbstractParser.log.warn("tar parser entry " + name + ": " + e.getMessage());
|
||||
} finally {
|
||||
|
@ -109,8 +129,7 @@ public class tarParser extends AbstractParser implements Parser {
|
|||
break;
|
||||
}
|
||||
}
|
||||
if (docacc.isEmpty()) return null;
|
||||
return docacc.toArray(new Document[docacc.size()]);
|
||||
return new Document[]{maindoc};
|
||||
}
|
||||
|
||||
public final static boolean isTar(File f) {
|
||||
|
|
|
@ -27,8 +27,7 @@ package net.yacy.document.parser;
|
|||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.Date;
|
||||
import java.util.zip.ZipEntry;
|
||||
import java.util.zip.ZipInputStream;
|
||||
|
||||
|
@ -43,7 +42,11 @@ import net.yacy.kelondro.util.FileUtils;
|
|||
import net.yacy.kelondro.util.MemoryControl;
|
||||
|
||||
// this is a new implementation of this parser idiom using multiple documents as result set
|
||||
|
||||
/**
|
||||
* Parses Zip archives. Creates a main document for the zip url/file.
|
||||
* Each file in the zip is parsed and the result added to the main document.
|
||||
* parse returns one document with the combined content.
|
||||
*/
|
||||
public class zipParser extends AbstractParser implements Parser {
|
||||
|
||||
public zipParser() {
|
||||
|
@ -74,15 +77,33 @@ public class zipParser extends AbstractParser implements Parser {
|
|||
if (!MemoryControl.request(200 * 1024 * 1024, false))
|
||||
throw new Parser.Failure("Not enough Memory available for zip parser: " + MemoryControl.available(), location);
|
||||
|
||||
Document[] docs = null;
|
||||
final List<Document> docacc = new ArrayList<Document>();
|
||||
ZipEntry entry;
|
||||
final ZipInputStream zis = new ZipInputStream(source);
|
||||
File tmp = null;
|
||||
// create maindoc for this zip container with supplied url and mime
|
||||
Document maindoc = new Document(
|
||||
location,
|
||||
mimeType,
|
||||
charset,
|
||||
this,
|
||||
null,
|
||||
null,
|
||||
null,
|
||||
null,
|
||||
null,
|
||||
null,
|
||||
null,
|
||||
0.0d, 0.0d,
|
||||
(Object)null,
|
||||
null,
|
||||
null,
|
||||
null,
|
||||
false,
|
||||
new Date());
|
||||
|
||||
// loop through the elements in the zip file and parse every single file inside
|
||||
while (true) {
|
||||
try {
|
||||
File tmp = null;
|
||||
if (zis.available() <= 0) break;
|
||||
entry = zis.getNextEntry();
|
||||
if (entry == null) break;
|
||||
|
@ -95,9 +116,9 @@ public class zipParser extends AbstractParser implements Parser {
|
|||
FileUtils.copy(zis, tmp, entry.getSize());
|
||||
final DigestURL virtualURL = DigestURL.newURL(location, "#" + name);
|
||||
//this.log.logInfo("ZIP file parser: " + virtualURL.toNormalform(false, false));
|
||||
docs = TextParser.parseSource(new AnchorURL(virtualURL), mime, null, scraper, timezoneOffset, 999, tmp);
|
||||
final Document[] docs = TextParser.parseSource(new AnchorURL(virtualURL), mime, null, scraper, timezoneOffset, 999, tmp);
|
||||
if (docs == null) continue;
|
||||
for (final Document d: docs) docacc.add(d);
|
||||
maindoc.addSubDocuments(docs);
|
||||
} catch (final Parser.Failure e) {
|
||||
AbstractParser.log.warn("ZIP parser entry " + name + ": " + e.getMessage());
|
||||
} finally {
|
||||
|
@ -108,7 +129,6 @@ public class zipParser extends AbstractParser implements Parser {
|
|||
break;
|
||||
}
|
||||
}
|
||||
if (docacc.isEmpty()) return null;
|
||||
return docacc.toArray(new Document[docacc.size()]);
|
||||
return new Document[]{maindoc};
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue
Block a user