Merge branch 'master' of https://github.com/yacy/yacy_search_server

2024-09-19 00:01:41 +02:00 · 2015-11-09 20:42:44 +01:00 · 2015-11-09 20:42:44 +01:00 · d481653202
commit d481653202
parent bc610e5382 112ae013f4
5 changed files with 136 additions and 32 deletions
--- a/source/net/yacy/document/Document.java
+++ b/source/net/yacy/document/Document.java
@ -671,6 +671,19 @@ dc_rights
        return v;
    }

+    /**
+     * Adds the main content of subdocuments to this document.
+     * This is useful if the document is a container for other documents (like zip or other archives)
+     * to make the content of the subdocuments searcheable,
+     * but has only one url (unlike container-urls as rss).
+     *
+     * This is similar to mergeDocuments but directly joins internal content variables,
+     * uses less parsed details and keeps this documents crawl data (like crawldepth, lastmodified)
+     *
+     * @see mergeDocuments()
+     * @param docs to be included
+     * @throws IOException
+     */
    public void addSubDocuments(final Document[] docs) throws IOException {
        for (final Document doc: docs) {
            this.sections.addAll(doc.sections);
--- a/source/net/yacy/document/parser/bzipParser.java
+++ b/source/net/yacy/document/parser/bzipParser.java
@ -30,6 +30,7 @@ package net.yacy.document.parser;
 import java.io.File;
 import java.io.FileOutputStream;
 import java.io.InputStream;
+import java.util.Date;

 import net.yacy.cora.document.id.AnchorURL;
 import net.yacy.cora.document.id.DigestURL;
@ -43,7 +44,10 @@ import net.yacy.kelondro.util.FileUtils;
 import org.apache.commons.compress.compressors.bzip2.BZip2CompressorInputStream;
 import org.apache.commons.compress.compressors.bzip2.BZip2Utils;

-
+/**
+ * Parses a bz2 archive.
+ * Unzips and parses the content and adds it to the created main document
+ */
 public class bzipParser extends AbstractParser implements Parser {

    public bzipParser() {
@ -69,7 +73,7 @@ public class bzipParser extends AbstractParser implements Parser {
            throws Parser.Failure, InterruptedException {

        File tempFile = null;
-        Document[] docs;
+        Document maindoc = null;
        try {
            int read = 0;
            final byte[] data = new byte[1024];
@ -82,18 +86,38 @@ public class bzipParser extends AbstractParser implements Parser {
            // creating a temp file to store the uncompressed data
            final FileOutputStream out = new FileOutputStream(tempFile);

-            // reading gzip file and store it uncompressed
+            // reading bzip file and store it uncompressed
            while((read = zippedContent.read(data, 0, 1024)) != -1) {
                out.write(data, 0, read);
            }
            zippedContent.close();
            out.close();

+             // create maindoc for this bzip container, register with supplied url & mime
+            maindoc = new Document(
+                    location,
+                    mimeType,
+                    charset,
+                    this,
+                    null,
+                    null,
+                    null,
+                    null,
+                    null,
+                    null,
+                    null,
+                    0.0d, 0.0d,
+                    (Object) null,
+                    null,
+                    null,
+                    null,
+                    false,
+                    new Date());
            // creating a new parser class to parse the unzipped content
            final String contentfilename = BZip2Utils.getUncompressedFilename(location.getFileName());
            final String mime = TextParser.mimeOf(DigestURL.getFileExtension(contentfilename));
-            docs = TextParser.parseSource(location, mime, null, scraper, timezoneOffset, 999, tempFile);
-            // TODO: this could return null from content parsing, even if bz2 successful read (see zipParser for alternative coding)
+            final Document[] docs = TextParser.parseSource(location, mime, null, scraper, timezoneOffset, 999, tempFile);
+            if (docs != null) maindoc.addSubDocuments(docs);
        } catch (final Exception e) {
            if (e instanceof InterruptedException) throw (InterruptedException) e;
            if (e instanceof Parser.Failure) throw (Parser.Failure) e;
@ -102,6 +126,6 @@ public class bzipParser extends AbstractParser implements Parser {
        } finally {
            if (tempFile != null) FileUtils.deletedelete(tempFile);
        }
-        return docs;
+        return maindoc == null ? null : new Document[]{maindoc};
    }
 }
--- a/source/net/yacy/document/parser/gzipParser.java
+++ b/source/net/yacy/document/parser/gzipParser.java
@ -30,17 +30,23 @@ package net.yacy.document.parser;
 import java.io.File;
 import java.io.FileOutputStream;
 import java.io.InputStream;
+import java.util.Date;
 import java.util.zip.GZIPInputStream;

 import net.yacy.cora.document.id.AnchorURL;
+import net.yacy.cora.document.id.DigestURL;
 import net.yacy.document.AbstractParser;
 import net.yacy.document.Document;
 import net.yacy.document.Parser;
 import net.yacy.document.TextParser;
 import net.yacy.document.VocabularyScraper;
 import net.yacy.kelondro.util.FileUtils;
+import org.apache.commons.compress.compressors.gzip.GzipUtils;

-
+/**
+ * Parses a gz archive.
+ * Unzips and parses the content and adds it to the created main document
+ */
 public class gzipParser extends AbstractParser implements Parser {

    public gzipParser() {
@ -65,7 +71,7 @@ public class gzipParser extends AbstractParser implements Parser {
            final InputStream source) throws Parser.Failure, InterruptedException {

        File tempFile = null;
-        Document[] docs = null;
+        Document maindoc = null;
        try {
            int read = 0;
            final byte[] data = new byte[1024];
@ -84,9 +90,31 @@ public class gzipParser extends AbstractParser implements Parser {
            }
            zippedContent.close();
            out.close();
-
+            // create maindoc for this gzip container, register with supplied url & mime
+            maindoc = new Document(
+                    location,
+                    mimeType,
+                    charset,
+                    this,
+                    null,
+                    null,
+                    null,
+                    null,
+                    null,
+                    null,
+                    null,
+                    0.0d, 0.0d,
+                    (Object) null,
+                    null,
+                    null,
+                    null,
+                    false,
+                    new Date());
            // creating a new parser class to parse the unzipped content
-            docs = TextParser.parseSource(location, null, null, scraper, timezoneOffset, 999, tempFile);
+            final String contentfilename = GzipUtils.getUncompressedFilename(location.getFileName());
+            final String mime = TextParser.mimeOf(DigestURL.getFileExtension(contentfilename));
+            Document[] docs = TextParser.parseSource(location, mime, null, scraper, timezoneOffset, 999, tempFile);
+            if (docs != null) maindoc.addSubDocuments(docs);
        } catch (final Exception e) {
            if (e instanceof InterruptedException) throw (InterruptedException) e;
            if (e instanceof Parser.Failure) throw (Parser.Failure) e;
@ -95,7 +123,7 @@ public class gzipParser extends AbstractParser implements Parser {
        } finally {
            if (tempFile != null) FileUtils.deletedelete(tempFile);
        }
-        return docs;
+        return maindoc == null ? null : new Document[]{maindoc};
    }

 }
--- a/source/net/yacy/document/parser/tarParser.java
+++ b/source/net/yacy/document/parser/tarParser.java
@ -29,8 +29,7 @@ import java.io.FileNotFoundException;
 import java.io.IOException;
 import java.io.InputStream;
 import java.io.RandomAccessFile;
-import java.util.ArrayList;
-import java.util.List;
+import java.util.Date;
 import java.util.zip.GZIPInputStream;

 import net.yacy.cora.document.encoding.UTF8;
@ -47,7 +46,10 @@ import org.apache.commons.compress.archivers.tar.TarArchiveEntry;
 import org.apache.commons.compress.archivers.tar.TarArchiveInputStream;

 // this is a new implementation of this parser idiom using multiple documents as result set
-
+/**
+ * Parses the tar file and each contained file,
+ * returns one document with combined content.
+ */
 public class tarParser extends AbstractParser implements Parser {

    private final static String MAGIC = "ustar"; // A magic for a tar archive, may appear at #101h-#105
@ -70,8 +72,6 @@ public class tarParser extends AbstractParser implements Parser {
            final int timezoneOffset,
            InputStream source) throws Parser.Failure, InterruptedException {

-        final List<Document> docacc = new ArrayList<Document>();
-        Document[] subDocs = null;
        final String ext = MultiProtocolURL.getFileExtension(location.getFileName());
        if (ext.equals("gz") || ext.equals("tgz")) {
            try {
@ -82,11 +82,31 @@ public class tarParser extends AbstractParser implements Parser {
        }
        TarArchiveEntry entry;
        final TarArchiveInputStream tis = new TarArchiveInputStream(source);
-        File tmp = null;
-
+        
+        // create maindoc for this bzip container
+        Document maindoc = new Document(
+                    location,
+                    mimeType,
+                    charset,
+                    this,
+                    null,
+                    null,
+                    null,
+                    null,
+                    null,
+                    null,
+                    null,
+                    0.0d, 0.0d,
+                    (Object) null,
+                    null,
+                    null,
+                    null,
+                    false,
+                    new Date());
        // loop through the elements in the tar file and parse every single file inside
        while (true) {
            try {
+                File tmp = null;
                entry = tis.getNextTarEntry();
                if (entry == null) break;
                if (entry.isDirectory() || entry.getSize() <= 0) continue;
@ -96,9 +116,9 @@ public class tarParser extends AbstractParser implements Parser {
                try {
                    tmp = FileUtils.createTempFile(this.getClass(), name);
                    FileUtils.copy(tis, tmp, entry.getSize());
-                    subDocs = TextParser.parseSource(AnchorURL.newAnchor(location, "#" + name), mime, null, scraper, timezoneOffset, 999, tmp);
+                    final Document[] subDocs = TextParser.parseSource(AnchorURL.newAnchor(location, "#" + name), mime, null, scraper, timezoneOffset, 999, tmp);
                    if (subDocs == null) continue;
-                    for (final Document d: subDocs) docacc.add(d);
+                    maindoc.addSubDocuments(subDocs);
                } catch (final Parser.Failure e) {
                    AbstractParser.log.warn("tar parser entry " + name + ": " + e.getMessage());
                } finally {
@ -109,8 +129,7 @@ public class tarParser extends AbstractParser implements Parser {
                break;
            }
        }
-        if (docacc.isEmpty()) return null;
-        return docacc.toArray(new Document[docacc.size()]);
+        return new Document[]{maindoc};
    }

    public final static boolean isTar(File f) {
--- a/source/net/yacy/document/parser/zipParser.java
+++ b/source/net/yacy/document/parser/zipParser.java
@ -27,8 +27,7 @@ package net.yacy.document.parser;
 import java.io.File;
 import java.io.IOException;
 import java.io.InputStream;
-import java.util.ArrayList;
-import java.util.List;
+import java.util.Date;
 import java.util.zip.ZipEntry;
 import java.util.zip.ZipInputStream;

@ -43,7 +42,11 @@ import net.yacy.kelondro.util.FileUtils;
 import net.yacy.kelondro.util.MemoryControl;

 // this is a new implementation of this parser idiom using multiple documents as result set
-
+/**
+ * Parses Zip archives. Creates a main document for the zip url/file.
+ * Each file in the zip is parsed and the result added to the main document.
+ * parse returns one  document with the combined content.
+ */
 public class zipParser extends AbstractParser implements Parser {

    public zipParser() {
@ -74,15 +77,33 @@ public class zipParser extends AbstractParser implements Parser {
        if (!MemoryControl.request(200 * 1024 * 1024, false))
            throw new Parser.Failure("Not enough Memory available for zip parser: " + MemoryControl.available(), location);

-         Document[] docs = null;
-        final List<Document> docacc = new ArrayList<Document>();
        ZipEntry entry;
        final ZipInputStream zis = new ZipInputStream(source);
-        File tmp = null;
+        // create maindoc for this zip container with supplied url and mime
+        Document maindoc = new Document(
+                location,
+                mimeType,
+                charset,
+                this,
+                null,
+                null,
+                null,
+                null,
+                null,
+                null,
+                null,
+                0.0d, 0.0d,
+                (Object)null,
+                null,
+                null,
+                null,
+                false,
+                new Date());

        // loop through the elements in the zip file and parse every single file inside
        while (true) {
            try {
+                File tmp = null;
                if (zis.available() <= 0) break;
                entry = zis.getNextEntry();
                if (entry == null) break;
@ -95,9 +116,9 @@ public class zipParser extends AbstractParser implements Parser {
                    FileUtils.copy(zis, tmp, entry.getSize());
                    final DigestURL virtualURL = DigestURL.newURL(location, "#" + name);
                    //this.log.logInfo("ZIP file parser: " + virtualURL.toNormalform(false, false));
-                    docs = TextParser.parseSource(new AnchorURL(virtualURL), mime, null, scraper, timezoneOffset, 999, tmp);
+                    final Document[] docs = TextParser.parseSource(new AnchorURL(virtualURL), mime, null, scraper, timezoneOffset, 999, tmp);
                    if (docs == null) continue;
-                    for (final Document d: docs) docacc.add(d);
+                    maindoc.addSubDocuments(docs);
                } catch (final Parser.Failure e) {
                    AbstractParser.log.warn("ZIP parser entry " + name + ": " + e.getMessage());
                } finally {
@ -108,7 +129,6 @@ public class zipParser extends AbstractParser implements Parser {
                break;
            }
        }
-        if (docacc.isEmpty()) return null;
-        return docacc.toArray(new Document[docacc.size()]);
+        return new Document[]{maindoc};
    }
 }