removed 7Zip parser because the old library could not be replaced by a maven repository

This commit is contained in:
Michael Peter Christen 2023-07-27 23:11:27 +02:00
parent 5afcba162b
commit 92dad3ed49
4 changed files with 10 additions and 290 deletions

3
.gitignore vendored
View File

@ -28,4 +28,5 @@ yacy.log
/.settings/ /.settings/
/.classpath /.classpath
/.project /.project
/ivy /ivy
/lib

Binary file not shown.

View File

@ -51,7 +51,6 @@ import net.yacy.document.parser.docParser;
import net.yacy.document.parser.genericParser; import net.yacy.document.parser.genericParser;
import net.yacy.document.parser.gzipParser; import net.yacy.document.parser.gzipParser;
import net.yacy.document.parser.gzipParser.GZIPOpeningStreamException; import net.yacy.document.parser.gzipParser.GZIPOpeningStreamException;
import net.yacy.document.parser.html.TagValency;
import net.yacy.document.parser.htmlParser; import net.yacy.document.parser.htmlParser;
import net.yacy.document.parser.linkScraperParser; import net.yacy.document.parser.linkScraperParser;
import net.yacy.document.parser.mmParser; import net.yacy.document.parser.mmParser;
@ -62,7 +61,6 @@ import net.yacy.document.parser.pptParser;
import net.yacy.document.parser.psParser; import net.yacy.document.parser.psParser;
import net.yacy.document.parser.rssParser; import net.yacy.document.parser.rssParser;
import net.yacy.document.parser.rtfParser; import net.yacy.document.parser.rtfParser;
import net.yacy.document.parser.sevenzipParser;
import net.yacy.document.parser.sidAudioParser; import net.yacy.document.parser.sidAudioParser;
import net.yacy.document.parser.tarParser; import net.yacy.document.parser.tarParser;
import net.yacy.document.parser.torrentParser; import net.yacy.document.parser.torrentParser;
@ -70,6 +68,7 @@ import net.yacy.document.parser.vcfParser;
import net.yacy.document.parser.vsdParser; import net.yacy.document.parser.vsdParser;
import net.yacy.document.parser.xlsParser; import net.yacy.document.parser.xlsParser;
import net.yacy.document.parser.zipParser; import net.yacy.document.parser.zipParser;
import net.yacy.document.parser.html.TagValency;
import net.yacy.document.parser.images.genericImageParser; import net.yacy.document.parser.images.genericImageParser;
import net.yacy.document.parser.images.metadataImageParser; import net.yacy.document.parser.images.metadataImageParser;
import net.yacy.document.parser.images.svgParser; import net.yacy.document.parser.images.svgParser;
@ -115,7 +114,6 @@ public final class TextParser {
initParser(new psParser()); initParser(new psParser());
initParser(new rssParser()); initParser(new rssParser());
initParser(new rtfParser()); initParser(new rtfParser());
initParser(new sevenzipParser());
initParser(new sidAudioParser()); initParser(new sidAudioParser());
initParser(new svgParser()); initParser(new svgParser());
initParser(new tarParser()); initParser(new tarParser());
@ -404,7 +402,7 @@ public final class TextParser {
public static Document[] parseSource( public static Document[] parseSource(
final DigestURL location, final DigestURL location,
String mimeType, final String mimeType,
final String charset, final String charset,
final TagValency defaultValency, final TagValency defaultValency,
final Set<String> valencySwitchTagNames, final Set<String> valencySwitchTagNames,
@ -438,7 +436,7 @@ public final class TextParser {
*/ */
public static Document[] parseWithLimits( public static Document[] parseWithLimits(
final DigestURL location, final DigestURL location,
String mimeType, final String mimeType,
final String charset, final String charset,
final TagValency defaultValency, final TagValency defaultValency,
final Set<String> valencySwitchTagNames, final Set<String> valencySwitchTagNames,
@ -446,8 +444,8 @@ public final class TextParser {
final int depth, final int depth,
final long contentLength, final long contentLength,
final InputStream sourceStream, final InputStream sourceStream,
int maxLinks, final int maxLinks,
long maxBytes) throws Parser.Failure{ final long maxBytes) throws Parser.Failure{
return parseSource(location, mimeType, charset, defaultValency, valencySwitchTagNames, new VocabularyScraper(), timezoneOffset, depth, contentLength, return parseSource(location, mimeType, charset, defaultValency, valencySwitchTagNames, new VocabularyScraper(), timezoneOffset, depth, contentLength,
sourceStream, maxLinks, maxBytes); sourceStream, maxLinks, maxBytes);
} }
@ -471,9 +469,9 @@ public final class TextParser {
* @throws Parser.Failure when the parser processing failed * @throws Parser.Failure when the parser processing failed
*/ */
public static Document[] parseWithLimits( public static Document[] parseWithLimits(
final DigestURL location, String mimeType, final String charset, final DigestURL location, final String mimeType, final String charset,
final int timezoneOffset, final int depth, final long contentLength, final InputStream sourceStream, int maxLinks, final int timezoneOffset, final int depth, final long contentLength, final InputStream sourceStream, final int maxLinks,
long maxBytes) throws Parser.Failure{ final long maxBytes) throws Parser.Failure{
return parseSource(location, mimeType, charset, TagValency.EVAL, new HashSet<String>(), new VocabularyScraper(), timezoneOffset, depth, contentLength, return parseSource(location, mimeType, charset, TagValency.EVAL, new HashSet<String>(), new VocabularyScraper(), timezoneOffset, depth, contentLength,
sourceStream, maxLinks, maxBytes); sourceStream, maxLinks, maxBytes);
} }

View File

@ -1,279 +0,0 @@
// sevenzipParser.java
// -------------------------------------
// part of YACY
// (C) by Michael Peter Christen; mc@yacy.net
// first published on http://www.anomic.de
// Frankfurt, Germany, 2004
//
// This file ist contributed by Franz Brausze
//
// $LastChangedDate$
// $LastChangedRevision$
// $LastChangedBy$
//
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation; either version 2 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
package net.yacy.document.parser;
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.util.Date;
import java.util.Set;
import net.yacy.cora.document.id.AnchorURL;
import net.yacy.cora.document.id.DigestURL;
import net.yacy.cora.document.id.MultiProtocolURL;
import net.yacy.cora.util.ConcurrentLog;
import net.yacy.document.AbstractParser;
import net.yacy.document.Document;
import net.yacy.document.Parser;
import net.yacy.document.TextParser;
import net.yacy.document.VocabularyScraper;
import net.yacy.document.parser.html.TagValency;
import net.yacy.kelondro.util.FileUtils;
import SevenZip.ArchiveExtractCallback;
import SevenZip.IInStream;
import SevenZip.Archive.IInArchive;
import SevenZip.Archive.SevenZipEntry;
import SevenZip.Archive.SevenZip.Handler;
public class sevenzipParser extends AbstractParser implements Parser {
public sevenzipParser() {
super("7zip Archive Parser");
this.SUPPORTED_EXTENSIONS.add("7z");
this.SUPPORTED_MIME_TYPES.add("application/x-7z-compressed");
}
public Document parse(
final DigestURL location,
final String mimeType,
final String charset,
final TagValency defaultValency,
final Set<String> valencySwitchTagNames,
final int timezoneOffset,
final IInStream source) throws Parser.Failure, InterruptedException {
final String filename = location.getFileName();
final Document doc = new Document(
location,
mimeType,
charset,
this,
null,
null,
AbstractParser.singleList(filename.isEmpty() ? location.toTokens() : MultiProtocolURL.unescape(filename)), // title,
null,
null,
null,
null,
0.0d, 0.0d,
(Object)null,
null,
null,
null,
false,
new Date());
Handler archive;
AbstractParser.log.fine("opening 7zip archive...");
try {
archive = new Handler(source);
} catch (final IOException e) {
throw new Parser.Failure("error opening 7zip archive: " + e.getMessage(), location);
}
final SZParserExtractCallback aec = new SZParserExtractCallback(AbstractParser.log, archive, doc, location.getFile(), defaultValency, valencySwitchTagNames, timezoneOffset);
AbstractParser.log.fine("processing archive contents...");
try {
archive.Extract(null, -1, 0, aec);
return doc;
} catch (final IOException e) {
if (e.getCause() instanceof InterruptedException)
throw (InterruptedException)e.getCause();
if (e.getCause() instanceof Parser.Failure)
throw (Parser.Failure)e.getCause();
throw new Parser.Failure(
"error processing 7zip archive at internal file " + aec.getCurrentFilePath() + ": " + e.getMessage(),
location);
} finally {
try { archive.close(); } catch (final IOException e) { }
}
}
public Document parse(
final DigestURL location,
final String mimeType,
final String charset,
final TagValency defaultValency,
final Set<String> valencySwitchTagNames,
final int timezoneOffset,
final byte[] source) throws Parser.Failure, InterruptedException {
return parse(location, mimeType, charset, defaultValency, valencySwitchTagNames, timezoneOffset, new ByteArrayIInStream(source));
}
@Override
public Document[] parse(
final DigestURL location,
final String mimeType,
final String charset,
final TagValency defaultValency,
final Set<String> valencySwitchTagNames,
final VocabularyScraper scraper,
final int timezoneOffset,
final InputStream source) throws Parser.Failure, InterruptedException {
try {
final ByteArrayOutputStream cfos = new ByteArrayOutputStream();
FileUtils.copy(source, cfos);
return new Document[]{parse(location, mimeType, charset, defaultValency, valencySwitchTagNames, timezoneOffset, cfos.toByteArray())};
} catch (final IOException e) {
throw new Parser.Failure("error processing 7zip archive: " + e.getMessage(), location);
}
}
// wrapper class to redirect output of standard ArchiveExtractCallback to serverLog
// and parse the extracted content
private static class SZParserExtractCallback extends ArchiveExtractCallback {
private final ConcurrentLog log;
private ByteArrayOutputStream cfos = null;
private final Document doc;
private final String prefix;
private final TagValency defaultValency;
private Set<String> valencySwitchTagNames;
private final int timezoneOffset;
public SZParserExtractCallback(
final ConcurrentLog logger,
final IInArchive handler,
final Document doc,
final String prefix,
final TagValency defaultValency,
final Set<String> valencySwitchTagNames,
final int timezoneOffset) {
super.Init(handler);
this.log = logger;
this.doc = doc;
this.prefix = prefix;
this.defaultValency = defaultValency;
this.valencySwitchTagNames = valencySwitchTagNames;
this.timezoneOffset = timezoneOffset;
}
@Override
public void PrepareOperation(final int arg0) {
this.extractMode = (arg0 == IInArchive.NExtract_NAskMode_kExtract);
switch (arg0) {
case IInArchive.NExtract_NAskMode_kExtract:
this.log.fine("Extracting " + this.filePath);
break;
case IInArchive.NExtract_NAskMode_kTest:
this.log.fine("Testing " + this.filePath);
break;
case IInArchive.NExtract_NAskMode_kSkip:
this.log.fine("Skipping " + this.filePath);
break;
}
}
@Override
public void SetOperationResult(final int arg0) throws IOException {
if (arg0 != IInArchive.NExtract_NOperationResult_kOK) {
this.NumErrors++;
switch(arg0) {
case IInArchive.NExtract_NOperationResult_kUnSupportedMethod:
throw new IOException("Unsupported Method");
case IInArchive.NExtract_NOperationResult_kCRCError:
throw new IOException("CRC Failed");
case IInArchive.NExtract_NOperationResult_kDataError:
throw new IOException("Data Error");
default:
// throw new IOException("Unknown Error");
}
} else try {
if (this.cfos != null) {
// parse the file
Document[] theDocs;
// workaround for relative links in file, normally '#' shall be used behind the location, see
// below for reversion of the effects
final AnchorURL url = AnchorURL.newAnchor(this.doc.dc_source(), this.prefix + "/" + super.filePath);
final String mime = TextParser.mimeOf(super.filePath.substring(super.filePath.lastIndexOf('.') + 1));
theDocs = TextParser.parseSource(url, mime, null,this.defaultValency, this.valencySwitchTagNames, new VocabularyScraper(), timezoneOffset, this.doc.getDepth() + 1, this.cfos.toByteArray());
this.doc.addSubDocuments(theDocs);
}
} catch (final Exception e) {
final IOException ex = new IOException("error parsing extracted content of " + super.filePath + ": " + e.getMessage());
ex.initCause(e);
throw ex;
}
}
@Override
public OutputStream GetStream(final int index, final int askExtractMode) throws IOException {
final SevenZipEntry item = super.archiveHandler.getEntry(index);
super.filePath = item.getName();
this.cfos = (item.isDirectory()) ? null : new ByteArrayOutputStream();
return this.cfos;
}
public String getCurrentFilePath() {
return super.filePath;
}
}
private static class SeekableByteArrayInputStream extends ByteArrayInputStream {
public SeekableByteArrayInputStream(final byte[] buf) { super(buf); }
public SeekableByteArrayInputStream(final byte[] buf, final int off, final int len) { super(buf, off, len); }
public int getPosition() { return super.pos; }
public void seekRelative(final int offset) { seekAbsolute(super.pos + offset); }
public void seekAbsolute(final int offset) {
if (offset > super.count)
throw new IndexOutOfBoundsException(Integer.toString(offset));
super.pos = offset;
}
}
private static class ByteArrayIInStream extends IInStream {
private final SeekableByteArrayInputStream sbais;
public ByteArrayIInStream(final byte[] buffer) {
this.sbais = new SeekableByteArrayInputStream(buffer);
}
@Override
public long Seek(final long offset, final int origin) {
switch (origin) {
case STREAM_SEEK_SET: this.sbais.seekAbsolute((int)offset); break;
case STREAM_SEEK_CUR: this.sbais.seekRelative((int)offset); break;
}
return this.sbais.getPosition();
}
@Override
public int read() throws IOException {
return this.sbais.read();
}
@Override
public int read(final byte[] b, final int off, final int len) throws IOException {
return this.sbais.read(b, off, len);
}
}
}