Added RSS parser support for maximum content bytes parsing limit

This commit is contained in:
luccioman 2017-07-12 00:18:12 +02:00
parent 452a17a8d5
commit 651fad6da5
4 changed files with 112 additions and 15 deletions

View File

@ -36,9 +36,16 @@ public class RSSFeed implements Iterable<RSSMessage> {
public static final int DEFAULT_MAXSIZE = 10000;
// class variables
private RSSMessage channel = null; // single required element see http://www.rssboard.org/rss-profile#element-channel
private final Map<String, RSSMessage> messages; // a guid:Item map
/** Single required element see http://www.rssboard.org/rss-profile#element-channel */
private RSSMessage channel = null;
/** A guid:Item map */
private final Map<String, RSSMessage> messages;
private final int maxsize;
/** Set to true when maxsize messages limit has been exceeded and exceeding messages have been discarded */
private boolean maxSizeExceeded;
@ -67,6 +74,7 @@ public class RSSFeed implements Iterable<RSSMessage> {
this.messages = Collections.synchronizedMap(new LinkedHashMap<String, RSSMessage>());
this.channel = null;
this.maxsize = maxsize;
this.maxSizeExceeded = false;
}
/**
@ -115,7 +123,10 @@ public class RSSFeed implements Iterable<RSSMessage> {
final String guid = item.getGuid();
this.messages.put(guid, item);
// in case that the feed is full (size > maxsize) flush the oldest element
while (this.messages.size() > this.maxsize) pollMessage();
while (this.messages.size() > this.maxsize) {
this.maxSizeExceeded = true;
pollMessage();
}
}
public RSSMessage getMessage(final String guid) {
@ -130,6 +141,13 @@ public class RSSFeed implements Iterable<RSSMessage> {
public int size() {
return this.messages.size();
}
/**
* @return true when maxsize messages limit has been exceeded and exceeding messages have been discarded
*/
public boolean isMaxSizeExceeded() {
return this.maxSizeExceeded;
}
@Override
public Iterator<RSSMessage> iterator() {

View File

@ -30,14 +30,16 @@ import javax.xml.parsers.ParserConfigurationException;
import javax.xml.parsers.SAXParser;
import javax.xml.parsers.SAXParserFactory;
import net.yacy.cora.document.feed.RSSMessage.Token;
import org.xml.sax.Attributes;
import org.xml.sax.EntityResolver;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.DefaultHandler;
import net.yacy.cora.document.feed.RSSMessage.Token;
import net.yacy.cora.util.StreamLimitException;
import net.yacy.cora.util.StrictLimitInputStream;
public class RSSReader extends DefaultHandler {
@ -47,6 +49,9 @@ public class RSSReader extends DefaultHandler {
private boolean parsingChannel, parsingItem;
private final RSSFeed theChannel;
private Type type;
/** When a parsing limit on instance construction has been exceeded */
private boolean maxBytesExceeded;
public enum Type { rss, atom, rdf, none }
@ -57,6 +62,7 @@ public class RSSReader extends DefaultHandler {
this.parsingChannel = false;
this.parsingItem = false;
this.type = Type.none;
this.maxBytesExceeded = false;
}
private static final ThreadLocal<SAXParser> tlSax = new ThreadLocal<SAXParser>();
@ -91,6 +97,33 @@ public class RSSReader extends DefaultHandler {
throw new IOException (e.getMessage());
}
}
public RSSReader(final int maxsize, final long maxBytes, InputStream stream) throws IOException {
this(maxsize);
if (!(stream instanceof ByteArrayInputStream) && !(stream instanceof BufferedInputStream)) {
stream = new BufferedInputStream(stream);
}
StrictLimitInputStream limitedSource = new StrictLimitInputStream(stream, maxBytes);
try {
final SAXParser saxParser = getParser();
// do not look at external dtd - see: http://www.ibm.com/developerworks/xml/library/x-tipcfsx/index.html
saxParser.getXMLReader().setEntityResolver(new EntityResolver() {
@Override
public InputSource resolveEntity(final String arg0, final String arg1)
throws SAXException, IOException {
return new InputSource(new StringReader(""));
}
});
saxParser.parse(limitedSource, this);
} catch (final SAXException e) {
throw new IOException (e.getMessage());
} catch(StreamLimitException e) {
this.maxBytesExceeded = true;
}
}
public Type getType() {
return this.type;
@ -177,5 +210,12 @@ public class RSSReader extends DefaultHandler {
public RSSFeed getFeed() {
return this.theChannel;
}
/**
* @return true when a parsing limit on instance construction has been exceeded
*/
public boolean isMaxBytesExceeded() {
return this.maxBytesExceeded;
}
}

View File

@ -955,6 +955,7 @@ dc_rights
final Set<String> languages = new HashSet<>();
double lon = 0.0d, lat = 0.0d;
boolean indexingDenied = false;
boolean partiallyParsed = false;
Date date = null;
String charset = null;
@ -1015,6 +1016,7 @@ dc_rights
if (doc.dc_language() != null) languages.add(doc.dc_language());
indexingDenied |= doc.indexingDenied;
partiallyParsed |= doc.isPartiallyParsed();
}
// clean up parser data
@ -1050,6 +1052,7 @@ dc_rights
indexingDenied,
date);
newDoc.setDepth(mindepth);
newDoc.setPartiallyParsed(partiallyParsed);
return newDoc;
}

View File

@ -74,17 +74,27 @@ public class rssParser extends AbstractParser implements Parser {
throw new Parser.Failure("Load error:" + e.getMessage(), location, e);
}
final RSSFeed feed = rssReader.getFeed();
return rssFeedToDocuments(charset, rssReader.getFeed());
}
/**
* Create parsed documents from the given feed.
* @param charset the charset name of the feed, if known
* @param feed the feed instance
* @return an array of documents : a document per feed item
*/
private Document[] rssFeedToDocuments(final String charset, final RSSFeed feed) {
//RSSMessage channel = feed.getChannel();
final List<Document> docs = new ArrayList<Document>();
DigestURL itemuri;
Set<String> languages;
Document doc;
for (final Hit item: feed) try {
itemuri = new DigestURL(item.getLink());
languages = new HashSet<String>();
languages.add(item.getLanguage());
doc = new Document(
for (final Hit item: feed) {
try {
itemuri = new DigestURL(item.getLink());
languages = new HashSet<String>();
languages.add(item.getLanguage());
doc = new Document(
itemuri,
TextParser.mimeOf(itemuri),
charset,
@ -104,14 +114,40 @@ public class rssParser extends AbstractParser implements Parser {
new LinkedHashMap<DigestURL, ImageEntry>(),
false,
item.getPubDate());
docs.add(doc);
} catch (final MalformedURLException e) {
continue;
}
docs.add(doc);
} catch (final MalformedURLException e) {
continue;
}
}
final Document[] da = new Document[docs.size()];
docs.toArray(da);
return da;
}
@Override
public boolean isParseWithLimitsSupported() {
return true;
}
@Override
public Document[] parseWithLimits(final DigestURL url, final String mimeType, final String charset, final VocabularyScraper scraper,
final int timezoneOffset, final InputStream source, final int maxLinks, final long maxBytes)
throws Failure, InterruptedException, UnsupportedOperationException {
RSSReader rssReader;
try {
rssReader = new RSSReader(maxLinks, maxBytes, source);
} catch (final IOException e) {
throw new Parser.Failure("Load error:" + e.getMessage(), url, e);
}
Document[] documents = rssFeedToDocuments(charset, rssReader.getFeed());
if (documents != null && documents.length > 0
&& (rssReader.isMaxBytesExceeded() || rssReader.getFeed().isMaxSizeExceeded())) {
/* A limit has been exceeded : mark the last document as partially parsed for information of the caller */
documents[documents.length - 1].setPartiallyParsed(true);
}
return documents;
}
}