Added RSS parser support for maximum content bytes parsing limit

2024-09-19 00:01:41 +02:00 · 2017-07-12 00:18:12 +02:00 · 2017-07-12 00:18:12 +02:00 · 651fad6da5
commit 651fad6da5
parent 452a17a8d5
4 changed files with 112 additions and 15 deletions
--- a/source/net/yacy/cora/document/feed/RSSFeed.java
+++ b/source/net/yacy/cora/document/feed/RSSFeed.java
@ -36,9 +36,16 @@ public class RSSFeed implements Iterable<RSSMessage> {
    public static final int DEFAULT_MAXSIZE = 10000;

    // class variables
-    private RSSMessage channel = null; // single required element  see http://www.rssboard.org/rss-profile#element-channel
-    private final Map<String, RSSMessage> messages; // a guid:Item map
+    
+    /** Single required element  see http://www.rssboard.org/rss-profile#element-channel */
+    private RSSMessage channel = null;
+    
+    /** A guid:Item map */
+    private final Map<String, RSSMessage> messages;
    private final int maxsize;
+    
+    /** Set to true when maxsize messages limit has been exceeded and exceeding messages have been discarded */
+    private boolean maxSizeExceeded;

    

@ -67,6 +74,7 @@ public class RSSFeed implements Iterable<RSSMessage> {
        this.messages = Collections.synchronizedMap(new LinkedHashMap<String, RSSMessage>());
        this.channel = null;
        this.maxsize = maxsize;
+        this.maxSizeExceeded = false;
    }

    /**
@ -115,7 +123,10 @@ public class RSSFeed implements Iterable<RSSMessage> {
        final String guid = item.getGuid();
        this.messages.put(guid, item);
        // in case that the feed is full (size > maxsize) flush the oldest element
-        while (this.messages.size() > this.maxsize) pollMessage();
+        while (this.messages.size() > this.maxsize) {
+        	this.maxSizeExceeded = true;
+        	pollMessage();
+        }
    }

    public RSSMessage getMessage(final String guid) {
@ -130,6 +141,13 @@ public class RSSFeed implements Iterable<RSSMessage> {
    public int size() {
        return this.messages.size();
    }
+    
+    /**
+     * @return true when maxsize messages limit has been exceeded and exceeding messages have been discarded
+     */
+    public boolean isMaxSizeExceeded() {
+		return this.maxSizeExceeded;
+	}

    @Override
    public Iterator<RSSMessage> iterator() {
--- a/source/net/yacy/cora/document/feed/RSSReader.java
+++ b/source/net/yacy/cora/document/feed/RSSReader.java
@ -30,14 +30,16 @@ import javax.xml.parsers.ParserConfigurationException;
 import javax.xml.parsers.SAXParser;
 import javax.xml.parsers.SAXParserFactory;

-import net.yacy.cora.document.feed.RSSMessage.Token;
-
 import org.xml.sax.Attributes;
 import org.xml.sax.EntityResolver;
 import org.xml.sax.InputSource;
 import org.xml.sax.SAXException;
 import org.xml.sax.helpers.DefaultHandler;

+import net.yacy.cora.document.feed.RSSMessage.Token;
+import net.yacy.cora.util.StreamLimitException;
+import net.yacy.cora.util.StrictLimitInputStream;
+

 public class RSSReader extends DefaultHandler {

@ -47,6 +49,9 @@ public class RSSReader extends DefaultHandler {
    private boolean parsingChannel, parsingItem;
    private final RSSFeed theChannel;
    private Type type;
+    
+    /** When a parsing limit on instance construction has been exceeded */
+    private boolean maxBytesExceeded;

    public enum Type { rss, atom, rdf, none }

@ -57,6 +62,7 @@ public class RSSReader extends DefaultHandler {
        this.parsingChannel = false;
        this.parsingItem = false;
        this.type = Type.none;
+        this.maxBytesExceeded = false;
    }

    private static final ThreadLocal<SAXParser> tlSax = new ThreadLocal<SAXParser>();
@ -91,6 +97,33 @@ public class RSSReader extends DefaultHandler {
            throw new IOException (e.getMessage());
        }
    }
+    
+    public RSSReader(final int maxsize, final long maxBytes, InputStream stream) throws IOException {
+        this(maxsize);
+        
+        if (!(stream instanceof ByteArrayInputStream) && !(stream instanceof BufferedInputStream)) {
+        	stream = new BufferedInputStream(stream);
+        }
+        
+		StrictLimitInputStream limitedSource = new StrictLimitInputStream(stream, maxBytes);
+        
+        try {
+            final SAXParser saxParser = getParser();
+            // do not look at external dtd - see: http://www.ibm.com/developerworks/xml/library/x-tipcfsx/index.html
+            saxParser.getXMLReader().setEntityResolver(new EntityResolver() {
+                @Override
+                public InputSource resolveEntity(final String arg0, final String arg1)
+                        throws SAXException, IOException {
+                    return new InputSource(new StringReader(""));
+                }
+            });
+            saxParser.parse(limitedSource, this);
+        } catch (final SAXException e) {
+	        throw new IOException (e.getMessage());
+        } catch(StreamLimitException e) {
+        	this.maxBytesExceeded = true;
+        }
+    }

    public Type getType() {
        return this.type;
@ -177,5 +210,12 @@ public class RSSReader extends DefaultHandler {
    public RSSFeed getFeed() {
        return this.theChannel;
    }
+    
+    /**
+     * @return true when a parsing limit on instance construction has been exceeded
+     */
+    public boolean isMaxBytesExceeded() {
+		return this.maxBytesExceeded;
+	}

 }
--- a/source/net/yacy/document/Document.java
+++ b/source/net/yacy/document/Document.java
@ -955,6 +955,7 @@ dc_rights
        final Set<String> languages = new HashSet<>();
        double lon = 0.0d, lat = 0.0d;
        boolean indexingDenied = false;
+        boolean partiallyParsed = false;
        Date date = null;
        String charset = null;

@ -1015,6 +1016,7 @@ dc_rights
            if (doc.dc_language() != null) languages.add(doc.dc_language());
            
            indexingDenied |= doc.indexingDenied;
+            partiallyParsed |= doc.isPartiallyParsed();
        }

        // clean up parser data
@ -1050,6 +1052,7 @@ dc_rights
                indexingDenied,
                date);
        newDoc.setDepth(mindepth);
+        newDoc.setPartiallyParsed(partiallyParsed);
        return newDoc;
    }

--- a/source/net/yacy/document/parser/rssParser.java
+++ b/source/net/yacy/document/parser/rssParser.java
@ -74,17 +74,27 @@ public class rssParser extends AbstractParser implements Parser {
            throw new Parser.Failure("Load error:" + e.getMessage(), location, e);
        }

-        final RSSFeed feed = rssReader.getFeed();
+        return rssFeedToDocuments(charset, rssReader.getFeed());
+    }
+
+    /**
+     * Create parsed documents from the given feed.
+     * @param charset the charset name of the feed, if known
+     * @param feed the feed instance 
+     * @return an array of documents : a document per feed item
+     */
+	private Document[] rssFeedToDocuments(final String charset, final RSSFeed feed) {
        //RSSMessage channel = feed.getChannel();
        final List<Document> docs = new ArrayList<Document>();
        DigestURL itemuri;
        Set<String> languages;
        Document doc;
-        for (final Hit item: feed) try {
-            itemuri = new DigestURL(item.getLink());
-            languages = new HashSet<String>();
-            languages.add(item.getLanguage());
-            doc = new Document(
+        for (final Hit item: feed) {
+        	try {
+        		itemuri = new DigestURL(item.getLink());
+        		languages = new HashSet<String>();
+        		languages.add(item.getLanguage());
+        		doc = new Document(
                    itemuri,
                    TextParser.mimeOf(itemuri),
                    charset,
@ -104,14 +114,40 @@ public class rssParser extends AbstractParser implements Parser {
                    new LinkedHashMap<DigestURL, ImageEntry>(),
                    false,
                    item.getPubDate());
-            docs.add(doc);
-        } catch (final MalformedURLException e) {
-            continue;
-            }
+        		docs.add(doc);
+        	} catch (final MalformedURLException e) {
+        		continue;
+        	}
+        }

        final Document[] da = new Document[docs.size()];
        docs.toArray(da);
        return da;
+	}
+    
+    @Override
+    public boolean isParseWithLimitsSupported() {
+    	return true;
+    }
+    
+    @Override
+    public Document[] parseWithLimits(final DigestURL url, final String mimeType, final String charset, final VocabularyScraper scraper,
+    		final int timezoneOffset, final InputStream source, final int maxLinks, final long maxBytes)
+    		throws Failure, InterruptedException, UnsupportedOperationException {
+        RSSReader rssReader;
+        try {
+            rssReader = new RSSReader(maxLinks, maxBytes, source);
+        } catch (final IOException e) {
+            throw new Parser.Failure("Load error:" + e.getMessage(), url, e);
+        }
+
+        Document[] documents =  rssFeedToDocuments(charset, rssReader.getFeed());
+		if (documents != null && documents.length > 0
+				&& (rssReader.isMaxBytesExceeded() || rssReader.getFeed().isMaxSizeExceeded())) {
+			/* A limit has been exceeded : mark the last document as partially parsed for information of the caller */
+			documents[documents.length - 1].setPartiallyParsed(true);
+		}
+        return documents;
    }

 }