mirror of
https://github.com/yacy/yacy_search_server.git
synced 2024-09-19 00:01:41 +02:00
Added RSS parser support for maximum content bytes parsing limit
This commit is contained in:
parent
452a17a8d5
commit
651fad6da5
|
@ -36,9 +36,16 @@ public class RSSFeed implements Iterable<RSSMessage> {
|
|||
public static final int DEFAULT_MAXSIZE = 10000;
|
||||
|
||||
// class variables
|
||||
private RSSMessage channel = null; // single required element see http://www.rssboard.org/rss-profile#element-channel
|
||||
private final Map<String, RSSMessage> messages; // a guid:Item map
|
||||
|
||||
/** Single required element see http://www.rssboard.org/rss-profile#element-channel */
|
||||
private RSSMessage channel = null;
|
||||
|
||||
/** A guid:Item map */
|
||||
private final Map<String, RSSMessage> messages;
|
||||
private final int maxsize;
|
||||
|
||||
/** Set to true when maxsize messages limit has been exceeded and exceeding messages have been discarded */
|
||||
private boolean maxSizeExceeded;
|
||||
|
||||
|
||||
|
||||
|
@ -67,6 +74,7 @@ public class RSSFeed implements Iterable<RSSMessage> {
|
|||
this.messages = Collections.synchronizedMap(new LinkedHashMap<String, RSSMessage>());
|
||||
this.channel = null;
|
||||
this.maxsize = maxsize;
|
||||
this.maxSizeExceeded = false;
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -115,7 +123,10 @@ public class RSSFeed implements Iterable<RSSMessage> {
|
|||
final String guid = item.getGuid();
|
||||
this.messages.put(guid, item);
|
||||
// in case that the feed is full (size > maxsize) flush the oldest element
|
||||
while (this.messages.size() > this.maxsize) pollMessage();
|
||||
while (this.messages.size() > this.maxsize) {
|
||||
this.maxSizeExceeded = true;
|
||||
pollMessage();
|
||||
}
|
||||
}
|
||||
|
||||
public RSSMessage getMessage(final String guid) {
|
||||
|
@ -130,6 +141,13 @@ public class RSSFeed implements Iterable<RSSMessage> {
|
|||
public int size() {
|
||||
return this.messages.size();
|
||||
}
|
||||
|
||||
/**
|
||||
* @return true when maxsize messages limit has been exceeded and exceeding messages have been discarded
|
||||
*/
|
||||
public boolean isMaxSizeExceeded() {
|
||||
return this.maxSizeExceeded;
|
||||
}
|
||||
|
||||
@Override
|
||||
public Iterator<RSSMessage> iterator() {
|
||||
|
|
|
@ -30,14 +30,16 @@ import javax.xml.parsers.ParserConfigurationException;
|
|||
import javax.xml.parsers.SAXParser;
|
||||
import javax.xml.parsers.SAXParserFactory;
|
||||
|
||||
import net.yacy.cora.document.feed.RSSMessage.Token;
|
||||
|
||||
import org.xml.sax.Attributes;
|
||||
import org.xml.sax.EntityResolver;
|
||||
import org.xml.sax.InputSource;
|
||||
import org.xml.sax.SAXException;
|
||||
import org.xml.sax.helpers.DefaultHandler;
|
||||
|
||||
import net.yacy.cora.document.feed.RSSMessage.Token;
|
||||
import net.yacy.cora.util.StreamLimitException;
|
||||
import net.yacy.cora.util.StrictLimitInputStream;
|
||||
|
||||
|
||||
public class RSSReader extends DefaultHandler {
|
||||
|
||||
|
@ -47,6 +49,9 @@ public class RSSReader extends DefaultHandler {
|
|||
private boolean parsingChannel, parsingItem;
|
||||
private final RSSFeed theChannel;
|
||||
private Type type;
|
||||
|
||||
/** When a parsing limit on instance construction has been exceeded */
|
||||
private boolean maxBytesExceeded;
|
||||
|
||||
public enum Type { rss, atom, rdf, none }
|
||||
|
||||
|
@ -57,6 +62,7 @@ public class RSSReader extends DefaultHandler {
|
|||
this.parsingChannel = false;
|
||||
this.parsingItem = false;
|
||||
this.type = Type.none;
|
||||
this.maxBytesExceeded = false;
|
||||
}
|
||||
|
||||
private static final ThreadLocal<SAXParser> tlSax = new ThreadLocal<SAXParser>();
|
||||
|
@ -91,6 +97,33 @@ public class RSSReader extends DefaultHandler {
|
|||
throw new IOException (e.getMessage());
|
||||
}
|
||||
}
|
||||
|
||||
public RSSReader(final int maxsize, final long maxBytes, InputStream stream) throws IOException {
|
||||
this(maxsize);
|
||||
|
||||
if (!(stream instanceof ByteArrayInputStream) && !(stream instanceof BufferedInputStream)) {
|
||||
stream = new BufferedInputStream(stream);
|
||||
}
|
||||
|
||||
StrictLimitInputStream limitedSource = new StrictLimitInputStream(stream, maxBytes);
|
||||
|
||||
try {
|
||||
final SAXParser saxParser = getParser();
|
||||
// do not look at external dtd - see: http://www.ibm.com/developerworks/xml/library/x-tipcfsx/index.html
|
||||
saxParser.getXMLReader().setEntityResolver(new EntityResolver() {
|
||||
@Override
|
||||
public InputSource resolveEntity(final String arg0, final String arg1)
|
||||
throws SAXException, IOException {
|
||||
return new InputSource(new StringReader(""));
|
||||
}
|
||||
});
|
||||
saxParser.parse(limitedSource, this);
|
||||
} catch (final SAXException e) {
|
||||
throw new IOException (e.getMessage());
|
||||
} catch(StreamLimitException e) {
|
||||
this.maxBytesExceeded = true;
|
||||
}
|
||||
}
|
||||
|
||||
public Type getType() {
|
||||
return this.type;
|
||||
|
@ -177,5 +210,12 @@ public class RSSReader extends DefaultHandler {
|
|||
public RSSFeed getFeed() {
|
||||
return this.theChannel;
|
||||
}
|
||||
|
||||
/**
|
||||
* @return true when a parsing limit on instance construction has been exceeded
|
||||
*/
|
||||
public boolean isMaxBytesExceeded() {
|
||||
return this.maxBytesExceeded;
|
||||
}
|
||||
|
||||
}
|
|
@ -955,6 +955,7 @@ dc_rights
|
|||
final Set<String> languages = new HashSet<>();
|
||||
double lon = 0.0d, lat = 0.0d;
|
||||
boolean indexingDenied = false;
|
||||
boolean partiallyParsed = false;
|
||||
Date date = null;
|
||||
String charset = null;
|
||||
|
||||
|
@ -1015,6 +1016,7 @@ dc_rights
|
|||
if (doc.dc_language() != null) languages.add(doc.dc_language());
|
||||
|
||||
indexingDenied |= doc.indexingDenied;
|
||||
partiallyParsed |= doc.isPartiallyParsed();
|
||||
}
|
||||
|
||||
// clean up parser data
|
||||
|
@ -1050,6 +1052,7 @@ dc_rights
|
|||
indexingDenied,
|
||||
date);
|
||||
newDoc.setDepth(mindepth);
|
||||
newDoc.setPartiallyParsed(partiallyParsed);
|
||||
return newDoc;
|
||||
}
|
||||
|
||||
|
|
|
@ -74,17 +74,27 @@ public class rssParser extends AbstractParser implements Parser {
|
|||
throw new Parser.Failure("Load error:" + e.getMessage(), location, e);
|
||||
}
|
||||
|
||||
final RSSFeed feed = rssReader.getFeed();
|
||||
return rssFeedToDocuments(charset, rssReader.getFeed());
|
||||
}
|
||||
|
||||
/**
|
||||
* Create parsed documents from the given feed.
|
||||
* @param charset the charset name of the feed, if known
|
||||
* @param feed the feed instance
|
||||
* @return an array of documents : a document per feed item
|
||||
*/
|
||||
private Document[] rssFeedToDocuments(final String charset, final RSSFeed feed) {
|
||||
//RSSMessage channel = feed.getChannel();
|
||||
final List<Document> docs = new ArrayList<Document>();
|
||||
DigestURL itemuri;
|
||||
Set<String> languages;
|
||||
Document doc;
|
||||
for (final Hit item: feed) try {
|
||||
itemuri = new DigestURL(item.getLink());
|
||||
languages = new HashSet<String>();
|
||||
languages.add(item.getLanguage());
|
||||
doc = new Document(
|
||||
for (final Hit item: feed) {
|
||||
try {
|
||||
itemuri = new DigestURL(item.getLink());
|
||||
languages = new HashSet<String>();
|
||||
languages.add(item.getLanguage());
|
||||
doc = new Document(
|
||||
itemuri,
|
||||
TextParser.mimeOf(itemuri),
|
||||
charset,
|
||||
|
@ -104,14 +114,40 @@ public class rssParser extends AbstractParser implements Parser {
|
|||
new LinkedHashMap<DigestURL, ImageEntry>(),
|
||||
false,
|
||||
item.getPubDate());
|
||||
docs.add(doc);
|
||||
} catch (final MalformedURLException e) {
|
||||
continue;
|
||||
}
|
||||
docs.add(doc);
|
||||
} catch (final MalformedURLException e) {
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
final Document[] da = new Document[docs.size()];
|
||||
docs.toArray(da);
|
||||
return da;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean isParseWithLimitsSupported() {
|
||||
return true;
|
||||
}
|
||||
|
||||
@Override
|
||||
public Document[] parseWithLimits(final DigestURL url, final String mimeType, final String charset, final VocabularyScraper scraper,
|
||||
final int timezoneOffset, final InputStream source, final int maxLinks, final long maxBytes)
|
||||
throws Failure, InterruptedException, UnsupportedOperationException {
|
||||
RSSReader rssReader;
|
||||
try {
|
||||
rssReader = new RSSReader(maxLinks, maxBytes, source);
|
||||
} catch (final IOException e) {
|
||||
throw new Parser.Failure("Load error:" + e.getMessage(), url, e);
|
||||
}
|
||||
|
||||
Document[] documents = rssFeedToDocuments(charset, rssReader.getFeed());
|
||||
if (documents != null && documents.length > 0
|
||||
&& (rssReader.isMaxBytesExceeded() || rssReader.getFeed().isMaxSizeExceeded())) {
|
||||
/* A limit has been exceeded : mark the last document as partially parsed for information of the caller */
|
||||
documents[documents.length - 1].setPartiallyParsed(true);
|
||||
}
|
||||
return documents;
|
||||
}
|
||||
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue
Block a user