Also handle text content when parsing XML within limits.

This commit is contained in:
luccioman 2017-08-14 14:47:01 +02:00
parent f38fb7f02c
commit acab6a6def
2 changed files with 16 additions and 2 deletions

View File

@ -193,11 +193,17 @@ public class GenericXMLParser extends AbstractParser implements Parser {
} catch(StreamLimitException e) {
limitExceeded = true;
}
if (writer.isOverflow()) {
throw new Parser.Failure("Not enough Memory available for generic the XML parser : "
+ Formatter.bytesToString(availableMemory), location);
}
/* create the parsed document with empty text content */
/* Create the parsed document with eventually only partial part of the text and links */
final byte[] contentBytes = UTF8.getBytes(writer.toString());
Document[] docs = new Document[] { new Document(location, mimeType, detectedCharset, this, null, null, null, null, "",
null, null, 0.0d, 0.0d, new byte[0], detectedURLs, null, null, false, new Date()) };
null, null, 0.0d, 0.0d, contentBytes, detectedURLs, null, null, false, new Date()) };
docs[0].setPartiallyParsed(limitExceeded);
return docs;
} catch (final Exception e) {

View File

@ -390,6 +390,8 @@ public class GenericXMLParserTest {
assertEquals(1, documents.length);
assertFalse(documents[0].isPartiallyParsed());
assertTrue(documents[0].getTextString().contains("And this is a relative link"));
Collection<AnchorURL> detectedAnchors = documents[0].getAnchors();
assertNotNull(detectedAnchors);
assertEquals(5, detectedAnchors.size());
@ -410,6 +412,9 @@ public class GenericXMLParserTest {
assertEquals(1, documents.length);
assertTrue(documents[0].isPartiallyParsed());
assertTrue(documents[0].getTextString().contains("Home page"));
assertFalse(documents[0].getTextString().contains("And this is a relative link"));
Collection<AnchorURL> detectedAnchors = documents[0].getAnchors();
assertNotNull(detectedAnchors);
assertEquals(2, detectedAnchors.size());
@ -447,6 +452,9 @@ public class GenericXMLParserTest {
assertEquals(1, documents.length);
assertTrue(documents[0].isPartiallyParsed());
assertTrue(documents[0].getTextString().contains("and this is a mention to a relative URL"));
assertFalse(documents[0].getTextString().contains("And this is a relative link to another"));
Collection<AnchorURL> detectedAnchors = documents[0].getAnchors();
assertNotNull(detectedAnchors);
assertEquals(3, detectedAnchors.size());