fix htmlParser <script> text extraction on code containing expression

recognized as tag like 1<a
reported in https://github.com/yacy/yacy_search_server/issues/109

Script content is ignored by default, but the text is filtered for html
tags. Modified scraper to skip tag filtering while within a <script> 
section (until a closing tag is detected </script>. 
Possible side effect, missing </script> end-tag will truncate trailing 
content text.
This commit is contained in:
reger 2017-02-24 01:25:32 +01:00
parent 2f191e0e1c
commit f254fcfc67
3 changed files with 38 additions and 1 deletions

View File

@ -274,7 +274,7 @@ public class ContentScraper extends AbstractScraper implements Scraper {
@Override
public void scrapeText(final char[] newtext0, final String insideTag) {
// System.out.println("SCRAPE: " + UTF8.String(newtext));
if (insideTag != null && ("script".equals(insideTag) || "style".equals(insideTag))) return;
if (insideTag != null && (TagName.script.name().equals(insideTag) || TagName.style.name().equals(insideTag))) return;
int p, pl, q, s = 0;
char[] newtext = CharacterCoding.html2unicode(new String(newtext0)).toCharArray();

View File

@ -39,6 +39,7 @@ import java.nio.charset.Charset;
import java.util.Enumeration;
import java.util.Properties;
import java.util.Stack;
import net.yacy.document.parser.html.ContentScraper.TagName;
import net.yacy.kelondro.io.CharBuffer;
@ -199,6 +200,11 @@ public final class TransformerWriter extends Writer {
return filterTag(text, quotechar, tag, false);
}
// don't add text from within <script> section, here e.g. a "if 1<a" expression could confuse tag detection
if (this.tagStack.size()>0 && this.tagStack.lastElement().name.equals(TagName.script.name())) {
return new char[0];
}
// an opening tag
tagend = tagEnd(in, 1);
tag = new String(in, 1, tagend - 1).toLowerCase();

View File

@ -3,6 +3,7 @@ package net.yacy.document.parser;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.net.MalformedURLException;
import java.nio.charset.Charset;
import java.nio.charset.StandardCharsets;
@ -141,4 +142,34 @@ public class htmlParserTest extends TestCase {
System.out.println("ScraperTagTest: [" + textSource + "] = [" + txt + "]");
assertEquals(txt, textSource);
}
/**
* Test for parseToScraper of class htmlParser for scraping html with a
* <script> tag which contains code with similar to other opening tag
* like "<a " see https://github.com/yacy/yacy_search_server/issues/109
*/
@Test
public void testParteToScraper_ScriptTag() throws MalformedURLException, IOException {
final AnchorURL url = new AnchorURL("http://localhost/");
final String charset = StandardCharsets.UTF_8.name();
final String textSource = "test text";
// extract from test case provided by https://github.com/yacy/yacy_search_server/issues/109
String testhtml = "<!doctype html>"
+ "<html class=\"a-no-js\" data-19ax5a9jf=\"dingo\">"
+ "<head><script>var aPageStart = (new Date()).getTime();</script><meta charset=\"utf-8\"><!-- emit CSM JS -->\n"
+ "<script>\n"
+ "function D(){if(E){var a=f.innerWidth?{w:f.innerWidth,h:f.innerHeight}:{w:k.clientWidth,h:k.clientHeight};5<Math.abs(a.w-\n"
// the 50<a is a possible error case
+ "P.w)||50<a.h-P.h?(P=a,Q=4,(a=l.mobile||l.tablet?450<a.w&&a.w>a.h:1250==a.w)?C(k,\"a-ws\"):ca(k,\"a-ws\")):Q--&&(ea=setTimeout(D,16))}}function na(a){(E=void 0===a?!E:!!a)&&D()}"
+ "</script>\n"
+ "</head>\n"
+ "<body>" + textSource + "</body>\n"
+ "</html>";
ContentScraper scraper = parseToScraper(url, charset, new VocabularyScraper(), 0, testhtml, 10);
System.out.println(scraper.getText());
String txt = scraper.getText();
System.out.println("ScraperTagTest: [" + textSource + "] = [" + txt + "]");
assertEquals(txt, textSource);
}
}