fix htmlParser <script> text extraction on code containing expression

recognized as tag like 1<a reported in https://github.com/yacy/yacy_search_server/issues/109 Script content is ignored by default, but the text is filtered for html tags. Modified scraper to skip tag filtering while within a <script> section (until a closing tag is detected </script>. Possible side effect, missing </script> end-tag will truncate trailing content text.
2024-09-19 00:01:41 +02:00 · 2017-02-24 01:25:32 +01:00 · 2017-02-24 01:25:32 +01:00 · f254fcfc67
commit f254fcfc67
parent 2f191e0e1c
3 changed files with 38 additions and 1 deletions
--- a/source/net/yacy/document/parser/html/ContentScraper.java
+++ b/source/net/yacy/document/parser/html/ContentScraper.java
@ -274,7 +274,7 @@ public class ContentScraper extends AbstractScraper implements Scraper {
    @Override
    public void scrapeText(final char[] newtext0, final String insideTag) {
        // System.out.println("SCRAPE: " + UTF8.String(newtext));
-        if (insideTag != null && ("script".equals(insideTag) || "style".equals(insideTag))) return;
+        if (insideTag != null && (TagName.script.name().equals(insideTag) || TagName.style.name().equals(insideTag))) return;
        int p, pl, q, s = 0;
        char[] newtext = CharacterCoding.html2unicode(new String(newtext0)).toCharArray();
        
--- a/source/net/yacy/document/parser/html/TransformerWriter.java
+++ b/source/net/yacy/document/parser/html/TransformerWriter.java
@ -39,6 +39,7 @@ import java.nio.charset.Charset;
 import java.util.Enumeration;
 import java.util.Properties;
 import java.util.Stack;
+import net.yacy.document.parser.html.ContentScraper.TagName;

 import net.yacy.kelondro.io.CharBuffer;

@ -199,6 +200,11 @@ public final class TransformerWriter extends Writer {
            return filterTag(text, quotechar, tag, false);
        }

+        // don't add text from within <script> section, here e.g. a "if 1<a" expression could confuse tag detection
+        if (this.tagStack.size()>0 && this.tagStack.lastElement().name.equals(TagName.script.name())) {
+            return new char[0];
+        }
+
        // an opening tag
        tagend = tagEnd(in, 1);
        tag = new String(in, 1, tagend - 1).toLowerCase();
--- a/test/java/net/yacy/document/parser/htmlParserTest.java
+++ b/test/java/net/yacy/document/parser/htmlParserTest.java
@ -3,6 +3,7 @@ package net.yacy.document.parser;
 import java.io.File;
 import java.io.FileInputStream;
 import java.io.FileNotFoundException;
+import java.io.IOException;
 import java.net.MalformedURLException;
 import java.nio.charset.Charset;
 import java.nio.charset.StandardCharsets;
@ -141,4 +142,34 @@ public class htmlParserTest extends TestCase {
        System.out.println("ScraperTagTest: [" + textSource + "] = [" + txt + "]");
        assertEquals(txt, textSource);
    }
+
+    /**
+     * Test for parseToScraper of class htmlParser for scraping html with a
+     * <script> tag which contains code with similar to other opening tag
+     * like "<a " see https://github.com/yacy/yacy_search_server/issues/109
+     */
+    @Test
+    public void testParteToScraper_ScriptTag() throws MalformedURLException, IOException {
+        final AnchorURL url = new AnchorURL("http://localhost/");
+        final String charset = StandardCharsets.UTF_8.name();
+        final String textSource = "test text";
+        // extract from test case provided by https://github.com/yacy/yacy_search_server/issues/109
+        String testhtml = "<!doctype html>"
+                + "<html class=\"a-no-js\" data-19ax5a9jf=\"dingo\">"
+                + "<head><script>var aPageStart = (new Date()).getTime();</script><meta charset=\"utf-8\"><!--  emit CSM JS -->\n"
+                + "<script>\n"
+                + "function D(){if(E){var a=f.innerWidth?{w:f.innerWidth,h:f.innerHeight}:{w:k.clientWidth,h:k.clientHeight};5<Math.abs(a.w-\n"
+                //  the  50<a  is a possible error case
+                + "P.w)||50<a.h-P.h?(P=a,Q=4,(a=l.mobile||l.tablet?450<a.w&&a.w>a.h:1250==a.w)?C(k,\"a-ws\"):ca(k,\"a-ws\")):Q--&&(ea=setTimeout(D,16))}}function na(a){(E=void 0===a?!E:!!a)&&D()}"
+                + "</script>\n"
+                + "</head>\n"
+                + "<body>" + textSource + "</body>\n"
+                + "</html>";
+        ContentScraper scraper = parseToScraper(url, charset, new VocabularyScraper(), 0, testhtml, 10);
+
+        System.out.println(scraper.getText());
+        String txt = scraper.getText();
+        System.out.println("ScraperTagTest: [" + textSource + "] = [" + txt + "]");
+        assertEquals(txt, textSource);
+    }
 }