package net.yacy.document.parser; import java.io.File; import java.io.FileInputStream; import java.io.FileNotFoundException; import java.net.MalformedURLException; import java.nio.charset.Charset; import static junit.framework.Assert.assertTrue; import junit.framework.TestCase; import net.yacy.cora.document.id.AnchorURL; import net.yacy.document.Document; import net.yacy.document.Parser; import org.junit.Test; public class htmlParserTest extends TestCase { @Test public void testGetRealCharsetEncoding() { String[][] testStrings = new String[][] { new String[]{null,null}, new String[]{"windows1250","windows-1250"}, new String[]{"windows_1250","windows-1250"}, new String[]{"ISO-8859-1","ISO-8859-1"}, new String[]{"ISO8859-1","ISO-8859-1"}, new String[]{"ISO-88591","ISO-8859-1"}, new String[]{"ISO88591","ISO-8859-1"}, new String[]{"iso_8859_1","ISO-8859-1"}, new String[]{"cp-1252","windows-1252"}, new String[]{"gb_2312","gb2312"}, // was: x-EUC-CN new String[]{"gb_2312-80","gb2312"}, // was: x-EUC-CN new String[]{"UTF-8;","UTF-8"} }; for (int i=0; i < testStrings.length; i++) { // desired conversion result String shouldBe = testStrings[i][1]; shouldBe = shouldBe!=null ? shouldBe.toLowerCase() : null; // conversion result String charset = htmlParser.patchCharsetEncoding(testStrings[i][0]); // test if equal assertEquals(shouldBe, charset!=null ? charset.toLowerCase() : null); System.out.println("testGetRealCharsetEncoding: " + (testStrings[i][0]!=null?testStrings[i][0]:"null") + " -> " + (charset!=null?charset:"null") + " | Supported: " + (charset!=null?Charset.isSupported(charset):false)); } } /** * Test of parse method, of class htmlParser. * - test getCharset */ @Test public void testParse() throws MalformedURLException, Parser.Failure, InterruptedException, FileNotFoundException { System.out.println("htmlParser.parse"); String[] testFiles = { "umlaute_html_iso.html", "umlaute_html_utf8.html", "umlaute_html_namedentities.html"}; final String mimetype = "text/html"; //final String resulttxt = "In München steht ein Hofbräuhaus. Dort gibt es Bier aus Maßkrügen."; for (String testfile : testFiles) { final String filename = "test/parsertest/" + testfile; final File file = new File(filename); final AnchorURL url = new AnchorURL("http://localhost/" + filename); System.out.println("parse file: " + filename); htmlParser p = new htmlParser(); final Document[] docs = p.parse(url, mimetype, null, new FileInputStream(file)); Document doc = docs[0]; String txt = doc.getCharset(); assertTrue("get Charset", txt != null); System.out.println("detected charset = " + txt); } } }