yacy_search_server/test/net/yacy/document/parser/htmlParserTest.java

84 lines
3.0 KiB
Java
Raw Normal View History

package net.yacy.document.parser;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.net.MalformedURLException;
import java.nio.charset.Charset;
import static junit.framework.Assert.assertTrue;
import junit.framework.TestCase;
import net.yacy.cora.document.id.AnchorURL;
import net.yacy.document.Document;
import net.yacy.document.Parser;
import org.junit.Test;
public class htmlParserTest extends TestCase {
@Test
public void testGetRealCharsetEncoding() {
String[][] testStrings = new String[][] {
new String[]{null,null},
new String[]{"windows1250","windows-1250"},
new String[]{"windows_1250","windows-1250"},
new String[]{"ISO-8859-1","ISO-8859-1"},
new String[]{"ISO8859-1","ISO-8859-1"},
new String[]{"ISO-88591","ISO-8859-1"},
new String[]{"ISO88591","ISO-8859-1"},
new String[]{"iso_8859_1","ISO-8859-1"},
new String[]{"cp-1252","windows-1252"},
new String[]{"gb_2312","gb2312"}, // was: x-EUC-CN
new String[]{"gb_2312-80","gb2312"}, // was: x-EUC-CN
new String[]{"UTF-8;","UTF-8"}
};
for (int i=0; i < testStrings.length; i++) {
// desired conversion result
String shouldBe = testStrings[i][1];
shouldBe = shouldBe!=null ? shouldBe.toLowerCase() : null;
// conversion result
String charset = htmlParser.patchCharsetEncoding(testStrings[i][0]);
// test if equal
assertEquals(shouldBe, charset!=null ? charset.toLowerCase() : null);
System.out.println("testGetRealCharsetEncoding: " + (testStrings[i][0]!=null?testStrings[i][0]:"null") + " -> " + (charset!=null?charset:"null") + " | Supported: " + (charset!=null?Charset.isSupported(charset):false));
}
}
/**
* Test of parse method, of class htmlParser.
* - test getCharset
*/
@Test
public void testParse() throws MalformedURLException, Parser.Failure, InterruptedException, FileNotFoundException {
System.out.println("htmlParser.parse");
String[] testFiles = {
"umlaute_html_iso.html",
"umlaute_html_utf8.html",
"umlaute_html_namedentities.html"};
final String mimetype = "text/html";
//final String resulttxt = "In München steht ein Hofbräuhaus. Dort gibt es Bier aus Maßkrügen.";
for (String testfile : testFiles) {
final String filename = "test/parsertest/" + testfile;
final File file = new File(filename);
final AnchorURL url = new AnchorURL("http://localhost/" + filename);
System.out.println("parse file: " + filename);
htmlParser p = new htmlParser();
final Document[] docs = p.parse(url, mimetype, null, new FileInputStream(file));
Document doc = docs[0];
String txt = doc.getCharset();
assertTrue("get Charset", txt != null);
System.out.println("detected charset = " + txt);
}
}
}