yacy_search_server/test/de/anomic/document/parser/htmlParserTest.java

package de.anomic.document.parser;

import java.nio.charset.Charset;

import net.yacy.document.parser.htmlParser;

import junit.framework.TestCase;

public class htmlParserTest extends TestCase {
	
	public void testGetRealCharsetEncoding() {
		String[][] testStrings = new String[][] {
	       new String[]{null,"UTF-8"},
	       new String[]{"windows1250","windows-1250"},
	       new String[]{"windows_1250","windows-1250"},
	       new String[]{"ISO-8859-1","ISO-8859-1"},
	       new String[]{"ISO8859-1","ISO-8859-1"},
	       new String[]{"ISO-88591","ISO-8859-1"},
	       new String[]{"ISO88591","ISO-8859-1"},
	       new String[]{"iso_8859_1","ISO-8859-1"},
	       new String[]{"cp-1252","windows-1252"},
	       new String[]{"gb_2312","gb2312"},           // was: x-EUC-CN
	       new String[]{"gb_2312-80","gb2312"},           // was: x-EUC-CN
	       new String[]{"UTF-8;","UTF-8"}
		};
		
		for (int i=0; i < testStrings.length; i++) {
			// desired conversion result
			String shouldBe = testStrings[i][1].toLowerCase();
			
			// conversion result
			String charset = htmlParser.patchCharsetEncoding(testStrings[i][0]).toLowerCase();
			
			// test if equal
			assertEquals(shouldBe, charset);
			System.out.println("testGetRealCharsetEncoding: " + testStrings[i][0] + " -> " + charset + " | Supported: " + Charset.isSupported(charset));
			
		}
		
	}

}
reactivate unittests * fix old tests * add buildtarget "ant test" git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@6228 6c8d7289-2bf4-0310-a012-ef5d649a1542 2009-07-17 22:58:21 +02:00			`package de.anomic.document.parser;`
) Better charset encoding detection ) New testclass for charset encoding detection tests git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@2808 6c8d7289-2bf4-0310-a012-ef5d649a1542 2006-10-19 09:02:18 +02:00
			`import java.nio.charset.Charset;`

refactoring of yacy documents and parsers: they depend now only on the kelondro classes git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@6426 6c8d7289-2bf4-0310-a012-ef5d649a1542 2009-10-18 02:53:43 +02:00			`import net.yacy.document.parser.htmlParser;`

) Better charset encoding detection ) New testclass for charset encoding detection tests git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@2808 6c8d7289-2bf4-0310-a012-ef5d649a1542 2006-10-19 09:02:18 +02:00			`import junit.framework.TestCase;`

reactivate unittests * fix old tests * add buildtarget "ant test" git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@6228 6c8d7289-2bf4-0310-a012-ef5d649a1542 2009-07-17 22:58:21 +02:00			`public class htmlParserTest extends TestCase {`
) Better charset encoding detection ) New testclass for charset encoding detection tests git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@2808 6c8d7289-2bf4-0310-a012-ef5d649a1542 2006-10-19 09:02:18 +02:00
			`public void testGetRealCharsetEncoding() {`
			`String[][] testStrings = new String[][] {`
reactivate unittests * fix old tests * add buildtarget "ant test" git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@6228 6c8d7289-2bf4-0310-a012-ef5d649a1542 2009-07-17 22:58:21 +02:00			`new String[]{null,"UTF-8"},`
) Better charset encoding detection ) New testclass for charset encoding detection tests git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@2808 6c8d7289-2bf4-0310-a012-ef5d649a1542 2006-10-19 09:02:18 +02:00			`new String[]{"windows1250","windows-1250"},`
			`new String[]{"windows_1250","windows-1250"},`
			`new String[]{"ISO-8859-1","ISO-8859-1"},`
			`new String[]{"ISO8859-1","ISO-8859-1"},`
			`new String[]{"ISO-88591","ISO-8859-1"},`
			`new String[]{"ISO88591","ISO-8859-1"},`
			`new String[]{"iso_8859_1","ISO-8859-1"},`
			`new String[]{"cp-1252","windows-1252"},`
reactivate unittests * fix old tests * add buildtarget "ant test" git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@6228 6c8d7289-2bf4-0310-a012-ef5d649a1542 2009-07-17 22:58:21 +02:00			`new String[]{"gb_2312","gb2312"}, // was: x-EUC-CN`
			`new String[]{"gb_2312-80","gb2312"}, // was: x-EUC-CN`
) Better charset encoding detection ) New testclass for charset encoding detection tests git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@2808 6c8d7289-2bf4-0310-a012-ef5d649a1542 2006-10-19 09:02:18 +02:00			`new String[]{"UTF-8;","UTF-8"}`
			`};`

			`for (int i=0; i < testStrings.length; i++) {`
			`// desired conversion result`
			`String shouldBe = testStrings[i][1].toLowerCase();`

			`// conversion result`
reactivate unittests * fix old tests * add buildtarget "ant test" git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@6228 6c8d7289-2bf4-0310-a012-ef5d649a1542 2009-07-17 22:58:21 +02:00			`String charset = htmlParser.patchCharsetEncoding(testStrings[i][0]).toLowerCase();`
) Better charset encoding detection ) New testclass for charset encoding detection tests git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@2808 6c8d7289-2bf4-0310-a012-ef5d649a1542 2006-10-19 09:02:18 +02:00
			`// test if equal`
reactivate unittests * fix old tests * add buildtarget "ant test" git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@6228 6c8d7289-2bf4-0310-a012-ef5d649a1542 2009-07-17 22:58:21 +02:00			`assertEquals(shouldBe, charset);`
) Better charset encoding detection ) New testclass for charset encoding detection tests git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@2808 6c8d7289-2bf4-0310-a012-ef5d649a1542 2006-10-19 09:02:18 +02:00			`System.out.println("testGetRealCharsetEncoding: " + testStrings[i][0] + " -> " + charset + " \| Supported: " + Charset.isSupported(charset));`

			`}`

			`}`

			`}`