Improved MultiprocotolURL non ASCII characters support.

After @sinkuu Pull Request #108 added JUnit tests, updated some JavaDoc
and also improved URL tokenization to support non ASCII characters.
This commit is contained in:
luccioman 2017-02-23 11:09:43 +01:00
parent 18e8b3a220
commit 2f191e0e1c
2 changed files with 145 additions and 27 deletions

View File

@ -37,6 +37,7 @@ import java.io.UnsupportedEncodingException;
import java.net.InetAddress;
import java.net.MalformedURLException;
import java.net.URLDecoder;
import java.nio.charset.StandardCharsets;
import java.util.BitSet;
import java.util.LinkedHashMap;
import java.util.LinkedHashSet;
@ -184,7 +185,7 @@ public class MultiProtocolURL implements Serializable, Comparable<MultiProtocolU
// identify protocol
url = url.trim();
if (url.startsWith("//")) {
// patch for urls starting with "//" which can be found in the wild
url = "http:" + url;
@ -680,12 +681,20 @@ public class MultiProtocolURL implements Serializable, Comparable<MultiProtocolU
return sbuf;
}
/**
* Decodes a <code>application/x-www-form-urlencoded</code> string using UTF-8 encoding.
*
* @param s the string to decode
* @return the newly decoded string
*/
public static String unescape(final String s) {
try {
return URLDecoder.decode(s, "UTF-8");
} catch (UnsupportedEncodingException e) {
return null; // unreachable
}
try {
return URLDecoder.decode(s, StandardCharsets.UTF_8.name());
} catch (UnsupportedEncodingException e) {
/* This should not happen */
ConcurrentLog.logException(e);
return s;
}
}
private void identPort(final String inputURL, final int dflt) throws MalformedURLException {
@ -887,15 +896,10 @@ public class MultiProtocolURL implements Serializable, Comparable<MultiProtocolU
return ((this.port >= 0) && (this.host != null)) ? this.host + ":" + this.port : ((this.host != null) ? this.host : "");
}
/**
* @return the host part of this URL, Punycode encoded for Internationalized Domain Names
*/
public String getHost() {
/*
if (this.host == null) return null;
if (this.host.length() > 0 && this.host.charAt(0) == '[') {
int p = this.host.indexOf(']');
if (p < 0) return this.host;
return this.host.substring(1, p);
}
*/
return this.host;
}
@ -975,8 +979,9 @@ public class MultiProtocolURL implements Serializable, Comparable<MultiProtocolU
}
/**
* Tokenized url as string (without the protocol)
* @return example "host com path file ext"
* Tokenizes url as string (without the protocol).
* For example "http://host.com/path/file.txt" returns "host com path file ext"
* @return url tokens as one string
*/
public String toTokens() {
return toTokens(unescape(this.urlstub(true,true)));
@ -993,7 +998,7 @@ public class MultiProtocolURL implements Serializable, Comparable<MultiProtocolU
char c;
for (int i = 0; i < s.length(); i++) {
c = s.charAt(i);
if ((c >= '0' && c <='9') || (c >= 'a' && c <='z') || (c >= 'A' && c <='Z')) sb.append(c); else sb.append(' ');
if (Character.isAlphabetic(c) || Character.isDigit(c)) sb.append(c); else sb.append(' ');
}
// split the string into tokens and add all camel-case splitting
@ -1059,8 +1064,8 @@ public class MultiProtocolURL implements Serializable, Comparable<MultiProtocolU
}
private static CharType charType(final char c) {
if (c >= 'a' && c <= 'z') return CharType.low;
if (c >= '0' && c <= '9') return CharType.number;
if (Character.isLowerCase(c)) return CharType.low;
if (Character.isDigit(c)) return CharType.number;
return CharType.high;
}

View File

@ -3,6 +3,7 @@ package net.yacy.cora.document.id;
import static org.junit.Assert.*;
import java.net.MalformedURLException;
import java.nio.charset.StandardCharsets;
import java.util.HashMap;
import java.util.LinkedHashMap;
import java.util.Map;
@ -131,7 +132,14 @@ public class MultiProtocolURLTest {
new String[]{"//www.yacy.net:?query=test", "www.yacy.net"},
new String[]{"http://www.yacy.net?data=1/2/3", "www.yacy.net"},
new String[]{"http://www.yacy.net?url=http://test.com", "www.yacy.net"}
new String[]{"http://www.yacy.net?url=http://test.com", "www.yacy.net"},
/* Punycode encoded internationalized domain name : Algeria TLD */
new String[]{"http://xn--ggbdmbaav3cjl1c9heugfv.xn--lgbbat1ad8j/", "xn--ggbdmbaav3cjl1c9heugfv.xn--lgbbat1ad8j"},
/* Internationalized domain name : Algeria TLD */
new String[]{"http://مركزأسماءالنطاقات.الجزائر/", "xn--ggbdmbaav3cjl1c9heugfv.xn--lgbbat1ad8j"},
/* Internationalized domain name : Chinese Ministry of education */
new String[]{"http://教育部.中国/", "xn--wcvs22dzol.xn--fiqs8s"},
/*http://教育部.中国/ */
};
for (int i = 0; i < testStrings.length; i++) {
@ -182,7 +190,9 @@ public class MultiProtocolURLTest {
new String[]{"http://www.heise.de/newsticker/thema/%23saukontrovers", "http://www.heise.de/newsticker/thema/%23saukontrovers"}, // http://mantis.tokeek.de/view.php?id=519
new String[]{"http://www.heise.de/newsticker/thema/#saukontrovers", "http://www.heise.de/newsticker/thema/"}, // anchor fragment
new String[]{"http://www.liferay.com/community/wiki/-/wiki/Main/Wiki+Portlet", "http://www.liferay.com/community/wiki/-/wiki/Main/Wiki+Portlet"}, // http://mantis.tokeek.de/view.php?id=559
new String[]{"http://de.wikipedia.org/wiki/Philippe_Ariès", "http://de.wikipedia.org/wiki/Philippe_Ari%C3%A8s"} // UTF-8 2 byte char
new String[]{"http://de.wikipedia.org/wiki/Philippe_Ariès", "http://de.wikipedia.org/wiki/Philippe_Ari%C3%A8s"}, // UTF-8 2 byte char
new String[] {"https://zh.wikipedia.org/wiki/Wikipedia:方針與指引", "https://zh.wikipedia.org/wiki/Wikipedia:%E6%96%B9%E9%87%9D%E8%88%87%E6%8C%87%E5%BC%95"}, // UTF-8 3 bytes chars
new String[] {"http://教育部.中国/jyb_xwfb/", "http://xn--wcvs22dzol.xn--fiqs8s/jyb_xwfb/"} // Internationalized Domain Name
};
for (String[] testString : testStrings) {
@ -204,8 +214,23 @@ public class MultiProtocolURLTest {
public void testGetAttribute() throws Exception {
// some test url/uri with problems in the past
String[][] testStrings = new String[][]{
// teststring , expectedresult
new String[]{"http://yacy.net?&test", "test"}
// teststring , expectedresultkey, expectedresultvalue
new String[]{"http://yacy.net?&test", "test", ""},
/* Encoded UTF-8 2 bytes characters parameter value */
new String[]{"https://zh.wikipedia.org/w/index.php?search=encodedlatinchars%C3%A0%C3%A4%C3%A2%C3%A9%C3%A8%C3%AF%C3%AE%C3%B4%C3%B6%C3%B9", "search", "encodedlatincharsàäâéèïîôöù"},
/* Non encoded UTF-8 2 bytes characters parameter value */
new String[]{"http://yacy.net?query=unencodedlatincharsàäâéèïîôöù", "query", "unencodedlatincharsàäâéèïîôöù"},
/* Encoded UTF-8 3 bytes characters parameter value */
new String[]{"https://zh.wikipedia.org/w/index.php?query=%E6%96%B9%E9%87%9D%E8%88%87%E6%8C%87%E5%BC%95", "query", "方針與指引"},
/* Non encoded UTF-8 3 bytes characters parameter value */
new String[]{"https://zh.wikipedia.org/w/index.php?query=方針與指引", "query", "方針與指引"},
/* Non encoded rfc3986 unreserved ascii chars parameter value */
new String[]{"https://example.net?query=-.~_", "query", "-.~_"},
/* Encoded rfc3986 reserved ascii chars parameter value */
new String[]{"https://example.net?query=%3A%2F%3F%23%40%24%26%2B%2C%3B%3D", "query", ":/?#@$&+,;="},
/* Non-Encoded rfc3986 reserved ascii chars parameter value
* (some reserved characters have a meaning here and can not be passed as non-encoded without breaking the parameter value : #, &, +) */
new String[]{"https://example.net?query=:/?[]@!$'()*,;=", "query", ":/?[]@!$'()*,;="},
};
for (String[] testString : testStrings) {
@ -214,10 +239,10 @@ public class MultiProtocolURLTest {
String shouldBe = testString[1];
MultiProtocolURL resultUrl = new MultiProtocolURL(testString[0]);
System.out.println(" -> " + resultUrl.toNormalform(false));
Map<String, String> attr = resultUrl.getAttributes();
assertEquals("", attr.get(shouldBe));
System.out.println(" -> " + resultUrl.toNormalform(false));
assertEquals(testString[2], attr.get(shouldBe));
}
}
@ -242,10 +267,10 @@ public class MultiProtocolURLTest {
}
/**
* Test of toTokens method, of class MultiProtocolURL.
* Test of toTokens static method, of class MultiProtocolURL.
*/
@Test
public void testToTokens() {
public void testStaticToTokens() {
// test string pairs which should generate equal results
String[][] testString = new String[][]{
{"abc", "abc "},
@ -261,6 +286,94 @@ public class MultiProtocolURLTest {
assertEquals("input: "+s[0]+"="+s[1],result1, result2);
}
}
/**
* Unit tests for {@link MultiProtocolURL#toTokens()}
* @throws MalformedURLException when
*/
@Test
public void testToTokens() throws MalformedURLException {
String[][] testStrings = new String[][]{
// test string , "expected tokens"
new String[]{"http://yacy.net?&test", "yacy net test"},
new String[]{"http://example.net/camelCased/subpath1/PATH_EXAMPLE", "example net camelCased subpath1 PATH EXAMPLE camel Cased subpath 1"},
/* Encoded UTF-8 2 bytes characters parameter value */
new String[]{"https://zh.wikipedia.org/w/index.php?search=encodedlatinchars%C3%A0%C3%A4%C3%A2%C3%A9%C3%A8%C3%AF%C3%AE%C3%B4%C3%B6%C3%B9", "zh wikipedia org w index php search encodedlatincharsàäâéèïîôöù"},
/* Non encoded UTF-8 2 bytes characters parameter value */
new String[]{"http://yacy.net?query=unencodedlatincharsàäâéèïîôöù", "yacy net query unencodedlatincharsàäâéèïîôöù"},
/* Encoded UTF-8 3 bytes characters parameter value */
new String[]{"https://zh.wikipedia.org/w/index.php?query=%E6%96%B9%E9%87%9D%E8%88%87%E6%8C%87%E5%BC%95", "zh wikipedia org w index php query 方針與指引"},
/* Non encoded UTF-8 3 bytes characters parameter value */
new String[]{"https://zh.wikipedia.org/w/index.php?query=方針與指引", "zh wikipedia org w index php query 方針與指引"},
/* Non encoded rfc3986 unreserved ascii chars parameter value */
new String[]{"https://example.net?query=-.~_", "example net query"},
/* Encoded rfc3986 reserved ascii chars parameter value */
new String[]{"https://example.net?query=%3A%2F%3F%23%40%24%26%2B%2C%3B%3D", "example net query"},
/* Non-Encoded rfc3986 reserved ascii chars parameter value
* (some reserved characters have a meaning here and can not be passed as non-encoded without breaking the parameter value : #, &, +) */
new String[]{"https://example.net?query=:/?[]@!$'()*,;=", "example net query"}
};
for (int i = 0; i < testStrings.length; i++) {
String[] testString = testStrings[i];
MultiProtocolURL resultUrl = new MultiProtocolURL(testString[0]);
String tokens = resultUrl.toTokens();
assertEquals("Test toTokens : " + i, testString[1], tokens);
}
}
/**
* Unit tests for {@link MultiProtocolURL#escape(String)}
*/
@Test
public void testEscape() {
String[] testStrings = {
"",
"asciiString",
"latin chars:àäâéèïîôöù",
"logograms:正體字/繁體字",
"with spaces and\ttab",
"rfc3986 unreserved ascii chars:-.~_",
"rfc3986 reserved ascii chars::/?#[]@!$&'()*+,;=",
"http://simpleurl.com/",
"http://urlwithqueryandanchor.net/path?q=asciiquery&p1=param1&p2=pâräm2&p3=简化字#anchor" };
for (String testString : testStrings) {
String encoded = MultiProtocolURL.escape(testString).toString();
assertTrue("Encoded string contains only ascii chars",
StandardCharsets.US_ASCII.newEncoder().canEncode(encoded));
assertEquals("escape/unescape consistency", testString,
MultiProtocolURL.unescape(encoded));
}
}
/**
* Unit tests for {@link MultiProtocolURL#unescape(String)}
*/
@Test
public void testUnescape() {
String[][] testStrings = new String[][] {
// test string , "expected unencoded result"
new String[] { "", "" }, new String[] { "asciiString", "asciiString" },
new String[] { "encoded latinchars : %C3%A0%C3%A4%C3%A2%C3%A9%C3%A8%C3%AF%C3%AE%C3%B4%C3%B6%C3%B9",
"encoded latinchars : àäâéèïîôöù" },
new String[] { "unencoded latin chars : àäâéèïîôöù", "unencoded latin chars : àäâéèïîôöù" },
new String[] { "encoded logograms : %E6%96%B9%E9%87%9D%E8%88%87%E6%8C%87%E5%BC%95",
"encoded logograms : 方針與指引" },
new String[] { "unencoded logograms : 方針與指引", "unencoded logograms : 方針與指引" },
new String[] { "with spaces and\ttab", "with spaces and\ttab" },
new String[] { "unencoded rfc3986 unreserved ascii chars:-.~_",
"unencoded rfc3986 unreserved ascii chars:-.~_" },
new String[] { "http://simpleurl.com/", "http://simpleurl.com/" },
new String[] {
"http://url-with-unencoded-query-and-anchor.net/path?q=asciiquery&p1=param1&p2=pâräm2&p3=简化字#anchor",
"http://url-with-unencoded-query-and-anchor.net/path?q=asciiquery&p1=param1&p2=pâräm2&p3=简化字#anchor" }, };
for (int i = 0; i < testStrings.length; i++) {
String[] testString = testStrings[i];
String unescaped = MultiProtocolURL.unescape(testString[0]);
assertEquals(testString[1], unescaped);
}
}
}