mirror of
https://github.com/yacy/yacy_search_server.git
synced 2024-09-19 00:01:41 +02:00
Improved MultiprocotolURL non ASCII characters support.
After @sinkuu Pull Request #108 added JUnit tests, updated some JavaDoc and also improved URL tokenization to support non ASCII characters.
This commit is contained in:
parent
18e8b3a220
commit
2f191e0e1c
|
@ -37,6 +37,7 @@ import java.io.UnsupportedEncodingException;
|
|||
import java.net.InetAddress;
|
||||
import java.net.MalformedURLException;
|
||||
import java.net.URLDecoder;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.util.BitSet;
|
||||
import java.util.LinkedHashMap;
|
||||
import java.util.LinkedHashSet;
|
||||
|
@ -184,7 +185,7 @@ public class MultiProtocolURL implements Serializable, Comparable<MultiProtocolU
|
|||
|
||||
// identify protocol
|
||||
url = url.trim();
|
||||
|
||||
|
||||
if (url.startsWith("//")) {
|
||||
// patch for urls starting with "//" which can be found in the wild
|
||||
url = "http:" + url;
|
||||
|
@ -680,12 +681,20 @@ public class MultiProtocolURL implements Serializable, Comparable<MultiProtocolU
|
|||
return sbuf;
|
||||
}
|
||||
|
||||
/**
|
||||
* Decodes a <code>application/x-www-form-urlencoded</code> string using UTF-8 encoding.
|
||||
*
|
||||
* @param s the string to decode
|
||||
* @return the newly decoded string
|
||||
*/
|
||||
public static String unescape(final String s) {
|
||||
try {
|
||||
return URLDecoder.decode(s, "UTF-8");
|
||||
} catch (UnsupportedEncodingException e) {
|
||||
return null; // unreachable
|
||||
}
|
||||
try {
|
||||
return URLDecoder.decode(s, StandardCharsets.UTF_8.name());
|
||||
} catch (UnsupportedEncodingException e) {
|
||||
/* This should not happen */
|
||||
ConcurrentLog.logException(e);
|
||||
return s;
|
||||
}
|
||||
}
|
||||
|
||||
private void identPort(final String inputURL, final int dflt) throws MalformedURLException {
|
||||
|
@ -887,15 +896,10 @@ public class MultiProtocolURL implements Serializable, Comparable<MultiProtocolU
|
|||
return ((this.port >= 0) && (this.host != null)) ? this.host + ":" + this.port : ((this.host != null) ? this.host : "");
|
||||
}
|
||||
|
||||
/**
|
||||
* @return the host part of this URL, Punycode encoded for Internationalized Domain Names
|
||||
*/
|
||||
public String getHost() {
|
||||
/*
|
||||
if (this.host == null) return null;
|
||||
if (this.host.length() > 0 && this.host.charAt(0) == '[') {
|
||||
int p = this.host.indexOf(']');
|
||||
if (p < 0) return this.host;
|
||||
return this.host.substring(1, p);
|
||||
}
|
||||
*/
|
||||
return this.host;
|
||||
}
|
||||
|
||||
|
@ -975,8 +979,9 @@ public class MultiProtocolURL implements Serializable, Comparable<MultiProtocolU
|
|||
}
|
||||
|
||||
/**
|
||||
* Tokenized url as string (without the protocol)
|
||||
* @return example "host com path file ext"
|
||||
* Tokenizes url as string (without the protocol).
|
||||
* For example "http://host.com/path/file.txt" returns "host com path file ext"
|
||||
* @return url tokens as one string
|
||||
*/
|
||||
public String toTokens() {
|
||||
return toTokens(unescape(this.urlstub(true,true)));
|
||||
|
@ -993,7 +998,7 @@ public class MultiProtocolURL implements Serializable, Comparable<MultiProtocolU
|
|||
char c;
|
||||
for (int i = 0; i < s.length(); i++) {
|
||||
c = s.charAt(i);
|
||||
if ((c >= '0' && c <='9') || (c >= 'a' && c <='z') || (c >= 'A' && c <='Z')) sb.append(c); else sb.append(' ');
|
||||
if (Character.isAlphabetic(c) || Character.isDigit(c)) sb.append(c); else sb.append(' ');
|
||||
}
|
||||
|
||||
// split the string into tokens and add all camel-case splitting
|
||||
|
@ -1059,8 +1064,8 @@ public class MultiProtocolURL implements Serializable, Comparable<MultiProtocolU
|
|||
}
|
||||
|
||||
private static CharType charType(final char c) {
|
||||
if (c >= 'a' && c <= 'z') return CharType.low;
|
||||
if (c >= '0' && c <= '9') return CharType.number;
|
||||
if (Character.isLowerCase(c)) return CharType.low;
|
||||
if (Character.isDigit(c)) return CharType.number;
|
||||
return CharType.high;
|
||||
}
|
||||
|
||||
|
|
|
@ -3,6 +3,7 @@ package net.yacy.cora.document.id;
|
|||
import static org.junit.Assert.*;
|
||||
|
||||
import java.net.MalformedURLException;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.util.HashMap;
|
||||
import java.util.LinkedHashMap;
|
||||
import java.util.Map;
|
||||
|
@ -131,7 +132,14 @@ public class MultiProtocolURLTest {
|
|||
new String[]{"//www.yacy.net:?query=test", "www.yacy.net"},
|
||||
|
||||
new String[]{"http://www.yacy.net?data=1/2/3", "www.yacy.net"},
|
||||
new String[]{"http://www.yacy.net?url=http://test.com", "www.yacy.net"}
|
||||
new String[]{"http://www.yacy.net?url=http://test.com", "www.yacy.net"},
|
||||
/* Punycode encoded internationalized domain name : Algeria TLD */
|
||||
new String[]{"http://xn--ggbdmbaav3cjl1c9heugfv.xn--lgbbat1ad8j/", "xn--ggbdmbaav3cjl1c9heugfv.xn--lgbbat1ad8j"},
|
||||
/* Internationalized domain name : Algeria TLD */
|
||||
new String[]{"http://مركزأسماءالنطاقات.الجزائر/", "xn--ggbdmbaav3cjl1c9heugfv.xn--lgbbat1ad8j"},
|
||||
/* Internationalized domain name : Chinese Ministry of education */
|
||||
new String[]{"http://教育部.中国/", "xn--wcvs22dzol.xn--fiqs8s"},
|
||||
/*http://教育部.中国/ */
|
||||
};
|
||||
|
||||
for (int i = 0; i < testStrings.length; i++) {
|
||||
|
@ -182,7 +190,9 @@ public class MultiProtocolURLTest {
|
|||
new String[]{"http://www.heise.de/newsticker/thema/%23saukontrovers", "http://www.heise.de/newsticker/thema/%23saukontrovers"}, // http://mantis.tokeek.de/view.php?id=519
|
||||
new String[]{"http://www.heise.de/newsticker/thema/#saukontrovers", "http://www.heise.de/newsticker/thema/"}, // anchor fragment
|
||||
new String[]{"http://www.liferay.com/community/wiki/-/wiki/Main/Wiki+Portlet", "http://www.liferay.com/community/wiki/-/wiki/Main/Wiki+Portlet"}, // http://mantis.tokeek.de/view.php?id=559
|
||||
new String[]{"http://de.wikipedia.org/wiki/Philippe_Ariès", "http://de.wikipedia.org/wiki/Philippe_Ari%C3%A8s"} // UTF-8 2 byte char
|
||||
new String[]{"http://de.wikipedia.org/wiki/Philippe_Ariès", "http://de.wikipedia.org/wiki/Philippe_Ari%C3%A8s"}, // UTF-8 2 byte char
|
||||
new String[] {"https://zh.wikipedia.org/wiki/Wikipedia:方針與指引", "https://zh.wikipedia.org/wiki/Wikipedia:%E6%96%B9%E9%87%9D%E8%88%87%E6%8C%87%E5%BC%95"}, // UTF-8 3 bytes chars
|
||||
new String[] {"http://教育部.中国/jyb_xwfb/", "http://xn--wcvs22dzol.xn--fiqs8s/jyb_xwfb/"} // Internationalized Domain Name
|
||||
};
|
||||
|
||||
for (String[] testString : testStrings) {
|
||||
|
@ -204,8 +214,23 @@ public class MultiProtocolURLTest {
|
|||
public void testGetAttribute() throws Exception {
|
||||
// some test url/uri with problems in the past
|
||||
String[][] testStrings = new String[][]{
|
||||
// teststring , expectedresult
|
||||
new String[]{"http://yacy.net?&test", "test"}
|
||||
// teststring , expectedresultkey, expectedresultvalue
|
||||
new String[]{"http://yacy.net?&test", "test", ""},
|
||||
/* Encoded UTF-8 2 bytes characters parameter value */
|
||||
new String[]{"https://zh.wikipedia.org/w/index.php?search=encodedlatinchars%C3%A0%C3%A4%C3%A2%C3%A9%C3%A8%C3%AF%C3%AE%C3%B4%C3%B6%C3%B9", "search", "encodedlatincharsàäâéèïîôöù"},
|
||||
/* Non encoded UTF-8 2 bytes characters parameter value */
|
||||
new String[]{"http://yacy.net?query=unencodedlatincharsàäâéèïîôöù", "query", "unencodedlatincharsàäâéèïîôöù"},
|
||||
/* Encoded UTF-8 3 bytes characters parameter value */
|
||||
new String[]{"https://zh.wikipedia.org/w/index.php?query=%E6%96%B9%E9%87%9D%E8%88%87%E6%8C%87%E5%BC%95", "query", "方針與指引"},
|
||||
/* Non encoded UTF-8 3 bytes characters parameter value */
|
||||
new String[]{"https://zh.wikipedia.org/w/index.php?query=方針與指引", "query", "方針與指引"},
|
||||
/* Non encoded rfc3986 unreserved ascii chars parameter value */
|
||||
new String[]{"https://example.net?query=-.~_", "query", "-.~_"},
|
||||
/* Encoded rfc3986 reserved ascii chars parameter value */
|
||||
new String[]{"https://example.net?query=%3A%2F%3F%23%40%24%26%2B%2C%3B%3D", "query", ":/?#@$&+,;="},
|
||||
/* Non-Encoded rfc3986 reserved ascii chars parameter value
|
||||
* (some reserved characters have a meaning here and can not be passed as non-encoded without breaking the parameter value : #, &, +) */
|
||||
new String[]{"https://example.net?query=:/?[]@!$'()*,;=", "query", ":/?[]@!$'()*,;="},
|
||||
};
|
||||
|
||||
for (String[] testString : testStrings) {
|
||||
|
@ -214,10 +239,10 @@ public class MultiProtocolURLTest {
|
|||
String shouldBe = testString[1];
|
||||
|
||||
MultiProtocolURL resultUrl = new MultiProtocolURL(testString[0]);
|
||||
System.out.println(" -> " + resultUrl.toNormalform(false));
|
||||
Map<String, String> attr = resultUrl.getAttributes();
|
||||
|
||||
assertEquals("", attr.get(shouldBe));
|
||||
System.out.println(" -> " + resultUrl.toNormalform(false));
|
||||
assertEquals(testString[2], attr.get(shouldBe));
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -242,10 +267,10 @@ public class MultiProtocolURLTest {
|
|||
}
|
||||
|
||||
/**
|
||||
* Test of toTokens method, of class MultiProtocolURL.
|
||||
* Test of toTokens static method, of class MultiProtocolURL.
|
||||
*/
|
||||
@Test
|
||||
public void testToTokens() {
|
||||
public void testStaticToTokens() {
|
||||
// test string pairs which should generate equal results
|
||||
String[][] testString = new String[][]{
|
||||
{"abc", "abc "},
|
||||
|
@ -261,6 +286,94 @@ public class MultiProtocolURLTest {
|
|||
assertEquals("input: "+s[0]+"="+s[1],result1, result2);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Unit tests for {@link MultiProtocolURL#toTokens()}
|
||||
* @throws MalformedURLException when
|
||||
*/
|
||||
@Test
|
||||
public void testToTokens() throws MalformedURLException {
|
||||
String[][] testStrings = new String[][]{
|
||||
// test string , "expected tokens"
|
||||
new String[]{"http://yacy.net?&test", "yacy net test"},
|
||||
new String[]{"http://example.net/camelCased/subpath1/PATH_EXAMPLE", "example net camelCased subpath1 PATH EXAMPLE camel Cased subpath 1"},
|
||||
/* Encoded UTF-8 2 bytes characters parameter value */
|
||||
new String[]{"https://zh.wikipedia.org/w/index.php?search=encodedlatinchars%C3%A0%C3%A4%C3%A2%C3%A9%C3%A8%C3%AF%C3%AE%C3%B4%C3%B6%C3%B9", "zh wikipedia org w index php search encodedlatincharsàäâéèïîôöù"},
|
||||
/* Non encoded UTF-8 2 bytes characters parameter value */
|
||||
new String[]{"http://yacy.net?query=unencodedlatincharsàäâéèïîôöù", "yacy net query unencodedlatincharsàäâéèïîôöù"},
|
||||
/* Encoded UTF-8 3 bytes characters parameter value */
|
||||
new String[]{"https://zh.wikipedia.org/w/index.php?query=%E6%96%B9%E9%87%9D%E8%88%87%E6%8C%87%E5%BC%95", "zh wikipedia org w index php query 方針與指引"},
|
||||
/* Non encoded UTF-8 3 bytes characters parameter value */
|
||||
new String[]{"https://zh.wikipedia.org/w/index.php?query=方針與指引", "zh wikipedia org w index php query 方針與指引"},
|
||||
/* Non encoded rfc3986 unreserved ascii chars parameter value */
|
||||
new String[]{"https://example.net?query=-.~_", "example net query"},
|
||||
/* Encoded rfc3986 reserved ascii chars parameter value */
|
||||
new String[]{"https://example.net?query=%3A%2F%3F%23%40%24%26%2B%2C%3B%3D", "example net query"},
|
||||
/* Non-Encoded rfc3986 reserved ascii chars parameter value
|
||||
* (some reserved characters have a meaning here and can not be passed as non-encoded without breaking the parameter value : #, &, +) */
|
||||
new String[]{"https://example.net?query=:/?[]@!$'()*,;=", "example net query"}
|
||||
};
|
||||
|
||||
for (int i = 0; i < testStrings.length; i++) {
|
||||
String[] testString = testStrings[i];
|
||||
MultiProtocolURL resultUrl = new MultiProtocolURL(testString[0]);
|
||||
String tokens = resultUrl.toTokens();
|
||||
assertEquals("Test toTokens : " + i, testString[1], tokens);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Unit tests for {@link MultiProtocolURL#escape(String)}
|
||||
*/
|
||||
@Test
|
||||
public void testEscape() {
|
||||
String[] testStrings = {
|
||||
"",
|
||||
"asciiString",
|
||||
"latin chars:àäâéèïîôöù",
|
||||
"logograms:正體字/繁體字",
|
||||
"with spaces and\ttab",
|
||||
"rfc3986 unreserved ascii chars:-.~_",
|
||||
"rfc3986 reserved ascii chars::/?#[]@!$&'()*+,;=",
|
||||
"http://simpleurl.com/",
|
||||
"http://urlwithqueryandanchor.net/path?q=asciiquery&p1=param1&p2=pâräm2&p3=简化字#anchor" };
|
||||
for (String testString : testStrings) {
|
||||
String encoded = MultiProtocolURL.escape(testString).toString();
|
||||
assertTrue("Encoded string contains only ascii chars",
|
||||
StandardCharsets.US_ASCII.newEncoder().canEncode(encoded));
|
||||
assertEquals("escape/unescape consistency", testString,
|
||||
MultiProtocolURL.unescape(encoded));
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Unit tests for {@link MultiProtocolURL#unescape(String)}
|
||||
*/
|
||||
@Test
|
||||
public void testUnescape() {
|
||||
String[][] testStrings = new String[][] {
|
||||
// test string , "expected unencoded result"
|
||||
new String[] { "", "" }, new String[] { "asciiString", "asciiString" },
|
||||
new String[] { "encoded latinchars : %C3%A0%C3%A4%C3%A2%C3%A9%C3%A8%C3%AF%C3%AE%C3%B4%C3%B6%C3%B9",
|
||||
"encoded latinchars : àäâéèïîôöù" },
|
||||
new String[] { "unencoded latin chars : àäâéèïîôöù", "unencoded latin chars : àäâéèïîôöù" },
|
||||
new String[] { "encoded logograms : %E6%96%B9%E9%87%9D%E8%88%87%E6%8C%87%E5%BC%95",
|
||||
"encoded logograms : 方針與指引" },
|
||||
new String[] { "unencoded logograms : 方針與指引", "unencoded logograms : 方針與指引" },
|
||||
new String[] { "with spaces and\ttab", "with spaces and\ttab" },
|
||||
new String[] { "unencoded rfc3986 unreserved ascii chars:-.~_",
|
||||
"unencoded rfc3986 unreserved ascii chars:-.~_" },
|
||||
new String[] { "http://simpleurl.com/", "http://simpleurl.com/" },
|
||||
new String[] {
|
||||
"http://url-with-unencoded-query-and-anchor.net/path?q=asciiquery&p1=param1&p2=pâräm2&p3=简化字#anchor",
|
||||
"http://url-with-unencoded-query-and-anchor.net/path?q=asciiquery&p1=param1&p2=pâräm2&p3=简化字#anchor" }, };
|
||||
for (int i = 0; i < testStrings.length; i++) {
|
||||
String[] testString = testStrings[i];
|
||||
String unescaped = MultiProtocolURL.unescape(testString[0]);
|
||||
assertEquals(testString[1], unescaped);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
|
Loading…
Reference in New Issue
Block a user