final private static Charset UTF8 = Charset.forName("UTF-8");\r
final private static Charset ASCII = Charset.forName("US-ASCII");\r
\r
+ /*\r
+ * RFC 3986 section 2.2 Reserved Characters (January 2005)\r
+ * !*'();:@&=+$,/?#[]\r
+ */\r
+ private static boolean[] UNESCAPED_US_ASCII_CHARS = new boolean[128];\r
+\r
+ static {\r
+ for(char ch='A';ch <= 'Z';++ch)\r
+ UNESCAPED_US_ASCII_CHARS[ch] = true;\r
+ for(char ch='a';ch <= 'z';++ch)\r
+ UNESCAPED_US_ASCII_CHARS[ch] = true;\r
+ for(char ch='0';ch <= '9';++ch)\r
+ UNESCAPED_US_ASCII_CHARS[ch] = true;\r
+ UNESCAPED_US_ASCII_CHARS[';'] = true;\r
+ UNESCAPED_US_ASCII_CHARS['?'] = true;\r
+ UNESCAPED_US_ASCII_CHARS[':'] = true;\r
+ UNESCAPED_US_ASCII_CHARS['@'] = true;\r
+ UNESCAPED_US_ASCII_CHARS['='] = true;\r
+ UNESCAPED_US_ASCII_CHARS['+'] = true;\r
+ UNESCAPED_US_ASCII_CHARS['$'] = true;\r
+ UNESCAPED_US_ASCII_CHARS['.'] = true;\r
+ UNESCAPED_US_ASCII_CHARS[','] = true;\r
+ UNESCAPED_US_ASCII_CHARS['-'] = true;\r
+ UNESCAPED_US_ASCII_CHARS['_'] = true;\r
+ UNESCAPED_US_ASCII_CHARS['!'] = true;\r
+ UNESCAPED_US_ASCII_CHARS['~'] = true;\r
+ UNESCAPED_US_ASCII_CHARS['*'] = true;\r
+ UNESCAPED_US_ASCII_CHARS['\''] = true;\r
+ UNESCAPED_US_ASCII_CHARS['('] = true;\r
+ UNESCAPED_US_ASCII_CHARS[')'] = true;\r
+ UNESCAPED_US_ASCII_CHARS['['] = true;\r
+ UNESCAPED_US_ASCII_CHARS[']'] = true;\r
+ }\r
+\r
+ private static boolean needsEscaping(String unicode) {\r
+ int len = unicode.length();\r
+ for (int i = 0; i < len; ++i) {\r
+ char ch = unicode.charAt(i);\r
+ if (ch >= 128 || !UNESCAPED_US_ASCII_CHARS[ch])\r
+ return true;\r
+ }\r
+ return false;\r
+ }\r
+\r
/* Copied and modified from Jena 2.4 com.hp.hpl.jena.util.URIref */\r
private static String encode(String unicode) {\r
boolean needsEscapes = needsEscaping(unicode);\r
int in = 0;\r
int out = 0;\r
while (in < utf8.length) {\r
- switch (utf8[in]) {\r
- case (byte)'a': case (byte)'b': case (byte)'c': case (byte)'d': case (byte)'e': case (byte)'f': case (byte)'g': case (byte)'h': case (byte)'i': case (byte)'j': case (byte)'k': case (byte)'l': case (byte)'m': case (byte)'n': case (byte)'o': case (byte)'p': case (byte)'q': case (byte)'r': case (byte)'s': case (byte)'t': case (byte)'u': case (byte)'v': case (byte)'w': case (byte)'x': case (byte)'y': case (byte)'z':\r
- case (byte)'A': case (byte)'B': case (byte)'C': case (byte)'D': case (byte)'E': case (byte)'F': case (byte)'G': case (byte)'H': case (byte)'I': case (byte)'J': case (byte)'K': case (byte)'L': case (byte)'M': case (byte)'N': case (byte)'O': case (byte)'P': case (byte)'Q': case (byte)'R': case (byte)'S': case (byte)'T': case (byte)'U': case (byte)'V': case (byte)'W': case (byte)'X': case (byte)'Y': case (byte)'Z':\r
- case (byte)'0': case (byte)'1': case (byte)'2': case (byte)'3': case (byte)'4': case (byte)'5': case (byte)'6': case (byte)'7': case (byte)'8': case (byte)'9':\r
- case (byte)';': case (byte)'?': case (byte)':': case (byte)'@': case (byte)'=': case (byte)'+': case (byte)'$': case (byte)',':\r
- case (byte)'-': case (byte)'_': case (byte)'.': case (byte)'!': case (byte)'~': case (byte)'*': case (byte)'\'': case (byte)'(': case (byte)')':\r
- case (byte)'[': case (byte)']':\r
- rsltAscii[out] = utf8[in];\r
- out++;\r
- in++;\r
- break;\r
+ byte inCh = utf8[in];\r
+ if (inCh >= 0 && inCh < 128 && UNESCAPED_US_ASCII_CHARS[inCh]) {\r
+ rsltAscii[out] = inCh;\r
+ out++;\r
+ in++;\r
+ } else {\r
+ switch (inCh) {\r
case (byte)' ':\r
rsltAscii[out++] = (byte) '%';\r
rsltAscii[out++] = '2';\r
case (byte) '%':\r
// [lehtonen] NOTE: all input needs to be escaped, i.e. "%01" should result in "%2501", not "%01".\r
// escape+unescape is a bijection, not an idempotent operation. \r
- // Fall through to to escape '%' as '%25'\r
+ // Fall through to escape '%' as '%25'\r
case (byte) '#':\r
case (byte) '/':\r
// Fall through to escape '/'\r
default:\r
rsltAscii[out++] = (byte) '%';\r
// Get rid of sign ...\r
- int c = (utf8[in]) & 255;\r
+ int c = (inCh) & 255;\r
rsltAscii[out++] = hexEncode(c / 16);\r
rsltAscii[out++] = hexEncode(c % 16);\r
in++;\r
break;\r
+ }\r
}\r
}\r
return new String(rsltAscii, 0, out, ASCII);\r
}\r
\r
- /*\r
- * RFC 3986 section 2.2 Reserved Characters (January 2005)\r
- * !*'();:@&=+$,/?#[]\r
- */\r
- private static boolean needsEscaping(String unicode) {\r
- int len = unicode.length();\r
- for (int i = 0; i < len; ++i) {\r
- switch (unicode.charAt(i)) {\r
- case (byte)'!':\r
- case (byte)'*':\r
- case (byte)'\'':\r
- case (byte)'(':\r
- case (byte)')':\r
- case (byte)';':\r
- case (byte)':':\r
- case (byte)'@':\r
- case (byte)'=': \r
- case (byte)'+':\r
- case (byte)'$':\r
- case (byte)',':\r
- case (byte)'?':\r
- case (byte)'~':\r
- case (byte)'[':\r
- case (byte)']':\r
- break;\r
- case (byte)' ':\r
- case (byte) '#':\r
- case (byte) '%':\r
- case (byte) '/':\r
- case (byte)'&':\r
- return true;\r
- }\r
- }\r
- return false;\r
- }\r
-\r
private static boolean needsUnescaping(String unicode) {\r
return unicode.indexOf('%') > -1;\r
}\r
testEscape("%", "%25");\r
testEscape("%01", "%2501");\r
testEscape("%GG", "%25GG");\r
+ testEscape("säätö venttiili", "s%C3%A4%C3%A4t%C3%B6%20venttiili");\r
+ testEscape("säätö", "s%C3%A4%C3%A4t%C3%B6");\r
}\r
\r
private static void testEscape(String unescaped, String expectedEscaped) {\r