From: Tuukka Lehtonen Date: Mon, 5 Sep 2016 12:37:50 +0000 (+0300) Subject: Fix URIStringUtils.needsEscaping to escape chars outside 7-bit ASCII. X-Git-Tag: v1.25.0~139 X-Git-Url: https://gerrit.simantics.org/r/gitweb?a=commitdiff_plain;h=refs%2Fchanges%2F49%2F49%2F3;p=simantics%2Fplatform.git Fix URIStringUtils.needsEscaping to escape chars outside 7-bit ASCII. refs #6673 Change-Id: I445d800182d3ecb2d2d7248036fb3b8dfb102e81 --- diff --git a/bundles/org.simantics.databoard/src/org/simantics/databoard/util/URIStringUtils.java b/bundles/org.simantics.databoard/src/org/simantics/databoard/util/URIStringUtils.java index dde498a2c..1cd658ba9 100644 --- a/bundles/org.simantics.databoard/src/org/simantics/databoard/util/URIStringUtils.java +++ b/bundles/org.simantics.databoard/src/org/simantics/databoard/util/URIStringUtils.java @@ -335,6 +335,50 @@ public final class URIStringUtils { final private static Charset UTF8 = Charset.forName("UTF-8"); final private static Charset ASCII = Charset.forName("US-ASCII"); + /* + * RFC 3986 section 2.2 Reserved Characters (January 2005) + * !*'();:@&=+$,/?#[] + */ + private static boolean[] UNESCAPED_US_ASCII_CHARS = new boolean[128]; + + static { + for(char ch='A';ch <= 'Z';++ch) + UNESCAPED_US_ASCII_CHARS[ch] = true; + for(char ch='a';ch <= 'z';++ch) + UNESCAPED_US_ASCII_CHARS[ch] = true; + for(char ch='0';ch <= '9';++ch) + UNESCAPED_US_ASCII_CHARS[ch] = true; + UNESCAPED_US_ASCII_CHARS[';'] = true; + UNESCAPED_US_ASCII_CHARS['?'] = true; + UNESCAPED_US_ASCII_CHARS[':'] = true; + UNESCAPED_US_ASCII_CHARS['@'] = true; + UNESCAPED_US_ASCII_CHARS['='] = true; + UNESCAPED_US_ASCII_CHARS['+'] = true; + UNESCAPED_US_ASCII_CHARS['$'] = true; + UNESCAPED_US_ASCII_CHARS['.'] = true; + UNESCAPED_US_ASCII_CHARS[','] = true; + UNESCAPED_US_ASCII_CHARS['-'] = true; + UNESCAPED_US_ASCII_CHARS['_'] = true; + UNESCAPED_US_ASCII_CHARS['!'] = true; + UNESCAPED_US_ASCII_CHARS['~'] = true; + UNESCAPED_US_ASCII_CHARS['*'] = true; + UNESCAPED_US_ASCII_CHARS['\''] = true; + UNESCAPED_US_ASCII_CHARS['('] = true; + UNESCAPED_US_ASCII_CHARS[')'] = true; + UNESCAPED_US_ASCII_CHARS['['] = true; + UNESCAPED_US_ASCII_CHARS[']'] = true; + } + + private static boolean needsEscaping(String unicode) { + int len = unicode.length(); + for (int i = 0; i < len; ++i) { + char ch = unicode.charAt(i); + if (ch >= 128 || !UNESCAPED_US_ASCII_CHARS[ch]) + return true; + } + return false; + } + /* Copied and modified from Jena 2.4 com.hp.hpl.jena.util.URIref */ private static String encode(String unicode) { boolean needsEscapes = needsEscaping(unicode); @@ -346,17 +390,13 @@ public final class URIStringUtils { int in = 0; int out = 0; while (in < utf8.length) { - switch (utf8[in]) { - case (byte)'a': case (byte)'b': case (byte)'c': case (byte)'d': case (byte)'e': case (byte)'f': case (byte)'g': case (byte)'h': case (byte)'i': case (byte)'j': case (byte)'k': case (byte)'l': case (byte)'m': case (byte)'n': case (byte)'o': case (byte)'p': case (byte)'q': case (byte)'r': case (byte)'s': case (byte)'t': case (byte)'u': case (byte)'v': case (byte)'w': case (byte)'x': case (byte)'y': case (byte)'z': - case (byte)'A': case (byte)'B': case (byte)'C': case (byte)'D': case (byte)'E': case (byte)'F': case (byte)'G': case (byte)'H': case (byte)'I': case (byte)'J': case (byte)'K': case (byte)'L': case (byte)'M': case (byte)'N': case (byte)'O': case (byte)'P': case (byte)'Q': case (byte)'R': case (byte)'S': case (byte)'T': case (byte)'U': case (byte)'V': case (byte)'W': case (byte)'X': case (byte)'Y': case (byte)'Z': - case (byte)'0': case (byte)'1': case (byte)'2': case (byte)'3': case (byte)'4': case (byte)'5': case (byte)'6': case (byte)'7': case (byte)'8': case (byte)'9': - case (byte)';': case (byte)'?': case (byte)':': case (byte)'@': case (byte)'=': case (byte)'+': case (byte)'$': case (byte)',': - case (byte)'-': case (byte)'_': case (byte)'.': case (byte)'!': case (byte)'~': case (byte)'*': case (byte)'\'': case (byte)'(': case (byte)')': - case (byte)'[': case (byte)']': - rsltAscii[out] = utf8[in]; - out++; - in++; - break; + byte inCh = utf8[in]; + if (inCh >= 0 && inCh < 128 && UNESCAPED_US_ASCII_CHARS[inCh]) { + rsltAscii[out] = inCh; + out++; + in++; + } else { + switch (inCh) { case (byte)' ': rsltAscii[out++] = (byte) '%'; rsltAscii[out++] = '2'; @@ -366,7 +406,7 @@ public final class URIStringUtils { case (byte) '%': // [lehtonen] NOTE: all input needs to be escaped, i.e. "%01" should result in "%2501", not "%01". // escape+unescape is a bijection, not an idempotent operation. - // Fall through to to escape '%' as '%25' + // Fall through to escape '%' as '%25' case (byte) '#': case (byte) '/': // Fall through to escape '/' @@ -376,52 +416,17 @@ public final class URIStringUtils { default: rsltAscii[out++] = (byte) '%'; // Get rid of sign ... - int c = (utf8[in]) & 255; + int c = (inCh) & 255; rsltAscii[out++] = hexEncode(c / 16); rsltAscii[out++] = hexEncode(c % 16); in++; break; + } } } return new String(rsltAscii, 0, out, ASCII); } - /* - * RFC 3986 section 2.2 Reserved Characters (January 2005) - * !*'();:@&=+$,/?#[] - */ - private static boolean needsEscaping(String unicode) { - int len = unicode.length(); - for (int i = 0; i < len; ++i) { - switch (unicode.charAt(i)) { - case (byte)'!': - case (byte)'*': - case (byte)'\'': - case (byte)'(': - case (byte)')': - case (byte)';': - case (byte)':': - case (byte)'@': - case (byte)'=': - case (byte)'+': - case (byte)'$': - case (byte)',': - case (byte)'?': - case (byte)'~': - case (byte)'[': - case (byte)']': - break; - case (byte)' ': - case (byte) '#': - case (byte) '%': - case (byte) '/': - case (byte)'&': - return true; - } - } - return false; - } - private static boolean needsUnescaping(String unicode) { return unicode.indexOf('%') > -1; } @@ -513,6 +518,8 @@ public final class URIStringUtils { testEscape("%", "%25"); testEscape("%01", "%2501"); testEscape("%GG", "%25GG"); + testEscape("säätö venttiili", "s%C3%A4%C3%A4t%C3%B6%20venttiili"); + testEscape("säätö", "s%C3%A4%C3%A4t%C3%B6"); } private static void testEscape(String unescaped, String expectedEscaped) {