Fix URIStringUtils.needsEscaping to escape chars outside 7-bit ASCII.

author Tuukka Lehtonen <tuukka.lehtonen@semantum.fi>

Mon, 5 Sep 2016 12:37:50 +0000 (15:37 +0300)

committer Tuukka Lehtonen <tuukka.lehtonen@semantum.fi>

Mon, 5 Sep 2016 14:38:17 +0000 (17:38 +0300)
author Tuukka Lehtonen <tuukka.lehtonen@semantum.fi>
Mon, 5 Sep 2016 12:37:50 +0000 (15:37 +0300)
committer Tuukka Lehtonen <tuukka.lehtonen@semantum.fi>
Mon, 5 Sep 2016 14:38:17 +0000 (17:38 +0300)
diff --git a/bundles/org.simantics.databoard/src/org/simantics/databoard/util/URIStringUtils.java b/bundles/org.simantics.databoard/src/org/simantics/databoard/util/URIStringUtils.java

index dde498a2c38bdcadbe3fb025cce664e2e0575c74..1cd658ba90614dd36d2050c8f0d736987eccfc8c 100644 (file)
--- a/bundles/org.simantics.databoard/src/org/simantics/databoard/util/URIStringUtils.java
+++ b/bundles/org.simantics.databoard/src/org/simantics/databoard/util/URIStringUtils.java
@@ -335,6 +335,50 @@ public final class URIStringUtils {
      final private static Charset UTF8 = Charset.forName("UTF-8");\r
      final private static Charset ASCII = Charset.forName("US-ASCII");\r
  \r
+    /*\r
+     * RFC 3986 section 2.2 Reserved Characters (January 2005)\r
+     * !*'();:@&=+$,/?#[]\r
+     */\r
+    private static boolean[] UNESCAPED_US_ASCII_CHARS = new boolean[128];\r
+\r
+    static {\r
+        for(char ch='A';ch <= 'Z';++ch)\r
+            UNESCAPED_US_ASCII_CHARS[ch] = true;\r
+        for(char ch='a';ch <= 'z';++ch)\r
+            UNESCAPED_US_ASCII_CHARS[ch] = true;\r
+        for(char ch='0';ch <= '9';++ch)\r
+            UNESCAPED_US_ASCII_CHARS[ch] = true;\r
+        UNESCAPED_US_ASCII_CHARS[';'] = true;\r
+        UNESCAPED_US_ASCII_CHARS['?'] = true;\r
+        UNESCAPED_US_ASCII_CHARS[':'] = true;\r
+        UNESCAPED_US_ASCII_CHARS['@'] = true;\r
+        UNESCAPED_US_ASCII_CHARS['='] = true;\r
+        UNESCAPED_US_ASCII_CHARS['+'] = true;\r
+        UNESCAPED_US_ASCII_CHARS['$'] = true;\r
+        UNESCAPED_US_ASCII_CHARS['.'] = true;\r
+        UNESCAPED_US_ASCII_CHARS[','] = true;\r
+        UNESCAPED_US_ASCII_CHARS['-'] = true;\r
+        UNESCAPED_US_ASCII_CHARS['_'] = true;\r
+        UNESCAPED_US_ASCII_CHARS['!'] = true;\r
+        UNESCAPED_US_ASCII_CHARS['~'] = true;\r
+        UNESCAPED_US_ASCII_CHARS['*'] = true;\r
+        UNESCAPED_US_ASCII_CHARS['\''] = true;\r
+        UNESCAPED_US_ASCII_CHARS['('] = true;\r
+        UNESCAPED_US_ASCII_CHARS[')'] = true;\r
+        UNESCAPED_US_ASCII_CHARS['['] = true;\r
+        UNESCAPED_US_ASCII_CHARS[']'] = true;\r
+    }\r
+\r
+    private static boolean needsEscaping(String unicode) {\r
+        int len = unicode.length();\r
+        for (int i = 0; i < len; ++i) {\r
+            char ch = unicode.charAt(i);\r
+            if (ch >= 128 || !UNESCAPED_US_ASCII_CHARS[ch])\r
+                return true;\r
+        }\r
+        return false;\r
+    }\r
+\r
      /* Copied and modified from Jena 2.4 com.hp.hpl.jena.util.URIref */\r
      private static String encode(String unicode) {\r
          boolean needsEscapes = needsEscaping(unicode);\r
@@ -346,17 +390,13 @@ public final class URIStringUtils {
          int in = 0;\r
          int out = 0;\r
          while (in < utf8.length) {\r
-            switch (utf8[in]) {\r
-                case (byte)'a': case (byte)'b': case (byte)'c': case (byte)'d': case (byte)'e': case (byte)'f': case (byte)'g': case (byte)'h': case (byte)'i': case (byte)'j': case (byte)'k': case (byte)'l': case (byte)'m': case (byte)'n': case (byte)'o': case (byte)'p': case (byte)'q': case (byte)'r': case (byte)'s': case (byte)'t': case (byte)'u': case (byte)'v': case (byte)'w': case (byte)'x': case (byte)'y': case (byte)'z':\r
-                case (byte)'A': case (byte)'B': case (byte)'C': case (byte)'D': case (byte)'E': case (byte)'F': case (byte)'G': case (byte)'H': case (byte)'I': case (byte)'J': case (byte)'K': case (byte)'L': case (byte)'M': case (byte)'N': case (byte)'O': case (byte)'P': case (byte)'Q': case (byte)'R': case (byte)'S': case (byte)'T': case (byte)'U': case (byte)'V': case (byte)'W': case (byte)'X': case (byte)'Y': case (byte)'Z':\r
-                case (byte)'0': case (byte)'1': case (byte)'2': case (byte)'3': case (byte)'4': case (byte)'5': case (byte)'6': case (byte)'7': case (byte)'8': case (byte)'9':\r
-                case (byte)';': case (byte)'?': case (byte)':': case (byte)'@': case (byte)'=': case (byte)'+': case (byte)'$': case (byte)',':\r
-                case (byte)'-': case (byte)'_': case (byte)'.': case (byte)'!': case (byte)'~': case (byte)'*': case (byte)'\'': case (byte)'(': case (byte)')':\r
-                case (byte)'[': case (byte)']':\r
-                    rsltAscii[out] = utf8[in];\r
-                    out++;\r
-                    in++;\r
-                    break;\r
+            byte inCh = utf8[in];\r
+            if (inCh >= 0 && inCh < 128 && UNESCAPED_US_ASCII_CHARS[inCh]) {\r
+                rsltAscii[out] = inCh;\r
+                out++;\r
+                in++;\r
+            } else {\r
+                switch (inCh) {\r
                  case (byte)' ':\r
                      rsltAscii[out++] = (byte) '%';\r
                      rsltAscii[out++] = '2';\r
@@ -366,7 +406,7 @@ public final class URIStringUtils {
                  case (byte) '%':\r
                      // [lehtonen] NOTE: all input needs to be escaped, i.e. "%01" should result in "%2501", not "%01".\r
                      // escape+unescape is a bijection, not an idempotent operation. \r
-                    // Fall through to to escape '%' as '%25'\r
+                    // Fall through to escape '%' as '%25'\r
                  case (byte) '#':\r
                  case (byte) '/':\r
                      // Fall through to escape '/'\r
@@ -376,52 +416,17 @@ public final class URIStringUtils {
                  default:\r
                      rsltAscii[out++] = (byte) '%';\r
                      // Get rid of sign ...\r
-                    int c = (utf8[in]) & 255;\r
+                    int c = (inCh) & 255;\r
                      rsltAscii[out++] = hexEncode(c / 16);\r
                      rsltAscii[out++] = hexEncode(c % 16);\r
                      in++;\r
                      break;\r
+                }\r
              }\r
          }\r
          return new String(rsltAscii, 0, out, ASCII);\r
      }\r
  \r
-    /*\r
-     * RFC 3986 section 2.2 Reserved Characters (January 2005)\r
-     * !*'();:@&=+$,/?#[]\r
-     */\r
-    private static boolean needsEscaping(String unicode) {\r
-        int len = unicode.length();\r
-        for (int i = 0; i < len; ++i) {\r
-            switch (unicode.charAt(i)) {\r
-                case (byte)'!':\r
-                case (byte)'*':\r
-                case (byte)'\'':\r
-                case (byte)'(':\r
-                case (byte)')':\r
-                case (byte)';':\r
-                case (byte)':':\r
-                case (byte)'@':\r
-                case (byte)'=': \r
-                case (byte)'+':\r
-                case (byte)'$':\r
-                case (byte)',':\r
-                case (byte)'?':\r
-                case (byte)'~':\r
-                case (byte)'[':\r
-                case (byte)']':\r
-                    break;\r
-                case (byte)' ':\r
-                case (byte) '#':\r
-                case (byte) '%':\r
-                case (byte) '/':\r
-                case (byte)'&':\r
-                    return true;\r
-            }\r
-        }\r
-        return false;\r
-    }\r
-\r
      private static boolean needsUnescaping(String unicode) {\r
          return unicode.indexOf('%') > -1;\r
      }\r
@@ -513,6 +518,8 @@ public final class URIStringUtils {
          testEscape("%", "%25");\r
          testEscape("%01", "%2501");\r
          testEscape("%GG", "%25GG");\r
+        testEscape("säätö venttiili", "s%C3%A4%C3%A4t%C3%B6%20venttiili");\r
+        testEscape("säätö", "s%C3%A4%C3%A4t%C3%B6");\r
      }\r
  \r
      private static void testEscape(String unescaped, String expectedEscaped) {\r
author	Tuukka Lehtonen <tuukka.lehtonen@semantum.fi>
	Mon, 5 Sep 2016 12:37:50 +0000 (15:37 +0300)
committer	Tuukka Lehtonen <tuukka.lehtonen@semantum.fi>
	Mon, 5 Sep 2016 14:38:17 +0000 (17:38 +0300)