Re-implement URIStringUtils escape and unescape using Unicode

author Tuukka Lehtonen <tuukka.lehtonen@semantum.fi>

Wed, 7 Sep 2016 07:09:22 +0000 (10:09 +0300)

committer Tuukka Lehtonen <tuukka.lehtonen@semantum.fi>

Wed, 7 Sep 2016 07:14:19 +0000 (10:14 +0300)
author Tuukka Lehtonen <tuukka.lehtonen@semantum.fi>
Wed, 7 Sep 2016 07:09:22 +0000 (10:09 +0300)
committer Tuukka Lehtonen <tuukka.lehtonen@semantum.fi>
Wed, 7 Sep 2016 07:14:19 +0000 (10:14 +0300)
diff --git a/bundles/org.simantics.databoard/src/org/simantics/databoard/util/URIStringUtils.java b/bundles/org.simantics.databoard/src/org/simantics/databoard/util/URIStringUtils.java

index 1cd658ba90614dd36d2050c8f0d736987eccfc8c..a11579f07e774404394e448daf3a4d4799651512 100644 (file)
--- a/bundles/org.simantics.databoard/src/org/simantics/databoard/util/URIStringUtils.java
+++ b/bundles/org.simantics.databoard/src/org/simantics/databoard/util/URIStringUtils.java
@@ -45,8 +45,7 @@
  \r
  package org.simantics.databoard.util;\r
  \r
-import java.nio.charset.Charset;\r
-import java.util.ArrayList;\r
+import java.util.Arrays;\r
  import java.util.List;\r
  \r
  \r
@@ -194,13 +193,14 @@ public final class URIStringUtils {
              return name;\r
      }\r
  \r
-    final private static int HTTP_POSITION = "http://".length();\r
+    final private static String HTTP_PREFIX = "http://";\r
+    final private static int HTTP_POSITION = HTTP_PREFIX.length();\r
  \r
      public static String[] splitURI(String uri) {\r
          int nextPathSeparator = uri.lastIndexOf(URIStringUtils.NAMESPACE_PATH_SEPARATOR);\r
          if (nextPathSeparator == -1) return null;\r
          if (nextPathSeparator == HTTP_POSITION - 1) {\r
-            if(uri.startsWith("http://")) return new String[] { "http://", uri.substring(HTTP_POSITION, uri.length()) };\r
+            if(uri.startsWith(HTTP_PREFIX)) return new String[] { HTTP_PREFIX, uri.substring(HTTP_POSITION, uri.length()) };\r
              else return null;\r
          }\r
          return new String[] {\r
@@ -208,12 +208,10 @@ public final class URIStringUtils {
                  uri.substring(nextPathSeparator + 1, uri.length())\r
          };\r
      }\r
-    \r
+\r
      public static List<String> splitURISCL(String uri) {\r
          String[] result = splitURI(uri);\r
-        ArrayList<String> list = new ArrayList<String>(result.length);\r
-        for(String s : result) list.add(s);\r
-        return list;\r
+        return Arrays.asList(result);\r
      }\r
  \r
      /**\r
@@ -263,8 +261,7 @@ public final class URIStringUtils {
      public static String escapeURI(String localName) {\r
          if (localName == null)\r
              throw new NullPointerException("null local name");\r
-        String result = encode(localName);\r
-        return result;\r
+        return encode(localName);\r
      }\r
  \r
      /**\r
@@ -276,8 +273,7 @@ public final class URIStringUtils {
       * @return the joined namespace\r
       */\r
      public static String appendURINamespace(String namespace, String suffix) {\r
-        //return namespace + NAMESPACE_PATH_SEPARATOR + suffix;\r
-        return new StringBuffer(namespace.length() + 1 + suffix.length())\r
+        return new StringBuilder(namespace.length() + 1 + suffix.length())\r
          .append(namespace)\r
          .append(NAMESPACE_PATH_SEPARATOR)\r
          .append(suffix)\r
@@ -293,9 +289,8 @@ public final class URIStringUtils {
       * @return the joined URI\r
       */\r
      public static String makeURI(String namespace, String localName) {\r
-        //return namespace + NAMESPACE_LOCAL_SEPARATOR + escapeURI(localName);\r
          String escapedLocalName = escapeURI(localName);\r
-        return new StringBuffer(namespace.length() + 1 + escapedLocalName.length())\r
+        return new StringBuilder(namespace.length() + 1 + escapedLocalName.length())\r
          .append(namespace)\r
          .append(NAMESPACE_LOCAL_SEPARATOR)\r
          .append(escapedLocalName)\r
@@ -332,99 +327,59 @@ public final class URIStringUtils {
      }\r
  \r
  \r
-    final private static Charset UTF8 = Charset.forName("UTF-8");\r
-    final private static Charset ASCII = Charset.forName("US-ASCII");\r
-\r
      /*\r
       * RFC 3986 section 2.2 Reserved Characters (January 2005)\r
       * !*'();:@&=+$,/?#[]\r
       */\r
-    private static boolean[] UNESCAPED_US_ASCII_CHARS = new boolean[128];\r
+    private static boolean[] ESCAPED_US_ASCII_CHARS = new boolean[128];\r
  \r
      static {\r
-        for(char ch='A';ch <= 'Z';++ch)\r
-            UNESCAPED_US_ASCII_CHARS[ch] = true;\r
-        for(char ch='a';ch <= 'z';++ch)\r
-            UNESCAPED_US_ASCII_CHARS[ch] = true;\r
-        for(char ch='0';ch <= '9';++ch)\r
-            UNESCAPED_US_ASCII_CHARS[ch] = true;\r
-        UNESCAPED_US_ASCII_CHARS[';'] = true;\r
-        UNESCAPED_US_ASCII_CHARS['?'] = true;\r
-        UNESCAPED_US_ASCII_CHARS[':'] = true;\r
-        UNESCAPED_US_ASCII_CHARS['@'] = true;\r
-        UNESCAPED_US_ASCII_CHARS['='] = true;\r
-        UNESCAPED_US_ASCII_CHARS['+'] = true;\r
-        UNESCAPED_US_ASCII_CHARS['$'] = true;\r
-        UNESCAPED_US_ASCII_CHARS['.'] = true;\r
-        UNESCAPED_US_ASCII_CHARS[','] = true;\r
-        UNESCAPED_US_ASCII_CHARS['-'] = true;\r
-        UNESCAPED_US_ASCII_CHARS['_'] = true;\r
-        UNESCAPED_US_ASCII_CHARS['!'] = true;\r
-        UNESCAPED_US_ASCII_CHARS['~'] = true;\r
-        UNESCAPED_US_ASCII_CHARS['*'] = true;\r
-        UNESCAPED_US_ASCII_CHARS['\''] = true;\r
-        UNESCAPED_US_ASCII_CHARS['('] = true;\r
-        UNESCAPED_US_ASCII_CHARS[')'] = true;\r
-        UNESCAPED_US_ASCII_CHARS['['] = true;\r
-        UNESCAPED_US_ASCII_CHARS[']'] = true;\r
+        ESCAPED_US_ASCII_CHARS[' '] = true;\r
+        // IMPORTANT NOTE: every time escape is invoked, all input needs to be escaped,\r
+        // i.e. escape("%01") should result in "%2501", not "%01".\r
+        // escape and unescape form a bijection, where neither\r
+        // of them is an idempotent operation. \r
+        ESCAPED_US_ASCII_CHARS['%'] = true;\r
+        // '#' and '/' are URL segment/fragment delimiters, need to be escaped in names.\r
+        ESCAPED_US_ASCII_CHARS['#'] = true;\r
+        ESCAPED_US_ASCII_CHARS['/'] = true;\r
+        // Escape '&' characters to avoid them being interpreted as SGML entities.\r
+        ESCAPED_US_ASCII_CHARS['&'] = true;\r
      }\r
  \r
-    private static boolean needsEscaping(String unicode) {\r
+    private static int needsEscaping(String unicode) {\r
          int len = unicode.length();\r
+        int escapeCount = 0;\r
          for (int i = 0; i < len; ++i) {\r
              char ch = unicode.charAt(i);\r
-            if (ch >= 128 || !UNESCAPED_US_ASCII_CHARS[ch])\r
-                return true;\r
+            if (ch < 128 && ESCAPED_US_ASCII_CHARS[ch])\r
+                ++escapeCount;\r
          }\r
-        return false;\r
+        return escapeCount;\r
      }\r
  \r
-    /* Copied and modified from Jena 2.4 com.hp.hpl.jena.util.URIref */\r
      private static String encode(String unicode) {\r
-        boolean needsEscapes = needsEscaping(unicode);\r
-        if (!needsEscapes)\r
+        int needsEscapes = needsEscaping(unicode);\r
+        if (needsEscapes == 0)\r
              return unicode;\r
  \r
-        byte utf8[] = unicode.getBytes(UTF8);\r
-        byte rsltAscii[] = new byte[utf8.length * 6];\r
+        int len = unicode.length();\r
+        char result[] = new char[(len - needsEscapes) + needsEscapes * 3];\r
          int in = 0;\r
          int out = 0;\r
-        while (in < utf8.length) {\r
-            byte inCh = utf8[in];\r
-            if (inCh >= 0 && inCh < 128 && UNESCAPED_US_ASCII_CHARS[inCh]) {\r
-                rsltAscii[out] = inCh;\r
-                out++;\r
-                in++;\r
+        while (in < len) {\r
+            char inCh = unicode.charAt(in++);\r
+            if (inCh >= 128 || !ESCAPED_US_ASCII_CHARS[inCh]) {\r
+                result[out++] = inCh;\r
              } else {\r
-                switch (inCh) {\r
-                case (byte)' ':\r
-                    rsltAscii[out++] = (byte) '%';\r
-                    rsltAscii[out++] = '2';\r
-                    rsltAscii[out++] = '0';\r
-                    in++;\r
-                    break;\r
-                case (byte) '%':\r
-                    // [lehtonen] NOTE: all input needs to be escaped, i.e. "%01" should result in "%2501", not "%01".\r
-                    // escape+unescape is a bijection, not an idempotent operation. \r
-                    // Fall through to escape '%' as '%25'\r
-                case (byte) '#':\r
-                case (byte) '/':\r
-                    // Fall through to escape '/'\r
-                case (byte)'&':\r
-                    // Fall through to escape '&' characters to avoid them\r
-                    // being interpreted as SGML entities.\r
-                default:\r
-                    rsltAscii[out++] = (byte) '%';\r
-                    // Get rid of sign ...\r
-                    int c = (inCh) & 255;\r
-                    rsltAscii[out++] = hexEncode(c / 16);\r
-                    rsltAscii[out++] = hexEncode(c % 16);\r
-                    in++;\r
-                    break;\r
-                }\r
+                // Only selected 7-bit US-ASCII characters are escaped\r
+                int c = inCh & 255;\r
+                result[out++] = '%';\r
+                result[out++] = (char) hexEncode(c / 16);\r
+                result[out++] = (char) hexEncode(c % 16);\r
              }\r
          }\r
-        return new String(rsltAscii, 0, out, ASCII);\r
+        return new String(result, 0, out);\r
      }\r
  \r
      private static boolean needsUnescaping(String unicode) {\r
@@ -432,13 +387,12 @@ public final class URIStringUtils {
      }\r
  \r
      /**\r
-     * Convert a URI, in US-ASCII, with escaped characters taken from UTF-8, to\r
-     * the corresponding Unicode string. On ill-formed input the results are\r
-     * undefined, specifically if the unescaped version is not a UTF-8 String,\r
-     * some String will be returned.\r
+     * Convert a URI, in UTF-16 with escaped characters taken from US-ASCII, to\r
+     * the corresponding unescaped Unicode string. On ill-formed input the results are\r
+     * undefined.\r
       * \r
       * @param uri the uri, in characters specified by RFC 2396 + '#'.\r
-     * @return the corresponding Unicode String.\r
+     * @return the corresponding unescaped Unicode String.\r
       * @exception IllegalArgumentException if a % hex sequence is ill-formed.\r
       */\r
      public static String unescape(String uri) {\r
@@ -446,26 +400,29 @@ public final class URIStringUtils {
              if (!needsUnescaping(uri))\r
                  return uri;\r
  \r
-            byte ascii[] = uri.getBytes("US-ASCII");\r
-            byte utf8[] = new byte[ascii.length];\r
+            int len = uri.length();\r
+            String unicode = uri;\r
+            char result[] = new char[len];\r
              int in = 0;\r
              int out = 0;\r
-            while ( in < ascii.length ) {\r
-                if (ascii[in] == (byte) '%') {\r
-                    in++;\r
-                    utf8[out++] = (byte) (hexDecode(ascii[in]) * 16 | hexDecode(ascii[in + 1]));\r
+            while (in < len) {\r
+                char inCh = unicode.charAt(in++);\r
+                if (inCh == '%') {\r
+                    char d1 = unicode.charAt(in);\r
+                    char d2 = unicode.charAt(in+1);\r
+                    if (d1 > 127 || d2 > 127)\r
+                        throw new IllegalArgumentException("Invalid hex digit escape sequence in " + uri + " at " + in);\r
+                    result[out++] = (char) (hexDecode((byte) d1) * 16 | hexDecode((byte) d2));\r
                      in += 2;\r
                  } else {\r
-                    utf8[out++] = ascii[in++];\r
+                    result[out++] = inCh;\r
                  }\r
              }\r
-            return new String(utf8, 0, out, "UTF-8");\r
+            return new String(result, 0, out);\r
          } catch (IllegalArgumentException e) {\r
              throw new IllegalArgumentException("Problem while unescaping string: " + uri, e);\r
-        } catch (java.io.UnsupportedEncodingException e) {\r
-            throw new Error("The JVM is required to support UTF-8 and US-ASCII encodings.");\r
-        } catch (ArrayIndexOutOfBoundsException ee) {\r
-            throw new IllegalArgumentException("Incomplete Hex escape sequence in " + uri);\r
+        } catch (IndexOutOfBoundsException ee) {\r
+            throw new IllegalArgumentException("Incomplete hex digit escape sequence in " + uri);\r
          }\r
      }\r
  \r
@@ -496,40 +453,36 @@ public final class URIStringUtils {
       * @param args\r
       */\r
      public static void main(String[] args) {\r
-        String s;\r
-        s = "http://www.vtt.fi%2FSome- %25 Namespace/Jotain";\r
-        System.out.println(String.format("escape+unescape: %s -> %s -> %s", s, escape(s), unescape(escape(s))));\r
-        s = "http://www.vtt.fi%2FPSK";\r
-        System.out.println(String.format("unescape: %s -> %s", s, unescape(s)));\r
-        s = "http://www.vtt.fi%2FSome-Namespace/Jotain / Muuta";\r
-        System.out.println(String.format("escape: %s -> %s", s, escape(s)));\r
-        s = "Jotain / Muuta";\r
-        System.out.println(String.format("escape: %s -> %s", s, escape(s)));\r
-\r
-        System.out.println("escapeURI: " + escapeURI("foo/bar/org%2Fnet"));\r
-        System.out.println("escapeURI('...#...'): " + escapeURI("foo/bar#org%2Fnet"));\r
-        s = makeURI("http://foo.bar.com/foo/bar", "baz/guuk/org%2Fnet");\r
+        String s = makeURI("http://foo.bar.com/foo/bar", "baz/guuk/org%2Fnet");\r
          System.out.println("escapeURI: " + s);\r
          System.out.println("getNamespace: " + getNamespace(s));\r
          System.out.println("getLocalName: " + getLocalName(s));\r
  \r
+        System.out.println("escapeURI: " + escapeURI("foo/bar/org%2Fnet"));\r
+        System.out.println("escapeURI('...#...'): " + escapeURI("foo/bar#org%2Fnet"));\r
+\r
          testEscape("/", "%2F");\r
          testEscape("#", "%23");\r
          testEscape("%", "%25");\r
          testEscape("%01", "%2501");\r
          testEscape("%GG", "%25GG");\r
-        testEscape("säätö venttiili", "s%C3%A4%C3%A4t%C3%B6%20venttiili");\r
-        testEscape("säätö", "s%C3%A4%C3%A4t%C3%B6");\r
+        testEscape("säätö venttiili", "säätö%20venttiili");\r
+        testEscape("säätö", "säätö");\r
+        testEscape("Something / Else", "Something%20%2F%20Else");\r
+        testEscape("http://www.vtt.fi%2FSome- %25 Namespace/Something", "http:%2F%2Fwww.vtt.fi%252FSome-%20%2525%20Namespace%2FSomething");\r
+        testEscape("http://www.vtt.fi/PSK", "http:%2F%2Fwww.vtt.fi%2FPSK");\r
+        testEscape("http://www.vtt.fi%2FSome-Namespace/Something / Else", "http:%2F%2Fwww.vtt.fi%252FSome-Namespace%2FSomething%20%2F%20Else");\r
      }\r
  \r
      private static void testEscape(String unescaped, String expectedEscaped) {\r
          String esc = escape(unescaped);\r
          String unesc = unescape(esc);\r
-        System.out.format("escape('%s')='%s', unescape('%s')='%s'\n", unescaped, esc, esc, unesc);\r
+        System.out.format("escape('%s') -> '%s', unescape('%s') -> '%s'", unescaped, esc, esc, unesc);\r
          if (!esc.equals(expectedEscaped))\r
              throw new AssertionError("escape('" + unescaped + "') was expected to return '" + expectedEscaped + "' but returned '" + esc + "'");\r
          if (!unesc.equals(unescaped))\r
              throw new AssertionError("unescape(escape('" + unescaped + "'))=unescape(" + esc + ") was expected to return '" + unescaped + "' but returned '" + unesc + "'");\r
+        System.out.println(" OK");\r
      }\r
  \r
  }\r
author	Tuukka Lehtonen <tuukka.lehtonen@semantum.fi>
	Wed, 7 Sep 2016 07:09:22 +0000 (10:09 +0300)
committer	Tuukka Lehtonen <tuukka.lehtonen@semantum.fi>
	Wed, 7 Sep 2016 07:14:19 +0000 (10:14 +0300)