From: Tuukka Lehtonen <tuukka.lehtonen@semantum.fi>
Date: Wed, 7 Sep 2016 07:09:22 +0000 (+0300)
Subject: Re-implement URIStringUtils escape and unescape using Unicode
X-Git-Tag: v1.25.0~130^2^2
X-Git-Url: https://gerrit.simantics.org/r/gitweb?a=commitdiff_plain;h=refs%2Fchanges%2F53%2F53%2F2;p=simantics%2Fplatform.git

Re-implement URIStringUtils escape and unescape using Unicode

Now escape only performs the absolute minimum amount of escaping
necessary. All String inputs are treated directly as UTF-16 characters
and only the smallest amount of 7-bit US-ASCII characters are escaped.
These characters are " ", "%", "#", "/" and "&". All other unicode chars
(> 127) are written directly as is into the escaped output.

The main effect of this change is that Simantics database URIs will
contain much less escaped characters than before. Since we always deal
with Java String (Unicode) character sequences in Simantics and these
URIs are only valid within Simantics, this approach doesn't really cause
any problems for us.

This implementation is not in accordance with RFC 2396 - Uniform
Resource Identifiers (URI): Generic Syntax but it works for us.

refs #6673

Change-Id: I5279f68993a087b856d58c36d63427562ea1d1e4
---

diff --git a/bundles/org.simantics.databoard/src/org/simantics/databoard/util/URIStringUtils.java b/bundles/org.simantics.databoard/src/org/simantics/databoard/util/URIStringUtils.java
index 1cd658ba9..a11579f07 100644
--- a/bundles/org.simantics.databoard/src/org/simantics/databoard/util/URIStringUtils.java
+++ b/bundles/org.simantics.databoard/src/org/simantics/databoard/util/URIStringUtils.java
@@ -45,8 +45,7 @@
 
 package org.simantics.databoard.util;
 
-import java.nio.charset.Charset;
-import java.util.ArrayList;
+import java.util.Arrays;
 import java.util.List;
 
 
@@ -194,13 +193,14 @@ public final class URIStringUtils {
             return name;
     }
 
-    final private static int HTTP_POSITION = "http://".length();
+    final private static String HTTP_PREFIX = "http://";
+    final private static int HTTP_POSITION = HTTP_PREFIX.length();
 
     public static String[] splitURI(String uri) {
         int nextPathSeparator = uri.lastIndexOf(URIStringUtils.NAMESPACE_PATH_SEPARATOR);
         if (nextPathSeparator == -1) return null;
         if (nextPathSeparator == HTTP_POSITION - 1) {
-            if(uri.startsWith("http://")) return new String[] { "http://", uri.substring(HTTP_POSITION, uri.length()) };
+            if(uri.startsWith(HTTP_PREFIX)) return new String[] { HTTP_PREFIX, uri.substring(HTTP_POSITION, uri.length()) };
             else return null;
         }
         return new String[] {
@@ -208,12 +208,10 @@ public final class URIStringUtils {
                 uri.substring(nextPathSeparator + 1, uri.length())
         };
     }
-    
+
     public static List<String> splitURISCL(String uri) {
         String[] result = splitURI(uri);
-        ArrayList<String> list = new ArrayList<String>(result.length);
-        for(String s : result) list.add(s);
-        return list;
+        return Arrays.asList(result);
     }
 
     /**
@@ -263,8 +261,7 @@ public final class URIStringUtils {
     public static String escapeURI(String localName) {
         if (localName == null)
             throw new NullPointerException("null local name");
-        String result = encode(localName);
-        return result;
+        return encode(localName);
     }
 
     /**
@@ -276,8 +273,7 @@ public final class URIStringUtils {
      * @return the joined namespace
      */
     public static String appendURINamespace(String namespace, String suffix) {
-        //return namespace + NAMESPACE_PATH_SEPARATOR + suffix;
-        return new StringBuffer(namespace.length() + 1 + suffix.length())
+        return new StringBuilder(namespace.length() + 1 + suffix.length())
         .append(namespace)
         .append(NAMESPACE_PATH_SEPARATOR)
         .append(suffix)
@@ -293,9 +289,8 @@ public final class URIStringUtils {
      * @return the joined URI
      */
     public static String makeURI(String namespace, String localName) {
-        //return namespace + NAMESPACE_LOCAL_SEPARATOR + escapeURI(localName);
         String escapedLocalName = escapeURI(localName);
-        return new StringBuffer(namespace.length() + 1 + escapedLocalName.length())
+        return new StringBuilder(namespace.length() + 1 + escapedLocalName.length())
         .append(namespace)
         .append(NAMESPACE_LOCAL_SEPARATOR)
         .append(escapedLocalName)
@@ -332,99 +327,59 @@ public final class URIStringUtils {
     }
 
 
-    final private static Charset UTF8 = Charset.forName("UTF-8");
-    final private static Charset ASCII = Charset.forName("US-ASCII");
-
     /*
      * RFC 3986 section 2.2 Reserved Characters (January 2005)
      * !*'();:@&=+$,/?#[]
      */
-    private static boolean[] UNESCAPED_US_ASCII_CHARS = new boolean[128];
+    private static boolean[] ESCAPED_US_ASCII_CHARS = new boolean[128];
 
     static {
-        for(char ch='A';ch <= 'Z';++ch)
-            UNESCAPED_US_ASCII_CHARS[ch] = true;
-        for(char ch='a';ch <= 'z';++ch)
-            UNESCAPED_US_ASCII_CHARS[ch] = true;
-        for(char ch='0';ch <= '9';++ch)
-            UNESCAPED_US_ASCII_CHARS[ch] = true;
-        UNESCAPED_US_ASCII_CHARS[';'] = true;
-        UNESCAPED_US_ASCII_CHARS['?'] = true;
-        UNESCAPED_US_ASCII_CHARS[':'] = true;
-        UNESCAPED_US_ASCII_CHARS['@'] = true;
-        UNESCAPED_US_ASCII_CHARS['='] = true;
-        UNESCAPED_US_ASCII_CHARS['+'] = true;
-        UNESCAPED_US_ASCII_CHARS['$'] = true;
-        UNESCAPED_US_ASCII_CHARS['.'] = true;
-        UNESCAPED_US_ASCII_CHARS[','] = true;
-        UNESCAPED_US_ASCII_CHARS['-'] = true;
-        UNESCAPED_US_ASCII_CHARS['_'] = true;
-        UNESCAPED_US_ASCII_CHARS['!'] = true;
-        UNESCAPED_US_ASCII_CHARS['~'] = true;
-        UNESCAPED_US_ASCII_CHARS['*'] = true;
-        UNESCAPED_US_ASCII_CHARS['\''] = true;
-        UNESCAPED_US_ASCII_CHARS['('] = true;
-        UNESCAPED_US_ASCII_CHARS[')'] = true;
-        UNESCAPED_US_ASCII_CHARS['['] = true;
-        UNESCAPED_US_ASCII_CHARS[']'] = true;
+        ESCAPED_US_ASCII_CHARS[' '] = true;
+        // IMPORTANT NOTE: every time escape is invoked, all input needs to be escaped,
+        // i.e. escape("%01") should result in "%2501", not "%01".
+        // escape and unescape form a bijection, where neither
+        // of them is an idempotent operation. 
+        ESCAPED_US_ASCII_CHARS['%'] = true;
+        // '#' and '/' are URL segment/fragment delimiters, need to be escaped in names.
+        ESCAPED_US_ASCII_CHARS['#'] = true;
+        ESCAPED_US_ASCII_CHARS['/'] = true;
+        // Escape '&' characters to avoid them being interpreted as SGML entities.
+        ESCAPED_US_ASCII_CHARS['&'] = true;
     }
 
-    private static boolean needsEscaping(String unicode) {
+    private static int needsEscaping(String unicode) {
         int len = unicode.length();
+        int escapeCount = 0;
         for (int i = 0; i < len; ++i) {
             char ch = unicode.charAt(i);
-            if (ch >= 128 || !UNESCAPED_US_ASCII_CHARS[ch])
-                return true;
+            if (ch < 128 && ESCAPED_US_ASCII_CHARS[ch])
+                ++escapeCount;
         }
-        return false;
+        return escapeCount;
     }
 
-    /* Copied and modified from Jena 2.4 com.hp.hpl.jena.util.URIref */
     private static String encode(String unicode) {
-        boolean needsEscapes = needsEscaping(unicode);
-        if (!needsEscapes)
+        int needsEscapes = needsEscaping(unicode);
+        if (needsEscapes == 0)
             return unicode;
 
-        byte utf8[] = unicode.getBytes(UTF8);
-        byte rsltAscii[] = new byte[utf8.length * 6];
+        int len = unicode.length();
+        char result[] = new char[(len - needsEscapes) + needsEscapes * 3];
         int in = 0;
         int out = 0;
-        while (in < utf8.length) {
-            byte inCh = utf8[in];
-            if (inCh >= 0 && inCh < 128 && UNESCAPED_US_ASCII_CHARS[inCh]) {
-                rsltAscii[out] = inCh;
-                out++;
-                in++;
+        while (in < len) {
+            char inCh = unicode.charAt(in++);
+            if (inCh >= 128 || !ESCAPED_US_ASCII_CHARS[inCh]) {
+                result[out++] = inCh;
             } else {
-                switch (inCh) {
-                case (byte)' ':
-                    rsltAscii[out++] = (byte) '%';
-                    rsltAscii[out++] = '2';
-                    rsltAscii[out++] = '0';
-                    in++;
-                    break;
-                case (byte) '%':
-                    // [lehtonen] NOTE: all input needs to be escaped, i.e. "%01" should result in "%2501", not "%01".
-                    // escape+unescape is a bijection, not an idempotent operation. 
-                    // Fall through to escape '%' as '%25'
-                case (byte) '#':
-                case (byte) '/':
-                    // Fall through to escape '/'
-                case (byte)'&':
-                    // Fall through to escape '&' characters to avoid them
-                    // being interpreted as SGML entities.
-                default:
-                    rsltAscii[out++] = (byte) '%';
-                    // Get rid of sign ...
-                    int c = (inCh) & 255;
-                    rsltAscii[out++] = hexEncode(c / 16);
-                    rsltAscii[out++] = hexEncode(c % 16);
-                    in++;
-                    break;
-                }
+                // Only selected 7-bit US-ASCII characters are escaped
+                int c = inCh & 255;
+                result[out++] = '%';
+                result[out++] = (char) hexEncode(c / 16);
+                result[out++] = (char) hexEncode(c % 16);
             }
         }
-        return new String(rsltAscii, 0, out, ASCII);
+        return new String(result, 0, out);
     }
 
     private static boolean needsUnescaping(String unicode) {
@@ -432,13 +387,12 @@ public final class URIStringUtils {
     }
 
     /**
-     * Convert a URI, in US-ASCII, with escaped characters taken from UTF-8, to
-     * the corresponding Unicode string. On ill-formed input the results are
-     * undefined, specifically if the unescaped version is not a UTF-8 String,
-     * some String will be returned.
+     * Convert a URI, in UTF-16 with escaped characters taken from US-ASCII, to
+     * the corresponding unescaped Unicode string. On ill-formed input the results are
+     * undefined.
      * 
      * @param uri the uri, in characters specified by RFC 2396 + '#'.
-     * @return the corresponding Unicode String.
+     * @return the corresponding unescaped Unicode String.
      * @exception IllegalArgumentException if a % hex sequence is ill-formed.
      */
     public static String unescape(String uri) {
@@ -446,26 +400,29 @@ public final class URIStringUtils {
             if (!needsUnescaping(uri))
                 return uri;
 
-            byte ascii[] = uri.getBytes("US-ASCII");
-            byte utf8[] = new byte[ascii.length];
+            int len = uri.length();
+            String unicode = uri;
+            char result[] = new char[len];
             int in = 0;
             int out = 0;
-            while ( in < ascii.length ) {
-                if (ascii[in] == (byte) '%') {
-                    in++;
-                    utf8[out++] = (byte) (hexDecode(ascii[in]) * 16 | hexDecode(ascii[in + 1]));
+            while (in < len) {
+                char inCh = unicode.charAt(in++);
+                if (inCh == '%') {
+                    char d1 = unicode.charAt(in);
+                    char d2 = unicode.charAt(in+1);
+                    if (d1 > 127 || d2 > 127)
+                        throw new IllegalArgumentException("Invalid hex digit escape sequence in " + uri + " at " + in);
+                    result[out++] = (char) (hexDecode((byte) d1) * 16 | hexDecode((byte) d2));
                     in += 2;
                 } else {
-                    utf8[out++] = ascii[in++];
+                    result[out++] = inCh;
                 }
             }
-            return new String(utf8, 0, out, "UTF-8");
+            return new String(result, 0, out);
         } catch (IllegalArgumentException e) {
             throw new IllegalArgumentException("Problem while unescaping string: " + uri, e);
-        } catch (java.io.UnsupportedEncodingException e) {
-            throw new Error("The JVM is required to support UTF-8 and US-ASCII encodings.");
-        } catch (ArrayIndexOutOfBoundsException ee) {
-            throw new IllegalArgumentException("Incomplete Hex escape sequence in " + uri);
+        } catch (IndexOutOfBoundsException ee) {
+            throw new IllegalArgumentException("Incomplete hex digit escape sequence in " + uri);
         }
     }
 
@@ -496,40 +453,36 @@ public final class URIStringUtils {
      * @param args
      */
     public static void main(String[] args) {
-        String s;
-        s = "http://www.vtt.fi%2FSome- %25 Namespace/Jotain";
-        System.out.println(String.format("escape+unescape: %s -> %s -> %s", s, escape(s), unescape(escape(s))));
-        s = "http://www.vtt.fi%2FPSK";
-        System.out.println(String.format("unescape: %s -> %s", s, unescape(s)));
-        s = "http://www.vtt.fi%2FSome-Namespace/Jotain / Muuta";
-        System.out.println(String.format("escape: %s -> %s", s, escape(s)));
-        s = "Jotain / Muuta";
-        System.out.println(String.format("escape: %s -> %s", s, escape(s)));
-
-        System.out.println("escapeURI: " + escapeURI("foo/bar/org%2Fnet"));
-        System.out.println("escapeURI('...#...'): " + escapeURI("foo/bar#org%2Fnet"));
-        s = makeURI("http://foo.bar.com/foo/bar", "baz/guuk/org%2Fnet");
+        String s = makeURI("http://foo.bar.com/foo/bar", "baz/guuk/org%2Fnet");
         System.out.println("escapeURI: " + s);
         System.out.println("getNamespace: " + getNamespace(s));
         System.out.println("getLocalName: " + getLocalName(s));
 
+        System.out.println("escapeURI: " + escapeURI("foo/bar/org%2Fnet"));
+        System.out.println("escapeURI('...#...'): " + escapeURI("foo/bar#org%2Fnet"));
+
         testEscape("/", "%2F");
         testEscape("#", "%23");
         testEscape("%", "%25");
         testEscape("%01", "%2501");
         testEscape("%GG", "%25GG");
-        testEscape("säätö venttiili", "s%C3%A4%C3%A4t%C3%B6%20venttiili");
-        testEscape("säätö", "s%C3%A4%C3%A4t%C3%B6");
+        testEscape("säätö venttiili", "säätö%20venttiili");
+        testEscape("säätö", "säätö");
+        testEscape("Something / Else", "Something%20%2F%20Else");
+        testEscape("http://www.vtt.fi%2FSome- %25 Namespace/Something", "http:%2F%2Fwww.vtt.fi%252FSome-%20%2525%20Namespace%2FSomething");
+        testEscape("http://www.vtt.fi/PSK", "http:%2F%2Fwww.vtt.fi%2FPSK");
+        testEscape("http://www.vtt.fi%2FSome-Namespace/Something / Else", "http:%2F%2Fwww.vtt.fi%252FSome-Namespace%2FSomething%20%2F%20Else");
     }
 
     private static void testEscape(String unescaped, String expectedEscaped) {
         String esc = escape(unescaped);
         String unesc = unescape(esc);
-        System.out.format("escape('%s')='%s', unescape('%s')='%s'\n", unescaped, esc, esc, unesc);
+        System.out.format("escape('%s') -> '%s', unescape('%s') -> '%s'", unescaped, esc, esc, unesc);
         if (!esc.equals(expectedEscaped))
             throw new AssertionError("escape('" + unescaped + "') was expected to return '" + expectedEscaped + "' but returned '" + esc + "'");
         if (!unesc.equals(unescaped))
             throw new AssertionError("unescape(escape('" + unescaped + "'))=unescape(" + esc + ") was expected to return '" + unescaped + "' but returned '" + unesc + "'");
+        System.out.println(" OK");
     }
 
 }