]> gerrit.simantics Code Review - simantics/platform.git/blobdiff - bundles/org.simantics.databoard/src/org/simantics/databoard/util/binary/UTF8.java
Migrated source code from Simantics SVN
[simantics/platform.git] / bundles / org.simantics.databoard / src / org / simantics / databoard / util / binary / UTF8.java
diff --git a/bundles/org.simantics.databoard/src/org/simantics/databoard/util/binary/UTF8.java b/bundles/org.simantics.databoard/src/org/simantics/databoard/util/binary/UTF8.java
new file mode 100644 (file)
index 0000000..6f16677
--- /dev/null
@@ -0,0 +1,236 @@
+package org.simantics.databoard.util.binary;\r
+\r
+import java.io.DataInput;\r
+import java.io.DataOutput;\r
+import java.io.IOException;\r
+import java.io.UTFDataFormatException;\r
+import java.nio.charset.Charset;\r
+\r
+/**\r
+ * Utils for handling Standard-UTF8 and <a href="http://download.oracle.com/javase/6/docs/api/java/io/DataInput.html">Modified-UTF8</a> Strings.<p>\r
+ * \r
+ * The differences between standard UTF8 and Modified are the following:\r
+ * <ul>\r
+ * <li>The null byte <code>'&#92;u0000'</code> is encoded in 2-byte format\r
+ *     rather than 1-byte, so that the encoded strings never have\r
+ *     embedded nulls.\r
+ * <li>Only the 1-byte, 2-byte, and 3-byte formats are used.\r
+ * <li><a href="../lang/Character.html#unicode">Supplementary characters</a>\r
+ *     are represented in the form of surrogate pairs.\r
+ * </ul>\r
+ * \r
+ */\r
+public class UTF8 {\r
+\r
+       public static final Charset CHARSET = Charset.forName("utf-8");\r
+       \r
+       /**\r
+        * Get the number of bytes in an UTF-8 encoding of a string \r
+        * \r
+        * @param string\r
+        * @return byte length\r
+        */\r
+       public static int getUTF8EncodingByteLength(String string)\r
+       {\r
+               // TODO ?May not work properly properly on some characters that are encoded with 2 chars in UTF-16? \r
+               \r
+               // Correct\r
+               //return string.getBytes(UTF8).length;\r
+               \r
+               // http://en.wikipedia.org/wiki/UTF-8\r
+               int result = 0;\r
+               int length = string.length();\r
+               for (int i=0; i<length; i++)\r
+               {\r
+                       char c = string.charAt(i);\r
+                       if (c>=0 && c<=0x7f) {\r
+                               result += 1;\r
+                       } else if (c>=0x80 && c<=0x07ff) {\r
+                               result += 2;\r
+                       } else if (c>=0xD800 && c<=0xDFFF) {\r
+                               result += 1;\r
+                       } else if (c>=0x800 && c<=0xffff) {\r
+                               result += 3;\r
+                       }\r
+                       // Not really used as char is 16-bit\r
+                       else if (c>=0x10000 && c<=0x10ffff) {\r
+                               result += 4;\r
+                       } else if (c>=0x110000 && c<=0x1FFFFF) {\r
+                               result += 4;\r
+                       } else {\r
+                               // NOT IN RFC 3629\r
+                               result += 5;\r
+                       }\r
+               }\r
+               return result;                          \r
+       }\r
+       \r
+\r
+       /**\r
+        * Get the number of bytes in an Modified-UTF-8 encoding of a string \r
+        * \r
+        * @param str\r
+        * @return byte length\r
+        */\r
+       public static int getModifiedUTF8EncodingByteLength(String str)\r
+       {\r
+        int strlen = str.length();\r
+       int utflen = 0;\r
+       int c = 0;\r
+     \r
+        /* use charAt instead of copying String to char array */\r
+       for (int i = 0; i < strlen; i++) {\r
+                c = str.charAt(i);\r
+           if ((c >= 0x0001) && (c <= 0x007F)) {\r
+               utflen++;\r
+           } else if (c > 0x07FF) {\r
+               utflen += 3;\r
+           } else {\r
+               utflen += 2;\r
+           }\r
+       }\r
+       return utflen;\r
+       }\r
+    \r
+       /**\r
+        * Write Modified-UTF8 to a stream.\r
+        * \r
+        * @param out output stream \r
+        * @param str string\r
+        * @throws IOException\r
+        */\r
+    public static void writeModifiedUTF(DataOutput out, String str) throws IOException {\r
+       // Copied from DataOutput\r
+        int strlen = str.length();\r
+        int c = 0;\r
+        \r
+        int i=0;\r
+        for (i=0; i<strlen; i++) {\r
+           c = str.charAt(i);\r
+           if (!((c >= 0x0001) && (c <= 0x007F))) break;\r
+           out.write(c);\r
+        }\r
+        \r
+        for (;i < strlen; i++){\r
+            c = str.charAt(i);\r
+            if ((c >= 0x0001) && (c <= 0x007F)) {\r
+               out.write( c );\r
+            } else if (c > 0x07FF) {\r
+               out.write(0xE0 | ((c >> 12) & 0x0F));\r
+               out.write(0x80 | ((c >>  6) & 0x3F));\r
+               out.write(0x80 | ((c >>  0) & 0x3F));\r
+            } else {\r
+               out.write(0xC0 | ((c >>  6) & 0x1F));\r
+               out.write(0x80 | ((c >>  0) & 0x3F));\r
+            }\r
+        }\r
+    }\r
+    \r
+    /**\r
+     * Read Modified-UTF8 from a stream\r
+     * @param in input\r
+     * @param utflen number of bytes\r
+     * @return string\r
+     * @throws IOException\r
+     */\r
+    public static String readModifiedUTF(DataInput in, int utflen)\r
+    throws IOException, UTFDataFormatException\r
+    {\r
+       // Copied from DataInput\r
+        byte[] bytearr = null;\r
+        char[] chararr = null;\r
+\r
+        {\r
+            bytearr = new byte[utflen];\r
+            chararr = new char[utflen];\r
+        }\r
+\r
+        int c, char2, char3;\r
+        int count = 0;\r
+        int chararr_count=0;\r
+\r
+        in.readFully(bytearr, 0, utflen);\r
+\r
+        while (count < utflen) {\r
+            c = (int) bytearr[count] & 0xff;      \r
+            if (c > 127) break;\r
+            count++;\r
+            chararr[chararr_count++]=(char)c;\r
+        }\r
+\r
+        while (count < utflen) {\r
+            c = (int) bytearr[count] & 0xff;\r
+            switch (c >> 4) {\r
+                case 0: case 1: case 2: case 3: case 4: case 5: case 6: case 7:\r
+                    /* 0xxxxxxx*/\r
+                    count++;\r
+                    chararr[chararr_count++]=(char)c;\r
+                    break;\r
+                case 12: case 13:\r
+                    /* 110x xxxx   10xx xxxx*/\r
+                    count += 2;\r
+                    if (count > utflen)\r
+                        throw new UTFDataFormatException(\r
+                            "malformed input: partial character at end");\r
+                    char2 = (int) bytearr[count-1];\r
+                    if ((char2 & 0xC0) != 0x80)\r
+                        throw new UTFDataFormatException(\r
+                            "malformed input around byte " + count); \r
+                    chararr[chararr_count++]=(char)(((c & 0x1F) << 6) | \r
+                                                    (char2 & 0x3F));  \r
+                    break;\r
+                case 14:\r
+                    /* 1110 xxxx  10xx xxxx  10xx xxxx */\r
+                    count += 3;\r
+                    if (count > utflen)\r
+                        throw new UTFDataFormatException(\r
+                            "malformed input: partial character at end");\r
+                    char2 = (int) bytearr[count-2];\r
+                    char3 = (int) bytearr[count-1];\r
+                    if (((char2 & 0xC0) != 0x80) || ((char3 & 0xC0) != 0x80))\r
+                        throw new UTFDataFormatException(\r
+                            "malformed input around byte " + (count-1));\r
+                    chararr[chararr_count++]=(char)(((c     & 0x0F) << 12) |\r
+                                                    ((char2 & 0x3F) << 6)  |\r
+                                                    ((char3 & 0x3F) << 0));\r
+                    break;\r
+                default:\r
+                    /* 10xx xxxx,  1111 xxxx */\r
+                    throw new UTFDataFormatException(\r
+                        "malformed input around byte " + count);\r
+            }\r
+        }\r
+        // The number of chars produced may be less than utflen\r
+        return new String(chararr, 0, chararr_count);\r
+    }    \r
+    \r
+    /**\r
+     * Write Standard-UTF8 to a stream.\r
+     * \r
+     * @param str\r
+     * @param out\r
+     * @throws IOException\r
+     */\r
+    public static void writeUTF(DataOutput out, String str)\r
+    throws IOException\r
+    {\r
+               byte[] bytes = str.getBytes(CHARSET);\r
+               out.write(bytes);\r
+    }\r
+    \r
+    /**\r
+     * Read Standard-UTF8 from a stream\r
+     * @param in input\r
+     * @param len number of bytes\r
+     * @return string\r
+     * @throws IOException\r
+     */\r
+    public static String readUTF(DataInput in, int len)\r
+    throws IOException\r
+    {\r
+               byte[] bytes = new byte[len];\r
+               in.readFully(bytes);\r
+               return new String(bytes, UTF8.CHARSET);\r
+    }\r
+       \r
+}\r