--- /dev/null
+package org.simantics.databoard.util.binary;\r
+\r
+import java.io.DataInput;\r
+import java.io.DataOutput;\r
+import java.io.IOException;\r
+import java.io.UTFDataFormatException;\r
+import java.nio.charset.Charset;\r
+\r
+/**\r
+ * Utils for handling Standard-UTF8 and <a href="http://download.oracle.com/javase/6/docs/api/java/io/DataInput.html">Modified-UTF8</a> Strings.<p>\r
+ * \r
+ * The differences between standard UTF8 and Modified are the following:\r
+ * <ul>\r
+ * <li>The null byte <code>'\u0000'</code> is encoded in 2-byte format\r
+ * rather than 1-byte, so that the encoded strings never have\r
+ * embedded nulls.\r
+ * <li>Only the 1-byte, 2-byte, and 3-byte formats are used.\r
+ * <li><a href="../lang/Character.html#unicode">Supplementary characters</a>\r
+ * are represented in the form of surrogate pairs.\r
+ * </ul>\r
+ * \r
+ */\r
+public class UTF8 {\r
+\r
+ public static final Charset CHARSET = Charset.forName("utf-8");\r
+ \r
+ /**\r
+ * Get the number of bytes in an UTF-8 encoding of a string \r
+ * \r
+ * @param string\r
+ * @return byte length\r
+ */\r
+ public static int getUTF8EncodingByteLength(String string)\r
+ {\r
+ // TODO ?May not work properly properly on some characters that are encoded with 2 chars in UTF-16? \r
+ \r
+ // Correct\r
+ //return string.getBytes(UTF8).length;\r
+ \r
+ // http://en.wikipedia.org/wiki/UTF-8\r
+ int result = 0;\r
+ int length = string.length();\r
+ for (int i=0; i<length; i++)\r
+ {\r
+ char c = string.charAt(i);\r
+ if (c>=0 && c<=0x7f) {\r
+ result += 1;\r
+ } else if (c>=0x80 && c<=0x07ff) {\r
+ result += 2;\r
+ } else if (c>=0xD800 && c<=0xDFFF) {\r
+ result += 1;\r
+ } else if (c>=0x800 && c<=0xffff) {\r
+ result += 3;\r
+ }\r
+ // Not really used as char is 16-bit\r
+ else if (c>=0x10000 && c<=0x10ffff) {\r
+ result += 4;\r
+ } else if (c>=0x110000 && c<=0x1FFFFF) {\r
+ result += 4;\r
+ } else {\r
+ // NOT IN RFC 3629\r
+ result += 5;\r
+ }\r
+ }\r
+ return result; \r
+ }\r
+ \r
+\r
+ /**\r
+ * Get the number of bytes in an Modified-UTF-8 encoding of a string \r
+ * \r
+ * @param str\r
+ * @return byte length\r
+ */\r
+ public static int getModifiedUTF8EncodingByteLength(String str)\r
+ {\r
+ int strlen = str.length();\r
+ int utflen = 0;\r
+ int c = 0;\r
+ \r
+ /* use charAt instead of copying String to char array */\r
+ for (int i = 0; i < strlen; i++) {\r
+ c = str.charAt(i);\r
+ if ((c >= 0x0001) && (c <= 0x007F)) {\r
+ utflen++;\r
+ } else if (c > 0x07FF) {\r
+ utflen += 3;\r
+ } else {\r
+ utflen += 2;\r
+ }\r
+ }\r
+ return utflen;\r
+ }\r
+ \r
+ /**\r
+ * Write Modified-UTF8 to a stream.\r
+ * \r
+ * @param out output stream \r
+ * @param str string\r
+ * @throws IOException\r
+ */\r
+ public static void writeModifiedUTF(DataOutput out, String str) throws IOException {\r
+ // Copied from DataOutput\r
+ int strlen = str.length();\r
+ int c = 0;\r
+ \r
+ int i=0;\r
+ for (i=0; i<strlen; i++) {\r
+ c = str.charAt(i);\r
+ if (!((c >= 0x0001) && (c <= 0x007F))) break;\r
+ out.write(c);\r
+ }\r
+ \r
+ for (;i < strlen; i++){\r
+ c = str.charAt(i);\r
+ if ((c >= 0x0001) && (c <= 0x007F)) {\r
+ out.write( c );\r
+ } else if (c > 0x07FF) {\r
+ out.write(0xE0 | ((c >> 12) & 0x0F));\r
+ out.write(0x80 | ((c >> 6) & 0x3F));\r
+ out.write(0x80 | ((c >> 0) & 0x3F));\r
+ } else {\r
+ out.write(0xC0 | ((c >> 6) & 0x1F));\r
+ out.write(0x80 | ((c >> 0) & 0x3F));\r
+ }\r
+ }\r
+ }\r
+ \r
+ /**\r
+ * Read Modified-UTF8 from a stream\r
+ * @param in input\r
+ * @param utflen number of bytes\r
+ * @return string\r
+ * @throws IOException\r
+ */\r
+ public static String readModifiedUTF(DataInput in, int utflen)\r
+ throws IOException, UTFDataFormatException\r
+ {\r
+ // Copied from DataInput\r
+ byte[] bytearr = null;\r
+ char[] chararr = null;\r
+\r
+ {\r
+ bytearr = new byte[utflen];\r
+ chararr = new char[utflen];\r
+ }\r
+\r
+ int c, char2, char3;\r
+ int count = 0;\r
+ int chararr_count=0;\r
+\r
+ in.readFully(bytearr, 0, utflen);\r
+\r
+ while (count < utflen) {\r
+ c = (int) bytearr[count] & 0xff; \r
+ if (c > 127) break;\r
+ count++;\r
+ chararr[chararr_count++]=(char)c;\r
+ }\r
+\r
+ while (count < utflen) {\r
+ c = (int) bytearr[count] & 0xff;\r
+ switch (c >> 4) {\r
+ case 0: case 1: case 2: case 3: case 4: case 5: case 6: case 7:\r
+ /* 0xxxxxxx*/\r
+ count++;\r
+ chararr[chararr_count++]=(char)c;\r
+ break;\r
+ case 12: case 13:\r
+ /* 110x xxxx 10xx xxxx*/\r
+ count += 2;\r
+ if (count > utflen)\r
+ throw new UTFDataFormatException(\r
+ "malformed input: partial character at end");\r
+ char2 = (int) bytearr[count-1];\r
+ if ((char2 & 0xC0) != 0x80)\r
+ throw new UTFDataFormatException(\r
+ "malformed input around byte " + count); \r
+ chararr[chararr_count++]=(char)(((c & 0x1F) << 6) | \r
+ (char2 & 0x3F)); \r
+ break;\r
+ case 14:\r
+ /* 1110 xxxx 10xx xxxx 10xx xxxx */\r
+ count += 3;\r
+ if (count > utflen)\r
+ throw new UTFDataFormatException(\r
+ "malformed input: partial character at end");\r
+ char2 = (int) bytearr[count-2];\r
+ char3 = (int) bytearr[count-1];\r
+ if (((char2 & 0xC0) != 0x80) || ((char3 & 0xC0) != 0x80))\r
+ throw new UTFDataFormatException(\r
+ "malformed input around byte " + (count-1));\r
+ chararr[chararr_count++]=(char)(((c & 0x0F) << 12) |\r
+ ((char2 & 0x3F) << 6) |\r
+ ((char3 & 0x3F) << 0));\r
+ break;\r
+ default:\r
+ /* 10xx xxxx, 1111 xxxx */\r
+ throw new UTFDataFormatException(\r
+ "malformed input around byte " + count);\r
+ }\r
+ }\r
+ // The number of chars produced may be less than utflen\r
+ return new String(chararr, 0, chararr_count);\r
+ } \r
+ \r
+ /**\r
+ * Write Standard-UTF8 to a stream.\r
+ * \r
+ * @param str\r
+ * @param out\r
+ * @throws IOException\r
+ */\r
+ public static void writeUTF(DataOutput out, String str)\r
+ throws IOException\r
+ {\r
+ byte[] bytes = str.getBytes(CHARSET);\r
+ out.write(bytes);\r
+ }\r
+ \r
+ /**\r
+ * Read Standard-UTF8 from a stream\r
+ * @param in input\r
+ * @param len number of bytes\r
+ * @return string\r
+ * @throws IOException\r
+ */\r
+ public static String readUTF(DataInput in, int len)\r
+ throws IOException\r
+ {\r
+ byte[] bytes = new byte[len];\r
+ in.readFully(bytes);\r
+ return new String(bytes, UTF8.CHARSET);\r
+ }\r
+ \r
+}\r