-package org.simantics.databoard.util.binary;\r
-\r
-import java.io.DataInput;\r
-import java.io.DataOutput;\r
-import java.io.IOException;\r
-import java.io.UTFDataFormatException;\r
-import java.nio.charset.Charset;\r
-\r
-/**\r
- * Utils for handling Standard-UTF8 and <a href="http://download.oracle.com/javase/6/docs/api/java/io/DataInput.html">Modified-UTF8</a> Strings.<p>\r
- * \r
- * The differences between standard UTF8 and Modified are the following:\r
- * <ul>\r
- * <li>The null byte <code>'\u0000'</code> is encoded in 2-byte format\r
- * rather than 1-byte, so that the encoded strings never have\r
- * embedded nulls.\r
- * <li>Only the 1-byte, 2-byte, and 3-byte formats are used.\r
- * <li><a href="../lang/Character.html#unicode">Supplementary characters</a>\r
- * are represented in the form of surrogate pairs.\r
- * </ul>\r
- * \r
- */\r
-public class UTF8 {\r
-\r
- public static final Charset CHARSET = Charset.forName("utf-8");\r
- \r
- /**\r
- * Get the number of bytes in an UTF-8 encoding of a string \r
- * \r
- * @param string\r
- * @return byte length\r
- */\r
- public static int getUTF8EncodingByteLength(String string)\r
- {\r
- // TODO ?May not work properly properly on some characters that are encoded with 2 chars in UTF-16? \r
- \r
- // Correct\r
- //return string.getBytes(UTF8).length;\r
- \r
- // http://en.wikipedia.org/wiki/UTF-8\r
- int result = 0;\r
- int length = string.length();\r
- for (int i=0; i<length; i++)\r
- {\r
- char c = string.charAt(i);\r
- if (c>=0 && c<=0x7f) {\r
- result += 1;\r
- } else if (c>=0x80 && c<=0x07ff) {\r
- result += 2;\r
- } else if (c>=0xD800 && c<=0xDFFF) {\r
- result += 1;\r
- } else if (c>=0x800 && c<=0xffff) {\r
- result += 3;\r
- }\r
- // Not really used as char is 16-bit\r
- else if (c>=0x10000 && c<=0x10ffff) {\r
- result += 4;\r
- } else if (c>=0x110000 && c<=0x1FFFFF) {\r
- result += 4;\r
- } else {\r
- // NOT IN RFC 3629\r
- result += 5;\r
- }\r
- }\r
- return result; \r
- }\r
- \r
-\r
- /**\r
- * Get the number of bytes in an Modified-UTF-8 encoding of a string \r
- * \r
- * @param str\r
- * @return byte length\r
- */\r
- public static int getModifiedUTF8EncodingByteLength(String str)\r
- {\r
- int strlen = str.length();\r
- int utflen = 0;\r
- int c = 0;\r
- \r
- /* use charAt instead of copying String to char array */\r
- for (int i = 0; i < strlen; i++) {\r
- c = str.charAt(i);\r
- if ((c >= 0x0001) && (c <= 0x007F)) {\r
- utflen++;\r
- } else if (c > 0x07FF) {\r
- utflen += 3;\r
- } else {\r
- utflen += 2;\r
- }\r
- }\r
- return utflen;\r
- }\r
- \r
- /**\r
- * Write Modified-UTF8 to a stream.\r
- * \r
- * @param out output stream \r
- * @param str string\r
- * @throws IOException\r
- */\r
- public static void writeModifiedUTF(DataOutput out, String str) throws IOException {\r
- // Copied from DataOutput\r
- int strlen = str.length();\r
- int c = 0;\r
- \r
- int i=0;\r
- for (i=0; i<strlen; i++) {\r
- c = str.charAt(i);\r
- if (!((c >= 0x0001) && (c <= 0x007F))) break;\r
- out.write(c);\r
- }\r
- \r
- for (;i < strlen; i++){\r
- c = str.charAt(i);\r
- if ((c >= 0x0001) && (c <= 0x007F)) {\r
- out.write( c );\r
- } else if (c > 0x07FF) {\r
- out.write(0xE0 | ((c >> 12) & 0x0F));\r
- out.write(0x80 | ((c >> 6) & 0x3F));\r
- out.write(0x80 | ((c >> 0) & 0x3F));\r
- } else {\r
- out.write(0xC0 | ((c >> 6) & 0x1F));\r
- out.write(0x80 | ((c >> 0) & 0x3F));\r
- }\r
- }\r
- }\r
- \r
- /**\r
- * Read Modified-UTF8 from a stream\r
- * @param in input\r
- * @param utflen number of bytes\r
- * @return string\r
- * @throws IOException\r
- */\r
- public static String readModifiedUTF(DataInput in, int utflen)\r
- throws IOException, UTFDataFormatException\r
- {\r
- // Copied from DataInput\r
- byte[] bytearr = null;\r
- char[] chararr = null;\r
-\r
- {\r
- bytearr = new byte[utflen];\r
- chararr = new char[utflen];\r
- }\r
-\r
- int c, char2, char3;\r
- int count = 0;\r
- int chararr_count=0;\r
-\r
- in.readFully(bytearr, 0, utflen);\r
-\r
- while (count < utflen) {\r
- c = (int) bytearr[count] & 0xff; \r
- if (c > 127) break;\r
- count++;\r
- chararr[chararr_count++]=(char)c;\r
- }\r
-\r
- while (count < utflen) {\r
- c = (int) bytearr[count] & 0xff;\r
- switch (c >> 4) {\r
- case 0: case 1: case 2: case 3: case 4: case 5: case 6: case 7:\r
- /* 0xxxxxxx*/\r
- count++;\r
- chararr[chararr_count++]=(char)c;\r
- break;\r
- case 12: case 13:\r
- /* 110x xxxx 10xx xxxx*/\r
- count += 2;\r
- if (count > utflen)\r
- throw new UTFDataFormatException(\r
- "malformed input: partial character at end");\r
- char2 = (int) bytearr[count-1];\r
- if ((char2 & 0xC0) != 0x80)\r
- throw new UTFDataFormatException(\r
- "malformed input around byte " + count); \r
- chararr[chararr_count++]=(char)(((c & 0x1F) << 6) | \r
- (char2 & 0x3F)); \r
- break;\r
- case 14:\r
- /* 1110 xxxx 10xx xxxx 10xx xxxx */\r
- count += 3;\r
- if (count > utflen)\r
- throw new UTFDataFormatException(\r
- "malformed input: partial character at end");\r
- char2 = (int) bytearr[count-2];\r
- char3 = (int) bytearr[count-1];\r
- if (((char2 & 0xC0) != 0x80) || ((char3 & 0xC0) != 0x80))\r
- throw new UTFDataFormatException(\r
- "malformed input around byte " + (count-1));\r
- chararr[chararr_count++]=(char)(((c & 0x0F) << 12) |\r
- ((char2 & 0x3F) << 6) |\r
- ((char3 & 0x3F) << 0));\r
- break;\r
- default:\r
- /* 10xx xxxx, 1111 xxxx */\r
- throw new UTFDataFormatException(\r
- "malformed input around byte " + count);\r
- }\r
- }\r
- // The number of chars produced may be less than utflen\r
- return new String(chararr, 0, chararr_count);\r
- } \r
- \r
- /**\r
- * Write Standard-UTF8 to a stream.\r
- * \r
- * @param str\r
- * @param out\r
- * @throws IOException\r
- */\r
- public static void writeUTF(DataOutput out, String str)\r
- throws IOException\r
- {\r
- byte[] bytes = str.getBytes(CHARSET);\r
- out.write(bytes);\r
- }\r
- \r
- /**\r
- * Read Standard-UTF8 from a stream\r
- * @param in input\r
- * @param len number of bytes\r
- * @return string\r
- * @throws IOException\r
- */\r
- public static String readUTF(DataInput in, int len)\r
- throws IOException\r
- {\r
- byte[] bytes = new byte[len];\r
- in.readFully(bytes);\r
- return new String(bytes, UTF8.CHARSET);\r
- }\r
- \r
-}\r
+package org.simantics.databoard.util.binary;
+
+import java.io.DataInput;
+import java.io.DataOutput;
+import java.io.IOException;
+import java.io.UTFDataFormatException;
+import java.nio.charset.Charset;
+
+/**
+ * Utils for handling Standard-UTF8 and <a href="http://download.oracle.com/javase/6/docs/api/java/io/DataInput.html">Modified-UTF8</a> Strings.<p>
+ *
+ * The differences between standard UTF8 and Modified are the following:
+ * <ul>
+ * <li>The null byte <code>'\u0000'</code> is encoded in 2-byte format
+ * rather than 1-byte, so that the encoded strings never have
+ * embedded nulls.
+ * <li>Only the 1-byte, 2-byte, and 3-byte formats are used.
+ * <li><a href="../lang/Character.html#unicode">Supplementary characters</a>
+ * are represented in the form of surrogate pairs.
+ * </ul>
+ *
+ */
+public class UTF8 {
+
+ public static final Charset CHARSET = Charset.forName("utf-8");
+
+ /**
+ * Get the number of bytes in an UTF-8 encoding of a string
+ *
+ * @param string
+ * @return byte length
+ */
+ public static int getUTF8EncodingByteLength(String string)
+ {
+ // TODO ?May not work properly properly on some characters that are encoded with 2 chars in UTF-16?
+
+ // Correct
+ //return string.getBytes(UTF8).length;
+
+ // http://en.wikipedia.org/wiki/UTF-8
+ int result = 0;
+ int length = string.length();
+ for (int i=0; i<length; i++)
+ {
+ char c = string.charAt(i);
+ if (c>=0 && c<=0x7f) {
+ result += 1;
+ } else if (c>=0x80 && c<=0x07ff) {
+ result += 2;
+ } else if (c>=0xD800 && c<=0xDFFF) {
+ result += 1;
+ } else if (c>=0x800 && c<=0xffff) {
+ result += 3;
+ }
+ // Not really used as char is 16-bit
+ else if (c>=0x10000 && c<=0x10ffff) {
+ result += 4;
+ } else if (c>=0x110000 && c<=0x1FFFFF) {
+ result += 4;
+ } else {
+ // NOT IN RFC 3629
+ result += 5;
+ }
+ }
+ return result;
+ }
+
+
+ /**
+ * Get the number of bytes in an Modified-UTF-8 encoding of a string
+ *
+ * @param str
+ * @return byte length
+ */
+ public static int getModifiedUTF8EncodingByteLength(String str)
+ {
+ int strlen = str.length();
+ int utflen = 0;
+ int c = 0;
+
+ /* use charAt instead of copying String to char array */
+ for (int i = 0; i < strlen; i++) {
+ c = str.charAt(i);
+ if ((c >= 0x0001) && (c <= 0x007F)) {
+ utflen++;
+ } else if (c > 0x07FF) {
+ utflen += 3;
+ } else {
+ utflen += 2;
+ }
+ }
+ return utflen;
+ }
+
+ /**
+ * Write Modified-UTF8 to a stream.
+ *
+ * @param out output stream
+ * @param str string
+ * @throws IOException
+ */
+ public static void writeModifiedUTF(DataOutput out, String str) throws IOException {
+ // Copied from DataOutput
+ int strlen = str.length();
+ int c = 0;
+
+ int i=0;
+ for (i=0; i<strlen; i++) {
+ c = str.charAt(i);
+ if (!((c >= 0x0001) && (c <= 0x007F))) break;
+ out.write(c);
+ }
+
+ for (;i < strlen; i++){
+ c = str.charAt(i);
+ if ((c >= 0x0001) && (c <= 0x007F)) {
+ out.write( c );
+ } else if (c > 0x07FF) {
+ out.write(0xE0 | ((c >> 12) & 0x0F));
+ out.write(0x80 | ((c >> 6) & 0x3F));
+ out.write(0x80 | ((c >> 0) & 0x3F));
+ } else {
+ out.write(0xC0 | ((c >> 6) & 0x1F));
+ out.write(0x80 | ((c >> 0) & 0x3F));
+ }
+ }
+ }
+
+ /**
+ * Read Modified-UTF8 from a stream
+ * @param in input
+ * @param utflen number of bytes
+ * @return string
+ * @throws IOException
+ */
+ public static String readModifiedUTF(DataInput in, int utflen)
+ throws IOException, UTFDataFormatException
+ {
+ // Copied from DataInput
+ byte[] bytearr = null;
+ char[] chararr = null;
+
+ {
+ bytearr = new byte[utflen];
+ chararr = new char[utflen];
+ }
+
+ int c, char2, char3;
+ int count = 0;
+ int chararr_count=0;
+
+ in.readFully(bytearr, 0, utflen);
+
+ while (count < utflen) {
+ c = (int) bytearr[count] & 0xff;
+ if (c > 127) break;
+ count++;
+ chararr[chararr_count++]=(char)c;
+ }
+
+ while (count < utflen) {
+ c = (int) bytearr[count] & 0xff;
+ switch (c >> 4) {
+ case 0: case 1: case 2: case 3: case 4: case 5: case 6: case 7:
+ /* 0xxxxxxx*/
+ count++;
+ chararr[chararr_count++]=(char)c;
+ break;
+ case 12: case 13:
+ /* 110x xxxx 10xx xxxx*/
+ count += 2;
+ if (count > utflen)
+ throw new UTFDataFormatException(
+ "malformed input: partial character at end");
+ char2 = (int) bytearr[count-1];
+ if ((char2 & 0xC0) != 0x80)
+ throw new UTFDataFormatException(
+ "malformed input around byte " + count);
+ chararr[chararr_count++]=(char)(((c & 0x1F) << 6) |
+ (char2 & 0x3F));
+ break;
+ case 14:
+ /* 1110 xxxx 10xx xxxx 10xx xxxx */
+ count += 3;
+ if (count > utflen)
+ throw new UTFDataFormatException(
+ "malformed input: partial character at end");
+ char2 = (int) bytearr[count-2];
+ char3 = (int) bytearr[count-1];
+ if (((char2 & 0xC0) != 0x80) || ((char3 & 0xC0) != 0x80))
+ throw new UTFDataFormatException(
+ "malformed input around byte " + (count-1));
+ chararr[chararr_count++]=(char)(((c & 0x0F) << 12) |
+ ((char2 & 0x3F) << 6) |
+ ((char3 & 0x3F) << 0));
+ break;
+ default:
+ /* 10xx xxxx, 1111 xxxx */
+ throw new UTFDataFormatException(
+ "malformed input around byte " + count);
+ }
+ }
+ // The number of chars produced may be less than utflen
+ return new String(chararr, 0, chararr_count);
+ }
+
+ /**
+ * Write Standard-UTF8 to a stream.
+ *
+ * @param str
+ * @param out
+ * @throws IOException
+ */
+ public static void writeUTF(DataOutput out, String str)
+ throws IOException
+ {
+ byte[] bytes = str.getBytes(CHARSET);
+ out.write(bytes);
+ }
+
+ /**
+ * Read Standard-UTF8 from a stream
+ * @param in input
+ * @param len number of bytes
+ * @return string
+ * @throws IOException
+ */
+ public static String readUTF(DataInput in, int len)
+ throws IOException
+ {
+ byte[] bytes = new byte[len];
+ in.readFully(bytes);
+ return new String(bytes, UTF8.CHARSET);
+ }
+
+}