bundles/org.simantics.databoard/src/org/simantics/databoard/util/binary/UTF8.java

   1 package org.simantics.databoard.util.binary;\r
   2 \r
   3 import java.io.DataInput;\r
   4 import java.io.DataOutput;\r
   5 import java.io.IOException;\r
   6 import java.io.UTFDataFormatException;\r
   7 import java.nio.charset.Charset;\r
   8 \r
   9 /**\r
  10  * Utils for handling Standard-UTF8 and <a href="http://download.oracle.com/javase/6/docs/api/java/io/DataInput.html">Modified-UTF8</a> Strings.<p>\r
  11  * \r
  12  * The differences between standard UTF8 and Modified are the following:\r
  13  * <ul>\r
  14  * <li>The null byte <code>'&#92;u0000'</code> is encoded in 2-byte format\r
  15  *     rather than 1-byte, so that the encoded strings never have\r
  16  *     embedded nulls.\r
  17  * <li>Only the 1-byte, 2-byte, and 3-byte formats are used.\r
  18  * <li><a href="../lang/Character.html#unicode">Supplementary characters</a>\r
  19  *     are represented in the form of surrogate pairs.\r
  20  * </ul>\r
  21  * \r
  22  */\r
  23 public class UTF8 {\r
  24 \r
  25         public static final Charset CHARSET = Charset.forName("utf-8");\r
  26         \r
  27         /**\r
  28          * Get the number of bytes in an UTF-8 encoding of a string \r
  29          * \r
  30          * @param string\r
  31          * @return byte length\r
  32          */\r
  33         public static int getUTF8EncodingByteLength(String string)\r
  34         {\r
  35                 // TODO ?May not work properly properly on some characters that are encoded with 2 chars in UTF-16? \r
  36                 \r
  37                 // Correct\r
  38                 //return string.getBytes(UTF8).length;\r
  39                 \r
  40                 // http://en.wikipedia.org/wiki/UTF-8\r
  41                 int result = 0;\r
  42                 int length = string.length();\r
  43                 for (int i=0; i<length; i++)\r
  44                 {\r
  45                         char c = string.charAt(i);\r
  46                         if (c>=0 && c<=0x7f) {\r
  47                                 result += 1;\r
  48                         } else if (c>=0x80 && c<=0x07ff) {\r
  49                                 result += 2;\r
  50                         } else if (c>=0xD800 && c<=0xDFFF) {\r
  51                                 result += 1;\r
  52                         } else if (c>=0x800 && c<=0xffff) {\r
  53                                 result += 3;\r
  54                         }\r
  55                         // Not really used as char is 16-bit\r
  56                         else if (c>=0x10000 && c<=0x10ffff) {\r
  57                                 result += 4;\r
  58                         } else if (c>=0x110000 && c<=0x1FFFFF) {\r
  59                                 result += 4;\r
  60                         } else {\r
  61                                 // NOT IN RFC 3629\r
  62                                 result += 5;\r
  63                         }\r
  64                 }\r
  65                 return result;                          \r
  66         }\r
  67         \r
  68 \r
  69         /**\r
  70          * Get the number of bytes in an Modified-UTF-8 encoding of a string \r
  71          * \r
  72          * @param str\r
  73          * @return byte length\r
  74          */\r
  75         public static int getModifiedUTF8EncodingByteLength(String str)\r
  76         {\r
  77         int strlen = str.length();\r
  78         int utflen = 0;\r
  79         int c = 0;\r
  80      \r
  81         /* use charAt instead of copying String to char array */\r
  82         for (int i = 0; i < strlen; i++) {\r
  83                 c = str.charAt(i);\r
  84             if ((c >= 0x0001) && (c <= 0x007F)) {\r
  85                 utflen++;\r
  86             } else if (c > 0x07FF) {\r
  87                 utflen += 3;\r
  88             } else {\r
  89                 utflen += 2;\r
  90             }\r
  91         }\r
  92         return utflen;\r
  93         }\r
  94     \r
  95         /**\r
  96          * Write Modified-UTF8 to a stream.\r
  97          * \r
  98          * @param out output stream \r
  99          * @param str string\r
 100          * @throws IOException\r
 101          */\r
 102     public static void writeModifiedUTF(DataOutput out, String str) throws IOException {\r
 103         // Copied from DataOutput\r
 104         int strlen = str.length();\r
 105         int c = 0;\r
 106         \r
 107         int i=0;\r
 108         for (i=0; i<strlen; i++) {\r
 109            c = str.charAt(i);\r
 110            if (!((c >= 0x0001) && (c <= 0x007F))) break;\r
 111            out.write(c);\r
 112         }\r
 113         \r
 114         for (;i < strlen; i++){\r
 115             c = str.charAt(i);\r
 116             if ((c >= 0x0001) && (c <= 0x007F)) {\r
 117                 out.write( c );\r
 118             } else if (c > 0x07FF) {\r
 119                 out.write(0xE0 | ((c >> 12) & 0x0F));\r
 120                 out.write(0x80 | ((c >>  6) & 0x3F));\r
 121                 out.write(0x80 | ((c >>  0) & 0x3F));\r
 122             } else {\r
 123                 out.write(0xC0 | ((c >>  6) & 0x1F));\r
 124                 out.write(0x80 | ((c >>  0) & 0x3F));\r
 125             }\r
 126         }\r
 127     }\r
 128     \r
 129     /**\r
 130      * Read Modified-UTF8 from a stream\r
 131      * @param in input\r
 132      * @param utflen number of bytes\r
 133      * @return string\r
 134      * @throws IOException\r
 135      */\r
 136     public static String readModifiedUTF(DataInput in, int utflen)\r
 137     throws IOException, UTFDataFormatException\r
 138     {\r
 139         // Copied from DataInput\r
 140         byte[] bytearr = null;\r
 141         char[] chararr = null;\r
 142 \r
 143         {\r
 144             bytearr = new byte[utflen];\r
 145             chararr = new char[utflen];\r
 146         }\r
 147 \r
 148         int c, char2, char3;\r
 149         int count = 0;\r
 150         int chararr_count=0;\r
 151 \r
 152         in.readFully(bytearr, 0, utflen);\r
 153 \r
 154         while (count < utflen) {\r
 155             c = (int) bytearr[count] & 0xff;      \r
 156             if (c > 127) break;\r
 157             count++;\r
 158             chararr[chararr_count++]=(char)c;\r
 159         }\r
 160 \r
 161         while (count < utflen) {\r
 162             c = (int) bytearr[count] & 0xff;\r
 163             switch (c >> 4) {\r
 164                 case 0: case 1: case 2: case 3: case 4: case 5: case 6: case 7:\r
 165                     /* 0xxxxxxx*/\r
 166                     count++;\r
 167                     chararr[chararr_count++]=(char)c;\r
 168                     break;\r
 169                 case 12: case 13:\r
 170                     /* 110x xxxx   10xx xxxx*/\r
 171                     count += 2;\r
 172                     if (count > utflen)\r
 173                         throw new UTFDataFormatException(\r
 174                             "malformed input: partial character at end");\r
 175                     char2 = (int) bytearr[count-1];\r
 176                     if ((char2 & 0xC0) != 0x80)\r
 177                         throw new UTFDataFormatException(\r
 178                             "malformed input around byte " + count); \r
 179                     chararr[chararr_count++]=(char)(((c & 0x1F) << 6) | \r
 180                                                     (char2 & 0x3F));  \r
 181                     break;\r
 182                 case 14:\r
 183                     /* 1110 xxxx  10xx xxxx  10xx xxxx */\r
 184                     count += 3;\r
 185                     if (count > utflen)\r
 186                         throw new UTFDataFormatException(\r
 187                             "malformed input: partial character at end");\r
 188                     char2 = (int) bytearr[count-2];\r
 189                     char3 = (int) bytearr[count-1];\r
 190                     if (((char2 & 0xC0) != 0x80) || ((char3 & 0xC0) != 0x80))\r
 191                         throw new UTFDataFormatException(\r
 192                             "malformed input around byte " + (count-1));\r
 193                     chararr[chararr_count++]=(char)(((c     & 0x0F) << 12) |\r
 194                                                     ((char2 & 0x3F) << 6)  |\r
 195                                                     ((char3 & 0x3F) << 0));\r
 196                     break;\r
 197                 default:\r
 198                     /* 10xx xxxx,  1111 xxxx */\r
 199                     throw new UTFDataFormatException(\r
 200                         "malformed input around byte " + count);\r
 201             }\r
 202         }\r
 203         // The number of chars produced may be less than utflen\r
 204         return new String(chararr, 0, chararr_count);\r
 205     }    \r
 206     \r
 207     /**\r
 208      * Write Standard-UTF8 to a stream.\r
 209      * \r
 210      * @param str\r
 211      * @param out\r
 212      * @throws IOException\r
 213      */\r
 214     public static void writeUTF(DataOutput out, String str)\r
 215     throws IOException\r
 216     {\r
 217                 byte[] bytes = str.getBytes(CHARSET);\r
 218                 out.write(bytes);\r
 219     }\r
 220     \r
 221     /**\r
 222      * Read Standard-UTF8 from a stream\r
 223      * @param in input\r
 224      * @param len number of bytes\r
 225      * @return string\r
 226      * @throws IOException\r
 227      */\r
 228     public static String readUTF(DataInput in, int len)\r
 229     throws IOException\r
 230     {\r
 231                 byte[] bytes = new byte[len];\r
 232                 in.readFully(bytes);\r
 233                 return new String(bytes, UTF8.CHARSET);\r
 234     }\r
 235         \r
 236 }\r