bundles/org.simantics.databoard/src/org/simantics/databoard/util/binary/UTF8.java

   1 package org.simantics.databoard.util.binary;
   2
   3 import java.io.DataInput;
   4 import java.io.DataOutput;
   5 import java.io.IOException;
   6 import java.io.UTFDataFormatException;
   7 import java.nio.charset.Charset;
   8
   9 /**
  10  * Utils for handling Standard-UTF8 and <a href="http://download.oracle.com/javase/6/docs/api/java/io/DataInput.html">Modified-UTF8</a> Strings.<p>
  11  *
  12  * The differences between standard UTF8 and Modified are the following:
  13  * <ul>
  14  * <li>The null byte <code>'&#92;u0000'</code> is encoded in 2-byte format
  15  *     rather than 1-byte, so that the encoded strings never have
  16  *     embedded nulls.
  17  * <li>Only the 1-byte, 2-byte, and 3-byte formats are used.
  18  * <li><a href="../lang/Character.html#unicode">Supplementary characters</a>
  19  *     are represented in the form of surrogate pairs.
  20  * </ul>
  21  *
  22  */
  23 public class UTF8 {
  24
  25         public static final Charset CHARSET = Charset.forName("utf-8");
  26
  27         /**
  28          * Get the number of bytes in an UTF-8 encoding of a string
  29          *
  30          * @param string
  31          * @return byte length
  32          */
  33         public static int getUTF8EncodingByteLength(String string)
  34         {
  35                 // TODO ?May not work properly properly on some characters that are encoded with 2 chars in UTF-16?
  36
  37                 // Correct
  38                 //return string.getBytes(UTF8).length;
  39
  40                 // http://en.wikipedia.org/wiki/UTF-8
  41                 int result = 0;
  42                 int length = string.length();
  43                 for (int i=0; i<length; i++)
  44                 {
  45                         char c = string.charAt(i);
  46                         if (c>=0 && c<=0x7f) {
  47                                 result += 1;
  48                         } else if (c>=0x80 && c<=0x07ff) {
  49                                 result += 2;
  50                         } else if (c>=0xD800 && c<=0xDFFF) {
  51                                 result += 1;
  52                         } else if (c>=0x800 && c<=0xffff) {
  53                                 result += 3;
  54                         }
  55                         // Not really used as char is 16-bit
  56                         else if (c>=0x10000 && c<=0x10ffff) {
  57                                 result += 4;
  58                         } else if (c>=0x110000 && c<=0x1FFFFF) {
  59                                 result += 4;
  60                         } else {
  61                                 // NOT IN RFC 3629
  62                                 result += 5;
  63                         }
  64                 }
  65                 return result;
  66         }
  67
  68
  69         /**
  70          * Get the number of bytes in an Modified-UTF-8 encoding of a string
  71          *
  72          * @param str
  73          * @return byte length
  74          */
  75         public static int getModifiedUTF8EncodingByteLength(String str)
  76         {
  77         int strlen = str.length();
  78         int utflen = 0;
  79         int c = 0;
  80
  81         /* use charAt instead of copying String to char array */
  82         for (int i = 0; i < strlen; i++) {
  83                 c = str.charAt(i);
  84             if ((c >= 0x0001) && (c <= 0x007F)) {
  85                 utflen++;
  86             } else if (c > 0x07FF) {
  87                 utflen += 3;
  88             } else {
  89                 utflen += 2;
  90             }
  91         }
  92         return utflen;
  93         }
  94
  95         /**
  96          * Write Modified-UTF8 to a stream.
  97          *
  98          * @param out output stream
  99          * @param str string
 100          * @throws IOException
 101          */
 102     public static void writeModifiedUTF(DataOutput out, String str) throws IOException {
 103         // Copied from DataOutput
 104         int strlen = str.length();
 105         int c = 0;
 106
 107         int i=0;
 108         for (i=0; i<strlen; i++) {
 109            c = str.charAt(i);
 110            if (!((c >= 0x0001) && (c <= 0x007F))) break;
 111            out.write(c);
 112         }
 113
 114         for (;i < strlen; i++){
 115             c = str.charAt(i);
 116             if ((c >= 0x0001) && (c <= 0x007F)) {
 117                 out.write( c );
 118             } else if (c > 0x07FF) {
 119                 out.write(0xE0 | ((c >> 12) & 0x0F));
 120                 out.write(0x80 | ((c >>  6) & 0x3F));
 121                 out.write(0x80 | ((c >>  0) & 0x3F));
 122             } else {
 123                 out.write(0xC0 | ((c >>  6) & 0x1F));
 124                 out.write(0x80 | ((c >>  0) & 0x3F));
 125             }
 126         }
 127     }
 128
 129     /**
 130      * Read Modified-UTF8 from a stream
 131      * @param in input
 132      * @param utflen number of bytes
 133      * @return string
 134      * @throws IOException
 135      */
 136     public static String readModifiedUTF(DataInput in, int utflen)
 137     throws IOException, UTFDataFormatException
 138     {
 139         // Copied from DataInput
 140         byte[] bytearr = null;
 141         char[] chararr = null;
 142
 143         {
 144             bytearr = new byte[utflen];
 145             chararr = new char[utflen];
 146         }
 147
 148         int c, char2, char3;
 149         int count = 0;
 150         int chararr_count=0;
 151
 152         in.readFully(bytearr, 0, utflen);
 153
 154         while (count < utflen) {
 155             c = (int) bytearr[count] & 0xff;
 156             if (c > 127) break;
 157             count++;
 158             chararr[chararr_count++]=(char)c;
 159         }
 160
 161         while (count < utflen) {
 162             c = (int) bytearr[count] & 0xff;
 163             switch (c >> 4) {
 164                 case 0: case 1: case 2: case 3: case 4: case 5: case 6: case 7:
 165                     /* 0xxxxxxx*/
 166                     count++;
 167                     chararr[chararr_count++]=(char)c;
 168                     break;
 169                 case 12: case 13:
 170                     /* 110x xxxx   10xx xxxx*/
 171                     count += 2;
 172                     if (count > utflen)
 173                         throw new UTFDataFormatException(
 174                             "malformed input: partial character at end");
 175                     char2 = (int) bytearr[count-1];
 176                     if ((char2 & 0xC0) != 0x80)
 177                         throw new UTFDataFormatException(
 178                             "malformed input around byte " + count);
 179                     chararr[chararr_count++]=(char)(((c & 0x1F) << 6) |
 180                                                     (char2 & 0x3F));
 181                     break;
 182                 case 14:
 183                     /* 1110 xxxx  10xx xxxx  10xx xxxx */
 184                     count += 3;
 185                     if (count > utflen)
 186                         throw new UTFDataFormatException(
 187                             "malformed input: partial character at end");
 188                     char2 = (int) bytearr[count-2];
 189                     char3 = (int) bytearr[count-1];
 190                     if (((char2 & 0xC0) != 0x80) || ((char3 & 0xC0) != 0x80))
 191                         throw new UTFDataFormatException(
 192                             "malformed input around byte " + (count-1));
 193                     chararr[chararr_count++]=(char)(((c     & 0x0F) << 12) |
 194                                                     ((char2 & 0x3F) << 6)  |
 195                                                     ((char3 & 0x3F) << 0));
 196                     break;
 197                 default:
 198                     /* 10xx xxxx,  1111 xxxx */
 199                     throw new UTFDataFormatException(
 200                         "malformed input around byte " + count);
 201             }
 202         }
 203         // The number of chars produced may be less than utflen
 204         return new String(chararr, 0, chararr_count);
 205     }
 206
 207     /**
 208      * Write Standard-UTF8 to a stream.
 209      *
 210      * @param str
 211      * @param out
 212      * @throws IOException
 213      */
 214     public static void writeUTF(DataOutput out, String str)
 215     throws IOException
 216     {
 217                 byte[] bytes = str.getBytes(CHARSET);
 218                 out.write(bytes);
 219     }
 220
 221     /**
 222      * Read Standard-UTF8 from a stream
 223      * @param in input
 224      * @param len number of bytes
 225      * @return string
 226      * @throws IOException
 227      */
 228     public static String readUTF(DataInput in, int len)
 229     throws IOException
 230     {
 231                 byte[] bytes = new byte[len];
 232                 in.readFully(bytes);
 233                 return new String(bytes, UTF8.CHARSET);
 234     }
 235
 236 }