]> gerrit.simantics Code Review - simantics/platform.git/blob - bundles/org.simantics.databoard/src/org/simantics/databoard/util/binary/UTF8.java
Fixed all line endings of the repository
[simantics/platform.git] / bundles / org.simantics.databoard / src / org / simantics / databoard / util / binary / UTF8.java
1 package org.simantics.databoard.util.binary;
2
3 import java.io.DataInput;
4 import java.io.DataOutput;
5 import java.io.IOException;
6 import java.io.UTFDataFormatException;
7 import java.nio.charset.Charset;
8
9 /**
10  * Utils for handling Standard-UTF8 and <a href="http://download.oracle.com/javase/6/docs/api/java/io/DataInput.html">Modified-UTF8</a> Strings.<p>
11  * 
12  * The differences between standard UTF8 and Modified are the following:
13  * <ul>
14  * <li>The null byte <code>'&#92;u0000'</code> is encoded in 2-byte format
15  *     rather than 1-byte, so that the encoded strings never have
16  *     embedded nulls.
17  * <li>Only the 1-byte, 2-byte, and 3-byte formats are used.
18  * <li><a href="../lang/Character.html#unicode">Supplementary characters</a>
19  *     are represented in the form of surrogate pairs.
20  * </ul>
21  * 
22  */
23 public class UTF8 {
24
25         public static final Charset CHARSET = Charset.forName("utf-8");
26         
27         /**
28          * Get the number of bytes in an UTF-8 encoding of a string 
29          * 
30          * @param string
31          * @return byte length
32          */
33         public static int getUTF8EncodingByteLength(String string)
34         {
35                 // TODO ?May not work properly properly on some characters that are encoded with 2 chars in UTF-16? 
36                 
37                 // Correct
38                 //return string.getBytes(UTF8).length;
39                 
40                 // http://en.wikipedia.org/wiki/UTF-8
41                 int result = 0;
42                 int length = string.length();
43                 for (int i=0; i<length; i++)
44                 {
45                         char c = string.charAt(i);
46                         if (c>=0 && c<=0x7f) {
47                                 result += 1;
48                         } else if (c>=0x80 && c<=0x07ff) {
49                                 result += 2;
50                         } else if (c>=0xD800 && c<=0xDFFF) {
51                                 result += 1;
52                         } else if (c>=0x800 && c<=0xffff) {
53                                 result += 3;
54                         }
55                         // Not really used as char is 16-bit
56                         else if (c>=0x10000 && c<=0x10ffff) {
57                                 result += 4;
58                         } else if (c>=0x110000 && c<=0x1FFFFF) {
59                                 result += 4;
60                         } else {
61                                 // NOT IN RFC 3629
62                                 result += 5;
63                         }
64                 }
65                 return result;                          
66         }
67         
68
69         /**
70          * Get the number of bytes in an Modified-UTF-8 encoding of a string 
71          * 
72          * @param str
73          * @return byte length
74          */
75         public static int getModifiedUTF8EncodingByteLength(String str)
76         {
77         int strlen = str.length();
78         int utflen = 0;
79         int c = 0;
80      
81         /* use charAt instead of copying String to char array */
82         for (int i = 0; i < strlen; i++) {
83                 c = str.charAt(i);
84             if ((c >= 0x0001) && (c <= 0x007F)) {
85                 utflen++;
86             } else if (c > 0x07FF) {
87                 utflen += 3;
88             } else {
89                 utflen += 2;
90             }
91         }
92         return utflen;
93         }
94     
95         /**
96          * Write Modified-UTF8 to a stream.
97          * 
98          * @param out output stream 
99          * @param str string
100          * @throws IOException
101          */
102     public static void writeModifiedUTF(DataOutput out, String str) throws IOException {
103         // Copied from DataOutput
104         int strlen = str.length();
105         int c = 0;
106         
107         int i=0;
108         for (i=0; i<strlen; i++) {
109            c = str.charAt(i);
110            if (!((c >= 0x0001) && (c <= 0x007F))) break;
111            out.write(c);
112         }
113         
114         for (;i < strlen; i++){
115             c = str.charAt(i);
116             if ((c >= 0x0001) && (c <= 0x007F)) {
117                 out.write( c );
118             } else if (c > 0x07FF) {
119                 out.write(0xE0 | ((c >> 12) & 0x0F));
120                 out.write(0x80 | ((c >>  6) & 0x3F));
121                 out.write(0x80 | ((c >>  0) & 0x3F));
122             } else {
123                 out.write(0xC0 | ((c >>  6) & 0x1F));
124                 out.write(0x80 | ((c >>  0) & 0x3F));
125             }
126         }
127     }
128     
129     /**
130      * Read Modified-UTF8 from a stream
131      * @param in input
132      * @param utflen number of bytes
133      * @return string
134      * @throws IOException
135      */
136     public static String readModifiedUTF(DataInput in, int utflen)
137     throws IOException, UTFDataFormatException
138     {
139         // Copied from DataInput
140         byte[] bytearr = null;
141         char[] chararr = null;
142
143         {
144             bytearr = new byte[utflen];
145             chararr = new char[utflen];
146         }
147
148         int c, char2, char3;
149         int count = 0;
150         int chararr_count=0;
151
152         in.readFully(bytearr, 0, utflen);
153
154         while (count < utflen) {
155             c = (int) bytearr[count] & 0xff;      
156             if (c > 127) break;
157             count++;
158             chararr[chararr_count++]=(char)c;
159         }
160
161         while (count < utflen) {
162             c = (int) bytearr[count] & 0xff;
163             switch (c >> 4) {
164                 case 0: case 1: case 2: case 3: case 4: case 5: case 6: case 7:
165                     /* 0xxxxxxx*/
166                     count++;
167                     chararr[chararr_count++]=(char)c;
168                     break;
169                 case 12: case 13:
170                     /* 110x xxxx   10xx xxxx*/
171                     count += 2;
172                     if (count > utflen)
173                         throw new UTFDataFormatException(
174                             "malformed input: partial character at end");
175                     char2 = (int) bytearr[count-1];
176                     if ((char2 & 0xC0) != 0x80)
177                         throw new UTFDataFormatException(
178                             "malformed input around byte " + count); 
179                     chararr[chararr_count++]=(char)(((c & 0x1F) << 6) | 
180                                                     (char2 & 0x3F));  
181                     break;
182                 case 14:
183                     /* 1110 xxxx  10xx xxxx  10xx xxxx */
184                     count += 3;
185                     if (count > utflen)
186                         throw new UTFDataFormatException(
187                             "malformed input: partial character at end");
188                     char2 = (int) bytearr[count-2];
189                     char3 = (int) bytearr[count-1];
190                     if (((char2 & 0xC0) != 0x80) || ((char3 & 0xC0) != 0x80))
191                         throw new UTFDataFormatException(
192                             "malformed input around byte " + (count-1));
193                     chararr[chararr_count++]=(char)(((c     & 0x0F) << 12) |
194                                                     ((char2 & 0x3F) << 6)  |
195                                                     ((char3 & 0x3F) << 0));
196                     break;
197                 default:
198                     /* 10xx xxxx,  1111 xxxx */
199                     throw new UTFDataFormatException(
200                         "malformed input around byte " + count);
201             }
202         }
203         // The number of chars produced may be less than utflen
204         return new String(chararr, 0, chararr_count);
205     }    
206     
207     /**
208      * Write Standard-UTF8 to a stream.
209      * 
210      * @param str
211      * @param out
212      * @throws IOException
213      */
214     public static void writeUTF(DataOutput out, String str)
215     throws IOException
216     {
217                 byte[] bytes = str.getBytes(CHARSET);
218                 out.write(bytes);
219     }
220     
221     /**
222      * Read Standard-UTF8 from a stream
223      * @param in input
224      * @param len number of bytes
225      * @return string
226      * @throws IOException
227      */
228     public static String readUTF(DataInput in, int len)
229     throws IOException
230     {
231                 byte[] bytes = new byte[len];
232                 in.readFully(bytes);
233                 return new String(bytes, UTF8.CHARSET);
234     }
235         
236 }