]> gerrit.simantics Code Review - simantics/platform.git/blob - bundles/org.simantics.databoard/src/org/simantics/databoard/util/binary/UTF8.java
Migrated source code from Simantics SVN
[simantics/platform.git] / bundles / org.simantics.databoard / src / org / simantics / databoard / util / binary / UTF8.java
1 package org.simantics.databoard.util.binary;\r
2 \r
3 import java.io.DataInput;\r
4 import java.io.DataOutput;\r
5 import java.io.IOException;\r
6 import java.io.UTFDataFormatException;\r
7 import java.nio.charset.Charset;\r
8 \r
9 /**\r
10  * Utils for handling Standard-UTF8 and <a href="http://download.oracle.com/javase/6/docs/api/java/io/DataInput.html">Modified-UTF8</a> Strings.<p>\r
11  * \r
12  * The differences between standard UTF8 and Modified are the following:\r
13  * <ul>\r
14  * <li>The null byte <code>'&#92;u0000'</code> is encoded in 2-byte format\r
15  *     rather than 1-byte, so that the encoded strings never have\r
16  *     embedded nulls.\r
17  * <li>Only the 1-byte, 2-byte, and 3-byte formats are used.\r
18  * <li><a href="../lang/Character.html#unicode">Supplementary characters</a>\r
19  *     are represented in the form of surrogate pairs.\r
20  * </ul>\r
21  * \r
22  */\r
23 public class UTF8 {\r
24 \r
25         public static final Charset CHARSET = Charset.forName("utf-8");\r
26         \r
27         /**\r
28          * Get the number of bytes in an UTF-8 encoding of a string \r
29          * \r
30          * @param string\r
31          * @return byte length\r
32          */\r
33         public static int getUTF8EncodingByteLength(String string)\r
34         {\r
35                 // TODO ?May not work properly properly on some characters that are encoded with 2 chars in UTF-16? \r
36                 \r
37                 // Correct\r
38                 //return string.getBytes(UTF8).length;\r
39                 \r
40                 // http://en.wikipedia.org/wiki/UTF-8\r
41                 int result = 0;\r
42                 int length = string.length();\r
43                 for (int i=0; i<length; i++)\r
44                 {\r
45                         char c = string.charAt(i);\r
46                         if (c>=0 && c<=0x7f) {\r
47                                 result += 1;\r
48                         } else if (c>=0x80 && c<=0x07ff) {\r
49                                 result += 2;\r
50                         } else if (c>=0xD800 && c<=0xDFFF) {\r
51                                 result += 1;\r
52                         } else if (c>=0x800 && c<=0xffff) {\r
53                                 result += 3;\r
54                         }\r
55                         // Not really used as char is 16-bit\r
56                         else if (c>=0x10000 && c<=0x10ffff) {\r
57                                 result += 4;\r
58                         } else if (c>=0x110000 && c<=0x1FFFFF) {\r
59                                 result += 4;\r
60                         } else {\r
61                                 // NOT IN RFC 3629\r
62                                 result += 5;\r
63                         }\r
64                 }\r
65                 return result;                          \r
66         }\r
67         \r
68 \r
69         /**\r
70          * Get the number of bytes in an Modified-UTF-8 encoding of a string \r
71          * \r
72          * @param str\r
73          * @return byte length\r
74          */\r
75         public static int getModifiedUTF8EncodingByteLength(String str)\r
76         {\r
77         int strlen = str.length();\r
78         int utflen = 0;\r
79         int c = 0;\r
80      \r
81         /* use charAt instead of copying String to char array */\r
82         for (int i = 0; i < strlen; i++) {\r
83                 c = str.charAt(i);\r
84             if ((c >= 0x0001) && (c <= 0x007F)) {\r
85                 utflen++;\r
86             } else if (c > 0x07FF) {\r
87                 utflen += 3;\r
88             } else {\r
89                 utflen += 2;\r
90             }\r
91         }\r
92         return utflen;\r
93         }\r
94     \r
95         /**\r
96          * Write Modified-UTF8 to a stream.\r
97          * \r
98          * @param out output stream \r
99          * @param str string\r
100          * @throws IOException\r
101          */\r
102     public static void writeModifiedUTF(DataOutput out, String str) throws IOException {\r
103         // Copied from DataOutput\r
104         int strlen = str.length();\r
105         int c = 0;\r
106         \r
107         int i=0;\r
108         for (i=0; i<strlen; i++) {\r
109            c = str.charAt(i);\r
110            if (!((c >= 0x0001) && (c <= 0x007F))) break;\r
111            out.write(c);\r
112         }\r
113         \r
114         for (;i < strlen; i++){\r
115             c = str.charAt(i);\r
116             if ((c >= 0x0001) && (c <= 0x007F)) {\r
117                 out.write( c );\r
118             } else if (c > 0x07FF) {\r
119                 out.write(0xE0 | ((c >> 12) & 0x0F));\r
120                 out.write(0x80 | ((c >>  6) & 0x3F));\r
121                 out.write(0x80 | ((c >>  0) & 0x3F));\r
122             } else {\r
123                 out.write(0xC0 | ((c >>  6) & 0x1F));\r
124                 out.write(0x80 | ((c >>  0) & 0x3F));\r
125             }\r
126         }\r
127     }\r
128     \r
129     /**\r
130      * Read Modified-UTF8 from a stream\r
131      * @param in input\r
132      * @param utflen number of bytes\r
133      * @return string\r
134      * @throws IOException\r
135      */\r
136     public static String readModifiedUTF(DataInput in, int utflen)\r
137     throws IOException, UTFDataFormatException\r
138     {\r
139         // Copied from DataInput\r
140         byte[] bytearr = null;\r
141         char[] chararr = null;\r
142 \r
143         {\r
144             bytearr = new byte[utflen];\r
145             chararr = new char[utflen];\r
146         }\r
147 \r
148         int c, char2, char3;\r
149         int count = 0;\r
150         int chararr_count=0;\r
151 \r
152         in.readFully(bytearr, 0, utflen);\r
153 \r
154         while (count < utflen) {\r
155             c = (int) bytearr[count] & 0xff;      \r
156             if (c > 127) break;\r
157             count++;\r
158             chararr[chararr_count++]=(char)c;\r
159         }\r
160 \r
161         while (count < utflen) {\r
162             c = (int) bytearr[count] & 0xff;\r
163             switch (c >> 4) {\r
164                 case 0: case 1: case 2: case 3: case 4: case 5: case 6: case 7:\r
165                     /* 0xxxxxxx*/\r
166                     count++;\r
167                     chararr[chararr_count++]=(char)c;\r
168                     break;\r
169                 case 12: case 13:\r
170                     /* 110x xxxx   10xx xxxx*/\r
171                     count += 2;\r
172                     if (count > utflen)\r
173                         throw new UTFDataFormatException(\r
174                             "malformed input: partial character at end");\r
175                     char2 = (int) bytearr[count-1];\r
176                     if ((char2 & 0xC0) != 0x80)\r
177                         throw new UTFDataFormatException(\r
178                             "malformed input around byte " + count); \r
179                     chararr[chararr_count++]=(char)(((c & 0x1F) << 6) | \r
180                                                     (char2 & 0x3F));  \r
181                     break;\r
182                 case 14:\r
183                     /* 1110 xxxx  10xx xxxx  10xx xxxx */\r
184                     count += 3;\r
185                     if (count > utflen)\r
186                         throw new UTFDataFormatException(\r
187                             "malformed input: partial character at end");\r
188                     char2 = (int) bytearr[count-2];\r
189                     char3 = (int) bytearr[count-1];\r
190                     if (((char2 & 0xC0) != 0x80) || ((char3 & 0xC0) != 0x80))\r
191                         throw new UTFDataFormatException(\r
192                             "malformed input around byte " + (count-1));\r
193                     chararr[chararr_count++]=(char)(((c     & 0x0F) << 12) |\r
194                                                     ((char2 & 0x3F) << 6)  |\r
195                                                     ((char3 & 0x3F) << 0));\r
196                     break;\r
197                 default:\r
198                     /* 10xx xxxx,  1111 xxxx */\r
199                     throw new UTFDataFormatException(\r
200                         "malformed input around byte " + count);\r
201             }\r
202         }\r
203         // The number of chars produced may be less than utflen\r
204         return new String(chararr, 0, chararr_count);\r
205     }    \r
206     \r
207     /**\r
208      * Write Standard-UTF8 to a stream.\r
209      * \r
210      * @param str\r
211      * @param out\r
212      * @throws IOException\r
213      */\r
214     public static void writeUTF(DataOutput out, String str)\r
215     throws IOException\r
216     {\r
217                 byte[] bytes = str.getBytes(CHARSET);\r
218                 out.write(bytes);\r
219     }\r
220     \r
221     /**\r
222      * Read Standard-UTF8 from a stream\r
223      * @param in input\r
224      * @param len number of bytes\r
225      * @return string\r
226      * @throws IOException\r
227      */\r
228     public static String readUTF(DataInput in, int len)\r
229     throws IOException\r
230     {\r
231                 byte[] bytes = new byte[len];\r
232                 in.readFully(bytes);\r
233                 return new String(bytes, UTF8.CHARSET);\r
234     }\r
235         \r
236 }\r