1 package org.simantics.databoard.util.binary;
\r
3 import java.io.DataInput;
\r
4 import java.io.DataOutput;
\r
5 import java.io.IOException;
\r
6 import java.io.UTFDataFormatException;
\r
7 import java.nio.charset.Charset;
\r
10 * Utils for handling Standard-UTF8 and <a href="http://download.oracle.com/javase/6/docs/api/java/io/DataInput.html">Modified-UTF8</a> Strings.<p>
\r
12 * The differences between standard UTF8 and Modified are the following:
\r
14 * <li>The null byte <code>'\u0000'</code> is encoded in 2-byte format
\r
15 * rather than 1-byte, so that the encoded strings never have
\r
17 * <li>Only the 1-byte, 2-byte, and 3-byte formats are used.
\r
18 * <li><a href="../lang/Character.html#unicode">Supplementary characters</a>
\r
19 * are represented in the form of surrogate pairs.
\r
25 public static final Charset CHARSET = Charset.forName("utf-8");
\r
28 * Get the number of bytes in an UTF-8 encoding of a string
\r
31 * @return byte length
\r
33 public static int getUTF8EncodingByteLength(String string)
\r
35 // TODO ?May not work properly properly on some characters that are encoded with 2 chars in UTF-16?
\r
38 //return string.getBytes(UTF8).length;
\r
40 // http://en.wikipedia.org/wiki/UTF-8
\r
42 int length = string.length();
\r
43 for (int i=0; i<length; i++)
\r
45 char c = string.charAt(i);
\r
46 if (c>=0 && c<=0x7f) {
\r
48 } else if (c>=0x80 && c<=0x07ff) {
\r
50 } else if (c>=0xD800 && c<=0xDFFF) {
\r
52 } else if (c>=0x800 && c<=0xffff) {
\r
55 // Not really used as char is 16-bit
\r
56 else if (c>=0x10000 && c<=0x10ffff) {
\r
58 } else if (c>=0x110000 && c<=0x1FFFFF) {
\r
70 * Get the number of bytes in an Modified-UTF-8 encoding of a string
\r
73 * @return byte length
\r
75 public static int getModifiedUTF8EncodingByteLength(String str)
\r
77 int strlen = str.length();
\r
81 /* use charAt instead of copying String to char array */
\r
82 for (int i = 0; i < strlen; i++) {
\r
84 if ((c >= 0x0001) && (c <= 0x007F)) {
\r
86 } else if (c > 0x07FF) {
\r
96 * Write Modified-UTF8 to a stream.
\r
98 * @param out output stream
\r
100 * @throws IOException
\r
102 public static void writeModifiedUTF(DataOutput out, String str) throws IOException {
\r
103 // Copied from DataOutput
\r
104 int strlen = str.length();
\r
108 for (i=0; i<strlen; i++) {
\r
110 if (!((c >= 0x0001) && (c <= 0x007F))) break;
\r
114 for (;i < strlen; i++){
\r
116 if ((c >= 0x0001) && (c <= 0x007F)) {
\r
118 } else if (c > 0x07FF) {
\r
119 out.write(0xE0 | ((c >> 12) & 0x0F));
\r
120 out.write(0x80 | ((c >> 6) & 0x3F));
\r
121 out.write(0x80 | ((c >> 0) & 0x3F));
\r
123 out.write(0xC0 | ((c >> 6) & 0x1F));
\r
124 out.write(0x80 | ((c >> 0) & 0x3F));
\r
130 * Read Modified-UTF8 from a stream
\r
132 * @param utflen number of bytes
\r
134 * @throws IOException
\r
136 public static String readModifiedUTF(DataInput in, int utflen)
\r
137 throws IOException, UTFDataFormatException
\r
139 // Copied from DataInput
\r
140 byte[] bytearr = null;
\r
141 char[] chararr = null;
\r
144 bytearr = new byte[utflen];
\r
145 chararr = new char[utflen];
\r
148 int c, char2, char3;
\r
150 int chararr_count=0;
\r
152 in.readFully(bytearr, 0, utflen);
\r
154 while (count < utflen) {
\r
155 c = (int) bytearr[count] & 0xff;
\r
156 if (c > 127) break;
\r
158 chararr[chararr_count++]=(char)c;
\r
161 while (count < utflen) {
\r
162 c = (int) bytearr[count] & 0xff;
\r
164 case 0: case 1: case 2: case 3: case 4: case 5: case 6: case 7:
\r
167 chararr[chararr_count++]=(char)c;
\r
170 /* 110x xxxx 10xx xxxx*/
\r
172 if (count > utflen)
\r
173 throw new UTFDataFormatException(
\r
174 "malformed input: partial character at end");
\r
175 char2 = (int) bytearr[count-1];
\r
176 if ((char2 & 0xC0) != 0x80)
\r
177 throw new UTFDataFormatException(
\r
178 "malformed input around byte " + count);
\r
179 chararr[chararr_count++]=(char)(((c & 0x1F) << 6) |
\r
183 /* 1110 xxxx 10xx xxxx 10xx xxxx */
\r
185 if (count > utflen)
\r
186 throw new UTFDataFormatException(
\r
187 "malformed input: partial character at end");
\r
188 char2 = (int) bytearr[count-2];
\r
189 char3 = (int) bytearr[count-1];
\r
190 if (((char2 & 0xC0) != 0x80) || ((char3 & 0xC0) != 0x80))
\r
191 throw new UTFDataFormatException(
\r
192 "malformed input around byte " + (count-1));
\r
193 chararr[chararr_count++]=(char)(((c & 0x0F) << 12) |
\r
194 ((char2 & 0x3F) << 6) |
\r
195 ((char3 & 0x3F) << 0));
\r
198 /* 10xx xxxx, 1111 xxxx */
\r
199 throw new UTFDataFormatException(
\r
200 "malformed input around byte " + count);
\r
203 // The number of chars produced may be less than utflen
\r
204 return new String(chararr, 0, chararr_count);
\r
208 * Write Standard-UTF8 to a stream.
\r
212 * @throws IOException
\r
214 public static void writeUTF(DataOutput out, String str)
\r
217 byte[] bytes = str.getBytes(CHARSET);
\r
222 * Read Standard-UTF8 from a stream
\r
224 * @param len number of bytes
\r
226 * @throws IOException
\r
228 public static String readUTF(DataInput in, int len)
\r
231 byte[] bytes = new byte[len];
\r
232 in.readFully(bytes);
\r
233 return new String(bytes, UTF8.CHARSET);
\r