1 package org.simantics.databoard.util.binary;
3 import java.io.DataInput;
4 import java.io.DataOutput;
5 import java.io.IOException;
6 import java.io.UTFDataFormatException;
7 import java.nio.charset.Charset;
10 * Utils for handling Standard-UTF8 and <a href="http://download.oracle.com/javase/6/docs/api/java/io/DataInput.html">Modified-UTF8</a> Strings.<p>
12 * The differences between standard UTF8 and Modified are the following:
14 * <li>The null byte <code>'\u0000'</code> is encoded in 2-byte format
15 * rather than 1-byte, so that the encoded strings never have
17 * <li>Only the 1-byte, 2-byte, and 3-byte formats are used.
18 * <li><a href="../lang/Character.html#unicode">Supplementary characters</a>
19 * are represented in the form of surrogate pairs.
25 public static final Charset CHARSET = Charset.forName("utf-8");
28 * Get the number of bytes in an UTF-8 encoding of a string
33 public static int getUTF8EncodingByteLength(String string)
35 // TODO ?May not work properly properly on some characters that are encoded with 2 chars in UTF-16?
38 //return string.getBytes(UTF8).length;
40 // http://en.wikipedia.org/wiki/UTF-8
42 int length = string.length();
43 for (int i=0; i<length; i++)
45 char c = string.charAt(i);
46 if (c>=0 && c<=0x7f) {
48 } else if (c>=0x80 && c<=0x07ff) {
50 } else if (c>=0xD800 && c<=0xDFFF) {
52 } else if (c>=0x800 && c<=0xffff) {
55 // Not really used as char is 16-bit
56 else if (c>=0x10000 && c<=0x10ffff) {
58 } else if (c>=0x110000 && c<=0x1FFFFF) {
70 * Get the number of bytes in an Modified-UTF-8 encoding of a string
75 public static int getModifiedUTF8EncodingByteLength(String str)
77 int strlen = str.length();
81 /* use charAt instead of copying String to char array */
82 for (int i = 0; i < strlen; i++) {
84 if ((c >= 0x0001) && (c <= 0x007F)) {
86 } else if (c > 0x07FF) {
96 * Write Modified-UTF8 to a stream.
98 * @param out output stream
100 * @throws IOException
102 public static void writeModifiedUTF(DataOutput out, String str) throws IOException {
103 // Copied from DataOutput
104 int strlen = str.length();
108 for (i=0; i<strlen; i++) {
110 if (!((c >= 0x0001) && (c <= 0x007F))) break;
114 for (;i < strlen; i++){
116 if ((c >= 0x0001) && (c <= 0x007F)) {
118 } else if (c > 0x07FF) {
119 out.write(0xE0 | ((c >> 12) & 0x0F));
120 out.write(0x80 | ((c >> 6) & 0x3F));
121 out.write(0x80 | ((c >> 0) & 0x3F));
123 out.write(0xC0 | ((c >> 6) & 0x1F));
124 out.write(0x80 | ((c >> 0) & 0x3F));
130 * Read Modified-UTF8 from a stream
132 * @param utflen number of bytes
134 * @throws IOException
136 public static String readModifiedUTF(DataInput in, int utflen)
137 throws IOException, UTFDataFormatException
139 // Copied from DataInput
140 byte[] bytearr = null;
141 char[] chararr = null;
144 bytearr = new byte[utflen];
145 chararr = new char[utflen];
152 in.readFully(bytearr, 0, utflen);
154 while (count < utflen) {
155 c = (int) bytearr[count] & 0xff;
158 chararr[chararr_count++]=(char)c;
161 while (count < utflen) {
162 c = (int) bytearr[count] & 0xff;
164 case 0: case 1: case 2: case 3: case 4: case 5: case 6: case 7:
167 chararr[chararr_count++]=(char)c;
170 /* 110x xxxx 10xx xxxx*/
173 throw new UTFDataFormatException(
174 "malformed input: partial character at end");
175 char2 = (int) bytearr[count-1];
176 if ((char2 & 0xC0) != 0x80)
177 throw new UTFDataFormatException(
178 "malformed input around byte " + count);
179 chararr[chararr_count++]=(char)(((c & 0x1F) << 6) |
183 /* 1110 xxxx 10xx xxxx 10xx xxxx */
186 throw new UTFDataFormatException(
187 "malformed input: partial character at end");
188 char2 = (int) bytearr[count-2];
189 char3 = (int) bytearr[count-1];
190 if (((char2 & 0xC0) != 0x80) || ((char3 & 0xC0) != 0x80))
191 throw new UTFDataFormatException(
192 "malformed input around byte " + (count-1));
193 chararr[chararr_count++]=(char)(((c & 0x0F) << 12) |
194 ((char2 & 0x3F) << 6) |
195 ((char3 & 0x3F) << 0));
198 /* 10xx xxxx, 1111 xxxx */
199 throw new UTFDataFormatException(
200 "malformed input around byte " + count);
203 // The number of chars produced may be less than utflen
204 return new String(chararr, 0, chararr_count);
208 * Write Standard-UTF8 to a stream.
212 * @throws IOException
214 public static void writeUTF(DataOutput out, String str)
217 byte[] bytes = str.getBytes(CHARSET);
222 * Read Standard-UTF8 from a stream
224 * @param len number of bytes
226 * @throws IOException
228 public static String readUTF(DataInput in, int len)
231 byte[] bytes = new byte[len];
233 return new String(bytes, UTF8.CHARSET);