package org.simantics.databoard.util.binary; import java.io.DataInput; import java.io.DataOutput; import java.io.IOException; import java.io.UTFDataFormatException; import java.nio.charset.Charset; /** * Utils for handling Standard-UTF8 and Modified-UTF8 Strings.

* * The differences between standard UTF8 and Modified are the following: *

The null byte '\u0000' is encoded in 2-byte format * rather than 1-byte, so that the encoded strings never have * embedded nulls. *
Only the 1-byte, 2-byte, and 3-byte formats are used. *
Supplementary characters * are represented in the form of surrogate pairs. *

* */ public class UTF8 { public static final Charset CHARSET = Charset.forName("utf-8"); /** * Get the number of bytes in an UTF-8 encoding of a string * * @param string * @return byte length */ public static int getUTF8EncodingByteLength(String string) { // TODO ?May not work properly properly on some characters that are encoded with 2 chars in UTF-16? // Correct //return string.getBytes(UTF8).length; // http://en.wikipedia.org/wiki/UTF-8 int result = 0; int length = string.length(); for (int i=0; i=0 && c<=0x7f) { result += 1; } else if (c>=0x80 && c<=0x07ff) { result += 2; } else if (c>=0xD800 && c<=0xDFFF) { result += 1; } else if (c>=0x800 && c<=0xffff) { result += 3; } // Not really used as char is 16-bit else if (c>=0x10000 && c<=0x10ffff) { result += 4; } else if (c>=0x110000 && c<=0x1FFFFF) { result += 4; } else { // NOT IN RFC 3629 result += 5; } } return result; } /** * Get the number of bytes in an Modified-UTF-8 encoding of a string * * @param str * @return byte length */ public static int getModifiedUTF8EncodingByteLength(String str) { int strlen = str.length(); int utflen = 0; int c = 0; /* use charAt instead of copying String to char array */ for (int i = 0; i < strlen; i++) { c = str.charAt(i); if ((c >= 0x0001) && (c <= 0x007F)) { utflen++; } else if (c > 0x07FF) { utflen += 3; } else { utflen += 2; } } return utflen; } /** * Write Modified-UTF8 to a stream. * * @param out output stream * @param str string * @throws IOException */ public static void writeModifiedUTF(DataOutput out, String str) throws IOException { // Copied from DataOutput int strlen = str.length(); int c = 0; int i=0; for (i=0; i= 0x0001) && (c <= 0x007F))) break; out.write(c); } for (;i < strlen; i++){ c = str.charAt(i); if ((c >= 0x0001) && (c <= 0x007F)) { out.write( c ); } else if (c > 0x07FF) { out.write(0xE0 | ((c >> 12) & 0x0F)); out.write(0x80 | ((c >> 6) & 0x3F)); out.write(0x80 | ((c >> 0) & 0x3F)); } else { out.write(0xC0 | ((c >> 6) & 0x1F)); out.write(0x80 | ((c >> 0) & 0x3F)); } } } /** * Read Modified-UTF8 from a stream * @param in input * @param utflen number of bytes * @return string * @throws IOException */ public static String readModifiedUTF(DataInput in, int utflen) throws IOException, UTFDataFormatException { // Copied from DataInput byte[] bytearr = null; char[] chararr = null; { bytearr = new byte[utflen]; chararr = new char[utflen]; } int c, char2, char3; int count = 0; int chararr_count=0; in.readFully(bytearr, 0, utflen); while (count < utflen) { c = (int) bytearr[count] & 0xff; if (c > 127) break; count++; chararr[chararr_count++]=(char)c; } while (count < utflen) { c = (int) bytearr[count] & 0xff; switch (c >> 4) { case 0: case 1: case 2: case 3: case 4: case 5: case 6: case 7: /* 0xxxxxxx*/ count++; chararr[chararr_count++]=(char)c; break; case 12: case 13: /* 110x xxxx 10xx xxxx*/ count += 2; if (count > utflen) throw new UTFDataFormatException( "malformed input: partial character at end"); char2 = (int) bytearr[count-1]; if ((char2 & 0xC0) != 0x80) throw new UTFDataFormatException( "malformed input around byte " + count); chararr[chararr_count++]=(char)(((c & 0x1F) << 6) | (char2 & 0x3F)); break; case 14: /* 1110 xxxx 10xx xxxx 10xx xxxx */ count += 3; if (count > utflen) throw new UTFDataFormatException( "malformed input: partial character at end"); char2 = (int) bytearr[count-2]; char3 = (int) bytearr[count-1]; if (((char2 & 0xC0) != 0x80) || ((char3 & 0xC0) != 0x80)) throw new UTFDataFormatException( "malformed input around byte " + (count-1)); chararr[chararr_count++]=(char)(((c & 0x0F) << 12) | ((char2 & 0x3F) << 6) | ((char3 & 0x3F) << 0)); break; default: /* 10xx xxxx, 1111 xxxx */ throw new UTFDataFormatException( "malformed input around byte " + count); } } // The number of chars produced may be less than utflen return new String(chararr, 0, chararr_count); } /** * Write Standard-UTF8 to a stream. * * @param str * @param out * @throws IOException */ public static void writeUTF(DataOutput out, String str) throws IOException { byte[] bytes = str.getBytes(CHARSET); out.write(bytes); } /** * Read Standard-UTF8 from a stream * @param in input * @param len number of bytes * @return string * @throws IOException */ public static String readUTF(DataInput in, int len) throws IOException { byte[] bytes = new byte[len]; in.readFully(bytes); return new String(bytes, UTF8.CHARSET); } }