X-Git-Url: https://gerrit.simantics.org/r/gitweb?a=blobdiff_plain;ds=sidebyside;f=bundles%2Forg.simantics.databoard%2Fsrc%2Forg%2Fsimantics%2Fdataboard%2Futil%2Fbinary%2FUTF8.java;h=aba0446aec647957a006aaf2360571be730cc8be;hb=refs%2Fchanges%2F38%2F238%2F2;hp=6f16677158fbeeb5086bd08c0e338b637944fdfd;hpb=24e2b34260f219f0d1644ca7a138894980e25b14;p=simantics%2Fplatform.git diff --git a/bundles/org.simantics.databoard/src/org/simantics/databoard/util/binary/UTF8.java b/bundles/org.simantics.databoard/src/org/simantics/databoard/util/binary/UTF8.java index 6f1667715..aba0446ae 100644 --- a/bundles/org.simantics.databoard/src/org/simantics/databoard/util/binary/UTF8.java +++ b/bundles/org.simantics.databoard/src/org/simantics/databoard/util/binary/UTF8.java @@ -1,236 +1,236 @@ -package org.simantics.databoard.util.binary; - -import java.io.DataInput; -import java.io.DataOutput; -import java.io.IOException; -import java.io.UTFDataFormatException; -import java.nio.charset.Charset; - -/** - * Utils for handling Standard-UTF8 and Modified-UTF8 Strings.

- * - * The differences between standard UTF8 and Modified are the following: - *

- * - */ -public class UTF8 { - - public static final Charset CHARSET = Charset.forName("utf-8"); - - /** - * Get the number of bytes in an UTF-8 encoding of a string - * - * @param string - * @return byte length - */ - public static int getUTF8EncodingByteLength(String string) - { - // TODO ?May not work properly properly on some characters that are encoded with 2 chars in UTF-16? - - // Correct - //return string.getBytes(UTF8).length; - - // http://en.wikipedia.org/wiki/UTF-8 - int result = 0; - int length = string.length(); - for (int i=0; i=0 && c<=0x7f) { - result += 1; - } else if (c>=0x80 && c<=0x07ff) { - result += 2; - } else if (c>=0xD800 && c<=0xDFFF) { - result += 1; - } else if (c>=0x800 && c<=0xffff) { - result += 3; - } - // Not really used as char is 16-bit - else if (c>=0x10000 && c<=0x10ffff) { - result += 4; - } else if (c>=0x110000 && c<=0x1FFFFF) { - result += 4; - } else { - // NOT IN RFC 3629 - result += 5; - } - } - return result; - } - - - /** - * Get the number of bytes in an Modified-UTF-8 encoding of a string - * - * @param str - * @return byte length - */ - public static int getModifiedUTF8EncodingByteLength(String str) - { - int strlen = str.length(); - int utflen = 0; - int c = 0; - - /* use charAt instead of copying String to char array */ - for (int i = 0; i < strlen; i++) { - c = str.charAt(i); - if ((c >= 0x0001) && (c <= 0x007F)) { - utflen++; - } else if (c > 0x07FF) { - utflen += 3; - } else { - utflen += 2; - } - } - return utflen; - } - - /** - * Write Modified-UTF8 to a stream. - * - * @param out output stream - * @param str string - * @throws IOException - */ - public static void writeModifiedUTF(DataOutput out, String str) throws IOException { - // Copied from DataOutput - int strlen = str.length(); - int c = 0; - - int i=0; - for (i=0; i= 0x0001) && (c <= 0x007F))) break; - out.write(c); - } - - for (;i < strlen; i++){ - c = str.charAt(i); - if ((c >= 0x0001) && (c <= 0x007F)) { - out.write( c ); - } else if (c > 0x07FF) { - out.write(0xE0 | ((c >> 12) & 0x0F)); - out.write(0x80 | ((c >> 6) & 0x3F)); - out.write(0x80 | ((c >> 0) & 0x3F)); - } else { - out.write(0xC0 | ((c >> 6) & 0x1F)); - out.write(0x80 | ((c >> 0) & 0x3F)); - } - } - } - - /** - * Read Modified-UTF8 from a stream - * @param in input - * @param utflen number of bytes - * @return string - * @throws IOException - */ - public static String readModifiedUTF(DataInput in, int utflen) - throws IOException, UTFDataFormatException - { - // Copied from DataInput - byte[] bytearr = null; - char[] chararr = null; - - { - bytearr = new byte[utflen]; - chararr = new char[utflen]; - } - - int c, char2, char3; - int count = 0; - int chararr_count=0; - - in.readFully(bytearr, 0, utflen); - - while (count < utflen) { - c = (int) bytearr[count] & 0xff; - if (c > 127) break; - count++; - chararr[chararr_count++]=(char)c; - } - - while (count < utflen) { - c = (int) bytearr[count] & 0xff; - switch (c >> 4) { - case 0: case 1: case 2: case 3: case 4: case 5: case 6: case 7: - /* 0xxxxxxx*/ - count++; - chararr[chararr_count++]=(char)c; - break; - case 12: case 13: - /* 110x xxxx 10xx xxxx*/ - count += 2; - if (count > utflen) - throw new UTFDataFormatException( - "malformed input: partial character at end"); - char2 = (int) bytearr[count-1]; - if ((char2 & 0xC0) != 0x80) - throw new UTFDataFormatException( - "malformed input around byte " + count); - chararr[chararr_count++]=(char)(((c & 0x1F) << 6) | - (char2 & 0x3F)); - break; - case 14: - /* 1110 xxxx 10xx xxxx 10xx xxxx */ - count += 3; - if (count > utflen) - throw new UTFDataFormatException( - "malformed input: partial character at end"); - char2 = (int) bytearr[count-2]; - char3 = (int) bytearr[count-1]; - if (((char2 & 0xC0) != 0x80) || ((char3 & 0xC0) != 0x80)) - throw new UTFDataFormatException( - "malformed input around byte " + (count-1)); - chararr[chararr_count++]=(char)(((c & 0x0F) << 12) | - ((char2 & 0x3F) << 6) | - ((char3 & 0x3F) << 0)); - break; - default: - /* 10xx xxxx, 1111 xxxx */ - throw new UTFDataFormatException( - "malformed input around byte " + count); - } - } - // The number of chars produced may be less than utflen - return new String(chararr, 0, chararr_count); - } - - /** - * Write Standard-UTF8 to a stream. - * - * @param str - * @param out - * @throws IOException - */ - public static void writeUTF(DataOutput out, String str) - throws IOException - { - byte[] bytes = str.getBytes(CHARSET); - out.write(bytes); - } - - /** - * Read Standard-UTF8 from a stream - * @param in input - * @param len number of bytes - * @return string - * @throws IOException - */ - public static String readUTF(DataInput in, int len) - throws IOException - { - byte[] bytes = new byte[len]; - in.readFully(bytes); - return new String(bytes, UTF8.CHARSET); - } - -} +package org.simantics.databoard.util.binary; + +import java.io.DataInput; +import java.io.DataOutput; +import java.io.IOException; +import java.io.UTFDataFormatException; +import java.nio.charset.Charset; + +/** + * Utils for handling Standard-UTF8 and Modified-UTF8 Strings.

+ * + * The differences between standard UTF8 and Modified are the following: + *

+ * + */ +public class UTF8 { + + public static final Charset CHARSET = Charset.forName("utf-8"); + + /** + * Get the number of bytes in an UTF-8 encoding of a string + * + * @param string + * @return byte length + */ + public static int getUTF8EncodingByteLength(String string) + { + // TODO ?May not work properly properly on some characters that are encoded with 2 chars in UTF-16? + + // Correct + //return string.getBytes(UTF8).length; + + // http://en.wikipedia.org/wiki/UTF-8 + int result = 0; + int length = string.length(); + for (int i=0; i=0 && c<=0x7f) { + result += 1; + } else if (c>=0x80 && c<=0x07ff) { + result += 2; + } else if (c>=0xD800 && c<=0xDFFF) { + result += 1; + } else if (c>=0x800 && c<=0xffff) { + result += 3; + } + // Not really used as char is 16-bit + else if (c>=0x10000 && c<=0x10ffff) { + result += 4; + } else if (c>=0x110000 && c<=0x1FFFFF) { + result += 4; + } else { + // NOT IN RFC 3629 + result += 5; + } + } + return result; + } + + + /** + * Get the number of bytes in an Modified-UTF-8 encoding of a string + * + * @param str + * @return byte length + */ + public static int getModifiedUTF8EncodingByteLength(String str) + { + int strlen = str.length(); + int utflen = 0; + int c = 0; + + /* use charAt instead of copying String to char array */ + for (int i = 0; i < strlen; i++) { + c = str.charAt(i); + if ((c >= 0x0001) && (c <= 0x007F)) { + utflen++; + } else if (c > 0x07FF) { + utflen += 3; + } else { + utflen += 2; + } + } + return utflen; + } + + /** + * Write Modified-UTF8 to a stream. + * + * @param out output stream + * @param str string + * @throws IOException + */ + public static void writeModifiedUTF(DataOutput out, String str) throws IOException { + // Copied from DataOutput + int strlen = str.length(); + int c = 0; + + int i=0; + for (i=0; i= 0x0001) && (c <= 0x007F))) break; + out.write(c); + } + + for (;i < strlen; i++){ + c = str.charAt(i); + if ((c >= 0x0001) && (c <= 0x007F)) { + out.write( c ); + } else if (c > 0x07FF) { + out.write(0xE0 | ((c >> 12) & 0x0F)); + out.write(0x80 | ((c >> 6) & 0x3F)); + out.write(0x80 | ((c >> 0) & 0x3F)); + } else { + out.write(0xC0 | ((c >> 6) & 0x1F)); + out.write(0x80 | ((c >> 0) & 0x3F)); + } + } + } + + /** + * Read Modified-UTF8 from a stream + * @param in input + * @param utflen number of bytes + * @return string + * @throws IOException + */ + public static String readModifiedUTF(DataInput in, int utflen) + throws IOException, UTFDataFormatException + { + // Copied from DataInput + byte[] bytearr = null; + char[] chararr = null; + + { + bytearr = new byte[utflen]; + chararr = new char[utflen]; + } + + int c, char2, char3; + int count = 0; + int chararr_count=0; + + in.readFully(bytearr, 0, utflen); + + while (count < utflen) { + c = (int) bytearr[count] & 0xff; + if (c > 127) break; + count++; + chararr[chararr_count++]=(char)c; + } + + while (count < utflen) { + c = (int) bytearr[count] & 0xff; + switch (c >> 4) { + case 0: case 1: case 2: case 3: case 4: case 5: case 6: case 7: + /* 0xxxxxxx*/ + count++; + chararr[chararr_count++]=(char)c; + break; + case 12: case 13: + /* 110x xxxx 10xx xxxx*/ + count += 2; + if (count > utflen) + throw new UTFDataFormatException( + "malformed input: partial character at end"); + char2 = (int) bytearr[count-1]; + if ((char2 & 0xC0) != 0x80) + throw new UTFDataFormatException( + "malformed input around byte " + count); + chararr[chararr_count++]=(char)(((c & 0x1F) << 6) | + (char2 & 0x3F)); + break; + case 14: + /* 1110 xxxx 10xx xxxx 10xx xxxx */ + count += 3; + if (count > utflen) + throw new UTFDataFormatException( + "malformed input: partial character at end"); + char2 = (int) bytearr[count-2]; + char3 = (int) bytearr[count-1]; + if (((char2 & 0xC0) != 0x80) || ((char3 & 0xC0) != 0x80)) + throw new UTFDataFormatException( + "malformed input around byte " + (count-1)); + chararr[chararr_count++]=(char)(((c & 0x0F) << 12) | + ((char2 & 0x3F) << 6) | + ((char3 & 0x3F) << 0)); + break; + default: + /* 10xx xxxx, 1111 xxxx */ + throw new UTFDataFormatException( + "malformed input around byte " + count); + } + } + // The number of chars produced may be less than utflen + return new String(chararr, 0, chararr_count); + } + + /** + * Write Standard-UTF8 to a stream. + * + * @param str + * @param out + * @throws IOException + */ + public static void writeUTF(DataOutput out, String str) + throws IOException + { + byte[] bytes = str.getBytes(CHARSET); + out.write(bytes); + } + + /** + * Read Standard-UTF8 from a stream + * @param in input + * @param len number of bytes + * @return string + * @throws IOException + */ + public static String readUTF(DataInput in, int len) + throws IOException + { + byte[] bytes = new byte[len]; + in.readFully(bytes); + return new String(bytes, UTF8.CHARSET); + } + +}