X-Git-Url: https://gerrit.simantics.org/r/gitweb?a=blobdiff_plain;ds=sidebyside;f=bundles%2Forg.simantics.databoard%2Fsrc%2Forg%2Fsimantics%2Fdataboard%2Futil%2Fbinary%2FUTF8.java;h=aba0446aec647957a006aaf2360571be730cc8be;hb=refs%2Fchanges%2F25%2F3525%2F1;hp=6f16677158fbeeb5086bd08c0e338b637944fdfd;hpb=969bd23cab98a79ca9101af33334000879fb60c5;p=simantics%2Fplatform.git
diff --git a/bundles/org.simantics.databoard/src/org/simantics/databoard/util/binary/UTF8.java b/bundles/org.simantics.databoard/src/org/simantics/databoard/util/binary/UTF8.java
index 6f1667715..aba0446ae 100644
--- a/bundles/org.simantics.databoard/src/org/simantics/databoard/util/binary/UTF8.java
+++ b/bundles/org.simantics.databoard/src/org/simantics/databoard/util/binary/UTF8.java
@@ -1,236 +1,236 @@
-package org.simantics.databoard.util.binary;
-
-import java.io.DataInput;
-import java.io.DataOutput;
-import java.io.IOException;
-import java.io.UTFDataFormatException;
-import java.nio.charset.Charset;
-
-/**
- * Utils for handling Standard-UTF8 and Modified-UTF8 Strings.
- *
- * The differences between standard UTF8 and Modified are the following:
- *
- * - The null byte
'\u0000'
is encoded in 2-byte format
- * rather than 1-byte, so that the encoded strings never have
- * embedded nulls.
- * - Only the 1-byte, 2-byte, and 3-byte formats are used.
- *
- Supplementary characters
- * are represented in the form of surrogate pairs.
- *
- *
- */
-public class UTF8 {
-
- public static final Charset CHARSET = Charset.forName("utf-8");
-
- /**
- * Get the number of bytes in an UTF-8 encoding of a string
- *
- * @param string
- * @return byte length
- */
- public static int getUTF8EncodingByteLength(String string)
- {
- // TODO ?May not work properly properly on some characters that are encoded with 2 chars in UTF-16?
-
- // Correct
- //return string.getBytes(UTF8).length;
-
- // http://en.wikipedia.org/wiki/UTF-8
- int result = 0;
- int length = string.length();
- for (int i=0; i=0 && c<=0x7f) {
- result += 1;
- } else if (c>=0x80 && c<=0x07ff) {
- result += 2;
- } else if (c>=0xD800 && c<=0xDFFF) {
- result += 1;
- } else if (c>=0x800 && c<=0xffff) {
- result += 3;
- }
- // Not really used as char is 16-bit
- else if (c>=0x10000 && c<=0x10ffff) {
- result += 4;
- } else if (c>=0x110000 && c<=0x1FFFFF) {
- result += 4;
- } else {
- // NOT IN RFC 3629
- result += 5;
- }
- }
- return result;
- }
-
-
- /**
- * Get the number of bytes in an Modified-UTF-8 encoding of a string
- *
- * @param str
- * @return byte length
- */
- public static int getModifiedUTF8EncodingByteLength(String str)
- {
- int strlen = str.length();
- int utflen = 0;
- int c = 0;
-
- /* use charAt instead of copying String to char array */
- for (int i = 0; i < strlen; i++) {
- c = str.charAt(i);
- if ((c >= 0x0001) && (c <= 0x007F)) {
- utflen++;
- } else if (c > 0x07FF) {
- utflen += 3;
- } else {
- utflen += 2;
- }
- }
- return utflen;
- }
-
- /**
- * Write Modified-UTF8 to a stream.
- *
- * @param out output stream
- * @param str string
- * @throws IOException
- */
- public static void writeModifiedUTF(DataOutput out, String str) throws IOException {
- // Copied from DataOutput
- int strlen = str.length();
- int c = 0;
-
- int i=0;
- for (i=0; i= 0x0001) && (c <= 0x007F))) break;
- out.write(c);
- }
-
- for (;i < strlen; i++){
- c = str.charAt(i);
- if ((c >= 0x0001) && (c <= 0x007F)) {
- out.write( c );
- } else if (c > 0x07FF) {
- out.write(0xE0 | ((c >> 12) & 0x0F));
- out.write(0x80 | ((c >> 6) & 0x3F));
- out.write(0x80 | ((c >> 0) & 0x3F));
- } else {
- out.write(0xC0 | ((c >> 6) & 0x1F));
- out.write(0x80 | ((c >> 0) & 0x3F));
- }
- }
- }
-
- /**
- * Read Modified-UTF8 from a stream
- * @param in input
- * @param utflen number of bytes
- * @return string
- * @throws IOException
- */
- public static String readModifiedUTF(DataInput in, int utflen)
- throws IOException, UTFDataFormatException
- {
- // Copied from DataInput
- byte[] bytearr = null;
- char[] chararr = null;
-
- {
- bytearr = new byte[utflen];
- chararr = new char[utflen];
- }
-
- int c, char2, char3;
- int count = 0;
- int chararr_count=0;
-
- in.readFully(bytearr, 0, utflen);
-
- while (count < utflen) {
- c = (int) bytearr[count] & 0xff;
- if (c > 127) break;
- count++;
- chararr[chararr_count++]=(char)c;
- }
-
- while (count < utflen) {
- c = (int) bytearr[count] & 0xff;
- switch (c >> 4) {
- case 0: case 1: case 2: case 3: case 4: case 5: case 6: case 7:
- /* 0xxxxxxx*/
- count++;
- chararr[chararr_count++]=(char)c;
- break;
- case 12: case 13:
- /* 110x xxxx 10xx xxxx*/
- count += 2;
- if (count > utflen)
- throw new UTFDataFormatException(
- "malformed input: partial character at end");
- char2 = (int) bytearr[count-1];
- if ((char2 & 0xC0) != 0x80)
- throw new UTFDataFormatException(
- "malformed input around byte " + count);
- chararr[chararr_count++]=(char)(((c & 0x1F) << 6) |
- (char2 & 0x3F));
- break;
- case 14:
- /* 1110 xxxx 10xx xxxx 10xx xxxx */
- count += 3;
- if (count > utflen)
- throw new UTFDataFormatException(
- "malformed input: partial character at end");
- char2 = (int) bytearr[count-2];
- char3 = (int) bytearr[count-1];
- if (((char2 & 0xC0) != 0x80) || ((char3 & 0xC0) != 0x80))
- throw new UTFDataFormatException(
- "malformed input around byte " + (count-1));
- chararr[chararr_count++]=(char)(((c & 0x0F) << 12) |
- ((char2 & 0x3F) << 6) |
- ((char3 & 0x3F) << 0));
- break;
- default:
- /* 10xx xxxx, 1111 xxxx */
- throw new UTFDataFormatException(
- "malformed input around byte " + count);
- }
- }
- // The number of chars produced may be less than utflen
- return new String(chararr, 0, chararr_count);
- }
-
- /**
- * Write Standard-UTF8 to a stream.
- *
- * @param str
- * @param out
- * @throws IOException
- */
- public static void writeUTF(DataOutput out, String str)
- throws IOException
- {
- byte[] bytes = str.getBytes(CHARSET);
- out.write(bytes);
- }
-
- /**
- * Read Standard-UTF8 from a stream
- * @param in input
- * @param len number of bytes
- * @return string
- * @throws IOException
- */
- public static String readUTF(DataInput in, int len)
- throws IOException
- {
- byte[] bytes = new byte[len];
- in.readFully(bytes);
- return new String(bytes, UTF8.CHARSET);
- }
-
-}
+package org.simantics.databoard.util.binary;
+
+import java.io.DataInput;
+import java.io.DataOutput;
+import java.io.IOException;
+import java.io.UTFDataFormatException;
+import java.nio.charset.Charset;
+
+/**
+ * Utils for handling Standard-UTF8 and Modified-UTF8 Strings.
+ *
+ * The differences between standard UTF8 and Modified are the following:
+ *
+ * - The null byte
'\u0000'
is encoded in 2-byte format
+ * rather than 1-byte, so that the encoded strings never have
+ * embedded nulls.
+ * - Only the 1-byte, 2-byte, and 3-byte formats are used.
+ *
- Supplementary characters
+ * are represented in the form of surrogate pairs.
+ *
+ *
+ */
+public class UTF8 {
+
+ public static final Charset CHARSET = Charset.forName("utf-8");
+
+ /**
+ * Get the number of bytes in an UTF-8 encoding of a string
+ *
+ * @param string
+ * @return byte length
+ */
+ public static int getUTF8EncodingByteLength(String string)
+ {
+ // TODO ?May not work properly properly on some characters that are encoded with 2 chars in UTF-16?
+
+ // Correct
+ //return string.getBytes(UTF8).length;
+
+ // http://en.wikipedia.org/wiki/UTF-8
+ int result = 0;
+ int length = string.length();
+ for (int i=0; i=0 && c<=0x7f) {
+ result += 1;
+ } else if (c>=0x80 && c<=0x07ff) {
+ result += 2;
+ } else if (c>=0xD800 && c<=0xDFFF) {
+ result += 1;
+ } else if (c>=0x800 && c<=0xffff) {
+ result += 3;
+ }
+ // Not really used as char is 16-bit
+ else if (c>=0x10000 && c<=0x10ffff) {
+ result += 4;
+ } else if (c>=0x110000 && c<=0x1FFFFF) {
+ result += 4;
+ } else {
+ // NOT IN RFC 3629
+ result += 5;
+ }
+ }
+ return result;
+ }
+
+
+ /**
+ * Get the number of bytes in an Modified-UTF-8 encoding of a string
+ *
+ * @param str
+ * @return byte length
+ */
+ public static int getModifiedUTF8EncodingByteLength(String str)
+ {
+ int strlen = str.length();
+ int utflen = 0;
+ int c = 0;
+
+ /* use charAt instead of copying String to char array */
+ for (int i = 0; i < strlen; i++) {
+ c = str.charAt(i);
+ if ((c >= 0x0001) && (c <= 0x007F)) {
+ utflen++;
+ } else if (c > 0x07FF) {
+ utflen += 3;
+ } else {
+ utflen += 2;
+ }
+ }
+ return utflen;
+ }
+
+ /**
+ * Write Modified-UTF8 to a stream.
+ *
+ * @param out output stream
+ * @param str string
+ * @throws IOException
+ */
+ public static void writeModifiedUTF(DataOutput out, String str) throws IOException {
+ // Copied from DataOutput
+ int strlen = str.length();
+ int c = 0;
+
+ int i=0;
+ for (i=0; i= 0x0001) && (c <= 0x007F))) break;
+ out.write(c);
+ }
+
+ for (;i < strlen; i++){
+ c = str.charAt(i);
+ if ((c >= 0x0001) && (c <= 0x007F)) {
+ out.write( c );
+ } else if (c > 0x07FF) {
+ out.write(0xE0 | ((c >> 12) & 0x0F));
+ out.write(0x80 | ((c >> 6) & 0x3F));
+ out.write(0x80 | ((c >> 0) & 0x3F));
+ } else {
+ out.write(0xC0 | ((c >> 6) & 0x1F));
+ out.write(0x80 | ((c >> 0) & 0x3F));
+ }
+ }
+ }
+
+ /**
+ * Read Modified-UTF8 from a stream
+ * @param in input
+ * @param utflen number of bytes
+ * @return string
+ * @throws IOException
+ */
+ public static String readModifiedUTF(DataInput in, int utflen)
+ throws IOException, UTFDataFormatException
+ {
+ // Copied from DataInput
+ byte[] bytearr = null;
+ char[] chararr = null;
+
+ {
+ bytearr = new byte[utflen];
+ chararr = new char[utflen];
+ }
+
+ int c, char2, char3;
+ int count = 0;
+ int chararr_count=0;
+
+ in.readFully(bytearr, 0, utflen);
+
+ while (count < utflen) {
+ c = (int) bytearr[count] & 0xff;
+ if (c > 127) break;
+ count++;
+ chararr[chararr_count++]=(char)c;
+ }
+
+ while (count < utflen) {
+ c = (int) bytearr[count] & 0xff;
+ switch (c >> 4) {
+ case 0: case 1: case 2: case 3: case 4: case 5: case 6: case 7:
+ /* 0xxxxxxx*/
+ count++;
+ chararr[chararr_count++]=(char)c;
+ break;
+ case 12: case 13:
+ /* 110x xxxx 10xx xxxx*/
+ count += 2;
+ if (count > utflen)
+ throw new UTFDataFormatException(
+ "malformed input: partial character at end");
+ char2 = (int) bytearr[count-1];
+ if ((char2 & 0xC0) != 0x80)
+ throw new UTFDataFormatException(
+ "malformed input around byte " + count);
+ chararr[chararr_count++]=(char)(((c & 0x1F) << 6) |
+ (char2 & 0x3F));
+ break;
+ case 14:
+ /* 1110 xxxx 10xx xxxx 10xx xxxx */
+ count += 3;
+ if (count > utflen)
+ throw new UTFDataFormatException(
+ "malformed input: partial character at end");
+ char2 = (int) bytearr[count-2];
+ char3 = (int) bytearr[count-1];
+ if (((char2 & 0xC0) != 0x80) || ((char3 & 0xC0) != 0x80))
+ throw new UTFDataFormatException(
+ "malformed input around byte " + (count-1));
+ chararr[chararr_count++]=(char)(((c & 0x0F) << 12) |
+ ((char2 & 0x3F) << 6) |
+ ((char3 & 0x3F) << 0));
+ break;
+ default:
+ /* 10xx xxxx, 1111 xxxx */
+ throw new UTFDataFormatException(
+ "malformed input around byte " + count);
+ }
+ }
+ // The number of chars produced may be less than utflen
+ return new String(chararr, 0, chararr_count);
+ }
+
+ /**
+ * Write Standard-UTF8 to a stream.
+ *
+ * @param str
+ * @param out
+ * @throws IOException
+ */
+ public static void writeUTF(DataOutput out, String str)
+ throws IOException
+ {
+ byte[] bytes = str.getBytes(CHARSET);
+ out.write(bytes);
+ }
+
+ /**
+ * Read Standard-UTF8 from a stream
+ * @param in input
+ * @param len number of bytes
+ * @return string
+ * @throws IOException
+ */
+ public static String readUTF(DataInput in, int len)
+ throws IOException
+ {
+ byte[] bytes = new byte[len];
+ in.readFully(bytes);
+ return new String(bytes, UTF8.CHARSET);
+ }
+
+}