]> gerrit.simantics Code Review - simantics/platform.git/blobdiff - bundles/org.simantics.databoard/src/org/simantics/databoard/util/binary/UTF8.java
Fixed all line endings of the repository
[simantics/platform.git] / bundles / org.simantics.databoard / src / org / simantics / databoard / util / binary / UTF8.java
index 6f16677158fbeeb5086bd08c0e338b637944fdfd..aba0446aec647957a006aaf2360571be730cc8be 100644 (file)
-package org.simantics.databoard.util.binary;\r
-\r
-import java.io.DataInput;\r
-import java.io.DataOutput;\r
-import java.io.IOException;\r
-import java.io.UTFDataFormatException;\r
-import java.nio.charset.Charset;\r
-\r
-/**\r
- * Utils for handling Standard-UTF8 and <a href="http://download.oracle.com/javase/6/docs/api/java/io/DataInput.html">Modified-UTF8</a> Strings.<p>\r
- * \r
- * The differences between standard UTF8 and Modified are the following:\r
- * <ul>\r
- * <li>The null byte <code>'&#92;u0000'</code> is encoded in 2-byte format\r
- *     rather than 1-byte, so that the encoded strings never have\r
- *     embedded nulls.\r
- * <li>Only the 1-byte, 2-byte, and 3-byte formats are used.\r
- * <li><a href="../lang/Character.html#unicode">Supplementary characters</a>\r
- *     are represented in the form of surrogate pairs.\r
- * </ul>\r
- * \r
- */\r
-public class UTF8 {\r
-\r
-       public static final Charset CHARSET = Charset.forName("utf-8");\r
-       \r
-       /**\r
-        * Get the number of bytes in an UTF-8 encoding of a string \r
-        * \r
-        * @param string\r
-        * @return byte length\r
-        */\r
-       public static int getUTF8EncodingByteLength(String string)\r
-       {\r
-               // TODO ?May not work properly properly on some characters that are encoded with 2 chars in UTF-16? \r
-               \r
-               // Correct\r
-               //return string.getBytes(UTF8).length;\r
-               \r
-               // http://en.wikipedia.org/wiki/UTF-8\r
-               int result = 0;\r
-               int length = string.length();\r
-               for (int i=0; i<length; i++)\r
-               {\r
-                       char c = string.charAt(i);\r
-                       if (c>=0 && c<=0x7f) {\r
-                               result += 1;\r
-                       } else if (c>=0x80 && c<=0x07ff) {\r
-                               result += 2;\r
-                       } else if (c>=0xD800 && c<=0xDFFF) {\r
-                               result += 1;\r
-                       } else if (c>=0x800 && c<=0xffff) {\r
-                               result += 3;\r
-                       }\r
-                       // Not really used as char is 16-bit\r
-                       else if (c>=0x10000 && c<=0x10ffff) {\r
-                               result += 4;\r
-                       } else if (c>=0x110000 && c<=0x1FFFFF) {\r
-                               result += 4;\r
-                       } else {\r
-                               // NOT IN RFC 3629\r
-                               result += 5;\r
-                       }\r
-               }\r
-               return result;                          \r
-       }\r
-       \r
-\r
-       /**\r
-        * Get the number of bytes in an Modified-UTF-8 encoding of a string \r
-        * \r
-        * @param str\r
-        * @return byte length\r
-        */\r
-       public static int getModifiedUTF8EncodingByteLength(String str)\r
-       {\r
-        int strlen = str.length();\r
-       int utflen = 0;\r
-       int c = 0;\r
-     \r
-        /* use charAt instead of copying String to char array */\r
-       for (int i = 0; i < strlen; i++) {\r
-                c = str.charAt(i);\r
-           if ((c >= 0x0001) && (c <= 0x007F)) {\r
-               utflen++;\r
-           } else if (c > 0x07FF) {\r
-               utflen += 3;\r
-           } else {\r
-               utflen += 2;\r
-           }\r
-       }\r
-       return utflen;\r
-       }\r
-    \r
-       /**\r
-        * Write Modified-UTF8 to a stream.\r
-        * \r
-        * @param out output stream \r
-        * @param str string\r
-        * @throws IOException\r
-        */\r
-    public static void writeModifiedUTF(DataOutput out, String str) throws IOException {\r
-       // Copied from DataOutput\r
-        int strlen = str.length();\r
-        int c = 0;\r
-        \r
-        int i=0;\r
-        for (i=0; i<strlen; i++) {\r
-           c = str.charAt(i);\r
-           if (!((c >= 0x0001) && (c <= 0x007F))) break;\r
-           out.write(c);\r
-        }\r
-        \r
-        for (;i < strlen; i++){\r
-            c = str.charAt(i);\r
-            if ((c >= 0x0001) && (c <= 0x007F)) {\r
-               out.write( c );\r
-            } else if (c > 0x07FF) {\r
-               out.write(0xE0 | ((c >> 12) & 0x0F));\r
-               out.write(0x80 | ((c >>  6) & 0x3F));\r
-               out.write(0x80 | ((c >>  0) & 0x3F));\r
-            } else {\r
-               out.write(0xC0 | ((c >>  6) & 0x1F));\r
-               out.write(0x80 | ((c >>  0) & 0x3F));\r
-            }\r
-        }\r
-    }\r
-    \r
-    /**\r
-     * Read Modified-UTF8 from a stream\r
-     * @param in input\r
-     * @param utflen number of bytes\r
-     * @return string\r
-     * @throws IOException\r
-     */\r
-    public static String readModifiedUTF(DataInput in, int utflen)\r
-    throws IOException, UTFDataFormatException\r
-    {\r
-       // Copied from DataInput\r
-        byte[] bytearr = null;\r
-        char[] chararr = null;\r
-\r
-        {\r
-            bytearr = new byte[utflen];\r
-            chararr = new char[utflen];\r
-        }\r
-\r
-        int c, char2, char3;\r
-        int count = 0;\r
-        int chararr_count=0;\r
-\r
-        in.readFully(bytearr, 0, utflen);\r
-\r
-        while (count < utflen) {\r
-            c = (int) bytearr[count] & 0xff;      \r
-            if (c > 127) break;\r
-            count++;\r
-            chararr[chararr_count++]=(char)c;\r
-        }\r
-\r
-        while (count < utflen) {\r
-            c = (int) bytearr[count] & 0xff;\r
-            switch (c >> 4) {\r
-                case 0: case 1: case 2: case 3: case 4: case 5: case 6: case 7:\r
-                    /* 0xxxxxxx*/\r
-                    count++;\r
-                    chararr[chararr_count++]=(char)c;\r
-                    break;\r
-                case 12: case 13:\r
-                    /* 110x xxxx   10xx xxxx*/\r
-                    count += 2;\r
-                    if (count > utflen)\r
-                        throw new UTFDataFormatException(\r
-                            "malformed input: partial character at end");\r
-                    char2 = (int) bytearr[count-1];\r
-                    if ((char2 & 0xC0) != 0x80)\r
-                        throw new UTFDataFormatException(\r
-                            "malformed input around byte " + count); \r
-                    chararr[chararr_count++]=(char)(((c & 0x1F) << 6) | \r
-                                                    (char2 & 0x3F));  \r
-                    break;\r
-                case 14:\r
-                    /* 1110 xxxx  10xx xxxx  10xx xxxx */\r
-                    count += 3;\r
-                    if (count > utflen)\r
-                        throw new UTFDataFormatException(\r
-                            "malformed input: partial character at end");\r
-                    char2 = (int) bytearr[count-2];\r
-                    char3 = (int) bytearr[count-1];\r
-                    if (((char2 & 0xC0) != 0x80) || ((char3 & 0xC0) != 0x80))\r
-                        throw new UTFDataFormatException(\r
-                            "malformed input around byte " + (count-1));\r
-                    chararr[chararr_count++]=(char)(((c     & 0x0F) << 12) |\r
-                                                    ((char2 & 0x3F) << 6)  |\r
-                                                    ((char3 & 0x3F) << 0));\r
-                    break;\r
-                default:\r
-                    /* 10xx xxxx,  1111 xxxx */\r
-                    throw new UTFDataFormatException(\r
-                        "malformed input around byte " + count);\r
-            }\r
-        }\r
-        // The number of chars produced may be less than utflen\r
-        return new String(chararr, 0, chararr_count);\r
-    }    \r
-    \r
-    /**\r
-     * Write Standard-UTF8 to a stream.\r
-     * \r
-     * @param str\r
-     * @param out\r
-     * @throws IOException\r
-     */\r
-    public static void writeUTF(DataOutput out, String str)\r
-    throws IOException\r
-    {\r
-               byte[] bytes = str.getBytes(CHARSET);\r
-               out.write(bytes);\r
-    }\r
-    \r
-    /**\r
-     * Read Standard-UTF8 from a stream\r
-     * @param in input\r
-     * @param len number of bytes\r
-     * @return string\r
-     * @throws IOException\r
-     */\r
-    public static String readUTF(DataInput in, int len)\r
-    throws IOException\r
-    {\r
-               byte[] bytes = new byte[len];\r
-               in.readFully(bytes);\r
-               return new String(bytes, UTF8.CHARSET);\r
-    }\r
-       \r
-}\r
+package org.simantics.databoard.util.binary;
+
+import java.io.DataInput;
+import java.io.DataOutput;
+import java.io.IOException;
+import java.io.UTFDataFormatException;
+import java.nio.charset.Charset;
+
+/**
+ * Utils for handling Standard-UTF8 and <a href="http://download.oracle.com/javase/6/docs/api/java/io/DataInput.html">Modified-UTF8</a> Strings.<p>
+ * 
+ * The differences between standard UTF8 and Modified are the following:
+ * <ul>
+ * <li>The null byte <code>'&#92;u0000'</code> is encoded in 2-byte format
+ *     rather than 1-byte, so that the encoded strings never have
+ *     embedded nulls.
+ * <li>Only the 1-byte, 2-byte, and 3-byte formats are used.
+ * <li><a href="../lang/Character.html#unicode">Supplementary characters</a>
+ *     are represented in the form of surrogate pairs.
+ * </ul>
+ * 
+ */
+public class UTF8 {
+
+       public static final Charset CHARSET = Charset.forName("utf-8");
+       
+       /**
+        * Get the number of bytes in an UTF-8 encoding of a string 
+        * 
+        * @param string
+        * @return byte length
+        */
+       public static int getUTF8EncodingByteLength(String string)
+       {
+               // TODO ?May not work properly properly on some characters that are encoded with 2 chars in UTF-16? 
+               
+               // Correct
+               //return string.getBytes(UTF8).length;
+               
+               // http://en.wikipedia.org/wiki/UTF-8
+               int result = 0;
+               int length = string.length();
+               for (int i=0; i<length; i++)
+               {
+                       char c = string.charAt(i);
+                       if (c>=0 && c<=0x7f) {
+                               result += 1;
+                       } else if (c>=0x80 && c<=0x07ff) {
+                               result += 2;
+                       } else if (c>=0xD800 && c<=0xDFFF) {
+                               result += 1;
+                       } else if (c>=0x800 && c<=0xffff) {
+                               result += 3;
+                       }
+                       // Not really used as char is 16-bit
+                       else if (c>=0x10000 && c<=0x10ffff) {
+                               result += 4;
+                       } else if (c>=0x110000 && c<=0x1FFFFF) {
+                               result += 4;
+                       } else {
+                               // NOT IN RFC 3629
+                               result += 5;
+                       }
+               }
+               return result;                          
+       }
+       
+
+       /**
+        * Get the number of bytes in an Modified-UTF-8 encoding of a string 
+        * 
+        * @param str
+        * @return byte length
+        */
+       public static int getModifiedUTF8EncodingByteLength(String str)
+       {
+        int strlen = str.length();
+       int utflen = 0;
+       int c = 0;
+     
+        /* use charAt instead of copying String to char array */
+       for (int i = 0; i < strlen; i++) {
+                c = str.charAt(i);
+           if ((c >= 0x0001) && (c <= 0x007F)) {
+               utflen++;
+           } else if (c > 0x07FF) {
+               utflen += 3;
+           } else {
+               utflen += 2;
+           }
+       }
+       return utflen;
+       }
+    
+       /**
+        * Write Modified-UTF8 to a stream.
+        * 
+        * @param out output stream 
+        * @param str string
+        * @throws IOException
+        */
+    public static void writeModifiedUTF(DataOutput out, String str) throws IOException {
+       // Copied from DataOutput
+        int strlen = str.length();
+        int c = 0;
+        
+        int i=0;
+        for (i=0; i<strlen; i++) {
+           c = str.charAt(i);
+           if (!((c >= 0x0001) && (c <= 0x007F))) break;
+           out.write(c);
+        }
+        
+        for (;i < strlen; i++){
+            c = str.charAt(i);
+            if ((c >= 0x0001) && (c <= 0x007F)) {
+               out.write( c );
+            } else if (c > 0x07FF) {
+               out.write(0xE0 | ((c >> 12) & 0x0F));
+               out.write(0x80 | ((c >>  6) & 0x3F));
+               out.write(0x80 | ((c >>  0) & 0x3F));
+            } else {
+               out.write(0xC0 | ((c >>  6) & 0x1F));
+               out.write(0x80 | ((c >>  0) & 0x3F));
+            }
+        }
+    }
+    
+    /**
+     * Read Modified-UTF8 from a stream
+     * @param in input
+     * @param utflen number of bytes
+     * @return string
+     * @throws IOException
+     */
+    public static String readModifiedUTF(DataInput in, int utflen)
+    throws IOException, UTFDataFormatException
+    {
+       // Copied from DataInput
+        byte[] bytearr = null;
+        char[] chararr = null;
+
+        {
+            bytearr = new byte[utflen];
+            chararr = new char[utflen];
+        }
+
+        int c, char2, char3;
+        int count = 0;
+        int chararr_count=0;
+
+        in.readFully(bytearr, 0, utflen);
+
+        while (count < utflen) {
+            c = (int) bytearr[count] & 0xff;      
+            if (c > 127) break;
+            count++;
+            chararr[chararr_count++]=(char)c;
+        }
+
+        while (count < utflen) {
+            c = (int) bytearr[count] & 0xff;
+            switch (c >> 4) {
+                case 0: case 1: case 2: case 3: case 4: case 5: case 6: case 7:
+                    /* 0xxxxxxx*/
+                    count++;
+                    chararr[chararr_count++]=(char)c;
+                    break;
+                case 12: case 13:
+                    /* 110x xxxx   10xx xxxx*/
+                    count += 2;
+                    if (count > utflen)
+                        throw new UTFDataFormatException(
+                            "malformed input: partial character at end");
+                    char2 = (int) bytearr[count-1];
+                    if ((char2 & 0xC0) != 0x80)
+                        throw new UTFDataFormatException(
+                            "malformed input around byte " + count); 
+                    chararr[chararr_count++]=(char)(((c & 0x1F) << 6) | 
+                                                    (char2 & 0x3F));  
+                    break;
+                case 14:
+                    /* 1110 xxxx  10xx xxxx  10xx xxxx */
+                    count += 3;
+                    if (count > utflen)
+                        throw new UTFDataFormatException(
+                            "malformed input: partial character at end");
+                    char2 = (int) bytearr[count-2];
+                    char3 = (int) bytearr[count-1];
+                    if (((char2 & 0xC0) != 0x80) || ((char3 & 0xC0) != 0x80))
+                        throw new UTFDataFormatException(
+                            "malformed input around byte " + (count-1));
+                    chararr[chararr_count++]=(char)(((c     & 0x0F) << 12) |
+                                                    ((char2 & 0x3F) << 6)  |
+                                                    ((char3 & 0x3F) << 0));
+                    break;
+                default:
+                    /* 10xx xxxx,  1111 xxxx */
+                    throw new UTFDataFormatException(
+                        "malformed input around byte " + count);
+            }
+        }
+        // The number of chars produced may be less than utflen
+        return new String(chararr, 0, chararr_count);
+    }    
+    
+    /**
+     * Write Standard-UTF8 to a stream.
+     * 
+     * @param str
+     * @param out
+     * @throws IOException
+     */
+    public static void writeUTF(DataOutput out, String str)
+    throws IOException
+    {
+               byte[] bytes = str.getBytes(CHARSET);
+               out.write(bytes);
+    }
+    
+    /**
+     * Read Standard-UTF8 from a stream
+     * @param in input
+     * @param len number of bytes
+     * @return string
+     * @throws IOException
+     */
+    public static String readUTF(DataInput in, int len)
+    throws IOException
+    {
+               byte[] bytes = new byte[len];
+               in.readFully(bytes);
+               return new String(bytes, UTF8.CHARSET);
+    }
+       
+}