]> gerrit.simantics Code Review - simantics/platform.git/blobdiff - bundles/org.simantics.graph/src/org/simantics/graph/representation/ByteFileReader.java
Modified UTF-8 decoding fixes for TG reading and indexing
[simantics/platform.git] / bundles / org.simantics.graph / src / org / simantics / graph / representation / ByteFileReader.java
index e930a00fcad35b9c36bc323bd119c165f8f495dc..3a76a911a0d3c269c4c72cf5053b742aa8579432 100644 (file)
@@ -6,6 +6,7 @@ import java.io.File;
 import java.io.FileInputStream;
 import java.io.IOException;
 import java.io.InputStream;
+import java.io.UTFDataFormatException;
 import java.nio.ByteBuffer;
 import java.nio.channels.ReadableByteChannel;
 
@@ -49,23 +50,67 @@ public class ByteFileReader implements Closeable {
                return bytes;
 
        }
-       final protected String utf(byte[] bytes, int index, int target) {
-               int i = 0;
-               while(index < target) {
-                       int c = bytes[index++]&0xff;
-                       if(c <= 0x7F) {
-                               chars[i++] = (char)(c&0x7F);
-                       } else if (c > 0x07FF) {
-                               int c2 = bytes[index++]&0xff;
-                               int c3 = bytes[index++]&0xff;
-                               chars[i++] = (char)(((c&0xf)<<12) + ((c2&0x3f)<<6) + (c3&0x3f)); 
-                       } else {
-                               int c2 = bytes[index++]&0xff;
-                               chars[i++] = (char)(((c&0x1f)<<6) + (c2&0x3f)); 
+
+       final protected String utf(byte[] bytearr, int index, int target) throws UTFDataFormatException {
+               // Copied from DataInputStream
+               int utflen = target - index;
+               char[] chararr = utflen > chars.length ? new char[utflen] : chars;
+
+               int c, char2, char3;
+               int count = index;
+               int chararr_count=0;
+
+               while (count < target) {
+                       c = (int) bytearr[count] & 0xff;
+                       if (c > 127) break;
+                       count++;
+                       chararr[chararr_count++]=(char)c;
+               }
+
+               while (count < target) {
+                       c = (int) bytearr[count] & 0xff;
+                       switch (c >> 4) {
+                       case 0: case 1: case 2: case 3: case 4: case 5: case 6: case 7:
+                               /* 0xxxxxxx*/
+                               count++;
+                               chararr[chararr_count++]=(char)c;
+                               break;
+                       case 12: case 13:
+                               /* 110x xxxx   10xx xxxx*/
+                               count += 2;
+                               if (count > target)
+                                       throw new UTFDataFormatException(
+                                                       "malformed input: partial character at end (" + (count-index) + " > " + utflen + ")");
+                               char2 = (int) bytearr[count-1];
+                               if ((char2 & 0xC0) != 0x80)
+                                       throw new UTFDataFormatException(
+                                                       "malformed input around byte " + count); 
+                               chararr[chararr_count++]=(char)(((c & 0x1F) << 6) | 
+                                               (char2 & 0x3F));  
+                               break;
+                       case 14:
+                               /* 1110 xxxx  10xx xxxx  10xx xxxx */
+                               count += 3;
+                               if (count > target)
+                                       throw new UTFDataFormatException(
+                                                       "malformed input: partial character at end (" + (count-index) + " > " + utflen + ")");
+                               char2 = (int) bytearr[count-2];
+                               char3 = (int) bytearr[count-1];
+                               if (((char2 & 0xC0) != 0x80) || ((char3 & 0xC0) != 0x80))
+                                       throw new UTFDataFormatException(
+                                                       "malformed input around byte " + (count-1));
+                               chararr[chararr_count++]=(char)(((c     & 0x0F) << 12) |
+                                               ((char2 & 0x3F) << 6)  |
+                                               ((char3 & 0x3F) << 0));
+                               break;
+                       default:
+                               /* 10xx xxxx,  1111 xxxx */
+                               throw new UTFDataFormatException(
+                                               "malformed input around byte " + count);
                        }
-                       
                }
-               return new String(chars, 0, i);
+               // The number of chars produced may be less than utflen
+               return new String(chararr, 0, chararr_count);
        }
 
        final protected byte[] safeBytes(int amount) throws IOException {