Modified UTF-8 decoding fixes for TG reading and indexing 80/1880/3
authorTuukka Lehtonen <tuukka.lehtonen@semantum.fi>
Tue, 26 Jun 2018 08:28:36 +0000 (11:28 +0300)
committerTuukka Lehtonen <tuukka.lehtonen@semantum.fi>
Tue, 26 Jun 2018 09:30:11 +0000 (12:30 +0300)
gitlab #33

Change-Id: Ibd4239b76d1d88fe4303bec9d7cc407d15e8505b

bundles/org.simantics.db.procore/src/fi/vtt/simantics/procore/internal/DirectQuerySupportImpl.java
bundles/org.simantics.graph/src/org/simantics/graph/representation/ByteFileReader.java

index 7f27a0261a0dd97f45cca02e79da1e5805fcd2af..0b640861e1d6af93255b25114c429b2f374ce54f 100644 (file)
@@ -7,6 +7,7 @@ import org.simantics.db.RelationInfo;
 import org.simantics.db.Resource;
 import org.simantics.db.common.procedure.wrapper.NoneToAsyncProcedure;
 import org.simantics.db.common.procedure.wrapper.SyncToAsyncProcedure;
+import org.simantics.db.exception.AssumptionException;
 import org.simantics.db.exception.DatabaseException;
 import org.simantics.db.exception.NoSingleResultException;
 import org.simantics.db.impl.ClusterI;
@@ -932,8 +933,6 @@ public class DirectQuerySupportImpl implements DirectQuerySupport {
                
        }
        
-       private final char[] chars = new char[1024];
-       
        private <T> void getDirectValue4(final ReadGraphImpl graph, final ClusterSmall cluster, final int subject, final ForPossibleRelatedValueProcedure<T> procedure) {
 
                ResourceTableSmall rt = cluster.resourceTable;
@@ -947,22 +946,25 @@ public class DirectQuerySupportImpl implements DirectQuerySupport {
                int valueIndex = (int)(ls[index] >>> 24) & 0x3FFFFF + vt.offset;
 
                int size = (int)bs[valueIndex++]-1;
+               char[] chars = new char[size];
                valueIndex++;
                for(int i=0;i<size;i++) {
                        chars[i] = (char)bs[valueIndex++];
                }
 
-               T value = (T)new String(chars, 0, size);
+               T value = (T)new String(chars);
 
                procedure.execute(graph, value);
 //             graph.dec();
                
        }
 
-       final private String utf(byte[] bytes) {
-               
+       final private String utf(byte[] bytes) throws AssumptionException {
+
                if(bytes == null) return null;
-               
+
+               // Read databoard int32 using Length encoding
+               // https://dev.simantics.org/index.php/Databoard_Specification#Length
                int index = 0;
                int length = bytes[index++]&0xff; 
                if(length >= 0x80) {
@@ -996,42 +998,68 @@ public class DirectQuerySupportImpl implements DirectQuerySupport {
                                length += 0x80;
                        }
                }
-               
-               int i = 0;
-               int target = length+index;
-               while(index < target) {
-                       int c = bytes[index++]&0xff;
-                       if(c <= 0x7F) {
-                               chars[i++] = (char)(c&0x7F);
-                       } else if (c > 0x07FF) {
-                               int c2 = bytes[index++]&0xff;
-                               int c3 = bytes[index++]&0xff;
-                               chars[i++] = (char)(((c&0xf)<<12) + ((c2&0x3f)<<6) + (c3&0x3f)); 
-                       } else {
-                               int c2 = bytes[index++]&0xff;
-                               chars[i++] = (char)(((c&0x1f)<<6) + (c2&0x3f)); 
+
+               // Copied from DataInputStream
+               int utflen = length;
+               char[] chararr = new char[utflen];
+
+               int c, char2, char3;
+               int count = index;
+               int target = index + length;
+               int chararr_count=0;
+
+               while (count < target) {
+                       c = (int) bytes[count] & 0xff;
+                       if (c > 127) break;
+                       count++;
+                       chararr[chararr_count++]=(char)c;
+               }
+
+               while (count < target) {
+                       c = (int) bytes[count] & 0xff;
+                       switch (c >> 4) {
+                       case 0: case 1: case 2: case 3: case 4: case 5: case 6: case 7:
+                               /* 0xxxxxxx*/
+                               count++;
+                               chararr[chararr_count++]=(char)c;
+                               break;
+                       case 12: case 13:
+                               /* 110x xxxx   10xx xxxx*/
+                               count += 2;
+                               if (count > target)
+                                       throw new AssumptionException(
+                                                       "malformed input: partial character at end (" + (count-index) + " > " + utflen + ")");
+                               char2 = (int) bytes[count-1];
+                               if ((char2 & 0xC0) != 0x80)
+                                       throw new AssumptionException(
+                                                       "malformed input around byte " + count); 
+                               chararr[chararr_count++]=(char)(((c & 0x1F) << 6) | 
+                                               (char2 & 0x3F));  
+                               break;
+                       case 14:
+                               /* 1110 xxxx  10xx xxxx  10xx xxxx */
+                               count += 3;
+                               if (count > target)
+                                       throw new AssumptionException(
+                                                       "malformed input: partial character at end (" + (count-index) + " > " + utflen + ")");
+                               char2 = (int) bytes[count-2];
+                               char3 = (int) bytes[count-1];
+                               if (((char2 & 0xC0) != 0x80) || ((char3 & 0xC0) != 0x80))
+                                       throw new AssumptionException(
+                                                       "malformed input around byte " + (count-1));
+                               chararr[chararr_count++]=(char)(((c     & 0x0F) << 12) |
+                                               ((char2 & 0x3F) << 6)  |
+                                               ((char3 & 0x3F) << 0));
+                               break;
+                       default:
+                               /* 10xx xxxx,  1111 xxxx */
+                               throw new AssumptionException(
+                                               "malformed input around byte " + count);
                        }
-                       
-                       
-//                     if (!((c >= 0x0001) && (c <= 0x007F))) {
-//                     } else {
-//                     }
-//                     
-//                             if ((c >= 0x0001) && (c <= 0x007F)) {
-//                                     bytearr[byteIndex++] = (byte)( c );
-//                             } else if (c > 0x07FF) {
-//                                     bytearr[byteIndex++] = (byte)(0xE0 | ((c >> 12) & 0x0F));
-//                                     bytearr[byteIndex++] = (byte)(0x80 | ((c >>  6) & 0x3F));
-//                                     bytearr[byteIndex++] = (byte)(0x80 | ((c >>  0) & 0x3F));
-//                             } else {
-//                                     bytearr[byteIndex++] = (byte)(0xC0 | ((c >>  6) & 0x1F));
-//                                     bytearr[byteIndex++] = (byte)(0x80 | ((c >>  0) & 0x3F));
-//                             }
-//                     }
-                       
-                       
                }
-               return new String(chars, 0, i);
+
+               // The number of chars produced may be less than utflen
+               return new String(chararr, 0, chararr_count);
        }
-       
+
 }
index e930a00fcad35b9c36bc323bd119c165f8f495dc..3a76a911a0d3c269c4c72cf5053b742aa8579432 100644 (file)
@@ -6,6 +6,7 @@ import java.io.File;
 import java.io.FileInputStream;
 import java.io.IOException;
 import java.io.InputStream;
+import java.io.UTFDataFormatException;
 import java.nio.ByteBuffer;
 import java.nio.channels.ReadableByteChannel;
 
@@ -49,23 +50,67 @@ public class ByteFileReader implements Closeable {
                return bytes;
 
        }
-       final protected String utf(byte[] bytes, int index, int target) {
-               int i = 0;
-               while(index < target) {
-                       int c = bytes[index++]&0xff;
-                       if(c <= 0x7F) {
-                               chars[i++] = (char)(c&0x7F);
-                       } else if (c > 0x07FF) {
-                               int c2 = bytes[index++]&0xff;
-                               int c3 = bytes[index++]&0xff;
-                               chars[i++] = (char)(((c&0xf)<<12) + ((c2&0x3f)<<6) + (c3&0x3f)); 
-                       } else {
-                               int c2 = bytes[index++]&0xff;
-                               chars[i++] = (char)(((c&0x1f)<<6) + (c2&0x3f)); 
+
+       final protected String utf(byte[] bytearr, int index, int target) throws UTFDataFormatException {
+               // Copied from DataInputStream
+               int utflen = target - index;
+               char[] chararr = utflen > chars.length ? new char[utflen] : chars;
+
+               int c, char2, char3;
+               int count = index;
+               int chararr_count=0;
+
+               while (count < target) {
+                       c = (int) bytearr[count] & 0xff;
+                       if (c > 127) break;
+                       count++;
+                       chararr[chararr_count++]=(char)c;
+               }
+
+               while (count < target) {
+                       c = (int) bytearr[count] & 0xff;
+                       switch (c >> 4) {
+                       case 0: case 1: case 2: case 3: case 4: case 5: case 6: case 7:
+                               /* 0xxxxxxx*/
+                               count++;
+                               chararr[chararr_count++]=(char)c;
+                               break;
+                       case 12: case 13:
+                               /* 110x xxxx   10xx xxxx*/
+                               count += 2;
+                               if (count > target)
+                                       throw new UTFDataFormatException(
+                                                       "malformed input: partial character at end (" + (count-index) + " > " + utflen + ")");
+                               char2 = (int) bytearr[count-1];
+                               if ((char2 & 0xC0) != 0x80)
+                                       throw new UTFDataFormatException(
+                                                       "malformed input around byte " + count); 
+                               chararr[chararr_count++]=(char)(((c & 0x1F) << 6) | 
+                                               (char2 & 0x3F));  
+                               break;
+                       case 14:
+                               /* 1110 xxxx  10xx xxxx  10xx xxxx */
+                               count += 3;
+                               if (count > target)
+                                       throw new UTFDataFormatException(
+                                                       "malformed input: partial character at end (" + (count-index) + " > " + utflen + ")");
+                               char2 = (int) bytearr[count-2];
+                               char3 = (int) bytearr[count-1];
+                               if (((char2 & 0xC0) != 0x80) || ((char3 & 0xC0) != 0x80))
+                                       throw new UTFDataFormatException(
+                                                       "malformed input around byte " + (count-1));
+                               chararr[chararr_count++]=(char)(((c     & 0x0F) << 12) |
+                                               ((char2 & 0x3F) << 6)  |
+                                               ((char3 & 0x3F) << 0));
+                               break;
+                       default:
+                               /* 10xx xxxx,  1111 xxxx */
+                               throw new UTFDataFormatException(
+                                               "malformed input around byte " + count);
                        }
-                       
                }
-               return new String(chars, 0, i);
+               // The number of chars produced may be less than utflen
+               return new String(chararr, 0, chararr_count);
        }
 
        final protected byte[] safeBytes(int amount) throws IOException {