]> gerrit.simantics Code Review - simantics/platform.git/blobdiff - bundles/org.simantics.db.procore/src/fi/vtt/simantics/procore/internal/DirectQuerySupportImpl.java
Modified UTF-8 decoding fixes for TG reading and indexing
[simantics/platform.git] / bundles / org.simantics.db.procore / src / fi / vtt / simantics / procore / internal / DirectQuerySupportImpl.java
index 7f27a0261a0dd97f45cca02e79da1e5805fcd2af..0b640861e1d6af93255b25114c429b2f374ce54f 100644 (file)
@@ -7,6 +7,7 @@ import org.simantics.db.RelationInfo;
 import org.simantics.db.Resource;
 import org.simantics.db.common.procedure.wrapper.NoneToAsyncProcedure;
 import org.simantics.db.common.procedure.wrapper.SyncToAsyncProcedure;
+import org.simantics.db.exception.AssumptionException;
 import org.simantics.db.exception.DatabaseException;
 import org.simantics.db.exception.NoSingleResultException;
 import org.simantics.db.impl.ClusterI;
@@ -932,8 +933,6 @@ public class DirectQuerySupportImpl implements DirectQuerySupport {
                
        }
        
-       private final char[] chars = new char[1024];
-       
        private <T> void getDirectValue4(final ReadGraphImpl graph, final ClusterSmall cluster, final int subject, final ForPossibleRelatedValueProcedure<T> procedure) {
 
                ResourceTableSmall rt = cluster.resourceTable;
@@ -947,22 +946,25 @@ public class DirectQuerySupportImpl implements DirectQuerySupport {
                int valueIndex = (int)(ls[index] >>> 24) & 0x3FFFFF + vt.offset;
 
                int size = (int)bs[valueIndex++]-1;
+               char[] chars = new char[size];
                valueIndex++;
                for(int i=0;i<size;i++) {
                        chars[i] = (char)bs[valueIndex++];
                }
 
-               T value = (T)new String(chars, 0, size);
+               T value = (T)new String(chars);
 
                procedure.execute(graph, value);
 //             graph.dec();
                
        }
 
-       final private String utf(byte[] bytes) {
-               
+       final private String utf(byte[] bytes) throws AssumptionException {
+
                if(bytes == null) return null;
-               
+
+               // Read databoard int32 using Length encoding
+               // https://dev.simantics.org/index.php/Databoard_Specification#Length
                int index = 0;
                int length = bytes[index++]&0xff; 
                if(length >= 0x80) {
@@ -996,42 +998,68 @@ public class DirectQuerySupportImpl implements DirectQuerySupport {
                                length += 0x80;
                        }
                }
-               
-               int i = 0;
-               int target = length+index;
-               while(index < target) {
-                       int c = bytes[index++]&0xff;
-                       if(c <= 0x7F) {
-                               chars[i++] = (char)(c&0x7F);
-                       } else if (c > 0x07FF) {
-                               int c2 = bytes[index++]&0xff;
-                               int c3 = bytes[index++]&0xff;
-                               chars[i++] = (char)(((c&0xf)<<12) + ((c2&0x3f)<<6) + (c3&0x3f)); 
-                       } else {
-                               int c2 = bytes[index++]&0xff;
-                               chars[i++] = (char)(((c&0x1f)<<6) + (c2&0x3f)); 
+
+               // Copied from DataInputStream
+               int utflen = length;
+               char[] chararr = new char[utflen];
+
+               int c, char2, char3;
+               int count = index;
+               int target = index + length;
+               int chararr_count=0;
+
+               while (count < target) {
+                       c = (int) bytes[count] & 0xff;
+                       if (c > 127) break;
+                       count++;
+                       chararr[chararr_count++]=(char)c;
+               }
+
+               while (count < target) {
+                       c = (int) bytes[count] & 0xff;
+                       switch (c >> 4) {
+                       case 0: case 1: case 2: case 3: case 4: case 5: case 6: case 7:
+                               /* 0xxxxxxx*/
+                               count++;
+                               chararr[chararr_count++]=(char)c;
+                               break;
+                       case 12: case 13:
+                               /* 110x xxxx   10xx xxxx*/
+                               count += 2;
+                               if (count > target)
+                                       throw new AssumptionException(
+                                                       "malformed input: partial character at end (" + (count-index) + " > " + utflen + ")");
+                               char2 = (int) bytes[count-1];
+                               if ((char2 & 0xC0) != 0x80)
+                                       throw new AssumptionException(
+                                                       "malformed input around byte " + count); 
+                               chararr[chararr_count++]=(char)(((c & 0x1F) << 6) | 
+                                               (char2 & 0x3F));  
+                               break;
+                       case 14:
+                               /* 1110 xxxx  10xx xxxx  10xx xxxx */
+                               count += 3;
+                               if (count > target)
+                                       throw new AssumptionException(
+                                                       "malformed input: partial character at end (" + (count-index) + " > " + utflen + ")");
+                               char2 = (int) bytes[count-2];
+                               char3 = (int) bytes[count-1];
+                               if (((char2 & 0xC0) != 0x80) || ((char3 & 0xC0) != 0x80))
+                                       throw new AssumptionException(
+                                                       "malformed input around byte " + (count-1));
+                               chararr[chararr_count++]=(char)(((c     & 0x0F) << 12) |
+                                               ((char2 & 0x3F) << 6)  |
+                                               ((char3 & 0x3F) << 0));
+                               break;
+                       default:
+                               /* 10xx xxxx,  1111 xxxx */
+                               throw new AssumptionException(
+                                               "malformed input around byte " + count);
                        }
-                       
-                       
-//                     if (!((c >= 0x0001) && (c <= 0x007F))) {
-//                     } else {
-//                     }
-//                     
-//                             if ((c >= 0x0001) && (c <= 0x007F)) {
-//                                     bytearr[byteIndex++] = (byte)( c );
-//                             } else if (c > 0x07FF) {
-//                                     bytearr[byteIndex++] = (byte)(0xE0 | ((c >> 12) & 0x0F));
-//                                     bytearr[byteIndex++] = (byte)(0x80 | ((c >>  6) & 0x3F));
-//                                     bytearr[byteIndex++] = (byte)(0x80 | ((c >>  0) & 0x3F));
-//                             } else {
-//                                     bytearr[byteIndex++] = (byte)(0xC0 | ((c >>  6) & 0x1F));
-//                                     bytearr[byteIndex++] = (byte)(0x80 | ((c >>  0) & 0x3F));
-//                             }
-//                     }
-                       
-                       
                }
-               return new String(chars, 0, i);
+
+               // The number of chars produced may be less than utflen
+               return new String(chararr, 0, chararr_count);
        }
-       
+
 }