]> gerrit.simantics Code Review - simantics/platform.git/commitdiff
Modified UTF-8 decoding fixes for TG reading and indexing 29/2029/1
authorTuukka Lehtonen <tuukka.lehtonen@semantum.fi>
Tue, 26 Jun 2018 08:28:36 +0000 (11:28 +0300)
committerTuukka Lehtonen <tuukka.lehtonen@semantum.fi>
Sun, 26 Aug 2018 12:11:29 +0000 (12:11 +0000)
gitlab #33

Change-Id: Ibd4239b76d1d88fe4303bec9d7cc407d15e8505b
(cherry picked from commit 2bbecd30ee49d821a0abeaf6d417b5a7fdbe015f)

bundles/org.simantics.db.procore/src/fi/vtt/simantics/procore/internal/DirectQuerySupportImpl.java
bundles/org.simantics.graph/src/org/simantics/graph/representation/ByteFileReader.java

index a0f6c1d1eb3cd437a65a340083c4d1ed1f6c241b..d455861a3d68b0e2097ac67538318fa95e0c363a 100644 (file)
@@ -7,6 +7,7 @@ import org.simantics.db.RelationInfo;
 import org.simantics.db.Resource;
 import org.simantics.db.common.procedure.wrapper.NoneToAsyncProcedure;
 import org.simantics.db.common.procedure.wrapper.SyncToAsyncProcedure;
+import org.simantics.db.exception.AssumptionException;
 import org.simantics.db.exception.DatabaseException;
 import org.simantics.db.impl.ClusterI;
 import org.simantics.db.impl.ClusterI.ClusterTypeEnum;
@@ -927,8 +928,6 @@ public class DirectQuerySupportImpl implements DirectQuerySupport {
                
        }
        
-       private final char[] chars = new char[1024];
-       
        private <T> void getDirectValue4(final ReadGraphImpl graph, final ClusterSmall cluster, final int subject, final ForPossibleRelatedValueProcedure<T> procedure) {
 
                ResourceTableSmall rt = cluster.resourceTable;
@@ -942,22 +941,25 @@ public class DirectQuerySupportImpl implements DirectQuerySupport {
                int valueIndex = (int)(ls[index] >>> 24) & 0x3FFFFF + vt.offset;
 
                int size = (int)bs[valueIndex++]-1;
+               char[] chars = new char[size];
                valueIndex++;
                for(int i=0;i<size;i++) {
                        chars[i] = (char)bs[valueIndex++];
                }
 
-               T value = (T)new String(chars, 0, size);
+               T value = (T)new String(chars);
 
                procedure.execute(graph, value);
 //             graph.dec();
                
        }
 
-       final private String utf(byte[] bytes) {
-               
+       final private String utf(byte[] bytes) throws AssumptionException {
+
                if(bytes == null) return null;
-               
+
+               // Read databoard int32 using Length encoding
+               // https://dev.simantics.org/index.php/Databoard_Specification#Length
                int index = 0;
                int length = bytes[index++]&0xff; 
                if(length >= 0x80) {
@@ -991,42 +993,68 @@ public class DirectQuerySupportImpl implements DirectQuerySupport {
                                length += 0x80;
                        }
                }
-               
-               int i = 0;
-               int target = length+index;
-               while(index < target) {
-                       int c = bytes[index++]&0xff;
-                       if(c <= 0x7F) {
-                               chars[i++] = (char)(c&0x7F);
-                       } else if (c > 0x07FF) {
-                               int c2 = bytes[index++]&0xff;
-                               int c3 = bytes[index++]&0xff;
-                               chars[i++] = (char)(((c&0xf)<<12) + ((c2&0x3f)<<6) + (c3&0x3f)); 
-                       } else {
-                               int c2 = bytes[index++]&0xff;
-                               chars[i++] = (char)(((c&0x1f)<<6) + (c2&0x3f)); 
+
+               // Copied from DataInputStream
+               int utflen = length;
+               char[] chararr = new char[utflen];
+
+               int c, char2, char3;
+               int count = index;
+               int target = index + length;
+               int chararr_count=0;
+
+               while (count < target) {
+                       c = (int) bytes[count] & 0xff;
+                       if (c > 127) break;
+                       count++;
+                       chararr[chararr_count++]=(char)c;
+               }
+
+               while (count < target) {
+                       c = (int) bytes[count] & 0xff;
+                       switch (c >> 4) {
+                       case 0: case 1: case 2: case 3: case 4: case 5: case 6: case 7:
+                               /* 0xxxxxxx*/
+                               count++;
+                               chararr[chararr_count++]=(char)c;
+                               break;
+                       case 12: case 13:
+                               /* 110x xxxx   10xx xxxx*/
+                               count += 2;
+                               if (count > target)
+                                       throw new AssumptionException(
+                                                       "malformed input: partial character at end (" + (count-index) + " > " + utflen + ")");
+                               char2 = (int) bytes[count-1];
+                               if ((char2 & 0xC0) != 0x80)
+                                       throw new AssumptionException(
+                                                       "malformed input around byte " + count); 
+                               chararr[chararr_count++]=(char)(((c & 0x1F) << 6) | 
+                                               (char2 & 0x3F));  
+                               break;
+                       case 14:
+                               /* 1110 xxxx  10xx xxxx  10xx xxxx */
+                               count += 3;
+                               if (count > target)
+                                       throw new AssumptionException(
+                                                       "malformed input: partial character at end (" + (count-index) + " > " + utflen + ")");
+                               char2 = (int) bytes[count-2];
+                               char3 = (int) bytes[count-1];
+                               if (((char2 & 0xC0) != 0x80) || ((char3 & 0xC0) != 0x80))
+                                       throw new AssumptionException(
+                                                       "malformed input around byte " + (count-1));
+                               chararr[chararr_count++]=(char)(((c     & 0x0F) << 12) |
+                                               ((char2 & 0x3F) << 6)  |
+                                               ((char3 & 0x3F) << 0));
+                               break;
+                       default:
+                               /* 10xx xxxx,  1111 xxxx */
+                               throw new AssumptionException(
+                                               "malformed input around byte " + count);
                        }
-                       
-                       
-//                     if (!((c >= 0x0001) && (c <= 0x007F))) {
-//                     } else {
-//                     }
-//                     
-//                             if ((c >= 0x0001) && (c <= 0x007F)) {
-//                                     bytearr[byteIndex++] = (byte)( c );
-//                             } else if (c > 0x07FF) {
-//                                     bytearr[byteIndex++] = (byte)(0xE0 | ((c >> 12) & 0x0F));
-//                                     bytearr[byteIndex++] = (byte)(0x80 | ((c >>  6) & 0x3F));
-//                                     bytearr[byteIndex++] = (byte)(0x80 | ((c >>  0) & 0x3F));
-//                             } else {
-//                                     bytearr[byteIndex++] = (byte)(0xC0 | ((c >>  6) & 0x1F));
-//                                     bytearr[byteIndex++] = (byte)(0x80 | ((c >>  0) & 0x3F));
-//                             }
-//                     }
-                       
-                       
                }
-               return new String(chars, 0, i);
+
+               // The number of chars produced may be less than utflen
+               return new String(chararr, 0, chararr_count);
        }
-       
+
 }
index 4c14bded6639fbf0662e6548f486d38dd27be912..bb07602c8b047ed43405d0768ce09d5ed9a30d19 100644 (file)
@@ -6,6 +6,7 @@ import java.io.File;
 import java.io.FileInputStream;
 import java.io.IOException;
 import java.io.InputStream;
+import java.io.UTFDataFormatException;
 import java.nio.ByteBuffer;
 import java.nio.channels.ReadableByteChannel;
 
@@ -49,23 +50,67 @@ public class ByteFileReader implements Closeable {
                return bytes;
 
        }
-       final protected String utf(byte[] bytes, int index, int target) {
-               int i = 0;
-               while(index < target) {
-                       int c = bytes[index++]&0xff;
-                       if(c <= 0x7F) {
-                               chars[i++] = (char)(c&0x7F);
-                       } else if (c > 0x07FF) {
-                               int c2 = bytes[index++]&0xff;
-                               int c3 = bytes[index++]&0xff;
-                               chars[i++] = (char)(((c&0xf)<<12) + ((c2&0x3f)<<6) + (c3&0x3f)); 
-                       } else {
-                               int c2 = bytes[index++]&0xff;
-                               chars[i++] = (char)(((c&0x1f)<<6) + (c2&0x3f)); 
+
+       final protected String utf(byte[] bytearr, int index, int target) throws UTFDataFormatException {
+               // Copied from DataInputStream
+               int utflen = target - index;
+               char[] chararr = utflen > chars.length ? new char[utflen] : chars;
+
+               int c, char2, char3;
+               int count = index;
+               int chararr_count=0;
+
+               while (count < target) {
+                       c = (int) bytearr[count] & 0xff;
+                       if (c > 127) break;
+                       count++;
+                       chararr[chararr_count++]=(char)c;
+               }
+
+               while (count < target) {
+                       c = (int) bytearr[count] & 0xff;
+                       switch (c >> 4) {
+                       case 0: case 1: case 2: case 3: case 4: case 5: case 6: case 7:
+                               /* 0xxxxxxx*/
+                               count++;
+                               chararr[chararr_count++]=(char)c;
+                               break;
+                       case 12: case 13:
+                               /* 110x xxxx   10xx xxxx*/
+                               count += 2;
+                               if (count > target)
+                                       throw new UTFDataFormatException(
+                                                       "malformed input: partial character at end (" + (count-index) + " > " + utflen + ")");
+                               char2 = (int) bytearr[count-1];
+                               if ((char2 & 0xC0) != 0x80)
+                                       throw new UTFDataFormatException(
+                                                       "malformed input around byte " + count); 
+                               chararr[chararr_count++]=(char)(((c & 0x1F) << 6) | 
+                                               (char2 & 0x3F));  
+                               break;
+                       case 14:
+                               /* 1110 xxxx  10xx xxxx  10xx xxxx */
+                               count += 3;
+                               if (count > target)
+                                       throw new UTFDataFormatException(
+                                                       "malformed input: partial character at end (" + (count-index) + " > " + utflen + ")");
+                               char2 = (int) bytearr[count-2];
+                               char3 = (int) bytearr[count-1];
+                               if (((char2 & 0xC0) != 0x80) || ((char3 & 0xC0) != 0x80))
+                                       throw new UTFDataFormatException(
+                                                       "malformed input around byte " + (count-1));
+                               chararr[chararr_count++]=(char)(((c     & 0x0F) << 12) |
+                                               ((char2 & 0x3F) << 6)  |
+                                               ((char3 & 0x3F) << 0));
+                               break;
+                       default:
+                               /* 10xx xxxx,  1111 xxxx */
+                               throw new UTFDataFormatException(
+                                               "malformed input around byte " + count);
                        }
-                       
                }
-               return new String(chars, 0, i);
+               // The number of chars produced may be less than utflen
+               return new String(chararr, 0, chararr_count);
        }
 
        final protected byte[] safeBytes(int amount) throws IOException {