From f0ba956951599a4107cdcb7d20cf2dae784804e1 Mon Sep 17 00:00:00 2001 From: Tuukka Lehtonen Date: Tue, 26 Jun 2018 11:28:36 +0300 Subject: [PATCH] Modified UTF-8 decoding fixes for TG reading and indexing gitlab #33 Change-Id: Ibd4239b76d1d88fe4303bec9d7cc407d15e8505b --- .../internal/DirectQuerySupportImpl.java | 110 +++++++++++------- .../graph/representation/ByteFileReader.java | 75 +++++++++--- 2 files changed, 129 insertions(+), 56 deletions(-) diff --git a/bundles/org.simantics.db.procore/src/fi/vtt/simantics/procore/internal/DirectQuerySupportImpl.java b/bundles/org.simantics.db.procore/src/fi/vtt/simantics/procore/internal/DirectQuerySupportImpl.java index 7f27a0261..0b640861e 100644 --- a/bundles/org.simantics.db.procore/src/fi/vtt/simantics/procore/internal/DirectQuerySupportImpl.java +++ b/bundles/org.simantics.db.procore/src/fi/vtt/simantics/procore/internal/DirectQuerySupportImpl.java @@ -7,6 +7,7 @@ import org.simantics.db.RelationInfo; import org.simantics.db.Resource; import org.simantics.db.common.procedure.wrapper.NoneToAsyncProcedure; import org.simantics.db.common.procedure.wrapper.SyncToAsyncProcedure; +import org.simantics.db.exception.AssumptionException; import org.simantics.db.exception.DatabaseException; import org.simantics.db.exception.NoSingleResultException; import org.simantics.db.impl.ClusterI; @@ -932,8 +933,6 @@ public class DirectQuerySupportImpl implements DirectQuerySupport { } - private final char[] chars = new char[1024]; - private void getDirectValue4(final ReadGraphImpl graph, final ClusterSmall cluster, final int subject, final ForPossibleRelatedValueProcedure procedure) { ResourceTableSmall rt = cluster.resourceTable; @@ -947,22 +946,25 @@ public class DirectQuerySupportImpl implements DirectQuerySupport { int valueIndex = (int)(ls[index] >>> 24) & 0x3FFFFF + vt.offset; int size = (int)bs[valueIndex++]-1; + char[] chars = new char[size]; valueIndex++; for(int i=0;i= 0x80) { @@ -996,42 +998,68 @@ public class DirectQuerySupportImpl implements DirectQuerySupport { length += 0x80; } } - - int i = 0; - int target = length+index; - while(index < target) { - int c = bytes[index++]&0xff; - if(c <= 0x7F) { - chars[i++] = (char)(c&0x7F); - } else if (c > 0x07FF) { - int c2 = bytes[index++]&0xff; - int c3 = bytes[index++]&0xff; - chars[i++] = (char)(((c&0xf)<<12) + ((c2&0x3f)<<6) + (c3&0x3f)); - } else { - int c2 = bytes[index++]&0xff; - chars[i++] = (char)(((c&0x1f)<<6) + (c2&0x3f)); + + // Copied from DataInputStream + int utflen = length; + char[] chararr = new char[utflen]; + + int c, char2, char3; + int count = index; + int target = index + length; + int chararr_count=0; + + while (count < target) { + c = (int) bytes[count] & 0xff; + if (c > 127) break; + count++; + chararr[chararr_count++]=(char)c; + } + + while (count < target) { + c = (int) bytes[count] & 0xff; + switch (c >> 4) { + case 0: case 1: case 2: case 3: case 4: case 5: case 6: case 7: + /* 0xxxxxxx*/ + count++; + chararr[chararr_count++]=(char)c; + break; + case 12: case 13: + /* 110x xxxx 10xx xxxx*/ + count += 2; + if (count > target) + throw new AssumptionException( + "malformed input: partial character at end (" + (count-index) + " > " + utflen + ")"); + char2 = (int) bytes[count-1]; + if ((char2 & 0xC0) != 0x80) + throw new AssumptionException( + "malformed input around byte " + count); + chararr[chararr_count++]=(char)(((c & 0x1F) << 6) | + (char2 & 0x3F)); + break; + case 14: + /* 1110 xxxx 10xx xxxx 10xx xxxx */ + count += 3; + if (count > target) + throw new AssumptionException( + "malformed input: partial character at end (" + (count-index) + " > " + utflen + ")"); + char2 = (int) bytes[count-2]; + char3 = (int) bytes[count-1]; + if (((char2 & 0xC0) != 0x80) || ((char3 & 0xC0) != 0x80)) + throw new AssumptionException( + "malformed input around byte " + (count-1)); + chararr[chararr_count++]=(char)(((c & 0x0F) << 12) | + ((char2 & 0x3F) << 6) | + ((char3 & 0x3F) << 0)); + break; + default: + /* 10xx xxxx, 1111 xxxx */ + throw new AssumptionException( + "malformed input around byte " + count); } - - -// if (!((c >= 0x0001) && (c <= 0x007F))) { -// } else { -// } -// -// if ((c >= 0x0001) && (c <= 0x007F)) { -// bytearr[byteIndex++] = (byte)( c ); -// } else if (c > 0x07FF) { -// bytearr[byteIndex++] = (byte)(0xE0 | ((c >> 12) & 0x0F)); -// bytearr[byteIndex++] = (byte)(0x80 | ((c >> 6) & 0x3F)); -// bytearr[byteIndex++] = (byte)(0x80 | ((c >> 0) & 0x3F)); -// } else { -// bytearr[byteIndex++] = (byte)(0xC0 | ((c >> 6) & 0x1F)); -// bytearr[byteIndex++] = (byte)(0x80 | ((c >> 0) & 0x3F)); -// } -// } - - } - return new String(chars, 0, i); + + // The number of chars produced may be less than utflen + return new String(chararr, 0, chararr_count); } - + } diff --git a/bundles/org.simantics.graph/src/org/simantics/graph/representation/ByteFileReader.java b/bundles/org.simantics.graph/src/org/simantics/graph/representation/ByteFileReader.java index e930a00fc..3a76a911a 100644 --- a/bundles/org.simantics.graph/src/org/simantics/graph/representation/ByteFileReader.java +++ b/bundles/org.simantics.graph/src/org/simantics/graph/representation/ByteFileReader.java @@ -6,6 +6,7 @@ import java.io.File; import java.io.FileInputStream; import java.io.IOException; import java.io.InputStream; +import java.io.UTFDataFormatException; import java.nio.ByteBuffer; import java.nio.channels.ReadableByteChannel; @@ -49,23 +50,67 @@ public class ByteFileReader implements Closeable { return bytes; } - final protected String utf(byte[] bytes, int index, int target) { - int i = 0; - while(index < target) { - int c = bytes[index++]&0xff; - if(c <= 0x7F) { - chars[i++] = (char)(c&0x7F); - } else if (c > 0x07FF) { - int c2 = bytes[index++]&0xff; - int c3 = bytes[index++]&0xff; - chars[i++] = (char)(((c&0xf)<<12) + ((c2&0x3f)<<6) + (c3&0x3f)); - } else { - int c2 = bytes[index++]&0xff; - chars[i++] = (char)(((c&0x1f)<<6) + (c2&0x3f)); + + final protected String utf(byte[] bytearr, int index, int target) throws UTFDataFormatException { + // Copied from DataInputStream + int utflen = target - index; + char[] chararr = utflen > chars.length ? new char[utflen] : chars; + + int c, char2, char3; + int count = index; + int chararr_count=0; + + while (count < target) { + c = (int) bytearr[count] & 0xff; + if (c > 127) break; + count++; + chararr[chararr_count++]=(char)c; + } + + while (count < target) { + c = (int) bytearr[count] & 0xff; + switch (c >> 4) { + case 0: case 1: case 2: case 3: case 4: case 5: case 6: case 7: + /* 0xxxxxxx*/ + count++; + chararr[chararr_count++]=(char)c; + break; + case 12: case 13: + /* 110x xxxx 10xx xxxx*/ + count += 2; + if (count > target) + throw new UTFDataFormatException( + "malformed input: partial character at end (" + (count-index) + " > " + utflen + ")"); + char2 = (int) bytearr[count-1]; + if ((char2 & 0xC0) != 0x80) + throw new UTFDataFormatException( + "malformed input around byte " + count); + chararr[chararr_count++]=(char)(((c & 0x1F) << 6) | + (char2 & 0x3F)); + break; + case 14: + /* 1110 xxxx 10xx xxxx 10xx xxxx */ + count += 3; + if (count > target) + throw new UTFDataFormatException( + "malformed input: partial character at end (" + (count-index) + " > " + utflen + ")"); + char2 = (int) bytearr[count-2]; + char3 = (int) bytearr[count-1]; + if (((char2 & 0xC0) != 0x80) || ((char3 & 0xC0) != 0x80)) + throw new UTFDataFormatException( + "malformed input around byte " + (count-1)); + chararr[chararr_count++]=(char)(((c & 0x0F) << 12) | + ((char2 & 0x3F) << 6) | + ((char3 & 0x3F) << 0)); + break; + default: + /* 10xx xxxx, 1111 xxxx */ + throw new UTFDataFormatException( + "malformed input around byte " + count); } - } - return new String(chars, 0, i); + // The number of chars produced may be less than utflen + return new String(chararr, 0, chararr_count); } final protected byte[] safeBytes(int amount) throws IOException { -- 2.47.1