package org.simantics.databoard.util.binary;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import java.io.UTFDataFormatException;
import java.nio.charset.Charset;
/**
* Utils for handling Standard-UTF8 and Modified-UTF8 Strings.
*
* The differences between standard UTF8 and Modified are the following:
*
* - The null byte
'\u0000'
is encoded in 2-byte format
* rather than 1-byte, so that the encoded strings never have
* embedded nulls.
* - Only the 1-byte, 2-byte, and 3-byte formats are used.
*
- Supplementary characters
* are represented in the form of surrogate pairs.
*
*
*/
public class UTF8 {
public static final Charset CHARSET = Charset.forName("utf-8");
/**
* Get the number of bytes in an UTF-8 encoding of a string
*
* @param string
* @return byte length
*/
public static int getUTF8EncodingByteLength(String string)
{
// TODO ?May not work properly properly on some characters that are encoded with 2 chars in UTF-16?
// Correct
//return string.getBytes(UTF8).length;
// http://en.wikipedia.org/wiki/UTF-8
int result = 0;
int length = string.length();
for (int i=0; i=0 && c<=0x7f) {
result += 1;
} else if (c>=0x80 && c<=0x07ff) {
result += 2;
} else if (c>=0xD800 && c<=0xDFFF) {
result += 1;
} else if (c>=0x800 && c<=0xffff) {
result += 3;
}
// Not really used as char is 16-bit
else if (c>=0x10000 && c<=0x10ffff) {
result += 4;
} else if (c>=0x110000 && c<=0x1FFFFF) {
result += 4;
} else {
// NOT IN RFC 3629
result += 5;
}
}
return result;
}
/**
* Get the number of bytes in an Modified-UTF-8 encoding of a string
*
* @param str
* @return byte length
*/
public static int getModifiedUTF8EncodingByteLength(String str)
{
int strlen = str.length();
int utflen = 0;
int c = 0;
/* use charAt instead of copying String to char array */
for (int i = 0; i < strlen; i++) {
c = str.charAt(i);
if ((c >= 0x0001) && (c <= 0x007F)) {
utflen++;
} else if (c > 0x07FF) {
utflen += 3;
} else {
utflen += 2;
}
}
return utflen;
}
/**
* Write Modified-UTF8 to a stream.
*
* @param out output stream
* @param str string
* @throws IOException
*/
public static void writeModifiedUTF(DataOutput out, String str) throws IOException {
// Copied from DataOutput
int strlen = str.length();
int c = 0;
int i=0;
for (i=0; i= 0x0001) && (c <= 0x007F))) break;
out.write(c);
}
for (;i < strlen; i++){
c = str.charAt(i);
if ((c >= 0x0001) && (c <= 0x007F)) {
out.write( c );
} else if (c > 0x07FF) {
out.write(0xE0 | ((c >> 12) & 0x0F));
out.write(0x80 | ((c >> 6) & 0x3F));
out.write(0x80 | ((c >> 0) & 0x3F));
} else {
out.write(0xC0 | ((c >> 6) & 0x1F));
out.write(0x80 | ((c >> 0) & 0x3F));
}
}
}
/**
* Read Modified-UTF8 from a stream
* @param in input
* @param utflen number of bytes
* @return string
* @throws IOException
*/
public static String readModifiedUTF(DataInput in, int utflen)
throws IOException, UTFDataFormatException
{
// Copied from DataInput
byte[] bytearr = null;
char[] chararr = null;
{
bytearr = new byte[utflen];
chararr = new char[utflen];
}
int c, char2, char3;
int count = 0;
int chararr_count=0;
in.readFully(bytearr, 0, utflen);
while (count < utflen) {
c = (int) bytearr[count] & 0xff;
if (c > 127) break;
count++;
chararr[chararr_count++]=(char)c;
}
while (count < utflen) {
c = (int) bytearr[count] & 0xff;
switch (c >> 4) {
case 0: case 1: case 2: case 3: case 4: case 5: case 6: case 7:
/* 0xxxxxxx*/
count++;
chararr[chararr_count++]=(char)c;
break;
case 12: case 13:
/* 110x xxxx 10xx xxxx*/
count += 2;
if (count > utflen)
throw new UTFDataFormatException(
"malformed input: partial character at end");
char2 = (int) bytearr[count-1];
if ((char2 & 0xC0) != 0x80)
throw new UTFDataFormatException(
"malformed input around byte " + count);
chararr[chararr_count++]=(char)(((c & 0x1F) << 6) |
(char2 & 0x3F));
break;
case 14:
/* 1110 xxxx 10xx xxxx 10xx xxxx */
count += 3;
if (count > utflen)
throw new UTFDataFormatException(
"malformed input: partial character at end");
char2 = (int) bytearr[count-2];
char3 = (int) bytearr[count-1];
if (((char2 & 0xC0) != 0x80) || ((char3 & 0xC0) != 0x80))
throw new UTFDataFormatException(
"malformed input around byte " + (count-1));
chararr[chararr_count++]=(char)(((c & 0x0F) << 12) |
((char2 & 0x3F) << 6) |
((char3 & 0x3F) << 0));
break;
default:
/* 10xx xxxx, 1111 xxxx */
throw new UTFDataFormatException(
"malformed input around byte " + count);
}
}
// The number of chars produced may be less than utflen
return new String(chararr, 0, chararr_count);
}
/**
* Write Standard-UTF8 to a stream.
*
* @param str
* @param out
* @throws IOException
*/
public static void writeUTF(DataOutput out, String str)
throws IOException
{
byte[] bytes = str.getBytes(CHARSET);
out.write(bytes);
}
/**
* Read Standard-UTF8 from a stream
* @param in input
* @param len number of bytes
* @return string
* @throws IOException
*/
public static String readUTF(DataInput in, int len)
throws IOException
{
byte[] bytes = new byte[len];
in.readFully(bytes);
return new String(bytes, UTF8.CHARSET);
}
}