bundles/org.simantics.databoard/cpp/DataBoardTest/libantlr3c-3.2/src/antlr3convertutf.c

   1 /*
   2  * Copyright 2001-2004 Unicode, Inc.
   3  *
   4  * Disclaimer
   5  *
   6  * This source code is provided as is by Unicode, Inc. No claims are
   7  * made as to fitness for any particular purpose. No warranties of any
   8  * kind are expressed or implied. The recipient agrees to determine
   9  * applicability of information provided. If this file has been
  10  * purchased on magnetic or optical media from Unicode, Inc., the
  11  * sole remedy for any claim will be exchange of defective media
  12  * within 90 days of receipt.
  13  *
  14  * Limitations on Rights to Redistribute This Code
  15  *
  16  * Unicode, Inc. hereby grants the right to freely use the information
  17  * supplied in this file in the creation of products supporting the
  18  * Unicode Standard, and to make copies of this file in any form
  19  * for internal or external distribution as long as this notice
  20  * remains attached.
  21  */
  22
  23 /* ---------------------------------------------------------------------
  24
  25     Conversions between UTF32, UTF-16, and UTF-8. Source code file.
  26     Author: Mark E. Davis, 1994.
  27     Rev History: Rick McGowan, fixes & updates May 2001.
  28     Sept 2001: fixed const & error conditions per
  29         mods suggested by S. Parent & A. Lillich.
  30     June 2002: Tim Dodd added detection and handling of incomplete
  31         source sequences, enhanced error detection, added casts
  32         to eliminate compiler warnings.
  33     July 2003: slight mods to back out aggressive FFFE detection.
  34     Jan 2004: updated switches in from-UTF8 conversions.
  35     Oct 2004: updated to use UNI_MAX_LEGAL_UTF32 in UTF-32 conversions.
  36
  37     See the header file "ConvertUTF.h" for complete documentation.
  38
  39 ------------------------------------------------------------------------ */
  40
  41
  42 #include "antlr3convertutf.h"
  43
  44 #ifdef CVTUTF_DEBUG
  45 #include <stdio.h>
  46 #endif
  47
  48 static const int halfShift  = 10; /* used for shifting by 10 bits */
  49
  50 static const UTF32 halfBase = 0x0010000UL;
  51 static const UTF32 halfMask = 0x3FFUL;
  52
  53 #define UNI_SUR_HIGH_START  (UTF32)0xD800
  54 #define UNI_SUR_HIGH_END    (UTF32)0xDBFF
  55 #define UNI_SUR_LOW_START   (UTF32)0xDC00
  56 #define UNI_SUR_LOW_END     (UTF32)0xDFFF
  57 #define false      0
  58 #define true        1
  59
  60 /* --------------------------------------------------------------------- */
  61
  62 ConversionResult ConvertUTF32toUTF16 (
  63         const UTF32** sourceStart, const UTF32* sourceEnd,
  64         UTF16** targetStart, UTF16* targetEnd, ConversionFlags flags) {
  65     ConversionResult result = conversionOK;
  66     const UTF32* source = *sourceStart;
  67     UTF16* target = *targetStart;
  68     while (source < sourceEnd) {
  69         UTF32 ch;
  70         if (target >= targetEnd) {
  71             result = targetExhausted; break;
  72         }
  73         ch = *source++;
  74         if (ch <= UNI_MAX_BMP) { /* Target is a character <= 0xFFFF */
  75             /* UTF-16 surrogate values are illegal in UTF-32; 0xffff or 0xfffe are both reserved values */
  76             if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) {
  77                 if (flags == strictConversion) {
  78                     --source; /* return to the illegal value itself */
  79                     result = sourceIllegal;
  80                     break;
  81                 } else {
  82                     *target++ = UNI_REPLACEMENT_CHAR;
  83                 }
  84             } else {
  85                 *target++ = (UTF16)ch; /* normal case */
  86             }
  87         } else if (ch > UNI_MAX_LEGAL_UTF32) {
  88             if (flags == strictConversion) {
  89                 result = sourceIllegal;
  90             } else {
  91                 *target++ = UNI_REPLACEMENT_CHAR;
  92             }
  93         } else {
  94             /* target is a character in range 0xFFFF - 0x10FFFF. */
  95             if (target + 1 >= targetEnd) {
  96                 --source; /* Back up source pointer! */
  97                 result = targetExhausted; break;
  98             }
  99             ch -= halfBase;
 100             *target++ = (UTF16)((ch >> halfShift) + UNI_SUR_HIGH_START);
 101             *target++ = (UTF16)((ch & halfMask) + UNI_SUR_LOW_START);
 102         }
 103     }
 104     *sourceStart = source;
 105     *targetStart = target;
 106     return result;
 107 }
 108
 109 /* --------------------------------------------------------------------- */
 110
 111 ConversionResult ConvertUTF16toUTF32 (
 112         const UTF16** sourceStart, const UTF16* sourceEnd,
 113         UTF32** targetStart, UTF32* targetEnd, ConversionFlags flags) {
 114     ConversionResult result = conversionOK;
 115     const UTF16* source = *sourceStart;
 116     UTF32* target = *targetStart;
 117     UTF32 ch, ch2;
 118     while (source < sourceEnd) {
 119         const UTF16* oldSource = source; /*  In case we have to back up because of target overflow. */
 120         ch = *source++;
 121         /* If we have a surrogate pair, convert to UTF32 first. */
 122         if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END) {
 123             /* If the 16 bits following the high surrogate are in the source buffer... */
 124             if (source < sourceEnd) {
 125                 ch2 = *source;
 126                 /* If it's a low surrogate, convert to UTF32. */
 127                 if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END) {
 128                     ch = ((ch - UNI_SUR_HIGH_START) << halfShift)
 129                         + (ch2 - UNI_SUR_LOW_START) + halfBase;
 130                     ++source;
 131                 } else if (flags == strictConversion) { /* it's an unpaired high surrogate */
 132                     --source; /* return to the illegal value itself */
 133                     result = sourceIllegal;
 134                     break;
 135                 }
 136             } else { /* We don't have the 16 bits following the high surrogate. */
 137                 --source; /* return to the high surrogate */
 138                 result = sourceExhausted;
 139                 break;
 140             }
 141         } else if (flags == strictConversion) {
 142             /* UTF-16 surrogate values are illegal in UTF-32 */
 143             if (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END) {
 144                 --source; /* return to the illegal value itself */
 145                 result = sourceIllegal;
 146                 break;
 147             }
 148         }
 149         if (target >= targetEnd) {
 150             source = oldSource; /* Back up source pointer! */
 151             result = targetExhausted; break;
 152         }
 153         *target++ = ch;
 154     }
 155     *sourceStart = source;
 156     *targetStart = target;
 157 #ifdef CVTUTF_DEBUG
 158 if (result == sourceIllegal) {
 159     ANTLR3_FPRINTF(stderr, "ConvertUTF16toUTF32 illegal seq 0x%04x,%04x\n", ch, ch2);
 160     fflush(stderr);
 161 }
 162 #endif
 163     return result;
 164 }
 165
 166 /* --------------------------------------------------------------------- */
 167
 168 /*
 169  * Index into the table below with the first byte of a UTF-8 sequence to
 170  * get the number of trailing bytes that are supposed to follow it.
 171  * Note that *legal* UTF-8 values can't have 4 or 5-bytes. The table is
 172  * left as-is for anyone who may want to do such conversion, which was
 173  * allowed in earlier algorithms.
 174  */
 175 static const char trailingBytesForUTF8[256] = {
 176     0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
 177     0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
 178     0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
 179     0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
 180     0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
 181     0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
 182     1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
 183     2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5
 184 };
 185
 186 /*
 187  * Magic values subtracted from a buffer value during UTF8 conversion.
 188  * This table contains as many values as there might be trailing bytes
 189  * in a UTF-8 sequence.
 190  */
 191 static const UTF32 offsetsFromUTF8[6] = { 0x00000000UL, 0x00003080UL, 0x000E2080UL,
 192                      0x03C82080UL, 0xFA082080UL, 0x82082080UL };
 193
 194 /*
 195  * Once the bits are split out into bytes of UTF-8, this is a mask OR-ed
 196  * into the first byte, depending on how many bytes follow.  There are
 197  * as many entries in this table as there are UTF-8 sequence types.
 198  * (I.e., one byte sequence, two byte... etc.). Remember that sequencs
 199  * for *legal* UTF-8 will be 4 or fewer bytes total.
 200  */
 201 static const UTF8 firstByteMark[7] = { 0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC };
 202
 203 /* --------------------------------------------------------------------- */
 204
 205 /* The interface converts a whole buffer to avoid function-call overhead.
 206  * Constants have been gathered. Loops & conditionals have been removed as
 207  * much as possible for efficiency, in favor of drop-through switches.
 208  * (See "Note A" at the bottom of the file for equivalent code.)
 209  * If your compiler supports it, the "isLegalUTF8" call can be turned
 210  * into an inline function.
 211  */
 212
 213 /* --------------------------------------------------------------------- */
 214
 215 ConversionResult ConvertUTF16toUTF8 (
 216         const UTF16** sourceStart, const UTF16* sourceEnd,
 217         UTF8** targetStart, UTF8* targetEnd, ConversionFlags flags) {
 218     ConversionResult result = conversionOK;
 219     const UTF16* source = *sourceStart;
 220     UTF8* target = *targetStart;
 221     while (source < sourceEnd) {
 222         UTF32 ch;
 223         unsigned short bytesToWrite = 0;
 224         const UTF32 byteMask = 0xBF;
 225         const UTF32 byteMark = 0x80;
 226         const UTF16* oldSource = source; /* In case we have to back up because of target overflow. */
 227         ch = *source++;
 228         /* If we have a surrogate pair, convert to UTF32 first. */
 229         if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END) {
 230             /* If the 16 bits following the high surrogate are in the source buffer... */
 231             if (source < sourceEnd) {
 232                 UTF32 ch2 = *source;
 233                 /* If it's a low surrogate, convert to UTF32. */
 234                 if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END) {
 235                     ch = ((ch - UNI_SUR_HIGH_START) << halfShift)
 236                         + (ch2 - UNI_SUR_LOW_START) + halfBase;
 237                     ++source;
 238                 } else if (flags == strictConversion) { /* it's an unpaired high surrogate */
 239                     --source; /* return to the illegal value itself */
 240                     result = sourceIllegal;
 241                     break;
 242                 }
 243             } else { /* We don't have the 16 bits following the high surrogate. */
 244                 --source; /* return to the high surrogate */
 245                 result = sourceExhausted;
 246                 break;
 247             }
 248         } else if (flags == strictConversion) {
 249             /* UTF-16 surrogate values are illegal in UTF-32 */
 250             if (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END) {
 251                 --source; /* return to the illegal value itself */
 252                 result = sourceIllegal;
 253                 break;
 254             }
 255         }
 256         /* Figure out how many bytes the result will require */
 257         if (ch < (UTF32)0x80) {      bytesToWrite = 1;
 258         } else if (ch < (UTF32)0x800) {     bytesToWrite = 2;
 259         } else if (ch < (UTF32)0x10000) {   bytesToWrite = 3;
 260         } else if (ch < (UTF32)0x110000) {  bytesToWrite = 4;
 261         } else {                            bytesToWrite = 3;
 262                                             ch = UNI_REPLACEMENT_CHAR;
 263         }
 264
 265         target += bytesToWrite;
 266         if (target > targetEnd) {
 267             source = oldSource; /* Back up source pointer! */
 268             target -= bytesToWrite; result = targetExhausted; break;
 269         }
 270         switch (bytesToWrite) { /* note: everything falls through. */
 271             case 4: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
 272             case 3: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
 273             case 2: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
 274             case 1: *--target =  (UTF8)(ch | firstByteMark[bytesToWrite]);
 275         }
 276         target += bytesToWrite;
 277     }
 278     *sourceStart = source;
 279     *targetStart = target;
 280     return result;
 281 }
 282
 283 /* --------------------------------------------------------------------- */
 284
 285 /*
 286  * Utility routine to tell whether a sequence of bytes is legal UTF-8.
 287  * This must be called with the length pre-determined by the first byte.
 288  * If not calling this from ConvertUTF8to*, then the length can be set by:
 289  *  length = trailingBytesForUTF8[*source]+1;
 290  * and the sequence is illegal right away if there aren't that many bytes
 291  * available.
 292  * If presented with a length > 4, this returns false.  The Unicode
 293  * definition of UTF-8 goes up to 4-byte sequences.
 294  */
 295
 296 static ANTLR3_BOOLEAN
 297 isLegalUTF8(const UTF8 *source, int length) {
 298     UTF8 a;
 299     const UTF8 *srcptr = source+length;
 300     switch (length) {
 301     default: return false;
 302         /* Everything else falls through when "true"... */
 303     case 4: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false;
 304     case 3: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false;
 305     case 2: if ((a = (*--srcptr)) > 0xBF) return false;
 306
 307         switch (*source) {
 308             /* no fall-through in this inner switch */
 309             case 0xE0: if (a < 0xA0) return false; break;
 310             case 0xED: if (a > 0x9F) return false; break;
 311             case 0xF0: if (a < 0x90) return false; break;
 312             case 0xF4: if (a > 0x8F) return false; break;
 313             default:   if (a < 0x80) return false;
 314         }
 315
 316     case 1: if (*source >= 0x80 && *source < 0xC2) return false;
 317     }
 318     if (*source > 0xF4) return false;
 319     return true;
 320 }
 321
 322 /* --------------------------------------------------------------------- */
 323
 324 /*
 325  * Exported function to return whether a UTF-8 sequence is legal or not.
 326  * This is not used here; it's just exported.
 327  */
 328 ANTLR3_BOOLEAN
 329 isLegalUTF8Sequence(const UTF8 *source, const UTF8 *sourceEnd) {
 330     int length = trailingBytesForUTF8[*source]+1;
 331     if (source+length > sourceEnd) {
 332         return false;
 333     }
 334     return isLegalUTF8(source, length);
 335 }
 336
 337 /* --------------------------------------------------------------------- */
 338
 339 ConversionResult ConvertUTF8toUTF16 (
 340         const UTF8** sourceStart, const UTF8* sourceEnd,
 341         UTF16** targetStart, UTF16* targetEnd, ConversionFlags flags) {
 342     ConversionResult result = conversionOK;
 343     const UTF8* source = *sourceStart;
 344     UTF16* target = *targetStart;
 345     while (source < sourceEnd) {
 346         UTF32 ch = 0;
 347         unsigned short extraBytesToRead = trailingBytesForUTF8[*source];
 348         if (source + extraBytesToRead >= sourceEnd) {
 349             result = sourceExhausted; break;
 350         }
 351         /* Do this check whether lenient or strict */
 352         if (! isLegalUTF8(source, extraBytesToRead+1)) {
 353             result = sourceIllegal;
 354             break;
 355         }
 356         /*
 357          * The cases all fall through. See "Note A" below.
 358          */
 359         switch (extraBytesToRead) {
 360             case 5: ch += *source++; ch <<= 6; /* remember, illegal UTF-8 */
 361             case 4: ch += *source++; ch <<= 6; /* remember, illegal UTF-8 */
 362             case 3: ch += *source++; ch <<= 6;
 363             case 2: ch += *source++; ch <<= 6;
 364             case 1: ch += *source++; ch <<= 6;
 365             case 0: ch += *source++;
 366         }
 367         ch -= offsetsFromUTF8[extraBytesToRead];
 368
 369         if (target >= targetEnd) {
 370             source -= (extraBytesToRead+1); /* Back up source pointer! */
 371             result = targetExhausted; break;
 372         }
 373         if (ch <= UNI_MAX_BMP) { /* Target is a character <= 0xFFFF */
 374             /* UTF-16 surrogate values are illegal in UTF-32 */
 375             if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) {
 376                 if (flags == strictConversion) {
 377                     source -= (extraBytesToRead+1); /* return to the illegal value itself */
 378                     result = sourceIllegal;
 379                     break;
 380                 } else {
 381                     *target++ = UNI_REPLACEMENT_CHAR;
 382                 }
 383             } else {
 384                 *target++ = (UTF16)ch; /* normal case */
 385             }
 386         } else if (ch > UNI_MAX_UTF16) {
 387             if (flags == strictConversion) {
 388                 result = sourceIllegal;
 389                 source -= (extraBytesToRead+1); /* return to the start */
 390                 break; /* Bail out; shouldn't continue */
 391             } else {
 392                 *target++ = UNI_REPLACEMENT_CHAR;
 393             }
 394         } else {
 395             /* target is a character in range 0xFFFF - 0x10FFFF. */
 396             if (target + 1 >= targetEnd) {
 397                 source -= (extraBytesToRead+1); /* Back up source pointer! */
 398                 result = targetExhausted; break;
 399             }
 400             ch -= halfBase;
 401             *target++ = (UTF16)((ch >> halfShift) + UNI_SUR_HIGH_START);
 402             *target++ = (UTF16)((ch & halfMask) + UNI_SUR_LOW_START);
 403         }
 404     }
 405     *sourceStart = source;
 406     *targetStart = target;
 407     return result;
 408 }
 409
 410 /* --------------------------------------------------------------------- */
 411
 412 ConversionResult ConvertUTF32toUTF8 (
 413         const UTF32** sourceStart, const UTF32* sourceEnd,
 414         UTF8** targetStart, UTF8* targetEnd, ConversionFlags flags) {
 415     ConversionResult result = conversionOK;
 416     const UTF32* source = *sourceStart;
 417     UTF8* target = *targetStart;
 418     while (source < sourceEnd) {
 419         UTF32 ch;
 420         unsigned short bytesToWrite = 0;
 421         const UTF32 byteMask = 0xBF;
 422         const UTF32 byteMark = 0x80;
 423         ch = *source++;
 424         if (flags == strictConversion ) {
 425             /* UTF-16 surrogate values are illegal in UTF-32 */
 426             if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) {
 427                 --source; /* return to the illegal value itself */
 428                 result = sourceIllegal;
 429                 break;
 430             }
 431         }
 432         /*
 433          * Figure out how many bytes the result will require. Turn any
 434          * illegally large UTF32 things (> Plane 17) into replacement chars.
 435          */
 436         if (ch < (UTF32)0x80) {      bytesToWrite = 1;
 437         } else if (ch < (UTF32)0x800) {     bytesToWrite = 2;
 438         } else if (ch < (UTF32)0x10000) {   bytesToWrite = 3;
 439         } else if (ch <= UNI_MAX_LEGAL_UTF32) {  bytesToWrite = 4;
 440         } else {                            bytesToWrite = 3;
 441                                             ch = UNI_REPLACEMENT_CHAR;
 442                                             result = sourceIllegal;
 443         }
 444
 445         target += bytesToWrite;
 446         if (target > targetEnd) {
 447             --source; /* Back up source pointer! */
 448             target -= bytesToWrite; result = targetExhausted; break;
 449         }
 450         switch (bytesToWrite) { /* note: everything falls through. */
 451             case 4: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
 452             case 3: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
 453             case 2: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
 454             case 1: *--target = (UTF8) (ch | firstByteMark[bytesToWrite]);
 455         }
 456         target += bytesToWrite;
 457     }
 458     *sourceStart = source;
 459     *targetStart = target;
 460     return result;
 461 }
 462
 463 /* --------------------------------------------------------------------- */
 464
 465 ConversionResult ConvertUTF8toUTF32 (
 466         const UTF8** sourceStart, const UTF8* sourceEnd,
 467         UTF32** targetStart, UTF32* targetEnd, ConversionFlags flags) {
 468     ConversionResult result = conversionOK;
 469     const UTF8* source = *sourceStart;
 470     UTF32* target = *targetStart;
 471     while (source < sourceEnd) {
 472         UTF32 ch = 0;
 473         unsigned short extraBytesToRead = trailingBytesForUTF8[*source];
 474         if (source + extraBytesToRead >= sourceEnd) {
 475             result = sourceExhausted; break;
 476         }
 477         /* Do this check whether lenient or strict */
 478         if (! isLegalUTF8(source, extraBytesToRead+1)) {
 479             result = sourceIllegal;
 480             break;
 481         }
 482         /*
 483          * The cases all fall through. See "Note A" below.
 484          */
 485         switch (extraBytesToRead) {
 486             case 5: ch += *source++; ch <<= 6;
 487             case 4: ch += *source++; ch <<= 6;
 488             case 3: ch += *source++; ch <<= 6;
 489             case 2: ch += *source++; ch <<= 6;
 490             case 1: ch += *source++; ch <<= 6;
 491             case 0: ch += *source++;
 492         }
 493         ch -= offsetsFromUTF8[extraBytesToRead];
 494
 495         if (target >= targetEnd) {
 496             source -= (extraBytesToRead+1); /* Back up the source pointer! */
 497             result = targetExhausted; break;
 498         }
 499         if (ch <= UNI_MAX_LEGAL_UTF32) {
 500             /*
 501              * UTF-16 surrogate values are illegal in UTF-32, and anything
 502              * over Plane 17 (> 0x10FFFF) is illegal.
 503              */
 504             if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) {
 505                 if (flags == strictConversion) {
 506                     source -= (extraBytesToRead+1); /* return to the illegal value itself */
 507                     result = sourceIllegal;
 508                     break;
 509                 } else {
 510                     *target++ = UNI_REPLACEMENT_CHAR;
 511                 }
 512             } else {
 513                 *target++ = ch;
 514             }
 515         } else { /* i.e., ch > UNI_MAX_LEGAL_UTF32 */
 516             result = sourceIllegal;
 517             *target++ = UNI_REPLACEMENT_CHAR;
 518         }
 519     }
 520     *sourceStart = source;
 521     *targetStart = target;
 522     return result;
 523 }
 524
 525 /* ---------------------------------------------------------------------
 526
 527     Note A.
 528     The fall-through switches in UTF-8 reading code save a
 529     temp variable, some decrements & conditionals.  The switches
 530     are equivalent to the following loop:
 531         {
 532             int tmpBytesToRead = extraBytesToRead+1;
 533             do {
 534                 ch += *source++;
 535                 --tmpBytesToRead;
 536                 if (tmpBytesToRead) ch <<= 6;
 537             } while (tmpBytesToRead > 0);
 538         }
 539     In UTF-8 writing code, the switches on "bytesToWrite" are
 540     similarly unrolled loops.
 541
 542    --------------------------------------------------------------------- */