bundles/org.simantics.databoard/cpp/DataBoardTest/libantlr3c-3.2/src/antlr3convertutf.c

   1 /*\r
   2  * Copyright 2001-2004 Unicode, Inc.\r
   3  * \r
   4  * Disclaimer\r
   5  * \r
   6  * This source code is provided as is by Unicode, Inc. No claims are\r
   7  * made as to fitness for any particular purpose. No warranties of any\r
   8  * kind are expressed or implied. The recipient agrees to determine\r
   9  * applicability of information provided. If this file has been\r
  10  * purchased on magnetic or optical media from Unicode, Inc., the\r
  11  * sole remedy for any claim will be exchange of defective media\r
  12  * within 90 days of receipt.\r
  13  * \r
  14  * Limitations on Rights to Redistribute This Code\r
  15  * \r
  16  * Unicode, Inc. hereby grants the right to freely use the information\r
  17  * supplied in this file in the creation of products supporting the\r
  18  * Unicode Standard, and to make copies of this file in any form\r
  19  * for internal or external distribution as long as this notice\r
  20  * remains attached.\r
  21  */\r
  22 \r
  23 /* ---------------------------------------------------------------------\r
  24 \r
  25     Conversions between UTF32, UTF-16, and UTF-8. Source code file.\r
  26     Author: Mark E. Davis, 1994.\r
  27     Rev History: Rick McGowan, fixes & updates May 2001.\r
  28     Sept 2001: fixed const & error conditions per\r
  29         mods suggested by S. Parent & A. Lillich.\r
  30     June 2002: Tim Dodd added detection and handling of incomplete\r
  31         source sequences, enhanced error detection, added casts\r
  32         to eliminate compiler warnings.\r
  33     July 2003: slight mods to back out aggressive FFFE detection.\r
  34     Jan 2004: updated switches in from-UTF8 conversions.\r
  35     Oct 2004: updated to use UNI_MAX_LEGAL_UTF32 in UTF-32 conversions.\r
  36 \r
  37     See the header file "ConvertUTF.h" for complete documentation.\r
  38 \r
  39 ------------------------------------------------------------------------ */\r
  40 \r
  41 \r
  42 #include "antlr3convertutf.h"\r
  43 \r
  44 #ifdef CVTUTF_DEBUG\r
  45 #include <stdio.h>\r
  46 #endif\r
  47 \r
  48 static const int halfShift  = 10; /* used for shifting by 10 bits */\r
  49 \r
  50 static const UTF32 halfBase = 0x0010000UL;\r
  51 static const UTF32 halfMask = 0x3FFUL;\r
  52 \r
  53 #define UNI_SUR_HIGH_START  (UTF32)0xD800\r
  54 #define UNI_SUR_HIGH_END    (UTF32)0xDBFF\r
  55 #define UNI_SUR_LOW_START   (UTF32)0xDC00\r
  56 #define UNI_SUR_LOW_END     (UTF32)0xDFFF\r
  57 #define false      0\r
  58 #define true        1\r
  59 \r
  60 /* --------------------------------------------------------------------- */\r
  61 \r
  62 ConversionResult ConvertUTF32toUTF16 (\r
  63         const UTF32** sourceStart, const UTF32* sourceEnd, \r
  64         UTF16** targetStart, UTF16* targetEnd, ConversionFlags flags) {\r
  65     ConversionResult result = conversionOK;\r
  66     const UTF32* source = *sourceStart;\r
  67     UTF16* target = *targetStart;\r
  68     while (source < sourceEnd) {\r
  69         UTF32 ch;\r
  70         if (target >= targetEnd) {\r
  71             result = targetExhausted; break;\r
  72         }\r
  73         ch = *source++;\r
  74         if (ch <= UNI_MAX_BMP) { /* Target is a character <= 0xFFFF */\r
  75             /* UTF-16 surrogate values are illegal in UTF-32; 0xffff or 0xfffe are both reserved values */\r
  76             if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) {\r
  77                 if (flags == strictConversion) {\r
  78                     --source; /* return to the illegal value itself */\r
  79                     result = sourceIllegal;\r
  80                     break;\r
  81                 } else {\r
  82                     *target++ = UNI_REPLACEMENT_CHAR;\r
  83                 }\r
  84             } else {\r
  85                 *target++ = (UTF16)ch; /* normal case */\r
  86             }\r
  87         } else if (ch > UNI_MAX_LEGAL_UTF32) {\r
  88             if (flags == strictConversion) {\r
  89                 result = sourceIllegal;\r
  90             } else {\r
  91                 *target++ = UNI_REPLACEMENT_CHAR;\r
  92             }\r
  93         } else {\r
  94             /* target is a character in range 0xFFFF - 0x10FFFF. */\r
  95             if (target + 1 >= targetEnd) {\r
  96                 --source; /* Back up source pointer! */\r
  97                 result = targetExhausted; break;\r
  98             }\r
  99             ch -= halfBase;\r
 100             *target++ = (UTF16)((ch >> halfShift) + UNI_SUR_HIGH_START);\r
 101             *target++ = (UTF16)((ch & halfMask) + UNI_SUR_LOW_START);\r
 102         }\r
 103     }\r
 104     *sourceStart = source;\r
 105     *targetStart = target;\r
 106     return result;\r
 107 }\r
 108 \r
 109 /* --------------------------------------------------------------------- */\r
 110 \r
 111 ConversionResult ConvertUTF16toUTF32 (\r
 112         const UTF16** sourceStart, const UTF16* sourceEnd, \r
 113         UTF32** targetStart, UTF32* targetEnd, ConversionFlags flags) {\r
 114     ConversionResult result = conversionOK;\r
 115     const UTF16* source = *sourceStart;\r
 116     UTF32* target = *targetStart;\r
 117     UTF32 ch, ch2;\r
 118     while (source < sourceEnd) {\r
 119         const UTF16* oldSource = source; /*  In case we have to back up because of target overflow. */\r
 120         ch = *source++;\r
 121         /* If we have a surrogate pair, convert to UTF32 first. */\r
 122         if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END) {\r
 123             /* If the 16 bits following the high surrogate are in the source buffer... */\r
 124             if (source < sourceEnd) {\r
 125                 ch2 = *source;\r
 126                 /* If it's a low surrogate, convert to UTF32. */\r
 127                 if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END) {\r
 128                     ch = ((ch - UNI_SUR_HIGH_START) << halfShift)\r
 129                         + (ch2 - UNI_SUR_LOW_START) + halfBase;\r
 130                     ++source;\r
 131                 } else if (flags == strictConversion) { /* it's an unpaired high surrogate */\r
 132                     --source; /* return to the illegal value itself */\r
 133                     result = sourceIllegal;\r
 134                     break;\r
 135                 }\r
 136             } else { /* We don't have the 16 bits following the high surrogate. */\r
 137                 --source; /* return to the high surrogate */\r
 138                 result = sourceExhausted;\r
 139                 break;\r
 140             }\r
 141         } else if (flags == strictConversion) {\r
 142             /* UTF-16 surrogate values are illegal in UTF-32 */\r
 143             if (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END) {\r
 144                 --source; /* return to the illegal value itself */\r
 145                 result = sourceIllegal;\r
 146                 break;\r
 147             }\r
 148         }\r
 149         if (target >= targetEnd) {\r
 150             source = oldSource; /* Back up source pointer! */\r
 151             result = targetExhausted; break;\r
 152         }\r
 153         *target++ = ch;\r
 154     }\r
 155     *sourceStart = source;\r
 156     *targetStart = target;\r
 157 #ifdef CVTUTF_DEBUG\r
 158 if (result == sourceIllegal) {\r
 159     ANTLR3_FPRINTF(stderr, "ConvertUTF16toUTF32 illegal seq 0x%04x,%04x\n", ch, ch2);\r
 160     fflush(stderr);\r
 161 }\r
 162 #endif\r
 163     return result;\r
 164 }\r
 165 \r
 166 /* --------------------------------------------------------------------- */\r
 167 \r
 168 /*\r
 169  * Index into the table below with the first byte of a UTF-8 sequence to\r
 170  * get the number of trailing bytes that are supposed to follow it.\r
 171  * Note that *legal* UTF-8 values can't have 4 or 5-bytes. The table is\r
 172  * left as-is for anyone who may want to do such conversion, which was\r
 173  * allowed in earlier algorithms.\r
 174  */\r
 175 static const char trailingBytesForUTF8[256] = {\r
 176     0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,\r
 177     0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,\r
 178     0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,\r
 179     0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,\r
 180     0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,\r
 181     0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,\r
 182     1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,\r
 183     2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5\r
 184 };\r
 185 \r
 186 /*\r
 187  * Magic values subtracted from a buffer value during UTF8 conversion.\r
 188  * This table contains as many values as there might be trailing bytes\r
 189  * in a UTF-8 sequence.\r
 190  */\r
 191 static const UTF32 offsetsFromUTF8[6] = { 0x00000000UL, 0x00003080UL, 0x000E2080UL, \r
 192                      0x03C82080UL, 0xFA082080UL, 0x82082080UL };\r
 193 \r
 194 /*\r
 195  * Once the bits are split out into bytes of UTF-8, this is a mask OR-ed\r
 196  * into the first byte, depending on how many bytes follow.  There are\r
 197  * as many entries in this table as there are UTF-8 sequence types.\r
 198  * (I.e., one byte sequence, two byte... etc.). Remember that sequencs\r
 199  * for *legal* UTF-8 will be 4 or fewer bytes total.\r
 200  */\r
 201 static const UTF8 firstByteMark[7] = { 0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC };\r
 202 \r
 203 /* --------------------------------------------------------------------- */\r
 204 \r
 205 /* The interface converts a whole buffer to avoid function-call overhead.\r
 206  * Constants have been gathered. Loops & conditionals have been removed as\r
 207  * much as possible for efficiency, in favor of drop-through switches.\r
 208  * (See "Note A" at the bottom of the file for equivalent code.)\r
 209  * If your compiler supports it, the "isLegalUTF8" call can be turned\r
 210  * into an inline function.\r
 211  */\r
 212 \r
 213 /* --------------------------------------------------------------------- */\r
 214 \r
 215 ConversionResult ConvertUTF16toUTF8 (\r
 216         const UTF16** sourceStart, const UTF16* sourceEnd, \r
 217         UTF8** targetStart, UTF8* targetEnd, ConversionFlags flags) {\r
 218     ConversionResult result = conversionOK;\r
 219     const UTF16* source = *sourceStart;\r
 220     UTF8* target = *targetStart;\r
 221     while (source < sourceEnd) {\r
 222         UTF32 ch;\r
 223         unsigned short bytesToWrite = 0;\r
 224         const UTF32 byteMask = 0xBF;\r
 225         const UTF32 byteMark = 0x80; \r
 226         const UTF16* oldSource = source; /* In case we have to back up because of target overflow. */\r
 227         ch = *source++;\r
 228         /* If we have a surrogate pair, convert to UTF32 first. */\r
 229         if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END) {\r
 230             /* If the 16 bits following the high surrogate are in the source buffer... */\r
 231             if (source < sourceEnd) {\r
 232                 UTF32 ch2 = *source;\r
 233                 /* If it's a low surrogate, convert to UTF32. */\r
 234                 if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END) {\r
 235                     ch = ((ch - UNI_SUR_HIGH_START) << halfShift)\r
 236                         + (ch2 - UNI_SUR_LOW_START) + halfBase;\r
 237                     ++source;\r
 238                 } else if (flags == strictConversion) { /* it's an unpaired high surrogate */\r
 239                     --source; /* return to the illegal value itself */\r
 240                     result = sourceIllegal;\r
 241                     break;\r
 242                 }\r
 243             } else { /* We don't have the 16 bits following the high surrogate. */\r
 244                 --source; /* return to the high surrogate */\r
 245                 result = sourceExhausted;\r
 246                 break;\r
 247             }\r
 248         } else if (flags == strictConversion) {\r
 249             /* UTF-16 surrogate values are illegal in UTF-32 */\r
 250             if (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END) {\r
 251                 --source; /* return to the illegal value itself */\r
 252                 result = sourceIllegal;\r
 253                 break;\r
 254             }\r
 255         }\r
 256         /* Figure out how many bytes the result will require */\r
 257         if (ch < (UTF32)0x80) {      bytesToWrite = 1;\r
 258         } else if (ch < (UTF32)0x800) {     bytesToWrite = 2;\r
 259         } else if (ch < (UTF32)0x10000) {   bytesToWrite = 3;\r
 260         } else if (ch < (UTF32)0x110000) {  bytesToWrite = 4;\r
 261         } else {                            bytesToWrite = 3;\r
 262                                             ch = UNI_REPLACEMENT_CHAR;\r
 263         }\r
 264 \r
 265         target += bytesToWrite;\r
 266         if (target > targetEnd) {\r
 267             source = oldSource; /* Back up source pointer! */\r
 268             target -= bytesToWrite; result = targetExhausted; break;\r
 269         }\r
 270         switch (bytesToWrite) { /* note: everything falls through. */\r
 271             case 4: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;\r
 272             case 3: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;\r
 273             case 2: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;\r
 274             case 1: *--target =  (UTF8)(ch | firstByteMark[bytesToWrite]);\r
 275         }\r
 276         target += bytesToWrite;\r
 277     }\r
 278     *sourceStart = source;\r
 279     *targetStart = target;\r
 280     return result;\r
 281 }\r
 282 \r
 283 /* --------------------------------------------------------------------- */\r
 284 \r
 285 /*\r
 286  * Utility routine to tell whether a sequence of bytes is legal UTF-8.\r
 287  * This must be called with the length pre-determined by the first byte.\r
 288  * If not calling this from ConvertUTF8to*, then the length can be set by:\r
 289  *  length = trailingBytesForUTF8[*source]+1;\r
 290  * and the sequence is illegal right away if there aren't that many bytes\r
 291  * available.\r
 292  * If presented with a length > 4, this returns false.  The Unicode\r
 293  * definition of UTF-8 goes up to 4-byte sequences.\r
 294  */\r
 295 \r
 296 static ANTLR3_BOOLEAN\r
 297 isLegalUTF8(const UTF8 *source, int length) {\r
 298     UTF8 a;\r
 299     const UTF8 *srcptr = source+length;\r
 300     switch (length) {\r
 301     default: return false;\r
 302         /* Everything else falls through when "true"... */\r
 303     case 4: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false;\r
 304     case 3: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false;\r
 305     case 2: if ((a = (*--srcptr)) > 0xBF) return false;\r
 306 \r
 307         switch (*source) {\r
 308             /* no fall-through in this inner switch */\r
 309             case 0xE0: if (a < 0xA0) return false; break;\r
 310             case 0xED: if (a > 0x9F) return false; break;\r
 311             case 0xF0: if (a < 0x90) return false; break;\r
 312             case 0xF4: if (a > 0x8F) return false; break;\r
 313             default:   if (a < 0x80) return false;\r
 314         }\r
 315 \r
 316     case 1: if (*source >= 0x80 && *source < 0xC2) return false;\r
 317     }\r
 318     if (*source > 0xF4) return false;\r
 319     return true;\r
 320 }\r
 321 \r
 322 /* --------------------------------------------------------------------- */\r
 323 \r
 324 /*\r
 325  * Exported function to return whether a UTF-8 sequence is legal or not.\r
 326  * This is not used here; it's just exported.\r
 327  */\r
 328 ANTLR3_BOOLEAN\r
 329 isLegalUTF8Sequence(const UTF8 *source, const UTF8 *sourceEnd) {\r
 330     int length = trailingBytesForUTF8[*source]+1;\r
 331     if (source+length > sourceEnd) {\r
 332         return false;\r
 333     }\r
 334     return isLegalUTF8(source, length);\r
 335 }\r
 336 \r
 337 /* --------------------------------------------------------------------- */\r
 338 \r
 339 ConversionResult ConvertUTF8toUTF16 (\r
 340         const UTF8** sourceStart, const UTF8* sourceEnd, \r
 341         UTF16** targetStart, UTF16* targetEnd, ConversionFlags flags) {\r
 342     ConversionResult result = conversionOK;\r
 343     const UTF8* source = *sourceStart;\r
 344     UTF16* target = *targetStart;\r
 345     while (source < sourceEnd) {\r
 346         UTF32 ch = 0;\r
 347         unsigned short extraBytesToRead = trailingBytesForUTF8[*source];\r
 348         if (source + extraBytesToRead >= sourceEnd) {\r
 349             result = sourceExhausted; break;\r
 350         }\r
 351         /* Do this check whether lenient or strict */\r
 352         if (! isLegalUTF8(source, extraBytesToRead+1)) {\r
 353             result = sourceIllegal;\r
 354             break;\r
 355         }\r
 356         /*\r
 357          * The cases all fall through. See "Note A" below.\r
 358          */\r
 359         switch (extraBytesToRead) {\r
 360             case 5: ch += *source++; ch <<= 6; /* remember, illegal UTF-8 */\r
 361             case 4: ch += *source++; ch <<= 6; /* remember, illegal UTF-8 */\r
 362             case 3: ch += *source++; ch <<= 6;\r
 363             case 2: ch += *source++; ch <<= 6;\r
 364             case 1: ch += *source++; ch <<= 6;\r
 365             case 0: ch += *source++;\r
 366         }\r
 367         ch -= offsetsFromUTF8[extraBytesToRead];\r
 368 \r
 369         if (target >= targetEnd) {\r
 370             source -= (extraBytesToRead+1); /* Back up source pointer! */\r
 371             result = targetExhausted; break;\r
 372         }\r
 373         if (ch <= UNI_MAX_BMP) { /* Target is a character <= 0xFFFF */\r
 374             /* UTF-16 surrogate values are illegal in UTF-32 */\r
 375             if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) {\r
 376                 if (flags == strictConversion) {\r
 377                     source -= (extraBytesToRead+1); /* return to the illegal value itself */\r
 378                     result = sourceIllegal;\r
 379                     break;\r
 380                 } else {\r
 381                     *target++ = UNI_REPLACEMENT_CHAR;\r
 382                 }\r
 383             } else {\r
 384                 *target++ = (UTF16)ch; /* normal case */\r
 385             }\r
 386         } else if (ch > UNI_MAX_UTF16) {\r
 387             if (flags == strictConversion) {\r
 388                 result = sourceIllegal;\r
 389                 source -= (extraBytesToRead+1); /* return to the start */\r
 390                 break; /* Bail out; shouldn't continue */\r
 391             } else {\r
 392                 *target++ = UNI_REPLACEMENT_CHAR;\r
 393             }\r
 394         } else {\r
 395             /* target is a character in range 0xFFFF - 0x10FFFF. */\r
 396             if (target + 1 >= targetEnd) {\r
 397                 source -= (extraBytesToRead+1); /* Back up source pointer! */\r
 398                 result = targetExhausted; break;\r
 399             }\r
 400             ch -= halfBase;\r
 401             *target++ = (UTF16)((ch >> halfShift) + UNI_SUR_HIGH_START);\r
 402             *target++ = (UTF16)((ch & halfMask) + UNI_SUR_LOW_START);\r
 403         }\r
 404     }\r
 405     *sourceStart = source;\r
 406     *targetStart = target;\r
 407     return result;\r
 408 }\r
 409 \r
 410 /* --------------------------------------------------------------------- */\r
 411 \r
 412 ConversionResult ConvertUTF32toUTF8 (\r
 413         const UTF32** sourceStart, const UTF32* sourceEnd, \r
 414         UTF8** targetStart, UTF8* targetEnd, ConversionFlags flags) {\r
 415     ConversionResult result = conversionOK;\r
 416     const UTF32* source = *sourceStart;\r
 417     UTF8* target = *targetStart;\r
 418     while (source < sourceEnd) {\r
 419         UTF32 ch;\r
 420         unsigned short bytesToWrite = 0;\r
 421         const UTF32 byteMask = 0xBF;\r
 422         const UTF32 byteMark = 0x80; \r
 423         ch = *source++;\r
 424         if (flags == strictConversion ) {\r
 425             /* UTF-16 surrogate values are illegal in UTF-32 */\r
 426             if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) {\r
 427                 --source; /* return to the illegal value itself */\r
 428                 result = sourceIllegal;\r
 429                 break;\r
 430             }\r
 431         }\r
 432         /*\r
 433          * Figure out how many bytes the result will require. Turn any\r
 434          * illegally large UTF32 things (> Plane 17) into replacement chars.\r
 435          */\r
 436         if (ch < (UTF32)0x80) {      bytesToWrite = 1;\r
 437         } else if (ch < (UTF32)0x800) {     bytesToWrite = 2;\r
 438         } else if (ch < (UTF32)0x10000) {   bytesToWrite = 3;\r
 439         } else if (ch <= UNI_MAX_LEGAL_UTF32) {  bytesToWrite = 4;\r
 440         } else {                            bytesToWrite = 3;\r
 441                                             ch = UNI_REPLACEMENT_CHAR;\r
 442                                             result = sourceIllegal;\r
 443         }\r
 444         \r
 445         target += bytesToWrite;\r
 446         if (target > targetEnd) {\r
 447             --source; /* Back up source pointer! */\r
 448             target -= bytesToWrite; result = targetExhausted; break;\r
 449         }\r
 450         switch (bytesToWrite) { /* note: everything falls through. */\r
 451             case 4: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;\r
 452             case 3: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;\r
 453             case 2: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;\r
 454             case 1: *--target = (UTF8) (ch | firstByteMark[bytesToWrite]);\r
 455         }\r
 456         target += bytesToWrite;\r
 457     }\r
 458     *sourceStart = source;\r
 459     *targetStart = target;\r
 460     return result;\r
 461 }\r
 462 \r
 463 /* --------------------------------------------------------------------- */\r
 464 \r
 465 ConversionResult ConvertUTF8toUTF32 (\r
 466         const UTF8** sourceStart, const UTF8* sourceEnd, \r
 467         UTF32** targetStart, UTF32* targetEnd, ConversionFlags flags) {\r
 468     ConversionResult result = conversionOK;\r
 469     const UTF8* source = *sourceStart;\r
 470     UTF32* target = *targetStart;\r
 471     while (source < sourceEnd) {\r
 472         UTF32 ch = 0;\r
 473         unsigned short extraBytesToRead = trailingBytesForUTF8[*source];\r
 474         if (source + extraBytesToRead >= sourceEnd) {\r
 475             result = sourceExhausted; break;\r
 476         }\r
 477         /* Do this check whether lenient or strict */\r
 478         if (! isLegalUTF8(source, extraBytesToRead+1)) {\r
 479             result = sourceIllegal;\r
 480             break;\r
 481         }\r
 482         /*\r
 483          * The cases all fall through. See "Note A" below.\r
 484          */\r
 485         switch (extraBytesToRead) {\r
 486             case 5: ch += *source++; ch <<= 6;\r
 487             case 4: ch += *source++; ch <<= 6;\r
 488             case 3: ch += *source++; ch <<= 6;\r
 489             case 2: ch += *source++; ch <<= 6;\r
 490             case 1: ch += *source++; ch <<= 6;\r
 491             case 0: ch += *source++;\r
 492         }\r
 493         ch -= offsetsFromUTF8[extraBytesToRead];\r
 494 \r
 495         if (target >= targetEnd) {\r
 496             source -= (extraBytesToRead+1); /* Back up the source pointer! */\r
 497             result = targetExhausted; break;\r
 498         }\r
 499         if (ch <= UNI_MAX_LEGAL_UTF32) {\r
 500             /*\r
 501              * UTF-16 surrogate values are illegal in UTF-32, and anything\r
 502              * over Plane 17 (> 0x10FFFF) is illegal.\r
 503              */\r
 504             if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) {\r
 505                 if (flags == strictConversion) {\r
 506                     source -= (extraBytesToRead+1); /* return to the illegal value itself */\r
 507                     result = sourceIllegal;\r
 508                     break;\r
 509                 } else {\r
 510                     *target++ = UNI_REPLACEMENT_CHAR;\r
 511                 }\r
 512             } else {\r
 513                 *target++ = ch;\r
 514             }\r
 515         } else { /* i.e., ch > UNI_MAX_LEGAL_UTF32 */\r
 516             result = sourceIllegal;\r
 517             *target++ = UNI_REPLACEMENT_CHAR;\r
 518         }\r
 519     }\r
 520     *sourceStart = source;\r
 521     *targetStart = target;\r
 522     return result;\r
 523 }\r
 524 \r
 525 /* ---------------------------------------------------------------------\r
 526 \r
 527     Note A.\r
 528     The fall-through switches in UTF-8 reading code save a\r
 529     temp variable, some decrements & conditionals.  The switches\r
 530     are equivalent to the following loop:\r
 531         {\r
 532             int tmpBytesToRead = extraBytesToRead+1;\r
 533             do {\r
 534                 ch += *source++;\r
 535                 --tmpBytesToRead;\r
 536                 if (tmpBytesToRead) ch <<= 6;\r
 537             } while (tmpBytesToRead > 0);\r
 538         }\r
 539     In UTF-8 writing code, the switches on "bytesToWrite" are\r
 540     similarly unrolled loops.\r
 541 \r
 542    --------------------------------------------------------------------- */\r