Home | History | Annotate | Line # | Download | only in Support
      1 /*===--- ConvertUTF.c - Universal Character Names conversions ---------------===
      2  *
      3  * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
      4  * See https://llvm.org/LICENSE.txt for license information.
      5  * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
      6  *
      7  *===------------------------------------------------------------------------=*/
      8 /*
      9  * Copyright 2001-2004 Unicode, Inc.
     10  *
     11  * Disclaimer
     12  *
     13  * This source code is provided as is by Unicode, Inc. No claims are
     14  * made as to fitness for any particular purpose. No warranties of any
     15  * kind are expressed or implied. The recipient agrees to determine
     16  * applicability of information provided. If this file has been
     17  * purchased on magnetic or optical media from Unicode, Inc., the
     18  * sole remedy for any claim will be exchange of defective media
     19  * within 90 days of receipt.
     20  *
     21  * Limitations on Rights to Redistribute This Code
     22  *
     23  * Unicode, Inc. hereby grants the right to freely use the information
     24  * supplied in this file in the creation of products supporting the
     25  * Unicode Standard, and to make copies of this file in any form
     26  * for internal or external distribution as long as this notice
     27  * remains attached.
     28  */
     29 
     30 /* ---------------------------------------------------------------------
     31 
     32     Conversions between UTF32, UTF-16, and UTF-8. Source code file.
     33     Author: Mark E. Davis, 1994.
     34     Rev History: Rick McGowan, fixes & updates May 2001.
     35     Sept 2001: fixed const & error conditions per
     36         mods suggested by S. Parent & A. Lillich.
     37     June 2002: Tim Dodd added detection and handling of incomplete
     38         source sequences, enhanced error detection, added casts
     39         to eliminate compiler warnings.
     40     July 2003: slight mods to back out aggressive FFFE detection.
     41     Jan 2004: updated switches in from-UTF8 conversions.
     42     Oct 2004: updated to use UNI_MAX_LEGAL_UTF32 in UTF-32 conversions.
     43 
     44     See the header file "ConvertUTF.h" for complete documentation.
     45 
     46 ------------------------------------------------------------------------ */
     47 
     48 #include "llvm/Support/ConvertUTF.h"
     49 #ifdef CVTUTF_DEBUG
     50 #include <stdio.h>
     51 #endif
     52 #include <assert.h>
     53 
     54 /*
     55  * This code extensively uses fall-through switches.
     56  * Keep the compiler from warning about that.
     57  */
     58 #if defined(__clang__) && defined(__has_warning)
     59 # if __has_warning("-Wimplicit-fallthrough")
     60 #  define ConvertUTF_DISABLE_WARNINGS \
     61     _Pragma("clang diagnostic push")  \
     62     _Pragma("clang diagnostic ignored \"-Wimplicit-fallthrough\"")
     63 #  define ConvertUTF_RESTORE_WARNINGS \
     64     _Pragma("clang diagnostic pop")
     65 # endif
     66 #elif defined(__GNUC__) && __GNUC__ > 6
     67 # define ConvertUTF_DISABLE_WARNINGS \
     68    _Pragma("GCC diagnostic push")    \
     69    _Pragma("GCC diagnostic ignored \"-Wimplicit-fallthrough\"")
     70 # define ConvertUTF_RESTORE_WARNINGS \
     71    _Pragma("GCC diagnostic pop")
     72 #endif
     73 #ifndef ConvertUTF_DISABLE_WARNINGS
     74 # define ConvertUTF_DISABLE_WARNINGS
     75 #endif
     76 #ifndef ConvertUTF_RESTORE_WARNINGS
     77 # define ConvertUTF_RESTORE_WARNINGS
     78 #endif
     79 
     80 ConvertUTF_DISABLE_WARNINGS
     81 
     82 namespace llvm {
     83 
     84 static const int halfShift  = 10; /* used for shifting by 10 bits */
     85 
     86 static const UTF32 halfBase = 0x0010000UL;
     87 static const UTF32 halfMask = 0x3FFUL;
     88 
     89 #define UNI_SUR_HIGH_START  (UTF32)0xD800
     90 #define UNI_SUR_HIGH_END    (UTF32)0xDBFF
     91 #define UNI_SUR_LOW_START   (UTF32)0xDC00
     92 #define UNI_SUR_LOW_END     (UTF32)0xDFFF
     93 
     94 /* --------------------------------------------------------------------- */
     95 
     96 /*
     97  * Index into the table below with the first byte of a UTF-8 sequence to
     98  * get the number of trailing bytes that are supposed to follow it.
     99  * Note that *legal* UTF-8 values can't have 4 or 5-bytes. The table is
    100  * left as-is for anyone who may want to do such conversion, which was
    101  * allowed in earlier algorithms.
    102  */
    103 static const char trailingBytesForUTF8[256] = {
    104     0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
    105     0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
    106     0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
    107     0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
    108     0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
    109     0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
    110     1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
    111     2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5
    112 };
    113 
    114 /*
    115  * Magic values subtracted from a buffer value during UTF8 conversion.
    116  * This table contains as many values as there might be trailing bytes
    117  * in a UTF-8 sequence.
    118  */
    119 static const UTF32 offsetsFromUTF8[6] = { 0x00000000UL, 0x00003080UL, 0x000E2080UL,
    120                      0x03C82080UL, 0xFA082080UL, 0x82082080UL };
    121 
    122 /*
    123  * Once the bits are split out into bytes of UTF-8, this is a mask OR-ed
    124  * into the first byte, depending on how many bytes follow.  There are
    125  * as many entries in this table as there are UTF-8 sequence types.
    126  * (I.e., one byte sequence, two byte... etc.). Remember that sequencs
    127  * for *legal* UTF-8 will be 4 or fewer bytes total.
    128  */
    129 static const UTF8 firstByteMark[7] = { 0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC };
    130 
    131 /* --------------------------------------------------------------------- */
    132 
    133 /* The interface converts a whole buffer to avoid function-call overhead.
    134  * Constants have been gathered. Loops & conditionals have been removed as
    135  * much as possible for efficiency, in favor of drop-through switches.
    136  * (See "Note A" at the bottom of the file for equivalent code.)
    137  * If your compiler supports it, the "isLegalUTF8" call can be turned
    138  * into an inline function.
    139  */
    140 
    141 
    142 /* --------------------------------------------------------------------- */
    143 
    144 ConversionResult ConvertUTF32toUTF16 (
    145         const UTF32** sourceStart, const UTF32* sourceEnd,
    146         UTF16** targetStart, UTF16* targetEnd, ConversionFlags flags) {
    147     ConversionResult result = conversionOK;
    148     const UTF32* source = *sourceStart;
    149     UTF16* target = *targetStart;
    150     while (source < sourceEnd) {
    151         UTF32 ch;
    152         if (target >= targetEnd) {
    153             result = targetExhausted; break;
    154         }
    155         ch = *source++;
    156         if (ch <= UNI_MAX_BMP) { /* Target is a character <= 0xFFFF */
    157             /* UTF-16 surrogate values are illegal in UTF-32; 0xffff or 0xfffe are both reserved values */
    158             if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) {
    159                 if (flags == strictConversion) {
    160                     --source; /* return to the illegal value itself */
    161                     result = sourceIllegal;
    162                     break;
    163                 } else {
    164                     *target++ = UNI_REPLACEMENT_CHAR;
    165                 }
    166             } else {
    167                 *target++ = (UTF16)ch; /* normal case */
    168             }
    169         } else if (ch > UNI_MAX_LEGAL_UTF32) {
    170             if (flags == strictConversion) {
    171                 result = sourceIllegal;
    172             } else {
    173                 *target++ = UNI_REPLACEMENT_CHAR;
    174             }
    175         } else {
    176             /* target is a character in range 0xFFFF - 0x10FFFF. */
    177             if (target + 1 >= targetEnd) {
    178                 --source; /* Back up source pointer! */
    179                 result = targetExhausted; break;
    180             }
    181             ch -= halfBase;
    182             *target++ = (UTF16)((ch >> halfShift) + UNI_SUR_HIGH_START);
    183             *target++ = (UTF16)((ch & halfMask) + UNI_SUR_LOW_START);
    184         }
    185     }
    186     *sourceStart = source;
    187     *targetStart = target;
    188     return result;
    189 }
    190 
    191 /* --------------------------------------------------------------------- */
    192 
    193 ConversionResult ConvertUTF16toUTF32 (
    194         const UTF16** sourceStart, const UTF16* sourceEnd,
    195         UTF32** targetStart, UTF32* targetEnd, ConversionFlags flags) {
    196     ConversionResult result = conversionOK;
    197     const UTF16* source = *sourceStart;
    198     UTF32* target = *targetStart;
    199     UTF32 ch, ch2;
    200     while (source < sourceEnd) {
    201         const UTF16* oldSource = source; /*  In case we have to back up because of target overflow. */
    202         ch = *source++;
    203         /* If we have a surrogate pair, convert to UTF32 first. */
    204         if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END) {
    205             /* If the 16 bits following the high surrogate are in the source buffer... */
    206             if (source < sourceEnd) {
    207                 ch2 = *source;
    208                 /* If it's a low surrogate, convert to UTF32. */
    209                 if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END) {
    210                     ch = ((ch - UNI_SUR_HIGH_START) << halfShift)
    211                         + (ch2 - UNI_SUR_LOW_START) + halfBase;
    212                     ++source;
    213                 } else if (flags == strictConversion) { /* it's an unpaired high surrogate */
    214                     --source; /* return to the illegal value itself */
    215                     result = sourceIllegal;
    216                     break;
    217                 }
    218             } else { /* We don't have the 16 bits following the high surrogate. */
    219                 --source; /* return to the high surrogate */
    220                 result = sourceExhausted;
    221                 break;
    222             }
    223         } else if (flags == strictConversion) {
    224             /* UTF-16 surrogate values are illegal in UTF-32 */
    225             if (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END) {
    226                 --source; /* return to the illegal value itself */
    227                 result = sourceIllegal;
    228                 break;
    229             }
    230         }
    231         if (target >= targetEnd) {
    232             source = oldSource; /* Back up source pointer! */
    233             result = targetExhausted; break;
    234         }
    235         *target++ = ch;
    236     }
    237     *sourceStart = source;
    238     *targetStart = target;
    239 #ifdef CVTUTF_DEBUG
    240 if (result == sourceIllegal) {
    241     fprintf(stderr, "ConvertUTF16toUTF32 illegal seq 0x%04x,%04x\n", ch, ch2);
    242     fflush(stderr);
    243 }
    244 #endif
    245     return result;
    246 }
    247 ConversionResult ConvertUTF16toUTF8 (
    248         const UTF16** sourceStart, const UTF16* sourceEnd,
    249         UTF8** targetStart, UTF8* targetEnd, ConversionFlags flags) {
    250     ConversionResult result = conversionOK;
    251     const UTF16* source = *sourceStart;
    252     UTF8* target = *targetStart;
    253     while (source < sourceEnd) {
    254         UTF32 ch;
    255         unsigned short bytesToWrite = 0;
    256         const UTF32 byteMask = 0xBF;
    257         const UTF32 byteMark = 0x80;
    258         const UTF16* oldSource = source; /* In case we have to back up because of target overflow. */
    259         ch = *source++;
    260         /* If we have a surrogate pair, convert to UTF32 first. */
    261         if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END) {
    262             /* If the 16 bits following the high surrogate are in the source buffer... */
    263             if (source < sourceEnd) {
    264                 UTF32 ch2 = *source;
    265                 /* If it's a low surrogate, convert to UTF32. */
    266                 if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END) {
    267                     ch = ((ch - UNI_SUR_HIGH_START) << halfShift)
    268                         + (ch2 - UNI_SUR_LOW_START) + halfBase;
    269                     ++source;
    270                 } else if (flags == strictConversion) { /* it's an unpaired high surrogate */
    271                     --source; /* return to the illegal value itself */
    272                     result = sourceIllegal;
    273                     break;
    274                 }
    275             } else { /* We don't have the 16 bits following the high surrogate. */
    276                 --source; /* return to the high surrogate */
    277                 result = sourceExhausted;
    278                 break;
    279             }
    280         } else if (flags == strictConversion) {
    281             /* UTF-16 surrogate values are illegal in UTF-32 */
    282             if (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END) {
    283                 --source; /* return to the illegal value itself */
    284                 result = sourceIllegal;
    285                 break;
    286             }
    287         }
    288         /* Figure out how many bytes the result will require */
    289         if (ch < (UTF32)0x80) {      bytesToWrite = 1;
    290         } else if (ch < (UTF32)0x800) {     bytesToWrite = 2;
    291         } else if (ch < (UTF32)0x10000) {   bytesToWrite = 3;
    292         } else if (ch < (UTF32)0x110000) {  bytesToWrite = 4;
    293         } else {                            bytesToWrite = 3;
    294                                             ch = UNI_REPLACEMENT_CHAR;
    295         }
    296 
    297         target += bytesToWrite;
    298         if (target > targetEnd) {
    299             source = oldSource; /* Back up source pointer! */
    300             target -= bytesToWrite; result = targetExhausted; break;
    301         }
    302         switch (bytesToWrite) { /* note: everything falls through. */
    303             case 4: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
    304             case 3: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
    305             case 2: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
    306             case 1: *--target =  (UTF8)(ch | firstByteMark[bytesToWrite]);
    307         }
    308         target += bytesToWrite;
    309     }
    310     *sourceStart = source;
    311     *targetStart = target;
    312     return result;
    313 }
    314 
    315 /* --------------------------------------------------------------------- */
    316 
    317 ConversionResult ConvertUTF32toUTF8 (
    318         const UTF32** sourceStart, const UTF32* sourceEnd,
    319         UTF8** targetStart, UTF8* targetEnd, ConversionFlags flags) {
    320     ConversionResult result = conversionOK;
    321     const UTF32* source = *sourceStart;
    322     UTF8* target = *targetStart;
    323     while (source < sourceEnd) {
    324         UTF32 ch;
    325         unsigned short bytesToWrite = 0;
    326         const UTF32 byteMask = 0xBF;
    327         const UTF32 byteMark = 0x80;
    328         ch = *source++;
    329         if (flags == strictConversion ) {
    330             /* UTF-16 surrogate values are illegal in UTF-32 */
    331             if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) {
    332                 --source; /* return to the illegal value itself */
    333                 result = sourceIllegal;
    334                 break;
    335             }
    336         }
    337         /*
    338          * Figure out how many bytes the result will require. Turn any
    339          * illegally large UTF32 things (> Plane 17) into replacement chars.
    340          */
    341         if (ch < (UTF32)0x80) {      bytesToWrite = 1;
    342         } else if (ch < (UTF32)0x800) {     bytesToWrite = 2;
    343         } else if (ch < (UTF32)0x10000) {   bytesToWrite = 3;
    344         } else if (ch <= UNI_MAX_LEGAL_UTF32) {  bytesToWrite = 4;
    345         } else {                            bytesToWrite = 3;
    346                                             ch = UNI_REPLACEMENT_CHAR;
    347                                             result = sourceIllegal;
    348         }
    349 
    350         target += bytesToWrite;
    351         if (target > targetEnd) {
    352             --source; /* Back up source pointer! */
    353             target -= bytesToWrite; result = targetExhausted; break;
    354         }
    355         switch (bytesToWrite) { /* note: everything falls through. */
    356             case 4: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
    357             case 3: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
    358             case 2: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
    359             case 1: *--target = (UTF8) (ch | firstByteMark[bytesToWrite]);
    360         }
    361         target += bytesToWrite;
    362     }
    363     *sourceStart = source;
    364     *targetStart = target;
    365     return result;
    366 }
    367 
    368 /* --------------------------------------------------------------------- */
    369 
    370 /*
    371  * Utility routine to tell whether a sequence of bytes is legal UTF-8.
    372  * This must be called with the length pre-determined by the first byte.
    373  * If not calling this from ConvertUTF8to*, then the length can be set by:
    374  *  length = trailingBytesForUTF8[*source]+1;
    375  * and the sequence is illegal right away if there aren't that many bytes
    376  * available.
    377  * If presented with a length > 4, this returns false.  The Unicode
    378  * definition of UTF-8 goes up to 4-byte sequences.
    379  */
    380 
    381 static Boolean isLegalUTF8(const UTF8 *source, int length) {
    382     UTF8 a;
    383     const UTF8 *srcptr = source+length;
    384     switch (length) {
    385     default: return false;
    386         /* Everything else falls through when "true"... */
    387     case 4: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false;
    388     case 3: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false;
    389     case 2: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false;
    390 
    391         switch (*source) {
    392             /* no fall-through in this inner switch */
    393             case 0xE0: if (a < 0xA0) return false; break;
    394             case 0xED: if (a > 0x9F) return false; break;
    395             case 0xF0: if (a < 0x90) return false; break;
    396             case 0xF4: if (a > 0x8F) return false; break;
    397             default:   if (a < 0x80) return false;
    398         }
    399 
    400     case 1: if (*source >= 0x80 && *source < 0xC2) return false;
    401     }
    402     if (*source > 0xF4) return false;
    403     return true;
    404 }
    405 
    406 /* --------------------------------------------------------------------- */
    407 
    408 /*
    409  * Exported function to return whether a UTF-8 sequence is legal or not.
    410  * This is not used here; it's just exported.
    411  */
    412 Boolean isLegalUTF8Sequence(const UTF8 *source, const UTF8 *sourceEnd) {
    413     int length = trailingBytesForUTF8[*source]+1;
    414     if (length > sourceEnd - source) {
    415         return false;
    416     }
    417     return isLegalUTF8(source, length);
    418 }
    419 
    420 /* --------------------------------------------------------------------- */
    421 
    422 static unsigned
    423 findMaximalSubpartOfIllFormedUTF8Sequence(const UTF8 *source,
    424                                           const UTF8 *sourceEnd) {
    425   UTF8 b1, b2, b3;
    426 
    427   assert(!isLegalUTF8Sequence(source, sourceEnd));
    428 
    429   /*
    430    * Unicode 6.3.0, D93b:
    431    *
    432    *   Maximal subpart of an ill-formed subsequence: The longest code unit
    433    *   subsequence starting at an unconvertible offset that is either:
    434    *   a. the initial subsequence of a well-formed code unit sequence, or
    435    *   b. a subsequence of length one.
    436    */
    437 
    438   if (source == sourceEnd)
    439     return 0;
    440 
    441   /*
    442    * Perform case analysis.  See Unicode 6.3.0, Table 3-7. Well-Formed UTF-8
    443    * Byte Sequences.
    444    */
    445 
    446   b1 = *source;
    447   ++source;
    448   if (b1 >= 0xC2 && b1 <= 0xDF) {
    449     /*
    450      * First byte is valid, but we know that this code unit sequence is
    451      * invalid, so the maximal subpart has to end after the first byte.
    452      */
    453     return 1;
    454   }
    455 
    456   if (source == sourceEnd)
    457     return 1;
    458 
    459   b2 = *source;
    460   ++source;
    461 
    462   if (b1 == 0xE0) {
    463     return (b2 >= 0xA0 && b2 <= 0xBF) ? 2 : 1;
    464   }
    465   if (b1 >= 0xE1 && b1 <= 0xEC) {
    466     return (b2 >= 0x80 && b2 <= 0xBF) ? 2 : 1;
    467   }
    468   if (b1 == 0xED) {
    469     return (b2 >= 0x80 && b2 <= 0x9F) ? 2 : 1;
    470   }
    471   if (b1 >= 0xEE && b1 <= 0xEF) {
    472     return (b2 >= 0x80 && b2 <= 0xBF) ? 2 : 1;
    473   }
    474   if (b1 == 0xF0) {
    475     if (b2 >= 0x90 && b2 <= 0xBF) {
    476       if (source == sourceEnd)
    477         return 2;
    478 
    479       b3 = *source;
    480       return (b3 >= 0x80 && b3 <= 0xBF) ? 3 : 2;
    481     }
    482     return 1;
    483   }
    484   if (b1 >= 0xF1 && b1 <= 0xF3) {
    485     if (b2 >= 0x80 && b2 <= 0xBF) {
    486       if (source == sourceEnd)
    487         return 2;
    488 
    489       b3 = *source;
    490       return (b3 >= 0x80 && b3 <= 0xBF) ? 3 : 2;
    491     }
    492     return 1;
    493   }
    494   if (b1 == 0xF4) {
    495     if (b2 >= 0x80 && b2 <= 0x8F) {
    496       if (source == sourceEnd)
    497         return 2;
    498 
    499       b3 = *source;
    500       return (b3 >= 0x80 && b3 <= 0xBF) ? 3 : 2;
    501     }
    502     return 1;
    503   }
    504 
    505   assert((b1 >= 0x80 && b1 <= 0xC1) || b1 >= 0xF5);
    506   /*
    507    * There are no valid sequences that start with these bytes.  Maximal subpart
    508    * is defined to have length 1 in these cases.
    509    */
    510   return 1;
    511 }
    512 
    513 /* --------------------------------------------------------------------- */
    514 
    515 /*
    516  * Exported function to return the total number of bytes in a codepoint
    517  * represented in UTF-8, given the value of the first byte.
    518  */
    519 unsigned getNumBytesForUTF8(UTF8 first) {
    520   return trailingBytesForUTF8[first] + 1;
    521 }
    522 
    523 /* --------------------------------------------------------------------- */
    524 
    525 /*
    526  * Exported function to return whether a UTF-8 string is legal or not.
    527  * This is not used here; it's just exported.
    528  */
    529 Boolean isLegalUTF8String(const UTF8 **source, const UTF8 *sourceEnd) {
    530     while (*source != sourceEnd) {
    531         int length = trailingBytesForUTF8[**source] + 1;
    532         if (length > sourceEnd - *source || !isLegalUTF8(*source, length))
    533             return false;
    534         *source += length;
    535     }
    536     return true;
    537 }
    538 
    539 /* --------------------------------------------------------------------- */
    540 
    541 ConversionResult ConvertUTF8toUTF16 (
    542         const UTF8** sourceStart, const UTF8* sourceEnd,
    543         UTF16** targetStart, UTF16* targetEnd, ConversionFlags flags) {
    544     ConversionResult result = conversionOK;
    545     const UTF8* source = *sourceStart;
    546     UTF16* target = *targetStart;
    547     while (source < sourceEnd) {
    548         UTF32 ch = 0;
    549         unsigned short extraBytesToRead = trailingBytesForUTF8[*source];
    550         if (extraBytesToRead >= sourceEnd - source) {
    551             result = sourceExhausted; break;
    552         }
    553         /* Do this check whether lenient or strict */
    554         if (!isLegalUTF8(source, extraBytesToRead+1)) {
    555             result = sourceIllegal;
    556             break;
    557         }
    558         /*
    559          * The cases all fall through. See "Note A" below.
    560          */
    561         switch (extraBytesToRead) {
    562             case 5: ch += *source++; ch <<= 6; /* remember, illegal UTF-8 */
    563             case 4: ch += *source++; ch <<= 6; /* remember, illegal UTF-8 */
    564             case 3: ch += *source++; ch <<= 6;
    565             case 2: ch += *source++; ch <<= 6;
    566             case 1: ch += *source++; ch <<= 6;
    567             case 0: ch += *source++;
    568         }
    569         ch -= offsetsFromUTF8[extraBytesToRead];
    570 
    571         if (target >= targetEnd) {
    572             source -= (extraBytesToRead+1); /* Back up source pointer! */
    573             result = targetExhausted; break;
    574         }
    575         if (ch <= UNI_MAX_BMP) { /* Target is a character <= 0xFFFF */
    576             /* UTF-16 surrogate values are illegal in UTF-32 */
    577             if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) {
    578                 if (flags == strictConversion) {
    579                     source -= (extraBytesToRead+1); /* return to the illegal value itself */
    580                     result = sourceIllegal;
    581                     break;
    582                 } else {
    583                     *target++ = UNI_REPLACEMENT_CHAR;
    584                 }
    585             } else {
    586                 *target++ = (UTF16)ch; /* normal case */
    587             }
    588         } else if (ch > UNI_MAX_UTF16) {
    589             if (flags == strictConversion) {
    590                 result = sourceIllegal;
    591                 source -= (extraBytesToRead+1); /* return to the start */
    592                 break; /* Bail out; shouldn't continue */
    593             } else {
    594                 *target++ = UNI_REPLACEMENT_CHAR;
    595             }
    596         } else {
    597             /* target is a character in range 0xFFFF - 0x10FFFF. */
    598             if (target + 1 >= targetEnd) {
    599                 source -= (extraBytesToRead+1); /* Back up source pointer! */
    600                 result = targetExhausted; break;
    601             }
    602             ch -= halfBase;
    603             *target++ = (UTF16)((ch >> halfShift) + UNI_SUR_HIGH_START);
    604             *target++ = (UTF16)((ch & halfMask) + UNI_SUR_LOW_START);
    605         }
    606     }
    607     *sourceStart = source;
    608     *targetStart = target;
    609     return result;
    610 }
    611 
    612 /* --------------------------------------------------------------------- */
    613 
    614 static ConversionResult ConvertUTF8toUTF32Impl(
    615         const UTF8** sourceStart, const UTF8* sourceEnd,
    616         UTF32** targetStart, UTF32* targetEnd, ConversionFlags flags,
    617         Boolean InputIsPartial) {
    618     ConversionResult result = conversionOK;
    619     const UTF8* source = *sourceStart;
    620     UTF32* target = *targetStart;
    621     while (source < sourceEnd) {
    622         UTF32 ch = 0;
    623         unsigned short extraBytesToRead = trailingBytesForUTF8[*source];
    624         if (extraBytesToRead >= sourceEnd - source) {
    625             if (flags == strictConversion || InputIsPartial) {
    626                 result = sourceExhausted;
    627                 break;
    628             } else {
    629                 result = sourceIllegal;
    630 
    631                 /*
    632                  * Replace the maximal subpart of ill-formed sequence with
    633                  * replacement character.
    634                  */
    635                 source += findMaximalSubpartOfIllFormedUTF8Sequence(source,
    636                                                                     sourceEnd);
    637                 *target++ = UNI_REPLACEMENT_CHAR;
    638                 continue;
    639             }
    640         }
    641         if (target >= targetEnd) {
    642             result = targetExhausted; break;
    643         }
    644 
    645         /* Do this check whether lenient or strict */
    646         if (!isLegalUTF8(source, extraBytesToRead+1)) {
    647             result = sourceIllegal;
    648             if (flags == strictConversion) {
    649                 /* Abort conversion. */
    650                 break;
    651             } else {
    652                 /*
    653                  * Replace the maximal subpart of ill-formed sequence with
    654                  * replacement character.
    655                  */
    656                 source += findMaximalSubpartOfIllFormedUTF8Sequence(source,
    657                                                                     sourceEnd);
    658                 *target++ = UNI_REPLACEMENT_CHAR;
    659                 continue;
    660             }
    661         }
    662         /*
    663          * The cases all fall through. See "Note A" below.
    664          */
    665         switch (extraBytesToRead) {
    666             case 5: ch += *source++; ch <<= 6;
    667             case 4: ch += *source++; ch <<= 6;
    668             case 3: ch += *source++; ch <<= 6;
    669             case 2: ch += *source++; ch <<= 6;
    670             case 1: ch += *source++; ch <<= 6;
    671             case 0: ch += *source++;
    672         }
    673         ch -= offsetsFromUTF8[extraBytesToRead];
    674 
    675         if (ch <= UNI_MAX_LEGAL_UTF32) {
    676             /*
    677              * UTF-16 surrogate values are illegal in UTF-32, and anything
    678              * over Plane 17 (> 0x10FFFF) is illegal.
    679              */
    680             if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) {
    681                 if (flags == strictConversion) {
    682                     source -= (extraBytesToRead+1); /* return to the illegal value itself */
    683                     result = sourceIllegal;
    684                     break;
    685                 } else {
    686                     *target++ = UNI_REPLACEMENT_CHAR;
    687                 }
    688             } else {
    689                 *target++ = ch;
    690             }
    691         } else { /* i.e., ch > UNI_MAX_LEGAL_UTF32 */
    692             result = sourceIllegal;
    693             *target++ = UNI_REPLACEMENT_CHAR;
    694         }
    695     }
    696     *sourceStart = source;
    697     *targetStart = target;
    698     return result;
    699 }
    700 
    701 ConversionResult ConvertUTF8toUTF32Partial(const UTF8 **sourceStart,
    702                                            const UTF8 *sourceEnd,
    703                                            UTF32 **targetStart,
    704                                            UTF32 *targetEnd,
    705                                            ConversionFlags flags) {
    706   return ConvertUTF8toUTF32Impl(sourceStart, sourceEnd, targetStart, targetEnd,
    707                                 flags, /*InputIsPartial=*/true);
    708 }
    709 
    710 ConversionResult ConvertUTF8toUTF32(const UTF8 **sourceStart,
    711                                     const UTF8 *sourceEnd, UTF32 **targetStart,
    712                                     UTF32 *targetEnd, ConversionFlags flags) {
    713   return ConvertUTF8toUTF32Impl(sourceStart, sourceEnd, targetStart, targetEnd,
    714                                 flags, /*InputIsPartial=*/false);
    715 }
    716 
    717 /* ---------------------------------------------------------------------
    718 
    719     Note A.
    720     The fall-through switches in UTF-8 reading code save a
    721     temp variable, some decrements & conditionals.  The switches
    722     are equivalent to the following loop:
    723         {
    724             int tmpBytesToRead = extraBytesToRead+1;
    725             do {
    726                 ch += *source++;
    727                 --tmpBytesToRead;
    728                 if (tmpBytesToRead) ch <<= 6;
    729             } while (tmpBytesToRead > 0);
    730         }
    731     In UTF-8 writing code, the switches on "bytesToWrite" are
    732     similarly unrolled loops.
    733 
    734    --------------------------------------------------------------------- */
    735 
    736 } // namespace llvm
    737 
    738 ConvertUTF_RESTORE_WARNINGS
    739