tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

utf_impl.cpp (11607B)


      1 // © 2016 and later: Unicode, Inc. and others.
      2 // License & terms of use: http://www.unicode.org/copyright.html
      3 /*
      4 ******************************************************************************
      5 *
      6 *   Copyright (C) 1999-2012, International Business Machines
      7 *   Corporation and others.  All Rights Reserved.
      8 *
      9 ******************************************************************************
     10 *   file name:  utf_impl.cpp
     11 *   encoding:   UTF-8
     12 *   tab size:   8 (not used)
     13 *   indentation:4
     14 *
     15 *   created on: 1999sep13
     16 *   created by: Markus W. Scherer
     17 *
     18 *   This file provides implementation functions for macros in the utfXX.h
     19 *   that would otherwise be too long as macros.
     20 */
     21 
     22 /* set import/export definitions */
     23 #ifndef U_UTF8_IMPL
     24 #   define U_UTF8_IMPL
     25 #endif
     26 
     27 #include "unicode/utypes.h"
     28 #include "unicode/utf.h"
     29 #include "unicode/utf8.h"
     30 #include "uassert.h"
     31 
     32 /*
     33 * Table of the number of utf8 trail bytes, indexed by the lead byte.
     34 * Used by the deprecated macro UTF8_COUNT_TRAIL_BYTES, defined in utf_old.h
     35 *
     36 * The current macro, U8_COUNT_TRAIL_BYTES, does _not_ use this table.
     37 *
     38 * Note that this table cannot be removed, even if UTF8_COUNT_TRAIL_BYTES were
     39 * changed to no longer use it. References to the table from expansions of UTF8_COUNT_TRAIL_BYTES
     40 * may exist in old client code that must continue to run with newer icu library versions.
     41 *
     42 * This table could be replaced on many machines by
     43 * a few lines of assembler code using an
     44 * "index of first 0-bit from msb" instruction and
     45 * one or two more integer instructions.
     46 *
     47 * For example, on an i386, do something like
     48 * - MOV AL, leadByte
     49 * - NOT AL         (8-bit, leave b15..b8==0..0, reverse only b7..b0)
     50 * - MOV AH, 0
     51 * - BSR BX, AX     (16-bit)
     52 * - MOV AX, 6      (result)
     53 * - JZ finish      (ZF==1 if leadByte==0xff)
     54 * - SUB AX, BX (result)
     55 * -finish:
     56 * (BSR: Bit Scan Reverse, scans for a 1-bit, starting from the MSB)
     57 */
     58 U_CAPI const uint8_t
     59 utf8_countTrailBytes[256]={
     60    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
     61    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
     62    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
     63    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
     64 
     65    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
     66    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
     67    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
     68    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
     69 
     70    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
     71    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
     72    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
     73    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
     74 
     75    // illegal C0 & C1
     76    // 2-byte lead bytes C2..DF
     77    0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
     78    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
     79 
     80    // 3-byte lead bytes E0..EF
     81    2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
     82    // 4-byte lead bytes F0..F4
     83    // illegal F5..FF
     84    3, 3, 3, 3, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
     85 };
     86 
     87 static const UChar32
     88 utf8_errorValue[6]={
     89    // Same values as UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_2, UTF_ERROR_VALUE,
     90    // but without relying on the obsolete unicode/utf_old.h.
     91    0x15, 0x9f, 0xffff,
     92    0x10ffff
     93 };
     94 
     95 static UChar32
     96 errorValue(int32_t count, int8_t strict) {
     97    if(strict>=0) {
     98        return utf8_errorValue[count];
     99    } else if(strict==-3) {
    100        return 0xfffd;
    101    } else {
    102        return U_SENTINEL;
    103    }
    104 }
    105 
    106 /*
    107 * Handle the non-inline part of the U8_NEXT() and U8_NEXT_FFFD() macros
    108 * and their obsolete sibling UTF8_NEXT_CHAR_SAFE().
    109 *
    110 * U8_NEXT() supports NUL-terminated strings indicated via length<0.
    111 *
    112 * The "strict" parameter controls the error behavior:
    113 * <0  "Safe" behavior of U8_NEXT():
    114 *     -1: All illegal byte sequences yield U_SENTINEL=-1.
    115 *     -2: Same as -1, except for lenient treatment of surrogate code points as legal.
    116 *         Some implementations use this for roundtripping of
    117 *         Unicode 16-bit strings that are not well-formed UTF-16, that is, they
    118 *         contain unpaired surrogates.
    119 *     -3: All illegal byte sequences yield U+FFFD.
    120 *  0  Obsolete "safe" behavior of UTF8_NEXT_CHAR_SAFE(..., false):
    121 *     All illegal byte sequences yield a positive code point such that this
    122 *     result code point would be encoded with the same number of bytes as
    123 *     the illegal sequence.
    124 * >0  Obsolete "strict" behavior of UTF8_NEXT_CHAR_SAFE(..., true):
    125 *     Same as the obsolete "safe" behavior, but non-characters are also treated
    126 *     like illegal sequences.
    127 */
    128 U_CAPI UChar32 U_EXPORT2
    129 utf8_nextCharSafeBody(const uint8_t *s, int32_t *pi, int32_t length, UChar32 c, int8_t strict) {
    130    // *pi is one after byte c.
    131    int32_t i=*pi;
    132    // length can be negative for NUL-terminated strings: Read and validate one byte at a time.
    133    if(i==length || c>0xf4) {
    134        // end of string, or not a lead byte
    135    } else if(c>=0xf0) {
    136        // Test for 4-byte sequences first because
    137        // U8_NEXT() handles shorter valid sequences inline.
    138        uint8_t t1=s[i], t2, t3;
    139        c&=7;
    140        if(U8_IS_VALID_LEAD4_AND_T1(c, t1) &&
    141                ++i!=length && (t2=s[i]-0x80)<=0x3f &&
    142                ++i!=length && (t3=s[i]-0x80)<=0x3f) {
    143            ++i;
    144            c=(c<<18)|((t1&0x3f)<<12)|(t2<<6)|t3;
    145            // strict: forbid non-characters like U+fffe
    146            if(strict<=0 || !U_IS_UNICODE_NONCHAR(c)) {
    147                *pi=i;
    148                return c;
    149            }
    150        }
    151    } else if(c>=0xe0) {
    152        c&=0xf;
    153        if(strict!=-2) {
    154            uint8_t t1=s[i], t2;
    155            if(U8_IS_VALID_LEAD3_AND_T1(c, t1) &&
    156                    ++i!=length && (t2=s[i]-0x80)<=0x3f) {
    157                ++i;
    158                c=(c<<12)|((t1&0x3f)<<6)|t2;
    159                // strict: forbid non-characters like U+fffe
    160                if(strict<=0 || !U_IS_UNICODE_NONCHAR(c)) {
    161                    *pi=i;
    162                    return c;
    163                }
    164            }
    165        } else {
    166            // strict=-2 -> lenient: allow surrogates
    167            uint8_t t1=s[i]-0x80, t2;
    168            if(t1<=0x3f && (c>0 || t1>=0x20) &&
    169                    ++i!=length && (t2=s[i]-0x80)<=0x3f) {
    170                *pi=i+1;
    171                return (c<<12)|(t1<<6)|t2;
    172            }
    173        }
    174    } else if(c>=0xc2) {
    175        uint8_t t1=s[i]-0x80;
    176        if(t1<=0x3f) {
    177            *pi=i+1;
    178            return ((c-0xc0)<<6)|t1;
    179        }
    180    }  // else 0x80<=c<0xc2 is not a lead byte
    181 
    182    /* error handling */
    183    c=errorValue(i-*pi, strict);
    184    *pi=i;
    185    return c;
    186 }
    187 
    188 U_CAPI int32_t U_EXPORT2
    189 utf8_appendCharSafeBody(uint8_t *s, int32_t i, int32_t length, UChar32 c, UBool *pIsError) {
    190    if((uint32_t)(c)<=0x7ff) {
    191        if((i)+1<(length)) {
    192            (s)[(i)++]=(uint8_t)(((c)>>6)|0xc0);
    193            (s)[(i)++]=(uint8_t)(((c)&0x3f)|0x80);
    194            return i;
    195        }
    196    } else if((uint32_t)(c)<=0xffff) {
    197        /* Starting with Unicode 3.2, surrogate code points must not be encoded in UTF-8. */
    198        if((i)+2<(length) && !U_IS_SURROGATE(c)) {
    199            (s)[(i)++]=(uint8_t)(((c)>>12)|0xe0);
    200            (s)[(i)++]=(uint8_t)((((c)>>6)&0x3f)|0x80);
    201            (s)[(i)++]=(uint8_t)(((c)&0x3f)|0x80);
    202            return i;
    203        }
    204    } else if((uint32_t)(c)<=0x10ffff) {
    205        if((i)+3<(length)) {
    206            (s)[(i)++]=(uint8_t)(((c)>>18)|0xf0);
    207            (s)[(i)++]=(uint8_t)((((c)>>12)&0x3f)|0x80);
    208            (s)[(i)++]=(uint8_t)((((c)>>6)&0x3f)|0x80);
    209            (s)[(i)++]=(uint8_t)(((c)&0x3f)|0x80);
    210            return i;
    211        }
    212    }
    213    /* c>0x10ffff or not enough space, write an error value */
    214    if(pIsError!=nullptr) {
    215        *pIsError=true;
    216    } else {
    217        length-=i;
    218        if(length>0) {
    219            int32_t offset;
    220            if(length>3) {
    221                length=3;
    222            }
    223            s+=i;
    224            offset=0;
    225            c=utf8_errorValue[length-1];
    226            U8_APPEND_UNSAFE(s, offset, c);
    227            i=i+offset;
    228        }
    229    }
    230    return i;
    231 }
    232 
    233 U_CAPI UChar32 U_EXPORT2
    234 utf8_prevCharSafeBody(const uint8_t *s, int32_t start, int32_t *pi, UChar32 c, int8_t strict) {
    235    // *pi is the index of byte c.
    236    int32_t i=*pi;
    237    if(U8_IS_TRAIL(c) && i>start) {
    238        uint8_t b1=s[--i];
    239        if(U8_IS_LEAD(b1)) {
    240            if(b1<0xe0) {
    241                *pi=i;
    242                return ((b1-0xc0)<<6)|(c&0x3f);
    243            } else if(b1<0xf0 ? U8_IS_VALID_LEAD3_AND_T1(b1, c) : U8_IS_VALID_LEAD4_AND_T1(b1, c)) {
    244                // Truncated 3- or 4-byte sequence.
    245                *pi=i;
    246                return errorValue(1, strict);
    247            }
    248        } else if(U8_IS_TRAIL(b1) && i>start) {
    249            // Extract the value bits from the last trail byte.
    250            c&=0x3f;
    251            uint8_t b2=s[--i];
    252            if(0xe0<=b2 && b2<=0xf4) {
    253                if(b2<0xf0) {
    254                    b2&=0xf;
    255                    if(strict!=-2) {
    256                        if(U8_IS_VALID_LEAD3_AND_T1(b2, b1)) {
    257                            *pi=i;
    258                            c=(b2<<12)|((b1&0x3f)<<6)|c;
    259                            if(strict<=0 || !U_IS_UNICODE_NONCHAR(c)) {
    260                                return c;
    261                            } else {
    262                                // strict: forbid non-characters like U+fffe
    263                                return errorValue(2, strict);
    264                            }
    265                        }
    266                    } else {
    267                        // strict=-2 -> lenient: allow surrogates
    268                        b1-=0x80;
    269                        if((b2>0 || b1>=0x20)) {
    270                            *pi=i;
    271                            return (b2<<12)|(b1<<6)|c;
    272                        }
    273                    }
    274                } else if(U8_IS_VALID_LEAD4_AND_T1(b2, b1)) {
    275                    // Truncated 4-byte sequence.
    276                    *pi=i;
    277                    return errorValue(2, strict);
    278                }
    279            } else if(U8_IS_TRAIL(b2) && i>start) {
    280                uint8_t b3=s[--i];
    281                if(0xf0<=b3 && b3<=0xf4) {
    282                    b3&=7;
    283                    if(U8_IS_VALID_LEAD4_AND_T1(b3, b2)) {
    284                        *pi=i;
    285                        c=(b3<<18)|((b2&0x3f)<<12)|((b1&0x3f)<<6)|c;
    286                        if(strict<=0 || !U_IS_UNICODE_NONCHAR(c)) {
    287                            return c;
    288                        } else {
    289                            // strict: forbid non-characters like U+fffe
    290                            return errorValue(3, strict);
    291                        }
    292                    }
    293                }
    294            }
    295        }
    296    }
    297    return errorValue(0, strict);
    298 }
    299 
    300 U_CAPI int32_t U_EXPORT2
    301 utf8_back1SafeBody(const uint8_t *s, int32_t start, int32_t i) {
    302    // Same as utf8_prevCharSafeBody(..., strict=-1) minus assembling code points.
    303    int32_t orig_i=i;
    304    uint8_t c=s[i];
    305    if(U8_IS_TRAIL(c) && i>start) {
    306        uint8_t b1=s[--i];
    307        if(U8_IS_LEAD(b1)) {
    308            if(b1<0xe0 ||
    309                    (b1<0xf0 ? U8_IS_VALID_LEAD3_AND_T1(b1, c) : U8_IS_VALID_LEAD4_AND_T1(b1, c))) {
    310                return i;
    311            }
    312        } else if(U8_IS_TRAIL(b1) && i>start) {
    313            uint8_t b2=s[--i];
    314            if(0xe0<=b2 && b2<=0xf4) {
    315                if(b2<0xf0 ? U8_IS_VALID_LEAD3_AND_T1(b2, b1) : U8_IS_VALID_LEAD4_AND_T1(b2, b1)) {
    316                    return i;
    317                }
    318            } else if(U8_IS_TRAIL(b2) && i>start) {
    319                uint8_t b3=s[--i];
    320                if(0xf0<=b3 && b3<=0xf4 && U8_IS_VALID_LEAD4_AND_T1(b3, b2)) {
    321                    return i;
    322                }
    323            }
    324        }
    325    }
    326    return orig_i;
    327 }