[ tor-browser ].git.dasho

ucnvbocu.cpp (47492B)
      1 // © 2016 and later: Unicode, Inc. and others.
      2 // License & terms of use: http://www.unicode.org/copyright.html
      3 /*
      4 ******************************************************************************
      5 *
      6 *   Copyright (C) 2002-2016, International Business Machines
      7 *   Corporation and others.  All Rights Reserved.
      8 *
      9 ******************************************************************************
     10 *   file name:  ucnvbocu.cpp
     11 *   encoding:   UTF-8
     12 *   tab size:   8 (not used)
     13 *   indentation:4
     14 *
     15 *   created on: 2002mar27
     16 *   created by: Markus W. Scherer
     17 *
     18 *   This is an implementation of the Binary Ordered Compression for Unicode,
     19 *   in its MIME-friendly form as defined in http://www.unicode.org/notes/tn6/
     20 */
     21 
     22 #include "unicode/utypes.h"
     23 
     24 #if !UCONFIG_NO_CONVERSION && !UCONFIG_ONLY_HTML_CONVERSION
     25 
     26 #include "unicode/ucnv.h"
     27 #include "unicode/ucnv_cb.h"
     28 #include "unicode/utf16.h"
     29 #include "putilimp.h"
     30 #include "ucnv_bld.h"
     31 #include "ucnv_cnv.h"
     32 #include "uassert.h"
     33 
     34 /* BOCU-1 constants and macros ---------------------------------------------- */
     35 
     36 /*
     37 * BOCU-1 encodes the code points of a Unicode string as
     38 * a sequence of byte-encoded differences (slope detection),
     39 * preserving lexical order.
     40 *
     41 * Optimize the difference-taking for runs of Unicode text within
     42 * small scripts:
     43 *
     44 * Most small scripts are allocated within aligned 128-blocks of Unicode
     45 * code points. Lexical order is preserved if the "previous code point" state
     46 * is always moved into the middle of such a block.
     47 *
     48 * Additionally, "prev" is moved from anywhere in the Unihan and Hangul
     49 * areas into the middle of those areas.
     50 *
     51 * C0 control codes and space are encoded with their US-ASCII bytes.
     52 * "prev" is reset for C0 controls but not for space.
     53 */
     54 
     55 /* initial value for "prev": middle of the ASCII range */
     56 #define BOCU1_ASCII_PREV        0x40
     57 
     58 /* bounding byte values for differences */
     59 #define BOCU1_MIN               0x21
     60 #define BOCU1_MIDDLE            0x90
     61 #define BOCU1_MAX_LEAD          0xfe
     62 #define BOCU1_MAX_TRAIL         0xff
     63 #define BOCU1_RESET             0xff
     64 
     65 /* number of lead bytes */
     66 #define BOCU1_COUNT             (BOCU1_MAX_LEAD-BOCU1_MIN+1)
     67 
     68 /* adjust trail byte counts for the use of some C0 control byte values */
     69 #define BOCU1_TRAIL_CONTROLS_COUNT  20
     70 #define BOCU1_TRAIL_BYTE_OFFSET     (BOCU1_MIN-BOCU1_TRAIL_CONTROLS_COUNT)
     71 
     72 /* number of trail bytes */
     73 #define BOCU1_TRAIL_COUNT       ((BOCU1_MAX_TRAIL-BOCU1_MIN+1)+BOCU1_TRAIL_CONTROLS_COUNT)
     74 
     75 /*
     76 * number of positive and negative single-byte codes
     77 * (counting 0==BOCU1_MIDDLE among the positive ones)
     78 */
     79 #define BOCU1_SINGLE            64
     80 
     81 /* number of lead bytes for positive and negative 2/3/4-byte sequences */
     82 #define BOCU1_LEAD_2            43
     83 #define BOCU1_LEAD_3            3
     84 #define BOCU1_LEAD_4            1
     85 
     86 /* The difference value range for single-byters. */
     87 #define BOCU1_REACH_POS_1   (BOCU1_SINGLE-1)
     88 #define BOCU1_REACH_NEG_1   (-BOCU1_SINGLE)
     89 
     90 /* The difference value range for double-byters. */
     91 #define BOCU1_REACH_POS_2   (BOCU1_REACH_POS_1+BOCU1_LEAD_2*BOCU1_TRAIL_COUNT)
     92 #define BOCU1_REACH_NEG_2   (BOCU1_REACH_NEG_1-BOCU1_LEAD_2*BOCU1_TRAIL_COUNT)
     93 
     94 /* The difference value range for 3-byters. */
     95 #define BOCU1_REACH_POS_3   \
     96    (BOCU1_REACH_POS_2+BOCU1_LEAD_3*BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT)
     97 
     98 #define BOCU1_REACH_NEG_3   (BOCU1_REACH_NEG_2-BOCU1_LEAD_3*BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT)
     99 
    100 /* The lead byte start values. */
    101 #define BOCU1_START_POS_2   (BOCU1_MIDDLE+BOCU1_REACH_POS_1+1)
    102 #define BOCU1_START_POS_3   (BOCU1_START_POS_2+BOCU1_LEAD_2)
    103 #define BOCU1_START_POS_4   (BOCU1_START_POS_3+BOCU1_LEAD_3)
    104     /* ==BOCU1_MAX_LEAD */
    105 
    106 #define BOCU1_START_NEG_2   (BOCU1_MIDDLE+BOCU1_REACH_NEG_1)
    107 #define BOCU1_START_NEG_3   (BOCU1_START_NEG_2-BOCU1_LEAD_2)
    108 #define BOCU1_START_NEG_4   (BOCU1_START_NEG_3-BOCU1_LEAD_3)
    109     /* ==BOCU1_MIN+1 */
    110 
    111 /* The length of a byte sequence, according to the lead byte (!=BOCU1_RESET). */
    112 #define BOCU1_LENGTH_FROM_LEAD(lead) \
    113    ((BOCU1_START_NEG_2<=(lead) && (lead)<BOCU1_START_POS_2) ? 1 : \
    114     (BOCU1_START_NEG_3<=(lead) && (lead)<BOCU1_START_POS_3) ? 2 : \
    115     (BOCU1_START_NEG_4<=(lead) && (lead)<BOCU1_START_POS_4) ? 3 : 4)
    116 
    117 /* The length of a byte sequence, according to its packed form. */
    118 #define BOCU1_LENGTH_FROM_PACKED(packed) \
    119    ((uint32_t)(packed)<0x04000000 ? (packed)>>24 : 4)
    120 
    121 /*
    122 * 12 commonly used C0 control codes (and space) are only used to encode
    123 * themselves directly,
    124 * which makes BOCU-1 MIME-usable and reasonably safe for
    125 * ASCII-oriented software.
    126 *
    127 * These controls are
    128 *  0   NUL
    129 *
    130 *  7   BEL
    131 *  8   BS
    132 *
    133 *  9   TAB
    134 *  a   LF
    135 *  b   VT
    136 *  c   FF
    137 *  d   CR
    138 *
    139 *  e   SO
    140 *  f   SI
    141 *
    142 * 1a   SUB
    143 * 1b   ESC
    144 *
    145 * The other 20 C0 controls are also encoded directly (to preserve order)
    146 * but are also used as trail bytes in difference encoding
    147 * (for better compression).
    148 */
    149 #define BOCU1_TRAIL_TO_BYTE(t) ((t)>=BOCU1_TRAIL_CONTROLS_COUNT ? (t)+BOCU1_TRAIL_BYTE_OFFSET : bocu1TrailToByte[t])
    150 
    151 /*
    152 * Byte value map for control codes,
    153 * from external byte values 0x00..0x20
    154 * to trail byte values 0..19 (0..0x13) as used in the difference calculation.
    155 * External byte values that are illegal as trail bytes are mapped to -1.
    156 */
    157 static const int8_t
    158 bocu1ByteToTrail[BOCU1_MIN]={
    159 /*  0     1     2     3     4     5     6     7    */
    160    -1,   0x00, 0x01, 0x02, 0x03, 0x04, 0x05, -1,
    161 
    162 /*  8     9     a     b     c     d     e     f    */
    163    -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,
    164 
    165 /*  10    11    12    13    14    15    16    17   */
    166    0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d,
    167 
    168 /*  18    19    1a    1b    1c    1d    1e    1f   */
    169    0x0e, 0x0f, -1,   -1,   0x10, 0x11, 0x12, 0x13,
    170 
    171 /*  20   */
    172    -1
    173 };
    174 
    175 /*
    176 * Byte value map for control codes,
    177 * from trail byte values 0..19 (0..0x13) as used in the difference calculation
    178 * to external byte values 0x00..0x20.
    179 */
    180 static const int8_t
    181 bocu1TrailToByte[BOCU1_TRAIL_CONTROLS_COUNT]={
    182 /*  0     1     2     3     4     5     6     7    */
    183    0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x10, 0x11,
    184 
    185 /*  8     9     a     b     c     d     e     f    */
    186    0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19,
    187 
    188 /*  10    11    12    13   */
    189    0x1c, 0x1d, 0x1e, 0x1f
    190 };
    191 
    192 /**
    193 * Integer division and modulo with negative numerators
    194 * yields negative modulo results and quotients that are one more than
    195 * what we need here.
    196 * This macro adjust the results so that the modulo-value m is always >=0.
    197 *
    198 * For positive n, the if() condition is always false.
    199 *
    200 * @param n Number to be split into quotient and rest.
    201 *          Will be modified to contain the quotient.
    202 * @param d Divisor.
    203 * @param m Output variable for the rest (modulo result).
    204 */
    205 #define NEGDIVMOD(n, d, m) UPRV_BLOCK_MACRO_BEGIN { \
    206    (m)=(n)%(d); \
    207    (n)/=(d); \
    208    if((m)<0) { \
    209        --(n); \
    210        (m)+=(d); \
    211    } \
    212 } UPRV_BLOCK_MACRO_END
    213 
    214 /* Faster versions of packDiff() for single-byte-encoded diff values. */
    215 
    216 /** Is a diff value encodable in a single byte? */
    217 #define DIFF_IS_SINGLE(diff) (BOCU1_REACH_NEG_1<=(diff) && (diff)<=BOCU1_REACH_POS_1)
    218 
    219 /** Encode a diff value in a single byte. */
    220 #define PACK_SINGLE_DIFF(diff) (BOCU1_MIDDLE+(diff))
    221 
    222 /** Is a diff value encodable in two bytes? */
    223 #define DIFF_IS_DOUBLE(diff) (BOCU1_REACH_NEG_2<=(diff) && (diff)<=BOCU1_REACH_POS_2)
    224 
    225 /* BOCU-1 implementation functions ------------------------------------------ */
    226 
    227 #define BOCU1_SIMPLE_PREV(c) (((c)&~0x7f)+BOCU1_ASCII_PREV)
    228 
    229 /**
    230 * Compute the next "previous" value for differencing
    231 * from the current code point.
    232 *
    233 * @param c current code point, 0x3040..0xd7a3 (rest handled by macro below)
    234 * @return "previous code point" state value
    235 */
    236 static inline int32_t
    237 bocu1Prev(int32_t c) {
    238    /* compute new prev */
    239    if(/* 0x3040<=c && */ c<=0x309f) {
    240        /* Hiragana is not 128-aligned */
    241        return 0x3070;
    242    } else if(0x4e00<=c && c<=0x9fa5) {
    243        /* CJK Unihan */
    244        return 0x4e00-BOCU1_REACH_NEG_2;
    245    } else if(0xac00<=c /* && c<=0xd7a3 */) {
    246        /* Korean Hangul */
    247        return (0xd7a3+0xac00)/2;
    248    } else {
    249        /* mostly small scripts */
    250        return BOCU1_SIMPLE_PREV(c);
    251    }
    252 }
    253 
    254 /** Fast version of bocu1Prev() for most scripts. */
    255 #define BOCU1_PREV(c) ((c)<0x3040 || (c)>0xd7a3 ? BOCU1_SIMPLE_PREV(c) : bocu1Prev(c))
    256 
    257 /*
    258 * The BOCU-1 converter uses the standard setup code in ucnv.c/ucnv_bld.c.
    259 * The UConverter fields are used as follows:
    260 *
    261 * fromUnicodeStatus    encoder's prev (0 will be interpreted as BOCU1_ASCII_PREV)
    262 *
    263 * toUnicodeStatus      decoder's prev (0 will be interpreted as BOCU1_ASCII_PREV)
    264 * mode                 decoder's incomplete (diff<<2)|count (ignored when toULength==0)
    265 */
    266 
    267 /* BOCU-1-from-Unicode conversion functions --------------------------------- */
    268 
    269 /**
    270 * Encode a difference -0x10ffff..0x10ffff in 1..4 bytes
    271 * and return a packed integer with them.
    272 *
    273 * The encoding favors small absolute differences with short encodings
    274 * to compress runs of same-script characters.
    275 *
    276 * Optimized version with unrolled loops and fewer floating-point operations
    277 * than the standard packDiff().
    278 *
    279 * @param diff difference value -0x10ffff..0x10ffff
    280 * @return
    281 *      0x010000zz for 1-byte sequence zz
    282 *      0x0200yyzz for 2-byte sequence yy zz
    283 *      0x03xxyyzz for 3-byte sequence xx yy zz
    284 *      0xwwxxyyzz for 4-byte sequence ww xx yy zz (ww>0x03)
    285 */
    286 static int32_t
    287 packDiff(int32_t diff) {
    288    int32_t result, m;
    289 
    290    U_ASSERT(!DIFF_IS_SINGLE(diff)); /* assume we won't be called where diff==BOCU1_REACH_NEG_1=-64 */
    291    if(diff>=BOCU1_REACH_NEG_1) {
    292        /* mostly positive differences, and single-byte negative ones */
    293 #if 0   /* single-byte case handled in macros, see below */
    294        if(diff<=BOCU1_REACH_POS_1) {
    295            /* single byte */
    296            return 0x01000000|(BOCU1_MIDDLE+diff);
    297        } else
    298 #endif
    299        if(diff<=BOCU1_REACH_POS_2) {
    300            /* two bytes */
    301            diff-=BOCU1_REACH_POS_1+1;
    302            result=0x02000000;
    303 
    304            m=diff%BOCU1_TRAIL_COUNT;
    305            diff/=BOCU1_TRAIL_COUNT;
    306            result|=BOCU1_TRAIL_TO_BYTE(m);
    307 
    308            result|=(BOCU1_START_POS_2+diff)<<8;
    309        } else if(diff<=BOCU1_REACH_POS_3) {
    310            /* three bytes */
    311            diff-=BOCU1_REACH_POS_2+1;
    312            result=0x03000000;
    313 
    314            m=diff%BOCU1_TRAIL_COUNT;
    315            diff/=BOCU1_TRAIL_COUNT;
    316            result|=BOCU1_TRAIL_TO_BYTE(m);
    317 
    318            m=diff%BOCU1_TRAIL_COUNT;
    319            diff/=BOCU1_TRAIL_COUNT;
    320            result|=BOCU1_TRAIL_TO_BYTE(m)<<8;
    321 
    322            result|=(BOCU1_START_POS_3+diff)<<16;
    323        } else {
    324            /* four bytes */
    325            diff-=BOCU1_REACH_POS_3+1;
    326 
    327            m=diff%BOCU1_TRAIL_COUNT;
    328            diff/=BOCU1_TRAIL_COUNT;
    329            result=BOCU1_TRAIL_TO_BYTE(m);
    330 
    331            m=diff%BOCU1_TRAIL_COUNT;
    332            diff/=BOCU1_TRAIL_COUNT;
    333            result|=BOCU1_TRAIL_TO_BYTE(m)<<8;
    334 
    335            /*
    336             * We know that / and % would deliver quotient 0 and rest=diff.
    337             * Avoid division and modulo for performance.
    338             */
    339            result|=BOCU1_TRAIL_TO_BYTE(diff)<<16;
    340 
    341            result |= static_cast<uint32_t>(BOCU1_START_POS_4) << 24;
    342        }
    343    } else {
    344        /* two- to four-byte negative differences */
    345        if(diff>=BOCU1_REACH_NEG_2) {
    346            /* two bytes */
    347            diff-=BOCU1_REACH_NEG_1;
    348            result=0x02000000;
    349 
    350            NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m);
    351            result|=BOCU1_TRAIL_TO_BYTE(m);
    352 
    353            result|=(BOCU1_START_NEG_2+diff)<<8;
    354        } else if(diff>=BOCU1_REACH_NEG_3) {
    355            /* three bytes */
    356            diff-=BOCU1_REACH_NEG_2;
    357            result=0x03000000;
    358 
    359            NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m);
    360            result|=BOCU1_TRAIL_TO_BYTE(m);
    361 
    362            NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m);
    363            result|=BOCU1_TRAIL_TO_BYTE(m)<<8;
    364 
    365            result|=(BOCU1_START_NEG_3+diff)<<16;
    366        } else {
    367            /* four bytes */
    368            diff-=BOCU1_REACH_NEG_3;
    369 
    370            NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m);
    371            result=BOCU1_TRAIL_TO_BYTE(m);
    372 
    373            NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m);
    374            result|=BOCU1_TRAIL_TO_BYTE(m)<<8;
    375 
    376            /*
    377             * We know that NEGDIVMOD would deliver
    378             * quotient -1 and rest=diff+BOCU1_TRAIL_COUNT.
    379             * Avoid division and modulo for performance.
    380             */
    381            m=diff+BOCU1_TRAIL_COUNT;
    382            result|=BOCU1_TRAIL_TO_BYTE(m)<<16;
    383 
    384            result|=BOCU1_MIN<<24;
    385        }
    386    }
    387    return result;
    388 }
    389 
    390 
    391 static void U_CALLCONV
    392 _Bocu1FromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs,
    393                             UErrorCode *pErrorCode) {
    394    UConverter *cnv;
    395    const char16_t *source, *sourceLimit;
    396    uint8_t *target;
    397    int32_t targetCapacity;
    398    int32_t *offsets;
    399 
    400    int32_t prev, c, diff;
    401 
    402    int32_t sourceIndex, nextSourceIndex;
    403 
    404    /* set up the local pointers */
    405    cnv=pArgs->converter;
    406    source=pArgs->source;
    407    sourceLimit=pArgs->sourceLimit;
    408    target = reinterpret_cast<uint8_t*>(pArgs->target);
    409    targetCapacity = static_cast<int32_t>(pArgs->targetLimit - pArgs->target);
    410    offsets=pArgs->offsets;
    411 
    412    /* get the converter state from UConverter */
    413    c=cnv->fromUChar32;
    414    prev = static_cast<int32_t>(cnv->fromUnicodeStatus);
    415    if(prev==0) {
    416        prev=BOCU1_ASCII_PREV;
    417    }
    418 
    419    /* sourceIndex=-1 if the current character began in the previous buffer */
    420    sourceIndex= c==0 ? 0 : -1;
    421    nextSourceIndex=0;
    422 
    423    /* conversion loop */
    424    if(c!=0 && targetCapacity>0) {
    425        goto getTrail;
    426    }
    427 
    428 fastSingle:
    429    /* fast loop for single-byte differences */
    430    /* use only one loop counter variable, targetCapacity, not also source */
    431    diff = static_cast<int32_t>(sourceLimit - source);
    432    if(targetCapacity>diff) {
    433        targetCapacity=diff;
    434    }
    435    while(targetCapacity>0 && (c=*source)<0x3000) {
    436        if(c<=0x20) {
    437            if(c!=0x20) {
    438                prev=BOCU1_ASCII_PREV;
    439            }
    440            *target++ = static_cast<uint8_t>(c);
    441            *offsets++=nextSourceIndex++;
    442            ++source;
    443            --targetCapacity;
    444        } else {
    445            diff=c-prev;
    446            if(DIFF_IS_SINGLE(diff)) {
    447                prev=BOCU1_SIMPLE_PREV(c);
    448                *target++ = static_cast<uint8_t>(PACK_SINGLE_DIFF(diff));
    449                *offsets++=nextSourceIndex++;
    450                ++source;
    451                --targetCapacity;
    452            } else {
    453                break;
    454            }
    455        }
    456    }
    457    /* restore real values */
    458    targetCapacity = static_cast<int32_t>(reinterpret_cast<const uint8_t*>(pArgs->targetLimit) - target);
    459    sourceIndex=nextSourceIndex; /* wrong if offsets==nullptr but does not matter */
    460 
    461    /* regular loop for all cases */
    462    while(source<sourceLimit) {
    463        if(targetCapacity>0) {
    464            c=*source++;
    465            ++nextSourceIndex;
    466 
    467            if(c<=0x20) {
    468                /*
    469                 * ISO C0 control & space:
    470                 * Encode directly for MIME compatibility,
    471                 * and reset state except for space, to not disrupt compression.
    472                 */
    473                if(c!=0x20) {
    474                    prev=BOCU1_ASCII_PREV;
    475                }
    476                *target++ = static_cast<uint8_t>(c);
    477                *offsets++=sourceIndex;
    478                --targetCapacity;
    479 
    480                sourceIndex=nextSourceIndex;
    481                continue;
    482            }
    483 
    484            if(U16_IS_LEAD(c)) {
    485 getTrail:
    486                if(source<sourceLimit) {
    487                    /* test the following code unit */
    488                    char16_t trail=*source;
    489                    if(U16_IS_TRAIL(trail)) {
    490                        ++source;
    491                        ++nextSourceIndex;
    492                        c=U16_GET_SUPPLEMENTARY(c, trail);
    493                    }
    494                } else {
    495                    /* no more input */
    496                    c=-c; /* negative lead surrogate as "incomplete" indicator to avoid c=0 everywhere else */
    497                    break;
    498                }
    499            }
    500 
    501            /*
    502             * all other Unicode code points c==U+0021..U+10ffff
    503             * are encoded with the difference c-prev
    504             *
    505             * a new prev is computed from c,
    506             * placed in the middle of a 0x80-block (for most small scripts) or
    507             * in the middle of the Unihan and Hangul blocks
    508             * to statistically minimize the following difference
    509             */
    510            diff=c-prev;
    511            prev=BOCU1_PREV(c);
    512            if(DIFF_IS_SINGLE(diff)) {
    513                *target++ = static_cast<uint8_t>(PACK_SINGLE_DIFF(diff));
    514                *offsets++=sourceIndex;
    515                --targetCapacity;
    516                sourceIndex=nextSourceIndex;
    517                if(c<0x3000) {
    518                    goto fastSingle;
    519                }
    520            } else if(DIFF_IS_DOUBLE(diff) && 2<=targetCapacity) {
    521                /* optimize 2-byte case */
    522                int32_t m;
    523 
    524                if(diff>=0) {
    525                    diff-=BOCU1_REACH_POS_1+1;
    526                    m=diff%BOCU1_TRAIL_COUNT;
    527                    diff/=BOCU1_TRAIL_COUNT;
    528                    diff+=BOCU1_START_POS_2;
    529                } else {
    530                    diff-=BOCU1_REACH_NEG_1;
    531                    NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m);
    532                    diff+=BOCU1_START_NEG_2;
    533                }
    534                *target++ = static_cast<uint8_t>(diff);
    535                *target++ = static_cast<uint8_t>(BOCU1_TRAIL_TO_BYTE(m));
    536                *offsets++=sourceIndex;
    537                *offsets++=sourceIndex;
    538                targetCapacity-=2;
    539                sourceIndex=nextSourceIndex;
    540            } else {
    541                int32_t length; /* will be 2..4 */
    542 
    543                diff=packDiff(diff);
    544                length=BOCU1_LENGTH_FROM_PACKED(diff);
    545 
    546                /* write the output character bytes from diff and length */
    547                /* from the first if in the loop we know that targetCapacity>0 */
    548                if(length<=targetCapacity) {
    549                    switch(length) {
    550                        /* each branch falls through to the next one */
    551                    case 4:
    552                        *target++ = static_cast<uint8_t>(diff >> 24);
    553                        *offsets++=sourceIndex;
    554                        U_FALLTHROUGH;
    555                    case 3:
    556                        *target++ = static_cast<uint8_t>(diff >> 16);
    557                        *offsets++=sourceIndex;
    558                        U_FALLTHROUGH;
    559                    case 2:
    560                        *target++ = static_cast<uint8_t>(diff >> 8);
    561                        *offsets++=sourceIndex;
    562                    /* case 1: handled above */
    563                        *target++ = static_cast<uint8_t>(diff);
    564                        *offsets++=sourceIndex;
    565                        U_FALLTHROUGH;
    566                    default:
    567                        /* will never occur */
    568                        break;
    569                    }
    570                    targetCapacity-=length;
    571                    sourceIndex=nextSourceIndex;
    572                } else {
    573                    uint8_t *charErrorBuffer;
    574 
    575                    /*
    576                     * We actually do this backwards here:
    577                     * In order to save an intermediate variable, we output
    578                     * first to the overflow buffer what does not fit into the
    579                     * regular target.
    580                     */
    581                    /* we know that 1<=targetCapacity<length<=4 */
    582                    length-=targetCapacity;
    583                    charErrorBuffer=(uint8_t *)cnv->charErrorBuffer;
    584                    switch(length) {
    585                        /* each branch falls through to the next one */
    586                    case 3:
    587                        *charErrorBuffer++ = static_cast<uint8_t>(diff >> 16);
    588                        U_FALLTHROUGH;
    589                    case 2:
    590                        *charErrorBuffer++ = static_cast<uint8_t>(diff >> 8);
    591                        U_FALLTHROUGH;
    592                    case 1:
    593                        *charErrorBuffer = static_cast<uint8_t>(diff);
    594                        U_FALLTHROUGH;
    595                    default:
    596                        /* will never occur */
    597                        break;
    598                    }
    599                    cnv->charErrorBufferLength = static_cast<int8_t>(length);
    600 
    601                    /* now output what fits into the regular target */
    602                    diff>>=8*length; /* length was reduced by targetCapacity */
    603                    switch(targetCapacity) {
    604                        /* each branch falls through to the next one */
    605                    case 3:
    606                        *target++ = static_cast<uint8_t>(diff >> 16);
    607                        *offsets++=sourceIndex;
    608                        U_FALLTHROUGH;
    609                    case 2:
    610                        *target++ = static_cast<uint8_t>(diff >> 8);
    611                        *offsets++=sourceIndex;
    612                        U_FALLTHROUGH;
    613                    case 1:
    614                        *target++ = static_cast<uint8_t>(diff);
    615                        *offsets++=sourceIndex;
    616                        U_FALLTHROUGH;
    617                    default:
    618                        /* will never occur */
    619                        break;
    620                    }
    621 
    622                    /* target overflow */
    623                    targetCapacity=0;
    624                    *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
    625                    break;
    626                }
    627            }
    628        } else {
    629            /* target is full */
    630            *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
    631            break;
    632        }
    633    }
    634 
    635    /* set the converter state back into UConverter */
    636    cnv->fromUChar32= c<0 ? -c : 0;
    637    cnv->fromUnicodeStatus = static_cast<uint32_t>(prev);
    638 
    639    /* write back the updated pointers */
    640    pArgs->source=source;
    641    pArgs->target = reinterpret_cast<char*>(target);
    642    pArgs->offsets=offsets;
    643 }
    644 
    645 /*
    646 * Identical to _Bocu1FromUnicodeWithOffsets but without offset handling.
    647 * If a change is made in the original function, then either
    648 * change this function the same way or
    649 * re-copy the original function and remove the variables
    650 * offsets, sourceIndex, and nextSourceIndex.
    651 */
    652 static void U_CALLCONV
    653 _Bocu1FromUnicode(UConverterFromUnicodeArgs *pArgs,
    654                  UErrorCode *pErrorCode) {
    655    UConverter *cnv;
    656    const char16_t *source, *sourceLimit;
    657    uint8_t *target;
    658    int32_t targetCapacity;
    659 
    660    int32_t prev, c, diff;
    661 
    662    /* set up the local pointers */
    663    cnv=pArgs->converter;
    664    source=pArgs->source;
    665    sourceLimit=pArgs->sourceLimit;
    666    target = reinterpret_cast<uint8_t*>(pArgs->target);
    667    targetCapacity = static_cast<int32_t>(pArgs->targetLimit - pArgs->target);
    668 
    669    /* get the converter state from UConverter */
    670    c=cnv->fromUChar32;
    671    prev = static_cast<int32_t>(cnv->fromUnicodeStatus);
    672    if(prev==0) {
    673        prev=BOCU1_ASCII_PREV;
    674    }
    675 
    676    /* conversion loop */
    677    if(c!=0 && targetCapacity>0) {
    678        goto getTrail;
    679    }
    680 
    681 fastSingle:
    682    /* fast loop for single-byte differences */
    683    /* use only one loop counter variable, targetCapacity, not also source */
    684    diff = static_cast<int32_t>(sourceLimit - source);
    685    if(targetCapacity>diff) {
    686        targetCapacity=diff;
    687    }
    688    while(targetCapacity>0 && (c=*source)<0x3000) {
    689        if(c<=0x20) {
    690            if(c!=0x20) {
    691                prev=BOCU1_ASCII_PREV;
    692            }
    693            *target++ = static_cast<uint8_t>(c);
    694        } else {
    695            diff=c-prev;
    696            if(DIFF_IS_SINGLE(diff)) {
    697                prev=BOCU1_SIMPLE_PREV(c);
    698                *target++ = static_cast<uint8_t>(PACK_SINGLE_DIFF(diff));
    699            } else {
    700                break;
    701            }
    702        }
    703        ++source;
    704        --targetCapacity;
    705    }
    706    /* restore real values */
    707    targetCapacity = static_cast<int32_t>(reinterpret_cast<const uint8_t*>(pArgs->targetLimit) - target);
    708 
    709    /* regular loop for all cases */
    710    while(source<sourceLimit) {
    711        if(targetCapacity>0) {
    712            c=*source++;
    713 
    714            if(c<=0x20) {
    715                /*
    716                 * ISO C0 control & space:
    717                 * Encode directly for MIME compatibility,
    718                 * and reset state except for space, to not disrupt compression.
    719                 */
    720                if(c!=0x20) {
    721                    prev=BOCU1_ASCII_PREV;
    722                }
    723                *target++ = static_cast<uint8_t>(c);
    724                --targetCapacity;
    725                continue;
    726            }
    727 
    728            if(U16_IS_LEAD(c)) {
    729 getTrail:
    730                if(source<sourceLimit) {
    731                    /* test the following code unit */
    732                    char16_t trail=*source;
    733                    if(U16_IS_TRAIL(trail)) {
    734                        ++source;
    735                        c=U16_GET_SUPPLEMENTARY(c, trail);
    736                    }
    737                } else {
    738                    /* no more input */
    739                    c=-c; /* negative lead surrogate as "incomplete" indicator to avoid c=0 everywhere else */
    740                    break;
    741                }
    742            }
    743 
    744            /*
    745             * all other Unicode code points c==U+0021..U+10ffff
    746             * are encoded with the difference c-prev
    747             *
    748             * a new prev is computed from c,
    749             * placed in the middle of a 0x80-block (for most small scripts) or
    750             * in the middle of the Unihan and Hangul blocks
    751             * to statistically minimize the following difference
    752             */
    753            diff=c-prev;
    754            prev=BOCU1_PREV(c);
    755            if(DIFF_IS_SINGLE(diff)) {
    756                *target++ = static_cast<uint8_t>(PACK_SINGLE_DIFF(diff));
    757                --targetCapacity;
    758                if(c<0x3000) {
    759                    goto fastSingle;
    760                }
    761            } else if(DIFF_IS_DOUBLE(diff) && 2<=targetCapacity) {
    762                /* optimize 2-byte case */
    763                int32_t m;
    764 
    765                if(diff>=0) {
    766                    diff-=BOCU1_REACH_POS_1+1;
    767                    m=diff%BOCU1_TRAIL_COUNT;
    768                    diff/=BOCU1_TRAIL_COUNT;
    769                    diff+=BOCU1_START_POS_2;
    770                } else {
    771                    diff-=BOCU1_REACH_NEG_1;
    772                    NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m);
    773                    diff+=BOCU1_START_NEG_2;
    774                }
    775                *target++ = static_cast<uint8_t>(diff);
    776                *target++ = static_cast<uint8_t>(BOCU1_TRAIL_TO_BYTE(m));
    777                targetCapacity-=2;
    778            } else {
    779                int32_t length; /* will be 2..4 */
    780 
    781                diff=packDiff(diff);
    782                length=BOCU1_LENGTH_FROM_PACKED(diff);
    783 
    784                /* write the output character bytes from diff and length */
    785                /* from the first if in the loop we know that targetCapacity>0 */
    786                if(length<=targetCapacity) {
    787                    switch(length) {
    788                        /* each branch falls through to the next one */
    789                    case 4:
    790                        *target++ = static_cast<uint8_t>(diff >> 24);
    791                        U_FALLTHROUGH;
    792                    case 3:
    793                        *target++ = static_cast<uint8_t>(diff >> 16);
    794                    /* case 2: handled above */
    795                        *target++ = static_cast<uint8_t>(diff >> 8);
    796                    /* case 1: handled above */
    797                        *target++ = static_cast<uint8_t>(diff);
    798                        U_FALLTHROUGH;
    799                    default:
    800                        /* will never occur */
    801                        break;
    802                    }
    803                    targetCapacity-=length;
    804                } else {
    805                    uint8_t *charErrorBuffer;
    806 
    807                    /*
    808                     * We actually do this backwards here:
    809                     * In order to save an intermediate variable, we output
    810                     * first to the overflow buffer what does not fit into the
    811                     * regular target.
    812                     */
    813                    /* we know that 1<=targetCapacity<length<=4 */
    814                    length-=targetCapacity;
    815                    charErrorBuffer=(uint8_t *)cnv->charErrorBuffer;
    816                    switch(length) {
    817                        /* each branch falls through to the next one */
    818                    case 3:
    819                        *charErrorBuffer++ = static_cast<uint8_t>(diff >> 16);
    820                        U_FALLTHROUGH;
    821                    case 2:
    822                        *charErrorBuffer++ = static_cast<uint8_t>(diff >> 8);
    823                        U_FALLTHROUGH;
    824                    case 1:
    825                        *charErrorBuffer = static_cast<uint8_t>(diff);
    826                        U_FALLTHROUGH;
    827                    default:
    828                        /* will never occur */
    829                        break;
    830                    }
    831                    cnv->charErrorBufferLength = static_cast<int8_t>(length);
    832 
    833                    /* now output what fits into the regular target */
    834                    diff>>=8*length; /* length was reduced by targetCapacity */
    835                    switch(targetCapacity) {
    836                        /* each branch falls through to the next one */
    837                    case 3:
    838                        *target++ = static_cast<uint8_t>(diff >> 16);
    839                        U_FALLTHROUGH;
    840                    case 2:
    841                        *target++ = static_cast<uint8_t>(diff >> 8);
    842                        U_FALLTHROUGH;
    843                    case 1:
    844                        *target++ = static_cast<uint8_t>(diff);
    845                        U_FALLTHROUGH;
    846                    default:
    847                        /* will never occur */
    848                        break;
    849                    }
    850 
    851                    /* target overflow */
    852                    targetCapacity=0;
    853                    *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
    854                    break;
    855                }
    856            }
    857        } else {
    858            /* target is full */
    859            *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
    860            break;
    861        }
    862    }
    863 
    864    /* set the converter state back into UConverter */
    865    cnv->fromUChar32= c<0 ? -c : 0;
    866    cnv->fromUnicodeStatus = static_cast<uint32_t>(prev);
    867 
    868    /* write back the updated pointers */
    869    pArgs->source=source;
    870    pArgs->target = reinterpret_cast<char*>(target);
    871 }
    872 
    873 /* BOCU-1-to-Unicode conversion functions ----------------------------------- */
    874 
    875 /**
    876 * Function for BOCU-1 decoder; handles multi-byte lead bytes.
    877 *
    878 * @param b lead byte;
    879 *          BOCU1_MIN<=b<BOCU1_START_NEG_2 or BOCU1_START_POS_2<=b<BOCU1_MAX_LEAD
    880 * @return (diff<<2)|count
    881 */
    882 static inline int32_t
    883 decodeBocu1LeadByte(int32_t b) {
    884    int32_t diff, count;
    885 
    886    if(b>=BOCU1_START_NEG_2) {
    887        /* positive difference */
    888        if(b<BOCU1_START_POS_3) {
    889            /* two bytes */
    890            diff = (b - BOCU1_START_POS_2) * BOCU1_TRAIL_COUNT + BOCU1_REACH_POS_1 + 1;
    891            count=1;
    892        } else if(b<BOCU1_START_POS_4) {
    893            /* three bytes */
    894            diff = (b - BOCU1_START_POS_3) * BOCU1_TRAIL_COUNT * BOCU1_TRAIL_COUNT + BOCU1_REACH_POS_2 + 1;
    895            count=2;
    896        } else {
    897            /* four bytes */
    898            diff=BOCU1_REACH_POS_3+1;
    899            count=3;
    900        }
    901    } else {
    902        /* negative difference */
    903        if(b>=BOCU1_START_NEG_3) {
    904            /* two bytes */
    905            diff = (b - BOCU1_START_NEG_2) * BOCU1_TRAIL_COUNT + BOCU1_REACH_NEG_1;
    906            count=1;
    907        } else if(b>BOCU1_MIN) {
    908            /* three bytes */
    909            diff = (b - BOCU1_START_NEG_3) * BOCU1_TRAIL_COUNT * BOCU1_TRAIL_COUNT + BOCU1_REACH_NEG_2;
    910            count=2;
    911        } else {
    912            /* four bytes */
    913            diff=-BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT+BOCU1_REACH_NEG_3;
    914            count=3;
    915        }
    916    }
    917 
    918    /* return the state for decoding the trail byte(s) */
    919    return (static_cast<uint32_t>(diff) << 2) | count;
    920 }
    921 
    922 /**
    923 * Function for BOCU-1 decoder; handles multi-byte trail bytes.
    924 *
    925 * @param count number of remaining trail bytes including this one
    926 * @param b trail byte
    927 * @return new delta for diff including b - <0 indicates an error
    928 *
    929 * @see decodeBocu1
    930 */
    931 static inline int32_t
    932 decodeBocu1TrailByte(int32_t count, int32_t b) {
    933    if(b<=0x20) {
    934        /* skip some C0 controls and make the trail byte range contiguous */
    935        b=bocu1ByteToTrail[b];
    936        /* b<0 for an illegal trail byte value will result in return<0 below */
    937 #if BOCU1_MAX_TRAIL<0xff
    938    } else if(b>BOCU1_MAX_TRAIL) {
    939        return -99;
    940 #endif
    941    } else {
    942        b-=BOCU1_TRAIL_BYTE_OFFSET;
    943    }
    944 
    945    /* add trail byte into difference and decrement count */
    946    if(count==1) {
    947        return b;
    948    } else if(count==2) {
    949        return b*BOCU1_TRAIL_COUNT;
    950    } else /* count==3 */ {
    951        return b*(BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT);
    952    }
    953 }
    954 
    955 static void U_CALLCONV
    956 _Bocu1ToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs,
    957                           UErrorCode *pErrorCode) {
    958    UConverter *cnv;
    959    const uint8_t *source, *sourceLimit;
    960    char16_t *target;
    961    const char16_t *targetLimit;
    962    int32_t *offsets;
    963 
    964    int32_t prev, count, diff, c;
    965 
    966    int8_t byteIndex;
    967    uint8_t *bytes;
    968 
    969    int32_t sourceIndex, nextSourceIndex;
    970 
    971    /* set up the local pointers */
    972    cnv=pArgs->converter;
    973    source = reinterpret_cast<const uint8_t*>(pArgs->source);
    974    sourceLimit = reinterpret_cast<const uint8_t*>(pArgs->sourceLimit);
    975    target=pArgs->target;
    976    targetLimit=pArgs->targetLimit;
    977    offsets=pArgs->offsets;
    978 
    979    /* get the converter state from UConverter */
    980    prev = static_cast<int32_t>(cnv->toUnicodeStatus);
    981    if(prev==0) {
    982        prev=BOCU1_ASCII_PREV;
    983    }
    984    diff=cnv->mode; /* mode may be set to UCNV_SI by ucnv_bld.c but then toULength==0 */
    985    count=diff&3;
    986    diff>>=2;
    987 
    988    byteIndex=cnv->toULength;
    989    bytes=cnv->toUBytes;
    990 
    991    /* sourceIndex=-1 if the current character began in the previous buffer */
    992    sourceIndex=byteIndex==0 ? 0 : -1;
    993    nextSourceIndex=0;
    994 
    995    /* conversion "loop" similar to _SCSUToUnicodeWithOffsets() */
    996    if(count>0 && byteIndex>0 && target<targetLimit) {
    997        goto getTrail;
    998    }
    999 
   1000 fastSingle:
   1001    /* fast loop for single-byte differences */
   1002    /* use count as the only loop counter variable */
   1003    diff = static_cast<int32_t>(sourceLimit - source);
   1004    count = static_cast<int32_t>(pArgs->targetLimit - target);
   1005    if(count>diff) {
   1006        count=diff;
   1007    }
   1008    while(count>0) {
   1009        if(BOCU1_START_NEG_2<=(c=*source) && c<BOCU1_START_POS_2) {
   1010            c=prev+(c-BOCU1_MIDDLE);
   1011            if(c<0x3000) {
   1012                *target++ = static_cast<char16_t>(c);
   1013                *offsets++=nextSourceIndex++;
   1014                prev=BOCU1_SIMPLE_PREV(c);
   1015            } else {
   1016                break;
   1017            }
   1018        } else if(c<=0x20) {
   1019            if(c!=0x20) {
   1020                prev=BOCU1_ASCII_PREV;
   1021            }
   1022            *target++ = static_cast<char16_t>(c);
   1023            *offsets++=nextSourceIndex++;
   1024        } else {
   1025            break;
   1026        }
   1027        ++source;
   1028        --count;
   1029    }
   1030    sourceIndex=nextSourceIndex; /* wrong if offsets==nullptr but does not matter */
   1031 
   1032    /* decode a sequence of single and lead bytes */
   1033    while(source<sourceLimit) {
   1034        if(target>=targetLimit) {
   1035            /* target is full */
   1036            *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
   1037            break;
   1038        }
   1039 
   1040        ++nextSourceIndex;
   1041        c=*source++;
   1042        if(BOCU1_START_NEG_2<=c && c<BOCU1_START_POS_2) {
   1043            /* Write a code point directly from a single-byte difference. */
   1044            c=prev+(c-BOCU1_MIDDLE);
   1045            if(c<0x3000) {
   1046                *target++ = static_cast<char16_t>(c);
   1047                *offsets++=sourceIndex;
   1048                prev=BOCU1_SIMPLE_PREV(c);
   1049                sourceIndex=nextSourceIndex;
   1050                goto fastSingle;
   1051            }
   1052        } else if(c<=0x20) {
   1053            /*
   1054             * Direct-encoded C0 control code or space.
   1055             * Reset prev for C0 control codes but not for space.
   1056             */
   1057            if(c!=0x20) {
   1058                prev=BOCU1_ASCII_PREV;
   1059            }
   1060            *target++ = static_cast<char16_t>(c);
   1061            *offsets++=sourceIndex;
   1062            sourceIndex=nextSourceIndex;
   1063            continue;
   1064        } else if(BOCU1_START_NEG_3<=c && c<BOCU1_START_POS_3 && source<sourceLimit) {
   1065            /* Optimize two-byte case. */
   1066            if(c>=BOCU1_MIDDLE) {
   1067                diff = (c - BOCU1_START_POS_2) * BOCU1_TRAIL_COUNT + BOCU1_REACH_POS_1 + 1;
   1068            } else {
   1069                diff = (c - BOCU1_START_NEG_2) * BOCU1_TRAIL_COUNT + BOCU1_REACH_NEG_1;
   1070            }
   1071 
   1072            /* trail byte */
   1073            ++nextSourceIndex;
   1074            c=decodeBocu1TrailByte(1, *source++);
   1075            if (c < 0 || static_cast<uint32_t>(c = prev + diff + c) > 0x10ffff) {
   1076                bytes[0]=source[-2];
   1077                bytes[1]=source[-1];
   1078                byteIndex=2;
   1079                *pErrorCode=U_ILLEGAL_CHAR_FOUND;
   1080                break;
   1081            }
   1082        } else if(c==BOCU1_RESET) {
   1083            /* only reset the state, no code point */
   1084            prev=BOCU1_ASCII_PREV;
   1085            sourceIndex=nextSourceIndex;
   1086            continue;
   1087        } else {
   1088            /*
   1089             * For multi-byte difference lead bytes, set the decoder state
   1090             * with the partial difference value from the lead byte and
   1091             * with the number of trail bytes.
   1092             */
   1093            bytes[0] = static_cast<uint8_t>(c);
   1094            byteIndex=1;
   1095 
   1096            diff=decodeBocu1LeadByte(c);
   1097            count=diff&3;
   1098            diff>>=2;
   1099 getTrail:
   1100            for(;;) {
   1101                if(source>=sourceLimit) {
   1102                    goto endloop;
   1103                }
   1104                ++nextSourceIndex;
   1105                c=bytes[byteIndex++]=*source++;
   1106 
   1107                /* trail byte in any position */
   1108                c=decodeBocu1TrailByte(count, c);
   1109                if(c<0) {
   1110                    *pErrorCode=U_ILLEGAL_CHAR_FOUND;
   1111                    goto endloop;
   1112                }
   1113 
   1114                diff+=c;
   1115                if(--count==0) {
   1116                    /* final trail byte, deliver a code point */
   1117                    byteIndex=0;
   1118                    c=prev+diff;
   1119                    if (static_cast<uint32_t>(c) > 0x10ffff) {
   1120                        *pErrorCode=U_ILLEGAL_CHAR_FOUND;
   1121                        goto endloop;
   1122                    }
   1123                    break;
   1124                }
   1125            }
   1126        }
   1127 
   1128        /* calculate the next prev and output c */
   1129        prev=BOCU1_PREV(c);
   1130        if(c<=0xffff) {
   1131            *target++ = static_cast<char16_t>(c);
   1132            *offsets++=sourceIndex;
   1133        } else {
   1134            /* output surrogate pair */
   1135            *target++=U16_LEAD(c);
   1136            if(target<targetLimit) {
   1137                *target++=U16_TRAIL(c);
   1138                *offsets++=sourceIndex;
   1139                *offsets++=sourceIndex;
   1140            } else {
   1141                /* target overflow */
   1142                *offsets++=sourceIndex;
   1143                cnv->UCharErrorBuffer[0]=U16_TRAIL(c);
   1144                cnv->UCharErrorBufferLength=1;
   1145                *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
   1146                break;
   1147            }
   1148        }
   1149        sourceIndex=nextSourceIndex;
   1150    }
   1151 endloop:
   1152 
   1153    if(*pErrorCode==U_ILLEGAL_CHAR_FOUND) {
   1154        /* set the converter state in UConverter to deal with the next character */
   1155        cnv->toUnicodeStatus=BOCU1_ASCII_PREV;
   1156        cnv->mode=0;
   1157    } else {
   1158        /* set the converter state back into UConverter */
   1159        cnv->toUnicodeStatus = static_cast<uint32_t>(prev);
   1160        cnv->mode = static_cast<int32_t>(static_cast<uint32_t>(diff) << 2) | count;
   1161    }
   1162    cnv->toULength=byteIndex;
   1163 
   1164    /* write back the updated pointers */
   1165    pArgs->source = reinterpret_cast<const char*>(source);
   1166    pArgs->target=target;
   1167    pArgs->offsets=offsets;
   1168 }
   1169 
   1170 /*
   1171 * Identical to _Bocu1ToUnicodeWithOffsets but without offset handling.
   1172 * If a change is made in the original function, then either
   1173 * change this function the same way or
   1174 * re-copy the original function and remove the variables
   1175 * offsets, sourceIndex, and nextSourceIndex.
   1176 */
   1177 static void U_CALLCONV
   1178 _Bocu1ToUnicode(UConverterToUnicodeArgs *pArgs,
   1179                UErrorCode *pErrorCode) {
   1180    UConverter *cnv;
   1181    const uint8_t *source, *sourceLimit;
   1182    char16_t *target;
   1183    const char16_t *targetLimit;
   1184 
   1185    int32_t prev, count, diff, c;
   1186 
   1187    int8_t byteIndex;
   1188    uint8_t *bytes;
   1189 
   1190    /* set up the local pointers */
   1191    cnv=pArgs->converter;
   1192    source = reinterpret_cast<const uint8_t*>(pArgs->source);
   1193    sourceLimit = reinterpret_cast<const uint8_t*>(pArgs->sourceLimit);
   1194    target=pArgs->target;
   1195    targetLimit=pArgs->targetLimit;
   1196 
   1197    /* get the converter state from UConverter */
   1198    prev = static_cast<int32_t>(cnv->toUnicodeStatus);
   1199    if(prev==0) {
   1200        prev=BOCU1_ASCII_PREV;
   1201    }
   1202    diff=cnv->mode; /* mode may be set to UCNV_SI by ucnv_bld.c but then toULength==0 */
   1203    count=diff&3;
   1204    diff>>=2;
   1205 
   1206    byteIndex=cnv->toULength;
   1207    bytes=cnv->toUBytes;
   1208 
   1209    /* conversion "loop" similar to _SCSUToUnicodeWithOffsets() */
   1210    if(count>0 && byteIndex>0 && target<targetLimit) {
   1211        goto getTrail;
   1212    }
   1213 
   1214 fastSingle:
   1215    /* fast loop for single-byte differences */
   1216    /* use count as the only loop counter variable */
   1217    diff = static_cast<int32_t>(sourceLimit - source);
   1218    count = static_cast<int32_t>(pArgs->targetLimit - target);
   1219    if(count>diff) {
   1220        count=diff;
   1221    }
   1222    while(count>0) {
   1223        if(BOCU1_START_NEG_2<=(c=*source) && c<BOCU1_START_POS_2) {
   1224            c=prev+(c-BOCU1_MIDDLE);
   1225            if(c<0x3000) {
   1226                *target++ = static_cast<char16_t>(c);
   1227                prev=BOCU1_SIMPLE_PREV(c);
   1228            } else {
   1229                break;
   1230            }
   1231        } else if(c<=0x20) {
   1232            if(c!=0x20) {
   1233                prev=BOCU1_ASCII_PREV;
   1234            }
   1235            *target++ = static_cast<char16_t>(c);
   1236        } else {
   1237            break;
   1238        }
   1239        ++source;
   1240        --count;
   1241    }
   1242 
   1243    /* decode a sequence of single and lead bytes */
   1244    while(source<sourceLimit) {
   1245        if(target>=targetLimit) {
   1246            /* target is full */
   1247            *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
   1248            break;
   1249        }
   1250 
   1251        c=*source++;
   1252        if(BOCU1_START_NEG_2<=c && c<BOCU1_START_POS_2) {
   1253            /* Write a code point directly from a single-byte difference. */
   1254            c=prev+(c-BOCU1_MIDDLE);
   1255            if(c<0x3000) {
   1256                *target++ = static_cast<char16_t>(c);
   1257                prev=BOCU1_SIMPLE_PREV(c);
   1258                goto fastSingle;
   1259            }
   1260        } else if(c<=0x20) {
   1261            /*
   1262             * Direct-encoded C0 control code or space.
   1263             * Reset prev for C0 control codes but not for space.
   1264             */
   1265            if(c!=0x20) {
   1266                prev=BOCU1_ASCII_PREV;
   1267            }
   1268            *target++ = static_cast<char16_t>(c);
   1269            continue;
   1270        } else if(BOCU1_START_NEG_3<=c && c<BOCU1_START_POS_3 && source<sourceLimit) {
   1271            /* Optimize two-byte case. */
   1272            if(c>=BOCU1_MIDDLE) {
   1273                diff = (c - BOCU1_START_POS_2) * BOCU1_TRAIL_COUNT + BOCU1_REACH_POS_1 + 1;
   1274            } else {
   1275                diff = (c - BOCU1_START_NEG_2) * BOCU1_TRAIL_COUNT + BOCU1_REACH_NEG_1;
   1276            }
   1277 
   1278            /* trail byte */
   1279            c=decodeBocu1TrailByte(1, *source++);
   1280            if (c < 0 || static_cast<uint32_t>(c = prev + diff + c) > 0x10ffff) {
   1281                bytes[0]=source[-2];
   1282                bytes[1]=source[-1];
   1283                byteIndex=2;
   1284                *pErrorCode=U_ILLEGAL_CHAR_FOUND;
   1285                break;
   1286            }
   1287        } else if(c==BOCU1_RESET) {
   1288            /* only reset the state, no code point */
   1289            prev=BOCU1_ASCII_PREV;
   1290            continue;
   1291        } else {
   1292            /*
   1293             * For multi-byte difference lead bytes, set the decoder state
   1294             * with the partial difference value from the lead byte and
   1295             * with the number of trail bytes.
   1296             */
   1297            bytes[0] = static_cast<uint8_t>(c);
   1298            byteIndex=1;
   1299 
   1300            diff=decodeBocu1LeadByte(c);
   1301            count=diff&3;
   1302            diff>>=2;
   1303 getTrail:
   1304            for(;;) {
   1305                if(source>=sourceLimit) {
   1306                    goto endloop;
   1307                }
   1308                c=bytes[byteIndex++]=*source++;
   1309 
   1310                /* trail byte in any position */
   1311                c=decodeBocu1TrailByte(count, c);
   1312                if(c<0) {
   1313                    *pErrorCode=U_ILLEGAL_CHAR_FOUND;
   1314                    goto endloop;
   1315                }
   1316 
   1317                diff+=c;
   1318                if(--count==0) {
   1319                    /* final trail byte, deliver a code point */
   1320                    byteIndex=0;
   1321                    c=prev+diff;
   1322                    if (static_cast<uint32_t>(c) > 0x10ffff) {
   1323                        *pErrorCode=U_ILLEGAL_CHAR_FOUND;
   1324                        goto endloop;
   1325                    }
   1326                    break;
   1327                }
   1328            }
   1329        }
   1330 
   1331        /* calculate the next prev and output c */
   1332        prev=BOCU1_PREV(c);
   1333        if(c<=0xffff) {
   1334            *target++ = static_cast<char16_t>(c);
   1335        } else {
   1336            /* output surrogate pair */
   1337            *target++=U16_LEAD(c);
   1338            if(target<targetLimit) {
   1339                *target++=U16_TRAIL(c);
   1340            } else {
   1341                /* target overflow */
   1342                cnv->UCharErrorBuffer[0]=U16_TRAIL(c);
   1343                cnv->UCharErrorBufferLength=1;
   1344                *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
   1345                break;
   1346            }
   1347        }
   1348    }
   1349 endloop:
   1350 
   1351    if(*pErrorCode==U_ILLEGAL_CHAR_FOUND) {
   1352        /* set the converter state in UConverter to deal with the next character */
   1353        cnv->toUnicodeStatus=BOCU1_ASCII_PREV;
   1354        cnv->mode=0;
   1355    } else {
   1356        /* set the converter state back into UConverter */
   1357        cnv->toUnicodeStatus = static_cast<uint32_t>(prev);
   1358        cnv->mode = (static_cast<uint32_t>(diff) << 2) | count;
   1359    }
   1360    cnv->toULength=byteIndex;
   1361 
   1362    /* write back the updated pointers */
   1363    pArgs->source = reinterpret_cast<const char*>(source);
   1364    pArgs->target=target;
   1365 }
   1366 
   1367 /* miscellaneous ------------------------------------------------------------ */
   1368 
   1369 static const UConverterImpl _Bocu1Impl={
   1370    UCNV_BOCU1,
   1371 
   1372    nullptr,
   1373    nullptr,
   1374 
   1375    nullptr,
   1376    nullptr,
   1377    nullptr,
   1378 
   1379    _Bocu1ToUnicode,
   1380    _Bocu1ToUnicodeWithOffsets,
   1381    _Bocu1FromUnicode,
   1382    _Bocu1FromUnicodeWithOffsets,
   1383    nullptr,
   1384 
   1385    nullptr,
   1386    nullptr,
   1387    nullptr,
   1388    nullptr,
   1389    ucnv_getCompleteUnicodeSet,
   1390 
   1391    nullptr,
   1392    nullptr
   1393 };
   1394 
   1395 static const UConverterStaticData _Bocu1StaticData={
   1396    sizeof(UConverterStaticData),
   1397    "BOCU-1",
   1398    1214, /* CCSID for BOCU-1 */
   1399    UCNV_IBM, UCNV_BOCU1,
   1400    1, 4, /* one char16_t generates at least 1 byte and at most 4 bytes */
   1401    { 0x1a, 0, 0, 0 }, 1, /* BOCU-1 never needs to write a subchar */
   1402    false, false,
   1403    0,
   1404    0,
   1405    { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
   1406 };
   1407 
   1408 const UConverterSharedData _Bocu1Data=
   1409        UCNV_IMMUTABLE_SHARED_DATA_INITIALIZER(&_Bocu1StaticData, &_Bocu1Impl);
   1410 
   1411 #endif
	tor-browser The Tor Browser
	git clone https://git.dasho.dev/tor-browser.git
	Log \| Files \| Refs \| README \| LICENSE