[ tor-browser ].git.dasho

ucnv_u7.cpp (55992B)
      1 // © 2016 and later: Unicode, Inc. and others.
      2 // License & terms of use: http://www.unicode.org/copyright.html
      3 /*  
      4 **********************************************************************
      5 *   Copyright (C) 2002-2016, International Business Machines
      6 *   Corporation and others.  All Rights Reserved.
      7 **********************************************************************
      8 *   file name:  ucnv_u7.c
      9 *   encoding:   UTF-8
     10 *   tab size:   8 (not used)
     11 *   indentation:4
     12 *
     13 *   created on: 2002jul01
     14 *   created by: Markus W. Scherer
     15 *
     16 *   UTF-7 converter implementation. Used to be in ucnv_utf.c.
     17 */
     18 
     19 #include "unicode/utypes.h"
     20 
     21 #if !UCONFIG_NO_CONVERSION && !UCONFIG_ONLY_HTML_CONVERSION
     22 
     23 #include "cmemory.h"
     24 #include "unicode/ucnv.h"
     25 #include "ucnv_bld.h"
     26 #include "ucnv_cnv.h"
     27 #include "uassert.h"
     28 
     29 /* UTF-7 -------------------------------------------------------------------- */
     30 
     31 /*
     32 * UTF-7 is a stateful encoding of Unicode.
     33 * It is defined in RFC 2152. (http://www.ietf.org/rfc/rfc2152.txt)
     34 * It was intended for use in Internet email systems, using in its bytewise
     35 * encoding only a subset of 7-bit US-ASCII.
     36 * UTF-7 is deprecated in favor of UTF-8/16/32 and SCSU, but still
     37 * occasionally used.
     38 *
     39 * For converting Unicode to UTF-7, the RFC allows to encode some US-ASCII
     40 * characters directly or in base64. Especially, the characters in set O
     41 * as defined in the RFC (see below) may be encoded directly but are not
     42 * allowed in, e.g., email headers.
     43 * By default, the ICU UTF-7 converter encodes set O directly.
     44 * By choosing the option "version=1", set O will be escaped instead.
     45 * For example:
     46 *     utf7Converter=ucnv_open("UTF-7,version=1");
     47 *
     48 * For details about email headers see RFC 2047.
     49 */
     50 
     51 /*
     52 * Tests for US-ASCII characters belonging to character classes
     53 * defined in UTF-7.
     54 *
     55 * Set D (directly encoded characters) consists of the following
     56 * characters: the upper and lower case letters A through Z
     57 * and a through z, the 10 digits 0-9, and the following nine special
     58 * characters (note that "+" and "=" are omitted):
     59 *     '(),-./:?
     60 *
     61 * Set O (optional direct characters) consists of the following
     62 * characters (note that "\" and "~" are omitted):
     63 *     !"#$%&*;<=>@[]^_`{|}
     64 *
     65 * According to the rules in RFC 2152, the byte values for the following
     66 * US-ASCII characters are not used in UTF-7 and are therefore illegal:
     67 * - all C0 control codes except for CR LF TAB
     68 * - BACKSLASH
     69 * - TILDE
     70 * - DEL
     71 * - all codes beyond US-ASCII, i.e. all >127
     72 */
     73 #define inSetD(c) \
     74    ((uint8_t)((c)-97)<26 || (uint8_t)((c)-65)<26 || /* letters */ \
     75     (uint8_t)((c)-48)<10 ||    /* digits */ \
     76     (uint8_t)((c)-39)<3 ||     /* '() */ \
     77     (uint8_t)((c)-44)<4 ||     /* ,-./ */ \
     78     (c)==58 || (c)==63         /* :? */ \
     79    )
     80 
     81 #define inSetO(c) \
     82    ((uint8_t)((c)-33)<6 ||         /* !"#$%& */ \
     83     (uint8_t)((c)-59)<4 ||         /* ;<=> */ \
     84     (uint8_t)((c)-93)<4 ||         /* ]^_` */ \
     85     (uint8_t)((c)-123)<3 ||        /* {|} */ \
     86     (c)==42 || (c)==64 || (c)==91  /* *@[ */ \
     87    )
     88 
     89 #define isCRLFTAB(c) ((c)==13 || (c)==10 || (c)==9)
     90 #define isCRLFSPTAB(c) ((c)==32 || (c)==13 || (c)==10 || (c)==9)
     91 
     92 #define PLUS  43
     93 #define MINUS 45
     94 #define BACKSLASH 92
     95 #define TILDE 126
     96 
     97 /* legal byte values: all US-ASCII graphic characters from space to before tilde, and CR LF TAB */
     98 #define isLegalUTF7(c) (((uint8_t)((c)-32)<94 && (c)!=BACKSLASH) || isCRLFTAB(c))
     99 
    100 /* encode directly sets D and O and CR LF SP TAB */
    101 static const UBool encodeDirectlyMaximum[128]={
    102 /* 0  1  2  3  4  5  6  7  8  9  a  b  c  d  e  f */
    103    0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0,
    104    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
    105 
    106    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1,
    107    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
    108 
    109    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
    110    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1,
    111 
    112    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
    113    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0
    114 };
    115 
    116 /* encode directly set D and CR LF SP TAB but not set O */
    117 static const UBool encodeDirectlyRestricted[128]={
    118 /* 0  1  2  3  4  5  6  7  8  9  a  b  c  d  e  f */
    119    0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0,
    120    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
    121 
    122    1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1,
    123    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1,
    124 
    125    0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
    126    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0,
    127 
    128    0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
    129    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0
    130 };
    131 
    132 static const uint8_t
    133 toBase64[64]={
    134    /* A-Z */
    135    65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77,
    136    78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90,
    137    /* a-z */
    138    97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109,
    139    110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122,
    140    /* 0-9 */
    141    48, 49, 50, 51, 52, 53, 54, 55, 56, 57,
    142    /* +/ */
    143    43, 47
    144 };
    145 
    146 static const int8_t
    147 fromBase64[128]={
    148    /* C0 controls, -1 for legal ones (CR LF TAB), -3 for illegal ones */
    149    -3, -3, -3, -3, -3, -3, -3, -3, -3, -1, -1, -3, -3, -1, -3, -3,
    150    -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3,
    151 
    152    /* general punctuation with + and / and a special value (-2) for - */
    153    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 62, -1, -2, -1, 63,
    154    /* digits */
    155    52, 53, 54, 55, 56, 57, 58, 59, 60, 61, -1, -1, -1, -1, -1, -1,
    156 
    157    /* A-Z */
    158    -1,  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14,
    159    15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, -1, -3, -1, -1, -1,
    160 
    161    /* a-z */
    162    -1, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40,
    163    41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, -1, -1, -1, -3, -3
    164 };
    165 
    166 /*
    167 * converter status values:
    168 *
    169 * toUnicodeStatus:
    170 *     24 inDirectMode (boolean)
    171 * 23..16 base64Counter (-1..7)
    172 * 15..0  bits (up to 14 bits incoming base64)
    173 *
    174 * fromUnicodeStatus:
    175 * 31..28 version (0: set O direct  1: set O escaped)
    176 *     24 inDirectMode (boolean)
    177 * 23..16 base64Counter (0..2)
    178 *  7..0  bits (6 bits outgoing base64)
    179 *
    180 */
    181 
    182 U_CDECL_BEGIN
    183 static void U_CALLCONV
    184 _UTF7Reset(UConverter *cnv, UConverterResetChoice choice) {
    185    if(choice<=UCNV_RESET_TO_UNICODE) {
    186        /* reset toUnicode */
    187        cnv->toUnicodeStatus=0x1000000; /* inDirectMode=true */
    188        cnv->toULength=0;
    189    }
    190    if(choice!=UCNV_RESET_TO_UNICODE) {
    191        /* reset fromUnicode */
    192        cnv->fromUnicodeStatus=(cnv->fromUnicodeStatus&0xf0000000)|0x1000000; /* keep version, inDirectMode=true */
    193    }
    194 }
    195 
    196 static void U_CALLCONV
    197 _UTF7Open(UConverter *cnv,
    198          UConverterLoadArgs *pArgs,
    199          UErrorCode *pErrorCode) {
    200    (void)pArgs;
    201    if(UCNV_GET_VERSION(cnv)<=1) {
    202        /* TODO(markus): Should just use cnv->options rather than copying the version number. */
    203        cnv->fromUnicodeStatus=UCNV_GET_VERSION(cnv)<<28;
    204        _UTF7Reset(cnv, UCNV_RESET_BOTH);
    205    } else {
    206        *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
    207    }
    208 }
    209 
    210 static void U_CALLCONV
    211 _UTF7ToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs,
    212                          UErrorCode *pErrorCode) {
    213    UConverter *cnv;
    214    const uint8_t *source, *sourceLimit;
    215    char16_t *target;
    216    const char16_t *targetLimit;
    217    int32_t *offsets;
    218 
    219    uint8_t *bytes;
    220    uint8_t byteIndex;
    221 
    222    int32_t length, targetCapacity;
    223 
    224    /* UTF-7 state */
    225    uint16_t bits;
    226    int8_t base64Counter;
    227    UBool inDirectMode;
    228 
    229    int8_t base64Value;
    230 
    231    int32_t sourceIndex, nextSourceIndex;
    232 
    233    uint8_t b;
    234    /* set up the local pointers */
    235    cnv=pArgs->converter;
    236 
    237    source=(const uint8_t *)pArgs->source;
    238    sourceLimit=(const uint8_t *)pArgs->sourceLimit;
    239    target=pArgs->target;
    240    targetLimit=pArgs->targetLimit;
    241    offsets=pArgs->offsets;
    242    /* get the state machine state */
    243    {
    244        uint32_t status=cnv->toUnicodeStatus;
    245        inDirectMode=(UBool)((status>>24)&1);
    246        base64Counter=(int8_t)(status>>16);
    247        bits=(uint16_t)status;
    248    }
    249    bytes=cnv->toUBytes;
    250    byteIndex=cnv->toULength;
    251 
    252    /* sourceIndex=-1 if the current character began in the previous buffer */
    253    sourceIndex=byteIndex==0 ? 0 : -1;
    254    nextSourceIndex=0;
    255 
    256    if(inDirectMode) {
    257 directMode:
    258        /*
    259         * In Direct Mode, most US-ASCII characters are encoded directly, i.e.,
    260         * with their US-ASCII byte values.
    261         * Backslash and Tilde and most control characters are not allowed in UTF-7.
    262         * A plus sign starts Unicode (or "escape") Mode.
    263         *
    264         * In Direct Mode, only the sourceIndex is used.
    265         */
    266        byteIndex=0;
    267        length=(int32_t)(sourceLimit-source);
    268        targetCapacity=(int32_t)(targetLimit-target);
    269        if(length>targetCapacity) {
    270            length=targetCapacity;
    271        }
    272        while(length>0) {
    273            b=*source++;
    274            if(!isLegalUTF7(b)) {
    275                /* illegal */
    276                bytes[0]=b;
    277                byteIndex=1;
    278                *pErrorCode=U_ILLEGAL_CHAR_FOUND;
    279                break;
    280            } else if(b!=PLUS) {
    281                /* write directly encoded character */
    282                *target++=b;
    283                if(offsets!=nullptr) {
    284                    *offsets++=sourceIndex++;
    285                }
    286            } else /* PLUS */ {
    287                /* switch to Unicode mode */
    288                nextSourceIndex=++sourceIndex;
    289                inDirectMode=false;
    290                byteIndex=0;
    291                bits=0;
    292                base64Counter=-1;
    293                goto unicodeMode;
    294            }
    295            --length;
    296        }
    297        if(source<sourceLimit && target>=targetLimit) {
    298            /* target is full */
    299            *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
    300        }
    301    } else {
    302 unicodeMode:
    303        /*
    304         * In Unicode (or "escape") Mode, UTF-16BE is base64-encoded.
    305         * The base64 sequence ends with any character that is not in the base64 alphabet.
    306         * A terminating minus sign is consumed.
    307         *
    308         * In Unicode Mode, the sourceIndex has the index to the start of the current
    309         * base64 bytes, while nextSourceIndex is precisely parallel to source,
    310         * keeping the index to the following byte.
    311         * Note that in 2 out of 3 cases, UChars overlap within a base64 byte.
    312         */
    313        while(source<sourceLimit) {
    314            if(target<targetLimit) {
    315                bytes[byteIndex++]=b=*source++;
    316                ++nextSourceIndex;
    317                base64Value = -3; /* initialize as illegal */
    318                if(b>=126 || (base64Value=fromBase64[b])==-3 || base64Value==-1) {
    319                    /* either
    320                     * base64Value==-1 for any legal character except base64 and minus sign, or
    321                     * base64Value==-3 for illegal characters:
    322                     * 1. In either case, leave Unicode mode.
    323                     * 2.1. If we ended with an incomplete char16_t or none after the +, then
    324                     *      generate an error for the preceding erroneous sequence and deal with
    325                     *      the current (possibly illegal) character next time through.
    326                     * 2.2. Else the current char comes after a complete char16_t, which was already
    327                     *      pushed to the output buf, so:
    328                     * 2.2.1. If the current char is legal, just save it for processing next time.
    329                     *        It may be for example, a plus which we need to deal with in direct mode.
    330                     * 2.2.2. Else if the current char is illegal, we might as well deal with it here.
    331                     */
    332                    inDirectMode=true;
    333                    if(base64Counter==-1) {
    334                        /* illegal: + immediately followed by something other than base64 or minus sign */
    335                        /* include the plus sign in the reported sequence, but not the subsequent char */
    336                        --source;
    337                        bytes[0]=PLUS;
    338                        byteIndex=1;
    339                        *pErrorCode=U_ILLEGAL_CHAR_FOUND;
    340                        break;
    341                    } else if(bits!=0) {
    342                        /* bits are illegally left over, a char16_t is incomplete */
    343                        /* don't include current char (legal or illegal) in error seq */
    344                        --source;
    345                        --byteIndex;
    346                        *pErrorCode=U_ILLEGAL_CHAR_FOUND;
    347                        break;
    348                    } else {
    349                        /* previous char16_t was complete */
    350                        if(base64Value==-3) {
    351                            /* current character is illegal, deal with it here */
    352                            *pErrorCode=U_ILLEGAL_CHAR_FOUND;
    353                            break;
    354                        } else {
    355                            /* un-read the current character in case it is a plus sign */
    356                            --source;
    357                            sourceIndex=nextSourceIndex-1;
    358                            goto directMode;
    359                        }
    360                    }
    361                } else if(base64Value>=0) {
    362                    /* collect base64 bytes into UChars */
    363                    switch(base64Counter) {
    364                    case -1: /* -1 is immediately after the + */
    365                    case 0:
    366                        bits=base64Value;
    367                        base64Counter=1;
    368                        break;
    369                    case 1:
    370                    case 3:
    371                    case 4:
    372                    case 6:
    373                        bits=(uint16_t)((bits<<6)|base64Value);
    374                        ++base64Counter;
    375                        break;
    376                    case 2:
    377                        *target++=(char16_t)((bits<<4)|(base64Value>>2));
    378                        if(offsets!=nullptr) {
    379                            *offsets++=sourceIndex;
    380                            sourceIndex=nextSourceIndex-1;
    381                        }
    382                        bytes[0]=b; /* keep this byte in case an error occurs */
    383                        byteIndex=1;
    384                        bits=(uint16_t)(base64Value&3);
    385                        base64Counter=3;
    386                        break;
    387                    case 5:
    388                        *target++=(char16_t)((bits<<2)|(base64Value>>4));
    389                        if(offsets!=nullptr) {
    390                            *offsets++=sourceIndex;
    391                            sourceIndex=nextSourceIndex-1;
    392                        }
    393                        bytes[0]=b; /* keep this byte in case an error occurs */
    394                        byteIndex=1;
    395                        bits=(uint16_t)(base64Value&15);
    396                        base64Counter=6;
    397                        break;
    398                    case 7:
    399                        *target++=(char16_t)((bits<<6)|base64Value);
    400                        if(offsets!=nullptr) {
    401                            *offsets++=sourceIndex;
    402                            sourceIndex=nextSourceIndex;
    403                        }
    404                        byteIndex=0;
    405                        bits=0;
    406                        base64Counter=0;
    407                        break;
    408                    default:
    409                        /* will never occur */
    410                        break;
    411                    }
    412                } else /*base64Value==-2*/ {
    413                    /* minus sign terminates the base64 sequence */
    414                    inDirectMode=true;
    415                    if(base64Counter==-1) {
    416                        /* +- i.e. a minus immediately following a plus */
    417                        *target++=PLUS;
    418                        if(offsets!=nullptr) {
    419                            *offsets++=sourceIndex-1;
    420                        }
    421                    } else {
    422                        /* absorb the minus and leave the Unicode Mode */
    423                        if(bits!=0) {
    424                            /* bits are illegally left over, a char16_t is incomplete */
    425                            *pErrorCode=U_ILLEGAL_CHAR_FOUND;
    426                            break;
    427                        }
    428                    }
    429                    sourceIndex=nextSourceIndex;
    430                    goto directMode;
    431                }
    432            } else {
    433                /* target is full */
    434                *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
    435                break;
    436            }
    437        }
    438    }
    439 
    440    if(U_SUCCESS(*pErrorCode) && pArgs->flush && source==sourceLimit && bits==0) {
    441        /*
    442         * if we are in Unicode mode, then the byteIndex might not be 0,
    443         * but that is ok if bits==0
    444         * -> we set byteIndex=0 at the end of the stream to avoid a truncated error
    445         * (not true for IMAP-mailbox-name where we must end in direct mode)
    446         */
    447        byteIndex=0;
    448    }
    449 
    450    /* set the converter state back into UConverter */
    451    cnv->toUnicodeStatus=((uint32_t)inDirectMode<<24)|((uint32_t)((uint8_t)base64Counter)<<16)|(uint32_t)bits;
    452    cnv->toULength=byteIndex;
    453 
    454    /* write back the updated pointers */
    455    pArgs->source=(const char *)source;
    456    pArgs->target=target;
    457    pArgs->offsets=offsets;
    458 }
    459 
    460 static void U_CALLCONV
    461 _UTF7FromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs,
    462                            UErrorCode *pErrorCode) {
    463    UConverter *cnv;
    464    const char16_t *source, *sourceLimit;
    465    uint8_t *target, *targetLimit;
    466    int32_t *offsets;
    467 
    468    int32_t length, targetCapacity, sourceIndex;
    469    char16_t c;
    470 
    471    /* UTF-7 state */
    472    const UBool *encodeDirectly;
    473    uint8_t bits;
    474    int8_t base64Counter;
    475    UBool inDirectMode;
    476 
    477    /* set up the local pointers */
    478    cnv=pArgs->converter;
    479 
    480    /* set up the local pointers */
    481    source=pArgs->source;
    482    sourceLimit=pArgs->sourceLimit;
    483    target=(uint8_t *)pArgs->target;
    484    targetLimit=(uint8_t *)pArgs->targetLimit;
    485    offsets=pArgs->offsets;
    486 
    487    /* get the state machine state */
    488    {
    489        uint32_t status=cnv->fromUnicodeStatus;
    490        encodeDirectly= status<0x10000000 ? encodeDirectlyMaximum : encodeDirectlyRestricted;
    491        inDirectMode=(UBool)((status>>24)&1);
    492        base64Counter=(int8_t)(status>>16);
    493        bits=(uint8_t)status;
    494        U_ASSERT(bits<=UPRV_LENGTHOF(toBase64));
    495    }
    496 
    497    /* UTF-7 always encodes UTF-16 code units, therefore we need only a simple sourceIndex */
    498    sourceIndex=0;
    499 
    500    if(inDirectMode) {
    501 directMode:
    502        length=(int32_t)(sourceLimit-source);
    503        targetCapacity=(int32_t)(targetLimit-target);
    504        if(length>targetCapacity) {
    505            length=targetCapacity;
    506        }
    507        while(length>0) {
    508            c=*source++;
    509            /* currently always encode CR LF SP TAB directly */
    510            if(c<=127 && encodeDirectly[c]) {
    511                /* encode directly */
    512                *target++=(uint8_t)c;
    513                if(offsets!=nullptr) {
    514                    *offsets++=sourceIndex++;
    515                }
    516            } else if(c==PLUS) {
    517                /* output +- for + */
    518                *target++=PLUS;
    519                if(target<targetLimit) {
    520                    *target++=MINUS;
    521                    if(offsets!=nullptr) {
    522                        *offsets++=sourceIndex;
    523                        *offsets++=sourceIndex++;
    524                    }
    525                    /* realign length and targetCapacity */
    526                    goto directMode;
    527                } else {
    528                    if(offsets!=nullptr) {
    529                        *offsets++=sourceIndex++;
    530                    }
    531                    cnv->charErrorBuffer[0]=MINUS;
    532                    cnv->charErrorBufferLength=1;
    533                    *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
    534                    break;
    535                }
    536            } else {
    537                /* un-read this character and switch to Unicode Mode */
    538                --source;
    539                *target++=PLUS;
    540                if(offsets!=nullptr) {
    541                    *offsets++=sourceIndex;
    542                }
    543                inDirectMode=false;
    544                base64Counter=0;
    545                goto unicodeMode;
    546            }
    547            --length;
    548        }
    549        if(source<sourceLimit && target>=targetLimit) {
    550            /* target is full */
    551            *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
    552        }
    553    } else {
    554 unicodeMode:
    555        while(source<sourceLimit) {
    556            if(target<targetLimit) {
    557                c=*source++;
    558                if(c<=127 && encodeDirectly[c]) {
    559                    /* encode directly */
    560                    inDirectMode=true;
    561 
    562                    /* trick: back out this character to make this easier */
    563                    --source;
    564 
    565                    /* terminate the base64 sequence */
    566                    if(base64Counter!=0) {
    567                        /* write remaining bits for the previous character */
    568                        *target++=toBase64[bits];
    569                        if(offsets!=nullptr) {
    570                            *offsets++=sourceIndex-1;
    571                        }
    572                    }
    573                    if(fromBase64[c]!=-1) {
    574                        /* need to terminate with a minus */
    575                        if(target<targetLimit) {
    576                            *target++=MINUS;
    577                            if(offsets!=nullptr) {
    578                                *offsets++=sourceIndex-1;
    579                            }
    580                        } else {
    581                            cnv->charErrorBuffer[0]=MINUS;
    582                            cnv->charErrorBufferLength=1;
    583                            *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
    584                            break;
    585                        }
    586                    }
    587                    goto directMode;
    588                } else {
    589                    /*
    590                     * base64 this character:
    591                     * Output 2 or 3 base64 bytes for the remaining bits of the previous character
    592                     * and the bits of this character, each implicitly in UTF-16BE.
    593                     *
    594                     * Here, bits is an 8-bit variable because only 6 bits need to be kept from one
    595                     * character to the next. The actual 2 or 4 bits are shifted to the left edge
    596                     * of the 6-bits field 5..0 to make the termination of the base64 sequence easier.
    597                     */
    598                    switch(base64Counter) {
    599                    case 0:
    600                        *target++=toBase64[c>>10];
    601                        if(target<targetLimit) {
    602                            *target++=toBase64[(c>>4)&0x3f];
    603                            if(offsets!=nullptr) {
    604                                *offsets++=sourceIndex;
    605                                *offsets++=sourceIndex++;
    606                            }
    607                        } else {
    608                            if(offsets!=nullptr) {
    609                                *offsets++=sourceIndex++;
    610                            }
    611                            cnv->charErrorBuffer[0]=toBase64[(c>>4)&0x3f];
    612                            cnv->charErrorBufferLength=1;
    613                            *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
    614                        }
    615                        bits=(uint8_t)((c&15)<<2);
    616                        base64Counter=1;
    617                        break;
    618                    case 1:
    619                        *target++=toBase64[bits|(c>>14)];
    620                        if(target<targetLimit) {
    621                            *target++=toBase64[(c>>8)&0x3f];
    622                            if(target<targetLimit) {
    623                                *target++=toBase64[(c>>2)&0x3f];
    624                                if(offsets!=nullptr) {
    625                                    *offsets++=sourceIndex;
    626                                    *offsets++=sourceIndex;
    627                                    *offsets++=sourceIndex++;
    628                                }
    629                            } else {
    630                                if(offsets!=nullptr) {
    631                                    *offsets++=sourceIndex;
    632                                    *offsets++=sourceIndex++;
    633                                }
    634                                cnv->charErrorBuffer[0]=toBase64[(c>>2)&0x3f];
    635                                cnv->charErrorBufferLength=1;
    636                                *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
    637                            }
    638                        } else {
    639                            if(offsets!=nullptr) {
    640                                *offsets++=sourceIndex++;
    641                            }
    642                            cnv->charErrorBuffer[0]=toBase64[(c>>8)&0x3f];
    643                            cnv->charErrorBuffer[1]=toBase64[(c>>2)&0x3f];
    644                            cnv->charErrorBufferLength=2;
    645                            *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
    646                        }
    647                        bits=(uint8_t)((c&3)<<4);
    648                        base64Counter=2;
    649                        break;
    650                    case 2:
    651                        *target++=toBase64[bits|(c>>12)];
    652                        if(target<targetLimit) {
    653                            *target++=toBase64[(c>>6)&0x3f];
    654                            if(target<targetLimit) {
    655                                *target++=toBase64[c&0x3f];
    656                                if(offsets!=nullptr) {
    657                                    *offsets++=sourceIndex;
    658                                    *offsets++=sourceIndex;
    659                                    *offsets++=sourceIndex++;
    660                                }
    661                            } else {
    662                                if(offsets!=nullptr) {
    663                                    *offsets++=sourceIndex;
    664                                    *offsets++=sourceIndex++;
    665                                }
    666                                cnv->charErrorBuffer[0]=toBase64[c&0x3f];
    667                                cnv->charErrorBufferLength=1;
    668                                *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
    669                            }
    670                        } else {
    671                            if(offsets!=nullptr) {
    672                                *offsets++=sourceIndex++;
    673                            }
    674                            cnv->charErrorBuffer[0]=toBase64[(c>>6)&0x3f];
    675                            cnv->charErrorBuffer[1]=toBase64[c&0x3f];
    676                            cnv->charErrorBufferLength=2;
    677                            *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
    678                        }
    679                        bits=0;
    680                        base64Counter=0;
    681                        break;
    682                    default:
    683                        /* will never occur */
    684                        break;
    685                    }
    686                }
    687            } else {
    688                /* target is full */
    689                *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
    690                break;
    691            }
    692        }
    693    }
    694 
    695    if(pArgs->flush && source>=sourceLimit) {
    696        /* flush remaining bits to the target */
    697        if(!inDirectMode) {
    698            if (base64Counter!=0) {
    699                if(target<targetLimit) {
    700                    *target++=toBase64[bits];
    701                    if(offsets!=nullptr) {
    702                        *offsets++=sourceIndex-1;
    703                    }
    704                } else {
    705                    cnv->charErrorBuffer[cnv->charErrorBufferLength++]=toBase64[bits];
    706                    *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
    707                }
    708            }
    709            /* Add final MINUS to terminate unicodeMode */
    710            if(target<targetLimit) {
    711                *target++=MINUS;
    712                if(offsets!=nullptr) {
    713                    *offsets++=sourceIndex-1;
    714                }
    715            } else {
    716                cnv->charErrorBuffer[cnv->charErrorBufferLength++]=MINUS;
    717                *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
    718            }
    719        }
    720        /* reset the state for the next conversion */
    721        cnv->fromUnicodeStatus=(cnv->fromUnicodeStatus&0xf0000000)|0x1000000; /* keep version, inDirectMode=true */
    722    } else {
    723        /* set the converter state back into UConverter */
    724        cnv->fromUnicodeStatus=
    725            (cnv->fromUnicodeStatus&0xf0000000)|    /* keep version*/
    726            ((uint32_t)inDirectMode<<24)|((uint32_t)base64Counter<<16)|(uint32_t)bits;
    727    }
    728 
    729    /* write back the updated pointers */
    730    pArgs->source=source;
    731    pArgs->target=(char *)target;
    732    pArgs->offsets=offsets;
    733 }
    734 
    735 static const char * U_CALLCONV
    736 _UTF7GetName(const UConverter *cnv) {
    737    switch(cnv->fromUnicodeStatus>>28) {
    738    case 1:
    739        return "UTF-7,version=1";
    740    default:
    741        return "UTF-7";
    742    }
    743 }
    744 U_CDECL_END
    745 
    746 static const UConverterImpl _UTF7Impl={
    747    UCNV_UTF7,
    748 
    749    nullptr,
    750    nullptr,
    751 
    752    _UTF7Open,
    753    nullptr,
    754    _UTF7Reset,
    755 
    756    _UTF7ToUnicodeWithOffsets,
    757    _UTF7ToUnicodeWithOffsets,
    758    _UTF7FromUnicodeWithOffsets,
    759    _UTF7FromUnicodeWithOffsets,
    760    nullptr,
    761 
    762    nullptr,
    763    _UTF7GetName,
    764    nullptr, /* we don't need writeSub() because we never call a callback at fromUnicode() */
    765    nullptr,
    766    ucnv_getCompleteUnicodeSet,
    767 
    768    nullptr,
    769    nullptr
    770 };
    771 
    772 static const UConverterStaticData _UTF7StaticData={
    773    sizeof(UConverterStaticData),
    774    "UTF-7",
    775    0, /* TODO CCSID for UTF-7 */
    776    UCNV_IBM, UCNV_UTF7,
    777    1, 4,
    778    { 0x3f, 0, 0, 0 }, 1, /* the subchar is not used */
    779    false, false,
    780    0,
    781    0,
    782    { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
    783 };
    784 
    785 const UConverterSharedData _UTF7Data=
    786        UCNV_IMMUTABLE_SHARED_DATA_INITIALIZER(&_UTF7StaticData, &_UTF7Impl);
    787 
    788 /* IMAP mailbox name encoding ----------------------------------------------- */
    789 
    790 /*
    791 * RFC 2060: INTERNET MESSAGE ACCESS PROTOCOL - VERSION 4rev1
    792 * http://www.ietf.org/rfc/rfc2060.txt
    793 *
    794 * 5.1.3.  Mailbox International Naming Convention
    795 *
    796 * By convention, international mailbox names are specified using a
    797 * modified version of the UTF-7 encoding described in [UTF-7].  The
    798 * purpose of these modifications is to correct the following problems
    799 * with UTF-7:
    800 *
    801 *    1) UTF-7 uses the "+" character for shifting; this conflicts with
    802 *       the common use of "+" in mailbox names, in particular USENET
    803 *       newsgroup names.
    804 *
    805 *    2) UTF-7's encoding is BASE64 which uses the "/" character; this
    806 *       conflicts with the use of "/" as a popular hierarchy delimiter.
    807 *
    808 *    3) UTF-7 prohibits the unencoded usage of "\"; this conflicts with
    809 *       the use of "\" as a popular hierarchy delimiter.
    810 *
    811 *    4) UTF-7 prohibits the unencoded usage of "~"; this conflicts with
    812 *       the use of "~" in some servers as a home directory indicator.
    813 *
    814 *    5) UTF-7 permits multiple alternate forms to represent the same
    815 *       string; in particular, printable US-ASCII characters can be
    816 *       represented in encoded form.
    817 *
    818 * In modified UTF-7, printable US-ASCII characters except for "&"
    819 * represent themselves; that is, characters with octet values 0x20-0x25
    820 * and 0x27-0x7e.  The character "&" (0x26) is represented by the two-
    821 * octet sequence "&-".
    822 *
    823 * All other characters (octet values 0x00-0x1f, 0x7f-0xff, and all
    824 * Unicode 16-bit octets) are represented in modified BASE64, with a
    825 * further modification from [UTF-7] that "," is used instead of "/".
    826 * Modified BASE64 MUST NOT be used to represent any printing US-ASCII
    827 * character which can represent itself.
    828 *
    829 * "&" is used to shift to modified BASE64 and "-" to shift back to US-
    830 * ASCII.  All names start in US-ASCII, and MUST end in US-ASCII (that
    831 * is, a name that ends with a Unicode 16-bit octet MUST end with a "-
    832 * ").
    833 *
    834 * For example, here is a mailbox name which mixes English, Japanese,
    835 * and Chinese text: ~peter/mail/&ZeVnLIqe-/&U,BTFw-
    836 */
    837 
    838 /*
    839 * Tests for US-ASCII characters belonging to character classes
    840 * defined in UTF-7.
    841 *
    842 * Set D (directly encoded characters) consists of the following
    843 * characters: the upper and lower case letters A through Z
    844 * and a through z, the 10 digits 0-9, and the following nine special
    845 * characters (note that "+" and "=" are omitted):
    846 *     '(),-./:?
    847 *
    848 * Set O (optional direct characters) consists of the following
    849 * characters (note that "\" and "~" are omitted):
    850 *     !"#$%&*;<=>@[]^_`{|}
    851 *
    852 * According to the rules in RFC 2152, the byte values for the following
    853 * US-ASCII characters are not used in UTF-7 and are therefore illegal:
    854 * - all C0 control codes except for CR LF TAB
    855 * - BACKSLASH
    856 * - TILDE
    857 * - DEL
    858 * - all codes beyond US-ASCII, i.e. all >127
    859 */
    860 
    861 /* uses '&' not '+' to start a base64 sequence */
    862 #define AMPERSAND 0x26
    863 #define COMMA 0x2c
    864 #define SLASH 0x2f
    865 
    866 /* legal byte values: all US-ASCII graphic characters 0x20..0x7e */
    867 #define isLegalIMAP(c) (0x20<=(c) && (c)<=0x7e)
    868 
    869 /* direct-encode all of printable ASCII 0x20..0x7e except '&' 0x26 */
    870 #define inSetDIMAP(c) (isLegalIMAP(c) && c!=AMPERSAND)
    871 
    872 #define TO_BASE64_IMAP(n) ((n)<63 ? toBase64[n] : COMMA)
    873 #define FROM_BASE64_IMAP(c) ((c)==COMMA ? 63 : (c)==SLASH ? -1 : fromBase64[c])
    874 
    875 /*
    876 * converter status values:
    877 *
    878 * toUnicodeStatus:
    879 *     24 inDirectMode (boolean)
    880 * 23..16 base64Counter (-1..7)
    881 * 15..0  bits (up to 14 bits incoming base64)
    882 *
    883 * fromUnicodeStatus:
    884 *     24 inDirectMode (boolean)
    885 * 23..16 base64Counter (0..2)
    886 *  7..0  bits (6 bits outgoing base64)
    887 *
    888 * ignore bits 31..25
    889 */
    890 
    891 U_CDECL_BEGIN
    892 static void U_CALLCONV
    893 _IMAPToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs,
    894                          UErrorCode *pErrorCode) {
    895    UConverter *cnv;
    896    const uint8_t *source, *sourceLimit;
    897    char16_t *target;
    898    const char16_t *targetLimit;
    899    int32_t *offsets;
    900 
    901    uint8_t *bytes;
    902    uint8_t byteIndex;
    903 
    904    int32_t length, targetCapacity;
    905 
    906    /* UTF-7 state */
    907    uint16_t bits;
    908    int8_t base64Counter;
    909    UBool inDirectMode;
    910 
    911    int8_t base64Value;
    912 
    913    int32_t sourceIndex, nextSourceIndex;
    914 
    915    char16_t c;
    916    uint8_t b;
    917 
    918    /* set up the local pointers */
    919    cnv=pArgs->converter;
    920 
    921    source=(const uint8_t *)pArgs->source;
    922    sourceLimit=(const uint8_t *)pArgs->sourceLimit;
    923    target=pArgs->target;
    924    targetLimit=pArgs->targetLimit;
    925    offsets=pArgs->offsets;
    926    /* get the state machine state */
    927    {
    928        uint32_t status=cnv->toUnicodeStatus;
    929        inDirectMode=(UBool)((status>>24)&1);
    930        base64Counter=(int8_t)(status>>16);
    931        bits=(uint16_t)status;
    932    }
    933    bytes=cnv->toUBytes;
    934    byteIndex=cnv->toULength;
    935 
    936    /* sourceIndex=-1 if the current character began in the previous buffer */
    937    sourceIndex=byteIndex==0 ? 0 : -1;
    938    nextSourceIndex=0;
    939 
    940    if(inDirectMode) {
    941 directMode:
    942        /*
    943         * In Direct Mode, US-ASCII characters are encoded directly, i.e.,
    944         * with their US-ASCII byte values.
    945         * An ampersand starts Unicode (or "escape") Mode.
    946         *
    947         * In Direct Mode, only the sourceIndex is used.
    948         */
    949        byteIndex=0;
    950        length=(int32_t)(sourceLimit-source);
    951        targetCapacity=(int32_t)(targetLimit-target);
    952        if(length>targetCapacity) {
    953            length=targetCapacity;
    954        }
    955        while(length>0) {
    956            b=*source++;
    957            if(!isLegalIMAP(b)) {
    958                /* illegal */
    959                bytes[0]=b;
    960                byteIndex=1;
    961                *pErrorCode=U_ILLEGAL_CHAR_FOUND;
    962                break;
    963            } else if(b!=AMPERSAND) {
    964                /* write directly encoded character */
    965                *target++=b;
    966                if(offsets!=nullptr) {
    967                    *offsets++=sourceIndex++;
    968                }
    969            } else /* AMPERSAND */ {
    970                /* switch to Unicode mode */
    971                nextSourceIndex=++sourceIndex;
    972                inDirectMode=false;
    973                byteIndex=0;
    974                bits=0;
    975                base64Counter=-1;
    976                goto unicodeMode;
    977            }
    978            --length;
    979        }
    980        if(source<sourceLimit && target>=targetLimit) {
    981            /* target is full */
    982            *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
    983        }
    984    } else {
    985 unicodeMode:
    986        /*
    987         * In Unicode (or "escape") Mode, UTF-16BE is base64-encoded.
    988         * The base64 sequence ends with any character that is not in the base64 alphabet.
    989         * A terminating minus sign is consumed.
    990         * US-ASCII must not be base64-ed.
    991         *
    992         * In Unicode Mode, the sourceIndex has the index to the start of the current
    993         * base64 bytes, while nextSourceIndex is precisely parallel to source,
    994         * keeping the index to the following byte.
    995         * Note that in 2 out of 3 cases, UChars overlap within a base64 byte.
    996         */
    997        while(source<sourceLimit) {
    998            if(target<targetLimit) {
    999                bytes[byteIndex++]=b=*source++;
   1000                ++nextSourceIndex;
   1001                if(b>0x7e) {
   1002                    /* illegal - test other illegal US-ASCII values by base64Value==-3 */
   1003                    inDirectMode=true;
   1004                    *pErrorCode=U_ILLEGAL_CHAR_FOUND;
   1005                    break;
   1006                } else if((base64Value=FROM_BASE64_IMAP(b))>=0) {
   1007                    /* collect base64 bytes into UChars */
   1008                    switch(base64Counter) {
   1009                    case -1: /* -1 is immediately after the & */
   1010                    case 0:
   1011                        bits=base64Value;
   1012                        base64Counter=1;
   1013                        break;
   1014                    case 1:
   1015                    case 3:
   1016                    case 4:
   1017                    case 6:
   1018                        bits=(uint16_t)((bits<<6)|base64Value);
   1019                        ++base64Counter;
   1020                        break;
   1021                    case 2:
   1022                        c=(char16_t)((bits<<4)|(base64Value>>2));
   1023                        if(isLegalIMAP(c)) {
   1024                            /* illegal */
   1025                            inDirectMode=true;
   1026                            *pErrorCode=U_ILLEGAL_CHAR_FOUND;
   1027                            goto endloop;
   1028                        }
   1029                        *target++=c;
   1030                        if(offsets!=nullptr) {
   1031                            *offsets++=sourceIndex;
   1032                            sourceIndex=nextSourceIndex-1;
   1033                        }
   1034                        bytes[0]=b; /* keep this byte in case an error occurs */
   1035                        byteIndex=1;
   1036                        bits=(uint16_t)(base64Value&3);
   1037                        base64Counter=3;
   1038                        break;
   1039                    case 5:
   1040                        c=(char16_t)((bits<<2)|(base64Value>>4));
   1041                        if(isLegalIMAP(c)) {
   1042                            /* illegal */
   1043                            inDirectMode=true;
   1044                            *pErrorCode=U_ILLEGAL_CHAR_FOUND;
   1045                            goto endloop;
   1046                        }
   1047                        *target++=c;
   1048                        if(offsets!=nullptr) {
   1049                            *offsets++=sourceIndex;
   1050                            sourceIndex=nextSourceIndex-1;
   1051                        }
   1052                        bytes[0]=b; /* keep this byte in case an error occurs */
   1053                        byteIndex=1;
   1054                        bits=(uint16_t)(base64Value&15);
   1055                        base64Counter=6;
   1056                        break;
   1057                    case 7:
   1058                        c=(char16_t)((bits<<6)|base64Value);
   1059                        if(isLegalIMAP(c)) {
   1060                            /* illegal */
   1061                            inDirectMode=true;
   1062                            *pErrorCode=U_ILLEGAL_CHAR_FOUND;
   1063                            goto endloop;
   1064                        }
   1065                        *target++=c;
   1066                        if(offsets!=nullptr) {
   1067                            *offsets++=sourceIndex;
   1068                            sourceIndex=nextSourceIndex;
   1069                        }
   1070                        byteIndex=0;
   1071                        bits=0;
   1072                        base64Counter=0;
   1073                        break;
   1074                    default:
   1075                        /* will never occur */
   1076                        break;
   1077                    }
   1078                } else if(base64Value==-2) {
   1079                    /* minus sign terminates the base64 sequence */
   1080                    inDirectMode=true;
   1081                    if(base64Counter==-1) {
   1082                        /* &- i.e. a minus immediately following an ampersand */
   1083                        *target++=AMPERSAND;
   1084                        if(offsets!=nullptr) {
   1085                            *offsets++=sourceIndex-1;
   1086                        }
   1087                    } else {
   1088                        /* absorb the minus and leave the Unicode Mode */
   1089                        if(bits!=0 || (base64Counter!=0 && base64Counter!=3 && base64Counter!=6)) {
   1090                            /* bits are illegally left over, a char16_t is incomplete */
   1091                            /* base64Counter other than 0, 3, 6 means non-minimal zero-padding, also illegal */
   1092                            *pErrorCode=U_ILLEGAL_CHAR_FOUND;
   1093                            break;
   1094                        }
   1095                    }
   1096                    sourceIndex=nextSourceIndex;
   1097                    goto directMode;
   1098                } else {
   1099                    if(base64Counter==-1) {
   1100                        /* illegal: & immediately followed by something other than base64 or minus sign */
   1101                        /* include the ampersand in the reported sequence */
   1102                        --sourceIndex;
   1103                        bytes[0]=AMPERSAND;
   1104                        bytes[1]=b;
   1105                        byteIndex=2;
   1106                    }
   1107                    /* base64Value==-1 for characters that are illegal only in Unicode mode */
   1108                    /* base64Value==-3 for illegal characters */
   1109                    /* illegal */
   1110                    inDirectMode=true;
   1111                    *pErrorCode=U_ILLEGAL_CHAR_FOUND;
   1112                    break;
   1113                }
   1114            } else {
   1115                /* target is full */
   1116                *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
   1117                break;
   1118            }
   1119        }
   1120    }
   1121 endloop:
   1122 
   1123    /*
   1124     * the end of the input stream and detection of truncated input
   1125     * are handled by the framework, but here we must check if we are in Unicode
   1126     * mode and byteIndex==0 because we must end in direct mode
   1127     *
   1128     * conditions:
   1129     *   successful
   1130     *   in Unicode mode and byteIndex==0
   1131     *   end of input and no truncated input
   1132     */
   1133    if( U_SUCCESS(*pErrorCode) &&
   1134        !inDirectMode && byteIndex==0 &&
   1135        pArgs->flush && source>=sourceLimit
   1136    ) {
   1137        if(base64Counter==-1) {
   1138            /* & at the very end of the input */
   1139            /* make the ampersand the reported sequence */
   1140            bytes[0]=AMPERSAND;
   1141            byteIndex=1;
   1142        }
   1143        /* else if(base64Counter!=-1) byteIndex remains 0 because there is no particular byte sequence */
   1144 
   1145        inDirectMode=true; /* avoid looping */
   1146        *pErrorCode=U_TRUNCATED_CHAR_FOUND;
   1147    }
   1148 
   1149    /* set the converter state back into UConverter */
   1150    cnv->toUnicodeStatus=((uint32_t)inDirectMode<<24)|((uint32_t)((uint8_t)base64Counter)<<16)|(uint32_t)bits;
   1151    cnv->toULength=byteIndex;
   1152 
   1153    /* write back the updated pointers */
   1154    pArgs->source=(const char *)source;
   1155    pArgs->target=target;
   1156    pArgs->offsets=offsets;
   1157 }
   1158 
   1159 static void U_CALLCONV
   1160 _IMAPFromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs,
   1161                            UErrorCode *pErrorCode) {
   1162    UConverter *cnv;
   1163    const char16_t *source, *sourceLimit;
   1164    uint8_t *target, *targetLimit;
   1165    int32_t *offsets;
   1166 
   1167    int32_t length, targetCapacity, sourceIndex;
   1168    char16_t c;
   1169    uint8_t b;
   1170 
   1171    /* UTF-7 state */
   1172    uint8_t bits;
   1173    int8_t base64Counter;
   1174    UBool inDirectMode;
   1175 
   1176    /* set up the local pointers */
   1177    cnv=pArgs->converter;
   1178 
   1179    /* set up the local pointers */
   1180    source=pArgs->source;
   1181    sourceLimit=pArgs->sourceLimit;
   1182    target=(uint8_t *)pArgs->target;
   1183    targetLimit=(uint8_t *)pArgs->targetLimit;
   1184    offsets=pArgs->offsets;
   1185 
   1186    /* get the state machine state */
   1187    {
   1188        uint32_t status=cnv->fromUnicodeStatus;
   1189        inDirectMode=(UBool)((status>>24)&1);
   1190        base64Counter=(int8_t)(status>>16);
   1191        bits=(uint8_t)status;
   1192    }
   1193 
   1194    /* UTF-7 always encodes UTF-16 code units, therefore we need only a simple sourceIndex */
   1195    sourceIndex=0;
   1196 
   1197    if(inDirectMode) {
   1198 directMode:
   1199        length=(int32_t)(sourceLimit-source);
   1200        targetCapacity=(int32_t)(targetLimit-target);
   1201        if(length>targetCapacity) {
   1202            length=targetCapacity;
   1203        }
   1204        while(length>0) {
   1205            c=*source++;
   1206            /* encode 0x20..0x7e except '&' directly */
   1207            if(inSetDIMAP(c)) {
   1208                /* encode directly */
   1209                *target++=(uint8_t)c;
   1210                if(offsets!=nullptr) {
   1211                    *offsets++=sourceIndex++;
   1212                }
   1213            } else if(c==AMPERSAND) {
   1214                /* output &- for & */
   1215                *target++=AMPERSAND;
   1216                if(target<targetLimit) {
   1217                    *target++=MINUS;
   1218                    if(offsets!=nullptr) {
   1219                        *offsets++=sourceIndex;
   1220                        *offsets++=sourceIndex++;
   1221                    }
   1222                    /* realign length and targetCapacity */
   1223                    goto directMode;
   1224                } else {
   1225                    if(offsets!=nullptr) {
   1226                        *offsets++=sourceIndex++;
   1227                    }
   1228                    cnv->charErrorBuffer[0]=MINUS;
   1229                    cnv->charErrorBufferLength=1;
   1230                    *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
   1231                    break;
   1232                }
   1233            } else {
   1234                /* un-read this character and switch to Unicode Mode */
   1235                --source;
   1236                *target++=AMPERSAND;
   1237                if(offsets!=nullptr) {
   1238                    *offsets++=sourceIndex;
   1239                }
   1240                inDirectMode=false;
   1241                base64Counter=0;
   1242                goto unicodeMode;
   1243            }
   1244            --length;
   1245        }
   1246        if(source<sourceLimit && target>=targetLimit) {
   1247            /* target is full */
   1248            *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
   1249        }
   1250    } else {
   1251 unicodeMode:
   1252        while(source<sourceLimit) {
   1253            if(target<targetLimit) {
   1254                c=*source++;
   1255                if(isLegalIMAP(c)) {
   1256                    /* encode directly */
   1257                    inDirectMode=true;
   1258 
   1259                    /* trick: back out this character to make this easier */
   1260                    --source;
   1261 
   1262                    /* terminate the base64 sequence */
   1263                    if(base64Counter!=0) {
   1264                        /* write remaining bits for the previous character */
   1265                        *target++=TO_BASE64_IMAP(bits);
   1266                        if(offsets!=nullptr) {
   1267                            *offsets++=sourceIndex-1;
   1268                        }
   1269                    }
   1270                    /* need to terminate with a minus */
   1271                    if(target<targetLimit) {
   1272                        *target++=MINUS;
   1273                        if(offsets!=nullptr) {
   1274                            *offsets++=sourceIndex-1;
   1275                        }
   1276                    } else {
   1277                        cnv->charErrorBuffer[0]=MINUS;
   1278                        cnv->charErrorBufferLength=1;
   1279                        *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
   1280                        break;
   1281                    }
   1282                    goto directMode;
   1283                } else {
   1284                    /*
   1285                     * base64 this character:
   1286                     * Output 2 or 3 base64 bytes for the remaining bits of the previous character
   1287                     * and the bits of this character, each implicitly in UTF-16BE.
   1288                     *
   1289                     * Here, bits is an 8-bit variable because only 6 bits need to be kept from one
   1290                     * character to the next. The actual 2 or 4 bits are shifted to the left edge
   1291                     * of the 6-bits field 5..0 to make the termination of the base64 sequence easier.
   1292                     */
   1293                    switch(base64Counter) {
   1294                    case 0:
   1295                        b=(uint8_t)(c>>10);
   1296                        *target++=TO_BASE64_IMAP(b);
   1297                        if(target<targetLimit) {
   1298                            b=(uint8_t)((c>>4)&0x3f);
   1299                            *target++=TO_BASE64_IMAP(b);
   1300                            if(offsets!=nullptr) {
   1301                                *offsets++=sourceIndex;
   1302                                *offsets++=sourceIndex++;
   1303                            }
   1304                        } else {
   1305                            if(offsets!=nullptr) {
   1306                                *offsets++=sourceIndex++;
   1307                            }
   1308                            b=(uint8_t)((c>>4)&0x3f);
   1309                            cnv->charErrorBuffer[0]=TO_BASE64_IMAP(b);
   1310                            cnv->charErrorBufferLength=1;
   1311                            *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
   1312                        }
   1313                        bits=(uint8_t)((c&15)<<2);
   1314                        base64Counter=1;
   1315                        break;
   1316                    case 1:
   1317                        b=(uint8_t)(bits|(c>>14));
   1318                        *target++=TO_BASE64_IMAP(b);
   1319                        if(target<targetLimit) {
   1320                            b=(uint8_t)((c>>8)&0x3f);
   1321                            *target++=TO_BASE64_IMAP(b);
   1322                            if(target<targetLimit) {
   1323                                b=(uint8_t)((c>>2)&0x3f);
   1324                                *target++=TO_BASE64_IMAP(b);
   1325                                if(offsets!=nullptr) {
   1326                                    *offsets++=sourceIndex;
   1327                                    *offsets++=sourceIndex;
   1328                                    *offsets++=sourceIndex++;
   1329                                }
   1330                            } else {
   1331                                if(offsets!=nullptr) {
   1332                                    *offsets++=sourceIndex;
   1333                                    *offsets++=sourceIndex++;
   1334                                }
   1335                                b=(uint8_t)((c>>2)&0x3f);
   1336                                cnv->charErrorBuffer[0]=TO_BASE64_IMAP(b);
   1337                                cnv->charErrorBufferLength=1;
   1338                                *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
   1339                            }
   1340                        } else {
   1341                            if(offsets!=nullptr) {
   1342                                *offsets++=sourceIndex++;
   1343                            }
   1344                            b=(uint8_t)((c>>8)&0x3f);
   1345                            cnv->charErrorBuffer[0]=TO_BASE64_IMAP(b);
   1346                            b=(uint8_t)((c>>2)&0x3f);
   1347                            cnv->charErrorBuffer[1]=TO_BASE64_IMAP(b);
   1348                            cnv->charErrorBufferLength=2;
   1349                            *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
   1350                        }
   1351                        bits=(uint8_t)((c&3)<<4);
   1352                        base64Counter=2;
   1353                        break;
   1354                    case 2:
   1355                        b=(uint8_t)(bits|(c>>12));
   1356                        *target++=TO_BASE64_IMAP(b);
   1357                        if(target<targetLimit) {
   1358                            b=(uint8_t)((c>>6)&0x3f);
   1359                            *target++=TO_BASE64_IMAP(b);
   1360                            if(target<targetLimit) {
   1361                                b=(uint8_t)(c&0x3f);
   1362                                *target++=TO_BASE64_IMAP(b);
   1363                                if(offsets!=nullptr) {
   1364                                    *offsets++=sourceIndex;
   1365                                    *offsets++=sourceIndex;
   1366                                    *offsets++=sourceIndex++;
   1367                                }
   1368                            } else {
   1369                                if(offsets!=nullptr) {
   1370                                    *offsets++=sourceIndex;
   1371                                    *offsets++=sourceIndex++;
   1372                                }
   1373                                b=(uint8_t)(c&0x3f);
   1374                                cnv->charErrorBuffer[0]=TO_BASE64_IMAP(b);
   1375                                cnv->charErrorBufferLength=1;
   1376                                *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
   1377                            }
   1378                        } else {
   1379                            if(offsets!=nullptr) {
   1380                                *offsets++=sourceIndex++;
   1381                            }
   1382                            b=(uint8_t)((c>>6)&0x3f);
   1383                            cnv->charErrorBuffer[0]=TO_BASE64_IMAP(b);
   1384                            b=(uint8_t)(c&0x3f);
   1385                            cnv->charErrorBuffer[1]=TO_BASE64_IMAP(b);
   1386                            cnv->charErrorBufferLength=2;
   1387                            *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
   1388                        }
   1389                        bits=0;
   1390                        base64Counter=0;
   1391                        break;
   1392                    default:
   1393                        /* will never occur */
   1394                        break;
   1395                    }
   1396                }
   1397            } else {
   1398                /* target is full */
   1399                *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
   1400                break;
   1401            }
   1402        }
   1403    }
   1404 
   1405    if(pArgs->flush && source>=sourceLimit) {
   1406        /* flush remaining bits to the target */
   1407        if(!inDirectMode) {
   1408            if(base64Counter!=0) {
   1409                if(target<targetLimit) {
   1410                    *target++=TO_BASE64_IMAP(bits);
   1411                    if(offsets!=nullptr) {
   1412                        *offsets++=sourceIndex-1;
   1413                    }
   1414                } else {
   1415                    cnv->charErrorBuffer[cnv->charErrorBufferLength++]=TO_BASE64_IMAP(bits);
   1416                    *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
   1417                }
   1418            }
   1419            /* need to terminate with a minus */
   1420            if(target<targetLimit) {
   1421                *target++=MINUS;
   1422                if(offsets!=nullptr) {
   1423                    *offsets++=sourceIndex-1;
   1424                }
   1425            } else {
   1426                cnv->charErrorBuffer[cnv->charErrorBufferLength++]=MINUS;
   1427                *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
   1428            }
   1429        }
   1430        /* reset the state for the next conversion */
   1431        cnv->fromUnicodeStatus=(cnv->fromUnicodeStatus&0xf0000000)|0x1000000; /* keep version, inDirectMode=true */
   1432    } else {
   1433        /* set the converter state back into UConverter */
   1434        cnv->fromUnicodeStatus=
   1435            (cnv->fromUnicodeStatus&0xf0000000)|    /* keep version*/
   1436            ((uint32_t)inDirectMode<<24)|((uint32_t)base64Counter<<16)|(uint32_t)bits;
   1437    }
   1438 
   1439    /* write back the updated pointers */
   1440    pArgs->source=source;
   1441    pArgs->target=(char *)target;
   1442    pArgs->offsets=offsets;
   1443 }
   1444 U_CDECL_END
   1445 
   1446 static const UConverterImpl _IMAPImpl={
   1447    UCNV_IMAP_MAILBOX,
   1448 
   1449    nullptr,
   1450    nullptr,
   1451 
   1452    _UTF7Open,
   1453    nullptr,
   1454    _UTF7Reset,
   1455 
   1456    _IMAPToUnicodeWithOffsets,
   1457    _IMAPToUnicodeWithOffsets,
   1458    _IMAPFromUnicodeWithOffsets,
   1459    _IMAPFromUnicodeWithOffsets,
   1460    nullptr,
   1461 
   1462    nullptr,
   1463    nullptr,
   1464    nullptr, /* we don't need writeSub() because we never call a callback at fromUnicode() */
   1465    nullptr,
   1466    ucnv_getCompleteUnicodeSet,
   1467    nullptr,
   1468    nullptr
   1469 };
   1470 
   1471 static const UConverterStaticData _IMAPStaticData={
   1472    sizeof(UConverterStaticData),
   1473    "IMAP-mailbox-name",
   1474    0, /* TODO CCSID for IMAP-mailbox-name */
   1475    UCNV_IBM, UCNV_IMAP_MAILBOX,
   1476    1, 4,
   1477    { 0x3f, 0, 0, 0 }, 1, /* the subchar is not used */
   1478    false, false,
   1479    0,
   1480    0,
   1481    { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
   1482 };
   1483 
   1484 const UConverterSharedData _IMAPData=
   1485        UCNV_IMMUTABLE_SHARED_DATA_INITIALIZER(&_IMAPStaticData, &_IMAPImpl);
   1486 
   1487 #endif
	tor-browser The Tor Browser
	git clone https://git.dasho.dev/tor-browser.git
	Log \| Files \| Refs \| README \| LICENSE