tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

utf8.c (13228B)


      1 /* This Source Code Form is subject to the terms of the Mozilla Public
      2 * License, v. 2.0. If a copy of the MPL was not distributed with this
      3 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
      4 
      5 #include "seccomon.h"
      6 #include "secport.h"
      7 
      8 /*
      9 * From RFC 2044:
     10 *
     11 * UCS-4 range (hex.)           UTF-8 octet sequence (binary)
     12 * 0000 0000-0000 007F   0xxxxxxx
     13 * 0000 0080-0000 07FF   110xxxxx 10xxxxxx
     14 * 0000 0800-0000 FFFF   1110xxxx 10xxxxxx 10xxxxxx
     15 * 0001 0000-001F FFFF   11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
     16 * 0020 0000-03FF FFFF   111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
     17 * 0400 0000-7FFF FFFF   1111110x 10xxxxxx ... 10xxxxxx
     18 */
     19 
     20 /*
     21 * From http://www.imc.org/draft-hoffman-utf16
     22 *
     23 * For U on [0x00010000,0x0010FFFF]:  Let U' = U - 0x00010000
     24 *
     25 * U' = yyyyyyyyyyxxxxxxxxxx
     26 * W1 = 110110yyyyyyyyyy
     27 * W2 = 110111xxxxxxxxxx
     28 */
     29 
     30 /*
     31 * This code is assuming NETWORK BYTE ORDER for the 16- and 32-bit
     32 * character values.  If you wish to use this code for working with
     33 * host byte order values, define the following:
     34 *
     35 * #if IS_BIG_ENDIAN
     36 * #define L_0 0
     37 * #define L_1 1
     38 * #define L_2 2
     39 * #define L_3 3
     40 * #define H_0 0
     41 * #define H_1 1
     42 * #else / * not everyone has elif * /
     43 * #if IS_LITTLE_ENDIAN
     44 * #define L_0 3
     45 * #define L_1 2
     46 * #define L_2 1
     47 * #define L_3 0
     48 * #define H_0 1
     49 * #define H_1 0
     50 * #else
     51 * #error "PDP and NUXI support deferred"
     52 * #endif / * IS_LITTLE_ENDIAN * /
     53 * #endif / * IS_BIG_ENDIAN * /
     54 */
     55 
     56 #define L_0 0
     57 #define L_1 1
     58 #define L_2 2
     59 #define L_3 3
     60 #define H_0 0
     61 #define H_1 1
     62 
     63 #define BAD_UTF8 ((PRUint32)-1)
     64 
     65 /*
     66 * Parse a single UTF-8 character per the spec. in section 3.9 (D36)
     67 * of Unicode 4.0.0.
     68 *
     69 * Parameters:
     70 * index - Points to the byte offset in inBuf of character to read.  On success,
     71 *         updated to the offset of the following character.
     72 * inBuf - Input buffer, UTF-8 encoded
     73 * inbufLen - Length of input buffer, in bytes.
     74 *
     75 * Returns:
     76 * Success - The UCS4 encoded character
     77 * Failure - BAD_UTF8
     78 */
     79 static PRUint32
     80 sec_port_read_utf8(unsigned int *index, unsigned char *inBuf, unsigned int inBufLen)
     81 {
     82    PRUint32 result;
     83    unsigned int i = *index;
     84    int bytes_left;
     85    PRUint32 min_value;
     86 
     87    PORT_Assert(i < inBufLen);
     88 
     89    if ((inBuf[i] & 0x80) == 0x00) {
     90        result = inBuf[i++];
     91        bytes_left = 0;
     92        min_value = 0;
     93    } else if ((inBuf[i] & 0xE0) == 0xC0) {
     94        result = inBuf[i++] & 0x1F;
     95        bytes_left = 1;
     96        min_value = 0x80;
     97    } else if ((inBuf[i] & 0xF0) == 0xE0) {
     98        result = inBuf[i++] & 0x0F;
     99        bytes_left = 2;
    100        min_value = 0x800;
    101    } else if ((inBuf[i] & 0xF8) == 0xF0) {
    102        result = inBuf[i++] & 0x07;
    103        bytes_left = 3;
    104        min_value = 0x10000;
    105    } else {
    106        return BAD_UTF8;
    107    }
    108 
    109    while (bytes_left--) {
    110        if (i >= inBufLen || (inBuf[i] & 0xC0) != 0x80)
    111            return BAD_UTF8;
    112        result = (result << 6) | (inBuf[i++] & 0x3F);
    113    }
    114 
    115    /* Check for overlong sequences, surrogates, and outside unicode range */
    116    if (result < min_value || (result & 0xFFFFF800) == 0xD800 || result > 0x10FFFF) {
    117        return BAD_UTF8;
    118    }
    119 
    120    *index = i;
    121    return result;
    122 }
    123 
    124 PRBool
    125 sec_port_ucs4_utf8_conversion_function(
    126    PRBool toUnicode,
    127    unsigned char *inBuf,
    128    unsigned int inBufLen,
    129    unsigned char *outBuf,
    130    unsigned int maxOutBufLen,
    131    unsigned int *outBufLen)
    132 {
    133    PORT_Assert((unsigned int *)NULL != outBufLen);
    134 
    135    if (toUnicode) {
    136        unsigned int i, len = 0;
    137 
    138        for (i = 0; i < inBufLen;) {
    139            if ((inBuf[i] & 0x80) == 0x00)
    140                i += 1;
    141            else if ((inBuf[i] & 0xE0) == 0xC0)
    142                i += 2;
    143            else if ((inBuf[i] & 0xF0) == 0xE0)
    144                i += 3;
    145            else if ((inBuf[i] & 0xF8) == 0xF0)
    146                i += 4;
    147            else
    148                return PR_FALSE;
    149 
    150            len += 4;
    151        }
    152 
    153        if (len > maxOutBufLen) {
    154            *outBufLen = len;
    155            return PR_FALSE;
    156        }
    157 
    158        len = 0;
    159 
    160        for (i = 0; i < inBufLen;) {
    161            PRUint32 ucs4 = sec_port_read_utf8(&i, inBuf, inBufLen);
    162 
    163            if (ucs4 == BAD_UTF8)
    164                return PR_FALSE;
    165 
    166            outBuf[len + L_0] = 0x00;
    167            outBuf[len + L_1] = (unsigned char)(ucs4 >> 16);
    168            outBuf[len + L_2] = (unsigned char)(ucs4 >> 8);
    169            outBuf[len + L_3] = (unsigned char)ucs4;
    170 
    171            len += 4;
    172        }
    173 
    174        *outBufLen = len;
    175        return PR_TRUE;
    176    } else {
    177        unsigned int i, len = 0;
    178        PORT_Assert((inBufLen % 4) == 0);
    179        if ((inBufLen % 4) != 0) {
    180            *outBufLen = 0;
    181            return PR_FALSE;
    182        }
    183 
    184        for (i = 0; i < inBufLen; i += 4) {
    185            if ((inBuf[i + L_0] > 0x00) || (inBuf[i + L_1] > 0x10)) {
    186                *outBufLen = 0;
    187                return PR_FALSE;
    188            } else if (inBuf[i + L_1] >= 0x01)
    189                len += 4;
    190            else if (inBuf[i + L_2] >= 0x08)
    191                len += 3;
    192            else if ((inBuf[i + L_2] > 0x00) || (inBuf[i + L_3] >= 0x80))
    193                len += 2;
    194            else
    195                len += 1;
    196        }
    197 
    198        if (len > maxOutBufLen) {
    199            *outBufLen = len;
    200            return PR_FALSE;
    201        }
    202 
    203        len = 0;
    204 
    205        for (i = 0; i < inBufLen; i += 4) {
    206            if (inBuf[i + L_1] >= 0x01) {
    207                /* 0001 0000-001F FFFF -> 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */
    208                /* 00000000 000abcde fghijklm nopqrstu ->
    209                   11110abc 10defghi 10jklmno 10pqrstu */
    210 
    211                outBuf[len + 0] = 0xF0 | ((inBuf[i + L_1] & 0x1C) >> 2);
    212                outBuf[len + 1] = 0x80 | ((inBuf[i + L_1] & 0x03) << 4) | ((inBuf[i + L_2] & 0xF0) >> 4);
    213                outBuf[len + 2] = 0x80 | ((inBuf[i + L_2] & 0x0F) << 2) | ((inBuf[i + L_3] & 0xC0) >> 6);
    214                outBuf[len + 3] = 0x80 | ((inBuf[i + L_3] & 0x3F) >> 0);
    215 
    216                len += 4;
    217            } else if (inBuf[i + L_2] >= 0x08) {
    218                /* 0000 0800-0000 FFFF -> 1110xxxx 10xxxxxx 10xxxxxx */
    219                /* 00000000 00000000 abcdefgh ijklmnop ->
    220                   1110abcd 10efghij 10klmnop */
    221 
    222                outBuf[len + 0] = 0xE0 | ((inBuf[i + L_2] & 0xF0) >> 4);
    223                outBuf[len + 1] = 0x80 | ((inBuf[i + L_2] & 0x0F) << 2) | ((inBuf[i + L_3] & 0xC0) >> 6);
    224                outBuf[len + 2] = 0x80 | ((inBuf[i + L_3] & 0x3F) >> 0);
    225 
    226                len += 3;
    227            } else if ((inBuf[i + L_2] > 0x00) || (inBuf[i + L_3] >= 0x80)) {
    228                /* 0000 0080-0000 07FF -> 110xxxxx 10xxxxxx */
    229                /* 00000000 00000000 00000abc defghijk ->
    230                   110abcde 10fghijk */
    231 
    232                outBuf[len + 0] = 0xC0 | ((inBuf[i + L_2] & 0x07) << 2) | ((inBuf[i + L_3] & 0xC0) >> 6);
    233                outBuf[len + 1] = 0x80 | ((inBuf[i + L_3] & 0x3F) >> 0);
    234 
    235                len += 2;
    236            } else {
    237                /* 0000 0000-0000 007F -> 0xxxxxx */
    238                /* 00000000 00000000 00000000 0abcdefg ->
    239                   0abcdefg */
    240 
    241                outBuf[len + 0] = (inBuf[i + L_3] & 0x7F);
    242 
    243                len += 1;
    244            }
    245        }
    246 
    247        *outBufLen = len;
    248        return PR_TRUE;
    249    }
    250 }
    251 
    252 PRBool
    253 sec_port_ucs2_utf8_conversion_function(
    254    PRBool toUnicode,
    255    unsigned char *inBuf,
    256    unsigned int inBufLen,
    257    unsigned char *outBuf,
    258    unsigned int maxOutBufLen,
    259    unsigned int *outBufLen)
    260 {
    261    PORT_Assert((unsigned int *)NULL != outBufLen);
    262 
    263    if (toUnicode) {
    264        unsigned int i, len = 0;
    265 
    266        for (i = 0; i < inBufLen;) {
    267            if ((inBuf[i] & 0x80) == 0x00) {
    268                i += 1;
    269                len += 2;
    270            } else if ((inBuf[i] & 0xE0) == 0xC0) {
    271                i += 2;
    272                len += 2;
    273            } else if ((inBuf[i] & 0xF0) == 0xE0) {
    274                i += 3;
    275                len += 2;
    276            } else if ((inBuf[i] & 0xF8) == 0xF0) {
    277                i += 4;
    278                len += 4;
    279            } else
    280                return PR_FALSE;
    281        }
    282 
    283        if (len > maxOutBufLen) {
    284            *outBufLen = len;
    285            return PR_FALSE;
    286        }
    287 
    288        len = 0;
    289 
    290        for (i = 0; i < inBufLen;) {
    291            PRUint32 ucs4 = sec_port_read_utf8(&i, inBuf, inBufLen);
    292 
    293            if (ucs4 == BAD_UTF8)
    294                return PR_FALSE;
    295 
    296            if (ucs4 < 0x10000) {
    297                outBuf[len + H_0] = (unsigned char)(ucs4 >> 8);
    298                outBuf[len + H_1] = (unsigned char)ucs4;
    299                len += 2;
    300            } else {
    301                ucs4 -= 0x10000;
    302                outBuf[len + 0 + H_0] = (unsigned char)(0xD8 | ((ucs4 >> 18) & 0x3));
    303                outBuf[len + 0 + H_1] = (unsigned char)(ucs4 >> 10);
    304                outBuf[len + 2 + H_0] = (unsigned char)(0xDC | ((ucs4 >> 8) & 0x3));
    305                outBuf[len + 2 + H_1] = (unsigned char)ucs4;
    306                len += 4;
    307            }
    308        }
    309 
    310        *outBufLen = len;
    311        return PR_TRUE;
    312    } else {
    313        unsigned int i, len = 0;
    314        PORT_Assert((inBufLen % 2) == 0);
    315        if ((inBufLen % 2) != 0) {
    316            *outBufLen = 0;
    317            return PR_FALSE;
    318        }
    319 
    320        for (i = 0; i < inBufLen; i += 2) {
    321            if ((inBuf[i + H_0] == 0x00) && ((inBuf[i + H_1] & 0x80) == 0x00))
    322                len += 1;
    323            else if (inBuf[i + H_0] < 0x08)
    324                len += 2;
    325            else if (((inBuf[i + H_0] & 0xFC) == 0xD8)) {
    326                if (((inBufLen - i) > 2) && ((inBuf[i + 2 + H_0] & 0xFC) == 0xDC)) {
    327                    i += 2;
    328                    len += 4;
    329                } else {
    330                    return PR_FALSE;
    331                }
    332            } else if ((inBuf[i + H_0] & 0xFC) == 0xDC) {
    333                return PR_FALSE;
    334            } else {
    335                len += 3;
    336            }
    337        }
    338 
    339        if (len > maxOutBufLen) {
    340            *outBufLen = len;
    341            return PR_FALSE;
    342        }
    343 
    344        len = 0;
    345 
    346        for (i = 0; i < inBufLen; i += 2) {
    347            if ((inBuf[i + H_0] == 0x00) && ((inBuf[i + H_1] & 0x80) == 0x00)) {
    348                /* 0000-007F -> 0xxxxxx */
    349                /* 00000000 0abcdefg -> 0abcdefg */
    350 
    351                outBuf[len] = inBuf[i + H_1] & 0x7F;
    352 
    353                len += 1;
    354            } else if (inBuf[i + H_0] < 0x08) {
    355                /* 0080-07FF -> 110xxxxx 10xxxxxx */
    356                /* 00000abc defghijk -> 110abcde 10fghijk */
    357 
    358                outBuf[len + 0] = 0xC0 | ((inBuf[i + H_0] & 0x07) << 2) | ((inBuf[i + H_1] & 0xC0) >> 6);
    359                outBuf[len + 1] = 0x80 | ((inBuf[i + H_1] & 0x3F) >> 0);
    360 
    361                len += 2;
    362            } else if ((inBuf[i + H_0] & 0xFC) == 0xD8) {
    363                int abcde, BCDE;
    364 
    365                PORT_Assert(((inBufLen - i) > 2) && ((inBuf[i + 2 + H_0] & 0xFC) == 0xDC));
    366 
    367                /* D800-DBFF DC00-DFFF -> 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */
    368                /* 110110BC DEfghijk 110111lm nopqrstu ->
    369                   { Let abcde = BCDE + 1 }
    370                   11110abc 10defghi 10jklmno 10pqrstu */
    371 
    372                BCDE = ((inBuf[i + H_0] & 0x03) << 2) | ((inBuf[i + H_1] & 0xC0) >> 6);
    373                abcde = BCDE + 1;
    374 
    375                outBuf[len + 0] = 0xF0 | ((abcde & 0x1C) >> 2);
    376                outBuf[len + 1] = 0x80 | ((abcde & 0x03) << 4) | ((inBuf[i + 0 + H_1] & 0x3C) >> 2);
    377                outBuf[len + 2] = 0x80 | ((inBuf[i + 0 + H_1] & 0x03) << 4) | ((inBuf[i + 2 + H_0] & 0x03) << 2) | ((inBuf[i + 2 + H_1] & 0xC0) >> 6);
    378                outBuf[len + 3] = 0x80 | ((inBuf[i + 2 + H_1] & 0x3F) >> 0);
    379 
    380                i += 2;
    381                len += 4;
    382            } else {
    383                /* 0800-FFFF -> 1110xxxx 10xxxxxx 10xxxxxx */
    384                /* abcdefgh ijklmnop -> 1110abcd 10efghij 10klmnop */
    385 
    386                outBuf[len + 0] = 0xE0 | ((inBuf[i + H_0] & 0xF0) >> 4);
    387                outBuf[len + 1] = 0x80 | ((inBuf[i + H_0] & 0x0F) << 2) | ((inBuf[i + H_1] & 0xC0) >> 6);
    388                outBuf[len + 2] = 0x80 | ((inBuf[i + H_1] & 0x3F) >> 0);
    389 
    390                len += 3;
    391            }
    392        }
    393 
    394        *outBufLen = len;
    395        return PR_TRUE;
    396    }
    397 }
    398 
    399 PRBool
    400 sec_port_iso88591_utf8_conversion_function(
    401    const unsigned char *inBuf,
    402    unsigned int inBufLen,
    403    unsigned char *outBuf,
    404    unsigned int maxOutBufLen,
    405    unsigned int *outBufLen)
    406 {
    407    unsigned int i, len = 0;
    408 
    409    PORT_Assert((unsigned int *)NULL != outBufLen);
    410 
    411    for (i = 0; i < inBufLen; i++) {
    412        if ((inBuf[i] & 0x80) == 0x00)
    413            len += 1;
    414        else
    415            len += 2;
    416    }
    417 
    418    if (len > maxOutBufLen) {
    419        *outBufLen = len;
    420        return PR_FALSE;
    421    }
    422 
    423    len = 0;
    424 
    425    for (i = 0; i < inBufLen; i++) {
    426        if ((inBuf[i] & 0x80) == 0x00) {
    427            /* 00-7F -> 0xxxxxxx */
    428            /* 0abcdefg -> 0abcdefg */
    429 
    430            outBuf[len] = inBuf[i];
    431            len += 1;
    432        } else {
    433            /* 80-FF <- 110xxxxx 10xxxxxx */
    434            /* 00000000 abcdefgh -> 110000ab 10cdefgh */
    435 
    436            outBuf[len + 0] = 0xC0 | ((inBuf[i] & 0xC0) >> 6);
    437            outBuf[len + 1] = 0x80 | ((inBuf[i] & 0x3F) >> 0);
    438 
    439            len += 2;
    440        }
    441    }
    442 
    443    *outBufLen = len;
    444    return PR_TRUE;
    445 }