tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

punycode.cpp (18119B)


      1 // © 2016 and later: Unicode, Inc. and others.
      2 // License & terms of use: http://www.unicode.org/copyright.html
      3 /*
      4 *******************************************************************************
      5 *
      6 *   Copyright (C) 2002-2011, International Business Machines
      7 *   Corporation and others.  All Rights Reserved.
      8 *
      9 *******************************************************************************
     10 *   file name:  punycode.cpp
     11 *   encoding:   UTF-8
     12 *   tab size:   8 (not used)
     13 *   indentation:4
     14 *
     15 *   created on: 2002jan31
     16 *   created by: Markus W. Scherer
     17 */
     18 
     19 
     20 /* This ICU code derived from: */
     21 /*
     22 punycode.c 0.4.0 (2001-Nov-17-Sat)
     23 http://www.cs.berkeley.edu/~amc/idn/
     24 Adam M. Costello
     25 http://www.nicemice.net/amc/
     26 
     27 Disclaimer and license
     28 
     29    Regarding this entire document or any portion of it (including
     30    the pseudocode and C code), the author makes no guarantees and
     31    is not responsible for any damage resulting from its use.  The
     32    author grants irrevocable permission to anyone to use, modify,
     33    and distribute it in any way that does not diminish the rights
     34    of anyone else to use, modify, and distribute it, provided that
     35    redistributed derivative works do not contain misleading author or
     36    version information.  Derivative works need not be licensed under
     37    similar terms.
     38 */
     39 /*
     40 * ICU modifications:
     41 * - ICU data types and coding conventions
     42 * - ICU string buffer handling with implicit source lengths
     43 *   and destination preflighting
     44 * - UTF-16 handling
     45 */
     46 
     47 #include "unicode/utypes.h"
     48 
     49 #if !UCONFIG_NO_IDNA
     50 
     51 #include "unicode/ustring.h"
     52 #include "unicode/utf.h"
     53 #include "unicode/utf16.h"
     54 #include "ustr_imp.h"
     55 #include "cstring.h"
     56 #include "cmemory.h"
     57 #include "punycode.h"
     58 #include "uassert.h"
     59 
     60 
     61 /* Punycode ----------------------------------------------------------------- */
     62 
     63 /* Punycode parameters for Bootstring */
     64 #define BASE            36
     65 #define TMIN            1
     66 #define TMAX            26
     67 #define SKEW            38
     68 #define DAMP            700
     69 #define INITIAL_BIAS    72
     70 #define INITIAL_N       0x80
     71 
     72 /* "Basic" Unicode/ASCII code points */
     73 #define _HYPHEN         0X2d
     74 #define DELIMITER       _HYPHEN
     75 
     76 #define _ZERO_          0X30
     77 #define _NINE           0x39
     78 
     79 #define _SMALL_A        0X61
     80 #define _SMALL_Z        0X7a
     81 
     82 #define _CAPITAL_A      0X41
     83 #define _CAPITAL_Z      0X5a
     84 
     85 #define IS_BASIC(c) ((c)<0x80)
     86 #define IS_BASIC_UPPERCASE(c) (_CAPITAL_A<=(c) && (c)<=_CAPITAL_Z)
     87 
     88 /**
     89 * digitToBasic() returns the basic code point whose value
     90 * (when used for representing integers) is d, which must be in the
     91 * range 0 to BASE-1. The lowercase form is used unless the uppercase flag is
     92 * nonzero, in which case the uppercase form is used.
     93 */
     94 static inline char
     95 digitToBasic(int32_t digit, UBool uppercase) {
     96    /*  0..25 map to ASCII a..z or A..Z */
     97    /* 26..35 map to ASCII 0..9         */
     98    if(digit<26) {
     99        if(uppercase) {
    100            return static_cast<char>(_CAPITAL_A + digit);
    101        } else {
    102            return static_cast<char>(_SMALL_A + digit);
    103        }
    104    } else {
    105        return static_cast<char>((_ZERO_ - 26) + digit);
    106    }
    107 }
    108 
    109 /**
    110 * @return the numeric value of a basic code point (for use in representing integers)
    111 *         in the range 0 to BASE-1, or a negative value if cp is invalid.
    112 */
    113 static int32_t decodeDigit(int32_t cp) {
    114    if(cp<=u'Z') {
    115        if(cp<=u'9') {
    116            if(cp<u'0') {
    117                return -1;
    118            } else {
    119                return cp-u'0'+26;  // 0..9 -> 26..35
    120            }
    121        } else {
    122            return cp-u'A';  // A-Z -> 0..25
    123        }
    124    } else if(cp<=u'z') {
    125        return cp-'a';  // a..z -> 0..25
    126    } else {
    127        return -1;
    128    }
    129 }
    130 
    131 static inline char
    132 asciiCaseMap(char b, UBool uppercase) {
    133    if(uppercase) {
    134        if(_SMALL_A<=b && b<=_SMALL_Z) {
    135            b-=(_SMALL_A-_CAPITAL_A);
    136        }
    137    } else {
    138        if(_CAPITAL_A<=b && b<=_CAPITAL_Z) {
    139            b+=(_SMALL_A-_CAPITAL_A);
    140        }
    141    }
    142    return b;
    143 }
    144 
    145 /* Punycode-specific Bootstring code ---------------------------------------- */
    146 
    147 /*
    148 * The following code omits the {parts} of the pseudo-algorithm in the spec
    149 * that are not used with the Punycode parameter set.
    150 */
    151 
    152 /* Bias adaptation function. */
    153 static int32_t
    154 adaptBias(int32_t delta, int32_t length, UBool firstTime) {
    155    int32_t count;
    156 
    157    if(firstTime) {
    158        delta/=DAMP;
    159    } else {
    160        delta/=2;
    161    }
    162 
    163    delta+=delta/length;
    164    for(count=0; delta>((BASE-TMIN)*TMAX)/2; count+=BASE) {
    165        delta/=(BASE-TMIN);
    166    }
    167 
    168    return count+(((BASE-TMIN+1)*delta)/(delta+SKEW));
    169 }
    170 
    171 namespace {
    172 
    173 // ICU-13727: Limit input length for n^2 algorithm
    174 // where well-formed strings are at most 59 characters long.
    175 constexpr int32_t ENCODE_MAX_CODE_UNITS=1000;
    176 constexpr int32_t DECODE_MAX_CHARS=2000;
    177 
    178 }  // namespace
    179 
    180 // encode
    181 U_CAPI int32_t
    182 u_strToPunycode(const char16_t *src, int32_t srcLength,
    183                char16_t *dest, int32_t destCapacity,
    184                const UBool *caseFlags,
    185                UErrorCode *pErrorCode) {
    186 
    187    int32_t cpBuffer[ENCODE_MAX_CODE_UNITS];
    188    int32_t n, delta, handledCPCount, basicLength, destLength, bias, j, m, q, k, t, srcCPCount;
    189    char16_t c, c2;
    190 
    191    /* argument checking */
    192    if(pErrorCode==nullptr || U_FAILURE(*pErrorCode)) {
    193        return 0;
    194    }
    195 
    196    if(src==nullptr || srcLength<-1 || destCapacity<0 || (dest==nullptr && destCapacity!=0)) {
    197        *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
    198        return 0;
    199    }
    200    if (srcLength>ENCODE_MAX_CODE_UNITS) {
    201        *pErrorCode=U_INPUT_TOO_LONG_ERROR;
    202        return 0;
    203    }
    204 
    205    /*
    206     * Handle the basic code points and
    207     * convert extended ones to UTF-32 in cpBuffer (caseFlag in sign bit):
    208     */
    209    srcCPCount=destLength=0;
    210    if(srcLength==-1) {
    211        /* NUL-terminated input */
    212        for(j=0; /* no condition */; ++j) {
    213            if((c=src[j])==0) {
    214                break;
    215            }
    216            if(j>=ENCODE_MAX_CODE_UNITS) {
    217                *pErrorCode=U_INPUT_TOO_LONG_ERROR;
    218                return 0;
    219            }
    220            if(IS_BASIC(c)) {
    221                cpBuffer[srcCPCount++]=0;
    222                if(destLength<destCapacity) {
    223                    dest[destLength]=
    224                        caseFlags!=nullptr ?
    225                            asciiCaseMap((char)c, caseFlags[j]) :
    226                            (char)c;
    227                }
    228                ++destLength;
    229            } else {
    230                n=(caseFlags!=nullptr && caseFlags[j])<<31L;
    231                if(U16_IS_SINGLE(c)) {
    232                    n|=c;
    233                } else if(U16_IS_LEAD(c) && U16_IS_TRAIL(c2=src[j+1])) {
    234                    ++j;
    235                    n|=(int32_t)U16_GET_SUPPLEMENTARY(c, c2);
    236                } else {
    237                    /* error: unmatched surrogate */
    238                    *pErrorCode=U_INVALID_CHAR_FOUND;
    239                    return 0;
    240                }
    241                cpBuffer[srcCPCount++]=n;
    242            }
    243        }
    244    } else {
    245        /* length-specified input */
    246        for(j=0; j<srcLength; ++j) {
    247            c=src[j];
    248            if(IS_BASIC(c)) {
    249                cpBuffer[srcCPCount++]=0;
    250                if(destLength<destCapacity) {
    251                    dest[destLength]=
    252                        caseFlags!=nullptr ?
    253                            asciiCaseMap((char)c, caseFlags[j]) :
    254                            (char)c;
    255                }
    256                ++destLength;
    257            } else {
    258                n=(caseFlags!=nullptr && caseFlags[j])<<31L;
    259                if(U16_IS_SINGLE(c)) {
    260                    n|=c;
    261                } else if(U16_IS_LEAD(c) && (j+1)<srcLength && U16_IS_TRAIL(c2=src[j+1])) {
    262                    ++j;
    263                    n|=(int32_t)U16_GET_SUPPLEMENTARY(c, c2);
    264                } else {
    265                    /* error: unmatched surrogate */
    266                    *pErrorCode=U_INVALID_CHAR_FOUND;
    267                    return 0;
    268                }
    269                cpBuffer[srcCPCount++]=n;
    270            }
    271        }
    272    }
    273 
    274    /* Finish the basic string - if it is not empty - with a delimiter. */
    275    basicLength=destLength;
    276    if(basicLength>0) {
    277        if(destLength<destCapacity) {
    278            dest[destLength]=DELIMITER;
    279        }
    280        ++destLength;
    281    }
    282 
    283    /*
    284     * handledCPCount is the number of code points that have been handled
    285     * basicLength is the number of basic code points
    286     * destLength is the number of chars that have been output
    287     */
    288 
    289    /* Initialize the state: */
    290    n=INITIAL_N;
    291    delta=0;
    292    bias=INITIAL_BIAS;
    293 
    294    /* Main encoding loop: */
    295    for(handledCPCount=basicLength; handledCPCount<srcCPCount; /* no op */) {
    296        /*
    297         * All non-basic code points < n have been handled already.
    298         * Find the next larger one:
    299         */
    300        for(m=0x7fffffff, j=0; j<srcCPCount; ++j) {
    301            q=cpBuffer[j]&0x7fffffff; /* remove case flag from the sign bit */
    302            if(n<=q && q<m) {
    303                m=q;
    304            }
    305        }
    306 
    307        /*
    308         * Increase delta enough to advance the decoder's
    309         * <n,i> state to <m,0>, but guard against overflow:
    310         */
    311        if(m-n>(0x7fffffff-handledCPCount-delta)/(handledCPCount+1)) {
    312            *pErrorCode=U_INTERNAL_PROGRAM_ERROR;
    313            return 0;
    314        }
    315        delta+=(m-n)*(handledCPCount+1);
    316        n=m;
    317 
    318        /* Encode a sequence of same code points n */
    319        for(j=0; j<srcCPCount; ++j) {
    320            q=cpBuffer[j]&0x7fffffff; /* remove case flag from the sign bit */
    321            if(q<n) {
    322                ++delta;
    323            } else if(q==n) {
    324                /* Represent delta as a generalized variable-length integer: */
    325                for(q=delta, k=BASE; /* no condition */; k+=BASE) {
    326 
    327                    /** RAM: comment out the old code for conformance with draft-ietf-idn-punycode-03.txt
    328 
    329                    t=k-bias;
    330                    if(t<TMIN) {
    331                        t=TMIN;
    332                    } else if(t>TMAX) {
    333                        t=TMAX;
    334                    }
    335                    */
    336 
    337                    t=k-bias;
    338                    if(t<TMIN) {
    339                        t=TMIN;
    340                    } else if(k>=(bias+TMAX)) {
    341                        t=TMAX;
    342                    }
    343 
    344                    if(q<t) {
    345                        break;
    346                    }
    347 
    348                    if(destLength<destCapacity) {
    349                        dest[destLength]=digitToBasic(t+(q-t)%(BASE-t), 0);
    350                    }
    351                    ++destLength;
    352                    q=(q-t)/(BASE-t);
    353                }
    354 
    355                if(destLength<destCapacity) {
    356                    dest[destLength] = digitToBasic(q, cpBuffer[j] < 0);
    357                }
    358                ++destLength;
    359                bias = adaptBias(delta, handledCPCount + 1, handledCPCount == basicLength);
    360                delta=0;
    361                ++handledCPCount;
    362            }
    363        }
    364 
    365        ++delta;
    366        ++n;
    367    }
    368 
    369    return u_terminateUChars(dest, destCapacity, destLength, pErrorCode);
    370 }
    371 
    372 // decode
    373 U_CAPI int32_t
    374 u_strFromPunycode(const char16_t *src, int32_t srcLength,
    375                  char16_t *dest, int32_t destCapacity,
    376                  UBool *caseFlags,
    377                  UErrorCode *pErrorCode) {
    378    int32_t n, destLength, i, bias, basicLength, j, in, oldi, w, k, digit, t,
    379            destCPCount, firstSupplementaryIndex, cpLength;
    380    char16_t b;
    381 
    382    /* argument checking */
    383    if(pErrorCode==nullptr || U_FAILURE(*pErrorCode)) {
    384        return 0;
    385    }
    386 
    387    if(src==nullptr || srcLength<-1 || (dest==nullptr && destCapacity!=0)) {
    388        *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
    389        return 0;
    390    }
    391 
    392    if(srcLength==-1) {
    393        srcLength=u_strlen(src);
    394    }
    395    if (srcLength>DECODE_MAX_CHARS) {
    396        *pErrorCode=U_INPUT_TOO_LONG_ERROR;
    397        return 0;
    398    }
    399 
    400    /*
    401     * Handle the basic code points:
    402     * Let basicLength be the number of input code points
    403     * before the last delimiter, or 0 if there is none,
    404     * then copy the first basicLength code points to the output.
    405     *
    406     * The two following loops iterate backward.
    407     */
    408    for(j=srcLength; j>0;) {
    409        if(src[--j]==DELIMITER) {
    410            break;
    411        }
    412    }
    413    destLength=basicLength=destCPCount=j;
    414    U_ASSERT(destLength>=0);
    415 
    416    while(j>0) {
    417        b=src[--j];
    418        if(!IS_BASIC(b)) {
    419            *pErrorCode=U_INVALID_CHAR_FOUND;
    420            return 0;
    421        }
    422 
    423        if(j<destCapacity) {
    424            dest[j] = b;
    425 
    426            if(caseFlags!=nullptr) {
    427                caseFlags[j]=IS_BASIC_UPPERCASE(b);
    428            }
    429        }
    430    }
    431 
    432    /* Initialize the state: */
    433    n=INITIAL_N;
    434    i=0;
    435    bias=INITIAL_BIAS;
    436    firstSupplementaryIndex=1000000000;
    437 
    438    /*
    439     * Main decoding loop:
    440     * Start just after the last delimiter if any
    441     * basic code points were copied; start at the beginning otherwise.
    442     */
    443    for(in=basicLength>0 ? basicLength+1 : 0; in<srcLength; /* no op */) {
    444        /*
    445         * in is the index of the next character to be consumed, and
    446         * destCPCount is the number of code points in the output array.
    447         *
    448         * Decode a generalized variable-length integer into delta,
    449         * which gets added to i.  The overflow checking is easier
    450         * if we increase i as we go, then subtract off its starting
    451         * value at the end to obtain delta.
    452         */
    453        for(oldi=i, w=1, k=BASE; /* no condition */; k+=BASE) {
    454            if(in>=srcLength) {
    455                *pErrorCode=U_ILLEGAL_CHAR_FOUND;
    456                return 0;
    457            }
    458 
    459            digit=decodeDigit(src[in++]);
    460            if(digit<0) {
    461                *pErrorCode=U_INVALID_CHAR_FOUND;
    462                return 0;
    463            }
    464            if(digit>(0x7fffffff-i)/w) {
    465                /* integer overflow */
    466                *pErrorCode=U_ILLEGAL_CHAR_FOUND;
    467                return 0;
    468            }
    469 
    470            i+=digit*w;
    471            /** RAM: comment out the old code for conformance with draft-ietf-idn-punycode-03.txt  
    472            t=k-bias;
    473            if(t<TMIN) {
    474                t=TMIN;
    475            } else if(t>TMAX) {
    476                t=TMAX;
    477            }
    478            */
    479            t=k-bias;
    480            if(t<TMIN) {
    481                t=TMIN;
    482            } else if(k>=(bias+TMAX)) {
    483                t=TMAX;
    484            }
    485            if(digit<t) {
    486                break;
    487            }
    488 
    489            if(w>0x7fffffff/(BASE-t)) {
    490                /* integer overflow */
    491                *pErrorCode=U_ILLEGAL_CHAR_FOUND;
    492                return 0;
    493            }
    494            w*=BASE-t;
    495        }
    496 
    497        /*
    498         * Modification from sample code:
    499         * Increments destCPCount here,
    500         * where needed instead of in for() loop tail.
    501         */
    502        ++destCPCount;
    503        bias = adaptBias(i - oldi, destCPCount, oldi == 0);
    504 
    505        /*
    506         * i was supposed to wrap around from (incremented) destCPCount to 0,
    507         * incrementing n each time, so we'll fix that now:
    508         */
    509        if(i/destCPCount>(0x7fffffff-n)) {
    510            /* integer overflow */
    511            *pErrorCode=U_ILLEGAL_CHAR_FOUND;
    512            return 0;
    513        }
    514 
    515        n+=i/destCPCount;
    516        i%=destCPCount;
    517        /* not needed for Punycode: */
    518        /* if (decode_digit(n) <= BASE) return punycode_invalid_input; */
    519 
    520        if(n>0x10ffff || U_IS_SURROGATE(n)) {
    521            /* Unicode code point overflow */
    522            *pErrorCode=U_ILLEGAL_CHAR_FOUND;
    523            return 0;
    524        }
    525 
    526        /* Insert n at position i of the output: */
    527        cpLength=U16_LENGTH(n);
    528        if(dest!=nullptr && ((destLength+cpLength)<=destCapacity)) {
    529            int32_t codeUnitIndex;
    530 
    531            /*
    532             * Handle indexes when supplementary code points are present.
    533             *
    534             * In almost all cases, there will be only BMP code points before i
    535             * and even in the entire string.
    536             * This is handled with the same efficiency as with UTF-32.
    537             *
    538             * Only the rare cases with supplementary code points are handled
    539             * more slowly - but not too bad since this is an insertion anyway.
    540             */
    541            if(i<=firstSupplementaryIndex) {
    542                codeUnitIndex=i;
    543                if(cpLength>1) {
    544                    firstSupplementaryIndex=codeUnitIndex;
    545                } else {
    546                    ++firstSupplementaryIndex;
    547                }
    548            } else {
    549                codeUnitIndex=firstSupplementaryIndex;
    550                U16_FWD_N(dest, codeUnitIndex, destLength, i-codeUnitIndex);
    551            }
    552 
    553            /* use the char16_t index codeUnitIndex instead of the code point index i */
    554            if(codeUnitIndex<destLength) {
    555                uprv_memmove(dest+codeUnitIndex+cpLength,
    556                             dest+codeUnitIndex,
    557                             (destLength-codeUnitIndex)*U_SIZEOF_UCHAR);
    558                if(caseFlags!=nullptr) {
    559                    uprv_memmove(caseFlags+codeUnitIndex+cpLength,
    560                                 caseFlags+codeUnitIndex,
    561                                 destLength-codeUnitIndex);
    562                }
    563            }
    564            if(cpLength==1) {
    565                /* BMP, insert one code unit */
    566                dest[codeUnitIndex]=(char16_t)n;
    567            } else {
    568                /* supplementary character, insert two code units */
    569                dest[codeUnitIndex]=U16_LEAD(n);
    570                dest[codeUnitIndex+1]=U16_TRAIL(n);
    571            }
    572            if(caseFlags!=nullptr) {
    573                /* Case of last character determines uppercase flag: */
    574                caseFlags[codeUnitIndex]=IS_BASIC_UPPERCASE(src[in-1]);
    575                if(cpLength==2) {
    576                    caseFlags[codeUnitIndex+1]=false;
    577                }
    578            }
    579        }
    580        destLength+=cpLength;
    581        U_ASSERT(destLength>=0);
    582        ++i;
    583    }
    584 
    585    return u_terminateUChars(dest, destCapacity, destLength, pErrorCode);
    586 }
    587 
    588 /* ### check notes on overflow handling - only necessary if not IDNA? are these Punycode functions to be public? */
    589 
    590 #endif /* #if !UCONFIG_NO_IDNA */