[ tor-browser ].git.dasho

uts46.cpp (57847B)
      1 // © 2016 and later: Unicode, Inc. and others.
      2 // License & terms of use: http://www.unicode.org/copyright.html
      3 /*
      4 *******************************************************************************
      5 *   Copyright (C) 2010-2015, International Business Machines
      6 *   Corporation and others.  All Rights Reserved.
      7 *******************************************************************************
      8 *   file name:  uts46.cpp
      9 *   encoding:   UTF-8
     10 *   tab size:   8 (not used)
     11 *   indentation:4
     12 *
     13 *   created on: 2010mar09
     14 *   created by: Markus W. Scherer
     15 */
     16 
     17 #include "unicode/utypes.h"
     18 
     19 #if !UCONFIG_NO_IDNA
     20 
     21 #include "unicode/bytestream.h"
     22 #include "unicode/idna.h"
     23 #include "unicode/normalizer2.h"
     24 #include "unicode/uscript.h"
     25 #include "unicode/ustring.h"
     26 #include "unicode/utf16.h"
     27 #include "bytesinkutil.h"
     28 #include "cmemory.h"
     29 #include "cstring.h"
     30 #include "punycode.h"
     31 #include "ubidi_props.h"
     32 
     33 // Note about tests for UIDNA_ERROR_DOMAIN_NAME_TOO_LONG:
     34 //
     35 // The domain name length limit is 255 octets in an internal DNS representation
     36 // where the last ("root") label is the empty label
     37 // represented by length byte 0 alone.
     38 // In a conventional string, this translates to 253 characters, or 254
     39 // if there is a trailing dot for the root label.
     40 
     41 U_NAMESPACE_BEGIN
     42 
     43 // Severe errors which usually result in a U+FFFD replacement character in the result string.
     44 const uint32_t severeErrors=
     45    UIDNA_ERROR_LEADING_COMBINING_MARK|
     46    UIDNA_ERROR_DISALLOWED|
     47    UIDNA_ERROR_PUNYCODE|
     48    UIDNA_ERROR_LABEL_HAS_DOT|
     49    UIDNA_ERROR_INVALID_ACE_LABEL;
     50 
     51 static inline UBool
     52 isASCIIString(const UnicodeString &dest) {
     53    const char16_t *s=dest.getBuffer();
     54    const char16_t *limit=s+dest.length();
     55    while(s<limit) {
     56        if(*s++>0x7f) {
     57            return false;
     58        }
     59    }
     60    return true;
     61 }
     62 
     63 static UBool
     64 isASCIIOkBiDi(const char16_t *s, int32_t length);
     65 
     66 static UBool
     67 isASCIIOkBiDi(const char *s, int32_t length);
     68 
     69 // IDNA class default implementations -------------------------------------- ***
     70 
     71 IDNA::~IDNA() {}
     72 
     73 void
     74 IDNA::labelToASCII_UTF8(StringPiece label, ByteSink &dest,
     75                        IDNAInfo &info, UErrorCode &errorCode) const {
     76    if(U_SUCCESS(errorCode)) {
     77        UnicodeString destString;
     78        labelToASCII(UnicodeString::fromUTF8(label), destString,
     79                     info, errorCode).toUTF8(dest);
     80    }
     81 }
     82 
     83 void
     84 IDNA::labelToUnicodeUTF8(StringPiece label, ByteSink &dest,
     85                         IDNAInfo &info, UErrorCode &errorCode) const {
     86    if(U_SUCCESS(errorCode)) {
     87        UnicodeString destString;
     88        labelToUnicode(UnicodeString::fromUTF8(label), destString,
     89                       info, errorCode).toUTF8(dest);
     90    }
     91 }
     92 
     93 void
     94 IDNA::nameToASCII_UTF8(StringPiece name, ByteSink &dest,
     95                       IDNAInfo &info, UErrorCode &errorCode) const {
     96    if(U_SUCCESS(errorCode)) {
     97        UnicodeString destString;
     98        nameToASCII(UnicodeString::fromUTF8(name), destString,
     99                    info, errorCode).toUTF8(dest);
    100    }
    101 }
    102 
    103 void
    104 IDNA::nameToUnicodeUTF8(StringPiece name, ByteSink &dest,
    105                        IDNAInfo &info, UErrorCode &errorCode) const {
    106    if(U_SUCCESS(errorCode)) {
    107        UnicodeString destString;
    108        nameToUnicode(UnicodeString::fromUTF8(name), destString,
    109                      info, errorCode).toUTF8(dest);
    110    }
    111 }
    112 
    113 // UTS46 class declaration ------------------------------------------------- ***
    114 
    115 class UTS46 : public IDNA {
    116 public:
    117    UTS46(uint32_t options, UErrorCode &errorCode);
    118    virtual ~UTS46();
    119 
    120    virtual UnicodeString &
    121    labelToASCII(const UnicodeString &label, UnicodeString &dest,
    122                 IDNAInfo &info, UErrorCode &errorCode) const override;
    123 
    124    virtual UnicodeString &
    125    labelToUnicode(const UnicodeString &label, UnicodeString &dest,
    126                   IDNAInfo &info, UErrorCode &errorCode) const override;
    127 
    128    virtual UnicodeString &
    129    nameToASCII(const UnicodeString &name, UnicodeString &dest,
    130                IDNAInfo &info, UErrorCode &errorCode) const override;
    131 
    132    virtual UnicodeString &
    133    nameToUnicode(const UnicodeString &name, UnicodeString &dest,
    134                  IDNAInfo &info, UErrorCode &errorCode) const override;
    135 
    136    virtual void
    137    labelToASCII_UTF8(StringPiece label, ByteSink &dest,
    138                      IDNAInfo &info, UErrorCode &errorCode) const override;
    139 
    140    virtual void
    141    labelToUnicodeUTF8(StringPiece label, ByteSink &dest,
    142                       IDNAInfo &info, UErrorCode &errorCode) const override;
    143 
    144    virtual void
    145    nameToASCII_UTF8(StringPiece name, ByteSink &dest,
    146                     IDNAInfo &info, UErrorCode &errorCode) const override;
    147 
    148    virtual void
    149    nameToUnicodeUTF8(StringPiece name, ByteSink &dest,
    150                      IDNAInfo &info, UErrorCode &errorCode) const override;
    151 
    152 private:
    153    UnicodeString &
    154    process(const UnicodeString &src,
    155            UBool isLabel, UBool toASCII,
    156            UnicodeString &dest,
    157            IDNAInfo &info, UErrorCode &errorCode) const;
    158 
    159    void
    160    processUTF8(StringPiece src,
    161                UBool isLabel, UBool toASCII,
    162                ByteSink &dest,
    163                IDNAInfo &info, UErrorCode &errorCode) const;
    164 
    165    UnicodeString &
    166    processUnicode(const UnicodeString &src,
    167                   int32_t labelStart, int32_t mappingStart,
    168                   UBool isLabel, UBool toASCII,
    169                   UnicodeString &dest,
    170                   IDNAInfo &info, UErrorCode &errorCode) const;
    171 
    172    // returns the new dest.length()
    173    int32_t
    174    mapDevChars(UnicodeString &dest, int32_t labelStart, int32_t mappingStart,
    175                UErrorCode &errorCode) const;
    176 
    177    // returns the new label length
    178    int32_t
    179    processLabel(UnicodeString &dest,
    180                 int32_t labelStart, int32_t labelLength,
    181                 UBool toASCII,
    182                 IDNAInfo &info, UErrorCode &errorCode) const;
    183    int32_t
    184    markBadACELabel(UnicodeString &dest,
    185                    int32_t labelStart, int32_t labelLength,
    186                    UBool toASCII, IDNAInfo &info, UErrorCode &errorCode) const;
    187 
    188    void
    189    checkLabelBiDi(const char16_t *label, int32_t labelLength, IDNAInfo &info) const;
    190 
    191    UBool
    192    isLabelOkContextJ(const char16_t *label, int32_t labelLength) const;
    193 
    194    void
    195    checkLabelContextO(const char16_t *label, int32_t labelLength, IDNAInfo &info) const;
    196 
    197    const Normalizer2 &uts46Norm2;  // uts46.nrm
    198    uint32_t options;
    199 };
    200 
    201 IDNA *
    202 IDNA::createUTS46Instance(uint32_t options, UErrorCode &errorCode) {
    203    if(U_SUCCESS(errorCode)) {
    204        IDNA *idna=new UTS46(options, errorCode);
    205        if(idna==nullptr) {
    206            errorCode=U_MEMORY_ALLOCATION_ERROR;
    207        } else if(U_FAILURE(errorCode)) {
    208            delete idna;
    209            idna=nullptr;
    210        }
    211        return idna;
    212    } else {
    213        return nullptr;
    214    }
    215 }
    216 
    217 // UTS46 implementation ---------------------------------------------------- ***
    218 
    219 UTS46::UTS46(uint32_t opt, UErrorCode &errorCode)
    220        : uts46Norm2(*Normalizer2::getInstance(nullptr, "uts46", UNORM2_COMPOSE, errorCode)),
    221          options(opt) {}
    222 
    223 UTS46::~UTS46() {}
    224 
    225 UnicodeString &
    226 UTS46::labelToASCII(const UnicodeString &label, UnicodeString &dest,
    227                    IDNAInfo &info, UErrorCode &errorCode) const {
    228    return process(label, true, true, dest, info, errorCode);
    229 }
    230 
    231 UnicodeString &
    232 UTS46::labelToUnicode(const UnicodeString &label, UnicodeString &dest,
    233                      IDNAInfo &info, UErrorCode &errorCode) const {
    234    return process(label, true, false, dest, info, errorCode);
    235 }
    236 
    237 UnicodeString &
    238 UTS46::nameToASCII(const UnicodeString &name, UnicodeString &dest,
    239                   IDNAInfo &info, UErrorCode &errorCode) const {
    240    process(name, false, true, dest, info, errorCode);
    241    if( dest.length()>=254 && (info.errors&UIDNA_ERROR_DOMAIN_NAME_TOO_LONG)==0 &&
    242        isASCIIString(dest) &&
    243        (dest.length()>254 || dest[253]!=0x2e)
    244    ) {
    245        info.errors|=UIDNA_ERROR_DOMAIN_NAME_TOO_LONG;
    246    }
    247    return dest;
    248 }
    249 
    250 UnicodeString &
    251 UTS46::nameToUnicode(const UnicodeString &name, UnicodeString &dest,
    252                     IDNAInfo &info, UErrorCode &errorCode) const {
    253    return process(name, false, false, dest, info, errorCode);
    254 }
    255 
    256 void
    257 UTS46::labelToASCII_UTF8(StringPiece label, ByteSink &dest,
    258                         IDNAInfo &info, UErrorCode &errorCode) const {
    259    processUTF8(label, true, true, dest, info, errorCode);
    260 }
    261 
    262 void
    263 UTS46::labelToUnicodeUTF8(StringPiece label, ByteSink &dest,
    264                          IDNAInfo &info, UErrorCode &errorCode) const {
    265    processUTF8(label, true, false, dest, info, errorCode);
    266 }
    267 
    268 void
    269 UTS46::nameToASCII_UTF8(StringPiece name, ByteSink &dest,
    270                        IDNAInfo &info, UErrorCode &errorCode) const {
    271    processUTF8(name, false, true, dest, info, errorCode);
    272 }
    273 
    274 void
    275 UTS46::nameToUnicodeUTF8(StringPiece name, ByteSink &dest,
    276                         IDNAInfo &info, UErrorCode &errorCode) const {
    277    processUTF8(name, false, false, dest, info, errorCode);
    278 }
    279 
    280 // UTS #46 data for ASCII characters.
    281 // The normalizer (using uts46.nrm) maps uppercase ASCII letters to lowercase
    282 // and passes through all other ASCII characters.
    283 // If UIDNA_USE_STD3_RULES is set, then non-LDH characters are disallowed
    284 // using this data.
    285 // The ASCII fastpath also uses this data.
    286 // Values: -1=disallowed  0==valid  1==mapped (lowercase)
    287 static const int8_t asciiData[128]={
    288    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
    289    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
    290    // 002D..002E; valid  #  HYPHEN-MINUS..FULL STOP
    291    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,  0,  0, -1,
    292    // 0030..0039; valid  #  DIGIT ZERO..DIGIT NINE
    293     0,  0,  0,  0,  0,  0,  0,  0,  0,  0, -1, -1, -1, -1, -1, -1,
    294    // 0041..005A; mapped  #  LATIN CAPITAL LETTER A..LATIN CAPITAL LETTER Z
    295    -1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
    296     1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1, -1, -1, -1, -1, -1,
    297    // 0061..007A; valid  #  LATIN SMALL LETTER A..LATIN SMALL LETTER Z
    298    -1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
    299     0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0, -1, -1, -1, -1, -1
    300 };
    301 
    302 UnicodeString &
    303 UTS46::process(const UnicodeString &src,
    304               UBool isLabel, UBool toASCII,
    305               UnicodeString &dest,
    306               IDNAInfo &info, UErrorCode &errorCode) const {
    307    // uts46Norm2.normalize() would do all of this error checking and setup,
    308    // but with the ASCII fastpath we do not always call it, and do not
    309    // call it first.
    310    if(U_FAILURE(errorCode)) {
    311        dest.setToBogus();
    312        return dest;
    313    }
    314    const char16_t *srcArray=src.getBuffer();
    315    if(&dest==&src || srcArray==nullptr) {
    316        errorCode=U_ILLEGAL_ARGUMENT_ERROR;
    317        dest.setToBogus();
    318        return dest;
    319    }
    320    // Arguments are fine, reset output values.
    321    dest.remove();
    322    info.reset();
    323    int32_t srcLength=src.length();
    324    if(srcLength==0) {
    325        info.errors|=UIDNA_ERROR_EMPTY_LABEL;
    326        return dest;
    327    }
    328    char16_t *destArray=dest.getBuffer(srcLength);
    329    if(destArray==nullptr) {
    330        errorCode=U_MEMORY_ALLOCATION_ERROR;
    331        return dest;
    332    }
    333    // ASCII fastpath
    334    UBool disallowNonLDHDot=(options&UIDNA_USE_STD3_RULES)!=0;
    335    int32_t labelStart=0;
    336    int32_t i;
    337    for(i=0;; ++i) {
    338        if(i==srcLength) {
    339            if(toASCII) {
    340                if((i-labelStart)>63) {
    341                    info.labelErrors|=UIDNA_ERROR_LABEL_TOO_LONG;
    342                }
    343                // There is a trailing dot if labelStart==i.
    344                if(!isLabel && i>=254 && (i>254 || labelStart<i)) {
    345                    info.errors|=UIDNA_ERROR_DOMAIN_NAME_TOO_LONG;
    346                }
    347            }
    348            info.errors|=info.labelErrors;
    349            dest.releaseBuffer(i);
    350            return dest;
    351        }
    352        char16_t c=srcArray[i];
    353        if(c>0x7f) {
    354            break;
    355        }
    356        int cData=asciiData[c];
    357        if(cData>0) {
    358            destArray[i]=c+0x20;  // Lowercase an uppercase ASCII letter.
    359        } else if(cData<0 && disallowNonLDHDot) {
    360            break;  // Replacing with U+FFFD can be complicated for toASCII.
    361        } else {
    362            destArray[i]=c;
    363            if(c==0x2d) {  // hyphen
    364                if(i==(labelStart+3) && srcArray[i-1]==0x2d) {
    365                    // "??--..." is Punycode or forbidden.
    366                    ++i;  // '-' was copied to dest already
    367                    break;
    368                }
    369                if(i==labelStart) {
    370                    // label starts with "-"
    371                    info.labelErrors|=UIDNA_ERROR_LEADING_HYPHEN;
    372                }
    373                if((i+1)==srcLength || srcArray[i+1]==0x2e) {
    374                    // label ends with "-"
    375                    info.labelErrors|=UIDNA_ERROR_TRAILING_HYPHEN;
    376                }
    377            } else if(c==0x2e) {  // dot
    378                if(isLabel) {
    379                    // Replacing with U+FFFD can be complicated for toASCII.
    380                    ++i;  // '.' was copied to dest already
    381                    break;
    382                }
    383                if(i==labelStart) {
    384                    info.labelErrors|=UIDNA_ERROR_EMPTY_LABEL;
    385                }
    386                if(toASCII && (i-labelStart)>63) {
    387                    info.labelErrors|=UIDNA_ERROR_LABEL_TOO_LONG;
    388                }
    389                info.errors|=info.labelErrors;
    390                info.labelErrors=0;
    391                labelStart=i+1;
    392            }
    393        }
    394    }
    395    info.errors|=info.labelErrors;
    396    dest.releaseBuffer(i);
    397    processUnicode(src, labelStart, i, isLabel, toASCII, dest, info, errorCode);
    398    if( info.isBiDi && U_SUCCESS(errorCode) && (info.errors&severeErrors)==0 &&
    399        (!info.isOkBiDi || (labelStart>0 && !isASCIIOkBiDi(dest.getBuffer(), labelStart)))
    400    ) {
    401        info.errors|=UIDNA_ERROR_BIDI;
    402    }
    403    return dest;
    404 }
    405 
    406 void
    407 UTS46::processUTF8(StringPiece src,
    408                   UBool isLabel, UBool toASCII,
    409                   ByteSink &dest,
    410                   IDNAInfo &info, UErrorCode &errorCode) const {
    411    if(U_FAILURE(errorCode)) {
    412        return;
    413    }
    414    const char *srcArray=src.data();
    415    int32_t srcLength=src.length();
    416    if(srcArray==nullptr && srcLength!=0) {
    417        errorCode=U_ILLEGAL_ARGUMENT_ERROR;
    418        return;
    419    }
    420    // Arguments are fine, reset output values.
    421    info.reset();
    422    if(srcLength==0) {
    423        info.errors|=UIDNA_ERROR_EMPTY_LABEL;
    424        dest.Flush();
    425        return;
    426    }
    427    UnicodeString destString;
    428    int32_t labelStart=0;
    429    if(srcLength<=256) {  // length of stackArray[]
    430        // ASCII fastpath
    431        char stackArray[256];
    432        int32_t destCapacity;
    433        char *destArray=dest.GetAppendBuffer(srcLength, srcLength+20,
    434                                             stackArray, UPRV_LENGTHOF(stackArray), &destCapacity);
    435        UBool disallowNonLDHDot=(options&UIDNA_USE_STD3_RULES)!=0;
    436        int32_t i;
    437        for(i=0;; ++i) {
    438            if(i==srcLength) {
    439                if(toASCII) {
    440                    if((i-labelStart)>63) {
    441                        info.labelErrors|=UIDNA_ERROR_LABEL_TOO_LONG;
    442                    }
    443                    // There is a trailing dot if labelStart==i.
    444                    if(!isLabel && i>=254 && (i>254 || labelStart<i)) {
    445                        info.errors|=UIDNA_ERROR_DOMAIN_NAME_TOO_LONG;
    446                    }
    447                }
    448                info.errors|=info.labelErrors;
    449                dest.Append(destArray, i);
    450                dest.Flush();
    451                return;
    452            }
    453            char c=srcArray[i];
    454            if (static_cast<int8_t>(c) < 0) { // (uint8_t)c>0x7f
    455                break;
    456            }
    457            int cData = asciiData[static_cast<int>(c)]; // Cast: gcc warns about indexing with a char.
    458            if(cData>0) {
    459                destArray[i]=c+0x20;  // Lowercase an uppercase ASCII letter.
    460            } else if(cData<0 && disallowNonLDHDot) {
    461                break;  // Replacing with U+FFFD can be complicated for toASCII.
    462            } else {
    463                destArray[i]=c;
    464                if(c==0x2d) {  // hyphen
    465                    if(i==(labelStart+3) && srcArray[i-1]==0x2d) {
    466                        // "??--..." is Punycode or forbidden.
    467                        break;
    468                    }
    469                    if(i==labelStart) {
    470                        // label starts with "-"
    471                        info.labelErrors|=UIDNA_ERROR_LEADING_HYPHEN;
    472                    }
    473                    if((i+1)==srcLength || srcArray[i+1]==0x2e) {
    474                        // label ends with "-"
    475                        info.labelErrors|=UIDNA_ERROR_TRAILING_HYPHEN;
    476                    }
    477                } else if(c==0x2e) {  // dot
    478                    if(isLabel) {
    479                        break;  // Replacing with U+FFFD can be complicated for toASCII.
    480                    }
    481                    if(i==labelStart) {
    482                        info.labelErrors|=UIDNA_ERROR_EMPTY_LABEL;
    483                    }
    484                    if(toASCII && (i-labelStart)>63) {
    485                        info.labelErrors|=UIDNA_ERROR_LABEL_TOO_LONG;
    486                    }
    487                    info.errors|=info.labelErrors;
    488                    info.labelErrors=0;
    489                    labelStart=i+1;
    490                }
    491            }
    492        }
    493        info.errors|=info.labelErrors;
    494        // Convert the processed ASCII prefix of the current label to UTF-16.
    495        int32_t mappingStart=i-labelStart;
    496        destString=UnicodeString::fromUTF8(StringPiece(destArray+labelStart, mappingStart));
    497        // Output the previous ASCII labels and process the rest of src in UTF-16.
    498        dest.Append(destArray, labelStart);
    499        processUnicode(UnicodeString::fromUTF8(StringPiece(src, labelStart)), 0, mappingStart,
    500                       isLabel, toASCII,
    501                       destString, info, errorCode);
    502    } else {
    503        // src is too long for the ASCII fastpath implementation.
    504        processUnicode(UnicodeString::fromUTF8(src), 0, 0,
    505                       isLabel, toASCII,
    506                       destString, info, errorCode);
    507    }
    508    destString.toUTF8(dest);  // calls dest.Flush()
    509    if(toASCII && !isLabel) {
    510        // length==labelStart==254 means that there is a trailing dot (ok) and
    511        // destString is empty (do not index at 253-labelStart).
    512        int32_t length=labelStart+destString.length();
    513        if( length>=254 && isASCIIString(destString) &&
    514            (length>254 ||
    515             (labelStart<254 && destString[253-labelStart]!=0x2e))
    516        ) {
    517            info.errors|=UIDNA_ERROR_DOMAIN_NAME_TOO_LONG;
    518        }
    519    }
    520    if( info.isBiDi && U_SUCCESS(errorCode) && (info.errors&severeErrors)==0 &&
    521        (!info.isOkBiDi || (labelStart>0 && !isASCIIOkBiDi(srcArray, labelStart)))
    522    ) {
    523        info.errors|=UIDNA_ERROR_BIDI;
    524    }
    525 }
    526 
    527 UnicodeString &
    528 UTS46::processUnicode(const UnicodeString &src,
    529                      int32_t labelStart, int32_t mappingStart,
    530                      UBool isLabel, UBool toASCII,
    531                      UnicodeString &dest,
    532                      IDNAInfo &info, UErrorCode &errorCode) const {
    533    if(mappingStart==0) {
    534        uts46Norm2.normalize(src, dest, errorCode);
    535    } else {
    536        uts46Norm2.normalizeSecondAndAppend(dest, src.tempSubString(mappingStart), errorCode);
    537    }
    538    if(U_FAILURE(errorCode)) {
    539        return dest;
    540    }
    541    UBool doMapDevChars=
    542        toASCII ? (options&UIDNA_NONTRANSITIONAL_TO_ASCII)==0 :
    543                  (options&UIDNA_NONTRANSITIONAL_TO_UNICODE)==0;
    544    const char16_t *destArray=dest.getBuffer();
    545    int32_t destLength=dest.length();
    546    int32_t labelLimit=labelStart;
    547    while(labelLimit<destLength) {
    548        char16_t c=destArray[labelLimit];
    549        if(c==0x2e && !isLabel) {
    550            int32_t labelLength=labelLimit-labelStart;
    551            int32_t newLength=processLabel(dest, labelStart, labelLength,
    552                                            toASCII, info, errorCode);
    553            info.errors|=info.labelErrors;
    554            info.labelErrors=0;
    555            if(U_FAILURE(errorCode)) {
    556                return dest;
    557            }
    558            destArray=dest.getBuffer();
    559            destLength+=newLength-labelLength;
    560            labelLimit=labelStart+=newLength+1;
    561            continue;
    562        } else if(c<0xdf) {
    563            // pass
    564        } else if(c<=0x200d && (c==0xdf || c==0x3c2 || c>=0x200c)) {
    565            info.isTransDiff=true;
    566            if(doMapDevChars) {
    567                destLength=mapDevChars(dest, labelStart, labelLimit, errorCode);
    568                if(U_FAILURE(errorCode)) {
    569                    return dest;
    570                }
    571                destArray=dest.getBuffer();
    572                // All deviation characters have been mapped, no need to check for them again.
    573                doMapDevChars=false;
    574                // Do not increment labelLimit in case c was removed.
    575                continue;
    576            }
    577        } else if(U16_IS_SURROGATE(c)) {
    578            if(U16_IS_SURROGATE_LEAD(c) ?
    579                    (labelLimit+1)==destLength || !U16_IS_TRAIL(destArray[labelLimit+1]) :
    580                    labelLimit==labelStart || !U16_IS_LEAD(destArray[labelLimit-1])) {
    581                // Map an unpaired surrogate to U+FFFD before normalization so that when
    582                // that removes characters we do not turn two unpaired ones into a pair.
    583                info.labelErrors|=UIDNA_ERROR_DISALLOWED;
    584                dest.setCharAt(labelLimit, 0xfffd);
    585                destArray=dest.getBuffer();
    586            }
    587        }
    588        ++labelLimit;
    589    }
    590    // Permit an empty label at the end (0<labelStart==labelLimit==destLength is ok)
    591    // but not an empty label elsewhere nor a completely empty domain name.
    592    // processLabel() sets UIDNA_ERROR_EMPTY_LABEL when labelLength==0.
    593    if(0==labelStart || labelStart<labelLimit) {
    594        processLabel(dest, labelStart, labelLimit-labelStart,
    595                      toASCII, info, errorCode);
    596        info.errors|=info.labelErrors;
    597    }
    598    return dest;
    599 }
    600 
    601 int32_t
    602 UTS46::mapDevChars(UnicodeString &dest, int32_t labelStart, int32_t mappingStart,
    603                   UErrorCode &errorCode) const {
    604    if(U_FAILURE(errorCode)) {
    605        return 0;
    606    }
    607    int32_t length=dest.length();
    608    char16_t *s=dest.getBuffer(dest[mappingStart]==0xdf ? length+1 : length);
    609    if(s==nullptr) {
    610        errorCode=U_MEMORY_ALLOCATION_ERROR;
    611        return length;
    612    }
    613    int32_t capacity=dest.getCapacity();
    614    UBool didMapDevChars=false;
    615    int32_t readIndex=mappingStart, writeIndex=mappingStart;
    616    do {
    617        char16_t c=s[readIndex++];
    618        switch(c) {
    619        case 0xdf:
    620            // Map sharp s to ss.
    621            didMapDevChars=true;
    622            s[writeIndex++]=0x73;  // Replace sharp s with first s.
    623            // Insert second s and account for possible buffer reallocation.
    624            if(writeIndex==readIndex) {
    625                if(length==capacity) {
    626                    dest.releaseBuffer(length);
    627                    s=dest.getBuffer(length+1);
    628                    if(s==nullptr) {
    629                        errorCode=U_MEMORY_ALLOCATION_ERROR;
    630                        return length;
    631                    }
    632                    capacity=dest.getCapacity();
    633                }
    634                u_memmove(s+writeIndex+1, s+writeIndex, length-writeIndex);
    635                ++readIndex;
    636            }
    637            s[writeIndex++]=0x73;
    638            ++length;
    639            break;
    640        case 0x3c2:  // Map final sigma to nonfinal sigma.
    641            didMapDevChars=true;
    642            s[writeIndex++]=0x3c3;
    643            break;
    644        case 0x200c:  // Ignore/remove ZWNJ.
    645        case 0x200d:  // Ignore/remove ZWJ.
    646            didMapDevChars=true;
    647            --length;
    648            break;
    649        default:
    650            // Only really necessary if writeIndex was different from readIndex.
    651            s[writeIndex++]=c;
    652            break;
    653        }
    654    } while(writeIndex<length);
    655    dest.releaseBuffer(length);
    656    if(didMapDevChars) {
    657        // Mapping deviation characters might have resulted in an un-NFC string.
    658        // We could use either the NFC or the UTS #46 normalizer.
    659        // By using the UTS #46 normalizer again, we avoid having to load a second .nrm data file.
    660        UnicodeString normalized;
    661        uts46Norm2.normalize(dest.tempSubString(labelStart), normalized, errorCode);
    662        if(U_SUCCESS(errorCode)) {
    663            dest.replace(labelStart, 0x7fffffff, normalized);
    664            if(dest.isBogus()) {
    665                errorCode=U_MEMORY_ALLOCATION_ERROR;
    666            }
    667            return dest.length();
    668        }
    669    }
    670    return length;
    671 }
    672 
    673 // Replace the label in dest with the label string, if the label was modified.
    674 // If &label==&dest then the label was modified in-place and labelLength
    675 // is the new label length, different from label.length().
    676 // If &label!=&dest then labelLength==label.length().
    677 // Returns labelLength (= the new label length).
    678 static int32_t
    679 replaceLabel(UnicodeString &dest, int32_t destLabelStart, int32_t destLabelLength,
    680             const UnicodeString &label, int32_t labelLength, UErrorCode &errorCode) {
    681    if(U_FAILURE(errorCode)) {
    682        return 0;
    683    }
    684    if(&label!=&dest) {
    685        dest.replace(destLabelStart, destLabelLength, label);
    686        if(dest.isBogus()) {
    687            errorCode=U_MEMORY_ALLOCATION_ERROR;
    688            return 0;
    689        }
    690    }
    691    return labelLength;
    692 }
    693 
    694 int32_t
    695 UTS46::processLabel(UnicodeString &dest,
    696                    int32_t labelStart, int32_t labelLength,
    697                    UBool toASCII,
    698                    IDNAInfo &info, UErrorCode &errorCode) const {
    699    if(U_FAILURE(errorCode)) {
    700        return 0;
    701    }
    702    UnicodeString fromPunycode;
    703    UnicodeString *labelString;
    704    const char16_t *label=dest.getBuffer()+labelStart;
    705    int32_t destLabelStart=labelStart;
    706    int32_t destLabelLength=labelLength;
    707    UBool wasPunycode;
    708    if(labelLength>=4 && label[0]==0x78 && label[1]==0x6e && label[2]==0x2d && label[3]==0x2d) {
    709        // Label starts with "xn--", try to un-Punycode it.
    710        // In IDNA2008, labels like "xn--" (decodes to an empty string) and
    711        // "xn--ASCII-" (decodes to just "ASCII") fail the round-trip validation from
    712        // comparing the ToUnicode input with the back-to-ToASCII output.
    713        // They are alternate encodings of the respective ASCII labels.
    714        // Ignore "xn---" here: It will fail Punycode.decode() which logically comes before
    715        // the round-trip verification.
    716        if(labelLength==4 || (labelLength>5 && label[labelLength-1]==u'-')) {
    717            info.labelErrors|=UIDNA_ERROR_INVALID_ACE_LABEL;
    718            return markBadACELabel(dest, labelStart, labelLength, toASCII, info, errorCode);
    719        }
    720        wasPunycode=true;
    721        char16_t *unicodeBuffer=fromPunycode.getBuffer(-1);  // capacity==-1: most labels should fit
    722        if(unicodeBuffer==nullptr) {
    723            // Should never occur if we used capacity==-1 which uses the internal buffer.
    724            errorCode=U_MEMORY_ALLOCATION_ERROR;
    725            return labelLength;
    726        }
    727        UErrorCode punycodeErrorCode=U_ZERO_ERROR;
    728        int32_t unicodeLength=u_strFromPunycode(label+4, labelLength-4,
    729                                                unicodeBuffer, fromPunycode.getCapacity(),
    730                                                nullptr, &punycodeErrorCode);
    731        if(punycodeErrorCode==U_BUFFER_OVERFLOW_ERROR) {
    732            fromPunycode.releaseBuffer(0);
    733            unicodeBuffer=fromPunycode.getBuffer(unicodeLength);
    734            if(unicodeBuffer==nullptr) {
    735                errorCode=U_MEMORY_ALLOCATION_ERROR;
    736                return labelLength;
    737            }
    738            punycodeErrorCode=U_ZERO_ERROR;
    739            unicodeLength=u_strFromPunycode(label+4, labelLength-4,
    740                                            unicodeBuffer, fromPunycode.getCapacity(),
    741                                            nullptr, &punycodeErrorCode);
    742        }
    743        fromPunycode.releaseBuffer(unicodeLength);
    744        if(U_FAILURE(punycodeErrorCode)) {
    745            info.labelErrors|=UIDNA_ERROR_PUNYCODE;
    746            return markBadACELabel(dest, labelStart, labelLength, toASCII, info, errorCode);
    747        }
    748        // Check for NFC, and for characters that are not
    749        // valid or deviation characters according to the normalizer.
    750        // If there is something wrong, then the string will change.
    751        // Note that the normalizer passes through non-LDH ASCII and deviation characters.
    752        // Deviation characters are ok in Punycode even in transitional processing.
    753        // In the code further below, if we find non-LDH ASCII and we have UIDNA_USE_STD3_RULES
    754        // then we will set UIDNA_ERROR_INVALID_ACE_LABEL there too.
    755        UBool isValid=uts46Norm2.isNormalized(fromPunycode, errorCode);
    756        if(U_FAILURE(errorCode)) {
    757            return labelLength;
    758        }
    759        // Unicode 15.1 UTS #46:
    760        // Added an additional condition in 4.1 Validity Criteria to
    761        // disallow labels such as xn--xn---epa., which do not round-trip.
    762        // --> Validity Criteria new criterion 4:
    763        // If not CheckHyphens, the label must not begin with “xn--”.
    764        if(!isValid || fromPunycode.startsWith(UnicodeString::readOnlyAlias(u"xn--"))) {
    765            info.labelErrors|=UIDNA_ERROR_INVALID_ACE_LABEL;
    766            return markBadACELabel(dest, labelStart, labelLength, toASCII, info, errorCode);
    767        }
    768        labelString=&fromPunycode;
    769        label=fromPunycode.getBuffer();
    770        labelStart=0;
    771        labelLength=fromPunycode.length();
    772    } else {
    773        wasPunycode=false;
    774        labelString=&dest;
    775    }
    776    // Validity check
    777    if(labelLength==0) {
    778        info.labelErrors|=UIDNA_ERROR_EMPTY_LABEL;
    779        return replaceLabel(dest, destLabelStart, destLabelLength,
    780                            *labelString, labelLength, errorCode);
    781    }
    782    // labelLength>0
    783    if(labelLength>=4 && label[2]==0x2d && label[3]==0x2d) {
    784        // label starts with "??--"
    785        info.labelErrors|=UIDNA_ERROR_HYPHEN_3_4;
    786    }
    787    if(label[0]==0x2d) {
    788        // label starts with "-"
    789        info.labelErrors|=UIDNA_ERROR_LEADING_HYPHEN;
    790    }
    791    if(label[labelLength-1]==0x2d) {
    792        // label ends with "-"
    793        info.labelErrors|=UIDNA_ERROR_TRAILING_HYPHEN;
    794    }
    795    // If the label was not a Punycode label, then it was the result of
    796    // mapping, normalization and label segmentation.
    797    // If the label was in Punycode, then we mapped it again above
    798    // and checked its validity.
    799    // Now we handle the STD3 restriction to LDH characters (if set)
    800    // and we look for U+FFFD which indicates disallowed characters
    801    // in a non-Punycode label or U+FFFD itself in a Punycode label.
    802    // We also check for dots which can come from the input to a single-label function.
    803    // Ok to cast away const because we own the UnicodeString.
    804    char16_t* s = const_cast<char16_t*>(label);
    805    const char16_t *limit=label+labelLength;
    806    char16_t oredChars=0;
    807    // If we enforce STD3 rules, then ASCII characters other than LDH and dot are disallowed.
    808    UBool disallowNonLDHDot=(options&UIDNA_USE_STD3_RULES)!=0;
    809    do {
    810        char16_t c=*s;
    811        if(c<=0x7f) {
    812            if(c==0x2e) {
    813                info.labelErrors|=UIDNA_ERROR_LABEL_HAS_DOT;
    814                *s=0xfffd;
    815            } else if(disallowNonLDHDot && asciiData[c]<0) {
    816                info.labelErrors|=UIDNA_ERROR_DISALLOWED;
    817                *s=0xfffd;
    818            }
    819        } else {
    820            oredChars|=c;
    821            if(c==0xfffd) {
    822                info.labelErrors|=UIDNA_ERROR_DISALLOWED;
    823            }
    824        }
    825        ++s;
    826    } while(s<limit);
    827    // Check for a leading combining mark after other validity checks
    828    // so that we don't report UIDNA_ERROR_DISALLOWED for the U+FFFD from here.
    829    UChar32 c;
    830    int32_t cpLength=0;
    831    // "Unsafe" is ok because unpaired surrogates were mapped to U+FFFD.
    832    U16_NEXT_UNSAFE(label, cpLength, c);
    833    if((U_GET_GC_MASK(c)&U_GC_M_MASK)!=0) {
    834        info.labelErrors|=UIDNA_ERROR_LEADING_COMBINING_MARK;
    835        labelString->replace(labelStart, cpLength, static_cast<char16_t>(0xfffd));
    836        label=labelString->getBuffer()+labelStart;
    837        labelLength+=1-cpLength;
    838        if(labelString==&dest) {
    839            destLabelLength=labelLength;
    840        }
    841    }
    842    if((info.labelErrors&severeErrors)==0) {
    843        // Do contextual checks only if we do not have U+FFFD from a severe error
    844        // because U+FFFD can make these checks fail.
    845        if((options&UIDNA_CHECK_BIDI)!=0 && (!info.isBiDi || info.isOkBiDi)) {
    846            checkLabelBiDi(label, labelLength, info);
    847        }
    848        if( (options&UIDNA_CHECK_CONTEXTJ)!=0 && (oredChars&0x200c)==0x200c &&
    849            !isLabelOkContextJ(label, labelLength)
    850        ) {
    851            info.labelErrors|=UIDNA_ERROR_CONTEXTJ;
    852        }
    853        if((options&UIDNA_CHECK_CONTEXTO)!=0 && oredChars>=0xb7) {
    854            checkLabelContextO(label, labelLength, info);
    855        }
    856        if(toASCII) {
    857            if(wasPunycode) {
    858                // Leave a Punycode label unchanged if it has no severe errors.
    859                if(destLabelLength>63) {
    860                    info.labelErrors|=UIDNA_ERROR_LABEL_TOO_LONG;
    861                }
    862                return destLabelLength;
    863            } else if(oredChars>=0x80) {
    864                // Contains non-ASCII characters.
    865                UnicodeString punycode;
    866                char16_t *buffer=punycode.getBuffer(63);  // 63==maximum DNS label length
    867                if(buffer==nullptr) {
    868                    errorCode=U_MEMORY_ALLOCATION_ERROR;
    869                    return destLabelLength;
    870                }
    871                buffer[0]=0x78;  // Write "xn--".
    872                buffer[1]=0x6e;
    873                buffer[2]=0x2d;
    874                buffer[3]=0x2d;
    875                UErrorCode punycodeErrorCode=U_ZERO_ERROR;
    876                int32_t punycodeLength=u_strToPunycode(label, labelLength,
    877                                                      buffer+4, punycode.getCapacity()-4,
    878                                                      nullptr, &punycodeErrorCode);
    879                if(punycodeErrorCode==U_BUFFER_OVERFLOW_ERROR) {
    880                    punycodeErrorCode=U_ZERO_ERROR;
    881                    punycode.releaseBuffer(4);
    882                    buffer=punycode.getBuffer(4+punycodeLength);
    883                    if(buffer==nullptr) {
    884                        errorCode=U_MEMORY_ALLOCATION_ERROR;
    885                        return destLabelLength;
    886                    }
    887                    punycodeLength=u_strToPunycode(label, labelLength,
    888                                                  buffer+4, punycode.getCapacity()-4,
    889                                                  nullptr, &punycodeErrorCode);
    890                }
    891                punycodeLength+=4;
    892                punycode.releaseBuffer(punycodeLength);
    893                if(U_FAILURE(punycodeErrorCode)) {
    894                    errorCode = punycodeErrorCode;
    895                    return destLabelLength;
    896                }
    897                if(punycodeLength>63) {
    898                    info.labelErrors|=UIDNA_ERROR_LABEL_TOO_LONG;
    899                }
    900                return replaceLabel(dest, destLabelStart, destLabelLength,
    901                                    punycode, punycodeLength, errorCode);
    902            } else {
    903                // all-ASCII label
    904                if(labelLength>63) {
    905                    info.labelErrors|=UIDNA_ERROR_LABEL_TOO_LONG;
    906                }
    907            }
    908        }
    909    } else {
    910        // If a Punycode label has severe errors,
    911        // then leave it but make sure it does not look valid.
    912        if(wasPunycode) {
    913            info.labelErrors|=UIDNA_ERROR_INVALID_ACE_LABEL;
    914            return markBadACELabel(dest, destLabelStart, destLabelLength, toASCII, info, errorCode);
    915        }
    916    }
    917    return replaceLabel(dest, destLabelStart, destLabelLength,
    918                        *labelString, labelLength, errorCode);
    919 }
    920 
    921 // Make sure an ACE label does not look valid.
    922 // Append U+FFFD if the label has only LDH characters.
    923 // If UIDNA_USE_STD3_RULES, also replace disallowed ASCII characters with U+FFFD.
    924 int32_t
    925 UTS46::markBadACELabel(UnicodeString &dest,
    926                       int32_t labelStart, int32_t labelLength,
    927                       UBool toASCII, IDNAInfo &info, UErrorCode &errorCode) const {
    928    if(U_FAILURE(errorCode)) {
    929        return 0;
    930    }
    931    UBool disallowNonLDHDot=(options&UIDNA_USE_STD3_RULES)!=0;
    932    UBool isASCII=true;
    933    UBool onlyLDH=true;
    934    const char16_t *label=dest.getBuffer()+labelStart;
    935    const char16_t *limit=label+labelLength;
    936    // Start after the initial "xn--".
    937    // Ok to cast away const because we own the UnicodeString.
    938    for(char16_t *s=const_cast<char16_t *>(label+4); s<limit; ++s) {
    939        char16_t c=*s;
    940        if(c<=0x7f) {
    941            if(c==0x2e) {
    942                info.labelErrors|=UIDNA_ERROR_LABEL_HAS_DOT;
    943                *s=0xfffd;
    944                isASCII=onlyLDH=false;
    945            } else if(asciiData[c]<0) {
    946                onlyLDH=false;
    947                if(disallowNonLDHDot) {
    948                    *s=0xfffd;
    949                    isASCII=false;
    950                }
    951            }
    952        } else {
    953            isASCII=onlyLDH=false;
    954        }
    955    }
    956    if(onlyLDH) {
    957        dest.insert(labelStart + labelLength, static_cast<char16_t>(0xfffd));
    958        if(dest.isBogus()) {
    959            errorCode=U_MEMORY_ALLOCATION_ERROR;
    960            return 0;
    961        }
    962        ++labelLength;
    963    } else {
    964        if(toASCII && isASCII && labelLength>63) {
    965            info.labelErrors|=UIDNA_ERROR_LABEL_TOO_LONG;
    966        }
    967    }
    968    return labelLength;
    969 }
    970 
    971 const uint32_t L_MASK=U_MASK(U_LEFT_TO_RIGHT);
    972 const uint32_t R_AL_MASK=U_MASK(U_RIGHT_TO_LEFT)|U_MASK(U_RIGHT_TO_LEFT_ARABIC);
    973 const uint32_t L_R_AL_MASK=L_MASK|R_AL_MASK;
    974 
    975 const uint32_t R_AL_AN_MASK=R_AL_MASK|U_MASK(U_ARABIC_NUMBER);
    976 
    977 const uint32_t EN_AN_MASK=U_MASK(U_EUROPEAN_NUMBER)|U_MASK(U_ARABIC_NUMBER);
    978 const uint32_t R_AL_EN_AN_MASK=R_AL_MASK|EN_AN_MASK;
    979 const uint32_t L_EN_MASK=L_MASK|U_MASK(U_EUROPEAN_NUMBER);
    980 
    981 const uint32_t ES_CS_ET_ON_BN_NSM_MASK=
    982    U_MASK(U_EUROPEAN_NUMBER_SEPARATOR)|
    983    U_MASK(U_COMMON_NUMBER_SEPARATOR)|
    984    U_MASK(U_EUROPEAN_NUMBER_TERMINATOR)|
    985    U_MASK(U_OTHER_NEUTRAL)|
    986    U_MASK(U_BOUNDARY_NEUTRAL)|
    987    U_MASK(U_DIR_NON_SPACING_MARK);
    988 const uint32_t L_EN_ES_CS_ET_ON_BN_NSM_MASK=L_EN_MASK|ES_CS_ET_ON_BN_NSM_MASK;
    989 const uint32_t R_AL_AN_EN_ES_CS_ET_ON_BN_NSM_MASK=R_AL_MASK|EN_AN_MASK|ES_CS_ET_ON_BN_NSM_MASK;
    990 
    991 // We scan the whole label and check both for whether it contains RTL characters
    992 // and whether it passes the BiDi Rule.
    993 // In a BiDi domain name, all labels must pass the BiDi Rule, but we might find
    994 // that a domain name is a BiDi domain name (has an RTL label) only after
    995 // processing several earlier labels.
    996 void
    997 UTS46::checkLabelBiDi(const char16_t *label, int32_t labelLength, IDNAInfo &info) const {
    998    // IDNA2008 BiDi rule
    999    // Get the directionality of the first character.
   1000    UChar32 c;
   1001    int32_t i=0;
   1002    U16_NEXT_UNSAFE(label, i, c);
   1003    uint32_t firstMask=U_MASK(u_charDirection(c));
   1004    // 1. The first character must be a character with BIDI property L, R
   1005    // or AL.  If it has the R or AL property, it is an RTL label; if it
   1006    // has the L property, it is an LTR label.
   1007    if((firstMask&~L_R_AL_MASK)!=0) {
   1008        info.isOkBiDi=false;
   1009    }
   1010    // Get the directionality of the last non-NSM character.
   1011    uint32_t lastMask;
   1012    for(;;) {
   1013        if(i>=labelLength) {
   1014            lastMask=firstMask;
   1015            break;
   1016        }
   1017        U16_PREV_UNSAFE(label, labelLength, c);
   1018        UCharDirection dir=u_charDirection(c);
   1019        if(dir!=U_DIR_NON_SPACING_MARK) {
   1020            lastMask=U_MASK(dir);
   1021            break;
   1022        }
   1023    }
   1024    // 3. In an RTL label, the end of the label must be a character with
   1025    // BIDI property R, AL, EN or AN, followed by zero or more
   1026    // characters with BIDI property NSM.
   1027    // 6. In an LTR label, the end of the label must be a character with
   1028    // BIDI property L or EN, followed by zero or more characters with
   1029    // BIDI property NSM.
   1030    if( (firstMask&L_MASK)!=0 ?
   1031            (lastMask&~L_EN_MASK)!=0 :
   1032            (lastMask&~R_AL_EN_AN_MASK)!=0
   1033    ) {
   1034        info.isOkBiDi=false;
   1035    }
   1036    // Add the directionalities of the intervening characters.
   1037    uint32_t mask=firstMask|lastMask;
   1038    while(i<labelLength) {
   1039        U16_NEXT_UNSAFE(label, i, c);
   1040        mask|=U_MASK(u_charDirection(c));
   1041    }
   1042    if(firstMask&L_MASK) {
   1043        // 5. In an LTR label, only characters with the BIDI properties L, EN,
   1044        // ES, CS, ET, ON, BN and NSM are allowed.
   1045        if((mask&~L_EN_ES_CS_ET_ON_BN_NSM_MASK)!=0) {
   1046            info.isOkBiDi=false;
   1047        }
   1048    } else {
   1049        // 2. In an RTL label, only characters with the BIDI properties R, AL,
   1050        // AN, EN, ES, CS, ET, ON, BN and NSM are allowed.
   1051        if((mask&~R_AL_AN_EN_ES_CS_ET_ON_BN_NSM_MASK)!=0) {
   1052            info.isOkBiDi=false;
   1053        }
   1054        // 4. In an RTL label, if an EN is present, no AN may be present, and
   1055        // vice versa.
   1056        if((mask&EN_AN_MASK)==EN_AN_MASK) {
   1057            info.isOkBiDi=false;
   1058        }
   1059    }
   1060    // An RTL label is a label that contains at least one character of type
   1061    // R, AL or AN. [...]
   1062    // A "BIDI domain name" is a domain name that contains at least one RTL
   1063    // label. [...]
   1064    // The following rule, consisting of six conditions, applies to labels
   1065    // in BIDI domain names.
   1066    if((mask&R_AL_AN_MASK)!=0) {
   1067        info.isBiDi=true;
   1068    }
   1069 }
   1070 
   1071 // Special code for the ASCII prefix of a BiDi domain name.
   1072 // The ASCII prefix is all-LTR.
   1073 
   1074 // IDNA2008 BiDi rule, parts relevant to ASCII labels:
   1075 // 1. The first character must be a character with BIDI property L [...]
   1076 // 5. In an LTR label, only characters with the BIDI properties L, EN,
   1077 // ES, CS, ET, ON, BN and NSM are allowed.
   1078 // 6. In an LTR label, the end of the label must be a character with
   1079 // BIDI property L or EN [...]
   1080 
   1081 // UTF-16 version, called for mapped ASCII prefix.
   1082 // Cannot contain uppercase A-Z.
   1083 // s[length-1] must be the trailing dot.
   1084 static UBool
   1085 isASCIIOkBiDi(const char16_t *s, int32_t length) {
   1086    int32_t labelStart=0;
   1087    for(int32_t i=0; i<length; ++i) {
   1088        char16_t c=s[i];
   1089        if(c==0x2e) {  // dot
   1090            if(i>labelStart) {
   1091                c=s[i-1];
   1092                if(!(0x61<=c && c<=0x7a) && !(0x30<=c && c<=0x39)) {
   1093                    // Last character in the label is not an L or EN.
   1094                    return false;
   1095                }
   1096            }
   1097            labelStart=i+1;
   1098        } else if(i==labelStart) {
   1099            if(!(0x61<=c && c<=0x7a)) {
   1100                // First character in the label is not an L.
   1101                return false;
   1102            }
   1103        } else {
   1104            if(c<=0x20 && (c>=0x1c || (9<=c && c<=0xd))) {
   1105                // Intermediate character in the label is a B, S or WS.
   1106                return false;
   1107            }
   1108        }
   1109    }
   1110    return true;
   1111 }
   1112 
   1113 // UTF-8 version, called for source ASCII prefix.
   1114 // Can contain uppercase A-Z.
   1115 // s[length-1] must be the trailing dot.
   1116 static UBool
   1117 isASCIIOkBiDi(const char *s, int32_t length) {
   1118    int32_t labelStart=0;
   1119    for(int32_t i=0; i<length; ++i) {
   1120        char c=s[i];
   1121        if(c==0x2e) {  // dot
   1122            if(i>labelStart) {
   1123                c=s[i-1];
   1124                if(!(0x61<=c && c<=0x7a) && !(0x41<=c && c<=0x5a) && !(0x30<=c && c<=0x39)) {
   1125                    // Last character in the label is not an L or EN.
   1126                    return false;
   1127                }
   1128            }
   1129            labelStart=i+1;
   1130        } else if(i==labelStart) {
   1131            if(!(0x61<=c && c<=0x7a) && !(0x41<=c && c<=0x5a)) {
   1132                // First character in the label is not an L.
   1133                return false;
   1134            }
   1135        } else {
   1136            if(c<=0x20 && (c>=0x1c || (9<=c && c<=0xd))) {
   1137                // Intermediate character in the label is a B, S or WS.
   1138                return false;
   1139            }
   1140        }
   1141    }
   1142    return true;
   1143 }
   1144 
   1145 UBool
   1146 UTS46::isLabelOkContextJ(const char16_t *label, int32_t labelLength) const {
   1147    // [IDNA2008-Tables]
   1148    // 200C..200D  ; CONTEXTJ    # ZERO WIDTH NON-JOINER..ZERO WIDTH JOINER
   1149    for(int32_t i=0; i<labelLength; ++i) {
   1150        if(label[i]==0x200c) {
   1151            // Appendix A.1. ZERO WIDTH NON-JOINER
   1152            // Rule Set:
   1153            //  False;
   1154            //  If Canonical_Combining_Class(Before(cp)) .eq.  Virama Then True;
   1155            //  If RegExpMatch((Joining_Type:{L,D})(Joining_Type:T)*\u200C
   1156            //     (Joining_Type:T)*(Joining_Type:{R,D})) Then True;
   1157            if(i==0) {
   1158                return false;
   1159            }
   1160            UChar32 c;
   1161            int32_t j=i;
   1162            U16_PREV_UNSAFE(label, j, c);
   1163            if(uts46Norm2.getCombiningClass(c)==9) {
   1164                continue;
   1165            }
   1166            // check precontext (Joining_Type:{L,D})(Joining_Type:T)*
   1167            for(;;) {
   1168                UJoiningType type=ubidi_getJoiningType(c);
   1169                if(type==U_JT_TRANSPARENT) {
   1170                    if(j==0) {
   1171                        return false;
   1172                    }
   1173                    U16_PREV_UNSAFE(label, j, c);
   1174                } else if(type==U_JT_LEFT_JOINING || type==U_JT_DUAL_JOINING) {
   1175                    break;  // precontext fulfilled
   1176                } else {
   1177                    return false;
   1178                }
   1179            }
   1180            // check postcontext (Joining_Type:T)*(Joining_Type:{R,D})
   1181            for(j=i+1;;) {
   1182                if(j==labelLength) {
   1183                    return false;
   1184                }
   1185                U16_NEXT_UNSAFE(label, j, c);
   1186                UJoiningType type=ubidi_getJoiningType(c);
   1187                if(type==U_JT_TRANSPARENT) {
   1188                    // just skip this character
   1189                } else if(type==U_JT_RIGHT_JOINING || type==U_JT_DUAL_JOINING) {
   1190                    break;  // postcontext fulfilled
   1191                } else {
   1192                    return false;
   1193                }
   1194            }
   1195        } else if(label[i]==0x200d) {
   1196            // Appendix A.2. ZERO WIDTH JOINER (U+200D)
   1197            // Rule Set:
   1198            //  False;
   1199            //  If Canonical_Combining_Class(Before(cp)) .eq.  Virama Then True;
   1200            if(i==0) {
   1201                return false;
   1202            }
   1203            UChar32 c;
   1204            int32_t j=i;
   1205            U16_PREV_UNSAFE(label, j, c);
   1206            if(uts46Norm2.getCombiningClass(c)!=9) {
   1207                return false;
   1208            }
   1209        }
   1210    }
   1211    return true;
   1212 }
   1213 
   1214 void
   1215 UTS46::checkLabelContextO(const char16_t *label, int32_t labelLength, IDNAInfo &info) const {
   1216    int32_t labelEnd=labelLength-1;  // inclusive
   1217    int32_t arabicDigits=0;  // -1 for 066x, +1 for 06Fx
   1218    for(int32_t i=0; i<=labelEnd; ++i) {
   1219        UChar32 c=label[i];
   1220        if(c<0xb7) {
   1221            // ASCII fastpath
   1222        } else if(c<=0x6f9) {
   1223            if(c==0xb7) {
   1224                // Appendix A.3. MIDDLE DOT (U+00B7)
   1225                // Rule Set:
   1226                //  False;
   1227                //  If Before(cp) .eq.  U+006C And
   1228                //     After(cp) .eq.  U+006C Then True;
   1229                if(!(0<i && label[i-1]==0x6c &&
   1230                     i<labelEnd && label[i+1]==0x6c)) {
   1231                    info.labelErrors|=UIDNA_ERROR_CONTEXTO_PUNCTUATION;
   1232                }
   1233            } else if(c==0x375) {
   1234                // Appendix A.4. GREEK LOWER NUMERAL SIGN (KERAIA) (U+0375)
   1235                // Rule Set:
   1236                //  False;
   1237                //  If Script(After(cp)) .eq.  Greek Then True;
   1238                UScriptCode script=USCRIPT_INVALID_CODE;
   1239                if(i<labelEnd) {
   1240                    UErrorCode errorCode=U_ZERO_ERROR;
   1241                    int32_t j=i+1;
   1242                    U16_NEXT(label, j, labelLength, c);
   1243                    script=uscript_getScript(c, &errorCode);
   1244                }
   1245                if(script!=USCRIPT_GREEK) {
   1246                    info.labelErrors|=UIDNA_ERROR_CONTEXTO_PUNCTUATION;
   1247                }
   1248            } else if(c==0x5f3 || c==0x5f4) {
   1249                // Appendix A.5. HEBREW PUNCTUATION GERESH (U+05F3)
   1250                // Rule Set:
   1251                //  False;
   1252                //  If Script(Before(cp)) .eq.  Hebrew Then True;
   1253                //
   1254                // Appendix A.6. HEBREW PUNCTUATION GERSHAYIM (U+05F4)
   1255                // Rule Set:
   1256                //  False;
   1257                //  If Script(Before(cp)) .eq.  Hebrew Then True;
   1258                UScriptCode script=USCRIPT_INVALID_CODE;
   1259                if(0<i) {
   1260                    UErrorCode errorCode=U_ZERO_ERROR;
   1261                    int32_t j=i;
   1262                    U16_PREV(label, 0, j, c);
   1263                    script=uscript_getScript(c, &errorCode);
   1264                }
   1265                if(script!=USCRIPT_HEBREW) {
   1266                    info.labelErrors|=UIDNA_ERROR_CONTEXTO_PUNCTUATION;
   1267                }
   1268            } else if(0x660<=c /* && c<=0x6f9 */) {
   1269                // Appendix A.8. ARABIC-INDIC DIGITS (0660..0669)
   1270                // Rule Set:
   1271                //  True;
   1272                //  For All Characters:
   1273                //    If cp .in. 06F0..06F9 Then False;
   1274                //  End For;
   1275                //
   1276                // Appendix A.9. EXTENDED ARABIC-INDIC DIGITS (06F0..06F9)
   1277                // Rule Set:
   1278                //  True;
   1279                //  For All Characters:
   1280                //    If cp .in. 0660..0669 Then False;
   1281                //  End For;
   1282                if(c<=0x669) {
   1283                    if(arabicDigits>0) {
   1284                        info.labelErrors|=UIDNA_ERROR_CONTEXTO_DIGITS;
   1285                    }
   1286                    arabicDigits=-1;
   1287                } else if(0x6f0<=c) {
   1288                    if(arabicDigits<0) {
   1289                        info.labelErrors|=UIDNA_ERROR_CONTEXTO_DIGITS;
   1290                    }
   1291                    arabicDigits=1;
   1292                }
   1293            }
   1294        } else if(c==0x30fb) {
   1295            // Appendix A.7. KATAKANA MIDDLE DOT (U+30FB)
   1296            // Rule Set:
   1297            //  False;
   1298            //  For All Characters:
   1299            //    If Script(cp) .in. {Hiragana, Katakana, Han} Then True;
   1300            //  End For;
   1301            UErrorCode errorCode=U_ZERO_ERROR;
   1302            for(int j=0;;) {
   1303                if(j>labelEnd) {
   1304                    info.labelErrors|=UIDNA_ERROR_CONTEXTO_PUNCTUATION;
   1305                    break;
   1306                }
   1307                U16_NEXT(label, j, labelLength, c);
   1308                UScriptCode script=uscript_getScript(c, &errorCode);
   1309                if(script==USCRIPT_HIRAGANA || script==USCRIPT_KATAKANA || script==USCRIPT_HAN) {
   1310                    break;
   1311                }
   1312            }
   1313        }
   1314    }
   1315 }
   1316 
   1317 U_NAMESPACE_END
   1318 
   1319 // C API ------------------------------------------------------------------- ***
   1320 
   1321 U_NAMESPACE_USE
   1322 
   1323 U_CAPI UIDNA * U_EXPORT2
   1324 uidna_openUTS46(uint32_t options, UErrorCode *pErrorCode) {
   1325    return reinterpret_cast<UIDNA *>(IDNA::createUTS46Instance(options, *pErrorCode));
   1326 }
   1327 
   1328 U_CAPI void U_EXPORT2
   1329 uidna_close(UIDNA *idna) {
   1330    delete reinterpret_cast<IDNA *>(idna);
   1331 }
   1332 
   1333 static UBool
   1334 checkArgs(const void *label, int32_t length,
   1335          void *dest, int32_t capacity,
   1336          UIDNAInfo *pInfo, UErrorCode *pErrorCode) {
   1337    if(U_FAILURE(*pErrorCode)) {
   1338        return false;
   1339    }
   1340    // sizeof(UIDNAInfo)=16 in the first API version.
   1341    if(pInfo==nullptr || pInfo->size<16) {
   1342        *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
   1343        return false;
   1344    }
   1345    if( (label==nullptr ? length!=0 : length<-1) ||
   1346        (dest==nullptr ? capacity!=0 : capacity<0) ||
   1347        (dest==label && label!=nullptr)
   1348    ) {
   1349        *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
   1350        return false;
   1351    }
   1352    // Set all *pInfo bytes to 0 except for the size field itself.
   1353    uprv_memset(&pInfo->size+1, 0, pInfo->size-sizeof(pInfo->size));
   1354    return true;
   1355 }
   1356 
   1357 static void
   1358 idnaInfoToStruct(IDNAInfo &info, UIDNAInfo *pInfo) {
   1359    pInfo->isTransitionalDifferent=info.isTransitionalDifferent();
   1360    pInfo->errors=info.getErrors();
   1361 }
   1362 
   1363 U_CAPI int32_t U_EXPORT2
   1364 uidna_labelToASCII(const UIDNA *idna,
   1365                   const char16_t *label, int32_t length,
   1366                   char16_t *dest, int32_t capacity,
   1367                   UIDNAInfo *pInfo, UErrorCode *pErrorCode) {
   1368    if(!checkArgs(label, length, dest, capacity, pInfo, pErrorCode)) {
   1369        return 0;
   1370    }
   1371    UnicodeString src(length < 0, label, length);
   1372    UnicodeString destString(dest, 0, capacity);
   1373    IDNAInfo info;
   1374    reinterpret_cast<const IDNA *>(idna)->labelToASCII(src, destString, info, *pErrorCode);
   1375    idnaInfoToStruct(info, pInfo);
   1376    return destString.extract(dest, capacity, *pErrorCode);
   1377 }
   1378 
   1379 U_CAPI int32_t U_EXPORT2
   1380 uidna_labelToUnicode(const UIDNA *idna,
   1381                     const char16_t *label, int32_t length,
   1382                     char16_t *dest, int32_t capacity,
   1383                     UIDNAInfo *pInfo, UErrorCode *pErrorCode) {
   1384    if(!checkArgs(label, length, dest, capacity, pInfo, pErrorCode)) {
   1385        return 0;
   1386    }
   1387    UnicodeString src(length < 0, label, length);
   1388    UnicodeString destString(dest, 0, capacity);
   1389    IDNAInfo info;
   1390    reinterpret_cast<const IDNA *>(idna)->labelToUnicode(src, destString, info, *pErrorCode);
   1391    idnaInfoToStruct(info, pInfo);
   1392    return destString.extract(dest, capacity, *pErrorCode);
   1393 }
   1394 
   1395 U_CAPI int32_t U_EXPORT2
   1396 uidna_nameToASCII(const UIDNA *idna,
   1397                  const char16_t *name, int32_t length,
   1398                  char16_t *dest, int32_t capacity,
   1399                  UIDNAInfo *pInfo, UErrorCode *pErrorCode) {
   1400    if(!checkArgs(name, length, dest, capacity, pInfo, pErrorCode)) {
   1401        return 0;
   1402    }
   1403    UnicodeString src(length < 0, name, length);
   1404    UnicodeString destString(dest, 0, capacity);
   1405    IDNAInfo info;
   1406    reinterpret_cast<const IDNA *>(idna)->nameToASCII(src, destString, info, *pErrorCode);
   1407    idnaInfoToStruct(info, pInfo);
   1408    return destString.extract(dest, capacity, *pErrorCode);
   1409 }
   1410 
   1411 U_CAPI int32_t U_EXPORT2
   1412 uidna_nameToUnicode(const UIDNA *idna,
   1413                    const char16_t *name, int32_t length,
   1414                    char16_t *dest, int32_t capacity,
   1415                    UIDNAInfo *pInfo, UErrorCode *pErrorCode) {
   1416    if(!checkArgs(name, length, dest, capacity, pInfo, pErrorCode)) {
   1417        return 0;
   1418    }
   1419    UnicodeString src(length < 0, name, length);
   1420    UnicodeString destString(dest, 0, capacity);
   1421    IDNAInfo info;
   1422    reinterpret_cast<const IDNA *>(idna)->nameToUnicode(src, destString, info, *pErrorCode);
   1423    idnaInfoToStruct(info, pInfo);
   1424    return destString.extract(dest, capacity, *pErrorCode);
   1425 }
   1426 
   1427 U_CAPI int32_t U_EXPORT2
   1428 uidna_labelToASCII_UTF8(const UIDNA *idna,
   1429                        const char *label, int32_t length,
   1430                        char *dest, int32_t capacity,
   1431                        UIDNAInfo *pInfo, UErrorCode *pErrorCode) {
   1432    if(!checkArgs(label, length, dest, capacity, pInfo, pErrorCode)) {
   1433        return 0;
   1434    }
   1435    StringPiece src(label, length<0 ? static_cast<int32_t>(uprv_strlen(label)) : length);
   1436    return ByteSinkUtil::viaByteSinkToTerminatedChars(
   1437        dest, capacity,
   1438        [&](ByteSink& sink, UErrorCode& status) {
   1439            IDNAInfo info;
   1440            reinterpret_cast<const IDNA *>(idna)->labelToASCII_UTF8(src, sink, info, status);
   1441            idnaInfoToStruct(info, pInfo);
   1442        },
   1443        *pErrorCode);
   1444 }
   1445 
   1446 U_CAPI int32_t U_EXPORT2
   1447 uidna_labelToUnicodeUTF8(const UIDNA *idna,
   1448                         const char *label, int32_t length,
   1449                         char *dest, int32_t capacity,
   1450                         UIDNAInfo *pInfo, UErrorCode *pErrorCode) {
   1451    if(!checkArgs(label, length, dest, capacity, pInfo, pErrorCode)) {
   1452        return 0;
   1453    }
   1454    StringPiece src(label, length<0 ? static_cast<int32_t>(uprv_strlen(label)) : length);
   1455    return ByteSinkUtil::viaByteSinkToTerminatedChars(
   1456        dest, capacity,
   1457        [&](ByteSink& sink, UErrorCode& status) {
   1458            IDNAInfo info;
   1459            reinterpret_cast<const IDNA *>(idna)->labelToUnicodeUTF8(src, sink, info, status);
   1460            idnaInfoToStruct(info, pInfo);
   1461        },
   1462        *pErrorCode);
   1463 }
   1464 
   1465 U_CAPI int32_t U_EXPORT2
   1466 uidna_nameToASCII_UTF8(const UIDNA *idna,
   1467                       const char *name, int32_t length,
   1468                       char *dest, int32_t capacity,
   1469                       UIDNAInfo *pInfo, UErrorCode *pErrorCode) {
   1470    if(!checkArgs(name, length, dest, capacity, pInfo, pErrorCode)) {
   1471        return 0;
   1472    }
   1473    StringPiece src(name, length<0 ? static_cast<int32_t>(uprv_strlen(name)) : length);
   1474    return ByteSinkUtil::viaByteSinkToTerminatedChars(
   1475        dest, capacity,
   1476        [&](ByteSink& sink, UErrorCode& status) {
   1477            IDNAInfo info;
   1478            reinterpret_cast<const IDNA *>(idna)->nameToASCII_UTF8(src, sink, info, status);
   1479            idnaInfoToStruct(info, pInfo);
   1480        },
   1481        *pErrorCode);
   1482 }
   1483 
   1484 U_CAPI int32_t U_EXPORT2
   1485 uidna_nameToUnicodeUTF8(const UIDNA *idna,
   1486                        const char *name, int32_t length,
   1487                        char *dest, int32_t capacity,
   1488                        UIDNAInfo *pInfo, UErrorCode *pErrorCode) {
   1489    if(!checkArgs(name, length, dest, capacity, pInfo, pErrorCode)) {
   1490        return 0;
   1491    }
   1492    StringPiece src(name, length<0 ? static_cast<int32_t>(uprv_strlen(name)) : length);
   1493    return ByteSinkUtil::viaByteSinkToTerminatedChars(
   1494        dest, capacity,
   1495        [&](ByteSink& sink, UErrorCode& status) {
   1496            IDNAInfo info;
   1497            reinterpret_cast<const IDNA *>(idna)->nameToUnicodeUTF8(src, sink, info, status);
   1498            idnaInfoToStruct(info, pInfo);
   1499        },
   1500        *pErrorCode);
   1501 }
   1502 
   1503 #endif  // UCONFIG_NO_IDNA
	tor-browser The Tor Browser
	git clone https://git.dasho.dev/tor-browser.git
	Log \| Files \| Refs \| README \| LICENSE