tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

uspoof_impl.cpp (31721B)


      1 // © 2016 and later: Unicode, Inc. and others.
      2 // License & terms of use: http://www.unicode.org/copyright.html
      3 /*
      4 **********************************************************************
      5 *   Copyright (C) 2008-2016, International Business Machines
      6 *   Corporation and others.  All Rights Reserved.
      7 **********************************************************************
      8 */
      9 
     10 #include "unicode/utypes.h"
     11 #include "unicode/uspoof.h"
     12 #include "unicode/uchar.h"
     13 #include "unicode/uniset.h"
     14 #include "unicode/utf16.h"
     15 #include "utrie2.h"
     16 #include "cmemory.h"
     17 #include "cstring.h"
     18 #include "scriptset.h"
     19 #include "umutex.h"
     20 #include "udataswp.h"
     21 #include "uassert.h"
     22 #include "ucln_in.h"
     23 #include "uspoof_impl.h"
     24 
     25 #if !UCONFIG_NO_NORMALIZATION
     26 
     27 
     28 U_NAMESPACE_BEGIN
     29 
     30 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(SpoofImpl)
     31 
     32 SpoofImpl::SpoofImpl(SpoofData *data, UErrorCode& status) {
     33    construct(status);
     34    fSpoofData = data;
     35 }
     36 
     37 SpoofImpl::SpoofImpl(UErrorCode& status) {
     38    construct(status);
     39 
     40    // TODO: Call this method where it is actually needed, instead of in the
     41    // constructor, to allow for lazy data loading.  See #12696.
     42    fSpoofData = SpoofData::getDefault(status);
     43 }
     44 
     45 SpoofImpl::SpoofImpl() {
     46    UErrorCode status = U_ZERO_ERROR;
     47    construct(status);
     48 
     49    // TODO: Call this method where it is actually needed, instead of in the
     50    // constructor, to allow for lazy data loading.  See #12696.
     51    fSpoofData = SpoofData::getDefault(status);
     52 }
     53 
     54 void SpoofImpl::construct(UErrorCode& status) {
     55    fChecks = USPOOF_ALL_CHECKS;
     56    fSpoofData = nullptr;
     57    fAllowedCharsSet = nullptr;
     58    fAllowedLocales = nullptr;
     59    fRestrictionLevel = USPOOF_HIGHLY_RESTRICTIVE;
     60 
     61    if (U_FAILURE(status)) { return; }
     62 
     63    UnicodeSet *allowedCharsSet = new UnicodeSet(0, 0x10ffff);
     64    fAllowedCharsSet = allowedCharsSet;
     65    fAllowedLocales  = uprv_strdup("");
     66    if (fAllowedCharsSet == nullptr || fAllowedLocales == nullptr) {
     67        status = U_MEMORY_ALLOCATION_ERROR;
     68        return;
     69    }
     70    allowedCharsSet->freeze();
     71 }
     72 
     73 
     74 // Copy Constructor, used by the user level clone() function.
     75 SpoofImpl::SpoofImpl(const SpoofImpl &src, UErrorCode &status)  :
     76        fChecks(USPOOF_ALL_CHECKS), fSpoofData(nullptr), fAllowedCharsSet(nullptr) , 
     77        fAllowedLocales(nullptr) {
     78    if (U_FAILURE(status)) {
     79        return;
     80    }
     81    fChecks = src.fChecks;
     82    if (src.fSpoofData != nullptr) {
     83        fSpoofData = src.fSpoofData->addReference();
     84    }
     85    fAllowedCharsSet = src.fAllowedCharsSet->clone();
     86    fAllowedLocales = uprv_strdup(src.fAllowedLocales);
     87    if (fAllowedCharsSet == nullptr || fAllowedLocales == nullptr) {
     88        status = U_MEMORY_ALLOCATION_ERROR;
     89    }
     90    fRestrictionLevel = src.fRestrictionLevel;
     91 }
     92 
     93 SpoofImpl::~SpoofImpl() {
     94    if (fSpoofData != nullptr) {
     95        fSpoofData->removeReference();   // Will delete if refCount goes to zero.
     96    }
     97    delete fAllowedCharsSet;
     98    uprv_free((void *)fAllowedLocales);
     99 }
    100 
    101 //  Cast this instance as a USpoofChecker for the C API.
    102 USpoofChecker *SpoofImpl::asUSpoofChecker() {
    103    return exportForC();
    104 }
    105 
    106 //
    107 //  Incoming parameter check on Status and the SpoofChecker object
    108 //    received from the C API.
    109 //
    110 const SpoofImpl *SpoofImpl::validateThis(const USpoofChecker *sc, UErrorCode &status) {
    111    const auto* This = validate(sc, status);
    112    if (U_FAILURE(status)) {
    113        return nullptr;
    114    }
    115    if (This->fSpoofData != nullptr && !This->fSpoofData->validateDataVersion(status)) {
    116        return nullptr;
    117    }
    118    return This;
    119 }
    120 
    121 SpoofImpl *SpoofImpl::validateThis(USpoofChecker *sc, UErrorCode &status) {
    122    return const_cast<SpoofImpl *>
    123        (SpoofImpl::validateThis(const_cast<const USpoofChecker *>(sc), status));
    124 }
    125 
    126 
    127 void SpoofImpl::setAllowedLocales(const char *localesList, UErrorCode &status) {
    128    UnicodeSet    allowedChars;
    129    UnicodeSet    *tmpSet = nullptr;
    130    const char    *locStart = localesList;
    131    const char    *locEnd = nullptr;
    132    const char    *localesListEnd = localesList + uprv_strlen(localesList);
    133    int32_t        localeListCount = 0;   // Number of locales provided by caller.
    134 
    135    // Loop runs once per locale from the localesList, a comma separated list of locales.
    136    do {
    137        locEnd = uprv_strchr(locStart, ',');
    138        if (locEnd == nullptr) {
    139            locEnd = localesListEnd;
    140        }
    141        while (*locStart == ' ') {
    142            locStart++;
    143        }
    144        const char *trimmedEnd = locEnd-1;
    145        while (trimmedEnd > locStart && *trimmedEnd == ' ') {
    146            trimmedEnd--;
    147        }
    148        if (trimmedEnd <= locStart) {
    149            break;
    150        }
    151        const char* locale = uprv_strndup(locStart, static_cast<int32_t>(trimmedEnd + 1 - locStart));
    152        localeListCount++;
    153 
    154        // We have one locale from the locales list.
    155        // Add the script chars for this locale to the accumulating set of allowed chars.
    156        // If the locale is no good, we will be notified back via status.
    157        addScriptChars(locale, &allowedChars, status);
    158        uprv_free((void *)locale);
    159        if (U_FAILURE(status)) {
    160            break;
    161        }
    162        locStart = locEnd + 1;
    163    } while (locStart < localesListEnd);
    164 
    165    // If our caller provided an empty list of locales, we disable the allowed characters checking
    166    if (localeListCount == 0) {
    167        uprv_free((void *)fAllowedLocales);
    168        fAllowedLocales = uprv_strdup("");
    169        tmpSet = new UnicodeSet(0, 0x10ffff);
    170        if (fAllowedLocales == nullptr || tmpSet == nullptr) {
    171            status = U_MEMORY_ALLOCATION_ERROR;
    172            return;
    173        } 
    174        tmpSet->freeze();
    175        delete fAllowedCharsSet;
    176        fAllowedCharsSet = tmpSet;
    177        fChecks &= ~USPOOF_CHAR_LIMIT;
    178        return;
    179    }
    180 
    181        
    182    // Add all common and inherited characters to the set of allowed chars.
    183    UnicodeSet tempSet;
    184    tempSet.applyIntPropertyValue(UCHAR_SCRIPT, USCRIPT_COMMON, status);
    185    allowedChars.addAll(tempSet);
    186    tempSet.applyIntPropertyValue(UCHAR_SCRIPT, USCRIPT_INHERITED, status);
    187    allowedChars.addAll(tempSet);
    188    
    189    // If anything went wrong, we bail out without changing
    190    // the state of the spoof checker.
    191    if (U_FAILURE(status)) {
    192        return;
    193    }
    194 
    195    // Store the updated spoof checker state.
    196    tmpSet = allowedChars.clone();
    197    const char *tmpLocalesList = uprv_strdup(localesList);
    198    if (tmpSet == nullptr || tmpLocalesList == nullptr) {
    199        status = U_MEMORY_ALLOCATION_ERROR;
    200        return;
    201    }
    202    uprv_free((void *)fAllowedLocales);
    203    fAllowedLocales = tmpLocalesList;
    204    tmpSet->freeze();
    205    delete fAllowedCharsSet;
    206    fAllowedCharsSet = tmpSet;
    207    fChecks |= USPOOF_CHAR_LIMIT;
    208 }
    209 
    210 
    211 const char * SpoofImpl::getAllowedLocales(UErrorCode &/*status*/) {
    212    return fAllowedLocales;
    213 }
    214 
    215 
    216 // Given a locale (a language), add all the characters from all of the scripts used with that language
    217 // to the allowedChars UnicodeSet
    218 
    219 void SpoofImpl::addScriptChars(const char *locale, UnicodeSet *allowedChars, UErrorCode &status) {
    220    UScriptCode scripts[30];
    221 
    222    int32_t numScripts = uscript_getCode(locale, scripts, UPRV_LENGTHOF(scripts), &status);
    223    if (U_FAILURE(status)) {
    224        return;
    225    }
    226    if (status == U_USING_DEFAULT_WARNING) {
    227        status = U_ILLEGAL_ARGUMENT_ERROR;
    228        return;
    229    }
    230    UnicodeSet tmpSet;
    231    int32_t    i;
    232    for (i=0; i<numScripts; i++) {
    233        tmpSet.applyIntPropertyValue(UCHAR_SCRIPT, scripts[i], status);
    234        allowedChars->addAll(tmpSet);
    235    }
    236 }
    237 
    238 // Computes the augmented script set for a code point, according to UTS 39 section 5.1.
    239 void SpoofImpl::getAugmentedScriptSet(UChar32 codePoint, ScriptSet& result, UErrorCode& status) {
    240    result.resetAll();
    241    result.setScriptExtensions(codePoint, status);
    242    if (U_FAILURE(status)) { return; }
    243 
    244    // Section 5.1 step 1
    245    if (result.test(USCRIPT_HAN, status)) {
    246        result.set(USCRIPT_HAN_WITH_BOPOMOFO, status);
    247        result.set(USCRIPT_JAPANESE, status);
    248        result.set(USCRIPT_KOREAN, status);
    249    }
    250    if (result.test(USCRIPT_HIRAGANA, status)) {
    251        result.set(USCRIPT_JAPANESE, status);
    252    }
    253    if (result.test(USCRIPT_KATAKANA, status)) {
    254        result.set(USCRIPT_JAPANESE, status);
    255    }
    256    if (result.test(USCRIPT_HANGUL, status)) {
    257        result.set(USCRIPT_KOREAN, status);
    258    }
    259    if (result.test(USCRIPT_BOPOMOFO, status)) {
    260        result.set(USCRIPT_HAN_WITH_BOPOMOFO, status);
    261    }
    262 
    263    // Section 5.1 step 2
    264    if (result.test(USCRIPT_COMMON, status) || result.test(USCRIPT_INHERITED, status)) {
    265        result.setAll();
    266    }
    267 }
    268 
    269 // Computes the resolved script set for a string, according to UTS 39 section 5.1.
    270 void SpoofImpl::getResolvedScriptSet(const UnicodeString& input, ScriptSet& result, UErrorCode& status) const {
    271    getResolvedScriptSetWithout(input, USCRIPT_CODE_LIMIT, result, status);
    272 }
    273 
    274 // Computes the resolved script set for a string, omitting characters having the specified script.
    275 // If USCRIPT_CODE_LIMIT is passed as the second argument, all characters are included.
    276 void SpoofImpl::getResolvedScriptSetWithout(const UnicodeString& input, UScriptCode script, ScriptSet& result, UErrorCode& status) const {
    277    result.setAll();
    278 
    279    ScriptSet temp;
    280    UChar32 codePoint;
    281    for (int32_t i = 0; i < input.length(); i += U16_LENGTH(codePoint)) {
    282        codePoint = input.char32At(i);
    283 
    284        // Compute the augmented script set for the character
    285        getAugmentedScriptSet(codePoint, temp, status);
    286        if (U_FAILURE(status)) { return; }
    287 
    288        // Intersect the augmented script set with the resolved script set, but only if the character doesn't
    289        // have the script specified in the function call
    290        if (script == USCRIPT_CODE_LIMIT || !temp.test(script, status)) {
    291            result.intersect(temp);
    292        }
    293    }
    294 }
    295 
    296 // Computes the set of numerics for a string, according to UTS 39 section 5.3.
    297 void SpoofImpl::getNumerics(const UnicodeString& input, UnicodeSet& result, UErrorCode& /*status*/) const {
    298    result.clear();
    299 
    300    UChar32 codePoint;
    301    for (int32_t i = 0; i < input.length(); i += U16_LENGTH(codePoint)) {
    302        codePoint = input.char32At(i);
    303 
    304        // Store a representative character for each kind of decimal digit
    305        if (u_charType(codePoint) == U_DECIMAL_DIGIT_NUMBER) {
    306            // Store the zero character as a representative for comparison.
    307            // Unicode guarantees it is codePoint - value
    308            result.add(codePoint - static_cast<UChar32>(u_getNumericValue(codePoint)));
    309        }
    310    }
    311 }
    312 
    313 // Computes the restriction level of a string, according to UTS 39 section 5.2.
    314 URestrictionLevel SpoofImpl::getRestrictionLevel(const UnicodeString& input, UErrorCode& status) const {
    315    // Section 5.2 step 1:
    316    if (!fAllowedCharsSet->containsAll(input)) {
    317        return USPOOF_UNRESTRICTIVE;
    318    }
    319 
    320    // Section 5.2 step 2
    321    // Java use a static UnicodeSet for this test.  In C++, avoid the static variable
    322    // and just do a simple for loop.
    323    UBool allASCII = true;
    324    for (int32_t i=0, length=input.length(); i<length; i++) {
    325        if (input.charAt(i) > 0x7f) {
    326            allASCII = false;
    327            break;
    328        }
    329    }
    330    if (allASCII) {
    331        return USPOOF_ASCII;
    332    }
    333 
    334    // Section 5.2 steps 3:
    335    ScriptSet resolvedScriptSet;
    336    getResolvedScriptSet(input, resolvedScriptSet, status);
    337    if (U_FAILURE(status)) { return USPOOF_UNRESTRICTIVE; }
    338 
    339    // Section 5.2 step 4:
    340    if (!resolvedScriptSet.isEmpty()) {
    341        return USPOOF_SINGLE_SCRIPT_RESTRICTIVE;
    342    }
    343 
    344    // Section 5.2 step 5:
    345    ScriptSet resolvedNoLatn;
    346    getResolvedScriptSetWithout(input, USCRIPT_LATIN, resolvedNoLatn, status);
    347    if (U_FAILURE(status)) { return USPOOF_UNRESTRICTIVE; }
    348 
    349    // Section 5.2 step 6:
    350    if (resolvedNoLatn.test(USCRIPT_HAN_WITH_BOPOMOFO, status)
    351            || resolvedNoLatn.test(USCRIPT_JAPANESE, status)
    352            || resolvedNoLatn.test(USCRIPT_KOREAN, status)) {
    353        return USPOOF_HIGHLY_RESTRICTIVE;
    354    }
    355 
    356    // Section 5.2 step 7:
    357    if (!resolvedNoLatn.isEmpty()
    358            && !resolvedNoLatn.test(USCRIPT_CYRILLIC, status)
    359            && !resolvedNoLatn.test(USCRIPT_GREEK, status)
    360            && !resolvedNoLatn.test(USCRIPT_CHEROKEE, status)) {
    361        return USPOOF_MODERATELY_RESTRICTIVE;
    362    }
    363 
    364    // Section 5.2 step 8:
    365    return USPOOF_MINIMALLY_RESTRICTIVE;
    366 }
    367 
    368 int32_t SpoofImpl::findHiddenOverlay(const UnicodeString& input, UErrorCode&) const {
    369    bool sawLeadCharacter = false;
    370    for (int32_t i=0; i<input.length();) {
    371        UChar32 cp = input.char32At(i);
    372        if (sawLeadCharacter && cp == 0x0307) {
    373            return i;
    374        }
    375        uint8_t combiningClass = u_getCombiningClass(cp);
    376        // Skip over characters except for those with combining class 0 (non-combining characters) or with
    377        // combining class 230 (same class as U+0307)
    378        U_ASSERT(u_getCombiningClass(0x0307) == 230);
    379        if (combiningClass == 0 || combiningClass == 230) {
    380            sawLeadCharacter = isIllegalCombiningDotLeadCharacter(cp);
    381        }
    382        i += U16_LENGTH(cp);
    383    }
    384    return -1;
    385 }
    386 
    387 static inline bool isIllegalCombiningDotLeadCharacterNoLookup(UChar32 cp) {
    388    return cp == u'i' || cp == u'j' || cp == u'ı' || cp == u'ȷ' || cp == u'l' ||
    389           u_hasBinaryProperty(cp, UCHAR_SOFT_DOTTED);
    390 }
    391 
    392 bool SpoofImpl::isIllegalCombiningDotLeadCharacter(UChar32 cp) const {
    393    if (isIllegalCombiningDotLeadCharacterNoLookup(cp)) {
    394        return true;
    395    }
    396    UnicodeString skelStr;
    397    fSpoofData->confusableLookup(cp, skelStr);
    398    UChar32 finalCp = skelStr.char32At(skelStr.moveIndex32(skelStr.length(), -1));
    399    if (finalCp != cp && isIllegalCombiningDotLeadCharacterNoLookup(finalCp)) {
    400        return true;
    401    }
    402    return false;
    403 }
    404 
    405 
    406 
    407 // Convert a text format hex number.  Utility function used by builder code.  Static.
    408 // Input: char16_t *string text.  Output: a UChar32
    409 // Input has been pre-checked, and will have no non-hex chars.
    410 // The number must fall in the code point range of 0..0x10ffff
    411 // Static Function.
    412 UChar32 SpoofImpl::ScanHex(const char16_t *s, int32_t start, int32_t limit, UErrorCode &status) {
    413    if (U_FAILURE(status)) {
    414        return 0;
    415    }
    416    U_ASSERT(limit-start > 0);
    417    uint32_t val = 0;
    418    int i;
    419    for (i=start; i<limit; i++) {
    420        int digitVal = s[i] - 0x30;
    421        if (digitVal>9) {
    422            digitVal = 0xa + (s[i] - 0x41);  // Upper Case 'A'
    423        }
    424        if (digitVal>15) {
    425            digitVal = 0xa + (s[i] - 0x61);  // Lower Case 'a'
    426        }
    427        U_ASSERT(digitVal <= 0xf);
    428        val <<= 4;
    429        val += digitVal;
    430    }
    431    if (val > 0x10ffff) {
    432        status = U_PARSE_ERROR;
    433        val = 0;
    434    }
    435    return static_cast<UChar32>(val);
    436 }
    437 
    438 
    439 //-----------------------------------------
    440 //
    441 //   class CheckResult Implementation
    442 //
    443 //-----------------------------------------
    444 
    445 CheckResult::CheckResult() {
    446    clear();
    447 }
    448 
    449 USpoofCheckResult* CheckResult::asUSpoofCheckResult() {
    450    return exportForC();
    451 }
    452 
    453 //
    454 //  Incoming parameter check on Status and the CheckResult object
    455 //    received from the C API.
    456 //
    457 const CheckResult* CheckResult::validateThis(const USpoofCheckResult *ptr, UErrorCode &status) {
    458    return validate(ptr, status);
    459 }
    460 
    461 CheckResult* CheckResult::validateThis(USpoofCheckResult *ptr, UErrorCode &status) {
    462    return validate(ptr, status);
    463 }
    464 
    465 void CheckResult::clear() {
    466    fChecks = 0;
    467    fNumerics.clear();
    468    fRestrictionLevel = USPOOF_UNDEFINED_RESTRICTIVE;
    469 }
    470 
    471 int32_t CheckResult::toCombinedBitmask(int32_t enabledChecks) {
    472    if ((enabledChecks & USPOOF_AUX_INFO) != 0 && fRestrictionLevel != USPOOF_UNDEFINED_RESTRICTIVE) {
    473        return fChecks | fRestrictionLevel;
    474    } else {
    475        return fChecks;
    476    }
    477 }
    478 
    479 CheckResult::~CheckResult() {
    480 }
    481 
    482 //----------------------------------------------------------------------------------------------
    483 //
    484 //   class SpoofData Implementation
    485 //
    486 //----------------------------------------------------------------------------------------------
    487 
    488 
    489 UBool SpoofData::validateDataVersion(UErrorCode &status) const {
    490    if (U_FAILURE(status) ||
    491        fRawData == nullptr ||
    492        fRawData->fMagic != USPOOF_MAGIC ||
    493        fRawData->fFormatVersion[0] != USPOOF_CONFUSABLE_DATA_FORMAT_VERSION ||
    494        fRawData->fFormatVersion[1] != 0 ||
    495        fRawData->fFormatVersion[2] != 0 ||
    496        fRawData->fFormatVersion[3] != 0) {
    497            status = U_INVALID_FORMAT_ERROR;
    498            return false;
    499    }
    500    return true;
    501 }
    502 
    503 static UBool U_CALLCONV
    504 spoofDataIsAcceptable(void *context,
    505                        const char * /* type */, const char * /*name*/,
    506                        const UDataInfo *pInfo) {
    507    if(
    508        pInfo->size >= 20 &&
    509        pInfo->isBigEndian == U_IS_BIG_ENDIAN &&
    510        pInfo->charsetFamily == U_CHARSET_FAMILY &&
    511        pInfo->dataFormat[0] == 0x43 &&  // dataFormat="Cfu "
    512        pInfo->dataFormat[1] == 0x66 &&
    513        pInfo->dataFormat[2] == 0x75 &&
    514        pInfo->dataFormat[3] == 0x20 &&
    515        pInfo->formatVersion[0] == USPOOF_CONFUSABLE_DATA_FORMAT_VERSION
    516    ) {
    517        UVersionInfo *version = static_cast<UVersionInfo *>(context);
    518        if(version != nullptr) {
    519            uprv_memcpy(version, pInfo->dataVersion, 4);
    520        }
    521        return true;
    522    } else {
    523        return false;
    524    }
    525 }
    526 
    527 //  Methods for the loading of the default confusables data file.  The confusable
    528 //  data is loaded only when it is needed.
    529 //
    530 //  SpoofData::getDefault() - Return the default confusables data, and call the
    531 //                            initOnce() if it is not available.  Adds a reference
    532 //                            to the SpoofData that the caller is responsible for
    533 //                            decrementing when they are done with the data.
    534 //
    535 //  uspoof_loadDefaultData - Called once, from initOnce().  The resulting SpoofData
    536 //                           is shared by all spoof checkers using the default data.
    537 //
    538 //  uspoof_cleanupDefaultData - Called during cleanup.
    539 //
    540 
    541 static UInitOnce gSpoofInitDefaultOnce {};
    542 static SpoofData* gDefaultSpoofData;
    543 
    544 static UBool U_CALLCONV
    545 uspoof_cleanupDefaultData() {
    546    if (gDefaultSpoofData) {
    547        // Will delete, assuming all user-level spoof checkers were closed.
    548        gDefaultSpoofData->removeReference();
    549        gDefaultSpoofData = nullptr;
    550        gSpoofInitDefaultOnce.reset();
    551    }
    552    return true;
    553 }
    554 
    555 static void U_CALLCONV uspoof_loadDefaultData(UErrorCode& status) {
    556    UDataMemory *udm = udata_openChoice(nullptr, "cfu", "confusables",
    557                                        spoofDataIsAcceptable, 
    558                                        nullptr,       // context, would receive dataVersion if supplied.
    559                                        &status);
    560    if (U_FAILURE(status)) { return; }
    561    gDefaultSpoofData = new SpoofData(udm, status);
    562    if (U_FAILURE(status)) {
    563        delete gDefaultSpoofData;
    564        gDefaultSpoofData = nullptr;
    565        return;
    566    }
    567    if (gDefaultSpoofData == nullptr) {
    568        status = U_MEMORY_ALLOCATION_ERROR;
    569        return;
    570    }
    571    ucln_i18n_registerCleanup(UCLN_I18N_SPOOFDATA, uspoof_cleanupDefaultData);
    572 }
    573 
    574 SpoofData* SpoofData::getDefault(UErrorCode& status) {
    575    umtx_initOnce(gSpoofInitDefaultOnce, &uspoof_loadDefaultData, status);
    576    if (U_FAILURE(status)) { return nullptr; }
    577    gDefaultSpoofData->addReference();
    578    return gDefaultSpoofData;
    579 }
    580 
    581 
    582 
    583 SpoofData::SpoofData(UDataMemory *udm, UErrorCode &status)
    584 {
    585    reset();
    586    if (U_FAILURE(status)) {
    587        return;
    588    }
    589    fUDM = udm;
    590    // fRawData is non-const because it may be constructed by the data builder.
    591    fRawData = reinterpret_cast<SpoofDataHeader *>(
    592            const_cast<void *>(udata_getMemory(udm)));
    593    validateDataVersion(status);
    594    initPtrs(status);
    595 }
    596 
    597 
    598 SpoofData::SpoofData(const void *data, int32_t length, UErrorCode &status)
    599 {
    600    reset();
    601    if (U_FAILURE(status)) {
    602        return;
    603    }
    604    if (static_cast<size_t>(length) < sizeof(SpoofDataHeader)) {
    605        status = U_INVALID_FORMAT_ERROR;
    606        return;
    607    }
    608    if (data == nullptr) {
    609        status = U_ILLEGAL_ARGUMENT_ERROR;
    610        return;
    611    }
    612    void *ncData = const_cast<void *>(data);
    613    fRawData = static_cast<SpoofDataHeader *>(ncData);
    614    if (length < fRawData->fLength) {
    615        status = U_INVALID_FORMAT_ERROR;
    616        return;
    617    }
    618    validateDataVersion(status);
    619    initPtrs(status);
    620 }
    621 
    622 
    623 // Spoof Data constructor for use from data builder.
    624 //   Initializes a new, empty data area that will be populated later.
    625 SpoofData::SpoofData(UErrorCode &status) {
    626    reset();
    627    if (U_FAILURE(status)) {
    628        return;
    629    }
    630    fDataOwned = true;
    631 
    632    // The spoof header should already be sized to be a multiple of 16 bytes.
    633    // Just in case it's not, round it up.
    634    uint32_t initialSize = (sizeof(SpoofDataHeader) + 15) & ~15;
    635    U_ASSERT(initialSize == sizeof(SpoofDataHeader));
    636    
    637    fRawData = static_cast<SpoofDataHeader *>(uprv_malloc(initialSize));
    638    fMemLimit = initialSize;
    639    if (fRawData == nullptr) {
    640        status = U_MEMORY_ALLOCATION_ERROR;
    641        return;
    642    }
    643    uprv_memset(fRawData, 0, initialSize);
    644 
    645    fRawData->fMagic = USPOOF_MAGIC;
    646    fRawData->fFormatVersion[0] = USPOOF_CONFUSABLE_DATA_FORMAT_VERSION;
    647    fRawData->fFormatVersion[1] = 0;
    648    fRawData->fFormatVersion[2] = 0;
    649    fRawData->fFormatVersion[3] = 0;
    650    initPtrs(status);
    651 }
    652 
    653 // reset() - initialize all fields.
    654 //           Should be updated if any new fields are added.
    655 //           Called by constructors to put things in a known initial state.
    656 void SpoofData::reset() {
    657   fRawData = nullptr;
    658   fDataOwned = false;
    659   fUDM      = nullptr;
    660   fMemLimit = 0;
    661   fRefCount = 1;
    662   fCFUKeys = nullptr;
    663   fCFUValues = nullptr;
    664   fCFUStrings = nullptr;
    665 }
    666 
    667 
    668 //  SpoofData::initPtrs()
    669 //            Initialize the pointers to the various sections of the raw data.
    670 //
    671 //            This function is used both during the Trie building process (multiple
    672 //            times, as the individual data sections are added), and
    673 //            during the opening of a Spoof Checker from prebuilt data.
    674 //
    675 //            The pointers for non-existent data sections (identified by an offset of 0)
    676 //            are set to nullptr.
    677 //
    678 //            Note:  During building the data, adding each new data section
    679 //            reallocs the raw data area, which likely relocates it, which
    680 //            in turn requires reinitializing all of the pointers into it, hence
    681 //            multiple calls to this function during building.
    682 //
    683 void SpoofData::initPtrs(UErrorCode &status) {
    684    fCFUKeys = nullptr;
    685    fCFUValues = nullptr;
    686    fCFUStrings = nullptr;
    687    if (U_FAILURE(status)) {
    688        return;
    689    }
    690    if (fRawData->fCFUKeys != 0) {
    691        fCFUKeys = reinterpret_cast<int32_t*>(reinterpret_cast<char*>(fRawData) + fRawData->fCFUKeys);
    692    }
    693    if (fRawData->fCFUStringIndex != 0) {
    694        fCFUValues = reinterpret_cast<uint16_t*>(reinterpret_cast<char*>(fRawData) + fRawData->fCFUStringIndex);
    695    }
    696    if (fRawData->fCFUStringTable != 0) {
    697        fCFUStrings = reinterpret_cast<char16_t*>(reinterpret_cast<char*>(fRawData) + fRawData->fCFUStringTable);
    698    }
    699 }
    700 
    701 
    702 SpoofData::~SpoofData() {
    703    if (fDataOwned) {
    704        uprv_free(fRawData);
    705    }
    706    fRawData = nullptr;
    707    if (fUDM != nullptr) {
    708        udata_close(fUDM);
    709    }
    710    fUDM = nullptr;
    711 }
    712 
    713 
    714 void SpoofData::removeReference() {
    715    if (umtx_atomic_dec(&fRefCount) == 0) {
    716        delete this;
    717    }
    718 }
    719 
    720 
    721 SpoofData *SpoofData::addReference() {
    722    umtx_atomic_inc(&fRefCount);
    723    return this;
    724 }
    725 
    726 
    727 void *SpoofData::reserveSpace(int32_t numBytes,  UErrorCode &status) {
    728    if (U_FAILURE(status)) {
    729        return nullptr;
    730    }
    731    if (!fDataOwned) {
    732        UPRV_UNREACHABLE_EXIT;
    733    }
    734 
    735    numBytes = (numBytes + 15) & ~15;   // Round up to a multiple of 16
    736    uint32_t returnOffset = fMemLimit;
    737    fMemLimit += numBytes;
    738    fRawData = static_cast<SpoofDataHeader *>(uprv_realloc(fRawData, fMemLimit));
    739    fRawData->fLength = fMemLimit;
    740    uprv_memset((char *)fRawData + returnOffset, 0, numBytes);
    741    initPtrs(status);
    742    return reinterpret_cast<char*>(fRawData) + returnOffset;
    743 }
    744 
    745 int32_t SpoofData::serialize(void *buf, int32_t capacity, UErrorCode &status) const {
    746    int32_t dataSize = fRawData->fLength;
    747    if (capacity < dataSize) {
    748        status = U_BUFFER_OVERFLOW_ERROR;
    749        return dataSize;
    750    }
    751    uprv_memcpy(buf, fRawData, dataSize);
    752    return dataSize;
    753 }
    754 
    755 int32_t SpoofData::size() const {
    756    return fRawData->fLength;
    757 }
    758 
    759 //-------------------------------
    760 //
    761 // Front-end APIs for SpoofData
    762 //
    763 //-------------------------------
    764 
    765 int32_t SpoofData::confusableLookup(UChar32 inChar, UnicodeString &dest) const {
    766    // Perform a binary search.
    767    // [lo, hi), i.e lo is inclusive, hi is exclusive.
    768    // The result after the loop will be in lo.
    769    int32_t lo = 0;
    770    int32_t hi = length();
    771    do {
    772        int32_t mid = (lo + hi) / 2;
    773        if (codePointAt(mid) > inChar) {
    774            hi = mid;
    775        } else if (codePointAt(mid) < inChar) {
    776            lo = mid;
    777        } else {
    778            // Found result.  Break early.
    779            lo = mid;
    780            break;
    781        }
    782    } while (hi - lo > 1);
    783 
    784    // Did we find an entry?  If not, the char maps to itself.
    785    if (codePointAt(lo) != inChar) {
    786        dest.append(inChar);
    787        return 1;
    788    }
    789 
    790    // Add the element to the string builder and return.
    791    return appendValueTo(lo, dest);
    792 }
    793 
    794 int32_t SpoofData::length() const {
    795    return fRawData->fCFUKeysSize;
    796 }
    797 
    798 UChar32 SpoofData::codePointAt(int32_t index) const {
    799    return ConfusableDataUtils::keyToCodePoint(fCFUKeys[index]);
    800 }
    801 
    802 int32_t SpoofData::appendValueTo(int32_t index, UnicodeString& dest) const {
    803    int32_t stringLength = ConfusableDataUtils::keyToLength(fCFUKeys[index]);
    804 
    805    // Value is either a char (for strings of length 1) or
    806    // an index into the string table (for longer strings)
    807    uint16_t value = fCFUValues[index];
    808    if (stringLength == 1) {
    809        dest.append(static_cast<char16_t>(value));
    810    } else {
    811        dest.append(fCFUStrings + value, stringLength);
    812    }
    813 
    814    return stringLength;
    815 }
    816 
    817 
    818 U_NAMESPACE_END
    819 
    820 U_NAMESPACE_USE
    821 
    822 //-----------------------------------------------------------------------------
    823 //
    824 //  uspoof_swap   -  byte swap and char encoding swap of spoof data
    825 //
    826 //-----------------------------------------------------------------------------
    827 U_CAPI int32_t U_EXPORT2
    828 uspoof_swap(const UDataSwapper *ds, const void *inData, int32_t length, void *outData,
    829           UErrorCode *status) {
    830 
    831    if (status == nullptr || U_FAILURE(*status)) {
    832        return 0;
    833    }
    834    if(ds==nullptr || inData==nullptr || length<-1 || (length>0 && outData==nullptr)) {
    835        *status=U_ILLEGAL_ARGUMENT_ERROR;
    836        return 0;
    837    }
    838 
    839    //
    840    //  Check that the data header is for spoof data.
    841    //    (Header contents are defined in gencfu.cpp)
    842    //
    843    const UDataInfo *pInfo = (const UDataInfo *)((const char *)inData+4);
    844    if(!(  pInfo->dataFormat[0]==0x43 &&   /* dataFormat="Cfu " */
    845           pInfo->dataFormat[1]==0x66 &&
    846           pInfo->dataFormat[2]==0x75 &&
    847           pInfo->dataFormat[3]==0x20 &&
    848           pInfo->formatVersion[0]==USPOOF_CONFUSABLE_DATA_FORMAT_VERSION &&
    849           pInfo->formatVersion[1]==0 &&
    850           pInfo->formatVersion[2]==0 &&
    851           pInfo->formatVersion[3]==0  )) {
    852        udata_printError(ds, "uspoof_swap(): data format %02x.%02x.%02x.%02x "
    853                             "(format version %02x %02x %02x %02x) is not recognized\n",
    854                         pInfo->dataFormat[0], pInfo->dataFormat[1],
    855                         pInfo->dataFormat[2], pInfo->dataFormat[3],
    856                         pInfo->formatVersion[0], pInfo->formatVersion[1],
    857                         pInfo->formatVersion[2], pInfo->formatVersion[3]);
    858        *status=U_UNSUPPORTED_ERROR;
    859        return 0;
    860    }
    861 
    862    //
    863    // Swap the data header.  (This is the generic ICU Data Header, not the uspoof Specific
    864    //                         header).  This swap also conveniently gets us
    865    //                         the size of the ICU d.h., which lets us locate the start
    866    //                         of the uspoof specific data.
    867    //
    868    int32_t headerSize=udata_swapDataHeader(ds, inData, length, outData, status);
    869 
    870 
    871    //
    872    // Get the Spoof Data Header, and check that it appears to be OK.
    873    //
    874    //
    875    const uint8_t   *inBytes =(const uint8_t *)inData+headerSize;
    876    SpoofDataHeader *spoofDH = (SpoofDataHeader *)inBytes;
    877    if (ds->readUInt32(spoofDH->fMagic)   != USPOOF_MAGIC ||
    878        ds->readUInt32(spoofDH->fLength)  <  sizeof(SpoofDataHeader)) 
    879    {
    880        udata_printError(ds, "uspoof_swap(): Spoof Data header is invalid.\n");
    881        *status=U_UNSUPPORTED_ERROR;
    882        return 0;
    883    }
    884 
    885    //
    886    // Prefight operation?  Just return the size
    887    //
    888    int32_t spoofDataLength = ds->readUInt32(spoofDH->fLength);
    889    int32_t totalSize = headerSize + spoofDataLength;
    890    if (length < 0) {
    891        return totalSize;
    892    }
    893 
    894    //
    895    // Check that length passed in is consistent with length from Spoof data header.
    896    //
    897    if (length < totalSize) {
    898        udata_printError(ds, "uspoof_swap(): too few bytes (%d after ICU Data header) for spoof data.\n",
    899                            spoofDataLength);
    900        *status=U_INDEX_OUTOFBOUNDS_ERROR;
    901        return 0;
    902        }
    903 
    904 
    905    //
    906    // Swap the Data.  Do the data itself first, then the Spoof Data Header, because
    907    //                 we need to reference the header to locate the data, and an
    908    //                 inplace swap of the header leaves it unusable.
    909    //
    910    uint8_t          *outBytes = (uint8_t *)outData + headerSize;
    911    SpoofDataHeader  *outputDH = (SpoofDataHeader *)outBytes;
    912 
    913    int32_t   sectionStart;
    914    int32_t   sectionLength;
    915 
    916    //
    917    // If not swapping in place, zero out the output buffer before starting.
    918    //    Gaps may exist between the individual sections, and these must be zeroed in
    919    //    the output buffer.  The simplest way to do that is to just zero the whole thing.
    920    //
    921    if (inBytes != outBytes) {
    922        uprv_memset(outBytes, 0, spoofDataLength);
    923    }
    924 
    925    // Confusables Keys Section   (fCFUKeys)
    926    sectionStart  = ds->readUInt32(spoofDH->fCFUKeys);
    927    sectionLength = ds->readUInt32(spoofDH->fCFUKeysSize) * 4;
    928    ds->swapArray32(ds, inBytes+sectionStart, sectionLength, outBytes+sectionStart, status);
    929 
    930    // String Index Section
    931    sectionStart  = ds->readUInt32(spoofDH->fCFUStringIndex);
    932    sectionLength = ds->readUInt32(spoofDH->fCFUStringIndexSize) * 2;
    933    ds->swapArray16(ds, inBytes+sectionStart, sectionLength, outBytes+sectionStart, status);
    934 
    935    // String Table Section
    936    sectionStart  = ds->readUInt32(spoofDH->fCFUStringTable);
    937    sectionLength = ds->readUInt32(spoofDH->fCFUStringTableLen) * 2;
    938    ds->swapArray16(ds, inBytes+sectionStart, sectionLength, outBytes+sectionStart, status);
    939 
    940    // And, last, swap the header itself.
    941    //   int32_t   fMagic             // swap this
    942    //   uint8_t   fFormatVersion[4]  // Do not swap this, just copy
    943    //   int32_t   fLength and all the rest       // Swap the rest, all is 32 bit stuff.
    944    //
    945    uint32_t magic = ds->readUInt32(spoofDH->fMagic);
    946    ds->writeUInt32((uint32_t *)&outputDH->fMagic, magic);
    947 
    948    if (inBytes != outBytes) {
    949        uprv_memcpy(outputDH->fFormatVersion, spoofDH->fFormatVersion, sizeof(spoofDH->fFormatVersion));
    950    }
    951    // swap starting at fLength
    952    ds->swapArray32(ds, &spoofDH->fLength, sizeof(SpoofDataHeader)-8 /* minus magic and fFormatVersion[4] */, &outputDH->fLength, status);
    953 
    954    return totalSize;
    955 }
    956 
    957 #endif