uspoof_impl.cpp (31721B)
1 // © 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html 3 /* 4 ********************************************************************** 5 * Copyright (C) 2008-2016, International Business Machines 6 * Corporation and others. All Rights Reserved. 7 ********************************************************************** 8 */ 9 10 #include "unicode/utypes.h" 11 #include "unicode/uspoof.h" 12 #include "unicode/uchar.h" 13 #include "unicode/uniset.h" 14 #include "unicode/utf16.h" 15 #include "utrie2.h" 16 #include "cmemory.h" 17 #include "cstring.h" 18 #include "scriptset.h" 19 #include "umutex.h" 20 #include "udataswp.h" 21 #include "uassert.h" 22 #include "ucln_in.h" 23 #include "uspoof_impl.h" 24 25 #if !UCONFIG_NO_NORMALIZATION 26 27 28 U_NAMESPACE_BEGIN 29 30 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(SpoofImpl) 31 32 SpoofImpl::SpoofImpl(SpoofData *data, UErrorCode& status) { 33 construct(status); 34 fSpoofData = data; 35 } 36 37 SpoofImpl::SpoofImpl(UErrorCode& status) { 38 construct(status); 39 40 // TODO: Call this method where it is actually needed, instead of in the 41 // constructor, to allow for lazy data loading. See #12696. 42 fSpoofData = SpoofData::getDefault(status); 43 } 44 45 SpoofImpl::SpoofImpl() { 46 UErrorCode status = U_ZERO_ERROR; 47 construct(status); 48 49 // TODO: Call this method where it is actually needed, instead of in the 50 // constructor, to allow for lazy data loading. See #12696. 51 fSpoofData = SpoofData::getDefault(status); 52 } 53 54 void SpoofImpl::construct(UErrorCode& status) { 55 fChecks = USPOOF_ALL_CHECKS; 56 fSpoofData = nullptr; 57 fAllowedCharsSet = nullptr; 58 fAllowedLocales = nullptr; 59 fRestrictionLevel = USPOOF_HIGHLY_RESTRICTIVE; 60 61 if (U_FAILURE(status)) { return; } 62 63 UnicodeSet *allowedCharsSet = new UnicodeSet(0, 0x10ffff); 64 fAllowedCharsSet = allowedCharsSet; 65 fAllowedLocales = uprv_strdup(""); 66 if (fAllowedCharsSet == nullptr || fAllowedLocales == nullptr) { 67 status = U_MEMORY_ALLOCATION_ERROR; 68 return; 69 } 70 allowedCharsSet->freeze(); 71 } 72 73 74 // Copy Constructor, used by the user level clone() function. 75 SpoofImpl::SpoofImpl(const SpoofImpl &src, UErrorCode &status) : 76 fChecks(USPOOF_ALL_CHECKS), fSpoofData(nullptr), fAllowedCharsSet(nullptr) , 77 fAllowedLocales(nullptr) { 78 if (U_FAILURE(status)) { 79 return; 80 } 81 fChecks = src.fChecks; 82 if (src.fSpoofData != nullptr) { 83 fSpoofData = src.fSpoofData->addReference(); 84 } 85 fAllowedCharsSet = src.fAllowedCharsSet->clone(); 86 fAllowedLocales = uprv_strdup(src.fAllowedLocales); 87 if (fAllowedCharsSet == nullptr || fAllowedLocales == nullptr) { 88 status = U_MEMORY_ALLOCATION_ERROR; 89 } 90 fRestrictionLevel = src.fRestrictionLevel; 91 } 92 93 SpoofImpl::~SpoofImpl() { 94 if (fSpoofData != nullptr) { 95 fSpoofData->removeReference(); // Will delete if refCount goes to zero. 96 } 97 delete fAllowedCharsSet; 98 uprv_free((void *)fAllowedLocales); 99 } 100 101 // Cast this instance as a USpoofChecker for the C API. 102 USpoofChecker *SpoofImpl::asUSpoofChecker() { 103 return exportForC(); 104 } 105 106 // 107 // Incoming parameter check on Status and the SpoofChecker object 108 // received from the C API. 109 // 110 const SpoofImpl *SpoofImpl::validateThis(const USpoofChecker *sc, UErrorCode &status) { 111 const auto* This = validate(sc, status); 112 if (U_FAILURE(status)) { 113 return nullptr; 114 } 115 if (This->fSpoofData != nullptr && !This->fSpoofData->validateDataVersion(status)) { 116 return nullptr; 117 } 118 return This; 119 } 120 121 SpoofImpl *SpoofImpl::validateThis(USpoofChecker *sc, UErrorCode &status) { 122 return const_cast<SpoofImpl *> 123 (SpoofImpl::validateThis(const_cast<const USpoofChecker *>(sc), status)); 124 } 125 126 127 void SpoofImpl::setAllowedLocales(const char *localesList, UErrorCode &status) { 128 UnicodeSet allowedChars; 129 UnicodeSet *tmpSet = nullptr; 130 const char *locStart = localesList; 131 const char *locEnd = nullptr; 132 const char *localesListEnd = localesList + uprv_strlen(localesList); 133 int32_t localeListCount = 0; // Number of locales provided by caller. 134 135 // Loop runs once per locale from the localesList, a comma separated list of locales. 136 do { 137 locEnd = uprv_strchr(locStart, ','); 138 if (locEnd == nullptr) { 139 locEnd = localesListEnd; 140 } 141 while (*locStart == ' ') { 142 locStart++; 143 } 144 const char *trimmedEnd = locEnd-1; 145 while (trimmedEnd > locStart && *trimmedEnd == ' ') { 146 trimmedEnd--; 147 } 148 if (trimmedEnd <= locStart) { 149 break; 150 } 151 const char* locale = uprv_strndup(locStart, static_cast<int32_t>(trimmedEnd + 1 - locStart)); 152 localeListCount++; 153 154 // We have one locale from the locales list. 155 // Add the script chars for this locale to the accumulating set of allowed chars. 156 // If the locale is no good, we will be notified back via status. 157 addScriptChars(locale, &allowedChars, status); 158 uprv_free((void *)locale); 159 if (U_FAILURE(status)) { 160 break; 161 } 162 locStart = locEnd + 1; 163 } while (locStart < localesListEnd); 164 165 // If our caller provided an empty list of locales, we disable the allowed characters checking 166 if (localeListCount == 0) { 167 uprv_free((void *)fAllowedLocales); 168 fAllowedLocales = uprv_strdup(""); 169 tmpSet = new UnicodeSet(0, 0x10ffff); 170 if (fAllowedLocales == nullptr || tmpSet == nullptr) { 171 status = U_MEMORY_ALLOCATION_ERROR; 172 return; 173 } 174 tmpSet->freeze(); 175 delete fAllowedCharsSet; 176 fAllowedCharsSet = tmpSet; 177 fChecks &= ~USPOOF_CHAR_LIMIT; 178 return; 179 } 180 181 182 // Add all common and inherited characters to the set of allowed chars. 183 UnicodeSet tempSet; 184 tempSet.applyIntPropertyValue(UCHAR_SCRIPT, USCRIPT_COMMON, status); 185 allowedChars.addAll(tempSet); 186 tempSet.applyIntPropertyValue(UCHAR_SCRIPT, USCRIPT_INHERITED, status); 187 allowedChars.addAll(tempSet); 188 189 // If anything went wrong, we bail out without changing 190 // the state of the spoof checker. 191 if (U_FAILURE(status)) { 192 return; 193 } 194 195 // Store the updated spoof checker state. 196 tmpSet = allowedChars.clone(); 197 const char *tmpLocalesList = uprv_strdup(localesList); 198 if (tmpSet == nullptr || tmpLocalesList == nullptr) { 199 status = U_MEMORY_ALLOCATION_ERROR; 200 return; 201 } 202 uprv_free((void *)fAllowedLocales); 203 fAllowedLocales = tmpLocalesList; 204 tmpSet->freeze(); 205 delete fAllowedCharsSet; 206 fAllowedCharsSet = tmpSet; 207 fChecks |= USPOOF_CHAR_LIMIT; 208 } 209 210 211 const char * SpoofImpl::getAllowedLocales(UErrorCode &/*status*/) { 212 return fAllowedLocales; 213 } 214 215 216 // Given a locale (a language), add all the characters from all of the scripts used with that language 217 // to the allowedChars UnicodeSet 218 219 void SpoofImpl::addScriptChars(const char *locale, UnicodeSet *allowedChars, UErrorCode &status) { 220 UScriptCode scripts[30]; 221 222 int32_t numScripts = uscript_getCode(locale, scripts, UPRV_LENGTHOF(scripts), &status); 223 if (U_FAILURE(status)) { 224 return; 225 } 226 if (status == U_USING_DEFAULT_WARNING) { 227 status = U_ILLEGAL_ARGUMENT_ERROR; 228 return; 229 } 230 UnicodeSet tmpSet; 231 int32_t i; 232 for (i=0; i<numScripts; i++) { 233 tmpSet.applyIntPropertyValue(UCHAR_SCRIPT, scripts[i], status); 234 allowedChars->addAll(tmpSet); 235 } 236 } 237 238 // Computes the augmented script set for a code point, according to UTS 39 section 5.1. 239 void SpoofImpl::getAugmentedScriptSet(UChar32 codePoint, ScriptSet& result, UErrorCode& status) { 240 result.resetAll(); 241 result.setScriptExtensions(codePoint, status); 242 if (U_FAILURE(status)) { return; } 243 244 // Section 5.1 step 1 245 if (result.test(USCRIPT_HAN, status)) { 246 result.set(USCRIPT_HAN_WITH_BOPOMOFO, status); 247 result.set(USCRIPT_JAPANESE, status); 248 result.set(USCRIPT_KOREAN, status); 249 } 250 if (result.test(USCRIPT_HIRAGANA, status)) { 251 result.set(USCRIPT_JAPANESE, status); 252 } 253 if (result.test(USCRIPT_KATAKANA, status)) { 254 result.set(USCRIPT_JAPANESE, status); 255 } 256 if (result.test(USCRIPT_HANGUL, status)) { 257 result.set(USCRIPT_KOREAN, status); 258 } 259 if (result.test(USCRIPT_BOPOMOFO, status)) { 260 result.set(USCRIPT_HAN_WITH_BOPOMOFO, status); 261 } 262 263 // Section 5.1 step 2 264 if (result.test(USCRIPT_COMMON, status) || result.test(USCRIPT_INHERITED, status)) { 265 result.setAll(); 266 } 267 } 268 269 // Computes the resolved script set for a string, according to UTS 39 section 5.1. 270 void SpoofImpl::getResolvedScriptSet(const UnicodeString& input, ScriptSet& result, UErrorCode& status) const { 271 getResolvedScriptSetWithout(input, USCRIPT_CODE_LIMIT, result, status); 272 } 273 274 // Computes the resolved script set for a string, omitting characters having the specified script. 275 // If USCRIPT_CODE_LIMIT is passed as the second argument, all characters are included. 276 void SpoofImpl::getResolvedScriptSetWithout(const UnicodeString& input, UScriptCode script, ScriptSet& result, UErrorCode& status) const { 277 result.setAll(); 278 279 ScriptSet temp; 280 UChar32 codePoint; 281 for (int32_t i = 0; i < input.length(); i += U16_LENGTH(codePoint)) { 282 codePoint = input.char32At(i); 283 284 // Compute the augmented script set for the character 285 getAugmentedScriptSet(codePoint, temp, status); 286 if (U_FAILURE(status)) { return; } 287 288 // Intersect the augmented script set with the resolved script set, but only if the character doesn't 289 // have the script specified in the function call 290 if (script == USCRIPT_CODE_LIMIT || !temp.test(script, status)) { 291 result.intersect(temp); 292 } 293 } 294 } 295 296 // Computes the set of numerics for a string, according to UTS 39 section 5.3. 297 void SpoofImpl::getNumerics(const UnicodeString& input, UnicodeSet& result, UErrorCode& /*status*/) const { 298 result.clear(); 299 300 UChar32 codePoint; 301 for (int32_t i = 0; i < input.length(); i += U16_LENGTH(codePoint)) { 302 codePoint = input.char32At(i); 303 304 // Store a representative character for each kind of decimal digit 305 if (u_charType(codePoint) == U_DECIMAL_DIGIT_NUMBER) { 306 // Store the zero character as a representative for comparison. 307 // Unicode guarantees it is codePoint - value 308 result.add(codePoint - static_cast<UChar32>(u_getNumericValue(codePoint))); 309 } 310 } 311 } 312 313 // Computes the restriction level of a string, according to UTS 39 section 5.2. 314 URestrictionLevel SpoofImpl::getRestrictionLevel(const UnicodeString& input, UErrorCode& status) const { 315 // Section 5.2 step 1: 316 if (!fAllowedCharsSet->containsAll(input)) { 317 return USPOOF_UNRESTRICTIVE; 318 } 319 320 // Section 5.2 step 2 321 // Java use a static UnicodeSet for this test. In C++, avoid the static variable 322 // and just do a simple for loop. 323 UBool allASCII = true; 324 for (int32_t i=0, length=input.length(); i<length; i++) { 325 if (input.charAt(i) > 0x7f) { 326 allASCII = false; 327 break; 328 } 329 } 330 if (allASCII) { 331 return USPOOF_ASCII; 332 } 333 334 // Section 5.2 steps 3: 335 ScriptSet resolvedScriptSet; 336 getResolvedScriptSet(input, resolvedScriptSet, status); 337 if (U_FAILURE(status)) { return USPOOF_UNRESTRICTIVE; } 338 339 // Section 5.2 step 4: 340 if (!resolvedScriptSet.isEmpty()) { 341 return USPOOF_SINGLE_SCRIPT_RESTRICTIVE; 342 } 343 344 // Section 5.2 step 5: 345 ScriptSet resolvedNoLatn; 346 getResolvedScriptSetWithout(input, USCRIPT_LATIN, resolvedNoLatn, status); 347 if (U_FAILURE(status)) { return USPOOF_UNRESTRICTIVE; } 348 349 // Section 5.2 step 6: 350 if (resolvedNoLatn.test(USCRIPT_HAN_WITH_BOPOMOFO, status) 351 || resolvedNoLatn.test(USCRIPT_JAPANESE, status) 352 || resolvedNoLatn.test(USCRIPT_KOREAN, status)) { 353 return USPOOF_HIGHLY_RESTRICTIVE; 354 } 355 356 // Section 5.2 step 7: 357 if (!resolvedNoLatn.isEmpty() 358 && !resolvedNoLatn.test(USCRIPT_CYRILLIC, status) 359 && !resolvedNoLatn.test(USCRIPT_GREEK, status) 360 && !resolvedNoLatn.test(USCRIPT_CHEROKEE, status)) { 361 return USPOOF_MODERATELY_RESTRICTIVE; 362 } 363 364 // Section 5.2 step 8: 365 return USPOOF_MINIMALLY_RESTRICTIVE; 366 } 367 368 int32_t SpoofImpl::findHiddenOverlay(const UnicodeString& input, UErrorCode&) const { 369 bool sawLeadCharacter = false; 370 for (int32_t i=0; i<input.length();) { 371 UChar32 cp = input.char32At(i); 372 if (sawLeadCharacter && cp == 0x0307) { 373 return i; 374 } 375 uint8_t combiningClass = u_getCombiningClass(cp); 376 // Skip over characters except for those with combining class 0 (non-combining characters) or with 377 // combining class 230 (same class as U+0307) 378 U_ASSERT(u_getCombiningClass(0x0307) == 230); 379 if (combiningClass == 0 || combiningClass == 230) { 380 sawLeadCharacter = isIllegalCombiningDotLeadCharacter(cp); 381 } 382 i += U16_LENGTH(cp); 383 } 384 return -1; 385 } 386 387 static inline bool isIllegalCombiningDotLeadCharacterNoLookup(UChar32 cp) { 388 return cp == u'i' || cp == u'j' || cp == u'ı' || cp == u'ȷ' || cp == u'l' || 389 u_hasBinaryProperty(cp, UCHAR_SOFT_DOTTED); 390 } 391 392 bool SpoofImpl::isIllegalCombiningDotLeadCharacter(UChar32 cp) const { 393 if (isIllegalCombiningDotLeadCharacterNoLookup(cp)) { 394 return true; 395 } 396 UnicodeString skelStr; 397 fSpoofData->confusableLookup(cp, skelStr); 398 UChar32 finalCp = skelStr.char32At(skelStr.moveIndex32(skelStr.length(), -1)); 399 if (finalCp != cp && isIllegalCombiningDotLeadCharacterNoLookup(finalCp)) { 400 return true; 401 } 402 return false; 403 } 404 405 406 407 // Convert a text format hex number. Utility function used by builder code. Static. 408 // Input: char16_t *string text. Output: a UChar32 409 // Input has been pre-checked, and will have no non-hex chars. 410 // The number must fall in the code point range of 0..0x10ffff 411 // Static Function. 412 UChar32 SpoofImpl::ScanHex(const char16_t *s, int32_t start, int32_t limit, UErrorCode &status) { 413 if (U_FAILURE(status)) { 414 return 0; 415 } 416 U_ASSERT(limit-start > 0); 417 uint32_t val = 0; 418 int i; 419 for (i=start; i<limit; i++) { 420 int digitVal = s[i] - 0x30; 421 if (digitVal>9) { 422 digitVal = 0xa + (s[i] - 0x41); // Upper Case 'A' 423 } 424 if (digitVal>15) { 425 digitVal = 0xa + (s[i] - 0x61); // Lower Case 'a' 426 } 427 U_ASSERT(digitVal <= 0xf); 428 val <<= 4; 429 val += digitVal; 430 } 431 if (val > 0x10ffff) { 432 status = U_PARSE_ERROR; 433 val = 0; 434 } 435 return static_cast<UChar32>(val); 436 } 437 438 439 //----------------------------------------- 440 // 441 // class CheckResult Implementation 442 // 443 //----------------------------------------- 444 445 CheckResult::CheckResult() { 446 clear(); 447 } 448 449 USpoofCheckResult* CheckResult::asUSpoofCheckResult() { 450 return exportForC(); 451 } 452 453 // 454 // Incoming parameter check on Status and the CheckResult object 455 // received from the C API. 456 // 457 const CheckResult* CheckResult::validateThis(const USpoofCheckResult *ptr, UErrorCode &status) { 458 return validate(ptr, status); 459 } 460 461 CheckResult* CheckResult::validateThis(USpoofCheckResult *ptr, UErrorCode &status) { 462 return validate(ptr, status); 463 } 464 465 void CheckResult::clear() { 466 fChecks = 0; 467 fNumerics.clear(); 468 fRestrictionLevel = USPOOF_UNDEFINED_RESTRICTIVE; 469 } 470 471 int32_t CheckResult::toCombinedBitmask(int32_t enabledChecks) { 472 if ((enabledChecks & USPOOF_AUX_INFO) != 0 && fRestrictionLevel != USPOOF_UNDEFINED_RESTRICTIVE) { 473 return fChecks | fRestrictionLevel; 474 } else { 475 return fChecks; 476 } 477 } 478 479 CheckResult::~CheckResult() { 480 } 481 482 //---------------------------------------------------------------------------------------------- 483 // 484 // class SpoofData Implementation 485 // 486 //---------------------------------------------------------------------------------------------- 487 488 489 UBool SpoofData::validateDataVersion(UErrorCode &status) const { 490 if (U_FAILURE(status) || 491 fRawData == nullptr || 492 fRawData->fMagic != USPOOF_MAGIC || 493 fRawData->fFormatVersion[0] != USPOOF_CONFUSABLE_DATA_FORMAT_VERSION || 494 fRawData->fFormatVersion[1] != 0 || 495 fRawData->fFormatVersion[2] != 0 || 496 fRawData->fFormatVersion[3] != 0) { 497 status = U_INVALID_FORMAT_ERROR; 498 return false; 499 } 500 return true; 501 } 502 503 static UBool U_CALLCONV 504 spoofDataIsAcceptable(void *context, 505 const char * /* type */, const char * /*name*/, 506 const UDataInfo *pInfo) { 507 if( 508 pInfo->size >= 20 && 509 pInfo->isBigEndian == U_IS_BIG_ENDIAN && 510 pInfo->charsetFamily == U_CHARSET_FAMILY && 511 pInfo->dataFormat[0] == 0x43 && // dataFormat="Cfu " 512 pInfo->dataFormat[1] == 0x66 && 513 pInfo->dataFormat[2] == 0x75 && 514 pInfo->dataFormat[3] == 0x20 && 515 pInfo->formatVersion[0] == USPOOF_CONFUSABLE_DATA_FORMAT_VERSION 516 ) { 517 UVersionInfo *version = static_cast<UVersionInfo *>(context); 518 if(version != nullptr) { 519 uprv_memcpy(version, pInfo->dataVersion, 4); 520 } 521 return true; 522 } else { 523 return false; 524 } 525 } 526 527 // Methods for the loading of the default confusables data file. The confusable 528 // data is loaded only when it is needed. 529 // 530 // SpoofData::getDefault() - Return the default confusables data, and call the 531 // initOnce() if it is not available. Adds a reference 532 // to the SpoofData that the caller is responsible for 533 // decrementing when they are done with the data. 534 // 535 // uspoof_loadDefaultData - Called once, from initOnce(). The resulting SpoofData 536 // is shared by all spoof checkers using the default data. 537 // 538 // uspoof_cleanupDefaultData - Called during cleanup. 539 // 540 541 static UInitOnce gSpoofInitDefaultOnce {}; 542 static SpoofData* gDefaultSpoofData; 543 544 static UBool U_CALLCONV 545 uspoof_cleanupDefaultData() { 546 if (gDefaultSpoofData) { 547 // Will delete, assuming all user-level spoof checkers were closed. 548 gDefaultSpoofData->removeReference(); 549 gDefaultSpoofData = nullptr; 550 gSpoofInitDefaultOnce.reset(); 551 } 552 return true; 553 } 554 555 static void U_CALLCONV uspoof_loadDefaultData(UErrorCode& status) { 556 UDataMemory *udm = udata_openChoice(nullptr, "cfu", "confusables", 557 spoofDataIsAcceptable, 558 nullptr, // context, would receive dataVersion if supplied. 559 &status); 560 if (U_FAILURE(status)) { return; } 561 gDefaultSpoofData = new SpoofData(udm, status); 562 if (U_FAILURE(status)) { 563 delete gDefaultSpoofData; 564 gDefaultSpoofData = nullptr; 565 return; 566 } 567 if (gDefaultSpoofData == nullptr) { 568 status = U_MEMORY_ALLOCATION_ERROR; 569 return; 570 } 571 ucln_i18n_registerCleanup(UCLN_I18N_SPOOFDATA, uspoof_cleanupDefaultData); 572 } 573 574 SpoofData* SpoofData::getDefault(UErrorCode& status) { 575 umtx_initOnce(gSpoofInitDefaultOnce, &uspoof_loadDefaultData, status); 576 if (U_FAILURE(status)) { return nullptr; } 577 gDefaultSpoofData->addReference(); 578 return gDefaultSpoofData; 579 } 580 581 582 583 SpoofData::SpoofData(UDataMemory *udm, UErrorCode &status) 584 { 585 reset(); 586 if (U_FAILURE(status)) { 587 return; 588 } 589 fUDM = udm; 590 // fRawData is non-const because it may be constructed by the data builder. 591 fRawData = reinterpret_cast<SpoofDataHeader *>( 592 const_cast<void *>(udata_getMemory(udm))); 593 validateDataVersion(status); 594 initPtrs(status); 595 } 596 597 598 SpoofData::SpoofData(const void *data, int32_t length, UErrorCode &status) 599 { 600 reset(); 601 if (U_FAILURE(status)) { 602 return; 603 } 604 if (static_cast<size_t>(length) < sizeof(SpoofDataHeader)) { 605 status = U_INVALID_FORMAT_ERROR; 606 return; 607 } 608 if (data == nullptr) { 609 status = U_ILLEGAL_ARGUMENT_ERROR; 610 return; 611 } 612 void *ncData = const_cast<void *>(data); 613 fRawData = static_cast<SpoofDataHeader *>(ncData); 614 if (length < fRawData->fLength) { 615 status = U_INVALID_FORMAT_ERROR; 616 return; 617 } 618 validateDataVersion(status); 619 initPtrs(status); 620 } 621 622 623 // Spoof Data constructor for use from data builder. 624 // Initializes a new, empty data area that will be populated later. 625 SpoofData::SpoofData(UErrorCode &status) { 626 reset(); 627 if (U_FAILURE(status)) { 628 return; 629 } 630 fDataOwned = true; 631 632 // The spoof header should already be sized to be a multiple of 16 bytes. 633 // Just in case it's not, round it up. 634 uint32_t initialSize = (sizeof(SpoofDataHeader) + 15) & ~15; 635 U_ASSERT(initialSize == sizeof(SpoofDataHeader)); 636 637 fRawData = static_cast<SpoofDataHeader *>(uprv_malloc(initialSize)); 638 fMemLimit = initialSize; 639 if (fRawData == nullptr) { 640 status = U_MEMORY_ALLOCATION_ERROR; 641 return; 642 } 643 uprv_memset(fRawData, 0, initialSize); 644 645 fRawData->fMagic = USPOOF_MAGIC; 646 fRawData->fFormatVersion[0] = USPOOF_CONFUSABLE_DATA_FORMAT_VERSION; 647 fRawData->fFormatVersion[1] = 0; 648 fRawData->fFormatVersion[2] = 0; 649 fRawData->fFormatVersion[3] = 0; 650 initPtrs(status); 651 } 652 653 // reset() - initialize all fields. 654 // Should be updated if any new fields are added. 655 // Called by constructors to put things in a known initial state. 656 void SpoofData::reset() { 657 fRawData = nullptr; 658 fDataOwned = false; 659 fUDM = nullptr; 660 fMemLimit = 0; 661 fRefCount = 1; 662 fCFUKeys = nullptr; 663 fCFUValues = nullptr; 664 fCFUStrings = nullptr; 665 } 666 667 668 // SpoofData::initPtrs() 669 // Initialize the pointers to the various sections of the raw data. 670 // 671 // This function is used both during the Trie building process (multiple 672 // times, as the individual data sections are added), and 673 // during the opening of a Spoof Checker from prebuilt data. 674 // 675 // The pointers for non-existent data sections (identified by an offset of 0) 676 // are set to nullptr. 677 // 678 // Note: During building the data, adding each new data section 679 // reallocs the raw data area, which likely relocates it, which 680 // in turn requires reinitializing all of the pointers into it, hence 681 // multiple calls to this function during building. 682 // 683 void SpoofData::initPtrs(UErrorCode &status) { 684 fCFUKeys = nullptr; 685 fCFUValues = nullptr; 686 fCFUStrings = nullptr; 687 if (U_FAILURE(status)) { 688 return; 689 } 690 if (fRawData->fCFUKeys != 0) { 691 fCFUKeys = reinterpret_cast<int32_t*>(reinterpret_cast<char*>(fRawData) + fRawData->fCFUKeys); 692 } 693 if (fRawData->fCFUStringIndex != 0) { 694 fCFUValues = reinterpret_cast<uint16_t*>(reinterpret_cast<char*>(fRawData) + fRawData->fCFUStringIndex); 695 } 696 if (fRawData->fCFUStringTable != 0) { 697 fCFUStrings = reinterpret_cast<char16_t*>(reinterpret_cast<char*>(fRawData) + fRawData->fCFUStringTable); 698 } 699 } 700 701 702 SpoofData::~SpoofData() { 703 if (fDataOwned) { 704 uprv_free(fRawData); 705 } 706 fRawData = nullptr; 707 if (fUDM != nullptr) { 708 udata_close(fUDM); 709 } 710 fUDM = nullptr; 711 } 712 713 714 void SpoofData::removeReference() { 715 if (umtx_atomic_dec(&fRefCount) == 0) { 716 delete this; 717 } 718 } 719 720 721 SpoofData *SpoofData::addReference() { 722 umtx_atomic_inc(&fRefCount); 723 return this; 724 } 725 726 727 void *SpoofData::reserveSpace(int32_t numBytes, UErrorCode &status) { 728 if (U_FAILURE(status)) { 729 return nullptr; 730 } 731 if (!fDataOwned) { 732 UPRV_UNREACHABLE_EXIT; 733 } 734 735 numBytes = (numBytes + 15) & ~15; // Round up to a multiple of 16 736 uint32_t returnOffset = fMemLimit; 737 fMemLimit += numBytes; 738 fRawData = static_cast<SpoofDataHeader *>(uprv_realloc(fRawData, fMemLimit)); 739 fRawData->fLength = fMemLimit; 740 uprv_memset((char *)fRawData + returnOffset, 0, numBytes); 741 initPtrs(status); 742 return reinterpret_cast<char*>(fRawData) + returnOffset; 743 } 744 745 int32_t SpoofData::serialize(void *buf, int32_t capacity, UErrorCode &status) const { 746 int32_t dataSize = fRawData->fLength; 747 if (capacity < dataSize) { 748 status = U_BUFFER_OVERFLOW_ERROR; 749 return dataSize; 750 } 751 uprv_memcpy(buf, fRawData, dataSize); 752 return dataSize; 753 } 754 755 int32_t SpoofData::size() const { 756 return fRawData->fLength; 757 } 758 759 //------------------------------- 760 // 761 // Front-end APIs for SpoofData 762 // 763 //------------------------------- 764 765 int32_t SpoofData::confusableLookup(UChar32 inChar, UnicodeString &dest) const { 766 // Perform a binary search. 767 // [lo, hi), i.e lo is inclusive, hi is exclusive. 768 // The result after the loop will be in lo. 769 int32_t lo = 0; 770 int32_t hi = length(); 771 do { 772 int32_t mid = (lo + hi) / 2; 773 if (codePointAt(mid) > inChar) { 774 hi = mid; 775 } else if (codePointAt(mid) < inChar) { 776 lo = mid; 777 } else { 778 // Found result. Break early. 779 lo = mid; 780 break; 781 } 782 } while (hi - lo > 1); 783 784 // Did we find an entry? If not, the char maps to itself. 785 if (codePointAt(lo) != inChar) { 786 dest.append(inChar); 787 return 1; 788 } 789 790 // Add the element to the string builder and return. 791 return appendValueTo(lo, dest); 792 } 793 794 int32_t SpoofData::length() const { 795 return fRawData->fCFUKeysSize; 796 } 797 798 UChar32 SpoofData::codePointAt(int32_t index) const { 799 return ConfusableDataUtils::keyToCodePoint(fCFUKeys[index]); 800 } 801 802 int32_t SpoofData::appendValueTo(int32_t index, UnicodeString& dest) const { 803 int32_t stringLength = ConfusableDataUtils::keyToLength(fCFUKeys[index]); 804 805 // Value is either a char (for strings of length 1) or 806 // an index into the string table (for longer strings) 807 uint16_t value = fCFUValues[index]; 808 if (stringLength == 1) { 809 dest.append(static_cast<char16_t>(value)); 810 } else { 811 dest.append(fCFUStrings + value, stringLength); 812 } 813 814 return stringLength; 815 } 816 817 818 U_NAMESPACE_END 819 820 U_NAMESPACE_USE 821 822 //----------------------------------------------------------------------------- 823 // 824 // uspoof_swap - byte swap and char encoding swap of spoof data 825 // 826 //----------------------------------------------------------------------------- 827 U_CAPI int32_t U_EXPORT2 828 uspoof_swap(const UDataSwapper *ds, const void *inData, int32_t length, void *outData, 829 UErrorCode *status) { 830 831 if (status == nullptr || U_FAILURE(*status)) { 832 return 0; 833 } 834 if(ds==nullptr || inData==nullptr || length<-1 || (length>0 && outData==nullptr)) { 835 *status=U_ILLEGAL_ARGUMENT_ERROR; 836 return 0; 837 } 838 839 // 840 // Check that the data header is for spoof data. 841 // (Header contents are defined in gencfu.cpp) 842 // 843 const UDataInfo *pInfo = (const UDataInfo *)((const char *)inData+4); 844 if(!( pInfo->dataFormat[0]==0x43 && /* dataFormat="Cfu " */ 845 pInfo->dataFormat[1]==0x66 && 846 pInfo->dataFormat[2]==0x75 && 847 pInfo->dataFormat[3]==0x20 && 848 pInfo->formatVersion[0]==USPOOF_CONFUSABLE_DATA_FORMAT_VERSION && 849 pInfo->formatVersion[1]==0 && 850 pInfo->formatVersion[2]==0 && 851 pInfo->formatVersion[3]==0 )) { 852 udata_printError(ds, "uspoof_swap(): data format %02x.%02x.%02x.%02x " 853 "(format version %02x %02x %02x %02x) is not recognized\n", 854 pInfo->dataFormat[0], pInfo->dataFormat[1], 855 pInfo->dataFormat[2], pInfo->dataFormat[3], 856 pInfo->formatVersion[0], pInfo->formatVersion[1], 857 pInfo->formatVersion[2], pInfo->formatVersion[3]); 858 *status=U_UNSUPPORTED_ERROR; 859 return 0; 860 } 861 862 // 863 // Swap the data header. (This is the generic ICU Data Header, not the uspoof Specific 864 // header). This swap also conveniently gets us 865 // the size of the ICU d.h., which lets us locate the start 866 // of the uspoof specific data. 867 // 868 int32_t headerSize=udata_swapDataHeader(ds, inData, length, outData, status); 869 870 871 // 872 // Get the Spoof Data Header, and check that it appears to be OK. 873 // 874 // 875 const uint8_t *inBytes =(const uint8_t *)inData+headerSize; 876 SpoofDataHeader *spoofDH = (SpoofDataHeader *)inBytes; 877 if (ds->readUInt32(spoofDH->fMagic) != USPOOF_MAGIC || 878 ds->readUInt32(spoofDH->fLength) < sizeof(SpoofDataHeader)) 879 { 880 udata_printError(ds, "uspoof_swap(): Spoof Data header is invalid.\n"); 881 *status=U_UNSUPPORTED_ERROR; 882 return 0; 883 } 884 885 // 886 // Prefight operation? Just return the size 887 // 888 int32_t spoofDataLength = ds->readUInt32(spoofDH->fLength); 889 int32_t totalSize = headerSize + spoofDataLength; 890 if (length < 0) { 891 return totalSize; 892 } 893 894 // 895 // Check that length passed in is consistent with length from Spoof data header. 896 // 897 if (length < totalSize) { 898 udata_printError(ds, "uspoof_swap(): too few bytes (%d after ICU Data header) for spoof data.\n", 899 spoofDataLength); 900 *status=U_INDEX_OUTOFBOUNDS_ERROR; 901 return 0; 902 } 903 904 905 // 906 // Swap the Data. Do the data itself first, then the Spoof Data Header, because 907 // we need to reference the header to locate the data, and an 908 // inplace swap of the header leaves it unusable. 909 // 910 uint8_t *outBytes = (uint8_t *)outData + headerSize; 911 SpoofDataHeader *outputDH = (SpoofDataHeader *)outBytes; 912 913 int32_t sectionStart; 914 int32_t sectionLength; 915 916 // 917 // If not swapping in place, zero out the output buffer before starting. 918 // Gaps may exist between the individual sections, and these must be zeroed in 919 // the output buffer. The simplest way to do that is to just zero the whole thing. 920 // 921 if (inBytes != outBytes) { 922 uprv_memset(outBytes, 0, spoofDataLength); 923 } 924 925 // Confusables Keys Section (fCFUKeys) 926 sectionStart = ds->readUInt32(spoofDH->fCFUKeys); 927 sectionLength = ds->readUInt32(spoofDH->fCFUKeysSize) * 4; 928 ds->swapArray32(ds, inBytes+sectionStart, sectionLength, outBytes+sectionStart, status); 929 930 // String Index Section 931 sectionStart = ds->readUInt32(spoofDH->fCFUStringIndex); 932 sectionLength = ds->readUInt32(spoofDH->fCFUStringIndexSize) * 2; 933 ds->swapArray16(ds, inBytes+sectionStart, sectionLength, outBytes+sectionStart, status); 934 935 // String Table Section 936 sectionStart = ds->readUInt32(spoofDH->fCFUStringTable); 937 sectionLength = ds->readUInt32(spoofDH->fCFUStringTableLen) * 2; 938 ds->swapArray16(ds, inBytes+sectionStart, sectionLength, outBytes+sectionStart, status); 939 940 // And, last, swap the header itself. 941 // int32_t fMagic // swap this 942 // uint8_t fFormatVersion[4] // Do not swap this, just copy 943 // int32_t fLength and all the rest // Swap the rest, all is 32 bit stuff. 944 // 945 uint32_t magic = ds->readUInt32(spoofDH->fMagic); 946 ds->writeUInt32((uint32_t *)&outputDH->fMagic, magic); 947 948 if (inBytes != outBytes) { 949 uprv_memcpy(outputDH->fFormatVersion, spoofDH->fFormatVersion, sizeof(spoofDH->fFormatVersion)); 950 } 951 // swap starting at fLength 952 ds->swapArray32(ds, &spoofDH->fLength, sizeof(SpoofDataHeader)-8 /* minus magic and fFormatVersion[4] */, &outputDH->fLength, status); 953 954 return totalSize; 955 } 956 957 #endif