uspoof_conf.cpp (17856B)
1 // © 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html 3 /* 4 ****************************************************************************** 5 * 6 * Copyright (C) 2008-2015, International Business Machines 7 * Corporation and others. All Rights Reserved. 8 * 9 ****************************************************************************** 10 * file name: uspoof_conf.cpp 11 * encoding: UTF-8 12 * tab size: 8 (not used) 13 * indentation:4 14 * 15 * created on: 2009Jan05 (refactoring earlier files) 16 * created by: Andy Heninger 17 * 18 * Internal classes for compiling confusable data into its binary (runtime) form. 19 */ 20 21 #include "unicode/utypes.h" 22 #include "unicode/uspoof.h" 23 #if !UCONFIG_NO_REGULAR_EXPRESSIONS 24 #if !UCONFIG_NO_NORMALIZATION 25 26 #include "unicode/unorm.h" 27 #include "unicode/uregex.h" 28 #include "unicode/ustring.h" 29 #include "cmemory.h" 30 #include "uspoof_impl.h" 31 #include "uhash.h" 32 #include "uvector.h" 33 #include "uassert.h" 34 #include "uarrsort.h" 35 #include "uspoof_conf.h" 36 37 U_NAMESPACE_USE 38 39 40 //--------------------------------------------------------------------- 41 // 42 // buildConfusableData Compile the source confusable data, as defined by 43 // the Unicode data file confusables.txt, into the binary 44 // structures used by the confusable detector. 45 // 46 // The binary structures are described in uspoof_impl.h 47 // 48 // 1. Parse the data, making a hash table mapping from a UChar32 to a String. 49 // 50 // 2. Sort all of the strings encountered by length, since they will need to 51 // be stored in that order in the final string table. 52 // TODO: Sorting these strings by length is no longer needed since the removal of 53 // the string lengths table. This logic can be removed to save processing time 54 // when building confusables data. 55 // 56 // 3. Build a list of keys (UChar32s) from the four mapping tables. Sort the 57 // list because that will be the ordering of our runtime table. 58 // 59 // 4. Generate the run time string table. This is generated before the key & value 60 // tables because we need the string indexes when building those tables. 61 // 62 // 5. Build the run-time key and value tables. These are parallel tables, and are built 63 // at the same time 64 // 65 66 SPUString::SPUString(LocalPointer<UnicodeString> s) { 67 fStr = std::move(s); 68 fCharOrStrTableIndex = 0; 69 } 70 71 72 SPUString::~SPUString() { 73 } 74 75 76 SPUStringPool::SPUStringPool(UErrorCode &status) : fVec(nullptr), fHash(nullptr) { 77 LocalPointer<UVector> vec(new UVector(status), status); 78 if (U_FAILURE(status)) { 79 return; 80 } 81 vec->setDeleter( 82 [](void *obj) {delete static_cast<SPUString*>(obj);}); 83 fVec = vec.orphan(); 84 fHash = uhash_open(uhash_hashUnicodeString, // key hash function 85 uhash_compareUnicodeString, // Key Comparator 86 nullptr, // Value Comparator 87 &status); 88 } 89 90 91 SPUStringPool::~SPUStringPool() { 92 delete fVec; 93 uhash_close(fHash); 94 } 95 96 97 int32_t SPUStringPool::size() { 98 return fVec->size(); 99 } 100 101 SPUString *SPUStringPool::getByIndex(int32_t index) { 102 SPUString* retString = static_cast<SPUString*>(fVec->elementAt(index)); 103 return retString; 104 } 105 106 107 // Comparison function for ordering strings in the string pool. 108 // Compare by length first, then, within a group of the same length, 109 // by code point order. 110 // Conforms to the type signature for a USortComparator in uvector.h 111 112 static int32_t U_CALLCONV SPUStringCompare(UHashTok left, UHashTok right) { 113 const SPUString *sL = const_cast<const SPUString *>( 114 static_cast<SPUString *>(left.pointer)); 115 const SPUString *sR = const_cast<const SPUString *>( 116 static_cast<SPUString *>(right.pointer)); 117 int32_t lenL = sL->fStr->length(); 118 int32_t lenR = sR->fStr->length(); 119 if (lenL < lenR) { 120 return -1; 121 } else if (lenL > lenR) { 122 return 1; 123 } else { 124 return sL->fStr->compare(*(sR->fStr)); 125 } 126 } 127 128 void SPUStringPool::sort(UErrorCode &status) { 129 fVec->sort(SPUStringCompare, status); 130 } 131 132 133 SPUString *SPUStringPool::addString(UnicodeString *src, UErrorCode &status) { 134 LocalPointer<UnicodeString> lpSrc(src); 135 if (U_FAILURE(status)) { 136 return nullptr; 137 } 138 SPUString *hashedString = static_cast<SPUString *>(uhash_get(fHash, src)); 139 if (hashedString != nullptr) { 140 return hashedString; 141 } 142 LocalPointer<SPUString> spuStr(new SPUString(std::move(lpSrc)), status); 143 hashedString = spuStr.getAlias(); 144 fVec->adoptElement(spuStr.orphan(), status); 145 if (U_FAILURE(status)) { 146 return nullptr; 147 } 148 uhash_put(fHash, src, hashedString, &status); 149 return hashedString; 150 } 151 152 153 154 ConfusabledataBuilder::ConfusabledataBuilder(SpoofImpl *spImpl, UErrorCode &status) : 155 fSpoofImpl(spImpl), 156 fInput(nullptr), 157 fTable(nullptr), 158 fKeySet(nullptr), 159 fKeyVec(nullptr), 160 fValueVec(nullptr), 161 fStringTable(nullptr), 162 stringPool(nullptr), 163 fParseLine(nullptr), 164 fParseHexNum(nullptr), 165 fLineNum(0) 166 { 167 if (U_FAILURE(status)) { 168 return; 169 } 170 171 fTable = uhash_open(uhash_hashLong, uhash_compareLong, nullptr, &status); 172 173 fKeySet = new UnicodeSet(); 174 if (fKeySet == nullptr) { 175 status = U_MEMORY_ALLOCATION_ERROR; 176 return; 177 } 178 179 fKeyVec = new UVector(status); 180 if (fKeyVec == nullptr) { 181 status = U_MEMORY_ALLOCATION_ERROR; 182 return; 183 } 184 185 fValueVec = new UVector(status); 186 if (fValueVec == nullptr) { 187 status = U_MEMORY_ALLOCATION_ERROR; 188 return; 189 } 190 191 stringPool = new SPUStringPool(status); 192 if (stringPool == nullptr) { 193 status = U_MEMORY_ALLOCATION_ERROR; 194 return; 195 } 196 } 197 198 199 ConfusabledataBuilder::~ConfusabledataBuilder() { 200 uprv_free(fInput); 201 uregex_close(fParseLine); 202 uregex_close(fParseHexNum); 203 uhash_close(fTable); 204 delete fKeySet; 205 delete fKeyVec; 206 delete fStringTable; 207 delete fValueVec; 208 delete stringPool; 209 } 210 211 212 void ConfusabledataBuilder::buildConfusableData(SpoofImpl * spImpl, const char * confusables, 213 int32_t confusablesLen, int32_t *errorType, UParseError *pe, UErrorCode &status) { 214 215 if (U_FAILURE(status)) { 216 return; 217 } 218 ConfusabledataBuilder builder(spImpl, status); 219 builder.build(confusables, confusablesLen, status); 220 if (U_FAILURE(status) && errorType != nullptr) { 221 *errorType = USPOOF_SINGLE_SCRIPT_CONFUSABLE; 222 pe->line = builder.fLineNum; 223 } 224 } 225 226 227 void ConfusabledataBuilder::build(const char * confusables, int32_t confusablesLen, 228 UErrorCode &status) { 229 230 // Convert the user input data from UTF-8 to char16_t (UTF-16) 231 int32_t inputLen = 0; 232 if (U_FAILURE(status)) { 233 return; 234 } 235 UErrorCode bufferStatus = U_ZERO_ERROR; 236 u_strFromUTF8(nullptr, 0, &inputLen, confusables, confusablesLen, &bufferStatus); 237 if (bufferStatus != U_BUFFER_OVERFLOW_ERROR) { 238 if (U_FAILURE(bufferStatus)) { 239 status = bufferStatus; 240 } 241 return; 242 } 243 fInput = static_cast<char16_t *>(uprv_malloc((inputLen+1) * sizeof(char16_t))); 244 if (fInput == nullptr) { 245 status = U_MEMORY_ALLOCATION_ERROR; 246 return; 247 } 248 u_strFromUTF8(fInput, inputLen+1, nullptr, confusables, confusablesLen, &status); 249 250 251 // Regular Expression to parse a line from Confusables.txt. The expression will match 252 // any line. What was matched is determined by examining which capture groups have a match. 253 // Capture Group 1: the source char 254 // Capture Group 2: the replacement chars 255 // Capture Group 3-6 the table type, SL, SA, ML, or MA (deprecated) 256 // Capture Group 7: A blank or comment only line. 257 // Capture Group 8: A syntactically invalid line. Anything that didn't match before. 258 // Example Line from the confusables.txt source file: 259 // "1D702 ; 006E 0329 ; SL # MATHEMATICAL ITALIC SMALL ETA ... " 260 UnicodeString pattern( 261 "(?m)^[ \\t]*([0-9A-Fa-f]+)[ \\t]+;" // Match the source char 262 "[ \\t]*([0-9A-Fa-f]+" // Match the replacement char(s) 263 "(?:[ \\t]+[0-9A-Fa-f]+)*)[ \\t]*;" // (continued) 264 "\\s*(?:(SL)|(SA)|(ML)|(MA))" // Match the table type 265 "[ \\t]*(?:#.*?)?$" // Match any trailing #comment 266 "|^([ \\t]*(?:#.*?)?)$" // OR match empty lines or lines with only a #comment 267 "|^(.*?)$", -1, US_INV); // OR match any line, which catches illegal lines. 268 // TODO: Why are we using the regex C API here? C++ would just take UnicodeString... 269 fParseLine = uregex_open(pattern.getBuffer(), pattern.length(), 0, nullptr, &status); 270 271 // Regular expression for parsing a hex number out of a space-separated list of them. 272 // Capture group 1 gets the number, with spaces removed. 273 pattern = UNICODE_STRING_SIMPLE("\\s*([0-9A-F]+)"); 274 fParseHexNum = uregex_open(pattern.getBuffer(), pattern.length(), 0, nullptr, &status); 275 276 // Zap any Byte Order Mark at the start of input. Changing it to a space is benign 277 // given the syntax of the input. 278 if (*fInput == 0xfeff) { 279 *fInput = 0x20; 280 } 281 282 // Parse the input, one line per iteration of this loop. 283 uregex_setText(fParseLine, fInput, inputLen, &status); 284 while (uregex_findNext(fParseLine, &status)) { 285 fLineNum++; 286 if (uregex_start(fParseLine, 7, &status) >= 0) { 287 // this was a blank or comment line. 288 continue; 289 } 290 if (uregex_start(fParseLine, 8, &status) >= 0) { 291 // input file syntax error. 292 status = U_PARSE_ERROR; 293 return; 294 } 295 296 // We have a good input line. Extract the key character and mapping string, and 297 // put them into the appropriate mapping table. 298 UChar32 keyChar = SpoofImpl::ScanHex(fInput, uregex_start(fParseLine, 1, &status), 299 uregex_end(fParseLine, 1, &status), status); 300 301 int32_t mapStringStart = uregex_start(fParseLine, 2, &status); 302 int32_t mapStringLength = uregex_end(fParseLine, 2, &status) - mapStringStart; 303 uregex_setText(fParseHexNum, &fInput[mapStringStart], mapStringLength, &status); 304 305 UnicodeString *mapString = new UnicodeString(); 306 if (mapString == nullptr) { 307 status = U_MEMORY_ALLOCATION_ERROR; 308 return; 309 } 310 while (uregex_findNext(fParseHexNum, &status)) { 311 UChar32 c = SpoofImpl::ScanHex(&fInput[mapStringStart], uregex_start(fParseHexNum, 1, &status), 312 uregex_end(fParseHexNum, 1, &status), status); 313 mapString->append(c); 314 } 315 U_ASSERT(mapString->length() >= 1); 316 317 // Put the map (value) string into the string pool 318 // This a little like a Java intern() - any duplicates will be eliminated. 319 SPUString *smapString = stringPool->addString(mapString, status); 320 321 // Add the UChar32 -> string mapping to the table. 322 // For Unicode 8, the SL, SA and ML tables have been discontinued. 323 // All input data from confusables.txt is tagged MA. 324 uhash_iput(fTable, keyChar, smapString, &status); 325 if (U_FAILURE(status)) { return; } 326 fKeySet->add(keyChar); 327 } 328 329 // Input data is now all parsed and collected. 330 // Now create the run-time binary form of the data. 331 // 332 // This is done in two steps. First the data is assembled into vectors and strings, 333 // for ease of construction, then the contents of these collections are dumped 334 // into the actual raw-bytes data storage. 335 336 // Build up the string array, and record the index of each string therein 337 // in the (build time only) string pool. 338 // Strings of length one are not entered into the strings array. 339 // (Strings in the table are sorted by length) 340 stringPool->sort(status); 341 fStringTable = new UnicodeString(); 342 int32_t poolSize = stringPool->size(); 343 int32_t i; 344 for (i=0; i<poolSize; i++) { 345 SPUString *s = stringPool->getByIndex(i); 346 int32_t strLen = s->fStr->length(); 347 int32_t strIndex = fStringTable->length(); 348 if (strLen == 1) { 349 // strings of length one do not get an entry in the string table. 350 // Keep the single string character itself here, which is the same 351 // convention that is used in the final run-time string table index. 352 s->fCharOrStrTableIndex = s->fStr->charAt(0); 353 } else { 354 s->fCharOrStrTableIndex = strIndex; 355 fStringTable->append(*(s->fStr)); 356 } 357 } 358 359 // Construct the compile-time Key and Value tables 360 // 361 // For each key code point, check which mapping tables it applies to, 362 // and create the final data for the key & value structures. 363 // 364 // The four logical mapping tables are conflated into one combined table. 365 // If multiple logical tables have the same mapping for some key, they 366 // share a single entry in the combined table. 367 // If more than one mapping exists for the same key code point, multiple 368 // entries will be created in the table 369 370 for (int32_t range=0; range<fKeySet->getRangeCount(); range++) { 371 // It is an oddity of the UnicodeSet API that simply enumerating the contained 372 // code points requires a nested loop. 373 for (UChar32 keyChar=fKeySet->getRangeStart(range); 374 keyChar <= fKeySet->getRangeEnd(range); keyChar++) { 375 SPUString *targetMapping = static_cast<SPUString *>(uhash_iget(fTable, keyChar)); 376 U_ASSERT(targetMapping != nullptr); 377 378 // Set an error code if trying to consume a long string. Otherwise, 379 // codePointAndLengthToKey will abort on a U_ASSERT. 380 if (targetMapping->fStr->length() > 256) { 381 status = U_ILLEGAL_ARGUMENT_ERROR; 382 return; 383 } 384 385 int32_t key = ConfusableDataUtils::codePointAndLengthToKey(keyChar, 386 targetMapping->fStr->length()); 387 int32_t value = targetMapping->fCharOrStrTableIndex; 388 389 fKeyVec->addElement(key, status); 390 fValueVec->addElement(value, status); 391 } 392 } 393 394 // Put the assembled data into the flat runtime array 395 outputData(status); 396 397 // All of the intermediate allocated data belongs to the ConfusabledataBuilder 398 // object (this), and is deleted in the destructor. 399 } 400 401 // 402 // outputData The confusable data has been compiled and stored in intermediate 403 // collections and strings. Copy it from there to the final flat 404 // binary array. 405 // 406 // Note that as each section is added to the output data, the 407 // expand (reserveSpace() function will likely relocate it in memory. 408 // Be careful with pointers. 409 // 410 void ConfusabledataBuilder::outputData(UErrorCode &status) { 411 412 U_ASSERT(fSpoofImpl->fSpoofData->fDataOwned); 413 414 // The Key Table 415 // While copying the keys to the runtime array, 416 // also sanity check that they are sorted. 417 418 int32_t numKeys = fKeyVec->size(); 419 int32_t *keys = 420 static_cast<int32_t *>(fSpoofImpl->fSpoofData->reserveSpace(numKeys*sizeof(int32_t), status)); 421 if (U_FAILURE(status)) { 422 return; 423 } 424 int i; 425 UChar32 previousCodePoint = 0; 426 for (i=0; i<numKeys; i++) { 427 int32_t key = fKeyVec->elementAti(i); 428 UChar32 codePoint = ConfusableDataUtils::keyToCodePoint(key); 429 (void)previousCodePoint; // Suppress unused variable warning. 430 // strictly greater because there can be only one entry per code point 431 U_ASSERT(codePoint > previousCodePoint); 432 keys[i] = key; 433 previousCodePoint = codePoint; 434 } 435 SpoofDataHeader *rawData = fSpoofImpl->fSpoofData->fRawData; 436 rawData->fCFUKeys = static_cast<int32_t>(reinterpret_cast<char*>(keys) - reinterpret_cast<char*>(rawData)); 437 rawData->fCFUKeysSize = numKeys; 438 fSpoofImpl->fSpoofData->fCFUKeys = keys; 439 440 441 // The Value Table, parallels the key table 442 int32_t numValues = fValueVec->size(); 443 U_ASSERT(numKeys == numValues); 444 uint16_t *values = 445 static_cast<uint16_t *>(fSpoofImpl->fSpoofData->reserveSpace(numKeys*sizeof(uint16_t), status)); 446 if (U_FAILURE(status)) { 447 return; 448 } 449 for (i=0; i<numValues; i++) { 450 uint32_t value = static_cast<uint32_t>(fValueVec->elementAti(i)); 451 U_ASSERT(value < 0xffff); 452 values[i] = static_cast<uint16_t>(value); 453 } 454 rawData = fSpoofImpl->fSpoofData->fRawData; 455 rawData->fCFUStringIndex = static_cast<int32_t>(reinterpret_cast<char*>(values) - reinterpret_cast<char*>(rawData)); 456 rawData->fCFUStringIndexSize = numValues; 457 fSpoofImpl->fSpoofData->fCFUValues = values; 458 459 // The Strings Table. 460 461 uint32_t stringsLength = fStringTable->length(); 462 // Reserve an extra space so the string will be nul-terminated. This is 463 // only a convenience, for when debugging; it is not needed otherwise. 464 char16_t *strings = 465 static_cast<char16_t *>(fSpoofImpl->fSpoofData->reserveSpace(stringsLength*sizeof(char16_t)+2, status)); 466 if (U_FAILURE(status)) { 467 return; 468 } 469 fStringTable->extract(strings, stringsLength+1, status); 470 rawData = fSpoofImpl->fSpoofData->fRawData; 471 U_ASSERT(rawData->fCFUStringTable == 0); 472 rawData->fCFUStringTable = static_cast<int32_t>(reinterpret_cast<char*>(strings) - reinterpret_cast<char*>(rawData)); 473 rawData->fCFUStringTableLen = stringsLength; 474 fSpoofImpl->fSpoofData->fCFUStrings = strings; 475 } 476 477 #endif 478 #endif // !UCONFIG_NO_REGULAR_EXPRESSIONS