swapimpl.cpp (38620B)
1 // © 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html 3 /* 4 ******************************************************************************* 5 * 6 * Copyright (C) 2005-2014, International Business Machines 7 * Corporation and others. All Rights Reserved. 8 * 9 ******************************************************************************* 10 * file name: swapimpl.cpp 11 * encoding: UTF-8 12 * tab size: 8 (not used) 13 * indentation:4 14 * 15 * created on: 2005may05 16 * created by: Markus W. Scherer 17 * 18 * Data file swapping functions moved here from the common library 19 * because some data is hardcoded in ICU4C and needs not be swapped any more. 20 * Moving the functions here simplifies testing (for code coverage) because 21 * we need not jump through hoops (like adding snapshots of these files 22 * to testdata). 23 * 24 * The declarations for these functions remain in the internal header files 25 * in icu/source/common/ 26 */ 27 28 #include "unicode/utypes.h" 29 #include "unicode/putil.h" 30 #include "unicode/udata.h" 31 32 /* Explicit include statement for std_string.h is needed 33 * for compilation on certain platforms. (e.g. AIX/VACPP) 34 */ 35 #include "unicode/std_string.h" 36 37 #include "cmemory.h" 38 #include "cstring.h" 39 #include "uinvchar.h" 40 #include "uassert.h" 41 #include "uarrsort.h" 42 #include "ucmndata.h" 43 #include "udataswp.h" 44 #include "ulayout_props.h" 45 46 /* swapping implementations in common */ 47 48 #include "emojiprops.h" 49 #include "uresdata.h" 50 #include "ucnv_io.h" 51 #include "uprops.h" 52 #include "ucase.h" 53 #include "ubidi_props.h" 54 #include "ucol_swp.h" 55 #include "ucnv_bld.h" 56 #include "unormimp.h" 57 #include "normalizer2impl.h" 58 #include "sprpimpl.h" 59 #include "propname.h" 60 #include "rbbidata.h" 61 #include "utrie.h" 62 #include "utrie2.h" 63 #include "dictionarydata.h" 64 65 /* swapping implementations in i18n */ 66 67 #if !UCONFIG_NO_NORMALIZATION 68 #include "uspoof_impl.h" 69 #endif 70 71 U_NAMESPACE_USE 72 73 /* definitions */ 74 75 /* Unicode property (value) aliases data swapping --------------------------- */ 76 77 static int32_t U_CALLCONV 78 upname_swap(const UDataSwapper *ds, 79 const void *inData, int32_t length, void *outData, 80 UErrorCode *pErrorCode) { 81 /* udata_swapDataHeader checks the arguments */ 82 int32_t headerSize=udata_swapDataHeader(ds, inData, length, outData, pErrorCode); 83 if(pErrorCode==nullptr || U_FAILURE(*pErrorCode)) { 84 return 0; 85 } 86 87 /* check data format and format version */ 88 const UDataInfo *pInfo= 89 reinterpret_cast<const UDataInfo *>( 90 static_cast<const char *>(inData)+4); 91 if(!( 92 pInfo->dataFormat[0]==0x70 && /* dataFormat="pnam" */ 93 pInfo->dataFormat[1]==0x6e && 94 pInfo->dataFormat[2]==0x61 && 95 pInfo->dataFormat[3]==0x6d && 96 pInfo->formatVersion[0]==2 97 )) { 98 udata_printError(ds, "upname_swap(): data format %02x.%02x.%02x.%02x (format version %02x) is not recognized as pnames.icu\n", 99 pInfo->dataFormat[0], pInfo->dataFormat[1], 100 pInfo->dataFormat[2], pInfo->dataFormat[3], 101 pInfo->formatVersion[0]); 102 *pErrorCode=U_UNSUPPORTED_ERROR; 103 return 0; 104 } 105 106 const uint8_t *inBytes=static_cast<const uint8_t *>(inData)+headerSize; 107 uint8_t *outBytes=static_cast<uint8_t *>(outData)+headerSize; 108 109 if(length>=0) { 110 length-=headerSize; 111 // formatVersion 2 initially has indexes[8], 32 bytes. 112 if(length<32) { 113 udata_printError(ds, "upname_swap(): too few bytes (%d after header) for pnames.icu\n", 114 static_cast<int>(length)); 115 *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR; 116 return 0; 117 } 118 } 119 120 const int32_t *inIndexes=reinterpret_cast<const int32_t *>(inBytes); 121 int32_t totalSize=udata_readInt32(ds, inIndexes[PropNameData::IX_TOTAL_SIZE]); 122 if(length>=0) { 123 if(length<totalSize) { 124 udata_printError(ds, "upname_swap(): too few bytes (%d after header, should be %d) " 125 "for pnames.icu\n", 126 static_cast<int>(length), static_cast<int>(totalSize)); 127 *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR; 128 return 0; 129 } 130 131 int32_t numBytesIndexesAndValueMaps= 132 udata_readInt32(ds, inIndexes[PropNameData::IX_BYTE_TRIES_OFFSET]); 133 134 // Swap the indexes[] and the valueMaps[]. 135 ds->swapArray32(ds, inBytes, numBytesIndexesAndValueMaps, outBytes, pErrorCode); 136 137 // Copy the rest of the data. 138 if(inBytes!=outBytes) { 139 uprv_memcpy(outBytes+numBytesIndexesAndValueMaps, 140 inBytes+numBytesIndexesAndValueMaps, 141 totalSize-numBytesIndexesAndValueMaps); 142 } 143 144 // We need not swap anything else: 145 // 146 // The ByteTries are already byte-serialized, and are fixed on ASCII. 147 // (On an EBCDIC machine, the input string is converted to lowercase ASCII 148 // while matching.) 149 // 150 // The name groups are mostly invariant characters, but since we only 151 // generate, and keep in subversion, ASCII versions of pnames.icu, 152 // and since only ICU4J uses the pnames.icu data file 153 // (the data is hardcoded in ICU4C) and ICU4J uses ASCII data files, 154 // we just copy those bytes too. 155 } 156 157 return headerSize+totalSize; 158 } 159 160 /* Unicode properties data swapping ----------------------------------------- */ 161 162 static int32_t U_CALLCONV 163 uprops_swap(const UDataSwapper *ds, 164 const void *inData, int32_t length, void *outData, 165 UErrorCode *pErrorCode) { 166 const UDataInfo *pInfo; 167 int32_t headerSize, i; 168 169 int32_t dataIndexes[UPROPS_INDEX_COUNT]; 170 const int32_t *inData32; 171 172 /* udata_swapDataHeader checks the arguments */ 173 headerSize=udata_swapDataHeader(ds, inData, length, outData, pErrorCode); 174 if(pErrorCode==nullptr || U_FAILURE(*pErrorCode)) { 175 return 0; 176 } 177 178 /* check data format and format version */ 179 pInfo = reinterpret_cast<const UDataInfo*>(static_cast<const char*>(inData) + 4); 180 if(!( 181 pInfo->dataFormat[0]==0x55 && /* dataFormat="UPro" */ 182 pInfo->dataFormat[1]==0x50 && 183 pInfo->dataFormat[2]==0x72 && 184 pInfo->dataFormat[3]==0x6f && 185 (3<=pInfo->formatVersion[0] && pInfo->formatVersion[0]<=9) && 186 (pInfo->formatVersion[0]>=7 || 187 (pInfo->formatVersion[2]==UTRIE_SHIFT && 188 pInfo->formatVersion[3]==UTRIE_INDEX_SHIFT)) 189 )) { 190 udata_printError(ds, "uprops_swap(): data format %02x.%02x.%02x.%02x (format version %02x) is not a Unicode properties file\n", 191 pInfo->dataFormat[0], pInfo->dataFormat[1], 192 pInfo->dataFormat[2], pInfo->dataFormat[3], 193 pInfo->formatVersion[0]); 194 *pErrorCode=U_UNSUPPORTED_ERROR; 195 return 0; 196 } 197 198 /* the properties file must contain at least the indexes array */ 199 if (length >= 0 && (length - headerSize) < static_cast<int32_t>(sizeof(dataIndexes))) { 200 udata_printError(ds, "uprops_swap(): too few bytes (%d after header) for a Unicode properties file\n", 201 length-headerSize); 202 *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR; 203 return 0; 204 } 205 206 /* read the indexes */ 207 inData32 = reinterpret_cast<const int32_t*>(static_cast<const char*>(inData) + headerSize); 208 for(i=0; i<UPROPS_INDEX_COUNT; ++i) { 209 dataIndexes[i]=udata_readInt32(ds, inData32[i]); 210 } 211 212 /* 213 * comments are copied from the data format description in genprops/store.c 214 * indexes[] constants are in uprops.h 215 */ 216 int32_t dataTop; 217 if(length>=0) { 218 int32_t *outData32; 219 220 /* 221 * In formatVersion 7, UPROPS_DATA_TOP_INDEX has the post-header data size. 222 * In earlier formatVersions, it is 0 and a lower dataIndexes entry 223 * has the top of the last item. 224 */ 225 for(i=UPROPS_DATA_TOP_INDEX; i>0 && (dataTop=dataIndexes[i])==0; --i) {} 226 227 if((length-headerSize)<(4*dataTop)) { 228 udata_printError(ds, "uprops_swap(): too few bytes (%d after header) for a Unicode properties file\n", 229 length-headerSize); 230 *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR; 231 return 0; 232 } 233 234 outData32 = reinterpret_cast<int32_t*>(static_cast<char*>(outData) + headerSize); 235 236 /* copy everything for inaccessible data (padding) */ 237 if(inData32!=outData32) { 238 uprv_memcpy(outData32, inData32, 4*(size_t)dataTop); 239 } 240 241 /* swap the indexes[16] */ 242 ds->swapArray32(ds, inData32, 4*UPROPS_INDEX_COUNT, outData32, pErrorCode); 243 244 /* 245 * swap the main properties UTrie 246 * PT serialized properties trie, see utrie.h (byte size: 4*(i0-16)) 247 */ 248 utrie_swapAnyVersion(ds, 249 inData32+UPROPS_INDEX_COUNT, 250 4*(dataIndexes[UPROPS_PROPS32_INDEX]-UPROPS_INDEX_COUNT), 251 outData32+UPROPS_INDEX_COUNT, 252 pErrorCode); 253 254 /* 255 * swap the properties and exceptions words 256 * P const uint32_t props32[i1-i0]; 257 * E const uint32_t exceptions[i2-i1]; 258 */ 259 ds->swapArray32(ds, 260 inData32+dataIndexes[UPROPS_PROPS32_INDEX], 261 4*(dataIndexes[UPROPS_EXCEPTIONS_TOP_INDEX]-dataIndexes[UPROPS_PROPS32_INDEX]), 262 outData32+dataIndexes[UPROPS_PROPS32_INDEX], 263 pErrorCode); 264 265 /* 266 * swap the UChars 267 * U const char16_t uchars[2*(i3-i2)]; 268 */ 269 ds->swapArray16(ds, 270 inData32+dataIndexes[UPROPS_EXCEPTIONS_TOP_INDEX], 271 4*(dataIndexes[UPROPS_ADDITIONAL_TRIE_INDEX]-dataIndexes[UPROPS_EXCEPTIONS_TOP_INDEX]), 272 outData32+dataIndexes[UPROPS_EXCEPTIONS_TOP_INDEX], 273 pErrorCode); 274 275 /* 276 * swap the additional UTrie 277 * i3 additionalTrieIndex; -- 32-bit unit index to the additional trie for more properties 278 */ 279 utrie_swapAnyVersion(ds, 280 inData32+dataIndexes[UPROPS_ADDITIONAL_TRIE_INDEX], 281 4*(dataIndexes[UPROPS_ADDITIONAL_VECTORS_INDEX]-dataIndexes[UPROPS_ADDITIONAL_TRIE_INDEX]), 282 outData32+dataIndexes[UPROPS_ADDITIONAL_TRIE_INDEX], 283 pErrorCode); 284 285 /* 286 * swap the properties vectors 287 * PV const uint32_t propsVectors[(i6-i4)/i5][i5]==uint32_t propsVectors[i6-i4]; 288 */ 289 ds->swapArray32(ds, 290 inData32+dataIndexes[UPROPS_ADDITIONAL_VECTORS_INDEX], 291 4*(dataIndexes[UPROPS_SCRIPT_EXTENSIONS_INDEX]-dataIndexes[UPROPS_ADDITIONAL_VECTORS_INDEX]), 292 outData32+dataIndexes[UPROPS_ADDITIONAL_VECTORS_INDEX], 293 pErrorCode); 294 295 // swap the Script_Extensions data 296 // SCX const uint16_t scriptExtensions[2*(i7-i6)]; 297 ds->swapArray16(ds, 298 inData32+dataIndexes[UPROPS_SCRIPT_EXTENSIONS_INDEX], 299 4*(dataIndexes[UPROPS_BLOCK_TRIE_INDEX]-dataIndexes[UPROPS_SCRIPT_EXTENSIONS_INDEX]), 300 outData32+dataIndexes[UPROPS_SCRIPT_EXTENSIONS_INDEX], 301 pErrorCode); 302 303 // Swap the Block UCPTrie=CodePointTrie. 304 int32_t partOffset = dataIndexes[UPROPS_BLOCK_TRIE_INDEX]; 305 int32_t nextOffset = dataIndexes[UPROPS_RESERVED_INDEX_8]; 306 int32_t partLength = 4 * (nextOffset - partOffset); 307 if (partLength >= 0) { 308 utrie_swapAnyVersion(ds, inData32 + partOffset, partLength, 309 outData32 + partOffset, pErrorCode); 310 } 311 } 312 313 return headerSize+4*dataIndexes[UPROPS_RESERVED_INDEX_8]; 314 } 315 316 /* Unicode case mapping data swapping --------------------------------------- */ 317 318 static int32_t U_CALLCONV 319 ucase_swap(const UDataSwapper *ds, 320 const void *inData, int32_t length, void *outData, 321 UErrorCode *pErrorCode) { 322 const UDataInfo *pInfo; 323 int32_t headerSize; 324 325 const uint8_t *inBytes; 326 uint8_t *outBytes; 327 328 const int32_t *inIndexes; 329 int32_t indexes[16]; 330 331 int32_t i, offset, count, size; 332 333 /* udata_swapDataHeader checks the arguments */ 334 headerSize=udata_swapDataHeader(ds, inData, length, outData, pErrorCode); 335 if(pErrorCode==nullptr || U_FAILURE(*pErrorCode)) { 336 return 0; 337 } 338 339 /* check data format and format version */ 340 pInfo = reinterpret_cast<const UDataInfo*>(static_cast<const char*>(inData) + 4); 341 if(!( 342 pInfo->dataFormat[0]==UCASE_FMT_0 && /* dataFormat="cAsE" */ 343 pInfo->dataFormat[1]==UCASE_FMT_1 && 344 pInfo->dataFormat[2]==UCASE_FMT_2 && 345 pInfo->dataFormat[3]==UCASE_FMT_3 && 346 ((pInfo->formatVersion[0]==1 && 347 pInfo->formatVersion[2]==UTRIE_SHIFT && 348 pInfo->formatVersion[3]==UTRIE_INDEX_SHIFT) || 349 (2<=pInfo->formatVersion[0] && pInfo->formatVersion[0]<=4)) 350 )) { 351 udata_printError(ds, "ucase_swap(): data format %02x.%02x.%02x.%02x (format version %02x) is not recognized as case mapping data\n", 352 pInfo->dataFormat[0], pInfo->dataFormat[1], 353 pInfo->dataFormat[2], pInfo->dataFormat[3], 354 pInfo->formatVersion[0]); 355 *pErrorCode=U_UNSUPPORTED_ERROR; 356 return 0; 357 } 358 359 inBytes = static_cast<const uint8_t*>(inData) + headerSize; 360 outBytes = static_cast<uint8_t*>(outData) + headerSize; 361 362 inIndexes = reinterpret_cast<const int32_t*>(inBytes); 363 364 if(length>=0) { 365 length-=headerSize; 366 if(length<16*4) { 367 udata_printError(ds, "ucase_swap(): too few bytes (%d after header) for case mapping data\n", 368 length); 369 *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR; 370 return 0; 371 } 372 } 373 374 /* read the first 16 indexes (ICU 3.2/format version 1: UCASE_IX_TOP==16, might grow) */ 375 for(i=0; i<16; ++i) { 376 indexes[i]=udata_readInt32(ds, inIndexes[i]); 377 } 378 379 /* get the total length of the data */ 380 size=indexes[UCASE_IX_LENGTH]; 381 382 if(length>=0) { 383 if(length<size) { 384 udata_printError(ds, "ucase_swap(): too few bytes (%d after header) for all of case mapping data\n", 385 length); 386 *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR; 387 return 0; 388 } 389 390 /* copy the data for inaccessible bytes */ 391 if(inBytes!=outBytes) { 392 uprv_memcpy(outBytes, inBytes, size); 393 } 394 395 offset=0; 396 397 /* swap the int32_t indexes[] */ 398 count=indexes[UCASE_IX_INDEX_TOP]*4; 399 ds->swapArray32(ds, inBytes, count, outBytes, pErrorCode); 400 offset+=count; 401 402 /* swap the UTrie */ 403 count=indexes[UCASE_IX_TRIE_SIZE]; 404 utrie_swapAnyVersion(ds, inBytes+offset, count, outBytes+offset, pErrorCode); 405 offset+=count; 406 407 /* swap the uint16_t exceptions[] and unfold[] */ 408 count=(indexes[UCASE_IX_EXC_LENGTH]+indexes[UCASE_IX_UNFOLD_LENGTH])*2; 409 ds->swapArray16(ds, inBytes+offset, count, outBytes+offset, pErrorCode); 410 offset+=count; 411 412 U_ASSERT(offset==size); 413 } 414 415 return headerSize+size; 416 } 417 418 /* Unicode bidi/shaping data swapping --------------------------------------- */ 419 420 static int32_t U_CALLCONV 421 ubidi_swap(const UDataSwapper *ds, 422 const void *inData, int32_t length, void *outData, 423 UErrorCode *pErrorCode) { 424 const UDataInfo *pInfo; 425 int32_t headerSize; 426 427 const uint8_t *inBytes; 428 uint8_t *outBytes; 429 430 const int32_t *inIndexes; 431 int32_t indexes[16]; 432 433 int32_t i, offset, count, size; 434 435 /* udata_swapDataHeader checks the arguments */ 436 headerSize=udata_swapDataHeader(ds, inData, length, outData, pErrorCode); 437 if(pErrorCode==nullptr || U_FAILURE(*pErrorCode)) { 438 return 0; 439 } 440 441 /* check data format and format version */ 442 pInfo = reinterpret_cast<const UDataInfo*>(static_cast<const char*>(inData) + 4); 443 if(!( 444 pInfo->dataFormat[0]==UBIDI_FMT_0 && /* dataFormat="BiDi" */ 445 pInfo->dataFormat[1]==UBIDI_FMT_1 && 446 pInfo->dataFormat[2]==UBIDI_FMT_2 && 447 pInfo->dataFormat[3]==UBIDI_FMT_3 && 448 ((pInfo->formatVersion[0]==1 && 449 pInfo->formatVersion[2]==UTRIE_SHIFT && 450 pInfo->formatVersion[3]==UTRIE_INDEX_SHIFT) || 451 pInfo->formatVersion[0]==2) 452 )) { 453 udata_printError(ds, "ubidi_swap(): data format %02x.%02x.%02x.%02x (format version %02x) is not recognized as bidi/shaping data\n", 454 pInfo->dataFormat[0], pInfo->dataFormat[1], 455 pInfo->dataFormat[2], pInfo->dataFormat[3], 456 pInfo->formatVersion[0]); 457 *pErrorCode=U_UNSUPPORTED_ERROR; 458 return 0; 459 } 460 461 inBytes = static_cast<const uint8_t*>(inData) + headerSize; 462 outBytes = static_cast<uint8_t*>(outData) + headerSize; 463 464 inIndexes = reinterpret_cast<const int32_t*>(inBytes); 465 466 if(length>=0) { 467 length-=headerSize; 468 if(length<16*4) { 469 udata_printError(ds, "ubidi_swap(): too few bytes (%d after header) for bidi/shaping data\n", 470 length); 471 *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR; 472 return 0; 473 } 474 } 475 476 /* read the first 16 indexes (ICU 3.4/format version 1: UBIDI_IX_TOP==16, might grow) */ 477 for(i=0; i<16; ++i) { 478 indexes[i]=udata_readInt32(ds, inIndexes[i]); 479 } 480 481 /* get the total length of the data */ 482 size=indexes[UBIDI_IX_LENGTH]; 483 484 if(length>=0) { 485 if(length<size) { 486 udata_printError(ds, "ubidi_swap(): too few bytes (%d after header) for all of bidi/shaping data\n", 487 length); 488 *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR; 489 return 0; 490 } 491 492 /* copy the data for inaccessible bytes */ 493 if(inBytes!=outBytes) { 494 uprv_memcpy(outBytes, inBytes, size); 495 } 496 497 offset=0; 498 499 /* swap the int32_t indexes[] */ 500 count=indexes[UBIDI_IX_INDEX_TOP]*4; 501 ds->swapArray32(ds, inBytes, count, outBytes, pErrorCode); 502 offset+=count; 503 504 /* swap the UTrie */ 505 count=indexes[UBIDI_IX_TRIE_SIZE]; 506 utrie_swapAnyVersion(ds, inBytes+offset, count, outBytes+offset, pErrorCode); 507 offset+=count; 508 509 /* swap the uint32_t mirrors[] */ 510 count=indexes[UBIDI_IX_MIRROR_LENGTH]*4; 511 ds->swapArray32(ds, inBytes+offset, count, outBytes+offset, pErrorCode); 512 offset+=count; 513 514 /* just skip the uint8_t jgArray[] and jgArray2[] */ 515 count=indexes[UBIDI_IX_JG_LIMIT]-indexes[UBIDI_IX_JG_START]; 516 offset+=count; 517 count=indexes[UBIDI_IX_JG_LIMIT2]-indexes[UBIDI_IX_JG_START2]; 518 offset+=count; 519 520 U_ASSERT(offset==size); 521 } 522 523 return headerSize+size; 524 } 525 526 /* Unicode normalization data swapping -------------------------------------- */ 527 528 #if !UCONFIG_NO_NORMALIZATION 529 530 static int32_t U_CALLCONV 531 unorm_swap(const UDataSwapper *ds, 532 const void *inData, int32_t length, void *outData, 533 UErrorCode *pErrorCode) { 534 const UDataInfo *pInfo; 535 int32_t headerSize; 536 537 const uint8_t *inBytes; 538 uint8_t *outBytes; 539 540 const int32_t *inIndexes; 541 int32_t indexes[32]; 542 543 int32_t i, offset, count, size; 544 545 /* udata_swapDataHeader checks the arguments */ 546 headerSize=udata_swapDataHeader(ds, inData, length, outData, pErrorCode); 547 if(pErrorCode==nullptr || U_FAILURE(*pErrorCode)) { 548 return 0; 549 } 550 551 /* check data format and format version */ 552 pInfo = reinterpret_cast<const UDataInfo*>(static_cast<const char*>(inData) + 4); 553 if(!( 554 pInfo->dataFormat[0]==0x4e && /* dataFormat="Norm" */ 555 pInfo->dataFormat[1]==0x6f && 556 pInfo->dataFormat[2]==0x72 && 557 pInfo->dataFormat[3]==0x6d && 558 pInfo->formatVersion[0]==2 559 )) { 560 udata_printError(ds, "unorm_swap(): data format %02x.%02x.%02x.%02x (format version %02x) is not recognized as unorm.icu\n", 561 pInfo->dataFormat[0], pInfo->dataFormat[1], 562 pInfo->dataFormat[2], pInfo->dataFormat[3], 563 pInfo->formatVersion[0]); 564 *pErrorCode=U_UNSUPPORTED_ERROR; 565 return 0; 566 } 567 568 inBytes = static_cast<const uint8_t*>(inData) + headerSize; 569 outBytes = static_cast<uint8_t*>(outData) + headerSize; 570 571 inIndexes = reinterpret_cast<const int32_t*>(inBytes); 572 573 if(length>=0) { 574 length-=headerSize; 575 if(length<32*4) { 576 udata_printError(ds, "unorm_swap(): too few bytes (%d after header) for unorm.icu\n", 577 length); 578 *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR; 579 return 0; 580 } 581 } 582 583 /* read the first 32 indexes (ICU 2.8/format version 2.2: _NORM_INDEX_TOP==32, might grow) */ 584 for(i=0; i<32; ++i) { 585 indexes[i]=udata_readInt32(ds, inIndexes[i]); 586 } 587 588 /* calculate the total length of the data */ 589 size= 590 32*4+ /* size of indexes[] */ 591 indexes[_NORM_INDEX_TRIE_SIZE]+ 592 indexes[_NORM_INDEX_UCHAR_COUNT]*2+ 593 indexes[_NORM_INDEX_COMBINE_DATA_COUNT]*2+ 594 indexes[_NORM_INDEX_FCD_TRIE_SIZE]+ 595 indexes[_NORM_INDEX_AUX_TRIE_SIZE]+ 596 indexes[_NORM_INDEX_CANON_SET_COUNT]*2; 597 598 if(length>=0) { 599 if(length<size) { 600 udata_printError(ds, "unorm_swap(): too few bytes (%d after header) for all of unorm.icu\n", 601 length); 602 *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR; 603 return 0; 604 } 605 606 /* copy the data for inaccessible bytes */ 607 if(inBytes!=outBytes) { 608 uprv_memcpy(outBytes, inBytes, size); 609 } 610 611 offset=0; 612 613 /* swap the indexes[] */ 614 count=32*4; 615 ds->swapArray32(ds, inBytes, count, outBytes, pErrorCode); 616 offset+=count; 617 618 /* swap the main UTrie */ 619 count=indexes[_NORM_INDEX_TRIE_SIZE]; 620 utrie_swap(ds, inBytes+offset, count, outBytes+offset, pErrorCode); 621 offset+=count; 622 623 /* swap the uint16_t extraData[] and the uint16_t combiningTable[] */ 624 count=(indexes[_NORM_INDEX_UCHAR_COUNT]+indexes[_NORM_INDEX_COMBINE_DATA_COUNT])*2; 625 ds->swapArray16(ds, inBytes+offset, count, outBytes+offset, pErrorCode); 626 offset+=count; 627 628 /* swap the FCD UTrie */ 629 count=indexes[_NORM_INDEX_FCD_TRIE_SIZE]; 630 if(count!=0) { 631 utrie_swap(ds, inBytes+offset, count, outBytes+offset, pErrorCode); 632 offset+=count; 633 } 634 635 /* swap the aux UTrie */ 636 count=indexes[_NORM_INDEX_AUX_TRIE_SIZE]; 637 if(count!=0) { 638 utrie_swap(ds, inBytes+offset, count, outBytes+offset, pErrorCode); 639 offset+=count; 640 } 641 642 /* swap the uint16_t combiningTable[] */ 643 count=indexes[_NORM_INDEX_CANON_SET_COUNT]*2; 644 ds->swapArray16(ds, inBytes+offset, count, outBytes+offset, pErrorCode); 645 offset+=count; 646 } 647 648 return headerSize+size; 649 } 650 651 #endif 652 653 // Unicode text layout properties data swapping -------------------------------- 654 655 static int32_t U_CALLCONV 656 ulayout_swap(const UDataSwapper *ds, 657 const void *inData, int32_t length, void *outData, 658 UErrorCode *pErrorCode) { 659 // udata_swapDataHeader checks the arguments. 660 int32_t headerSize = udata_swapDataHeader(ds, inData, length, outData, pErrorCode); 661 if (pErrorCode == nullptr || U_FAILURE(*pErrorCode)) { 662 return 0; 663 } 664 665 // Check data format and format version. 666 const UDataInfo* pInfo = reinterpret_cast<const UDataInfo*>(static_cast<const char*>(inData) + 4); 667 if (!( 668 pInfo->dataFormat[0] == ULAYOUT_FMT_0 && // dataFormat="Layo" 669 pInfo->dataFormat[1] == ULAYOUT_FMT_1 && 670 pInfo->dataFormat[2] == ULAYOUT_FMT_2 && 671 pInfo->dataFormat[3] == ULAYOUT_FMT_3 && 672 pInfo->formatVersion[0] == 1)) { 673 udata_printError(ds, 674 "ulayout_swap(): data format %02x.%02x.%02x.%02x (format version %02x) " 675 "is not recognized as text layout properties data\n", 676 pInfo->dataFormat[0], pInfo->dataFormat[1], 677 pInfo->dataFormat[2], pInfo->dataFormat[3], 678 pInfo->formatVersion[0]); 679 *pErrorCode = U_UNSUPPORTED_ERROR; 680 return 0; 681 } 682 683 const uint8_t* inBytes = static_cast<const uint8_t*>(inData) + headerSize; 684 uint8_t* outBytes = static_cast<uint8_t*>(outData) + headerSize; 685 686 const int32_t* inIndexes = reinterpret_cast<const int32_t*>(inBytes); 687 688 if (length >= 0) { 689 length -= headerSize; 690 if (length < 12 * 4) { 691 udata_printError(ds, 692 "ulayout_swap(): too few bytes (%d after header) for text layout properties data\n", 693 length); 694 *pErrorCode = U_INDEX_OUTOFBOUNDS_ERROR; 695 return 0; 696 } 697 } 698 699 int32_t indexesLength = udata_readInt32(ds, inIndexes[ULAYOUT_IX_INDEXES_LENGTH]); 700 if (indexesLength < 12) { 701 udata_printError(ds, 702 "ulayout_swap(): too few indexes (%d) for text layout properties data\n", 703 indexesLength); 704 *pErrorCode = U_INDEX_OUTOFBOUNDS_ERROR; 705 return 0; 706 } 707 708 // Read the data offsets before swapping anything. 709 int32_t indexes[ULAYOUT_IX_TRIES_TOP + 1]; 710 for (int32_t i = ULAYOUT_IX_INPC_TRIE_TOP; i <= ULAYOUT_IX_TRIES_TOP; ++i) { 711 indexes[i] = udata_readInt32(ds, inIndexes[i]); 712 } 713 int32_t size = indexes[ULAYOUT_IX_TRIES_TOP]; 714 715 if (length >= 0) { 716 if (length < size) { 717 udata_printError(ds, 718 "ulayout_swap(): too few bytes (%d after header) " 719 "for all of text layout properties data\n", 720 length); 721 *pErrorCode = U_INDEX_OUTOFBOUNDS_ERROR; 722 return 0; 723 } 724 725 // Copy the data for inaccessible bytes. 726 if (inBytes != outBytes) { 727 uprv_memcpy(outBytes, inBytes, size); 728 } 729 730 // Swap the int32_t indexes[]. 731 int32_t offset = 0; 732 int32_t count = indexesLength * 4; 733 ds->swapArray32(ds, inBytes, count, outBytes, pErrorCode); 734 offset += count; 735 736 // Swap each trie. 737 for (int32_t i = ULAYOUT_IX_INPC_TRIE_TOP; i <= ULAYOUT_IX_TRIES_TOP; ++i) { 738 int32_t top = indexes[i]; 739 count = top - offset; 740 U_ASSERT(count >= 0); 741 if (count >= 16) { 742 utrie_swapAnyVersion(ds, inBytes + offset, count, outBytes + offset, pErrorCode); 743 } 744 offset = top; 745 } 746 747 U_ASSERT(offset == size); 748 } 749 750 return headerSize + size; 751 } 752 753 // Unicode emoji properties data swapping -------------------------------------- 754 755 static int32_t U_CALLCONV 756 uemoji_swap(const UDataSwapper *ds, 757 const void *inData, int32_t length, void *outData, 758 UErrorCode *pErrorCode) { 759 // udata_swapDataHeader checks the arguments. 760 int32_t headerSize = udata_swapDataHeader(ds, inData, length, outData, pErrorCode); 761 if (pErrorCode == nullptr || U_FAILURE(*pErrorCode)) { 762 return 0; 763 } 764 765 // Check data format and format version. 766 const UDataInfo* pInfo = reinterpret_cast<const UDataInfo*>(static_cast<const char*>(inData) + 4); 767 if (!( 768 pInfo->dataFormat[0] == u'E' && 769 pInfo->dataFormat[1] == u'm' && 770 pInfo->dataFormat[2] == u'o' && 771 pInfo->dataFormat[3] == u'j' && 772 pInfo->formatVersion[0] == 1)) { 773 udata_printError(ds, 774 "uemoji_swap(): data format %02x.%02x.%02x.%02x (format version %02x) " 775 "is not recognized as emoji properties data\n", 776 pInfo->dataFormat[0], pInfo->dataFormat[1], 777 pInfo->dataFormat[2], pInfo->dataFormat[3], 778 pInfo->formatVersion[0]); 779 *pErrorCode = U_UNSUPPORTED_ERROR; 780 return 0; 781 } 782 783 const uint8_t* inBytes = static_cast<const uint8_t*>(inData) + headerSize; 784 uint8_t* outBytes = static_cast<uint8_t*>(outData) + headerSize; 785 786 const int32_t* inIndexes = reinterpret_cast<const int32_t*>(inBytes); 787 788 if (length >= 0) { 789 length -= headerSize; 790 // We expect to read at least EmojiProps::IX_TOTAL_SIZE. 791 if (length < 14 * 4) { 792 udata_printError(ds, 793 "uemoji_swap(): too few bytes (%d after header) for emoji properties data\n", 794 length); 795 *pErrorCode = U_INDEX_OUTOFBOUNDS_ERROR; 796 return 0; 797 } 798 } 799 800 // First offset after indexes[]. 801 int32_t cpTrieOffset = udata_readInt32(ds, inIndexes[EmojiProps::IX_CPTRIE_OFFSET]); 802 int32_t indexesLength = cpTrieOffset / 4; 803 if (indexesLength < 14) { 804 udata_printError(ds, 805 "uemoji_swap(): too few indexes (%d) for emoji properties data\n", 806 indexesLength); 807 *pErrorCode = U_INDEX_OUTOFBOUNDS_ERROR; 808 return 0; 809 } 810 811 // Read the data offsets before swapping anything. 812 int32_t indexes[EmojiProps::IX_TOTAL_SIZE + 1]; 813 indexes[0] = cpTrieOffset; 814 for (int32_t i = 1; i <= EmojiProps::IX_TOTAL_SIZE; ++i) { 815 indexes[i] = udata_readInt32(ds, inIndexes[i]); 816 } 817 int32_t size = indexes[EmojiProps::IX_TOTAL_SIZE]; 818 819 if (length >= 0) { 820 if (length < size) { 821 udata_printError(ds, 822 "uemoji_swap(): too few bytes (%d after header) " 823 "for all of emoji properties data\n", 824 length); 825 *pErrorCode = U_INDEX_OUTOFBOUNDS_ERROR; 826 return 0; 827 } 828 829 // Copy the data for inaccessible bytes. 830 if (inBytes != outBytes) { 831 uprv_memcpy(outBytes, inBytes, size); 832 } 833 834 // Swap the int32_t indexes[]. 835 int32_t offset = 0; 836 int32_t top = cpTrieOffset; 837 ds->swapArray32(ds, inBytes, top - offset, outBytes, pErrorCode); 838 offset = top; 839 840 // Swap the code point trie. 841 top = indexes[EmojiProps::IX_CPTRIE_OFFSET + 1]; 842 int32_t count = top - offset; 843 U_ASSERT(count >= 0); 844 if (count >= 16) { 845 utrie_swapAnyVersion(ds, inBytes + offset, count, outBytes + offset, pErrorCode); 846 } 847 offset = top; 848 849 // Swap all of the string tries. 850 // They are all serialized as arrays of 16-bit units. 851 offset = indexes[EmojiProps::IX_BASIC_EMOJI_TRIE_OFFSET]; 852 top = indexes[EmojiProps::IX_RGI_EMOJI_ZWJ_SEQUENCE_TRIE_OFFSET + 1]; 853 ds->swapArray16(ds, inBytes + offset, top - offset, outBytes + offset, pErrorCode); 854 offset = top; 855 856 U_ASSERT(offset == size); 857 } 858 859 return headerSize + size; 860 } 861 862 /* Swap 'Test' data from gentest */ 863 static int32_t U_CALLCONV 864 test_swap(const UDataSwapper *ds, 865 const void *inData, int32_t length, void *outData, 866 UErrorCode *pErrorCode) { 867 const UDataInfo *pInfo; 868 int32_t headerSize; 869 870 const uint8_t *inBytes; 871 uint8_t *outBytes; 872 873 int32_t offset; 874 875 /* udata_swapDataHeader checks the arguments */ 876 headerSize=udata_swapDataHeader(ds, inData, length, outData, pErrorCode); 877 if(pErrorCode==nullptr || U_FAILURE(*pErrorCode)) { 878 udata_printError(ds, "test_swap(): data header swap failed %s\n", pErrorCode != nullptr ? u_errorName(*pErrorCode) : "pErrorCode is nullptr"); 879 return 0; 880 } 881 882 /* check data format and format version */ 883 pInfo = reinterpret_cast<const UDataInfo*>(static_cast<const char*>(inData) + 4); 884 if(!( 885 pInfo->dataFormat[0]==0x54 && /* dataFormat="Norm" */ 886 pInfo->dataFormat[1]==0x65 && 887 pInfo->dataFormat[2]==0x73 && 888 pInfo->dataFormat[3]==0x74 && 889 pInfo->formatVersion[0]==1 890 )) { 891 udata_printError(ds, "test_swap(): data format %02x.%02x.%02x.%02x (format version %02x) is not recognized as testdata\n", 892 pInfo->dataFormat[0], pInfo->dataFormat[1], 893 pInfo->dataFormat[2], pInfo->dataFormat[3], 894 pInfo->formatVersion[0]); 895 *pErrorCode=U_UNSUPPORTED_ERROR; 896 return 0; 897 } 898 899 inBytes = static_cast<const uint8_t*>(inData) + headerSize; 900 outBytes = static_cast<uint8_t*>(outData) + headerSize; 901 902 int32_t size16 = 2; // 16bit plus padding 903 int32_t sizeStr = 5; // 4 char inv-str plus null 904 int32_t size = size16 + sizeStr; 905 906 if(length>=0) { 907 if(length<size) { 908 udata_printError(ds, "test_swap(): too few bytes (%d after header, wanted %d) for all of testdata\n", 909 length, size); 910 *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR; 911 return 0; 912 } 913 914 offset =0; 915 /* swap a 1 entry array */ 916 ds->swapArray16(ds, inBytes+offset, size16, outBytes+offset, pErrorCode); 917 offset+=size16; 918 ds->swapInvChars(ds, inBytes+offset, sizeStr, outBytes+offset, pErrorCode); 919 } 920 921 return headerSize+size; 922 } 923 924 /* swap any data (except a .dat package) ------------------------------------ */ 925 926 static const struct { 927 uint8_t dataFormat[4]; 928 UDataSwapFn *swapFn; 929 } swapFns[]={ 930 { { 0x52, 0x65, 0x73, 0x42 }, ures_swap }, /* dataFormat="ResB" */ 931 #if !UCONFIG_NO_LEGACY_CONVERSION 932 { { 0x63, 0x6e, 0x76, 0x74 }, ucnv_swap }, /* dataFormat="cnvt" */ 933 #endif 934 #if !UCONFIG_NO_CONVERSION 935 { { 0x43, 0x76, 0x41, 0x6c }, ucnv_swapAliases }, /* dataFormat="CvAl" */ 936 #endif 937 #if !UCONFIG_NO_IDNA 938 { { 0x53, 0x50, 0x52, 0x50 }, usprep_swap }, /* dataFormat="SPRP" */ 939 #endif 940 /* insert data formats here, descending by expected frequency of occurrence */ 941 { { 0x55, 0x50, 0x72, 0x6f }, uprops_swap }, /* dataFormat="UPro" */ 942 943 { { UCASE_FMT_0, UCASE_FMT_1, UCASE_FMT_2, UCASE_FMT_3 }, 944 ucase_swap }, /* dataFormat="cAsE" */ 945 946 { { UBIDI_FMT_0, UBIDI_FMT_1, UBIDI_FMT_2, UBIDI_FMT_3 }, 947 ubidi_swap }, /* dataFormat="BiDi" */ 948 949 #if !UCONFIG_NO_NORMALIZATION 950 { { 0x4e, 0x6f, 0x72, 0x6d }, unorm_swap }, /* dataFormat="Norm" */ 951 { { 0x4e, 0x72, 0x6d, 0x32 }, unorm2_swap }, /* dataFormat="Nrm2" */ 952 #endif 953 954 { { ULAYOUT_FMT_0, ULAYOUT_FMT_1, ULAYOUT_FMT_2, ULAYOUT_FMT_3 }, 955 ulayout_swap }, // dataFormat="Layo" 956 957 { { u'E', u'm', u'o', u'j' }, uemoji_swap }, 958 959 #if !UCONFIG_NO_COLLATION 960 { { 0x55, 0x43, 0x6f, 0x6c }, ucol_swap }, /* dataFormat="UCol" */ 961 { { 0x49, 0x6e, 0x76, 0x43 }, ucol_swapInverseUCA },/* dataFormat="InvC" */ 962 #endif 963 #if !UCONFIG_NO_BREAK_ITERATION 964 { { 0x42, 0x72, 0x6b, 0x20 }, ubrk_swap }, /* dataFormat="Brk " */ 965 { { 0x44, 0x69, 0x63, 0x74 }, udict_swap }, /* dataFormat="Dict" */ 966 #endif 967 { { 0x70, 0x6e, 0x61, 0x6d }, upname_swap }, /* dataFormat="pnam" */ 968 { { 0x75, 0x6e, 0x61, 0x6d }, uchar_swapNames }, /* dataFormat="unam" */ 969 #if !UCONFIG_NO_NORMALIZATION 970 { { 0x43, 0x66, 0x75, 0x20 }, uspoof_swap }, /* dataFormat="Cfu " */ 971 #endif 972 { { 0x54, 0x65, 0x73, 0x74 }, test_swap } /* dataFormat="Test" */ 973 }; 974 975 U_CAPI int32_t U_EXPORT2 976 udata_swap(const UDataSwapper *ds, 977 const void *inData, int32_t length, void *outData, 978 UErrorCode *pErrorCode) { 979 char dataFormatChars[4]; 980 const UDataInfo *pInfo; 981 int32_t i, swappedLength; 982 983 if(pErrorCode==nullptr || U_FAILURE(*pErrorCode)) { 984 return 0; 985 } 986 987 /* 988 * Preflight the header first; checks for illegal arguments, too. 989 * Do not swap the header right away because the format-specific swapper 990 * will swap it, get the headerSize again, and also use the header 991 * information. Otherwise we would have to pass some of the information 992 * and not be able to use the UDataSwapFn signature. 993 */ 994 udata_swapDataHeader(ds, inData, -1, nullptr, pErrorCode); 995 996 /* 997 * If we wanted udata_swap() to also handle non-loadable data like a UTrie, 998 * then we could check here for further known magic values and structures. 999 */ 1000 if(U_FAILURE(*pErrorCode)) { 1001 return 0; /* the data format was not recognized */ 1002 } 1003 1004 pInfo=(const UDataInfo *)((const char *)inData+4); 1005 1006 { 1007 /* convert the data format from ASCII to Unicode to the system charset */ 1008 char16_t u[4]={ 1009 pInfo->dataFormat[0], pInfo->dataFormat[1], 1010 pInfo->dataFormat[2], pInfo->dataFormat[3] 1011 }; 1012 1013 if(uprv_isInvariantUString(u, 4)) { 1014 u_UCharsToChars(u, dataFormatChars, 4); 1015 } else { 1016 dataFormatChars[0]=dataFormatChars[1]=dataFormatChars[2]=dataFormatChars[3]='?'; 1017 } 1018 } 1019 1020 /* dispatch to the swap function for the dataFormat */ 1021 for(i=0; i<UPRV_LENGTHOF(swapFns); ++i) { 1022 if(0==memcmp(swapFns[i].dataFormat, pInfo->dataFormat, 4)) { 1023 swappedLength=swapFns[i].swapFn(ds, inData, length, outData, pErrorCode); 1024 1025 if(U_FAILURE(*pErrorCode)) { 1026 udata_printError(ds, "udata_swap(): failure swapping data format %02x.%02x.%02x.%02x (\"%c%c%c%c\") - %s\n", 1027 pInfo->dataFormat[0], pInfo->dataFormat[1], 1028 pInfo->dataFormat[2], pInfo->dataFormat[3], 1029 dataFormatChars[0], dataFormatChars[1], 1030 dataFormatChars[2], dataFormatChars[3], 1031 u_errorName(*pErrorCode)); 1032 } else if(swappedLength<(length-15)) { 1033 /* swapped less than expected */ 1034 udata_printError(ds, "udata_swap() warning: swapped only %d out of %d bytes - data format %02x.%02x.%02x.%02x (\"%c%c%c%c\")\n", 1035 swappedLength, length, 1036 pInfo->dataFormat[0], pInfo->dataFormat[1], 1037 pInfo->dataFormat[2], pInfo->dataFormat[3], 1038 dataFormatChars[0], dataFormatChars[1], 1039 dataFormatChars[2], dataFormatChars[3], 1040 u_errorName(*pErrorCode)); 1041 } 1042 1043 return swappedLength; 1044 } 1045 } 1046 1047 /* the dataFormat was not recognized */ 1048 udata_printError(ds, "udata_swap(): unknown data format %02x.%02x.%02x.%02x (\"%c%c%c%c\")\n", 1049 pInfo->dataFormat[0], pInfo->dataFormat[1], 1050 pInfo->dataFormat[2], pInfo->dataFormat[3], 1051 dataFormatChars[0], dataFormatChars[1], 1052 dataFormatChars[2], dataFormatChars[3]); 1053 1054 *pErrorCode=U_UNSUPPORTED_ERROR; 1055 return 0; 1056 }