utrie2_builder.cpp (48178B)
1 // © 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html 3 /* 4 ****************************************************************************** 5 * 6 * Copyright (C) 2001-2014, International Business Machines 7 * Corporation and others. All Rights Reserved. 8 * 9 ****************************************************************************** 10 * file name: utrie2_builder.cpp 11 * encoding: UTF-8 12 * tab size: 8 (not used) 13 * indentation:4 14 * 15 * created on: 2008sep26 (split off from utrie2.c) 16 * created by: Markus W. Scherer 17 * 18 * This is a common implementation of a Unicode trie. 19 * It is a kind of compressed, serializable table of 16- or 32-bit values associated with 20 * Unicode code points (0..0x10ffff). 21 * This is the second common version of a Unicode trie (hence the name UTrie2). 22 * See utrie2.h for a comparison. 23 * 24 * This file contains only the builder code. 25 * See utrie2.c for the runtime and enumeration code. 26 */ 27 // #define UTRIE2_DEBUG 28 #ifdef UTRIE2_DEBUG 29 # include <stdio.h> 30 #endif 31 // #define UCPTRIE_DEBUG 32 33 #include "unicode/utypes.h" 34 #ifdef UCPTRIE_DEBUG 35 #include "unicode/ucptrie.h" 36 #include "unicode/umutablecptrie.h" 37 #include "ucptrie_impl.h" 38 #endif 39 #include "cmemory.h" 40 #include "utrie2.h" 41 #include "utrie2_impl.h" 42 43 #include "utrie.h" // for utrie2_fromUTrie() 44 45 /* Implementation notes ----------------------------------------------------- */ 46 47 /* 48 * The UTRIE2_SHIFT_1, UTRIE2_SHIFT_2, UTRIE2_INDEX_SHIFT and other values 49 * have been chosen to minimize trie sizes overall. 50 * Most of the code is flexible enough to work with a range of values, 51 * within certain limits. 52 * 53 * Exception: Support for separate values for lead surrogate code _units_ 54 * vs. code _points_ was added after the constants were fixed, 55 * and has not been tested nor particularly designed for different constant values. 56 * (Especially the utrie2_enum() code that jumps to the special LSCP index-2 57 * part and back.) 58 * 59 * Requires UTRIE2_SHIFT_2<=6. Otherwise 0xc0 which is the top of the ASCII-linear data 60 * including the bad-UTF-8-data block is not a multiple of UTRIE2_DATA_BLOCK_LENGTH 61 * and map[block>>UTRIE2_SHIFT_2] (used in reference counting and compaction 62 * remapping) stops working. 63 * 64 * Requires UTRIE2_SHIFT_1>=10 because utrie2_enumForLeadSurrogate() 65 * assumes that a single index-2 block is used for 0x400 code points 66 * corresponding to one lead surrogate. 67 * 68 * Requires UTRIE2_SHIFT_1<=16. Otherwise one single index-2 block contains 69 * more than one Unicode plane, and the split of the index-2 table into a BMP 70 * part and a supplementary part, with a gap in between, would not work. 71 * 72 * Requires UTRIE2_INDEX_SHIFT>=1 not because of the code but because 73 * there is data with more than 64k distinct values, 74 * for example for Unihan collation with a separate collation weight per 75 * Han character. 76 */ 77 78 /* Building a trie ----------------------------------------------------------*/ 79 80 enum { 81 /** The null index-2 block, following the gap in the index-2 table. */ 82 UNEWTRIE2_INDEX_2_NULL_OFFSET=UNEWTRIE2_INDEX_GAP_OFFSET+UNEWTRIE2_INDEX_GAP_LENGTH, 83 84 /** The start of allocated index-2 blocks. */ 85 UNEWTRIE2_INDEX_2_START_OFFSET=UNEWTRIE2_INDEX_2_NULL_OFFSET+UTRIE2_INDEX_2_BLOCK_LENGTH, 86 87 /** 88 * The null data block. 89 * Length 64=0x40 even if UTRIE2_DATA_BLOCK_LENGTH is smaller, 90 * to work with 6-bit trail bytes from 2-byte UTF-8. 91 */ 92 UNEWTRIE2_DATA_NULL_OFFSET=UTRIE2_DATA_START_OFFSET, 93 94 /** The start of allocated data blocks. */ 95 UNEWTRIE2_DATA_START_OFFSET=UNEWTRIE2_DATA_NULL_OFFSET+0x40, 96 97 /** 98 * The start of data blocks for U+0800 and above. 99 * Below, compaction uses a block length of 64 for 2-byte UTF-8. 100 * From here on, compaction uses UTRIE2_DATA_BLOCK_LENGTH. 101 * Data values for 0x780 code points beyond ASCII. 102 */ 103 UNEWTRIE2_DATA_0800_OFFSET=UNEWTRIE2_DATA_START_OFFSET+0x780 104 }; 105 106 /* Start with allocation of 16k data entries. */ 107 #define UNEWTRIE2_INITIAL_DATA_LENGTH ((int32_t)1<<14) 108 109 /* Grow about 8x each time. */ 110 #define UNEWTRIE2_MEDIUM_DATA_LENGTH ((int32_t)1<<17) 111 112 static int32_t 113 allocIndex2Block(UNewTrie2 *trie); 114 115 U_CAPI UTrie2 * U_EXPORT2 116 utrie2_open(uint32_t initialValue, uint32_t errorValue, UErrorCode *pErrorCode) { 117 UTrie2 *trie; 118 UNewTrie2 *newTrie; 119 uint32_t *data; 120 int32_t i, j; 121 122 if(U_FAILURE(*pErrorCode)) { 123 return nullptr; 124 } 125 126 trie=(UTrie2 *)uprv_malloc(sizeof(UTrie2)); 127 newTrie=(UNewTrie2 *)uprv_malloc(sizeof(UNewTrie2)); 128 data=(uint32_t *)uprv_malloc(UNEWTRIE2_INITIAL_DATA_LENGTH*4); 129 if(trie==nullptr || newTrie==nullptr || data==nullptr) { 130 uprv_free(trie); 131 uprv_free(newTrie); 132 uprv_free(data); 133 *pErrorCode=U_MEMORY_ALLOCATION_ERROR; 134 return nullptr; 135 } 136 137 uprv_memset(trie, 0, sizeof(UTrie2)); 138 trie->initialValue=initialValue; 139 trie->errorValue=errorValue; 140 trie->highStart=0x110000; 141 trie->newTrie=newTrie; 142 #ifdef UTRIE2_DEBUG 143 trie->name="open"; 144 #endif 145 146 newTrie->data=data; 147 #ifdef UCPTRIE_DEBUG 148 newTrie->t3=umutablecptrie_open(initialValue, errorValue, pErrorCode); 149 #endif 150 newTrie->dataCapacity=UNEWTRIE2_INITIAL_DATA_LENGTH; 151 newTrie->initialValue=initialValue; 152 newTrie->errorValue=errorValue; 153 newTrie->highStart=0x110000; 154 newTrie->firstFreeBlock=0; /* no free block in the list */ 155 newTrie->isCompacted=false; 156 157 /* 158 * preallocate and reset 159 * - ASCII 160 * - the bad-UTF-8-data block 161 * - the null data block 162 */ 163 for(i=0; i<0x80; ++i) { 164 newTrie->data[i]=initialValue; 165 } 166 for(; i<0xc0; ++i) { 167 newTrie->data[i]=errorValue; 168 } 169 for(i=UNEWTRIE2_DATA_NULL_OFFSET; i<UNEWTRIE2_DATA_START_OFFSET; ++i) { 170 newTrie->data[i]=initialValue; 171 } 172 newTrie->dataNullOffset=UNEWTRIE2_DATA_NULL_OFFSET; 173 newTrie->dataLength=UNEWTRIE2_DATA_START_OFFSET; 174 175 /* set the index-2 indexes for the 2=0x80>>UTRIE2_SHIFT_2 ASCII data blocks */ 176 for(i=0, j=0; j<0x80; ++i, j+=UTRIE2_DATA_BLOCK_LENGTH) { 177 newTrie->index2[i]=j; 178 newTrie->map[i]=1; 179 } 180 /* reference counts for the bad-UTF-8-data block */ 181 for(; j<0xc0; ++i, j+=UTRIE2_DATA_BLOCK_LENGTH) { 182 newTrie->map[i]=0; 183 } 184 /* 185 * Reference counts for the null data block: all blocks except for the ASCII blocks. 186 * Plus 1 so that we don't drop this block during compaction. 187 * Plus as many as needed for lead surrogate code points. 188 */ 189 /* i==newTrie->dataNullOffset */ 190 newTrie->map[i++]= 191 (0x110000>>UTRIE2_SHIFT_2)- 192 (0x80>>UTRIE2_SHIFT_2)+ 193 1+ 194 UTRIE2_LSCP_INDEX_2_LENGTH; 195 j+=UTRIE2_DATA_BLOCK_LENGTH; 196 for(; j<UNEWTRIE2_DATA_START_OFFSET; ++i, j+=UTRIE2_DATA_BLOCK_LENGTH) { 197 newTrie->map[i]=0; 198 } 199 200 /* 201 * set the remaining indexes in the BMP index-2 block 202 * to the null data block 203 */ 204 for(i=0x80>>UTRIE2_SHIFT_2; i<UTRIE2_INDEX_2_BMP_LENGTH; ++i) { 205 newTrie->index2[i]=UNEWTRIE2_DATA_NULL_OFFSET; 206 } 207 208 /* 209 * Fill the index gap with impossible values so that compaction 210 * does not overlap other index-2 blocks with the gap. 211 */ 212 for(i=0; i<UNEWTRIE2_INDEX_GAP_LENGTH; ++i) { 213 newTrie->index2[UNEWTRIE2_INDEX_GAP_OFFSET+i]=-1; 214 } 215 216 /* set the indexes in the null index-2 block */ 217 for(i=0; i<UTRIE2_INDEX_2_BLOCK_LENGTH; ++i) { 218 newTrie->index2[UNEWTRIE2_INDEX_2_NULL_OFFSET+i]=UNEWTRIE2_DATA_NULL_OFFSET; 219 } 220 newTrie->index2NullOffset=UNEWTRIE2_INDEX_2_NULL_OFFSET; 221 newTrie->index2Length=UNEWTRIE2_INDEX_2_START_OFFSET; 222 223 /* set the index-1 indexes for the linear index-2 block */ 224 for(i=0, j=0; 225 i<UTRIE2_OMITTED_BMP_INDEX_1_LENGTH; 226 ++i, j+=UTRIE2_INDEX_2_BLOCK_LENGTH 227 ) { 228 newTrie->index1[i]=j; 229 } 230 231 /* set the remaining index-1 indexes to the null index-2 block */ 232 for(; i<UNEWTRIE2_INDEX_1_LENGTH; ++i) { 233 newTrie->index1[i]=UNEWTRIE2_INDEX_2_NULL_OFFSET; 234 } 235 236 /* 237 * Preallocate and reset data for U+0080..U+07ff, 238 * for 2-byte UTF-8 which will be compacted in 64-blocks 239 * even if UTRIE2_DATA_BLOCK_LENGTH is smaller. 240 */ 241 for(i=0x80; i<0x800; i+=UTRIE2_DATA_BLOCK_LENGTH) { 242 utrie2_set32(trie, i, initialValue, pErrorCode); 243 } 244 245 return trie; 246 } 247 248 static UNewTrie2 * 249 cloneBuilder(const UNewTrie2 *other) { 250 UNewTrie2 *trie; 251 252 trie = static_cast<UNewTrie2*>(uprv_malloc(sizeof(UNewTrie2))); 253 if(trie==nullptr) { 254 return nullptr; 255 } 256 257 trie->data = static_cast<uint32_t*>(uprv_malloc(other->dataCapacity * 4)); 258 if(trie->data==nullptr) { 259 uprv_free(trie); 260 return nullptr; 261 } 262 #ifdef UCPTRIE_DEBUG 263 if(other->t3==nullptr) { 264 trie->t3=nullptr; 265 } else { 266 UErrorCode errorCode=U_ZERO_ERROR; 267 trie->t3=umutablecptrie_clone(other->t3, &errorCode); 268 } 269 #endif 270 trie->dataCapacity=other->dataCapacity; 271 272 /* clone data */ 273 uprv_memcpy(trie->index1, other->index1, sizeof(trie->index1)); 274 uprv_memcpy(trie->index2, other->index2, (size_t)other->index2Length*4); 275 trie->index2NullOffset=other->index2NullOffset; 276 trie->index2Length=other->index2Length; 277 278 uprv_memcpy(trie->data, other->data, (size_t)other->dataLength*4); 279 trie->dataNullOffset=other->dataNullOffset; 280 trie->dataLength=other->dataLength; 281 282 /* reference counters */ 283 if(other->isCompacted) { 284 trie->firstFreeBlock=0; 285 } else { 286 uprv_memcpy(trie->map, other->map, ((size_t)other->dataLength>>UTRIE2_SHIFT_2)*4); 287 trie->firstFreeBlock=other->firstFreeBlock; 288 } 289 290 trie->initialValue=other->initialValue; 291 trie->errorValue=other->errorValue; 292 trie->highStart=other->highStart; 293 trie->isCompacted=other->isCompacted; 294 295 return trie; 296 } 297 298 U_CAPI UTrie2 * U_EXPORT2 299 utrie2_clone(const UTrie2 *other, UErrorCode *pErrorCode) { 300 UTrie2 *trie; 301 302 if(U_FAILURE(*pErrorCode)) { 303 return nullptr; 304 } 305 if(other==nullptr || (other->memory==nullptr && other->newTrie==nullptr)) { 306 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; 307 return nullptr; 308 } 309 310 trie=(UTrie2 *)uprv_malloc(sizeof(UTrie2)); 311 if(trie==nullptr) { 312 *pErrorCode=U_MEMORY_ALLOCATION_ERROR; 313 return nullptr; 314 } 315 uprv_memcpy(trie, other, sizeof(UTrie2)); 316 317 if(other->memory!=nullptr) { 318 trie->memory=uprv_malloc(other->length); 319 if(trie->memory!=nullptr) { 320 trie->isMemoryOwned=true; 321 uprv_memcpy(trie->memory, other->memory, other->length); 322 323 /* make the clone's pointers point to its own memory */ 324 trie->index=(uint16_t *)trie->memory+(other->index-(uint16_t *)other->memory); 325 if(other->data16!=nullptr) { 326 trie->data16=(uint16_t *)trie->memory+(other->data16-(uint16_t *)other->memory); 327 } 328 if(other->data32!=nullptr) { 329 trie->data32=(uint32_t *)trie->memory+(other->data32-(uint32_t *)other->memory); 330 } 331 } 332 } else /* other->newTrie!=nullptr */ { 333 trie->newTrie=cloneBuilder(other->newTrie); 334 } 335 336 if(trie->memory==nullptr && trie->newTrie==nullptr) { 337 *pErrorCode=U_MEMORY_ALLOCATION_ERROR; 338 uprv_free(trie); 339 trie=nullptr; 340 } 341 return trie; 342 } 343 344 typedef struct NewTrieAndStatus { 345 UTrie2 *trie; 346 UErrorCode errorCode; 347 UBool exclusiveLimit; /* rather than inclusive range end */ 348 } NewTrieAndStatus; 349 350 static UBool U_CALLCONV 351 copyEnumRange(const void *context, UChar32 start, UChar32 end, uint32_t value) { 352 NewTrieAndStatus *nt=(NewTrieAndStatus *)context; 353 if(value!=nt->trie->initialValue) { 354 if(nt->exclusiveLimit) { 355 --end; 356 } 357 if(start==end) { 358 utrie2_set32(nt->trie, start, value, &nt->errorCode); 359 } else { 360 utrie2_setRange32(nt->trie, start, end, value, true, &nt->errorCode); 361 } 362 return U_SUCCESS(nt->errorCode); 363 } else { 364 return true; 365 } 366 } 367 368 #ifdef UTRIE2_DEBUG 369 static long countInitial(const UTrie2 *trie) { 370 uint32_t initialValue=trie->initialValue; 371 int32_t length=trie->dataLength; 372 long count=0; 373 if(trie->data16!=nullptr) { 374 for(int32_t i=0; i<length; ++i) { 375 if(trie->data16[i]==initialValue) { ++count; } 376 } 377 } else { 378 for(int32_t i=0; i<length; ++i) { 379 if(trie->data32[i]==initialValue) { ++count; } 380 } 381 } 382 return count; 383 } 384 385 static void 386 utrie_printLengths(const UTrie *trie) { 387 long indexLength=trie->indexLength; 388 long dataLength=(long)trie->dataLength; 389 long totalLength=(long)sizeof(UTrieHeader)+indexLength*2+dataLength*(trie->data32!=nullptr ? 4 : 2); 390 printf("**UTrieLengths** index:%6ld data:%6ld serialized:%6ld\n", 391 indexLength, dataLength, totalLength); 392 } 393 394 static void 395 utrie2_printLengths(const UTrie2 *trie, const char *which) { 396 long indexLength=trie->indexLength; 397 long dataLength=(long)trie->dataLength; 398 long totalLength=(long)sizeof(UTrie2Header)+indexLength*2+dataLength*(trie->data32!=nullptr ? 4 : 2); 399 printf("**UTrie2Lengths(%s %s)** index:%6ld data:%6ld countInitial:%6ld serialized:%6ld\n", 400 which, trie->name, indexLength, dataLength, countInitial(trie), totalLength); 401 } 402 #endif 403 404 U_CAPI UTrie2 * U_EXPORT2 405 utrie2_cloneAsThawed(const UTrie2 *other, UErrorCode *pErrorCode) { 406 NewTrieAndStatus context; 407 char16_t lead; 408 409 if(U_FAILURE(*pErrorCode)) { 410 return nullptr; 411 } 412 if(other==nullptr || (other->memory==nullptr && other->newTrie==nullptr)) { 413 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; 414 return nullptr; 415 } 416 if(other->newTrie!=nullptr && !other->newTrie->isCompacted) { 417 return utrie2_clone(other, pErrorCode); /* clone an unfrozen trie */ 418 } 419 420 /* Clone the frozen trie by enumerating it and building a new one. */ 421 context.trie=utrie2_open(other->initialValue, other->errorValue, pErrorCode); 422 if(U_FAILURE(*pErrorCode)) { 423 return nullptr; 424 } 425 context.exclusiveLimit=false; 426 context.errorCode=*pErrorCode; 427 utrie2_enum(other, nullptr, copyEnumRange, &context); 428 *pErrorCode=context.errorCode; 429 for(lead=0xd800; lead<0xdc00; ++lead) { 430 uint32_t value; 431 if(other->data32==nullptr) { 432 value=UTRIE2_GET16_FROM_U16_SINGLE_LEAD(other, lead); 433 } else { 434 value=UTRIE2_GET32_FROM_U16_SINGLE_LEAD(other, lead); 435 } 436 if(value!=other->initialValue) { 437 utrie2_set32ForLeadSurrogateCodeUnit(context.trie, lead, value, pErrorCode); 438 } 439 } 440 if(U_FAILURE(*pErrorCode)) { 441 utrie2_close(context.trie); 442 context.trie=nullptr; 443 } 444 return context.trie; 445 } 446 447 /* Almost the same as utrie2_cloneAsThawed() but copies a UTrie and freezes the clone. */ 448 U_CAPI UTrie2 * U_EXPORT2 449 utrie2_fromUTrie(const UTrie *trie1, uint32_t errorValue, UErrorCode *pErrorCode) { 450 NewTrieAndStatus context; 451 char16_t lead; 452 453 if(U_FAILURE(*pErrorCode)) { 454 return nullptr; 455 } 456 if(trie1==nullptr) { 457 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; 458 return nullptr; 459 } 460 context.trie=utrie2_open(trie1->initialValue, errorValue, pErrorCode); 461 if(U_FAILURE(*pErrorCode)) { 462 return nullptr; 463 } 464 context.exclusiveLimit=true; 465 context.errorCode=*pErrorCode; 466 utrie_enum(trie1, nullptr, copyEnumRange, &context); 467 *pErrorCode=context.errorCode; 468 for(lead=0xd800; lead<0xdc00; ++lead) { 469 uint32_t value; 470 if(trie1->data32==nullptr) { 471 value=UTRIE_GET16_FROM_LEAD(trie1, lead); 472 } else { 473 value=UTRIE_GET32_FROM_LEAD(trie1, lead); 474 } 475 if(value!=trie1->initialValue) { 476 utrie2_set32ForLeadSurrogateCodeUnit(context.trie, lead, value, pErrorCode); 477 } 478 } 479 if(U_SUCCESS(*pErrorCode)) { 480 utrie2_freeze(context.trie, 481 trie1->data32!=nullptr ? UTRIE2_32_VALUE_BITS : UTRIE2_16_VALUE_BITS, 482 pErrorCode); 483 } 484 #ifdef UTRIE2_DEBUG 485 if(U_SUCCESS(*pErrorCode)) { 486 utrie_printLengths(trie1); 487 utrie2_printLengths(context.trie, "fromUTrie"); 488 } 489 #endif 490 if(U_FAILURE(*pErrorCode)) { 491 utrie2_close(context.trie); 492 context.trie=nullptr; 493 } 494 return context.trie; 495 } 496 497 static inline UBool 498 isInNullBlock(UNewTrie2 *trie, UChar32 c, UBool forLSCP) { 499 int32_t i2, block; 500 501 if(U_IS_LEAD(c) && forLSCP) { 502 i2=(UTRIE2_LSCP_INDEX_2_OFFSET-(0xd800>>UTRIE2_SHIFT_2))+ 503 (c>>UTRIE2_SHIFT_2); 504 } else { 505 i2=trie->index1[c>>UTRIE2_SHIFT_1]+ 506 ((c>>UTRIE2_SHIFT_2)&UTRIE2_INDEX_2_MASK); 507 } 508 block=trie->index2[i2]; 509 return block == trie->dataNullOffset; 510 } 511 512 static int32_t 513 allocIndex2Block(UNewTrie2 *trie) { 514 int32_t newBlock, newTop; 515 516 newBlock=trie->index2Length; 517 newTop=newBlock+UTRIE2_INDEX_2_BLOCK_LENGTH; 518 if(newTop>UPRV_LENGTHOF(trie->index2)) { 519 /* 520 * Should never occur. 521 * Either UTRIE2_MAX_BUILD_TIME_INDEX_LENGTH is incorrect, 522 * or the code writes more values than should be possible. 523 */ 524 return -1; 525 } 526 trie->index2Length=newTop; 527 uprv_memcpy(trie->index2+newBlock, trie->index2+trie->index2NullOffset, UTRIE2_INDEX_2_BLOCK_LENGTH*4); 528 return newBlock; 529 } 530 531 static int32_t 532 getIndex2Block(UNewTrie2 *trie, UChar32 c, UBool forLSCP) { 533 int32_t i1, i2; 534 535 if(U_IS_LEAD(c) && forLSCP) { 536 return UTRIE2_LSCP_INDEX_2_OFFSET; 537 } 538 539 i1=c>>UTRIE2_SHIFT_1; 540 i2=trie->index1[i1]; 541 if(i2==trie->index2NullOffset) { 542 i2=allocIndex2Block(trie); 543 if(i2<0) { 544 return -1; /* program error */ 545 } 546 trie->index1[i1]=i2; 547 } 548 return i2; 549 } 550 551 static int32_t 552 allocDataBlock(UNewTrie2 *trie, int32_t copyBlock) { 553 int32_t newBlock, newTop; 554 555 if(trie->firstFreeBlock!=0) { 556 /* get the first free block */ 557 newBlock=trie->firstFreeBlock; 558 trie->firstFreeBlock=-trie->map[newBlock>>UTRIE2_SHIFT_2]; 559 } else { 560 /* get a new block from the high end */ 561 newBlock=trie->dataLength; 562 newTop=newBlock+UTRIE2_DATA_BLOCK_LENGTH; 563 if(newTop>trie->dataCapacity) { 564 /* out of memory in the data array */ 565 int32_t capacity; 566 uint32_t *data; 567 568 if(trie->dataCapacity<UNEWTRIE2_MEDIUM_DATA_LENGTH) { 569 capacity=UNEWTRIE2_MEDIUM_DATA_LENGTH; 570 } else if(trie->dataCapacity<UNEWTRIE2_MAX_DATA_LENGTH) { 571 capacity=UNEWTRIE2_MAX_DATA_LENGTH; 572 } else { 573 /* 574 * Should never occur. 575 * Either UNEWTRIE2_MAX_DATA_LENGTH is incorrect, 576 * or the code writes more values than should be possible. 577 */ 578 return -1; 579 } 580 data = static_cast<uint32_t*>(uprv_malloc(capacity * 4)); 581 if(data==nullptr) { 582 return -1; 583 } 584 uprv_memcpy(data, trie->data, (size_t)trie->dataLength*4); 585 uprv_free(trie->data); 586 trie->data=data; 587 trie->dataCapacity=capacity; 588 } 589 trie->dataLength=newTop; 590 } 591 uprv_memcpy(trie->data+newBlock, trie->data+copyBlock, UTRIE2_DATA_BLOCK_LENGTH*4); 592 trie->map[newBlock>>UTRIE2_SHIFT_2]=0; 593 return newBlock; 594 } 595 596 /* call when the block's reference counter reaches 0 */ 597 static void 598 releaseDataBlock(UNewTrie2 *trie, int32_t block) { 599 /* put this block at the front of the free-block chain */ 600 trie->map[block>>UTRIE2_SHIFT_2]=-trie->firstFreeBlock; 601 trie->firstFreeBlock=block; 602 } 603 604 static inline UBool 605 isWritableBlock(UNewTrie2 *trie, int32_t block) { 606 return block != trie->dataNullOffset && 1 == trie->map[block >> UTRIE2_SHIFT_2]; 607 } 608 609 static inline void 610 setIndex2Entry(UNewTrie2 *trie, int32_t i2, int32_t block) { 611 int32_t oldBlock; 612 ++trie->map[block>>UTRIE2_SHIFT_2]; /* increment first, in case block==oldBlock! */ 613 oldBlock=trie->index2[i2]; 614 if(0 == --trie->map[oldBlock>>UTRIE2_SHIFT_2]) { 615 releaseDataBlock(trie, oldBlock); 616 } 617 trie->index2[i2]=block; 618 } 619 620 /** 621 * No error checking for illegal arguments. 622 * 623 * @return -1 if no new data block available (out of memory in data array) 624 * @internal 625 */ 626 static int32_t 627 getDataBlock(UNewTrie2 *trie, UChar32 c, UBool forLSCP) { 628 int32_t i2, oldBlock, newBlock; 629 630 i2=getIndex2Block(trie, c, forLSCP); 631 if(i2<0) { 632 return -1; /* program error */ 633 } 634 635 i2+=(c>>UTRIE2_SHIFT_2)&UTRIE2_INDEX_2_MASK; 636 oldBlock=trie->index2[i2]; 637 if(isWritableBlock(trie, oldBlock)) { 638 return oldBlock; 639 } 640 641 /* allocate a new data block */ 642 newBlock=allocDataBlock(trie, oldBlock); 643 if(newBlock<0) { 644 /* out of memory in the data array */ 645 return -1; 646 } 647 setIndex2Entry(trie, i2, newBlock); 648 return newBlock; 649 } 650 651 /** 652 * @return true if the value was successfully set 653 */ 654 static void 655 set32(UNewTrie2 *trie, 656 UChar32 c, UBool forLSCP, uint32_t value, 657 UErrorCode *pErrorCode) { 658 int32_t block; 659 660 if(trie==nullptr || trie->isCompacted) { 661 *pErrorCode=U_NO_WRITE_PERMISSION; 662 return; 663 } 664 #ifdef UCPTRIE_DEBUG 665 umutablecptrie_set(trie->t3, c, value, pErrorCode); 666 #endif 667 668 block=getDataBlock(trie, c, forLSCP); 669 if(block<0) { 670 *pErrorCode=U_MEMORY_ALLOCATION_ERROR; 671 return; 672 } 673 674 trie->data[block+(c&UTRIE2_DATA_MASK)]=value; 675 } 676 677 U_CAPI void U_EXPORT2 678 utrie2_set32(UTrie2 *trie, UChar32 c, uint32_t value, UErrorCode *pErrorCode) { 679 if(U_FAILURE(*pErrorCode)) { 680 return; 681 } 682 if((uint32_t)c>0x10ffff) { 683 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; 684 return; 685 } 686 set32(trie->newTrie, c, true, value, pErrorCode); 687 } 688 689 U_CAPI void U_EXPORT2 690 utrie2_set32ForLeadSurrogateCodeUnit(UTrie2 *trie, 691 UChar32 c, uint32_t value, 692 UErrorCode *pErrorCode) { 693 if(U_FAILURE(*pErrorCode)) { 694 return; 695 } 696 if(!U_IS_LEAD(c)) { 697 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; 698 return; 699 } 700 set32(trie->newTrie, c, false, value, pErrorCode); 701 } 702 703 static void 704 writeBlock(uint32_t *block, uint32_t value) { 705 uint32_t *limit=block+UTRIE2_DATA_BLOCK_LENGTH; 706 while(block<limit) { 707 *block++=value; 708 } 709 } 710 711 /** 712 * initialValue is ignored if overwrite=true 713 * @internal 714 */ 715 static void 716 fillBlock(uint32_t *block, UChar32 start, UChar32 limit, 717 uint32_t value, uint32_t initialValue, UBool overwrite) { 718 uint32_t *pLimit; 719 720 pLimit=block+limit; 721 block+=start; 722 if(overwrite) { 723 while(block<pLimit) { 724 *block++=value; 725 } 726 } else { 727 while(block<pLimit) { 728 if(*block==initialValue) { 729 *block=value; 730 } 731 ++block; 732 } 733 } 734 } 735 736 U_CAPI void U_EXPORT2 737 utrie2_setRange32(UTrie2 *trie, 738 UChar32 start, UChar32 end, 739 uint32_t value, UBool overwrite, 740 UErrorCode *pErrorCode) { 741 /* 742 * repeat value in [start..end] 743 * mark index values for repeat-data blocks by setting bit 31 of the index values 744 * fill around existing values if any, if(overwrite) 745 */ 746 UNewTrie2 *newTrie; 747 int32_t block, rest, repeatBlock; 748 UChar32 limit; 749 750 if(U_FAILURE(*pErrorCode)) { 751 return; 752 } 753 if((uint32_t)start>0x10ffff || (uint32_t)end>0x10ffff || start>end) { 754 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; 755 return; 756 } 757 newTrie=trie->newTrie; 758 if(newTrie==nullptr || newTrie->isCompacted) { 759 *pErrorCode=U_NO_WRITE_PERMISSION; 760 return; 761 } 762 #ifdef UCPTRIE_DEBUG 763 umutablecptrie_setRange(newTrie->t3, start, end, value, pErrorCode); 764 #endif 765 if(!overwrite && value==newTrie->initialValue) { 766 return; /* nothing to do */ 767 } 768 769 limit=end+1; 770 if(start&UTRIE2_DATA_MASK) { 771 UChar32 nextStart; 772 773 /* set partial block at [start..following block boundary[ */ 774 block=getDataBlock(newTrie, start, true); 775 if(block<0) { 776 *pErrorCode=U_MEMORY_ALLOCATION_ERROR; 777 return; 778 } 779 780 nextStart=(start+UTRIE2_DATA_MASK)&~UTRIE2_DATA_MASK; 781 if(nextStart<=limit) { 782 fillBlock(newTrie->data+block, start&UTRIE2_DATA_MASK, UTRIE2_DATA_BLOCK_LENGTH, 783 value, newTrie->initialValue, overwrite); 784 start=nextStart; 785 } else { 786 fillBlock(newTrie->data+block, start&UTRIE2_DATA_MASK, limit&UTRIE2_DATA_MASK, 787 value, newTrie->initialValue, overwrite); 788 return; 789 } 790 } 791 792 /* number of positions in the last, partial block */ 793 rest=limit&UTRIE2_DATA_MASK; 794 795 /* round down limit to a block boundary */ 796 limit&=~UTRIE2_DATA_MASK; 797 798 /* iterate over all-value blocks */ 799 if(value==newTrie->initialValue) { 800 repeatBlock=newTrie->dataNullOffset; 801 } else { 802 repeatBlock=-1; 803 } 804 805 while(start<limit) { 806 int32_t i2; 807 UBool setRepeatBlock=false; 808 809 if(value==newTrie->initialValue && isInNullBlock(newTrie, start, true)) { 810 start+=UTRIE2_DATA_BLOCK_LENGTH; /* nothing to do */ 811 continue; 812 } 813 814 /* get index value */ 815 i2=getIndex2Block(newTrie, start, true); 816 if(i2<0) { 817 *pErrorCode=U_INTERNAL_PROGRAM_ERROR; 818 return; 819 } 820 i2+=(start>>UTRIE2_SHIFT_2)&UTRIE2_INDEX_2_MASK; 821 block=newTrie->index2[i2]; 822 if(isWritableBlock(newTrie, block)) { 823 /* already allocated */ 824 if(overwrite && block>=UNEWTRIE2_DATA_0800_OFFSET) { 825 /* 826 * We overwrite all values, and it's not a 827 * protected (ASCII-linear or 2-byte UTF-8) block: 828 * replace with the repeatBlock. 829 */ 830 setRepeatBlock=true; 831 } else { 832 /* !overwrite, or protected block: just write the values into this block */ 833 fillBlock(newTrie->data+block, 834 0, UTRIE2_DATA_BLOCK_LENGTH, 835 value, newTrie->initialValue, overwrite); 836 } 837 } else if(newTrie->data[block]!=value && (overwrite || block==newTrie->dataNullOffset)) { 838 /* 839 * Set the repeatBlock instead of the null block or previous repeat block: 840 * 841 * If !isWritableBlock() then all entries in the block have the same value 842 * because it's the null block or a range block (the repeatBlock from a previous 843 * call to utrie2_setRange32()). 844 * No other blocks are used multiple times before compacting. 845 * 846 * The null block is the only non-writable block with the initialValue because 847 * of the repeatBlock initialization above. (If value==initialValue, then 848 * the repeatBlock will be the null data block.) 849 * 850 * We set our repeatBlock if the desired value differs from the block's value, 851 * and if we overwrite any data or if the data is all initial values 852 * (which is the same as the block being the null block, see above). 853 */ 854 setRepeatBlock=true; 855 } 856 if(setRepeatBlock) { 857 if(repeatBlock>=0) { 858 setIndex2Entry(newTrie, i2, repeatBlock); 859 } else { 860 /* create and set and fill the repeatBlock */ 861 repeatBlock=getDataBlock(newTrie, start, true); 862 if(repeatBlock<0) { 863 *pErrorCode=U_MEMORY_ALLOCATION_ERROR; 864 return; 865 } 866 writeBlock(newTrie->data+repeatBlock, value); 867 } 868 } 869 870 start+=UTRIE2_DATA_BLOCK_LENGTH; 871 } 872 873 if(rest>0) { 874 /* set partial block at [last block boundary..limit[ */ 875 block=getDataBlock(newTrie, start, true); 876 if(block<0) { 877 *pErrorCode=U_MEMORY_ALLOCATION_ERROR; 878 return; 879 } 880 881 fillBlock(newTrie->data+block, 0, rest, value, newTrie->initialValue, overwrite); 882 } 883 } 884 885 /* compaction --------------------------------------------------------------- */ 886 887 static inline UBool 888 equal_int32(const int32_t *s, const int32_t *t, int32_t length) { 889 while(length>0 && *s==*t) { 890 ++s; 891 ++t; 892 --length; 893 } 894 return length == 0; 895 } 896 897 static inline UBool 898 equal_uint32(const uint32_t *s, const uint32_t *t, int32_t length) { 899 while(length>0 && *s==*t) { 900 ++s; 901 ++t; 902 --length; 903 } 904 return length == 0; 905 } 906 907 static int32_t 908 findSameIndex2Block(const int32_t *idx, int32_t index2Length, int32_t otherBlock) { 909 int32_t block; 910 911 /* ensure that we do not even partially get past index2Length */ 912 index2Length-=UTRIE2_INDEX_2_BLOCK_LENGTH; 913 914 for(block=0; block<=index2Length; ++block) { 915 if(equal_int32(idx+block, idx+otherBlock, UTRIE2_INDEX_2_BLOCK_LENGTH)) { 916 return block; 917 } 918 } 919 return -1; 920 } 921 922 static int32_t 923 findSameDataBlock(const uint32_t *data, int32_t dataLength, int32_t otherBlock, int32_t blockLength) { 924 int32_t block; 925 926 /* ensure that we do not even partially get past dataLength */ 927 dataLength-=blockLength; 928 929 for(block=0; block<=dataLength; block+=UTRIE2_DATA_GRANULARITY) { 930 if(equal_uint32(data+block, data+otherBlock, blockLength)) { 931 return block; 932 } 933 } 934 return -1; 935 } 936 937 /* 938 * Find the start of the last range in the trie by enumerating backward. 939 * Indexes for supplementary code points higher than this will be omitted. 940 */ 941 static UChar32 942 findHighStart(UNewTrie2 *trie, uint32_t highValue) { 943 const uint32_t *data32; 944 945 uint32_t value, initialValue; 946 UChar32 c, prev; 947 int32_t i1, i2, j, i2Block, prevI2Block, index2NullOffset, block, prevBlock, nullBlock; 948 949 data32=trie->data; 950 initialValue=trie->initialValue; 951 952 index2NullOffset=trie->index2NullOffset; 953 nullBlock=trie->dataNullOffset; 954 955 /* set variables for previous range */ 956 if(highValue==initialValue) { 957 prevI2Block=index2NullOffset; 958 prevBlock=nullBlock; 959 } else { 960 prevI2Block=-1; 961 prevBlock=-1; 962 } 963 prev=0x110000; 964 965 /* enumerate index-2 blocks */ 966 i1=UNEWTRIE2_INDEX_1_LENGTH; 967 c=prev; 968 while(c>0) { 969 i2Block=trie->index1[--i1]; 970 if(i2Block==prevI2Block) { 971 /* the index-2 block is the same as the previous one, and filled with highValue */ 972 c-=UTRIE2_CP_PER_INDEX_1_ENTRY; 973 continue; 974 } 975 prevI2Block=i2Block; 976 if(i2Block==index2NullOffset) { 977 /* this is the null index-2 block */ 978 if(highValue!=initialValue) { 979 return c; 980 } 981 c-=UTRIE2_CP_PER_INDEX_1_ENTRY; 982 } else { 983 /* enumerate data blocks for one index-2 block */ 984 for(i2=UTRIE2_INDEX_2_BLOCK_LENGTH; i2>0;) { 985 block=trie->index2[i2Block+ --i2]; 986 if(block==prevBlock) { 987 /* the block is the same as the previous one, and filled with highValue */ 988 c-=UTRIE2_DATA_BLOCK_LENGTH; 989 continue; 990 } 991 prevBlock=block; 992 if(block==nullBlock) { 993 /* this is the null data block */ 994 if(highValue!=initialValue) { 995 return c; 996 } 997 c-=UTRIE2_DATA_BLOCK_LENGTH; 998 } else { 999 for(j=UTRIE2_DATA_BLOCK_LENGTH; j>0;) { 1000 value=data32[block+ --j]; 1001 if(value!=highValue) { 1002 return c; 1003 } 1004 --c; 1005 } 1006 } 1007 } 1008 } 1009 } 1010 1011 /* deliver last range */ 1012 return 0; 1013 } 1014 1015 /* 1016 * Compact a build-time trie. 1017 * 1018 * The compaction 1019 * - removes blocks that are identical with earlier ones 1020 * - overlaps adjacent blocks as much as possible (if overlap==true) 1021 * - moves blocks in steps of the data granularity 1022 * - moves and overlaps blocks that overlap with multiple values in the overlap region 1023 * 1024 * It does not 1025 * - try to move and overlap blocks that are not already adjacent 1026 */ 1027 static void 1028 compactData(UNewTrie2 *trie) { 1029 #ifdef UTRIE2_DEBUG 1030 int32_t countSame=0, sumOverlaps=0; 1031 #endif 1032 1033 int32_t start, newStart, movedStart; 1034 int32_t blockLength, overlap; 1035 int32_t i, mapIndex, blockCount; 1036 1037 /* do not compact linear-ASCII data */ 1038 newStart=UTRIE2_DATA_START_OFFSET; 1039 for(start=0, i=0; start<newStart; start+=UTRIE2_DATA_BLOCK_LENGTH, ++i) { 1040 trie->map[i]=start; 1041 } 1042 1043 /* 1044 * Start with a block length of 64 for 2-byte UTF-8, 1045 * then switch to UTRIE2_DATA_BLOCK_LENGTH. 1046 */ 1047 blockLength=64; 1048 blockCount=blockLength>>UTRIE2_SHIFT_2; 1049 for(start=newStart; start<trie->dataLength;) { 1050 /* 1051 * start: index of first entry of current block 1052 * newStart: index where the current block is to be moved 1053 * (right after current end of already-compacted data) 1054 */ 1055 if(start==UNEWTRIE2_DATA_0800_OFFSET) { 1056 blockLength=UTRIE2_DATA_BLOCK_LENGTH; 1057 blockCount=1; 1058 } 1059 1060 /* skip blocks that are not used */ 1061 if(trie->map[start>>UTRIE2_SHIFT_2]<=0) { 1062 /* advance start to the next block */ 1063 start+=blockLength; 1064 1065 /* leave newStart with the previous block! */ 1066 continue; 1067 } 1068 1069 /* search for an identical block */ 1070 if( (movedStart=findSameDataBlock(trie->data, newStart, start, blockLength)) 1071 >=0 1072 ) { 1073 #ifdef UTRIE2_DEBUG 1074 ++countSame; 1075 #endif 1076 /* found an identical block, set the other block's index value for the current block */ 1077 for(i=blockCount, mapIndex=start>>UTRIE2_SHIFT_2; i>0; --i) { 1078 trie->map[mapIndex++]=movedStart; 1079 movedStart+=UTRIE2_DATA_BLOCK_LENGTH; 1080 } 1081 1082 /* advance start to the next block */ 1083 start+=blockLength; 1084 1085 /* leave newStart with the previous block! */ 1086 continue; 1087 } 1088 1089 /* see if the beginning of this block can be overlapped with the end of the previous block */ 1090 /* look for maximum overlap (modulo granularity) with the previous, adjacent block */ 1091 for(overlap=blockLength-UTRIE2_DATA_GRANULARITY; 1092 overlap>0 && !equal_uint32(trie->data+(newStart-overlap), trie->data+start, overlap); 1093 overlap-=UTRIE2_DATA_GRANULARITY) {} 1094 1095 #ifdef UTRIE2_DEBUG 1096 sumOverlaps+=overlap; 1097 #endif 1098 if(overlap>0 || newStart<start) { 1099 /* some overlap, or just move the whole block */ 1100 movedStart=newStart-overlap; 1101 for(i=blockCount, mapIndex=start>>UTRIE2_SHIFT_2; i>0; --i) { 1102 trie->map[mapIndex++]=movedStart; 1103 movedStart+=UTRIE2_DATA_BLOCK_LENGTH; 1104 } 1105 1106 /* move the non-overlapping indexes to their new positions */ 1107 start+=overlap; 1108 for(i=blockLength-overlap; i>0; --i) { 1109 trie->data[newStart++]=trie->data[start++]; 1110 } 1111 } else /* no overlap && newStart==start */ { 1112 for(i=blockCount, mapIndex=start>>UTRIE2_SHIFT_2; i>0; --i) { 1113 trie->map[mapIndex++]=start; 1114 start+=UTRIE2_DATA_BLOCK_LENGTH; 1115 } 1116 newStart=start; 1117 } 1118 } 1119 1120 /* now adjust the index-2 table */ 1121 for(i=0; i<trie->index2Length; ++i) { 1122 if(i==UNEWTRIE2_INDEX_GAP_OFFSET) { 1123 /* Gap indexes are invalid (-1). Skip over the gap. */ 1124 i+=UNEWTRIE2_INDEX_GAP_LENGTH; 1125 } 1126 trie->index2[i]=trie->map[trie->index2[i]>>UTRIE2_SHIFT_2]; 1127 } 1128 trie->dataNullOffset=trie->map[trie->dataNullOffset>>UTRIE2_SHIFT_2]; 1129 1130 /* ensure dataLength alignment */ 1131 while((newStart&(UTRIE2_DATA_GRANULARITY-1))!=0) { 1132 trie->data[newStart++]=trie->initialValue; 1133 } 1134 1135 #ifdef UTRIE2_DEBUG 1136 /* we saved some space */ 1137 printf("compacting UTrie2: count of 32-bit data words %lu->%lu countSame=%ld sumOverlaps=%ld\n", 1138 (long)trie->dataLength, (long)newStart, (long)countSame, (long)sumOverlaps); 1139 #endif 1140 1141 trie->dataLength=newStart; 1142 } 1143 1144 static void 1145 compactIndex2(UNewTrie2 *trie) { 1146 int32_t i, start, newStart, movedStart, overlap; 1147 1148 /* do not compact linear-BMP index-2 blocks */ 1149 newStart=UTRIE2_INDEX_2_BMP_LENGTH; 1150 for(start=0, i=0; start<newStart; start+=UTRIE2_INDEX_2_BLOCK_LENGTH, ++i) { 1151 trie->map[i]=start; 1152 } 1153 1154 /* Reduce the index table gap to what will be needed at runtime. */ 1155 newStart+=UTRIE2_UTF8_2B_INDEX_2_LENGTH+((trie->highStart-0x10000)>>UTRIE2_SHIFT_1); 1156 1157 for(start=UNEWTRIE2_INDEX_2_NULL_OFFSET; start<trie->index2Length;) { 1158 /* 1159 * start: index of first entry of current block 1160 * newStart: index where the current block is to be moved 1161 * (right after current end of already-compacted data) 1162 */ 1163 1164 /* search for an identical block */ 1165 if( (movedStart=findSameIndex2Block(trie->index2, newStart, start)) 1166 >=0 1167 ) { 1168 /* found an identical block, set the other block's index value for the current block */ 1169 trie->map[start>>UTRIE2_SHIFT_1_2]=movedStart; 1170 1171 /* advance start to the next block */ 1172 start+=UTRIE2_INDEX_2_BLOCK_LENGTH; 1173 1174 /* leave newStart with the previous block! */ 1175 continue; 1176 } 1177 1178 /* see if the beginning of this block can be overlapped with the end of the previous block */ 1179 /* look for maximum overlap with the previous, adjacent block */ 1180 for(overlap=UTRIE2_INDEX_2_BLOCK_LENGTH-1; 1181 overlap>0 && !equal_int32(trie->index2+(newStart-overlap), trie->index2+start, overlap); 1182 --overlap) {} 1183 1184 if(overlap>0 || newStart<start) { 1185 /* some overlap, or just move the whole block */ 1186 trie->map[start>>UTRIE2_SHIFT_1_2]=newStart-overlap; 1187 1188 /* move the non-overlapping indexes to their new positions */ 1189 start+=overlap; 1190 for(i=UTRIE2_INDEX_2_BLOCK_LENGTH-overlap; i>0; --i) { 1191 trie->index2[newStart++]=trie->index2[start++]; 1192 } 1193 } else /* no overlap && newStart==start */ { 1194 trie->map[start>>UTRIE2_SHIFT_1_2]=start; 1195 start+=UTRIE2_INDEX_2_BLOCK_LENGTH; 1196 newStart=start; 1197 } 1198 } 1199 1200 /* now adjust the index-1 table */ 1201 for(i=0; i<UNEWTRIE2_INDEX_1_LENGTH; ++i) { 1202 trie->index1[i]=trie->map[trie->index1[i]>>UTRIE2_SHIFT_1_2]; 1203 } 1204 trie->index2NullOffset=trie->map[trie->index2NullOffset>>UTRIE2_SHIFT_1_2]; 1205 1206 /* 1207 * Ensure data table alignment: 1208 * Needs to be granularity-aligned for 16-bit trie 1209 * (so that dataMove will be down-shiftable), 1210 * and 2-aligned for uint32_t data. 1211 */ 1212 while((newStart&((UTRIE2_DATA_GRANULARITY-1)|1))!=0) { 1213 /* Arbitrary value: 0x3fffc not possible for real data. */ 1214 trie->index2[newStart++] = static_cast<int32_t>(0xffff) << UTRIE2_INDEX_SHIFT; 1215 } 1216 1217 #ifdef UTRIE2_DEBUG 1218 /* we saved some space */ 1219 printf("compacting UTrie2: count of 16-bit index words %lu->%lu\n", 1220 (long)trie->index2Length, (long)newStart); 1221 #endif 1222 1223 trie->index2Length=newStart; 1224 } 1225 1226 static void 1227 compactTrie(UTrie2 *trie, UErrorCode *pErrorCode) { 1228 UNewTrie2 *newTrie; 1229 UChar32 highStart, suppHighStart; 1230 uint32_t highValue; 1231 1232 newTrie=trie->newTrie; 1233 1234 /* find highStart and round it up */ 1235 highValue=utrie2_get32(trie, 0x10ffff); 1236 highStart=findHighStart(newTrie, highValue); 1237 highStart=(highStart+(UTRIE2_CP_PER_INDEX_1_ENTRY-1))&~(UTRIE2_CP_PER_INDEX_1_ENTRY-1); 1238 if(highStart==0x110000) { 1239 highValue=trie->errorValue; 1240 } 1241 1242 /* 1243 * Set trie->highStart only after utrie2_get32(trie, highStart). 1244 * Otherwise utrie2_get32(trie, highStart) would try to read the highValue. 1245 */ 1246 trie->highStart=newTrie->highStart=highStart; 1247 1248 #ifdef UTRIE2_DEBUG 1249 printf("UTrie2: highStart U+%06lx highValue 0x%lx initialValue 0x%lx\n", 1250 (long)highStart, (long)highValue, (long)trie->initialValue); 1251 #endif 1252 1253 if(highStart<0x110000) { 1254 /* Blank out [highStart..10ffff] to release associated data blocks. */ 1255 suppHighStart= highStart<=0x10000 ? 0x10000 : highStart; 1256 utrie2_setRange32(trie, suppHighStart, 0x10ffff, trie->initialValue, true, pErrorCode); 1257 if(U_FAILURE(*pErrorCode)) { 1258 return; 1259 } 1260 } 1261 1262 compactData(newTrie); 1263 if(highStart>0x10000) { 1264 compactIndex2(newTrie); 1265 #ifdef UTRIE2_DEBUG 1266 } else { 1267 printf("UTrie2: highStart U+%04lx count of 16-bit index words %lu->%lu\n", 1268 (long)highStart, (long)trie->newTrie->index2Length, (long)UTRIE2_INDEX_1_OFFSET); 1269 #endif 1270 } 1271 1272 /* 1273 * Store the highValue in the data array and round up the dataLength. 1274 * Must be done after compactData() because that assumes that dataLength 1275 * is a multiple of UTRIE2_DATA_BLOCK_LENGTH. 1276 */ 1277 newTrie->data[newTrie->dataLength++]=highValue; 1278 while((newTrie->dataLength&(UTRIE2_DATA_GRANULARITY-1))!=0) { 1279 newTrie->data[newTrie->dataLength++]=trie->initialValue; 1280 } 1281 1282 newTrie->isCompacted=true; 1283 } 1284 1285 /* serialization ------------------------------------------------------------ */ 1286 1287 /** 1288 * Maximum length of the runtime index array. 1289 * Limited by its own 16-bit index values, and by uint16_t UTrie2Header.indexLength. 1290 * (The actual maximum length is lower, 1291 * (0x110000>>UTRIE2_SHIFT_2)+UTRIE2_UTF8_2B_INDEX_2_LENGTH+UTRIE2_MAX_INDEX_1_LENGTH.) 1292 */ 1293 #define UTRIE2_MAX_INDEX_LENGTH 0xffff 1294 1295 /** 1296 * Maximum length of the runtime data array. 1297 * Limited by 16-bit index values that are left-shifted by UTRIE2_INDEX_SHIFT, 1298 * and by uint16_t UTrie2Header.shiftedDataLength. 1299 */ 1300 #define UTRIE2_MAX_DATA_LENGTH (0xffff<<UTRIE2_INDEX_SHIFT) 1301 1302 /* Compact and internally serialize the trie. */ 1303 U_CAPI void U_EXPORT2 1304 utrie2_freeze(UTrie2 *trie, UTrie2ValueBits valueBits, UErrorCode *pErrorCode) { 1305 UNewTrie2 *newTrie; 1306 UTrie2Header *header; 1307 uint32_t *p; 1308 uint16_t *dest16; 1309 int32_t i, length; 1310 int32_t allIndexesLength; 1311 int32_t dataMove; /* >0 if the data is moved to the end of the index array */ 1312 UChar32 highStart; 1313 1314 /* argument check */ 1315 if(U_FAILURE(*pErrorCode)) { 1316 return; 1317 } 1318 if( trie==nullptr || 1319 valueBits<0 || UTRIE2_COUNT_VALUE_BITS<=valueBits 1320 ) { 1321 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; 1322 return; 1323 } 1324 newTrie=trie->newTrie; 1325 if(newTrie==nullptr) { 1326 /* already frozen */ 1327 UTrie2ValueBits frozenValueBits= 1328 trie->data16!=nullptr ? UTRIE2_16_VALUE_BITS : UTRIE2_32_VALUE_BITS; 1329 if(valueBits!=frozenValueBits) { 1330 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; 1331 } 1332 return; 1333 } 1334 1335 /* compact if necessary */ 1336 if(!newTrie->isCompacted) { 1337 compactTrie(trie, pErrorCode); 1338 if(U_FAILURE(*pErrorCode)) { 1339 return; 1340 } 1341 } 1342 highStart=trie->highStart; 1343 1344 if(highStart<=0x10000) { 1345 allIndexesLength=UTRIE2_INDEX_1_OFFSET; 1346 } else { 1347 allIndexesLength=newTrie->index2Length; 1348 } 1349 if(valueBits==UTRIE2_16_VALUE_BITS) { 1350 dataMove=allIndexesLength; 1351 } else { 1352 dataMove=0; 1353 } 1354 1355 /* are indexLength and dataLength within limits? */ 1356 if( /* for unshifted indexLength */ 1357 allIndexesLength>UTRIE2_MAX_INDEX_LENGTH || 1358 /* for unshifted dataNullOffset */ 1359 (dataMove+newTrie->dataNullOffset)>0xffff || 1360 /* for unshifted 2-byte UTF-8 index-2 values */ 1361 (dataMove+UNEWTRIE2_DATA_0800_OFFSET)>0xffff || 1362 /* for shiftedDataLength */ 1363 (dataMove+newTrie->dataLength)>UTRIE2_MAX_DATA_LENGTH 1364 ) { 1365 *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR; 1366 return; 1367 } 1368 1369 /* calculate the total serialized length */ 1370 length=sizeof(UTrie2Header)+allIndexesLength*2; 1371 if(valueBits==UTRIE2_16_VALUE_BITS) { 1372 length+=newTrie->dataLength*2; 1373 } else { 1374 length+=newTrie->dataLength*4; 1375 } 1376 1377 trie->memory=uprv_malloc(length); 1378 if(trie->memory==nullptr) { 1379 *pErrorCode=U_MEMORY_ALLOCATION_ERROR; 1380 return; 1381 } 1382 trie->length=length; 1383 trie->isMemoryOwned=true; 1384 1385 trie->indexLength=allIndexesLength; 1386 trie->dataLength=newTrie->dataLength; 1387 if(highStart<=0x10000) { 1388 trie->index2NullOffset=0xffff; 1389 } else { 1390 trie->index2NullOffset=static_cast<uint16_t>(UTRIE2_INDEX_2_OFFSET+newTrie->index2NullOffset); 1391 } 1392 trie->dataNullOffset=(uint16_t)(dataMove+newTrie->dataNullOffset); 1393 trie->highValueIndex=dataMove+trie->dataLength-UTRIE2_DATA_GRANULARITY; 1394 1395 /* set the header fields */ 1396 header=(UTrie2Header *)trie->memory; 1397 1398 header->signature=UTRIE2_SIG; /* "Tri2" */ 1399 header->options=(uint16_t)valueBits; 1400 1401 header->indexLength=(uint16_t)trie->indexLength; 1402 header->shiftedDataLength=(uint16_t)(trie->dataLength>>UTRIE2_INDEX_SHIFT); 1403 header->index2NullOffset=trie->index2NullOffset; 1404 header->dataNullOffset=trie->dataNullOffset; 1405 header->shiftedHighStart=(uint16_t)(highStart>>UTRIE2_SHIFT_1); 1406 1407 /* fill the index and data arrays */ 1408 dest16=(uint16_t *)(header+1); 1409 trie->index=dest16; 1410 1411 /* write the index-2 array values shifted right by UTRIE2_INDEX_SHIFT, after adding dataMove */ 1412 p=(uint32_t *)newTrie->index2; 1413 for(i=UTRIE2_INDEX_2_BMP_LENGTH; i>0; --i) { 1414 *dest16++=(uint16_t)((dataMove + *p++)>>UTRIE2_INDEX_SHIFT); 1415 } 1416 1417 /* write UTF-8 2-byte index-2 values, not right-shifted */ 1418 for(i=0; i<(0xc2-0xc0); ++i) { /* C0..C1 */ 1419 *dest16++=(uint16_t)(dataMove+UTRIE2_BAD_UTF8_DATA_OFFSET); 1420 } 1421 for(; i<(0xe0-0xc0); ++i) { /* C2..DF */ 1422 *dest16++=(uint16_t)(dataMove+newTrie->index2[i<<(6-UTRIE2_SHIFT_2)]); 1423 } 1424 1425 if(highStart>0x10000) { 1426 int32_t index1Length=(highStart-0x10000)>>UTRIE2_SHIFT_1; 1427 int32_t index2Offset=UTRIE2_INDEX_2_BMP_LENGTH+UTRIE2_UTF8_2B_INDEX_2_LENGTH+index1Length; 1428 1429 /* write 16-bit index-1 values for supplementary code points */ 1430 p=(uint32_t *)newTrie->index1+UTRIE2_OMITTED_BMP_INDEX_1_LENGTH; 1431 for(i=index1Length; i>0; --i) { 1432 *dest16++=(uint16_t)(UTRIE2_INDEX_2_OFFSET + *p++); 1433 } 1434 1435 /* 1436 * write the index-2 array values for supplementary code points, 1437 * shifted right by UTRIE2_INDEX_SHIFT, after adding dataMove 1438 */ 1439 p=(uint32_t *)newTrie->index2+index2Offset; 1440 for(i=newTrie->index2Length-index2Offset; i>0; --i) { 1441 *dest16++=(uint16_t)((dataMove + *p++)>>UTRIE2_INDEX_SHIFT); 1442 } 1443 } 1444 1445 /* write the 16/32-bit data array */ 1446 switch(valueBits) { 1447 case UTRIE2_16_VALUE_BITS: 1448 /* write 16-bit data values */ 1449 trie->data16=dest16; 1450 trie->data32=nullptr; 1451 p=newTrie->data; 1452 for(i=newTrie->dataLength; i>0; --i) { 1453 *dest16++=(uint16_t)*p++; 1454 } 1455 break; 1456 case UTRIE2_32_VALUE_BITS: 1457 /* write 32-bit data values */ 1458 trie->data16=nullptr; 1459 trie->data32=(uint32_t *)dest16; 1460 uprv_memcpy(dest16, newTrie->data, (size_t)newTrie->dataLength*4); 1461 break; 1462 default: 1463 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; 1464 return; 1465 } 1466 1467 #ifdef UTRIE2_DEBUG 1468 utrie2_printLengths(trie, ""); 1469 #endif 1470 1471 #ifdef UCPTRIE_DEBUG 1472 umutablecptrie_setName(newTrie->t3, trie->name); 1473 ucptrie_close( 1474 umutablecptrie_buildImmutable( 1475 newTrie->t3, UCPTRIE_TYPE_FAST, (UCPTrieValueWidth)valueBits, pErrorCode)); 1476 #endif 1477 /* Delete the UNewTrie2. */ 1478 uprv_free(newTrie->data); 1479 uprv_free(newTrie); 1480 trie->newTrie=nullptr; 1481 }