n2builder.cpp (43661B)
1 // © 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html 3 /* 4 ******************************************************************************* 5 * 6 * Copyright (C) 2009-2016, International Business Machines 7 * Corporation and others. All Rights Reserved. 8 * 9 ******************************************************************************* 10 * file name: n2builder.cpp 11 * encoding: UTF-8 12 * tab size: 8 (not used) 13 * indentation:4 14 * 15 * created on: 2009nov25 16 * created by: Markus W. Scherer 17 * 18 * Builds Normalizer2 data and writes a binary .nrm file. 19 * For the file format see source/common/normalizer2impl.h. 20 */ 21 22 #include "unicode/utypes.h" 23 #include "n2builder.h" 24 25 #include <stdio.h> 26 #include <stdlib.h> 27 #include <string.h> 28 #include <vector> 29 #include "unicode/errorcode.h" 30 #include "unicode/localpointer.h" 31 #include "unicode/putil.h" 32 #include "unicode/ucptrie.h" 33 #include "unicode/udata.h" 34 #include "unicode/umutablecptrie.h" 35 #include "unicode/uniset.h" 36 #include "unicode/unistr.h" 37 #include "unicode/usetiter.h" 38 #include "unicode/ustring.h" 39 #include "charstr.h" 40 #include "extradata.h" 41 #include "hash.h" 42 #include "normalizer2impl.h" 43 #include "norms.h" 44 #include "toolutil.h" 45 #include "unewdata.h" 46 #include "uvectr32.h" 47 #include "writesrc.h" 48 49 #if !UCONFIG_NO_NORMALIZATION 50 51 /* UDataInfo cf. udata.h */ 52 static UDataInfo dataInfo={ 53 sizeof(UDataInfo), 54 0, 55 56 U_IS_BIG_ENDIAN, 57 U_CHARSET_FAMILY, 58 U_SIZEOF_UCHAR, 59 0, 60 61 { 0x4e, 0x72, 0x6d, 0x32 }, /* dataFormat="Nrm2" */ 62 { 5, 0, 0, 0 }, /* formatVersion */ 63 { 16, 0, 0, 0 } /* dataVersion (Unicode version) */ 64 }; 65 66 U_NAMESPACE_BEGIN 67 68 class HangulIterator { 69 public: 70 struct Range { 71 UChar32 start, end; 72 }; 73 74 HangulIterator() : rangeIndex(0) {} 75 const Range *nextRange() { 76 if(rangeIndex<UPRV_LENGTHOF(ranges)) { 77 return ranges+rangeIndex++; 78 } else { 79 return nullptr; 80 } 81 } 82 private: 83 static const Range ranges[4]; 84 int32_t rangeIndex; 85 }; 86 87 const HangulIterator::Range HangulIterator::ranges[4]={ 88 { Hangul::JAMO_L_BASE, Hangul::JAMO_L_END }, 89 { Hangul::JAMO_V_BASE, Hangul::JAMO_V_END }, 90 // JAMO_T_BASE+1: not U+11A7 91 { Hangul::JAMO_T_BASE+1, Hangul::JAMO_T_END }, 92 { Hangul::HANGUL_BASE, Hangul::HANGUL_END }, 93 }; 94 95 Normalizer2DataBuilder::Normalizer2DataBuilder(UErrorCode &errorCode) : 96 norms(errorCode), 97 phase(0), overrideHandling(OVERRIDE_PREVIOUS), optimization(OPTIMIZE_NORMAL), 98 norm16TrieBytes(nullptr), norm16TrieLength(0) { 99 memset(unicodeVersion, 0, sizeof(unicodeVersion)); 100 memset(indexes, 0, sizeof(indexes)); 101 memset(smallFCD, 0, sizeof(smallFCD)); 102 } 103 104 Normalizer2DataBuilder::~Normalizer2DataBuilder() { 105 delete[] norm16TrieBytes; 106 } 107 108 void 109 Normalizer2DataBuilder::setUnicodeVersion(const char *v) { 110 UVersionInfo nullVersion={ 0, 0, 0, 0 }; 111 UVersionInfo version; 112 u_versionFromString(version, v); 113 if( 0!=memcmp(version, unicodeVersion, U_MAX_VERSION_LENGTH) && 114 0!=memcmp(nullVersion, unicodeVersion, U_MAX_VERSION_LENGTH) 115 ) { 116 char buffer[U_MAX_VERSION_STRING_LENGTH]; 117 u_versionToString(unicodeVersion, buffer); 118 fprintf(stderr, "gennorm2 error: multiple inconsistent Unicode version numbers %s vs. %s\n", 119 buffer, v); 120 exit(U_ILLEGAL_ARGUMENT_ERROR); 121 } 122 memcpy(unicodeVersion, version, U_MAX_VERSION_LENGTH); 123 } 124 125 Norm *Normalizer2DataBuilder::checkNormForMapping(Norm *p, UChar32 c) { 126 if(p!=nullptr) { 127 if(p->mappingType!=Norm::NONE) { 128 if( overrideHandling==OVERRIDE_NONE || 129 (overrideHandling==OVERRIDE_PREVIOUS && p->mappingPhase==phase) 130 ) { 131 fprintf(stderr, 132 "error in gennorm2 phase %d: " 133 "not permitted to override mapping for U+%04lX from phase %d\n", 134 static_cast<int>(phase), static_cast<long>(c), static_cast<int>(p->mappingPhase)); 135 exit(U_INVALID_FORMAT_ERROR); 136 } 137 delete p->mapping; 138 p->mapping=nullptr; 139 } 140 p->mappingPhase=phase; 141 } 142 return p; 143 } 144 145 void Normalizer2DataBuilder::setOverrideHandling(OverrideHandling oh) { 146 overrideHandling=oh; 147 ++phase; 148 } 149 150 void Normalizer2DataBuilder::setCC(UChar32 c, uint8_t cc) { 151 norms.createNorm(c)->cc=cc; 152 norms.ccSet.add(c); 153 } 154 155 static UBool isWellFormed(const UnicodeString &s) { 156 UErrorCode errorCode=U_ZERO_ERROR; 157 u_strToUTF8(nullptr, 0, nullptr, toUCharPtr(s.getBuffer()), s.length(), &errorCode); 158 return U_SUCCESS(errorCode) || errorCode==U_BUFFER_OVERFLOW_ERROR; 159 } 160 161 void Normalizer2DataBuilder::setOneWayMapping(UChar32 c, const UnicodeString &m) { 162 if(!isWellFormed(m)) { 163 fprintf(stderr, 164 "error in gennorm2 phase %d: " 165 "illegal one-way mapping from U+%04lX to malformed string\n", 166 static_cast<int>(phase), static_cast<long>(c)); 167 exit(U_INVALID_FORMAT_ERROR); 168 } 169 Norm *p=checkNormForMapping(norms.createNorm(c), c); 170 p->mapping=new UnicodeString(m); 171 p->mappingType=Norm::ONE_WAY; 172 p->setMappingCP(); 173 norms.mappingSet.add(c); 174 } 175 176 void Normalizer2DataBuilder::setRoundTripMapping(UChar32 c, const UnicodeString &m) { 177 if(U_IS_SURROGATE(c)) { 178 fprintf(stderr, 179 "error in gennorm2 phase %d: " 180 "illegal round-trip mapping from surrogate code point U+%04lX\n", 181 static_cast<int>(phase), static_cast<long>(c)); 182 exit(U_INVALID_FORMAT_ERROR); 183 } 184 if(!isWellFormed(m)) { 185 fprintf(stderr, 186 "error in gennorm2 phase %d: " 187 "illegal round-trip mapping from U+%04lX to malformed string\n", 188 static_cast<int>(phase), static_cast<long>(c)); 189 exit(U_INVALID_FORMAT_ERROR); 190 } 191 int32_t numCP=u_countChar32(toUCharPtr(m.getBuffer()), m.length()); 192 if(numCP!=2) { 193 fprintf(stderr, 194 "error in gennorm2 phase %d: " 195 "illegal round-trip mapping from U+%04lX to %d!=2 code points\n", 196 static_cast<int>(phase), static_cast<long>(c), static_cast<int>(numCP)); 197 exit(U_INVALID_FORMAT_ERROR); 198 } 199 Norm *p=checkNormForMapping(norms.createNorm(c), c); 200 p->mapping=new UnicodeString(m); 201 p->mappingType=Norm::ROUND_TRIP; 202 p->mappingCP=U_SENTINEL; 203 norms.mappingSet.add(c); 204 } 205 206 void Normalizer2DataBuilder::removeMapping(UChar32 c) { 207 // createNorm(c), not getNorm(c), to record a non-mapping and detect conflicting data. 208 Norm *p=checkNormForMapping(norms.createNorm(c), c); 209 p->mappingType=Norm::REMOVED; 210 norms.mappingSet.add(c); 211 } 212 213 UBool Normalizer2DataBuilder::mappingHasCompBoundaryAfter(const BuilderReorderingBuffer &buffer, 214 Norm::MappingType mappingType) const { 215 if(buffer.isEmpty()) { 216 return false; // Maps-to-empty-string is no boundary of any kind. 217 } 218 int32_t lastStarterIndex=buffer.lastStarterIndex(); 219 if(lastStarterIndex<0) { 220 return false; // no starter 221 } 222 const int32_t lastIndex=buffer.length()-1; 223 if(mappingType==Norm::ONE_WAY && lastStarterIndex<lastIndex && buffer.ccAt(lastIndex)>1) { 224 // One-way mapping where after the last starter is at least one combining mark 225 // with a combining class greater than 1, 226 // which means that another combining mark can reorder before it. 227 // By contrast, in a round-trip mapping this does not prevent a boundary as long as 228 // the starter or composite does not combine-forward with a following combining mark. 229 return false; 230 } 231 UChar32 starter=buffer.charAt(lastStarterIndex); 232 if(lastStarterIndex==0 && norms.combinesBack(starter)) { 233 // The last starter is at the beginning of the mapping and combines backward. 234 return false; 235 } 236 if(Hangul::isJamoL(starter) || 237 (Hangul::isJamoV(starter) && 238 0<lastStarterIndex && Hangul::isJamoL(buffer.charAt(lastStarterIndex-1)))) { 239 // A Jamo leading consonant or an LV pair combines-forward if it is at the end, 240 // otherwise it is blocked. 241 return lastStarterIndex!=lastIndex; 242 } 243 // Note: There can be no Hangul syllable in the fully decomposed mapping. 244 245 // Multiple starters can combine into one. 246 // Look for the first of the last sequence of starters, excluding Jamos. 247 int32_t i=lastStarterIndex; 248 UChar32 c; 249 while(0<i && buffer.ccAt(i-1)==0 && !Hangul::isJamo(c=buffer.charAt(i-1))) { 250 starter=c; 251 --i; 252 } 253 // Compose as far as possible, and see if further compositions with 254 // characters following this mapping are possible. 255 const Norm *starterNorm=norms.getNorm(starter); 256 if(i==lastStarterIndex && 257 (starterNorm==nullptr || !starterNorm->combinesFwd())) { 258 return true; // The last starter does not combine forward. 259 } 260 uint8_t prevCC=0; 261 while(++i<buffer.length()) { 262 uint8_t cc=buffer.ccAt(i); // !=0 if after last starter 263 if(i>lastStarterIndex && norms.combinesWithCCBetween(*starterNorm, prevCC, cc)) { 264 // The starter combines with a mark that reorders before the current one. 265 return false; 266 } 267 UChar32 c=buffer.charAt(i); 268 if(starterNorm!=nullptr && (prevCC<cc || prevCC==0) && 269 norms.getNormRef(c).combinesBack && (starter=starterNorm->combine(c))>=0) { 270 // The starter combines with c into a composite replacement starter. 271 starterNorm=norms.getNorm(starter); 272 if(i>=lastStarterIndex && 273 (starterNorm==nullptr || !starterNorm->combinesFwd())) { 274 return true; // The composite does not combine further. 275 } 276 // Keep prevCC because we "removed" the combining mark. 277 } else if(cc==0) { 278 starterNorm=norms.getNorm(c); 279 if(i==lastStarterIndex && 280 (starterNorm==nullptr || !starterNorm->combinesFwd())) { 281 return true; // The new starter does not combine forward. 282 } 283 prevCC=0; 284 } else { 285 prevCC=cc; 286 } 287 } 288 if(prevCC==0) { 289 return false; // forward-combining starter at the very end 290 } 291 if(norms.combinesWithCCBetween(*starterNorm, prevCC, 256)) { 292 // The starter combines with another mark. 293 return false; 294 } 295 return true; 296 } 297 298 UBool Normalizer2DataBuilder::mappingRecomposes(const BuilderReorderingBuffer &buffer) const { 299 if(buffer.lastStarterIndex()<0) { 300 return false; // no starter 301 } 302 const Norm *starterNorm=nullptr; 303 uint8_t prevCC=0; 304 for(int32_t i=0; i<buffer.length(); ++i) { 305 UChar32 c=buffer.charAt(i); 306 uint8_t cc=buffer.ccAt(i); 307 if(starterNorm!=nullptr && (prevCC<cc || prevCC==0) && 308 norms.getNormRef(c).combinesBack && starterNorm->combine(c)>=0) { 309 return true; // normal composite 310 } else if(cc==0) { 311 if(Hangul::isJamoL(c)) { 312 if((i+1)<buffer.length() && Hangul::isJamoV(buffer.charAt(i+1))) { 313 return true; // Hangul syllable 314 } 315 starterNorm=nullptr; 316 } else { 317 starterNorm=norms.getNorm(c); 318 } 319 } 320 prevCC=cc; 321 } 322 return false; 323 } 324 325 void Normalizer2DataBuilder::postProcess(Norm &norm) { 326 // Prerequisites: Compositions are built, mappings are recursively decomposed. 327 // Mappings are not yet in canonical order. 328 // 329 // This function works on a Norm struct. We do not know which code point(s) map(s) to it. 330 // Therefore, we cannot compute algorithmic mapping deltas here. 331 // Error conditions are checked, but printed later when we do know the offending code point. 332 if(norm.hasMapping()) { 333 if(norm.mapping->length()>Normalizer2Impl::MAPPING_LENGTH_MASK) { 334 norm.error="mapping longer than maximum of 31"; 335 return; 336 } 337 // Ensure canonical order. 338 BuilderReorderingBuffer buffer; 339 if(norm.rawMapping!=nullptr) { 340 norms.reorder(*norm.rawMapping, buffer); 341 buffer.reset(); 342 } 343 norms.reorder(*norm.mapping, buffer); 344 if(buffer.isEmpty()) { 345 // A character that is deleted (maps to an empty string) must 346 // get the worst-case lccc and tccc values because arbitrary 347 // characters on both sides will become adjacent. 348 norm.leadCC=1; 349 norm.trailCC=0xff; 350 } else { 351 norm.leadCC=buffer.ccAt(0); 352 norm.trailCC=buffer.ccAt(buffer.length()-1); 353 } 354 355 norm.hasCompBoundaryBefore= 356 !buffer.isEmpty() && norm.leadCC==0 && !norms.combinesBack(buffer.charAt(0)); 357 // No comp-boundary-after when norm.combinesBack: 358 // MaybeNo character whose first mapping character may combine-back, 359 // in which case we would not recompose to this character, 360 // and may need more context. 361 norm.hasCompBoundaryAfter= 362 !norm.combinesBack && !norm.combinesFwd() && 363 mappingHasCompBoundaryAfter(buffer, norm.mappingType); 364 365 if(norm.combinesBack) { 366 if(norm.mappingType!=Norm::ROUND_TRIP) { 367 // One-way mappings don't get NFC_QC=Maybe, and 368 // should not have gotten combinesBack set. 369 norm.error="combines-back and has a one-way mapping, " 370 "not possible in Unicode normalization"; 371 } else if(norm.combinesFwd()) { 372 // Earlier code checked ccc=0. 373 norm.type=Norm::MAYBE_NO_COMBINES_FWD; 374 } else if(norm.cc==0) { 375 norm.type=Norm::MAYBE_NO_MAPPING_ONLY; 376 } else { 377 norm.error="combines-back and decomposes with ccc!=0, " 378 "not possible in Unicode normalization"; 379 // ... because we don't reorder again after composition. 380 } 381 } else if(norm.mappingType==Norm::ROUND_TRIP) { 382 if(norm.combinesFwd()) { 383 norm.type=Norm::YES_NO_COMBINES_FWD; 384 } else { 385 norm.type=Norm::YES_NO_MAPPING_ONLY; 386 } 387 } else { // one-way mapping 388 if(norm.combinesFwd()) { 389 norm.error="combines-forward and has a one-way mapping, " 390 "not possible in Unicode normalization"; 391 } else if(buffer.isEmpty()) { 392 norm.type=Norm::NO_NO_EMPTY; 393 } else if(!norm.hasCompBoundaryBefore) { 394 norm.type=Norm::NO_NO_COMP_NO_MAYBE_CC; 395 } else if(mappingRecomposes(buffer)) { 396 norm.type=Norm::NO_NO_COMP_BOUNDARY_BEFORE; 397 } else { 398 // The mapping is comp-normalized. 399 norm.type=Norm::NO_NO_COMP_YES; 400 } 401 } 402 } else { // no mapping 403 norm.leadCC=norm.trailCC=norm.cc; 404 405 norm.hasCompBoundaryBefore= 406 norm.cc==0 && !norm.combinesBack; 407 norm.hasCompBoundaryAfter= 408 norm.cc==0 && !norm.combinesBack && !norm.combinesFwd(); 409 410 if(norm.combinesBack) { 411 if(norm.combinesFwd()) { 412 // Earlier code checked ccc=0. 413 norm.type=Norm::MAYBE_YES_COMBINES_FWD; 414 } else { 415 norm.type=Norm::MAYBE_YES_SIMPLE; // any ccc 416 } 417 } else if(norm.combinesFwd()) { 418 // Earlier code checked ccc=0. 419 norm.type=Norm::YES_YES_COMBINES_FWD; 420 } else if(norm.cc!=0) { 421 norm.type=Norm::YES_YES_WITH_CC; 422 } else { 423 norm.type=Norm::INERT; 424 } 425 } 426 } 427 428 class Norm16Writer : public Norms::Enumerator { 429 public: 430 Norm16Writer(UMutableCPTrie *trie, Norms &n, Normalizer2DataBuilder &b) : 431 Norms::Enumerator(n), builder(b), norm16Trie(trie) {} 432 void rangeHandler(UChar32 start, UChar32 end, Norm &norm) override { 433 builder.writeNorm16(norm16Trie, start, end, norm); 434 } 435 Normalizer2DataBuilder &builder; 436 UMutableCPTrie *norm16Trie; 437 }; 438 439 void Normalizer2DataBuilder::setSmallFCD(UChar32 c) { 440 UChar32 lead= c<=0xffff ? c : U16_LEAD(c); 441 smallFCD[lead >> 8] |= static_cast<uint8_t>(1) << ((lead >> 5) & 7); 442 } 443 444 void Normalizer2DataBuilder::writeNorm16(UMutableCPTrie *norm16Trie, UChar32 start, UChar32 end, Norm &norm) { 445 if((norm.leadCC|norm.trailCC)!=0) { 446 for(UChar32 c=start; c<=end; ++c) { 447 setSmallFCD(c); 448 } 449 } 450 451 int32_t norm16; 452 switch(norm.type) { 453 case Norm::INERT: 454 norm16=Normalizer2Impl::INERT; 455 break; 456 case Norm::YES_YES_COMBINES_FWD: 457 norm16=norm.offset*2; 458 break; 459 case Norm::YES_NO_COMBINES_FWD: 460 norm16=indexes[Normalizer2Impl::IX_MIN_YES_NO]+norm.offset*2; 461 break; 462 case Norm::YES_NO_MAPPING_ONLY: 463 norm16=indexes[Normalizer2Impl::IX_MIN_YES_NO_MAPPINGS_ONLY]+norm.offset*2; 464 break; 465 case Norm::NO_NO_COMP_YES: 466 norm16=indexes[Normalizer2Impl::IX_MIN_NO_NO]+norm.offset*2; 467 break; 468 case Norm::NO_NO_COMP_BOUNDARY_BEFORE: 469 norm16=indexes[Normalizer2Impl::IX_MIN_NO_NO_COMP_BOUNDARY_BEFORE]+norm.offset*2; 470 break; 471 case Norm::NO_NO_COMP_NO_MAYBE_CC: 472 norm16=indexes[Normalizer2Impl::IX_MIN_NO_NO_COMP_NO_MAYBE_CC]+norm.offset*2; 473 break; 474 case Norm::NO_NO_EMPTY: 475 norm16=indexes[Normalizer2Impl::IX_MIN_NO_NO_EMPTY]+norm.offset*2; 476 break; 477 case Norm::NO_NO_DELTA: 478 { 479 // Positive offset from minNoNoDelta, shifted left for additional bits. 480 int32_t offset=(norm.offset+Normalizer2Impl::MAX_DELTA)<<Normalizer2Impl::DELTA_SHIFT; 481 if(norm.trailCC==0) { 482 // DELTA_TCCC_0==0 483 } else if(norm.trailCC==1) { 484 offset|=Normalizer2Impl::DELTA_TCCC_1; 485 } else { 486 offset|=Normalizer2Impl::DELTA_TCCC_GT_1; 487 } 488 norm16=getMinNoNoDelta()+offset; 489 break; 490 } 491 case Norm::MAYBE_NO_MAPPING_ONLY: 492 norm16=indexes[Normalizer2Impl::IX_MIN_MAYBE_NO]+norm.offset*2; 493 break; 494 case Norm::MAYBE_NO_COMBINES_FWD: 495 norm16=indexes[Normalizer2Impl::IX_MIN_MAYBE_NO_COMBINES_FWD]+norm.offset*2; 496 break; 497 case Norm::MAYBE_YES_COMBINES_FWD: 498 norm16=indexes[Normalizer2Impl::IX_MIN_MAYBE_YES]+norm.offset*2; 499 break; 500 case Norm::MAYBE_YES_SIMPLE: 501 norm16=Normalizer2Impl::MIN_NORMAL_MAYBE_YES+norm.cc*2; // ccc=0..255 502 break; 503 case Norm::YES_YES_WITH_CC: 504 U_ASSERT(norm.cc!=0); 505 norm16=Normalizer2Impl::MIN_YES_YES_WITH_CC-2+norm.cc*2; // ccc=1..255 506 break; 507 default: // Should not occur. 508 exit(U_INTERNAL_PROGRAM_ERROR); 509 } 510 U_ASSERT((norm16&1)==0); 511 if(norm.hasCompBoundaryAfter) { 512 norm16|=Normalizer2Impl::HAS_COMP_BOUNDARY_AFTER; 513 } 514 IcuToolErrorCode errorCode("gennorm2/writeNorm16()"); 515 umutablecptrie_setRange(norm16Trie, start, end, static_cast<uint32_t>(norm16), errorCode); 516 517 // Set the minimum code points for real data lookups in the quick check loops. 518 UBool isDecompNo= 519 (Norm::YES_NO_COMBINES_FWD<=norm.type && norm.type<=Norm::NO_NO_DELTA) || 520 norm.cc!=0; 521 if(isDecompNo && start<indexes[Normalizer2Impl::IX_MIN_DECOMP_NO_CP]) { 522 indexes[Normalizer2Impl::IX_MIN_DECOMP_NO_CP]=start; 523 } 524 UBool isCompNoMaybe= norm.type>=Norm::NO_NO_COMP_YES; 525 if(isCompNoMaybe && start<indexes[Normalizer2Impl::IX_MIN_COMP_NO_MAYBE_CP]) { 526 indexes[Normalizer2Impl::IX_MIN_COMP_NO_MAYBE_CP]=start; 527 } 528 if(norm.leadCC!=0 && start<indexes[Normalizer2Impl::IX_MIN_LCCC_CP]) { 529 indexes[Normalizer2Impl::IX_MIN_LCCC_CP]=start; 530 } 531 } 532 533 void Normalizer2DataBuilder::setHangulData(UMutableCPTrie *norm16Trie) { 534 HangulIterator hi; 535 const HangulIterator::Range *range; 536 // Check that none of the Hangul/Jamo code points have data. 537 while((range=hi.nextRange())!=nullptr) { 538 for(UChar32 c=range->start; c<=range->end; ++c) { 539 if(umutablecptrie_get(norm16Trie, c)>Normalizer2Impl::INERT) { 540 fprintf(stderr, 541 "gennorm2 error: " 542 "illegal mapping/composition/ccc data for Hangul or Jamo U+%04lX\n", 543 static_cast<long>(c)); 544 exit(U_INVALID_FORMAT_ERROR); 545 } 546 } 547 } 548 // Set data for algorithmic runtime handling. 549 IcuToolErrorCode errorCode("gennorm2/setHangulData()"); 550 551 // Jamo V/T are maybeYes 552 if(Hangul::JAMO_V_BASE<indexes[Normalizer2Impl::IX_MIN_COMP_NO_MAYBE_CP]) { 553 indexes[Normalizer2Impl::IX_MIN_COMP_NO_MAYBE_CP]=Hangul::JAMO_V_BASE; 554 } 555 umutablecptrie_setRange(norm16Trie, Hangul::JAMO_L_BASE, Hangul::JAMO_L_END, 556 Normalizer2Impl::JAMO_L, errorCode); 557 umutablecptrie_setRange(norm16Trie, Hangul::JAMO_V_BASE, Hangul::JAMO_V_END, 558 Normalizer2Impl::JAMO_VT, errorCode); 559 // JAMO_T_BASE+1: not U+11A7 560 umutablecptrie_setRange(norm16Trie, Hangul::JAMO_T_BASE+1, Hangul::JAMO_T_END, 561 Normalizer2Impl::JAMO_VT, errorCode); 562 563 // Hangul LV encoded as minYesNo 564 uint32_t lv=indexes[Normalizer2Impl::IX_MIN_YES_NO]; 565 // Hangul LVT encoded as minYesNoMappingsOnly|HAS_COMP_BOUNDARY_AFTER 566 uint32_t lvt=indexes[Normalizer2Impl::IX_MIN_YES_NO_MAPPINGS_ONLY]| 567 Normalizer2Impl::HAS_COMP_BOUNDARY_AFTER; 568 if(Hangul::HANGUL_BASE<indexes[Normalizer2Impl::IX_MIN_DECOMP_NO_CP]) { 569 indexes[Normalizer2Impl::IX_MIN_DECOMP_NO_CP]=Hangul::HANGUL_BASE; 570 } 571 // Set the first LV, then write all other Hangul syllables as LVT, 572 // then overwrite the remaining LV. 573 umutablecptrie_set(norm16Trie, Hangul::HANGUL_BASE, lv, errorCode); 574 umutablecptrie_setRange(norm16Trie, Hangul::HANGUL_BASE+1, Hangul::HANGUL_END, lvt, errorCode); 575 UChar32 c=Hangul::HANGUL_BASE; 576 while((c+=Hangul::JAMO_T_COUNT)<=Hangul::HANGUL_END) { 577 umutablecptrie_set(norm16Trie, c, lv, errorCode); 578 } 579 errorCode.assertSuccess(); 580 } 581 582 LocalUCPTriePointer Normalizer2DataBuilder::processData() { 583 // Build composition lists before recursive decomposition, 584 // so that we still have the raw, pair-wise mappings. 585 CompositionBuilder compBuilder(norms); 586 norms.enumRanges(compBuilder); 587 588 // Recursively decompose all mappings. 589 Decomposer decomposer(norms); 590 do { 591 decomposer.didDecompose=false; 592 norms.enumRanges(decomposer); 593 } while(decomposer.didDecompose); 594 595 // Set the Norm::Type and other properties. 596 int32_t normsLength=norms.length(); 597 for(int32_t i=1; i<normsLength; ++i) { 598 postProcess(norms.getNormRefByIndex(i)); 599 } 600 601 // Write the properties, mappings and composition lists to 602 // appropriate parts of the "extra data" array. 603 ExtraData extra(norms, optimization==OPTIMIZE_FAST); 604 norms.enumRanges(extra); 605 606 extraData=extra.yesYesCompositions; 607 indexes[Normalizer2Impl::IX_MIN_YES_NO]=extraData.length()*2; 608 extraData.append(extra.yesNoMappingsAndCompositions); 609 indexes[Normalizer2Impl::IX_MIN_YES_NO_MAPPINGS_ONLY]=extraData.length()*2; 610 extraData.append(extra.yesNoMappingsOnly); 611 indexes[Normalizer2Impl::IX_MIN_NO_NO]=extraData.length()*2; 612 extraData.append(extra.noNoMappingsCompYes); 613 indexes[Normalizer2Impl::IX_MIN_NO_NO_COMP_BOUNDARY_BEFORE]=extraData.length()*2; 614 extraData.append(extra.noNoMappingsCompBoundaryBefore); 615 indexes[Normalizer2Impl::IX_MIN_NO_NO_COMP_NO_MAYBE_CC]=extraData.length()*2; 616 extraData.append(extra.noNoMappingsCompNoMaybeCC); 617 indexes[Normalizer2Impl::IX_MIN_NO_NO_EMPTY]=extraData.length()*2; 618 extraData.append(extra.noNoMappingsEmpty); 619 indexes[Normalizer2Impl::IX_LIMIT_NO_NO]=extraData.length()*2; 620 621 int32_t maybeDataLength= 622 extra.maybeNoMappingsOnly.length()+ 623 extra.maybeNoMappingsAndCompositions.length()+ 624 extra.maybeYesCompositions.length(); 625 int32_t minMaybeNo=Normalizer2Impl::MIN_NORMAL_MAYBE_YES-maybeDataLength*2; 626 // Adjust minMaybeNo down to 8-align it, 627 // so that NO_NO_DELTA bits 2..1 can be used without subtracting the center. 628 minMaybeNo&=~7; 629 630 int32_t index=minMaybeNo; 631 indexes[Normalizer2Impl::IX_MIN_MAYBE_NO]=index; 632 extraData.append(extra.maybeNoMappingsOnly); 633 index+=extra.maybeNoMappingsOnly.length()*2; 634 indexes[Normalizer2Impl::IX_MIN_MAYBE_NO_COMBINES_FWD]=index; 635 extraData.append(extra.maybeNoMappingsAndCompositions); 636 index+=extra.maybeNoMappingsAndCompositions.length()*2; 637 indexes[Normalizer2Impl::IX_MIN_MAYBE_YES]=index; 638 extraData.append(extra.maybeYesCompositions); 639 640 // Pad the extraData to even length for 4-byte alignment of following data. 641 if(extraData.length()&1) { 642 extraData.append(static_cast<char16_t>(0)); 643 } 644 645 int32_t minNoNoDelta=getMinNoNoDelta(); 646 U_ASSERT((minNoNoDelta&7)==0); 647 if(indexes[Normalizer2Impl::IX_LIMIT_NO_NO]>minNoNoDelta) { 648 fprintf(stderr, 649 "gennorm2 error: " 650 "data structure overflow, too much mapping composition data\n"); 651 exit(U_BUFFER_OVERFLOW_ERROR); 652 } 653 654 // writeNorm16() and setHangulData() reduce these as needed. 655 indexes[Normalizer2Impl::IX_MIN_DECOMP_NO_CP]=0x110000; 656 indexes[Normalizer2Impl::IX_MIN_COMP_NO_MAYBE_CP]=0x110000; 657 indexes[Normalizer2Impl::IX_MIN_LCCC_CP]=0x110000; 658 659 IcuToolErrorCode errorCode("gennorm2/processData()"); 660 UMutableCPTrie *norm16Trie = umutablecptrie_open( 661 Normalizer2Impl::INERT, Normalizer2Impl::INERT, errorCode); 662 errorCode.assertSuccess(); 663 664 // Map each code point to its norm16 value, 665 // including the properties that fit directly, 666 // and the offset to the "extra data" if necessary. 667 Norm16Writer norm16Writer(norm16Trie, norms, *this); 668 norms.enumRanges(norm16Writer); 669 // TODO: iterate via getRange() instead of callback? 670 671 setHangulData(norm16Trie); 672 673 // Look for the "worst" norm16 value of any supplementary code point 674 // corresponding to a lead surrogate, and set it as that surrogate's value. 675 // Enables UTF-16 quick check inner loops to look at only code units. 676 // 677 // We could be more sophisticated: 678 // We could collect a bit set for whether there are values in the different 679 // norm16 ranges (yesNo, maybeYes, yesYesWithCC etc.) 680 // and select the best value that only breaks the composition and/or decomposition 681 // inner loops if necessary. 682 // However, that seems like overkill for an optimization for supplementary characters. 683 // 684 // First check that surrogate code *points* are inert. 685 // The parser should have rejected values/mappings for them. 686 uint32_t value; 687 UChar32 end = umutablecptrie_getRange(norm16Trie, 0xd800, UCPMAP_RANGE_NORMAL, 0, 688 nullptr, nullptr, &value); 689 if (value != Normalizer2Impl::INERT || end < 0xdfff) { 690 fprintf(stderr, 691 "gennorm2 error: not all surrogate code points are inert: U+d800..U+%04x=%lx\n", 692 static_cast<int>(end), static_cast<long>(value)); 693 exit(U_INTERNAL_PROGRAM_ERROR); 694 } 695 uint32_t maxNorm16 = 0; 696 // ANDing values yields 0 bits where any value has a 0. 697 // Used for worst-case HAS_COMP_BOUNDARY_AFTER. 698 uint32_t andedNorm16 = 0; 699 end = 0; 700 for (UChar32 start = 0x10000;;) { 701 if (start > end) { 702 end = umutablecptrie_getRange(norm16Trie, start, UCPMAP_RANGE_NORMAL, 0, 703 nullptr, nullptr, &value); 704 if (end < 0) { break; } 705 } 706 if ((start & 0x3ff) == 0) { 707 // Data for a new lead surrogate. 708 maxNorm16 = andedNorm16 = value; 709 } else { 710 if (value > maxNorm16) { 711 maxNorm16 = value; 712 } 713 andedNorm16 &= value; 714 } 715 // Intersect each range with the code points for one lead surrogate. 716 UChar32 leadEnd = start | 0x3ff; 717 if (leadEnd <= end) { 718 // End of the supplementary block for a lead surrogate. 719 if (maxNorm16 >= static_cast<uint32_t>(indexes[Normalizer2Impl::IX_LIMIT_NO_NO])) { 720 // Set noNo ("worst" value) if it got into "less-bad" maybeYes or ccc!=0. 721 // Otherwise it might end up at something like JAMO_VT which stays in 722 // the inner decomposition quick check loop. 723 maxNorm16 = static_cast<uint32_t>(indexes[Normalizer2Impl::IX_LIMIT_NO_NO]); 724 } 725 maxNorm16 = 726 (maxNorm16 & ~Normalizer2Impl::HAS_COMP_BOUNDARY_AFTER)| 727 (andedNorm16 & Normalizer2Impl::HAS_COMP_BOUNDARY_AFTER); 728 if (maxNorm16 != Normalizer2Impl::INERT) { 729 umutablecptrie_set(norm16Trie, U16_LEAD(start), maxNorm16, errorCode); 730 } 731 if (value == Normalizer2Impl::INERT) { 732 // Potentially skip inert supplementary blocks for several lead surrogates. 733 start = (end + 1) & ~0x3ff; 734 } else { 735 start = leadEnd + 1; 736 } 737 } else { 738 start = end + 1; 739 } 740 } 741 742 // Adjust supplementary minimum code points to break quick check loops at their lead surrogates. 743 // For an empty data file, minCP=0x110000 turns into 0xdc00 (first trail surrogate) 744 // which is harmless. 745 // As a result, the minimum code points are always BMP code points. 746 int32_t minCP=indexes[Normalizer2Impl::IX_MIN_DECOMP_NO_CP]; 747 if(minCP>=0x10000) { 748 indexes[Normalizer2Impl::IX_MIN_DECOMP_NO_CP]=U16_LEAD(minCP); 749 } 750 minCP=indexes[Normalizer2Impl::IX_MIN_COMP_NO_MAYBE_CP]; 751 if(minCP>=0x10000) { 752 indexes[Normalizer2Impl::IX_MIN_COMP_NO_MAYBE_CP]=U16_LEAD(minCP); 753 } 754 minCP=indexes[Normalizer2Impl::IX_MIN_LCCC_CP]; 755 if(minCP>=0x10000) { 756 indexes[Normalizer2Impl::IX_MIN_LCCC_CP]=U16_LEAD(minCP); 757 } 758 759 LocalUCPTriePointer builtTrie( 760 umutablecptrie_buildImmutable(norm16Trie, UCPTRIE_TYPE_FAST, UCPTRIE_VALUE_BITS_16, errorCode)); 761 norm16TrieLength=ucptrie_toBinary(builtTrie.getAlias(), nullptr, 0, errorCode); 762 if(errorCode.get()!=U_BUFFER_OVERFLOW_ERROR) { 763 fprintf(stderr, "gennorm2 error: unable to build/serialize the normalization trie - %s\n", 764 errorCode.errorName()); 765 exit(errorCode.reset()); 766 } 767 umutablecptrie_close(norm16Trie); 768 errorCode.reset(); 769 norm16TrieBytes=new uint8_t[norm16TrieLength]; 770 ucptrie_toBinary(builtTrie.getAlias(), norm16TrieBytes, norm16TrieLength, errorCode); 771 errorCode.assertSuccess(); 772 773 int32_t offset = static_cast<int32_t>(sizeof(indexes)); 774 indexes[Normalizer2Impl::IX_NORM_TRIE_OFFSET]=offset; 775 offset+=norm16TrieLength; 776 indexes[Normalizer2Impl::IX_EXTRA_DATA_OFFSET]=offset; 777 offset+=extraData.length()*2; 778 indexes[Normalizer2Impl::IX_SMALL_FCD_OFFSET]=offset; 779 offset+=sizeof(smallFCD); 780 int32_t totalSize=offset; 781 for(int32_t i=Normalizer2Impl::IX_RESERVED3_OFFSET; i<=Normalizer2Impl::IX_TOTAL_SIZE; ++i) { 782 indexes[i]=totalSize; 783 } 784 785 if(beVerbose) { 786 printf("size of normalization trie: %5ld bytes\n", static_cast<long>(norm16TrieLength)); 787 printf("size of 16-bit extra data: %5ld uint16_t\n", static_cast<long>(extraData.length())); 788 printf("size of small-FCD data: %5ld bytes\n", static_cast<long>(sizeof(smallFCD))); 789 printf("size of binary data file contents: %5ld bytes\n", static_cast<long>(totalSize)); 790 printf("minDecompNoCodePoint: U+%04lX\n", 791 static_cast<long>(indexes[Normalizer2Impl::IX_MIN_DECOMP_NO_CP])); 792 printf("minCompNoMaybeCodePoint: U+%04lX\n", 793 static_cast<long>(indexes[Normalizer2Impl::IX_MIN_COMP_NO_MAYBE_CP])); 794 printf("minLcccCodePoint: U+%04lX\n", 795 static_cast<long>(indexes[Normalizer2Impl::IX_MIN_LCCC_CP])); 796 printf("minYesNo: (with compositions) 0x%04x\n", 797 static_cast<int>(indexes[Normalizer2Impl::IX_MIN_YES_NO])); 798 printf("minYesNoMappingsOnly: 0x%04x\n", 799 static_cast<int>(indexes[Normalizer2Impl::IX_MIN_YES_NO_MAPPINGS_ONLY])); 800 printf("minNoNo: (comp-normalized) 0x%04x\n", 801 static_cast<int>(indexes[Normalizer2Impl::IX_MIN_NO_NO])); 802 printf("minNoNoCompBoundaryBefore: 0x%04x\n", 803 static_cast<int>(indexes[Normalizer2Impl::IX_MIN_NO_NO_COMP_BOUNDARY_BEFORE])); 804 printf("minNoNoCompNoMaybeCC: 0x%04x\n", 805 static_cast<int>(indexes[Normalizer2Impl::IX_MIN_NO_NO_COMP_NO_MAYBE_CC])); 806 printf("minNoNoEmpty: 0x%04x\n", 807 static_cast<int>(indexes[Normalizer2Impl::IX_MIN_NO_NO_EMPTY])); 808 printf("limitNoNo: 0x%04x\n", 809 static_cast<int>(indexes[Normalizer2Impl::IX_LIMIT_NO_NO])); 810 printf("minNoNoDelta: 0x%04x\n", 811 static_cast<int>(minNoNoDelta)); 812 printf("minMaybeNo: 0x%04x\n", 813 static_cast<int>(indexes[Normalizer2Impl::IX_MIN_MAYBE_NO])); 814 printf("minMaybeNoCombinesFwd: 0x%04x\n", 815 static_cast<int>(indexes[Normalizer2Impl::IX_MIN_MAYBE_NO_COMBINES_FWD])); 816 printf("minMaybeYes: 0x%04x\n", 817 static_cast<int>(indexes[Normalizer2Impl::IX_MIN_MAYBE_YES])); 818 } 819 820 UVersionInfo nullVersion={ 0, 0, 0, 0 }; 821 if(0==memcmp(nullVersion, unicodeVersion, 4)) { 822 u_versionFromString(unicodeVersion, U_UNICODE_VERSION); 823 } 824 memcpy(dataInfo.dataVersion, unicodeVersion, 4); 825 return builtTrie; 826 } 827 828 void Normalizer2DataBuilder::writeBinaryFile(const char *filename) { 829 processData(); 830 831 IcuToolErrorCode errorCode("gennorm2/writeBinaryFile()"); 832 UNewDataMemory *pData= 833 udata_create(nullptr, nullptr, filename, &dataInfo, 834 haveCopyright ? U_COPYRIGHT_STRING : nullptr, errorCode); 835 if(errorCode.isFailure()) { 836 fprintf(stderr, "gennorm2 error: unable to create the output file %s - %s\n", 837 filename, errorCode.errorName()); 838 exit(errorCode.reset()); 839 } 840 udata_writeBlock(pData, indexes, sizeof(indexes)); 841 udata_writeBlock(pData, norm16TrieBytes, norm16TrieLength); 842 udata_writeUString(pData, toUCharPtr(extraData.getBuffer()), extraData.length()); 843 udata_writeBlock(pData, smallFCD, sizeof(smallFCD)); 844 int32_t writtenSize=udata_finish(pData, errorCode); 845 if(errorCode.isFailure()) { 846 fprintf(stderr, "gennorm2: error %s writing the output file\n", errorCode.errorName()); 847 exit(errorCode.reset()); 848 } 849 int32_t totalSize=indexes[Normalizer2Impl::IX_TOTAL_SIZE]; 850 if(writtenSize!=totalSize) { 851 fprintf(stderr, "gennorm2 error: written size %ld != calculated size %ld\n", 852 static_cast<long>(writtenSize), static_cast<long>(totalSize)); 853 exit(U_INTERNAL_PROGRAM_ERROR); 854 } 855 } 856 857 void 858 Normalizer2DataBuilder::writeCSourceFile(const char *filename) { 859 LocalUCPTriePointer norm16Trie = processData(); 860 861 IcuToolErrorCode errorCode("gennorm2/writeCSourceFile()"); 862 const char *basename=findBasename(filename); 863 CharString path(filename, static_cast<int32_t>(basename - filename), errorCode); 864 CharString dataName(basename, errorCode); 865 const char *extension=strrchr(basename, '.'); 866 if(extension!=nullptr) { 867 dataName.truncate(static_cast<int32_t>(extension - basename)); 868 } 869 const char *name=dataName.data(); 870 errorCode.assertSuccess(); 871 872 FILE *f=usrc_create(path.data(), basename, 2016, "icu/source/tools/gennorm2/n2builder.cpp"); 873 if(f==nullptr) { 874 fprintf(stderr, "gennorm2/writeCSourceFile() error: unable to create the output file %s\n", 875 filename); 876 exit(U_FILE_ACCESS_ERROR); 877 } 878 fputs("#ifdef INCLUDED_FROM_NORMALIZER2_CPP\n\n", f); 879 880 char line[100]; 881 snprintf(line, sizeof(line), "static const UVersionInfo %s_formatVersion={", name); 882 usrc_writeArray(f, line, dataInfo.formatVersion, 8, 4, "", "};\n"); 883 snprintf(line, sizeof(line), "static const UVersionInfo %s_dataVersion={", name); 884 usrc_writeArray(f, line, dataInfo.dataVersion, 8, 4, "", "};\n\n"); 885 snprintf(line, sizeof(line), "static const int32_t %s_indexes[Normalizer2Impl::IX_COUNT]={\n", name); 886 usrc_writeArray(f, line, indexes, 32, Normalizer2Impl::IX_COUNT, "", "\n};\n\n"); 887 888 usrc_writeUCPTrie(f, name, norm16Trie.getAlias(), UPRV_TARGET_SYNTAX_CCODE); 889 890 snprintf(line, sizeof(line), "static const uint16_t %s_extraData[%%ld]={\n", name); 891 usrc_writeArray(f, line, extraData.getBuffer(), 16, extraData.length(), "", "\n};\n\n"); 892 snprintf(line, sizeof(line), "static const uint8_t %s_smallFCD[%%ld]={\n", name); 893 usrc_writeArray(f, line, smallFCD, 8, sizeof(smallFCD), "", "\n};\n\n"); 894 895 fputs("#endif // INCLUDED_FROM_NORMALIZER2_CPP\n", f); 896 fclose(f); 897 } 898 899 namespace { 900 901 bool equalStrings(const UnicodeString *s1, const UnicodeString *s2) { 902 if(s1 == nullptr) { 903 return s2 == nullptr; 904 } else if(s2 == nullptr) { 905 return false; 906 } else { 907 return *s1 == *s2; 908 } 909 } 910 911 const char *typeChars = "?-=>"; 912 913 void writeMapping(FILE *f, const UnicodeString *m) { 914 if(m != nullptr && !m->isEmpty()) { 915 int32_t i = 0; 916 UChar32 c = m->char32At(i); 917 fprintf(f, "%04lX", static_cast<long>(c)); 918 while((i += U16_LENGTH(c)) < m->length()) { 919 c = m->char32At(i); 920 fprintf(f, " %04lX", static_cast<long>(c)); 921 } 922 } 923 fputs("\n", f); 924 } 925 926 } // namespace 927 928 void 929 Normalizer2DataBuilder::writeDataFile(const char *filename, bool writeRemoved) const { 930 // Do not processData() before writing the input-syntax data file. 931 FILE *f = fopen(filename, "w"); 932 if(f == nullptr) { 933 fprintf(stderr, "gennorm2/writeDataFile() error: unable to create the output file %s\n", 934 filename); 935 exit(U_FILE_ACCESS_ERROR); 936 return; 937 } 938 939 if(unicodeVersion[0] != 0 || unicodeVersion[1] != 0 || 940 unicodeVersion[2] != 0 || unicodeVersion[3] != 0) { 941 char uv[U_MAX_VERSION_STRING_LENGTH]; 942 u_versionToString(unicodeVersion, uv); 943 fprintf(f, "* Unicode %s\n\n", uv); 944 } 945 946 UnicodeSetIterator ccIter(norms.ccSet); 947 UChar32 start = U_SENTINEL; 948 UChar32 end = U_SENTINEL; 949 uint8_t prevCC = 0; 950 bool done = false; 951 bool didWrite = false; 952 do { 953 UChar32 c; 954 uint8_t cc; 955 if(ccIter.next() && !ccIter.isString()) { 956 c = ccIter.getCodepoint(); 957 cc = norms.getCC(c); 958 } else { 959 c = 0x110000; 960 cc = 0; 961 done = true; 962 } 963 if(cc == prevCC && c == (end + 1)) { 964 end = c; 965 } else { 966 if(prevCC != 0) { 967 if(start == end) { 968 fprintf(f, "%04lX:%d\n", static_cast<long>(start), static_cast<int>(prevCC)); 969 } else { 970 fprintf(f, "%04lX..%04lX:%d\n", static_cast<long>(start), static_cast<long>(end), static_cast<int>(prevCC)); 971 } 972 didWrite = true; 973 } 974 start = end = c; 975 prevCC = cc; 976 } 977 } while(!done); 978 if(didWrite) { 979 fputs("\n", f); 980 } 981 982 UnicodeSetIterator mIter(norms.mappingSet); 983 start = U_SENTINEL; 984 end = U_SENTINEL; 985 const UnicodeString *prevMapping = nullptr; 986 Norm::MappingType prevType = Norm::NONE; 987 done = false; 988 do { 989 UChar32 c; 990 const Norm *norm; 991 if(mIter.next() && !mIter.isString()) { 992 c = mIter.getCodepoint(); 993 norm = norms.getNorm(c); 994 } else { 995 c = 0x110000; 996 norm = nullptr; 997 done = true; 998 } 999 const UnicodeString *mapping; 1000 Norm::MappingType type; 1001 if(norm == nullptr) { 1002 mapping = nullptr; 1003 type = Norm::NONE; 1004 } else { 1005 type = norm->mappingType; 1006 if(type == Norm::NONE) { 1007 mapping = nullptr; 1008 } else { 1009 mapping = norm->mapping; 1010 } 1011 } 1012 if(type == prevType && equalStrings(mapping, prevMapping) && c == (end + 1)) { 1013 end = c; 1014 } else { 1015 if(writeRemoved ? prevType != Norm::NONE : prevType > Norm::REMOVED) { 1016 if(start == end) { 1017 fprintf(f, "%04lX%c", static_cast<long>(start), typeChars[prevType]); 1018 } else { 1019 fprintf(f, "%04lX..%04lX%c", static_cast<long>(start), static_cast<long>(end), typeChars[prevType]); 1020 } 1021 writeMapping(f, prevMapping); 1022 } 1023 start = end = c; 1024 prevMapping = mapping; 1025 prevType = type; 1026 } 1027 } while(!done); 1028 1029 fclose(f); 1030 } 1031 1032 void 1033 Normalizer2DataBuilder::computeDiff(const Normalizer2DataBuilder &b1, 1034 const Normalizer2DataBuilder &b2, 1035 Normalizer2DataBuilder &diff) { 1036 // Compute diff = b1 - b2 1037 // so that we should be able to get b1 = b2 + diff. 1038 if(0 != memcmp(b1.unicodeVersion, b2.unicodeVersion, U_MAX_VERSION_LENGTH)) { 1039 memcpy(diff.unicodeVersion, b1.unicodeVersion, U_MAX_VERSION_LENGTH); 1040 } 1041 1042 UnicodeSet ccSet(b1.norms.ccSet); 1043 ccSet.addAll(b2.norms.ccSet); 1044 UnicodeSetIterator ccIter(ccSet); 1045 while(ccIter.next() && !ccIter.isString()) { 1046 UChar32 c = ccIter.getCodepoint(); 1047 uint8_t cc1 = b1.norms.getCC(c); 1048 uint8_t cc2 = b2.norms.getCC(c); 1049 if(cc1 != cc2) { 1050 diff.setCC(c, cc1); 1051 } 1052 } 1053 1054 UnicodeSet mSet(b1.norms.mappingSet); 1055 mSet.addAll(b2.norms.mappingSet); 1056 UnicodeSetIterator mIter(mSet); 1057 while(mIter.next() && !mIter.isString()) { 1058 UChar32 c = mIter.getCodepoint(); 1059 const Norm *norm1 = b1.norms.getNorm(c); 1060 const Norm *norm2 = b2.norms.getNorm(c); 1061 const UnicodeString *mapping1; 1062 Norm::MappingType type1; 1063 if(norm1 == nullptr || !norm1->hasMapping()) { 1064 mapping1 = nullptr; 1065 type1 = Norm::NONE; 1066 } else { 1067 mapping1 = norm1->mapping; 1068 type1 = norm1->mappingType; 1069 } 1070 const UnicodeString *mapping2; 1071 Norm::MappingType type2; 1072 if(norm2 == nullptr || !norm2->hasMapping()) { 1073 mapping2 = nullptr; 1074 type2 = Norm::NONE; 1075 } else { 1076 mapping2 = norm2->mapping; 1077 type2 = norm2->mappingType; 1078 } 1079 if(type1 == type2 && equalStrings(mapping1, mapping2)) { 1080 // Nothing to do. 1081 } else if(type1 == Norm::NONE) { 1082 diff.removeMapping(c); 1083 } else if(type1 == Norm::ROUND_TRIP) { 1084 diff.setRoundTripMapping(c, *mapping1); 1085 } else if(type1 == Norm::ONE_WAY) { 1086 diff.setOneWayMapping(c, *mapping1); 1087 } 1088 } 1089 } 1090 1091 U_NAMESPACE_END 1092 1093 #endif /* #if !UCONFIG_NO_NORMALIZATION */ 1094 1095 /* 1096 * Hey, Emacs, please set the following: 1097 * 1098 * Local Variables: 1099 * indent-tabs-mode: nil 1100 * End: 1101 */