[ tor-browser ].git.dasho

n2builder.cpp (43661B)
      1 // © 2016 and later: Unicode, Inc. and others.
      2 // License & terms of use: http://www.unicode.org/copyright.html
      3 /*
      4 *******************************************************************************
      5 *
      6 *   Copyright (C) 2009-2016, International Business Machines
      7 *   Corporation and others.  All Rights Reserved.
      8 *
      9 *******************************************************************************
     10 *   file name:  n2builder.cpp
     11 *   encoding:   UTF-8
     12 *   tab size:   8 (not used)
     13 *   indentation:4
     14 *
     15 *   created on: 2009nov25
     16 *   created by: Markus W. Scherer
     17 *
     18 * Builds Normalizer2 data and writes a binary .nrm file.
     19 * For the file format see source/common/normalizer2impl.h.
     20 */
     21 
     22 #include "unicode/utypes.h"
     23 #include "n2builder.h"
     24 
     25 #include <stdio.h>
     26 #include <stdlib.h>
     27 #include <string.h>
     28 #include <vector>
     29 #include "unicode/errorcode.h"
     30 #include "unicode/localpointer.h"
     31 #include "unicode/putil.h"
     32 #include "unicode/ucptrie.h"
     33 #include "unicode/udata.h"
     34 #include "unicode/umutablecptrie.h"
     35 #include "unicode/uniset.h"
     36 #include "unicode/unistr.h"
     37 #include "unicode/usetiter.h"
     38 #include "unicode/ustring.h"
     39 #include "charstr.h"
     40 #include "extradata.h"
     41 #include "hash.h"
     42 #include "normalizer2impl.h"
     43 #include "norms.h"
     44 #include "toolutil.h"
     45 #include "unewdata.h"
     46 #include "uvectr32.h"
     47 #include "writesrc.h"
     48 
     49 #if !UCONFIG_NO_NORMALIZATION
     50 
     51 /* UDataInfo cf. udata.h */
     52 static UDataInfo dataInfo={
     53    sizeof(UDataInfo),
     54    0,
     55 
     56    U_IS_BIG_ENDIAN,
     57    U_CHARSET_FAMILY,
     58    U_SIZEOF_UCHAR,
     59    0,
     60 
     61    { 0x4e, 0x72, 0x6d, 0x32 }, /* dataFormat="Nrm2" */
     62    { 5, 0, 0, 0 },             /* formatVersion */
     63    { 16, 0, 0, 0 }             /* dataVersion (Unicode version) */
     64 };
     65 
     66 U_NAMESPACE_BEGIN
     67 
     68 class HangulIterator {
     69 public:
     70    struct Range {
     71        UChar32 start, end;
     72    };
     73 
     74    HangulIterator() : rangeIndex(0) {}
     75    const Range *nextRange() {
     76        if(rangeIndex<UPRV_LENGTHOF(ranges)) {
     77            return ranges+rangeIndex++;
     78        } else {
     79            return nullptr;
     80        }
     81    }
     82 private:
     83    static const Range ranges[4];
     84    int32_t rangeIndex;
     85 };
     86 
     87 const HangulIterator::Range HangulIterator::ranges[4]={
     88    { Hangul::JAMO_L_BASE, Hangul::JAMO_L_END },
     89    { Hangul::JAMO_V_BASE, Hangul::JAMO_V_END },
     90    // JAMO_T_BASE+1: not U+11A7
     91    { Hangul::JAMO_T_BASE+1, Hangul::JAMO_T_END },
     92    { Hangul::HANGUL_BASE, Hangul::HANGUL_END },
     93 };
     94 
     95 Normalizer2DataBuilder::Normalizer2DataBuilder(UErrorCode &errorCode) :
     96        norms(errorCode),
     97        phase(0), overrideHandling(OVERRIDE_PREVIOUS), optimization(OPTIMIZE_NORMAL),
     98        norm16TrieBytes(nullptr), norm16TrieLength(0) {
     99    memset(unicodeVersion, 0, sizeof(unicodeVersion));
    100    memset(indexes, 0, sizeof(indexes));
    101    memset(smallFCD, 0, sizeof(smallFCD));
    102 }
    103 
    104 Normalizer2DataBuilder::~Normalizer2DataBuilder() {
    105    delete[] norm16TrieBytes;
    106 }
    107 
    108 void
    109 Normalizer2DataBuilder::setUnicodeVersion(const char *v) {
    110    UVersionInfo nullVersion={ 0, 0, 0, 0 };
    111    UVersionInfo version;
    112    u_versionFromString(version, v);
    113    if( 0!=memcmp(version, unicodeVersion, U_MAX_VERSION_LENGTH) &&
    114        0!=memcmp(nullVersion, unicodeVersion, U_MAX_VERSION_LENGTH)
    115    ) {
    116        char buffer[U_MAX_VERSION_STRING_LENGTH];
    117        u_versionToString(unicodeVersion, buffer);
    118        fprintf(stderr, "gennorm2 error: multiple inconsistent Unicode version numbers %s vs. %s\n",
    119                buffer, v);
    120        exit(U_ILLEGAL_ARGUMENT_ERROR);
    121    }
    122    memcpy(unicodeVersion, version, U_MAX_VERSION_LENGTH);
    123 }
    124 
    125 Norm *Normalizer2DataBuilder::checkNormForMapping(Norm *p, UChar32 c) {
    126    if(p!=nullptr) {
    127        if(p->mappingType!=Norm::NONE) {
    128            if( overrideHandling==OVERRIDE_NONE ||
    129                (overrideHandling==OVERRIDE_PREVIOUS && p->mappingPhase==phase)
    130            ) {
    131                fprintf(stderr,
    132                        "error in gennorm2 phase %d: "
    133                        "not permitted to override mapping for U+%04lX from phase %d\n",
    134                        static_cast<int>(phase), static_cast<long>(c), static_cast<int>(p->mappingPhase));
    135                exit(U_INVALID_FORMAT_ERROR);
    136            }
    137            delete p->mapping;
    138            p->mapping=nullptr;
    139        }
    140        p->mappingPhase=phase;
    141    }
    142    return p;
    143 }
    144 
    145 void Normalizer2DataBuilder::setOverrideHandling(OverrideHandling oh) {
    146    overrideHandling=oh;
    147    ++phase;
    148 }
    149 
    150 void Normalizer2DataBuilder::setCC(UChar32 c, uint8_t cc) {
    151    norms.createNorm(c)->cc=cc;
    152    norms.ccSet.add(c);
    153 }
    154 
    155 static UBool isWellFormed(const UnicodeString &s) {
    156    UErrorCode errorCode=U_ZERO_ERROR;
    157    u_strToUTF8(nullptr, 0, nullptr, toUCharPtr(s.getBuffer()), s.length(), &errorCode);
    158    return U_SUCCESS(errorCode) || errorCode==U_BUFFER_OVERFLOW_ERROR;
    159 }
    160 
    161 void Normalizer2DataBuilder::setOneWayMapping(UChar32 c, const UnicodeString &m) {
    162    if(!isWellFormed(m)) {
    163        fprintf(stderr,
    164                "error in gennorm2 phase %d: "
    165                "illegal one-way mapping from U+%04lX to malformed string\n",
    166                static_cast<int>(phase), static_cast<long>(c));
    167        exit(U_INVALID_FORMAT_ERROR);
    168    }
    169    Norm *p=checkNormForMapping(norms.createNorm(c), c);
    170    p->mapping=new UnicodeString(m);
    171    p->mappingType=Norm::ONE_WAY;
    172    p->setMappingCP();
    173    norms.mappingSet.add(c);
    174 }
    175 
    176 void Normalizer2DataBuilder::setRoundTripMapping(UChar32 c, const UnicodeString &m) {
    177    if(U_IS_SURROGATE(c)) {
    178        fprintf(stderr,
    179                "error in gennorm2 phase %d: "
    180                "illegal round-trip mapping from surrogate code point U+%04lX\n",
    181                static_cast<int>(phase), static_cast<long>(c));
    182        exit(U_INVALID_FORMAT_ERROR);
    183    }
    184    if(!isWellFormed(m)) {
    185        fprintf(stderr,
    186                "error in gennorm2 phase %d: "
    187                "illegal round-trip mapping from U+%04lX to malformed string\n",
    188                static_cast<int>(phase), static_cast<long>(c));
    189        exit(U_INVALID_FORMAT_ERROR);
    190    }
    191    int32_t numCP=u_countChar32(toUCharPtr(m.getBuffer()), m.length());
    192    if(numCP!=2) {
    193        fprintf(stderr,
    194                "error in gennorm2 phase %d: "
    195                "illegal round-trip mapping from U+%04lX to %d!=2 code points\n",
    196                static_cast<int>(phase), static_cast<long>(c), static_cast<int>(numCP));
    197        exit(U_INVALID_FORMAT_ERROR);
    198    }
    199    Norm *p=checkNormForMapping(norms.createNorm(c), c);
    200    p->mapping=new UnicodeString(m);
    201    p->mappingType=Norm::ROUND_TRIP;
    202    p->mappingCP=U_SENTINEL;
    203    norms.mappingSet.add(c);
    204 }
    205 
    206 void Normalizer2DataBuilder::removeMapping(UChar32 c) {
    207    // createNorm(c), not getNorm(c), to record a non-mapping and detect conflicting data.
    208    Norm *p=checkNormForMapping(norms.createNorm(c), c);
    209    p->mappingType=Norm::REMOVED;
    210    norms.mappingSet.add(c);
    211 }
    212 
    213 UBool Normalizer2DataBuilder::mappingHasCompBoundaryAfter(const BuilderReorderingBuffer &buffer,
    214                                                          Norm::MappingType mappingType) const {
    215    if(buffer.isEmpty()) {
    216        return false;  // Maps-to-empty-string is no boundary of any kind.
    217    }
    218    int32_t lastStarterIndex=buffer.lastStarterIndex();
    219    if(lastStarterIndex<0) {
    220        return false;  // no starter
    221    }
    222    const int32_t lastIndex=buffer.length()-1;
    223    if(mappingType==Norm::ONE_WAY && lastStarterIndex<lastIndex && buffer.ccAt(lastIndex)>1) {
    224        // One-way mapping where after the last starter is at least one combining mark
    225        // with a combining class greater than 1,
    226        // which means that another combining mark can reorder before it.
    227        // By contrast, in a round-trip mapping this does not prevent a boundary as long as
    228        // the starter or composite does not combine-forward with a following combining mark.
    229        return false;
    230    }
    231    UChar32 starter=buffer.charAt(lastStarterIndex);
    232    if(lastStarterIndex==0 && norms.combinesBack(starter)) {
    233        // The last starter is at the beginning of the mapping and combines backward.
    234        return false;
    235    }
    236    if(Hangul::isJamoL(starter) ||
    237            (Hangul::isJamoV(starter) &&
    238            0<lastStarterIndex && Hangul::isJamoL(buffer.charAt(lastStarterIndex-1)))) {
    239        // A Jamo leading consonant or an LV pair combines-forward if it is at the end,
    240        // otherwise it is blocked.
    241        return lastStarterIndex!=lastIndex;
    242    }
    243    // Note: There can be no Hangul syllable in the fully decomposed mapping.
    244 
    245    // Multiple starters can combine into one.
    246    // Look for the first of the last sequence of starters, excluding Jamos.
    247    int32_t i=lastStarterIndex;
    248    UChar32 c;
    249    while(0<i && buffer.ccAt(i-1)==0 && !Hangul::isJamo(c=buffer.charAt(i-1))) {
    250        starter=c;
    251        --i;
    252    }
    253    // Compose as far as possible, and see if further compositions with
    254    // characters following this mapping are possible.
    255    const Norm *starterNorm=norms.getNorm(starter);
    256    if(i==lastStarterIndex &&
    257            (starterNorm==nullptr || !starterNorm->combinesFwd())) {
    258        return true;  // The last starter does not combine forward.
    259    }
    260    uint8_t prevCC=0;
    261    while(++i<buffer.length()) {
    262        uint8_t cc=buffer.ccAt(i);  // !=0 if after last starter
    263        if(i>lastStarterIndex && norms.combinesWithCCBetween(*starterNorm, prevCC, cc)) {
    264            // The starter combines with a mark that reorders before the current one.
    265            return false;
    266        }
    267        UChar32 c=buffer.charAt(i);
    268        if(starterNorm!=nullptr && (prevCC<cc || prevCC==0) &&
    269                norms.getNormRef(c).combinesBack && (starter=starterNorm->combine(c))>=0) {
    270            // The starter combines with c into a composite replacement starter.
    271            starterNorm=norms.getNorm(starter);
    272            if(i>=lastStarterIndex &&
    273                    (starterNorm==nullptr || !starterNorm->combinesFwd())) {
    274                return true;  // The composite does not combine further.
    275            }
    276            // Keep prevCC because we "removed" the combining mark.
    277        } else if(cc==0) {
    278            starterNorm=norms.getNorm(c);
    279            if(i==lastStarterIndex &&
    280                    (starterNorm==nullptr || !starterNorm->combinesFwd())) {
    281                return true;  // The new starter does not combine forward.
    282            }
    283            prevCC=0;
    284        } else {
    285            prevCC=cc;
    286        }
    287    }
    288    if(prevCC==0) {
    289        return false;  // forward-combining starter at the very end
    290    }
    291    if(norms.combinesWithCCBetween(*starterNorm, prevCC, 256)) {
    292        // The starter combines with another mark.
    293        return false;
    294    }
    295    return true;
    296 }
    297 
    298 UBool Normalizer2DataBuilder::mappingRecomposes(const BuilderReorderingBuffer &buffer) const {
    299    if(buffer.lastStarterIndex()<0) {
    300        return false;  // no starter
    301    }
    302    const Norm *starterNorm=nullptr;
    303    uint8_t prevCC=0;
    304    for(int32_t i=0; i<buffer.length(); ++i) {
    305        UChar32 c=buffer.charAt(i);
    306        uint8_t cc=buffer.ccAt(i);
    307        if(starterNorm!=nullptr && (prevCC<cc || prevCC==0) &&
    308                norms.getNormRef(c).combinesBack && starterNorm->combine(c)>=0) {
    309            return true;  // normal composite
    310        } else if(cc==0) {
    311            if(Hangul::isJamoL(c)) {
    312                if((i+1)<buffer.length() && Hangul::isJamoV(buffer.charAt(i+1))) {
    313                    return true;  // Hangul syllable
    314                }
    315                starterNorm=nullptr;
    316            } else {
    317                starterNorm=norms.getNorm(c);
    318            }
    319        }
    320        prevCC=cc;
    321    }
    322    return false;
    323 }
    324 
    325 void Normalizer2DataBuilder::postProcess(Norm &norm) {
    326    // Prerequisites: Compositions are built, mappings are recursively decomposed.
    327    // Mappings are not yet in canonical order.
    328    //
    329    // This function works on a Norm struct. We do not know which code point(s) map(s) to it.
    330    // Therefore, we cannot compute algorithmic mapping deltas here.
    331    // Error conditions are checked, but printed later when we do know the offending code point.
    332    if(norm.hasMapping()) {
    333        if(norm.mapping->length()>Normalizer2Impl::MAPPING_LENGTH_MASK) {
    334            norm.error="mapping longer than maximum of 31";
    335            return;
    336        }
    337        // Ensure canonical order.
    338        BuilderReorderingBuffer buffer;
    339        if(norm.rawMapping!=nullptr) {
    340            norms.reorder(*norm.rawMapping, buffer);
    341            buffer.reset();
    342        }
    343        norms.reorder(*norm.mapping, buffer);
    344        if(buffer.isEmpty()) {
    345            // A character that is deleted (maps to an empty string) must
    346            // get the worst-case lccc and tccc values because arbitrary
    347            // characters on both sides will become adjacent.
    348            norm.leadCC=1;
    349            norm.trailCC=0xff;
    350        } else {
    351            norm.leadCC=buffer.ccAt(0);
    352            norm.trailCC=buffer.ccAt(buffer.length()-1);
    353        }
    354 
    355        norm.hasCompBoundaryBefore=
    356            !buffer.isEmpty() && norm.leadCC==0 && !norms.combinesBack(buffer.charAt(0));
    357        // No comp-boundary-after when norm.combinesBack:
    358        // MaybeNo character whose first mapping character may combine-back,
    359        // in which case we would not recompose to this character,
    360        // and may need more context.
    361        norm.hasCompBoundaryAfter=
    362            !norm.combinesBack && !norm.combinesFwd() &&
    363            mappingHasCompBoundaryAfter(buffer, norm.mappingType);
    364 
    365        if(norm.combinesBack) {
    366            if(norm.mappingType!=Norm::ROUND_TRIP) {
    367                // One-way mappings don't get NFC_QC=Maybe, and
    368                // should not have gotten combinesBack set.
    369                norm.error="combines-back and has a one-way mapping, "
    370                           "not possible in Unicode normalization";
    371            } else if(norm.combinesFwd()) {
    372                // Earlier code checked ccc=0.
    373                norm.type=Norm::MAYBE_NO_COMBINES_FWD;
    374            } else if(norm.cc==0) {
    375                norm.type=Norm::MAYBE_NO_MAPPING_ONLY;
    376            } else {
    377                norm.error="combines-back and decomposes with ccc!=0, "
    378                           "not possible in Unicode normalization";
    379                // ... because we don't reorder again after composition.
    380            }
    381        } else if(norm.mappingType==Norm::ROUND_TRIP) {
    382            if(norm.combinesFwd()) {
    383                norm.type=Norm::YES_NO_COMBINES_FWD;
    384            } else {
    385                norm.type=Norm::YES_NO_MAPPING_ONLY;
    386            }
    387        } else {  // one-way mapping
    388            if(norm.combinesFwd()) {
    389                norm.error="combines-forward and has a one-way mapping, "
    390                           "not possible in Unicode normalization";
    391            } else if(buffer.isEmpty()) {
    392                norm.type=Norm::NO_NO_EMPTY;
    393            } else if(!norm.hasCompBoundaryBefore) {
    394                norm.type=Norm::NO_NO_COMP_NO_MAYBE_CC;
    395            } else if(mappingRecomposes(buffer)) {
    396                norm.type=Norm::NO_NO_COMP_BOUNDARY_BEFORE;
    397            } else {
    398                // The mapping is comp-normalized.
    399                norm.type=Norm::NO_NO_COMP_YES;
    400            }
    401        }
    402    } else {  // no mapping
    403        norm.leadCC=norm.trailCC=norm.cc;
    404 
    405        norm.hasCompBoundaryBefore=
    406            norm.cc==0 && !norm.combinesBack;
    407        norm.hasCompBoundaryAfter=
    408            norm.cc==0 && !norm.combinesBack && !norm.combinesFwd();
    409 
    410        if(norm.combinesBack) {
    411            if(norm.combinesFwd()) {
    412                // Earlier code checked ccc=0.
    413                norm.type=Norm::MAYBE_YES_COMBINES_FWD;
    414            } else {
    415                norm.type=Norm::MAYBE_YES_SIMPLE;  // any ccc
    416            }
    417        } else if(norm.combinesFwd()) {
    418            // Earlier code checked ccc=0.
    419            norm.type=Norm::YES_YES_COMBINES_FWD;
    420        } else if(norm.cc!=0) {
    421            norm.type=Norm::YES_YES_WITH_CC;
    422        } else {
    423            norm.type=Norm::INERT;
    424        }
    425    }
    426 }
    427 
    428 class Norm16Writer : public Norms::Enumerator {
    429 public:
    430    Norm16Writer(UMutableCPTrie *trie, Norms &n, Normalizer2DataBuilder &b) :
    431            Norms::Enumerator(n), builder(b), norm16Trie(trie) {}
    432    void rangeHandler(UChar32 start, UChar32 end, Norm &norm) override {
    433        builder.writeNorm16(norm16Trie, start, end, norm);
    434    }
    435    Normalizer2DataBuilder &builder;
    436    UMutableCPTrie *norm16Trie;
    437 };
    438 
    439 void Normalizer2DataBuilder::setSmallFCD(UChar32 c) {
    440    UChar32 lead= c<=0xffff ? c : U16_LEAD(c);
    441    smallFCD[lead >> 8] |= static_cast<uint8_t>(1) << ((lead >> 5) & 7);
    442 }
    443 
    444 void Normalizer2DataBuilder::writeNorm16(UMutableCPTrie *norm16Trie, UChar32 start, UChar32 end, Norm &norm) {
    445    if((norm.leadCC|norm.trailCC)!=0) {
    446        for(UChar32 c=start; c<=end; ++c) {
    447            setSmallFCD(c);
    448        }
    449    }
    450 
    451    int32_t norm16;
    452    switch(norm.type) {
    453    case Norm::INERT:
    454        norm16=Normalizer2Impl::INERT;
    455        break;
    456    case Norm::YES_YES_COMBINES_FWD:
    457        norm16=norm.offset*2;
    458        break;
    459    case Norm::YES_NO_COMBINES_FWD:
    460        norm16=indexes[Normalizer2Impl::IX_MIN_YES_NO]+norm.offset*2;
    461        break;
    462    case Norm::YES_NO_MAPPING_ONLY:
    463        norm16=indexes[Normalizer2Impl::IX_MIN_YES_NO_MAPPINGS_ONLY]+norm.offset*2;
    464        break;
    465    case Norm::NO_NO_COMP_YES:
    466        norm16=indexes[Normalizer2Impl::IX_MIN_NO_NO]+norm.offset*2;
    467        break;
    468    case Norm::NO_NO_COMP_BOUNDARY_BEFORE:
    469        norm16=indexes[Normalizer2Impl::IX_MIN_NO_NO_COMP_BOUNDARY_BEFORE]+norm.offset*2;
    470        break;
    471    case Norm::NO_NO_COMP_NO_MAYBE_CC:
    472        norm16=indexes[Normalizer2Impl::IX_MIN_NO_NO_COMP_NO_MAYBE_CC]+norm.offset*2;
    473        break;
    474    case Norm::NO_NO_EMPTY:
    475        norm16=indexes[Normalizer2Impl::IX_MIN_NO_NO_EMPTY]+norm.offset*2;
    476        break;
    477    case Norm::NO_NO_DELTA:
    478        {
    479            // Positive offset from minNoNoDelta, shifted left for additional bits.
    480            int32_t offset=(norm.offset+Normalizer2Impl::MAX_DELTA)<<Normalizer2Impl::DELTA_SHIFT;
    481            if(norm.trailCC==0) {
    482                // DELTA_TCCC_0==0
    483            } else if(norm.trailCC==1) {
    484                offset|=Normalizer2Impl::DELTA_TCCC_1;
    485            } else {
    486                offset|=Normalizer2Impl::DELTA_TCCC_GT_1;
    487            }
    488            norm16=getMinNoNoDelta()+offset;
    489            break;
    490        }
    491    case Norm::MAYBE_NO_MAPPING_ONLY:
    492        norm16=indexes[Normalizer2Impl::IX_MIN_MAYBE_NO]+norm.offset*2;
    493        break;
    494    case Norm::MAYBE_NO_COMBINES_FWD:
    495        norm16=indexes[Normalizer2Impl::IX_MIN_MAYBE_NO_COMBINES_FWD]+norm.offset*2;
    496        break;
    497    case Norm::MAYBE_YES_COMBINES_FWD:
    498        norm16=indexes[Normalizer2Impl::IX_MIN_MAYBE_YES]+norm.offset*2;
    499        break;
    500    case Norm::MAYBE_YES_SIMPLE:
    501        norm16=Normalizer2Impl::MIN_NORMAL_MAYBE_YES+norm.cc*2;  // ccc=0..255
    502        break;
    503    case Norm::YES_YES_WITH_CC:
    504        U_ASSERT(norm.cc!=0);
    505        norm16=Normalizer2Impl::MIN_YES_YES_WITH_CC-2+norm.cc*2;  // ccc=1..255
    506        break;
    507    default:  // Should not occur.
    508        exit(U_INTERNAL_PROGRAM_ERROR);
    509    }
    510    U_ASSERT((norm16&1)==0);
    511    if(norm.hasCompBoundaryAfter) {
    512        norm16|=Normalizer2Impl::HAS_COMP_BOUNDARY_AFTER;
    513    }
    514    IcuToolErrorCode errorCode("gennorm2/writeNorm16()");
    515    umutablecptrie_setRange(norm16Trie, start, end, static_cast<uint32_t>(norm16), errorCode);
    516 
    517    // Set the minimum code points for real data lookups in the quick check loops.
    518    UBool isDecompNo=
    519            (Norm::YES_NO_COMBINES_FWD<=norm.type && norm.type<=Norm::NO_NO_DELTA) ||
    520            norm.cc!=0;
    521    if(isDecompNo && start<indexes[Normalizer2Impl::IX_MIN_DECOMP_NO_CP]) {
    522        indexes[Normalizer2Impl::IX_MIN_DECOMP_NO_CP]=start;
    523    }
    524    UBool isCompNoMaybe= norm.type>=Norm::NO_NO_COMP_YES;
    525    if(isCompNoMaybe && start<indexes[Normalizer2Impl::IX_MIN_COMP_NO_MAYBE_CP]) {
    526        indexes[Normalizer2Impl::IX_MIN_COMP_NO_MAYBE_CP]=start;
    527    }
    528    if(norm.leadCC!=0 && start<indexes[Normalizer2Impl::IX_MIN_LCCC_CP]) {
    529        indexes[Normalizer2Impl::IX_MIN_LCCC_CP]=start;
    530    }
    531 }
    532 
    533 void Normalizer2DataBuilder::setHangulData(UMutableCPTrie *norm16Trie) {
    534    HangulIterator hi;
    535    const HangulIterator::Range *range;
    536    // Check that none of the Hangul/Jamo code points have data.
    537    while((range=hi.nextRange())!=nullptr) {
    538        for(UChar32 c=range->start; c<=range->end; ++c) {
    539            if(umutablecptrie_get(norm16Trie, c)>Normalizer2Impl::INERT) {
    540                fprintf(stderr,
    541                        "gennorm2 error: "
    542                        "illegal mapping/composition/ccc data for Hangul or Jamo U+%04lX\n",
    543                        static_cast<long>(c));
    544                exit(U_INVALID_FORMAT_ERROR);
    545            }
    546        }
    547    }
    548    // Set data for algorithmic runtime handling.
    549    IcuToolErrorCode errorCode("gennorm2/setHangulData()");
    550 
    551    // Jamo V/T are maybeYes
    552    if(Hangul::JAMO_V_BASE<indexes[Normalizer2Impl::IX_MIN_COMP_NO_MAYBE_CP]) {
    553        indexes[Normalizer2Impl::IX_MIN_COMP_NO_MAYBE_CP]=Hangul::JAMO_V_BASE;
    554    }
    555    umutablecptrie_setRange(norm16Trie, Hangul::JAMO_L_BASE, Hangul::JAMO_L_END,
    556                            Normalizer2Impl::JAMO_L, errorCode);
    557    umutablecptrie_setRange(norm16Trie, Hangul::JAMO_V_BASE, Hangul::JAMO_V_END,
    558                            Normalizer2Impl::JAMO_VT, errorCode);
    559    // JAMO_T_BASE+1: not U+11A7
    560    umutablecptrie_setRange(norm16Trie, Hangul::JAMO_T_BASE+1, Hangul::JAMO_T_END,
    561                            Normalizer2Impl::JAMO_VT, errorCode);
    562 
    563    // Hangul LV encoded as minYesNo
    564    uint32_t lv=indexes[Normalizer2Impl::IX_MIN_YES_NO];
    565    // Hangul LVT encoded as minYesNoMappingsOnly|HAS_COMP_BOUNDARY_AFTER
    566    uint32_t lvt=indexes[Normalizer2Impl::IX_MIN_YES_NO_MAPPINGS_ONLY]|
    567        Normalizer2Impl::HAS_COMP_BOUNDARY_AFTER;
    568    if(Hangul::HANGUL_BASE<indexes[Normalizer2Impl::IX_MIN_DECOMP_NO_CP]) {
    569        indexes[Normalizer2Impl::IX_MIN_DECOMP_NO_CP]=Hangul::HANGUL_BASE;
    570    }
    571    // Set the first LV, then write all other Hangul syllables as LVT,
    572    // then overwrite the remaining LV.
    573    umutablecptrie_set(norm16Trie, Hangul::HANGUL_BASE, lv, errorCode);
    574    umutablecptrie_setRange(norm16Trie, Hangul::HANGUL_BASE+1, Hangul::HANGUL_END, lvt, errorCode);
    575    UChar32 c=Hangul::HANGUL_BASE;
    576    while((c+=Hangul::JAMO_T_COUNT)<=Hangul::HANGUL_END) {
    577        umutablecptrie_set(norm16Trie, c, lv, errorCode);
    578    }
    579    errorCode.assertSuccess();
    580 }
    581 
    582 LocalUCPTriePointer Normalizer2DataBuilder::processData() {
    583    // Build composition lists before recursive decomposition,
    584    // so that we still have the raw, pair-wise mappings.
    585    CompositionBuilder compBuilder(norms);
    586    norms.enumRanges(compBuilder);
    587 
    588    // Recursively decompose all mappings.
    589    Decomposer decomposer(norms);
    590    do {
    591        decomposer.didDecompose=false;
    592        norms.enumRanges(decomposer);
    593    } while(decomposer.didDecompose);
    594 
    595    // Set the Norm::Type and other properties.
    596    int32_t normsLength=norms.length();
    597    for(int32_t i=1; i<normsLength; ++i) {
    598        postProcess(norms.getNormRefByIndex(i));
    599    }
    600 
    601    // Write the properties, mappings and composition lists to
    602    // appropriate parts of the "extra data" array.
    603    ExtraData extra(norms, optimization==OPTIMIZE_FAST);
    604    norms.enumRanges(extra);
    605 
    606    extraData=extra.yesYesCompositions;
    607    indexes[Normalizer2Impl::IX_MIN_YES_NO]=extraData.length()*2;
    608    extraData.append(extra.yesNoMappingsAndCompositions);
    609    indexes[Normalizer2Impl::IX_MIN_YES_NO_MAPPINGS_ONLY]=extraData.length()*2;
    610    extraData.append(extra.yesNoMappingsOnly);
    611    indexes[Normalizer2Impl::IX_MIN_NO_NO]=extraData.length()*2;
    612    extraData.append(extra.noNoMappingsCompYes);
    613    indexes[Normalizer2Impl::IX_MIN_NO_NO_COMP_BOUNDARY_BEFORE]=extraData.length()*2;
    614    extraData.append(extra.noNoMappingsCompBoundaryBefore);
    615    indexes[Normalizer2Impl::IX_MIN_NO_NO_COMP_NO_MAYBE_CC]=extraData.length()*2;
    616    extraData.append(extra.noNoMappingsCompNoMaybeCC);
    617    indexes[Normalizer2Impl::IX_MIN_NO_NO_EMPTY]=extraData.length()*2;
    618    extraData.append(extra.noNoMappingsEmpty);
    619    indexes[Normalizer2Impl::IX_LIMIT_NO_NO]=extraData.length()*2;
    620 
    621    int32_t maybeDataLength=
    622        extra.maybeNoMappingsOnly.length()+
    623        extra.maybeNoMappingsAndCompositions.length()+
    624        extra.maybeYesCompositions.length();
    625    int32_t minMaybeNo=Normalizer2Impl::MIN_NORMAL_MAYBE_YES-maybeDataLength*2;
    626    // Adjust minMaybeNo down to 8-align it,
    627    // so that NO_NO_DELTA bits 2..1 can be used without subtracting the center.
    628    minMaybeNo&=~7;
    629 
    630    int32_t index=minMaybeNo;
    631    indexes[Normalizer2Impl::IX_MIN_MAYBE_NO]=index;
    632    extraData.append(extra.maybeNoMappingsOnly);
    633    index+=extra.maybeNoMappingsOnly.length()*2;
    634    indexes[Normalizer2Impl::IX_MIN_MAYBE_NO_COMBINES_FWD]=index;
    635    extraData.append(extra.maybeNoMappingsAndCompositions);
    636    index+=extra.maybeNoMappingsAndCompositions.length()*2;
    637    indexes[Normalizer2Impl::IX_MIN_MAYBE_YES]=index;
    638    extraData.append(extra.maybeYesCompositions);
    639 
    640    // Pad the extraData to even length for 4-byte alignment of following data.
    641    if(extraData.length()&1) {
    642        extraData.append(static_cast<char16_t>(0));
    643    }
    644 
    645    int32_t minNoNoDelta=getMinNoNoDelta();
    646    U_ASSERT((minNoNoDelta&7)==0);
    647    if(indexes[Normalizer2Impl::IX_LIMIT_NO_NO]>minNoNoDelta) {
    648        fprintf(stderr,
    649                "gennorm2 error: "
    650                "data structure overflow, too much mapping composition data\n");
    651        exit(U_BUFFER_OVERFLOW_ERROR);
    652    }
    653 
    654    // writeNorm16() and setHangulData() reduce these as needed.
    655    indexes[Normalizer2Impl::IX_MIN_DECOMP_NO_CP]=0x110000;
    656    indexes[Normalizer2Impl::IX_MIN_COMP_NO_MAYBE_CP]=0x110000;
    657    indexes[Normalizer2Impl::IX_MIN_LCCC_CP]=0x110000;
    658 
    659    IcuToolErrorCode errorCode("gennorm2/processData()");
    660    UMutableCPTrie *norm16Trie = umutablecptrie_open(
    661        Normalizer2Impl::INERT, Normalizer2Impl::INERT, errorCode);
    662    errorCode.assertSuccess();
    663 
    664    // Map each code point to its norm16 value,
    665    // including the properties that fit directly,
    666    // and the offset to the "extra data" if necessary.
    667    Norm16Writer norm16Writer(norm16Trie, norms, *this);
    668    norms.enumRanges(norm16Writer);
    669    // TODO: iterate via getRange() instead of callback?
    670 
    671    setHangulData(norm16Trie);
    672 
    673    // Look for the "worst" norm16 value of any supplementary code point
    674    // corresponding to a lead surrogate, and set it as that surrogate's value.
    675    // Enables UTF-16 quick check inner loops to look at only code units.
    676    //
    677    // We could be more sophisticated:
    678    // We could collect a bit set for whether there are values in the different
    679    // norm16 ranges (yesNo, maybeYes, yesYesWithCC etc.)
    680    // and select the best value that only breaks the composition and/or decomposition
    681    // inner loops if necessary.
    682    // However, that seems like overkill for an optimization for supplementary characters.
    683    //
    684    // First check that surrogate code *points* are inert.
    685    // The parser should have rejected values/mappings for them.
    686    uint32_t value;
    687    UChar32 end = umutablecptrie_getRange(norm16Trie, 0xd800, UCPMAP_RANGE_NORMAL, 0,
    688                                          nullptr, nullptr, &value);
    689    if (value != Normalizer2Impl::INERT || end < 0xdfff) {
    690        fprintf(stderr,
    691                "gennorm2 error: not all surrogate code points are inert: U+d800..U+%04x=%lx\n",
    692                static_cast<int>(end), static_cast<long>(value));
    693        exit(U_INTERNAL_PROGRAM_ERROR);
    694    }
    695    uint32_t maxNorm16 = 0;
    696    // ANDing values yields 0 bits where any value has a 0.
    697    // Used for worst-case HAS_COMP_BOUNDARY_AFTER.
    698    uint32_t andedNorm16 = 0;
    699    end = 0;
    700    for (UChar32 start = 0x10000;;) {
    701        if (start > end) {
    702            end = umutablecptrie_getRange(norm16Trie, start, UCPMAP_RANGE_NORMAL, 0,
    703                                          nullptr, nullptr, &value);
    704            if (end < 0) { break; }
    705        }
    706        if ((start & 0x3ff) == 0) {
    707            // Data for a new lead surrogate.
    708            maxNorm16 = andedNorm16 = value;
    709        } else {
    710            if (value > maxNorm16) {
    711                maxNorm16 = value;
    712            }
    713            andedNorm16 &= value;
    714        }
    715        // Intersect each range with the code points for one lead surrogate.
    716        UChar32 leadEnd = start | 0x3ff;
    717        if (leadEnd <= end) {
    718            // End of the supplementary block for a lead surrogate.
    719            if (maxNorm16 >= static_cast<uint32_t>(indexes[Normalizer2Impl::IX_LIMIT_NO_NO])) {
    720                // Set noNo ("worst" value) if it got into "less-bad" maybeYes or ccc!=0.
    721                // Otherwise it might end up at something like JAMO_VT which stays in
    722                // the inner decomposition quick check loop.
    723                maxNorm16 = static_cast<uint32_t>(indexes[Normalizer2Impl::IX_LIMIT_NO_NO]);
    724            }
    725            maxNorm16 =
    726                (maxNorm16 & ~Normalizer2Impl::HAS_COMP_BOUNDARY_AFTER)|
    727                (andedNorm16 & Normalizer2Impl::HAS_COMP_BOUNDARY_AFTER);
    728            if (maxNorm16 != Normalizer2Impl::INERT) {
    729                umutablecptrie_set(norm16Trie, U16_LEAD(start), maxNorm16, errorCode);
    730            }
    731            if (value == Normalizer2Impl::INERT) {
    732                // Potentially skip inert supplementary blocks for several lead surrogates.
    733                start = (end + 1) & ~0x3ff;
    734            } else {
    735                start = leadEnd + 1;
    736            }
    737        } else {
    738            start = end + 1;
    739        }
    740    }
    741 
    742    // Adjust supplementary minimum code points to break quick check loops at their lead surrogates.
    743    // For an empty data file, minCP=0x110000 turns into 0xdc00 (first trail surrogate)
    744    // which is harmless.
    745    // As a result, the minimum code points are always BMP code points.
    746    int32_t minCP=indexes[Normalizer2Impl::IX_MIN_DECOMP_NO_CP];
    747    if(minCP>=0x10000) {
    748        indexes[Normalizer2Impl::IX_MIN_DECOMP_NO_CP]=U16_LEAD(minCP);
    749    }
    750    minCP=indexes[Normalizer2Impl::IX_MIN_COMP_NO_MAYBE_CP];
    751    if(minCP>=0x10000) {
    752        indexes[Normalizer2Impl::IX_MIN_COMP_NO_MAYBE_CP]=U16_LEAD(minCP);
    753    }
    754    minCP=indexes[Normalizer2Impl::IX_MIN_LCCC_CP];
    755    if(minCP>=0x10000) {
    756        indexes[Normalizer2Impl::IX_MIN_LCCC_CP]=U16_LEAD(minCP);
    757    }
    758 
    759    LocalUCPTriePointer builtTrie(
    760        umutablecptrie_buildImmutable(norm16Trie, UCPTRIE_TYPE_FAST, UCPTRIE_VALUE_BITS_16, errorCode));
    761    norm16TrieLength=ucptrie_toBinary(builtTrie.getAlias(), nullptr, 0, errorCode);
    762    if(errorCode.get()!=U_BUFFER_OVERFLOW_ERROR) {
    763        fprintf(stderr, "gennorm2 error: unable to build/serialize the normalization trie - %s\n",
    764                errorCode.errorName());
    765        exit(errorCode.reset());
    766    }
    767    umutablecptrie_close(norm16Trie);
    768    errorCode.reset();
    769    norm16TrieBytes=new uint8_t[norm16TrieLength];
    770    ucptrie_toBinary(builtTrie.getAlias(), norm16TrieBytes, norm16TrieLength, errorCode);
    771    errorCode.assertSuccess();
    772 
    773    int32_t offset = static_cast<int32_t>(sizeof(indexes));
    774    indexes[Normalizer2Impl::IX_NORM_TRIE_OFFSET]=offset;
    775    offset+=norm16TrieLength;
    776    indexes[Normalizer2Impl::IX_EXTRA_DATA_OFFSET]=offset;
    777    offset+=extraData.length()*2;
    778    indexes[Normalizer2Impl::IX_SMALL_FCD_OFFSET]=offset;
    779    offset+=sizeof(smallFCD);
    780    int32_t totalSize=offset;
    781    for(int32_t i=Normalizer2Impl::IX_RESERVED3_OFFSET; i<=Normalizer2Impl::IX_TOTAL_SIZE; ++i) {
    782        indexes[i]=totalSize;
    783    }
    784 
    785    if(beVerbose) {
    786        printf("size of normalization trie:         %5ld bytes\n", static_cast<long>(norm16TrieLength));
    787        printf("size of 16-bit extra data:          %5ld uint16_t\n", static_cast<long>(extraData.length()));
    788        printf("size of small-FCD data:             %5ld bytes\n", static_cast<long>(sizeof(smallFCD)));
    789        printf("size of binary data file contents:  %5ld bytes\n", static_cast<long>(totalSize));
    790        printf("minDecompNoCodePoint:              U+%04lX\n",
    791               static_cast<long>(indexes[Normalizer2Impl::IX_MIN_DECOMP_NO_CP]));
    792        printf("minCompNoMaybeCodePoint:           U+%04lX\n",
    793               static_cast<long>(indexes[Normalizer2Impl::IX_MIN_COMP_NO_MAYBE_CP]));
    794        printf("minLcccCodePoint:                  U+%04lX\n",
    795               static_cast<long>(indexes[Normalizer2Impl::IX_MIN_LCCC_CP]));
    796        printf("minYesNo: (with compositions)      0x%04x\n",
    797               static_cast<int>(indexes[Normalizer2Impl::IX_MIN_YES_NO]));
    798        printf("minYesNoMappingsOnly:              0x%04x\n",
    799               static_cast<int>(indexes[Normalizer2Impl::IX_MIN_YES_NO_MAPPINGS_ONLY]));
    800        printf("minNoNo: (comp-normalized)         0x%04x\n",
    801               static_cast<int>(indexes[Normalizer2Impl::IX_MIN_NO_NO]));
    802        printf("minNoNoCompBoundaryBefore:         0x%04x\n",
    803               static_cast<int>(indexes[Normalizer2Impl::IX_MIN_NO_NO_COMP_BOUNDARY_BEFORE]));
    804        printf("minNoNoCompNoMaybeCC:              0x%04x\n",
    805               static_cast<int>(indexes[Normalizer2Impl::IX_MIN_NO_NO_COMP_NO_MAYBE_CC]));
    806        printf("minNoNoEmpty:                      0x%04x\n",
    807               static_cast<int>(indexes[Normalizer2Impl::IX_MIN_NO_NO_EMPTY]));
    808        printf("limitNoNo:                         0x%04x\n",
    809               static_cast<int>(indexes[Normalizer2Impl::IX_LIMIT_NO_NO]));
    810        printf("minNoNoDelta:                      0x%04x\n",
    811               static_cast<int>(minNoNoDelta));
    812        printf("minMaybeNo:                        0x%04x\n",
    813               static_cast<int>(indexes[Normalizer2Impl::IX_MIN_MAYBE_NO]));
    814        printf("minMaybeNoCombinesFwd:             0x%04x\n",
    815               static_cast<int>(indexes[Normalizer2Impl::IX_MIN_MAYBE_NO_COMBINES_FWD]));
    816        printf("minMaybeYes:                       0x%04x\n",
    817               static_cast<int>(indexes[Normalizer2Impl::IX_MIN_MAYBE_YES]));
    818    }
    819 
    820    UVersionInfo nullVersion={ 0, 0, 0, 0 };
    821    if(0==memcmp(nullVersion, unicodeVersion, 4)) {
    822        u_versionFromString(unicodeVersion, U_UNICODE_VERSION);
    823    }
    824    memcpy(dataInfo.dataVersion, unicodeVersion, 4);
    825    return builtTrie;
    826 }
    827 
    828 void Normalizer2DataBuilder::writeBinaryFile(const char *filename) {
    829    processData();
    830 
    831    IcuToolErrorCode errorCode("gennorm2/writeBinaryFile()");
    832    UNewDataMemory *pData=
    833        udata_create(nullptr, nullptr, filename, &dataInfo,
    834                     haveCopyright ? U_COPYRIGHT_STRING : nullptr, errorCode);
    835    if(errorCode.isFailure()) {
    836        fprintf(stderr, "gennorm2 error: unable to create the output file %s - %s\n",
    837                filename, errorCode.errorName());
    838        exit(errorCode.reset());
    839    }
    840    udata_writeBlock(pData, indexes, sizeof(indexes));
    841    udata_writeBlock(pData, norm16TrieBytes, norm16TrieLength);
    842    udata_writeUString(pData, toUCharPtr(extraData.getBuffer()), extraData.length());
    843    udata_writeBlock(pData, smallFCD, sizeof(smallFCD));
    844    int32_t writtenSize=udata_finish(pData, errorCode);
    845    if(errorCode.isFailure()) {
    846        fprintf(stderr, "gennorm2: error %s writing the output file\n", errorCode.errorName());
    847        exit(errorCode.reset());
    848    }
    849    int32_t totalSize=indexes[Normalizer2Impl::IX_TOTAL_SIZE];
    850    if(writtenSize!=totalSize) {
    851        fprintf(stderr, "gennorm2 error: written size %ld != calculated size %ld\n",
    852            static_cast<long>(writtenSize), static_cast<long>(totalSize));
    853        exit(U_INTERNAL_PROGRAM_ERROR);
    854    }
    855 }
    856 
    857 void
    858 Normalizer2DataBuilder::writeCSourceFile(const char *filename) {
    859    LocalUCPTriePointer norm16Trie = processData();
    860 
    861    IcuToolErrorCode errorCode("gennorm2/writeCSourceFile()");
    862    const char *basename=findBasename(filename);
    863    CharString path(filename, static_cast<int32_t>(basename - filename), errorCode);
    864    CharString dataName(basename, errorCode);
    865    const char *extension=strrchr(basename, '.');
    866    if(extension!=nullptr) {
    867        dataName.truncate(static_cast<int32_t>(extension - basename));
    868    }
    869    const char *name=dataName.data();
    870    errorCode.assertSuccess();
    871 
    872    FILE *f=usrc_create(path.data(), basename, 2016, "icu/source/tools/gennorm2/n2builder.cpp");
    873    if(f==nullptr) {
    874        fprintf(stderr, "gennorm2/writeCSourceFile() error: unable to create the output file %s\n",
    875                filename);
    876        exit(U_FILE_ACCESS_ERROR);
    877    }
    878    fputs("#ifdef INCLUDED_FROM_NORMALIZER2_CPP\n\n", f);
    879 
    880    char line[100];
    881    snprintf(line, sizeof(line), "static const UVersionInfo %s_formatVersion={", name);
    882    usrc_writeArray(f, line, dataInfo.formatVersion, 8, 4, "", "};\n");
    883    snprintf(line, sizeof(line), "static const UVersionInfo %s_dataVersion={", name);
    884    usrc_writeArray(f, line, dataInfo.dataVersion, 8, 4, "", "};\n\n");
    885    snprintf(line, sizeof(line), "static const int32_t %s_indexes[Normalizer2Impl::IX_COUNT]={\n", name);
    886    usrc_writeArray(f, line, indexes, 32, Normalizer2Impl::IX_COUNT, "", "\n};\n\n");
    887 
    888    usrc_writeUCPTrie(f, name, norm16Trie.getAlias(), UPRV_TARGET_SYNTAX_CCODE);
    889 
    890    snprintf(line, sizeof(line), "static const uint16_t %s_extraData[%%ld]={\n", name);
    891    usrc_writeArray(f, line, extraData.getBuffer(), 16, extraData.length(), "", "\n};\n\n");
    892    snprintf(line, sizeof(line), "static const uint8_t %s_smallFCD[%%ld]={\n", name);
    893    usrc_writeArray(f, line, smallFCD, 8, sizeof(smallFCD), "", "\n};\n\n");
    894 
    895    fputs("#endif  // INCLUDED_FROM_NORMALIZER2_CPP\n", f);
    896    fclose(f);
    897 }
    898 
    899 namespace {
    900 
    901 bool equalStrings(const UnicodeString *s1, const UnicodeString *s2) {
    902    if(s1 == nullptr) {
    903        return s2 == nullptr;
    904    } else if(s2 == nullptr) {
    905        return false;
    906    } else {
    907        return *s1 == *s2;
    908    }
    909 }
    910 
    911 const char *typeChars = "?-=>";
    912 
    913 void writeMapping(FILE *f, const UnicodeString *m) {
    914    if(m != nullptr && !m->isEmpty()) {
    915        int32_t i = 0;
    916        UChar32 c = m->char32At(i);
    917        fprintf(f, "%04lX", static_cast<long>(c));
    918        while((i += U16_LENGTH(c)) < m->length()) {
    919            c = m->char32At(i);
    920            fprintf(f, " %04lX", static_cast<long>(c));
    921        }
    922    }
    923    fputs("\n", f);
    924 }
    925 
    926 }  // namespace
    927 
    928 void
    929 Normalizer2DataBuilder::writeDataFile(const char *filename, bool writeRemoved) const {
    930    // Do not processData() before writing the input-syntax data file.
    931    FILE *f = fopen(filename, "w");
    932    if(f == nullptr) {
    933        fprintf(stderr, "gennorm2/writeDataFile() error: unable to create the output file %s\n",
    934                filename);
    935        exit(U_FILE_ACCESS_ERROR);
    936        return;
    937    }
    938 
    939    if(unicodeVersion[0] != 0 || unicodeVersion[1] != 0 ||
    940            unicodeVersion[2] != 0 || unicodeVersion[3] != 0) {
    941        char uv[U_MAX_VERSION_STRING_LENGTH];
    942        u_versionToString(unicodeVersion, uv);
    943        fprintf(f, "* Unicode %s\n\n", uv);
    944    }
    945 
    946    UnicodeSetIterator ccIter(norms.ccSet);
    947    UChar32 start = U_SENTINEL;
    948    UChar32 end = U_SENTINEL;
    949    uint8_t prevCC = 0;
    950    bool done = false;
    951    bool didWrite = false;
    952    do {
    953        UChar32 c;
    954        uint8_t cc;
    955        if(ccIter.next() && !ccIter.isString()) {
    956            c = ccIter.getCodepoint();
    957            cc = norms.getCC(c);
    958        } else {
    959            c = 0x110000;
    960            cc = 0;
    961            done = true;
    962        }
    963        if(cc == prevCC && c == (end + 1)) {
    964            end = c;
    965        } else {
    966            if(prevCC != 0) {
    967                if(start == end) {
    968                    fprintf(f, "%04lX:%d\n", static_cast<long>(start), static_cast<int>(prevCC));
    969                } else {
    970                    fprintf(f, "%04lX..%04lX:%d\n", static_cast<long>(start), static_cast<long>(end), static_cast<int>(prevCC));
    971                }
    972                didWrite = true;
    973            }
    974            start = end = c;
    975            prevCC = cc;
    976        }
    977    } while(!done);
    978    if(didWrite) {
    979        fputs("\n", f);
    980    }
    981 
    982    UnicodeSetIterator mIter(norms.mappingSet);
    983    start = U_SENTINEL;
    984    end = U_SENTINEL;
    985    const UnicodeString *prevMapping = nullptr;
    986    Norm::MappingType prevType = Norm::NONE;
    987    done = false;
    988    do {
    989        UChar32 c;
    990        const Norm *norm;
    991        if(mIter.next() && !mIter.isString()) {
    992            c = mIter.getCodepoint();
    993            norm = norms.getNorm(c);
    994        } else {
    995            c = 0x110000;
    996            norm = nullptr;
    997            done = true;
    998        }
    999        const UnicodeString *mapping;
   1000        Norm::MappingType type;
   1001        if(norm == nullptr) {
   1002            mapping = nullptr;
   1003            type = Norm::NONE;
   1004        } else {
   1005            type = norm->mappingType;
   1006            if(type == Norm::NONE) {
   1007                mapping = nullptr;
   1008            } else {
   1009                mapping = norm->mapping;
   1010            }
   1011        }
   1012        if(type == prevType && equalStrings(mapping, prevMapping) && c == (end + 1)) {
   1013            end = c;
   1014        } else {
   1015            if(writeRemoved ? prevType != Norm::NONE : prevType > Norm::REMOVED) {
   1016                if(start == end) {
   1017                    fprintf(f, "%04lX%c", static_cast<long>(start), typeChars[prevType]);
   1018                } else {
   1019                    fprintf(f, "%04lX..%04lX%c", static_cast<long>(start), static_cast<long>(end), typeChars[prevType]);
   1020                }
   1021                writeMapping(f, prevMapping);
   1022            }
   1023            start = end = c;
   1024            prevMapping = mapping;
   1025            prevType = type;
   1026        }
   1027    } while(!done);
   1028 
   1029    fclose(f);
   1030 }
   1031 
   1032 void
   1033 Normalizer2DataBuilder::computeDiff(const Normalizer2DataBuilder &b1,
   1034                                    const Normalizer2DataBuilder &b2,
   1035                                    Normalizer2DataBuilder &diff) {
   1036    // Compute diff = b1 - b2
   1037    // so that we should be able to get b1 = b2 + diff.
   1038    if(0 != memcmp(b1.unicodeVersion, b2.unicodeVersion, U_MAX_VERSION_LENGTH)) {
   1039        memcpy(diff.unicodeVersion, b1.unicodeVersion, U_MAX_VERSION_LENGTH);
   1040    }
   1041 
   1042    UnicodeSet ccSet(b1.norms.ccSet);
   1043    ccSet.addAll(b2.norms.ccSet);
   1044    UnicodeSetIterator ccIter(ccSet);
   1045    while(ccIter.next() && !ccIter.isString()) {
   1046        UChar32 c = ccIter.getCodepoint();
   1047        uint8_t cc1 = b1.norms.getCC(c);
   1048        uint8_t cc2 = b2.norms.getCC(c);
   1049        if(cc1 != cc2) {
   1050            diff.setCC(c, cc1);
   1051        }
   1052    }
   1053 
   1054    UnicodeSet mSet(b1.norms.mappingSet);
   1055    mSet.addAll(b2.norms.mappingSet);
   1056    UnicodeSetIterator mIter(mSet);
   1057    while(mIter.next() && !mIter.isString()) {
   1058        UChar32 c = mIter.getCodepoint();
   1059        const Norm *norm1 = b1.norms.getNorm(c);
   1060        const Norm *norm2 = b2.norms.getNorm(c);
   1061        const UnicodeString *mapping1;
   1062        Norm::MappingType type1;
   1063        if(norm1 == nullptr || !norm1->hasMapping()) {
   1064            mapping1 = nullptr;
   1065            type1 = Norm::NONE;
   1066        } else {
   1067            mapping1 = norm1->mapping;
   1068            type1 = norm1->mappingType;
   1069        }
   1070        const UnicodeString *mapping2;
   1071        Norm::MappingType type2;
   1072        if(norm2 == nullptr || !norm2->hasMapping()) {
   1073            mapping2 = nullptr;
   1074            type2 = Norm::NONE;
   1075        } else {
   1076            mapping2 = norm2->mapping;
   1077            type2 = norm2->mappingType;
   1078        }
   1079        if(type1 == type2 && equalStrings(mapping1, mapping2)) {
   1080            // Nothing to do.
   1081        } else if(type1 == Norm::NONE) {
   1082            diff.removeMapping(c);
   1083        } else if(type1 == Norm::ROUND_TRIP) {
   1084            diff.setRoundTripMapping(c, *mapping1);
   1085        } else if(type1 == Norm::ONE_WAY) {
   1086            diff.setOneWayMapping(c, *mapping1);
   1087        }
   1088    }
   1089 }
   1090 
   1091 U_NAMESPACE_END
   1092 
   1093 #endif /* #if !UCONFIG_NO_NORMALIZATION */
   1094 
   1095 /*
   1096 * Hey, Emacs, please set the following:
   1097 *
   1098 * Local Variables:
   1099 * indent-tabs-mode: nil
   1100 * End:
   1101 */
	tor-browser The Tor Browser
	git clone https://git.dasho.dev/tor-browser.git
	Log \| Files \| Refs \| README \| LICENSE