tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

norms.cpp (11676B)


      1 // © 2017 and later: Unicode, Inc. and others.
      2 // License & terms of use: http://www.unicode.org/copyright.html
      3 
      4 // norms.cpp
      5 // created: 2017jun04 Markus W. Scherer
      6 // (pulled out of n2builder.cpp)
      7 
      8 #include "unicode/utypes.h"
      9 
     10 #if !UCONFIG_NO_NORMALIZATION
     11 
     12 #include <stdio.h>
     13 #include <stdlib.h>
     14 #include "unicode/errorcode.h"
     15 #include "unicode/umutablecptrie.h"
     16 #include "unicode/unistr.h"
     17 #include "unicode/utf16.h"
     18 #include "normalizer2impl.h"
     19 #include "norms.h"
     20 #include "toolutil.h"
     21 #include "uvectr32.h"
     22 
     23 U_NAMESPACE_BEGIN
     24 
     25 void BuilderReorderingBuffer::append(UChar32 c, uint8_t cc) {
     26    if(cc==0 || fLength==0 || ccAt(fLength-1)<=cc) {
     27        if(cc==0) {
     28            fLastStarterIndex=fLength;
     29        }
     30        fArray[fLength++]=(c<<8)|cc;
     31        return;
     32    }
     33    // Let this character bubble back to its canonical order.
     34    int32_t i=fLength-1;
     35    while(i>fLastStarterIndex && ccAt(i)>cc) {
     36        --i;
     37    }
     38    ++i;  // after the last starter or prevCC<=cc
     39    // Move this and the following characters forward one to make space.
     40    for(int32_t j=fLength; i<j; --j) {
     41        fArray[j]=fArray[j-1];
     42    }
     43    fArray[i]=(c<<8)|cc;
     44    ++fLength;
     45    fDidReorder=true;
     46 }
     47 
     48 void BuilderReorderingBuffer::toString(UnicodeString &dest) const {
     49    dest.remove();
     50    for(int32_t i=0; i<fLength; ++i) {
     51        dest.append(charAt(i));
     52    }
     53 }
     54 
     55 UChar32 Norm::combine(UChar32 trail) const {
     56    int32_t length;
     57    const CompositionPair *pairs=getCompositionPairs(length);
     58    for(int32_t i=0; i<length; ++i) {
     59        if(trail==pairs[i].trail) {
     60            return pairs[i].composite;
     61        }
     62        if(trail<pairs[i].trail) {
     63            break;
     64        }
     65    }
     66    return U_SENTINEL;
     67 }
     68 
     69 Norms::Norms(UErrorCode &errorCode) {
     70    normTrie = umutablecptrie_open(0, 0, &errorCode);
     71    normMem=utm_open("gennorm2 normalization structs", 10000, 0x110100, sizeof(Norm));
     72    // Default "inert" Norm struct at index 0. Practically immutable.
     73    norms=allocNorm();
     74    norms->type=Norm::INERT;
     75 }
     76 
     77 Norms::~Norms() {
     78    umutablecptrie_close(normTrie);
     79    int32_t normsLength=utm_countItems(normMem);
     80    for(int32_t i=1; i<normsLength; ++i) {
     81        delete norms[i].mapping;
     82        delete norms[i].rawMapping;
     83        delete norms[i].compositions;
     84    }
     85    utm_close(normMem);
     86 }
     87 
     88 Norm *Norms::allocNorm() {
     89    Norm* p = static_cast<Norm*>(utm_alloc(normMem));
     90    norms = static_cast<Norm*>(utm_getStart(normMem)); // in case it got reallocated
     91    return p;
     92 }
     93 
     94 Norm *Norms::getNorm(UChar32 c) {
     95    uint32_t i = umutablecptrie_get(normTrie, c);
     96    if(i==0) {
     97        return nullptr;
     98    }
     99    return norms+i;
    100 }
    101 
    102 const Norm *Norms::getNorm(UChar32 c) const {
    103    uint32_t i = umutablecptrie_get(normTrie, c);
    104    if(i==0) {
    105        return nullptr;
    106    }
    107    return norms+i;
    108 }
    109 
    110 const Norm &Norms::getNormRef(UChar32 c) const {
    111    return norms[umutablecptrie_get(normTrie, c)];
    112 }
    113 
    114 Norm *Norms::createNorm(UChar32 c) {
    115    uint32_t i=umutablecptrie_get(normTrie, c);
    116    if(i!=0) {
    117        return norms+i;
    118    } else {
    119        /* allocate Norm */
    120        Norm *p=allocNorm();
    121        IcuToolErrorCode errorCode("gennorm2/createNorm()");
    122        umutablecptrie_set(normTrie, c, static_cast<uint32_t>(p - norms), errorCode);
    123        return p;
    124    }
    125 }
    126 
    127 void Norms::reorder(UnicodeString &mapping, BuilderReorderingBuffer &buffer) const {
    128    int32_t length=mapping.length();
    129    U_ASSERT(length<=Normalizer2Impl::MAPPING_LENGTH_MASK);
    130    const char16_t *s=mapping.getBuffer();
    131    int32_t i=0;
    132    UChar32 c;
    133    while(i<length) {
    134        U16_NEXT(s, i, length, c);
    135        buffer.append(c, getCC(c));
    136    }
    137    if(buffer.didReorder()) {
    138        buffer.toString(mapping);
    139    }
    140 }
    141 
    142 UBool Norms::combinesWithCCBetween(const Norm &norm, uint8_t lowCC, int32_t highCC) const {
    143    if((highCC-lowCC)>=2) {
    144        int32_t length;
    145        const CompositionPair *pairs=norm.getCompositionPairs(length);
    146        for(int32_t i=0; i<length; ++i) {
    147            uint8_t trailCC=getCC(pairs[i].trail);
    148            if(lowCC<trailCC && trailCC<highCC) {
    149                return true;
    150            }
    151        }
    152    }
    153    return false;
    154 }
    155 
    156 void Norms::enumRanges(Enumerator &e) {
    157    UChar32 start = 0, end;
    158    uint32_t i;
    159    while ((end = umutablecptrie_getRange(normTrie, start, UCPMAP_RANGE_NORMAL, 0,
    160                                          nullptr, nullptr, &i)) >= 0) {
    161        if (i > 0) {
    162            e.rangeHandler(start, end, norms[i]);
    163        }
    164        start = end + 1;
    165    }
    166 }
    167 
    168 Norms::Enumerator::~Enumerator() {}
    169 
    170 void CompositionBuilder::rangeHandler(UChar32 start, UChar32 end, Norm &norm) {
    171    if(norm.mappingType!=Norm::ROUND_TRIP) { return; }
    172    if(start!=end) {
    173        fprintf(stderr,
    174                "gennorm2 error: same round-trip mapping for "
    175                "more than 1 code point U+%04lX..U+%04lX\n",
    176                static_cast<long>(start), static_cast<long>(end));
    177        exit(U_INVALID_FORMAT_ERROR);
    178    }
    179    if(norm.cc!=0) {
    180        fprintf(stderr,
    181                "gennorm2 error: "
    182                "U+%04lX has a round-trip mapping and ccc!=0, "
    183                "not possible in Unicode normalization\n",
    184                static_cast<long>(start));
    185        exit(U_INVALID_FORMAT_ERROR);
    186    }
    187    // setRoundTripMapping() ensured that there are exactly two code points.
    188    const UnicodeString &m=*norm.mapping;
    189    UChar32 lead=m.char32At(0);
    190    UChar32 trail=m.char32At(m.length()-1);
    191    if(norms.getCC(lead)!=0) {
    192        fprintf(stderr,
    193                "gennorm2 error: "
    194                "U+%04lX's round-trip mapping's starter U+%04lX has ccc!=0, "
    195                "not possible in Unicode normalization\n",
    196                static_cast<long>(start), static_cast<long>(lead));
    197        exit(U_INVALID_FORMAT_ERROR);
    198    }
    199    // Flag for trailing character.
    200    norms.createNorm(trail)->combinesBack=true;
    201    // Insert (trail, composite) pair into compositions list for the lead character.
    202    IcuToolErrorCode errorCode("gennorm2/addComposition()");
    203    Norm *leadNorm=norms.createNorm(lead);
    204    UVector32 *compositions=leadNorm->compositions;
    205    int32_t i;
    206    if(compositions==nullptr) {
    207        compositions=leadNorm->compositions=new UVector32(errorCode);
    208        i=0;  // "insert" the first pair at index 0
    209    } else {
    210        // Insertion sort, and check for duplicate trail characters.
    211        int32_t length;
    212        const CompositionPair *pairs=leadNorm->getCompositionPairs(length);
    213        for(i=0; i<length; ++i) {
    214            if(trail==pairs[i].trail) {
    215                fprintf(stderr,
    216                        "gennorm2 error: same round-trip mapping for "
    217                        "more than 1 code point (e.g., U+%04lX) to U+%04lX + U+%04lX\n",
    218                        static_cast<long>(start), static_cast<long>(lead), static_cast<long>(trail));
    219                exit(U_INVALID_FORMAT_ERROR);
    220            }
    221            if(trail<pairs[i].trail) {
    222                break;
    223            }
    224        }
    225    }
    226    compositions->insertElementAt(trail, 2*i, errorCode);
    227    compositions->insertElementAt(start, 2*i+1, errorCode);
    228 }
    229 
    230 void Decomposer::rangeHandler(UChar32 start, UChar32 end, Norm &norm) {
    231    if(!norm.hasMapping()) { return; }
    232    const UnicodeString &m=*norm.mapping;
    233    UnicodeString *decomposed=nullptr;
    234    const char16_t *s=toUCharPtr(m.getBuffer());
    235    int32_t length=m.length();
    236    int32_t prev, i=0;
    237    UChar32 c;
    238    while(i<length) {
    239        prev=i;
    240        U16_NEXT(s, i, length, c);
    241        if(start<=c && c<=end) {
    242            fprintf(stderr,
    243                    "gennorm2 error: U+%04lX maps to itself directly or indirectly\n",
    244                    static_cast<long>(c));
    245            exit(U_INVALID_FORMAT_ERROR);
    246        }
    247        const Norm &cNorm=norms.getNormRef(c);
    248        if(norm.mappingType==Norm::ROUND_TRIP && prev==0 &&
    249                !norm.combinesBack && cNorm.combinesBack) {
    250            // If a two-way mapping starts with an NFC_QC=Maybe character,
    251            // then mark the composite as NFC_QC=Maybe as well,
    252            // so that we trigger decomposition and recomposition.
    253            norm.combinesBack=true;
    254            didDecompose|=true;
    255        }
    256        if(cNorm.hasMapping()) {
    257            if(norm.mappingType==Norm::ROUND_TRIP) {
    258                if(prev==0) {
    259                    if(cNorm.mappingType!=Norm::ROUND_TRIP) {
    260                        fprintf(stderr,
    261                                "gennorm2 error: "
    262                                "U+%04lX's round-trip mapping's starter "
    263                                "U+%04lX one-way-decomposes, "
    264                                "not possible in Unicode normalization\n",
    265                                static_cast<long>(start), static_cast<long>(c));
    266                        exit(U_INVALID_FORMAT_ERROR);
    267                    }
    268                    uint8_t myTrailCC=norms.getCC(m.char32At(i));
    269                    UChar32 cTrailChar=cNorm.mapping->char32At(cNorm.mapping->length()-1);
    270                    uint8_t cTrailCC=norms.getCC(cTrailChar);
    271                    if(cTrailCC>myTrailCC) {
    272                        fprintf(stderr,
    273                                "gennorm2 error: "
    274                                "U+%04lX's round-trip mapping's starter "
    275                                "U+%04lX decomposes and the "
    276                                "inner/earlier tccc=%hu > outer/following tccc=%hu, "
    277                                "not possible in Unicode normalization\n",
    278                                static_cast<long>(start), static_cast<long>(c),
    279                                static_cast<short>(cTrailCC), static_cast<short>(myTrailCC));
    280                        exit(U_INVALID_FORMAT_ERROR);
    281                    }
    282                } else {
    283                    fprintf(stderr,
    284                            "gennorm2 error: "
    285                            "U+%04lX's round-trip mapping's non-starter "
    286                            "U+%04lX decomposes, "
    287                            "not possible in Unicode normalization\n",
    288                            static_cast<long>(start), static_cast<long>(c));
    289                    exit(U_INVALID_FORMAT_ERROR);
    290                }
    291            }
    292            if(decomposed==nullptr) {
    293                decomposed=new UnicodeString(m, 0, prev);
    294            }
    295            decomposed->append(*cNorm.mapping);
    296        } else if(Hangul::isHangul(c)) {
    297            char16_t buffer[3];
    298            int32_t hangulLength=Hangul::decompose(c, buffer);
    299            if(norm.mappingType==Norm::ROUND_TRIP && prev!=0) {
    300                fprintf(stderr,
    301                        "gennorm2 error: "
    302                        "U+%04lX's round-trip mapping's non-starter "
    303                        "U+%04lX decomposes, "
    304                        "not possible in Unicode normalization\n",
    305                        static_cast<long>(start), static_cast<long>(c));
    306                exit(U_INVALID_FORMAT_ERROR);
    307            }
    308            if(decomposed==nullptr) {
    309                decomposed=new UnicodeString(m, 0, prev);
    310            }
    311            decomposed->append(buffer, hangulLength);
    312        } else if(decomposed!=nullptr) {
    313            decomposed->append(m, prev, i-prev);
    314        }
    315    }
    316    if(decomposed!=nullptr) {
    317        if(norm.rawMapping==nullptr) {
    318            // Remember the original mapping when decomposing recursively.
    319            norm.rawMapping=norm.mapping;
    320        } else {
    321            delete norm.mapping;
    322        }
    323        norm.mapping=decomposed;
    324        // Not  norm.setMappingCP();  because the original mapping
    325        // is most likely to be encodable as a delta.
    326        didDecompose|=true;
    327    }
    328 }
    329 
    330 U_NAMESPACE_END
    331 
    332 #endif // #if !UCONFIG_NO_NORMALIZATION