tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

extradata.cpp (10462B)


      1 // © 2017 and later: Unicode, Inc. and others.
      2 // License & terms of use: http://www.unicode.org/copyright.html
      3 
      4 // extradata.cpp
      5 // created: 2017jun04 Markus W. Scherer
      6 // (pulled out of n2builder.cpp)
      7 
      8 #include "unicode/utypes.h"
      9 
     10 #if !UCONFIG_NO_NORMALIZATION
     11 
     12 #include <stdio.h>
     13 #include <stdlib.h>
     14 #include "unicode/errorcode.h"
     15 #include "unicode/unistr.h"
     16 #include "unicode/utf16.h"
     17 #include "extradata.h"
     18 #include "normalizer2impl.h"
     19 #include "norms.h"
     20 #include "toolutil.h"
     21 #include "utrie2.h"
     22 #include "uvectr32.h"
     23 
     24 U_NAMESPACE_BEGIN
     25 
     26 ExtraData::ExtraData(Norms &n, UBool fast) :
     27        Norms::Enumerator(n),
     28        yesYesCompositions(1000, static_cast<UChar32>(0xffff), 2), // 0=inert, 1=Jamo L, 2=start of compositions
     29        yesNoMappingsAndCompositions(1000, static_cast<UChar32>(0), 1), // 0=Hangul LV, 1=start of normal data
     30        yesNoMappingsOnly(1000, static_cast<UChar32>(0), 1), // 0=Hangul LVT, 1=start of normal data
     31        optimizeFast(fast) {
     32    // Hangul LV algorithmically decomposes to two Jamo.
     33    // Some code may harmlessly read this firstUnit.
     34    yesNoMappingsAndCompositions.setCharAt(0, 2);
     35    // Hangul LVT algorithmically decomposes to three Jamo.
     36    // Some code may harmlessly read this firstUnit.
     37    yesNoMappingsOnly.setCharAt(0, 3);
     38 }
     39 
     40 int32_t ExtraData::writeMapping(UChar32 c, const Norm &norm, UnicodeString &dataString) {
     41    UnicodeString &m=*norm.mapping;
     42    int32_t length=m.length();
     43    // Write the mapping & raw mapping extraData.
     44    int32_t firstUnit=length|(norm.trailCC<<8);
     45    int32_t preMappingLength=0;
     46    if(norm.rawMapping!=nullptr) {
     47        UnicodeString &rm=*norm.rawMapping;
     48        int32_t rmLength=rm.length();
     49        if(rmLength>Normalizer2Impl::MAPPING_LENGTH_MASK) {
     50            fprintf(stderr,
     51                    "gennorm2 error: "
     52                    "raw mapping for U+%04lX longer than maximum of %d\n",
     53                    static_cast<long>(c), Normalizer2Impl::MAPPING_LENGTH_MASK);
     54            exit(U_INVALID_FORMAT_ERROR);
     55        }
     56        char16_t rm0=rm.charAt(0);
     57        if( rmLength==length-1 &&
     58            // 99: overlong substring lengths get pinned to remainder lengths anyway
     59            0==rm.compare(1, 99, m, 2, 99) &&
     60            rm0>Normalizer2Impl::MAPPING_LENGTH_MASK
     61        ) {
     62            // Compression:
     63            // rawMapping=rm0+mapping.substring(2) -> store only rm0
     64            //
     65            // The raw mapping is the same as the final mapping after replacing
     66            // the final mapping's first two code units with the raw mapping's first one.
     67            // In this case, we store only that first unit, rm0.
     68            // This helps with a few hundred mappings.
     69            dataString.append(rm0);
     70            preMappingLength=1;
     71        } else {
     72            // Store the raw mapping with its length.
     73            dataString.append(rm);
     74            dataString.append(static_cast<char16_t>(rmLength));
     75            preMappingLength=rmLength+1;
     76        }
     77        firstUnit|=Normalizer2Impl::MAPPING_HAS_RAW_MAPPING;
     78    }
     79    int32_t cccLccc=norm.cc|(norm.leadCC<<8);
     80    if(cccLccc!=0) {
     81        dataString.append(static_cast<char16_t>(cccLccc));
     82        ++preMappingLength;
     83        firstUnit|=Normalizer2Impl::MAPPING_HAS_CCC_LCCC_WORD;
     84    }
     85    dataString.append(static_cast<char16_t>(firstUnit));
     86    dataString.append(m);
     87    return preMappingLength;
     88 }
     89 
     90 int32_t ExtraData::writeNoNoMapping(UChar32 c, const Norm &norm,
     91                                    UnicodeString &dataString,
     92                                    Hashtable &previousMappings) {
     93    UnicodeString newMapping;
     94    int32_t offset=writeMapping(c, norm, newMapping);
     95    UBool found=false;
     96    int32_t previousOffset=previousMappings.getiAndFound(newMapping, found);
     97    if(found) {
     98        // Duplicate, point to the identical mapping that has already been stored.
     99        offset=previousOffset;
    100    } else {
    101        // Append this new mapping and
    102        // enter it into the hashtable, avoiding value 0 which is "not found".
    103        offset=dataString.length()+offset;
    104        dataString.append(newMapping);
    105        IcuToolErrorCode errorCode("gennorm2/writeExtraData()/Hashtable.putiAllowZero()");
    106        previousMappings.putiAllowZero(newMapping, offset, errorCode);
    107    }
    108    return offset;
    109 }
    110 
    111 UBool ExtraData::setNoNoDelta(UChar32 c, Norm &norm) const {
    112    // Try a compact, algorithmic encoding to a single compYesAndZeroCC code point.
    113    // Do not map from ASCII to non-ASCII.
    114    if(norm.mappingCP>=0 &&
    115            !(c<=0x7f && norm.mappingCP>0x7f) &&
    116            norms.getNormRef(norm.mappingCP).type<Norm::NO_NO_COMP_YES) {
    117        int32_t delta=norm.mappingCP-c;
    118        if(-Normalizer2Impl::MAX_DELTA<=delta && delta<=Normalizer2Impl::MAX_DELTA) {
    119            norm.type=Norm::NO_NO_DELTA;
    120            norm.offset=delta;
    121            return true;
    122        }
    123    }
    124    return false;
    125 }
    126 
    127 void ExtraData::writeCompositions(UChar32 c, const Norm &norm, UnicodeString &dataString) {
    128    if(norm.cc!=0) {
    129        fprintf(stderr,
    130                "gennorm2 error: "
    131                "U+%04lX combines-forward and has ccc!=0, not possible in Unicode normalization\n",
    132                static_cast<long>(c));
    133        exit(U_INVALID_FORMAT_ERROR);
    134    }
    135    int32_t length;
    136    const CompositionPair *pairs=norm.getCompositionPairs(length);
    137    for(int32_t i=0; i<length; ++i) {
    138        const CompositionPair &pair=pairs[i];
    139        // 22 bits for the composite character and whether it combines forward.
    140        UChar32 compositeAndFwd=pair.composite<<1;
    141        if(norms.getNormRef(pair.composite).combinesFwd()) {
    142            compositeAndFwd|=1;  // The composite character also combines-forward.
    143        }
    144        // Encode most pairs in two units and some in three.
    145        int32_t firstUnit, secondUnit, thirdUnit;
    146        if(pair.trail<Normalizer2Impl::COMP_1_TRAIL_LIMIT) {
    147            if(compositeAndFwd<=0xffff) {
    148                firstUnit=pair.trail<<1;
    149                secondUnit=compositeAndFwd;
    150                thirdUnit=-1;
    151            } else {
    152                firstUnit=(pair.trail<<1)|Normalizer2Impl::COMP_1_TRIPLE;
    153                secondUnit=compositeAndFwd>>16;
    154                thirdUnit=compositeAndFwd;
    155            }
    156        } else {
    157            firstUnit=(Normalizer2Impl::COMP_1_TRAIL_LIMIT+
    158                       (pair.trail>>Normalizer2Impl::COMP_1_TRAIL_SHIFT))|
    159                      Normalizer2Impl::COMP_1_TRIPLE;
    160            secondUnit=(pair.trail<<Normalizer2Impl::COMP_2_TRAIL_SHIFT)|
    161                       (compositeAndFwd>>16);
    162            thirdUnit=compositeAndFwd;
    163        }
    164        // Set the high bit of the first unit if this is the last composition pair.
    165        if(i==(length-1)) {
    166            firstUnit|=Normalizer2Impl::COMP_1_LAST_TUPLE;
    167        }
    168        dataString.append(static_cast<char16_t>(firstUnit)).append(static_cast<char16_t>(secondUnit));
    169        if(thirdUnit>=0) {
    170            dataString.append(static_cast<char16_t>(thirdUnit));
    171        }
    172    }
    173 }
    174 
    175 void ExtraData::rangeHandler(UChar32 start, UChar32 end, Norm &norm) {
    176    if(start!=end) {
    177        fprintf(stderr,
    178                "gennorm2 error: unexpected shared data for "
    179                "multiple code points U+%04lX..U+%04lX\n",
    180                static_cast<long>(start), static_cast<long>(end));
    181        exit(U_INTERNAL_PROGRAM_ERROR);
    182    }
    183    if(norm.error!=nullptr) {
    184        fprintf(stderr, "gennorm2 error: U+%04lX %s\n", static_cast<long>(start), norm.error);
    185        exit(U_INVALID_FORMAT_ERROR);
    186    }
    187    writeExtraData(start, norm);
    188 }
    189 
    190 //  Ticket #13342 - Disable optimizations on MSVC for this function as a workaround.
    191 #if (defined(_MSC_VER) && (_MSC_VER >= 1900) && defined(_MSC_FULL_VER) && (_MSC_FULL_VER >= 190024210))
    192 #pragma optimize( "", off )
    193 #endif
    194 
    195 void ExtraData::writeExtraData(UChar32 c, Norm &norm) {
    196    switch(norm.type) {
    197    case Norm::INERT:
    198        break;  // no extra data
    199    case Norm::YES_YES_COMBINES_FWD:
    200        norm.offset=yesYesCompositions.length();
    201        writeCompositions(c, norm, yesYesCompositions);
    202        break;
    203    case Norm::YES_NO_COMBINES_FWD:
    204        norm.offset=yesNoMappingsAndCompositions.length()+
    205                writeMapping(c, norm, yesNoMappingsAndCompositions);
    206        writeCompositions(c, norm, yesNoMappingsAndCompositions);
    207        break;
    208    case Norm::YES_NO_MAPPING_ONLY:
    209        norm.offset=yesNoMappingsOnly.length()+
    210                writeMapping(c, norm, yesNoMappingsOnly);
    211        break;
    212    case Norm::NO_NO_COMP_YES:
    213        if(!optimizeFast && setNoNoDelta(c, norm)) {
    214            break;
    215        }
    216        norm.offset=writeNoNoMapping(c, norm, noNoMappingsCompYes, previousNoNoMappingsCompYes);
    217        break;
    218    case Norm::NO_NO_COMP_BOUNDARY_BEFORE:
    219        if(!optimizeFast && setNoNoDelta(c, norm)) {
    220            break;
    221        }
    222        norm.offset=writeNoNoMapping(
    223            c, norm, noNoMappingsCompBoundaryBefore, previousNoNoMappingsCompBoundaryBefore);
    224        break;
    225    case Norm::NO_NO_COMP_NO_MAYBE_CC:
    226        norm.offset=writeNoNoMapping(
    227            c, norm, noNoMappingsCompNoMaybeCC, previousNoNoMappingsCompNoMaybeCC);
    228        break;
    229    case Norm::NO_NO_EMPTY:
    230        // There can be multiple extra data entries for mappings to the empty string
    231        // if they have different raw mappings.
    232        norm.offset=writeNoNoMapping(c, norm, noNoMappingsEmpty, previousNoNoMappingsEmpty);
    233        break;
    234    case Norm::MAYBE_NO_MAPPING_ONLY:
    235        norm.offset=maybeNoMappingsOnly.length()+
    236                writeMapping(c, norm, maybeNoMappingsOnly);
    237        break;
    238    case Norm::MAYBE_NO_COMBINES_FWD:
    239        norm.offset=maybeNoMappingsAndCompositions.length()+
    240                writeMapping(c, norm, maybeNoMappingsAndCompositions);
    241        writeCompositions(c, norm, maybeNoMappingsAndCompositions);
    242        break;
    243    case Norm::MAYBE_YES_COMBINES_FWD:
    244        norm.offset=maybeYesCompositions.length();
    245        writeCompositions(c, norm, maybeYesCompositions);
    246        break;
    247    case Norm::MAYBE_YES_SIMPLE:
    248        break;  // no extra data
    249    case Norm::YES_YES_WITH_CC:
    250        break;  // no extra data
    251    default:  // Should not occur.
    252        exit(U_INTERNAL_PROGRAM_ERROR);
    253    }
    254 }
    255 
    256 // Ticket #13342 - Turn optimization back on.
    257 #if (defined(_MSC_VER) && (_MSC_VER >= 1900) && defined(_MSC_FULL_VER) && (_MSC_FULL_VER >= 190024210))
    258 #pragma optimize( "", on )
    259 #endif
    260 
    261 U_NAMESPACE_END
    262 
    263 #endif // #if !UCONFIG_NO_NORMALIZATION