tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

collationfcd.h (4926B)


      1 // © 2016 and later: Unicode, Inc. and others.
      2 // License & terms of use: http://www.unicode.org/copyright.html
      3 /*
      4 *******************************************************************************
      5 * Copyright (C) 2012-2014, International Business Machines
      6 * Corporation and others.  All Rights Reserved.
      7 *******************************************************************************
      8 * collationfcd.h
      9 *
     10 * created on: 2012aug18
     11 * created by: Markus W. Scherer
     12 */
     13 
     14 #ifndef __COLLATIONFCD_H__
     15 #define __COLLATIONFCD_H__
     16 
     17 #include "unicode/utypes.h"
     18 
     19 #if !UCONFIG_NO_COLLATION
     20 
     21 #include "unicode/utf16.h"
     22 
     23 U_NAMESPACE_BEGIN
     24 
     25 /**
     26 * Data and functions for the FCD check fast path.
     27 *
     28 * The fast path looks at a pair of 16-bit code units and checks
     29 * whether there is an FCD boundary between them;
     30 * there is if the first unit has a trailing ccc=0 (!hasTccc(first))
     31 * or the second unit has a leading ccc=0 (!hasLccc(second)),
     32 * or both.
     33 * When the fast path finds a possible non-boundary,
     34 * then the FCD check slow path looks at the actual sequence of FCD values.
     35 *
     36 * This is a pure optimization.
     37 * The fast path must at least find all possible non-boundaries.
     38 * If the fast path is too pessimistic, it costs performance.
     39 *
     40 * For a pair of BMP characters, the fast path tests are precise (1 bit per character).
     41 *
     42 * For a supplementary code point, the two units are its lead and trail surrogates.
     43 * We set hasTccc(lead)=true if any of its 1024 associated supplementary code points
     44 * has lccc!=0 or tccc!=0.
     45 * We set hasLccc(trail)=true for all trail surrogates.
     46 * As a result, we leave the fast path if the lead surrogate might start a
     47 * supplementary code point that is not FCD-inert.
     48 * (So the fast path need not detect that there is a surrogate pair,
     49 * nor look ahead to the next full code point.)
     50 *
     51 * hasLccc(lead)=true if any of its 1024 associated supplementary code points
     52 * has lccc!=0, for fast boundary checking between BMP & supplementary.
     53 *
     54 * hasTccc(trail)=false:
     55 * It should only be tested for unpaired trail surrogates which are FCD-inert.
     56 */
     57 class U_I18N_API CollationFCD {
     58 public:
     59    static inline UBool hasLccc(UChar32 c) {
     60        // assert c <= 0xffff
     61        // c can be negative, e.g., U_SENTINEL from UCharIterator;
     62        // that is handled in the first test.
     63        int32_t i;
     64        return
     65            // U+0300 is the first character with lccc!=0.
     66            c >= 0x300 &&
     67            (i = lcccIndex[c >> 5]) != 0 &&
     68            (lcccBits[i] & (static_cast<uint32_t>(1) << (c & 0x1f))) != 0;
     69    }
     70 
     71    static inline UBool hasTccc(UChar32 c) {
     72        // assert c <= 0xffff
     73        // c can be negative, e.g., U_SENTINEL from UCharIterator;
     74        // that is handled in the first test.
     75        int32_t i;
     76        return
     77            // U+00C0 is the first character with tccc!=0.
     78            c >= 0xc0 &&
     79            (i = tcccIndex[c >> 5]) != 0 &&
     80            (tcccBits[i] & (static_cast<uint32_t>(1) << (c & 0x1f))) != 0;
     81    }
     82 
     83    static inline UBool mayHaveLccc(UChar32 c) {
     84        // Handles all of Unicode 0..10FFFF.
     85        // c can be negative, e.g., U_SENTINEL.
     86        // U+0300 is the first character with lccc!=0.
     87        if(c < 0x300) { return false; }
     88        if(c > 0xffff) { c = U16_LEAD(c); }
     89        int32_t i;
     90        return
     91            (i = lcccIndex[c >> 5]) != 0 &&
     92            (lcccBits[i] & (static_cast<uint32_t>(1) << (c & 0x1f))) != 0;
     93    }
     94 
     95    /**
     96     * Tibetan composite vowel signs (U+0F73, U+0F75, U+0F81)
     97     * must be decomposed before reaching the core collation code,
     98     * or else some sequences including them, even ones passing the FCD check,
     99     * do not yield canonically equivalent results.
    100     *
    101     * This is a fast and imprecise test.
    102     *
    103     * @param c a code point
    104     * @return true if c is U+0F73, U+0F75 or U+0F81 or one of several other Tibetan characters
    105     */
    106    static inline UBool maybeTibetanCompositeVowel(UChar32 c) {
    107        return (c & 0x1fff01) == 0xf01;
    108    }
    109 
    110    /**
    111     * Tibetan composite vowel signs (U+0F73, U+0F75, U+0F81)
    112     * must be decomposed before reaching the core collation code,
    113     * or else some sequences including them, even ones passing the FCD check,
    114     * do not yield canonically equivalent results.
    115     *
    116     * They have distinct lccc/tccc combinations: 129/130 or 129/132.
    117     *
    118     * @param fcd16 the FCD value (lccc/tccc combination) of a code point
    119     * @return true if fcd16 is from U+0F73, U+0F75 or U+0F81
    120     */
    121    static inline UBool isFCD16OfTibetanCompositeVowel(uint16_t fcd16) {
    122        return fcd16 == 0x8182 || fcd16 == 0x8184;
    123    }
    124 
    125 private:
    126    CollationFCD() = delete;  // No instantiation.
    127 
    128    static const uint8_t lcccIndex[2048];
    129    static const uint8_t tcccIndex[2048];
    130    static const uint32_t lcccBits[];
    131    static const uint32_t tcccBits[];
    132 };
    133 
    134 U_NAMESPACE_END
    135 
    136 #endif  // !UCONFIG_NO_COLLATION
    137 #endif  // __COLLATIONFCD_H__