tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

dictionarydata.cpp (8417B)


      1 // © 2016 and later: Unicode, Inc. and others.
      2 // License & terms of use: http://www.unicode.org/copyright.html
      3 /*
      4 *******************************************************************************
      5 * Copyright (C) 2014-2016, International Business Machines
      6 * Corporation and others.  All Rights Reserved.
      7 *******************************************************************************
      8 * dictionarydata.h
      9 *
     10 * created on: 2012may31
     11 * created by: Markus W. Scherer & Maxime Serrano
     12 */
     13 
     14 #include "dictionarydata.h"
     15 #include "unicode/ucharstrie.h"
     16 #include "unicode/bytestrie.h"
     17 #include "unicode/udata.h"
     18 #include "cmemory.h"
     19 
     20 #if !UCONFIG_NO_BREAK_ITERATION
     21 
     22 U_NAMESPACE_BEGIN
     23 
     24 const int32_t  DictionaryData::TRIE_TYPE_BYTES = 0;
     25 const int32_t  DictionaryData::TRIE_TYPE_UCHARS = 1;
     26 const int32_t  DictionaryData::TRIE_TYPE_MASK = 7;
     27 const int32_t  DictionaryData::TRIE_HAS_VALUES = 8;
     28 
     29 const int32_t  DictionaryData::TRANSFORM_NONE = 0;
     30 const int32_t  DictionaryData::TRANSFORM_TYPE_OFFSET = 0x1000000;
     31 const int32_t  DictionaryData::TRANSFORM_TYPE_MASK = 0x7f000000;
     32 const int32_t  DictionaryData::TRANSFORM_OFFSET_MASK = 0x1fffff;
     33    
     34 DictionaryMatcher::~DictionaryMatcher() {
     35 }
     36 
     37 UCharsDictionaryMatcher::~UCharsDictionaryMatcher() {
     38    udata_close(file);
     39 }
     40 
     41 int32_t UCharsDictionaryMatcher::getType() const {
     42    return DictionaryData::TRIE_TYPE_UCHARS;
     43 }
     44 
     45 int32_t UCharsDictionaryMatcher::matches(UText *text, int32_t maxLength, int32_t limit,
     46                            int32_t *lengths, int32_t *cpLengths, int32_t *values,
     47                            int32_t *prefix) const {
     48 
     49    UCharsTrie uct(characters);
     50    int32_t startingTextIndex = static_cast<int32_t>(utext_getNativeIndex(text));
     51    int32_t wordCount = 0;
     52    int32_t codePointsMatched = 0;
     53 
     54    for (UChar32 c = utext_next32(text); c >= 0; c=utext_next32(text)) {
     55        UStringTrieResult result = (codePointsMatched == 0) ? uct.first(c) : uct.next(c);
     56        int32_t lengthMatched = static_cast<int32_t>(utext_getNativeIndex(text)) - startingTextIndex;
     57        codePointsMatched += 1;
     58        if (USTRINGTRIE_HAS_VALUE(result)) {
     59            if (wordCount < limit) {
     60                if (values != nullptr) {
     61                    values[wordCount] = uct.getValue();
     62                }
     63                if (lengths != nullptr) {
     64                    lengths[wordCount] = lengthMatched;
     65                }
     66                if (cpLengths != nullptr) {
     67                    cpLengths[wordCount] = codePointsMatched;
     68                }
     69                ++wordCount;
     70            }
     71            if (result == USTRINGTRIE_FINAL_VALUE) {
     72                break;
     73            }
     74        }
     75        else if (result == USTRINGTRIE_NO_MATCH) {
     76            break;
     77        }
     78        if (lengthMatched >= maxLength) {
     79            break;
     80        }
     81    }
     82 
     83    if (prefix != nullptr) {
     84        *prefix = codePointsMatched;
     85    }
     86    return wordCount;
     87 }
     88 
     89 BytesDictionaryMatcher::~BytesDictionaryMatcher() {
     90    udata_close(file);
     91 }
     92 
     93 UChar32 BytesDictionaryMatcher::transform(UChar32 c) const {
     94    if ((transformConstant & DictionaryData::TRANSFORM_TYPE_MASK) == DictionaryData::TRANSFORM_TYPE_OFFSET) {
     95        if (c == 0x200D) {
     96            return 0xFF;
     97        } else if (c == 0x200C) {
     98            return 0xFE;
     99        }
    100        int32_t delta = c - (transformConstant & DictionaryData::TRANSFORM_OFFSET_MASK);
    101        if (delta < 0 || 0xFD < delta) {
    102            return U_SENTINEL;
    103        }
    104        return static_cast<UChar32>(delta);
    105    }
    106    return c;
    107 }
    108 
    109 int32_t BytesDictionaryMatcher::getType() const {
    110    return DictionaryData::TRIE_TYPE_BYTES;
    111 }
    112 
    113 int32_t BytesDictionaryMatcher::matches(UText *text, int32_t maxLength, int32_t limit,
    114                            int32_t *lengths, int32_t *cpLengths, int32_t *values,
    115                            int32_t *prefix) const {
    116    BytesTrie bt(characters);
    117    int32_t startingTextIndex = static_cast<int32_t>(utext_getNativeIndex(text));
    118    int32_t wordCount = 0;
    119    int32_t codePointsMatched = 0;
    120 
    121    for (UChar32 c = utext_next32(text); c >= 0; c=utext_next32(text)) {
    122        UStringTrieResult result = (codePointsMatched == 0) ? bt.first(transform(c)) : bt.next(transform(c));
    123        int32_t lengthMatched = static_cast<int32_t>(utext_getNativeIndex(text)) - startingTextIndex;
    124        codePointsMatched += 1;
    125        if (USTRINGTRIE_HAS_VALUE(result)) {
    126            if (wordCount < limit) {
    127                if (values != nullptr) {
    128                    values[wordCount] = bt.getValue();
    129                }
    130                if (lengths != nullptr) {
    131                    lengths[wordCount] = lengthMatched;
    132                }
    133                if (cpLengths != nullptr) {
    134                    cpLengths[wordCount] = codePointsMatched;
    135                }
    136                ++wordCount;
    137            }
    138            if (result == USTRINGTRIE_FINAL_VALUE) {
    139                break;
    140            }
    141        }
    142        else if (result == USTRINGTRIE_NO_MATCH) {
    143            break;
    144        }
    145        if (lengthMatched >= maxLength) {
    146            break;
    147        }
    148    }
    149 
    150    if (prefix != nullptr) {
    151        *prefix = codePointsMatched;
    152    }
    153    return wordCount;
    154 }
    155 
    156 
    157 U_NAMESPACE_END
    158 
    159 U_NAMESPACE_USE
    160 
    161 U_CAPI int32_t U_EXPORT2
    162 udict_swap(const UDataSwapper *ds, const void *inData, int32_t length,
    163           void *outData, UErrorCode *pErrorCode) {
    164    const UDataInfo *pInfo;
    165    int32_t headerSize;
    166    const uint8_t *inBytes;
    167    uint8_t *outBytes;
    168    const int32_t *inIndexes;
    169    int32_t indexes[DictionaryData::IX_COUNT];
    170    int32_t i, offset, size;
    171 
    172    headerSize = udata_swapDataHeader(ds, inData, length, outData, pErrorCode);
    173    if (pErrorCode == nullptr || U_FAILURE(*pErrorCode)) return 0;
    174    pInfo = (const UDataInfo *)((const char *)inData + 4);
    175    if (!(pInfo->dataFormat[0] == 0x44 && 
    176          pInfo->dataFormat[1] == 0x69 && 
    177          pInfo->dataFormat[2] == 0x63 && 
    178          pInfo->dataFormat[3] == 0x74 && 
    179          pInfo->formatVersion[0] == 1)) {
    180        udata_printError(ds, "udict_swap(): data format %02x.%02x.%02x.%02x (format version %02x) is not recognized as dictionary data\n",
    181                         pInfo->dataFormat[0], pInfo->dataFormat[1], pInfo->dataFormat[2], pInfo->dataFormat[3], pInfo->formatVersion[0]);
    182        *pErrorCode = U_UNSUPPORTED_ERROR;
    183        return 0;
    184    }
    185 
    186    inBytes = (const uint8_t *)inData + headerSize;
    187    outBytes = (outData == nullptr) ? nullptr : (uint8_t *)outData + headerSize;
    188 
    189    inIndexes = (const int32_t *)inBytes;
    190    if (length >= 0) {
    191        length -= headerSize;
    192        if (length < (int32_t)(sizeof(indexes))) {
    193            udata_printError(ds, "udict_swap(): too few bytes (%d after header) for dictionary data\n", length);
    194            *pErrorCode = U_INDEX_OUTOFBOUNDS_ERROR;
    195            return 0;
    196        }
    197    }
    198 
    199    for (i = 0; i < DictionaryData::IX_COUNT; i++) {
    200        indexes[i] = udata_readInt32(ds, inIndexes[i]);
    201    }
    202 
    203    size = indexes[DictionaryData::IX_TOTAL_SIZE];
    204 
    205    if (length >= 0) {
    206        if (length < size) {
    207            udata_printError(ds, "udict_swap(): too few bytes (%d after header) for all of dictionary data\n", length);
    208            *pErrorCode = U_INDEX_OUTOFBOUNDS_ERROR;
    209            return 0;
    210        }
    211 
    212        if (inBytes != outBytes) {
    213            uprv_memcpy(outBytes, inBytes, size);
    214        }
    215 
    216        offset = 0;
    217        ds->swapArray32(ds, inBytes, sizeof(indexes), outBytes, pErrorCode);
    218        offset = (int32_t)sizeof(indexes);
    219        int32_t trieType = indexes[DictionaryData::IX_TRIE_TYPE] & DictionaryData::TRIE_TYPE_MASK;
    220        int32_t nextOffset = indexes[DictionaryData::IX_RESERVED1_OFFSET];
    221 
    222        if (trieType == DictionaryData::TRIE_TYPE_UCHARS) {
    223            ds->swapArray16(ds, inBytes + offset, nextOffset - offset, outBytes + offset, pErrorCode);
    224        } else if (trieType == DictionaryData::TRIE_TYPE_BYTES) {
    225            // nothing to do
    226        } else {
    227            udata_printError(ds, "udict_swap(): unknown trie type!\n");
    228            *pErrorCode = U_UNSUPPORTED_ERROR;
    229            return 0;
    230        }
    231 
    232        // these next two sections are empty in the current format,
    233        // but may be used later.
    234        offset = nextOffset;
    235        nextOffset = indexes[DictionaryData::IX_RESERVED2_OFFSET];
    236        offset = nextOffset;
    237        nextOffset = indexes[DictionaryData::IX_TOTAL_SIZE];
    238        offset = nextOffset;
    239    }
    240    return headerSize + size;
    241 }
    242 #endif