tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

rbbirb.cpp (11752B)


      1 // © 2016 and later: Unicode, Inc. and others.
      2 // License & terms of use: http://www.unicode.org/copyright.html
      3 //
      4 //  file:  rbbirb.cpp
      5 //
      6 //  Copyright (C) 2002-2011, International Business Machines Corporation and others.
      7 //  All Rights Reserved.
      8 //
      9 //  This file contains the RBBIRuleBuilder class implementation.  This is the main class for
     10 //    building (compiling) break rules into the tables required by the runtime
     11 //    RBBI engine.
     12 //
     13 
     14 #include "unicode/utypes.h"
     15 
     16 #if !UCONFIG_NO_BREAK_ITERATION
     17 
     18 #include "unicode/brkiter.h"
     19 #include "unicode/rbbi.h"
     20 #include "unicode/ubrk.h"
     21 #include "unicode/unistr.h"
     22 #include "unicode/uniset.h"
     23 #include "unicode/uchar.h"
     24 #include "unicode/uchriter.h"
     25 #include "unicode/ustring.h"
     26 #include "unicode/parsepos.h"
     27 #include "unicode/parseerr.h"
     28 
     29 #include "cmemory.h"
     30 #include "cstring.h"
     31 #include "rbbirb.h"
     32 #include "rbbinode.h"
     33 #include "rbbiscan.h"
     34 #include "rbbisetb.h"
     35 #include "rbbitblb.h"
     36 #include "rbbidata.h"
     37 #include "uassert.h"
     38 
     39 
     40 U_NAMESPACE_BEGIN
     41 
     42 
     43 //----------------------------------------------------------------------------------------
     44 //
     45 //  Constructor.
     46 //
     47 //----------------------------------------------------------------------------------------
     48 RBBIRuleBuilder::RBBIRuleBuilder(const UnicodeString   &rules,
     49                                       UParseError     *parseErr,
     50                                       UErrorCode      &status)
     51 : fRules(rules), fStrippedRules(rules)
     52 {
     53    fStatus = &status; // status is checked below
     54    fParseError = parseErr;
     55    fDebugEnv   = nullptr;
     56 #ifdef RBBI_DEBUG
     57    fDebugEnv   = getenv("U_RBBIDEBUG");
     58 #endif
     59 
     60 
     61    fForwardTree        = nullptr;
     62    fReverseTree        = nullptr;
     63    fSafeFwdTree        = nullptr;
     64    fSafeRevTree        = nullptr;
     65    fDefaultTree        = &fForwardTree;
     66    fForwardTable       = nullptr;
     67    fRuleStatusVals     = nullptr;
     68    fChainRules         = false;
     69    fLookAheadHardBreak = false;
     70    fUSetNodes          = nullptr;
     71    fRuleStatusVals     = nullptr;
     72    fScanner            = nullptr;
     73    fSetBuilder         = nullptr;
     74    if (parseErr) {
     75        uprv_memset(parseErr, 0, sizeof(UParseError));
     76    }
     77 
     78    if (U_FAILURE(status)) {
     79        return;
     80    }
     81 
     82    fUSetNodes          = new UVector(status); // bcos status gets overwritten here
     83    fRuleStatusVals     = new UVector(status);
     84    fScanner            = new RBBIRuleScanner(this);
     85    fSetBuilder         = new RBBISetBuilder(this);
     86    if (U_FAILURE(status)) {
     87        return;
     88    }
     89    if (fSetBuilder == nullptr || fScanner == nullptr ||
     90        fUSetNodes == nullptr || fRuleStatusVals == nullptr) {
     91        status = U_MEMORY_ALLOCATION_ERROR;
     92    }
     93 }
     94 
     95 
     96 
     97 //----------------------------------------------------------------------------------------
     98 //
     99 //  Destructor
    100 //
    101 //----------------------------------------------------------------------------------------
    102 RBBIRuleBuilder::~RBBIRuleBuilder() {
    103 
    104    int        i;
    105    for (i=0; ; i++) {
    106        RBBINode* n = static_cast<RBBINode*>(fUSetNodes->elementAt(i));
    107        if (n==nullptr) {
    108            break;
    109        }
    110        delete n;
    111    }
    112 
    113    delete fUSetNodes;
    114    delete fSetBuilder;
    115    delete fForwardTable;
    116    delete fForwardTree;
    117    delete fReverseTree;
    118    delete fSafeFwdTree;
    119    delete fSafeRevTree;
    120    delete fScanner;
    121    delete fRuleStatusVals;
    122 }
    123 
    124 
    125 
    126 
    127 
    128 //----------------------------------------------------------------------------------------
    129 //
    130 //   flattenData() -  Collect up the compiled RBBI rule data and put it into
    131 //                    the format for saving in ICU data files,
    132 //                    which is also the format needed by the RBBI runtime engine.
    133 //
    134 //----------------------------------------------------------------------------------------
    135 static int32_t align8(int32_t i) {return (i+7) & 0xfffffff8;}
    136 
    137 RBBIDataHeader *RBBIRuleBuilder::flattenData() {
    138    int32_t    i;
    139 
    140    if (U_FAILURE(*fStatus)) {
    141        return nullptr;
    142    }
    143 
    144    // Remove whitespace from the rules to make it smaller.
    145    // The rule parser has already removed comments.
    146    fStrippedRules = fScanner->stripRules(fStrippedRules);
    147 
    148    // Calculate the size of each section in the data.
    149    //   Sizes here are padded up to a multiple of 8 for better memory alignment.
    150    //   Sections sizes actually stored in the header are for the actual data
    151    //     without the padding.
    152    //
    153    int32_t headerSize        = align8(sizeof(RBBIDataHeader));
    154    int32_t forwardTableSize  = align8(fForwardTable->getTableSize());
    155    int32_t reverseTableSize  = align8(fForwardTable->getSafeTableSize());
    156    int32_t trieSize          = align8(fSetBuilder->getTrieSize());
    157    int32_t statusTableSize   = align8(fRuleStatusVals->size() * sizeof(int32_t));
    158 
    159    int32_t rulesLengthInUTF8 = 0;
    160    u_strToUTF8WithSub(nullptr, 0, &rulesLengthInUTF8,
    161                       fStrippedRules.getBuffer(), fStrippedRules.length(),
    162                       0xfffd, nullptr, fStatus);
    163    *fStatus = U_ZERO_ERROR;
    164 
    165    int32_t rulesSize         = align8((rulesLengthInUTF8+1));
    166 
    167    int32_t         totalSize = headerSize
    168                                + forwardTableSize
    169                                + reverseTableSize
    170                                + statusTableSize + trieSize + rulesSize;
    171 
    172 #ifdef RBBI_DEBUG
    173    if (fDebugEnv && uprv_strstr(fDebugEnv, "size")) {
    174        RBBIDebugPrintf("Header Size:        %8d\n", headerSize);
    175        RBBIDebugPrintf("Forward Table Size: %8d\n", forwardTableSize);
    176        RBBIDebugPrintf("Reverse Table Size: %8d\n", reverseTableSize);
    177        RBBIDebugPrintf("Trie Size:          %8d\n", trieSize);
    178        RBBIDebugPrintf("Status Table Size:  %8d\n", statusTableSize);
    179        RBBIDebugPrintf("Rules Size:         %8d\n", rulesSize);
    180        RBBIDebugPrintf("-----------------------------\n");
    181        RBBIDebugPrintf("Total Size:         %8d\n", totalSize);
    182    }
    183 #endif
    184 
    185    LocalMemory<RBBIDataHeader> data(static_cast<RBBIDataHeader*>(uprv_malloc(totalSize)));
    186    if (data.isNull()) {
    187        *fStatus = U_MEMORY_ALLOCATION_ERROR;
    188        return nullptr;
    189    }
    190    uprv_memset(data.getAlias(), 0, totalSize);
    191 
    192 
    193    data->fMagic            = 0xb1a0;
    194    data->fFormatVersion[0] = RBBI_DATA_FORMAT_VERSION[0];
    195    data->fFormatVersion[1] = RBBI_DATA_FORMAT_VERSION[1];
    196    data->fFormatVersion[2] = RBBI_DATA_FORMAT_VERSION[2];
    197    data->fFormatVersion[3] = RBBI_DATA_FORMAT_VERSION[3];
    198    data->fLength           = totalSize;
    199    data->fCatCount         = fSetBuilder->getNumCharCategories();
    200 
    201    data->fFTable        = headerSize;
    202    data->fFTableLen     = forwardTableSize;
    203 
    204    data->fRTable        = data->fFTable  + data->fFTableLen;
    205    data->fRTableLen     = reverseTableSize;
    206 
    207    data->fTrie          = data->fRTable + data->fRTableLen;
    208    data->fTrieLen       = trieSize;
    209    data->fStatusTable   = data->fTrie    + data->fTrieLen;
    210    data->fStatusTableLen= statusTableSize;
    211    data->fRuleSource    = data->fStatusTable + statusTableSize;
    212    data->fRuleSourceLen = rulesLengthInUTF8;
    213 
    214    uprv_memset(data->fReserved, 0, sizeof(data->fReserved));
    215 
    216    fForwardTable->exportTable(reinterpret_cast<uint8_t*>(data.getAlias()) + data->fFTable);
    217    fForwardTable->exportSafeTable(reinterpret_cast<uint8_t*>(data.getAlias()) + data->fRTable);
    218    fSetBuilder->serializeTrie(reinterpret_cast<uint8_t*>(data.getAlias()) + data->fTrie);
    219 
    220    int32_t* ruleStatusTable = reinterpret_cast<int32_t*>(reinterpret_cast<uint8_t*>(data.getAlias()) + data->fStatusTable);
    221    for (i=0; i<fRuleStatusVals->size(); i++) {
    222        ruleStatusTable[i] = fRuleStatusVals->elementAti(i);
    223    }
    224 
    225    u_strToUTF8WithSub(reinterpret_cast<char*>(data.getAlias()) + data->fRuleSource, rulesSize, &rulesLengthInUTF8,
    226                       fStrippedRules.getBuffer(), fStrippedRules.length(),
    227                       0xfffd, nullptr, fStatus);
    228    if (U_FAILURE(*fStatus)) {
    229        return nullptr;
    230    }
    231 
    232    return data.orphan();
    233 }
    234 
    235 
    236 //----------------------------------------------------------------------------------------
    237 //
    238 //  createRuleBasedBreakIterator    construct from source rules that are passed in
    239 //                                  in a UnicodeString
    240 //
    241 //----------------------------------------------------------------------------------------
    242 BreakIterator *
    243 RBBIRuleBuilder::createRuleBasedBreakIterator( const UnicodeString    &rules,
    244                                    UParseError      *parseError,
    245                                    UErrorCode       &status)
    246 {
    247    //
    248    // Read the input rules, generate a parse tree, symbol table,
    249    // and list of all Unicode Sets referenced by the rules.
    250    //
    251    RBBIRuleBuilder  builder(rules, parseError, status);
    252    if (U_FAILURE(status)) { // status checked here bcos build below doesn't
    253        return nullptr;
    254    }
    255 
    256    RBBIDataHeader *data = builder.build(status);
    257 
    258    if (U_FAILURE(status)) {
    259        return nullptr;
    260    }
    261 
    262    //
    263    //  Create a break iterator from the compiled rules.
    264    //     (Identical to creation from stored pre-compiled rules)
    265    //
    266    // status is checked after init in construction.
    267    RuleBasedBreakIterator *This = new RuleBasedBreakIterator(data, status);
    268    if (U_FAILURE(status)) {
    269        delete This;
    270        This = nullptr;
    271    } 
    272    else if(This == nullptr) { // test for nullptr
    273        status = U_MEMORY_ALLOCATION_ERROR;
    274    }
    275    return This;
    276 }
    277 
    278 RBBIDataHeader *RBBIRuleBuilder::build(UErrorCode &status) {
    279    if (U_FAILURE(status)) {
    280        return nullptr;
    281    }
    282 
    283    fScanner->parse();
    284    if (U_FAILURE(status)) {
    285        return nullptr;
    286    }
    287 
    288    //
    289    // UnicodeSet processing.
    290    //    Munge the Unicode Sets to create an initial set of character categories.
    291    //
    292    fSetBuilder->buildRanges();
    293 
    294    //
    295    //   Generate the DFA state transition table.
    296    //
    297    fForwardTable = new RBBITableBuilder(this, &fForwardTree, status);
    298    if (fForwardTable == nullptr) {
    299        status = U_MEMORY_ALLOCATION_ERROR;
    300        return nullptr;
    301    }
    302 
    303    fForwardTable->buildForwardTable();
    304 
    305    // State table and character category optimization.
    306    // Merge equivalent rows and columns.
    307    // Note that this process alters the initial set of character categories,
    308    // causing the representation of UnicodeSets in the parse tree to become invalid.
    309 
    310    optimizeTables();
    311    fForwardTable->buildSafeReverseTable(status);
    312 
    313 
    314 #ifdef RBBI_DEBUG
    315    if (fDebugEnv && uprv_strstr(fDebugEnv, "states")) {
    316        fForwardTable->printStates();
    317        fForwardTable->printRuleStatusTable();
    318        fForwardTable->printReverseTable();
    319    }
    320 #endif
    321 
    322    //    Generate the mapping tables (TRIE) from input code points to
    323    //    the character categories.
    324    //
    325    fSetBuilder->buildTrie();
    326 
    327    //
    328    //   Package up the compiled data into a memory image
    329    //      in the run-time format.
    330    //
    331    RBBIDataHeader *data = flattenData(); // returns nullptr if error
    332    if (U_FAILURE(status)) {
    333        return nullptr;
    334    }
    335    return data;
    336 }
    337 
    338 void RBBIRuleBuilder::optimizeTables() {
    339    bool didSomething;
    340    do {
    341        didSomething = false;
    342 
    343        // Begin looking for duplicates with char class 3.
    344        // Classes 0, 1 and 2 are special; they are unused, {bof} and {eof} respectively,
    345        // and should not have other categories merged into them.
    346        IntPair duplPair = {3, 0};
    347        while (fForwardTable->findDuplCharClassFrom(&duplPair)) {
    348            fSetBuilder->mergeCategories(duplPair);
    349            fForwardTable->removeColumn(duplPair.second);
    350            didSomething = true;
    351        }
    352 
    353        while (fForwardTable->removeDuplicateStates() > 0) {
    354            didSomething = true;
    355        }
    356    } while (didSomething);
    357 }
    358 
    359 U_NAMESPACE_END
    360 
    361 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */