rbbirb.cpp (11752B)
1 // © 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html 3 // 4 // file: rbbirb.cpp 5 // 6 // Copyright (C) 2002-2011, International Business Machines Corporation and others. 7 // All Rights Reserved. 8 // 9 // This file contains the RBBIRuleBuilder class implementation. This is the main class for 10 // building (compiling) break rules into the tables required by the runtime 11 // RBBI engine. 12 // 13 14 #include "unicode/utypes.h" 15 16 #if !UCONFIG_NO_BREAK_ITERATION 17 18 #include "unicode/brkiter.h" 19 #include "unicode/rbbi.h" 20 #include "unicode/ubrk.h" 21 #include "unicode/unistr.h" 22 #include "unicode/uniset.h" 23 #include "unicode/uchar.h" 24 #include "unicode/uchriter.h" 25 #include "unicode/ustring.h" 26 #include "unicode/parsepos.h" 27 #include "unicode/parseerr.h" 28 29 #include "cmemory.h" 30 #include "cstring.h" 31 #include "rbbirb.h" 32 #include "rbbinode.h" 33 #include "rbbiscan.h" 34 #include "rbbisetb.h" 35 #include "rbbitblb.h" 36 #include "rbbidata.h" 37 #include "uassert.h" 38 39 40 U_NAMESPACE_BEGIN 41 42 43 //---------------------------------------------------------------------------------------- 44 // 45 // Constructor. 46 // 47 //---------------------------------------------------------------------------------------- 48 RBBIRuleBuilder::RBBIRuleBuilder(const UnicodeString &rules, 49 UParseError *parseErr, 50 UErrorCode &status) 51 : fRules(rules), fStrippedRules(rules) 52 { 53 fStatus = &status; // status is checked below 54 fParseError = parseErr; 55 fDebugEnv = nullptr; 56 #ifdef RBBI_DEBUG 57 fDebugEnv = getenv("U_RBBIDEBUG"); 58 #endif 59 60 61 fForwardTree = nullptr; 62 fReverseTree = nullptr; 63 fSafeFwdTree = nullptr; 64 fSafeRevTree = nullptr; 65 fDefaultTree = &fForwardTree; 66 fForwardTable = nullptr; 67 fRuleStatusVals = nullptr; 68 fChainRules = false; 69 fLookAheadHardBreak = false; 70 fUSetNodes = nullptr; 71 fRuleStatusVals = nullptr; 72 fScanner = nullptr; 73 fSetBuilder = nullptr; 74 if (parseErr) { 75 uprv_memset(parseErr, 0, sizeof(UParseError)); 76 } 77 78 if (U_FAILURE(status)) { 79 return; 80 } 81 82 fUSetNodes = new UVector(status); // bcos status gets overwritten here 83 fRuleStatusVals = new UVector(status); 84 fScanner = new RBBIRuleScanner(this); 85 fSetBuilder = new RBBISetBuilder(this); 86 if (U_FAILURE(status)) { 87 return; 88 } 89 if (fSetBuilder == nullptr || fScanner == nullptr || 90 fUSetNodes == nullptr || fRuleStatusVals == nullptr) { 91 status = U_MEMORY_ALLOCATION_ERROR; 92 } 93 } 94 95 96 97 //---------------------------------------------------------------------------------------- 98 // 99 // Destructor 100 // 101 //---------------------------------------------------------------------------------------- 102 RBBIRuleBuilder::~RBBIRuleBuilder() { 103 104 int i; 105 for (i=0; ; i++) { 106 RBBINode* n = static_cast<RBBINode*>(fUSetNodes->elementAt(i)); 107 if (n==nullptr) { 108 break; 109 } 110 delete n; 111 } 112 113 delete fUSetNodes; 114 delete fSetBuilder; 115 delete fForwardTable; 116 delete fForwardTree; 117 delete fReverseTree; 118 delete fSafeFwdTree; 119 delete fSafeRevTree; 120 delete fScanner; 121 delete fRuleStatusVals; 122 } 123 124 125 126 127 128 //---------------------------------------------------------------------------------------- 129 // 130 // flattenData() - Collect up the compiled RBBI rule data and put it into 131 // the format for saving in ICU data files, 132 // which is also the format needed by the RBBI runtime engine. 133 // 134 //---------------------------------------------------------------------------------------- 135 static int32_t align8(int32_t i) {return (i+7) & 0xfffffff8;} 136 137 RBBIDataHeader *RBBIRuleBuilder::flattenData() { 138 int32_t i; 139 140 if (U_FAILURE(*fStatus)) { 141 return nullptr; 142 } 143 144 // Remove whitespace from the rules to make it smaller. 145 // The rule parser has already removed comments. 146 fStrippedRules = fScanner->stripRules(fStrippedRules); 147 148 // Calculate the size of each section in the data. 149 // Sizes here are padded up to a multiple of 8 for better memory alignment. 150 // Sections sizes actually stored in the header are for the actual data 151 // without the padding. 152 // 153 int32_t headerSize = align8(sizeof(RBBIDataHeader)); 154 int32_t forwardTableSize = align8(fForwardTable->getTableSize()); 155 int32_t reverseTableSize = align8(fForwardTable->getSafeTableSize()); 156 int32_t trieSize = align8(fSetBuilder->getTrieSize()); 157 int32_t statusTableSize = align8(fRuleStatusVals->size() * sizeof(int32_t)); 158 159 int32_t rulesLengthInUTF8 = 0; 160 u_strToUTF8WithSub(nullptr, 0, &rulesLengthInUTF8, 161 fStrippedRules.getBuffer(), fStrippedRules.length(), 162 0xfffd, nullptr, fStatus); 163 *fStatus = U_ZERO_ERROR; 164 165 int32_t rulesSize = align8((rulesLengthInUTF8+1)); 166 167 int32_t totalSize = headerSize 168 + forwardTableSize 169 + reverseTableSize 170 + statusTableSize + trieSize + rulesSize; 171 172 #ifdef RBBI_DEBUG 173 if (fDebugEnv && uprv_strstr(fDebugEnv, "size")) { 174 RBBIDebugPrintf("Header Size: %8d\n", headerSize); 175 RBBIDebugPrintf("Forward Table Size: %8d\n", forwardTableSize); 176 RBBIDebugPrintf("Reverse Table Size: %8d\n", reverseTableSize); 177 RBBIDebugPrintf("Trie Size: %8d\n", trieSize); 178 RBBIDebugPrintf("Status Table Size: %8d\n", statusTableSize); 179 RBBIDebugPrintf("Rules Size: %8d\n", rulesSize); 180 RBBIDebugPrintf("-----------------------------\n"); 181 RBBIDebugPrintf("Total Size: %8d\n", totalSize); 182 } 183 #endif 184 185 LocalMemory<RBBIDataHeader> data(static_cast<RBBIDataHeader*>(uprv_malloc(totalSize))); 186 if (data.isNull()) { 187 *fStatus = U_MEMORY_ALLOCATION_ERROR; 188 return nullptr; 189 } 190 uprv_memset(data.getAlias(), 0, totalSize); 191 192 193 data->fMagic = 0xb1a0; 194 data->fFormatVersion[0] = RBBI_DATA_FORMAT_VERSION[0]; 195 data->fFormatVersion[1] = RBBI_DATA_FORMAT_VERSION[1]; 196 data->fFormatVersion[2] = RBBI_DATA_FORMAT_VERSION[2]; 197 data->fFormatVersion[3] = RBBI_DATA_FORMAT_VERSION[3]; 198 data->fLength = totalSize; 199 data->fCatCount = fSetBuilder->getNumCharCategories(); 200 201 data->fFTable = headerSize; 202 data->fFTableLen = forwardTableSize; 203 204 data->fRTable = data->fFTable + data->fFTableLen; 205 data->fRTableLen = reverseTableSize; 206 207 data->fTrie = data->fRTable + data->fRTableLen; 208 data->fTrieLen = trieSize; 209 data->fStatusTable = data->fTrie + data->fTrieLen; 210 data->fStatusTableLen= statusTableSize; 211 data->fRuleSource = data->fStatusTable + statusTableSize; 212 data->fRuleSourceLen = rulesLengthInUTF8; 213 214 uprv_memset(data->fReserved, 0, sizeof(data->fReserved)); 215 216 fForwardTable->exportTable(reinterpret_cast<uint8_t*>(data.getAlias()) + data->fFTable); 217 fForwardTable->exportSafeTable(reinterpret_cast<uint8_t*>(data.getAlias()) + data->fRTable); 218 fSetBuilder->serializeTrie(reinterpret_cast<uint8_t*>(data.getAlias()) + data->fTrie); 219 220 int32_t* ruleStatusTable = reinterpret_cast<int32_t*>(reinterpret_cast<uint8_t*>(data.getAlias()) + data->fStatusTable); 221 for (i=0; i<fRuleStatusVals->size(); i++) { 222 ruleStatusTable[i] = fRuleStatusVals->elementAti(i); 223 } 224 225 u_strToUTF8WithSub(reinterpret_cast<char*>(data.getAlias()) + data->fRuleSource, rulesSize, &rulesLengthInUTF8, 226 fStrippedRules.getBuffer(), fStrippedRules.length(), 227 0xfffd, nullptr, fStatus); 228 if (U_FAILURE(*fStatus)) { 229 return nullptr; 230 } 231 232 return data.orphan(); 233 } 234 235 236 //---------------------------------------------------------------------------------------- 237 // 238 // createRuleBasedBreakIterator construct from source rules that are passed in 239 // in a UnicodeString 240 // 241 //---------------------------------------------------------------------------------------- 242 BreakIterator * 243 RBBIRuleBuilder::createRuleBasedBreakIterator( const UnicodeString &rules, 244 UParseError *parseError, 245 UErrorCode &status) 246 { 247 // 248 // Read the input rules, generate a parse tree, symbol table, 249 // and list of all Unicode Sets referenced by the rules. 250 // 251 RBBIRuleBuilder builder(rules, parseError, status); 252 if (U_FAILURE(status)) { // status checked here bcos build below doesn't 253 return nullptr; 254 } 255 256 RBBIDataHeader *data = builder.build(status); 257 258 if (U_FAILURE(status)) { 259 return nullptr; 260 } 261 262 // 263 // Create a break iterator from the compiled rules. 264 // (Identical to creation from stored pre-compiled rules) 265 // 266 // status is checked after init in construction. 267 RuleBasedBreakIterator *This = new RuleBasedBreakIterator(data, status); 268 if (U_FAILURE(status)) { 269 delete This; 270 This = nullptr; 271 } 272 else if(This == nullptr) { // test for nullptr 273 status = U_MEMORY_ALLOCATION_ERROR; 274 } 275 return This; 276 } 277 278 RBBIDataHeader *RBBIRuleBuilder::build(UErrorCode &status) { 279 if (U_FAILURE(status)) { 280 return nullptr; 281 } 282 283 fScanner->parse(); 284 if (U_FAILURE(status)) { 285 return nullptr; 286 } 287 288 // 289 // UnicodeSet processing. 290 // Munge the Unicode Sets to create an initial set of character categories. 291 // 292 fSetBuilder->buildRanges(); 293 294 // 295 // Generate the DFA state transition table. 296 // 297 fForwardTable = new RBBITableBuilder(this, &fForwardTree, status); 298 if (fForwardTable == nullptr) { 299 status = U_MEMORY_ALLOCATION_ERROR; 300 return nullptr; 301 } 302 303 fForwardTable->buildForwardTable(); 304 305 // State table and character category optimization. 306 // Merge equivalent rows and columns. 307 // Note that this process alters the initial set of character categories, 308 // causing the representation of UnicodeSets in the parse tree to become invalid. 309 310 optimizeTables(); 311 fForwardTable->buildSafeReverseTable(status); 312 313 314 #ifdef RBBI_DEBUG 315 if (fDebugEnv && uprv_strstr(fDebugEnv, "states")) { 316 fForwardTable->printStates(); 317 fForwardTable->printRuleStatusTable(); 318 fForwardTable->printReverseTable(); 319 } 320 #endif 321 322 // Generate the mapping tables (TRIE) from input code points to 323 // the character categories. 324 // 325 fSetBuilder->buildTrie(); 326 327 // 328 // Package up the compiled data into a memory image 329 // in the run-time format. 330 // 331 RBBIDataHeader *data = flattenData(); // returns nullptr if error 332 if (U_FAILURE(status)) { 333 return nullptr; 334 } 335 return data; 336 } 337 338 void RBBIRuleBuilder::optimizeTables() { 339 bool didSomething; 340 do { 341 didSomething = false; 342 343 // Begin looking for duplicates with char class 3. 344 // Classes 0, 1 and 2 are special; they are unused, {bof} and {eof} respectively, 345 // and should not have other categories merged into them. 346 IntPair duplPair = {3, 0}; 347 while (fForwardTable->findDuplCharClassFrom(&duplPair)) { 348 fSetBuilder->mergeCategories(duplPair); 349 fForwardTable->removeColumn(duplPair.second); 350 didSomething = true; 351 } 352 353 while (fForwardTable->removeDuplicateStates() > 0) { 354 didSomething = true; 355 } 356 } while (didSomething); 357 } 358 359 U_NAMESPACE_END 360 361 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */