rbbidata.cpp (17295B)
1 // © 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html 3 /* 4 *************************************************************************** 5 * Copyright (C) 1999-2014 International Business Machines Corporation * 6 * and others. All rights reserved. * 7 *************************************************************************** 8 */ 9 10 #include "unicode/utypes.h" 11 12 #if !UCONFIG_NO_BREAK_ITERATION 13 14 #include "unicode/ucptrie.h" 15 #include "unicode/utypes.h" 16 #include "rbbidata.h" 17 #include "rbbirb.h" 18 #include "udatamem.h" 19 #include "cmemory.h" 20 #include "cstring.h" 21 #include "umutex.h" 22 23 #include "uassert.h" 24 25 26 U_NAMESPACE_BEGIN 27 28 //----------------------------------------------------------------------------- 29 // 30 // Constructors. 31 // 32 //----------------------------------------------------------------------------- 33 RBBIDataWrapper::RBBIDataWrapper(const RBBIDataHeader *data, UErrorCode &status) { 34 init0(); 35 init(data, status); 36 } 37 38 RBBIDataWrapper::RBBIDataWrapper(const RBBIDataHeader *data, enum EDontAdopt, UErrorCode &status) { 39 init0(); 40 init(data, status); 41 fDontFreeData = true; 42 } 43 44 RBBIDataWrapper::RBBIDataWrapper(UDataMemory* udm, UErrorCode &status) { 45 init0(); 46 if (U_FAILURE(status)) { 47 return; 48 } 49 const DataHeader *dh = udm->pHeader; 50 int32_t headerSize = dh->dataHeader.headerSize; 51 if ( !(headerSize >= 20 && 52 dh->info.isBigEndian == U_IS_BIG_ENDIAN && 53 dh->info.charsetFamily == U_CHARSET_FAMILY && 54 dh->info.dataFormat[0] == 0x42 && // dataFormat="Brk " 55 dh->info.dataFormat[1] == 0x72 && 56 dh->info.dataFormat[2] == 0x6b && 57 dh->info.dataFormat[3] == 0x20 && 58 isDataVersionAcceptable(dh->info.formatVersion)) 59 ) { 60 status = U_INVALID_FORMAT_ERROR; 61 return; 62 } 63 const char *dataAsBytes = reinterpret_cast<const char *>(dh); 64 const RBBIDataHeader *rbbidh = reinterpret_cast<const RBBIDataHeader *>(dataAsBytes + headerSize); 65 init(rbbidh, status); 66 fUDataMem = udm; 67 } 68 69 UBool RBBIDataWrapper::isDataVersionAcceptable(const UVersionInfo version) { 70 return RBBI_DATA_FORMAT_VERSION[0] == version[0]; 71 } 72 73 74 //----------------------------------------------------------------------------- 75 // 76 // init(). Does most of the work of construction, shared between the 77 // constructors. 78 // 79 //----------------------------------------------------------------------------- 80 void RBBIDataWrapper::init0() { 81 fHeader = nullptr; 82 fForwardTable = nullptr; 83 fReverseTable = nullptr; 84 fRuleSource = nullptr; 85 fRuleStatusTable = nullptr; 86 fTrie = nullptr; 87 fUDataMem = nullptr; 88 fRefCount = 0; 89 fDontFreeData = true; 90 } 91 92 void RBBIDataWrapper::init(const RBBIDataHeader *data, UErrorCode &status) { 93 if (U_FAILURE(status)) { 94 return; 95 } 96 fHeader = data; 97 if (fHeader->fMagic != 0xb1a0 || !isDataVersionAcceptable(fHeader->fFormatVersion)) { 98 status = U_INVALID_FORMAT_ERROR; 99 return; 100 } 101 // Note: in ICU version 3.2 and earlier, there was a formatVersion 1 102 // that is no longer supported. At that time fFormatVersion was 103 // an int32_t field, rather than an array of 4 bytes. 104 105 fDontFreeData = false; 106 if (data->fFTableLen != 0) { 107 fForwardTable = reinterpret_cast<const RBBIStateTable*>(reinterpret_cast<const char*>(data) + fHeader->fFTable); 108 } 109 if (data->fRTableLen != 0) { 110 fReverseTable = reinterpret_cast<const RBBIStateTable*>(reinterpret_cast<const char*>(data) + fHeader->fRTable); 111 } 112 113 fTrie = ucptrie_openFromBinary(UCPTRIE_TYPE_FAST, 114 UCPTRIE_VALUE_BITS_ANY, 115 (uint8_t *)data + fHeader->fTrie, 116 fHeader->fTrieLen, 117 nullptr, // *actual length 118 &status); 119 if (U_FAILURE(status)) { 120 return; 121 } 122 123 UCPTrieValueWidth width = ucptrie_getValueWidth(fTrie); 124 if (!(width == UCPTRIE_VALUE_BITS_8 || width == UCPTRIE_VALUE_BITS_16)) { 125 status = U_INVALID_FORMAT_ERROR; 126 return; 127 } 128 129 fRuleSource = ((char *)data + fHeader->fRuleSource); 130 fRuleString = UnicodeString::fromUTF8(StringPiece(fRuleSource, fHeader->fRuleSourceLen)); 131 U_ASSERT(data->fRuleSourceLen > 0); 132 133 fRuleStatusTable = reinterpret_cast<const int32_t*>(reinterpret_cast<const char*>(data) + fHeader->fStatusTable); 134 fStatusMaxIdx = data->fStatusTableLen / sizeof(int32_t); 135 136 fRefCount = 1; 137 138 #ifdef RBBI_DEBUG 139 char *debugEnv = getenv("U_RBBIDEBUG"); 140 if (debugEnv && uprv_strstr(debugEnv, "data")) {this->printData();} 141 #endif 142 } 143 144 145 //----------------------------------------------------------------------------- 146 // 147 // Destructor. Don't call this - use removeReference() instead. 148 // 149 //----------------------------------------------------------------------------- 150 RBBIDataWrapper::~RBBIDataWrapper() { 151 U_ASSERT(fRefCount == 0); 152 ucptrie_close(fTrie); 153 fTrie = nullptr; 154 if (fUDataMem) { 155 udata_close(fUDataMem); 156 } else if (!fDontFreeData) { 157 uprv_free((void *)fHeader); 158 } 159 } 160 161 162 163 //----------------------------------------------------------------------------- 164 // 165 // Operator == Consider two RBBIDataWrappers to be equal if they 166 // refer to the same underlying data. Although 167 // the data wrappers are normally shared between 168 // iterator instances, it's possible to independently 169 // open the same data twice, and get two instances, which 170 // should still be ==. 171 // 172 //----------------------------------------------------------------------------- 173 bool RBBIDataWrapper::operator ==(const RBBIDataWrapper &other) const { 174 if (fHeader == other.fHeader) { 175 return true; 176 } 177 if (fHeader->fLength != other.fHeader->fLength) { 178 return false; 179 } 180 if (uprv_memcmp(fHeader, other.fHeader, fHeader->fLength) == 0) { 181 return true; 182 } 183 return false; 184 } 185 186 int32_t RBBIDataWrapper::hashCode() { 187 return fHeader->fFTableLen; 188 } 189 190 191 192 //----------------------------------------------------------------------------- 193 // 194 // Reference Counting. A single RBBIDataWrapper object is shared among 195 // however many RulesBasedBreakIterator instances are 196 // referencing the same data. 197 // 198 //----------------------------------------------------------------------------- 199 void RBBIDataWrapper::removeReference() { 200 if (umtx_atomic_dec(&fRefCount) == 0) { 201 delete this; 202 } 203 } 204 205 206 RBBIDataWrapper *RBBIDataWrapper::addReference() { 207 umtx_atomic_inc(&fRefCount); 208 return this; 209 } 210 211 212 213 //----------------------------------------------------------------------------- 214 // 215 // getRuleSourceString 216 // 217 //----------------------------------------------------------------------------- 218 const UnicodeString &RBBIDataWrapper::getRuleSourceString() const { 219 return fRuleString; 220 } 221 222 223 //----------------------------------------------------------------------------- 224 // 225 // print - debugging function to dump the runtime data tables. 226 // 227 //----------------------------------------------------------------------------- 228 #ifdef RBBI_DEBUG 229 void RBBIDataWrapper::printTable(const char *heading, const RBBIStateTable *table) { 230 uint32_t c; 231 uint32_t s; 232 233 RBBIDebugPrintf("%s\n", heading); 234 235 RBBIDebugPrintf(" fDictCategoriesStart: %d\n", table->fDictCategoriesStart); 236 RBBIDebugPrintf(" fLookAheadResultsSize: %d\n", table->fLookAheadResultsSize); 237 RBBIDebugPrintf(" Flags: %4x RBBI_LOOKAHEAD_HARD_BREAK=%s RBBI_BOF_REQUIRED=%s RBBI_8BITS_ROWS=%s\n", 238 table->fFlags, 239 table->fFlags & RBBI_LOOKAHEAD_HARD_BREAK ? "T" : "F", 240 table->fFlags & RBBI_BOF_REQUIRED ? "T" : "F", 241 table->fFlags & RBBI_8BITS_ROWS ? "T" : "F"); 242 RBBIDebugPrintf("\nState | Acc LA TagIx"); 243 for (c=0; c<fHeader->fCatCount; c++) {RBBIDebugPrintf("%3d ", c);} 244 RBBIDebugPrintf("\n------|---------------"); for (c=0;c<fHeader->fCatCount; c++) { 245 RBBIDebugPrintf("----"); 246 } 247 RBBIDebugPrintf("\n"); 248 249 if (table == nullptr) { 250 RBBIDebugPrintf(" N U L L T A B L E\n\n"); 251 return; 252 } 253 UBool use8Bits = table->fFlags & RBBI_8BITS_ROWS; 254 for (s=0; s<table->fNumStates; s++) { 255 RBBIStateTableRow *row = (RBBIStateTableRow *) 256 (table->fTableData + (table->fRowLen * s)); 257 if (use8Bits) { 258 RBBIDebugPrintf("%4d | %3d %3d %3d ", s, row->r8.fAccepting, row->r8.fLookAhead, row->r8.fTagsIdx); 259 for (c=0; c<fHeader->fCatCount; c++) { 260 RBBIDebugPrintf("%3d ", row->r8.fNextState[c]); 261 } 262 } else { 263 RBBIDebugPrintf("%4d | %3d %3d %3d ", s, row->r16.fAccepting, row->r16.fLookAhead, row->r16.fTagsIdx); 264 for (c=0; c<fHeader->fCatCount; c++) { 265 RBBIDebugPrintf("%3d ", row->r16.fNextState[c]); 266 } 267 } 268 RBBIDebugPrintf("\n"); 269 } 270 RBBIDebugPrintf("\n"); 271 } 272 #endif 273 274 275 void RBBIDataWrapper::printData() { 276 #ifdef RBBI_DEBUG 277 RBBIDebugPrintf("RBBI Data at %p\n", (void *)fHeader); 278 RBBIDebugPrintf(" Version = {%d %d %d %d}\n", fHeader->fFormatVersion[0], fHeader->fFormatVersion[1], 279 fHeader->fFormatVersion[2], fHeader->fFormatVersion[3]); 280 RBBIDebugPrintf(" total length of data = %d\n", fHeader->fLength); 281 RBBIDebugPrintf(" number of character categories = %d\n\n", fHeader->fCatCount); 282 283 printTable("Forward State Transition Table", fForwardTable); 284 printTable("Reverse State Transition Table", fReverseTable); 285 286 RBBIDebugPrintf("\nOriginal Rules source:\n"); 287 for (int32_t c=0; fRuleSource[c] != 0; c++) { 288 RBBIDebugPrintf("%c", fRuleSource[c]); 289 } 290 RBBIDebugPrintf("\n\n"); 291 #endif 292 } 293 294 295 U_NAMESPACE_END 296 U_NAMESPACE_USE 297 298 //----------------------------------------------------------------------------- 299 // 300 // ubrk_swap - byte swap and char encoding swap of RBBI data 301 // 302 //----------------------------------------------------------------------------- 303 304 U_CAPI int32_t U_EXPORT2 305 ubrk_swap(const UDataSwapper *ds, const void *inData, int32_t length, void *outData, 306 UErrorCode *status) { 307 308 if (status == nullptr || U_FAILURE(*status)) { 309 return 0; 310 } 311 if(ds==nullptr || inData==nullptr || length<-1 || (length>0 && outData==nullptr)) { 312 *status=U_ILLEGAL_ARGUMENT_ERROR; 313 return 0; 314 } 315 316 // 317 // Check that the data header is for for break data. 318 // (Header contents are defined in genbrk.cpp) 319 // 320 const UDataInfo *pInfo = (const UDataInfo *)((const char *)inData+4); 321 if(!( pInfo->dataFormat[0]==0x42 && /* dataFormat="Brk " */ 322 pInfo->dataFormat[1]==0x72 && 323 pInfo->dataFormat[2]==0x6b && 324 pInfo->dataFormat[3]==0x20 && 325 RBBIDataWrapper::isDataVersionAcceptable(pInfo->formatVersion) )) { 326 udata_printError(ds, "ubrk_swap(): data format %02x.%02x.%02x.%02x (format version %02x) is not recognized\n", 327 pInfo->dataFormat[0], pInfo->dataFormat[1], 328 pInfo->dataFormat[2], pInfo->dataFormat[3], 329 pInfo->formatVersion[0]); 330 *status=U_UNSUPPORTED_ERROR; 331 return 0; 332 } 333 334 // 335 // Swap the data header. (This is the generic ICU Data Header, not the RBBI Specific 336 // RBBIDataHeader). This swap also conveniently gets us 337 // the size of the ICU d.h., which lets us locate the start 338 // of the RBBI specific data. 339 // 340 int32_t headerSize=udata_swapDataHeader(ds, inData, length, outData, status); 341 342 343 // 344 // Get the RRBI Data Header, and check that it appears to be OK. 345 // 346 const uint8_t *inBytes =(const uint8_t *)inData+headerSize; 347 RBBIDataHeader *rbbiDH = (RBBIDataHeader *)inBytes; 348 if (ds->readUInt32(rbbiDH->fMagic) != 0xb1a0 || 349 !RBBIDataWrapper::isDataVersionAcceptable(rbbiDH->fFormatVersion) || 350 ds->readUInt32(rbbiDH->fLength) < sizeof(RBBIDataHeader)) { 351 udata_printError(ds, "ubrk_swap(): RBBI Data header is invalid.\n"); 352 *status=U_UNSUPPORTED_ERROR; 353 return 0; 354 } 355 356 // 357 // Prefight operation? Just return the size 358 // 359 int32_t breakDataLength = ds->readUInt32(rbbiDH->fLength); 360 int32_t totalSize = headerSize + breakDataLength; 361 if (length < 0) { 362 return totalSize; 363 } 364 365 // 366 // Check that length passed in is consistent with length from RBBI data header. 367 // 368 if (length < totalSize) { 369 udata_printError(ds, "ubrk_swap(): too few bytes (%d after ICU Data header) for break data.\n", 370 breakDataLength); 371 *status=U_INDEX_OUTOFBOUNDS_ERROR; 372 return 0; 373 } 374 375 376 // 377 // Swap the Data. Do the data itself first, then the RBBI Data Header, because 378 // we need to reference the header to locate the data, and an 379 // inplace swap of the header leaves it unusable. 380 // 381 uint8_t *outBytes = (uint8_t *)outData + headerSize; 382 RBBIDataHeader *outputDH = (RBBIDataHeader *)outBytes; 383 384 int32_t tableStartOffset; 385 int32_t tableLength; 386 387 // 388 // If not swapping in place, zero out the output buffer before starting. 389 // Individual tables and other data items within are aligned to 8 byte boundaries 390 // when originally created. Any unused space between items needs to be zero. 391 // 392 if (inBytes != outBytes) { 393 uprv_memset(outBytes, 0, breakDataLength); 394 } 395 396 // 397 // Each state table begins with several 32 bit fields. Calculate the size 398 // in bytes of these. 399 // 400 int32_t topSize = offsetof(RBBIStateTable, fTableData); 401 402 // Forward state table. 403 tableStartOffset = ds->readUInt32(rbbiDH->fFTable); 404 tableLength = ds->readUInt32(rbbiDH->fFTableLen); 405 406 if (tableLength > 0) { 407 RBBIStateTable *rbbiST = (RBBIStateTable *)(inBytes+tableStartOffset); 408 UBool use8Bits = ds->readUInt32(rbbiST->fFlags) & RBBI_8BITS_ROWS; 409 410 ds->swapArray32(ds, inBytes+tableStartOffset, topSize, 411 outBytes+tableStartOffset, status); 412 413 // Swap the state table if the table is in 16 bits. 414 if (use8Bits) { 415 if (outBytes != inBytes) { 416 uprv_memmove(outBytes+tableStartOffset+topSize, 417 inBytes+tableStartOffset+topSize, 418 tableLength-topSize); 419 } 420 } else { 421 ds->swapArray16(ds, inBytes+tableStartOffset+topSize, tableLength-topSize, 422 outBytes+tableStartOffset+topSize, status); 423 } 424 } 425 426 // Reverse state table. Same layout as forward table, above. 427 tableStartOffset = ds->readUInt32(rbbiDH->fRTable); 428 tableLength = ds->readUInt32(rbbiDH->fRTableLen); 429 430 if (tableLength > 0) { 431 RBBIStateTable *rbbiST = (RBBIStateTable *)(inBytes+tableStartOffset); 432 UBool use8Bits = ds->readUInt32(rbbiST->fFlags) & RBBI_8BITS_ROWS; 433 434 ds->swapArray32(ds, inBytes+tableStartOffset, topSize, 435 outBytes+tableStartOffset, status); 436 437 // Swap the state table if the table is in 16 bits. 438 if (use8Bits) { 439 if (outBytes != inBytes) { 440 uprv_memmove(outBytes+tableStartOffset+topSize, 441 inBytes+tableStartOffset+topSize, 442 tableLength-topSize); 443 } 444 } else { 445 ds->swapArray16(ds, inBytes+tableStartOffset+topSize, tableLength-topSize, 446 outBytes+tableStartOffset+topSize, status); 447 } 448 } 449 450 // Trie table for character categories 451 ucptrie_swap(ds, inBytes+ds->readUInt32(rbbiDH->fTrie), ds->readUInt32(rbbiDH->fTrieLen), 452 outBytes+ds->readUInt32(rbbiDH->fTrie), status); 453 454 // Source Rules Text. It's UTF8 data 455 if (outBytes != inBytes) { 456 uprv_memmove(outBytes+ds->readUInt32(rbbiDH->fRuleSource), 457 inBytes+ds->readUInt32(rbbiDH->fRuleSource), 458 ds->readUInt32(rbbiDH->fRuleSourceLen)); 459 } 460 461 // Table of rule status values. It's all int_32 values 462 ds->swapArray32(ds, inBytes+ds->readUInt32(rbbiDH->fStatusTable), ds->readUInt32(rbbiDH->fStatusTableLen), 463 outBytes+ds->readUInt32(rbbiDH->fStatusTable), status); 464 465 // And, last, the header. 466 // It is all int32_t values except for fFormataVersion, which is an array of four bytes. 467 // Swap the whole thing as int32_t, then re-swap the one field. 468 // 469 ds->swapArray32(ds, inBytes, sizeof(RBBIDataHeader), outBytes, status); 470 ds->swapArray32(ds, outputDH->fFormatVersion, 4, outputDH->fFormatVersion, status); 471 472 return totalSize; 473 } 474 475 476 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */