brkiter.cpp (16590B)
1 // © 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html 3 /* 4 ******************************************************************************* 5 * Copyright (C) 1997-2015, International Business Machines Corporation and 6 * others. All Rights Reserved. 7 ******************************************************************************* 8 * 9 * File brkiter.cpp 10 * 11 * Modification History: 12 * 13 * Date Name Description 14 * 02/18/97 aliu Converted from OpenClass. Added DONE. 15 * 01/13/2000 helena Added UErrorCode parameter to createXXXInstance methods. 16 ***************************************************************************************** 17 */ 18 19 // ***************************************************************************** 20 // This file was generated from the java source file BreakIterator.java 21 // ***************************************************************************** 22 23 #include "unicode/utypes.h" 24 25 #if !UCONFIG_NO_BREAK_ITERATION 26 27 #include "unicode/rbbi.h" 28 #include "unicode/brkiter.h" 29 #include "unicode/udata.h" 30 #include "unicode/uloc.h" 31 #include "unicode/ures.h" 32 #include "unicode/ustring.h" 33 #include "unicode/filteredbrk.h" 34 #include "bytesinkutil.h" 35 #include "ucln_cmn.h" 36 #include "cstring.h" 37 #include "umutex.h" 38 #include "servloc.h" 39 #include "locbased.h" 40 #include "uresimp.h" 41 #include "uassert.h" 42 #include "ubrkimpl.h" 43 #include "utracimp.h" 44 #include "charstr.h" 45 46 // ***************************************************************************** 47 // class BreakIterator 48 // This class implements methods for finding the location of boundaries in text. 49 // Instances of BreakIterator maintain a current position and scan over text 50 // returning the index of characters where boundaries occur. 51 // ***************************************************************************** 52 53 U_NAMESPACE_BEGIN 54 55 // ------------------------------------- 56 57 BreakIterator* 58 BreakIterator::buildInstance(const Locale& loc, const char *type, UErrorCode &status) 59 { 60 char fnbuff[256]; 61 char ext[4]={'\0'}; 62 CharString actual; 63 int32_t size; 64 const char16_t* brkfname = nullptr; 65 UResourceBundle brkRulesStack; 66 UResourceBundle brkNameStack; 67 UResourceBundle *brkRules = &brkRulesStack; 68 UResourceBundle *brkName = &brkNameStack; 69 RuleBasedBreakIterator *result = nullptr; 70 71 if (U_FAILURE(status)) 72 return nullptr; 73 74 ures_initStackObject(brkRules); 75 ures_initStackObject(brkName); 76 77 // Get the locale 78 UResourceBundle *b = ures_openNoDefault(U_ICUDATA_BRKITR, loc.getName(), &status); 79 80 // Get the "boundaries" array. 81 if (U_SUCCESS(status)) { 82 brkRules = ures_getByKeyWithFallback(b, "boundaries", brkRules, &status); 83 // Get the string object naming the rules file 84 brkName = ures_getByKeyWithFallback(brkRules, type, brkName, &status); 85 // Get the actual string 86 brkfname = ures_getString(brkName, &size, &status); 87 U_ASSERT((size_t)size<sizeof(fnbuff)); 88 if (static_cast<size_t>(size) >= sizeof(fnbuff)) { 89 size=0; 90 if (U_SUCCESS(status)) { 91 status = U_BUFFER_OVERFLOW_ERROR; 92 } 93 } 94 95 // Use the string if we found it 96 if (U_SUCCESS(status) && brkfname) { 97 actual.append(ures_getLocaleInternal(brkName, &status), -1, status); 98 99 char16_t* extStart=u_strchr(brkfname, 0x002e); 100 int len = 0; 101 if (extStart != nullptr){ 102 len = static_cast<int>(extStart - brkfname); 103 u_UCharsToChars(extStart+1, ext, sizeof(ext)); // nul terminates the buff 104 u_UCharsToChars(brkfname, fnbuff, len); 105 } 106 fnbuff[len]=0; // nul terminate 107 } 108 } 109 110 ures_close(brkRules); 111 ures_close(brkName); 112 113 UDataMemory* file = udata_open(U_ICUDATA_BRKITR, ext, fnbuff, &status); 114 if (U_FAILURE(status)) { 115 ures_close(b); 116 return nullptr; 117 } 118 119 // Create a RuleBasedBreakIterator 120 result = new RuleBasedBreakIterator(file, uprv_strstr(type, "phrase") != nullptr, status); 121 122 // If there is a result, set the valid locale and actual locale, and the kind 123 if (U_SUCCESS(status) && result != nullptr) { 124 result->actualLocale = Locale(actual.data()); 125 result->validLocale = Locale(ures_getLocaleByType(b, ULOC_VALID_LOCALE, &status)); 126 result->requestLocale = loc; 127 } 128 129 ures_close(b); 130 131 if (U_FAILURE(status) && result != nullptr) { // Sometimes redundant check, but simple 132 delete result; 133 return nullptr; 134 } 135 136 if (result == nullptr) { 137 udata_close(file); 138 if (U_SUCCESS(status)) { 139 status = U_MEMORY_ALLOCATION_ERROR; 140 } 141 } 142 143 return result; 144 } 145 146 // Creates a break iterator for word breaks. 147 BreakIterator* U_EXPORT2 148 BreakIterator::createWordInstance(const Locale& key, UErrorCode& status) 149 { 150 return createInstance(key, UBRK_WORD, status); 151 } 152 153 // ------------------------------------- 154 155 // Creates a break iterator for line breaks. 156 BreakIterator* U_EXPORT2 157 BreakIterator::createLineInstance(const Locale& key, UErrorCode& status) 158 { 159 return createInstance(key, UBRK_LINE, status); 160 } 161 162 // ------------------------------------- 163 164 // Creates a break iterator for character breaks. 165 BreakIterator* U_EXPORT2 166 BreakIterator::createCharacterInstance(const Locale& key, UErrorCode& status) 167 { 168 return createInstance(key, UBRK_CHARACTER, status); 169 } 170 171 // ------------------------------------- 172 173 // Creates a break iterator for sentence breaks. 174 BreakIterator* U_EXPORT2 175 BreakIterator::createSentenceInstance(const Locale& key, UErrorCode& status) 176 { 177 return createInstance(key, UBRK_SENTENCE, status); 178 } 179 180 // ------------------------------------- 181 182 // Creates a break iterator for title casing breaks. 183 BreakIterator* U_EXPORT2 184 BreakIterator::createTitleInstance(const Locale& key, UErrorCode& status) 185 { 186 return createInstance(key, UBRK_TITLE, status); 187 } 188 189 // ------------------------------------- 190 191 // Gets all the available locales that has localized text boundary data. 192 const Locale* U_EXPORT2 193 BreakIterator::getAvailableLocales(int32_t& count) 194 { 195 return Locale::getAvailableLocales(count); 196 } 197 198 // ------------------------------------------ 199 // 200 // Constructors, destructor and assignment operator 201 // 202 //------------------------------------------- 203 204 BreakIterator::BreakIterator() 205 : actualLocale(Locale::getRoot()), validLocale(Locale::getRoot()), requestLocale(Locale::getRoot()) 206 { 207 } 208 209 BreakIterator::BreakIterator(const BreakIterator &other) 210 : UObject(other), 211 actualLocale(other.actualLocale), 212 validLocale(other.validLocale), 213 requestLocale(other.requestLocale) { 214 } 215 216 BreakIterator &BreakIterator::operator =(const BreakIterator &other) { 217 if (this != &other) { 218 actualLocale = other.actualLocale; 219 validLocale = other.validLocale; 220 requestLocale = other.requestLocale; 221 } 222 return *this; 223 } 224 225 BreakIterator::~BreakIterator() 226 { 227 } 228 229 // ------------------------------------------ 230 // 231 // Registration 232 // 233 //------------------------------------------- 234 #if !UCONFIG_NO_SERVICE 235 236 // ------------------------------------- 237 238 class ICUBreakIteratorFactory : public ICUResourceBundleFactory { 239 public: 240 virtual ~ICUBreakIteratorFactory(); 241 protected: 242 virtual UObject* handleCreate(const Locale& loc, int32_t kind, const ICUService* /*service*/, UErrorCode& status) const override { 243 return BreakIterator::makeInstance(loc, kind, status); 244 } 245 }; 246 247 ICUBreakIteratorFactory::~ICUBreakIteratorFactory() {} 248 249 // ------------------------------------- 250 251 class ICUBreakIteratorService : public ICULocaleService { 252 public: 253 ICUBreakIteratorService() 254 : ICULocaleService(UNICODE_STRING("Break Iterator", 14)) 255 { 256 UErrorCode status = U_ZERO_ERROR; 257 registerFactory(new ICUBreakIteratorFactory(), status); 258 } 259 260 virtual ~ICUBreakIteratorService(); 261 262 virtual UObject* cloneInstance(UObject* instance) const override { 263 return ((BreakIterator*)instance)->clone(); 264 } 265 266 virtual UObject* handleDefault(const ICUServiceKey& key, UnicodeString* /*actualID*/, UErrorCode& status) const override { 267 LocaleKey& lkey = static_cast<LocaleKey&>(const_cast<ICUServiceKey&>(key)); 268 int32_t kind = lkey.kind(); 269 Locale loc; 270 lkey.currentLocale(loc); 271 return BreakIterator::makeInstance(loc, kind, status); 272 } 273 274 virtual UBool isDefault() const override { 275 return countFactories() == 1; 276 } 277 }; 278 279 ICUBreakIteratorService::~ICUBreakIteratorService() {} 280 281 // ------------------------------------- 282 283 // defined in ucln_cmn.h 284 U_NAMESPACE_END 285 286 static icu::UInitOnce gInitOnceBrkiter {}; 287 static icu::ICULocaleService* gService = nullptr; 288 289 290 291 /** 292 * Release all static memory held by breakiterator. 293 */ 294 U_CDECL_BEGIN 295 static UBool U_CALLCONV breakiterator_cleanup() { 296 #if !UCONFIG_NO_SERVICE 297 if (gService) { 298 delete gService; 299 gService = nullptr; 300 } 301 gInitOnceBrkiter.reset(); 302 #endif 303 return true; 304 } 305 U_CDECL_END 306 U_NAMESPACE_BEGIN 307 308 static void U_CALLCONV 309 initService() { 310 gService = new ICUBreakIteratorService(); 311 ucln_common_registerCleanup(UCLN_COMMON_BREAKITERATOR, breakiterator_cleanup); 312 } 313 314 static ICULocaleService* 315 getService() 316 { 317 umtx_initOnce(gInitOnceBrkiter, &initService); 318 return gService; 319 } 320 321 322 // ------------------------------------- 323 324 static inline UBool 325 hasService() 326 { 327 return !gInitOnceBrkiter.isReset() && getService() != nullptr; 328 } 329 330 // ------------------------------------- 331 332 URegistryKey U_EXPORT2 333 BreakIterator::registerInstance(BreakIterator* toAdopt, const Locale& locale, UBreakIteratorType kind, UErrorCode& status) 334 { 335 ICULocaleService *service = getService(); 336 if (service == nullptr) { 337 status = U_MEMORY_ALLOCATION_ERROR; 338 return nullptr; 339 } 340 return service->registerInstance(toAdopt, locale, kind, status); 341 } 342 343 // ------------------------------------- 344 345 UBool U_EXPORT2 346 BreakIterator::unregister(URegistryKey key, UErrorCode& status) 347 { 348 if (U_SUCCESS(status)) { 349 if (hasService()) { 350 return gService->unregister(key, status); 351 } 352 status = U_MEMORY_ALLOCATION_ERROR; 353 } 354 return false; 355 } 356 357 // ------------------------------------- 358 359 StringEnumeration* U_EXPORT2 360 BreakIterator::getAvailableLocales() 361 { 362 ICULocaleService *service = getService(); 363 if (service == nullptr) { 364 return nullptr; 365 } 366 return service->getAvailableLocales(); 367 } 368 #endif /* UCONFIG_NO_SERVICE */ 369 370 // ------------------------------------- 371 372 BreakIterator* 373 BreakIterator::createInstance(const Locale& loc, int32_t kind, UErrorCode& status) 374 { 375 if (U_FAILURE(status)) { 376 return nullptr; 377 } 378 379 #if !UCONFIG_NO_SERVICE 380 if (hasService()) { 381 Locale actualLoc(""); 382 BreakIterator *result = (BreakIterator*)gService->get(loc, kind, &actualLoc, status); 383 // TODO: The way the service code works in ICU 2.8 is that if 384 // there is a real registered break iterator, the actualLoc 385 // will be populated, but if the handleDefault path is taken 386 // (because nothing is registered that can handle the 387 // requested locale) then the actualLoc comes back empty. In 388 // that case, the returned object already has its actual/valid 389 // locale data populated (by makeInstance, which is what 390 // handleDefault calls), so we don't touch it. YES, A COMMENT 391 // THIS LONG is a sign of bad code -- so the action item is to 392 // revisit this in ICU 3.0 and clean it up/fix it/remove it. 393 if (U_SUCCESS(status) && (result != nullptr) && *actualLoc.getName() != 0) { 394 result->actualLocale = actualLoc; 395 result->validLocale = actualLoc; 396 } 397 return result; 398 } 399 else 400 #endif 401 { 402 return makeInstance(loc, kind, status); 403 } 404 } 405 406 // ------------------------------------- 407 enum { kKeyValueLenMax = 32 }; 408 409 BreakIterator* 410 BreakIterator::makeInstance(const Locale& loc, int32_t kind, UErrorCode& status) 411 { 412 413 if (U_FAILURE(status)) { 414 return nullptr; 415 } 416 417 BreakIterator *result = nullptr; 418 switch (kind) { 419 case UBRK_CHARACTER: 420 { 421 UTRACE_ENTRY(UTRACE_UBRK_CREATE_CHARACTER); 422 result = BreakIterator::buildInstance(loc, "grapheme", status); 423 UTRACE_EXIT_STATUS(status); 424 } 425 break; 426 case UBRK_WORD: 427 { 428 UTRACE_ENTRY(UTRACE_UBRK_CREATE_WORD); 429 result = BreakIterator::buildInstance(loc, "word", status); 430 UTRACE_EXIT_STATUS(status); 431 } 432 break; 433 case UBRK_LINE: 434 { 435 char lb_lw[kKeyValueLenMax]; 436 UTRACE_ENTRY(UTRACE_UBRK_CREATE_LINE); 437 uprv_strcpy(lb_lw, "line"); 438 UErrorCode kvStatus = U_ZERO_ERROR; 439 auto value = loc.getKeywordValue<CharString>("lb", kvStatus); 440 if (U_SUCCESS(kvStatus) && (value == "strict" || value == "normal" || value == "loose")) { 441 uprv_strcat(lb_lw, "_"); 442 uprv_strcat(lb_lw, value.data()); 443 } 444 // lw=phrase is only supported in Japanese and Korean 445 if (uprv_strcmp(loc.getLanguage(), "ja") == 0 || uprv_strcmp(loc.getLanguage(), "ko") == 0) { 446 value = loc.getKeywordValue<CharString>("lw", kvStatus); 447 if (U_SUCCESS(kvStatus) && value == "phrase") { 448 uprv_strcat(lb_lw, "_"); 449 uprv_strcat(lb_lw, value.data()); 450 } 451 } 452 result = BreakIterator::buildInstance(loc, lb_lw, status); 453 454 UTRACE_DATA1(UTRACE_INFO, "lb_lw=%s", lb_lw); 455 UTRACE_EXIT_STATUS(status); 456 } 457 break; 458 case UBRK_SENTENCE: 459 { 460 UTRACE_ENTRY(UTRACE_UBRK_CREATE_SENTENCE); 461 result = BreakIterator::buildInstance(loc, "sentence", status); 462 #if !UCONFIG_NO_FILTERED_BREAK_ITERATION 463 char ssKeyValue[kKeyValueLenMax] = {0}; 464 UErrorCode kvStatus = U_ZERO_ERROR; 465 int32_t kLen = loc.getKeywordValue("ss", ssKeyValue, kKeyValueLenMax, kvStatus); 466 if (U_SUCCESS(kvStatus) && kLen > 0 && uprv_strcmp(ssKeyValue,"standard")==0) { 467 FilteredBreakIteratorBuilder* fbiBuilder = FilteredBreakIteratorBuilder::createInstance(loc, kvStatus); 468 if (U_SUCCESS(kvStatus)) { 469 result = fbiBuilder->build(result, status); 470 delete fbiBuilder; 471 } 472 } 473 #endif 474 UTRACE_EXIT_STATUS(status); 475 } 476 break; 477 case UBRK_TITLE: 478 { 479 UTRACE_ENTRY(UTRACE_UBRK_CREATE_TITLE); 480 result = BreakIterator::buildInstance(loc, "title", status); 481 UTRACE_EXIT_STATUS(status); 482 } 483 break; 484 default: 485 status = U_ILLEGAL_ARGUMENT_ERROR; 486 } 487 488 if (U_FAILURE(status)) { 489 delete result; 490 return nullptr; 491 } 492 493 return result; 494 } 495 496 Locale 497 BreakIterator::getLocale(ULocDataLocaleType type, UErrorCode& status) const { 498 if (U_FAILURE(status)) { 499 return Locale::getRoot(); 500 } 501 if (type == ULOC_REQUESTED_LOCALE) { 502 return requestLocale; 503 } 504 return LocaleBased::getLocale(validLocale, actualLocale, type, status); 505 } 506 507 const char * 508 BreakIterator::getLocaleID(ULocDataLocaleType type, UErrorCode& status) const { 509 if (U_FAILURE(status)) { 510 return nullptr; 511 } 512 if (type == ULOC_REQUESTED_LOCALE) { 513 return requestLocale.getName(); 514 } 515 return LocaleBased::getLocaleID(validLocale, actualLocale, type, status); 516 } 517 518 519 // This implementation of getRuleStatus is a do-nothing stub, here to 520 // provide a default implementation for any derived BreakIterator classes that 521 // do not implement it themselves. 522 int32_t BreakIterator::getRuleStatus() const { 523 return 0; 524 } 525 526 // This implementation of getRuleStatusVec is a do-nothing stub, here to 527 // provide a default implementation for any derived BreakIterator classes that 528 // do not implement it themselves. 529 int32_t BreakIterator::getRuleStatusVec(int32_t *fillInVec, int32_t capacity, UErrorCode &status) { 530 if (U_FAILURE(status)) { 531 return 0; 532 } 533 if (capacity < 1) { 534 status = U_BUFFER_OVERFLOW_ERROR; 535 return 1; 536 } 537 *fillInVec = 0; 538 return 1; 539 } 540 541 BreakIterator::BreakIterator(const Locale& valid, const Locale& actual) 542 : actualLocale(actual), validLocale(valid), requestLocale(Locale::getRoot()) { 543 } 544 545 U_NAMESPACE_END 546 547 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */ 548 549 //eof