brkeng.cpp (12200B)
1 // © 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html 3 /* 4 ************************************************************************************ 5 * Copyright (C) 2006-2016, International Business Machines Corporation 6 * and others. All Rights Reserved. 7 ************************************************************************************ 8 */ 9 10 #include "unicode/utypes.h" 11 12 #if !UCONFIG_NO_BREAK_ITERATION 13 14 #include "unicode/uchar.h" 15 #include "unicode/uniset.h" 16 #include "unicode/chariter.h" 17 #include "unicode/ures.h" 18 #include "unicode/udata.h" 19 #include "unicode/putil.h" 20 #include "unicode/ustring.h" 21 #include "unicode/uscript.h" 22 #include "unicode/ucharstrie.h" 23 #include "unicode/bytestrie.h" 24 #include "unicode/rbbi.h" 25 26 #include "brkeng.h" 27 #include "cmemory.h" 28 #include "dictbe.h" 29 #include "lstmbe.h" 30 #include "charstr.h" 31 #include "dictionarydata.h" 32 #include "mutex.h" 33 #include "uvector.h" 34 #include "umutex.h" 35 #include "uresimp.h" 36 #include "ubrkimpl.h" 37 38 U_NAMESPACE_BEGIN 39 40 /* 41 ****************************************************************** 42 */ 43 44 LanguageBreakEngine::LanguageBreakEngine() { 45 } 46 47 LanguageBreakEngine::~LanguageBreakEngine() { 48 } 49 50 /* 51 ****************************************************************** 52 */ 53 54 LanguageBreakFactory::LanguageBreakFactory() { 55 } 56 57 LanguageBreakFactory::~LanguageBreakFactory() { 58 } 59 60 /* 61 ****************************************************************** 62 */ 63 64 UnhandledEngine::UnhandledEngine(UErrorCode &status) : fHandled(nullptr) { 65 (void)status; 66 } 67 68 UnhandledEngine::~UnhandledEngine() { 69 delete fHandled; 70 fHandled = nullptr; 71 } 72 73 UBool 74 UnhandledEngine::handles(UChar32 c, const char* locale) const { 75 (void)locale; // Unused 76 return fHandled && fHandled->contains(c); 77 } 78 79 int32_t 80 UnhandledEngine::findBreaks( UText *text, 81 int32_t startPos, 82 int32_t endPos, 83 UVector32 &/*foundBreaks*/, 84 UBool /* isPhraseBreaking */, 85 UErrorCode &status) const { 86 if (U_FAILURE(status)) return 0; 87 utext_setNativeIndex(text, startPos); 88 UChar32 c = utext_current32(text); 89 while (static_cast<int32_t>(utext_getNativeIndex(text)) < endPos && fHandled->contains(c)) { 90 utext_next32(text); // TODO: recast loop to work with post-increment operations. 91 c = utext_current32(text); 92 } 93 return 0; 94 } 95 96 void 97 UnhandledEngine::handleCharacter(UChar32 c) { 98 if (fHandled == nullptr) { 99 fHandled = new UnicodeSet(); 100 if (fHandled == nullptr) { 101 return; 102 } 103 } 104 if (!fHandled->contains(c)) { 105 UErrorCode status = U_ZERO_ERROR; 106 // Apply the entire script of the character. 107 int32_t script = u_getIntPropertyValue(c, UCHAR_SCRIPT); 108 fHandled->applyIntPropertyValue(UCHAR_SCRIPT, script, status); 109 } 110 } 111 112 /* 113 ****************************************************************** 114 */ 115 116 ICULanguageBreakFactory::ICULanguageBreakFactory(UErrorCode &/*status*/) { 117 fEngines = nullptr; 118 } 119 120 ICULanguageBreakFactory::~ICULanguageBreakFactory() { 121 delete fEngines; 122 } 123 124 void ICULanguageBreakFactory::ensureEngines(UErrorCode& status) { 125 static UMutex gBreakEngineMutex; 126 Mutex m(&gBreakEngineMutex); 127 if (fEngines == nullptr) { 128 LocalPointer<UStack> engines(new UStack(uprv_deleteUObject, nullptr, status), status); 129 if (U_SUCCESS(status)) { 130 fEngines = engines.orphan(); 131 } 132 } 133 } 134 135 const LanguageBreakEngine * 136 ICULanguageBreakFactory::getEngineFor(UChar32 c, const char* locale) { 137 const LanguageBreakEngine *lbe = nullptr; 138 UErrorCode status = U_ZERO_ERROR; 139 ensureEngines(status); 140 if (U_FAILURE(status) ) { 141 // Note: no way to return error code to caller. 142 return nullptr; 143 } 144 145 static UMutex gBreakEngineMutex; 146 Mutex m(&gBreakEngineMutex); 147 int32_t i = fEngines->size(); 148 while (--i >= 0) { 149 lbe = static_cast<const LanguageBreakEngine*>(fEngines->elementAt(i)); 150 if (lbe != nullptr && lbe->handles(c, locale)) { 151 return lbe; 152 } 153 } 154 155 // We didn't find an engine. Create one. 156 lbe = loadEngineFor(c, locale); 157 if (lbe != nullptr) { 158 fEngines->push((void *)lbe, status); 159 } 160 return U_SUCCESS(status) ? lbe : nullptr; 161 } 162 163 const LanguageBreakEngine * 164 ICULanguageBreakFactory::loadEngineFor(UChar32 c, const char*) { 165 UErrorCode status = U_ZERO_ERROR; 166 UScriptCode code = uscript_getScript(c, &status); 167 if (U_SUCCESS(status)) { 168 const LanguageBreakEngine *engine = nullptr; 169 // Try to use LSTM first 170 const LSTMData *data = CreateLSTMDataForScript(code, status); 171 if (U_SUCCESS(status)) { 172 if (data != nullptr) { 173 engine = CreateLSTMBreakEngine(code, data, status); 174 if (U_SUCCESS(status) && engine != nullptr) { 175 return engine; 176 } 177 if (engine != nullptr) { 178 delete engine; 179 engine = nullptr; 180 } else { 181 DeleteLSTMData(data); 182 } 183 } 184 } 185 status = U_ZERO_ERROR; // fallback to dictionary based 186 DictionaryMatcher *m = loadDictionaryMatcherFor(code); 187 if (m != nullptr) { 188 switch(code) { 189 case USCRIPT_THAI: 190 engine = new ThaiBreakEngine(m, status); 191 break; 192 case USCRIPT_LAO: 193 engine = new LaoBreakEngine(m, status); 194 break; 195 case USCRIPT_MYANMAR: 196 engine = new BurmeseBreakEngine(m, status); 197 break; 198 case USCRIPT_KHMER: 199 engine = new KhmerBreakEngine(m, status); 200 break; 201 202 #if !UCONFIG_NO_NORMALIZATION 203 // CJK not available w/o normalization 204 case USCRIPT_HANGUL: 205 engine = new CjkBreakEngine(m, kKorean, status); 206 break; 207 208 // use same BreakEngine and dictionary for both Chinese and Japanese 209 case USCRIPT_HIRAGANA: 210 case USCRIPT_KATAKANA: 211 case USCRIPT_HAN: 212 engine = new CjkBreakEngine(m, kChineseJapanese, status); 213 break; 214 #if 0 215 // TODO: Have to get some characters with script=common handled 216 // by CjkBreakEngine (e.g. U+309B). Simply subjecting 217 // them to CjkBreakEngine does not work. The engine has to 218 // special-case them. 219 case USCRIPT_COMMON: 220 { 221 UBlockCode block = ublock_getCode(code); 222 if (block == UBLOCK_HIRAGANA || block == UBLOCK_KATAKANA) 223 engine = new CjkBreakEngine(dict, kChineseJapanese, status); 224 break; 225 } 226 #endif 227 #endif 228 229 default: 230 break; 231 } 232 if (engine == nullptr) { 233 delete m; 234 } 235 else if (U_FAILURE(status)) { 236 delete engine; 237 engine = nullptr; 238 } 239 return engine; 240 } 241 } 242 return nullptr; 243 } 244 245 DictionaryMatcher * 246 ICULanguageBreakFactory::loadDictionaryMatcherFor(UScriptCode script) { 247 UErrorCode status = U_ZERO_ERROR; 248 // open root from brkitr tree. 249 UResourceBundle *b = ures_open(U_ICUDATA_BRKITR, "", &status); 250 b = ures_getByKeyWithFallback(b, "dictionaries", b, &status); 251 int32_t dictnlength = 0; 252 const char16_t *dictfname = 253 ures_getStringByKeyWithFallback(b, uscript_getShortName(script), &dictnlength, &status); 254 if (U_FAILURE(status)) { 255 ures_close(b); 256 return nullptr; 257 } 258 CharString dictnbuf; 259 CharString ext; 260 const char16_t *extStart = u_memrchr(dictfname, 0x002e, dictnlength); // last dot 261 if (extStart != nullptr) { 262 int32_t len = static_cast<int32_t>(extStart - dictfname); 263 ext.appendInvariantChars(UnicodeString(false, extStart + 1, dictnlength - len - 1), status); 264 dictnlength = len; 265 } 266 dictnbuf.appendInvariantChars(UnicodeString(false, dictfname, dictnlength), status); 267 ures_close(b); 268 269 UDataMemory *file = udata_open(U_ICUDATA_BRKITR, ext.data(), dictnbuf.data(), &status); 270 if (U_SUCCESS(status)) { 271 // build trie 272 const uint8_t* data = static_cast<const uint8_t*>(udata_getMemory(file)); 273 const int32_t* indexes = reinterpret_cast<const int32_t*>(data); 274 const int32_t offset = indexes[DictionaryData::IX_STRING_TRIE_OFFSET]; 275 const int32_t trieType = indexes[DictionaryData::IX_TRIE_TYPE] & DictionaryData::TRIE_TYPE_MASK; 276 DictionaryMatcher *m = nullptr; 277 if (trieType == DictionaryData::TRIE_TYPE_BYTES) { 278 const int32_t transform = indexes[DictionaryData::IX_TRANSFORM]; 279 const char* characters = reinterpret_cast<const char*>(data + offset); 280 m = new BytesDictionaryMatcher(characters, transform, file); 281 } 282 else if (trieType == DictionaryData::TRIE_TYPE_UCHARS) { 283 const char16_t* characters = reinterpret_cast<const char16_t*>(data + offset); 284 m = new UCharsDictionaryMatcher(characters, file); 285 } 286 if (m == nullptr) { 287 // no matcher exists to take ownership - either we are an invalid 288 // type or memory allocation failed 289 udata_close(file); 290 } 291 return m; 292 } else if (dictfname != nullptr) { 293 // we don't have a dictionary matcher. 294 // returning nullptr here will cause us to fail to find a dictionary break engine, as expected 295 status = U_ZERO_ERROR; 296 return nullptr; 297 } 298 return nullptr; 299 } 300 301 302 void ICULanguageBreakFactory::addExternalEngine( 303 ExternalBreakEngine* external, UErrorCode& status) { 304 LocalPointer<ExternalBreakEngine> engine(external, status); 305 ensureEngines(status); 306 LocalPointer<BreakEngineWrapper> wrapper( 307 new BreakEngineWrapper(engine.orphan(), status), status); 308 static UMutex gBreakEngineMutex; 309 Mutex m(&gBreakEngineMutex); 310 fEngines->push(wrapper.getAlias(), status); 311 wrapper.orphan(); 312 } 313 314 BreakEngineWrapper::BreakEngineWrapper( 315 ExternalBreakEngine* engine, UErrorCode &status) : delegate(engine, status) { 316 } 317 318 BreakEngineWrapper::~BreakEngineWrapper() { 319 } 320 321 UBool BreakEngineWrapper::handles(UChar32 c, const char* locale) const { 322 return delegate->isFor(c, locale); 323 } 324 325 int32_t BreakEngineWrapper::findBreaks( 326 UText *text, 327 int32_t startPos, 328 int32_t endPos, 329 UVector32 &foundBreaks, 330 UBool /* isPhraseBreaking */, 331 UErrorCode &status) const { 332 if (U_FAILURE(status)) return 0; 333 int32_t result = 0; 334 335 // Find the span of characters included in the set. 336 // The span to break begins at the current position in the text, and 337 // extends towards the start or end of the text, depending on 'reverse'. 338 339 utext_setNativeIndex(text, startPos); 340 int32_t start = static_cast<int32_t>(utext_getNativeIndex(text)); 341 int32_t current; 342 int32_t rangeStart; 343 int32_t rangeEnd; 344 UChar32 c = utext_current32(text); 345 while ((current = static_cast<int32_t>(utext_getNativeIndex(text))) < endPos && delegate->handles(c)) { 346 utext_next32(text); // TODO: recast loop for postincrement 347 c = utext_current32(text); 348 } 349 rangeStart = start; 350 rangeEnd = current; 351 int32_t beforeSize = foundBreaks.size(); 352 int32_t additionalCapacity = rangeEnd - rangeStart + 1; 353 // enlarge to contains (rangeEnd-rangeStart+1) more items 354 foundBreaks.ensureCapacity(beforeSize+additionalCapacity, status); 355 if (U_FAILURE(status)) return 0; 356 foundBreaks.setSize(beforeSize + beforeSize+additionalCapacity); 357 result = delegate->fillBreaks(text, rangeStart, rangeEnd, foundBreaks.getBuffer()+beforeSize, 358 additionalCapacity, status); 359 if (U_FAILURE(status)) return 0; 360 foundBreaks.setSize(beforeSize + result); 361 utext_setNativeIndex(text, current); 362 return result; 363 } 364 365 U_NAMESPACE_END 366 367 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */