rbt_set.cpp (16003B)
1 // © 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html 3 /* 4 ********************************************************************** 5 * Copyright (C) 1999-2011, International Business Machines 6 * Corporation and others. All Rights Reserved. 7 ********************************************************************** 8 * Date Name Description 9 * 11/17/99 aliu Creation. 10 ********************************************************************** 11 */ 12 13 #include "unicode/utypes.h" 14 15 #if !UCONFIG_NO_TRANSLITERATION 16 17 #include "unicode/unistr.h" 18 #include "unicode/uniset.h" 19 #include "unicode/utf16.h" 20 #include "rbt_set.h" 21 #include "rbt_rule.h" 22 #include "cmemory.h" 23 #include "putilimp.h" 24 25 U_CDECL_BEGIN 26 static void U_CALLCONV _deleteRule(void *rule) { 27 delete (icu::TransliterationRule *)rule; 28 } 29 U_CDECL_END 30 31 //---------------------------------------------------------------------- 32 // BEGIN Debugging support 33 //---------------------------------------------------------------------- 34 35 // #define DEBUG_RBT 36 37 #ifdef DEBUG_RBT 38 #include <stdio.h> 39 #include "charstr.h" 40 41 /** 42 * @param appendTo result is appended to this param. 43 * @param input the string being transliterated 44 * @param pos the index struct 45 */ 46 static UnicodeString& _formatInput(UnicodeString &appendTo, 47 const UnicodeString& input, 48 const UTransPosition& pos) { 49 // Output a string of the form aaa{bbb|ccc|ddd}eee, where 50 // the {} indicate the context start and limit, and the || 51 // indicate the start and limit. 52 if (0 <= pos.contextStart && 53 pos.contextStart <= pos.start && 54 pos.start <= pos.limit && 55 pos.limit <= pos.contextLimit && 56 pos.contextLimit <= input.length()) { 57 58 UnicodeString a, b, c, d, e; 59 input.extractBetween(0, pos.contextStart, a); 60 input.extractBetween(pos.contextStart, pos.start, b); 61 input.extractBetween(pos.start, pos.limit, c); 62 input.extractBetween(pos.limit, pos.contextLimit, d); 63 input.extractBetween(pos.contextLimit, input.length(), e); 64 appendTo.append(a).append((char16_t)123/*{*/).append(b). 65 append((char16_t)124/*|*/).append(c).append((char16_t)124/*|*/).append(d). 66 append((char16_t)125/*}*/).append(e); 67 } else { 68 appendTo.append("INVALID UTransPosition"); 69 //appendTo.append((UnicodeString)"INVALID UTransPosition {cs=" + 70 // pos.contextStart + ", s=" + pos.start + ", l=" + 71 // pos.limit + ", cl=" + pos.contextLimit + "} on " + 72 // input); 73 } 74 return appendTo; 75 } 76 77 // Append a hex string to the target 78 UnicodeString& _appendHex(uint32_t number, 79 int32_t digits, 80 UnicodeString& target) { 81 static const char16_t digitString[] = { 82 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 83 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0 84 }; 85 while (digits--) { 86 target += digitString[(number >> (digits*4)) & 0xF]; 87 } 88 return target; 89 } 90 91 // Replace nonprintable characters with unicode escapes 92 UnicodeString& _escape(const UnicodeString &source, 93 UnicodeString &target) { 94 for (int32_t i = 0; i < source.length(); ) { 95 UChar32 ch = source.char32At(i); 96 i += U16_LENGTH(ch); 97 if (ch < 0x09 || (ch > 0x0A && ch < 0x20)|| ch > 0x7E) { 98 if (ch <= 0xFFFF) { 99 target += "\\u"; 100 _appendHex(ch, 4, target); 101 } else { 102 target += "\\U"; 103 _appendHex(ch, 8, target); 104 } 105 } else { 106 target += ch; 107 } 108 } 109 return target; 110 } 111 112 inline void _debugOut(const char* msg, TransliterationRule* rule, 113 const Replaceable& theText, UTransPosition& pos) { 114 UnicodeString buf(msg, ""); 115 if (rule) { 116 UnicodeString r; 117 rule->toRule(r, true); 118 buf.append((char16_t)32).append(r); 119 } 120 buf.append(UnicodeString(" => ", "")); 121 UnicodeString* text = (UnicodeString*)&theText; 122 _formatInput(buf, *text, pos); 123 UnicodeString esc; 124 _escape(buf, esc); 125 CharString cbuf(esc); 126 printf("%s\n", (const char*) cbuf); 127 } 128 129 #else 130 #define _debugOut(msg, rule, theText, pos) 131 #endif 132 133 //---------------------------------------------------------------------- 134 // END Debugging support 135 //---------------------------------------------------------------------- 136 137 // Fill the precontext and postcontext with the patterns of the rules 138 // that are masking one another. 139 static void maskingError(const icu::TransliterationRule& rule1, 140 const icu::TransliterationRule& rule2, 141 UParseError& parseError) { 142 icu::UnicodeString r; 143 int32_t len; 144 145 parseError.line = parseError.offset = -1; 146 147 // for pre-context 148 rule1.toRule(r, false); 149 len = uprv_min(r.length(), U_PARSE_CONTEXT_LEN-1); 150 r.extract(0, len, parseError.preContext); 151 parseError.preContext[len] = 0; 152 153 //for post-context 154 r.truncate(0); 155 rule2.toRule(r, false); 156 len = uprv_min(r.length(), U_PARSE_CONTEXT_LEN-1); 157 r.extract(0, len, parseError.postContext); 158 parseError.postContext[len] = 0; 159 } 160 161 U_NAMESPACE_BEGIN 162 163 /** 164 * Construct a new empty rule set. 165 */ 166 TransliterationRuleSet::TransliterationRuleSet(UErrorCode& status) : 167 UMemory(), ruleVector(nullptr), rules(nullptr), index {}, maxContextLength(0) { 168 LocalPointer<UVector> lpRuleVector(new UVector(_deleteRule, nullptr, status), status); 169 if (U_FAILURE(status)) { 170 return; 171 } 172 ruleVector = lpRuleVector.orphan(); 173 } 174 175 /** 176 * Copy constructor. 177 */ 178 TransliterationRuleSet::TransliterationRuleSet(const TransliterationRuleSet& other) : 179 UMemory(other), 180 ruleVector(nullptr), 181 rules(nullptr), 182 maxContextLength(other.maxContextLength) { 183 184 int32_t i, len; 185 uprv_memcpy(index, other.index, sizeof(index)); 186 UErrorCode status = U_ZERO_ERROR; 187 LocalPointer<UVector> lpRuleVector(new UVector(_deleteRule, nullptr, status), status); 188 if (U_FAILURE(status)) { 189 return; 190 } 191 ruleVector = lpRuleVector.orphan(); 192 if (other.ruleVector != nullptr && U_SUCCESS(status)) { 193 len = other.ruleVector->size(); 194 for (i=0; i<len && U_SUCCESS(status); ++i) { 195 LocalPointer<TransliterationRule> tempTranslitRule( 196 new TransliterationRule(*static_cast<TransliterationRule*>(other.ruleVector->elementAt(i))), status); 197 ruleVector->adoptElement(tempTranslitRule.orphan(), status); 198 } 199 } 200 if (other.rules != nullptr && U_SUCCESS(status)) { 201 UParseError p; 202 freeze(p, status); 203 } 204 } 205 206 /** 207 * Destructor. 208 */ 209 TransliterationRuleSet::~TransliterationRuleSet() { 210 delete ruleVector; // This deletes the contained rules 211 uprv_free(rules); 212 } 213 214 void TransliterationRuleSet::setData(const TransliterationRuleData* d) { 215 /** 216 * We assume that the ruleset has already been frozen. 217 */ 218 int32_t len = index[256]; // see freeze() 219 for (int32_t i=0; i<len; ++i) { 220 rules[i]->setData(d); 221 } 222 } 223 224 /** 225 * Return the maximum context length. 226 * @return the length of the longest preceding context. 227 */ 228 int32_t TransliterationRuleSet::getMaximumContextLength() const { 229 return maxContextLength; 230 } 231 232 /** 233 * Add a rule to this set. Rules are added in order, and order is 234 * significant. The last call to this method must be followed by 235 * a call to <code>freeze()</code> before the rule set is used. 236 * 237 * <p>If freeze() has already been called, calling addRule() 238 * unfreezes the rules, and freeze() must be called again. 239 * 240 * @param adoptedRule the rule to add 241 */ 242 void TransliterationRuleSet::addRule(TransliterationRule* adoptedRule, 243 UErrorCode& status) { 244 LocalPointer<TransliterationRule> lpAdoptedRule(adoptedRule); 245 ruleVector->adoptElement(lpAdoptedRule.orphan(), status); 246 if (U_FAILURE(status)) { 247 return; 248 } 249 250 int32_t len; 251 if ((len = adoptedRule->getContextLength()) > maxContextLength) { 252 maxContextLength = len; 253 } 254 255 uprv_free(rules); 256 rules = nullptr; 257 } 258 259 /** 260 * Check this for masked rules and index it to optimize performance. 261 * The sequence of operations is: (1) add rules to a set using 262 * <code>addRule()</code>; (2) freeze the set using 263 * <code>freeze()</code>; (3) use the rule set. If 264 * <code>addRule()</code> is called after calling this method, it 265 * invalidates this object, and this method must be called again. 266 * That is, <code>freeze()</code> may be called multiple times, 267 * although for optimal performance it shouldn't be. 268 */ 269 void TransliterationRuleSet::freeze(UParseError& parseError,UErrorCode& status) { 270 /* Construct the rule array and index table. We reorder the 271 * rules by sorting them into 256 bins. Each bin contains all 272 * rules matching the index value for that bin. A rule 273 * matches an index value if string whose first key character 274 * has a low byte equal to the index value can match the rule. 275 * 276 * Each bin contains zero or more rules, in the same order 277 * they were found originally. However, the total rules in 278 * the bins may exceed the number in the original vector, 279 * since rules that have a variable as their first key 280 * character will generally fall into more than one bin. 281 * 282 * That is, each bin contains all rules that either have that 283 * first index value as their first key character, or have 284 * a set containing the index value as their first character. 285 */ 286 int32_t n = ruleVector->size(); 287 int32_t j; 288 int16_t x; 289 UVector v(2*n, status); // heuristic; adjust as needed 290 291 if (U_FAILURE(status)) { 292 return; 293 } 294 295 /* Precompute the index values. This saves a LOT of time. 296 * Be careful not to call malloc(0). 297 */ 298 int16_t* indexValue = static_cast<int16_t*>(uprv_malloc(sizeof(int16_t) * (n > 0 ? n : 1))); 299 /* test for nullptr */ 300 if (indexValue == nullptr) { 301 status = U_MEMORY_ALLOCATION_ERROR; 302 return; 303 } 304 for (j=0; j<n; ++j) { 305 TransliterationRule* r = static_cast<TransliterationRule*>(ruleVector->elementAt(j)); 306 indexValue[j] = r->getIndexValue(); 307 } 308 for (x=0; x<256; ++x) { 309 index[x] = v.size(); 310 for (j=0; j<n; ++j) { 311 if (indexValue[j] >= 0) { 312 if (indexValue[j] == x) { 313 v.addElement(ruleVector->elementAt(j), status); 314 } 315 } else { 316 // If the indexValue is < 0, then the first key character is 317 // a set, and we must use the more time-consuming 318 // matchesIndexValue check. In practice this happens 319 // rarely, so we seldom treat this code path. 320 TransliterationRule* r = static_cast<TransliterationRule*>(ruleVector->elementAt(j)); 321 if (r->matchesIndexValue(static_cast<uint8_t>(x))) { 322 v.addElement(r, status); 323 } 324 } 325 } 326 } 327 uprv_free(indexValue); 328 index[256] = v.size(); 329 if (U_FAILURE(status)) { 330 return; 331 } 332 333 /* Freeze things into an array. 334 */ 335 uprv_free(rules); // Contains alias pointers 336 337 /* You can't do malloc(0)! */ 338 if (v.size() == 0) { 339 rules = nullptr; 340 return; 341 } 342 rules = static_cast<TransliterationRule**>(uprv_malloc(v.size() * sizeof(TransliterationRule*))); 343 /* test for nullptr */ 344 if (rules == nullptr) { 345 status = U_MEMORY_ALLOCATION_ERROR; 346 return; 347 } 348 for (j=0; j<v.size(); ++j) { 349 rules[j] = static_cast<TransliterationRule*>(v.elementAt(j)); 350 } 351 352 // TODO Add error reporting that indicates the rules that 353 // are being masked. 354 //UnicodeString errors; 355 356 /* Check for masking. This is MUCH faster than our old check, 357 * which was each rule against each following rule, since we 358 * only have to check for masking within each bin now. It's 359 * 256*O(n2^2) instead of O(n1^2), where n1 is the total rule 360 * count, and n2 is the per-bin rule count. But n2<<n1, so 361 * it's a big win. 362 */ 363 for (x=0; x<256; ++x) { 364 for (j=index[x]; j<index[x+1]-1; ++j) { 365 TransliterationRule* r1 = rules[j]; 366 for (int32_t k=j+1; k<index[x+1]; ++k) { 367 TransliterationRule* r2 = rules[k]; 368 if (r1->masks(*r2)) { 369 //| if (errors == null) { 370 //| errors = new StringBuffer(); 371 //| } else { 372 //| errors.append("\n"); 373 //| } 374 //| errors.append("Rule " + r1 + " masks " + r2); 375 status = U_RULE_MASK_ERROR; 376 maskingError(*r1, *r2, parseError); 377 return; 378 } 379 } 380 } 381 } 382 383 //if (errors != null) { 384 // throw new IllegalArgumentException(errors.toString()); 385 //} 386 } 387 388 /** 389 * Transliterate the given text with the given UTransPosition 390 * indices. Return true if the transliteration should continue 391 * or false if it should halt (because of a U_PARTIAL_MATCH match). 392 * Note that false is only ever returned if isIncremental is true. 393 * @param text the text to be transliterated 394 * @param pos the position indices, which will be updated 395 * @param incremental if true, assume new text may be inserted 396 * at index.limit, and return false if there is a partial match. 397 * @return true unless a U_PARTIAL_MATCH has been obtained, 398 * indicating that transliteration should stop until more text 399 * arrives. 400 */ 401 UBool TransliterationRuleSet::transliterate(Replaceable& text, 402 UTransPosition& pos, 403 UBool incremental) { 404 int16_t indexByte = static_cast<int16_t>(text.char32At(pos.start) & 0xFF); 405 for (int32_t i=index[indexByte]; i<index[indexByte+1]; ++i) { 406 UMatchDegree m = rules[i]->matchAndReplace(text, pos, incremental); 407 switch (m) { 408 case U_MATCH: 409 _debugOut("match", rules[i], text, pos); 410 return true; 411 case U_PARTIAL_MATCH: 412 _debugOut("partial match", rules[i], text, pos); 413 return false; 414 default: /* Ram: added default to make GCC happy */ 415 break; 416 } 417 } 418 // No match or partial match from any rule 419 pos.start += U16_LENGTH(text.char32At(pos.start)); 420 _debugOut("no match", nullptr, text, pos); 421 return true; 422 } 423 424 /** 425 * Create rule strings that represents this rule set. 426 */ 427 UnicodeString& TransliterationRuleSet::toRules(UnicodeString& ruleSource, 428 UBool escapeUnprintable) const { 429 int32_t i; 430 int32_t count = ruleVector->size(); 431 ruleSource.truncate(0); 432 for (i=0; i<count; ++i) { 433 if (i != 0) { 434 ruleSource.append(static_cast<char16_t>(0x000A) /*\n*/); 435 } 436 TransliterationRule *r = 437 static_cast<TransliterationRule*>(ruleVector->elementAt(i)); 438 r->toRule(ruleSource, escapeUnprintable); 439 } 440 return ruleSource; 441 } 442 443 /** 444 * Return the set of all characters that may be modified 445 * (getTarget=false) or emitted (getTarget=true) by this set. 446 */ 447 UnicodeSet& TransliterationRuleSet::getSourceTargetSet(UnicodeSet& result, 448 UBool getTarget) const 449 { 450 result.clear(); 451 int32_t count = ruleVector->size(); 452 for (int32_t i=0; i<count; ++i) { 453 TransliterationRule* r = 454 static_cast<TransliterationRule*>(ruleVector->elementAt(i)); 455 if (getTarget) { 456 r->addTargetSetTo(result); 457 } else { 458 r->addSourceSetTo(result); 459 } 460 } 461 return result; 462 } 463 464 U_NAMESPACE_END 465 466 #endif /* #if !UCONFIG_NO_TRANSLITERATION */