uniset_closure.cpp (12080B)
1 // © 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html 3 /* 4 ******************************************************************************* 5 * 6 * Copyright (C) 2011, International Business Machines 7 * Corporation and others. All Rights Reserved. 8 * 9 ******************************************************************************* 10 * file name: uniset_closure.cpp 11 * encoding: UTF-8 12 * tab size: 8 (not used) 13 * indentation:4 14 * 15 * created on: 2011may30 16 * created by: Markus W. Scherer 17 * 18 * UnicodeSet::closeOver() and related methods moved here from uniset_props.cpp 19 * to simplify dependencies. 20 * In particular, this depends on the BreakIterator, but the BreakIterator 21 * code also builds UnicodeSets from patterns and needs uniset_props. 22 */ 23 24 #include "unicode/brkiter.h" 25 #include "unicode/locid.h" 26 #include "unicode/parsepos.h" 27 #include "unicode/uniset.h" 28 #include "unicode/utf16.h" 29 #include "cmemory.h" 30 #include "ruleiter.h" 31 #include "ucase.h" 32 #include "uprops.h" 33 #include "util.h" 34 #include "uvector.h" 35 36 U_NAMESPACE_BEGIN 37 38 // TODO memory debugging provided inside uniset.cpp 39 // could be made available here but probably obsolete with use of modern 40 // memory leak checker tools 41 #define _dbgct(me) 42 43 //---------------------------------------------------------------- 44 // Constructors &c 45 //---------------------------------------------------------------- 46 47 UnicodeSet::UnicodeSet(const UnicodeString& pattern, 48 uint32_t options, 49 const SymbolTable* symbols, 50 UErrorCode& status) { 51 applyPattern(pattern, options, symbols, status); 52 _dbgct(this); 53 } 54 55 UnicodeSet::UnicodeSet(const UnicodeString& pattern, ParsePosition& pos, 56 uint32_t options, 57 const SymbolTable* symbols, 58 UErrorCode& status) { 59 applyPattern(pattern, pos, options, symbols, status); 60 _dbgct(this); 61 } 62 63 //---------------------------------------------------------------- 64 // Public API 65 //---------------------------------------------------------------- 66 67 UnicodeSet& UnicodeSet::applyPattern(const UnicodeString& pattern, 68 uint32_t options, 69 const SymbolTable* symbols, 70 UErrorCode& status) { 71 ParsePosition pos(0); 72 applyPattern(pattern, pos, options, symbols, status); 73 if (U_FAILURE(status)) return *this; 74 75 int32_t i = pos.getIndex(); 76 77 if (options & USET_IGNORE_SPACE) { 78 // Skip over trailing whitespace 79 ICU_Utility::skipWhitespace(pattern, i, true); 80 } 81 82 if (i != pattern.length()) { 83 status = U_ILLEGAL_ARGUMENT_ERROR; 84 } 85 return *this; 86 } 87 88 UnicodeSet& UnicodeSet::applyPattern(const UnicodeString& pattern, 89 ParsePosition& pos, 90 uint32_t options, 91 const SymbolTable* symbols, 92 UErrorCode& status) { 93 if (U_FAILURE(status)) { 94 return *this; 95 } 96 if (isFrozen()) { 97 status = U_NO_WRITE_PERMISSION; 98 return *this; 99 } 100 // Need to build the pattern in a temporary string because 101 // _applyPattern calls add() etc., which set pat to empty. 102 UnicodeString rebuiltPat; 103 RuleCharacterIterator chars(pattern, symbols, pos); 104 applyPattern(chars, symbols, rebuiltPat, options, &UnicodeSet::closeOver, 0, status); 105 if (U_FAILURE(status)) return *this; 106 if (chars.inVariable()) { 107 // syntaxError(chars, "Extra chars in variable value"); 108 status = U_MALFORMED_SET; 109 return *this; 110 } 111 setPattern(rebuiltPat); 112 return *this; 113 } 114 115 // USetAdder implementation 116 // Does not use uset.h to reduce code dependencies 117 static void U_CALLCONV 118 _set_add(USet *set, UChar32 c) { 119 reinterpret_cast<UnicodeSet*>(set)->add(c); 120 } 121 122 static void U_CALLCONV 123 _set_addRange(USet *set, UChar32 start, UChar32 end) { 124 reinterpret_cast<UnicodeSet*>(set)->add(start, end); 125 } 126 127 static void U_CALLCONV 128 _set_addString(USet *set, const char16_t *str, int32_t length) { 129 reinterpret_cast<UnicodeSet*>(set)->add(UnicodeString(static_cast<UBool>(length < 0), str, length)); 130 } 131 132 //---------------------------------------------------------------- 133 // Case folding API 134 //---------------------------------------------------------------- 135 136 // add the result of a full case mapping to the set 137 // use str as a temporary string to avoid constructing one 138 static inline void 139 addCaseMapping(UnicodeSet &set, int32_t result, const char16_t *full, UnicodeString &str) { 140 if(result >= 0) { 141 if(result > UCASE_MAX_STRING_LENGTH) { 142 // add a single-code point case mapping 143 set.add(result); 144 } else { 145 // add a string case mapping from full with length result 146 str.setTo(static_cast<UBool>(false), full, result); 147 set.add(str); 148 } 149 } 150 // result < 0: the code point mapped to itself, no need to add it 151 // see ucase.h 152 } 153 154 namespace { 155 156 /** For case closure on a large set, look only at code points with relevant properties. */ 157 const UnicodeSet &maybeOnlyCaseSensitive(const UnicodeSet &src, UnicodeSet &subset) { 158 // The subset must have been constructed with all code points, 159 // so that the retainAll() intersection effectively copies all single code points from src. 160 U_ASSERT(subset.contains(0, 0x10ffff)); 161 if (src.size() < 30) { 162 return src; 163 } 164 // Return the intersection of the src code points with Case_Sensitive ones. 165 UErrorCode errorCode = U_ZERO_ERROR; 166 const UnicodeSet *sensitive = 167 CharacterProperties::getBinaryPropertySet(UCHAR_CASE_SENSITIVE, errorCode); 168 if (U_FAILURE(errorCode)) { 169 return src; 170 } 171 // Start by copying the "smaller" set. 172 // (We "copy" by intersecting all Unicode *code points* with the first set, 173 // which omits any strings.) 174 if (src.getRangeCount() > sensitive->getRangeCount()) { 175 subset.retainAll(*sensitive); 176 subset.retainAll(src); 177 } else { 178 subset.retainAll(src); 179 subset.retainAll(*sensitive); 180 } 181 return subset; 182 } 183 184 // Per-character scf = Simple_Case_Folding of a string. 185 // (Normally when we case-fold a string we use full case foldings.) 186 bool scfString(const UnicodeString &s, UnicodeString &scf) { 187 // Iterate over the raw buffer for best performance. 188 const char16_t *p = s.getBuffer(); 189 int32_t length = s.length(); 190 // Loop while not needing modification. 191 for (int32_t i = 0; i < length;) { 192 UChar32 c; 193 U16_NEXT(p, i, length, c); // post-increments i 194 UChar32 scfChar = u_foldCase(c, U_FOLD_CASE_DEFAULT); 195 if (scfChar != c) { 196 // Copy the characters before c. 197 scf.setTo(p, i - U16_LENGTH(c)); 198 // Loop over the rest of the string and keep case-folding. 199 for (;;) { 200 scf.append(scfChar); 201 if (i == length) { 202 return true; 203 } 204 U16_NEXT(p, i, length, c); // post-increments i 205 scfChar = u_foldCase(c, U_FOLD_CASE_DEFAULT); 206 } 207 } 208 } 209 return false; 210 } 211 212 } // namespace 213 214 UnicodeSet& UnicodeSet::closeOver(int32_t attribute) { 215 if (isFrozen() || isBogus()) { 216 return *this; 217 } 218 switch (attribute & USET_CASE_MASK) { 219 case 0: 220 break; 221 case USET_CASE_INSENSITIVE: 222 closeOverCaseInsensitive(/* simple= */ false); 223 break; 224 case USET_ADD_CASE_MAPPINGS: 225 closeOverAddCaseMappings(); 226 break; 227 case USET_SIMPLE_CASE_INSENSITIVE: 228 closeOverCaseInsensitive(/* simple= */ true); 229 break; 230 default: 231 // bad option (unreachable) 232 break; 233 } 234 return *this; 235 } 236 237 void UnicodeSet::closeOverCaseInsensitive(bool simple) { 238 // Start with input set to guarantee inclusion. 239 UnicodeSet foldSet(*this); 240 // Full case mappings closure: 241 // Remove strings because the strings will actually be reduced (folded); 242 // therefore, start with no strings and add only those needed. 243 // Do this before processing code points, because they may add strings. 244 if (!simple && foldSet.hasStrings()) { 245 foldSet.strings_->removeAllElements(); 246 } 247 248 USetAdder sa = { 249 foldSet.toUSet(), 250 _set_add, 251 _set_addRange, 252 _set_addString, 253 nullptr, // don't need remove() 254 nullptr // don't need removeRange() 255 }; 256 257 UnicodeSet subset(0, 0x10ffff); 258 const UnicodeSet &codePoints = maybeOnlyCaseSensitive(*this, subset); 259 260 // Iterate over the ranges of single code points. Nested loop for each code point. 261 int32_t n = codePoints.getRangeCount(); 262 263 for (int32_t i=0; i<n; ++i) { 264 UChar32 start = codePoints.getRangeStart(i); 265 UChar32 end = codePoints.getRangeEnd(i); 266 267 if (simple) { 268 for (UChar32 cp=start; cp<=end; ++cp) { 269 ucase_addSimpleCaseClosure(cp, &sa); 270 } 271 } else { 272 for (UChar32 cp=start; cp<=end; ++cp) { 273 ucase_addCaseClosure(cp, &sa); 274 } 275 } 276 } 277 if (hasStrings()) { 278 UnicodeString str; 279 for (int32_t j=0; j<strings_->size(); ++j) { 280 const UnicodeString* pStr = static_cast<const UnicodeString*>(strings_->elementAt(j)); 281 if (simple) { 282 if (scfString(*pStr, str)) { 283 foldSet.remove(*pStr).add(str); 284 } 285 } else { 286 str = *pStr; 287 str.foldCase(); 288 if(!ucase_addStringCaseClosure(str.getBuffer(), str.length(), &sa)) { 289 foldSet.add(str); // does not map to code points: add the folded string itself 290 } 291 } 292 } 293 } 294 *this = foldSet; 295 } 296 297 void UnicodeSet::closeOverAddCaseMappings() { 298 // Start with input set to guarantee inclusion. 299 UnicodeSet foldSet(*this); 300 301 UnicodeSet subset(0, 0x10ffff); 302 const UnicodeSet &codePoints = maybeOnlyCaseSensitive(*this, subset); 303 304 // Iterate over the ranges of single code points. Nested loop for each code point. 305 int32_t n = codePoints.getRangeCount(); 306 UChar32 result; 307 const char16_t *full; 308 UnicodeString str; 309 310 for (int32_t i=0; i<n; ++i) { 311 UChar32 start = codePoints.getRangeStart(i); 312 UChar32 end = codePoints.getRangeEnd(i); 313 314 // add case mappings 315 // (does not add long s for regular s, or Kelvin for k, for example) 316 for (UChar32 cp=start; cp<=end; ++cp) { 317 result = ucase_toFullLower(cp, nullptr, nullptr, &full, UCASE_LOC_ROOT); 318 addCaseMapping(foldSet, result, full, str); 319 320 result = ucase_toFullTitle(cp, nullptr, nullptr, &full, UCASE_LOC_ROOT); 321 addCaseMapping(foldSet, result, full, str); 322 323 result = ucase_toFullUpper(cp, nullptr, nullptr, &full, UCASE_LOC_ROOT); 324 addCaseMapping(foldSet, result, full, str); 325 326 result = ucase_toFullFolding(cp, &full, 0); 327 addCaseMapping(foldSet, result, full, str); 328 } 329 } 330 if (hasStrings()) { 331 Locale root(""); 332 #if !UCONFIG_NO_BREAK_ITERATION 333 UErrorCode status = U_ZERO_ERROR; 334 BreakIterator *bi = BreakIterator::createWordInstance(root, status); 335 if (U_SUCCESS(status)) { 336 #endif 337 for (int32_t j=0; j<strings_->size(); ++j) { 338 const UnicodeString* pStr = static_cast<const UnicodeString*>(strings_->elementAt(j)); 339 (str = *pStr).toLower(root); 340 foldSet.add(str); 341 #if !UCONFIG_NO_BREAK_ITERATION 342 (str = *pStr).toTitle(bi, root); 343 foldSet.add(str); 344 #endif 345 (str = *pStr).toUpper(root); 346 foldSet.add(str); 347 (str = *pStr).foldCase(); 348 foldSet.add(str); 349 } 350 #if !UCONFIG_NO_BREAK_ITERATION 351 } 352 delete bi; 353 #endif 354 } 355 *this = foldSet; 356 } 357 358 U_NAMESPACE_END