rbt.cpp (10647B)
1 // © 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html 3 /* 4 ********************************************************************** 5 * Copyright (C) 1999-2015, International Business Machines 6 * Corporation and others. All Rights Reserved. 7 ********************************************************************** 8 * Date Name Description 9 * 11/17/99 aliu Creation. 10 ********************************************************************** 11 */ 12 13 #include "unicode/utypes.h" 14 15 #if !UCONFIG_NO_TRANSLITERATION 16 17 #include "unicode/rep.h" 18 #include "unicode/uniset.h" 19 #include "rbt_pars.h" 20 #include "rbt_data.h" 21 #include "rbt_rule.h" 22 #include "rbt.h" 23 #include "mutex.h" 24 #include "umutex.h" 25 26 U_NAMESPACE_BEGIN 27 28 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(RuleBasedTransliterator) 29 30 static Replaceable *gLockedText = nullptr; 31 32 void RuleBasedTransliterator::_construct(const UnicodeString& rules, 33 UTransDirection direction, 34 UParseError& parseError, 35 UErrorCode& status) { 36 fData = nullptr; 37 isDataOwned = true; 38 if (U_FAILURE(status)) { 39 return; 40 } 41 42 TransliteratorParser parser(status); 43 parser.parse(rules, direction, parseError, status); 44 if (U_FAILURE(status)) { 45 return; 46 } 47 48 if (parser.idBlockVector.size() != 0 || 49 parser.compoundFilter != nullptr || 50 parser.dataVector.size() == 0) { 51 status = U_INVALID_RBT_SYNTAX; // ::ID blocks disallowed in RBT 52 return; 53 } 54 55 fData = static_cast<TransliterationRuleData*>(parser.dataVector.orphanElementAt(0)); 56 setMaximumContextLength(fData->ruleSet.getMaximumContextLength()); 57 } 58 59 /** 60 * Constructs a new transliterator from the given rules. 61 * @param id the id for the transliterator. 62 * @param rules rules, separated by ';' 63 * @param direction either FORWARD or REVERSE. 64 * @param adoptedFilter the filter for this transliterator. 65 * @param parseError Struct to receive information on position 66 * of error if an error is encountered 67 * @param status Output param set to success/failure code. 68 * @exception IllegalArgumentException if rules are malformed 69 * or direction is invalid. 70 */ 71 RuleBasedTransliterator::RuleBasedTransliterator( 72 const UnicodeString& id, 73 const UnicodeString& rules, 74 UTransDirection direction, 75 UnicodeFilter* adoptedFilter, 76 UParseError& parseError, 77 UErrorCode& status) : 78 Transliterator(id, adoptedFilter) { 79 _construct(rules, direction,parseError,status); 80 } 81 82 /** 83 * Constructs a new transliterator from the given rules. 84 * @param id the id for the transliterator. 85 * @param rules rules, separated by ';' 86 * @param direction either FORWARD or REVERSE. 87 * @param adoptedFilter the filter for this transliterator. 88 * @param status Output param set to success/failure code. 89 * @exception IllegalArgumentException if rules are malformed 90 * or direction is invalid. 91 */ 92 /*RuleBasedTransliterator::RuleBasedTransliterator( 93 const UnicodeString& id, 94 const UnicodeString& rules, 95 UTransDirection direction, 96 UnicodeFilter* adoptedFilter, 97 UErrorCode& status) : 98 Transliterator(id, adoptedFilter) { 99 UParseError parseError; 100 _construct(rules, direction,parseError, status); 101 }*/ 102 103 /** 104 * Convenience constructor with no filter. 105 */ 106 /*RuleBasedTransliterator::RuleBasedTransliterator( 107 const UnicodeString& id, 108 const UnicodeString& rules, 109 UTransDirection direction, 110 UErrorCode& status) : 111 Transliterator(id, 0) { 112 UParseError parseError; 113 _construct(rules, direction,parseError, status); 114 }*/ 115 116 /** 117 * Convenience constructor with no filter and FORWARD direction. 118 */ 119 /*RuleBasedTransliterator::RuleBasedTransliterator( 120 const UnicodeString& id, 121 const UnicodeString& rules, 122 UErrorCode& status) : 123 Transliterator(id, 0) { 124 UParseError parseError; 125 _construct(rules, UTRANS_FORWARD, parseError, status); 126 }*/ 127 128 /** 129 * Convenience constructor with FORWARD direction. 130 */ 131 /*RuleBasedTransliterator::RuleBasedTransliterator( 132 const UnicodeString& id, 133 const UnicodeString& rules, 134 UnicodeFilter* adoptedFilter, 135 UErrorCode& status) : 136 Transliterator(id, adoptedFilter) { 137 UParseError parseError; 138 _construct(rules, UTRANS_FORWARD,parseError, status); 139 }*/ 140 141 RuleBasedTransliterator::RuleBasedTransliterator(const UnicodeString& id, 142 const TransliterationRuleData* theData, 143 UnicodeFilter* adoptedFilter) : 144 Transliterator(id, adoptedFilter), 145 fData(const_cast<TransliterationRuleData*>(theData)), // cast away const 146 isDataOwned(false) { 147 setMaximumContextLength(fData->ruleSet.getMaximumContextLength()); 148 } 149 150 /** 151 * Internal constructor. 152 */ 153 RuleBasedTransliterator::RuleBasedTransliterator(const UnicodeString& id, 154 TransliterationRuleData* theData, 155 UBool isDataAdopted) : 156 Transliterator(id, nullptr), 157 fData(theData), 158 isDataOwned(isDataAdopted) { 159 setMaximumContextLength(fData->ruleSet.getMaximumContextLength()); 160 } 161 162 /** 163 * Copy constructor. 164 */ 165 RuleBasedTransliterator::RuleBasedTransliterator( 166 const RuleBasedTransliterator& other) : 167 Transliterator(other), fData(other.fData), 168 isDataOwned(other.isDataOwned) { 169 170 // The data object may or may not be owned. If it is not owned we 171 // share it; it is invariant. If it is owned, it's still 172 // invariant, but we need to copy it to prevent double-deletion. 173 // If this becomes a performance issue (if people do a lot of RBT 174 // copying -- unlikely) we can reference count the data object. 175 176 // Only do a deep copy if this is owned data, that is, data that 177 // will be later deleted. System transliterators contain 178 // non-owned data. 179 if (isDataOwned) { 180 fData = new TransliterationRuleData(*other.fData); 181 } 182 } 183 184 /** 185 * Destructor. 186 */ 187 RuleBasedTransliterator::~RuleBasedTransliterator() { 188 // Delete the data object only if we own it. 189 if (isDataOwned) { 190 delete fData; 191 } 192 } 193 194 RuleBasedTransliterator* 195 RuleBasedTransliterator::clone() const { 196 return new RuleBasedTransliterator(*this); 197 } 198 199 /** 200 * Implements {@link Transliterator#handleTransliterate}. 201 */ 202 void 203 RuleBasedTransliterator::handleTransliterate(Replaceable& text, UTransPosition& index, 204 UBool isIncremental) const { 205 /* We keep contextStart and contextLimit fixed the entire time, 206 * relative to the text -- contextLimit may move numerically if 207 * text is inserted or removed. The start offset moves toward 208 * limit, with replacements happening under it. 209 * 210 * Example: rules 1. ab>x|y 211 * 2. yc>z 212 * 213 * |eabcd begin - no match, advance start 214 * e|abcd match rule 1 - change text & adjust start 215 * ex|ycd match rule 2 - change text & adjust start 216 * exz|d no match, advance start 217 * exzd| done 218 */ 219 220 /* A rule like 221 * a>b|a 222 * creates an infinite loop. To prevent that, we put an arbitrary 223 * limit on the number of iterations that we take, one that is 224 * high enough that any reasonable rules are ok, but low enough to 225 * prevent a server from hanging. The limit is 16 times the 226 * number of characters n, unless n is so large that 16n exceeds a 227 * uint32_t. 228 */ 229 uint32_t loopCount = 0; 230 uint32_t loopLimit = index.limit - index.start; 231 if (loopLimit >= 0x10000000) { 232 loopLimit = 0xFFFFFFFF; 233 } else { 234 loopLimit <<= 4; 235 } 236 237 // Transliterator locking. Rule-based Transliterators are not thread safe; concurrent 238 // operations must be prevented. 239 // A Complication: compound transliterators can result in recursive entries to this 240 // function, sometimes with different "This" objects, always with the same text. 241 // Double-locking must be prevented in these cases. 242 // 243 244 UBool lockedMutexAtThisLevel = false; 245 246 // Test whether this request is operating on the same text string as 247 // some other transliteration that is still in progress and holding the 248 // transliteration mutex. If so, do not lock the transliteration 249 // mutex again. 250 // 251 // gLockedText variable is protected by the global ICU mutex. 252 // Shared RBT data protected by transliteratorDataMutex. 253 // 254 // TODO(andy): Need a better scheme for handling this. 255 256 static UMutex transliteratorDataMutex; 257 UBool needToLock; 258 { 259 Mutex m; 260 needToLock = (&text != gLockedText); 261 } 262 if (needToLock) { 263 umtx_lock(&transliteratorDataMutex); // Contention, longish waits possible here. 264 Mutex m; 265 gLockedText = &text; 266 lockedMutexAtThisLevel = true; 267 } 268 269 // Check to make sure we don't dereference a null pointer. 270 if (fData != nullptr) { 271 while (index.start < index.limit && 272 loopCount <= loopLimit && 273 fData->ruleSet.transliterate(text, index, isIncremental)) { 274 ++loopCount; 275 } 276 } 277 if (lockedMutexAtThisLevel) { 278 { 279 Mutex m; 280 gLockedText = nullptr; 281 } 282 umtx_unlock(&transliteratorDataMutex); 283 } 284 } 285 286 UnicodeString& RuleBasedTransliterator::toRules(UnicodeString& rulesSource, 287 UBool escapeUnprintable) const { 288 return fData->ruleSet.toRules(rulesSource, escapeUnprintable); 289 } 290 291 /** 292 * Implement Transliterator framework 293 */ 294 void RuleBasedTransliterator::handleGetSourceSet(UnicodeSet& result) const { 295 fData->ruleSet.getSourceTargetSet(result, false); 296 } 297 298 /** 299 * Override Transliterator framework 300 */ 301 UnicodeSet& RuleBasedTransliterator::getTargetSet(UnicodeSet& result) const { 302 return fData->ruleSet.getSourceTargetSet(result, true); 303 } 304 305 U_NAMESPACE_END 306 307 #endif /* #if !UCONFIG_NO_TRANSLITERATION */