unesctrn.cpp (9564B)
1 // © 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html 3 /* 4 ********************************************************************** 5 * Copyright (c) 2001-2011, International Business Machines 6 * Corporation and others. All Rights Reserved. 7 ********************************************************************** 8 * Date Name Description 9 * 11/19/2001 aliu Creation. 10 ********************************************************************** 11 */ 12 13 #include "unicode/utypes.h" 14 15 #if !UCONFIG_NO_TRANSLITERATION 16 17 #include "unicode/uchar.h" 18 #include "unicode/utf16.h" 19 #include "unesctrn.h" 20 #include "util.h" 21 22 #include "cmemory.h" 23 24 U_NAMESPACE_BEGIN 25 26 /** 27 * Special character marking the end of the spec[] array. 28 */ 29 static const char16_t END = 0xFFFF; 30 31 // Unicode: "U+10FFFF" hex, min=4, max=6 32 static const char16_t SPEC_Unicode[] = { 33 2, 0, 16, 4, 6, 85/*U*/, 43/*+*/, 34 END 35 }; 36 37 // Java: "\\uFFFF" hex, min=4, max=4 38 static const char16_t SPEC_Java[] = { 39 2, 0, 16, 4, 4, 92/*\*/, 117/*u*/, 40 END 41 }; 42 43 // C: "\\uFFFF" hex, min=4, max=4; \\U0010FFFF hex, min=8, max=8 44 static const char16_t SPEC_C[] = { 45 2, 0, 16, 4, 4, 92/*\*/, 117/*u*/, 46 2, 0, 16, 8, 8, 92/*\*/, 85/*U*/, 47 END 48 }; 49 50 // XML: "" hex, min=1, max=6 51 static const char16_t SPEC_XML[] = { 52 3, 1, 16, 1, 6, 38/*&*/, 35/*#*/, 120/*x*/, 59/*;*/, 53 END 54 }; 55 56 // XML10: "" dec, min=1, max=7 (not really "Hex-Any") 57 static const char16_t SPEC_XML10[] = { 58 2, 1, 10, 1, 7, 38/*&*/, 35/*#*/, 59/*;*/, 59 END 60 }; 61 62 // Perl: "\\x{263A}" hex, min=1, max=6 63 static const char16_t SPEC_Perl[] = { 64 3, 1, 16, 1, 6, 92/*\*/, 120/*x*/, 123/*{*/, 125/*}*/, 65 END 66 }; 67 68 // All: Java, C, Perl, XML, XML10, Unicode 69 static const char16_t SPEC_Any[] = { 70 2, 0, 16, 4, 6, 85/*U*/, 43/*+*/, // Unicode 71 2, 0, 16, 4, 4, 92/*\*/, 117/*u*/, // Java 72 2, 0, 16, 8, 8, 92/*\*/, 85/*U*/, // C (surrogates) 73 3, 1, 16, 1, 6, 38/*&*/, 35/*#*/, 120/*x*/, 59/*;*/, // XML 74 2, 1, 10, 1, 7, 38/*&*/, 35/*#*/, 59/*;*/, // XML10 75 3, 1, 16, 1, 6, 92/*\*/, 120/*x*/, 123/*{*/, 125/*}*/, // Perl 76 END 77 }; 78 79 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(UnescapeTransliterator) 80 81 static char16_t* copySpec(const char16_t* spec) { 82 int32_t len = 0; 83 while (spec[len] != END) { 84 ++len; 85 } 86 ++len; 87 char16_t* result = static_cast<char16_t*>(uprv_malloc(len * sizeof(char16_t))); 88 // Check for memory allocation error. 89 if (result != nullptr) { 90 uprv_memcpy(result, spec, (size_t)len*sizeof(result[0])); 91 } 92 return result; 93 } 94 95 /** 96 * Factory methods. Ignore the context. 97 */ 98 static Transliterator* _createUnicode(const UnicodeString& ID, Transliterator::Token /*context*/) { 99 return new UnescapeTransliterator(ID, SPEC_Unicode); 100 } 101 static Transliterator* _createJava(const UnicodeString& ID, Transliterator::Token /*context*/) { 102 return new UnescapeTransliterator(ID, SPEC_Java); 103 } 104 static Transliterator* _createC(const UnicodeString& ID, Transliterator::Token /*context*/) { 105 return new UnescapeTransliterator(ID, SPEC_C); 106 } 107 static Transliterator* _createXML(const UnicodeString& ID, Transliterator::Token /*context*/) { 108 return new UnescapeTransliterator(ID, SPEC_XML); 109 } 110 static Transliterator* _createXML10(const UnicodeString& ID, Transliterator::Token /*context*/) { 111 return new UnescapeTransliterator(ID, SPEC_XML10); 112 } 113 static Transliterator* _createPerl(const UnicodeString& ID, Transliterator::Token /*context*/) { 114 return new UnescapeTransliterator(ID, SPEC_Perl); 115 } 116 static Transliterator* _createAny(const UnicodeString& ID, Transliterator::Token /*context*/) { 117 return new UnescapeTransliterator(ID, SPEC_Any); 118 } 119 120 /** 121 * Registers standard variants with the system. Called by 122 * Transliterator during initialization. 123 */ 124 void UnescapeTransliterator::registerIDs() { 125 Token t = integerToken(0); 126 127 Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Hex-Any/Unicode"), _createUnicode, t); 128 129 Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Hex-Any/Java"), _createJava, t); 130 131 Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Hex-Any/C"), _createC, t); 132 133 Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Hex-Any/XML"), _createXML, t); 134 135 Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Hex-Any/XML10"), _createXML10, t); 136 137 Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Hex-Any/Perl"), _createPerl, t); 138 139 Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Hex-Any"), _createAny, t); 140 } 141 142 /** 143 * Constructor. Takes the encoded spec array. 144 */ 145 UnescapeTransliterator::UnescapeTransliterator(const UnicodeString& newID, 146 const char16_t *newSpec) : 147 Transliterator(newID, nullptr) 148 { 149 this->spec = copySpec(newSpec); 150 } 151 152 /** 153 * Copy constructor. 154 */ 155 UnescapeTransliterator::UnescapeTransliterator(const UnescapeTransliterator& o) : 156 Transliterator(o) { 157 this->spec = copySpec(o.spec); 158 } 159 160 UnescapeTransliterator::~UnescapeTransliterator() { 161 uprv_free(spec); 162 } 163 164 /** 165 * Transliterator API. 166 */ 167 UnescapeTransliterator* UnescapeTransliterator::clone() const { 168 return new UnescapeTransliterator(*this); 169 } 170 171 /** 172 * Implements {@link Transliterator#handleTransliterate}. 173 */ 174 void UnescapeTransliterator::handleTransliterate(Replaceable& text, UTransPosition& pos, 175 UBool isIncremental) const { 176 int32_t start = pos.start; 177 int32_t limit = pos.limit; 178 int32_t i, ipat; 179 180 while (start < limit) { 181 // Loop over the forms in spec[]. Exit this loop when we 182 // match one of the specs. Exit the outer loop if a 183 // partial match is detected and isIncremental is true. 184 for (ipat=0; spec[ipat] != END;) { 185 186 // Read the header 187 int32_t prefixLen = spec[ipat++]; 188 int32_t suffixLen = spec[ipat++]; 189 int8_t radix = static_cast<int8_t>(spec[ipat++]); 190 int32_t minDigits = spec[ipat++]; 191 int32_t maxDigits = spec[ipat++]; 192 193 // s is a copy of start that is advanced over the 194 // characters as we parse them. 195 int32_t s = start; 196 UBool match = true; 197 198 for (i=0; i<prefixLen; ++i) { 199 if (s >= limit) { 200 if (i > 0) { 201 // We've already matched a character. This is 202 // a partial match, so we return if in 203 // incremental mode. In non-incremental mode, 204 // go to the next spec. 205 if (isIncremental) { 206 goto exit; 207 } 208 match = false; 209 break; 210 } 211 } 212 char16_t c = text.charAt(s++); 213 if (c != spec[ipat + i]) { 214 match = false; 215 break; 216 } 217 } 218 219 if (match) { 220 UChar32 u = 0; 221 int32_t digitCount = 0; 222 for (;;) { 223 if (s >= limit) { 224 // Check for partial match in incremental mode. 225 if (s > start && isIncremental) { 226 goto exit; 227 } 228 break; 229 } 230 UChar32 ch = text.char32At(s); 231 int32_t digit = u_digit(ch, radix); 232 if (digit < 0) { 233 break; 234 } 235 s += U16_LENGTH(ch); 236 u = (u * radix) + digit; 237 if (++digitCount == maxDigits) { 238 break; 239 } 240 } 241 242 match = (digitCount >= minDigits); 243 244 if (match) { 245 for (i=0; i<suffixLen; ++i) { 246 if (s >= limit) { 247 // Check for partial match in incremental mode. 248 if (s > start && isIncremental) { 249 goto exit; 250 } 251 match = false; 252 break; 253 } 254 char16_t c = text.charAt(s++); 255 if (c != spec[ipat + prefixLen + i]) { 256 match = false; 257 break; 258 } 259 } 260 261 if (match) { 262 // At this point, we have a match 263 UnicodeString str(u); 264 text.handleReplaceBetween(start, s, str); 265 limit -= s - start - str.length(); 266 // The following break statement leaves the 267 // loop that is traversing the forms in 268 // spec[]. We then parse the next input 269 // character. 270 break; 271 } 272 } 273 } 274 275 ipat += prefixLen + suffixLen; 276 } 277 278 if (start < limit) { 279 start += U16_LENGTH(text.char32At(start)); 280 } 281 } 282 283 exit: 284 pos.contextLimit += limit - pos.limit; 285 pos.limit = limit; 286 pos.start = start; 287 } 288 289 U_NAMESPACE_END 290 291 #endif /* #if !UCONFIG_NO_TRANSLITERATION */ 292 293 //eof