tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

unesctrn.cpp (9564B)


      1 // © 2016 and later: Unicode, Inc. and others.
      2 // License & terms of use: http://www.unicode.org/copyright.html
      3 /*
      4 **********************************************************************
      5 *   Copyright (c) 2001-2011, International Business Machines
      6 *   Corporation and others.  All Rights Reserved.
      7 **********************************************************************
      8 *   Date        Name        Description
      9 *   11/19/2001  aliu        Creation.
     10 **********************************************************************
     11 */
     12 
     13 #include "unicode/utypes.h"
     14 
     15 #if !UCONFIG_NO_TRANSLITERATION
     16 
     17 #include "unicode/uchar.h"
     18 #include "unicode/utf16.h"
     19 #include "unesctrn.h"
     20 #include "util.h"
     21 
     22 #include "cmemory.h"
     23 
     24 U_NAMESPACE_BEGIN
     25 
     26 /**
     27 * Special character marking the end of the spec[] array.
     28 */
     29 static const char16_t END = 0xFFFF;
     30 
     31 // Unicode: "U+10FFFF" hex, min=4, max=6
     32 static const char16_t SPEC_Unicode[] = {
     33    2, 0, 16, 4, 6, 85/*U*/, 43/*+*/,
     34    END
     35 };
     36 
     37 // Java: "\\uFFFF" hex, min=4, max=4
     38 static const char16_t SPEC_Java[] = {
     39    2, 0, 16, 4, 4, 92/*\*/, 117/*u*/,
     40    END
     41 };
     42 
     43 // C: "\\uFFFF" hex, min=4, max=4; \\U0010FFFF hex, min=8, max=8
     44 static const char16_t SPEC_C[] = {
     45    2, 0, 16, 4, 4, 92/*\*/, 117/*u*/,
     46    2, 0, 16, 8, 8, 92/*\*/, 85/*U*/,
     47    END
     48 };
     49 
     50 // XML: "" hex, min=1, max=6
     51 static const char16_t SPEC_XML[] = {
     52    3, 1, 16, 1, 6, 38/*&*/, 35/*#*/, 120/*x*/, 59/*;*/,
     53    END
     54 };
     55 
     56 // XML10: "" dec, min=1, max=7 (not really "Hex-Any")
     57 static const char16_t SPEC_XML10[] = {
     58    2, 1, 10, 1, 7, 38/*&*/, 35/*#*/, 59/*;*/,
     59    END
     60 };
     61 
     62 // Perl: "\\x{263A}" hex, min=1, max=6
     63 static const char16_t SPEC_Perl[] = {
     64    3, 1, 16, 1, 6, 92/*\*/, 120/*x*/, 123/*{*/, 125/*}*/,
     65    END
     66 };
     67 
     68 // All: Java, C, Perl, XML, XML10, Unicode
     69 static const char16_t SPEC_Any[] = {
     70    2, 0, 16, 4, 6, 85/*U*/, 43/*+*/,                      // Unicode
     71    2, 0, 16, 4, 4, 92/*\*/, 117/*u*/,                     // Java
     72    2, 0, 16, 8, 8, 92/*\*/, 85/*U*/,                      // C (surrogates)
     73    3, 1, 16, 1, 6, 38/*&*/, 35/*#*/, 120/*x*/, 59/*;*/,   // XML
     74    2, 1, 10, 1, 7, 38/*&*/, 35/*#*/, 59/*;*/,             // XML10
     75    3, 1, 16, 1, 6, 92/*\*/, 120/*x*/, 123/*{*/, 125/*}*/, // Perl
     76    END
     77 };
     78 
     79 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(UnescapeTransliterator)
     80 
     81 static char16_t* copySpec(const char16_t* spec) {
     82    int32_t len = 0;
     83    while (spec[len] != END) {
     84        ++len;
     85    }
     86    ++len;
     87    char16_t* result = static_cast<char16_t*>(uprv_malloc(len * sizeof(char16_t)));
     88    // Check for memory allocation error. 
     89    if (result != nullptr) {
     90    	uprv_memcpy(result, spec, (size_t)len*sizeof(result[0]));
     91    }
     92    return result;
     93 }
     94 
     95 /**
     96 * Factory methods.  Ignore the context.
     97 */
     98 static Transliterator* _createUnicode(const UnicodeString& ID, Transliterator::Token /*context*/) {
     99    return new UnescapeTransliterator(ID, SPEC_Unicode);
    100 }
    101 static Transliterator* _createJava(const UnicodeString& ID, Transliterator::Token /*context*/) {
    102    return new UnescapeTransliterator(ID, SPEC_Java);
    103 }
    104 static Transliterator* _createC(const UnicodeString& ID, Transliterator::Token /*context*/) {
    105    return new UnescapeTransliterator(ID, SPEC_C);
    106 }
    107 static Transliterator* _createXML(const UnicodeString& ID, Transliterator::Token /*context*/) {
    108    return new UnescapeTransliterator(ID, SPEC_XML);
    109 }
    110 static Transliterator* _createXML10(const UnicodeString& ID, Transliterator::Token /*context*/) {
    111    return new UnescapeTransliterator(ID, SPEC_XML10);
    112 }
    113 static Transliterator* _createPerl(const UnicodeString& ID, Transliterator::Token /*context*/) {
    114    return new UnescapeTransliterator(ID, SPEC_Perl);
    115 }
    116 static Transliterator* _createAny(const UnicodeString& ID, Transliterator::Token /*context*/) {
    117    return new UnescapeTransliterator(ID, SPEC_Any);
    118 }
    119 
    120 /**
    121 * Registers standard variants with the system.  Called by
    122 * Transliterator during initialization.
    123 */
    124 void UnescapeTransliterator::registerIDs() {
    125    Token t = integerToken(0);
    126 
    127    Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Hex-Any/Unicode"), _createUnicode, t);
    128 
    129    Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Hex-Any/Java"), _createJava, t);
    130 
    131    Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Hex-Any/C"), _createC, t);
    132 
    133    Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Hex-Any/XML"), _createXML, t);
    134 
    135    Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Hex-Any/XML10"), _createXML10, t);
    136 
    137    Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Hex-Any/Perl"), _createPerl, t);
    138 
    139    Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Hex-Any"), _createAny, t);
    140 }
    141 
    142 /**
    143 * Constructor.  Takes the encoded spec array.
    144 */
    145 UnescapeTransliterator::UnescapeTransliterator(const UnicodeString& newID,
    146                                               const char16_t *newSpec) :
    147    Transliterator(newID, nullptr)
    148 {
    149    this->spec = copySpec(newSpec);
    150 }
    151 
    152 /**
    153 * Copy constructor.
    154 */
    155 UnescapeTransliterator::UnescapeTransliterator(const UnescapeTransliterator& o) :
    156    Transliterator(o) {
    157    this->spec = copySpec(o.spec);
    158 }
    159 
    160 UnescapeTransliterator::~UnescapeTransliterator() {
    161    uprv_free(spec);
    162 }
    163 
    164 /**
    165 * Transliterator API.
    166 */
    167 UnescapeTransliterator* UnescapeTransliterator::clone() const {
    168    return new UnescapeTransliterator(*this);
    169 }
    170 
    171 /**
    172 * Implements {@link Transliterator#handleTransliterate}.
    173 */
    174 void UnescapeTransliterator::handleTransliterate(Replaceable& text, UTransPosition& pos,
    175                                                 UBool isIncremental) const {
    176    int32_t start = pos.start;
    177    int32_t limit = pos.limit;
    178    int32_t i, ipat;
    179 
    180    while (start < limit) {
    181        // Loop over the forms in spec[].  Exit this loop when we
    182        // match one of the specs.  Exit the outer loop if a
    183        // partial match is detected and isIncremental is true.
    184        for (ipat=0; spec[ipat] != END;) {
    185 
    186            // Read the header
    187            int32_t prefixLen = spec[ipat++];
    188            int32_t suffixLen = spec[ipat++];
    189            int8_t radix = static_cast<int8_t>(spec[ipat++]);
    190            int32_t minDigits = spec[ipat++];
    191            int32_t maxDigits = spec[ipat++];
    192 
    193            // s is a copy of start that is advanced over the
    194            // characters as we parse them.
    195            int32_t s = start;
    196            UBool match = true;
    197 
    198            for (i=0; i<prefixLen; ++i) {
    199                if (s >= limit) {
    200                    if (i > 0) {
    201                        // We've already matched a character.  This is
    202                        // a partial match, so we return if in
    203                        // incremental mode.  In non-incremental mode,
    204                        // go to the next spec.
    205                        if (isIncremental) {
    206                            goto exit;
    207                        }
    208                        match = false;
    209                        break;
    210                    }
    211                }
    212                char16_t c = text.charAt(s++);
    213                if (c != spec[ipat + i]) {
    214                    match = false;
    215                    break;
    216                }
    217            }
    218 
    219            if (match) {
    220                UChar32 u = 0;
    221                int32_t digitCount = 0;
    222                for (;;) {
    223                    if (s >= limit) {
    224                        // Check for partial match in incremental mode.
    225                        if (s > start && isIncremental) {
    226                            goto exit;
    227                        }
    228                        break;
    229                    }
    230                    UChar32 ch = text.char32At(s);
    231                    int32_t digit = u_digit(ch, radix);
    232                    if (digit < 0) {
    233                        break;
    234                    }
    235                    s += U16_LENGTH(ch);
    236                    u = (u * radix) + digit;
    237                    if (++digitCount == maxDigits) {
    238                        break;
    239                    }
    240                }
    241 
    242                match = (digitCount >= minDigits);
    243 
    244                if (match) {
    245                    for (i=0; i<suffixLen; ++i) {
    246                        if (s >= limit) {
    247                            // Check for partial match in incremental mode.
    248                            if (s > start && isIncremental) {
    249                                goto exit;
    250                            }
    251                            match = false;
    252                            break;
    253                        }
    254                        char16_t c = text.charAt(s++);
    255                        if (c != spec[ipat + prefixLen + i]) {
    256                            match = false;
    257                            break;
    258                        }
    259                    }
    260 
    261                    if (match) {
    262                        // At this point, we have a match
    263                        UnicodeString str(u);
    264                        text.handleReplaceBetween(start, s, str);
    265                        limit -= s - start - str.length();
    266                        // The following break statement leaves the
    267                        // loop that is traversing the forms in
    268                        // spec[].  We then parse the next input
    269                        // character.
    270                        break;
    271                    }
    272                }
    273            }
    274 
    275            ipat += prefixLen + suffixLen;
    276        }
    277 
    278        if (start < limit) {
    279            start += U16_LENGTH(text.char32At(start));
    280        }
    281    }
    282 
    283  exit:
    284    pos.contextLimit += limit - pos.limit;
    285    pos.limit = limit;
    286    pos.start = start;
    287 }
    288 
    289 U_NAMESPACE_END
    290 
    291 #endif /* #if !UCONFIG_NO_TRANSLITERATION */
    292 
    293 //eof