tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

csdetect.cpp (14119B)


      1 // © 2016 and later: Unicode, Inc. and others.
      2 // License & terms of use: http://www.unicode.org/copyright.html
      3 /*
      4 **********************************************************************
      5 *   Copyright (C) 2005-2016, International Business Machines
      6 *   Corporation and others.  All Rights Reserved.
      7 **********************************************************************
      8 */
      9 
     10 #include "unicode/utypes.h"
     11 
     12 #if !UCONFIG_NO_CONVERSION
     13 
     14 #include "unicode/ucsdet.h"
     15 
     16 #include "csdetect.h"
     17 #include "csmatch.h"
     18 #include "uenumimp.h"
     19 
     20 #include "cmemory.h"
     21 #include "cstring.h"
     22 #include "umutex.h"
     23 #include "ucln_in.h"
     24 #include "uarrsort.h"
     25 #include "inputext.h"
     26 #include "csrsbcs.h"
     27 #include "csrmbcs.h"
     28 #include "csrutf8.h"
     29 #include "csrucode.h"
     30 #include "csr2022.h"
     31 
     32 #define NEW_ARRAY(type,count) (type *) uprv_malloc((count) * sizeof(type))
     33 #define DELETE_ARRAY(array) uprv_free((void *) (array))
     34 
     35 U_NAMESPACE_BEGIN
     36 
     37 struct CSRecognizerInfo : public UMemory {
     38    CSRecognizerInfo(CharsetRecognizer *recognizer, UBool isDefaultEnabled)
     39        : recognizer(recognizer), isDefaultEnabled(isDefaultEnabled) {}
     40 
     41    ~CSRecognizerInfo() {delete recognizer;}
     42 
     43    CharsetRecognizer *recognizer;
     44    UBool isDefaultEnabled;
     45 };
     46 
     47 U_NAMESPACE_END
     48 
     49 static icu::CSRecognizerInfo **fCSRecognizers = nullptr;
     50 static icu::UInitOnce gCSRecognizersInitOnce {};
     51 static int32_t fCSRecognizers_size = 0;
     52 
     53 U_CDECL_BEGIN
     54 static UBool U_CALLCONV csdet_cleanup()
     55 {
     56    U_NAMESPACE_USE
     57    if (fCSRecognizers != nullptr) {
     58        for(int32_t r = 0; r < fCSRecognizers_size; r += 1) {
     59            delete fCSRecognizers[r];
     60            fCSRecognizers[r] = nullptr;
     61        }
     62 
     63        DELETE_ARRAY(fCSRecognizers);
     64        fCSRecognizers = nullptr;
     65        fCSRecognizers_size = 0;
     66    }
     67    gCSRecognizersInitOnce.reset();
     68 
     69    return true;
     70 }
     71 
     72 static int32_t U_CALLCONV
     73 charsetMatchComparator(const void * /*context*/, const void *left, const void *right)
     74 {
     75    U_NAMESPACE_USE
     76 
     77    const CharsetMatch **csm_l = (const CharsetMatch **) left;
     78    const CharsetMatch **csm_r = (const CharsetMatch **) right;
     79 
     80    // NOTE: compare is backwards to sort from highest to lowest.
     81    return (*csm_r)->getConfidence() - (*csm_l)->getConfidence();
     82 }
     83 
     84 static void U_CALLCONV initRecognizers(UErrorCode &status) {
     85    U_NAMESPACE_USE
     86    ucln_i18n_registerCleanup(UCLN_I18N_CSDET, csdet_cleanup);
     87    CSRecognizerInfo *tempArray[] = {
     88        new CSRecognizerInfo(new CharsetRecog_UTF8(), true),
     89 
     90        new CSRecognizerInfo(new CharsetRecog_UTF_16_BE(), true),
     91        new CSRecognizerInfo(new CharsetRecog_UTF_16_LE(), true),
     92        new CSRecognizerInfo(new CharsetRecog_UTF_32_BE(), true),
     93        new CSRecognizerInfo(new CharsetRecog_UTF_32_LE(), true),
     94 
     95        new CSRecognizerInfo(new CharsetRecog_8859_1(), true),
     96        new CSRecognizerInfo(new CharsetRecog_8859_2(), true),
     97        new CSRecognizerInfo(new CharsetRecog_8859_5_ru(), true),
     98        new CSRecognizerInfo(new CharsetRecog_8859_6_ar(), true),
     99        new CSRecognizerInfo(new CharsetRecog_8859_7_el(), true),
    100        new CSRecognizerInfo(new CharsetRecog_8859_8_I_he(), true),
    101        new CSRecognizerInfo(new CharsetRecog_8859_8_he(), true),
    102        new CSRecognizerInfo(new CharsetRecog_windows_1251(), true),
    103        new CSRecognizerInfo(new CharsetRecog_windows_1256(), true),
    104        new CSRecognizerInfo(new CharsetRecog_KOI8_R(), true),
    105        new CSRecognizerInfo(new CharsetRecog_8859_9_tr(), true),
    106        new CSRecognizerInfo(new CharsetRecog_sjis(), true),
    107        new CSRecognizerInfo(new CharsetRecog_gb_18030(), true),
    108        new CSRecognizerInfo(new CharsetRecog_euc_jp(), true),
    109        new CSRecognizerInfo(new CharsetRecog_euc_kr(), true),
    110        new CSRecognizerInfo(new CharsetRecog_big5(), true),
    111 
    112        new CSRecognizerInfo(new CharsetRecog_2022JP(), true),
    113 #if !UCONFIG_ONLY_HTML_CONVERSION
    114        new CSRecognizerInfo(new CharsetRecog_2022KR(), true),
    115        new CSRecognizerInfo(new CharsetRecog_2022CN(), true),
    116 
    117        new CSRecognizerInfo(new CharsetRecog_IBM424_he_rtl(), false),
    118        new CSRecognizerInfo(new CharsetRecog_IBM424_he_ltr(), false),
    119        new CSRecognizerInfo(new CharsetRecog_IBM420_ar_rtl(), false),
    120        new CSRecognizerInfo(new CharsetRecog_IBM420_ar_ltr(), false)
    121 #endif
    122    };
    123    int32_t rCount = UPRV_LENGTHOF(tempArray);
    124 
    125    fCSRecognizers = NEW_ARRAY(CSRecognizerInfo *, rCount);
    126 
    127    if (fCSRecognizers == nullptr) {
    128        status = U_MEMORY_ALLOCATION_ERROR;
    129    } 
    130    else {
    131        fCSRecognizers_size = rCount;
    132        for (int32_t r = 0; r < rCount; r += 1) {
    133            fCSRecognizers[r] = tempArray[r];
    134            if (fCSRecognizers[r] == nullptr) {
    135                status = U_MEMORY_ALLOCATION_ERROR;
    136            }
    137        }
    138    }
    139 }
    140 
    141 U_CDECL_END
    142 
    143 U_NAMESPACE_BEGIN
    144 
    145 void CharsetDetector::setRecognizers(UErrorCode &status)
    146 {
    147    umtx_initOnce(gCSRecognizersInitOnce, &initRecognizers, status);
    148 }
    149 
    150 CharsetDetector::CharsetDetector(UErrorCode &status)
    151  : textIn(new InputText(status)), resultArray(nullptr),
    152    resultCount(0), fStripTags(false), fFreshTextSet(false),
    153    fEnabledRecognizers(nullptr)
    154 {
    155    if (U_FAILURE(status)) {
    156        return;
    157    }
    158 
    159    setRecognizers(status);
    160 
    161    if (U_FAILURE(status)) {
    162        return;
    163    }
    164 
    165    resultArray = static_cast<CharsetMatch**>(uprv_malloc(sizeof(CharsetMatch*) * fCSRecognizers_size));
    166 
    167    if (resultArray == nullptr) {
    168        status = U_MEMORY_ALLOCATION_ERROR;
    169        return;
    170    }
    171 
    172    for(int32_t i = 0; i < fCSRecognizers_size; i += 1) {
    173        resultArray[i] = new CharsetMatch();
    174 
    175        if (resultArray[i] == nullptr) {
    176            status = U_MEMORY_ALLOCATION_ERROR;
    177            break;
    178        }
    179    }
    180 }
    181 
    182 CharsetDetector::~CharsetDetector()
    183 {
    184    delete textIn;
    185 
    186    for(int32_t i = 0; i < fCSRecognizers_size; i += 1) {
    187        delete resultArray[i];
    188    }
    189 
    190    uprv_free(resultArray);
    191 
    192    if (fEnabledRecognizers) {
    193        uprv_free(fEnabledRecognizers);
    194    }
    195 }
    196 
    197 void CharsetDetector::setText(const char *in, int32_t len)
    198 {
    199    textIn->setText(in, len);
    200    fFreshTextSet = true;
    201 }
    202 
    203 UBool CharsetDetector::setStripTagsFlag(UBool flag)
    204 {
    205    UBool temp = fStripTags;
    206    fStripTags = flag;
    207    fFreshTextSet = true;
    208    return temp;
    209 }
    210 
    211 UBool CharsetDetector::getStripTagsFlag() const
    212 {
    213    return fStripTags;
    214 }
    215 
    216 void CharsetDetector::setDeclaredEncoding(const char *encoding, int32_t len) const
    217 {
    218    textIn->setDeclaredEncoding(encoding,len);
    219 }
    220 
    221 int32_t CharsetDetector::getDetectableCount()
    222 {
    223    UErrorCode status = U_ZERO_ERROR;
    224 
    225    setRecognizers(status);
    226 
    227    return fCSRecognizers_size; 
    228 }
    229 
    230 const CharsetMatch *CharsetDetector::detect(UErrorCode &status)
    231 {
    232    int32_t maxMatchesFound = 0;
    233 
    234    detectAll(maxMatchesFound, status);
    235 
    236    if(maxMatchesFound > 0) {
    237        return resultArray[0];
    238    } else {
    239        return nullptr;
    240    }
    241 }
    242 
    243 const CharsetMatch * const *CharsetDetector::detectAll(int32_t &maxMatchesFound, UErrorCode &status)
    244 {
    245    if(!textIn->isSet()) {
    246        status = U_MISSING_RESOURCE_ERROR;// TODO:  Need to set proper status code for input text not set
    247 
    248        return nullptr;
    249    } else if (fFreshTextSet) {
    250        CharsetRecognizer *csr;
    251        int32_t            i;
    252 
    253        textIn->MungeInput(fStripTags);
    254 
    255        // Iterate over all possible charsets, remember all that
    256        // give a match quality > 0.
    257        resultCount = 0;
    258        for (i = 0; i < fCSRecognizers_size; i += 1) {
    259            csr = fCSRecognizers[i]->recognizer;
    260            if (csr->match(textIn, resultArray[resultCount])) {
    261                resultCount++;
    262            }
    263        }
    264 
    265        if (resultCount > 1) {
    266            uprv_sortArray(resultArray, resultCount, sizeof resultArray[0], charsetMatchComparator, nullptr, true, &status);
    267        }
    268        fFreshTextSet = false;
    269    }
    270 
    271    maxMatchesFound = resultCount;
    272 
    273    if (maxMatchesFound == 0) {
    274        status = U_INVALID_CHAR_FOUND;
    275        return nullptr;
    276    }
    277 
    278    return resultArray;
    279 }
    280 
    281 void CharsetDetector::setDetectableCharset(const char *encoding, UBool enabled, UErrorCode &status)
    282 {
    283    if (U_FAILURE(status)) {
    284        return;
    285    }
    286 
    287    int32_t modIdx = -1;
    288    UBool isDefaultVal = false;
    289    for (int32_t i = 0; i < fCSRecognizers_size; i++) {
    290        CSRecognizerInfo *csrinfo = fCSRecognizers[i];
    291        if (uprv_strcmp(csrinfo->recognizer->getName(), encoding) == 0) {
    292            modIdx = i;
    293            isDefaultVal = (csrinfo->isDefaultEnabled == enabled);
    294            break;
    295        }
    296    }
    297    if (modIdx < 0) {
    298        // No matching encoding found
    299        status = U_ILLEGAL_ARGUMENT_ERROR;
    300        return;
    301    }
    302 
    303    if (fEnabledRecognizers == nullptr && !isDefaultVal) {
    304        // Create an array storing the non default setting
    305        fEnabledRecognizers = NEW_ARRAY(UBool, fCSRecognizers_size);
    306        if (fEnabledRecognizers == nullptr) {
    307            status = U_MEMORY_ALLOCATION_ERROR;
    308            return;
    309        }
    310        // Initialize the array with default info
    311        for (int32_t i = 0; i < fCSRecognizers_size; i++) {
    312            fEnabledRecognizers[i] = fCSRecognizers[i]->isDefaultEnabled;
    313        }
    314    }
    315 
    316    if (fEnabledRecognizers != nullptr) {
    317        fEnabledRecognizers[modIdx] = enabled;
    318    }
    319 }
    320 
    321 /*const char *CharsetDetector::getCharsetName(int32_t index, UErrorCode &status) const
    322 {
    323    if( index > fCSRecognizers_size-1 || index < 0) {
    324        status = U_INDEX_OUTOFBOUNDS_ERROR;
    325 
    326        return 0;
    327    } else {
    328        return fCSRecognizers[index]->getName();
    329    }
    330 }*/
    331 
    332 U_NAMESPACE_END
    333 
    334 U_CDECL_BEGIN
    335 typedef struct {
    336    int32_t currIndex;
    337    UBool all;
    338    UBool *enabledRecognizers;
    339 } Context;
    340 
    341 
    342 
    343 static void U_CALLCONV
    344 enumClose(UEnumeration *en) {
    345    if(en->context != nullptr) {
    346        DELETE_ARRAY(en->context);
    347    }
    348 
    349    DELETE_ARRAY(en);
    350 }
    351 
    352 static int32_t U_CALLCONV
    353 enumCount(UEnumeration *en, UErrorCode *) {
    354    if (((Context *)en->context)->all) {
    355        // ucsdet_getAllDetectableCharsets, all charset detector names
    356        return fCSRecognizers_size;
    357    }
    358 
    359    // Otherwise, ucsdet_getDetectableCharsets - only enabled ones
    360    int32_t count = 0;
    361    UBool *enabledArray = ((Context *)en->context)->enabledRecognizers;
    362    if (enabledArray != nullptr) {
    363        // custom set
    364        for (int32_t i = 0; i < fCSRecognizers_size; i++) {
    365            if (enabledArray[i]) {
    366                count++;
    367            }
    368        }
    369    } else {
    370        // default set
    371        for (int32_t i = 0; i < fCSRecognizers_size; i++) {
    372            if (fCSRecognizers[i]->isDefaultEnabled) {
    373                count++;
    374            }
    375        }
    376    }
    377    return count;
    378 }
    379 
    380 static const char* U_CALLCONV
    381 enumNext(UEnumeration *en, int32_t *resultLength, UErrorCode * /*status*/) {
    382    const char *currName = nullptr;
    383 
    384    if (((Context *)en->context)->currIndex < fCSRecognizers_size) {
    385        if (((Context *)en->context)->all) {
    386            // ucsdet_getAllDetectableCharsets, all charset detector names
    387            currName = fCSRecognizers[((Context *)en->context)->currIndex]->recognizer->getName();
    388            ((Context *)en->context)->currIndex++;
    389        } else {
    390            // ucsdet_getDetectableCharsets
    391            UBool *enabledArray = ((Context *)en->context)->enabledRecognizers;
    392            if (enabledArray != nullptr) {
    393                // custom set
    394                while (currName == nullptr && ((Context *)en->context)->currIndex < fCSRecognizers_size) {
    395                    if (enabledArray[((Context *)en->context)->currIndex]) {
    396                        currName = fCSRecognizers[((Context *)en->context)->currIndex]->recognizer->getName();
    397                    }
    398                    ((Context *)en->context)->currIndex++;
    399                }
    400            } else {
    401                // default set
    402                while (currName == nullptr && ((Context *)en->context)->currIndex < fCSRecognizers_size) {
    403                    if (fCSRecognizers[((Context *)en->context)->currIndex]->isDefaultEnabled) {
    404                        currName = fCSRecognizers[((Context *)en->context)->currIndex]->recognizer->getName();
    405                    }
    406                    ((Context *)en->context)->currIndex++;
    407                }
    408            }
    409        }
    410    }
    411 
    412    if(resultLength != nullptr) {
    413        *resultLength = currName == nullptr ? 0 : (int32_t)uprv_strlen(currName);
    414    }
    415 
    416    return currName;
    417 }
    418 
    419 
    420 static void U_CALLCONV
    421 enumReset(UEnumeration *en, UErrorCode *) {
    422    ((Context *)en->context)->currIndex = 0;
    423 }
    424 
    425 static const UEnumeration gCSDetEnumeration = {
    426    nullptr,
    427    nullptr,
    428    enumClose,
    429    enumCount,
    430    uenum_unextDefault,
    431    enumNext,
    432    enumReset
    433 };
    434 
    435 U_CDECL_END
    436 
    437 U_NAMESPACE_BEGIN
    438 
    439 UEnumeration * CharsetDetector::getAllDetectableCharsets(UErrorCode &status)
    440 {
    441 
    442    /* Initialize recognized charsets. */
    443    setRecognizers(status);
    444 
    445    if(U_FAILURE(status)) {
    446        return nullptr;
    447    }
    448 
    449    UEnumeration *en = NEW_ARRAY(UEnumeration, 1);
    450    if (en == nullptr) {
    451        status = U_MEMORY_ALLOCATION_ERROR;
    452        return nullptr;
    453    }
    454    memcpy(en, &gCSDetEnumeration, sizeof(UEnumeration));
    455    en->context = (void*)NEW_ARRAY(Context, 1);
    456    if (en->context == nullptr) {
    457        status = U_MEMORY_ALLOCATION_ERROR;
    458        DELETE_ARRAY(en);
    459        return nullptr;
    460    }
    461    uprv_memset(en->context, 0, sizeof(Context));
    462    static_cast<Context*>(en->context)->all = true;
    463    return en;
    464 }
    465 
    466 UEnumeration * CharsetDetector::getDetectableCharsets(UErrorCode &status) const
    467 {
    468    if(U_FAILURE(status)) {
    469        return nullptr;
    470    }
    471 
    472    UEnumeration *en = NEW_ARRAY(UEnumeration, 1);
    473    if (en == nullptr) {
    474        status = U_MEMORY_ALLOCATION_ERROR;
    475        return nullptr;
    476    }
    477    memcpy(en, &gCSDetEnumeration, sizeof(UEnumeration));
    478    en->context = (void*)NEW_ARRAY(Context, 1);
    479    if (en->context == nullptr) {
    480        status = U_MEMORY_ALLOCATION_ERROR;
    481        DELETE_ARRAY(en);
    482        return nullptr;
    483    }
    484    uprv_memset(en->context, 0, sizeof(Context));
    485    static_cast<Context*>(en->context)->all = false;
    486    static_cast<Context*>(en->context)->enabledRecognizers = fEnabledRecognizers;
    487    return en;
    488 }
    489 
    490 U_NAMESPACE_END
    491 
    492 #endif