[ tor-browser ].git.dasho

csrmbcs.cpp (16125B)
      1 // © 2016 and later: Unicode, Inc. and others.
      2 // License & terms of use: http://www.unicode.org/copyright.html
      3 /*
      4 **********************************************************************
      5 *   Copyright (C) 2005-2016, International Business Machines
      6 *   Corporation and others.  All Rights Reserved.
      7 **********************************************************************
      8 */
      9 
     10 #include "unicode/utypes.h"
     11 
     12 #if !UCONFIG_NO_CONVERSION
     13 
     14 #include "cmemory.h"
     15 #include "csmatch.h"
     16 #include "csrmbcs.h"
     17 
     18 #include <math.h>
     19 
     20 U_NAMESPACE_BEGIN
     21 
     22 #define min(x,y) (((x)<(y))?(x):(y))
     23 
     24 static const uint16_t commonChars_sjis [] = {
     25 // TODO:  This set of data comes from the character frequency-
     26 //        of-occurrence analysis tool.  The data needs to be moved
     27 //        into a resource and loaded from there.
     28 0x8140, 0x8141, 0x8142, 0x8145, 0x815b, 0x8169, 0x816a, 0x8175, 0x8176, 0x82a0,
     29 0x82a2, 0x82a4, 0x82a9, 0x82aa, 0x82ab, 0x82ad, 0x82af, 0x82b1, 0x82b3, 0x82b5,
     30 0x82b7, 0x82bd, 0x82be, 0x82c1, 0x82c4, 0x82c5, 0x82c6, 0x82c8, 0x82c9, 0x82cc,
     31 0x82cd, 0x82dc, 0x82e0, 0x82e7, 0x82e8, 0x82e9, 0x82ea, 0x82f0, 0x82f1, 0x8341,
     32 0x8343, 0x834e, 0x834f, 0x8358, 0x835e, 0x8362, 0x8367, 0x8375, 0x8376, 0x8389,
     33 0x838a, 0x838b, 0x838d, 0x8393, 0x8e96, 0x93fa, 0x95aa};
     34 
     35 static const uint16_t commonChars_euc_jp[] = {
     36 // TODO:  This set of data comes from the character frequency-
     37 //        of-occurrence analysis tool.  The data needs to be moved
     38 //        into a resource and loaded from there.
     39 0xa1a1, 0xa1a2, 0xa1a3, 0xa1a6, 0xa1bc, 0xa1ca, 0xa1cb, 0xa1d6, 0xa1d7, 0xa4a2,
     40 0xa4a4, 0xa4a6, 0xa4a8, 0xa4aa, 0xa4ab, 0xa4ac, 0xa4ad, 0xa4af, 0xa4b1, 0xa4b3,
     41 0xa4b5, 0xa4b7, 0xa4b9, 0xa4bb, 0xa4bd, 0xa4bf, 0xa4c0, 0xa4c1, 0xa4c3, 0xa4c4,
     42 0xa4c6, 0xa4c7, 0xa4c8, 0xa4c9, 0xa4ca, 0xa4cb, 0xa4ce, 0xa4cf, 0xa4d0, 0xa4de,
     43 0xa4df, 0xa4e1, 0xa4e2, 0xa4e4, 0xa4e8, 0xa4e9, 0xa4ea, 0xa4eb, 0xa4ec, 0xa4ef,
     44 0xa4f2, 0xa4f3, 0xa5a2, 0xa5a3, 0xa5a4, 0xa5a6, 0xa5a7, 0xa5aa, 0xa5ad, 0xa5af,
     45 0xa5b0, 0xa5b3, 0xa5b5, 0xa5b7, 0xa5b8, 0xa5b9, 0xa5bf, 0xa5c3, 0xa5c6, 0xa5c7,
     46 0xa5c8, 0xa5c9, 0xa5cb, 0xa5d0, 0xa5d5, 0xa5d6, 0xa5d7, 0xa5de, 0xa5e0, 0xa5e1,
     47 0xa5e5, 0xa5e9, 0xa5ea, 0xa5eb, 0xa5ec, 0xa5ed, 0xa5f3, 0xb8a9, 0xb9d4, 0xbaee,
     48 0xbbc8, 0xbef0, 0xbfb7, 0xc4ea, 0xc6fc, 0xc7bd, 0xcab8, 0xcaf3, 0xcbdc, 0xcdd1};
     49 
     50 static const uint16_t commonChars_euc_kr[] = {
     51 // TODO:  This set of data comes from the character frequency-
     52 //        of-occurrence analysis tool.  The data needs to be moved
     53 //        into a resource and loaded from there.
     54 0xb0a1, 0xb0b3, 0xb0c5, 0xb0cd, 0xb0d4, 0xb0e6, 0xb0ed, 0xb0f8, 0xb0fa, 0xb0fc,
     55 0xb1b8, 0xb1b9, 0xb1c7, 0xb1d7, 0xb1e2, 0xb3aa, 0xb3bb, 0xb4c2, 0xb4cf, 0xb4d9,
     56 0xb4eb, 0xb5a5, 0xb5b5, 0xb5bf, 0xb5c7, 0xb5e9, 0xb6f3, 0xb7af, 0xb7c2, 0xb7ce,
     57 0xb8a6, 0xb8ae, 0xb8b6, 0xb8b8, 0xb8bb, 0xb8e9, 0xb9ab, 0xb9ae, 0xb9cc, 0xb9ce,
     58 0xb9fd, 0xbab8, 0xbace, 0xbad0, 0xbaf1, 0xbbe7, 0xbbf3, 0xbbfd, 0xbcad, 0xbcba,
     59 0xbcd2, 0xbcf6, 0xbdba, 0xbdc0, 0xbdc3, 0xbdc5, 0xbec6, 0xbec8, 0xbedf, 0xbeee,
     60 0xbef8, 0xbefa, 0xbfa1, 0xbfa9, 0xbfc0, 0xbfe4, 0xbfeb, 0xbfec, 0xbff8, 0xc0a7,
     61 0xc0af, 0xc0b8, 0xc0ba, 0xc0bb, 0xc0bd, 0xc0c7, 0xc0cc, 0xc0ce, 0xc0cf, 0xc0d6,
     62 0xc0da, 0xc0e5, 0xc0fb, 0xc0fc, 0xc1a4, 0xc1a6, 0xc1b6, 0xc1d6, 0xc1df, 0xc1f6,
     63 0xc1f8, 0xc4a1, 0xc5cd, 0xc6ae, 0xc7cf, 0xc7d1, 0xc7d2, 0xc7d8, 0xc7e5, 0xc8ad};
     64 
     65 static const uint16_t commonChars_big5[] = {
     66 // TODO:  This set of data comes from the character frequency-
     67 //        of-occurrence analysis tool.  The data needs to be moved
     68 //        into a resource and loaded from there.
     69 0xa140, 0xa141, 0xa142, 0xa143, 0xa147, 0xa149, 0xa175, 0xa176, 0xa440, 0xa446,
     70 0xa447, 0xa448, 0xa451, 0xa454, 0xa457, 0xa464, 0xa46a, 0xa46c, 0xa477, 0xa4a3,
     71 0xa4a4, 0xa4a7, 0xa4c1, 0xa4ce, 0xa4d1, 0xa4df, 0xa4e8, 0xa4fd, 0xa540, 0xa548,
     72 0xa558, 0xa569, 0xa5cd, 0xa5e7, 0xa657, 0xa661, 0xa662, 0xa668, 0xa670, 0xa6a8,
     73 0xa6b3, 0xa6b9, 0xa6d3, 0xa6db, 0xa6e6, 0xa6f2, 0xa740, 0xa751, 0xa759, 0xa7da,
     74 0xa8a3, 0xa8a5, 0xa8ad, 0xa8d1, 0xa8d3, 0xa8e4, 0xa8fc, 0xa9c0, 0xa9d2, 0xa9f3,
     75 0xaa6b, 0xaaba, 0xaabe, 0xaacc, 0xaafc, 0xac47, 0xac4f, 0xacb0, 0xacd2, 0xad59,
     76 0xaec9, 0xafe0, 0xb0ea, 0xb16f, 0xb2b3, 0xb2c4, 0xb36f, 0xb44c, 0xb44e, 0xb54c,
     77 0xb5a5, 0xb5bd, 0xb5d0, 0xb5d8, 0xb671, 0xb7ed, 0xb867, 0xb944, 0xbad8, 0xbb44,
     78 0xbba1, 0xbdd1, 0xc2c4, 0xc3b9, 0xc440, 0xc45f};
     79 
     80 static const uint16_t commonChars_gb_18030[] = {
     81 // TODO:  This set of data comes from the character frequency-
     82 //        of-occurrence analysis tool.  The data needs to be moved
     83 //        into a resource and loaded from there.
     84 0xa1a1, 0xa1a2, 0xa1a3, 0xa1a4, 0xa1b0, 0xa1b1, 0xa1f1, 0xa1f3, 0xa3a1, 0xa3ac,
     85 0xa3ba, 0xb1a8, 0xb1b8, 0xb1be, 0xb2bb, 0xb3c9, 0xb3f6, 0xb4f3, 0xb5bd, 0xb5c4,
     86 0xb5e3, 0xb6af, 0xb6d4, 0xb6e0, 0xb7a2, 0xb7a8, 0xb7bd, 0xb7d6, 0xb7dd, 0xb8b4,
     87 0xb8df, 0xb8f6, 0xb9ab, 0xb9c9, 0xb9d8, 0xb9fa, 0xb9fd, 0xbacd, 0xbba7, 0xbbd6,
     88 0xbbe1, 0xbbfa, 0xbcbc, 0xbcdb, 0xbcfe, 0xbdcc, 0xbecd, 0xbedd, 0xbfb4, 0xbfc6,
     89 0xbfc9, 0xc0b4, 0xc0ed, 0xc1cb, 0xc2db, 0xc3c7, 0xc4dc, 0xc4ea, 0xc5cc, 0xc6f7,
     90 0xc7f8, 0xc8ab, 0xc8cb, 0xc8d5, 0xc8e7, 0xc9cf, 0xc9fa, 0xcab1, 0xcab5, 0xcac7,
     91 0xcad0, 0xcad6, 0xcaf5, 0xcafd, 0xccec, 0xcdf8, 0xceaa, 0xcec4, 0xced2, 0xcee5,
     92 0xcfb5, 0xcfc2, 0xcfd6, 0xd0c2, 0xd0c5, 0xd0d0, 0xd0d4, 0xd1a7, 0xd2aa, 0xd2b2,
     93 0xd2b5, 0xd2bb, 0xd2d4, 0xd3c3, 0xd3d0, 0xd3fd, 0xd4c2, 0xd4da, 0xd5e2, 0xd6d0};
     94 
     95 static int32_t binarySearch(const uint16_t *array, int32_t len, uint16_t value)
     96 {
     97    int32_t start = 0, end = len-1;
     98    int32_t mid = (start+end)/2;
     99 
    100    while(start <= end) {
    101        if(array[mid] == value) {
    102            return mid;
    103        }
    104 
    105        if(array[mid] < value){
    106            start = mid+1;
    107        } else {
    108            end = mid-1;
    109        }
    110 
    111        mid = (start+end)/2;
    112    }
    113 
    114    return -1;
    115 }
    116 
    117 IteratedChar::IteratedChar() : 
    118 charValue(0), index(-1), nextIndex(0), error(false), done(false)
    119 {
    120    // nothing else to do.
    121 }
    122 
    123 /*void IteratedChar::reset()
    124 {
    125    charValue = 0;
    126    index     = -1;
    127    nextIndex = 0;
    128    error     = false;
    129    done      = false;
    130 }*/
    131 
    132 int32_t IteratedChar::nextByte(InputText *det)
    133 {
    134    if (nextIndex >= det->fRawLength) {
    135        done = true;
    136 
    137        return -1;
    138    }
    139 
    140    return det->fRawInput[nextIndex++];
    141 }
    142 
    143 CharsetRecog_mbcs::~CharsetRecog_mbcs()
    144 {
    145    // nothing to do.
    146 }
    147 
    148 int32_t CharsetRecog_mbcs::match_mbcs(InputText *det, const uint16_t commonChars[], int32_t commonCharsLen) const {
    149    int32_t doubleByteCharCount = 0;
    150    int32_t commonCharCount     = 0;
    151    int32_t badCharCount        = 0;
    152    int32_t totalCharCount      = 0;
    153    int32_t confidence          = 0;
    154    IteratedChar iter;
    155 
    156    while (nextChar(&iter, det)) {
    157        totalCharCount++;
    158 
    159        if (iter.error) {
    160            badCharCount++;
    161        } else {
    162            if (iter.charValue > 0xFF) {
    163                doubleByteCharCount++;
    164 
    165                if (commonChars != nullptr) {
    166                    if (binarySearch(commonChars, commonCharsLen, static_cast<uint16_t>(iter.charValue)) >= 0){
    167                        commonCharCount += 1;
    168                    }
    169                }
    170            }
    171        }
    172 
    173 
    174        if (badCharCount >= 2 && badCharCount*5 >= doubleByteCharCount) {
    175            // Bail out early if the byte data is not matching the encoding scheme.
    176            // break detectBlock;
    177            return confidence;
    178        }
    179    }
    180 
    181    if (doubleByteCharCount <= 10 && badCharCount == 0) {
    182        // Not many multi-byte chars.
    183        if (doubleByteCharCount == 0 && totalCharCount < 10) {
    184            // There weren't any multibyte sequences, and there was a low density of non-ASCII single bytes.
    185            // We don't have enough data to have any confidence.
    186            // Statistical analysis of single byte non-ASCII characters would probably help here.
    187            confidence = 0;
    188        }
    189        else {
    190            //   ASCII or ISO file?  It's probably not our encoding,
    191            //   but is not incompatible with our encoding, so don't give it a zero.
    192            confidence = 10;
    193        }
    194 
    195        return confidence;
    196    }
    197 
    198    //
    199    //  No match if there are too many characters that don't fit the encoding scheme.
    200    //    (should we have zero tolerance for these?)
    201    //
    202    if (doubleByteCharCount < 20*badCharCount) {
    203        confidence = 0;
    204 
    205        return confidence;
    206    }
    207 
    208    if (commonChars == nullptr) {
    209        // We have no statistics on frequently occurring characters.
    210        //  Assess confidence purely on having a reasonable number of
    211        //  multi-byte characters (the more the better)
    212        confidence = 30 + doubleByteCharCount - 20*badCharCount;
    213 
    214        if (confidence > 100) {
    215            confidence = 100;
    216        }
    217    } else {
    218        //
    219        // Frequency of occurrence statistics exist.
    220        //
    221 
    222        double maxVal = log(static_cast<double>(doubleByteCharCount) / 4); /*(float)?*/
    223        double scaleFactor = 90.0 / maxVal;
    224        confidence = static_cast<int32_t>(log(static_cast<double>(commonCharCount) + 1) * scaleFactor + 10.0);
    225 
    226        confidence = min(confidence, 100);
    227    }
    228 
    229    if (confidence < 0) {
    230        confidence = 0;
    231    }
    232 
    233    return confidence;
    234 }
    235 
    236 CharsetRecog_sjis::~CharsetRecog_sjis()
    237 {
    238    // nothing to do
    239 }
    240 
    241 UBool CharsetRecog_sjis::nextChar(IteratedChar* it, InputText* det) const {
    242    it->index = it->nextIndex;
    243    it->error = false;
    244 
    245    int32_t firstByte = it->charValue = it->nextByte(det);
    246 
    247    if (firstByte < 0) {
    248        return false;
    249    }
    250 
    251    if (firstByte <= 0x7F || (firstByte > 0xA0 && firstByte <= 0xDF)) {
    252        return true;
    253    }
    254 
    255    int32_t secondByte = it->nextByte(det);
    256    if (secondByte >= 0) {
    257        it->charValue = (firstByte << 8) | secondByte;
    258    }
    259    // else we'll handle the error later.
    260 
    261    if (! ((secondByte >= 0x40 && secondByte <= 0x7F) || (secondByte >= 0x80 && secondByte <= 0xFE))) {
    262        // Illegal second byte value.
    263        it->error = true;
    264    }
    265 
    266    return true;
    267 }
    268 
    269 UBool CharsetRecog_sjis::match(InputText* det, CharsetMatch *results) const {
    270    int32_t confidence = match_mbcs(det, commonChars_sjis, UPRV_LENGTHOF(commonChars_sjis));
    271    results->set(det, this, confidence);
    272    return (confidence > 0);
    273 }
    274 
    275 const char *CharsetRecog_sjis::getName() const
    276 {
    277    return "Shift_JIS";
    278 }
    279 
    280 const char *CharsetRecog_sjis::getLanguage() const
    281 {
    282    return "ja";
    283 }
    284 
    285 CharsetRecog_euc::~CharsetRecog_euc()
    286 {
    287    // nothing to do
    288 }
    289 
    290 UBool CharsetRecog_euc::nextChar(IteratedChar* it, InputText* det) const {
    291    int32_t firstByte  = 0;
    292    int32_t secondByte = 0;
    293    int32_t thirdByte  = 0;
    294 
    295    it->index = it->nextIndex;
    296    it->error = false;
    297    firstByte = it->charValue = it->nextByte(det);
    298 
    299    if (firstByte < 0) {
    300        // Ran off the end of the input data
    301        return false;
    302    }
    303 
    304    if (firstByte <= 0x8D) {
    305        // single byte char
    306        return true;
    307    }
    308 
    309    secondByte = it->nextByte(det);
    310    if (secondByte >= 0) {
    311        it->charValue = (it->charValue << 8) | secondByte;
    312    }
    313    // else we'll handle the error later.
    314 
    315    if (firstByte >= 0xA1 && firstByte <= 0xFE) {
    316        // Two byte Char
    317        if (secondByte < 0xA1) {
    318            it->error = true;
    319        }
    320 
    321        return true;
    322    }
    323 
    324    if (firstByte == 0x8E) {
    325        // Code Set 2.
    326        //   In EUC-JP, total char size is 2 bytes, only one byte of actual char value.
    327        //   In EUC-TW, total char size is 4 bytes, three bytes contribute to char value.
    328        // We don't know which we've got.
    329        // Treat it like EUC-JP.  If the data really was EUC-TW, the following two
    330        //   bytes will look like a well formed 2 byte char.
    331        if (secondByte < 0xA1) {
    332            it->error = true;
    333        }
    334 
    335        return true;
    336    }
    337 
    338    if (firstByte == 0x8F) {
    339        // Code set 3.
    340        // Three byte total char size, two bytes of actual char value.
    341        thirdByte    = it->nextByte(det);
    342        it->charValue = (it->charValue << 8) | thirdByte;
    343 
    344        if (thirdByte < 0xa1) {
    345            // Bad second byte or ran off the end of the input data with a non-ASCII first byte.
    346            it->error = true;
    347        }
    348    }
    349 
    350    return true;
    351 
    352 }
    353 
    354 CharsetRecog_euc_jp::~CharsetRecog_euc_jp()
    355 {
    356    // nothing to do
    357 }
    358 
    359 const char *CharsetRecog_euc_jp::getName() const
    360 {
    361    return "EUC-JP";
    362 }
    363 
    364 const char *CharsetRecog_euc_jp::getLanguage() const
    365 {
    366    return "ja";
    367 }
    368 
    369 UBool CharsetRecog_euc_jp::match(InputText *det, CharsetMatch *results) const
    370 {
    371    int32_t confidence = match_mbcs(det, commonChars_euc_jp, UPRV_LENGTHOF(commonChars_euc_jp));
    372    results->set(det, this, confidence);
    373    return (confidence > 0);
    374 }
    375 
    376 CharsetRecog_euc_kr::~CharsetRecog_euc_kr()
    377 {
    378    // nothing to do
    379 }
    380 
    381 const char *CharsetRecog_euc_kr::getName() const
    382 {
    383    return "EUC-KR";
    384 }
    385 
    386 const char *CharsetRecog_euc_kr::getLanguage() const
    387 {
    388    return "ko";
    389 }
    390 
    391 UBool CharsetRecog_euc_kr::match(InputText *det, CharsetMatch *results) const
    392 {
    393    int32_t confidence =  match_mbcs(det, commonChars_euc_kr, UPRV_LENGTHOF(commonChars_euc_kr));
    394    results->set(det, this, confidence);
    395    return (confidence > 0);
    396 }
    397 
    398 CharsetRecog_big5::~CharsetRecog_big5()
    399 {
    400    // nothing to do
    401 }
    402 
    403 UBool CharsetRecog_big5::nextChar(IteratedChar* it, InputText* det) const
    404 {
    405    int32_t firstByte;
    406 
    407    it->index = it->nextIndex;
    408    it->error = false;
    409    firstByte = it->charValue = it->nextByte(det);
    410 
    411    if (firstByte < 0) {
    412        return false;
    413    }
    414 
    415    if (firstByte <= 0x7F || firstByte == 0xFF) {
    416        // single byte character.
    417        return true;
    418    }
    419 
    420    int32_t secondByte = it->nextByte(det);
    421    if (secondByte >= 0)  {
    422        it->charValue = (it->charValue << 8) | secondByte;
    423    }
    424    // else we'll handle the error later.
    425 
    426    if (secondByte < 0x40 || secondByte == 0x7F || secondByte == 0xFF) {
    427        it->error = true;
    428    }
    429 
    430    return true;
    431 }
    432 
    433 const char *CharsetRecog_big5::getName() const
    434 {
    435    return "Big5";
    436 }
    437 
    438 const char *CharsetRecog_big5::getLanguage() const
    439 {
    440    return "zh";
    441 }
    442 
    443 UBool CharsetRecog_big5::match(InputText *det, CharsetMatch *results) const
    444 {
    445    int32_t confidence = match_mbcs(det, commonChars_big5, UPRV_LENGTHOF(commonChars_big5));
    446    results->set(det, this, confidence);
    447    return (confidence > 0);
    448 }
    449 
    450 CharsetRecog_gb_18030::~CharsetRecog_gb_18030()
    451 {
    452    // nothing to do
    453 }
    454 
    455 UBool CharsetRecog_gb_18030::nextChar(IteratedChar* it, InputText* det) const {
    456    int32_t firstByte  = 0;
    457    int32_t secondByte = 0;
    458    int32_t thirdByte  = 0;
    459    int32_t fourthByte = 0;
    460 
    461    it->index = it->nextIndex;
    462    it->error = false;
    463    firstByte = it->charValue = it->nextByte(det);
    464 
    465    if (firstByte < 0) {
    466        // Ran off the end of the input data
    467        return false;
    468    }
    469 
    470    if (firstByte <= 0x80) {
    471        // single byte char
    472        return true;
    473    }
    474 
    475    secondByte = it->nextByte(det);
    476    if (secondByte >= 0) {
    477        it->charValue = (it->charValue << 8) | secondByte;
    478    }
    479    // else we'll handle the error later.
    480 
    481    if (firstByte >= 0x81 && firstByte <= 0xFE) {
    482        // Two byte Char
    483        if ((secondByte >= 0x40 && secondByte <= 0x7E) || (secondByte >=80 && secondByte <= 0xFE)) {
    484            return true;
    485        }
    486 
    487        // Four byte char
    488        if (secondByte >= 0x30 && secondByte <= 0x39) {
    489            thirdByte = it->nextByte(det);
    490 
    491            if (thirdByte >= 0x81 && thirdByte <= 0xFE) {
    492                fourthByte = it->nextByte(det);
    493 
    494                if (fourthByte >= 0x30 && fourthByte <= 0x39) {
    495                    it->charValue = (it->charValue << 16) | (thirdByte << 8) | fourthByte;
    496 
    497                    return true;
    498                }
    499            }
    500        }
    501 
    502        // Something wasn't valid, or we ran out of data (-1).
    503        it->error = true;
    504    }
    505 
    506    return true;
    507 }
    508 
    509 const char *CharsetRecog_gb_18030::getName() const
    510 {
    511    return "GB18030";
    512 }
    513 
    514 const char *CharsetRecog_gb_18030::getLanguage() const
    515 {
    516    return "zh";
    517 }
    518 
    519 UBool CharsetRecog_gb_18030::match(InputText *det, CharsetMatch *results) const
    520 {
    521    int32_t confidence = match_mbcs(det, commonChars_gb_18030, UPRV_LENGTHOF(commonChars_gb_18030));
    522    results->set(det, this, confidence);
    523    return (confidence > 0);
    524 }
    525 
    526 U_NAMESPACE_END
    527 #endif
	tor-browser The Tor Browser
	git clone https://git.dasho.dev/tor-browser.git
	Log \| Files \| Refs \| README \| LICENSE