csrmbcs.cpp (16125B)
1 // © 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html 3 /* 4 ********************************************************************** 5 * Copyright (C) 2005-2016, International Business Machines 6 * Corporation and others. All Rights Reserved. 7 ********************************************************************** 8 */ 9 10 #include "unicode/utypes.h" 11 12 #if !UCONFIG_NO_CONVERSION 13 14 #include "cmemory.h" 15 #include "csmatch.h" 16 #include "csrmbcs.h" 17 18 #include <math.h> 19 20 U_NAMESPACE_BEGIN 21 22 #define min(x,y) (((x)<(y))?(x):(y)) 23 24 static const uint16_t commonChars_sjis [] = { 25 // TODO: This set of data comes from the character frequency- 26 // of-occurrence analysis tool. The data needs to be moved 27 // into a resource and loaded from there. 28 0x8140, 0x8141, 0x8142, 0x8145, 0x815b, 0x8169, 0x816a, 0x8175, 0x8176, 0x82a0, 29 0x82a2, 0x82a4, 0x82a9, 0x82aa, 0x82ab, 0x82ad, 0x82af, 0x82b1, 0x82b3, 0x82b5, 30 0x82b7, 0x82bd, 0x82be, 0x82c1, 0x82c4, 0x82c5, 0x82c6, 0x82c8, 0x82c9, 0x82cc, 31 0x82cd, 0x82dc, 0x82e0, 0x82e7, 0x82e8, 0x82e9, 0x82ea, 0x82f0, 0x82f1, 0x8341, 32 0x8343, 0x834e, 0x834f, 0x8358, 0x835e, 0x8362, 0x8367, 0x8375, 0x8376, 0x8389, 33 0x838a, 0x838b, 0x838d, 0x8393, 0x8e96, 0x93fa, 0x95aa}; 34 35 static const uint16_t commonChars_euc_jp[] = { 36 // TODO: This set of data comes from the character frequency- 37 // of-occurrence analysis tool. The data needs to be moved 38 // into a resource and loaded from there. 39 0xa1a1, 0xa1a2, 0xa1a3, 0xa1a6, 0xa1bc, 0xa1ca, 0xa1cb, 0xa1d6, 0xa1d7, 0xa4a2, 40 0xa4a4, 0xa4a6, 0xa4a8, 0xa4aa, 0xa4ab, 0xa4ac, 0xa4ad, 0xa4af, 0xa4b1, 0xa4b3, 41 0xa4b5, 0xa4b7, 0xa4b9, 0xa4bb, 0xa4bd, 0xa4bf, 0xa4c0, 0xa4c1, 0xa4c3, 0xa4c4, 42 0xa4c6, 0xa4c7, 0xa4c8, 0xa4c9, 0xa4ca, 0xa4cb, 0xa4ce, 0xa4cf, 0xa4d0, 0xa4de, 43 0xa4df, 0xa4e1, 0xa4e2, 0xa4e4, 0xa4e8, 0xa4e9, 0xa4ea, 0xa4eb, 0xa4ec, 0xa4ef, 44 0xa4f2, 0xa4f3, 0xa5a2, 0xa5a3, 0xa5a4, 0xa5a6, 0xa5a7, 0xa5aa, 0xa5ad, 0xa5af, 45 0xa5b0, 0xa5b3, 0xa5b5, 0xa5b7, 0xa5b8, 0xa5b9, 0xa5bf, 0xa5c3, 0xa5c6, 0xa5c7, 46 0xa5c8, 0xa5c9, 0xa5cb, 0xa5d0, 0xa5d5, 0xa5d6, 0xa5d7, 0xa5de, 0xa5e0, 0xa5e1, 47 0xa5e5, 0xa5e9, 0xa5ea, 0xa5eb, 0xa5ec, 0xa5ed, 0xa5f3, 0xb8a9, 0xb9d4, 0xbaee, 48 0xbbc8, 0xbef0, 0xbfb7, 0xc4ea, 0xc6fc, 0xc7bd, 0xcab8, 0xcaf3, 0xcbdc, 0xcdd1}; 49 50 static const uint16_t commonChars_euc_kr[] = { 51 // TODO: This set of data comes from the character frequency- 52 // of-occurrence analysis tool. The data needs to be moved 53 // into a resource and loaded from there. 54 0xb0a1, 0xb0b3, 0xb0c5, 0xb0cd, 0xb0d4, 0xb0e6, 0xb0ed, 0xb0f8, 0xb0fa, 0xb0fc, 55 0xb1b8, 0xb1b9, 0xb1c7, 0xb1d7, 0xb1e2, 0xb3aa, 0xb3bb, 0xb4c2, 0xb4cf, 0xb4d9, 56 0xb4eb, 0xb5a5, 0xb5b5, 0xb5bf, 0xb5c7, 0xb5e9, 0xb6f3, 0xb7af, 0xb7c2, 0xb7ce, 57 0xb8a6, 0xb8ae, 0xb8b6, 0xb8b8, 0xb8bb, 0xb8e9, 0xb9ab, 0xb9ae, 0xb9cc, 0xb9ce, 58 0xb9fd, 0xbab8, 0xbace, 0xbad0, 0xbaf1, 0xbbe7, 0xbbf3, 0xbbfd, 0xbcad, 0xbcba, 59 0xbcd2, 0xbcf6, 0xbdba, 0xbdc0, 0xbdc3, 0xbdc5, 0xbec6, 0xbec8, 0xbedf, 0xbeee, 60 0xbef8, 0xbefa, 0xbfa1, 0xbfa9, 0xbfc0, 0xbfe4, 0xbfeb, 0xbfec, 0xbff8, 0xc0a7, 61 0xc0af, 0xc0b8, 0xc0ba, 0xc0bb, 0xc0bd, 0xc0c7, 0xc0cc, 0xc0ce, 0xc0cf, 0xc0d6, 62 0xc0da, 0xc0e5, 0xc0fb, 0xc0fc, 0xc1a4, 0xc1a6, 0xc1b6, 0xc1d6, 0xc1df, 0xc1f6, 63 0xc1f8, 0xc4a1, 0xc5cd, 0xc6ae, 0xc7cf, 0xc7d1, 0xc7d2, 0xc7d8, 0xc7e5, 0xc8ad}; 64 65 static const uint16_t commonChars_big5[] = { 66 // TODO: This set of data comes from the character frequency- 67 // of-occurrence analysis tool. The data needs to be moved 68 // into a resource and loaded from there. 69 0xa140, 0xa141, 0xa142, 0xa143, 0xa147, 0xa149, 0xa175, 0xa176, 0xa440, 0xa446, 70 0xa447, 0xa448, 0xa451, 0xa454, 0xa457, 0xa464, 0xa46a, 0xa46c, 0xa477, 0xa4a3, 71 0xa4a4, 0xa4a7, 0xa4c1, 0xa4ce, 0xa4d1, 0xa4df, 0xa4e8, 0xa4fd, 0xa540, 0xa548, 72 0xa558, 0xa569, 0xa5cd, 0xa5e7, 0xa657, 0xa661, 0xa662, 0xa668, 0xa670, 0xa6a8, 73 0xa6b3, 0xa6b9, 0xa6d3, 0xa6db, 0xa6e6, 0xa6f2, 0xa740, 0xa751, 0xa759, 0xa7da, 74 0xa8a3, 0xa8a5, 0xa8ad, 0xa8d1, 0xa8d3, 0xa8e4, 0xa8fc, 0xa9c0, 0xa9d2, 0xa9f3, 75 0xaa6b, 0xaaba, 0xaabe, 0xaacc, 0xaafc, 0xac47, 0xac4f, 0xacb0, 0xacd2, 0xad59, 76 0xaec9, 0xafe0, 0xb0ea, 0xb16f, 0xb2b3, 0xb2c4, 0xb36f, 0xb44c, 0xb44e, 0xb54c, 77 0xb5a5, 0xb5bd, 0xb5d0, 0xb5d8, 0xb671, 0xb7ed, 0xb867, 0xb944, 0xbad8, 0xbb44, 78 0xbba1, 0xbdd1, 0xc2c4, 0xc3b9, 0xc440, 0xc45f}; 79 80 static const uint16_t commonChars_gb_18030[] = { 81 // TODO: This set of data comes from the character frequency- 82 // of-occurrence analysis tool. The data needs to be moved 83 // into a resource and loaded from there. 84 0xa1a1, 0xa1a2, 0xa1a3, 0xa1a4, 0xa1b0, 0xa1b1, 0xa1f1, 0xa1f3, 0xa3a1, 0xa3ac, 85 0xa3ba, 0xb1a8, 0xb1b8, 0xb1be, 0xb2bb, 0xb3c9, 0xb3f6, 0xb4f3, 0xb5bd, 0xb5c4, 86 0xb5e3, 0xb6af, 0xb6d4, 0xb6e0, 0xb7a2, 0xb7a8, 0xb7bd, 0xb7d6, 0xb7dd, 0xb8b4, 87 0xb8df, 0xb8f6, 0xb9ab, 0xb9c9, 0xb9d8, 0xb9fa, 0xb9fd, 0xbacd, 0xbba7, 0xbbd6, 88 0xbbe1, 0xbbfa, 0xbcbc, 0xbcdb, 0xbcfe, 0xbdcc, 0xbecd, 0xbedd, 0xbfb4, 0xbfc6, 89 0xbfc9, 0xc0b4, 0xc0ed, 0xc1cb, 0xc2db, 0xc3c7, 0xc4dc, 0xc4ea, 0xc5cc, 0xc6f7, 90 0xc7f8, 0xc8ab, 0xc8cb, 0xc8d5, 0xc8e7, 0xc9cf, 0xc9fa, 0xcab1, 0xcab5, 0xcac7, 91 0xcad0, 0xcad6, 0xcaf5, 0xcafd, 0xccec, 0xcdf8, 0xceaa, 0xcec4, 0xced2, 0xcee5, 92 0xcfb5, 0xcfc2, 0xcfd6, 0xd0c2, 0xd0c5, 0xd0d0, 0xd0d4, 0xd1a7, 0xd2aa, 0xd2b2, 93 0xd2b5, 0xd2bb, 0xd2d4, 0xd3c3, 0xd3d0, 0xd3fd, 0xd4c2, 0xd4da, 0xd5e2, 0xd6d0}; 94 95 static int32_t binarySearch(const uint16_t *array, int32_t len, uint16_t value) 96 { 97 int32_t start = 0, end = len-1; 98 int32_t mid = (start+end)/2; 99 100 while(start <= end) { 101 if(array[mid] == value) { 102 return mid; 103 } 104 105 if(array[mid] < value){ 106 start = mid+1; 107 } else { 108 end = mid-1; 109 } 110 111 mid = (start+end)/2; 112 } 113 114 return -1; 115 } 116 117 IteratedChar::IteratedChar() : 118 charValue(0), index(-1), nextIndex(0), error(false), done(false) 119 { 120 // nothing else to do. 121 } 122 123 /*void IteratedChar::reset() 124 { 125 charValue = 0; 126 index = -1; 127 nextIndex = 0; 128 error = false; 129 done = false; 130 }*/ 131 132 int32_t IteratedChar::nextByte(InputText *det) 133 { 134 if (nextIndex >= det->fRawLength) { 135 done = true; 136 137 return -1; 138 } 139 140 return det->fRawInput[nextIndex++]; 141 } 142 143 CharsetRecog_mbcs::~CharsetRecog_mbcs() 144 { 145 // nothing to do. 146 } 147 148 int32_t CharsetRecog_mbcs::match_mbcs(InputText *det, const uint16_t commonChars[], int32_t commonCharsLen) const { 149 int32_t doubleByteCharCount = 0; 150 int32_t commonCharCount = 0; 151 int32_t badCharCount = 0; 152 int32_t totalCharCount = 0; 153 int32_t confidence = 0; 154 IteratedChar iter; 155 156 while (nextChar(&iter, det)) { 157 totalCharCount++; 158 159 if (iter.error) { 160 badCharCount++; 161 } else { 162 if (iter.charValue > 0xFF) { 163 doubleByteCharCount++; 164 165 if (commonChars != nullptr) { 166 if (binarySearch(commonChars, commonCharsLen, static_cast<uint16_t>(iter.charValue)) >= 0){ 167 commonCharCount += 1; 168 } 169 } 170 } 171 } 172 173 174 if (badCharCount >= 2 && badCharCount*5 >= doubleByteCharCount) { 175 // Bail out early if the byte data is not matching the encoding scheme. 176 // break detectBlock; 177 return confidence; 178 } 179 } 180 181 if (doubleByteCharCount <= 10 && badCharCount == 0) { 182 // Not many multi-byte chars. 183 if (doubleByteCharCount == 0 && totalCharCount < 10) { 184 // There weren't any multibyte sequences, and there was a low density of non-ASCII single bytes. 185 // We don't have enough data to have any confidence. 186 // Statistical analysis of single byte non-ASCII characters would probably help here. 187 confidence = 0; 188 } 189 else { 190 // ASCII or ISO file? It's probably not our encoding, 191 // but is not incompatible with our encoding, so don't give it a zero. 192 confidence = 10; 193 } 194 195 return confidence; 196 } 197 198 // 199 // No match if there are too many characters that don't fit the encoding scheme. 200 // (should we have zero tolerance for these?) 201 // 202 if (doubleByteCharCount < 20*badCharCount) { 203 confidence = 0; 204 205 return confidence; 206 } 207 208 if (commonChars == nullptr) { 209 // We have no statistics on frequently occurring characters. 210 // Assess confidence purely on having a reasonable number of 211 // multi-byte characters (the more the better) 212 confidence = 30 + doubleByteCharCount - 20*badCharCount; 213 214 if (confidence > 100) { 215 confidence = 100; 216 } 217 } else { 218 // 219 // Frequency of occurrence statistics exist. 220 // 221 222 double maxVal = log(static_cast<double>(doubleByteCharCount) / 4); /*(float)?*/ 223 double scaleFactor = 90.0 / maxVal; 224 confidence = static_cast<int32_t>(log(static_cast<double>(commonCharCount) + 1) * scaleFactor + 10.0); 225 226 confidence = min(confidence, 100); 227 } 228 229 if (confidence < 0) { 230 confidence = 0; 231 } 232 233 return confidence; 234 } 235 236 CharsetRecog_sjis::~CharsetRecog_sjis() 237 { 238 // nothing to do 239 } 240 241 UBool CharsetRecog_sjis::nextChar(IteratedChar* it, InputText* det) const { 242 it->index = it->nextIndex; 243 it->error = false; 244 245 int32_t firstByte = it->charValue = it->nextByte(det); 246 247 if (firstByte < 0) { 248 return false; 249 } 250 251 if (firstByte <= 0x7F || (firstByte > 0xA0 && firstByte <= 0xDF)) { 252 return true; 253 } 254 255 int32_t secondByte = it->nextByte(det); 256 if (secondByte >= 0) { 257 it->charValue = (firstByte << 8) | secondByte; 258 } 259 // else we'll handle the error later. 260 261 if (! ((secondByte >= 0x40 && secondByte <= 0x7F) || (secondByte >= 0x80 && secondByte <= 0xFE))) { 262 // Illegal second byte value. 263 it->error = true; 264 } 265 266 return true; 267 } 268 269 UBool CharsetRecog_sjis::match(InputText* det, CharsetMatch *results) const { 270 int32_t confidence = match_mbcs(det, commonChars_sjis, UPRV_LENGTHOF(commonChars_sjis)); 271 results->set(det, this, confidence); 272 return (confidence > 0); 273 } 274 275 const char *CharsetRecog_sjis::getName() const 276 { 277 return "Shift_JIS"; 278 } 279 280 const char *CharsetRecog_sjis::getLanguage() const 281 { 282 return "ja"; 283 } 284 285 CharsetRecog_euc::~CharsetRecog_euc() 286 { 287 // nothing to do 288 } 289 290 UBool CharsetRecog_euc::nextChar(IteratedChar* it, InputText* det) const { 291 int32_t firstByte = 0; 292 int32_t secondByte = 0; 293 int32_t thirdByte = 0; 294 295 it->index = it->nextIndex; 296 it->error = false; 297 firstByte = it->charValue = it->nextByte(det); 298 299 if (firstByte < 0) { 300 // Ran off the end of the input data 301 return false; 302 } 303 304 if (firstByte <= 0x8D) { 305 // single byte char 306 return true; 307 } 308 309 secondByte = it->nextByte(det); 310 if (secondByte >= 0) { 311 it->charValue = (it->charValue << 8) | secondByte; 312 } 313 // else we'll handle the error later. 314 315 if (firstByte >= 0xA1 && firstByte <= 0xFE) { 316 // Two byte Char 317 if (secondByte < 0xA1) { 318 it->error = true; 319 } 320 321 return true; 322 } 323 324 if (firstByte == 0x8E) { 325 // Code Set 2. 326 // In EUC-JP, total char size is 2 bytes, only one byte of actual char value. 327 // In EUC-TW, total char size is 4 bytes, three bytes contribute to char value. 328 // We don't know which we've got. 329 // Treat it like EUC-JP. If the data really was EUC-TW, the following two 330 // bytes will look like a well formed 2 byte char. 331 if (secondByte < 0xA1) { 332 it->error = true; 333 } 334 335 return true; 336 } 337 338 if (firstByte == 0x8F) { 339 // Code set 3. 340 // Three byte total char size, two bytes of actual char value. 341 thirdByte = it->nextByte(det); 342 it->charValue = (it->charValue << 8) | thirdByte; 343 344 if (thirdByte < 0xa1) { 345 // Bad second byte or ran off the end of the input data with a non-ASCII first byte. 346 it->error = true; 347 } 348 } 349 350 return true; 351 352 } 353 354 CharsetRecog_euc_jp::~CharsetRecog_euc_jp() 355 { 356 // nothing to do 357 } 358 359 const char *CharsetRecog_euc_jp::getName() const 360 { 361 return "EUC-JP"; 362 } 363 364 const char *CharsetRecog_euc_jp::getLanguage() const 365 { 366 return "ja"; 367 } 368 369 UBool CharsetRecog_euc_jp::match(InputText *det, CharsetMatch *results) const 370 { 371 int32_t confidence = match_mbcs(det, commonChars_euc_jp, UPRV_LENGTHOF(commonChars_euc_jp)); 372 results->set(det, this, confidence); 373 return (confidence > 0); 374 } 375 376 CharsetRecog_euc_kr::~CharsetRecog_euc_kr() 377 { 378 // nothing to do 379 } 380 381 const char *CharsetRecog_euc_kr::getName() const 382 { 383 return "EUC-KR"; 384 } 385 386 const char *CharsetRecog_euc_kr::getLanguage() const 387 { 388 return "ko"; 389 } 390 391 UBool CharsetRecog_euc_kr::match(InputText *det, CharsetMatch *results) const 392 { 393 int32_t confidence = match_mbcs(det, commonChars_euc_kr, UPRV_LENGTHOF(commonChars_euc_kr)); 394 results->set(det, this, confidence); 395 return (confidence > 0); 396 } 397 398 CharsetRecog_big5::~CharsetRecog_big5() 399 { 400 // nothing to do 401 } 402 403 UBool CharsetRecog_big5::nextChar(IteratedChar* it, InputText* det) const 404 { 405 int32_t firstByte; 406 407 it->index = it->nextIndex; 408 it->error = false; 409 firstByte = it->charValue = it->nextByte(det); 410 411 if (firstByte < 0) { 412 return false; 413 } 414 415 if (firstByte <= 0x7F || firstByte == 0xFF) { 416 // single byte character. 417 return true; 418 } 419 420 int32_t secondByte = it->nextByte(det); 421 if (secondByte >= 0) { 422 it->charValue = (it->charValue << 8) | secondByte; 423 } 424 // else we'll handle the error later. 425 426 if (secondByte < 0x40 || secondByte == 0x7F || secondByte == 0xFF) { 427 it->error = true; 428 } 429 430 return true; 431 } 432 433 const char *CharsetRecog_big5::getName() const 434 { 435 return "Big5"; 436 } 437 438 const char *CharsetRecog_big5::getLanguage() const 439 { 440 return "zh"; 441 } 442 443 UBool CharsetRecog_big5::match(InputText *det, CharsetMatch *results) const 444 { 445 int32_t confidence = match_mbcs(det, commonChars_big5, UPRV_LENGTHOF(commonChars_big5)); 446 results->set(det, this, confidence); 447 return (confidence > 0); 448 } 449 450 CharsetRecog_gb_18030::~CharsetRecog_gb_18030() 451 { 452 // nothing to do 453 } 454 455 UBool CharsetRecog_gb_18030::nextChar(IteratedChar* it, InputText* det) const { 456 int32_t firstByte = 0; 457 int32_t secondByte = 0; 458 int32_t thirdByte = 0; 459 int32_t fourthByte = 0; 460 461 it->index = it->nextIndex; 462 it->error = false; 463 firstByte = it->charValue = it->nextByte(det); 464 465 if (firstByte < 0) { 466 // Ran off the end of the input data 467 return false; 468 } 469 470 if (firstByte <= 0x80) { 471 // single byte char 472 return true; 473 } 474 475 secondByte = it->nextByte(det); 476 if (secondByte >= 0) { 477 it->charValue = (it->charValue << 8) | secondByte; 478 } 479 // else we'll handle the error later. 480 481 if (firstByte >= 0x81 && firstByte <= 0xFE) { 482 // Two byte Char 483 if ((secondByte >= 0x40 && secondByte <= 0x7E) || (secondByte >=80 && secondByte <= 0xFE)) { 484 return true; 485 } 486 487 // Four byte char 488 if (secondByte >= 0x30 && secondByte <= 0x39) { 489 thirdByte = it->nextByte(det); 490 491 if (thirdByte >= 0x81 && thirdByte <= 0xFE) { 492 fourthByte = it->nextByte(det); 493 494 if (fourthByte >= 0x30 && fourthByte <= 0x39) { 495 it->charValue = (it->charValue << 16) | (thirdByte << 8) | fourthByte; 496 497 return true; 498 } 499 } 500 } 501 502 // Something wasn't valid, or we ran out of data (-1). 503 it->error = true; 504 } 505 506 return true; 507 } 508 509 const char *CharsetRecog_gb_18030::getName() const 510 { 511 return "GB18030"; 512 } 513 514 const char *CharsetRecog_gb_18030::getLanguage() const 515 { 516 return "zh"; 517 } 518 519 UBool CharsetRecog_gb_18030::match(InputText *det, CharsetMatch *results) const 520 { 521 int32_t confidence = match_mbcs(det, commonChars_gb_18030, UPRV_LENGTHOF(commonChars_gb_18030)); 522 results->set(det, this, confidence); 523 return (confidence > 0); 524 } 525 526 U_NAMESPACE_END 527 #endif