ucnvmbcs.cpp (221477B)
1 // © 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html 3 /* 4 ****************************************************************************** 5 * 6 * Copyright (C) 2000-2016, International Business Machines 7 * Corporation and others. All Rights Reserved. 8 * 9 ****************************************************************************** 10 * file name: ucnvmbcs.cpp 11 * encoding: UTF-8 12 * tab size: 8 (not used) 13 * indentation:4 14 * 15 * created on: 2000jul03 16 * created by: Markus W. Scherer 17 * 18 * The current code in this file replaces the previous implementation 19 * of conversion code from multi-byte codepages to Unicode and back. 20 * This implementation supports the following: 21 * - legacy variable-length codepages with up to 4 bytes per character 22 * - all Unicode code points (up to 0x10ffff) 23 * - efficient distinction of unassigned vs. illegal byte sequences 24 * - it is possible in fromUnicode() to directly deal with simple 25 * stateful encodings (used for EBCDIC_STATEFUL) 26 * - it is possible to convert Unicode code points 27 * to a single zero byte (but not as a fallback except for SBCS) 28 * 29 * Remaining limitations in fromUnicode: 30 * - byte sequences must not have leading zero bytes 31 * - except for SBCS codepages: no fallback mapping from Unicode to a zero byte 32 * - limitation to up to 4 bytes per character 33 * 34 * ICU 2.8 (late 2003) adds a secondary data structure which lifts some of these 35 * limitations and adds m:n character mappings and other features. 36 * See ucnv_ext.h for details. 37 * 38 * Change history: 39 * 40 * 5/6/2001 Ram Moved MBCS_SINGLE_RESULT_FROM_U,MBCS_STAGE_2_FROM_U, 41 * MBCS_VALUE_2_FROM_STAGE_2, MBCS_VALUE_4_FROM_STAGE_2 42 * macros to ucnvmbcs.h file 43 */ 44 45 #include "unicode/utypes.h" 46 47 #if !UCONFIG_NO_CONVERSION && !UCONFIG_NO_LEGACY_CONVERSION 48 49 #include "unicode/ucnv.h" 50 #include "unicode/ucnv_cb.h" 51 #include "unicode/udata.h" 52 #include "unicode/uset.h" 53 #include "unicode/utf8.h" 54 #include "unicode/utf16.h" 55 #include "ucnv_bld.h" 56 #include "ucnvmbcs.h" 57 #include "ucnv_ext.h" 58 #include "ucnv_cnv.h" 59 #include "cmemory.h" 60 #include "cstring.h" 61 #include "umutex.h" 62 #include "ustr_imp.h" 63 64 /* control optimizations according to the platform */ 65 #define MBCS_UNROLL_SINGLE_TO_BMP 1 66 #define MBCS_UNROLL_SINGLE_FROM_BMP 0 67 68 /* 69 * _MBCSHeader versions 5.3 & 4.3 70 * (Note that the _MBCSHeader version is in addition to the converter formatVersion.) 71 * 72 * This version is optional. Version 5 is used for incompatible data format changes. 73 * makeconv will continue to generate version 4 files if possible. 74 * 75 * Changes from version 4: 76 * 77 * The main difference is an additional _MBCSHeader field with 78 * - the length (number of uint32_t) of the _MBCSHeader 79 * - flags for further incompatible data format changes 80 * - flags for further, backward compatible data format changes 81 * 82 * The MBCS_OPT_FROM_U flag indicates that most of the fromUnicode data is omitted from 83 * the file and needs to be reconstituted at load time. 84 * This requires a utf8Friendly format with an additional mbcsIndex table for fast 85 * (and UTF-8-friendly) fromUnicode conversion for Unicode code points up to maxFastUChar. 86 * (For details about these structures see below, and see ucnvmbcs.h.) 87 * 88 * utf8Friendly also implies that the fromUnicode mappings are stored in ascending order 89 * of the Unicode code points. (This requires that the .ucm file has the |0 etc. 90 * precision markers for all mappings.) 91 * 92 * All fallbacks have been moved to the extension table, leaving only roundtrips in the 93 * omitted data that can be reconstituted from the toUnicode data. 94 * 95 * Of the stage 2 table, the part corresponding to maxFastUChar and below is omitted. 96 * With only roundtrip mappings in the base fromUnicode data, this part is fully 97 * redundant with the mbcsIndex and will be reconstituted from that (also using the 98 * stage 1 table which contains the information about how stage 2 was compacted). 99 * 100 * The rest of the stage 2 table, the part for code points above maxFastUChar, 101 * is stored in the file and will be appended to the reconstituted part. 102 * 103 * The entire fromUBytes array is omitted from the file and will be reconstitued. 104 * This is done by enumerating all toUnicode roundtrip mappings, performing 105 * each mapping (using the stage 1 and reconstituted stage 2 tables) and 106 * writing instead of reading the byte values. 107 * 108 * _MBCSHeader version 4.3 109 * 110 * Change from version 4.2: 111 * - Optional utf8Friendly data structures, with 64-entry stage 3 block 112 * allocation for parts of the BMP, and an additional mbcsIndex in non-SBCS 113 * files which can be used instead of stages 1 & 2. 114 * Faster lookups for roundtrips from most commonly used characters, 115 * and lookups from UTF-8 byte sequences with a natural bit distribution. 116 * See ucnvmbcs.h for more details. 117 * 118 * Change from version 4.1: 119 * - Added an optional extension table structure at the end of the .cnv file. 120 * It is present if the upper bits of the header flags field contains a non-zero 121 * byte offset to it. 122 * Files that contain only a conversion table and no base table 123 * use the special outputType MBCS_OUTPUT_EXT_ONLY. 124 * These contain the base table name between the MBCS header and the extension 125 * data. 126 * 127 * Change from version 4.0: 128 * - Replace header.reserved with header.fromUBytesLength so that all 129 * fields in the data have length. 130 * 131 * Changes from version 3 (for performance improvements): 132 * - new bit distribution for state table entries 133 * - reordered action codes 134 * - new data structure for single-byte fromUnicode 135 * + stage 2 only contains indexes 136 * + stage 3 stores 16 bits per character with classification bits 15..8 137 * - no multiplier for stage 1 entries 138 * - stage 2 for non-single-byte codepages contains the index and the flags in 139 * one 32-bit value 140 * - 2-byte and 4-byte fromUnicode results are stored directly as 16/32-bit integers 141 * 142 * For more details about old versions of the MBCS data structure, see 143 * the corresponding versions of this file. 144 * 145 * Converting stateless codepage data ---------------------------------------*** 146 * (or codepage data with simple states) to Unicode. 147 * 148 * Data structure and algorithm for converting from complex legacy codepages 149 * to Unicode. (Designed before 2000-may-22.) 150 * 151 * The basic idea is that the structure of legacy codepages can be described 152 * with state tables. 153 * When reading a byte stream, each input byte causes a state transition. 154 * Some transitions result in the output of a code point, some result in 155 * "unassigned" or "illegal" output. 156 * This is used here for character conversion. 157 * 158 * The data structure begins with a state table consisting of a row 159 * per state, with 256 entries (columns) per row for each possible input 160 * byte value. 161 * Each entry is 32 bits wide, with two formats distinguished by 162 * the sign bit (bit 31): 163 * 164 * One format for transitional entries (bit 31 not set) for non-final bytes, and 165 * one format for final entries (bit 31 set). 166 * Both formats contain the number of the next state in the same bit 167 * positions. 168 * State 0 is the initial state. 169 * 170 * Most of the time, the offset values of subsequent states are added 171 * up to a scalar value. This value will eventually be the index of 172 * the Unicode code point in a table that follows the state table. 173 * The effect is that the code points for final state table rows 174 * are contiguous. The code points of final state rows follow each other 175 * in the order of the references to those final states by previous 176 * states, etc. 177 * 178 * For some terminal states, the offset is itself the output Unicode 179 * code point (16 bits for a BMP code point or 20 bits for a supplementary 180 * code point (stored as code point minus 0x10000 so that 20 bits are enough). 181 * For others, the code point in the Unicode table is stored with either 182 * one or two code units: one for BMP code points, two for a pair of 183 * surrogates. 184 * All code points for a final state entry take up the same number of code 185 * units, regardless of whether they all actually _use_ the same number 186 * of code units. This is necessary for simple array access. 187 * 188 * An additional feature comes in with what in ICU is called "fallback" 189 * mappings: 190 * 191 * In addition to round-trippable, precise, 1:1 mappings, there are often 192 * mappings defined between similar, though not the same, characters. 193 * Typically, such mappings occur only in fromUnicode mapping tables because 194 * Unicode has a superset repertoire of most other codepages. However, it 195 * is possible to provide such mappings in the toUnicode tables, too. 196 * In this case, the fallback mappings are partly integrated into the 197 * general state tables because the structure of the encoding includes their 198 * byte sequences. 199 * For final entries in an initial state, fallback mappings are stored in 200 * the entry itself like with roundtrip mappings. 201 * For other final entries, they are stored in the code units table if 202 * the entry is for a pair of code units. 203 * For single-unit results in the code units table, there is no space to 204 * alternatively hold a fallback mapping; in this case, the code unit 205 * is stored as U+fffe (unassigned), and the fallback mapping needs to 206 * be looked up by the scalar offset value in a separate table. 207 * 208 * "Unassigned" state entries really mean "structurally unassigned", 209 * i.e., such a byte sequence will never have a mapping result. 210 * 211 * The interpretation of the bits in each entry is as follows: 212 * 213 * Bit 31 not set, not a terminal entry ("transitional"): 214 * 30..24 next state 215 * 23..0 offset delta, to be added up 216 * 217 * Bit 31 set, terminal ("final") entry: 218 * 30..24 next state (regardless of action code) 219 * 23..20 action code: 220 * action codes 0 and 1 result in precise-mapping Unicode code points 221 * 0 valid byte sequence 222 * 19..16 not used, 0 223 * 15..0 16-bit Unicode BMP code point 224 * never U+fffe or U+ffff 225 * 1 valid byte sequence 226 * 19..0 20-bit Unicode supplementary code point 227 * never U+fffe or U+ffff 228 * 229 * action codes 2 and 3 result in fallback (unidirectional-mapping) Unicode code points 230 * 2 valid byte sequence (fallback) 231 * 19..16 not used, 0 232 * 15..0 16-bit Unicode BMP code point as fallback result 233 * 3 valid byte sequence (fallback) 234 * 19..0 20-bit Unicode supplementary code point as fallback result 235 * 236 * action codes 4 and 5 may result in roundtrip/fallback/unassigned/illegal results 237 * depending on the code units they result in 238 * 4 valid byte sequence 239 * 19..9 not used, 0 240 * 8..0 final offset delta 241 * pointing to one 16-bit code unit which may be 242 * fffe unassigned -- look for a fallback for this offset 243 * ffff illegal 244 * 5 valid byte sequence 245 * 19..9 not used, 0 246 * 8..0 final offset delta 247 * pointing to two 16-bit code units 248 * (typically UTF-16 surrogates) 249 * the result depends on the first code unit as follows: 250 * 0000..d7ff roundtrip BMP code point (1st alone) 251 * d800..dbff roundtrip surrogate pair (1st, 2nd) 252 * dc00..dfff fallback surrogate pair (1st-400, 2nd) 253 * e000 roundtrip BMP code point (2nd alone) 254 * e001 fallback BMP code point (2nd alone) 255 * fffe unassigned 256 * ffff illegal 257 * (the final offset deltas are at most 255 * 2, 258 * times 2 because of storing code unit pairs) 259 * 260 * 6 unassigned byte sequence 261 * 19..16 not used, 0 262 * 15..0 16-bit Unicode BMP code point U+fffe (new with version 2) 263 * this does not contain a final offset delta because the main 264 * purpose of this action code is to save scalar offset values; 265 * therefore, fallback values cannot be assigned to byte 266 * sequences that result in this action code 267 * 7 illegal byte sequence 268 * 19..16 not used, 0 269 * 15..0 16-bit Unicode BMP code point U+ffff (new with version 2) 270 * 8 state change only 271 * 19..0 not used, 0 272 * useful for state changes in simple stateful encodings, 273 * at Shift-In/Shift-Out codes 274 * 275 * 276 * 9..15 reserved for future use 277 * current implementations will only perform a state change 278 * and ignore bits 19..0 279 * 280 * An encoding with contiguous ranges of unassigned byte sequences, like 281 * Shift-JIS and especially EUC-TW, can be stored efficiently by having 282 * at least two states for the trail bytes: 283 * One trail byte state that results in code points, and one that only 284 * has "unassigned" and "illegal" terminal states. 285 * 286 * Note: partly by accident, this data structure supports simple stateful 287 * encodings without any additional logic. 288 * Currently, only simple Shift-In/Shift-Out schemes are handled with 289 * appropriate state tables (especially EBCDIC_STATEFUL!). 290 * 291 * MBCS version 2 added: 292 * unassigned and illegal action codes have U+fffe and U+ffff 293 * instead of unused bits; this is useful for _MBCS_SINGLE_SIMPLE_GET_NEXT_BMP() 294 * 295 * Converting from Unicode to codepage bytes --------------------------------*** 296 * 297 * The conversion data structure for fromUnicode is designed for the known 298 * structure of Unicode. It maps from 21-bit code points (0..0x10ffff) to 299 * a sequence of 1..4 bytes, in addition to a flag that indicates if there is 300 * a roundtrip mapping. 301 * 302 * The lookup is done with a 3-stage trie, using 11/6/4 bits for stage 1/2/3 303 * like in the character properties table. 304 * The beginning of the trie is at offsetFromUTable, the beginning of stage 3 305 * with the resulting bytes is at offsetFromUBytes. 306 * 307 * Beginning with version 4, single-byte codepages have a significantly different 308 * trie compared to other codepages. 309 * In all cases, the entry in stage 1 is directly the index of the block of 310 * 64 entries in stage 2. 311 * 312 * Single-byte lookup: 313 * 314 * Stage 2 only contains 16-bit indexes directly to the 16-blocks in stage 3. 315 * Stage 3 contains one 16-bit word per result: 316 * Bits 15..8 indicate the kind of result: 317 * f roundtrip result 318 * c fallback result from private-use code point 319 * 8 fallback result from other code points 320 * 0 unassigned 321 * Bits 7..0 contain the codepage byte. A zero byte is always possible. 322 * 323 * In version 4.3, the runtime code can build an sbcsIndex for a utf8Friendly 324 * file. For 2-byte UTF-8 byte sequences and some 3-byte sequences the lookup 325 * becomes a 2-stage (single-index) trie lookup with 6 bits for stage 3. 326 * ASCII code points can be looked up with a linear array access into stage 3. 327 * See maxFastUChar and other details in ucnvmbcs.h. 328 * 329 * Multi-byte lookup: 330 * 331 * Stage 2 contains a 32-bit word for each 16-block in stage 3: 332 * Bits 31..16 contain flags for which stage 3 entries contain roundtrip results 333 * test: MBCS_FROM_U_IS_ROUNDTRIP(stage2Entry, c) 334 * If this test is false, then a non-zero result will be interpreted as 335 * a fallback mapping. 336 * Bits 15..0 contain the index to stage 3, which must be multiplied by 16*(bytes per char) 337 * 338 * Stage 3 contains 2, 3, or 4 bytes per result. 339 * 2 or 4 bytes are stored as uint16_t/uint32_t in platform endianness, 340 * while 3 bytes are stored as bytes in big-endian order. 341 * Leading zero bytes are ignored, and the number of bytes is counted. 342 * A zero byte mapping result is possible as a roundtrip result. 343 * For some output types, the actual result is processed from this; 344 * see ucnv_MBCSFromUnicodeWithOffsets(). 345 * 346 * Note that stage 1 always contains 0x440=1088 entries (0x440==0x110000>>10), 347 * or (version 3 and up) for BMP-only codepages, it contains 64 entries. 348 * 349 * In version 4.3, a utf8Friendly file contains an mbcsIndex table. 350 * For 2-byte UTF-8 byte sequences and most 3-byte sequences the lookup 351 * becomes a 2-stage (single-index) trie lookup with 6 bits for stage 3. 352 * ASCII code points can be looked up with a linear array access into stage 3. 353 * See maxFastUChar, mbcsIndex and other details in ucnvmbcs.h. 354 * 355 * In version 3, stage 2 blocks may overlap by multiples of the multiplier 356 * for compaction. 357 * In version 4, stage 2 blocks (and for single-byte codepages, stage 3 blocks) 358 * may overlap by any number of entries. 359 * 360 * MBCS version 2 added: 361 * the converter checks for known output types, which allows 362 * adding new ones without crashing an unaware converter 363 */ 364 365 /** 366 * Callback from ucnv_MBCSEnumToUnicode(), takes 32 mappings from 367 * consecutive sequences of bytes, starting from the one encoded in value, 368 * to Unicode code points. (Multiple mappings to reduce per-function call overhead.) 369 * Does not currently support m:n mappings or reverse fallbacks. 370 * This function will not be called for sequences of bytes with leading zeros. 371 * 372 * @param context an opaque pointer, as passed into ucnv_MBCSEnumToUnicode() 373 * @param value contains 1..4 bytes of the first byte sequence, right-aligned 374 * @param codePoints resulting Unicode code points, or negative if a byte sequence does 375 * not map to anything 376 * @return true to continue enumeration, false to stop 377 */ 378 typedef UBool U_CALLCONV 379 UConverterEnumToUCallback(const void *context, uint32_t value, UChar32 codePoints[32]); 380 381 static void U_CALLCONV 382 ucnv_MBCSLoad(UConverterSharedData *sharedData, 383 UConverterLoadArgs *pArgs, 384 const uint8_t *raw, 385 UErrorCode *pErrorCode); 386 387 static void U_CALLCONV 388 ucnv_MBCSUnload(UConverterSharedData *sharedData); 389 390 static void U_CALLCONV 391 ucnv_MBCSOpen(UConverter *cnv, 392 UConverterLoadArgs *pArgs, 393 UErrorCode *pErrorCode); 394 395 static UChar32 U_CALLCONV 396 ucnv_MBCSGetNextUChar(UConverterToUnicodeArgs *pArgs, 397 UErrorCode *pErrorCode); 398 399 static void U_CALLCONV 400 ucnv_MBCSGetStarters(const UConverter* cnv, 401 UBool starters[256], 402 UErrorCode *pErrorCode); 403 404 U_CDECL_BEGIN 405 static const char* U_CALLCONV 406 ucnv_MBCSGetName(const UConverter *cnv); 407 U_CDECL_END 408 409 static void U_CALLCONV 410 ucnv_MBCSWriteSub(UConverterFromUnicodeArgs *pArgs, 411 int32_t offsetIndex, 412 UErrorCode *pErrorCode); 413 414 static UChar32 U_CALLCONV 415 ucnv_MBCSGetNextUChar(UConverterToUnicodeArgs *pArgs, 416 UErrorCode *pErrorCode); 417 418 static void U_CALLCONV 419 ucnv_SBCSFromUTF8(UConverterFromUnicodeArgs *pFromUArgs, 420 UConverterToUnicodeArgs *pToUArgs, 421 UErrorCode *pErrorCode); 422 423 static void U_CALLCONV 424 ucnv_MBCSGetUnicodeSet(const UConverter *cnv, 425 const USetAdder *sa, 426 UConverterUnicodeSet which, 427 UErrorCode *pErrorCode); 428 429 static void U_CALLCONV 430 ucnv_DBCSFromUTF8(UConverterFromUnicodeArgs *pFromUArgs, 431 UConverterToUnicodeArgs *pToUArgs, 432 UErrorCode *pErrorCode); 433 434 static const UConverterImpl _SBCSUTF8Impl={ 435 UCNV_MBCS, 436 437 ucnv_MBCSLoad, 438 ucnv_MBCSUnload, 439 440 ucnv_MBCSOpen, 441 nullptr, 442 nullptr, 443 444 ucnv_MBCSToUnicodeWithOffsets, 445 ucnv_MBCSToUnicodeWithOffsets, 446 ucnv_MBCSFromUnicodeWithOffsets, 447 ucnv_MBCSFromUnicodeWithOffsets, 448 ucnv_MBCSGetNextUChar, 449 450 ucnv_MBCSGetStarters, 451 ucnv_MBCSGetName, 452 ucnv_MBCSWriteSub, 453 nullptr, 454 ucnv_MBCSGetUnicodeSet, 455 456 nullptr, 457 ucnv_SBCSFromUTF8 458 }; 459 460 static const UConverterImpl _DBCSUTF8Impl={ 461 UCNV_MBCS, 462 463 ucnv_MBCSLoad, 464 ucnv_MBCSUnload, 465 466 ucnv_MBCSOpen, 467 nullptr, 468 nullptr, 469 470 ucnv_MBCSToUnicodeWithOffsets, 471 ucnv_MBCSToUnicodeWithOffsets, 472 ucnv_MBCSFromUnicodeWithOffsets, 473 ucnv_MBCSFromUnicodeWithOffsets, 474 ucnv_MBCSGetNextUChar, 475 476 ucnv_MBCSGetStarters, 477 ucnv_MBCSGetName, 478 ucnv_MBCSWriteSub, 479 nullptr, 480 ucnv_MBCSGetUnicodeSet, 481 482 nullptr, 483 ucnv_DBCSFromUTF8 484 }; 485 486 static const UConverterImpl _MBCSImpl={ 487 UCNV_MBCS, 488 489 ucnv_MBCSLoad, 490 ucnv_MBCSUnload, 491 492 ucnv_MBCSOpen, 493 nullptr, 494 nullptr, 495 496 ucnv_MBCSToUnicodeWithOffsets, 497 ucnv_MBCSToUnicodeWithOffsets, 498 ucnv_MBCSFromUnicodeWithOffsets, 499 ucnv_MBCSFromUnicodeWithOffsets, 500 ucnv_MBCSGetNextUChar, 501 502 ucnv_MBCSGetStarters, 503 ucnv_MBCSGetName, 504 ucnv_MBCSWriteSub, 505 nullptr, 506 ucnv_MBCSGetUnicodeSet, 507 nullptr, 508 nullptr 509 }; 510 511 /* Static data is in tools/makeconv/ucnvstat.c for data-based 512 * converters. Be sure to update it as well. 513 */ 514 515 const UConverterSharedData _MBCSData={ 516 sizeof(UConverterSharedData), 1, 517 nullptr, nullptr, false, true, &_MBCSImpl, 518 0, UCNV_MBCS_TABLE_INITIALIZER 519 }; 520 521 522 /* GB 18030 data ------------------------------------------------------------ */ 523 524 /* helper macros for linear values for GB 18030 four-byte sequences */ 525 #define LINEAR_18030(a, b, c, d) ((((a)*10+(b))*126L+(c))*10L+(d)) 526 527 #define LINEAR_18030_BASE LINEAR_18030(0x81, 0x30, 0x81, 0x30) 528 529 #define LINEAR(x) LINEAR_18030(x>>24, (x>>16)&0xff, (x>>8)&0xff, x&0xff) 530 531 /* 532 * Some ranges of GB 18030 where both the Unicode code points and the 533 * GB four-byte sequences are contiguous and are handled algorithmically by 534 * the special callback functions below. 535 * The values are start & end of Unicode & GB codes. 536 * 537 * Note that single surrogates are not mapped by GB 18030 538 * as of the re-released mapping tables from 2000-nov-30. 539 */ 540 static const uint32_t 541 gb18030Ranges[14][4]={ 542 {0x10000, 0x10FFFF, LINEAR(0x90308130), LINEAR(0xE3329A35)}, 543 {0x9FA6, 0xD7FF, LINEAR(0x82358F33), LINEAR(0x8336C738)}, 544 {0x0452, 0x1E3E, LINEAR(0x8130D330), LINEAR(0x8135F436)}, 545 {0x1E40, 0x200F, LINEAR(0x8135F438), LINEAR(0x8136A531)}, 546 {0xE865, 0xF92B, LINEAR(0x8336D030), LINEAR(0x84308534)}, 547 {0x2643, 0x2E80, LINEAR(0x8137A839), LINEAR(0x8138FD38)}, 548 {0xFA2A, 0xFE2F, LINEAR(0x84309C38), LINEAR(0x84318537)}, 549 {0x3CE1, 0x4055, LINEAR(0x8231D438), LINEAR(0x8232AF32)}, 550 {0x361B, 0x3917, LINEAR(0x8230A633), LINEAR(0x8230F237)}, 551 {0x49B8, 0x4C76, LINEAR(0x8234A131), LINEAR(0x8234E733)}, 552 {0x4160, 0x4336, LINEAR(0x8232C937), LINEAR(0x8232F837)}, 553 {0x478E, 0x4946, LINEAR(0x8233E838), LINEAR(0x82349638)}, 554 {0x44D7, 0x464B, LINEAR(0x8233A339), LINEAR(0x8233C931)}, 555 {0xFFE6, 0xFFFF, LINEAR(0x8431A234), LINEAR(0x8431A439)} 556 }; 557 558 /* bit flag for UConverter.options indicating GB 18030 special handling */ 559 #define _MBCS_OPTION_GB18030 0x8000 560 561 /* bit flag for UConverter.options indicating KEIS,JEF,JIF special handling */ 562 #define _MBCS_OPTION_KEIS 0x01000 563 #define _MBCS_OPTION_JEF 0x02000 564 #define _MBCS_OPTION_JIPS 0x04000 565 566 #define KEIS_SO_CHAR_1 0x0A 567 #define KEIS_SO_CHAR_2 0x42 568 #define KEIS_SI_CHAR_1 0x0A 569 #define KEIS_SI_CHAR_2 0x41 570 571 #define JEF_SO_CHAR 0x28 572 #define JEF_SI_CHAR 0x29 573 574 #define JIPS_SO_CHAR_1 0x1A 575 #define JIPS_SO_CHAR_2 0x70 576 #define JIPS_SI_CHAR_1 0x1A 577 #define JIPS_SI_CHAR_2 0x71 578 579 enum SISO_Option { 580 SI, 581 SO 582 }; 583 typedef enum SISO_Option SISO_Option; 584 585 static int32_t getSISOBytes(SISO_Option option, uint32_t cnvOption, uint8_t *value) { 586 int32_t SISOLength = 0; 587 588 switch (option) { 589 case SI: 590 if ((cnvOption&_MBCS_OPTION_KEIS)!=0) { 591 value[0] = KEIS_SI_CHAR_1; 592 value[1] = KEIS_SI_CHAR_2; 593 SISOLength = 2; 594 } else if ((cnvOption&_MBCS_OPTION_JEF)!=0) { 595 value[0] = JEF_SI_CHAR; 596 SISOLength = 1; 597 } else if ((cnvOption&_MBCS_OPTION_JIPS)!=0) { 598 value[0] = JIPS_SI_CHAR_1; 599 value[1] = JIPS_SI_CHAR_2; 600 SISOLength = 2; 601 } else { 602 value[0] = UCNV_SI; 603 SISOLength = 1; 604 } 605 break; 606 case SO: 607 if ((cnvOption&_MBCS_OPTION_KEIS)!=0) { 608 value[0] = KEIS_SO_CHAR_1; 609 value[1] = KEIS_SO_CHAR_2; 610 SISOLength = 2; 611 } else if ((cnvOption&_MBCS_OPTION_JEF)!=0) { 612 value[0] = JEF_SO_CHAR; 613 SISOLength = 1; 614 } else if ((cnvOption&_MBCS_OPTION_JIPS)!=0) { 615 value[0] = JIPS_SO_CHAR_1; 616 value[1] = JIPS_SO_CHAR_2; 617 SISOLength = 2; 618 } else { 619 value[0] = UCNV_SO; 620 SISOLength = 1; 621 } 622 break; 623 default: 624 /* Should never happen. */ 625 break; 626 } 627 628 return SISOLength; 629 } 630 631 /* Miscellaneous ------------------------------------------------------------ */ 632 633 /* similar to ucnv_MBCSGetNextUChar() but recursive */ 634 static UBool 635 enumToU(UConverterMBCSTable *mbcsTable, int8_t stateProps[], 636 int32_t state, uint32_t offset, 637 uint32_t value, 638 UConverterEnumToUCallback *callback, const void *context, 639 UErrorCode *pErrorCode) { 640 UChar32 codePoints[32]; 641 const int32_t *row; 642 const uint16_t *unicodeCodeUnits; 643 UChar32 anyCodePoints; 644 int32_t b, limit; 645 646 row=mbcsTable->stateTable[state]; 647 unicodeCodeUnits=mbcsTable->unicodeCodeUnits; 648 649 value<<=8; 650 anyCodePoints=-1; /* becomes non-negative if there is a mapping */ 651 652 b=(stateProps[state]&0x38)<<2; 653 if(b==0 && stateProps[state]>=0x40) { 654 /* skip byte sequences with leading zeros because they are not stored in the fromUnicode table */ 655 codePoints[0]=U_SENTINEL; 656 b=1; 657 } 658 limit=((stateProps[state]&7)+1)<<5; 659 while(b<limit) { 660 int32_t entry=row[b]; 661 if(MBCS_ENTRY_IS_TRANSITION(entry)) { 662 int32_t nextState=MBCS_ENTRY_TRANSITION_STATE(entry); 663 if(stateProps[nextState]>=0) { 664 /* recurse to a state with non-ignorable actions */ 665 if(!enumToU( 666 mbcsTable, stateProps, nextState, 667 offset+MBCS_ENTRY_TRANSITION_OFFSET(entry), 668 value | static_cast<uint32_t>(b), 669 callback, context, 670 pErrorCode)) { 671 return false; 672 } 673 } 674 codePoints[b&0x1f]=U_SENTINEL; 675 } else { 676 UChar32 c; 677 int32_t action; 678 679 /* 680 * An if-else-if chain provides more reliable performance for 681 * the most common cases compared to a switch. 682 */ 683 action=MBCS_ENTRY_FINAL_ACTION(entry); 684 if(action==MBCS_STATE_VALID_DIRECT_16) { 685 /* output BMP code point */ 686 c = static_cast<char16_t>(MBCS_ENTRY_FINAL_VALUE_16(entry)); 687 } else if(action==MBCS_STATE_VALID_16) { 688 int32_t finalOffset=offset+MBCS_ENTRY_FINAL_VALUE_16(entry); 689 c=unicodeCodeUnits[finalOffset]; 690 if(c<0xfffe) { 691 /* output BMP code point */ 692 } else { 693 c=U_SENTINEL; 694 } 695 } else if(action==MBCS_STATE_VALID_16_PAIR) { 696 int32_t finalOffset=offset+MBCS_ENTRY_FINAL_VALUE_16(entry); 697 c=unicodeCodeUnits[finalOffset++]; 698 if(c<0xd800) { 699 /* output BMP code point below 0xd800 */ 700 } else if(c<=0xdbff) { 701 /* output roundtrip or fallback supplementary code point */ 702 c=((c&0x3ff)<<10)+unicodeCodeUnits[finalOffset]+(0x10000-0xdc00); 703 } else if(c==0xe000) { 704 /* output roundtrip BMP code point above 0xd800 or fallback BMP code point */ 705 c=unicodeCodeUnits[finalOffset]; 706 } else { 707 c=U_SENTINEL; 708 } 709 } else if(action==MBCS_STATE_VALID_DIRECT_20) { 710 /* output supplementary code point */ 711 c = static_cast<UChar32>(MBCS_ENTRY_FINAL_VALUE(entry) + 0x10000); 712 } else { 713 c=U_SENTINEL; 714 } 715 716 codePoints[b&0x1f]=c; 717 anyCodePoints&=c; 718 } 719 if(((++b)&0x1f)==0) { 720 if(anyCodePoints>=0) { 721 if (!callback(context, value | static_cast<uint32_t>(b - 0x20), codePoints)) { 722 return false; 723 } 724 anyCodePoints=-1; 725 } 726 } 727 } 728 return true; 729 } 730 731 /* 732 * Only called if stateProps[state]==-1. 733 * A recursive call may do stateProps[state]|=0x40 if this state is the target of an 734 * MBCS_STATE_CHANGE_ONLY. 735 */ 736 static int8_t 737 getStateProp(const int32_t (*stateTable)[256], int8_t stateProps[], int state) { 738 const int32_t *row; 739 int32_t min, max, entry, nextState; 740 741 row=stateTable[state]; 742 stateProps[state]=0; 743 744 /* find first non-ignorable state */ 745 for(min=0;; ++min) { 746 entry=row[min]; 747 nextState=MBCS_ENTRY_STATE(entry); 748 if(stateProps[nextState]==-1) { 749 getStateProp(stateTable, stateProps, nextState); 750 } 751 if(MBCS_ENTRY_IS_TRANSITION(entry)) { 752 if(stateProps[nextState]>=0) { 753 break; 754 } 755 } else if(MBCS_ENTRY_FINAL_ACTION(entry)<MBCS_STATE_UNASSIGNED) { 756 break; 757 } 758 if(min==0xff) { 759 stateProps[state]=-0x40; /* (int8_t)0xc0 */ 760 return stateProps[state]; 761 } 762 } 763 stateProps[state] |= static_cast<int8_t>((min >> 5) << 3); 764 765 /* find last non-ignorable state */ 766 for(max=0xff; min<max; --max) { 767 entry=row[max]; 768 nextState=MBCS_ENTRY_STATE(entry); 769 if(stateProps[nextState]==-1) { 770 getStateProp(stateTable, stateProps, nextState); 771 } 772 if(MBCS_ENTRY_IS_TRANSITION(entry)) { 773 if(stateProps[nextState]>=0) { 774 break; 775 } 776 } else if(MBCS_ENTRY_FINAL_ACTION(entry)<MBCS_STATE_UNASSIGNED) { 777 break; 778 } 779 } 780 stateProps[state] |= static_cast<int8_t>(max >> 5); 781 782 /* recurse further and collect direct-state information */ 783 while(min<=max) { 784 entry=row[min]; 785 nextState=MBCS_ENTRY_STATE(entry); 786 if(stateProps[nextState]==-1) { 787 getStateProp(stateTable, stateProps, nextState); 788 } 789 if(MBCS_ENTRY_IS_FINAL(entry)) { 790 stateProps[nextState]|=0x40; 791 if(MBCS_ENTRY_FINAL_ACTION(entry)<=MBCS_STATE_FALLBACK_DIRECT_20) { 792 stateProps[state]|=0x40; 793 } 794 } 795 ++min; 796 } 797 return stateProps[state]; 798 } 799 800 /* 801 * Internal function enumerating the toUnicode data of an MBCS converter. 802 * Currently only used for reconstituting data for a MBCS_OPT_NO_FROM_U 803 * table, but could also be used for a future ucnv_getUnicodeSet() option 804 * that includes reverse fallbacks (after updating this function's implementation). 805 * Currently only handles roundtrip mappings. 806 * Does not currently handle extensions. 807 */ 808 static void 809 ucnv_MBCSEnumToUnicode(UConverterMBCSTable *mbcsTable, 810 UConverterEnumToUCallback *callback, const void *context, 811 UErrorCode *pErrorCode) { 812 /* 813 * Properties for each state, to speed up the enumeration. 814 * Ignorable actions are unassigned/illegal/state-change-only: 815 * They do not lead to mappings. 816 * 817 * Bits 7..6: 818 * 1 direct/initial state (stateful converters have multiple) 819 * 0 non-initial state with transitions or with non-ignorable result actions 820 * -1 final state with only ignorable actions 821 * 822 * Bits 5..3: 823 * The lowest byte value with non-ignorable actions is 824 * value<<5 (rounded down). 825 * 826 * Bits 2..0: 827 * The highest byte value with non-ignorable actions is 828 * (value<<5)&0x1f (rounded up). 829 */ 830 int8_t stateProps[MBCS_MAX_STATE_COUNT]; 831 int32_t state; 832 833 uprv_memset(stateProps, -1, sizeof(stateProps)); 834 835 /* recurse from state 0 and set all stateProps */ 836 getStateProp(mbcsTable->stateTable, stateProps, 0); 837 838 for(state=0; state<mbcsTable->countStates; ++state) { 839 /*if(stateProps[state]==-1) { 840 printf("unused/unreachable <icu:state> %d\n", state); 841 }*/ 842 if(stateProps[state]>=0x40) { 843 /* start from each direct state */ 844 enumToU( 845 mbcsTable, stateProps, state, 0, 0, 846 callback, context, 847 pErrorCode); 848 } 849 } 850 } 851 852 U_CFUNC void 853 ucnv_MBCSGetFilteredUnicodeSetForUnicode(const UConverterSharedData *sharedData, 854 const USetAdder *sa, 855 UConverterUnicodeSet which, 856 UConverterSetFilter filter, 857 UErrorCode *pErrorCode) { 858 const UConverterMBCSTable *mbcsTable; 859 const uint16_t *table; 860 861 uint32_t st3; 862 uint16_t st1, maxStage1, st2; 863 864 UChar32 c; 865 866 /* enumerate the from-Unicode trie table */ 867 mbcsTable=&sharedData->mbcs; 868 table=mbcsTable->fromUnicodeTable; 869 if(mbcsTable->unicodeMask&UCNV_HAS_SUPPLEMENTARY) { 870 maxStage1=0x440; 871 } else { 872 maxStage1=0x40; 873 } 874 875 c=0; /* keep track of the current code point while enumerating */ 876 877 if(mbcsTable->outputType==MBCS_OUTPUT_1) { 878 const uint16_t *stage2, *stage3, *results; 879 uint16_t minValue; 880 881 results=(const uint16_t *)mbcsTable->fromUnicodeBytes; 882 883 /* 884 * Set a threshold variable for selecting which mappings to use. 885 * See ucnv_MBCSSingleFromBMPWithOffsets() and 886 * MBCS_SINGLE_RESULT_FROM_U() for details. 887 */ 888 if(which==UCNV_ROUNDTRIP_SET) { 889 /* use only roundtrips */ 890 minValue=0xf00; 891 } else /* UCNV_ROUNDTRIP_AND_FALLBACK_SET */ { 892 /* use all roundtrip and fallback results */ 893 minValue=0x800; 894 } 895 896 for(st1=0; st1<maxStage1; ++st1) { 897 st2=table[st1]; 898 if(st2>maxStage1) { 899 stage2=table+st2; 900 for(st2=0; st2<64; ++st2) { 901 if((st3=stage2[st2])!=0) { 902 /* read the stage 3 block */ 903 stage3=results+st3; 904 905 do { 906 if(*stage3++>=minValue) { 907 sa->add(sa->set, c); 908 } 909 } while((++c&0xf)!=0); 910 } else { 911 c+=16; /* empty stage 3 block */ 912 } 913 } 914 } else { 915 c+=1024; /* empty stage 2 block */ 916 } 917 } 918 } else { 919 const uint32_t *stage2; 920 const uint8_t *stage3, *bytes; 921 uint32_t st3Multiplier; 922 uint32_t value; 923 UBool useFallback; 924 925 bytes=mbcsTable->fromUnicodeBytes; 926 927 useFallback = which == UCNV_ROUNDTRIP_AND_FALLBACK_SET; 928 929 switch(mbcsTable->outputType) { 930 case MBCS_OUTPUT_3: 931 case MBCS_OUTPUT_4_EUC: 932 st3Multiplier=3; 933 break; 934 case MBCS_OUTPUT_4: 935 st3Multiplier=4; 936 break; 937 default: 938 st3Multiplier=2; 939 break; 940 } 941 942 for(st1=0; st1<maxStage1; ++st1) { 943 st2=table[st1]; 944 if(st2>(maxStage1>>1)) { 945 stage2=(const uint32_t *)table+st2; 946 for(st2=0; st2<64; ++st2) { 947 if((st3=stage2[st2])!=0) { 948 /* read the stage 3 block */ 949 stage3=bytes+st3Multiplier*16*(uint32_t)(uint16_t)st3; 950 951 /* get the roundtrip flags for the stage 3 block */ 952 st3>>=16; 953 954 /* 955 * Add code points for which the roundtrip flag is set, 956 * or which map to non-zero bytes if we use fallbacks. 957 * See ucnv_MBCSFromUnicodeWithOffsets() for details. 958 */ 959 switch(filter) { 960 case UCNV_SET_FILTER_NONE: 961 do { 962 if(st3&1) { 963 sa->add(sa->set, c); 964 stage3+=st3Multiplier; 965 } else if(useFallback) { 966 uint8_t b=0; 967 switch(st3Multiplier) { 968 case 4: 969 b|=*stage3++; 970 U_FALLTHROUGH; 971 case 3: 972 b|=*stage3++; 973 U_FALLTHROUGH; 974 case 2: 975 b|=stage3[0]|stage3[1]; 976 stage3+=2; 977 U_FALLTHROUGH; 978 default: 979 break; 980 } 981 if(b!=0) { 982 sa->add(sa->set, c); 983 } 984 } 985 st3>>=1; 986 } while((++c&0xf)!=0); 987 break; 988 case UCNV_SET_FILTER_DBCS_ONLY: 989 /* Ignore single-byte results (<0x100). */ 990 do { 991 if(((st3&1)!=0 || useFallback) && *((const uint16_t *)stage3)>=0x100) { 992 sa->add(sa->set, c); 993 } 994 st3>>=1; 995 stage3+=2; /* +=st3Multiplier */ 996 } while((++c&0xf)!=0); 997 break; 998 case UCNV_SET_FILTER_2022_CN: 999 /* Only add code points that map to CNS 11643 planes 1 & 2 for non-EXT ISO-2022-CN. */ 1000 do { 1001 if(((st3&1)!=0 || useFallback) && ((value=*stage3)==0x81 || value==0x82)) { 1002 sa->add(sa->set, c); 1003 } 1004 st3>>=1; 1005 stage3+=3; /* +=st3Multiplier */ 1006 } while((++c&0xf)!=0); 1007 break; 1008 case UCNV_SET_FILTER_SJIS: 1009 /* Only add code points that map to Shift-JIS codes corresponding to JIS X 0208. */ 1010 do { 1011 if(((st3&1)!=0 || useFallback) && (value=*((const uint16_t *)stage3))>=0x8140 && value<=0xeffc) { 1012 sa->add(sa->set, c); 1013 } 1014 st3>>=1; 1015 stage3+=2; /* +=st3Multiplier */ 1016 } while((++c&0xf)!=0); 1017 break; 1018 case UCNV_SET_FILTER_GR94DBCS: 1019 /* Only add code points that map to ISO 2022 GR 94 DBCS codes (each byte A1..FE). */ 1020 do { 1021 if( ((st3&1)!=0 || useFallback) && 1022 (uint16_t)((value=*((const uint16_t *)stage3)) - 0xa1a1)<=(0xfefe - 0xa1a1) && 1023 (uint8_t)(value-0xa1)<=(0xfe - 0xa1) 1024 ) { 1025 sa->add(sa->set, c); 1026 } 1027 st3>>=1; 1028 stage3+=2; /* +=st3Multiplier */ 1029 } while((++c&0xf)!=0); 1030 break; 1031 case UCNV_SET_FILTER_HZ: 1032 /* Only add code points that are suitable for HZ DBCS (lead byte A1..FD). */ 1033 do { 1034 if( ((st3&1)!=0 || useFallback) && 1035 (uint16_t)((value=*((const uint16_t *)stage3))-0xa1a1)<=(0xfdfe - 0xa1a1) && 1036 (uint8_t)(value-0xa1)<=(0xfe - 0xa1) 1037 ) { 1038 sa->add(sa->set, c); 1039 } 1040 st3>>=1; 1041 stage3+=2; /* +=st3Multiplier */ 1042 } while((++c&0xf)!=0); 1043 break; 1044 default: 1045 *pErrorCode=U_INTERNAL_PROGRAM_ERROR; 1046 return; 1047 } 1048 } else { 1049 c+=16; /* empty stage 3 block */ 1050 } 1051 } 1052 } else { 1053 c+=1024; /* empty stage 2 block */ 1054 } 1055 } 1056 } 1057 1058 ucnv_extGetUnicodeSet(sharedData, sa, which, filter, pErrorCode); 1059 } 1060 1061 U_CFUNC void 1062 ucnv_MBCSGetUnicodeSetForUnicode(const UConverterSharedData *sharedData, 1063 const USetAdder *sa, 1064 UConverterUnicodeSet which, 1065 UErrorCode *pErrorCode) { 1066 ucnv_MBCSGetFilteredUnicodeSetForUnicode( 1067 sharedData, sa, which, 1068 sharedData->mbcs.outputType==MBCS_OUTPUT_DBCS_ONLY ? 1069 UCNV_SET_FILTER_DBCS_ONLY : 1070 UCNV_SET_FILTER_NONE, 1071 pErrorCode); 1072 } 1073 1074 static void U_CALLCONV 1075 ucnv_MBCSGetUnicodeSet(const UConverter *cnv, 1076 const USetAdder *sa, 1077 UConverterUnicodeSet which, 1078 UErrorCode *pErrorCode) { 1079 if(cnv->options&_MBCS_OPTION_GB18030) { 1080 sa->addRange(sa->set, 0, 0xd7ff); 1081 sa->addRange(sa->set, 0xe000, 0x10ffff); 1082 } else { 1083 ucnv_MBCSGetUnicodeSetForUnicode(cnv->sharedData, sa, which, pErrorCode); 1084 } 1085 } 1086 1087 /* conversion extensions for input not in the main table -------------------- */ 1088 1089 /* 1090 * Hardcoded extension handling for GB 18030. 1091 * Definition of LINEAR macros and gb18030Ranges see near the beginning of the file. 1092 * 1093 * In the future, conversion extensions may handle m:n mappings and delta tables, 1094 * see https://htmlpreview.github.io/?https://github.com/unicode-org/icu-docs/blob/main/design/conversion/conversion_extensions.html 1095 * 1096 * If an input character cannot be mapped, then these functions set an error 1097 * code. The framework will then call the callback function. 1098 */ 1099 1100 /* 1101 * @return if(U_FAILURE) return the code point for cnv->fromUChar32 1102 * else return 0 after output has been written to the target 1103 */ 1104 static UChar32 1105 _extFromU(UConverter *cnv, const UConverterSharedData *sharedData, 1106 UChar32 cp, 1107 const char16_t **source, const char16_t *sourceLimit, 1108 uint8_t **target, const uint8_t *targetLimit, 1109 int32_t **offsets, int32_t sourceIndex, 1110 UBool flush, 1111 UErrorCode *pErrorCode) { 1112 const int32_t *cx; 1113 1114 cnv->useSubChar1=false; 1115 1116 if( (cx=sharedData->mbcs.extIndexes)!=nullptr && 1117 ucnv_extInitialMatchFromU( 1118 cnv, cx, 1119 cp, source, sourceLimit, 1120 reinterpret_cast<char**>(target), reinterpret_cast<const char*>(targetLimit), 1121 offsets, sourceIndex, 1122 flush, 1123 pErrorCode) 1124 ) { 1125 return 0; /* an extension mapping handled the input */ 1126 } 1127 1128 /* GB 18030 */ 1129 if((cnv->options&_MBCS_OPTION_GB18030)!=0) { 1130 const uint32_t *range; 1131 int32_t i; 1132 1133 range=gb18030Ranges[0]; 1134 for(i=0; i<UPRV_LENGTHOF(gb18030Ranges); range+=4, ++i) { 1135 if (range[0] <= static_cast<uint32_t>(cp) && static_cast<uint32_t>(cp) <= range[1]) { 1136 /* found the Unicode code point, output the four-byte sequence for it */ 1137 uint32_t linear; 1138 char bytes[4]; 1139 1140 /* get the linear value of the first GB 18030 code in this range */ 1141 linear=range[2]-LINEAR_18030_BASE; 1142 1143 /* add the offset from the beginning of the range */ 1144 linear += (static_cast<uint32_t>(cp) - range[0]); 1145 1146 /* turn this into a four-byte sequence */ 1147 bytes[3] = static_cast<char>(0x30 + linear % 10); linear /= 10; 1148 bytes[2] = static_cast<char>(0x81 + linear % 126); linear /= 126; 1149 bytes[1] = static_cast<char>(0x30 + linear % 10); linear /= 10; 1150 bytes[0] = static_cast<char>(0x81 + linear); 1151 1152 /* output this sequence */ 1153 ucnv_fromUWriteBytes(cnv, 1154 bytes, 4, reinterpret_cast<char**>(target), reinterpret_cast<const char*>(targetLimit), 1155 offsets, sourceIndex, pErrorCode); 1156 return 0; 1157 } 1158 } 1159 } 1160 1161 /* no mapping */ 1162 *pErrorCode=U_INVALID_CHAR_FOUND; 1163 return cp; 1164 } 1165 1166 /* 1167 * Input sequence: cnv->toUBytes[0..length[ 1168 * @return if(U_FAILURE) return the length (toULength, byteIndex) for the input 1169 * else return 0 after output has been written to the target 1170 */ 1171 static int8_t 1172 _extToU(UConverter *cnv, const UConverterSharedData *sharedData, 1173 int8_t length, 1174 const uint8_t **source, const uint8_t *sourceLimit, 1175 char16_t **target, const char16_t *targetLimit, 1176 int32_t **offsets, int32_t sourceIndex, 1177 UBool flush, 1178 UErrorCode *pErrorCode) { 1179 const int32_t *cx; 1180 1181 if( (cx=sharedData->mbcs.extIndexes)!=nullptr && 1182 ucnv_extInitialMatchToU( 1183 cnv, cx, 1184 length, reinterpret_cast<const char**>(source), reinterpret_cast<const char*>(sourceLimit), 1185 target, targetLimit, 1186 offsets, sourceIndex, 1187 flush, 1188 pErrorCode) 1189 ) { 1190 return 0; /* an extension mapping handled the input */ 1191 } 1192 1193 /* GB 18030 */ 1194 if(length==4 && (cnv->options&_MBCS_OPTION_GB18030)!=0) { 1195 const uint32_t *range; 1196 uint32_t linear; 1197 int32_t i; 1198 1199 linear=LINEAR_18030(cnv->toUBytes[0], cnv->toUBytes[1], cnv->toUBytes[2], cnv->toUBytes[3]); 1200 range=gb18030Ranges[0]; 1201 for(i=0; i<UPRV_LENGTHOF(gb18030Ranges); range+=4, ++i) { 1202 if(range[2]<=linear && linear<=range[3]) { 1203 /* found the sequence, output the Unicode code point for it */ 1204 *pErrorCode=U_ZERO_ERROR; 1205 1206 /* add the linear difference between the input and start sequences to the start code point */ 1207 linear=range[0]+(linear-range[2]); 1208 1209 /* output this code point */ 1210 ucnv_toUWriteCodePoint(cnv, linear, target, targetLimit, offsets, sourceIndex, pErrorCode); 1211 1212 return 0; 1213 } 1214 } 1215 } 1216 1217 /* no mapping */ 1218 *pErrorCode=U_INVALID_CHAR_FOUND; 1219 return length; 1220 } 1221 1222 /* EBCDIC swap LF<->NL ------------------------------------------------------ */ 1223 1224 /* 1225 * This code modifies a standard EBCDIC<->Unicode mapping table for 1226 * OS/390 (z/OS) Unix System Services (Open Edition). 1227 * The difference is in the mapping of Line Feed and New Line control codes: 1228 * Standard EBCDIC maps 1229 * 1230 * <U000A> \x25 |0 1231 * <U0085> \x15 |0 1232 * 1233 * but OS/390 USS EBCDIC swaps the control codes for LF and NL, 1234 * mapping 1235 * 1236 * <U000A> \x15 |0 1237 * <U0085> \x25 |0 1238 * 1239 * This code modifies a loaded standard EBCDIC<->Unicode mapping table 1240 * by copying it into allocated memory and swapping the LF and NL values. 1241 * It allows to support the same EBCDIC charset in both versions without 1242 * duplicating the entire installed table. 1243 */ 1244 1245 /* standard EBCDIC codes */ 1246 #define EBCDIC_LF 0x25 1247 #define EBCDIC_NL 0x15 1248 1249 /* standard EBCDIC codes with roundtrip flag as stored in Unicode-to-single-byte tables */ 1250 #define EBCDIC_RT_LF 0xf25 1251 #define EBCDIC_RT_NL 0xf15 1252 1253 /* Unicode code points */ 1254 #define U_LF 0x0a 1255 #define U_NL 0x85 1256 1257 static UBool 1258 _EBCDICSwapLFNL(UConverterSharedData *sharedData, UErrorCode *pErrorCode) { 1259 UConverterMBCSTable *mbcsTable; 1260 1261 const uint16_t *table, *results; 1262 const uint8_t *bytes; 1263 1264 int32_t (*newStateTable)[256]; 1265 uint16_t *newResults; 1266 uint8_t *p; 1267 char *name; 1268 1269 uint32_t stage2Entry; 1270 uint32_t size, sizeofFromUBytes; 1271 1272 mbcsTable=&sharedData->mbcs; 1273 1274 table=mbcsTable->fromUnicodeTable; 1275 bytes=mbcsTable->fromUnicodeBytes; 1276 results = reinterpret_cast<const uint16_t*>(bytes); 1277 1278 /* 1279 * Check that this is an EBCDIC table with SBCS portion - 1280 * SBCS or EBCDIC_STATEFUL with standard EBCDIC LF and NL mappings. 1281 * 1282 * If not, ignore the option. Options are always ignored if they do not apply. 1283 */ 1284 if(!( 1285 (mbcsTable->outputType==MBCS_OUTPUT_1 || mbcsTable->outputType==MBCS_OUTPUT_2_SISO) && 1286 mbcsTable->stateTable[0][EBCDIC_LF]==MBCS_ENTRY_FINAL(0, MBCS_STATE_VALID_DIRECT_16, U_LF) && 1287 mbcsTable->stateTable[0][EBCDIC_NL]==MBCS_ENTRY_FINAL(0, MBCS_STATE_VALID_DIRECT_16, U_NL) 1288 )) { 1289 return false; 1290 } 1291 1292 if(mbcsTable->outputType==MBCS_OUTPUT_1) { 1293 if(!( 1294 EBCDIC_RT_LF==MBCS_SINGLE_RESULT_FROM_U(table, results, U_LF) && 1295 EBCDIC_RT_NL==MBCS_SINGLE_RESULT_FROM_U(table, results, U_NL) 1296 )) { 1297 return false; 1298 } 1299 } else /* MBCS_OUTPUT_2_SISO */ { 1300 stage2Entry=MBCS_STAGE_2_FROM_U(table, U_LF); 1301 if(!( 1302 MBCS_FROM_U_IS_ROUNDTRIP(stage2Entry, U_LF)!=0 && 1303 EBCDIC_LF==MBCS_VALUE_2_FROM_STAGE_2(bytes, stage2Entry, U_LF) 1304 )) { 1305 return false; 1306 } 1307 1308 stage2Entry=MBCS_STAGE_2_FROM_U(table, U_NL); 1309 if(!( 1310 MBCS_FROM_U_IS_ROUNDTRIP(stage2Entry, U_NL)!=0 && 1311 EBCDIC_NL==MBCS_VALUE_2_FROM_STAGE_2(bytes, stage2Entry, U_NL) 1312 )) { 1313 return false; 1314 } 1315 } 1316 1317 if(mbcsTable->fromUBytesLength>0) { 1318 /* 1319 * We _know_ the number of bytes in the fromUnicodeBytes array 1320 * starting with header.version 4.1. 1321 */ 1322 sizeofFromUBytes=mbcsTable->fromUBytesLength; 1323 } else { 1324 /* 1325 * Otherwise: 1326 * There used to be code to enumerate the fromUnicode 1327 * trie and find the highest entry, but it was removed in ICU 3.2 1328 * because it was not tested and caused a low code coverage number. 1329 * See Jitterbug 3674. 1330 * This affects only some .cnv file formats with a header.version 1331 * below 4.1, and only when swaplfnl is requested. 1332 * 1333 * ucnvmbcs.c revision 1.99 is the last one with the 1334 * ucnv_MBCSSizeofFromUBytes() function. 1335 */ 1336 *pErrorCode=U_INVALID_FORMAT_ERROR; 1337 return false; 1338 } 1339 1340 /* 1341 * The table has an appropriate format. 1342 * Allocate and build 1343 * - a modified to-Unicode state table 1344 * - a modified from-Unicode output array 1345 * - a converter name string with the swap option appended 1346 */ 1347 size= 1348 mbcsTable->countStates*1024+ 1349 sizeofFromUBytes+ 1350 UCNV_MAX_CONVERTER_NAME_LENGTH+20; 1351 p = static_cast<uint8_t*>(uprv_malloc(size)); 1352 if(p==nullptr) { 1353 *pErrorCode=U_MEMORY_ALLOCATION_ERROR; 1354 return false; 1355 } 1356 1357 /* copy and modify the to-Unicode state table */ 1358 newStateTable = reinterpret_cast<int32_t(*)[256]>(p); 1359 uprv_memcpy(newStateTable, mbcsTable->stateTable, mbcsTable->countStates*1024); 1360 1361 newStateTable[0][EBCDIC_LF]=MBCS_ENTRY_FINAL(0, MBCS_STATE_VALID_DIRECT_16, U_NL); 1362 newStateTable[0][EBCDIC_NL]=MBCS_ENTRY_FINAL(0, MBCS_STATE_VALID_DIRECT_16, U_LF); 1363 1364 /* copy and modify the from-Unicode result table */ 1365 newResults = reinterpret_cast<uint16_t*>(newStateTable[mbcsTable->countStates]); 1366 uprv_memcpy(newResults, bytes, sizeofFromUBytes); 1367 1368 /* conveniently, the table access macros work on the left side of expressions */ 1369 if(mbcsTable->outputType==MBCS_OUTPUT_1) { 1370 MBCS_SINGLE_RESULT_FROM_U(table, newResults, U_LF)=EBCDIC_RT_NL; 1371 MBCS_SINGLE_RESULT_FROM_U(table, newResults, U_NL)=EBCDIC_RT_LF; 1372 } else /* MBCS_OUTPUT_2_SISO */ { 1373 stage2Entry=MBCS_STAGE_2_FROM_U(table, U_LF); 1374 MBCS_VALUE_2_FROM_STAGE_2(newResults, stage2Entry, U_LF)=EBCDIC_NL; 1375 1376 stage2Entry=MBCS_STAGE_2_FROM_U(table, U_NL); 1377 MBCS_VALUE_2_FROM_STAGE_2(newResults, stage2Entry, U_NL)=EBCDIC_LF; 1378 } 1379 1380 /* set the canonical converter name */ 1381 name = reinterpret_cast<char*>(newResults) + sizeofFromUBytes; 1382 uprv_strcpy(name, sharedData->staticData->name); 1383 uprv_strcat(name, UCNV_SWAP_LFNL_OPTION_STRING); 1384 1385 /* set the pointers */ 1386 icu::umtx_lock(nullptr); 1387 if(mbcsTable->swapLFNLStateTable==nullptr) { 1388 mbcsTable->swapLFNLStateTable=newStateTable; 1389 mbcsTable->swapLFNLFromUnicodeBytes = reinterpret_cast<uint8_t*>(newResults); 1390 mbcsTable->swapLFNLName=name; 1391 1392 newStateTable=nullptr; 1393 } 1394 icu::umtx_unlock(nullptr); 1395 1396 /* release the allocated memory if another thread beat us to it */ 1397 if(newStateTable!=nullptr) { 1398 uprv_free(newStateTable); 1399 } 1400 return true; 1401 } 1402 1403 /* reconstitute omitted fromUnicode data ------------------------------------ */ 1404 1405 /* for details, compare with genmbcs.c MBCSAddFromUnicode() and transformEUC() */ 1406 static UBool U_CALLCONV 1407 writeStage3Roundtrip(const void *context, uint32_t value, UChar32 codePoints[32]) { 1408 UConverterMBCSTable *mbcsTable=(UConverterMBCSTable *)context; 1409 const uint16_t *table; 1410 uint32_t *stage2; 1411 uint8_t *bytes, *p; 1412 UChar32 c; 1413 int32_t i, st3; 1414 1415 table=mbcsTable->fromUnicodeTable; 1416 bytes = const_cast<uint8_t*>(mbcsTable->fromUnicodeBytes); 1417 1418 /* for EUC outputTypes, modify the value like genmbcs.c's transformEUC() */ 1419 switch(mbcsTable->outputType) { 1420 case MBCS_OUTPUT_3_EUC: 1421 if(value<=0xffff) { 1422 /* short sequences are stored directly */ 1423 /* code set 0 or 1 */ 1424 } else if(value<=0x8effff) { 1425 /* code set 2 */ 1426 value&=0x7fff; 1427 } else /* first byte is 0x8f */ { 1428 /* code set 3 */ 1429 value&=0xff7f; 1430 } 1431 break; 1432 case MBCS_OUTPUT_4_EUC: 1433 if(value<=0xffffff) { 1434 /* short sequences are stored directly */ 1435 /* code set 0 or 1 */ 1436 } else if(value<=0x8effffff) { 1437 /* code set 2 */ 1438 value&=0x7fffff; 1439 } else /* first byte is 0x8f */ { 1440 /* code set 3 */ 1441 value&=0xff7fff; 1442 } 1443 break; 1444 default: 1445 break; 1446 } 1447 1448 for(i=0; i<=0x1f; ++value, ++i) { 1449 c=codePoints[i]; 1450 if(c<0) { 1451 continue; 1452 } 1453 1454 /* locate the stage 2 & 3 data */ 1455 stage2=((uint32_t *)table)+table[c>>10]+((c>>4)&0x3f); 1456 p=bytes; 1457 st3 = static_cast<int32_t>(static_cast<uint16_t>(*stage2)) * 16 + (c & 0xf); 1458 1459 /* write the codepage bytes into stage 3 */ 1460 switch(mbcsTable->outputType) { 1461 case MBCS_OUTPUT_3: 1462 case MBCS_OUTPUT_4_EUC: 1463 p+=st3*3; 1464 p[0] = static_cast<uint8_t>(value >> 16); 1465 p[1] = static_cast<uint8_t>(value >> 8); 1466 p[2] = static_cast<uint8_t>(value); 1467 break; 1468 case MBCS_OUTPUT_4: 1469 reinterpret_cast<uint32_t*>(p)[st3] = value; 1470 break; 1471 default: 1472 /* 2 bytes per character */ 1473 reinterpret_cast<uint16_t*>(p)[st3] = static_cast<uint16_t>(value); 1474 break; 1475 } 1476 1477 /* set the roundtrip flag */ 1478 *stage2|=(1UL<<(16+(c&0xf))); 1479 } 1480 return true; 1481 } 1482 1483 static void 1484 reconstituteData(UConverterMBCSTable *mbcsTable, 1485 uint32_t stage1Length, uint32_t stage2Length, 1486 uint32_t fullStage2Length, /* lengths are numbers of units, not bytes */ 1487 UErrorCode *pErrorCode) { 1488 uint16_t *stage1; 1489 uint32_t *stage2; 1490 uint32_t dataLength=stage1Length*2+fullStage2Length*4+mbcsTable->fromUBytesLength; 1491 mbcsTable->reconstitutedData = static_cast<uint8_t*>(uprv_malloc(dataLength)); 1492 if(mbcsTable->reconstitutedData==nullptr) { 1493 *pErrorCode=U_MEMORY_ALLOCATION_ERROR; 1494 return; 1495 } 1496 uprv_memset(mbcsTable->reconstitutedData, 0, dataLength); 1497 1498 /* copy existing data and reroute the pointers */ 1499 stage1 = reinterpret_cast<uint16_t*>(mbcsTable->reconstitutedData); 1500 uprv_memcpy(stage1, mbcsTable->fromUnicodeTable, stage1Length*2); 1501 1502 stage2 = reinterpret_cast<uint32_t*>(stage1 + stage1Length); 1503 uprv_memcpy(stage2+(fullStage2Length-stage2Length), 1504 mbcsTable->fromUnicodeTable+stage1Length, 1505 stage2Length*4); 1506 1507 mbcsTable->fromUnicodeTable=stage1; 1508 mbcsTable->fromUnicodeBytes = reinterpret_cast<uint8_t*>(stage2 + fullStage2Length); 1509 1510 /* indexes into stage 2 count from the bottom of the fromUnicodeTable */ 1511 stage2 = reinterpret_cast<uint32_t*>(stage1); 1512 1513 /* reconstitute the initial part of stage 2 from the mbcsIndex */ 1514 { 1515 int32_t stageUTF8Length = (static_cast<int32_t>(mbcsTable->maxFastUChar) + 1) >> 6; 1516 int32_t stageUTF8Index=0; 1517 int32_t st1, st2, st3, i; 1518 1519 for(st1=0; stageUTF8Index<stageUTF8Length; ++st1) { 1520 st2=stage1[st1]; 1521 if (st2 != static_cast<int32_t>(stage1Length) / 2) { 1522 /* each stage 2 block has 64 entries corresponding to 16 entries in the mbcsIndex */ 1523 for(i=0; i<16; ++i) { 1524 st3=mbcsTable->mbcsIndex[stageUTF8Index++]; 1525 if(st3!=0) { 1526 /* an stage 2 entry's index is per stage 3 16-block, not per stage 3 entry */ 1527 st3>>=4; 1528 /* 1529 * 4 stage 2 entries point to 4 consecutive stage 3 16-blocks which are 1530 * allocated together as a single 64-block for access from the mbcsIndex 1531 */ 1532 stage2[st2++]=st3++; 1533 stage2[st2++]=st3++; 1534 stage2[st2++]=st3++; 1535 stage2[st2++]=st3; 1536 } else { 1537 /* no stage 3 block, skip */ 1538 st2+=4; 1539 } 1540 } 1541 } else { 1542 /* no stage 2 block, skip */ 1543 stageUTF8Index+=16; 1544 } 1545 } 1546 } 1547 1548 /* reconstitute fromUnicodeBytes with roundtrips from toUnicode data */ 1549 ucnv_MBCSEnumToUnicode(mbcsTable, writeStage3Roundtrip, mbcsTable, pErrorCode); 1550 } 1551 1552 /* MBCS setup functions ----------------------------------------------------- */ 1553 1554 static void U_CALLCONV 1555 ucnv_MBCSLoad(UConverterSharedData *sharedData, 1556 UConverterLoadArgs *pArgs, 1557 const uint8_t *raw, 1558 UErrorCode *pErrorCode) { 1559 UDataInfo info; 1560 UConverterMBCSTable *mbcsTable=&sharedData->mbcs; 1561 _MBCSHeader *header=(_MBCSHeader *)raw; 1562 uint32_t offset; 1563 uint32_t headerLength; 1564 UBool noFromU=false; 1565 1566 if(header->version[0]==4) { 1567 headerLength=MBCS_HEADER_V4_LENGTH; 1568 } else if(header->version[0]==5 && header->version[1]>=3 && 1569 (header->options&MBCS_OPT_UNKNOWN_INCOMPATIBLE_MASK)==0) { 1570 headerLength=header->options&MBCS_OPT_LENGTH_MASK; 1571 noFromU = static_cast<UBool>((header->options & MBCS_OPT_NO_FROM_U) != 0); 1572 } else { 1573 *pErrorCode=U_INVALID_TABLE_FORMAT; 1574 return; 1575 } 1576 1577 mbcsTable->outputType = static_cast<uint8_t>(header->flags); 1578 if(noFromU && mbcsTable->outputType==MBCS_OUTPUT_1) { 1579 *pErrorCode=U_INVALID_TABLE_FORMAT; 1580 return; 1581 } 1582 1583 /* extension data, header version 4.2 and higher */ 1584 offset=header->flags>>8; 1585 if(offset!=0) { 1586 mbcsTable->extIndexes = reinterpret_cast<const int32_t*>(raw + offset); 1587 } 1588 1589 if(mbcsTable->outputType==MBCS_OUTPUT_EXT_ONLY) { 1590 UConverterLoadArgs args=UCNV_LOAD_ARGS_INITIALIZER; 1591 UConverterSharedData *baseSharedData; 1592 const int32_t *extIndexes; 1593 const char *baseName; 1594 1595 /* extension-only file, load the base table and set values appropriately */ 1596 if((extIndexes=mbcsTable->extIndexes)==nullptr) { 1597 /* extension-only file without extension */ 1598 *pErrorCode=U_INVALID_TABLE_FORMAT; 1599 return; 1600 } 1601 1602 if(pArgs->nestedLoads!=1) { 1603 /* an extension table must not be loaded as a base table */ 1604 *pErrorCode=U_INVALID_TABLE_FILE; 1605 return; 1606 } 1607 1608 /* load the base table */ 1609 baseName = reinterpret_cast<const char*>(header) + headerLength * 4; 1610 if(0==uprv_strcmp(baseName, sharedData->staticData->name)) { 1611 /* forbid loading this same extension-only file */ 1612 *pErrorCode=U_INVALID_TABLE_FORMAT; 1613 return; 1614 } 1615 1616 /* TODO parse package name out of the prefix of the base name in the extension .cnv file? */ 1617 args.size=sizeof(UConverterLoadArgs); 1618 args.nestedLoads=2; 1619 args.onlyTestIsLoadable=pArgs->onlyTestIsLoadable; 1620 args.reserved=pArgs->reserved; 1621 args.options=pArgs->options; 1622 args.pkg=pArgs->pkg; 1623 args.name=baseName; 1624 baseSharedData=ucnv_load(&args, pErrorCode); 1625 if(U_FAILURE(*pErrorCode)) { 1626 return; 1627 } 1628 if( baseSharedData->staticData->conversionType!=UCNV_MBCS || 1629 baseSharedData->mbcs.baseSharedData!=nullptr 1630 ) { 1631 ucnv_unload(baseSharedData); 1632 *pErrorCode=U_INVALID_TABLE_FORMAT; 1633 return; 1634 } 1635 if(pArgs->onlyTestIsLoadable) { 1636 /* 1637 * Exit as soon as we know that we can load the converter 1638 * and the format is valid and supported. 1639 * The worst that can happen in the following code is a memory 1640 * allocation error. 1641 */ 1642 ucnv_unload(baseSharedData); 1643 return; 1644 } 1645 1646 /* copy the base table data */ 1647 uprv_memcpy(mbcsTable, &baseSharedData->mbcs, sizeof(UConverterMBCSTable)); 1648 1649 /* overwrite values with relevant ones for the extension converter */ 1650 mbcsTable->baseSharedData=baseSharedData; 1651 mbcsTable->extIndexes=extIndexes; 1652 1653 /* 1654 * It would be possible to share the swapLFNL data with a base converter, 1655 * but the generated name would have to be different, and the memory 1656 * would have to be free'd only once. 1657 * It is easier to just create the data for the extension converter 1658 * separately when it is requested. 1659 */ 1660 mbcsTable->swapLFNLStateTable=nullptr; 1661 mbcsTable->swapLFNLFromUnicodeBytes=nullptr; 1662 mbcsTable->swapLFNLName=nullptr; 1663 1664 /* 1665 * The reconstitutedData must be deleted only when the base converter 1666 * is unloaded. 1667 */ 1668 mbcsTable->reconstitutedData=nullptr; 1669 1670 /* 1671 * Set a special, runtime-only outputType if the extension converter 1672 * is a DBCS version of a base converter that also maps single bytes. 1673 */ 1674 if( sharedData->staticData->conversionType==UCNV_DBCS || 1675 (sharedData->staticData->conversionType==UCNV_MBCS && 1676 sharedData->staticData->minBytesPerChar>=2) 1677 ) { 1678 if(baseSharedData->mbcs.outputType==MBCS_OUTPUT_2_SISO) { 1679 /* the base converter is SI/SO-stateful */ 1680 int32_t entry; 1681 1682 /* get the dbcs state from the state table entry for SO=0x0e */ 1683 entry=mbcsTable->stateTable[0][0xe]; 1684 if( MBCS_ENTRY_IS_FINAL(entry) && 1685 MBCS_ENTRY_FINAL_ACTION(entry)==MBCS_STATE_CHANGE_ONLY && 1686 MBCS_ENTRY_FINAL_STATE(entry)!=0 1687 ) { 1688 mbcsTable->dbcsOnlyState = static_cast<uint8_t>(MBCS_ENTRY_FINAL_STATE(entry)); 1689 1690 mbcsTable->outputType=MBCS_OUTPUT_DBCS_ONLY; 1691 } 1692 } else if( 1693 baseSharedData->staticData->conversionType==UCNV_MBCS && 1694 baseSharedData->staticData->minBytesPerChar==1 && 1695 baseSharedData->staticData->maxBytesPerChar==2 && 1696 mbcsTable->countStates<=127 1697 ) { 1698 /* non-stateful base converter, need to modify the state table */ 1699 int32_t (*newStateTable)[256]; 1700 int32_t *state; 1701 int32_t i, count; 1702 1703 /* allocate a new state table and copy the base state table contents */ 1704 count=mbcsTable->countStates; 1705 newStateTable = static_cast<int32_t(*)[256]>(uprv_malloc((count + 1) * 1024)); 1706 if(newStateTable==nullptr) { 1707 ucnv_unload(baseSharedData); 1708 *pErrorCode=U_MEMORY_ALLOCATION_ERROR; 1709 return; 1710 } 1711 1712 uprv_memcpy(newStateTable, mbcsTable->stateTable, count*1024); 1713 1714 /* change all final single-byte entries to go to a new all-illegal state */ 1715 state=newStateTable[0]; 1716 for(i=0; i<256; ++i) { 1717 if(MBCS_ENTRY_IS_FINAL(state[i])) { 1718 state[i]=MBCS_ENTRY_TRANSITION(count, 0); 1719 } 1720 } 1721 1722 /* build the new all-illegal state */ 1723 state=newStateTable[count]; 1724 for(i=0; i<256; ++i) { 1725 state[i]=MBCS_ENTRY_FINAL(0, MBCS_STATE_ILLEGAL, 0); 1726 } 1727 mbcsTable->stateTable=(const int32_t (*)[256])newStateTable; 1728 mbcsTable->countStates = static_cast<uint8_t>(count + 1); 1729 mbcsTable->stateTableOwned=true; 1730 1731 mbcsTable->outputType=MBCS_OUTPUT_DBCS_ONLY; 1732 } 1733 } 1734 1735 /* 1736 * unlike below for files with base tables, do not get the unicodeMask 1737 * from the sharedData; instead, use the base table's unicodeMask, 1738 * which we copied in the memcpy above; 1739 * this is necessary because the static data unicodeMask, especially 1740 * the UCNV_HAS_SUPPLEMENTARY flag, is part of the base table data 1741 */ 1742 } else { 1743 /* conversion file with a base table; an additional extension table is optional */ 1744 /* make sure that the output type is known */ 1745 switch(mbcsTable->outputType) { 1746 case MBCS_OUTPUT_1: 1747 case MBCS_OUTPUT_2: 1748 case MBCS_OUTPUT_3: 1749 case MBCS_OUTPUT_4: 1750 case MBCS_OUTPUT_3_EUC: 1751 case MBCS_OUTPUT_4_EUC: 1752 case MBCS_OUTPUT_2_SISO: 1753 /* OK */ 1754 break; 1755 default: 1756 *pErrorCode=U_INVALID_TABLE_FORMAT; 1757 return; 1758 } 1759 if(pArgs->onlyTestIsLoadable) { 1760 /* 1761 * Exit as soon as we know that we can load the converter 1762 * and the format is valid and supported. 1763 * The worst that can happen in the following code is a memory 1764 * allocation error. 1765 */ 1766 return; 1767 } 1768 1769 mbcsTable->countStates = static_cast<uint8_t>(header->countStates); 1770 mbcsTable->countToUFallbacks=header->countToUFallbacks; 1771 mbcsTable->stateTable = reinterpret_cast<const int32_t(*)[256]>(raw + headerLength * 4); 1772 mbcsTable->toUFallbacks = reinterpret_cast<const _MBCSToUFallback*>(mbcsTable->stateTable + header->countStates); 1773 mbcsTable->unicodeCodeUnits = reinterpret_cast<const uint16_t*>(raw + header->offsetToUCodeUnits); 1774 1775 mbcsTable->fromUnicodeTable = reinterpret_cast<const uint16_t*>(raw + header->offsetFromUTable); 1776 mbcsTable->fromUnicodeBytes = raw + header->offsetFromUBytes; 1777 mbcsTable->fromUBytesLength=header->fromUBytesLength; 1778 1779 /* 1780 * converter versions 6.1 and up contain a unicodeMask that is 1781 * used here to select the most efficient function implementations 1782 */ 1783 info.size=sizeof(UDataInfo); 1784 udata_getInfo((UDataMemory *)sharedData->dataMemory, &info); 1785 if(info.formatVersion[0]>6 || (info.formatVersion[0]==6 && info.formatVersion[1]>=1)) { 1786 /* mask off possible future extensions to be safe */ 1787 mbcsTable->unicodeMask = static_cast<uint8_t>(sharedData->staticData->unicodeMask & 3); 1788 } else { 1789 /* for older versions, assume worst case: contains anything possible (prevent over-optimizations) */ 1790 mbcsTable->unicodeMask=UCNV_HAS_SUPPLEMENTARY|UCNV_HAS_SURROGATES; 1791 } 1792 1793 /* 1794 * _MBCSHeader.version 4.3 adds utf8Friendly data structures. 1795 * Check for the header version, SBCS vs. MBCS, and for whether the 1796 * data structures are optimized for code points as high as what the 1797 * runtime code is designed for. 1798 * The implementation does not handle mapping tables with entries for 1799 * unpaired surrogates. 1800 */ 1801 if( header->version[1]>=3 && 1802 (mbcsTable->unicodeMask&UCNV_HAS_SURROGATES)==0 && 1803 (mbcsTable->countStates==1 ? 1804 (header->version[2]>=(SBCS_FAST_MAX>>8)) : 1805 (header->version[2]>=(MBCS_FAST_MAX>>8)) 1806 ) 1807 ) { 1808 mbcsTable->utf8Friendly=true; 1809 1810 if(mbcsTable->countStates==1) { 1811 /* 1812 * SBCS: Stage 3 is allocated in 64-entry blocks for U+0000..SBCS_FAST_MAX or higher. 1813 * Build a table with indexes to each block, to be used instead of 1814 * the regular stage 1/2 table. 1815 */ 1816 int32_t i; 1817 for(i=0; i<(SBCS_FAST_LIMIT>>6); ++i) { 1818 mbcsTable->sbcsIndex[i]=mbcsTable->fromUnicodeTable[mbcsTable->fromUnicodeTable[i>>4]+((i<<2)&0x3c)]; 1819 } 1820 /* set SBCS_FAST_MAX to reflect the reach of sbcsIndex[] even if header->version[2]>(SBCS_FAST_MAX>>8) */ 1821 mbcsTable->maxFastUChar=SBCS_FAST_MAX; 1822 } else { 1823 /* 1824 * MBCS: Stage 3 is allocated in 64-entry blocks for U+0000..MBCS_FAST_MAX or higher. 1825 * The .cnv file is prebuilt with an additional stage table with indexes 1826 * to each block. 1827 */ 1828 mbcsTable->mbcsIndex = reinterpret_cast<const uint16_t*>( 1829 mbcsTable->fromUnicodeBytes + 1830 (noFromU ? 0 : mbcsTable->fromUBytesLength)); 1831 mbcsTable->maxFastUChar = (static_cast<char16_t>(header->version[2]) << 8) | 0xff; 1832 } 1833 } 1834 1835 /* calculate a bit set of 4 ASCII characters per bit that round-trip to ASCII bytes */ 1836 { 1837 uint32_t asciiRoundtrips=0xffffffff; 1838 int32_t i; 1839 1840 for(i=0; i<0x80; ++i) { 1841 if(mbcsTable->stateTable[0][i]!=MBCS_ENTRY_FINAL(0, MBCS_STATE_VALID_DIRECT_16, i)) { 1842 asciiRoundtrips &= ~(static_cast<uint32_t>(1) << (i >> 2)); 1843 } 1844 } 1845 mbcsTable->asciiRoundtrips=asciiRoundtrips; 1846 } 1847 1848 if(noFromU) { 1849 uint32_t stage1Length= 1850 mbcsTable->unicodeMask&UCNV_HAS_SUPPLEMENTARY ? 1851 0x440 : 0x40; 1852 uint32_t stage2Length= 1853 (header->offsetFromUBytes-header->offsetFromUTable)/4- 1854 stage1Length/2; 1855 reconstituteData(mbcsTable, stage1Length, stage2Length, header->fullStage2Length, pErrorCode); 1856 } 1857 } 1858 1859 /* Set the impl pointer here so that it is set for both extension-only and base tables. */ 1860 if(mbcsTable->utf8Friendly) { 1861 if(mbcsTable->countStates==1) { 1862 sharedData->impl=&_SBCSUTF8Impl; 1863 } else { 1864 if(mbcsTable->outputType==MBCS_OUTPUT_2) { 1865 sharedData->impl=&_DBCSUTF8Impl; 1866 } 1867 } 1868 } 1869 1870 if(mbcsTable->outputType==MBCS_OUTPUT_DBCS_ONLY || mbcsTable->outputType==MBCS_OUTPUT_2_SISO) { 1871 /* 1872 * MBCS_OUTPUT_DBCS_ONLY: No SBCS mappings, therefore ASCII does not roundtrip. 1873 * MBCS_OUTPUT_2_SISO: Bypass the ASCII fastpath to handle prevLength correctly. 1874 */ 1875 mbcsTable->asciiRoundtrips=0; 1876 } 1877 } 1878 1879 static void U_CALLCONV 1880 ucnv_MBCSUnload(UConverterSharedData *sharedData) { 1881 UConverterMBCSTable *mbcsTable=&sharedData->mbcs; 1882 1883 if(mbcsTable->swapLFNLStateTable!=nullptr) { 1884 uprv_free(mbcsTable->swapLFNLStateTable); 1885 } 1886 if(mbcsTable->stateTableOwned) { 1887 uprv_free((void *)mbcsTable->stateTable); 1888 } 1889 if(mbcsTable->baseSharedData!=nullptr) { 1890 ucnv_unload(mbcsTable->baseSharedData); 1891 } 1892 if(mbcsTable->reconstitutedData!=nullptr) { 1893 uprv_free(mbcsTable->reconstitutedData); 1894 } 1895 } 1896 1897 static void U_CALLCONV 1898 ucnv_MBCSOpen(UConverter *cnv, 1899 UConverterLoadArgs *pArgs, 1900 UErrorCode *pErrorCode) { 1901 UConverterMBCSTable *mbcsTable; 1902 const int32_t *extIndexes; 1903 uint8_t outputType; 1904 int8_t maxBytesPerUChar; 1905 1906 if(pArgs->onlyTestIsLoadable) { 1907 return; 1908 } 1909 1910 mbcsTable=&cnv->sharedData->mbcs; 1911 outputType=mbcsTable->outputType; 1912 1913 if(outputType==MBCS_OUTPUT_DBCS_ONLY) { 1914 /* the swaplfnl option does not apply, remove it */ 1915 cnv->options=pArgs->options&=~UCNV_OPTION_SWAP_LFNL; 1916 } 1917 1918 if((pArgs->options&UCNV_OPTION_SWAP_LFNL)!=0) { 1919 /* do this because double-checked locking is broken */ 1920 UBool isCached; 1921 1922 icu::umtx_lock(nullptr); 1923 isCached=mbcsTable->swapLFNLStateTable!=nullptr; 1924 icu::umtx_unlock(nullptr); 1925 1926 if(!isCached) { 1927 if(!_EBCDICSwapLFNL(cnv->sharedData, pErrorCode)) { 1928 if(U_FAILURE(*pErrorCode)) { 1929 return; /* something went wrong */ 1930 } 1931 1932 /* the option does not apply, remove it */ 1933 cnv->options=pArgs->options&=~UCNV_OPTION_SWAP_LFNL; 1934 } 1935 } 1936 } 1937 1938 if(uprv_strstr(pArgs->name, "18030")!=nullptr) { 1939 if(uprv_strstr(pArgs->name, "gb18030")!=nullptr || uprv_strstr(pArgs->name, "GB18030")!=nullptr) { 1940 /* set a flag for GB 18030 mode, which changes the callback behavior */ 1941 cnv->options|=_MBCS_OPTION_GB18030; 1942 } 1943 } else if((uprv_strstr(pArgs->name, "KEIS")!=nullptr) || (uprv_strstr(pArgs->name, "keis")!=nullptr)) { 1944 /* set a flag for KEIS converter, which changes the SI/SO character sequence */ 1945 cnv->options|=_MBCS_OPTION_KEIS; 1946 } else if((uprv_strstr(pArgs->name, "JEF")!=nullptr) || (uprv_strstr(pArgs->name, "jef")!=nullptr)) { 1947 /* set a flag for JEF converter, which changes the SI/SO character sequence */ 1948 cnv->options|=_MBCS_OPTION_JEF; 1949 } else if((uprv_strstr(pArgs->name, "JIPS")!=nullptr) || (uprv_strstr(pArgs->name, "jips")!=nullptr)) { 1950 /* set a flag for JIPS converter, which changes the SI/SO character sequence */ 1951 cnv->options|=_MBCS_OPTION_JIPS; 1952 } 1953 1954 /* fix maxBytesPerUChar depending on outputType and options etc. */ 1955 if(outputType==MBCS_OUTPUT_2_SISO) { 1956 cnv->maxBytesPerUChar=3; /* SO+DBCS */ 1957 } 1958 1959 extIndexes=mbcsTable->extIndexes; 1960 if(extIndexes!=nullptr) { 1961 maxBytesPerUChar = static_cast<int8_t>(UCNV_GET_MAX_BYTES_PER_UCHAR(extIndexes)); 1962 if(outputType==MBCS_OUTPUT_2_SISO) { 1963 ++maxBytesPerUChar; /* SO + multiple DBCS */ 1964 } 1965 1966 if(maxBytesPerUChar>cnv->maxBytesPerUChar) { 1967 cnv->maxBytesPerUChar=maxBytesPerUChar; 1968 } 1969 } 1970 1971 #if 0 1972 /* 1973 * documentation of UConverter fields used for status 1974 * all of these fields are (re)set to 0 by ucnv_bld.c and ucnv_reset() 1975 */ 1976 1977 /* toUnicode */ 1978 cnv->toUnicodeStatus=0; /* offset */ 1979 cnv->mode=0; /* state */ 1980 cnv->toULength=0; /* byteIndex */ 1981 1982 /* fromUnicode */ 1983 cnv->fromUChar32=0; 1984 cnv->fromUnicodeStatus=1; /* prevLength */ 1985 #endif 1986 } 1987 1988 U_CDECL_BEGIN 1989 1990 static const char* U_CALLCONV 1991 ucnv_MBCSGetName(const UConverter *cnv) { 1992 if((cnv->options&UCNV_OPTION_SWAP_LFNL)!=0 && cnv->sharedData->mbcs.swapLFNLName!=nullptr) { 1993 return cnv->sharedData->mbcs.swapLFNLName; 1994 } else { 1995 return cnv->sharedData->staticData->name; 1996 } 1997 } 1998 U_CDECL_END 1999 2000 2001 /* MBCS-to-Unicode conversion functions ------------------------------------- */ 2002 2003 static UChar32 U_CALLCONV 2004 ucnv_MBCSGetFallback(UConverterMBCSTable *mbcsTable, uint32_t offset) { 2005 const _MBCSToUFallback *toUFallbacks; 2006 uint32_t i, start, limit; 2007 2008 limit=mbcsTable->countToUFallbacks; 2009 if(limit>0) { 2010 /* do a binary search for the fallback mapping */ 2011 toUFallbacks=mbcsTable->toUFallbacks; 2012 start=0; 2013 while(start<limit-1) { 2014 i=(start+limit)/2; 2015 if(offset<toUFallbacks[i].offset) { 2016 limit=i; 2017 } else { 2018 start=i; 2019 } 2020 } 2021 2022 /* did we really find it? */ 2023 if(offset==toUFallbacks[start].offset) { 2024 return toUFallbacks[start].codePoint; 2025 } 2026 } 2027 2028 return 0xfffe; 2029 } 2030 2031 /* This version of ucnv_MBCSToUnicodeWithOffsets() is optimized for single-byte, single-state codepages. */ 2032 static void 2033 ucnv_MBCSSingleToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs, 2034 UErrorCode *pErrorCode) { 2035 UConverter *cnv; 2036 const uint8_t *source, *sourceLimit; 2037 char16_t *target; 2038 const char16_t *targetLimit; 2039 int32_t *offsets; 2040 2041 const int32_t (*stateTable)[256]; 2042 2043 int32_t sourceIndex; 2044 2045 int32_t entry; 2046 char16_t c; 2047 uint8_t action; 2048 2049 /* set up the local pointers */ 2050 cnv=pArgs->converter; 2051 source = reinterpret_cast<const uint8_t*>(pArgs->source); 2052 sourceLimit = reinterpret_cast<const uint8_t*>(pArgs->sourceLimit); 2053 target=pArgs->target; 2054 targetLimit=pArgs->targetLimit; 2055 offsets=pArgs->offsets; 2056 2057 if((cnv->options&UCNV_OPTION_SWAP_LFNL)!=0) { 2058 stateTable=(const int32_t (*)[256])cnv->sharedData->mbcs.swapLFNLStateTable; 2059 } else { 2060 stateTable=cnv->sharedData->mbcs.stateTable; 2061 } 2062 2063 /* sourceIndex=-1 if the current character began in the previous buffer */ 2064 sourceIndex=0; 2065 2066 /* conversion loop */ 2067 while(source<sourceLimit) { 2068 /* 2069 * This following test is to see if available input would overflow the output. 2070 * It does not catch output of more than one code unit that 2071 * overflows as a result of a surrogate pair or callback output 2072 * from the last source byte. 2073 * Therefore, those situations also test for overflows and will 2074 * then break the loop, too. 2075 */ 2076 if(target>=targetLimit) { 2077 /* target is full */ 2078 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 2079 break; 2080 } 2081 2082 entry=stateTable[0][*source++]; 2083 /* MBCS_ENTRY_IS_FINAL(entry) */ 2084 2085 /* test the most common case first */ 2086 if(MBCS_ENTRY_FINAL_IS_VALID_DIRECT_16(entry)) { 2087 /* output BMP code point */ 2088 *target++ = static_cast<char16_t>(MBCS_ENTRY_FINAL_VALUE_16(entry)); 2089 if(offsets!=nullptr) { 2090 *offsets++=sourceIndex; 2091 } 2092 2093 /* normal end of action codes: prepare for a new character */ 2094 ++sourceIndex; 2095 continue; 2096 } 2097 2098 /* 2099 * An if-else-if chain provides more reliable performance for 2100 * the most common cases compared to a switch. 2101 */ 2102 action = static_cast<uint8_t>(MBCS_ENTRY_FINAL_ACTION(entry)); 2103 if(action==MBCS_STATE_VALID_DIRECT_20 || 2104 (action==MBCS_STATE_FALLBACK_DIRECT_20 && UCNV_TO_U_USE_FALLBACK(cnv)) 2105 ) { 2106 entry=MBCS_ENTRY_FINAL_VALUE(entry); 2107 /* output surrogate pair */ 2108 *target++ = static_cast<char16_t>(0xd800 | static_cast<char16_t>(entry >> 10)); 2109 if(offsets!=nullptr) { 2110 *offsets++=sourceIndex; 2111 } 2112 c = static_cast<char16_t>(0xdc00 | static_cast<char16_t>(entry & 0x3ff)); 2113 if(target<targetLimit) { 2114 *target++=c; 2115 if(offsets!=nullptr) { 2116 *offsets++=sourceIndex; 2117 } 2118 } else { 2119 /* target overflow */ 2120 cnv->UCharErrorBuffer[0]=c; 2121 cnv->UCharErrorBufferLength=1; 2122 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 2123 break; 2124 } 2125 2126 ++sourceIndex; 2127 continue; 2128 } else if(action==MBCS_STATE_FALLBACK_DIRECT_16) { 2129 if(UCNV_TO_U_USE_FALLBACK(cnv)) { 2130 /* output BMP code point */ 2131 *target++ = static_cast<char16_t>(MBCS_ENTRY_FINAL_VALUE_16(entry)); 2132 if(offsets!=nullptr) { 2133 *offsets++=sourceIndex; 2134 } 2135 2136 ++sourceIndex; 2137 continue; 2138 } 2139 } else if(action==MBCS_STATE_UNASSIGNED) { 2140 /* just fall through */ 2141 } else if(action==MBCS_STATE_ILLEGAL) { 2142 /* callback(illegal) */ 2143 *pErrorCode=U_ILLEGAL_CHAR_FOUND; 2144 } else { 2145 /* reserved, must never occur */ 2146 ++sourceIndex; 2147 continue; 2148 } 2149 2150 if(U_FAILURE(*pErrorCode)) { 2151 /* callback(illegal) */ 2152 break; 2153 } else /* unassigned sequences indicated with byteIndex>0 */ { 2154 /* try an extension mapping */ 2155 pArgs->source = reinterpret_cast<const char*>(source); 2156 cnv->toUBytes[0]=*(source-1); 2157 cnv->toULength=_extToU(cnv, cnv->sharedData, 2158 1, &source, sourceLimit, 2159 &target, targetLimit, 2160 &offsets, sourceIndex, 2161 pArgs->flush, 2162 pErrorCode); 2163 sourceIndex += 1 + static_cast<int32_t>(source - reinterpret_cast<const uint8_t*>(pArgs->source)); 2164 2165 if(U_FAILURE(*pErrorCode)) { 2166 /* not mappable or buffer overflow */ 2167 break; 2168 } 2169 } 2170 } 2171 2172 /* write back the updated pointers */ 2173 pArgs->source = reinterpret_cast<const char*>(source); 2174 pArgs->target=target; 2175 pArgs->offsets=offsets; 2176 } 2177 2178 /* 2179 * This version of ucnv_MBCSSingleToUnicodeWithOffsets() is optimized for single-byte, single-state codepages 2180 * that only map to and from the BMP. 2181 * In addition to single-byte optimizations, the offset calculations 2182 * become much easier. 2183 */ 2184 static void 2185 ucnv_MBCSSingleToBMPWithOffsets(UConverterToUnicodeArgs *pArgs, 2186 UErrorCode *pErrorCode) { 2187 UConverter *cnv; 2188 const uint8_t *source, *sourceLimit, *lastSource; 2189 char16_t *target; 2190 int32_t targetCapacity, length; 2191 int32_t *offsets; 2192 2193 const int32_t (*stateTable)[256]; 2194 2195 int32_t sourceIndex; 2196 2197 int32_t entry; 2198 uint8_t action; 2199 2200 /* set up the local pointers */ 2201 cnv=pArgs->converter; 2202 source = reinterpret_cast<const uint8_t*>(pArgs->source); 2203 sourceLimit = reinterpret_cast<const uint8_t*>(pArgs->sourceLimit); 2204 target=pArgs->target; 2205 targetCapacity = static_cast<int32_t>(pArgs->targetLimit - pArgs->target); 2206 offsets=pArgs->offsets; 2207 2208 if((cnv->options&UCNV_OPTION_SWAP_LFNL)!=0) { 2209 stateTable=(const int32_t (*)[256])cnv->sharedData->mbcs.swapLFNLStateTable; 2210 } else { 2211 stateTable=cnv->sharedData->mbcs.stateTable; 2212 } 2213 2214 /* sourceIndex=-1 if the current character began in the previous buffer */ 2215 sourceIndex=0; 2216 lastSource=source; 2217 2218 /* 2219 * since the conversion here is 1:1 char16_t:uint8_t, we need only one counter 2220 * for the minimum of the sourceLength and targetCapacity 2221 */ 2222 length = static_cast<int32_t>(sourceLimit - source); 2223 if(length<targetCapacity) { 2224 targetCapacity=length; 2225 } 2226 2227 #if MBCS_UNROLL_SINGLE_TO_BMP 2228 /* unrolling makes it faster on Pentium III/Windows 2000 */ 2229 /* unroll the loop with the most common case */ 2230 unrolled: 2231 if(targetCapacity>=16) { 2232 int32_t count, loops, oredEntries; 2233 2234 loops=count=targetCapacity>>4; 2235 do { 2236 oredEntries=entry=stateTable[0][*source++]; 2237 *target++ = static_cast<char16_t>(MBCS_ENTRY_FINAL_VALUE_16(entry)); 2238 oredEntries|=entry=stateTable[0][*source++]; 2239 *target++ = static_cast<char16_t>(MBCS_ENTRY_FINAL_VALUE_16(entry)); 2240 oredEntries|=entry=stateTable[0][*source++]; 2241 *target++ = static_cast<char16_t>(MBCS_ENTRY_FINAL_VALUE_16(entry)); 2242 oredEntries|=entry=stateTable[0][*source++]; 2243 *target++ = static_cast<char16_t>(MBCS_ENTRY_FINAL_VALUE_16(entry)); 2244 oredEntries|=entry=stateTable[0][*source++]; 2245 *target++ = static_cast<char16_t>(MBCS_ENTRY_FINAL_VALUE_16(entry)); 2246 oredEntries|=entry=stateTable[0][*source++]; 2247 *target++ = static_cast<char16_t>(MBCS_ENTRY_FINAL_VALUE_16(entry)); 2248 oredEntries|=entry=stateTable[0][*source++]; 2249 *target++ = static_cast<char16_t>(MBCS_ENTRY_FINAL_VALUE_16(entry)); 2250 oredEntries|=entry=stateTable[0][*source++]; 2251 *target++ = static_cast<char16_t>(MBCS_ENTRY_FINAL_VALUE_16(entry)); 2252 oredEntries|=entry=stateTable[0][*source++]; 2253 *target++ = static_cast<char16_t>(MBCS_ENTRY_FINAL_VALUE_16(entry)); 2254 oredEntries|=entry=stateTable[0][*source++]; 2255 *target++ = static_cast<char16_t>(MBCS_ENTRY_FINAL_VALUE_16(entry)); 2256 oredEntries|=entry=stateTable[0][*source++]; 2257 *target++ = static_cast<char16_t>(MBCS_ENTRY_FINAL_VALUE_16(entry)); 2258 oredEntries|=entry=stateTable[0][*source++]; 2259 *target++ = static_cast<char16_t>(MBCS_ENTRY_FINAL_VALUE_16(entry)); 2260 oredEntries|=entry=stateTable[0][*source++]; 2261 *target++ = static_cast<char16_t>(MBCS_ENTRY_FINAL_VALUE_16(entry)); 2262 oredEntries|=entry=stateTable[0][*source++]; 2263 *target++ = static_cast<char16_t>(MBCS_ENTRY_FINAL_VALUE_16(entry)); 2264 oredEntries|=entry=stateTable[0][*source++]; 2265 *target++ = static_cast<char16_t>(MBCS_ENTRY_FINAL_VALUE_16(entry)); 2266 oredEntries|=entry=stateTable[0][*source++]; 2267 *target++ = static_cast<char16_t>(MBCS_ENTRY_FINAL_VALUE_16(entry)); 2268 2269 /* were all 16 entries really valid? */ 2270 if(!MBCS_ENTRY_FINAL_IS_VALID_DIRECT_16(oredEntries)) { 2271 /* no, return to the first of these 16 */ 2272 source-=16; 2273 target-=16; 2274 break; 2275 } 2276 } while(--count>0); 2277 count=loops-count; 2278 targetCapacity-=16*count; 2279 2280 if(offsets!=nullptr) { 2281 lastSource+=16*count; 2282 while(count>0) { 2283 *offsets++=sourceIndex++; 2284 *offsets++=sourceIndex++; 2285 *offsets++=sourceIndex++; 2286 *offsets++=sourceIndex++; 2287 *offsets++=sourceIndex++; 2288 *offsets++=sourceIndex++; 2289 *offsets++=sourceIndex++; 2290 *offsets++=sourceIndex++; 2291 *offsets++=sourceIndex++; 2292 *offsets++=sourceIndex++; 2293 *offsets++=sourceIndex++; 2294 *offsets++=sourceIndex++; 2295 *offsets++=sourceIndex++; 2296 *offsets++=sourceIndex++; 2297 *offsets++=sourceIndex++; 2298 *offsets++=sourceIndex++; 2299 --count; 2300 } 2301 } 2302 } 2303 #endif 2304 2305 /* conversion loop */ 2306 while(targetCapacity > 0 && source < sourceLimit) { 2307 entry=stateTable[0][*source++]; 2308 /* MBCS_ENTRY_IS_FINAL(entry) */ 2309 2310 /* test the most common case first */ 2311 if(MBCS_ENTRY_FINAL_IS_VALID_DIRECT_16(entry)) { 2312 /* output BMP code point */ 2313 *target++ = static_cast<char16_t>(MBCS_ENTRY_FINAL_VALUE_16(entry)); 2314 --targetCapacity; 2315 continue; 2316 } 2317 2318 /* 2319 * An if-else-if chain provides more reliable performance for 2320 * the most common cases compared to a switch. 2321 */ 2322 action = static_cast<uint8_t>(MBCS_ENTRY_FINAL_ACTION(entry)); 2323 if(action==MBCS_STATE_FALLBACK_DIRECT_16) { 2324 if(UCNV_TO_U_USE_FALLBACK(cnv)) { 2325 /* output BMP code point */ 2326 *target++ = static_cast<char16_t>(MBCS_ENTRY_FINAL_VALUE_16(entry)); 2327 --targetCapacity; 2328 continue; 2329 } 2330 } else if(action==MBCS_STATE_UNASSIGNED) { 2331 /* just fall through */ 2332 } else if(action==MBCS_STATE_ILLEGAL) { 2333 /* callback(illegal) */ 2334 *pErrorCode=U_ILLEGAL_CHAR_FOUND; 2335 } else { 2336 /* reserved, must never occur */ 2337 continue; 2338 } 2339 2340 /* set offsets since the start or the last extension */ 2341 if(offsets!=nullptr) { 2342 int32_t count = static_cast<int32_t>(source - lastSource); 2343 2344 /* predecrement: do not set the offset for the callback-causing character */ 2345 while(--count>0) { 2346 *offsets++=sourceIndex++; 2347 } 2348 /* offset and sourceIndex are now set for the current character */ 2349 } 2350 2351 if(U_FAILURE(*pErrorCode)) { 2352 /* callback(illegal) */ 2353 break; 2354 } else /* unassigned sequences indicated with byteIndex>0 */ { 2355 /* try an extension mapping */ 2356 lastSource=source; 2357 cnv->toUBytes[0]=*(source-1); 2358 cnv->toULength=_extToU(cnv, cnv->sharedData, 2359 1, &source, sourceLimit, 2360 &target, pArgs->targetLimit, 2361 &offsets, sourceIndex, 2362 pArgs->flush, 2363 pErrorCode); 2364 sourceIndex += 1 + static_cast<int32_t>(source - lastSource); 2365 2366 if(U_FAILURE(*pErrorCode)) { 2367 /* not mappable or buffer overflow */ 2368 break; 2369 } 2370 2371 /* recalculate the targetCapacity after an extension mapping */ 2372 targetCapacity = static_cast<int32_t>(pArgs->targetLimit - target); 2373 length = static_cast<int32_t>(sourceLimit - source); 2374 if(length<targetCapacity) { 2375 targetCapacity=length; 2376 } 2377 } 2378 2379 #if MBCS_UNROLL_SINGLE_TO_BMP 2380 /* unrolling makes it faster on Pentium III/Windows 2000 */ 2381 goto unrolled; 2382 #endif 2383 } 2384 2385 if(U_SUCCESS(*pErrorCode) && source<sourceLimit && target>=pArgs->targetLimit) { 2386 /* target is full */ 2387 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 2388 } 2389 2390 /* set offsets since the start or the last callback */ 2391 if(offsets!=nullptr) { 2392 size_t count=source-lastSource; 2393 while(count>0) { 2394 *offsets++=sourceIndex++; 2395 --count; 2396 } 2397 } 2398 2399 /* write back the updated pointers */ 2400 pArgs->source = reinterpret_cast<const char*>(source); 2401 pArgs->target=target; 2402 pArgs->offsets=offsets; 2403 } 2404 2405 static UBool 2406 hasValidTrailBytes(const int32_t (*stateTable)[256], uint8_t state) { 2407 const int32_t *row=stateTable[state]; 2408 int32_t b, entry; 2409 /* First test for final entries in this state for some commonly valid byte values. */ 2410 entry=row[0xa1]; 2411 if( !MBCS_ENTRY_IS_TRANSITION(entry) && 2412 MBCS_ENTRY_FINAL_ACTION(entry)!=MBCS_STATE_ILLEGAL 2413 ) { 2414 return true; 2415 } 2416 entry=row[0x41]; 2417 if( !MBCS_ENTRY_IS_TRANSITION(entry) && 2418 MBCS_ENTRY_FINAL_ACTION(entry)!=MBCS_STATE_ILLEGAL 2419 ) { 2420 return true; 2421 } 2422 /* Then test for final entries in this state. */ 2423 for(b=0; b<=0xff; ++b) { 2424 entry=row[b]; 2425 if( !MBCS_ENTRY_IS_TRANSITION(entry) && 2426 MBCS_ENTRY_FINAL_ACTION(entry)!=MBCS_STATE_ILLEGAL 2427 ) { 2428 return true; 2429 } 2430 } 2431 /* Then recurse for transition entries. */ 2432 for(b=0; b<=0xff; ++b) { 2433 entry=row[b]; 2434 if( MBCS_ENTRY_IS_TRANSITION(entry) && 2435 hasValidTrailBytes(stateTable, static_cast<uint8_t>(MBCS_ENTRY_TRANSITION_STATE(entry))) 2436 ) { 2437 return true; 2438 } 2439 } 2440 return false; 2441 } 2442 2443 /* 2444 * Is byte b a single/lead byte in this state? 2445 * Recurse for transition states, because here we don't want to say that 2446 * b is a lead byte if all byte sequences that start with b are illegal. 2447 */ 2448 static UBool 2449 isSingleOrLead(const int32_t (*stateTable)[256], uint8_t state, UBool isDBCSOnly, uint8_t b) { 2450 const int32_t *row=stateTable[state]; 2451 int32_t entry=row[b]; 2452 if(MBCS_ENTRY_IS_TRANSITION(entry)) { /* lead byte */ 2453 return hasValidTrailBytes(stateTable, static_cast<uint8_t>(MBCS_ENTRY_TRANSITION_STATE(entry))); 2454 } else { 2455 uint8_t action = static_cast<uint8_t>(MBCS_ENTRY_FINAL_ACTION(entry)); 2456 if(action==MBCS_STATE_CHANGE_ONLY && isDBCSOnly) { 2457 return false; /* SI/SO are illegal for DBCS-only conversion */ 2458 } else { 2459 return action!=MBCS_STATE_ILLEGAL; 2460 } 2461 } 2462 } 2463 2464 U_CFUNC void 2465 ucnv_MBCSToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs, 2466 UErrorCode *pErrorCode) { 2467 UConverter *cnv; 2468 const uint8_t *source, *sourceLimit; 2469 char16_t *target; 2470 const char16_t *targetLimit; 2471 int32_t *offsets; 2472 2473 const int32_t (*stateTable)[256]; 2474 const uint16_t *unicodeCodeUnits; 2475 2476 uint32_t offset; 2477 uint8_t state; 2478 int8_t byteIndex; 2479 uint8_t *bytes; 2480 2481 int32_t sourceIndex, nextSourceIndex; 2482 2483 int32_t entry; 2484 char16_t c; 2485 uint8_t action; 2486 2487 /* use optimized function if possible */ 2488 cnv=pArgs->converter; 2489 2490 if(cnv->preToULength>0) { 2491 /* 2492 * pass sourceIndex=-1 because we continue from an earlier buffer 2493 * in the future, this may change with continuous offsets 2494 */ 2495 ucnv_extContinueMatchToU(cnv, pArgs, -1, pErrorCode); 2496 2497 if(U_FAILURE(*pErrorCode) || cnv->preToULength<0) { 2498 return; 2499 } 2500 } 2501 2502 if(cnv->sharedData->mbcs.countStates==1) { 2503 if(!(cnv->sharedData->mbcs.unicodeMask&UCNV_HAS_SUPPLEMENTARY)) { 2504 ucnv_MBCSSingleToBMPWithOffsets(pArgs, pErrorCode); 2505 } else { 2506 ucnv_MBCSSingleToUnicodeWithOffsets(pArgs, pErrorCode); 2507 } 2508 return; 2509 } 2510 2511 /* set up the local pointers */ 2512 source=(const uint8_t *)pArgs->source; 2513 sourceLimit=(const uint8_t *)pArgs->sourceLimit; 2514 target=pArgs->target; 2515 targetLimit=pArgs->targetLimit; 2516 offsets=pArgs->offsets; 2517 2518 if((cnv->options&UCNV_OPTION_SWAP_LFNL)!=0) { 2519 stateTable=(const int32_t (*)[256])cnv->sharedData->mbcs.swapLFNLStateTable; 2520 } else { 2521 stateTable=cnv->sharedData->mbcs.stateTable; 2522 } 2523 unicodeCodeUnits=cnv->sharedData->mbcs.unicodeCodeUnits; 2524 2525 /* get the converter state from UConverter */ 2526 offset=cnv->toUnicodeStatus; 2527 byteIndex=cnv->toULength; 2528 bytes=cnv->toUBytes; 2529 2530 /* 2531 * if we are in the SBCS state for a DBCS-only converter, 2532 * then load the DBCS state from the MBCS data 2533 * (dbcsOnlyState==0 if it is not a DBCS-only converter) 2534 */ 2535 if((state=(uint8_t)(cnv->mode))==0) { 2536 state=cnv->sharedData->mbcs.dbcsOnlyState; 2537 } 2538 2539 /* sourceIndex=-1 if the current character began in the previous buffer */ 2540 sourceIndex=byteIndex==0 ? 0 : -1; 2541 nextSourceIndex=0; 2542 2543 /* conversion loop */ 2544 while(source<sourceLimit) { 2545 /* 2546 * This following test is to see if available input would overflow the output. 2547 * It does not catch output of more than one code unit that 2548 * overflows as a result of a surrogate pair or callback output 2549 * from the last source byte. 2550 * Therefore, those situations also test for overflows and will 2551 * then break the loop, too. 2552 */ 2553 if(target>=targetLimit) { 2554 /* target is full */ 2555 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 2556 break; 2557 } 2558 2559 if(byteIndex==0) { 2560 /* optimized loop for 1/2-byte input and BMP output */ 2561 if(offsets==nullptr) { 2562 do { 2563 entry=stateTable[state][*source]; 2564 if(MBCS_ENTRY_IS_TRANSITION(entry)) { 2565 state=(uint8_t)MBCS_ENTRY_TRANSITION_STATE(entry); 2566 offset=MBCS_ENTRY_TRANSITION_OFFSET(entry); 2567 2568 ++source; 2569 if( source<sourceLimit && 2570 MBCS_ENTRY_IS_FINAL(entry=stateTable[state][*source]) && 2571 MBCS_ENTRY_FINAL_ACTION(entry)==MBCS_STATE_VALID_16 && 2572 (c=unicodeCodeUnits[offset+MBCS_ENTRY_FINAL_VALUE_16(entry)])<0xfffe 2573 ) { 2574 ++source; 2575 *target++=c; 2576 state=(uint8_t)MBCS_ENTRY_FINAL_STATE(entry); /* typically 0 */ 2577 offset=0; 2578 } else { 2579 /* set the state and leave the optimized loop */ 2580 bytes[0]=*(source-1); 2581 byteIndex=1; 2582 break; 2583 } 2584 } else { 2585 if(MBCS_ENTRY_FINAL_IS_VALID_DIRECT_16(entry)) { 2586 /* output BMP code point */ 2587 ++source; 2588 *target++=(char16_t)MBCS_ENTRY_FINAL_VALUE_16(entry); 2589 state=(uint8_t)MBCS_ENTRY_FINAL_STATE(entry); /* typically 0 */ 2590 } else { 2591 /* leave the optimized loop */ 2592 break; 2593 } 2594 } 2595 } while(source<sourceLimit && target<targetLimit); 2596 } else /* offsets!=nullptr */ { 2597 do { 2598 entry=stateTable[state][*source]; 2599 if(MBCS_ENTRY_IS_TRANSITION(entry)) { 2600 state=(uint8_t)MBCS_ENTRY_TRANSITION_STATE(entry); 2601 offset=MBCS_ENTRY_TRANSITION_OFFSET(entry); 2602 2603 ++source; 2604 if( source<sourceLimit && 2605 MBCS_ENTRY_IS_FINAL(entry=stateTable[state][*source]) && 2606 MBCS_ENTRY_FINAL_ACTION(entry)==MBCS_STATE_VALID_16 && 2607 (c=unicodeCodeUnits[offset+MBCS_ENTRY_FINAL_VALUE_16(entry)])<0xfffe 2608 ) { 2609 ++source; 2610 *target++=c; 2611 if(offsets!=nullptr) { 2612 *offsets++=sourceIndex; 2613 sourceIndex=(nextSourceIndex+=2); 2614 } 2615 state=(uint8_t)MBCS_ENTRY_FINAL_STATE(entry); /* typically 0 */ 2616 offset=0; 2617 } else { 2618 /* set the state and leave the optimized loop */ 2619 ++nextSourceIndex; 2620 bytes[0]=*(source-1); 2621 byteIndex=1; 2622 break; 2623 } 2624 } else { 2625 if(MBCS_ENTRY_FINAL_IS_VALID_DIRECT_16(entry)) { 2626 /* output BMP code point */ 2627 ++source; 2628 *target++=(char16_t)MBCS_ENTRY_FINAL_VALUE_16(entry); 2629 if(offsets!=nullptr) { 2630 *offsets++=sourceIndex; 2631 sourceIndex=++nextSourceIndex; 2632 } 2633 state=(uint8_t)MBCS_ENTRY_FINAL_STATE(entry); /* typically 0 */ 2634 } else { 2635 /* leave the optimized loop */ 2636 break; 2637 } 2638 } 2639 } while(source<sourceLimit && target<targetLimit); 2640 } 2641 2642 /* 2643 * these tests and break statements could be put inside the loop 2644 * if C had "break outerLoop" like Java 2645 */ 2646 if(source>=sourceLimit) { 2647 break; 2648 } 2649 if(target>=targetLimit) { 2650 /* target is full */ 2651 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 2652 break; 2653 } 2654 2655 ++nextSourceIndex; 2656 bytes[byteIndex++]=*source++; 2657 } else /* byteIndex>0 */ { 2658 ++nextSourceIndex; 2659 entry=stateTable[state][bytes[byteIndex++]=*source++]; 2660 } 2661 2662 if(MBCS_ENTRY_IS_TRANSITION(entry)) { 2663 state=(uint8_t)MBCS_ENTRY_TRANSITION_STATE(entry); 2664 offset+=MBCS_ENTRY_TRANSITION_OFFSET(entry); 2665 continue; 2666 } 2667 2668 /* save the previous state for proper extension mapping with SI/SO-stateful converters */ 2669 cnv->mode=state; 2670 2671 /* set the next state early so that we can reuse the entry variable */ 2672 state=(uint8_t)MBCS_ENTRY_FINAL_STATE(entry); /* typically 0 */ 2673 2674 /* 2675 * An if-else-if chain provides more reliable performance for 2676 * the most common cases compared to a switch. 2677 */ 2678 action=(uint8_t)(MBCS_ENTRY_FINAL_ACTION(entry)); 2679 if(action==MBCS_STATE_VALID_16) { 2680 offset+=MBCS_ENTRY_FINAL_VALUE_16(entry); 2681 c=unicodeCodeUnits[offset]; 2682 if(c<0xfffe) { 2683 /* output BMP code point */ 2684 *target++=c; 2685 if(offsets!=nullptr) { 2686 *offsets++=sourceIndex; 2687 } 2688 byteIndex=0; 2689 } else if(c==0xfffe) { 2690 if(UCNV_TO_U_USE_FALLBACK(cnv) && (entry=(int32_t)ucnv_MBCSGetFallback(&cnv->sharedData->mbcs, offset))!=0xfffe) { 2691 /* output fallback BMP code point */ 2692 *target++=(char16_t)entry; 2693 if(offsets!=nullptr) { 2694 *offsets++=sourceIndex; 2695 } 2696 byteIndex=0; 2697 } 2698 } else { 2699 /* callback(illegal) */ 2700 *pErrorCode=U_ILLEGAL_CHAR_FOUND; 2701 } 2702 } else if(action==MBCS_STATE_VALID_DIRECT_16) { 2703 /* output BMP code point */ 2704 *target++=(char16_t)MBCS_ENTRY_FINAL_VALUE_16(entry); 2705 if(offsets!=nullptr) { 2706 *offsets++=sourceIndex; 2707 } 2708 byteIndex=0; 2709 } else if(action==MBCS_STATE_VALID_16_PAIR) { 2710 offset+=MBCS_ENTRY_FINAL_VALUE_16(entry); 2711 c=unicodeCodeUnits[offset++]; 2712 if(c<0xd800) { 2713 /* output BMP code point below 0xd800 */ 2714 *target++=c; 2715 if(offsets!=nullptr) { 2716 *offsets++=sourceIndex; 2717 } 2718 byteIndex=0; 2719 } else if(UCNV_TO_U_USE_FALLBACK(cnv) ? c<=0xdfff : c<=0xdbff) { 2720 /* output roundtrip or fallback surrogate pair */ 2721 *target++=(char16_t)(c&0xdbff); 2722 if(offsets!=nullptr) { 2723 *offsets++=sourceIndex; 2724 } 2725 byteIndex=0; 2726 if(target<targetLimit) { 2727 *target++=unicodeCodeUnits[offset]; 2728 if(offsets!=nullptr) { 2729 *offsets++=sourceIndex; 2730 } 2731 } else { 2732 /* target overflow */ 2733 cnv->UCharErrorBuffer[0]=unicodeCodeUnits[offset]; 2734 cnv->UCharErrorBufferLength=1; 2735 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 2736 2737 offset=0; 2738 break; 2739 } 2740 } else if(UCNV_TO_U_USE_FALLBACK(cnv) ? (c&0xfffe)==0xe000 : c==0xe000) { 2741 /* output roundtrip BMP code point above 0xd800 or fallback BMP code point */ 2742 *target++=unicodeCodeUnits[offset]; 2743 if(offsets!=nullptr) { 2744 *offsets++=sourceIndex; 2745 } 2746 byteIndex=0; 2747 } else if(c==0xffff) { 2748 /* callback(illegal) */ 2749 *pErrorCode=U_ILLEGAL_CHAR_FOUND; 2750 } 2751 } else if(action==MBCS_STATE_VALID_DIRECT_20 || 2752 (action==MBCS_STATE_FALLBACK_DIRECT_20 && UCNV_TO_U_USE_FALLBACK(cnv)) 2753 ) { 2754 entry=MBCS_ENTRY_FINAL_VALUE(entry); 2755 /* output surrogate pair */ 2756 *target++=(char16_t)(0xd800|(char16_t)(entry>>10)); 2757 if(offsets!=nullptr) { 2758 *offsets++=sourceIndex; 2759 } 2760 byteIndex=0; 2761 c=(char16_t)(0xdc00|(char16_t)(entry&0x3ff)); 2762 if(target<targetLimit) { 2763 *target++=c; 2764 if(offsets!=nullptr) { 2765 *offsets++=sourceIndex; 2766 } 2767 } else { 2768 /* target overflow */ 2769 cnv->UCharErrorBuffer[0]=c; 2770 cnv->UCharErrorBufferLength=1; 2771 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 2772 2773 offset=0; 2774 break; 2775 } 2776 } else if(action==MBCS_STATE_CHANGE_ONLY) { 2777 /* 2778 * This serves as a state change without any output. 2779 * It is useful for reading simple stateful encodings, 2780 * for example using just Shift-In/Shift-Out codes. 2781 * The 21 unused bits may later be used for more sophisticated 2782 * state transitions. 2783 */ 2784 if(cnv->sharedData->mbcs.dbcsOnlyState==0) { 2785 byteIndex=0; 2786 } else { 2787 /* SI/SO are illegal for DBCS-only conversion */ 2788 state=(uint8_t)(cnv->mode); /* restore the previous state */ 2789 2790 /* callback(illegal) */ 2791 *pErrorCode=U_ILLEGAL_CHAR_FOUND; 2792 } 2793 } else if(action==MBCS_STATE_FALLBACK_DIRECT_16) { 2794 if(UCNV_TO_U_USE_FALLBACK(cnv)) { 2795 /* output BMP code point */ 2796 *target++=(char16_t)MBCS_ENTRY_FINAL_VALUE_16(entry); 2797 if(offsets!=nullptr) { 2798 *offsets++=sourceIndex; 2799 } 2800 byteIndex=0; 2801 } 2802 } else if(action==MBCS_STATE_UNASSIGNED) { 2803 /* just fall through */ 2804 } else if(action==MBCS_STATE_ILLEGAL) { 2805 /* callback(illegal) */ 2806 *pErrorCode=U_ILLEGAL_CHAR_FOUND; 2807 } else { 2808 /* reserved, must never occur */ 2809 byteIndex=0; 2810 } 2811 2812 /* end of action codes: prepare for a new character */ 2813 offset=0; 2814 2815 if(byteIndex==0) { 2816 sourceIndex=nextSourceIndex; 2817 } else if(U_FAILURE(*pErrorCode)) { 2818 /* callback(illegal) */ 2819 if(byteIndex>1) { 2820 /* 2821 * Ticket 5691: consistent illegal sequences: 2822 * - We include at least the first byte in the illegal sequence. 2823 * - If any of the non-initial bytes could be the start of a character, 2824 * we stop the illegal sequence before the first one of those. 2825 */ 2826 UBool isDBCSOnly = cnv->sharedData->mbcs.dbcsOnlyState != 0; 2827 int8_t i; 2828 for(i=1; 2829 i<byteIndex && !isSingleOrLead(stateTable, state, isDBCSOnly, bytes[i]); 2830 ++i) {} 2831 if(i<byteIndex) { 2832 /* Back out some bytes. */ 2833 int8_t backOutDistance=byteIndex-i; 2834 int32_t bytesFromThisBuffer=(int32_t)(source-(const uint8_t *)pArgs->source); 2835 byteIndex=i; /* length of reported illegal byte sequence */ 2836 if(backOutDistance<=bytesFromThisBuffer) { 2837 source-=backOutDistance; 2838 } else { 2839 /* Back out bytes from the previous buffer: Need to replay them. */ 2840 cnv->preToULength=(int8_t)(bytesFromThisBuffer-backOutDistance); 2841 /* preToULength is negative! */ 2842 uprv_memcpy(cnv->preToU, bytes+i, -cnv->preToULength); 2843 source=(const uint8_t *)pArgs->source; 2844 } 2845 } 2846 } 2847 break; 2848 } else /* unassigned sequences indicated with byteIndex>0 */ { 2849 /* try an extension mapping */ 2850 pArgs->source=(const char *)source; 2851 byteIndex=_extToU(cnv, cnv->sharedData, 2852 byteIndex, &source, sourceLimit, 2853 &target, targetLimit, 2854 &offsets, sourceIndex, 2855 pArgs->flush, 2856 pErrorCode); 2857 sourceIndex=nextSourceIndex+=(int32_t)(source-(const uint8_t *)pArgs->source); 2858 2859 if(U_FAILURE(*pErrorCode)) { 2860 /* not mappable or buffer overflow */ 2861 break; 2862 } 2863 } 2864 } 2865 2866 /* set the converter state back into UConverter */ 2867 cnv->toUnicodeStatus=offset; 2868 cnv->mode=state; 2869 cnv->toULength=byteIndex; 2870 2871 /* write back the updated pointers */ 2872 pArgs->source=(const char *)source; 2873 pArgs->target=target; 2874 pArgs->offsets=offsets; 2875 } 2876 2877 /* 2878 * This version of ucnv_MBCSGetNextUChar() is optimized for single-byte, single-state codepages. 2879 * We still need a conversion loop in case we find reserved action codes, which are to be ignored. 2880 */ 2881 static UChar32 2882 ucnv_MBCSSingleGetNextUChar(UConverterToUnicodeArgs *pArgs, 2883 UErrorCode *pErrorCode) { 2884 UConverter *cnv; 2885 const int32_t (*stateTable)[256]; 2886 const uint8_t *source, *sourceLimit; 2887 2888 int32_t entry; 2889 uint8_t action; 2890 2891 /* set up the local pointers */ 2892 cnv=pArgs->converter; 2893 source = reinterpret_cast<const uint8_t*>(pArgs->source); 2894 sourceLimit = reinterpret_cast<const uint8_t*>(pArgs->sourceLimit); 2895 if((cnv->options&UCNV_OPTION_SWAP_LFNL)!=0) { 2896 stateTable=(const int32_t (*)[256])cnv->sharedData->mbcs.swapLFNLStateTable; 2897 } else { 2898 stateTable=cnv->sharedData->mbcs.stateTable; 2899 } 2900 2901 /* conversion loop */ 2902 while(source<sourceLimit) { 2903 entry=stateTable[0][*source++]; 2904 /* MBCS_ENTRY_IS_FINAL(entry) */ 2905 2906 /* write back the updated pointer early so that we can return directly */ 2907 pArgs->source = reinterpret_cast<const char*>(source); 2908 2909 if(MBCS_ENTRY_FINAL_IS_VALID_DIRECT_16(entry)) { 2910 /* output BMP code point */ 2911 return static_cast<char16_t>(MBCS_ENTRY_FINAL_VALUE_16(entry)); 2912 } 2913 2914 /* 2915 * An if-else-if chain provides more reliable performance for 2916 * the most common cases compared to a switch. 2917 */ 2918 action = static_cast<uint8_t>(MBCS_ENTRY_FINAL_ACTION(entry)); 2919 if( action==MBCS_STATE_VALID_DIRECT_20 || 2920 (action==MBCS_STATE_FALLBACK_DIRECT_20 && UCNV_TO_U_USE_FALLBACK(cnv)) 2921 ) { 2922 /* output supplementary code point */ 2923 return static_cast<UChar32>(MBCS_ENTRY_FINAL_VALUE(entry) + 0x10000); 2924 } else if(action==MBCS_STATE_FALLBACK_DIRECT_16) { 2925 if(UCNV_TO_U_USE_FALLBACK(cnv)) { 2926 /* output BMP code point */ 2927 return static_cast<char16_t>(MBCS_ENTRY_FINAL_VALUE_16(entry)); 2928 } 2929 } else if(action==MBCS_STATE_UNASSIGNED) { 2930 /* just fall through */ 2931 } else if(action==MBCS_STATE_ILLEGAL) { 2932 /* callback(illegal) */ 2933 *pErrorCode=U_ILLEGAL_CHAR_FOUND; 2934 } else { 2935 /* reserved, must never occur */ 2936 continue; 2937 } 2938 2939 if(U_FAILURE(*pErrorCode)) { 2940 /* callback(illegal) */ 2941 break; 2942 } else /* unassigned sequence */ { 2943 /* defer to the generic implementation */ 2944 pArgs->source = reinterpret_cast<const char*>(source) - 1; 2945 return UCNV_GET_NEXT_UCHAR_USE_TO_U; 2946 } 2947 } 2948 2949 /* no output because of empty input or only state changes */ 2950 *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR; 2951 return 0xffff; 2952 } 2953 2954 /* 2955 * Version of _MBCSToUnicodeWithOffsets() optimized for single-character 2956 * conversion without offset handling. 2957 * 2958 * When a character does not have a mapping to Unicode, then we return to the 2959 * generic ucnv_getNextUChar() code for extension/GB 18030 and error/callback 2960 * handling. 2961 * We also defer to the generic code in other complicated cases and have them 2962 * ultimately handled by _MBCSToUnicodeWithOffsets() itself. 2963 * 2964 * All normal mappings and errors are handled here. 2965 */ 2966 static UChar32 U_CALLCONV 2967 ucnv_MBCSGetNextUChar(UConverterToUnicodeArgs *pArgs, 2968 UErrorCode *pErrorCode) { 2969 UConverter *cnv; 2970 const uint8_t *source, *sourceLimit, *lastSource; 2971 2972 const int32_t (*stateTable)[256]; 2973 const uint16_t *unicodeCodeUnits; 2974 2975 uint32_t offset; 2976 uint8_t state; 2977 2978 int32_t entry; 2979 UChar32 c; 2980 uint8_t action; 2981 2982 /* use optimized function if possible */ 2983 cnv=pArgs->converter; 2984 2985 if(cnv->preToULength>0) { 2986 /* use the generic code in ucnv_getNextUChar() to continue with a partial match */ 2987 return UCNV_GET_NEXT_UCHAR_USE_TO_U; 2988 } 2989 2990 if(cnv->sharedData->mbcs.unicodeMask&UCNV_HAS_SURROGATES) { 2991 /* 2992 * Using the generic ucnv_getNextUChar() code lets us deal correctly 2993 * with the rare case of a codepage that maps single surrogates 2994 * without adding the complexity to this already complicated function here. 2995 */ 2996 return UCNV_GET_NEXT_UCHAR_USE_TO_U; 2997 } else if(cnv->sharedData->mbcs.countStates==1) { 2998 return ucnv_MBCSSingleGetNextUChar(pArgs, pErrorCode); 2999 } 3000 3001 /* set up the local pointers */ 3002 source = lastSource = reinterpret_cast<const uint8_t*>(pArgs->source); 3003 sourceLimit = reinterpret_cast<const uint8_t*>(pArgs->sourceLimit); 3004 3005 if((cnv->options&UCNV_OPTION_SWAP_LFNL)!=0) { 3006 stateTable=(const int32_t (*)[256])cnv->sharedData->mbcs.swapLFNLStateTable; 3007 } else { 3008 stateTable=cnv->sharedData->mbcs.stateTable; 3009 } 3010 unicodeCodeUnits=cnv->sharedData->mbcs.unicodeCodeUnits; 3011 3012 /* get the converter state from UConverter */ 3013 offset=cnv->toUnicodeStatus; 3014 3015 /* 3016 * if we are in the SBCS state for a DBCS-only converter, 3017 * then load the DBCS state from the MBCS data 3018 * (dbcsOnlyState==0 if it is not a DBCS-only converter) 3019 */ 3020 if ((state = static_cast<uint8_t>(cnv->mode)) == 0) { 3021 state=cnv->sharedData->mbcs.dbcsOnlyState; 3022 } 3023 3024 /* conversion loop */ 3025 c=U_SENTINEL; 3026 while(source<sourceLimit) { 3027 entry=stateTable[state][*source++]; 3028 if(MBCS_ENTRY_IS_TRANSITION(entry)) { 3029 state = static_cast<uint8_t>(MBCS_ENTRY_TRANSITION_STATE(entry)); 3030 offset+=MBCS_ENTRY_TRANSITION_OFFSET(entry); 3031 3032 /* optimization for 1/2-byte input and BMP output */ 3033 if( source<sourceLimit && 3034 MBCS_ENTRY_IS_FINAL(entry=stateTable[state][*source]) && 3035 MBCS_ENTRY_FINAL_ACTION(entry)==MBCS_STATE_VALID_16 && 3036 (c=unicodeCodeUnits[offset+MBCS_ENTRY_FINAL_VALUE_16(entry)])<0xfffe 3037 ) { 3038 ++source; 3039 state = static_cast<uint8_t>(MBCS_ENTRY_FINAL_STATE(entry)); /* typically 0 */ 3040 /* output BMP code point */ 3041 break; 3042 } 3043 } else { 3044 /* save the previous state for proper extension mapping with SI/SO-stateful converters */ 3045 cnv->mode=state; 3046 3047 /* set the next state early so that we can reuse the entry variable */ 3048 state = static_cast<uint8_t>(MBCS_ENTRY_FINAL_STATE(entry)); /* typically 0 */ 3049 3050 /* 3051 * An if-else-if chain provides more reliable performance for 3052 * the most common cases compared to a switch. 3053 */ 3054 action = static_cast<uint8_t>(MBCS_ENTRY_FINAL_ACTION(entry)); 3055 if(action==MBCS_STATE_VALID_DIRECT_16) { 3056 /* output BMP code point */ 3057 c = static_cast<char16_t>(MBCS_ENTRY_FINAL_VALUE_16(entry)); 3058 break; 3059 } else if(action==MBCS_STATE_VALID_16) { 3060 offset+=MBCS_ENTRY_FINAL_VALUE_16(entry); 3061 c=unicodeCodeUnits[offset]; 3062 if(c<0xfffe) { 3063 /* output BMP code point */ 3064 break; 3065 } else if(c==0xfffe) { 3066 if(UCNV_TO_U_USE_FALLBACK(cnv) && (c=ucnv_MBCSGetFallback(&cnv->sharedData->mbcs, offset))!=0xfffe) { 3067 break; 3068 } 3069 } else { 3070 /* callback(illegal) */ 3071 *pErrorCode=U_ILLEGAL_CHAR_FOUND; 3072 } 3073 } else if(action==MBCS_STATE_VALID_16_PAIR) { 3074 offset+=MBCS_ENTRY_FINAL_VALUE_16(entry); 3075 c=unicodeCodeUnits[offset++]; 3076 if(c<0xd800) { 3077 /* output BMP code point below 0xd800 */ 3078 break; 3079 } else if(UCNV_TO_U_USE_FALLBACK(cnv) ? c<=0xdfff : c<=0xdbff) { 3080 /* output roundtrip or fallback supplementary code point */ 3081 c=((c&0x3ff)<<10)+unicodeCodeUnits[offset]+(0x10000-0xdc00); 3082 break; 3083 } else if(UCNV_TO_U_USE_FALLBACK(cnv) ? (c&0xfffe)==0xe000 : c==0xe000) { 3084 /* output roundtrip BMP code point above 0xd800 or fallback BMP code point */ 3085 c=unicodeCodeUnits[offset]; 3086 break; 3087 } else if(c==0xffff) { 3088 /* callback(illegal) */ 3089 *pErrorCode=U_ILLEGAL_CHAR_FOUND; 3090 } 3091 } else if(action==MBCS_STATE_VALID_DIRECT_20 || 3092 (action==MBCS_STATE_FALLBACK_DIRECT_20 && UCNV_TO_U_USE_FALLBACK(cnv)) 3093 ) { 3094 /* output supplementary code point */ 3095 c = static_cast<UChar32>(MBCS_ENTRY_FINAL_VALUE(entry) + 0x10000); 3096 break; 3097 } else if(action==MBCS_STATE_CHANGE_ONLY) { 3098 /* 3099 * This serves as a state change without any output. 3100 * It is useful for reading simple stateful encodings, 3101 * for example using just Shift-In/Shift-Out codes. 3102 * The 21 unused bits may later be used for more sophisticated 3103 * state transitions. 3104 */ 3105 if(cnv->sharedData->mbcs.dbcsOnlyState!=0) { 3106 /* SI/SO are illegal for DBCS-only conversion */ 3107 state = static_cast<uint8_t>(cnv->mode); /* restore the previous state */ 3108 3109 /* callback(illegal) */ 3110 *pErrorCode=U_ILLEGAL_CHAR_FOUND; 3111 } 3112 } else if(action==MBCS_STATE_FALLBACK_DIRECT_16) { 3113 if(UCNV_TO_U_USE_FALLBACK(cnv)) { 3114 /* output BMP code point */ 3115 c = static_cast<char16_t>(MBCS_ENTRY_FINAL_VALUE_16(entry)); 3116 break; 3117 } 3118 } else if(action==MBCS_STATE_UNASSIGNED) { 3119 /* just fall through */ 3120 } else if(action==MBCS_STATE_ILLEGAL) { 3121 /* callback(illegal) */ 3122 *pErrorCode=U_ILLEGAL_CHAR_FOUND; 3123 } else { 3124 /* reserved (must never occur), or only state change */ 3125 offset=0; 3126 lastSource=source; 3127 continue; 3128 } 3129 3130 /* end of action codes: prepare for a new character */ 3131 offset=0; 3132 3133 if(U_FAILURE(*pErrorCode)) { 3134 /* callback(illegal) */ 3135 break; 3136 } else /* unassigned sequence */ { 3137 /* defer to the generic implementation */ 3138 cnv->toUnicodeStatus=0; 3139 cnv->mode=state; 3140 pArgs->source = reinterpret_cast<const char*>(lastSource); 3141 return UCNV_GET_NEXT_UCHAR_USE_TO_U; 3142 } 3143 } 3144 } 3145 3146 if(c<0) { 3147 if(U_SUCCESS(*pErrorCode) && source==sourceLimit && lastSource<source) { 3148 /* incomplete character byte sequence */ 3149 cnv->toULength = static_cast<int8_t>(source - lastSource); 3150 uprv_memcpy(cnv->toUBytes, lastSource, cnv->toULength); 3151 *pErrorCode=U_TRUNCATED_CHAR_FOUND; 3152 } else if(U_FAILURE(*pErrorCode)) { 3153 /* callback(illegal) */ 3154 /* 3155 * Ticket 5691: consistent illegal sequences: 3156 * - We include at least the first byte in the illegal sequence. 3157 * - If any of the non-initial bytes could be the start of a character, 3158 * we stop the illegal sequence before the first one of those. 3159 */ 3160 UBool isDBCSOnly = static_cast<UBool>(cnv->sharedData->mbcs.dbcsOnlyState != 0); 3161 uint8_t *bytes=cnv->toUBytes; 3162 *bytes++=*lastSource++; /* first byte */ 3163 if(lastSource==source) { 3164 cnv->toULength=1; 3165 } else /* lastSource<source: multi-byte character */ { 3166 int8_t i; 3167 for(i=1; 3168 lastSource<source && !isSingleOrLead(stateTable, state, isDBCSOnly, *lastSource); 3169 ++i 3170 ) { 3171 *bytes++=*lastSource++; 3172 } 3173 cnv->toULength=i; 3174 source=lastSource; 3175 } 3176 } else { 3177 /* no output because of empty input or only state changes */ 3178 *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR; 3179 } 3180 c=0xffff; 3181 } 3182 3183 /* set the converter state back into UConverter, ready for a new character */ 3184 cnv->toUnicodeStatus=0; 3185 cnv->mode=state; 3186 3187 /* write back the updated pointer */ 3188 pArgs->source = reinterpret_cast<const char*>(source); 3189 return c; 3190 } 3191 3192 #if 0 3193 /* 3194 * Code disabled 2002dec09 (ICU 2.4) because it is not currently used in ICU. markus 3195 * Removal improves code coverage. 3196 */ 3197 /** 3198 * This version of ucnv_MBCSSimpleGetNextUChar() is optimized for single-byte, single-state codepages. 3199 * It does not handle the EBCDIC swaplfnl option (set in UConverter). 3200 * It does not handle conversion extensions (_extToU()). 3201 */ 3202 U_CFUNC UChar32 3203 ucnv_MBCSSingleSimpleGetNextUChar(UConverterSharedData *sharedData, 3204 uint8_t b, UBool useFallback) { 3205 int32_t entry; 3206 uint8_t action; 3207 3208 entry=sharedData->mbcs.stateTable[0][b]; 3209 /* MBCS_ENTRY_IS_FINAL(entry) */ 3210 3211 if(MBCS_ENTRY_FINAL_IS_VALID_DIRECT_16(entry)) { 3212 /* output BMP code point */ 3213 return (char16_t)MBCS_ENTRY_FINAL_VALUE_16(entry); 3214 } 3215 3216 /* 3217 * An if-else-if chain provides more reliable performance for 3218 * the most common cases compared to a switch. 3219 */ 3220 action=(uint8_t)(MBCS_ENTRY_FINAL_ACTION(entry)); 3221 if(action==MBCS_STATE_VALID_DIRECT_20) { 3222 /* output supplementary code point */ 3223 return 0x10000+MBCS_ENTRY_FINAL_VALUE(entry); 3224 } else if(action==MBCS_STATE_FALLBACK_DIRECT_16) { 3225 if(!TO_U_USE_FALLBACK(useFallback)) { 3226 return 0xfffe; 3227 } 3228 /* output BMP code point */ 3229 return (char16_t)MBCS_ENTRY_FINAL_VALUE_16(entry); 3230 } else if(action==MBCS_STATE_FALLBACK_DIRECT_20) { 3231 if(!TO_U_USE_FALLBACK(useFallback)) { 3232 return 0xfffe; 3233 } 3234 /* output supplementary code point */ 3235 return 0x10000+MBCS_ENTRY_FINAL_VALUE(entry); 3236 } else if(action==MBCS_STATE_UNASSIGNED) { 3237 return 0xfffe; 3238 } else if(action==MBCS_STATE_ILLEGAL) { 3239 return 0xffff; 3240 } else { 3241 /* reserved, must never occur */ 3242 return 0xffff; 3243 } 3244 } 3245 #endif 3246 3247 /* 3248 * This is a simple version of _MBCSGetNextUChar() that is used 3249 * by other converter implementations. 3250 * It only returns an "assigned" result if it consumes the entire input. 3251 * It does not use state from the converter, nor error codes. 3252 * It does not handle the EBCDIC swaplfnl option (set in UConverter). 3253 * It handles conversion extensions but not GB 18030. 3254 * 3255 * Return value: 3256 * U+fffe unassigned 3257 * U+ffff illegal 3258 * otherwise the Unicode code point 3259 */ 3260 U_CFUNC UChar32 3261 ucnv_MBCSSimpleGetNextUChar(UConverterSharedData *sharedData, 3262 const char *source, int32_t length, 3263 UBool useFallback) { 3264 const int32_t (*stateTable)[256]; 3265 const uint16_t *unicodeCodeUnits; 3266 3267 uint32_t offset; 3268 uint8_t state, action; 3269 3270 UChar32 c; 3271 int32_t i, entry; 3272 3273 if(length<=0) { 3274 /* no input at all: "illegal" */ 3275 return 0xffff; 3276 } 3277 3278 #if 0 3279 /* 3280 * Code disabled 2002dec09 (ICU 2.4) because it is not currently used in ICU. markus 3281 * TODO In future releases, verify that this function is never called for SBCS 3282 * conversions, i.e., that sharedData->mbcs.countStates==1 is still true. 3283 * Removal improves code coverage. 3284 */ 3285 /* use optimized function if possible */ 3286 if(sharedData->mbcs.countStates==1) { 3287 if(length==1) { 3288 return ucnv_MBCSSingleSimpleGetNextUChar(sharedData, (uint8_t)*source, useFallback); 3289 } else { 3290 return 0xffff; /* illegal: more than a single byte for an SBCS converter */ 3291 } 3292 } 3293 #endif 3294 3295 /* set up the local pointers */ 3296 stateTable=sharedData->mbcs.stateTable; 3297 unicodeCodeUnits=sharedData->mbcs.unicodeCodeUnits; 3298 3299 /* converter state */ 3300 offset=0; 3301 state=sharedData->mbcs.dbcsOnlyState; 3302 3303 /* conversion loop */ 3304 for(i=0;;) { 3305 entry=stateTable[state][(uint8_t)source[i++]]; 3306 if(MBCS_ENTRY_IS_TRANSITION(entry)) { 3307 state=(uint8_t)MBCS_ENTRY_TRANSITION_STATE(entry); 3308 offset+=MBCS_ENTRY_TRANSITION_OFFSET(entry); 3309 3310 if(i==length) { 3311 return 0xffff; /* truncated character */ 3312 } 3313 } else { 3314 /* 3315 * An if-else-if chain provides more reliable performance for 3316 * the most common cases compared to a switch. 3317 */ 3318 action=(uint8_t)(MBCS_ENTRY_FINAL_ACTION(entry)); 3319 if(action==MBCS_STATE_VALID_16) { 3320 offset+=MBCS_ENTRY_FINAL_VALUE_16(entry); 3321 c=unicodeCodeUnits[offset]; 3322 if(c!=0xfffe) { 3323 /* done */ 3324 } else if(UCNV_TO_U_USE_FALLBACK(cnv)) { 3325 c=ucnv_MBCSGetFallback(&sharedData->mbcs, offset); 3326 /* else done with 0xfffe */ 3327 } 3328 break; 3329 } else if(action==MBCS_STATE_VALID_DIRECT_16) { 3330 /* output BMP code point */ 3331 c=(char16_t)MBCS_ENTRY_FINAL_VALUE_16(entry); 3332 break; 3333 } else if(action==MBCS_STATE_VALID_16_PAIR) { 3334 offset+=MBCS_ENTRY_FINAL_VALUE_16(entry); 3335 c=unicodeCodeUnits[offset++]; 3336 if(c<0xd800) { 3337 /* output BMP code point below 0xd800 */ 3338 } else if(UCNV_TO_U_USE_FALLBACK(cnv) ? c<=0xdfff : c<=0xdbff) { 3339 /* output roundtrip or fallback supplementary code point */ 3340 c=(UChar32)(((c&0x3ff)<<10)+unicodeCodeUnits[offset]+(0x10000-0xdc00)); 3341 } else if(UCNV_TO_U_USE_FALLBACK(cnv) ? (c&0xfffe)==0xe000 : c==0xe000) { 3342 /* output roundtrip BMP code point above 0xd800 or fallback BMP code point */ 3343 c=unicodeCodeUnits[offset]; 3344 } else if(c==0xffff) { 3345 return 0xffff; 3346 } else { 3347 c=0xfffe; 3348 } 3349 break; 3350 } else if(action==MBCS_STATE_VALID_DIRECT_20) { 3351 /* output supplementary code point */ 3352 c=0x10000+MBCS_ENTRY_FINAL_VALUE(entry); 3353 break; 3354 } else if(action==MBCS_STATE_FALLBACK_DIRECT_16) { 3355 if(!TO_U_USE_FALLBACK(useFallback)) { 3356 c=0xfffe; 3357 break; 3358 } 3359 /* output BMP code point */ 3360 c=(char16_t)MBCS_ENTRY_FINAL_VALUE_16(entry); 3361 break; 3362 } else if(action==MBCS_STATE_FALLBACK_DIRECT_20) { 3363 if(!TO_U_USE_FALLBACK(useFallback)) { 3364 c=0xfffe; 3365 break; 3366 } 3367 /* output supplementary code point */ 3368 c=0x10000+MBCS_ENTRY_FINAL_VALUE(entry); 3369 break; 3370 } else if(action==MBCS_STATE_UNASSIGNED) { 3371 c=0xfffe; 3372 break; 3373 } 3374 3375 /* 3376 * forbid MBCS_STATE_CHANGE_ONLY for this function, 3377 * and MBCS_STATE_ILLEGAL and reserved action codes 3378 */ 3379 return 0xffff; 3380 } 3381 } 3382 3383 if(i!=length) { 3384 /* illegal for this function: not all input consumed */ 3385 return 0xffff; 3386 } 3387 3388 if(c==0xfffe) { 3389 /* try an extension mapping */ 3390 const int32_t *cx=sharedData->mbcs.extIndexes; 3391 if(cx!=nullptr) { 3392 return ucnv_extSimpleMatchToU(cx, source, length, useFallback); 3393 } 3394 } 3395 3396 return c; 3397 } 3398 3399 /* MBCS-from-Unicode conversion functions ----------------------------------- */ 3400 3401 /* This version of ucnv_MBCSFromUnicodeWithOffsets() is optimized for double-byte codepages. */ 3402 static void 3403 ucnv_MBCSDoubleFromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs, 3404 UErrorCode *pErrorCode) { 3405 UConverter *cnv; 3406 const char16_t *source, *sourceLimit; 3407 uint8_t *target; 3408 int32_t targetCapacity; 3409 int32_t *offsets; 3410 3411 const uint16_t *table; 3412 const uint16_t *mbcsIndex; 3413 const uint8_t *bytes; 3414 3415 UChar32 c; 3416 3417 int32_t sourceIndex, nextSourceIndex; 3418 3419 uint32_t stage2Entry; 3420 uint32_t asciiRoundtrips; 3421 uint32_t value; 3422 uint8_t unicodeMask; 3423 3424 /* use optimized function if possible */ 3425 cnv=pArgs->converter; 3426 unicodeMask=cnv->sharedData->mbcs.unicodeMask; 3427 3428 /* set up the local pointers */ 3429 source=pArgs->source; 3430 sourceLimit=pArgs->sourceLimit; 3431 target = reinterpret_cast<uint8_t*>(pArgs->target); 3432 targetCapacity = static_cast<int32_t>(pArgs->targetLimit - pArgs->target); 3433 offsets=pArgs->offsets; 3434 3435 table=cnv->sharedData->mbcs.fromUnicodeTable; 3436 mbcsIndex=cnv->sharedData->mbcs.mbcsIndex; 3437 if((cnv->options&UCNV_OPTION_SWAP_LFNL)!=0) { 3438 bytes=cnv->sharedData->mbcs.swapLFNLFromUnicodeBytes; 3439 } else { 3440 bytes=cnv->sharedData->mbcs.fromUnicodeBytes; 3441 } 3442 asciiRoundtrips=cnv->sharedData->mbcs.asciiRoundtrips; 3443 3444 /* get the converter state from UConverter */ 3445 c=cnv->fromUChar32; 3446 3447 /* sourceIndex=-1 if the current character began in the previous buffer */ 3448 sourceIndex= c==0 ? 0 : -1; 3449 nextSourceIndex=0; 3450 3451 /* conversion loop */ 3452 if(c!=0 && targetCapacity>0) { 3453 goto getTrail; 3454 } 3455 3456 while(source<sourceLimit) { 3457 /* 3458 * This following test is to see if available input would overflow the output. 3459 * It does not catch output of more than one byte that 3460 * overflows as a result of a multi-byte character or callback output 3461 * from the last source character. 3462 * Therefore, those situations also test for overflows and will 3463 * then break the loop, too. 3464 */ 3465 if(targetCapacity>0) { 3466 /* 3467 * Get a correct Unicode code point: 3468 * a single char16_t for a BMP code point or 3469 * a matched surrogate pair for a "supplementary code point". 3470 */ 3471 c=*source++; 3472 ++nextSourceIndex; 3473 if(c<=0x7f && IS_ASCII_ROUNDTRIP(c, asciiRoundtrips)) { 3474 *target++ = static_cast<uint8_t>(c); 3475 if(offsets!=nullptr) { 3476 *offsets++=sourceIndex; 3477 sourceIndex=nextSourceIndex; 3478 } 3479 --targetCapacity; 3480 c=0; 3481 continue; 3482 } 3483 /* 3484 * utf8Friendly table: Test for <=0xd7ff rather than <=MBCS_FAST_MAX 3485 * to avoid dealing with surrogates. 3486 * MBCS_FAST_MAX must be >=0xd7ff. 3487 */ 3488 if(c<=0xd7ff) { 3489 value=DBCS_RESULT_FROM_MOST_BMP(mbcsIndex, (const uint16_t *)bytes, c); 3490 /* There are only roundtrips (!=0) and no-mapping (==0) entries. */ 3491 if(value==0) { 3492 goto unassigned; 3493 } 3494 /* output the value */ 3495 } else { 3496 /* 3497 * This also tests if the codepage maps single surrogates. 3498 * If it does, then surrogates are not paired but mapped separately. 3499 * Note that in this case unmatched surrogates are not detected. 3500 */ 3501 if(U16_IS_SURROGATE(c) && !(unicodeMask&UCNV_HAS_SURROGATES)) { 3502 if(U16_IS_SURROGATE_LEAD(c)) { 3503 getTrail: 3504 if(source<sourceLimit) { 3505 /* test the following code unit */ 3506 char16_t trail=*source; 3507 if(U16_IS_TRAIL(trail)) { 3508 ++source; 3509 ++nextSourceIndex; 3510 c=U16_GET_SUPPLEMENTARY(c, trail); 3511 if(!(unicodeMask&UCNV_HAS_SUPPLEMENTARY)) { 3512 /* BMP-only codepages are stored without stage 1 entries for supplementary code points */ 3513 /* callback(unassigned) */ 3514 goto unassigned; 3515 } 3516 /* convert this supplementary code point */ 3517 /* exit this condition tree */ 3518 } else { 3519 /* this is an unmatched lead code unit (1st surrogate) */ 3520 /* callback(illegal) */ 3521 *pErrorCode=U_ILLEGAL_CHAR_FOUND; 3522 break; 3523 } 3524 } else { 3525 /* no more input */ 3526 break; 3527 } 3528 } else { 3529 /* this is an unmatched trail code unit (2nd surrogate) */ 3530 /* callback(illegal) */ 3531 *pErrorCode=U_ILLEGAL_CHAR_FOUND; 3532 break; 3533 } 3534 } 3535 3536 /* convert the Unicode code point in c into codepage bytes */ 3537 stage2Entry=MBCS_STAGE_2_FROM_U(table, c); 3538 3539 /* get the bytes and the length for the output */ 3540 /* MBCS_OUTPUT_2 */ 3541 value=MBCS_VALUE_2_FROM_STAGE_2(bytes, stage2Entry, c); 3542 3543 /* is this code point assigned, or do we use fallbacks? */ 3544 if(!(MBCS_FROM_U_IS_ROUNDTRIP(stage2Entry, c) || 3545 (UCNV_FROM_U_USE_FALLBACK(cnv, c) && value!=0)) 3546 ) { 3547 /* 3548 * We allow a 0 byte output if the "assigned" bit is set for this entry. 3549 * There is no way with this data structure for fallback output 3550 * to be a zero byte. 3551 */ 3552 3553 unassigned: 3554 /* try an extension mapping */ 3555 pArgs->source=source; 3556 c=_extFromU(cnv, cnv->sharedData, 3557 c, &source, sourceLimit, 3558 &target, target+targetCapacity, 3559 &offsets, sourceIndex, 3560 pArgs->flush, 3561 pErrorCode); 3562 nextSourceIndex += static_cast<int32_t>(source - pArgs->source); 3563 3564 if(U_FAILURE(*pErrorCode)) { 3565 /* not mappable or buffer overflow */ 3566 break; 3567 } else { 3568 /* a mapping was written to the target, continue */ 3569 3570 /* recalculate the targetCapacity after an extension mapping */ 3571 targetCapacity = static_cast<int32_t>(pArgs->targetLimit - reinterpret_cast<char*>(target)); 3572 3573 /* normal end of conversion: prepare for a new character */ 3574 sourceIndex=nextSourceIndex; 3575 continue; 3576 } 3577 } 3578 } 3579 3580 /* write the output character bytes from value and length */ 3581 /* from the first if in the loop we know that targetCapacity>0 */ 3582 if(value<=0xff) { 3583 /* this is easy because we know that there is enough space */ 3584 *target++ = static_cast<uint8_t>(value); 3585 if(offsets!=nullptr) { 3586 *offsets++=sourceIndex; 3587 } 3588 --targetCapacity; 3589 } else /* length==2 */ { 3590 *target++ = static_cast<uint8_t>(value >> 8); 3591 if(2<=targetCapacity) { 3592 *target++ = static_cast<uint8_t>(value); 3593 if(offsets!=nullptr) { 3594 *offsets++=sourceIndex; 3595 *offsets++=sourceIndex; 3596 } 3597 targetCapacity-=2; 3598 } else { 3599 if(offsets!=nullptr) { 3600 *offsets++=sourceIndex; 3601 } 3602 cnv->charErrorBuffer[0] = static_cast<char>(value); 3603 cnv->charErrorBufferLength=1; 3604 3605 /* target overflow */ 3606 targetCapacity=0; 3607 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 3608 c=0; 3609 break; 3610 } 3611 } 3612 3613 /* normal end of conversion: prepare for a new character */ 3614 c=0; 3615 sourceIndex=nextSourceIndex; 3616 continue; 3617 } else { 3618 /* target is full */ 3619 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 3620 break; 3621 } 3622 } 3623 3624 /* set the converter state back into UConverter */ 3625 cnv->fromUChar32=c; 3626 3627 /* write back the updated pointers */ 3628 pArgs->source=source; 3629 pArgs->target = reinterpret_cast<char*>(target); 3630 pArgs->offsets=offsets; 3631 } 3632 3633 /* This version of ucnv_MBCSFromUnicodeWithOffsets() is optimized for single-byte codepages. */ 3634 static void 3635 ucnv_MBCSSingleFromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs, 3636 UErrorCode *pErrorCode) { 3637 UConverter *cnv; 3638 const char16_t *source, *sourceLimit; 3639 uint8_t *target; 3640 int32_t targetCapacity; 3641 int32_t *offsets; 3642 3643 const uint16_t *table; 3644 const uint16_t *results; 3645 3646 UChar32 c; 3647 3648 int32_t sourceIndex, nextSourceIndex; 3649 3650 uint16_t value, minValue; 3651 UBool hasSupplementary; 3652 3653 /* set up the local pointers */ 3654 cnv=pArgs->converter; 3655 source=pArgs->source; 3656 sourceLimit=pArgs->sourceLimit; 3657 target = reinterpret_cast<uint8_t*>(pArgs->target); 3658 targetCapacity = static_cast<int32_t>(pArgs->targetLimit - pArgs->target); 3659 offsets=pArgs->offsets; 3660 3661 table=cnv->sharedData->mbcs.fromUnicodeTable; 3662 if((cnv->options&UCNV_OPTION_SWAP_LFNL)!=0) { 3663 results = reinterpret_cast<uint16_t*>(cnv->sharedData->mbcs.swapLFNLFromUnicodeBytes); 3664 } else { 3665 results=(uint16_t *)cnv->sharedData->mbcs.fromUnicodeBytes; 3666 } 3667 3668 if(cnv->useFallback) { 3669 /* use all roundtrip and fallback results */ 3670 minValue=0x800; 3671 } else { 3672 /* use only roundtrips and fallbacks from private-use characters */ 3673 minValue=0xc00; 3674 } 3675 hasSupplementary = static_cast<UBool>(cnv->sharedData->mbcs.unicodeMask & UCNV_HAS_SUPPLEMENTARY); 3676 3677 /* get the converter state from UConverter */ 3678 c=cnv->fromUChar32; 3679 3680 /* sourceIndex=-1 if the current character began in the previous buffer */ 3681 sourceIndex= c==0 ? 0 : -1; 3682 nextSourceIndex=0; 3683 3684 /* conversion loop */ 3685 if(c!=0 && targetCapacity>0) { 3686 goto getTrail; 3687 } 3688 3689 while(source<sourceLimit) { 3690 /* 3691 * This following test is to see if available input would overflow the output. 3692 * It does not catch output of more than one byte that 3693 * overflows as a result of a multi-byte character or callback output 3694 * from the last source character. 3695 * Therefore, those situations also test for overflows and will 3696 * then break the loop, too. 3697 */ 3698 if(targetCapacity>0) { 3699 /* 3700 * Get a correct Unicode code point: 3701 * a single char16_t for a BMP code point or 3702 * a matched surrogate pair for a "supplementary code point". 3703 */ 3704 c=*source++; 3705 ++nextSourceIndex; 3706 if(U16_IS_SURROGATE(c)) { 3707 if(U16_IS_SURROGATE_LEAD(c)) { 3708 getTrail: 3709 if(source<sourceLimit) { 3710 /* test the following code unit */ 3711 char16_t trail=*source; 3712 if(U16_IS_TRAIL(trail)) { 3713 ++source; 3714 ++nextSourceIndex; 3715 c=U16_GET_SUPPLEMENTARY(c, trail); 3716 if(!hasSupplementary) { 3717 /* BMP-only codepages are stored without stage 1 entries for supplementary code points */ 3718 /* callback(unassigned) */ 3719 goto unassigned; 3720 } 3721 /* convert this supplementary code point */ 3722 /* exit this condition tree */ 3723 } else { 3724 /* this is an unmatched lead code unit (1st surrogate) */ 3725 /* callback(illegal) */ 3726 *pErrorCode=U_ILLEGAL_CHAR_FOUND; 3727 break; 3728 } 3729 } else { 3730 /* no more input */ 3731 break; 3732 } 3733 } else { 3734 /* this is an unmatched trail code unit (2nd surrogate) */ 3735 /* callback(illegal) */ 3736 *pErrorCode=U_ILLEGAL_CHAR_FOUND; 3737 break; 3738 } 3739 } 3740 3741 /* convert the Unicode code point in c into codepage bytes */ 3742 value=MBCS_SINGLE_RESULT_FROM_U(table, results, c); 3743 3744 /* is this code point assigned, or do we use fallbacks? */ 3745 if(value>=minValue) { 3746 /* assigned, write the output character bytes from value and length */ 3747 /* length==1 */ 3748 /* this is easy because we know that there is enough space */ 3749 *target++ = static_cast<uint8_t>(value); 3750 if(offsets!=nullptr) { 3751 *offsets++=sourceIndex; 3752 } 3753 --targetCapacity; 3754 3755 /* normal end of conversion: prepare for a new character */ 3756 c=0; 3757 sourceIndex=nextSourceIndex; 3758 } else { /* unassigned */ 3759 unassigned: 3760 /* try an extension mapping */ 3761 pArgs->source=source; 3762 c=_extFromU(cnv, cnv->sharedData, 3763 c, &source, sourceLimit, 3764 &target, target+targetCapacity, 3765 &offsets, sourceIndex, 3766 pArgs->flush, 3767 pErrorCode); 3768 nextSourceIndex += static_cast<int32_t>(source - pArgs->source); 3769 3770 if(U_FAILURE(*pErrorCode)) { 3771 /* not mappable or buffer overflow */ 3772 break; 3773 } else { 3774 /* a mapping was written to the target, continue */ 3775 3776 /* recalculate the targetCapacity after an extension mapping */ 3777 targetCapacity = static_cast<int32_t>(pArgs->targetLimit - reinterpret_cast<char*>(target)); 3778 3779 /* normal end of conversion: prepare for a new character */ 3780 sourceIndex=nextSourceIndex; 3781 } 3782 } 3783 } else { 3784 /* target is full */ 3785 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 3786 break; 3787 } 3788 } 3789 3790 /* set the converter state back into UConverter */ 3791 cnv->fromUChar32=c; 3792 3793 /* write back the updated pointers */ 3794 pArgs->source=source; 3795 pArgs->target = reinterpret_cast<char*>(target); 3796 pArgs->offsets=offsets; 3797 } 3798 3799 /* 3800 * This version of ucnv_MBCSFromUnicode() is optimized for single-byte codepages 3801 * that map only to and from the BMP. 3802 * In addition to single-byte/state optimizations, the offset calculations 3803 * become much easier. 3804 * It would be possible to use the sbcsIndex for UTF-8-friendly tables, 3805 * but measurements have shown that this diminishes performance 3806 * in more cases than it improves it. 3807 * See SVN revision 21013 (2007-feb-06) for the last version with #if switches 3808 * for various MBCS and SBCS optimizations. 3809 */ 3810 static void 3811 ucnv_MBCSSingleFromBMPWithOffsets(UConverterFromUnicodeArgs *pArgs, 3812 UErrorCode *pErrorCode) { 3813 UConverter *cnv; 3814 const char16_t *source, *sourceLimit, *lastSource; 3815 uint8_t *target; 3816 int32_t targetCapacity, length; 3817 int32_t *offsets; 3818 3819 const uint16_t *table; 3820 const uint16_t *results; 3821 3822 UChar32 c; 3823 3824 int32_t sourceIndex; 3825 3826 uint32_t asciiRoundtrips; 3827 uint16_t value, minValue; 3828 3829 /* set up the local pointers */ 3830 cnv=pArgs->converter; 3831 source=pArgs->source; 3832 sourceLimit=pArgs->sourceLimit; 3833 target = reinterpret_cast<uint8_t*>(pArgs->target); 3834 targetCapacity = static_cast<int32_t>(pArgs->targetLimit - pArgs->target); 3835 offsets=pArgs->offsets; 3836 3837 table=cnv->sharedData->mbcs.fromUnicodeTable; 3838 if((cnv->options&UCNV_OPTION_SWAP_LFNL)!=0) { 3839 results = reinterpret_cast<uint16_t*>(cnv->sharedData->mbcs.swapLFNLFromUnicodeBytes); 3840 } else { 3841 results=(uint16_t *)cnv->sharedData->mbcs.fromUnicodeBytes; 3842 } 3843 asciiRoundtrips=cnv->sharedData->mbcs.asciiRoundtrips; 3844 3845 if(cnv->useFallback) { 3846 /* use all roundtrip and fallback results */ 3847 minValue=0x800; 3848 } else { 3849 /* use only roundtrips and fallbacks from private-use characters */ 3850 minValue=0xc00; 3851 } 3852 3853 /* get the converter state from UConverter */ 3854 c=cnv->fromUChar32; 3855 3856 /* sourceIndex=-1 if the current character began in the previous buffer */ 3857 sourceIndex= c==0 ? 0 : -1; 3858 lastSource=source; 3859 3860 /* 3861 * since the conversion here is 1:1 char16_t:uint8_t, we need only one counter 3862 * for the minimum of the sourceLength and targetCapacity 3863 */ 3864 length = static_cast<int32_t>(sourceLimit - source); 3865 if(length<targetCapacity) { 3866 targetCapacity=length; 3867 } 3868 3869 /* conversion loop */ 3870 if(c!=0 && targetCapacity>0) { 3871 goto getTrail; 3872 } 3873 3874 #if MBCS_UNROLL_SINGLE_FROM_BMP 3875 /* unrolling makes it slower on Pentium III/Windows 2000?! */ 3876 /* unroll the loop with the most common case */ 3877 unrolled: 3878 if(targetCapacity>=4) { 3879 int32_t count, loops; 3880 uint16_t andedValues; 3881 3882 loops=count=targetCapacity>>2; 3883 do { 3884 c=*source++; 3885 andedValues=value=MBCS_SINGLE_RESULT_FROM_U(table, results, c); 3886 *target++=(uint8_t)value; 3887 c=*source++; 3888 andedValues&=value=MBCS_SINGLE_RESULT_FROM_U(table, results, c); 3889 *target++=(uint8_t)value; 3890 c=*source++; 3891 andedValues&=value=MBCS_SINGLE_RESULT_FROM_U(table, results, c); 3892 *target++=(uint8_t)value; 3893 c=*source++; 3894 andedValues&=value=MBCS_SINGLE_RESULT_FROM_U(table, results, c); 3895 *target++=(uint8_t)value; 3896 3897 /* were all 4 entries really valid? */ 3898 if(andedValues<minValue) { 3899 /* no, return to the first of these 4 */ 3900 source-=4; 3901 target-=4; 3902 break; 3903 } 3904 } while(--count>0); 3905 count=loops-count; 3906 targetCapacity-=4*count; 3907 3908 if(offsets!=nullptr) { 3909 lastSource+=4*count; 3910 while(count>0) { 3911 *offsets++=sourceIndex++; 3912 *offsets++=sourceIndex++; 3913 *offsets++=sourceIndex++; 3914 *offsets++=sourceIndex++; 3915 --count; 3916 } 3917 } 3918 3919 c=0; 3920 } 3921 #endif 3922 3923 while(targetCapacity>0) { 3924 /* 3925 * Get a correct Unicode code point: 3926 * a single char16_t for a BMP code point or 3927 * a matched surrogate pair for a "supplementary code point". 3928 */ 3929 c=*source++; 3930 /* 3931 * Do not immediately check for single surrogates: 3932 * Assume that they are unassigned and check for them in that case. 3933 * This speeds up the conversion of assigned characters. 3934 */ 3935 /* convert the Unicode code point in c into codepage bytes */ 3936 if(c<=0x7f && IS_ASCII_ROUNDTRIP(c, asciiRoundtrips)) { 3937 *target++ = static_cast<uint8_t>(c); 3938 --targetCapacity; 3939 c=0; 3940 continue; 3941 } 3942 value=MBCS_SINGLE_RESULT_FROM_U(table, results, c); 3943 /* is this code point assigned, or do we use fallbacks? */ 3944 if(value>=minValue) { 3945 /* assigned, write the output character bytes from value and length */ 3946 /* length==1 */ 3947 /* this is easy because we know that there is enough space */ 3948 *target++ = static_cast<uint8_t>(value); 3949 --targetCapacity; 3950 3951 /* normal end of conversion: prepare for a new character */ 3952 c=0; 3953 continue; 3954 } else if(!U16_IS_SURROGATE(c)) { 3955 /* normal, unassigned BMP character */ 3956 } else if(U16_IS_SURROGATE_LEAD(c)) { 3957 getTrail: 3958 if(source<sourceLimit) { 3959 /* test the following code unit */ 3960 char16_t trail=*source; 3961 if(U16_IS_TRAIL(trail)) { 3962 ++source; 3963 c=U16_GET_SUPPLEMENTARY(c, trail); 3964 /* this codepage does not map supplementary code points */ 3965 /* callback(unassigned) */ 3966 } else { 3967 /* this is an unmatched lead code unit (1st surrogate) */ 3968 /* callback(illegal) */ 3969 *pErrorCode=U_ILLEGAL_CHAR_FOUND; 3970 break; 3971 } 3972 } else { 3973 /* no more input */ 3974 if (pArgs->flush) { 3975 *pErrorCode=U_TRUNCATED_CHAR_FOUND; 3976 } 3977 break; 3978 } 3979 } else { 3980 /* this is an unmatched trail code unit (2nd surrogate) */ 3981 /* callback(illegal) */ 3982 *pErrorCode=U_ILLEGAL_CHAR_FOUND; 3983 break; 3984 } 3985 3986 /* c does not have a mapping */ 3987 3988 /* get the number of code units for c to correctly advance sourceIndex */ 3989 length=U16_LENGTH(c); 3990 3991 /* set offsets since the start or the last extension */ 3992 if(offsets!=nullptr) { 3993 int32_t count = static_cast<int32_t>(source - lastSource); 3994 3995 /* do not set the offset for this character */ 3996 count-=length; 3997 3998 while(count>0) { 3999 *offsets++=sourceIndex++; 4000 --count; 4001 } 4002 /* offsets and sourceIndex are now set for the current character */ 4003 } 4004 4005 /* try an extension mapping */ 4006 lastSource=source; 4007 c=_extFromU(cnv, cnv->sharedData, 4008 c, &source, sourceLimit, 4009 &target, reinterpret_cast<const uint8_t*>(pArgs->targetLimit), 4010 &offsets, sourceIndex, 4011 pArgs->flush, 4012 pErrorCode); 4013 sourceIndex += length + static_cast<int32_t>(source - lastSource); 4014 lastSource=source; 4015 4016 if(U_FAILURE(*pErrorCode)) { 4017 /* not mappable or buffer overflow */ 4018 break; 4019 } else { 4020 /* a mapping was written to the target, continue */ 4021 4022 /* recalculate the targetCapacity after an extension mapping */ 4023 targetCapacity = static_cast<int32_t>(pArgs->targetLimit - reinterpret_cast<char*>(target)); 4024 length = static_cast<int32_t>(sourceLimit - source); 4025 if(length<targetCapacity) { 4026 targetCapacity=length; 4027 } 4028 } 4029 4030 #if MBCS_UNROLL_SINGLE_FROM_BMP 4031 /* unrolling makes it slower on Pentium III/Windows 2000?! */ 4032 goto unrolled; 4033 #endif 4034 } 4035 4036 if(U_SUCCESS(*pErrorCode) && source<sourceLimit && target>=(uint8_t *)pArgs->targetLimit) { 4037 /* target is full */ 4038 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 4039 } 4040 4041 /* set offsets since the start or the last callback */ 4042 if(offsets!=nullptr) { 4043 size_t count=source-lastSource; 4044 if (count > 0 && *pErrorCode == U_TRUNCATED_CHAR_FOUND) { 4045 /* 4046 Caller gave us a partial supplementary character, 4047 which this function couldn't convert in any case. 4048 The callback will handle the offset. 4049 */ 4050 count--; 4051 } 4052 while(count>0) { 4053 *offsets++=sourceIndex++; 4054 --count; 4055 } 4056 } 4057 4058 /* set the converter state back into UConverter */ 4059 cnv->fromUChar32=c; 4060 4061 /* write back the updated pointers */ 4062 pArgs->source=source; 4063 pArgs->target = reinterpret_cast<char*>(target); 4064 pArgs->offsets=offsets; 4065 } 4066 4067 U_CFUNC void 4068 ucnv_MBCSFromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs, 4069 UErrorCode *pErrorCode) { 4070 UConverter *cnv; 4071 const char16_t *source, *sourceLimit; 4072 uint8_t *target; 4073 int32_t targetCapacity; 4074 int32_t *offsets; 4075 4076 const uint16_t *table; 4077 const uint16_t *mbcsIndex; 4078 const uint8_t *p, *bytes; 4079 uint8_t outputType; 4080 4081 UChar32 c; 4082 4083 int32_t prevSourceIndex, sourceIndex, nextSourceIndex; 4084 4085 uint32_t stage2Entry; 4086 uint32_t asciiRoundtrips; 4087 uint32_t value; 4088 /* Shift-In and Shift-Out byte sequences differ by encoding scheme. */ 4089 uint8_t siBytes[2] = {0, 0}; 4090 uint8_t soBytes[2] = {0, 0}; 4091 uint8_t siLength, soLength; 4092 int32_t length = 0, prevLength; 4093 uint8_t unicodeMask; 4094 4095 cnv=pArgs->converter; 4096 4097 if(cnv->preFromUFirstCP>=0) { 4098 /* 4099 * pass sourceIndex=-1 because we continue from an earlier buffer 4100 * in the future, this may change with continuous offsets 4101 */ 4102 ucnv_extContinueMatchFromU(cnv, pArgs, -1, pErrorCode); 4103 4104 if(U_FAILURE(*pErrorCode) || cnv->preFromULength<0) { 4105 return; 4106 } 4107 } 4108 4109 /* use optimized function if possible */ 4110 outputType=cnv->sharedData->mbcs.outputType; 4111 unicodeMask=cnv->sharedData->mbcs.unicodeMask; 4112 if(outputType==MBCS_OUTPUT_1 && !(unicodeMask&UCNV_HAS_SURROGATES)) { 4113 if(!(unicodeMask&UCNV_HAS_SUPPLEMENTARY)) { 4114 ucnv_MBCSSingleFromBMPWithOffsets(pArgs, pErrorCode); 4115 } else { 4116 ucnv_MBCSSingleFromUnicodeWithOffsets(pArgs, pErrorCode); 4117 } 4118 return; 4119 } else if(outputType==MBCS_OUTPUT_2 && cnv->sharedData->mbcs.utf8Friendly) { 4120 ucnv_MBCSDoubleFromUnicodeWithOffsets(pArgs, pErrorCode); 4121 return; 4122 } 4123 4124 /* set up the local pointers */ 4125 source=pArgs->source; 4126 sourceLimit=pArgs->sourceLimit; 4127 target=(uint8_t *)pArgs->target; 4128 targetCapacity=(int32_t)(pArgs->targetLimit-pArgs->target); 4129 offsets=pArgs->offsets; 4130 4131 table=cnv->sharedData->mbcs.fromUnicodeTable; 4132 if(cnv->sharedData->mbcs.utf8Friendly) { 4133 mbcsIndex=cnv->sharedData->mbcs.mbcsIndex; 4134 } else { 4135 mbcsIndex=nullptr; 4136 } 4137 if((cnv->options&UCNV_OPTION_SWAP_LFNL)!=0) { 4138 bytes=cnv->sharedData->mbcs.swapLFNLFromUnicodeBytes; 4139 } else { 4140 bytes=cnv->sharedData->mbcs.fromUnicodeBytes; 4141 } 4142 asciiRoundtrips=cnv->sharedData->mbcs.asciiRoundtrips; 4143 4144 /* get the converter state from UConverter */ 4145 c=cnv->fromUChar32; 4146 4147 if(outputType==MBCS_OUTPUT_2_SISO) { 4148 prevLength=cnv->fromUnicodeStatus; 4149 if(prevLength==0) { 4150 /* set the real value */ 4151 prevLength=1; 4152 } 4153 } else { 4154 /* prevent fromUnicodeStatus from being set to something non-0 */ 4155 prevLength=0; 4156 } 4157 4158 /* sourceIndex=-1 if the current character began in the previous buffer */ 4159 prevSourceIndex=-1; 4160 sourceIndex= c==0 ? 0 : -1; 4161 nextSourceIndex=0; 4162 4163 /* Get the SI/SO character for the converter */ 4164 siLength = static_cast<uint8_t>(getSISOBytes(SI, cnv->options, siBytes)); 4165 soLength = static_cast<uint8_t>(getSISOBytes(SO, cnv->options, soBytes)); 4166 4167 /* conversion loop */ 4168 /* 4169 * This is another piece of ugly code: 4170 * A goto into the loop if the converter state contains a first surrogate 4171 * from the previous function call. 4172 * It saves me to check in each loop iteration a check of if(c==0) 4173 * and duplicating the trail-surrogate-handling code in the else 4174 * branch of that check. 4175 * I could not find any other way to get around this other than 4176 * using a function call for the conversion and callback, which would 4177 * be even more inefficient. 4178 * 4179 * Markus Scherer 2000-jul-19 4180 */ 4181 if(c!=0 && targetCapacity>0) { 4182 goto getTrail; 4183 } 4184 4185 while(source<sourceLimit) { 4186 /* 4187 * This following test is to see if available input would overflow the output. 4188 * It does not catch output of more than one byte that 4189 * overflows as a result of a multi-byte character or callback output 4190 * from the last source character. 4191 * Therefore, those situations also test for overflows and will 4192 * then break the loop, too. 4193 */ 4194 if(targetCapacity>0) { 4195 /* 4196 * Get a correct Unicode code point: 4197 * a single char16_t for a BMP code point or 4198 * a matched surrogate pair for a "supplementary code point". 4199 */ 4200 c=*source++; 4201 ++nextSourceIndex; 4202 if(c<=0x7f && IS_ASCII_ROUNDTRIP(c, asciiRoundtrips)) { 4203 *target++=(uint8_t)c; 4204 if(offsets!=nullptr) { 4205 *offsets++=sourceIndex; 4206 prevSourceIndex=sourceIndex; 4207 sourceIndex=nextSourceIndex; 4208 } 4209 --targetCapacity; 4210 c=0; 4211 continue; 4212 } 4213 /* 4214 * utf8Friendly table: Test for <=0xd7ff rather than <=MBCS_FAST_MAX 4215 * to avoid dealing with surrogates. 4216 * MBCS_FAST_MAX must be >=0xd7ff. 4217 */ 4218 if(c<=0xd7ff && mbcsIndex!=nullptr) { 4219 value=mbcsIndex[c>>6]; 4220 4221 /* get the bytes and the length for the output (copied from below and adapted for utf8Friendly data) */ 4222 /* There are only roundtrips (!=0) and no-mapping (==0) entries. */ 4223 switch(outputType) { 4224 case MBCS_OUTPUT_2: 4225 value=((const uint16_t *)bytes)[value +(c&0x3f)]; 4226 if(value<=0xff) { 4227 if(value==0) { 4228 goto unassigned; 4229 } else { 4230 length=1; 4231 } 4232 } else { 4233 length=2; 4234 } 4235 break; 4236 case MBCS_OUTPUT_2_SISO: 4237 /* 1/2-byte stateful with Shift-In/Shift-Out */ 4238 /* 4239 * Save the old state in the converter object 4240 * right here, then change the local prevLength state variable if necessary. 4241 * Then, if this character turns out to be unassigned or a fallback that 4242 * is not taken, the callback code must not save the new state in the converter 4243 * because the new state is for a character that is not output. 4244 * However, the callback must still restore the state from the converter 4245 * in case the callback function changed it for its output. 4246 */ 4247 cnv->fromUnicodeStatus=prevLength; /* save the old state */ 4248 value=((const uint16_t *)bytes)[value +(c&0x3f)]; 4249 if(value<=0xff) { 4250 if(value==0) { 4251 goto unassigned; 4252 } else if(prevLength<=1) { 4253 length=1; 4254 } else { 4255 /* change from double-byte mode to single-byte */ 4256 if (siLength == 1) { 4257 value|=(uint32_t)siBytes[0]<<8; 4258 length = 2; 4259 } else if (siLength == 2) { 4260 value|=(uint32_t)siBytes[1]<<8; 4261 value|=(uint32_t)siBytes[0]<<16; 4262 length = 3; 4263 } 4264 prevLength=1; 4265 } 4266 } else { 4267 if(prevLength==2) { 4268 length=2; 4269 } else { 4270 /* change from single-byte mode to double-byte */ 4271 if (soLength == 1) { 4272 value|=(uint32_t)soBytes[0]<<16; 4273 length = 3; 4274 } else if (soLength == 2) { 4275 value|=(uint32_t)soBytes[1]<<16; 4276 value|=(uint32_t)soBytes[0]<<24; 4277 length = 4; 4278 } 4279 prevLength=2; 4280 } 4281 } 4282 break; 4283 case MBCS_OUTPUT_DBCS_ONLY: 4284 /* table with single-byte results, but only DBCS mappings used */ 4285 value=((const uint16_t *)bytes)[value +(c&0x3f)]; 4286 if(value<=0xff) { 4287 /* no mapping or SBCS result, not taken for DBCS-only */ 4288 goto unassigned; 4289 } else { 4290 length=2; 4291 } 4292 break; 4293 case MBCS_OUTPUT_3: 4294 p=bytes+(value+(c&0x3f))*3; 4295 value=((uint32_t)*p<<16)|((uint32_t)p[1]<<8)|p[2]; 4296 if(value<=0xff) { 4297 if(value==0) { 4298 goto unassigned; 4299 } else { 4300 length=1; 4301 } 4302 } else if(value<=0xffff) { 4303 length=2; 4304 } else { 4305 length=3; 4306 } 4307 break; 4308 case MBCS_OUTPUT_4: 4309 value=((const uint32_t *)bytes)[value +(c&0x3f)]; 4310 if(value<=0xff) { 4311 if(value==0) { 4312 goto unassigned; 4313 } else { 4314 length=1; 4315 } 4316 } else if(value<=0xffff) { 4317 length=2; 4318 } else if(value<=0xffffff) { 4319 length=3; 4320 } else { 4321 length=4; 4322 } 4323 break; 4324 case MBCS_OUTPUT_3_EUC: 4325 value=((const uint16_t *)bytes)[value +(c&0x3f)]; 4326 /* EUC 16-bit fixed-length representation */ 4327 if(value<=0xff) { 4328 if(value==0) { 4329 goto unassigned; 4330 } else { 4331 length=1; 4332 } 4333 } else if((value&0x8000)==0) { 4334 value|=0x8e8000; 4335 length=3; 4336 } else if((value&0x80)==0) { 4337 value|=0x8f0080; 4338 length=3; 4339 } else { 4340 length=2; 4341 } 4342 break; 4343 case MBCS_OUTPUT_4_EUC: 4344 p=bytes+(value+(c&0x3f))*3; 4345 value=((uint32_t)*p<<16)|((uint32_t)p[1]<<8)|p[2]; 4346 /* EUC 16-bit fixed-length representation applied to the first two bytes */ 4347 if(value<=0xff) { 4348 if(value==0) { 4349 goto unassigned; 4350 } else { 4351 length=1; 4352 } 4353 } else if(value<=0xffff) { 4354 length=2; 4355 } else if((value&0x800000)==0) { 4356 value|=0x8e800000; 4357 length=4; 4358 } else if((value&0x8000)==0) { 4359 value|=0x8f008000; 4360 length=4; 4361 } else { 4362 length=3; 4363 } 4364 break; 4365 default: 4366 /* must not occur */ 4367 /* 4368 * To avoid compiler warnings that value & length may be 4369 * used without having been initialized, we set them here. 4370 * In reality, this is unreachable code. 4371 * Not having a default branch also causes warnings with 4372 * some compilers. 4373 */ 4374 value=0; 4375 length=0; 4376 break; 4377 } 4378 /* output the value */ 4379 } else { 4380 /* 4381 * This also tests if the codepage maps single surrogates. 4382 * If it does, then surrogates are not paired but mapped separately. 4383 * Note that in this case unmatched surrogates are not detected. 4384 */ 4385 if(U16_IS_SURROGATE(c) && !(unicodeMask&UCNV_HAS_SURROGATES)) { 4386 if(U16_IS_SURROGATE_LEAD(c)) { 4387 getTrail: 4388 if(source<sourceLimit) { 4389 /* test the following code unit */ 4390 char16_t trail=*source; 4391 if(U16_IS_TRAIL(trail)) { 4392 ++source; 4393 ++nextSourceIndex; 4394 c=U16_GET_SUPPLEMENTARY(c, trail); 4395 if(!(unicodeMask&UCNV_HAS_SUPPLEMENTARY)) { 4396 /* BMP-only codepages are stored without stage 1 entries for supplementary code points */ 4397 cnv->fromUnicodeStatus=prevLength; /* save the old state */ 4398 /* callback(unassigned) */ 4399 goto unassigned; 4400 } 4401 /* convert this supplementary code point */ 4402 /* exit this condition tree */ 4403 } else { 4404 /* this is an unmatched lead code unit (1st surrogate) */ 4405 /* callback(illegal) */ 4406 *pErrorCode=U_ILLEGAL_CHAR_FOUND; 4407 break; 4408 } 4409 } else { 4410 /* no more input */ 4411 break; 4412 } 4413 } else { 4414 /* this is an unmatched trail code unit (2nd surrogate) */ 4415 /* callback(illegal) */ 4416 *pErrorCode=U_ILLEGAL_CHAR_FOUND; 4417 break; 4418 } 4419 } 4420 4421 /* convert the Unicode code point in c into codepage bytes */ 4422 4423 /* 4424 * The basic lookup is a triple-stage compact array (trie) lookup. 4425 * For details see the beginning of this file. 4426 * 4427 * Single-byte codepages are handled with a different data structure 4428 * by _MBCSSingle... functions. 4429 * 4430 * The result consists of a 32-bit value from stage 2 and 4431 * a pointer to as many bytes as are stored per character. 4432 * The pointer points to the character's bytes in stage 3. 4433 * Bits 15..0 of the stage 2 entry contain the stage 3 index 4434 * for that pointer, while bits 31..16 are flags for which of 4435 * the 16 characters in the block are roundtrip-assigned. 4436 * 4437 * For 2-byte and 4-byte codepages, the bytes are stored as uint16_t 4438 * respectively as uint32_t, in the platform encoding. 4439 * For 3-byte codepages, the bytes are always stored in big-endian order. 4440 * 4441 * For EUC encodings that use only either 0x8e or 0x8f as the first 4442 * byte of their longest byte sequences, the first two bytes in 4443 * this third stage indicate with their 7th bits whether these bytes 4444 * are to be written directly or actually need to be preceded by 4445 * one of the two Single-Shift codes. With this, the third stage 4446 * stores one byte fewer per character than the actual maximum length of 4447 * EUC byte sequences. 4448 * 4449 * Other than that, leading zero bytes are removed and the other 4450 * bytes output. A single zero byte may be output if the "assigned" 4451 * bit in stage 2 was on. 4452 * The data structure does not support zero byte output as a fallback, 4453 * and also does not allow output of leading zeros. 4454 */ 4455 stage2Entry=MBCS_STAGE_2_FROM_U(table, c); 4456 4457 /* get the bytes and the length for the output */ 4458 switch(outputType) { 4459 case MBCS_OUTPUT_2: 4460 value=MBCS_VALUE_2_FROM_STAGE_2(bytes, stage2Entry, c); 4461 if(value<=0xff) { 4462 length=1; 4463 } else { 4464 length=2; 4465 } 4466 break; 4467 case MBCS_OUTPUT_2_SISO: 4468 /* 1/2-byte stateful with Shift-In/Shift-Out */ 4469 /* 4470 * Save the old state in the converter object 4471 * right here, then change the local prevLength state variable if necessary. 4472 * Then, if this character turns out to be unassigned or a fallback that 4473 * is not taken, the callback code must not save the new state in the converter 4474 * because the new state is for a character that is not output. 4475 * However, the callback must still restore the state from the converter 4476 * in case the callback function changed it for its output. 4477 */ 4478 cnv->fromUnicodeStatus=prevLength; /* save the old state */ 4479 value=MBCS_VALUE_2_FROM_STAGE_2(bytes, stage2Entry, c); 4480 if(value<=0xff) { 4481 if(value==0 && MBCS_FROM_U_IS_ROUNDTRIP(stage2Entry, c)==0) { 4482 /* no mapping, leave value==0 */ 4483 length=0; 4484 } else if(prevLength<=1) { 4485 length=1; 4486 } else { 4487 /* change from double-byte mode to single-byte */ 4488 if (siLength == 1) { 4489 value|=(uint32_t)siBytes[0]<<8; 4490 length = 2; 4491 } else if (siLength == 2) { 4492 value|=(uint32_t)siBytes[1]<<8; 4493 value|=(uint32_t)siBytes[0]<<16; 4494 length = 3; 4495 } 4496 prevLength=1; 4497 } 4498 } else { 4499 if(prevLength==2) { 4500 length=2; 4501 } else { 4502 /* change from single-byte mode to double-byte */ 4503 if (soLength == 1) { 4504 value|=(uint32_t)soBytes[0]<<16; 4505 length = 3; 4506 } else if (soLength == 2) { 4507 value|=(uint32_t)soBytes[1]<<16; 4508 value|=(uint32_t)soBytes[0]<<24; 4509 length = 4; 4510 } 4511 prevLength=2; 4512 } 4513 } 4514 break; 4515 case MBCS_OUTPUT_DBCS_ONLY: 4516 /* table with single-byte results, but only DBCS mappings used */ 4517 value=MBCS_VALUE_2_FROM_STAGE_2(bytes, stage2Entry, c); 4518 if(value<=0xff) { 4519 /* no mapping or SBCS result, not taken for DBCS-only */ 4520 value=stage2Entry=0; /* stage2Entry=0 to reset roundtrip flags */ 4521 length=0; 4522 } else { 4523 length=2; 4524 } 4525 break; 4526 case MBCS_OUTPUT_3: 4527 p=MBCS_POINTER_3_FROM_STAGE_2(bytes, stage2Entry, c); 4528 value=((uint32_t)*p<<16)|((uint32_t)p[1]<<8)|p[2]; 4529 if(value<=0xff) { 4530 length=1; 4531 } else if(value<=0xffff) { 4532 length=2; 4533 } else { 4534 length=3; 4535 } 4536 break; 4537 case MBCS_OUTPUT_4: 4538 value=MBCS_VALUE_4_FROM_STAGE_2(bytes, stage2Entry, c); 4539 if(value<=0xff) { 4540 length=1; 4541 } else if(value<=0xffff) { 4542 length=2; 4543 } else if(value<=0xffffff) { 4544 length=3; 4545 } else { 4546 length=4; 4547 } 4548 break; 4549 case MBCS_OUTPUT_3_EUC: 4550 value=MBCS_VALUE_2_FROM_STAGE_2(bytes, stage2Entry, c); 4551 /* EUC 16-bit fixed-length representation */ 4552 if(value<=0xff) { 4553 length=1; 4554 } else if((value&0x8000)==0) { 4555 value|=0x8e8000; 4556 length=3; 4557 } else if((value&0x80)==0) { 4558 value|=0x8f0080; 4559 length=3; 4560 } else { 4561 length=2; 4562 } 4563 break; 4564 case MBCS_OUTPUT_4_EUC: 4565 p=MBCS_POINTER_3_FROM_STAGE_2(bytes, stage2Entry, c); 4566 value=((uint32_t)*p<<16)|((uint32_t)p[1]<<8)|p[2]; 4567 /* EUC 16-bit fixed-length representation applied to the first two bytes */ 4568 if(value<=0xff) { 4569 length=1; 4570 } else if(value<=0xffff) { 4571 length=2; 4572 } else if((value&0x800000)==0) { 4573 value|=0x8e800000; 4574 length=4; 4575 } else if((value&0x8000)==0) { 4576 value|=0x8f008000; 4577 length=4; 4578 } else { 4579 length=3; 4580 } 4581 break; 4582 default: 4583 /* must not occur */ 4584 /* 4585 * To avoid compiler warnings that value & length may be 4586 * used without having been initialized, we set them here. 4587 * In reality, this is unreachable code. 4588 * Not having a default branch also causes warnings with 4589 * some compilers. 4590 */ 4591 value=stage2Entry=0; /* stage2Entry=0 to reset roundtrip flags */ 4592 length=0; 4593 break; 4594 } 4595 4596 /* is this code point assigned, or do we use fallbacks? */ 4597 if(!(MBCS_FROM_U_IS_ROUNDTRIP(stage2Entry, c)!=0 || 4598 (UCNV_FROM_U_USE_FALLBACK(cnv, c) && value!=0)) 4599 ) { 4600 /* 4601 * We allow a 0 byte output if the "assigned" bit is set for this entry. 4602 * There is no way with this data structure for fallback output 4603 * to be a zero byte. 4604 */ 4605 4606 unassigned: 4607 /* try an extension mapping */ 4608 pArgs->source=source; 4609 c=_extFromU(cnv, cnv->sharedData, 4610 c, &source, sourceLimit, 4611 &target, target+targetCapacity, 4612 &offsets, sourceIndex, 4613 pArgs->flush, 4614 pErrorCode); 4615 nextSourceIndex+=(int32_t)(source-pArgs->source); 4616 prevLength=cnv->fromUnicodeStatus; /* restore SISO state */ 4617 4618 if(U_FAILURE(*pErrorCode)) { 4619 /* not mappable or buffer overflow */ 4620 break; 4621 } else { 4622 /* a mapping was written to the target, continue */ 4623 4624 /* recalculate the targetCapacity after an extension mapping */ 4625 targetCapacity=(int32_t)(pArgs->targetLimit-(char *)target); 4626 4627 /* normal end of conversion: prepare for a new character */ 4628 if(offsets!=nullptr) { 4629 prevSourceIndex=sourceIndex; 4630 sourceIndex=nextSourceIndex; 4631 } 4632 continue; 4633 } 4634 } 4635 } 4636 4637 /* write the output character bytes from value and length */ 4638 /* from the first if in the loop we know that targetCapacity>0 */ 4639 if(length<=targetCapacity) { 4640 if(offsets==nullptr) { 4641 switch(length) { 4642 /* each branch falls through to the next one */ 4643 case 4: 4644 *target++=(uint8_t)(value>>24); 4645 U_FALLTHROUGH; 4646 case 3: 4647 *target++=(uint8_t)(value>>16); 4648 U_FALLTHROUGH; 4649 case 2: 4650 *target++=(uint8_t)(value>>8); 4651 U_FALLTHROUGH; 4652 case 1: 4653 *target++=(uint8_t)value; 4654 U_FALLTHROUGH; 4655 default: 4656 /* will never occur */ 4657 break; 4658 } 4659 } else { 4660 switch(length) { 4661 /* each branch falls through to the next one */ 4662 case 4: 4663 *target++=(uint8_t)(value>>24); 4664 *offsets++=sourceIndex; 4665 U_FALLTHROUGH; 4666 case 3: 4667 *target++=(uint8_t)(value>>16); 4668 *offsets++=sourceIndex; 4669 U_FALLTHROUGH; 4670 case 2: 4671 *target++=(uint8_t)(value>>8); 4672 *offsets++=sourceIndex; 4673 U_FALLTHROUGH; 4674 case 1: 4675 *target++=(uint8_t)value; 4676 *offsets++=sourceIndex; 4677 U_FALLTHROUGH; 4678 default: 4679 /* will never occur */ 4680 break; 4681 } 4682 } 4683 targetCapacity-=length; 4684 } else { 4685 uint8_t *charErrorBuffer; 4686 4687 /* 4688 * We actually do this backwards here: 4689 * In order to save an intermediate variable, we output 4690 * first to the overflow buffer what does not fit into the 4691 * regular target. 4692 */ 4693 /* we know that 1<=targetCapacity<length<=4 */ 4694 length-=targetCapacity; 4695 charErrorBuffer=(uint8_t *)cnv->charErrorBuffer; 4696 switch(length) { 4697 /* each branch falls through to the next one */ 4698 case 3: 4699 *charErrorBuffer++=(uint8_t)(value>>16); 4700 U_FALLTHROUGH; 4701 case 2: 4702 *charErrorBuffer++=(uint8_t)(value>>8); 4703 U_FALLTHROUGH; 4704 case 1: 4705 *charErrorBuffer=(uint8_t)value; 4706 U_FALLTHROUGH; 4707 default: 4708 /* will never occur */ 4709 break; 4710 } 4711 cnv->charErrorBufferLength=(int8_t)length; 4712 4713 /* now output what fits into the regular target */ 4714 value>>=8*length; /* length was reduced by targetCapacity */ 4715 switch(targetCapacity) { 4716 /* each branch falls through to the next one */ 4717 case 3: 4718 *target++=(uint8_t)(value>>16); 4719 if(offsets!=nullptr) { 4720 *offsets++=sourceIndex; 4721 } 4722 U_FALLTHROUGH; 4723 case 2: 4724 *target++=(uint8_t)(value>>8); 4725 if(offsets!=nullptr) { 4726 *offsets++=sourceIndex; 4727 } 4728 U_FALLTHROUGH; 4729 case 1: 4730 *target++=(uint8_t)value; 4731 if(offsets!=nullptr) { 4732 *offsets++=sourceIndex; 4733 } 4734 U_FALLTHROUGH; 4735 default: 4736 /* will never occur */ 4737 break; 4738 } 4739 4740 /* target overflow */ 4741 targetCapacity=0; 4742 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 4743 c=0; 4744 break; 4745 } 4746 4747 /* normal end of conversion: prepare for a new character */ 4748 c=0; 4749 if(offsets!=nullptr) { 4750 prevSourceIndex=sourceIndex; 4751 sourceIndex=nextSourceIndex; 4752 } 4753 continue; 4754 } else { 4755 /* target is full */ 4756 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 4757 break; 4758 } 4759 } 4760 4761 /* 4762 * the end of the input stream and detection of truncated input 4763 * are handled by the framework, but for EBCDIC_STATEFUL conversion 4764 * we need to emit an SI at the very end 4765 * 4766 * conditions: 4767 * successful 4768 * EBCDIC_STATEFUL in DBCS mode 4769 * end of input and no truncated input 4770 */ 4771 if( U_SUCCESS(*pErrorCode) && 4772 outputType==MBCS_OUTPUT_2_SISO && prevLength==2 && 4773 pArgs->flush && source>=sourceLimit && c==0 4774 ) { 4775 /* EBCDIC_STATEFUL ending with DBCS: emit an SI to return the output stream to SBCS */ 4776 if(targetCapacity>0) { 4777 *target++ = siBytes[0]; 4778 if (siLength == 2) { 4779 if (targetCapacity<2) { 4780 cnv->charErrorBuffer[0] = siBytes[1]; 4781 cnv->charErrorBufferLength=1; 4782 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 4783 } else { 4784 *target++ = siBytes[1]; 4785 } 4786 } 4787 if(offsets!=nullptr) { 4788 /* set the last source character's index (sourceIndex points at sourceLimit now) */ 4789 *offsets++=prevSourceIndex; 4790 } 4791 } else { 4792 /* target is full */ 4793 cnv->charErrorBuffer[0] = siBytes[0]; 4794 if (siLength == 2) { 4795 cnv->charErrorBuffer[1] = siBytes[1]; 4796 } 4797 cnv->charErrorBufferLength=siLength; 4798 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 4799 } 4800 prevLength=1; /* we switched into SBCS */ 4801 } 4802 4803 /* set the converter state back into UConverter */ 4804 cnv->fromUChar32=c; 4805 cnv->fromUnicodeStatus=prevLength; 4806 4807 /* write back the updated pointers */ 4808 pArgs->source=source; 4809 pArgs->target=(char *)target; 4810 pArgs->offsets=offsets; 4811 } 4812 4813 /* 4814 * This is another simple conversion function for internal use by other 4815 * conversion implementations. 4816 * It does not use the converter state nor call callbacks. 4817 * It does not handle the EBCDIC swaplfnl option (set in UConverter). 4818 * It handles conversion extensions but not GB 18030. 4819 * 4820 * It converts one single Unicode code point into codepage bytes, encoded 4821 * as one 32-bit value. The function returns the number of bytes in *pValue: 4822 * 1..4 the number of bytes in *pValue 4823 * 0 unassigned (*pValue undefined) 4824 * -1 illegal (currently not used, *pValue undefined) 4825 * 4826 * *pValue will contain the resulting bytes with the last byte in bits 7..0, 4827 * the second to last byte in bits 15..8, etc. 4828 * Currently, the function assumes but does not check that 0<=c<=0x10ffff. 4829 */ 4830 U_CFUNC int32_t 4831 ucnv_MBCSFromUChar32(UConverterSharedData *sharedData, 4832 UChar32 c, uint32_t *pValue, 4833 UBool useFallback) { 4834 const int32_t *cx; 4835 const uint16_t *table; 4836 #if 0 4837 /* #if 0 because this is not currently used in ICU - reduce code, increase code coverage */ 4838 const uint8_t *p; 4839 #endif 4840 uint32_t stage2Entry; 4841 uint32_t value; 4842 int32_t length; 4843 4844 /* BMP-only codepages are stored without stage 1 entries for supplementary code points */ 4845 if(c<=0xffff || (sharedData->mbcs.unicodeMask&UCNV_HAS_SUPPLEMENTARY)) { 4846 table=sharedData->mbcs.fromUnicodeTable; 4847 4848 /* convert the Unicode code point in c into codepage bytes (same as in _MBCSFromUnicodeWithOffsets) */ 4849 if(sharedData->mbcs.outputType==MBCS_OUTPUT_1) { 4850 value=MBCS_SINGLE_RESULT_FROM_U(table, (uint16_t *)sharedData->mbcs.fromUnicodeBytes, c); 4851 /* is this code point assigned, or do we use fallbacks? */ 4852 if(useFallback ? value>=0x800 : value>=0xc00) { 4853 *pValue=value&0xff; 4854 return 1; 4855 } 4856 } else /* outputType!=MBCS_OUTPUT_1 */ { 4857 stage2Entry=MBCS_STAGE_2_FROM_U(table, c); 4858 4859 /* get the bytes and the length for the output */ 4860 switch(sharedData->mbcs.outputType) { 4861 case MBCS_OUTPUT_2: 4862 value=MBCS_VALUE_2_FROM_STAGE_2(sharedData->mbcs.fromUnicodeBytes, stage2Entry, c); 4863 if(value<=0xff) { 4864 length=1; 4865 } else { 4866 length=2; 4867 } 4868 break; 4869 #if 0 4870 /* #if 0 because this is not currently used in ICU - reduce code, increase code coverage */ 4871 case MBCS_OUTPUT_DBCS_ONLY: 4872 /* table with single-byte results, but only DBCS mappings used */ 4873 value=MBCS_VALUE_2_FROM_STAGE_2(sharedData->mbcs.fromUnicodeBytes, stage2Entry, c); 4874 if(value<=0xff) { 4875 /* no mapping or SBCS result, not taken for DBCS-only */ 4876 value=stage2Entry=0; /* stage2Entry=0 to reset roundtrip flags */ 4877 length=0; 4878 } else { 4879 length=2; 4880 } 4881 break; 4882 case MBCS_OUTPUT_3: 4883 p=MBCS_POINTER_3_FROM_STAGE_2(sharedData->mbcs.fromUnicodeBytes, stage2Entry, c); 4884 value=((uint32_t)*p<<16)|((uint32_t)p[1]<<8)|p[2]; 4885 if(value<=0xff) { 4886 length=1; 4887 } else if(value<=0xffff) { 4888 length=2; 4889 } else { 4890 length=3; 4891 } 4892 break; 4893 case MBCS_OUTPUT_4: 4894 value=MBCS_VALUE_4_FROM_STAGE_2(sharedData->mbcs.fromUnicodeBytes, stage2Entry, c); 4895 if(value<=0xff) { 4896 length=1; 4897 } else if(value<=0xffff) { 4898 length=2; 4899 } else if(value<=0xffffff) { 4900 length=3; 4901 } else { 4902 length=4; 4903 } 4904 break; 4905 case MBCS_OUTPUT_3_EUC: 4906 value=MBCS_VALUE_2_FROM_STAGE_2(sharedData->mbcs.fromUnicodeBytes, stage2Entry, c); 4907 /* EUC 16-bit fixed-length representation */ 4908 if(value<=0xff) { 4909 length=1; 4910 } else if((value&0x8000)==0) { 4911 value|=0x8e8000; 4912 length=3; 4913 } else if((value&0x80)==0) { 4914 value|=0x8f0080; 4915 length=3; 4916 } else { 4917 length=2; 4918 } 4919 break; 4920 case MBCS_OUTPUT_4_EUC: 4921 p=MBCS_POINTER_3_FROM_STAGE_2(sharedData->mbcs.fromUnicodeBytes, stage2Entry, c); 4922 value=((uint32_t)*p<<16)|((uint32_t)p[1]<<8)|p[2]; 4923 /* EUC 16-bit fixed-length representation applied to the first two bytes */ 4924 if(value<=0xff) { 4925 length=1; 4926 } else if(value<=0xffff) { 4927 length=2; 4928 } else if((value&0x800000)==0) { 4929 value|=0x8e800000; 4930 length=4; 4931 } else if((value&0x8000)==0) { 4932 value|=0x8f008000; 4933 length=4; 4934 } else { 4935 length=3; 4936 } 4937 break; 4938 #endif 4939 default: 4940 /* must not occur */ 4941 return -1; 4942 } 4943 4944 /* is this code point assigned, or do we use fallbacks? */ 4945 if( MBCS_FROM_U_IS_ROUNDTRIP(stage2Entry, c) || 4946 (FROM_U_USE_FALLBACK(useFallback, c) && value!=0) 4947 ) { 4948 /* 4949 * We allow a 0 byte output if the "assigned" bit is set for this entry. 4950 * There is no way with this data structure for fallback output 4951 * to be a zero byte. 4952 */ 4953 /* assigned */ 4954 *pValue=value; 4955 return length; 4956 } 4957 } 4958 } 4959 4960 cx=sharedData->mbcs.extIndexes; 4961 if(cx!=nullptr) { 4962 length=ucnv_extSimpleMatchFromU(cx, c, pValue, useFallback); 4963 return length>=0 ? length : -length; /* return abs(length); */ 4964 } 4965 4966 /* unassigned */ 4967 return 0; 4968 } 4969 4970 4971 #if 0 4972 /* 4973 * This function has been moved to ucnv2022.c for inlining. 4974 * This implementation is here only for documentation purposes 4975 */ 4976 4977 /** 4978 * This version of ucnv_MBCSFromUChar32() is optimized for single-byte codepages. 4979 * It does not handle the EBCDIC swaplfnl option (set in UConverter). 4980 * It does not handle conversion extensions (_extFromU()). 4981 * 4982 * It returns the codepage byte for the code point, or -1 if it is unassigned. 4983 */ 4984 U_CFUNC int32_t 4985 ucnv_MBCSSingleFromUChar32(UConverterSharedData *sharedData, 4986 UChar32 c, 4987 UBool useFallback) { 4988 const uint16_t *table; 4989 int32_t value; 4990 4991 /* BMP-only codepages are stored without stage 1 entries for supplementary code points */ 4992 if(c>=0x10000 && !(sharedData->mbcs.unicodeMask&UCNV_HAS_SUPPLEMENTARY)) { 4993 return -1; 4994 } 4995 4996 /* convert the Unicode code point in c into codepage bytes (same as in _MBCSFromUnicodeWithOffsets) */ 4997 table=sharedData->mbcs.fromUnicodeTable; 4998 4999 /* get the byte for the output */ 5000 value=MBCS_SINGLE_RESULT_FROM_U(table, (uint16_t *)sharedData->mbcs.fromUnicodeBytes, c); 5001 /* is this code point assigned, or do we use fallbacks? */ 5002 if(useFallback ? value>=0x800 : value>=0xc00) { 5003 return value&0xff; 5004 } else { 5005 return -1; 5006 } 5007 } 5008 #endif 5009 5010 /* MBCS-from-UTF-8 conversion functions ------------------------------------- */ 5011 5012 /* offsets for n-byte UTF-8 sequences that were calculated with ((lead<<6)+trail)<<6+trail... */ 5013 static const UChar32 5014 utf8_offsets[5]={ 0, 0, 0x3080, 0xE2080, 0x3C82080 }; 5015 5016 static void U_CALLCONV 5017 ucnv_SBCSFromUTF8(UConverterFromUnicodeArgs *pFromUArgs, 5018 UConverterToUnicodeArgs *pToUArgs, 5019 UErrorCode *pErrorCode) { 5020 UConverter *utf8, *cnv; 5021 const uint8_t *source, *sourceLimit; 5022 uint8_t *target; 5023 int32_t targetCapacity; 5024 5025 const uint16_t *table, *sbcsIndex; 5026 const uint16_t *results; 5027 5028 int8_t oldToULength, toULength, toULimit; 5029 5030 UChar32 c; 5031 uint8_t b, t1, t2; 5032 5033 uint32_t asciiRoundtrips; 5034 uint16_t value, minValue = 0; 5035 UBool hasSupplementary; 5036 5037 /* set up the local pointers */ 5038 utf8=pToUArgs->converter; 5039 cnv=pFromUArgs->converter; 5040 source=(uint8_t *)pToUArgs->source; 5041 sourceLimit=(uint8_t *)pToUArgs->sourceLimit; 5042 target = reinterpret_cast<uint8_t*>(pFromUArgs->target); 5043 targetCapacity = static_cast<int32_t>(pFromUArgs->targetLimit - pFromUArgs->target); 5044 5045 table=cnv->sharedData->mbcs.fromUnicodeTable; 5046 sbcsIndex=cnv->sharedData->mbcs.sbcsIndex; 5047 if((cnv->options&UCNV_OPTION_SWAP_LFNL)!=0) { 5048 results = reinterpret_cast<uint16_t*>(cnv->sharedData->mbcs.swapLFNLFromUnicodeBytes); 5049 } else { 5050 results=(uint16_t *)cnv->sharedData->mbcs.fromUnicodeBytes; 5051 } 5052 asciiRoundtrips=cnv->sharedData->mbcs.asciiRoundtrips; 5053 5054 if(cnv->useFallback) { 5055 /* use all roundtrip and fallback results */ 5056 minValue=0x800; 5057 } else { 5058 /* use only roundtrips and fallbacks from private-use characters */ 5059 minValue=0xc00; 5060 } 5061 hasSupplementary = static_cast<UBool>(cnv->sharedData->mbcs.unicodeMask & UCNV_HAS_SUPPLEMENTARY); 5062 5063 /* get the converter state from the UTF-8 UConverter */ 5064 if(utf8->toULength > 0) { 5065 toULength=oldToULength=utf8->toULength; 5066 toULimit = static_cast<int8_t>(utf8->mode); 5067 c = static_cast<UChar32>(utf8->toUnicodeStatus); 5068 } else { 5069 toULength=oldToULength=toULimit=0; 5070 c = 0; 5071 } 5072 5073 // The conversion loop checks source<sourceLimit only once per 1/2/3-byte character. 5074 // If the buffer ends with a truncated 2- or 3-byte sequence, 5075 // then we reduce the sourceLimit to before that, 5076 // and collect the remaining bytes after the conversion loop. 5077 { 5078 // Do not go back into the bytes that will be read for finishing a partial 5079 // sequence from the previous buffer. 5080 int32_t length = static_cast<int32_t>(sourceLimit - source) - (toULimit - oldToULength); 5081 if(length>0) { 5082 uint8_t b1=*(sourceLimit-1); 5083 if(U8_IS_SINGLE(b1)) { 5084 // common ASCII character 5085 } else if(U8_IS_TRAIL(b1) && length>=2) { 5086 uint8_t b2=*(sourceLimit-2); 5087 if(0xe0<=b2 && b2<0xf0 && U8_IS_VALID_LEAD3_AND_T1(b2, b1)) { 5088 // truncated 3-byte sequence 5089 sourceLimit-=2; 5090 } 5091 } else if(0xc2<=b1 && b1<0xf0) { 5092 // truncated 2- or 3-byte sequence 5093 --sourceLimit; 5094 } 5095 } 5096 } 5097 5098 if(c!=0 && targetCapacity>0) { 5099 utf8->toUnicodeStatus=0; 5100 utf8->toULength=0; 5101 goto moreBytes; 5102 /* 5103 * Note: We could avoid the goto by duplicating some of the moreBytes 5104 * code, but only up to the point of collecting a complete UTF-8 5105 * sequence; then recurse for the toUBytes[toULength] 5106 * and then continue with normal conversion. 5107 * 5108 * If so, move this code to just after initializing the minimum 5109 * set of local variables for reading the UTF-8 input 5110 * (utf8, source, target, limits but not cnv, table, minValue, etc.). 5111 * 5112 * Potential advantages: 5113 * - avoid the goto 5114 * - oldToULength could become a local variable in just those code blocks 5115 * that deal with buffer boundaries 5116 * - possibly faster if the goto prevents some compiler optimizations 5117 * (this would need measuring to confirm) 5118 * Disadvantage: 5119 * - code duplication 5120 */ 5121 } 5122 5123 /* conversion loop */ 5124 while(source<sourceLimit) { 5125 if(targetCapacity>0) { 5126 b=*source++; 5127 if(U8_IS_SINGLE(b)) { 5128 /* convert ASCII */ 5129 if(IS_ASCII_ROUNDTRIP(b, asciiRoundtrips)) { 5130 *target++ = b; 5131 --targetCapacity; 5132 continue; 5133 } else { 5134 c=b; 5135 value=SBCS_RESULT_FROM_UTF8(sbcsIndex, results, 0, c); 5136 } 5137 } else { 5138 if(b<0xe0) { 5139 if( /* handle U+0080..U+07FF inline */ 5140 b>=0xc2 && 5141 (t1 = static_cast<uint8_t>(*source - 0x80)) <= 0x3f 5142 ) { 5143 c=b&0x1f; 5144 ++source; 5145 value=SBCS_RESULT_FROM_UTF8(sbcsIndex, results, c, t1); 5146 if(value>=minValue) { 5147 *target++ = static_cast<uint8_t>(value); 5148 --targetCapacity; 5149 continue; 5150 } else { 5151 c=(c<<6)|t1; 5152 } 5153 } else { 5154 c=-1; 5155 } 5156 } else if(b==0xe0) { 5157 if( /* handle U+0800..U+0FFF inline */ 5158 (t1 = static_cast<uint8_t>(source[0] - 0x80)) <= 0x3f && t1 >= 0x20 && 5159 (t2 = static_cast<uint8_t>(source[1] - 0x80)) <= 0x3f 5160 ) { 5161 c=t1; 5162 source+=2; 5163 value=SBCS_RESULT_FROM_UTF8(sbcsIndex, results, c, t2); 5164 if(value>=minValue) { 5165 *target++ = static_cast<uint8_t>(value); 5166 --targetCapacity; 5167 continue; 5168 } else { 5169 c=(c<<6)|t2; 5170 } 5171 } else { 5172 c=-1; 5173 } 5174 } else { 5175 c=-1; 5176 } 5177 5178 if(c<0) { 5179 /* handle "complicated" and error cases, and continuing partial characters */ 5180 oldToULength=0; 5181 toULength=1; 5182 toULimit=U8_COUNT_BYTES_NON_ASCII(b); 5183 c=b; 5184 moreBytes: 5185 while(toULength<toULimit) { 5186 /* 5187 * The sourceLimit may have been adjusted before the conversion loop 5188 * to stop before a truncated sequence. 5189 * Here we need to use the real limit in case we have two truncated 5190 * sequences at the end. 5191 * See ticket #7492. 5192 */ 5193 if(source<(uint8_t *)pToUArgs->sourceLimit) { 5194 b=*source; 5195 if(icu::UTF8::isValidTrail(c, b, toULength, toULimit)) { 5196 ++source; 5197 ++toULength; 5198 c=(c<<6)+b; 5199 } else { 5200 break; /* sequence too short, stop with toULength<toULimit */ 5201 } 5202 } else { 5203 /* store the partial UTF-8 character, compatible with the regular UTF-8 converter */ 5204 source-=(toULength-oldToULength); 5205 while(oldToULength<toULength) { 5206 utf8->toUBytes[oldToULength++]=*source++; 5207 } 5208 utf8->toUnicodeStatus=c; 5209 utf8->toULength=toULength; 5210 utf8->mode=toULimit; 5211 pToUArgs->source=(char *)source; 5212 pFromUArgs->target = reinterpret_cast<char*>(target); 5213 return; 5214 } 5215 } 5216 5217 if(toULength==toULimit) { 5218 c-=utf8_offsets[toULength]; 5219 if(toULength<=3) { /* BMP */ 5220 value=MBCS_SINGLE_RESULT_FROM_U(table, results, c); 5221 } else { 5222 /* supplementary code point */ 5223 if(!hasSupplementary) { 5224 /* BMP-only codepages are stored without stage 1 entries for supplementary code points */ 5225 value=0; 5226 } else { 5227 value=MBCS_SINGLE_RESULT_FROM_U(table, results, c); 5228 } 5229 } 5230 } else { 5231 /* error handling: illegal UTF-8 byte sequence */ 5232 source-=(toULength-oldToULength); 5233 while(oldToULength<toULength) { 5234 utf8->toUBytes[oldToULength++]=*source++; 5235 } 5236 utf8->toULength=toULength; 5237 pToUArgs->source=(char *)source; 5238 pFromUArgs->target = reinterpret_cast<char*>(target); 5239 *pErrorCode=U_ILLEGAL_CHAR_FOUND; 5240 return; 5241 } 5242 } 5243 } 5244 5245 if(value>=minValue) { 5246 /* output the mapping for c */ 5247 *target++ = static_cast<uint8_t>(value); 5248 --targetCapacity; 5249 } else { 5250 /* value<minValue means c is unassigned (unmappable) */ 5251 /* 5252 * Try an extension mapping. 5253 * Pass in no source because we don't have UTF-16 input. 5254 * If we have a partial match on c, we will return and revert 5255 * to UTF-8->UTF-16->charset conversion. 5256 */ 5257 static const char16_t nul=0; 5258 const char16_t *noSource=&nul; 5259 c=_extFromU(cnv, cnv->sharedData, 5260 c, &noSource, noSource, 5261 &target, target+targetCapacity, 5262 nullptr, -1, 5263 pFromUArgs->flush, 5264 pErrorCode); 5265 5266 if(U_FAILURE(*pErrorCode)) { 5267 /* not mappable or buffer overflow */ 5268 cnv->fromUChar32=c; 5269 break; 5270 } else if(cnv->preFromUFirstCP>=0) { 5271 /* 5272 * Partial match, return and revert to pivoting. 5273 * In normal from-UTF-16 conversion, we would just continue 5274 * but then exit the loop because the extension match would 5275 * have consumed the source. 5276 */ 5277 *pErrorCode=U_USING_DEFAULT_WARNING; 5278 break; 5279 } else { 5280 /* a mapping was written to the target, continue */ 5281 5282 /* recalculate the targetCapacity after an extension mapping */ 5283 targetCapacity = static_cast<int32_t>(pFromUArgs->targetLimit - reinterpret_cast<char*>(target)); 5284 } 5285 } 5286 } else { 5287 /* target is full */ 5288 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 5289 break; 5290 } 5291 } 5292 5293 /* 5294 * The sourceLimit may have been adjusted before the conversion loop 5295 * to stop before a truncated sequence. 5296 * If so, then collect the truncated sequence now. 5297 */ 5298 if(U_SUCCESS(*pErrorCode) && 5299 cnv->preFromUFirstCP<0 && 5300 source<(sourceLimit=(uint8_t *)pToUArgs->sourceLimit)) { 5301 c=utf8->toUBytes[0]=b=*source++; 5302 toULength=1; 5303 toULimit=U8_COUNT_BYTES(b); 5304 while(source<sourceLimit) { 5305 utf8->toUBytes[toULength++]=b=*source++; 5306 c=(c<<6)+b; 5307 } 5308 utf8->toUnicodeStatus=c; 5309 utf8->toULength=toULength; 5310 utf8->mode=toULimit; 5311 } 5312 5313 /* write back the updated pointers */ 5314 pToUArgs->source=(char *)source; 5315 pFromUArgs->target = reinterpret_cast<char*>(target); 5316 } 5317 5318 static void U_CALLCONV 5319 ucnv_DBCSFromUTF8(UConverterFromUnicodeArgs *pFromUArgs, 5320 UConverterToUnicodeArgs *pToUArgs, 5321 UErrorCode *pErrorCode) { 5322 UConverter *utf8, *cnv; 5323 const uint8_t *source, *sourceLimit; 5324 uint8_t *target; 5325 int32_t targetCapacity; 5326 5327 const uint16_t *table, *mbcsIndex; 5328 const uint16_t *results; 5329 5330 int8_t oldToULength, toULength, toULimit; 5331 5332 UChar32 c; 5333 uint8_t b, t1, t2; 5334 5335 uint32_t stage2Entry; 5336 uint32_t asciiRoundtrips; 5337 uint16_t value = 0; 5338 UBool hasSupplementary; 5339 5340 /* set up the local pointers */ 5341 utf8=pToUArgs->converter; 5342 cnv=pFromUArgs->converter; 5343 source=(uint8_t *)pToUArgs->source; 5344 sourceLimit=(uint8_t *)pToUArgs->sourceLimit; 5345 target = reinterpret_cast<uint8_t*>(pFromUArgs->target); 5346 targetCapacity = static_cast<int32_t>(pFromUArgs->targetLimit - pFromUArgs->target); 5347 5348 table=cnv->sharedData->mbcs.fromUnicodeTable; 5349 mbcsIndex=cnv->sharedData->mbcs.mbcsIndex; 5350 if((cnv->options&UCNV_OPTION_SWAP_LFNL)!=0) { 5351 results = reinterpret_cast<uint16_t*>(cnv->sharedData->mbcs.swapLFNLFromUnicodeBytes); 5352 } else { 5353 results=(uint16_t *)cnv->sharedData->mbcs.fromUnicodeBytes; 5354 } 5355 asciiRoundtrips=cnv->sharedData->mbcs.asciiRoundtrips; 5356 5357 hasSupplementary = static_cast<UBool>(cnv->sharedData->mbcs.unicodeMask & UCNV_HAS_SUPPLEMENTARY); 5358 5359 /* get the converter state from the UTF-8 UConverter */ 5360 if(utf8->toULength > 0) { 5361 toULength=oldToULength=utf8->toULength; 5362 toULimit = static_cast<int8_t>(utf8->mode); 5363 c = static_cast<UChar32>(utf8->toUnicodeStatus); 5364 } else { 5365 toULength=oldToULength=toULimit=0; 5366 c = 0; 5367 } 5368 5369 // The conversion loop checks source<sourceLimit only once per 1/2/3-byte character. 5370 // If the buffer ends with a truncated 2- or 3-byte sequence, 5371 // then we reduce the sourceLimit to before that, 5372 // and collect the remaining bytes after the conversion loop. 5373 { 5374 // Do not go back into the bytes that will be read for finishing a partial 5375 // sequence from the previous buffer. 5376 int32_t length = static_cast<int32_t>(sourceLimit - source) - (toULimit - oldToULength); 5377 if(length>0) { 5378 uint8_t b1=*(sourceLimit-1); 5379 if(U8_IS_SINGLE(b1)) { 5380 // common ASCII character 5381 } else if(U8_IS_TRAIL(b1) && length>=2) { 5382 uint8_t b2=*(sourceLimit-2); 5383 if(0xe0<=b2 && b2<0xf0 && U8_IS_VALID_LEAD3_AND_T1(b2, b1)) { 5384 // truncated 3-byte sequence 5385 sourceLimit-=2; 5386 } 5387 } else if(0xc2<=b1 && b1<0xf0) { 5388 // truncated 2- or 3-byte sequence 5389 --sourceLimit; 5390 } 5391 } 5392 } 5393 5394 if(c!=0 && targetCapacity>0) { 5395 utf8->toUnicodeStatus=0; 5396 utf8->toULength=0; 5397 goto moreBytes; 5398 /* See note in ucnv_SBCSFromUTF8() about this goto. */ 5399 } 5400 5401 /* conversion loop */ 5402 while(source<sourceLimit) { 5403 if(targetCapacity>0) { 5404 b=*source++; 5405 if(U8_IS_SINGLE(b)) { 5406 /* convert ASCII */ 5407 if(IS_ASCII_ROUNDTRIP(b, asciiRoundtrips)) { 5408 *target++=b; 5409 --targetCapacity; 5410 continue; 5411 } else { 5412 value=DBCS_RESULT_FROM_UTF8(mbcsIndex, results, 0, b); 5413 if(value==0) { 5414 c=b; 5415 goto unassigned; 5416 } 5417 } 5418 } else { 5419 if(b>=0xe0) { 5420 if( /* handle U+0800..U+D7FF inline */ 5421 b<=0xed && // do not assume maxFastUChar>0xd7ff 5422 U8_IS_VALID_LEAD3_AND_T1(b, t1=source[0]) && 5423 (t2 = static_cast<uint8_t>(source[1] - 0x80)) <= 0x3f 5424 ) { 5425 c=((b&0xf)<<6)|(t1&0x3f); 5426 source+=2; 5427 value=DBCS_RESULT_FROM_UTF8(mbcsIndex, results, c, t2); 5428 if(value==0) { 5429 c=(c<<6)|t2; 5430 goto unassigned; 5431 } 5432 } else { 5433 c=-1; 5434 } 5435 } else { 5436 if( /* handle U+0080..U+07FF inline */ 5437 b>=0xc2 && 5438 (t1 = static_cast<uint8_t>(*source - 0x80)) <= 0x3f 5439 ) { 5440 c=b&0x1f; 5441 ++source; 5442 value=DBCS_RESULT_FROM_UTF8(mbcsIndex, results, c, t1); 5443 if(value==0) { 5444 c=(c<<6)|t1; 5445 goto unassigned; 5446 } 5447 } else { 5448 c=-1; 5449 } 5450 } 5451 5452 if(c<0) { 5453 /* handle "complicated" and error cases, and continuing partial characters */ 5454 oldToULength=0; 5455 toULength=1; 5456 toULimit=U8_COUNT_BYTES_NON_ASCII(b); 5457 c=b; 5458 moreBytes: 5459 while(toULength<toULimit) { 5460 /* 5461 * The sourceLimit may have been adjusted before the conversion loop 5462 * to stop before a truncated sequence. 5463 * Here we need to use the real limit in case we have two truncated 5464 * sequences at the end. 5465 * See ticket #7492. 5466 */ 5467 if(source<(uint8_t *)pToUArgs->sourceLimit) { 5468 b=*source; 5469 if(icu::UTF8::isValidTrail(c, b, toULength, toULimit)) { 5470 ++source; 5471 ++toULength; 5472 c=(c<<6)+b; 5473 } else { 5474 break; /* sequence too short, stop with toULength<toULimit */ 5475 } 5476 } else { 5477 /* store the partial UTF-8 character, compatible with the regular UTF-8 converter */ 5478 source-=(toULength-oldToULength); 5479 while(oldToULength<toULength) { 5480 utf8->toUBytes[oldToULength++]=*source++; 5481 } 5482 utf8->toUnicodeStatus=c; 5483 utf8->toULength=toULength; 5484 utf8->mode=toULimit; 5485 pToUArgs->source=(char *)source; 5486 pFromUArgs->target = reinterpret_cast<char*>(target); 5487 return; 5488 } 5489 } 5490 5491 if(toULength==toULimit) { 5492 c-=utf8_offsets[toULength]; 5493 if(toULength<=3) { /* BMP */ 5494 stage2Entry=MBCS_STAGE_2_FROM_U(table, c); 5495 } else { 5496 /* supplementary code point */ 5497 if(!hasSupplementary) { 5498 /* BMP-only codepages are stored without stage 1 entries for supplementary code points */ 5499 stage2Entry=0; 5500 } else { 5501 stage2Entry=MBCS_STAGE_2_FROM_U(table, c); 5502 } 5503 } 5504 } else { 5505 /* error handling: illegal UTF-8 byte sequence */ 5506 source-=(toULength-oldToULength); 5507 while(oldToULength<toULength) { 5508 utf8->toUBytes[oldToULength++]=*source++; 5509 } 5510 utf8->toULength=toULength; 5511 pToUArgs->source=(char *)source; 5512 pFromUArgs->target = reinterpret_cast<char*>(target); 5513 *pErrorCode=U_ILLEGAL_CHAR_FOUND; 5514 return; 5515 } 5516 5517 /* get the bytes and the length for the output */ 5518 /* MBCS_OUTPUT_2 */ 5519 value=MBCS_VALUE_2_FROM_STAGE_2(results, stage2Entry, c); 5520 5521 /* is this code point assigned, or do we use fallbacks? */ 5522 if(!(MBCS_FROM_U_IS_ROUNDTRIP(stage2Entry, c) || 5523 (UCNV_FROM_U_USE_FALLBACK(cnv, c) && value!=0)) 5524 ) { 5525 goto unassigned; 5526 } 5527 } 5528 } 5529 5530 /* write the output character bytes from value and length */ 5531 /* from the first if in the loop we know that targetCapacity>0 */ 5532 if(value<=0xff) { 5533 /* this is easy because we know that there is enough space */ 5534 *target++ = static_cast<uint8_t>(value); 5535 --targetCapacity; 5536 } else /* length==2 */ { 5537 *target++ = static_cast<uint8_t>(value >> 8); 5538 if(2<=targetCapacity) { 5539 *target++ = static_cast<uint8_t>(value); 5540 targetCapacity-=2; 5541 } else { 5542 cnv->charErrorBuffer[0] = static_cast<char>(value); 5543 cnv->charErrorBufferLength=1; 5544 5545 /* target overflow */ 5546 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 5547 break; 5548 } 5549 } 5550 continue; 5551 5552 unassigned: 5553 { 5554 /* 5555 * Try an extension mapping. 5556 * Pass in no source because we don't have UTF-16 input. 5557 * If we have a partial match on c, we will return and revert 5558 * to UTF-8->UTF-16->charset conversion. 5559 */ 5560 static const char16_t nul=0; 5561 const char16_t *noSource=&nul; 5562 c=_extFromU(cnv, cnv->sharedData, 5563 c, &noSource, noSource, 5564 &target, target+targetCapacity, 5565 nullptr, -1, 5566 pFromUArgs->flush, 5567 pErrorCode); 5568 5569 if(U_FAILURE(*pErrorCode)) { 5570 /* not mappable or buffer overflow */ 5571 cnv->fromUChar32=c; 5572 break; 5573 } else if(cnv->preFromUFirstCP>=0) { 5574 /* 5575 * Partial match, return and revert to pivoting. 5576 * In normal from-UTF-16 conversion, we would just continue 5577 * but then exit the loop because the extension match would 5578 * have consumed the source. 5579 */ 5580 *pErrorCode=U_USING_DEFAULT_WARNING; 5581 break; 5582 } else { 5583 /* a mapping was written to the target, continue */ 5584 5585 /* recalculate the targetCapacity after an extension mapping */ 5586 targetCapacity = static_cast<int32_t>(pFromUArgs->targetLimit - reinterpret_cast<char*>(target)); 5587 continue; 5588 } 5589 } 5590 } else { 5591 /* target is full */ 5592 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 5593 break; 5594 } 5595 } 5596 5597 /* 5598 * The sourceLimit may have been adjusted before the conversion loop 5599 * to stop before a truncated sequence. 5600 * If so, then collect the truncated sequence now. 5601 */ 5602 if(U_SUCCESS(*pErrorCode) && 5603 cnv->preFromUFirstCP<0 && 5604 source<(sourceLimit=(uint8_t *)pToUArgs->sourceLimit)) { 5605 c=utf8->toUBytes[0]=b=*source++; 5606 toULength=1; 5607 toULimit=U8_COUNT_BYTES(b); 5608 while(source<sourceLimit) { 5609 utf8->toUBytes[toULength++]=b=*source++; 5610 c=(c<<6)+b; 5611 } 5612 utf8->toUnicodeStatus=c; 5613 utf8->toULength=toULength; 5614 utf8->mode=toULimit; 5615 } 5616 5617 /* write back the updated pointers */ 5618 pToUArgs->source=(char *)source; 5619 pFromUArgs->target = reinterpret_cast<char*>(target); 5620 } 5621 5622 /* miscellaneous ------------------------------------------------------------ */ 5623 5624 static void U_CALLCONV 5625 ucnv_MBCSGetStarters(const UConverter* cnv, 5626 UBool starters[256], 5627 UErrorCode *) { 5628 const int32_t *state0; 5629 int i; 5630 5631 state0=cnv->sharedData->mbcs.stateTable[cnv->sharedData->mbcs.dbcsOnlyState]; 5632 for(i=0; i<256; ++i) { 5633 /* all bytes that cause a state transition from state 0 are lead bytes */ 5634 starters[i] = static_cast<UBool>(MBCS_ENTRY_IS_TRANSITION(state0[i])); 5635 } 5636 } 5637 5638 /* 5639 * This is an internal function that allows other converter implementations 5640 * to check whether a byte is a lead byte. 5641 */ 5642 U_CFUNC UBool 5643 ucnv_MBCSIsLeadByte(UConverterSharedData *sharedData, char byte) { 5644 return MBCS_ENTRY_IS_TRANSITION(sharedData->mbcs.stateTable[0][(uint8_t)byte]); 5645 } 5646 5647 static void U_CALLCONV 5648 ucnv_MBCSWriteSub(UConverterFromUnicodeArgs *pArgs, 5649 int32_t offsetIndex, 5650 UErrorCode *pErrorCode) { 5651 UConverter *cnv=pArgs->converter; 5652 char *p, *subchar; 5653 char buffer[4]; 5654 int32_t length; 5655 5656 /* first, select between subChar and subChar1 */ 5657 if( cnv->subChar1!=0 && 5658 (cnv->sharedData->mbcs.extIndexes!=nullptr ? 5659 cnv->useSubChar1 : 5660 (cnv->invalidUCharBuffer[0]<=0xff)) 5661 ) { 5662 /* select subChar1 if it is set (not 0) and the unmappable Unicode code point is up to U+00ff (IBM MBCS behavior) */ 5663 subchar = reinterpret_cast<char*>(&cnv->subChar1); 5664 length=1; 5665 } else { 5666 /* select subChar in all other cases */ 5667 subchar = reinterpret_cast<char*>(cnv->subChars); 5668 length=cnv->subCharLen; 5669 } 5670 5671 /* reset the selector for the next code point */ 5672 cnv->useSubChar1=false; 5673 5674 if (cnv->sharedData->mbcs.outputType == MBCS_OUTPUT_2_SISO) { 5675 p=buffer; 5676 5677 /* fromUnicodeStatus contains prevLength */ 5678 switch(length) { 5679 case 1: 5680 if(cnv->fromUnicodeStatus==2) { 5681 /* DBCS mode and SBCS sub char: change to SBCS */ 5682 cnv->fromUnicodeStatus=1; 5683 *p++=UCNV_SI; 5684 } 5685 *p++=subchar[0]; 5686 break; 5687 case 2: 5688 if(cnv->fromUnicodeStatus<=1) { 5689 /* SBCS mode and DBCS sub char: change to DBCS */ 5690 cnv->fromUnicodeStatus=2; 5691 *p++=UCNV_SO; 5692 } 5693 *p++=subchar[0]; 5694 *p++=subchar[1]; 5695 break; 5696 default: 5697 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; 5698 return; 5699 } 5700 subchar=buffer; 5701 length = static_cast<int32_t>(p - buffer); 5702 } 5703 5704 ucnv_cbFromUWriteBytes(pArgs, subchar, length, offsetIndex, pErrorCode); 5705 } 5706 5707 U_CFUNC UConverterType 5708 ucnv_MBCSGetType(const UConverter* converter) { 5709 /* SBCS, DBCS, and EBCDIC_STATEFUL are replaced by MBCS, but here we cheat a little */ 5710 if(converter->sharedData->mbcs.countStates==1) { 5711 return (UConverterType)UCNV_SBCS; 5712 } else if((converter->sharedData->mbcs.outputType&0xff)==MBCS_OUTPUT_2_SISO) { 5713 return (UConverterType)UCNV_EBCDIC_STATEFUL; 5714 } else if(converter->sharedData->staticData->minBytesPerChar==2 && converter->sharedData->staticData->maxBytesPerChar==2) { 5715 return (UConverterType)UCNV_DBCS; 5716 } 5717 return (UConverterType)UCNV_MBCS; 5718 } 5719 5720 #endif /* #if !UCONFIG_NO_LEGACY_CONVERSION */