ucnv2022.cpp (159706B)
1 // © 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html 3 /* 4 ********************************************************************** 5 * Copyright (C) 2000-2016, International Business Machines 6 * Corporation and others. All Rights Reserved. 7 ********************************************************************** 8 * file name: ucnv2022.cpp 9 * encoding: UTF-8 10 * tab size: 8 (not used) 11 * indentation:4 12 * 13 * created on: 2000feb03 14 * created by: Markus W. Scherer 15 * 16 * Change history: 17 * 18 * 06/29/2000 helena Major rewrite of the callback APIs. 19 * 08/08/2000 Ram Included support for ISO-2022-JP-2 20 * Changed implementation of toUnicode 21 * function 22 * 08/21/2000 Ram Added support for ISO-2022-KR 23 * 08/29/2000 Ram Seperated implementation of EBCDIC to 24 * ucnvebdc.c 25 * 09/20/2000 Ram Added support for ISO-2022-CN 26 * Added implementations for getNextUChar() 27 * for specific 2022 country variants. 28 * 10/31/2000 Ram Implemented offsets logic functions 29 */ 30 31 #include "unicode/utypes.h" 32 33 #if !UCONFIG_NO_CONVERSION && !UCONFIG_NO_LEGACY_CONVERSION 34 35 #include "unicode/ucnv.h" 36 #include "unicode/uset.h" 37 #include "unicode/ucnv_err.h" 38 #include "unicode/ucnv_cb.h" 39 #include "unicode/utf16.h" 40 #include "ucnv_imp.h" 41 #include "ucnv_bld.h" 42 #include "ucnv_cnv.h" 43 #include "ucnvmbcs.h" 44 #include "cstring.h" 45 #include "cmemory.h" 46 #include "uassert.h" 47 48 #ifdef U_ENABLE_GENERIC_ISO_2022 49 /* 50 * I am disabling the generic ISO-2022 converter after proposing to do so on 51 * the icu mailing list two days ago. 52 * 53 * Reasons: 54 * 1. It does not fully support the ISO-2022/ECMA-35 specification with all of 55 * its designation sequences, single shifts with return to the previous state, 56 * switch-with-no-return to UTF-16BE or similar, etc. 57 * This is unlike the language-specific variants like ISO-2022-JP which 58 * require a much smaller repertoire of ISO-2022 features. 59 * These variants continue to be supported. 60 * 2. I believe that no one is really using the generic ISO-2022 converter 61 * but rather always one of the language-specific variants. 62 * Note that ICU's generic ISO-2022 converter has always output one escape 63 * sequence followed by UTF-8 for the whole stream. 64 * 3. Switching between subcharsets is extremely slow, because each time 65 * the previous converter is closed and a new one opened, 66 * without any kind of caching, least-recently-used list, etc. 67 * 4. The code is currently buggy, and given the above it does not seem 68 * reasonable to spend the time on maintenance. 69 * 5. ISO-2022 subcharsets should normally be used with 7-bit byte encodings. 70 * This means, for example, that when ISO-8859-7 is designated, the following 71 * ISO-2022 bytes 00..7f should be interpreted as ISO-8859-7 bytes 80..ff. 72 * The ICU ISO-2022 converter does not handle this - and has no information 73 * about which subconverter would have to be shifted vs. which is designed 74 * for 7-bit ISO-2022. 75 * 76 * Markus Scherer 2003-dec-03 77 */ 78 #endif 79 80 #if !UCONFIG_ONLY_HTML_CONVERSION 81 static const char SHIFT_IN_STR[] = "\x0F"; 82 // static const char SHIFT_OUT_STR[] = "\x0E"; 83 #endif 84 85 #define CR 0x0D 86 #define LF 0x0A 87 #define H_TAB 0x09 88 #define V_TAB 0x0B 89 #define SPACE 0x20 90 91 enum { 92 HWKANA_START=0xff61, 93 HWKANA_END=0xff9f 94 }; 95 96 /* 97 * 94-character sets with native byte values A1..FE are encoded in ISO 2022 98 * as bytes 21..7E. (Subtract 0x80.) 99 * 96-character sets with native byte values A0..FF are encoded in ISO 2022 100 * as bytes 20..7F. (Subtract 0x80.) 101 * Do not encode C1 control codes with native bytes 80..9F 102 * as bytes 00..1F (C0 control codes). 103 */ 104 enum { 105 GR94_START=0xa1, 106 GR94_END=0xfe, 107 GR96_START=0xa0, 108 GR96_END=0xff 109 }; 110 111 /* 112 * ISO 2022 control codes must not be converted from Unicode 113 * because they would mess up the byte stream. 114 * The bit mask 0x0800c000 has bits set at bit positions 0xe, 0xf, 0x1b 115 * corresponding to SO, SI, and ESC. 116 */ 117 #define IS_2022_CONTROL(c) (((c)<0x20) && (((uint32_t)1<<(c))&0x0800c000)!=0) 118 119 /* for ISO-2022-JP and -CN implementations */ 120 typedef enum { 121 /* shared values */ 122 INVALID_STATE=-1, 123 ASCII = 0, 124 125 SS2_STATE=0x10, 126 SS3_STATE, 127 128 /* JP */ 129 ISO8859_1 = 1 , 130 ISO8859_7 = 2 , 131 JISX201 = 3, 132 JISX208 = 4, 133 JISX212 = 5, 134 GB2312 =6, 135 KSC5601 =7, 136 HWKANA_7BIT=8, /* Halfwidth Katakana 7 bit */ 137 138 /* CN */ 139 /* the first few enum constants must keep their values because they correspond to myConverterArray[] */ 140 GB2312_1=1, 141 ISO_IR_165=2, 142 CNS_11643=3, 143 144 /* 145 * these are used in StateEnum and ISO2022State variables, 146 * but CNS_11643 must be used to index into myConverterArray[] 147 */ 148 CNS_11643_0=0x20, 149 CNS_11643_1, 150 CNS_11643_2, 151 CNS_11643_3, 152 CNS_11643_4, 153 CNS_11643_5, 154 CNS_11643_6, 155 CNS_11643_7 156 } StateEnum; 157 158 /* is the StateEnum charset value for a DBCS charset? */ 159 #if UCONFIG_ONLY_HTML_CONVERSION 160 #define IS_JP_DBCS(cs) (JISX208==(cs)) 161 #else 162 #define IS_JP_DBCS(cs) (JISX208<=(cs) && (cs)<=KSC5601) 163 #endif 164 165 #define CSM(cs) ((uint16_t)1<<(cs)) 166 167 /* 168 * Each of these charset masks (with index x) contains a bit for a charset in exact correspondence 169 * to whether that charset is used in the corresponding version x of ISO_2022,locale=ja,version=x 170 * 171 * Note: The converter uses some leniency: 172 * - The escape sequence ESC ( I for half-width 7-bit Katakana is recognized in 173 * all versions, not just JIS7 and JIS8. 174 * - ICU does not distinguish between different versions of JIS X 0208. 175 */ 176 #if UCONFIG_ONLY_HTML_CONVERSION 177 enum { MAX_JA_VERSION=0 }; 178 #else 179 enum { MAX_JA_VERSION=4 }; 180 #endif 181 static const uint16_t jpCharsetMasks[MAX_JA_VERSION+1]={ 182 CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT), 183 #if !UCONFIG_ONLY_HTML_CONVERSION 184 CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT)|CSM(JISX212), 185 CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT)|CSM(JISX212)|CSM(GB2312)|CSM(KSC5601)|CSM(ISO8859_1)|CSM(ISO8859_7), 186 CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT)|CSM(JISX212)|CSM(GB2312)|CSM(KSC5601)|CSM(ISO8859_1)|CSM(ISO8859_7), 187 CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT)|CSM(JISX212)|CSM(GB2312)|CSM(KSC5601)|CSM(ISO8859_1)|CSM(ISO8859_7) 188 #endif 189 }; 190 191 typedef enum { 192 ASCII1=0, 193 LATIN1, 194 SBCS, 195 DBCS, 196 MBCS, 197 HWKANA 198 }Cnv2022Type; 199 200 typedef struct ISO2022State { 201 int8_t cs[4]; /* charset number for SI (G0)/SO (G1)/SS2 (G2)/SS3 (G3) */ 202 int8_t g; /* 0..3 for G0..G3 (SI/SO/SS2/SS3) */ 203 int8_t prevG; /* g before single shift (SS2 or SS3) */ 204 } ISO2022State; 205 206 #define UCNV_OPTIONS_VERSION_MASK 0xf 207 #define UCNV_2022_MAX_CONVERTERS 10 208 209 typedef struct{ 210 UConverterSharedData *myConverterArray[UCNV_2022_MAX_CONVERTERS]; 211 UConverter *currentConverter; 212 Cnv2022Type currentType; 213 ISO2022State toU2022State, fromU2022State; 214 uint32_t key; 215 uint32_t version; 216 #ifdef U_ENABLE_GENERIC_ISO_2022 217 UBool isFirstBuffer; 218 #endif 219 UBool isEmptySegment; 220 char name[30]; 221 char locale[3]; 222 }UConverterDataISO2022; 223 224 /* Protos */ 225 /* ISO-2022 ----------------------------------------------------------------- */ 226 227 /*Forward declaration */ 228 U_CFUNC void U_CALLCONV 229 ucnv_fromUnicode_UTF8(UConverterFromUnicodeArgs * args, 230 UErrorCode * err); 231 U_CFUNC void U_CALLCONV 232 ucnv_fromUnicode_UTF8_OFFSETS_LOGIC(UConverterFromUnicodeArgs * args, 233 UErrorCode * err); 234 235 #define ESC_2022 0x1B /*ESC*/ 236 237 typedef enum 238 { 239 INVALID_2022 = -1, /*Doesn't correspond to a valid iso 2022 escape sequence*/ 240 VALID_NON_TERMINAL_2022 = 0, /*so far corresponds to a valid iso 2022 escape sequence*/ 241 VALID_TERMINAL_2022 = 1, /*corresponds to a valid iso 2022 escape sequence*/ 242 VALID_MAYBE_TERMINAL_2022 = 2 /*so far matches one iso 2022 escape sequence, but by adding more characters might match another escape sequence*/ 243 } UCNV_TableStates_2022; 244 245 /* 246 * The way these state transition arrays work is: 247 * ex : ESC$B is the sequence for JISX208 248 * a) First Iteration: char is ESC 249 * i) Get the value of ESC from normalize_esq_chars_2022[] with int value of ESC as index 250 * int x = normalize_esq_chars_2022[27] which is equal to 1 251 * ii) Search for this value in escSeqStateTable_Key_2022[] 252 * value of x is stored at escSeqStateTable_Key_2022[0] 253 * iii) Save this index as offset 254 * iv) Get state of this sequence from escSeqStateTable_Value_2022[] 255 * escSeqStateTable_Value_2022[offset], which is VALID_NON_TERMINAL_2022 256 * b) Switch on this state and continue to next char 257 * i) Get the value of $ from normalize_esq_chars_2022[] with int value of $ as index 258 * which is normalize_esq_chars_2022[36] == 4 259 * ii) x is currently 1(from above) 260 * x<<=5 -- x is now 32 261 * x+=normalize_esq_chars_2022[36] 262 * now x is 36 263 * iii) Search for this value in escSeqStateTable_Key_2022[] 264 * value of x is stored at escSeqStateTable_Key_2022[2], so offset is 2 265 * iv) Get state of this sequence from escSeqStateTable_Value_2022[] 266 * escSeqStateTable_Value_2022[offset], which is VALID_NON_TERMINAL_2022 267 * c) Switch on this state and continue to next char 268 * i) Get the value of B from normalize_esq_chars_2022[] with int value of B as index 269 * ii) x is currently 36 (from above) 270 * x<<=5 -- x is now 1152 271 * x+=normalize_esq_chars_2022[66] 272 * now x is 1161 273 * iii) Search for this value in escSeqStateTable_Key_2022[] 274 * value of x is stored at escSeqStateTable_Key_2022[21], so offset is 21 275 * iv) Get state of this sequence from escSeqStateTable_Value_2022[21] 276 * escSeqStateTable_Value_2022[offset], which is VALID_TERMINAL_2022 277 * v) Get the converter name form escSeqStateTable_Result_2022[21] which is JISX208 278 */ 279 280 281 /*Below are the 3 arrays depicting a state transition table*/ 282 static const int8_t normalize_esq_chars_2022[256] = { 283 /* 0 1 2 3 4 5 6 7 8 9 */ 284 285 0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 286 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 287 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,1 ,0 ,0 288 ,0 ,0 ,0 ,0 ,0 ,0 ,4 ,7 ,29 ,0 289 ,2 ,24 ,26 ,27 ,0 ,3 ,23 ,6 ,0 ,0 290 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 291 ,0 ,0 ,0 ,0 ,5 ,8 ,9 ,10 ,11 ,12 292 ,13 ,14 ,15 ,16 ,17 ,18 ,19 ,20 ,25 ,28 293 ,0 ,0 ,21 ,0 ,0 ,0 ,0 ,0 ,0 ,0 294 ,22 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 295 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 296 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 297 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 298 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 299 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 300 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 301 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 302 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 303 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 304 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 305 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 306 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 307 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 308 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 309 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 310 ,0 ,0 ,0 ,0 ,0 ,0 311 }; 312 313 #ifdef U_ENABLE_GENERIC_ISO_2022 314 /* 315 * When the generic ISO-2022 converter is completely removed, not just disabled 316 * per #ifdef, then the following state table and the associated tables that are 317 * dimensioned with MAX_STATES_2022 should be trimmed. 318 * 319 * Especially, VALID_MAYBE_TERMINAL_2022 will not be used any more, and all of 320 * the associated escape sequences starting with ESC ( B should be removed. 321 * This includes the ones with key values 1097 and all of the ones above 1000000. 322 * 323 * For the latter, the tables can simply be truncated. 324 * For the former, since the tables must be kept parallel, it is probably best 325 * to simply duplicate an adjacent table cell, parallel in all tables. 326 * 327 * It may make sense to restructure the tables, especially by using small search 328 * tables for the variants instead of indexing them parallel to the table here. 329 */ 330 #endif 331 332 #define MAX_STATES_2022 74 333 static const int32_t escSeqStateTable_Key_2022[MAX_STATES_2022] = { 334 /* 0 1 2 3 4 5 6 7 8 9 */ 335 336 1 ,34 ,36 ,39 ,55 ,57 ,60 ,61 ,1093 ,1096 337 ,1097 ,1098 ,1099 ,1100 ,1101 ,1102 ,1103 ,1104 ,1105 ,1106 338 ,1109 ,1154 ,1157 ,1160 ,1161 ,1176 ,1178 ,1179 ,1254 ,1257 339 ,1768 ,1773 ,1957 ,35105 ,36933 ,36936 ,36937 ,36938 ,36939 ,36940 340 ,36942 ,36943 ,36944 ,36945 ,36946 ,36947 ,36948 ,37640 ,37642 ,37644 341 ,37646 ,37711 ,37744 ,37745 ,37746 ,37747 ,37748 ,40133 ,40136 ,40138 342 ,40139 ,40140 ,40141 ,1123363 ,35947624 ,35947625 ,35947626 ,35947627 ,35947629 ,35947630 343 ,35947631 ,35947635 ,35947636 ,35947638 344 }; 345 346 #ifdef U_ENABLE_GENERIC_ISO_2022 347 348 static const char* const escSeqStateTable_Result_2022[MAX_STATES_2022] = { 349 /* 0 1 2 3 4 5 6 7 8 9 */ 350 351 nullptr ,nullptr ,nullptr ,nullptr ,nullptr ,nullptr ,nullptr ,nullptr ,"latin1" ,"latin1" 352 ,"latin1" ,"ibm-865" ,"ibm-865" ,"ibm-865" ,"ibm-865" ,"ibm-865" ,"ibm-865" ,"JISX0201" ,"JISX0201" ,"latin1" 353 ,"latin1" ,nullptr ,"JISX-208" ,"ibm-5478" ,"JISX-208" ,nullptr ,nullptr ,nullptr ,nullptr ,"UTF8" 354 ,"ISO-8859-1" ,"ISO-8859-7" ,"JIS-X-208" ,nullptr ,"ibm-955" ,"ibm-367" ,"ibm-952" ,"ibm-949" ,"JISX-212" ,"ibm-1383" 355 ,"ibm-952" ,"ibm-964" ,"ibm-964" ,"ibm-964" ,"ibm-964" ,"ibm-964" ,"ibm-964" ,"ibm-5478" ,"ibm-949" ,"ISO-IR-165" 356 ,"CNS-11643-1992,1" ,"CNS-11643-1992,2" ,"CNS-11643-1992,3" ,"CNS-11643-1992,4" ,"CNS-11643-1992,5" ,"CNS-11643-1992,6" ,"CNS-11643-1992,7" ,"UTF16_PlatformEndian" ,"UTF16_PlatformEndian" ,"UTF16_PlatformEndian" 357 ,"UTF16_PlatformEndian" ,"UTF16_PlatformEndian" ,"UTF16_PlatformEndian" ,nullptr ,"latin1" ,"ibm-912" ,"ibm-913" ,"ibm-914" ,"ibm-813" ,"ibm-1089" 358 ,"ibm-920" ,"ibm-915" ,"ibm-915" ,"latin1" 359 }; 360 361 #endif 362 363 static const int8_t escSeqStateTable_Value_2022[MAX_STATES_2022] = { 364 /* 0 1 2 3 4 5 6 7 8 9 */ 365 VALID_NON_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 366 ,VALID_MAYBE_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 367 ,VALID_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_TERMINAL_2022 368 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 369 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 370 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 371 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 372 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 373 }; 374 375 /* Type def for refactoring changeState_2022 code*/ 376 typedef enum{ 377 #ifdef U_ENABLE_GENERIC_ISO_2022 378 ISO_2022=0, 379 #endif 380 ISO_2022_JP=1, 381 #if !UCONFIG_ONLY_HTML_CONVERSION 382 ISO_2022_KR=2, 383 ISO_2022_CN=3 384 #endif 385 } Variant2022; 386 387 /*********** ISO 2022 Converter Protos ***********/ 388 static void U_CALLCONV 389 _ISO2022Open(UConverter *cnv, UConverterLoadArgs *pArgs, UErrorCode *errorCode); 390 391 static void U_CALLCONV 392 _ISO2022Close(UConverter *converter); 393 394 static void U_CALLCONV 395 _ISO2022Reset(UConverter *converter, UConverterResetChoice choice); 396 397 U_CDECL_BEGIN 398 static const char * U_CALLCONV 399 _ISO2022getName(const UConverter* cnv); 400 U_CDECL_END 401 402 static void U_CALLCONV 403 _ISO_2022_WriteSub(UConverterFromUnicodeArgs *args, int32_t offsetIndex, UErrorCode *err); 404 405 U_CDECL_BEGIN 406 static UConverter * U_CALLCONV 407 _ISO_2022_SafeClone(const UConverter *cnv, void *stackBuffer, int32_t *pBufferSize, UErrorCode *status); 408 409 U_CDECL_END 410 411 #ifdef U_ENABLE_GENERIC_ISO_2022 412 static void U_CALLCONV 413 T_UConverter_toUnicode_ISO_2022_OFFSETS_LOGIC(UConverterToUnicodeArgs* args, UErrorCode* err); 414 #endif 415 416 namespace { 417 418 /*const UConverterSharedData _ISO2022Data;*/ 419 extern const UConverterSharedData _ISO2022JPData; 420 421 #if !UCONFIG_ONLY_HTML_CONVERSION 422 extern const UConverterSharedData _ISO2022KRData; 423 extern const UConverterSharedData _ISO2022CNData; 424 #endif 425 426 } // namespace 427 428 /*************** Converter implementations ******************/ 429 430 /* The purpose of this function is to get around gcc compiler warnings. */ 431 static inline void 432 fromUWriteUInt8(UConverter *cnv, 433 const char *bytes, int32_t length, 434 uint8_t **target, const char *targetLimit, 435 int32_t **offsets, 436 int32_t sourceIndex, 437 UErrorCode *pErrorCode) 438 { 439 char* targetChars = reinterpret_cast<char*>(*target); 440 ucnv_fromUWriteBytes(cnv, bytes, length, &targetChars, targetLimit, 441 offsets, sourceIndex, pErrorCode); 442 *target = reinterpret_cast<uint8_t*>(targetChars); 443 444 } 445 446 static inline void 447 setInitialStateToUnicodeKR(UConverter* /*converter*/, UConverterDataISO2022 *myConverterData){ 448 if(myConverterData->version == 1) { 449 UConverter *cnv = myConverterData->currentConverter; 450 451 cnv->toUnicodeStatus=0; /* offset */ 452 cnv->mode=0; /* state */ 453 cnv->toULength=0; /* byteIndex */ 454 } 455 } 456 457 static inline void 458 setInitialStateFromUnicodeKR(UConverter* converter,UConverterDataISO2022 *myConverterData){ 459 /* in ISO-2022-KR the designator sequence appears only once 460 * in a file so we append it only once 461 */ 462 if( converter->charErrorBufferLength==0){ 463 464 converter->charErrorBufferLength = 4; 465 converter->charErrorBuffer[0] = 0x1b; 466 converter->charErrorBuffer[1] = 0x24; 467 converter->charErrorBuffer[2] = 0x29; 468 converter->charErrorBuffer[3] = 0x43; 469 } 470 if(myConverterData->version == 1) { 471 UConverter *cnv = myConverterData->currentConverter; 472 473 cnv->fromUChar32=0; 474 cnv->fromUnicodeStatus=1; /* prevLength */ 475 } 476 } 477 478 static void U_CALLCONV 479 _ISO2022Open(UConverter *cnv, UConverterLoadArgs *pArgs, UErrorCode *errorCode){ 480 481 char myLocale[7]={' ',' ',' ',' ',' ',' ', '\0'}; 482 483 cnv->extraInfo = uprv_malloc (sizeof (UConverterDataISO2022)); 484 if(cnv->extraInfo != nullptr) { 485 UConverterNamePieces stackPieces; 486 UConverterLoadArgs stackArgs=UCNV_LOAD_ARGS_INITIALIZER; 487 UConverterDataISO2022* myConverterData = static_cast<UConverterDataISO2022*>(cnv->extraInfo); 488 uint32_t version; 489 490 stackArgs.onlyTestIsLoadable = pArgs->onlyTestIsLoadable; 491 492 uprv_memset(myConverterData, 0, sizeof(UConverterDataISO2022)); 493 myConverterData->currentType = ASCII1; 494 cnv->fromUnicodeStatus =false; 495 if(pArgs->locale){ 496 uprv_strncpy(myLocale, pArgs->locale, sizeof(myLocale)-1); 497 } 498 version = pArgs->options & UCNV_OPTIONS_VERSION_MASK; 499 myConverterData->version = version; 500 if(myLocale[0]=='j' && (myLocale[1]=='a'|| myLocale[1]=='p') && 501 (myLocale[2]=='_' || myLocale[2]=='\0')) 502 { 503 /* open the required converters and cache them */ 504 if(version>MAX_JA_VERSION) { 505 // ICU 55 fails to open a converter for an unsupported version. 506 // Previously, it fell back to version 0, but that would yield 507 // unexpected behavior. 508 *errorCode = U_MISSING_RESOURCE_ERROR; 509 return; 510 } 511 if(jpCharsetMasks[version]&CSM(ISO8859_7)) { 512 myConverterData->myConverterArray[ISO8859_7] = 513 ucnv_loadSharedData("ISO8859_7", &stackPieces, &stackArgs, errorCode); 514 } 515 myConverterData->myConverterArray[JISX208] = 516 ucnv_loadSharedData("Shift-JIS", &stackPieces, &stackArgs, errorCode); 517 if(jpCharsetMasks[version]&CSM(JISX212)) { 518 myConverterData->myConverterArray[JISX212] = 519 ucnv_loadSharedData("jisx-212", &stackPieces, &stackArgs, errorCode); 520 } 521 if(jpCharsetMasks[version]&CSM(GB2312)) { 522 myConverterData->myConverterArray[GB2312] = 523 ucnv_loadSharedData("ibm-5478", &stackPieces, &stackArgs, errorCode); /* gb_2312_80-1 */ 524 } 525 if(jpCharsetMasks[version]&CSM(KSC5601)) { 526 myConverterData->myConverterArray[KSC5601] = 527 ucnv_loadSharedData("ksc_5601", &stackPieces, &stackArgs, errorCode); 528 } 529 530 /* set the function pointers to appropriate functions */ 531 cnv->sharedData = const_cast<UConverterSharedData*>(&_ISO2022JPData); 532 uprv_strcpy(myConverterData->locale,"ja"); 533 534 (void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=ja,version="); 535 size_t len = uprv_strlen(myConverterData->name); 536 myConverterData->name[len] = static_cast<char>(myConverterData->version + static_cast<int>('0')); 537 myConverterData->name[len+1]='\0'; 538 } 539 #if !UCONFIG_ONLY_HTML_CONVERSION 540 else if(myLocale[0]=='k' && (myLocale[1]=='o'|| myLocale[1]=='r') && 541 (myLocale[2]=='_' || myLocale[2]=='\0')) 542 { 543 if(version>1) { 544 // ICU 55 fails to open a converter for an unsupported version. 545 // Previously, it fell back to version 0, but that would yield 546 // unexpected behavior. 547 *errorCode = U_MISSING_RESOURCE_ERROR; 548 return; 549 } 550 const char *cnvName; 551 if(version==1) { 552 cnvName="icu-internal-25546"; 553 } else { 554 cnvName="ibm-949"; 555 myConverterData->version=version=0; 556 } 557 if(pArgs->onlyTestIsLoadable) { 558 ucnv_canCreateConverter(cnvName, errorCode); /* errorCode carries result */ 559 uprv_free(cnv->extraInfo); 560 cnv->extraInfo=nullptr; 561 return; 562 } else { 563 myConverterData->currentConverter=ucnv_open(cnvName, errorCode); 564 if (U_FAILURE(*errorCode)) { 565 _ISO2022Close(cnv); 566 return; 567 } 568 569 if(version==1) { 570 (void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=ko,version=1"); 571 uprv_memcpy(cnv->subChars, myConverterData->currentConverter->subChars, 4); 572 cnv->subCharLen = myConverterData->currentConverter->subCharLen; 573 }else{ 574 (void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=ko,version=0"); 575 } 576 577 /* initialize the state variables */ 578 setInitialStateToUnicodeKR(cnv, myConverterData); 579 setInitialStateFromUnicodeKR(cnv, myConverterData); 580 581 /* set the function pointers to appropriate functions */ 582 cnv->sharedData = const_cast<UConverterSharedData*>(&_ISO2022KRData); 583 uprv_strcpy(myConverterData->locale,"ko"); 584 } 585 } 586 else if(((myLocale[0]=='z' && myLocale[1]=='h') || (myLocale[0]=='c'&& myLocale[1]=='n'))&& 587 (myLocale[2]=='_' || myLocale[2]=='\0')) 588 { 589 if(version>2) { 590 // ICU 55 fails to open a converter for an unsupported version. 591 // Previously, it fell back to version 0, but that would yield 592 // unexpected behavior. 593 *errorCode = U_MISSING_RESOURCE_ERROR; 594 return; 595 } 596 597 /* open the required converters and cache them */ 598 myConverterData->myConverterArray[GB2312_1] = 599 ucnv_loadSharedData("ibm-5478", &stackPieces, &stackArgs, errorCode); 600 if(version>=1) { 601 myConverterData->myConverterArray[ISO_IR_165] = 602 ucnv_loadSharedData("iso-ir-165", &stackPieces, &stackArgs, errorCode); 603 } 604 myConverterData->myConverterArray[CNS_11643] = 605 ucnv_loadSharedData("cns-11643-1992", &stackPieces, &stackArgs, errorCode); 606 607 608 /* set the function pointers to appropriate functions */ 609 cnv->sharedData = const_cast<UConverterSharedData*>(&_ISO2022CNData); 610 uprv_strcpy(myConverterData->locale,"cn"); 611 612 if (version==0){ 613 myConverterData->version = 0; 614 (void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=zh,version=0"); 615 }else if (version==1){ 616 myConverterData->version = 1; 617 (void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=zh,version=1"); 618 }else { 619 myConverterData->version = 2; 620 (void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=zh,version=2"); 621 } 622 } 623 #endif // !UCONFIG_ONLY_HTML_CONVERSION 624 else{ 625 #ifdef U_ENABLE_GENERIC_ISO_2022 626 myConverterData->isFirstBuffer = true; 627 628 /* append the UTF-8 escape sequence */ 629 cnv->charErrorBufferLength = 3; 630 cnv->charErrorBuffer[0] = 0x1b; 631 cnv->charErrorBuffer[1] = 0x25; 632 cnv->charErrorBuffer[2] = 0x42; 633 634 cnv->sharedData=(UConverterSharedData*)&_ISO2022Data; 635 /* initialize the state variables */ 636 uprv_strcpy(myConverterData->name,"ISO_2022"); 637 #else 638 *errorCode = U_MISSING_RESOURCE_ERROR; 639 // Was U_UNSUPPORTED_ERROR but changed in ICU 55 to a more standard 640 // data loading error code. 641 return; 642 #endif 643 } 644 645 cnv->maxBytesPerUChar=cnv->sharedData->staticData->maxBytesPerChar; 646 647 if(U_FAILURE(*errorCode) || pArgs->onlyTestIsLoadable) { 648 _ISO2022Close(cnv); 649 } 650 } else { 651 *errorCode = U_MEMORY_ALLOCATION_ERROR; 652 } 653 } 654 655 656 static void U_CALLCONV 657 _ISO2022Close(UConverter *converter) { 658 UConverterDataISO2022* myData = static_cast<UConverterDataISO2022*>(converter->extraInfo); 659 UConverterSharedData **array = myData->myConverterArray; 660 int32_t i; 661 662 if (converter->extraInfo != nullptr) { 663 /*close the array of converter pointers and free the memory*/ 664 for (i=0; i<UCNV_2022_MAX_CONVERTERS; i++) { 665 if(array[i]!=nullptr) { 666 ucnv_unloadSharedDataIfReady(array[i]); 667 } 668 } 669 670 ucnv_close(myData->currentConverter); 671 672 if(!converter->isExtraLocal){ 673 uprv_free (converter->extraInfo); 674 converter->extraInfo = nullptr; 675 } 676 } 677 } 678 679 static void U_CALLCONV 680 _ISO2022Reset(UConverter *converter, UConverterResetChoice choice) { 681 UConverterDataISO2022* myConverterData = static_cast<UConverterDataISO2022*>(converter->extraInfo); 682 if(choice<=UCNV_RESET_TO_UNICODE) { 683 uprv_memset(&myConverterData->toU2022State, 0, sizeof(ISO2022State)); 684 myConverterData->key = 0; 685 myConverterData->isEmptySegment = false; 686 } 687 if(choice!=UCNV_RESET_TO_UNICODE) { 688 uprv_memset(&myConverterData->fromU2022State, 0, sizeof(ISO2022State)); 689 } 690 #ifdef U_ENABLE_GENERIC_ISO_2022 691 if(myConverterData->locale[0] == 0){ 692 if(choice<=UCNV_RESET_TO_UNICODE) { 693 myConverterData->isFirstBuffer = true; 694 myConverterData->key = 0; 695 if (converter->mode == UCNV_SO){ 696 ucnv_close (myConverterData->currentConverter); 697 myConverterData->currentConverter=nullptr; 698 } 699 converter->mode = UCNV_SI; 700 } 701 if(choice!=UCNV_RESET_TO_UNICODE) { 702 /* re-append UTF-8 escape sequence */ 703 converter->charErrorBufferLength = 3; 704 converter->charErrorBuffer[0] = 0x1b; 705 converter->charErrorBuffer[1] = 0x28; 706 converter->charErrorBuffer[2] = 0x42; 707 } 708 } 709 else 710 #endif 711 { 712 /* reset the state variables */ 713 if(myConverterData->locale[0] == 'k'){ 714 if(choice<=UCNV_RESET_TO_UNICODE) { 715 setInitialStateToUnicodeKR(converter, myConverterData); 716 } 717 if(choice!=UCNV_RESET_TO_UNICODE) { 718 setInitialStateFromUnicodeKR(converter, myConverterData); 719 } 720 } 721 } 722 } 723 724 U_CDECL_BEGIN 725 726 static const char * U_CALLCONV 727 _ISO2022getName(const UConverter* cnv){ 728 if(cnv->extraInfo){ 729 UConverterDataISO2022* myData= (UConverterDataISO2022*)cnv->extraInfo; 730 return myData->name; 731 } 732 return nullptr; 733 } 734 735 U_CDECL_END 736 737 738 /*************** to unicode *******************/ 739 /**************************************************************************** 740 * Recognized escape sequences are 741 * <ESC>(B ASCII 742 * <ESC>.A ISO-8859-1 743 * <ESC>.F ISO-8859-7 744 * <ESC>(J JISX-201 745 * <ESC>(I JISX-201 746 * <ESC>$B JISX-208 747 * <ESC>$@ JISX-208 748 * <ESC>$(D JISX-212 749 * <ESC>$A GB2312 750 * <ESC>$(C KSC5601 751 */ 752 static const int8_t nextStateToUnicodeJP[MAX_STATES_2022]= { 753 /* 0 1 2 3 4 5 6 7 8 9 */ 754 INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,SS2_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE 755 ,ASCII ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,JISX201 ,HWKANA_7BIT ,JISX201 ,INVALID_STATE 756 ,INVALID_STATE ,INVALID_STATE ,JISX208 ,GB2312 ,JISX208 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE 757 ,ISO8859_1 ,ISO8859_7 ,JISX208 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,KSC5601 ,JISX212 ,INVALID_STATE 758 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE 759 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE 760 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE 761 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE 762 }; 763 764 #if !UCONFIG_ONLY_HTML_CONVERSION 765 /*************** to unicode *******************/ 766 static const int8_t nextStateToUnicodeCN[MAX_STATES_2022]= { 767 /* 0 1 2 3 4 5 6 7 8 9 */ 768 INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,SS2_STATE ,SS3_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE 769 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE 770 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE 771 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE 772 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,GB2312_1 ,INVALID_STATE ,ISO_IR_165 773 ,CNS_11643_1 ,CNS_11643_2 ,CNS_11643_3 ,CNS_11643_4 ,CNS_11643_5 ,CNS_11643_6 ,CNS_11643_7 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE 774 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE 775 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE 776 }; 777 #endif 778 779 780 static UCNV_TableStates_2022 781 getKey_2022(char c,int32_t* key,int32_t* offset){ 782 int32_t togo; 783 int32_t low = 0; 784 int32_t hi = MAX_STATES_2022; 785 int32_t oldmid=0; 786 787 togo = normalize_esq_chars_2022[static_cast<uint8_t>(c)]; 788 if(togo == 0) { 789 /* not a valid character anywhere in an escape sequence */ 790 *key = 0; 791 *offset = 0; 792 return INVALID_2022; 793 } 794 togo = (*key << 5) + togo; 795 796 while (hi != low) /*binary search*/{ 797 798 int32_t mid = (hi+low) >> 1; /*Finds median*/ 799 800 if (mid == oldmid) 801 break; 802 803 if (escSeqStateTable_Key_2022[mid] > togo){ 804 hi = mid; 805 } 806 else if (escSeqStateTable_Key_2022[mid] < togo){ 807 low = mid; 808 } 809 else /*we found it*/{ 810 *key = togo; 811 *offset = mid; 812 return static_cast<UCNV_TableStates_2022>(escSeqStateTable_Value_2022[mid]); 813 } 814 oldmid = mid; 815 816 } 817 818 *key = 0; 819 *offset = 0; 820 return INVALID_2022; 821 } 822 823 /*runs through a state machine to determine the escape sequence - codepage correspondence 824 */ 825 static void 826 changeState_2022(UConverter* _this, 827 const char** source, 828 const char* sourceLimit, 829 Variant2022 var, 830 UErrorCode* err){ 831 UCNV_TableStates_2022 value; 832 UConverterDataISO2022* myData2022 = static_cast<UConverterDataISO2022*>(_this->extraInfo); 833 uint32_t key = myData2022->key; 834 int32_t offset = 0; 835 int8_t initialToULength = _this->toULength; 836 char c; 837 838 value = VALID_NON_TERMINAL_2022; 839 while (*source < sourceLimit) { 840 c = *(*source)++; 841 _this->toUBytes[_this->toULength++] = static_cast<uint8_t>(c); 842 value = getKey_2022(c, reinterpret_cast<int32_t*>(&key), &offset); 843 844 switch (value){ 845 846 case VALID_NON_TERMINAL_2022 : 847 /* continue with the loop */ 848 break; 849 850 case VALID_TERMINAL_2022: 851 key = 0; 852 goto DONE; 853 854 case INVALID_2022: 855 goto DONE; 856 857 case VALID_MAYBE_TERMINAL_2022: 858 #ifdef U_ENABLE_GENERIC_ISO_2022 859 /* ESC ( B is ambiguous only for ISO_2022 itself */ 860 if(var == ISO_2022) { 861 /* discard toUBytes[] for ESC ( B because this sequence is correct and complete */ 862 _this->toULength = 0; 863 864 /* TODO need to indicate that ESC ( B was seen; if failure, then need to replay from source or from MBCS-style replay */ 865 866 /* continue with the loop */ 867 value = VALID_NON_TERMINAL_2022; 868 break; 869 } else 870 #endif 871 { 872 /* not ISO_2022 itself, finish here */ 873 value = VALID_TERMINAL_2022; 874 key = 0; 875 goto DONE; 876 } 877 } 878 } 879 880 DONE: 881 myData2022->key = key; 882 883 if (value == VALID_NON_TERMINAL_2022) { 884 /* indicate that the escape sequence is incomplete: key!=0 */ 885 return; 886 } else if (value == INVALID_2022 ) { 887 *err = U_ILLEGAL_ESCAPE_SEQUENCE; 888 } else /* value == VALID_TERMINAL_2022 */ { 889 switch(var){ 890 #ifdef U_ENABLE_GENERIC_ISO_2022 891 case ISO_2022: 892 { 893 const char *chosenConverterName = escSeqStateTable_Result_2022[offset]; 894 if(chosenConverterName == nullptr) { 895 /* SS2 or SS3 */ 896 *err = U_UNSUPPORTED_ESCAPE_SEQUENCE; 897 _this->toUCallbackReason = UCNV_UNASSIGNED; 898 return; 899 } 900 901 _this->mode = UCNV_SI; 902 ucnv_close(myData2022->currentConverter); 903 myData2022->currentConverter = myUConverter = ucnv_open(chosenConverterName, err); 904 if(U_SUCCESS(*err)) { 905 myUConverter->fromCharErrorBehaviour = UCNV_TO_U_CALLBACK_STOP; 906 _this->mode = UCNV_SO; 907 } 908 break; 909 } 910 #endif 911 case ISO_2022_JP: 912 { 913 StateEnum tempState = static_cast<StateEnum>(nextStateToUnicodeJP[offset]); 914 switch(tempState) { 915 case INVALID_STATE: 916 *err = U_UNSUPPORTED_ESCAPE_SEQUENCE; 917 break; 918 case SS2_STATE: 919 if(myData2022->toU2022State.cs[2]!=0) { 920 if(myData2022->toU2022State.g<2) { 921 myData2022->toU2022State.prevG=myData2022->toU2022State.g; 922 } 923 myData2022->toU2022State.g=2; 924 } else { 925 /* illegal to have SS2 before a matching designator */ 926 *err = U_ILLEGAL_ESCAPE_SEQUENCE; 927 } 928 break; 929 /* case SS3_STATE: not used in ISO-2022-JP-x */ 930 case ISO8859_1: 931 case ISO8859_7: 932 if((jpCharsetMasks[myData2022->version] & CSM(tempState)) == 0) { 933 *err = U_UNSUPPORTED_ESCAPE_SEQUENCE; 934 } else { 935 /* G2 charset for SS2 */ 936 myData2022->toU2022State.cs[2] = static_cast<int8_t>(tempState); 937 } 938 break; 939 default: 940 if((jpCharsetMasks[myData2022->version] & CSM(tempState)) == 0) { 941 *err = U_UNSUPPORTED_ESCAPE_SEQUENCE; 942 } else { 943 /* G0 charset */ 944 myData2022->toU2022State.cs[0] = static_cast<int8_t>(tempState); 945 } 946 break; 947 } 948 } 949 break; 950 #if !UCONFIG_ONLY_HTML_CONVERSION 951 case ISO_2022_CN: 952 { 953 StateEnum tempState = static_cast<StateEnum>(nextStateToUnicodeCN[offset]); 954 switch(tempState) { 955 case INVALID_STATE: 956 *err = U_UNSUPPORTED_ESCAPE_SEQUENCE; 957 break; 958 case SS2_STATE: 959 if(myData2022->toU2022State.cs[2]!=0) { 960 if(myData2022->toU2022State.g<2) { 961 myData2022->toU2022State.prevG=myData2022->toU2022State.g; 962 } 963 myData2022->toU2022State.g=2; 964 } else { 965 /* illegal to have SS2 before a matching designator */ 966 *err = U_ILLEGAL_ESCAPE_SEQUENCE; 967 } 968 break; 969 case SS3_STATE: 970 if(myData2022->toU2022State.cs[3]!=0) { 971 if(myData2022->toU2022State.g<2) { 972 myData2022->toU2022State.prevG=myData2022->toU2022State.g; 973 } 974 myData2022->toU2022State.g=3; 975 } else { 976 /* illegal to have SS3 before a matching designator */ 977 *err = U_ILLEGAL_ESCAPE_SEQUENCE; 978 } 979 break; 980 case ISO_IR_165: 981 if(myData2022->version==0) { 982 *err = U_UNSUPPORTED_ESCAPE_SEQUENCE; 983 break; 984 } 985 U_FALLTHROUGH; 986 case GB2312_1: 987 U_FALLTHROUGH; 988 case CNS_11643_1: 989 myData2022->toU2022State.cs[1] = static_cast<int8_t>(tempState); 990 break; 991 case CNS_11643_2: 992 myData2022->toU2022State.cs[2] = static_cast<int8_t>(tempState); 993 break; 994 default: 995 /* other CNS 11643 planes */ 996 if(myData2022->version==0) { 997 *err = U_UNSUPPORTED_ESCAPE_SEQUENCE; 998 } else { 999 myData2022->toU2022State.cs[3] = static_cast<int8_t>(tempState); 1000 } 1001 break; 1002 } 1003 } 1004 break; 1005 case ISO_2022_KR: 1006 if(offset==0x30){ 1007 /* nothing to be done, just accept this one escape sequence */ 1008 } else { 1009 *err = U_UNSUPPORTED_ESCAPE_SEQUENCE; 1010 } 1011 break; 1012 #endif // !UCONFIG_ONLY_HTML_CONVERSION 1013 1014 default: 1015 *err = U_ILLEGAL_ESCAPE_SEQUENCE; 1016 break; 1017 } 1018 } 1019 if(U_SUCCESS(*err)) { 1020 _this->toULength = 0; 1021 } else if(*err==U_ILLEGAL_ESCAPE_SEQUENCE) { 1022 if(_this->toULength>1) { 1023 /* 1024 * Ticket 5691: consistent illegal sequences: 1025 * - We include at least the first byte (ESC) in the illegal sequence. 1026 * - If any of the non-initial bytes could be the start of a character, 1027 * we stop the illegal sequence before the first one of those. 1028 * In escape sequences, all following bytes are "printable", that is, 1029 * unless they are completely illegal (>7f in SBCS, outside 21..7e in DBCS), 1030 * they are valid single/lead bytes. 1031 * For simplicity, we always only report the initial ESC byte as the 1032 * illegal sequence and back out all other bytes we looked at. 1033 */ 1034 /* Back out some bytes. */ 1035 int8_t backOutDistance=_this->toULength-1; 1036 int8_t bytesFromThisBuffer=_this->toULength-initialToULength; 1037 if(backOutDistance<=bytesFromThisBuffer) { 1038 /* same as initialToULength<=1 */ 1039 *source-=backOutDistance; 1040 } else { 1041 /* Back out bytes from the previous buffer: Need to replay them. */ 1042 _this->preToULength = static_cast<int8_t>(bytesFromThisBuffer - backOutDistance); 1043 /* same as -(initialToULength-1) */ 1044 /* preToULength is negative! */ 1045 uprv_memcpy(_this->preToU, _this->toUBytes+1, -_this->preToULength); 1046 *source-=bytesFromThisBuffer; 1047 } 1048 _this->toULength=1; 1049 } 1050 } else if(*err==U_UNSUPPORTED_ESCAPE_SEQUENCE) { 1051 _this->toUCallbackReason = UCNV_UNASSIGNED; 1052 } 1053 } 1054 1055 #if !UCONFIG_ONLY_HTML_CONVERSION 1056 /*Checks the characters of the buffer against valid 2022 escape sequences 1057 *if the match we return a pointer to the initial start of the sequence otherwise 1058 *we return sourceLimit 1059 */ 1060 /*for 2022 looks ahead in the stream 1061 *to determine the longest possible convertible 1062 *data stream 1063 */ 1064 static inline const char* 1065 getEndOfBuffer_2022(const char** source, 1066 const char* sourceLimit, 1067 UBool /*flush*/){ 1068 1069 const char* mySource = *source; 1070 1071 #ifdef U_ENABLE_GENERIC_ISO_2022 1072 if (*source >= sourceLimit) 1073 return sourceLimit; 1074 1075 do{ 1076 1077 if (*mySource == ESC_2022){ 1078 int8_t i; 1079 int32_t key = 0; 1080 int32_t offset; 1081 UCNV_TableStates_2022 value = VALID_NON_TERMINAL_2022; 1082 1083 /* Kludge: I could not 1084 * figure out the reason for validating an escape sequence 1085 * twice - once here and once in changeState_2022(). 1086 * is it possible to have an ESC character in a ISO2022 1087 * byte stream which is valid in a code page? Is it legal? 1088 */ 1089 for (i=0; 1090 (mySource+i < sourceLimit)&&(value == VALID_NON_TERMINAL_2022); 1091 i++) { 1092 value = getKey_2022(*(mySource+i), &key, &offset); 1093 } 1094 if (value > 0 || *mySource==ESC_2022) 1095 return mySource; 1096 1097 if ((value == VALID_NON_TERMINAL_2022)&&(!flush) ) 1098 return sourceLimit; 1099 } 1100 }while (++mySource < sourceLimit); 1101 1102 return sourceLimit; 1103 #else 1104 while(mySource < sourceLimit && *mySource != ESC_2022) { 1105 ++mySource; 1106 } 1107 return mySource; 1108 #endif 1109 } 1110 #endif 1111 1112 /* This inline function replicates code in _MBCSFromUChar32() function in ucnvmbcs.c 1113 * any future change in _MBCSFromUChar32() function should be reflected here. 1114 * @return number of bytes in *value; negative number if fallback; 0 if no mapping 1115 */ 1116 static inline int32_t 1117 MBCS_FROM_UCHAR32_ISO2022(UConverterSharedData* sharedData, 1118 UChar32 c, 1119 uint32_t* value, 1120 UBool useFallback, 1121 int outputType) 1122 { 1123 const int32_t *cx; 1124 const uint16_t *table; 1125 uint32_t stage2Entry; 1126 uint32_t myValue; 1127 int32_t length; 1128 const uint8_t *p; 1129 /* 1130 * TODO(markus): Use and require new, faster MBCS conversion table structures. 1131 * Use internal version of ucnv_open() that verifies that the new structures are available, 1132 * else U_INTERNAL_PROGRAM_ERROR. 1133 */ 1134 /* BMP-only codepages are stored without stage 1 entries for supplementary code points */ 1135 if(c<0x10000 || (sharedData->mbcs.unicodeMask&UCNV_HAS_SUPPLEMENTARY)) { 1136 table=sharedData->mbcs.fromUnicodeTable; 1137 stage2Entry=MBCS_STAGE_2_FROM_U(table, c); 1138 /* get the bytes and the length for the output */ 1139 if(outputType==MBCS_OUTPUT_2){ 1140 myValue=MBCS_VALUE_2_FROM_STAGE_2(sharedData->mbcs.fromUnicodeBytes, stage2Entry, c); 1141 if(myValue<=0xff) { 1142 length=1; 1143 } else { 1144 length=2; 1145 } 1146 } else /* outputType==MBCS_OUTPUT_3 */ { 1147 p=MBCS_POINTER_3_FROM_STAGE_2(sharedData->mbcs.fromUnicodeBytes, stage2Entry, c); 1148 myValue = (static_cast<uint32_t>(*p) << 16) | (static_cast<uint32_t>(p[1]) << 8) | p[2]; 1149 if(myValue<=0xff) { 1150 length=1; 1151 } else if(myValue<=0xffff) { 1152 length=2; 1153 } else { 1154 length=3; 1155 } 1156 } 1157 /* is this code point assigned, or do we use fallbacks? */ 1158 if((stage2Entry&(1<<(16+(c&0xf))))!=0) { 1159 /* assigned */ 1160 *value=myValue; 1161 return length; 1162 } else if(FROM_U_USE_FALLBACK(useFallback, c) && myValue!=0) { 1163 /* 1164 * We allow a 0 byte output if the "assigned" bit is set for this entry. 1165 * There is no way with this data structure for fallback output 1166 * to be a zero byte. 1167 */ 1168 *value=myValue; 1169 return -length; 1170 } 1171 } 1172 1173 cx=sharedData->mbcs.extIndexes; 1174 if(cx!=nullptr) { 1175 return ucnv_extSimpleMatchFromU(cx, c, value, useFallback); 1176 } 1177 1178 /* unassigned */ 1179 return 0; 1180 } 1181 1182 /* This inline function replicates code in _MBCSSingleFromUChar32() function in ucnvmbcs.c 1183 * any future change in _MBCSSingleFromUChar32() function should be reflected here. 1184 * @param retval pointer to output byte 1185 * @return 1 roundtrip byte 0 no mapping -1 fallback byte 1186 */ 1187 static inline int32_t 1188 MBCS_SINGLE_FROM_UCHAR32(UConverterSharedData* sharedData, 1189 UChar32 c, 1190 uint32_t* retval, 1191 UBool useFallback) 1192 { 1193 const uint16_t *table; 1194 int32_t value; 1195 /* BMP-only codepages are stored without stage 1 entries for supplementary code points */ 1196 if(c>=0x10000 && !(sharedData->mbcs.unicodeMask&UCNV_HAS_SUPPLEMENTARY)) { 1197 return 0; 1198 } 1199 /* convert the Unicode code point in c into codepage bytes (same as in _MBCSFromUnicodeWithOffsets) */ 1200 table=sharedData->mbcs.fromUnicodeTable; 1201 /* get the byte for the output */ 1202 value=MBCS_SINGLE_RESULT_FROM_U(table, (uint16_t *)sharedData->mbcs.fromUnicodeBytes, c); 1203 /* is this code point assigned, or do we use fallbacks? */ 1204 *retval = static_cast<uint32_t>(value & 0xff); 1205 if(value>=0xf00) { 1206 return 1; /* roundtrip */ 1207 } else if(useFallback ? value>=0x800 : value>=0xc00) { 1208 return -1; /* fallback taken */ 1209 } else { 1210 return 0; /* no mapping */ 1211 } 1212 } 1213 1214 /* 1215 * Check that the result is a 2-byte value with each byte in the range A1..FE 1216 * (strict EUC DBCS) before accepting it and subtracting 0x80 from each byte 1217 * to move it to the ISO 2022 range 21..7E. 1218 * Return 0 if out of range. 1219 */ 1220 static inline uint32_t 1221 _2022FromGR94DBCS(uint32_t value) { 1222 if (static_cast<uint16_t>(value - 0xa1a1) <= (0xfefe - 0xa1a1) && 1223 static_cast<uint8_t>(value - 0xa1) <= (0xfe - 0xa1) 1224 ) { 1225 return value - 0x8080; /* shift down to 21..7e byte range */ 1226 } else { 1227 return 0; /* not valid for ISO 2022 */ 1228 } 1229 } 1230 1231 #if 0 /* 5691: Call sites now check for validity. They can just += 0x8080 after that. */ 1232 /* 1233 * This method does the reverse of _2022FromGR94DBCS(). Given the 2022 code point, it returns the 1234 * 2 byte value that is in the range A1..FE for each byte. Otherwise it returns the 2022 code point 1235 * unchanged. 1236 */ 1237 static inline uint32_t 1238 _2022ToGR94DBCS(uint32_t value) { 1239 uint32_t returnValue = value + 0x8080; 1240 if( (uint16_t)(returnValue - 0xa1a1) <= (0xfefe - 0xa1a1) && 1241 (uint8_t)(returnValue - 0xa1) <= (0xfe - 0xa1)) { 1242 return returnValue; 1243 } else { 1244 return value; 1245 } 1246 } 1247 #endif 1248 1249 #ifdef U_ENABLE_GENERIC_ISO_2022 1250 1251 /********************************************************************************** 1252 * ISO-2022 Converter 1253 * 1254 * 1255 */ 1256 1257 static void U_CALLCONV 1258 T_UConverter_toUnicode_ISO_2022_OFFSETS_LOGIC(UConverterToUnicodeArgs* args, 1259 UErrorCode* err){ 1260 const char* mySourceLimit, *realSourceLimit; 1261 const char* sourceStart; 1262 const char16_t* myTargetStart; 1263 UConverter* saveThis; 1264 UConverterDataISO2022* myData; 1265 int8_t length; 1266 1267 saveThis = args->converter; 1268 myData=((UConverterDataISO2022*)(saveThis->extraInfo)); 1269 1270 realSourceLimit = args->sourceLimit; 1271 while (args->source < realSourceLimit) { 1272 if(myData->key == 0) { /* are we in the middle of an escape sequence? */ 1273 /*Find the end of the buffer e.g : Next Escape Seq | end of Buffer*/ 1274 mySourceLimit = getEndOfBuffer_2022(&(args->source), realSourceLimit, args->flush); 1275 1276 if(args->source < mySourceLimit) { 1277 if(myData->currentConverter==nullptr) { 1278 myData->currentConverter = ucnv_open("ASCII",err); 1279 if(U_FAILURE(*err)){ 1280 return; 1281 } 1282 1283 myData->currentConverter->fromCharErrorBehaviour = UCNV_TO_U_CALLBACK_STOP; 1284 saveThis->mode = UCNV_SO; 1285 } 1286 1287 /* convert to before the ESC or until the end of the buffer */ 1288 myData->isFirstBuffer=false; 1289 sourceStart = args->source; 1290 myTargetStart = args->target; 1291 args->converter = myData->currentConverter; 1292 ucnv_toUnicode(args->converter, 1293 &args->target, 1294 args->targetLimit, 1295 &args->source, 1296 mySourceLimit, 1297 args->offsets, 1298 (UBool)(args->flush && mySourceLimit == realSourceLimit), 1299 err); 1300 args->converter = saveThis; 1301 1302 if (*err == U_BUFFER_OVERFLOW_ERROR) { 1303 /* move the overflow buffer */ 1304 length = saveThis->UCharErrorBufferLength = myData->currentConverter->UCharErrorBufferLength; 1305 myData->currentConverter->UCharErrorBufferLength = 0; 1306 if(length > 0) { 1307 uprv_memcpy(saveThis->UCharErrorBuffer, 1308 myData->currentConverter->UCharErrorBuffer, 1309 length*U_SIZEOF_UCHAR); 1310 } 1311 return; 1312 } 1313 1314 /* 1315 * At least one of: 1316 * -Error while converting 1317 * -Done with entire buffer 1318 * -Need to write offsets or update the current offset 1319 * (leave that up to the code in ucnv.c) 1320 * 1321 * or else we just stopped at an ESC byte and continue with changeState_2022() 1322 */ 1323 if (U_FAILURE(*err) || 1324 (args->source == realSourceLimit) || 1325 (args->offsets != nullptr && (args->target != myTargetStart || args->source != sourceStart) || 1326 (mySourceLimit < realSourceLimit && myData->currentConverter->toULength > 0)) 1327 ) { 1328 /* copy partial or error input for truncated detection and error handling */ 1329 if(U_FAILURE(*err)) { 1330 length = saveThis->invalidCharLength = myData->currentConverter->invalidCharLength; 1331 if(length > 0) { 1332 uprv_memcpy(saveThis->invalidCharBuffer, myData->currentConverter->invalidCharBuffer, length); 1333 } 1334 } else { 1335 length = saveThis->toULength = myData->currentConverter->toULength; 1336 if(length > 0) { 1337 uprv_memcpy(saveThis->toUBytes, myData->currentConverter->toUBytes, length); 1338 if(args->source < mySourceLimit) { 1339 *err = U_TRUNCATED_CHAR_FOUND; /* truncated input before ESC */ 1340 } 1341 } 1342 } 1343 return; 1344 } 1345 } 1346 } 1347 1348 sourceStart = args->source; 1349 changeState_2022(args->converter, 1350 &(args->source), 1351 realSourceLimit, 1352 ISO_2022, 1353 err); 1354 if (U_FAILURE(*err) || (args->source != sourceStart && args->offsets != nullptr)) { 1355 /* let the ucnv.c code update its current offset */ 1356 return; 1357 } 1358 } 1359 } 1360 1361 #endif 1362 1363 /* 1364 * To Unicode Callback helper function 1365 */ 1366 static void 1367 toUnicodeCallback(UConverter *cnv, 1368 const uint32_t sourceChar, const uint32_t targetUniChar, 1369 UErrorCode* err){ 1370 if(sourceChar>0xff){ 1371 cnv->toUBytes[0] = static_cast<uint8_t>(sourceChar >> 8); 1372 cnv->toUBytes[1] = static_cast<uint8_t>(sourceChar); 1373 cnv->toULength = 2; 1374 } 1375 else{ 1376 cnv->toUBytes[0] = static_cast<char>(sourceChar); 1377 cnv->toULength = 1; 1378 } 1379 1380 if(targetUniChar == (missingCharMarker-1/*0xfffe*/)){ 1381 *err = U_INVALID_CHAR_FOUND; 1382 } 1383 else{ 1384 *err = U_ILLEGAL_CHAR_FOUND; 1385 } 1386 } 1387 1388 /**************************************ISO-2022-JP*************************************************/ 1389 1390 /************************************** IMPORTANT ************************************************** 1391 * The UConverter_fromUnicode_ISO2022_JP converter does not use ucnv_fromUnicode() functions for SBCS,DBCS and 1392 * MBCS; instead, the values are obtained directly by calling _MBCSFromUChar32(). 1393 * The converter iterates over each Unicode codepoint 1394 * to obtain the equivalent codepoints from the codepages supported. Since the source buffer is 1395 * processed one char at a time it would make sense to reduce the extra processing a canned converter 1396 * would do as far as possible. 1397 * 1398 * If the implementation of these macros or structure of sharedData struct change in the future, make 1399 * sure that ISO-2022 is also changed. 1400 *************************************************************************************************** 1401 */ 1402 1403 /*************************************************************************************************** 1404 * Rules for ISO-2022-jp encoding 1405 * (i) Escape sequences must be fully contained within a line they should not 1406 * span new lines or CRs 1407 * (ii) If the last character on a line is represented by two bytes then an ASCII or 1408 * JIS-Roman character escape sequence should follow before the line terminates 1409 * (iii) If the first character on the line is represented by two bytes then a two 1410 * byte character escape sequence should precede it 1411 * (iv) If no escape sequence is encountered then the characters are ASCII 1412 * (v) Latin(ISO-8859-1) and Greek(ISO-8859-7) characters must be designated to G2, 1413 * and invoked with SS2 (ESC N). 1414 * (vi) If there is any G0 designation in text, there must be a switch to 1415 * ASCII or to JIS X 0201-Roman before a space character (but not 1416 * necessarily before "ESC 4/14 2/0" or "ESC N ' '") or control 1417 * characters such as tab or CRLF. 1418 * (vi) Supported encodings: 1419 * ASCII, JISX201, JISX208, JISX212, GB2312, KSC5601, ISO-8859-1,ISO-8859-7 1420 * 1421 * source : RFC-1554 1422 * 1423 * JISX201, JISX208,JISX212 : new .cnv data files created 1424 * KSC5601 : alias to ibm-949 mapping table 1425 * GB2312 : alias to ibm-1386 mapping table 1426 * ISO-8859-1 : Algorithmic implemented as LATIN1 case 1427 * ISO-8859-7 : alias to ibm-9409 mapping table 1428 */ 1429 1430 /* preference order of JP charsets */ 1431 static const StateEnum jpCharsetPref[]={ 1432 ASCII, 1433 JISX201, 1434 ISO8859_1, 1435 JISX208, 1436 ISO8859_7, 1437 JISX212, 1438 GB2312, 1439 KSC5601, 1440 HWKANA_7BIT 1441 }; 1442 1443 /* 1444 * The escape sequences must be in order of the enum constants like JISX201 = 3, 1445 * not in order of jpCharsetPref[]! 1446 */ 1447 static const char escSeqChars[][6] ={ 1448 "\x1B\x28\x42", /* <ESC>(B ASCII */ 1449 "\x1B\x2E\x41", /* <ESC>.A ISO-8859-1 */ 1450 "\x1B\x2E\x46", /* <ESC>.F ISO-8859-7 */ 1451 "\x1B\x28\x4A", /* <ESC>(J JISX-201 */ 1452 "\x1B\x24\x42", /* <ESC>$B JISX-208 */ 1453 "\x1B\x24\x28\x44", /* <ESC>$(D JISX-212 */ 1454 "\x1B\x24\x41", /* <ESC>$A GB2312 */ 1455 "\x1B\x24\x28\x43", /* <ESC>$(C KSC5601 */ 1456 "\x1B\x28\x49" /* <ESC>(I HWKANA_7BIT */ 1457 1458 }; 1459 static const int8_t escSeqCharsLen[] ={ 1460 3, /* length of <ESC>(B ASCII */ 1461 3, /* length of <ESC>.A ISO-8859-1 */ 1462 3, /* length of <ESC>.F ISO-8859-7 */ 1463 3, /* length of <ESC>(J JISX-201 */ 1464 3, /* length of <ESC>$B JISX-208 */ 1465 4, /* length of <ESC>$(D JISX-212 */ 1466 3, /* length of <ESC>$A GB2312 */ 1467 4, /* length of <ESC>$(C KSC5601 */ 1468 3 /* length of <ESC>(I HWKANA_7BIT */ 1469 }; 1470 1471 /* 1472 * The iteration over various code pages works this way: 1473 * i) Get the currentState from myConverterData->currentState 1474 * ii) Check if the character is mapped to a valid character in the currentState 1475 * Yes -> a) set the initIterState to currentState 1476 * b) remain in this state until an invalid character is found 1477 * No -> a) go to the next code page and find the character 1478 * iii) Before changing the state increment the current state check if the current state 1479 * is equal to the intitIteration state 1480 * Yes -> A character that cannot be represented in any of the supported encodings 1481 * break and return a U_INVALID_CHARACTER error 1482 * No -> Continue and find the character in next code page 1483 * 1484 * 1485 * TODO: Implement a priority technique where the users are allowed to set the priority of code pages 1486 */ 1487 1488 /* Map 00..7F to Unicode according to JIS X 0201. */ 1489 static inline uint32_t 1490 jisx201ToU(uint32_t value) { 1491 if(value < 0x5c) { 1492 return value; 1493 } else if(value == 0x5c) { 1494 return 0xa5; 1495 } else if(value == 0x7e) { 1496 return 0x203e; 1497 } else /* value <= 0x7f */ { 1498 return value; 1499 } 1500 } 1501 1502 /* Map Unicode to 00..7F according to JIS X 0201. Return U+FFFE if unmappable. */ 1503 static inline uint32_t 1504 jisx201FromU(uint32_t value) { 1505 if(value<=0x7f) { 1506 if(value!=0x5c && value!=0x7e) { 1507 return value; 1508 } 1509 } else if(value==0xa5) { 1510 return 0x5c; 1511 } else if(value==0x203e) { 1512 return 0x7e; 1513 } 1514 return 0xfffe; 1515 } 1516 1517 /* 1518 * Take a valid Shift-JIS byte pair, check that it is in the range corresponding 1519 * to JIS X 0208, and convert it to a pair of 21..7E bytes. 1520 * Return 0 if the byte pair is out of range. 1521 */ 1522 static inline uint32_t 1523 _2022FromSJIS(uint32_t value) { 1524 uint8_t trail; 1525 1526 if(value > 0xEFFC) { 1527 return 0; /* beyond JIS X 0208 */ 1528 } 1529 1530 trail = static_cast<uint8_t>(value); 1531 1532 value &= 0xff00; /* lead byte */ 1533 if(value <= 0x9f00) { 1534 value -= 0x7000; 1535 } else /* 0xe000 <= value <= 0xef00 */ { 1536 value -= 0xb000; 1537 } 1538 value <<= 1; 1539 1540 if(trail <= 0x9e) { 1541 value -= 0x100; 1542 if(trail <= 0x7e) { 1543 value |= trail - 0x1f; 1544 } else { 1545 value |= trail - 0x20; 1546 } 1547 } else /* trail <= 0xfc */ { 1548 value |= trail - 0x7e; 1549 } 1550 return value; 1551 } 1552 1553 /* 1554 * Convert a pair of JIS X 0208 21..7E bytes to Shift-JIS. 1555 * If either byte is outside 21..7E make sure that the result is not valid 1556 * for Shift-JIS so that the converter catches it. 1557 * Some invalid byte values already turn into equally invalid Shift-JIS 1558 * byte values and need not be tested explicitly. 1559 */ 1560 static inline void 1561 _2022ToSJIS(uint8_t c1, uint8_t c2, char bytes[2]) { 1562 if(c1&1) { 1563 ++c1; 1564 if(c2 <= 0x5f) { 1565 c2 += 0x1f; 1566 } else if(c2 <= 0x7e) { 1567 c2 += 0x20; 1568 } else { 1569 c2 = 0; /* invalid */ 1570 } 1571 } else { 1572 if (static_cast<uint8_t>(c2 - 0x21) <= ((0x7e) - 0x21)) { 1573 c2 += 0x7e; 1574 } else { 1575 c2 = 0; /* invalid */ 1576 } 1577 } 1578 c1 >>= 1; 1579 if(c1 <= 0x2f) { 1580 c1 += 0x70; 1581 } else if(c1 <= 0x3f) { 1582 c1 += 0xb0; 1583 } else { 1584 c1 = 0; /* invalid */ 1585 } 1586 bytes[0] = static_cast<char>(c1); 1587 bytes[1] = static_cast<char>(c2); 1588 } 1589 1590 /* 1591 * JIS X 0208 has fallbacks from Unicode half-width Katakana to full-width (DBCS) 1592 * Katakana. 1593 * Now that we use a Shift-JIS table for JIS X 0208 we need to hardcode these fallbacks 1594 * because Shift-JIS roundtrips half-width Katakana to single bytes. 1595 * These were the only fallbacks in ICU's jisx-208.ucm file. 1596 */ 1597 static const uint16_t hwkana_fb[HWKANA_END - HWKANA_START + 1] = { 1598 0x2123, /* U+FF61 */ 1599 0x2156, 1600 0x2157, 1601 0x2122, 1602 0x2126, 1603 0x2572, 1604 0x2521, 1605 0x2523, 1606 0x2525, 1607 0x2527, 1608 0x2529, 1609 0x2563, 1610 0x2565, 1611 0x2567, 1612 0x2543, 1613 0x213C, /* U+FF70 */ 1614 0x2522, 1615 0x2524, 1616 0x2526, 1617 0x2528, 1618 0x252A, 1619 0x252B, 1620 0x252D, 1621 0x252F, 1622 0x2531, 1623 0x2533, 1624 0x2535, 1625 0x2537, 1626 0x2539, 1627 0x253B, 1628 0x253D, 1629 0x253F, /* U+FF80 */ 1630 0x2541, 1631 0x2544, 1632 0x2546, 1633 0x2548, 1634 0x254A, 1635 0x254B, 1636 0x254C, 1637 0x254D, 1638 0x254E, 1639 0x254F, 1640 0x2552, 1641 0x2555, 1642 0x2558, 1643 0x255B, 1644 0x255E, 1645 0x255F, /* U+FF90 */ 1646 0x2560, 1647 0x2561, 1648 0x2562, 1649 0x2564, 1650 0x2566, 1651 0x2568, 1652 0x2569, 1653 0x256A, 1654 0x256B, 1655 0x256C, 1656 0x256D, 1657 0x256F, 1658 0x2573, 1659 0x212B, 1660 0x212C /* U+FF9F */ 1661 }; 1662 1663 static void U_CALLCONV 1664 UConverter_fromUnicode_ISO_2022_JP_OFFSETS_LOGIC(UConverterFromUnicodeArgs* args, UErrorCode* err) { 1665 UConverter *cnv = args->converter; 1666 UConverterDataISO2022 *converterData; 1667 ISO2022State *pFromU2022State; 1668 uint8_t* target = reinterpret_cast<uint8_t*>(args->target); 1669 const uint8_t* targetLimit = reinterpret_cast<const uint8_t*>(args->targetLimit); 1670 const char16_t* source = args->source; 1671 const char16_t* sourceLimit = args->sourceLimit; 1672 int32_t* offsets = args->offsets; 1673 UChar32 sourceChar; 1674 char buffer[8]; 1675 int32_t len, outLen; 1676 int8_t choices[10]; 1677 int32_t choiceCount; 1678 uint32_t targetValue = 0; 1679 UBool useFallback; 1680 1681 int32_t i; 1682 int8_t cs, g; 1683 1684 /* set up the state */ 1685 converterData = static_cast<UConverterDataISO2022*>(cnv->extraInfo); 1686 pFromU2022State = &converterData->fromU2022State; 1687 1688 choiceCount = 0; 1689 1690 /* check if the last codepoint of previous buffer was a lead surrogate*/ 1691 if((sourceChar = cnv->fromUChar32)!=0 && target< targetLimit) { 1692 goto getTrail; 1693 } 1694 1695 while(source < sourceLimit) { 1696 if(target < targetLimit) { 1697 1698 sourceChar = *(source++); 1699 /*check if the char is a First surrogate*/ 1700 if(U16_IS_SURROGATE(sourceChar)) { 1701 if(U16_IS_SURROGATE_LEAD(sourceChar)) { 1702 getTrail: 1703 /*look ahead to find the trail surrogate*/ 1704 if(source < sourceLimit) { 1705 /* test the following code unit */ 1706 char16_t trail = *source; 1707 if(U16_IS_TRAIL(trail)) { 1708 source++; 1709 sourceChar=U16_GET_SUPPLEMENTARY(sourceChar, trail); 1710 cnv->fromUChar32=0x00; 1711 /* convert this supplementary code point */ 1712 /* exit this condition tree */ 1713 } else { 1714 /* this is an unmatched lead code unit (1st surrogate) */ 1715 /* callback(illegal) */ 1716 *err=U_ILLEGAL_CHAR_FOUND; 1717 cnv->fromUChar32=sourceChar; 1718 break; 1719 } 1720 } else { 1721 /* no more input */ 1722 cnv->fromUChar32=sourceChar; 1723 break; 1724 } 1725 } else { 1726 /* this is an unmatched trail code unit (2nd surrogate) */ 1727 /* callback(illegal) */ 1728 *err=U_ILLEGAL_CHAR_FOUND; 1729 cnv->fromUChar32=sourceChar; 1730 break; 1731 } 1732 } 1733 1734 /* do not convert SO/SI/ESC */ 1735 if(IS_2022_CONTROL(sourceChar)) { 1736 /* callback(illegal) */ 1737 *err=U_ILLEGAL_CHAR_FOUND; 1738 cnv->fromUChar32=sourceChar; 1739 break; 1740 } 1741 1742 /* do the conversion */ 1743 1744 if(choiceCount == 0) { 1745 uint16_t csm; 1746 1747 /* 1748 * The csm variable keeps track of which charsets are allowed 1749 * and not used yet while building the choices[]. 1750 */ 1751 csm = jpCharsetMasks[converterData->version]; 1752 choiceCount = 0; 1753 1754 /* JIS7/8: try single-byte half-width Katakana before JISX208 */ 1755 if(converterData->version == 3 || converterData->version == 4) { 1756 choices[choiceCount++] = static_cast<int8_t>(HWKANA_7BIT); 1757 } 1758 /* Do not try single-byte half-width Katakana for other versions. */ 1759 csm &= ~CSM(HWKANA_7BIT); 1760 1761 /* try the current G0 charset */ 1762 choices[choiceCount++] = cs = pFromU2022State->cs[0]; 1763 csm &= ~CSM(cs); 1764 1765 /* try the current G2 charset */ 1766 if((cs = pFromU2022State->cs[2]) != 0) { 1767 choices[choiceCount++] = cs; 1768 csm &= ~CSM(cs); 1769 } 1770 1771 /* try all the other possible charsets */ 1772 for(i = 0; i < UPRV_LENGTHOF(jpCharsetPref); ++i) { 1773 cs = static_cast<int8_t>(jpCharsetPref[i]); 1774 if(CSM(cs) & csm) { 1775 choices[choiceCount++] = cs; 1776 csm &= ~CSM(cs); 1777 } 1778 } 1779 } 1780 1781 cs = g = 0; 1782 /* 1783 * len==0: no mapping found yet 1784 * len<0: found a fallback result: continue looking for a roundtrip but no further fallbacks 1785 * len>0: found a roundtrip result, done 1786 */ 1787 len = 0; 1788 /* 1789 * We will turn off useFallback after finding a fallback, 1790 * but we still get fallbacks from PUA code points as usual. 1791 * Therefore, we will also need to check that we don't overwrite 1792 * an early fallback with a later one. 1793 */ 1794 useFallback = cnv->useFallback; 1795 1796 for(i = 0; i < choiceCount && len <= 0; ++i) { 1797 uint32_t value; 1798 int32_t len2; 1799 int8_t cs0 = choices[i]; 1800 switch(cs0) { 1801 case ASCII: 1802 if(sourceChar <= 0x7f) { 1803 targetValue = static_cast<uint32_t>(sourceChar); 1804 len = 1; 1805 cs = cs0; 1806 g = 0; 1807 } 1808 break; 1809 case ISO8859_1: 1810 if(GR96_START <= sourceChar && sourceChar <= GR96_END) { 1811 targetValue = static_cast<uint32_t>(sourceChar) - 0x80; 1812 len = 1; 1813 cs = cs0; 1814 g = 2; 1815 } 1816 break; 1817 case HWKANA_7BIT: 1818 if (static_cast<uint32_t>(sourceChar - HWKANA_START) <= (HWKANA_END - HWKANA_START)) { 1819 if(converterData->version==3) { 1820 /* JIS7: use G1 (SO) */ 1821 /* Shift U+FF61..U+FF9F to bytes 21..5F. */ 1822 targetValue = static_cast<uint32_t>(sourceChar - (HWKANA_START - 0x21)); 1823 len = 1; 1824 pFromU2022State->cs[1] = cs = cs0; /* do not output an escape sequence */ 1825 g = 1; 1826 } else if(converterData->version==4) { 1827 /* JIS8: use 8-bit bytes with any single-byte charset, see escape sequence output below */ 1828 /* Shift U+FF61..U+FF9F to bytes A1..DF. */ 1829 targetValue = static_cast<uint32_t>(sourceChar - (HWKANA_START - 0xa1)); 1830 len = 1; 1831 1832 cs = pFromU2022State->cs[0]; 1833 if(IS_JP_DBCS(cs)) { 1834 /* switch from a DBCS charset to JISX201 */ 1835 cs = static_cast<int8_t>(JISX201); 1836 } 1837 /* else stay in the current G0 charset */ 1838 g = 0; 1839 } 1840 /* else do not use HWKANA_7BIT with other versions */ 1841 } 1842 break; 1843 case JISX201: 1844 /* G0 SBCS */ 1845 value = jisx201FromU(sourceChar); 1846 if(value <= 0x7f) { 1847 targetValue = value; 1848 len = 1; 1849 cs = cs0; 1850 g = 0; 1851 useFallback = false; 1852 } 1853 break; 1854 case JISX208: 1855 /* G0 DBCS from Shift-JIS table */ 1856 len2 = MBCS_FROM_UCHAR32_ISO2022( 1857 converterData->myConverterArray[cs0], 1858 sourceChar, &value, 1859 useFallback, MBCS_OUTPUT_2); 1860 if(len2 == 2 || (len2 == -2 && len == 0)) { /* only accept DBCS: abs(len)==2 */ 1861 value = _2022FromSJIS(value); 1862 if(value != 0) { 1863 targetValue = value; 1864 len = len2; 1865 cs = cs0; 1866 g = 0; 1867 useFallback = false; 1868 } 1869 } else if(len == 0 && useFallback && 1870 static_cast<uint32_t>(sourceChar - HWKANA_START) <= (HWKANA_END - HWKANA_START)) { 1871 targetValue = hwkana_fb[sourceChar - HWKANA_START]; 1872 len = -2; 1873 cs = cs0; 1874 g = 0; 1875 useFallback = false; 1876 } 1877 break; 1878 case ISO8859_7: 1879 /* G0 SBCS forced to 7-bit output */ 1880 len2 = MBCS_SINGLE_FROM_UCHAR32( 1881 converterData->myConverterArray[cs0], 1882 sourceChar, &value, 1883 useFallback); 1884 if(len2 != 0 && !(len2 < 0 && len != 0) && GR96_START <= value && value <= GR96_END) { 1885 targetValue = value - 0x80; 1886 len = len2; 1887 cs = cs0; 1888 g = 2; 1889 useFallback = false; 1890 } 1891 break; 1892 default: 1893 /* G0 DBCS */ 1894 len2 = MBCS_FROM_UCHAR32_ISO2022( 1895 converterData->myConverterArray[cs0], 1896 sourceChar, &value, 1897 useFallback, MBCS_OUTPUT_2); 1898 if(len2 == 2 || (len2 == -2 && len == 0)) { /* only accept DBCS: abs(len)==2 */ 1899 if(cs0 == KSC5601) { 1900 /* 1901 * Check for valid bytes for the encoding scheme. 1902 * This is necessary because the sub-converter (windows-949) 1903 * has a broader encoding scheme than is valid for 2022. 1904 */ 1905 value = _2022FromGR94DBCS(value); 1906 if(value == 0) { 1907 break; 1908 } 1909 } 1910 targetValue = value; 1911 len = len2; 1912 cs = cs0; 1913 g = 0; 1914 useFallback = false; 1915 } 1916 break; 1917 } 1918 } 1919 1920 if(len != 0) { 1921 if(len < 0) { 1922 len = -len; /* fallback */ 1923 } 1924 outLen = 0; /* count output bytes */ 1925 1926 /* write SI if necessary (only for JIS7) */ 1927 if(pFromU2022State->g == 1 && g == 0) { 1928 buffer[outLen++] = UCNV_SI; 1929 pFromU2022State->g = 0; 1930 } 1931 1932 /* write the designation sequence if necessary */ 1933 if(cs != pFromU2022State->cs[g]) { 1934 int32_t escLen = escSeqCharsLen[cs]; 1935 uprv_memcpy(buffer + outLen, escSeqChars[cs], escLen); 1936 outLen += escLen; 1937 pFromU2022State->cs[g] = cs; 1938 1939 /* invalidate the choices[] */ 1940 choiceCount = 0; 1941 } 1942 1943 /* write the shift sequence if necessary */ 1944 if(g != pFromU2022State->g) { 1945 switch(g) { 1946 /* case 0 handled before writing escapes */ 1947 case 1: 1948 buffer[outLen++] = UCNV_SO; 1949 pFromU2022State->g = 1; 1950 break; 1951 default: /* case 2 */ 1952 buffer[outLen++] = 0x1b; 1953 buffer[outLen++] = 0x4e; 1954 break; 1955 /* no case 3: no SS3 in ISO-2022-JP-x */ 1956 } 1957 } 1958 1959 /* write the output bytes */ 1960 if(len == 1) { 1961 buffer[outLen++] = static_cast<char>(targetValue); 1962 } else /* len == 2 */ { 1963 buffer[outLen++] = static_cast<char>(targetValue >> 8); 1964 buffer[outLen++] = static_cast<char>(targetValue); 1965 } 1966 } else { 1967 /* 1968 * if we cannot find the character after checking all codepages 1969 * then this is an error 1970 */ 1971 *err = U_INVALID_CHAR_FOUND; 1972 cnv->fromUChar32=sourceChar; 1973 break; 1974 } 1975 1976 if(sourceChar == CR || sourceChar == LF) { 1977 /* reset the G2 state at the end of a line (conversion got us into ASCII or JISX201 already) */ 1978 pFromU2022State->cs[2] = 0; 1979 choiceCount = 0; 1980 } 1981 1982 /* output outLen>0 bytes in buffer[] */ 1983 if(outLen == 1) { 1984 *target++ = buffer[0]; 1985 if(offsets) { 1986 *offsets++ = static_cast<int32_t>(source - args->source - 1); /* -1: known to be ASCII */ 1987 } 1988 } else if(outLen == 2 && (target + 2) <= targetLimit) { 1989 *target++ = buffer[0]; 1990 *target++ = buffer[1]; 1991 if(offsets) { 1992 int32_t sourceIndex = static_cast<int32_t>(source - args->source - U16_LENGTH(sourceChar)); 1993 *offsets++ = sourceIndex; 1994 *offsets++ = sourceIndex; 1995 } 1996 } else { 1997 fromUWriteUInt8( 1998 cnv, 1999 buffer, outLen, 2000 &target, reinterpret_cast<const char*>(targetLimit), 2001 &offsets, static_cast<int32_t>(source - args->source - U16_LENGTH(sourceChar)), 2002 err); 2003 if(U_FAILURE(*err)) { 2004 break; 2005 } 2006 } 2007 } /* end if(myTargetIndex<myTargetLength) */ 2008 else{ 2009 *err =U_BUFFER_OVERFLOW_ERROR; 2010 break; 2011 } 2012 2013 }/* end while(mySourceIndex<mySourceLength) */ 2014 2015 /* 2016 * the end of the input stream and detection of truncated input 2017 * are handled by the framework, but for ISO-2022-JP conversion 2018 * we need to be in ASCII mode at the very end 2019 * 2020 * conditions: 2021 * successful 2022 * in SO mode or not in ASCII mode 2023 * end of input and no truncated input 2024 */ 2025 if( U_SUCCESS(*err) && 2026 (pFromU2022State->g!=0 || pFromU2022State->cs[0]!=ASCII) && 2027 args->flush && source>=sourceLimit && cnv->fromUChar32==0 2028 ) { 2029 int32_t sourceIndex; 2030 2031 outLen = 0; 2032 2033 if(pFromU2022State->g != 0) { 2034 buffer[outLen++] = UCNV_SI; 2035 pFromU2022State->g = 0; 2036 } 2037 2038 if(pFromU2022State->cs[0] != ASCII) { 2039 int32_t escLen = escSeqCharsLen[ASCII]; 2040 uprv_memcpy(buffer + outLen, escSeqChars[ASCII], escLen); 2041 outLen += escLen; 2042 pFromU2022State->cs[0] = static_cast<int8_t>(ASCII); 2043 } 2044 2045 /* get the source index of the last input character */ 2046 /* 2047 * TODO this would be simpler and more reliable if we used a pair 2048 * of sourceIndex/prevSourceIndex like in ucnvmbcs.c 2049 * so that we could simply use the prevSourceIndex here; 2050 * this code gives an incorrect result for the rare case of an unmatched 2051 * trail surrogate that is alone in the last buffer of the text stream 2052 */ 2053 sourceIndex = static_cast<int32_t>(source - args->source); 2054 if(sourceIndex>0) { 2055 --sourceIndex; 2056 if( U16_IS_TRAIL(args->source[sourceIndex]) && 2057 (sourceIndex==0 || U16_IS_LEAD(args->source[sourceIndex-1])) 2058 ) { 2059 --sourceIndex; 2060 } 2061 } else { 2062 sourceIndex=-1; 2063 } 2064 2065 fromUWriteUInt8( 2066 cnv, 2067 buffer, outLen, 2068 &target, reinterpret_cast<const char*>(targetLimit), 2069 &offsets, sourceIndex, 2070 err); 2071 } 2072 2073 /*save the state and return */ 2074 args->source = source; 2075 args->target = reinterpret_cast<char*>(target); 2076 } 2077 2078 /*************** to unicode *******************/ 2079 2080 static void U_CALLCONV 2081 UConverter_toUnicode_ISO_2022_JP_OFFSETS_LOGIC(UConverterToUnicodeArgs *args, 2082 UErrorCode* err){ 2083 char tempBuf[2]; 2084 const char* mySource = const_cast<char*>(args->source); 2085 char16_t *myTarget = args->target; 2086 const char *mySourceLimit = args->sourceLimit; 2087 uint32_t targetUniChar = 0x0000; 2088 uint32_t mySourceChar = 0x0000; 2089 uint32_t tmpSourceChar = 0x0000; 2090 UConverterDataISO2022* myData; 2091 ISO2022State *pToU2022State; 2092 StateEnum cs; 2093 2094 myData = static_cast<UConverterDataISO2022*>(args->converter->extraInfo); 2095 pToU2022State = &myData->toU2022State; 2096 2097 if(myData->key != 0) { 2098 /* continue with a partial escape sequence */ 2099 goto escape; 2100 } else if(args->converter->toULength == 1 && mySource < mySourceLimit && myTarget < args->targetLimit) { 2101 /* continue with a partial double-byte character */ 2102 mySourceChar = args->converter->toUBytes[0]; 2103 args->converter->toULength = 0; 2104 cs = static_cast<StateEnum>(pToU2022State->cs[pToU2022State->g]); 2105 targetUniChar = missingCharMarker; 2106 goto getTrailByte; 2107 } 2108 2109 while(mySource < mySourceLimit){ 2110 2111 targetUniChar =missingCharMarker; 2112 2113 if(myTarget < args->targetLimit){ 2114 2115 mySourceChar = static_cast<unsigned char>(*mySource++); 2116 2117 switch(mySourceChar) { 2118 case UCNV_SI: 2119 if(myData->version==3) { 2120 pToU2022State->g=0; 2121 continue; 2122 } else { 2123 /* only JIS7 uses SI/SO, not ISO-2022-JP-x */ 2124 myData->isEmptySegment = false; /* reset this, we have a different error */ 2125 break; 2126 } 2127 2128 case UCNV_SO: 2129 if(myData->version==3) { 2130 /* JIS7: switch to G1 half-width Katakana */ 2131 pToU2022State->cs[1] = static_cast<int8_t>(HWKANA_7BIT); 2132 pToU2022State->g=1; 2133 continue; 2134 } else { 2135 /* only JIS7 uses SI/SO, not ISO-2022-JP-x */ 2136 myData->isEmptySegment = false; /* reset this, we have a different error */ 2137 break; 2138 } 2139 2140 case ESC_2022: 2141 mySource--; 2142 escape: 2143 { 2144 const char * mySourceBefore = mySource; 2145 int8_t toULengthBefore = args->converter->toULength; 2146 2147 changeState_2022(args->converter,&(mySource), 2148 mySourceLimit, ISO_2022_JP,err); 2149 2150 /* If in ISO-2022-JP only and we successfully completed an escape sequence, but previous segment was empty, create an error */ 2151 if(myData->version==0 && myData->key==0 && U_SUCCESS(*err) && myData->isEmptySegment) { 2152 *err = U_ILLEGAL_ESCAPE_SEQUENCE; 2153 args->converter->toUCallbackReason = UCNV_IRREGULAR; 2154 args->converter->toULength = static_cast<int8_t>(toULengthBefore + (mySource - mySourceBefore)); 2155 } 2156 } 2157 2158 /* invalid or illegal escape sequence */ 2159 if(U_FAILURE(*err)){ 2160 args->target = myTarget; 2161 args->source = mySource; 2162 myData->isEmptySegment = false; /* Reset to avoid future spurious errors */ 2163 return; 2164 } 2165 /* If we successfully completed an escape sequence, we begin a new segment, empty so far */ 2166 if(myData->key==0) { 2167 myData->isEmptySegment = true; 2168 } 2169 continue; 2170 2171 /* ISO-2022-JP does not use single-byte (C1) SS2 and SS3 */ 2172 2173 case CR: 2174 case LF: 2175 /* automatically reset to single-byte mode */ 2176 if (static_cast<StateEnum>(pToU2022State->cs[0]) != ASCII && 2177 static_cast<StateEnum>(pToU2022State->cs[0]) != JISX201) { 2178 pToU2022State->cs[0] = static_cast<int8_t>(ASCII); 2179 } 2180 pToU2022State->cs[2] = 0; 2181 pToU2022State->g = 0; 2182 U_FALLTHROUGH; 2183 default: 2184 /* convert one or two bytes */ 2185 myData->isEmptySegment = false; 2186 cs = static_cast<StateEnum>(pToU2022State->cs[pToU2022State->g]); 2187 if (static_cast<uint8_t>(mySourceChar - 0xa1) <= (0xdf - 0xa1) && myData->version == 4 && 2188 !IS_JP_DBCS(cs) 2189 ) { 2190 /* 8-bit halfwidth katakana in any single-byte mode for JIS8 */ 2191 targetUniChar = mySourceChar + (HWKANA_START - 0xa1); 2192 2193 /* return from a single-shift state to the previous one */ 2194 if(pToU2022State->g >= 2) { 2195 pToU2022State->g=pToU2022State->prevG; 2196 } 2197 } else switch(cs) { 2198 case ASCII: 2199 if(mySourceChar <= 0x7f) { 2200 targetUniChar = mySourceChar; 2201 } 2202 break; 2203 case ISO8859_1: 2204 if(mySourceChar <= 0x7f) { 2205 targetUniChar = mySourceChar + 0x80; 2206 } 2207 /* return from a single-shift state to the previous one */ 2208 pToU2022State->g=pToU2022State->prevG; 2209 break; 2210 case ISO8859_7: 2211 if(mySourceChar <= 0x7f) { 2212 /* convert mySourceChar+0x80 to use a normal 8-bit table */ 2213 targetUniChar = 2214 _MBCS_SINGLE_SIMPLE_GET_NEXT_BMP( 2215 myData->myConverterArray[cs], 2216 mySourceChar + 0x80); 2217 } 2218 /* return from a single-shift state to the previous one */ 2219 pToU2022State->g=pToU2022State->prevG; 2220 break; 2221 case JISX201: 2222 if(mySourceChar <= 0x7f) { 2223 targetUniChar = jisx201ToU(mySourceChar); 2224 } 2225 break; 2226 case HWKANA_7BIT: 2227 if (static_cast<uint8_t>(mySourceChar - 0x21) <= (0x5f - 0x21)) { 2228 /* 7-bit halfwidth Katakana */ 2229 targetUniChar = mySourceChar + (HWKANA_START - 0x21); 2230 } 2231 break; 2232 default: 2233 /* G0 DBCS */ 2234 if(mySource < mySourceLimit) { 2235 int leadIsOk, trailIsOk; 2236 uint8_t trailByte; 2237 getTrailByte: 2238 trailByte = static_cast<uint8_t>(*mySource); 2239 /* 2240 * Ticket 5691: consistent illegal sequences: 2241 * - We include at least the first byte in the illegal sequence. 2242 * - If any of the non-initial bytes could be the start of a character, 2243 * we stop the illegal sequence before the first one of those. 2244 * 2245 * In ISO-2022 DBCS, if the second byte is in the 21..7e range or is 2246 * an ESC/SO/SI, we report only the first byte as the illegal sequence. 2247 * Otherwise we convert or report the pair of bytes. 2248 */ 2249 leadIsOk = static_cast<uint8_t>(mySourceChar - 0x21) <= (0x7e - 0x21); 2250 trailIsOk = static_cast<uint8_t>(trailByte - 0x21) <= (0x7e - 0x21); 2251 if (leadIsOk && trailIsOk) { 2252 ++mySource; 2253 tmpSourceChar = (mySourceChar << 8) | trailByte; 2254 if(cs == JISX208) { 2255 _2022ToSJIS(static_cast<uint8_t>(mySourceChar), trailByte, tempBuf); 2256 mySourceChar = tmpSourceChar; 2257 } else { 2258 /* Copy before we modify tmpSourceChar so toUnicodeCallback() sees the correct bytes. */ 2259 mySourceChar = tmpSourceChar; 2260 if (cs == KSC5601) { 2261 tmpSourceChar += 0x8080; /* = _2022ToGR94DBCS(tmpSourceChar) */ 2262 } 2263 tempBuf[0] = static_cast<char>(tmpSourceChar >> 8); 2264 tempBuf[1] = static_cast<char>(tmpSourceChar); 2265 } 2266 targetUniChar = ucnv_MBCSSimpleGetNextUChar(myData->myConverterArray[cs], tempBuf, 2, false); 2267 } else if (!(trailIsOk || IS_2022_CONTROL(trailByte))) { 2268 /* report a pair of illegal bytes if the second byte is not a DBCS starter */ 2269 ++mySource; 2270 /* add another bit so that the code below writes 2 bytes in case of error */ 2271 mySourceChar = 0x10000 | (mySourceChar << 8) | trailByte; 2272 } 2273 } else { 2274 args->converter->toUBytes[0] = static_cast<uint8_t>(mySourceChar); 2275 args->converter->toULength = 1; 2276 goto endloop; 2277 } 2278 } /* End of inner switch */ 2279 break; 2280 } /* End of outer switch */ 2281 if(targetUniChar < (missingCharMarker-1/*0xfffe*/)){ 2282 if(args->offsets){ 2283 args->offsets[myTarget - args->target] = static_cast<int32_t>(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2)); 2284 } 2285 *(myTarget++) = static_cast<char16_t>(targetUniChar); 2286 } 2287 else if(targetUniChar > missingCharMarker){ 2288 /* disassemble the surrogate pair and write to output*/ 2289 targetUniChar-=0x0010000; 2290 *myTarget = static_cast<char16_t>(0xd800 + static_cast<char16_t>(targetUniChar >> 10)); 2291 if(args->offsets){ 2292 args->offsets[myTarget - args->target] = static_cast<int32_t>(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2)); 2293 } 2294 ++myTarget; 2295 if(myTarget< args->targetLimit){ 2296 *myTarget = static_cast<char16_t>(0xdc00 + static_cast<char16_t>(targetUniChar & 0x3ff)); 2297 if(args->offsets){ 2298 args->offsets[myTarget - args->target] = static_cast<int32_t>(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2)); 2299 } 2300 ++myTarget; 2301 }else{ 2302 args->converter->UCharErrorBuffer[args->converter->UCharErrorBufferLength++]= 2303 static_cast<char16_t>(0xdc00 + static_cast<char16_t>(targetUniChar & 0x3ff)); 2304 } 2305 2306 } 2307 else{ 2308 /* Call the callback function*/ 2309 toUnicodeCallback(args->converter,mySourceChar,targetUniChar,err); 2310 break; 2311 } 2312 } 2313 else{ /* goes with "if(myTarget < args->targetLimit)" way up near top of function */ 2314 *err =U_BUFFER_OVERFLOW_ERROR; 2315 break; 2316 } 2317 } 2318 endloop: 2319 args->target = myTarget; 2320 args->source = mySource; 2321 } 2322 2323 2324 #if !UCONFIG_ONLY_HTML_CONVERSION 2325 /*************************************************************** 2326 * Rules for ISO-2022-KR encoding 2327 * i) The KSC5601 designator sequence should appear only once in a file, 2328 * at the beginning of a line before any KSC5601 characters. This usually 2329 * means that it appears by itself on the first line of the file 2330 * ii) There are only 2 shifting sequences SO to shift into double byte mode 2331 * and SI to shift into single byte mode 2332 */ 2333 static void U_CALLCONV 2334 UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC_IBM(UConverterFromUnicodeArgs* args, UErrorCode* err){ 2335 2336 UConverter* saveConv = args->converter; 2337 UConverterDataISO2022* myConverterData = static_cast<UConverterDataISO2022*>(saveConv->extraInfo); 2338 args->converter=myConverterData->currentConverter; 2339 2340 myConverterData->currentConverter->fromUChar32 = saveConv->fromUChar32; 2341 ucnv_MBCSFromUnicodeWithOffsets(args,err); 2342 saveConv->fromUChar32 = myConverterData->currentConverter->fromUChar32; 2343 2344 if(*err == U_BUFFER_OVERFLOW_ERROR) { 2345 if(myConverterData->currentConverter->charErrorBufferLength > 0) { 2346 uprv_memcpy( 2347 saveConv->charErrorBuffer, 2348 myConverterData->currentConverter->charErrorBuffer, 2349 myConverterData->currentConverter->charErrorBufferLength); 2350 } 2351 saveConv->charErrorBufferLength = myConverterData->currentConverter->charErrorBufferLength; 2352 myConverterData->currentConverter->charErrorBufferLength = 0; 2353 } 2354 args->converter=saveConv; 2355 } 2356 2357 static void U_CALLCONV 2358 UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC(UConverterFromUnicodeArgs* args, UErrorCode* err){ 2359 2360 const char16_t *source = args->source; 2361 const char16_t *sourceLimit = args->sourceLimit; 2362 unsigned char *target = reinterpret_cast<unsigned char*>(args->target); 2363 unsigned char *targetLimit = reinterpret_cast<unsigned char*>(const_cast<char*>(args->targetLimit)); 2364 int32_t* offsets = args->offsets; 2365 uint32_t targetByteUnit = 0x0000; 2366 UChar32 sourceChar = 0x0000; 2367 UBool isTargetByteDBCS; 2368 UBool oldIsTargetByteDBCS; 2369 UConverterDataISO2022 *converterData; 2370 UConverterSharedData* sharedData; 2371 UBool useFallback; 2372 int32_t length =0; 2373 2374 converterData = static_cast<UConverterDataISO2022*>(args->converter->extraInfo); 2375 /* if the version is 1 then the user is requesting 2376 * conversion with ibm-25546 pass the arguments to 2377 * MBCS converter and return 2378 */ 2379 if(converterData->version==1){ 2380 UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC_IBM(args,err); 2381 return; 2382 } 2383 2384 /* initialize data */ 2385 sharedData = converterData->currentConverter->sharedData; 2386 useFallback = args->converter->useFallback; 2387 isTargetByteDBCS = static_cast<UBool>(args->converter->fromUnicodeStatus); 2388 oldIsTargetByteDBCS = isTargetByteDBCS; 2389 2390 isTargetByteDBCS = static_cast<UBool>(args->converter->fromUnicodeStatus); 2391 if((sourceChar = args->converter->fromUChar32)!=0 && target <targetLimit) { 2392 goto getTrail; 2393 } 2394 while(source < sourceLimit){ 2395 2396 targetByteUnit = missingCharMarker; 2397 2398 if(target < (unsigned char*) args->targetLimit){ 2399 sourceChar = *source++; 2400 2401 /* do not convert SO/SI/ESC */ 2402 if(IS_2022_CONTROL(sourceChar)) { 2403 /* callback(illegal) */ 2404 *err=U_ILLEGAL_CHAR_FOUND; 2405 args->converter->fromUChar32=sourceChar; 2406 break; 2407 } 2408 2409 length = MBCS_FROM_UCHAR32_ISO2022(sharedData,sourceChar,&targetByteUnit,useFallback,MBCS_OUTPUT_2); 2410 if(length < 0) { 2411 length = -length; /* fallback */ 2412 } 2413 /* only DBCS or SBCS characters are expected*/ 2414 /* DB characters with high bit set to 1 are expected */ 2415 if( length > 2 || length==0 || 2416 (length == 1 && targetByteUnit > 0x7f) || 2417 (length == 2 && 2418 (static_cast<uint16_t>(targetByteUnit - 0xa1a1) > (0xfefe - 0xa1a1) || 2419 static_cast<uint8_t>(targetByteUnit - 0xa1) > (0xfe - 0xa1))) 2420 ) { 2421 targetByteUnit=missingCharMarker; 2422 } 2423 if (targetByteUnit != missingCharMarker){ 2424 2425 oldIsTargetByteDBCS = isTargetByteDBCS; 2426 isTargetByteDBCS = static_cast<UBool>(targetByteUnit > 0x00FF); 2427 /* append the shift sequence */ 2428 if (oldIsTargetByteDBCS != isTargetByteDBCS ){ 2429 2430 if (isTargetByteDBCS) 2431 *target++ = UCNV_SO; 2432 else 2433 *target++ = UCNV_SI; 2434 if(offsets) 2435 *(offsets++) = static_cast<int32_t>(source - args->source - 1); 2436 } 2437 /* write the targetUniChar to target */ 2438 if(targetByteUnit <= 0x00FF){ 2439 if( target < targetLimit){ 2440 *(target++) = static_cast<unsigned char>(targetByteUnit); 2441 if(offsets){ 2442 *(offsets++) = static_cast<int32_t>(source - args->source - 1); 2443 } 2444 2445 }else{ 2446 args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = static_cast<unsigned char>(targetByteUnit); 2447 *err = U_BUFFER_OVERFLOW_ERROR; 2448 } 2449 }else{ 2450 if(target < targetLimit){ 2451 *(target++) = static_cast<unsigned char>((targetByteUnit >> 8) - 0x80); 2452 if(offsets){ 2453 *(offsets++) = static_cast<int32_t>(source - args->source - 1); 2454 } 2455 if(target < targetLimit){ 2456 *(target++) = static_cast<unsigned char>(targetByteUnit - 0x80); 2457 if(offsets){ 2458 *(offsets++) = static_cast<int32_t>(source - args->source - 1); 2459 } 2460 }else{ 2461 args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = static_cast<unsigned char>(targetByteUnit - 0x80); 2462 *err = U_BUFFER_OVERFLOW_ERROR; 2463 } 2464 }else{ 2465 args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = static_cast<unsigned char>((targetByteUnit >> 8) - 0x80); 2466 args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = static_cast<unsigned char>(targetByteUnit - 0x80); 2467 *err = U_BUFFER_OVERFLOW_ERROR; 2468 } 2469 } 2470 2471 } 2472 else{ 2473 /* oops.. the code point is unassingned 2474 * set the error and reason 2475 */ 2476 2477 /*check if the char is a First surrogate*/ 2478 if(U16_IS_SURROGATE(sourceChar)) { 2479 if(U16_IS_SURROGATE_LEAD(sourceChar)) { 2480 getTrail: 2481 /*look ahead to find the trail surrogate*/ 2482 if(source < sourceLimit) { 2483 /* test the following code unit */ 2484 char16_t trail = *source; 2485 if(U16_IS_TRAIL(trail)) { 2486 source++; 2487 sourceChar=U16_GET_SUPPLEMENTARY(sourceChar, trail); 2488 *err = U_INVALID_CHAR_FOUND; 2489 /* convert this surrogate code point */ 2490 /* exit this condition tree */ 2491 } else { 2492 /* this is an unmatched lead code unit (1st surrogate) */ 2493 /* callback(illegal) */ 2494 *err=U_ILLEGAL_CHAR_FOUND; 2495 } 2496 } else { 2497 /* no more input */ 2498 *err = U_ZERO_ERROR; 2499 } 2500 } else { 2501 /* this is an unmatched trail code unit (2nd surrogate) */ 2502 /* callback(illegal) */ 2503 *err=U_ILLEGAL_CHAR_FOUND; 2504 } 2505 } else { 2506 /* callback(unassigned) for a BMP code point */ 2507 *err = U_INVALID_CHAR_FOUND; 2508 } 2509 2510 args->converter->fromUChar32=sourceChar; 2511 break; 2512 } 2513 } /* end if(myTargetIndex<myTargetLength) */ 2514 else{ 2515 *err =U_BUFFER_OVERFLOW_ERROR; 2516 break; 2517 } 2518 2519 }/* end while(mySourceIndex<mySourceLength) */ 2520 2521 /* 2522 * the end of the input stream and detection of truncated input 2523 * are handled by the framework, but for ISO-2022-KR conversion 2524 * we need to be in ASCII mode at the very end 2525 * 2526 * conditions: 2527 * successful 2528 * not in ASCII mode 2529 * end of input and no truncated input 2530 */ 2531 if( U_SUCCESS(*err) && 2532 isTargetByteDBCS && 2533 args->flush && source>=sourceLimit && args->converter->fromUChar32==0 2534 ) { 2535 int32_t sourceIndex; 2536 2537 /* we are switching to ASCII */ 2538 isTargetByteDBCS=false; 2539 2540 /* get the source index of the last input character */ 2541 /* 2542 * TODO this would be simpler and more reliable if we used a pair 2543 * of sourceIndex/prevSourceIndex like in ucnvmbcs.c 2544 * so that we could simply use the prevSourceIndex here; 2545 * this code gives an incorrect result for the rare case of an unmatched 2546 * trail surrogate that is alone in the last buffer of the text stream 2547 */ 2548 sourceIndex = static_cast<int32_t>(source - args->source); 2549 if(sourceIndex>0) { 2550 --sourceIndex; 2551 if( U16_IS_TRAIL(args->source[sourceIndex]) && 2552 (sourceIndex==0 || U16_IS_LEAD(args->source[sourceIndex-1])) 2553 ) { 2554 --sourceIndex; 2555 } 2556 } else { 2557 sourceIndex=-1; 2558 } 2559 2560 fromUWriteUInt8( 2561 args->converter, 2562 SHIFT_IN_STR, 1, 2563 &target, reinterpret_cast<const char*>(targetLimit), 2564 &offsets, sourceIndex, 2565 err); 2566 } 2567 2568 /*save the state and return */ 2569 args->source = source; 2570 args->target = reinterpret_cast<char*>(target); 2571 args->converter->fromUnicodeStatus = static_cast<uint32_t>(isTargetByteDBCS); 2572 } 2573 2574 /************************ To Unicode ***************************************/ 2575 2576 static void U_CALLCONV 2577 UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC_IBM(UConverterToUnicodeArgs *args, 2578 UErrorCode* err){ 2579 char const* sourceStart; 2580 UConverterDataISO2022* myData = static_cast<UConverterDataISO2022*>(args->converter->extraInfo); 2581 2582 UConverterToUnicodeArgs subArgs; 2583 int32_t minArgsSize; 2584 2585 /* set up the subconverter arguments */ 2586 if(args->size<sizeof(UConverterToUnicodeArgs)) { 2587 minArgsSize = args->size; 2588 } else { 2589 minArgsSize = static_cast<int32_t>(sizeof(UConverterToUnicodeArgs)); 2590 } 2591 2592 uprv_memcpy(&subArgs, args, minArgsSize); 2593 subArgs.size = static_cast<uint16_t>(minArgsSize); 2594 subArgs.converter = myData->currentConverter; 2595 2596 /* remember the original start of the input for offsets */ 2597 sourceStart = args->source; 2598 2599 if(myData->key != 0) { 2600 /* continue with a partial escape sequence */ 2601 goto escape; 2602 } 2603 2604 while(U_SUCCESS(*err) && args->source < args->sourceLimit) { 2605 /*Find the end of the buffer e.g : Next Escape Seq | end of Buffer*/ 2606 subArgs.source = args->source; 2607 subArgs.sourceLimit = getEndOfBuffer_2022(&(args->source), args->sourceLimit, args->flush); 2608 if(subArgs.source != subArgs.sourceLimit) { 2609 /* 2610 * get the current partial byte sequence 2611 * 2612 * it needs to be moved between the public and the subconverter 2613 * so that the conversion framework, which only sees the public 2614 * converter, can handle truncated and illegal input etc. 2615 */ 2616 if(args->converter->toULength > 0) { 2617 uprv_memcpy(subArgs.converter->toUBytes, args->converter->toUBytes, args->converter->toULength); 2618 } 2619 subArgs.converter->toULength = args->converter->toULength; 2620 2621 /* 2622 * Convert up to the end of the input, or to before the next escape character. 2623 * Does not handle conversion extensions because the preToU[] state etc. 2624 * is not copied. 2625 */ 2626 ucnv_MBCSToUnicodeWithOffsets(&subArgs, err); 2627 2628 if(args->offsets != nullptr && sourceStart != args->source) { 2629 /* update offsets to base them on the actual start of the input */ 2630 int32_t *offsets = args->offsets; 2631 char16_t *target = args->target; 2632 int32_t delta = static_cast<int32_t>(args->source - sourceStart); 2633 while(target < subArgs.target) { 2634 if(*offsets >= 0) { 2635 *offsets += delta; 2636 } 2637 ++offsets; 2638 ++target; 2639 } 2640 } 2641 args->source = subArgs.source; 2642 args->target = subArgs.target; 2643 args->offsets = subArgs.offsets; 2644 2645 /* copy input/error/overflow buffers */ 2646 if(subArgs.converter->toULength > 0) { 2647 uprv_memcpy(args->converter->toUBytes, subArgs.converter->toUBytes, subArgs.converter->toULength); 2648 } 2649 args->converter->toULength = subArgs.converter->toULength; 2650 2651 if(*err == U_BUFFER_OVERFLOW_ERROR) { 2652 if(subArgs.converter->UCharErrorBufferLength > 0) { 2653 uprv_memcpy(args->converter->UCharErrorBuffer, subArgs.converter->UCharErrorBuffer, 2654 subArgs.converter->UCharErrorBufferLength); 2655 } 2656 args->converter->UCharErrorBufferLength=subArgs.converter->UCharErrorBufferLength; 2657 subArgs.converter->UCharErrorBufferLength = 0; 2658 } 2659 } 2660 2661 if (U_FAILURE(*err) || (args->source == args->sourceLimit)) { 2662 return; 2663 } 2664 2665 escape: 2666 changeState_2022(args->converter, 2667 &(args->source), 2668 args->sourceLimit, 2669 ISO_2022_KR, 2670 err); 2671 } 2672 } 2673 2674 static void U_CALLCONV 2675 UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC(UConverterToUnicodeArgs *args, 2676 UErrorCode* err){ 2677 char tempBuf[2]; 2678 const char* mySource = const_cast<char*>(args->source); 2679 char16_t *myTarget = args->target; 2680 const char *mySourceLimit = args->sourceLimit; 2681 UChar32 targetUniChar = 0x0000; 2682 char16_t mySourceChar = 0x0000; 2683 UConverterDataISO2022* myData; 2684 UConverterSharedData* sharedData ; 2685 UBool useFallback; 2686 2687 myData = static_cast<UConverterDataISO2022*>(args->converter->extraInfo); 2688 if(myData->version==1){ 2689 UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC_IBM(args,err); 2690 return; 2691 } 2692 2693 /* initialize state */ 2694 sharedData = myData->currentConverter->sharedData; 2695 useFallback = args->converter->useFallback; 2696 2697 if(myData->key != 0) { 2698 /* continue with a partial escape sequence */ 2699 goto escape; 2700 } else if(args->converter->toULength == 1 && mySource < mySourceLimit && myTarget < args->targetLimit) { 2701 /* continue with a partial double-byte character */ 2702 mySourceChar = args->converter->toUBytes[0]; 2703 args->converter->toULength = 0; 2704 goto getTrailByte; 2705 } 2706 2707 while(mySource< mySourceLimit){ 2708 2709 if(myTarget < args->targetLimit){ 2710 2711 mySourceChar = static_cast<unsigned char>(*mySource++); 2712 2713 if(mySourceChar==UCNV_SI){ 2714 myData->toU2022State.g = 0; 2715 if (myData->isEmptySegment) { 2716 myData->isEmptySegment = false; /* we are handling it, reset to avoid future spurious errors */ 2717 *err = U_ILLEGAL_ESCAPE_SEQUENCE; 2718 args->converter->toUCallbackReason = UCNV_IRREGULAR; 2719 args->converter->toUBytes[0] = static_cast<uint8_t>(mySourceChar); 2720 args->converter->toULength = 1; 2721 args->target = myTarget; 2722 args->source = mySource; 2723 return; 2724 } 2725 /*consume the source */ 2726 continue; 2727 }else if(mySourceChar==UCNV_SO){ 2728 myData->toU2022State.g = 1; 2729 myData->isEmptySegment = true; /* Begin a new segment, empty so far */ 2730 /*consume the source */ 2731 continue; 2732 }else if(mySourceChar==ESC_2022){ 2733 mySource--; 2734 escape: 2735 myData->isEmptySegment = false; /* Any invalid ESC sequences will be detected separately, so just reset this */ 2736 changeState_2022(args->converter,&(mySource), 2737 mySourceLimit, ISO_2022_KR, err); 2738 if(U_FAILURE(*err)){ 2739 args->target = myTarget; 2740 args->source = mySource; 2741 return; 2742 } 2743 continue; 2744 } 2745 2746 myData->isEmptySegment = false; /* Any invalid char errors will be detected separately, so just reset this */ 2747 if(myData->toU2022State.g == 1) { 2748 if(mySource < mySourceLimit) { 2749 int leadIsOk, trailIsOk; 2750 uint8_t trailByte; 2751 getTrailByte: 2752 targetUniChar = missingCharMarker; 2753 trailByte = static_cast<uint8_t>(*mySource); 2754 /* 2755 * Ticket 5691: consistent illegal sequences: 2756 * - We include at least the first byte in the illegal sequence. 2757 * - If any of the non-initial bytes could be the start of a character, 2758 * we stop the illegal sequence before the first one of those. 2759 * 2760 * In ISO-2022 DBCS, if the second byte is in the 21..7e range or is 2761 * an ESC/SO/SI, we report only the first byte as the illegal sequence. 2762 * Otherwise we convert or report the pair of bytes. 2763 */ 2764 leadIsOk = static_cast<uint8_t>(mySourceChar - 0x21) <= (0x7e - 0x21); 2765 trailIsOk = static_cast<uint8_t>(trailByte - 0x21) <= (0x7e - 0x21); 2766 if (leadIsOk && trailIsOk) { 2767 ++mySource; 2768 tempBuf[0] = static_cast<char>(mySourceChar + 0x80); 2769 tempBuf[1] = static_cast<char>(trailByte + 0x80); 2770 targetUniChar = ucnv_MBCSSimpleGetNextUChar(sharedData, tempBuf, 2, useFallback); 2771 mySourceChar = (mySourceChar << 8) | trailByte; 2772 } else if (!(trailIsOk || IS_2022_CONTROL(trailByte))) { 2773 /* report a pair of illegal bytes if the second byte is not a DBCS starter */ 2774 ++mySource; 2775 /* add another bit so that the code below writes 2 bytes in case of error */ 2776 mySourceChar = static_cast<char16_t>(0x10000 | (mySourceChar << 8) | trailByte); 2777 } 2778 } else { 2779 args->converter->toUBytes[0] = static_cast<uint8_t>(mySourceChar); 2780 args->converter->toULength = 1; 2781 break; 2782 } 2783 } 2784 else if(mySourceChar <= 0x7f) { 2785 targetUniChar = ucnv_MBCSSimpleGetNextUChar(sharedData, mySource - 1, 1, useFallback); 2786 } else { 2787 targetUniChar = 0xffff; 2788 } 2789 if(targetUniChar < 0xfffe){ 2790 if(args->offsets) { 2791 args->offsets[myTarget - args->target] = static_cast<int32_t>(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2)); 2792 } 2793 *(myTarget++) = static_cast<char16_t>(targetUniChar); 2794 } 2795 else { 2796 /* Call the callback function*/ 2797 toUnicodeCallback(args->converter,mySourceChar,targetUniChar,err); 2798 break; 2799 } 2800 } 2801 else{ 2802 *err =U_BUFFER_OVERFLOW_ERROR; 2803 break; 2804 } 2805 } 2806 args->target = myTarget; 2807 args->source = mySource; 2808 } 2809 2810 /*************************** END ISO2022-KR *********************************/ 2811 2812 /*************************** ISO-2022-CN ********************************* 2813 * 2814 * Rules for ISO-2022-CN Encoding: 2815 * i) The designator sequence must appear once on a line before any instance 2816 * of character set it designates. 2817 * ii) If two lines contain characters from the same character set, both lines 2818 * must include the designator sequence. 2819 * iii) Once the designator sequence is known, a shifting sequence has to be found 2820 * to invoke the shifting 2821 * iv) All lines start in ASCII and end in ASCII. 2822 * v) Four shifting sequences are employed for this purpose: 2823 * 2824 * Sequcence ASCII Eq Charsets 2825 * ---------- ------- --------- 2826 * SI <SI> US-ASCII 2827 * SO <SO> CNS-11643-1992 Plane 1, GB2312, ISO-IR-165 2828 * SS2 <ESC>N CNS-11643-1992 Plane 2 2829 * SS3 <ESC>O CNS-11643-1992 Planes 3-7 2830 * 2831 * vi) 2832 * SOdesignator : ESC "$" ")" finalchar_for_SO 2833 * SS2designator : ESC "$" "*" finalchar_for_SS2 2834 * SS3designator : ESC "$" "+" finalchar_for_SS3 2835 * 2836 * ESC $ ) A Indicates the bytes following SO are Chinese 2837 * characters as defined in GB 2312-80, until 2838 * another SOdesignation appears 2839 * 2840 * 2841 * ESC $ ) E Indicates the bytes following SO are as defined 2842 * in ISO-IR-165 (for details, see section 2.1), 2843 * until another SOdesignation appears 2844 * 2845 * ESC $ ) G Indicates the bytes following SO are as defined 2846 * in CNS 11643-plane-1, until another 2847 * SOdesignation appears 2848 * 2849 * ESC $ * H Indicates the two bytes immediately following 2850 * SS2 is a Chinese character as defined in CNS 2851 * 11643-plane-2, until another SS2designation 2852 * appears 2853 * (Meaning <ESC>N must precede every 2 byte 2854 * sequence.) 2855 * 2856 * ESC $ + I Indicates the immediate two bytes following SS3 2857 * is a Chinese character as defined in CNS 2858 * 11643-plane-3, until another SS3designation 2859 * appears 2860 * (Meaning <ESC>O must precede every 2 byte 2861 * sequence.) 2862 * 2863 * ESC $ + J Indicates the immediate two bytes following SS3 2864 * is a Chinese character as defined in CNS 2865 * 11643-plane-4, until another SS3designation 2866 * appears 2867 * (In English: <ESC>O must precede every 2 byte 2868 * sequence.) 2869 * 2870 * ESC $ + K Indicates the immediate two bytes following SS3 2871 * is a Chinese character as defined in CNS 2872 * 11643-plane-5, until another SS3designation 2873 * appears 2874 * 2875 * ESC $ + L Indicates the immediate two bytes following SS3 2876 * is a Chinese character as defined in CNS 2877 * 11643-plane-6, until another SS3designation 2878 * appears 2879 * 2880 * ESC $ + M Indicates the immediate two bytes following SS3 2881 * is a Chinese character as defined in CNS 2882 * 11643-plane-7, until another SS3designation 2883 * appears 2884 * 2885 * As in ISO-2022-CN, each line starts in ASCII, and ends in ASCII, and 2886 * has its own designation information before any Chinese characters 2887 * appear 2888 * 2889 */ 2890 2891 /* The following are defined this way to make the strings truly readonly */ 2892 static const char GB_2312_80_STR[] = "\x1B\x24\x29\x41"; 2893 static const char ISO_IR_165_STR[] = "\x1B\x24\x29\x45"; 2894 static const char CNS_11643_1992_Plane_1_STR[] = "\x1B\x24\x29\x47"; 2895 static const char CNS_11643_1992_Plane_2_STR[] = "\x1B\x24\x2A\x48"; 2896 static const char CNS_11643_1992_Plane_3_STR[] = "\x1B\x24\x2B\x49"; 2897 static const char CNS_11643_1992_Plane_4_STR[] = "\x1B\x24\x2B\x4A"; 2898 static const char CNS_11643_1992_Plane_5_STR[] = "\x1B\x24\x2B\x4B"; 2899 static const char CNS_11643_1992_Plane_6_STR[] = "\x1B\x24\x2B\x4C"; 2900 static const char CNS_11643_1992_Plane_7_STR[] = "\x1B\x24\x2B\x4D"; 2901 2902 /********************** ISO2022-CN Data **************************/ 2903 static const char* const escSeqCharsCN[10] ={ 2904 SHIFT_IN_STR, /* 0 ASCII */ 2905 GB_2312_80_STR, /* 1 GB2312_1 */ 2906 ISO_IR_165_STR, /* 2 ISO_IR_165 */ 2907 CNS_11643_1992_Plane_1_STR, 2908 CNS_11643_1992_Plane_2_STR, 2909 CNS_11643_1992_Plane_3_STR, 2910 CNS_11643_1992_Plane_4_STR, 2911 CNS_11643_1992_Plane_5_STR, 2912 CNS_11643_1992_Plane_6_STR, 2913 CNS_11643_1992_Plane_7_STR 2914 }; 2915 2916 static void U_CALLCONV 2917 UConverter_fromUnicode_ISO_2022_CN_OFFSETS_LOGIC(UConverterFromUnicodeArgs* args, UErrorCode* err){ 2918 UConverter *cnv = args->converter; 2919 UConverterDataISO2022 *converterData; 2920 ISO2022State *pFromU2022State; 2921 uint8_t* target = reinterpret_cast<uint8_t*>(args->target); 2922 const uint8_t* targetLimit = reinterpret_cast<const uint8_t*>(args->targetLimit); 2923 const char16_t* source = args->source; 2924 const char16_t* sourceLimit = args->sourceLimit; 2925 int32_t* offsets = args->offsets; 2926 UChar32 sourceChar; 2927 char buffer[8]; 2928 int32_t len; 2929 int8_t choices[3]; 2930 int32_t choiceCount; 2931 uint32_t targetValue = 0; 2932 UBool useFallback; 2933 2934 /* set up the state */ 2935 converterData = static_cast<UConverterDataISO2022*>(cnv->extraInfo); 2936 pFromU2022State = &converterData->fromU2022State; 2937 2938 choiceCount = 0; 2939 2940 /* check if the last codepoint of previous buffer was a lead surrogate*/ 2941 if((sourceChar = cnv->fromUChar32)!=0 && target< targetLimit) { 2942 goto getTrail; 2943 } 2944 2945 while( source < sourceLimit){ 2946 if(target < targetLimit){ 2947 2948 sourceChar = *(source++); 2949 /*check if the char is a First surrogate*/ 2950 if(U16_IS_SURROGATE(sourceChar)) { 2951 if(U16_IS_SURROGATE_LEAD(sourceChar)) { 2952 getTrail: 2953 /*look ahead to find the trail surrogate*/ 2954 if(source < sourceLimit) { 2955 /* test the following code unit */ 2956 char16_t trail = *source; 2957 if(U16_IS_TRAIL(trail)) { 2958 source++; 2959 sourceChar=U16_GET_SUPPLEMENTARY(sourceChar, trail); 2960 cnv->fromUChar32=0x00; 2961 /* convert this supplementary code point */ 2962 /* exit this condition tree */ 2963 } else { 2964 /* this is an unmatched lead code unit (1st surrogate) */ 2965 /* callback(illegal) */ 2966 *err=U_ILLEGAL_CHAR_FOUND; 2967 cnv->fromUChar32=sourceChar; 2968 break; 2969 } 2970 } else { 2971 /* no more input */ 2972 cnv->fromUChar32=sourceChar; 2973 break; 2974 } 2975 } else { 2976 /* this is an unmatched trail code unit (2nd surrogate) */ 2977 /* callback(illegal) */ 2978 *err=U_ILLEGAL_CHAR_FOUND; 2979 cnv->fromUChar32=sourceChar; 2980 break; 2981 } 2982 } 2983 2984 /* do the conversion */ 2985 if(sourceChar <= 0x007f ){ 2986 /* do not convert SO/SI/ESC */ 2987 if(IS_2022_CONTROL(sourceChar)) { 2988 /* callback(illegal) */ 2989 *err=U_ILLEGAL_CHAR_FOUND; 2990 cnv->fromUChar32=sourceChar; 2991 break; 2992 } 2993 2994 /* US-ASCII */ 2995 if(pFromU2022State->g == 0) { 2996 buffer[0] = static_cast<char>(sourceChar); 2997 len = 1; 2998 } else { 2999 buffer[0] = UCNV_SI; 3000 buffer[1] = static_cast<char>(sourceChar); 3001 len = 2; 3002 pFromU2022State->g = 0; 3003 choiceCount = 0; 3004 } 3005 if(sourceChar == CR || sourceChar == LF) { 3006 /* reset the state at the end of a line */ 3007 uprv_memset(pFromU2022State, 0, sizeof(ISO2022State)); 3008 choiceCount = 0; 3009 } 3010 } 3011 else{ 3012 /* convert U+0080..U+10ffff */ 3013 int32_t i; 3014 int8_t cs, g; 3015 3016 if(choiceCount == 0) { 3017 /* try the current SO/G1 converter first */ 3018 choices[0] = pFromU2022State->cs[1]; 3019 3020 /* default to GB2312_1 if none is designated yet */ 3021 if(choices[0] == 0) { 3022 choices[0] = GB2312_1; 3023 } 3024 3025 if(converterData->version == 0) { 3026 /* ISO-2022-CN */ 3027 3028 /* try the other SO/G1 converter; a CNS_11643_1 lookup may result in any plane */ 3029 if(choices[0] == GB2312_1) { 3030 choices[1] = static_cast<int8_t>(CNS_11643_1); 3031 } else { 3032 choices[1] = static_cast<int8_t>(GB2312_1); 3033 } 3034 3035 choiceCount = 2; 3036 } else if (converterData->version == 1) { 3037 /* ISO-2022-CN-EXT */ 3038 3039 /* try one of the other converters */ 3040 switch(choices[0]) { 3041 case GB2312_1: 3042 choices[1] = static_cast<int8_t>(CNS_11643_1); 3043 choices[2] = static_cast<int8_t>(ISO_IR_165); 3044 break; 3045 case ISO_IR_165: 3046 choices[1] = static_cast<int8_t>(GB2312_1); 3047 choices[2] = static_cast<int8_t>(CNS_11643_1); 3048 break; 3049 default: /* CNS_11643_x */ 3050 choices[1] = static_cast<int8_t>(GB2312_1); 3051 choices[2] = static_cast<int8_t>(ISO_IR_165); 3052 break; 3053 } 3054 3055 choiceCount = 3; 3056 } else { 3057 choices[0] = static_cast<int8_t>(CNS_11643_1); 3058 choices[1] = static_cast<int8_t>(GB2312_1); 3059 } 3060 } 3061 3062 cs = g = 0; 3063 /* 3064 * len==0: no mapping found yet 3065 * len<0: found a fallback result: continue looking for a roundtrip but no further fallbacks 3066 * len>0: found a roundtrip result, done 3067 */ 3068 len = 0; 3069 /* 3070 * We will turn off useFallback after finding a fallback, 3071 * but we still get fallbacks from PUA code points as usual. 3072 * Therefore, we will also need to check that we don't overwrite 3073 * an early fallback with a later one. 3074 */ 3075 useFallback = cnv->useFallback; 3076 3077 for(i = 0; i < choiceCount && len <= 0; ++i) { 3078 int8_t cs0 = choices[i]; 3079 if(cs0 > 0) { 3080 uint32_t value; 3081 int32_t len2; 3082 if(cs0 >= CNS_11643_0) { 3083 len2 = MBCS_FROM_UCHAR32_ISO2022( 3084 converterData->myConverterArray[CNS_11643], 3085 sourceChar, 3086 &value, 3087 useFallback, 3088 MBCS_OUTPUT_3); 3089 if(len2 == 3 || (len2 == -3 && len == 0)) { 3090 targetValue = value; 3091 cs = static_cast<int8_t>(CNS_11643_0 + (value >> 16) - 0x80); 3092 if(len2 >= 0) { 3093 len = 2; 3094 } else { 3095 len = -2; 3096 useFallback = false; 3097 } 3098 if(cs == CNS_11643_1) { 3099 g = 1; 3100 } else if(cs == CNS_11643_2) { 3101 g = 2; 3102 } else /* plane 3..7 */ if(converterData->version == 1) { 3103 g = 3; 3104 } else { 3105 /* ISO-2022-CN (without -EXT) does not support plane 3..7 */ 3106 len = 0; 3107 } 3108 } 3109 } else { 3110 /* GB2312_1 or ISO-IR-165 */ 3111 U_ASSERT(cs0<UCNV_2022_MAX_CONVERTERS); 3112 len2 = MBCS_FROM_UCHAR32_ISO2022( 3113 converterData->myConverterArray[cs0], 3114 sourceChar, 3115 &value, 3116 useFallback, 3117 MBCS_OUTPUT_2); 3118 if(len2 == 2 || (len2 == -2 && len == 0)) { 3119 targetValue = value; 3120 len = len2; 3121 cs = cs0; 3122 g = 1; 3123 useFallback = false; 3124 } 3125 } 3126 } 3127 } 3128 3129 if(len != 0) { 3130 len = 0; /* count output bytes; it must have been abs(len) == 2 */ 3131 3132 /* write the designation sequence if necessary */ 3133 if(cs != pFromU2022State->cs[g]) { 3134 if(cs < CNS_11643) { 3135 uprv_memcpy(buffer, escSeqCharsCN[cs], 4); 3136 } else { 3137 U_ASSERT(cs >= CNS_11643_1); 3138 uprv_memcpy(buffer, escSeqCharsCN[CNS_11643 + (cs - CNS_11643_1)], 4); 3139 } 3140 len = 4; 3141 pFromU2022State->cs[g] = cs; 3142 if(g == 1) { 3143 /* changing the SO/G1 charset invalidates the choices[] */ 3144 choiceCount = 0; 3145 } 3146 } 3147 3148 /* write the shift sequence if necessary */ 3149 if(g != pFromU2022State->g) { 3150 switch(g) { 3151 case 1: 3152 buffer[len++] = UCNV_SO; 3153 3154 /* set the new state only if it is the locking shift SO/G1, not for SS2 or SS3 */ 3155 pFromU2022State->g = 1; 3156 break; 3157 case 2: 3158 buffer[len++] = 0x1b; 3159 buffer[len++] = 0x4e; 3160 break; 3161 default: /* case 3 */ 3162 buffer[len++] = 0x1b; 3163 buffer[len++] = 0x4f; 3164 break; 3165 } 3166 } 3167 3168 /* write the two output bytes */ 3169 buffer[len++] = static_cast<char>(targetValue >> 8); 3170 buffer[len++] = static_cast<char>(targetValue); 3171 } else { 3172 /* if we cannot find the character after checking all codepages 3173 * then this is an error 3174 */ 3175 *err = U_INVALID_CHAR_FOUND; 3176 cnv->fromUChar32=sourceChar; 3177 break; 3178 } 3179 } 3180 3181 /* output len>0 bytes in buffer[] */ 3182 if(len == 1) { 3183 *target++ = buffer[0]; 3184 if(offsets) { 3185 *offsets++ = static_cast<int32_t>(source - args->source - 1); /* -1: known to be ASCII */ 3186 } 3187 } else if(len == 2 && (target + 2) <= targetLimit) { 3188 *target++ = buffer[0]; 3189 *target++ = buffer[1]; 3190 if(offsets) { 3191 int32_t sourceIndex = static_cast<int32_t>(source - args->source - U16_LENGTH(sourceChar)); 3192 *offsets++ = sourceIndex; 3193 *offsets++ = sourceIndex; 3194 } 3195 } else { 3196 fromUWriteUInt8( 3197 cnv, 3198 buffer, len, 3199 &target, reinterpret_cast<const char*>(targetLimit), 3200 &offsets, static_cast<int32_t>(source - args->source - U16_LENGTH(sourceChar)), 3201 err); 3202 if(U_FAILURE(*err)) { 3203 break; 3204 } 3205 } 3206 } /* end if(myTargetIndex<myTargetLength) */ 3207 else{ 3208 *err =U_BUFFER_OVERFLOW_ERROR; 3209 break; 3210 } 3211 3212 }/* end while(mySourceIndex<mySourceLength) */ 3213 3214 /* 3215 * the end of the input stream and detection of truncated input 3216 * are handled by the framework, but for ISO-2022-CN conversion 3217 * we need to be in ASCII mode at the very end 3218 * 3219 * conditions: 3220 * successful 3221 * not in ASCII mode 3222 * end of input and no truncated input 3223 */ 3224 if( U_SUCCESS(*err) && 3225 pFromU2022State->g!=0 && 3226 args->flush && source>=sourceLimit && cnv->fromUChar32==0 3227 ) { 3228 int32_t sourceIndex; 3229 3230 /* we are switching to ASCII */ 3231 pFromU2022State->g=0; 3232 3233 /* get the source index of the last input character */ 3234 /* 3235 * TODO this would be simpler and more reliable if we used a pair 3236 * of sourceIndex/prevSourceIndex like in ucnvmbcs.c 3237 * so that we could simply use the prevSourceIndex here; 3238 * this code gives an incorrect result for the rare case of an unmatched 3239 * trail surrogate that is alone in the last buffer of the text stream 3240 */ 3241 sourceIndex = static_cast<int32_t>(source - args->source); 3242 if(sourceIndex>0) { 3243 --sourceIndex; 3244 if( U16_IS_TRAIL(args->source[sourceIndex]) && 3245 (sourceIndex==0 || U16_IS_LEAD(args->source[sourceIndex-1])) 3246 ) { 3247 --sourceIndex; 3248 } 3249 } else { 3250 sourceIndex=-1; 3251 } 3252 3253 fromUWriteUInt8( 3254 cnv, 3255 SHIFT_IN_STR, 1, 3256 &target, reinterpret_cast<const char*>(targetLimit), 3257 &offsets, sourceIndex, 3258 err); 3259 } 3260 3261 /*save the state and return */ 3262 args->source = source; 3263 args->target = reinterpret_cast<char*>(target); 3264 } 3265 3266 3267 static void U_CALLCONV 3268 UConverter_toUnicode_ISO_2022_CN_OFFSETS_LOGIC(UConverterToUnicodeArgs *args, 3269 UErrorCode* err){ 3270 char tempBuf[3]; 3271 const char* mySource = const_cast<char*>(args->source); 3272 char16_t *myTarget = args->target; 3273 const char *mySourceLimit = args->sourceLimit; 3274 uint32_t targetUniChar = 0x0000; 3275 uint32_t mySourceChar = 0x0000; 3276 UConverterDataISO2022* myData; 3277 ISO2022State *pToU2022State; 3278 3279 myData = static_cast<UConverterDataISO2022*>(args->converter->extraInfo); 3280 pToU2022State = &myData->toU2022State; 3281 3282 if(myData->key != 0) { 3283 /* continue with a partial escape sequence */ 3284 goto escape; 3285 } else if(args->converter->toULength == 1 && mySource < mySourceLimit && myTarget < args->targetLimit) { 3286 /* continue with a partial double-byte character */ 3287 mySourceChar = args->converter->toUBytes[0]; 3288 args->converter->toULength = 0; 3289 targetUniChar = missingCharMarker; 3290 goto getTrailByte; 3291 } 3292 3293 while(mySource < mySourceLimit){ 3294 3295 targetUniChar =missingCharMarker; 3296 3297 if(myTarget < args->targetLimit){ 3298 3299 mySourceChar = static_cast<unsigned char>(*mySource++); 3300 3301 switch(mySourceChar){ 3302 case UCNV_SI: 3303 pToU2022State->g=0; 3304 if (myData->isEmptySegment) { 3305 myData->isEmptySegment = false; /* we are handling it, reset to avoid future spurious errors */ 3306 *err = U_ILLEGAL_ESCAPE_SEQUENCE; 3307 args->converter->toUCallbackReason = UCNV_IRREGULAR; 3308 args->converter->toUBytes[0] = static_cast<uint8_t>(mySourceChar); 3309 args->converter->toULength = 1; 3310 args->target = myTarget; 3311 args->source = mySource; 3312 return; 3313 } 3314 continue; 3315 3316 case UCNV_SO: 3317 if(pToU2022State->cs[1] != 0) { 3318 pToU2022State->g=1; 3319 myData->isEmptySegment = true; /* Begin a new segment, empty so far */ 3320 continue; 3321 } else { 3322 /* illegal to have SO before a matching designator */ 3323 myData->isEmptySegment = false; /* Handling a different error, reset this to avoid future spurious errs */ 3324 break; 3325 } 3326 3327 case ESC_2022: 3328 mySource--; 3329 escape: 3330 { 3331 const char * mySourceBefore = mySource; 3332 int8_t toULengthBefore = args->converter->toULength; 3333 3334 changeState_2022(args->converter,&(mySource), 3335 mySourceLimit, ISO_2022_CN,err); 3336 3337 /* After SO there must be at least one character before a designator (designator error handled separately) */ 3338 if(myData->key==0 && U_SUCCESS(*err) && myData->isEmptySegment) { 3339 *err = U_ILLEGAL_ESCAPE_SEQUENCE; 3340 args->converter->toUCallbackReason = UCNV_IRREGULAR; 3341 args->converter->toULength = static_cast<int8_t>(toULengthBefore + (mySource - mySourceBefore)); 3342 } 3343 } 3344 3345 /* invalid or illegal escape sequence */ 3346 if(U_FAILURE(*err)){ 3347 args->target = myTarget; 3348 args->source = mySource; 3349 myData->isEmptySegment = false; /* Reset to avoid future spurious errors */ 3350 return; 3351 } 3352 continue; 3353 3354 /* ISO-2022-CN does not use single-byte (C1) SS2 and SS3 */ 3355 3356 case CR: 3357 case LF: 3358 uprv_memset(pToU2022State, 0, sizeof(ISO2022State)); 3359 U_FALLTHROUGH; 3360 default: 3361 /* convert one or two bytes */ 3362 myData->isEmptySegment = false; 3363 if(pToU2022State->g != 0) { 3364 if(mySource < mySourceLimit) { 3365 UConverterSharedData *cnv; 3366 StateEnum tempState; 3367 int32_t tempBufLen; 3368 int leadIsOk, trailIsOk; 3369 uint8_t trailByte; 3370 getTrailByte: 3371 trailByte = static_cast<uint8_t>(*mySource); 3372 /* 3373 * Ticket 5691: consistent illegal sequences: 3374 * - We include at least the first byte in the illegal sequence. 3375 * - If any of the non-initial bytes could be the start of a character, 3376 * we stop the illegal sequence before the first one of those. 3377 * 3378 * In ISO-2022 DBCS, if the second byte is in the 21..7e range or is 3379 * an ESC/SO/SI, we report only the first byte as the illegal sequence. 3380 * Otherwise we convert or report the pair of bytes. 3381 */ 3382 leadIsOk = static_cast<uint8_t>(mySourceChar - 0x21) <= (0x7e - 0x21); 3383 trailIsOk = static_cast<uint8_t>(trailByte - 0x21) <= (0x7e - 0x21); 3384 if (leadIsOk && trailIsOk) { 3385 ++mySource; 3386 tempState = static_cast<StateEnum>(pToU2022State->cs[pToU2022State->g]); 3387 if(tempState >= CNS_11643_0) { 3388 cnv = myData->myConverterArray[CNS_11643]; 3389 tempBuf[0] = static_cast<char>(0x80 + (tempState - CNS_11643_0)); 3390 tempBuf[1] = static_cast<char>(mySourceChar); 3391 tempBuf[2] = static_cast<char>(trailByte); 3392 tempBufLen = 3; 3393 3394 }else{ 3395 U_ASSERT(tempState<UCNV_2022_MAX_CONVERTERS); 3396 cnv = myData->myConverterArray[tempState]; 3397 tempBuf[0] = static_cast<char>(mySourceChar); 3398 tempBuf[1] = static_cast<char>(trailByte); 3399 tempBufLen = 2; 3400 } 3401 targetUniChar = ucnv_MBCSSimpleGetNextUChar(cnv, tempBuf, tempBufLen, false); 3402 mySourceChar = (mySourceChar << 8) | trailByte; 3403 } else if (!(trailIsOk || IS_2022_CONTROL(trailByte))) { 3404 /* report a pair of illegal bytes if the second byte is not a DBCS starter */ 3405 ++mySource; 3406 /* add another bit so that the code below writes 2 bytes in case of error */ 3407 mySourceChar = 0x10000 | (mySourceChar << 8) | trailByte; 3408 } 3409 if(pToU2022State->g>=2) { 3410 /* return from a single-shift state to the previous one */ 3411 pToU2022State->g=pToU2022State->prevG; 3412 } 3413 } else { 3414 args->converter->toUBytes[0] = static_cast<uint8_t>(mySourceChar); 3415 args->converter->toULength = 1; 3416 goto endloop; 3417 } 3418 } 3419 else{ 3420 if(mySourceChar <= 0x7f) { 3421 targetUniChar = static_cast<char16_t>(mySourceChar); 3422 } 3423 } 3424 break; 3425 } 3426 if(targetUniChar < (missingCharMarker-1/*0xfffe*/)){ 3427 if(args->offsets){ 3428 args->offsets[myTarget - args->target] = static_cast<int32_t>(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2)); 3429 } 3430 *(myTarget++) = static_cast<char16_t>(targetUniChar); 3431 } 3432 else if(targetUniChar > missingCharMarker){ 3433 /* disassemble the surrogate pair and write to output*/ 3434 targetUniChar-=0x0010000; 3435 *myTarget = static_cast<char16_t>(0xd800 + static_cast<char16_t>(targetUniChar >> 10)); 3436 if(args->offsets){ 3437 args->offsets[myTarget - args->target] = static_cast<int32_t>(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2)); 3438 } 3439 ++myTarget; 3440 if(myTarget< args->targetLimit){ 3441 *myTarget = static_cast<char16_t>(0xdc00 + static_cast<char16_t>(targetUniChar & 0x3ff)); 3442 if(args->offsets){ 3443 args->offsets[myTarget - args->target] = static_cast<int32_t>(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2)); 3444 } 3445 ++myTarget; 3446 }else{ 3447 args->converter->UCharErrorBuffer[args->converter->UCharErrorBufferLength++]= 3448 static_cast<char16_t>(0xdc00 + static_cast<char16_t>(targetUniChar & 0x3ff)); 3449 } 3450 3451 } 3452 else{ 3453 /* Call the callback function*/ 3454 toUnicodeCallback(args->converter,mySourceChar,targetUniChar,err); 3455 break; 3456 } 3457 } 3458 else{ 3459 *err =U_BUFFER_OVERFLOW_ERROR; 3460 break; 3461 } 3462 } 3463 endloop: 3464 args->target = myTarget; 3465 args->source = mySource; 3466 } 3467 #endif /* #if !UCONFIG_ONLY_HTML_CONVERSION */ 3468 3469 static void U_CALLCONV 3470 _ISO_2022_WriteSub(UConverterFromUnicodeArgs *args, int32_t offsetIndex, UErrorCode *err) { 3471 UConverter *cnv = args->converter; 3472 UConverterDataISO2022* myConverterData = static_cast<UConverterDataISO2022*>(cnv->extraInfo); 3473 ISO2022State *pFromU2022State=&myConverterData->fromU2022State; 3474 char *p, *subchar; 3475 char buffer[8]; 3476 int32_t length; 3477 3478 subchar = reinterpret_cast<char*>(cnv->subChars); 3479 length=cnv->subCharLen; /* assume length==1 for most variants */ 3480 3481 p = buffer; 3482 switch(myConverterData->locale[0]){ 3483 case 'j': 3484 { 3485 int8_t cs; 3486 3487 if(pFromU2022State->g == 1) { 3488 /* JIS7: switch from G1 to G0 */ 3489 pFromU2022State->g = 0; 3490 *p++ = UCNV_SI; 3491 } 3492 3493 cs = pFromU2022State->cs[0]; 3494 if(cs != ASCII && cs != JISX201) { 3495 /* not in ASCII or JIS X 0201: switch to ASCII */ 3496 pFromU2022State->cs[0] = static_cast<int8_t>(ASCII); 3497 *p++ = '\x1b'; 3498 *p++ = '\x28'; 3499 *p++ = '\x42'; 3500 } 3501 3502 *p++ = subchar[0]; 3503 break; 3504 } 3505 case 'c': 3506 if(pFromU2022State->g != 0) { 3507 /* not in ASCII mode: switch to ASCII */ 3508 pFromU2022State->g = 0; 3509 *p++ = UCNV_SI; 3510 } 3511 *p++ = subchar[0]; 3512 break; 3513 case 'k': 3514 if(myConverterData->version == 0) { 3515 if(length == 1) { 3516 if(args->converter->fromUnicodeStatus) { 3517 /* in DBCS mode: switch to SBCS */ 3518 args->converter->fromUnicodeStatus = 0; 3519 *p++ = UCNV_SI; 3520 } 3521 *p++ = subchar[0]; 3522 } else /* length == 2*/ { 3523 if(!args->converter->fromUnicodeStatus) { 3524 /* in SBCS mode: switch to DBCS */ 3525 args->converter->fromUnicodeStatus = 1; 3526 *p++ = UCNV_SO; 3527 } 3528 *p++ = subchar[0]; 3529 *p++ = subchar[1]; 3530 } 3531 break; 3532 } else { 3533 /* save the subconverter's substitution string */ 3534 uint8_t *currentSubChars = myConverterData->currentConverter->subChars; 3535 int8_t currentSubCharLen = myConverterData->currentConverter->subCharLen; 3536 3537 /* set our substitution string into the subconverter */ 3538 myConverterData->currentConverter->subChars = reinterpret_cast<uint8_t*>(subchar); 3539 myConverterData->currentConverter->subCharLen = static_cast<int8_t>(length); 3540 3541 /* let the subconverter write the subchar, set/retrieve fromUChar32 state */ 3542 args->converter = myConverterData->currentConverter; 3543 myConverterData->currentConverter->fromUChar32 = cnv->fromUChar32; 3544 ucnv_cbFromUWriteSub(args, 0, err); 3545 cnv->fromUChar32 = myConverterData->currentConverter->fromUChar32; 3546 args->converter = cnv; 3547 3548 /* restore the subconverter's substitution string */ 3549 myConverterData->currentConverter->subChars = currentSubChars; 3550 myConverterData->currentConverter->subCharLen = currentSubCharLen; 3551 3552 if(*err == U_BUFFER_OVERFLOW_ERROR) { 3553 if(myConverterData->currentConverter->charErrorBufferLength > 0) { 3554 uprv_memcpy( 3555 cnv->charErrorBuffer, 3556 myConverterData->currentConverter->charErrorBuffer, 3557 myConverterData->currentConverter->charErrorBufferLength); 3558 } 3559 cnv->charErrorBufferLength = myConverterData->currentConverter->charErrorBufferLength; 3560 myConverterData->currentConverter->charErrorBufferLength = 0; 3561 } 3562 return; 3563 } 3564 default: 3565 /* not expected */ 3566 break; 3567 } 3568 ucnv_cbFromUWriteBytes(args, 3569 buffer, static_cast<int32_t>(p - buffer), 3570 offsetIndex, err); 3571 } 3572 3573 /* 3574 * Structure for cloning an ISO 2022 converter into a single memory block. 3575 */ 3576 struct cloneStruct 3577 { 3578 UConverter cnv; 3579 UConverter currentConverter; 3580 UConverterDataISO2022 mydata; 3581 }; 3582 3583 3584 U_CDECL_BEGIN 3585 3586 static UConverter * U_CALLCONV 3587 _ISO_2022_SafeClone( 3588 const UConverter *cnv, 3589 void *stackBuffer, 3590 int32_t *pBufferSize, 3591 UErrorCode *status) 3592 { 3593 struct cloneStruct * localClone; 3594 UConverterDataISO2022 *cnvData; 3595 int32_t i, size; 3596 3597 if (U_FAILURE(*status)){ 3598 return nullptr; 3599 } 3600 3601 if (*pBufferSize == 0) { /* 'preflighting' request - set needed size into *pBufferSize */ 3602 *pBufferSize = (int32_t)sizeof(struct cloneStruct); 3603 return nullptr; 3604 } 3605 3606 cnvData = (UConverterDataISO2022 *)cnv->extraInfo; 3607 localClone = (struct cloneStruct *)stackBuffer; 3608 3609 /* ucnv.c/ucnv_safeClone() copied the main UConverter already */ 3610 3611 uprv_memcpy(&localClone->mydata, cnvData, sizeof(UConverterDataISO2022)); 3612 localClone->cnv.extraInfo = &localClone->mydata; /* set pointer to extra data */ 3613 localClone->cnv.isExtraLocal = true; 3614 3615 /* share the subconverters */ 3616 3617 if(cnvData->currentConverter != nullptr) { 3618 size = (int32_t)sizeof(UConverter); 3619 localClone->mydata.currentConverter = 3620 ucnv_safeClone(cnvData->currentConverter, 3621 &localClone->currentConverter, 3622 &size, status); 3623 if(U_FAILURE(*status)) { 3624 return nullptr; 3625 } 3626 } 3627 3628 for(i=0; i<UCNV_2022_MAX_CONVERTERS; ++i) { 3629 if(cnvData->myConverterArray[i] != nullptr) { 3630 ucnv_incrementRefCount(cnvData->myConverterArray[i]); 3631 } 3632 } 3633 3634 return &localClone->cnv; 3635 } 3636 3637 U_CDECL_END 3638 3639 static void U_CALLCONV 3640 _ISO_2022_GetUnicodeSet(const UConverter *cnv, 3641 const USetAdder *sa, 3642 UConverterUnicodeSet which, 3643 UErrorCode *pErrorCode) 3644 { 3645 int32_t i; 3646 UConverterDataISO2022* cnvData; 3647 3648 if (U_FAILURE(*pErrorCode)) { 3649 return; 3650 } 3651 #ifdef U_ENABLE_GENERIC_ISO_2022 3652 if (cnv->sharedData == &_ISO2022Data) { 3653 /* We use UTF-8 in this case */ 3654 sa->addRange(sa->set, 0, 0xd7FF); 3655 sa->addRange(sa->set, 0xE000, 0x10FFFF); 3656 return; 3657 } 3658 #endif 3659 3660 cnvData = static_cast<UConverterDataISO2022*>(cnv->extraInfo); 3661 3662 /* open a set and initialize it with code points that are algorithmically round-tripped */ 3663 switch(cnvData->locale[0]){ 3664 case 'j': 3665 /* include JIS X 0201 which is hardcoded */ 3666 sa->add(sa->set, 0xa5); 3667 sa->add(sa->set, 0x203e); 3668 if(jpCharsetMasks[cnvData->version]&CSM(ISO8859_1)) { 3669 /* include Latin-1 for some variants of JP */ 3670 sa->addRange(sa->set, 0, 0xff); 3671 } else { 3672 /* include ASCII for JP */ 3673 sa->addRange(sa->set, 0, 0x7f); 3674 } 3675 if(cnvData->version==3 || cnvData->version==4 || which==UCNV_ROUNDTRIP_AND_FALLBACK_SET) { 3676 /* 3677 * Do not test (jpCharsetMasks[cnvData->version]&CSM(HWKANA_7BIT))!=0 3678 * because the bit is on for all JP versions although only versions 3 & 4 (JIS7 & JIS8) 3679 * use half-width Katakana. 3680 * This is because all ISO-2022-JP variants are lenient in that they accept (in toUnicode) 3681 * half-width Katakana via the ESC ( I sequence. 3682 * However, we only emit (fromUnicode) half-width Katakana according to the 3683 * definition of each variant. 3684 * 3685 * When including fallbacks, 3686 * we need to include half-width Katakana Unicode code points for all JP variants because 3687 * JIS X 0208 has hardcoded fallbacks for them (which map to full-width Katakana). 3688 */ 3689 /* include half-width Katakana for JP */ 3690 sa->addRange(sa->set, HWKANA_START, HWKANA_END); 3691 } 3692 break; 3693 #if !UCONFIG_ONLY_HTML_CONVERSION 3694 case 'c': 3695 case 'z': 3696 /* include ASCII for CN */ 3697 sa->addRange(sa->set, 0, 0x7f); 3698 break; 3699 case 'k': 3700 /* there is only one converter for KR, and it is not in the myConverterArray[] */ 3701 cnvData->currentConverter->sharedData->impl->getUnicodeSet( 3702 cnvData->currentConverter, sa, which, pErrorCode); 3703 /* the loop over myConverterArray[] will simply not find another converter */ 3704 break; 3705 #endif 3706 default: 3707 break; 3708 } 3709 3710 #if 0 /* Replaced by ucnv_MBCSGetFilteredUnicodeSetForUnicode() until we implement ucnv_getUnicodeSet() with reverse fallbacks. */ 3711 if( (cnvData->locale[0]=='c' || cnvData->locale[0]=='z') && 3712 cnvData->version==0 && i==CNS_11643 3713 ) { 3714 /* special handling for non-EXT ISO-2022-CN: add only code points for CNS planes 1 and 2 */ 3715 ucnv_MBCSGetUnicodeSetForBytes( 3716 cnvData->myConverterArray[i], 3717 sa, UCNV_ROUNDTRIP_SET, 3718 0, 0x81, 0x82, 3719 pErrorCode); 3720 } 3721 #endif 3722 3723 for (i=0; i<UCNV_2022_MAX_CONVERTERS; i++) { 3724 UConverterSetFilter filter; 3725 if(cnvData->myConverterArray[i]!=nullptr) { 3726 if(cnvData->locale[0]=='j' && i==JISX208) { 3727 /* 3728 * Only add code points that map to Shift-JIS codes 3729 * corresponding to JIS X 0208. 3730 */ 3731 filter=UCNV_SET_FILTER_SJIS; 3732 #if !UCONFIG_ONLY_HTML_CONVERSION 3733 } else if( (cnvData->locale[0]=='c' || cnvData->locale[0]=='z') && 3734 cnvData->version==0 && i==CNS_11643) { 3735 /* 3736 * Version-specific for CN: 3737 * CN version 0 does not map CNS planes 3..7 although 3738 * they are all available in the CNS conversion table; 3739 * CN version 1 (-EXT) does map them all. 3740 * The two versions create different Unicode sets. 3741 */ 3742 filter=UCNV_SET_FILTER_2022_CN; 3743 } else if(i==KSC5601) { 3744 /* 3745 * Some of the KSC 5601 tables (convrtrs.txt has this aliases on multiple tables) 3746 * are broader than GR94. 3747 */ 3748 filter=UCNV_SET_FILTER_GR94DBCS; 3749 #endif 3750 } else { 3751 filter=UCNV_SET_FILTER_NONE; 3752 } 3753 ucnv_MBCSGetFilteredUnicodeSetForUnicode(cnvData->myConverterArray[i], sa, which, filter, pErrorCode); 3754 } 3755 } 3756 3757 /* 3758 * ISO 2022 converters must not convert SO/SI/ESC despite what 3759 * sub-converters do by themselves. 3760 * Remove these characters from the set. 3761 */ 3762 sa->remove(sa->set, 0x0e); 3763 sa->remove(sa->set, 0x0f); 3764 sa->remove(sa->set, 0x1b); 3765 3766 /* ISO 2022 converters do not convert C1 controls either */ 3767 sa->removeRange(sa->set, 0x80, 0x9f); 3768 } 3769 3770 static const UConverterImpl _ISO2022Impl={ 3771 UCNV_ISO_2022, 3772 3773 nullptr, 3774 nullptr, 3775 3776 _ISO2022Open, 3777 _ISO2022Close, 3778 _ISO2022Reset, 3779 3780 #ifdef U_ENABLE_GENERIC_ISO_2022 3781 T_UConverter_toUnicode_ISO_2022_OFFSETS_LOGIC, 3782 T_UConverter_toUnicode_ISO_2022_OFFSETS_LOGIC, 3783 ucnv_fromUnicode_UTF8, 3784 ucnv_fromUnicode_UTF8_OFFSETS_LOGIC, 3785 #else 3786 nullptr, 3787 nullptr, 3788 nullptr, 3789 nullptr, 3790 #endif 3791 nullptr, 3792 3793 nullptr, 3794 _ISO2022getName, 3795 _ISO_2022_WriteSub, 3796 _ISO_2022_SafeClone, 3797 _ISO_2022_GetUnicodeSet, 3798 3799 nullptr, 3800 nullptr 3801 }; 3802 static const UConverterStaticData _ISO2022StaticData={ 3803 sizeof(UConverterStaticData), 3804 "ISO_2022", 3805 2022, 3806 UCNV_IBM, 3807 UCNV_ISO_2022, 3808 1, 3809 3, /* max 3 bytes per char16_t from UTF-8 (4 bytes from surrogate _pair_) */ 3810 { 0x1a, 0, 0, 0 }, 3811 1, 3812 false, 3813 false, 3814 0, 3815 0, 3816 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */ 3817 }; 3818 const UConverterSharedData _ISO2022Data= 3819 UCNV_IMMUTABLE_SHARED_DATA_INITIALIZER(&_ISO2022StaticData, &_ISO2022Impl); 3820 3821 /*************JP****************/ 3822 static const UConverterImpl _ISO2022JPImpl={ 3823 UCNV_ISO_2022, 3824 3825 nullptr, 3826 nullptr, 3827 3828 _ISO2022Open, 3829 _ISO2022Close, 3830 _ISO2022Reset, 3831 3832 UConverter_toUnicode_ISO_2022_JP_OFFSETS_LOGIC, 3833 UConverter_toUnicode_ISO_2022_JP_OFFSETS_LOGIC, 3834 UConverter_fromUnicode_ISO_2022_JP_OFFSETS_LOGIC, 3835 UConverter_fromUnicode_ISO_2022_JP_OFFSETS_LOGIC, 3836 nullptr, 3837 3838 nullptr, 3839 _ISO2022getName, 3840 _ISO_2022_WriteSub, 3841 _ISO_2022_SafeClone, 3842 _ISO_2022_GetUnicodeSet, 3843 3844 nullptr, 3845 nullptr 3846 }; 3847 static const UConverterStaticData _ISO2022JPStaticData={ 3848 sizeof(UConverterStaticData), 3849 "ISO_2022_JP", 3850 0, 3851 UCNV_IBM, 3852 UCNV_ISO_2022, 3853 1, 3854 6, /* max 6 bytes per char16_t: 4-byte escape sequence + DBCS */ 3855 { 0x1a, 0, 0, 0 }, 3856 1, 3857 false, 3858 false, 3859 0, 3860 0, 3861 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */ 3862 }; 3863 3864 namespace { 3865 3866 const UConverterSharedData _ISO2022JPData= 3867 UCNV_IMMUTABLE_SHARED_DATA_INITIALIZER(&_ISO2022JPStaticData, &_ISO2022JPImpl); 3868 3869 } // namespace 3870 3871 #if !UCONFIG_ONLY_HTML_CONVERSION 3872 /************* KR ***************/ 3873 static const UConverterImpl _ISO2022KRImpl={ 3874 UCNV_ISO_2022, 3875 3876 nullptr, 3877 nullptr, 3878 3879 _ISO2022Open, 3880 _ISO2022Close, 3881 _ISO2022Reset, 3882 3883 UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC, 3884 UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC, 3885 UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC, 3886 UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC, 3887 nullptr, 3888 3889 nullptr, 3890 _ISO2022getName, 3891 _ISO_2022_WriteSub, 3892 _ISO_2022_SafeClone, 3893 _ISO_2022_GetUnicodeSet, 3894 3895 nullptr, 3896 nullptr 3897 }; 3898 static const UConverterStaticData _ISO2022KRStaticData={ 3899 sizeof(UConverterStaticData), 3900 "ISO_2022_KR", 3901 0, 3902 UCNV_IBM, 3903 UCNV_ISO_2022, 3904 1, 3905 8, /* max 8 bytes per char16_t */ 3906 { 0x1a, 0, 0, 0 }, 3907 1, 3908 false, 3909 false, 3910 0, 3911 0, 3912 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */ 3913 }; 3914 3915 namespace { 3916 3917 const UConverterSharedData _ISO2022KRData= 3918 UCNV_IMMUTABLE_SHARED_DATA_INITIALIZER(&_ISO2022KRStaticData, &_ISO2022KRImpl); 3919 3920 } // namespace 3921 3922 /*************** CN ***************/ 3923 static const UConverterImpl _ISO2022CNImpl={ 3924 3925 UCNV_ISO_2022, 3926 3927 nullptr, 3928 nullptr, 3929 3930 _ISO2022Open, 3931 _ISO2022Close, 3932 _ISO2022Reset, 3933 3934 UConverter_toUnicode_ISO_2022_CN_OFFSETS_LOGIC, 3935 UConverter_toUnicode_ISO_2022_CN_OFFSETS_LOGIC, 3936 UConverter_fromUnicode_ISO_2022_CN_OFFSETS_LOGIC, 3937 UConverter_fromUnicode_ISO_2022_CN_OFFSETS_LOGIC, 3938 nullptr, 3939 3940 nullptr, 3941 _ISO2022getName, 3942 _ISO_2022_WriteSub, 3943 _ISO_2022_SafeClone, 3944 _ISO_2022_GetUnicodeSet, 3945 3946 nullptr, 3947 nullptr 3948 }; 3949 static const UConverterStaticData _ISO2022CNStaticData={ 3950 sizeof(UConverterStaticData), 3951 "ISO_2022_CN", 3952 0, 3953 UCNV_IBM, 3954 UCNV_ISO_2022, 3955 1, 3956 8, /* max 8 bytes per char16_t: 4-byte CNS designator + 2 bytes for SS2/SS3 + DBCS */ 3957 { 0x1a, 0, 0, 0 }, 3958 1, 3959 false, 3960 false, 3961 0, 3962 0, 3963 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */ 3964 }; 3965 3966 namespace { 3967 3968 const UConverterSharedData _ISO2022CNData= 3969 UCNV_IMMUTABLE_SHARED_DATA_INITIALIZER(&_ISO2022CNStaticData, &_ISO2022CNImpl); 3970 3971 } // namespace 3972 #endif /* #if !UCONFIG_ONLY_HTML_CONVERSION */ 3973 3974 #endif /* #if !UCONFIG_NO_LEGACY_CONVERSION */