ucnv_err.cpp (18401B)
1 // © 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html 3 /* 4 ***************************************************************************** 5 * 6 * Copyright (C) 1998-2016, International Business Machines 7 * Corporation and others. All Rights Reserved. 8 * 9 ***************************************************************************** 10 * 11 * ucnv_err.c 12 * Implements error behaviour functions called by T_UConverter_{from,to}Unicode 13 * 14 * 15 * Change history: 16 * 17 * 06/29/2000 helena Major rewrite of the callback APIs. 18 */ 19 20 #include "unicode/utypes.h" 21 22 #if !UCONFIG_NO_CONVERSION 23 24 #include "unicode/ucnv_err.h" 25 #include "unicode/ucnv_cb.h" 26 #include "ucnv_cnv.h" 27 #include "cmemory.h" 28 #include "unicode/ucnv.h" 29 #include "ustrfmt.h" 30 31 #define VALUE_STRING_LENGTH 48 32 /*Magic # 32 = 4(number of char in value string) * 8(max number of bytes per char for any converter) */ 33 #define UNICODE_PERCENT_SIGN_CODEPOINT 0x0025 34 #define UNICODE_U_CODEPOINT 0x0055 35 #define UNICODE_X_CODEPOINT 0x0058 36 #define UNICODE_RS_CODEPOINT 0x005C 37 #define UNICODE_U_LOW_CODEPOINT 0x0075 38 #define UNICODE_X_LOW_CODEPOINT 0x0078 39 #define UNICODE_AMP_CODEPOINT 0x0026 40 #define UNICODE_HASH_CODEPOINT 0x0023 41 #define UNICODE_SEMICOLON_CODEPOINT 0x003B 42 #define UNICODE_PLUS_CODEPOINT 0x002B 43 #define UNICODE_LEFT_CURLY_CODEPOINT 0x007B 44 #define UNICODE_RIGHT_CURLY_CODEPOINT 0x007D 45 #define UNICODE_SPACE_CODEPOINT 0x0020 46 #define UCNV_PRV_ESCAPE_ICU 0 47 #define UCNV_PRV_ESCAPE_C 'C' 48 #define UCNV_PRV_ESCAPE_XML_DEC 'D' 49 #define UCNV_PRV_ESCAPE_XML_HEX 'X' 50 #define UCNV_PRV_ESCAPE_JAVA 'J' 51 #define UCNV_PRV_ESCAPE_UNICODE 'U' 52 #define UCNV_PRV_ESCAPE_CSS2 'S' 53 #define UCNV_PRV_STOP_ON_ILLEGAL 'i' 54 55 /* 56 * IS_DEFAULT_IGNORABLE_CODE_POINT 57 * This is to check if a code point has the default ignorable unicode property. 58 * As such, this list needs to be updated if the ignorable code point list ever 59 * changes. 60 * To avoid dependency on other code, this list is hard coded here. 61 * When an ignorable code point is found and is unmappable, the default callbacks 62 * will ignore them. 63 * For a list of the default ignorable code points, use this link: 64 * https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=%5B%3ADI%3A%5D&abb=on&g=&i= 65 * 66 * This list should be sync with the one in CharsetCallback.java 67 */ 68 #define IS_DEFAULT_IGNORABLE_CODE_POINT(c) ( \ 69 (c == 0x00AD) || \ 70 (c == 0x034F) || \ 71 (c == 0x061C) || \ 72 (c == 0x115F) || \ 73 (c == 0x1160) || \ 74 (0x17B4 <= c && c <= 0x17B5) || \ 75 (0x180B <= c && c <= 0x180F) || \ 76 (0x200B <= c && c <= 0x200F) || \ 77 (0x202A <= c && c <= 0x202E) || \ 78 (0x2060 <= c && c <= 0x206F) || \ 79 (c == 0x3164) || \ 80 (0xFE00 <= c && c <= 0xFE0F) || \ 81 (c == 0xFEFF) || \ 82 (c == 0xFFA0) || \ 83 (0xFFF0 <= c && c <= 0xFFF8) || \ 84 (0x1BCA0 <= c && c <= 0x1BCA3) || \ 85 (0x1D173 <= c && c <= 0x1D17A) || \ 86 (0xE0000 <= c && c <= 0xE0FFF)) 87 88 89 /*Function Pointer STOPS at the ILLEGAL_SEQUENCE */ 90 U_CAPI void U_EXPORT2 91 UCNV_FROM_U_CALLBACK_STOP ( 92 const void *context, 93 UConverterFromUnicodeArgs *fromUArgs, 94 const char16_t* codeUnits, 95 int32_t length, 96 UChar32 codePoint, 97 UConverterCallbackReason reason, 98 UErrorCode * err) 99 { 100 (void)context; 101 (void)fromUArgs; 102 (void)codeUnits; 103 (void)length; 104 if (reason == UCNV_UNASSIGNED && IS_DEFAULT_IGNORABLE_CODE_POINT(codePoint)) 105 { 106 /* 107 * Skip if the codepoint has unicode property of default ignorable. 108 */ 109 *err = U_ZERO_ERROR; 110 } 111 /* the caller must have set the error code accordingly */ 112 } 113 114 115 /*Function Pointer STOPS at the ILLEGAL_SEQUENCE */ 116 U_CAPI void U_EXPORT2 117 UCNV_TO_U_CALLBACK_STOP ( 118 const void *context, 119 UConverterToUnicodeArgs *toUArgs, 120 const char* codePoints, 121 int32_t length, 122 UConverterCallbackReason reason, 123 UErrorCode * err) 124 { 125 /* the caller must have set the error code accordingly */ 126 (void)context; (void)toUArgs; (void)codePoints; (void)length; (void)reason; (void)err; 127 } 128 129 U_CAPI void U_EXPORT2 130 UCNV_FROM_U_CALLBACK_SKIP ( 131 const void *context, 132 UConverterFromUnicodeArgs *fromUArgs, 133 const char16_t* codeUnits, 134 int32_t length, 135 UChar32 codePoint, 136 UConverterCallbackReason reason, 137 UErrorCode * err) 138 { 139 (void)fromUArgs; 140 (void)codeUnits; 141 (void)length; 142 if (reason <= UCNV_IRREGULAR) 143 { 144 if (reason == UCNV_UNASSIGNED && IS_DEFAULT_IGNORABLE_CODE_POINT(codePoint)) 145 { 146 /* 147 * Skip if the codepoint has unicode property of default ignorable. 148 */ 149 *err = U_ZERO_ERROR; 150 } 151 else if (context == nullptr || (*((char*)context) == UCNV_PRV_STOP_ON_ILLEGAL && reason == UCNV_UNASSIGNED)) 152 { 153 *err = U_ZERO_ERROR; 154 } 155 /* else the caller must have set the error code accordingly. */ 156 } 157 /* else ignore the reset, close and clone calls. */ 158 } 159 160 U_CAPI void U_EXPORT2 161 UCNV_FROM_U_CALLBACK_SUBSTITUTE ( 162 const void *context, 163 UConverterFromUnicodeArgs *fromArgs, 164 const char16_t* codeUnits, 165 int32_t length, 166 UChar32 codePoint, 167 UConverterCallbackReason reason, 168 UErrorCode * err) 169 { 170 (void)codeUnits; 171 (void)length; 172 if (reason <= UCNV_IRREGULAR) 173 { 174 if (reason == UCNV_UNASSIGNED && IS_DEFAULT_IGNORABLE_CODE_POINT(codePoint)) 175 { 176 /* 177 * Skip if the codepoint has unicode property of default ignorable. 178 */ 179 *err = U_ZERO_ERROR; 180 } 181 else if (context == nullptr || (*((char*)context) == UCNV_PRV_STOP_ON_ILLEGAL && reason == UCNV_UNASSIGNED)) 182 { 183 *err = U_ZERO_ERROR; 184 ucnv_cbFromUWriteSub(fromArgs, 0, err); 185 } 186 /* else the caller must have set the error code accordingly. */ 187 } 188 /* else ignore the reset, close and clone calls. */ 189 } 190 191 /*uses uprv_itou to get a unicode escape sequence of the offensive sequence, 192 *uses a clean copy (resetted) of the converter, to convert that unicode 193 *escape sequence to the target codepage (if conversion failure happens then 194 *we revert to substituting with subchar) 195 */ 196 U_CAPI void U_EXPORT2 197 UCNV_FROM_U_CALLBACK_ESCAPE ( 198 const void *context, 199 UConverterFromUnicodeArgs *fromArgs, 200 const char16_t *codeUnits, 201 int32_t length, 202 UChar32 codePoint, 203 UConverterCallbackReason reason, 204 UErrorCode * err) 205 { 206 207 char16_t valueString[VALUE_STRING_LENGTH]; 208 int32_t valueStringLength = 0; 209 int32_t i = 0; 210 211 const char16_t *myValueSource = nullptr; 212 UErrorCode err2 = U_ZERO_ERROR; 213 UConverterFromUCallback original = nullptr; 214 const void *originalContext; 215 216 UConverterFromUCallback ignoredCallback = nullptr; 217 const void *ignoredContext; 218 219 if (reason > UCNV_IRREGULAR) 220 { 221 return; 222 } 223 else if (reason == UCNV_UNASSIGNED && IS_DEFAULT_IGNORABLE_CODE_POINT(codePoint)) 224 { 225 /* 226 * Skip if the codepoint has unicode property of default ignorable. 227 */ 228 *err = U_ZERO_ERROR; 229 return; 230 } 231 232 ucnv_setFromUCallBack (fromArgs->converter, 233 (UConverterFromUCallback) UCNV_FROM_U_CALLBACK_SUBSTITUTE, 234 nullptr, 235 &original, 236 &originalContext, 237 &err2); 238 239 if (U_FAILURE (err2)) 240 { 241 *err = err2; 242 return; 243 } 244 if(context==nullptr) 245 { 246 while (i < length) 247 { 248 valueString[valueStringLength++] = (char16_t) UNICODE_PERCENT_SIGN_CODEPOINT; /* adding % */ 249 valueString[valueStringLength++] = (char16_t) UNICODE_U_CODEPOINT; /* adding U */ 250 valueStringLength += uprv_itou (valueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, (uint16_t)codeUnits[i++], 16, 4); 251 } 252 } 253 else 254 { 255 switch(*((char*)context)) 256 { 257 case UCNV_PRV_ESCAPE_JAVA: 258 while (i < length) 259 { 260 valueString[valueStringLength++] = (char16_t) UNICODE_RS_CODEPOINT; /* adding \ */ 261 valueString[valueStringLength++] = (char16_t) UNICODE_U_LOW_CODEPOINT; /* adding u */ 262 valueStringLength += uprv_itou (valueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, (uint16_t)codeUnits[i++], 16, 4); 263 } 264 break; 265 266 case UCNV_PRV_ESCAPE_C: 267 valueString[valueStringLength++] = (char16_t) UNICODE_RS_CODEPOINT; /* adding \ */ 268 269 if(length==2){ 270 valueString[valueStringLength++] = (char16_t) UNICODE_U_CODEPOINT; /* adding U */ 271 valueStringLength += uprv_itou (valueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, codePoint, 16, 8); 272 273 } 274 else{ 275 valueString[valueStringLength++] = (char16_t) UNICODE_U_LOW_CODEPOINT; /* adding u */ 276 valueStringLength += uprv_itou (valueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, (uint16_t)codeUnits[0], 16, 4); 277 } 278 break; 279 280 case UCNV_PRV_ESCAPE_XML_DEC: 281 282 valueString[valueStringLength++] = (char16_t) UNICODE_AMP_CODEPOINT; /* adding & */ 283 valueString[valueStringLength++] = (char16_t) UNICODE_HASH_CODEPOINT; /* adding # */ 284 if(length==2){ 285 valueStringLength += uprv_itou (valueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, codePoint, 10, 0); 286 } 287 else{ 288 valueStringLength += uprv_itou (valueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, (uint16_t)codeUnits[0], 10, 0); 289 } 290 valueString[valueStringLength++] = (char16_t) UNICODE_SEMICOLON_CODEPOINT; /* adding ; */ 291 break; 292 293 case UCNV_PRV_ESCAPE_XML_HEX: 294 295 valueString[valueStringLength++] = (char16_t) UNICODE_AMP_CODEPOINT; /* adding & */ 296 valueString[valueStringLength++] = (char16_t) UNICODE_HASH_CODEPOINT; /* adding # */ 297 valueString[valueStringLength++] = (char16_t) UNICODE_X_LOW_CODEPOINT; /* adding x */ 298 if(length==2){ 299 valueStringLength += uprv_itou (valueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, codePoint, 16, 0); 300 } 301 else{ 302 valueStringLength += uprv_itou (valueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, (uint16_t)codeUnits[0], 16, 0); 303 } 304 valueString[valueStringLength++] = (char16_t) UNICODE_SEMICOLON_CODEPOINT; /* adding ; */ 305 break; 306 307 case UCNV_PRV_ESCAPE_UNICODE: 308 valueString[valueStringLength++] = (char16_t) UNICODE_LEFT_CURLY_CODEPOINT; /* adding { */ 309 valueString[valueStringLength++] = (char16_t) UNICODE_U_CODEPOINT; /* adding U */ 310 valueString[valueStringLength++] = (char16_t) UNICODE_PLUS_CODEPOINT; /* adding + */ 311 if (length == 2) { 312 valueStringLength += uprv_itou (valueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, codePoint, 16, 4); 313 } else { 314 valueStringLength += uprv_itou (valueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, (uint16_t)codeUnits[0], 16, 4); 315 } 316 valueString[valueStringLength++] = (char16_t) UNICODE_RIGHT_CURLY_CODEPOINT; /* adding } */ 317 break; 318 319 case UCNV_PRV_ESCAPE_CSS2: 320 valueString[valueStringLength++] = (char16_t) UNICODE_RS_CODEPOINT; /* adding \ */ 321 valueStringLength += uprv_itou (valueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, codePoint, 16, 0); 322 /* Always add space character, because the next character might be whitespace, 323 which would erroneously be considered the termination of the escape sequence. */ 324 valueString[valueStringLength++] = (char16_t) UNICODE_SPACE_CODEPOINT; 325 break; 326 327 default: 328 while (i < length) 329 { 330 valueString[valueStringLength++] = (char16_t) UNICODE_PERCENT_SIGN_CODEPOINT; /* adding % */ 331 valueString[valueStringLength++] = (char16_t) UNICODE_U_CODEPOINT; /* adding U */ 332 valueStringLength += uprv_itou (valueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, (uint16_t)codeUnits[i++], 16, 4); 333 } 334 } 335 } 336 myValueSource = valueString; 337 338 /* reset the error */ 339 *err = U_ZERO_ERROR; 340 341 ucnv_cbFromUWriteUChars(fromArgs, &myValueSource, myValueSource+valueStringLength, 0, err); 342 343 ucnv_setFromUCallBack (fromArgs->converter, 344 original, 345 originalContext, 346 &ignoredCallback, 347 &ignoredContext, 348 &err2); 349 if (U_FAILURE (err2)) 350 { 351 *err = err2; 352 return; 353 } 354 } 355 356 357 358 U_CAPI void U_EXPORT2 359 UCNV_TO_U_CALLBACK_SKIP ( 360 const void *context, 361 UConverterToUnicodeArgs *toArgs, 362 const char* codeUnits, 363 int32_t length, 364 UConverterCallbackReason reason, 365 UErrorCode * err) 366 { 367 (void)toArgs; 368 (void)codeUnits; 369 (void)length; 370 if (reason <= UCNV_IRREGULAR) 371 { 372 if (context == nullptr || (*((char*)context) == UCNV_PRV_STOP_ON_ILLEGAL && reason == UCNV_UNASSIGNED)) 373 { 374 *err = U_ZERO_ERROR; 375 } 376 /* else the caller must have set the error code accordingly. */ 377 } 378 /* else ignore the reset, close and clone calls. */ 379 } 380 381 U_CAPI void U_EXPORT2 382 UCNV_TO_U_CALLBACK_SUBSTITUTE ( 383 const void *context, 384 UConverterToUnicodeArgs *toArgs, 385 const char* codeUnits, 386 int32_t length, 387 UConverterCallbackReason reason, 388 UErrorCode * err) 389 { 390 (void)codeUnits; 391 (void)length; 392 if (reason <= UCNV_IRREGULAR) 393 { 394 if (context == nullptr || (*((char*)context) == UCNV_PRV_STOP_ON_ILLEGAL && reason == UCNV_UNASSIGNED)) 395 { 396 *err = U_ZERO_ERROR; 397 ucnv_cbToUWriteSub(toArgs,0,err); 398 } 399 /* else the caller must have set the error code accordingly. */ 400 } 401 /* else ignore the reset, close and clone calls. */ 402 } 403 404 /*uses uprv_itou to get a unicode escape sequence of the offensive sequence, 405 *and uses that as the substitution sequence 406 */ 407 U_CAPI void U_EXPORT2 408 UCNV_TO_U_CALLBACK_ESCAPE ( 409 const void *context, 410 UConverterToUnicodeArgs *toArgs, 411 const char* codeUnits, 412 int32_t length, 413 UConverterCallbackReason reason, 414 UErrorCode * err) 415 { 416 char16_t uniValueString[VALUE_STRING_LENGTH]; 417 int32_t valueStringLength = 0; 418 int32_t i = 0; 419 420 if (reason > UCNV_IRREGULAR) 421 { 422 return; 423 } 424 425 if(context==nullptr) 426 { 427 while (i < length) 428 { 429 uniValueString[valueStringLength++] = (char16_t) UNICODE_PERCENT_SIGN_CODEPOINT; /* adding % */ 430 uniValueString[valueStringLength++] = (char16_t) UNICODE_X_CODEPOINT; /* adding X */ 431 valueStringLength += uprv_itou (uniValueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, (uint8_t) codeUnits[i++], 16, 2); 432 } 433 } 434 else 435 { 436 switch(*((char*)context)) 437 { 438 case UCNV_PRV_ESCAPE_XML_DEC: 439 while (i < length) 440 { 441 uniValueString[valueStringLength++] = (char16_t) UNICODE_AMP_CODEPOINT; /* adding & */ 442 uniValueString[valueStringLength++] = (char16_t) UNICODE_HASH_CODEPOINT; /* adding # */ 443 valueStringLength += uprv_itou (uniValueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, (uint8_t)codeUnits[i++], 10, 0); 444 uniValueString[valueStringLength++] = (char16_t) UNICODE_SEMICOLON_CODEPOINT; /* adding ; */ 445 } 446 break; 447 448 case UCNV_PRV_ESCAPE_XML_HEX: 449 while (i < length) 450 { 451 uniValueString[valueStringLength++] = (char16_t) UNICODE_AMP_CODEPOINT; /* adding & */ 452 uniValueString[valueStringLength++] = (char16_t) UNICODE_HASH_CODEPOINT; /* adding # */ 453 uniValueString[valueStringLength++] = (char16_t) UNICODE_X_LOW_CODEPOINT; /* adding x */ 454 valueStringLength += uprv_itou (uniValueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, (uint8_t)codeUnits[i++], 16, 0); 455 uniValueString[valueStringLength++] = (char16_t) UNICODE_SEMICOLON_CODEPOINT; /* adding ; */ 456 } 457 break; 458 case UCNV_PRV_ESCAPE_C: 459 while (i < length) 460 { 461 uniValueString[valueStringLength++] = (char16_t) UNICODE_RS_CODEPOINT; /* adding \ */ 462 uniValueString[valueStringLength++] = (char16_t) UNICODE_X_LOW_CODEPOINT; /* adding x */ 463 valueStringLength += uprv_itou (uniValueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, (uint8_t)codeUnits[i++], 16, 2); 464 } 465 break; 466 default: 467 while (i < length) 468 { 469 uniValueString[valueStringLength++] = (char16_t) UNICODE_PERCENT_SIGN_CODEPOINT; /* adding % */ 470 uniValueString[valueStringLength++] = (char16_t) UNICODE_X_CODEPOINT; /* adding X */ 471 uprv_itou (uniValueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, (uint8_t) codeUnits[i++], 16, 2); 472 valueStringLength += 2; 473 } 474 } 475 } 476 /* reset the error */ 477 *err = U_ZERO_ERROR; 478 479 ucnv_cbToUWriteUChars(toArgs, uniValueString, valueStringLength, 0, err); 480 } 481 482 #endif