ucnvbocu.cpp (47492B)
1 // © 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html 3 /* 4 ****************************************************************************** 5 * 6 * Copyright (C) 2002-2016, International Business Machines 7 * Corporation and others. All Rights Reserved. 8 * 9 ****************************************************************************** 10 * file name: ucnvbocu.cpp 11 * encoding: UTF-8 12 * tab size: 8 (not used) 13 * indentation:4 14 * 15 * created on: 2002mar27 16 * created by: Markus W. Scherer 17 * 18 * This is an implementation of the Binary Ordered Compression for Unicode, 19 * in its MIME-friendly form as defined in http://www.unicode.org/notes/tn6/ 20 */ 21 22 #include "unicode/utypes.h" 23 24 #if !UCONFIG_NO_CONVERSION && !UCONFIG_ONLY_HTML_CONVERSION 25 26 #include "unicode/ucnv.h" 27 #include "unicode/ucnv_cb.h" 28 #include "unicode/utf16.h" 29 #include "putilimp.h" 30 #include "ucnv_bld.h" 31 #include "ucnv_cnv.h" 32 #include "uassert.h" 33 34 /* BOCU-1 constants and macros ---------------------------------------------- */ 35 36 /* 37 * BOCU-1 encodes the code points of a Unicode string as 38 * a sequence of byte-encoded differences (slope detection), 39 * preserving lexical order. 40 * 41 * Optimize the difference-taking for runs of Unicode text within 42 * small scripts: 43 * 44 * Most small scripts are allocated within aligned 128-blocks of Unicode 45 * code points. Lexical order is preserved if the "previous code point" state 46 * is always moved into the middle of such a block. 47 * 48 * Additionally, "prev" is moved from anywhere in the Unihan and Hangul 49 * areas into the middle of those areas. 50 * 51 * C0 control codes and space are encoded with their US-ASCII bytes. 52 * "prev" is reset for C0 controls but not for space. 53 */ 54 55 /* initial value for "prev": middle of the ASCII range */ 56 #define BOCU1_ASCII_PREV 0x40 57 58 /* bounding byte values for differences */ 59 #define BOCU1_MIN 0x21 60 #define BOCU1_MIDDLE 0x90 61 #define BOCU1_MAX_LEAD 0xfe 62 #define BOCU1_MAX_TRAIL 0xff 63 #define BOCU1_RESET 0xff 64 65 /* number of lead bytes */ 66 #define BOCU1_COUNT (BOCU1_MAX_LEAD-BOCU1_MIN+1) 67 68 /* adjust trail byte counts for the use of some C0 control byte values */ 69 #define BOCU1_TRAIL_CONTROLS_COUNT 20 70 #define BOCU1_TRAIL_BYTE_OFFSET (BOCU1_MIN-BOCU1_TRAIL_CONTROLS_COUNT) 71 72 /* number of trail bytes */ 73 #define BOCU1_TRAIL_COUNT ((BOCU1_MAX_TRAIL-BOCU1_MIN+1)+BOCU1_TRAIL_CONTROLS_COUNT) 74 75 /* 76 * number of positive and negative single-byte codes 77 * (counting 0==BOCU1_MIDDLE among the positive ones) 78 */ 79 #define BOCU1_SINGLE 64 80 81 /* number of lead bytes for positive and negative 2/3/4-byte sequences */ 82 #define BOCU1_LEAD_2 43 83 #define BOCU1_LEAD_3 3 84 #define BOCU1_LEAD_4 1 85 86 /* The difference value range for single-byters. */ 87 #define BOCU1_REACH_POS_1 (BOCU1_SINGLE-1) 88 #define BOCU1_REACH_NEG_1 (-BOCU1_SINGLE) 89 90 /* The difference value range for double-byters. */ 91 #define BOCU1_REACH_POS_2 (BOCU1_REACH_POS_1+BOCU1_LEAD_2*BOCU1_TRAIL_COUNT) 92 #define BOCU1_REACH_NEG_2 (BOCU1_REACH_NEG_1-BOCU1_LEAD_2*BOCU1_TRAIL_COUNT) 93 94 /* The difference value range for 3-byters. */ 95 #define BOCU1_REACH_POS_3 \ 96 (BOCU1_REACH_POS_2+BOCU1_LEAD_3*BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT) 97 98 #define BOCU1_REACH_NEG_3 (BOCU1_REACH_NEG_2-BOCU1_LEAD_3*BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT) 99 100 /* The lead byte start values. */ 101 #define BOCU1_START_POS_2 (BOCU1_MIDDLE+BOCU1_REACH_POS_1+1) 102 #define BOCU1_START_POS_3 (BOCU1_START_POS_2+BOCU1_LEAD_2) 103 #define BOCU1_START_POS_4 (BOCU1_START_POS_3+BOCU1_LEAD_3) 104 /* ==BOCU1_MAX_LEAD */ 105 106 #define BOCU1_START_NEG_2 (BOCU1_MIDDLE+BOCU1_REACH_NEG_1) 107 #define BOCU1_START_NEG_3 (BOCU1_START_NEG_2-BOCU1_LEAD_2) 108 #define BOCU1_START_NEG_4 (BOCU1_START_NEG_3-BOCU1_LEAD_3) 109 /* ==BOCU1_MIN+1 */ 110 111 /* The length of a byte sequence, according to the lead byte (!=BOCU1_RESET). */ 112 #define BOCU1_LENGTH_FROM_LEAD(lead) \ 113 ((BOCU1_START_NEG_2<=(lead) && (lead)<BOCU1_START_POS_2) ? 1 : \ 114 (BOCU1_START_NEG_3<=(lead) && (lead)<BOCU1_START_POS_3) ? 2 : \ 115 (BOCU1_START_NEG_4<=(lead) && (lead)<BOCU1_START_POS_4) ? 3 : 4) 116 117 /* The length of a byte sequence, according to its packed form. */ 118 #define BOCU1_LENGTH_FROM_PACKED(packed) \ 119 ((uint32_t)(packed)<0x04000000 ? (packed)>>24 : 4) 120 121 /* 122 * 12 commonly used C0 control codes (and space) are only used to encode 123 * themselves directly, 124 * which makes BOCU-1 MIME-usable and reasonably safe for 125 * ASCII-oriented software. 126 * 127 * These controls are 128 * 0 NUL 129 * 130 * 7 BEL 131 * 8 BS 132 * 133 * 9 TAB 134 * a LF 135 * b VT 136 * c FF 137 * d CR 138 * 139 * e SO 140 * f SI 141 * 142 * 1a SUB 143 * 1b ESC 144 * 145 * The other 20 C0 controls are also encoded directly (to preserve order) 146 * but are also used as trail bytes in difference encoding 147 * (for better compression). 148 */ 149 #define BOCU1_TRAIL_TO_BYTE(t) ((t)>=BOCU1_TRAIL_CONTROLS_COUNT ? (t)+BOCU1_TRAIL_BYTE_OFFSET : bocu1TrailToByte[t]) 150 151 /* 152 * Byte value map for control codes, 153 * from external byte values 0x00..0x20 154 * to trail byte values 0..19 (0..0x13) as used in the difference calculation. 155 * External byte values that are illegal as trail bytes are mapped to -1. 156 */ 157 static const int8_t 158 bocu1ByteToTrail[BOCU1_MIN]={ 159 /* 0 1 2 3 4 5 6 7 */ 160 -1, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, -1, 161 162 /* 8 9 a b c d e f */ 163 -1, -1, -1, -1, -1, -1, -1, -1, 164 165 /* 10 11 12 13 14 15 16 17 */ 166 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 167 168 /* 18 19 1a 1b 1c 1d 1e 1f */ 169 0x0e, 0x0f, -1, -1, 0x10, 0x11, 0x12, 0x13, 170 171 /* 20 */ 172 -1 173 }; 174 175 /* 176 * Byte value map for control codes, 177 * from trail byte values 0..19 (0..0x13) as used in the difference calculation 178 * to external byte values 0x00..0x20. 179 */ 180 static const int8_t 181 bocu1TrailToByte[BOCU1_TRAIL_CONTROLS_COUNT]={ 182 /* 0 1 2 3 4 5 6 7 */ 183 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x10, 0x11, 184 185 /* 8 9 a b c d e f */ 186 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 187 188 /* 10 11 12 13 */ 189 0x1c, 0x1d, 0x1e, 0x1f 190 }; 191 192 /** 193 * Integer division and modulo with negative numerators 194 * yields negative modulo results and quotients that are one more than 195 * what we need here. 196 * This macro adjust the results so that the modulo-value m is always >=0. 197 * 198 * For positive n, the if() condition is always false. 199 * 200 * @param n Number to be split into quotient and rest. 201 * Will be modified to contain the quotient. 202 * @param d Divisor. 203 * @param m Output variable for the rest (modulo result). 204 */ 205 #define NEGDIVMOD(n, d, m) UPRV_BLOCK_MACRO_BEGIN { \ 206 (m)=(n)%(d); \ 207 (n)/=(d); \ 208 if((m)<0) { \ 209 --(n); \ 210 (m)+=(d); \ 211 } \ 212 } UPRV_BLOCK_MACRO_END 213 214 /* Faster versions of packDiff() for single-byte-encoded diff values. */ 215 216 /** Is a diff value encodable in a single byte? */ 217 #define DIFF_IS_SINGLE(diff) (BOCU1_REACH_NEG_1<=(diff) && (diff)<=BOCU1_REACH_POS_1) 218 219 /** Encode a diff value in a single byte. */ 220 #define PACK_SINGLE_DIFF(diff) (BOCU1_MIDDLE+(diff)) 221 222 /** Is a diff value encodable in two bytes? */ 223 #define DIFF_IS_DOUBLE(diff) (BOCU1_REACH_NEG_2<=(diff) && (diff)<=BOCU1_REACH_POS_2) 224 225 /* BOCU-1 implementation functions ------------------------------------------ */ 226 227 #define BOCU1_SIMPLE_PREV(c) (((c)&~0x7f)+BOCU1_ASCII_PREV) 228 229 /** 230 * Compute the next "previous" value for differencing 231 * from the current code point. 232 * 233 * @param c current code point, 0x3040..0xd7a3 (rest handled by macro below) 234 * @return "previous code point" state value 235 */ 236 static inline int32_t 237 bocu1Prev(int32_t c) { 238 /* compute new prev */ 239 if(/* 0x3040<=c && */ c<=0x309f) { 240 /* Hiragana is not 128-aligned */ 241 return 0x3070; 242 } else if(0x4e00<=c && c<=0x9fa5) { 243 /* CJK Unihan */ 244 return 0x4e00-BOCU1_REACH_NEG_2; 245 } else if(0xac00<=c /* && c<=0xd7a3 */) { 246 /* Korean Hangul */ 247 return (0xd7a3+0xac00)/2; 248 } else { 249 /* mostly small scripts */ 250 return BOCU1_SIMPLE_PREV(c); 251 } 252 } 253 254 /** Fast version of bocu1Prev() for most scripts. */ 255 #define BOCU1_PREV(c) ((c)<0x3040 || (c)>0xd7a3 ? BOCU1_SIMPLE_PREV(c) : bocu1Prev(c)) 256 257 /* 258 * The BOCU-1 converter uses the standard setup code in ucnv.c/ucnv_bld.c. 259 * The UConverter fields are used as follows: 260 * 261 * fromUnicodeStatus encoder's prev (0 will be interpreted as BOCU1_ASCII_PREV) 262 * 263 * toUnicodeStatus decoder's prev (0 will be interpreted as BOCU1_ASCII_PREV) 264 * mode decoder's incomplete (diff<<2)|count (ignored when toULength==0) 265 */ 266 267 /* BOCU-1-from-Unicode conversion functions --------------------------------- */ 268 269 /** 270 * Encode a difference -0x10ffff..0x10ffff in 1..4 bytes 271 * and return a packed integer with them. 272 * 273 * The encoding favors small absolute differences with short encodings 274 * to compress runs of same-script characters. 275 * 276 * Optimized version with unrolled loops and fewer floating-point operations 277 * than the standard packDiff(). 278 * 279 * @param diff difference value -0x10ffff..0x10ffff 280 * @return 281 * 0x010000zz for 1-byte sequence zz 282 * 0x0200yyzz for 2-byte sequence yy zz 283 * 0x03xxyyzz for 3-byte sequence xx yy zz 284 * 0xwwxxyyzz for 4-byte sequence ww xx yy zz (ww>0x03) 285 */ 286 static int32_t 287 packDiff(int32_t diff) { 288 int32_t result, m; 289 290 U_ASSERT(!DIFF_IS_SINGLE(diff)); /* assume we won't be called where diff==BOCU1_REACH_NEG_1=-64 */ 291 if(diff>=BOCU1_REACH_NEG_1) { 292 /* mostly positive differences, and single-byte negative ones */ 293 #if 0 /* single-byte case handled in macros, see below */ 294 if(diff<=BOCU1_REACH_POS_1) { 295 /* single byte */ 296 return 0x01000000|(BOCU1_MIDDLE+diff); 297 } else 298 #endif 299 if(diff<=BOCU1_REACH_POS_2) { 300 /* two bytes */ 301 diff-=BOCU1_REACH_POS_1+1; 302 result=0x02000000; 303 304 m=diff%BOCU1_TRAIL_COUNT; 305 diff/=BOCU1_TRAIL_COUNT; 306 result|=BOCU1_TRAIL_TO_BYTE(m); 307 308 result|=(BOCU1_START_POS_2+diff)<<8; 309 } else if(diff<=BOCU1_REACH_POS_3) { 310 /* three bytes */ 311 diff-=BOCU1_REACH_POS_2+1; 312 result=0x03000000; 313 314 m=diff%BOCU1_TRAIL_COUNT; 315 diff/=BOCU1_TRAIL_COUNT; 316 result|=BOCU1_TRAIL_TO_BYTE(m); 317 318 m=diff%BOCU1_TRAIL_COUNT; 319 diff/=BOCU1_TRAIL_COUNT; 320 result|=BOCU1_TRAIL_TO_BYTE(m)<<8; 321 322 result|=(BOCU1_START_POS_3+diff)<<16; 323 } else { 324 /* four bytes */ 325 diff-=BOCU1_REACH_POS_3+1; 326 327 m=diff%BOCU1_TRAIL_COUNT; 328 diff/=BOCU1_TRAIL_COUNT; 329 result=BOCU1_TRAIL_TO_BYTE(m); 330 331 m=diff%BOCU1_TRAIL_COUNT; 332 diff/=BOCU1_TRAIL_COUNT; 333 result|=BOCU1_TRAIL_TO_BYTE(m)<<8; 334 335 /* 336 * We know that / and % would deliver quotient 0 and rest=diff. 337 * Avoid division and modulo for performance. 338 */ 339 result|=BOCU1_TRAIL_TO_BYTE(diff)<<16; 340 341 result |= static_cast<uint32_t>(BOCU1_START_POS_4) << 24; 342 } 343 } else { 344 /* two- to four-byte negative differences */ 345 if(diff>=BOCU1_REACH_NEG_2) { 346 /* two bytes */ 347 diff-=BOCU1_REACH_NEG_1; 348 result=0x02000000; 349 350 NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m); 351 result|=BOCU1_TRAIL_TO_BYTE(m); 352 353 result|=(BOCU1_START_NEG_2+diff)<<8; 354 } else if(diff>=BOCU1_REACH_NEG_3) { 355 /* three bytes */ 356 diff-=BOCU1_REACH_NEG_2; 357 result=0x03000000; 358 359 NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m); 360 result|=BOCU1_TRAIL_TO_BYTE(m); 361 362 NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m); 363 result|=BOCU1_TRAIL_TO_BYTE(m)<<8; 364 365 result|=(BOCU1_START_NEG_3+diff)<<16; 366 } else { 367 /* four bytes */ 368 diff-=BOCU1_REACH_NEG_3; 369 370 NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m); 371 result=BOCU1_TRAIL_TO_BYTE(m); 372 373 NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m); 374 result|=BOCU1_TRAIL_TO_BYTE(m)<<8; 375 376 /* 377 * We know that NEGDIVMOD would deliver 378 * quotient -1 and rest=diff+BOCU1_TRAIL_COUNT. 379 * Avoid division and modulo for performance. 380 */ 381 m=diff+BOCU1_TRAIL_COUNT; 382 result|=BOCU1_TRAIL_TO_BYTE(m)<<16; 383 384 result|=BOCU1_MIN<<24; 385 } 386 } 387 return result; 388 } 389 390 391 static void U_CALLCONV 392 _Bocu1FromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs, 393 UErrorCode *pErrorCode) { 394 UConverter *cnv; 395 const char16_t *source, *sourceLimit; 396 uint8_t *target; 397 int32_t targetCapacity; 398 int32_t *offsets; 399 400 int32_t prev, c, diff; 401 402 int32_t sourceIndex, nextSourceIndex; 403 404 /* set up the local pointers */ 405 cnv=pArgs->converter; 406 source=pArgs->source; 407 sourceLimit=pArgs->sourceLimit; 408 target = reinterpret_cast<uint8_t*>(pArgs->target); 409 targetCapacity = static_cast<int32_t>(pArgs->targetLimit - pArgs->target); 410 offsets=pArgs->offsets; 411 412 /* get the converter state from UConverter */ 413 c=cnv->fromUChar32; 414 prev = static_cast<int32_t>(cnv->fromUnicodeStatus); 415 if(prev==0) { 416 prev=BOCU1_ASCII_PREV; 417 } 418 419 /* sourceIndex=-1 if the current character began in the previous buffer */ 420 sourceIndex= c==0 ? 0 : -1; 421 nextSourceIndex=0; 422 423 /* conversion loop */ 424 if(c!=0 && targetCapacity>0) { 425 goto getTrail; 426 } 427 428 fastSingle: 429 /* fast loop for single-byte differences */ 430 /* use only one loop counter variable, targetCapacity, not also source */ 431 diff = static_cast<int32_t>(sourceLimit - source); 432 if(targetCapacity>diff) { 433 targetCapacity=diff; 434 } 435 while(targetCapacity>0 && (c=*source)<0x3000) { 436 if(c<=0x20) { 437 if(c!=0x20) { 438 prev=BOCU1_ASCII_PREV; 439 } 440 *target++ = static_cast<uint8_t>(c); 441 *offsets++=nextSourceIndex++; 442 ++source; 443 --targetCapacity; 444 } else { 445 diff=c-prev; 446 if(DIFF_IS_SINGLE(diff)) { 447 prev=BOCU1_SIMPLE_PREV(c); 448 *target++ = static_cast<uint8_t>(PACK_SINGLE_DIFF(diff)); 449 *offsets++=nextSourceIndex++; 450 ++source; 451 --targetCapacity; 452 } else { 453 break; 454 } 455 } 456 } 457 /* restore real values */ 458 targetCapacity = static_cast<int32_t>(reinterpret_cast<const uint8_t*>(pArgs->targetLimit) - target); 459 sourceIndex=nextSourceIndex; /* wrong if offsets==nullptr but does not matter */ 460 461 /* regular loop for all cases */ 462 while(source<sourceLimit) { 463 if(targetCapacity>0) { 464 c=*source++; 465 ++nextSourceIndex; 466 467 if(c<=0x20) { 468 /* 469 * ISO C0 control & space: 470 * Encode directly for MIME compatibility, 471 * and reset state except for space, to not disrupt compression. 472 */ 473 if(c!=0x20) { 474 prev=BOCU1_ASCII_PREV; 475 } 476 *target++ = static_cast<uint8_t>(c); 477 *offsets++=sourceIndex; 478 --targetCapacity; 479 480 sourceIndex=nextSourceIndex; 481 continue; 482 } 483 484 if(U16_IS_LEAD(c)) { 485 getTrail: 486 if(source<sourceLimit) { 487 /* test the following code unit */ 488 char16_t trail=*source; 489 if(U16_IS_TRAIL(trail)) { 490 ++source; 491 ++nextSourceIndex; 492 c=U16_GET_SUPPLEMENTARY(c, trail); 493 } 494 } else { 495 /* no more input */ 496 c=-c; /* negative lead surrogate as "incomplete" indicator to avoid c=0 everywhere else */ 497 break; 498 } 499 } 500 501 /* 502 * all other Unicode code points c==U+0021..U+10ffff 503 * are encoded with the difference c-prev 504 * 505 * a new prev is computed from c, 506 * placed in the middle of a 0x80-block (for most small scripts) or 507 * in the middle of the Unihan and Hangul blocks 508 * to statistically minimize the following difference 509 */ 510 diff=c-prev; 511 prev=BOCU1_PREV(c); 512 if(DIFF_IS_SINGLE(diff)) { 513 *target++ = static_cast<uint8_t>(PACK_SINGLE_DIFF(diff)); 514 *offsets++=sourceIndex; 515 --targetCapacity; 516 sourceIndex=nextSourceIndex; 517 if(c<0x3000) { 518 goto fastSingle; 519 } 520 } else if(DIFF_IS_DOUBLE(diff) && 2<=targetCapacity) { 521 /* optimize 2-byte case */ 522 int32_t m; 523 524 if(diff>=0) { 525 diff-=BOCU1_REACH_POS_1+1; 526 m=diff%BOCU1_TRAIL_COUNT; 527 diff/=BOCU1_TRAIL_COUNT; 528 diff+=BOCU1_START_POS_2; 529 } else { 530 diff-=BOCU1_REACH_NEG_1; 531 NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m); 532 diff+=BOCU1_START_NEG_2; 533 } 534 *target++ = static_cast<uint8_t>(diff); 535 *target++ = static_cast<uint8_t>(BOCU1_TRAIL_TO_BYTE(m)); 536 *offsets++=sourceIndex; 537 *offsets++=sourceIndex; 538 targetCapacity-=2; 539 sourceIndex=nextSourceIndex; 540 } else { 541 int32_t length; /* will be 2..4 */ 542 543 diff=packDiff(diff); 544 length=BOCU1_LENGTH_FROM_PACKED(diff); 545 546 /* write the output character bytes from diff and length */ 547 /* from the first if in the loop we know that targetCapacity>0 */ 548 if(length<=targetCapacity) { 549 switch(length) { 550 /* each branch falls through to the next one */ 551 case 4: 552 *target++ = static_cast<uint8_t>(diff >> 24); 553 *offsets++=sourceIndex; 554 U_FALLTHROUGH; 555 case 3: 556 *target++ = static_cast<uint8_t>(diff >> 16); 557 *offsets++=sourceIndex; 558 U_FALLTHROUGH; 559 case 2: 560 *target++ = static_cast<uint8_t>(diff >> 8); 561 *offsets++=sourceIndex; 562 /* case 1: handled above */ 563 *target++ = static_cast<uint8_t>(diff); 564 *offsets++=sourceIndex; 565 U_FALLTHROUGH; 566 default: 567 /* will never occur */ 568 break; 569 } 570 targetCapacity-=length; 571 sourceIndex=nextSourceIndex; 572 } else { 573 uint8_t *charErrorBuffer; 574 575 /* 576 * We actually do this backwards here: 577 * In order to save an intermediate variable, we output 578 * first to the overflow buffer what does not fit into the 579 * regular target. 580 */ 581 /* we know that 1<=targetCapacity<length<=4 */ 582 length-=targetCapacity; 583 charErrorBuffer=(uint8_t *)cnv->charErrorBuffer; 584 switch(length) { 585 /* each branch falls through to the next one */ 586 case 3: 587 *charErrorBuffer++ = static_cast<uint8_t>(diff >> 16); 588 U_FALLTHROUGH; 589 case 2: 590 *charErrorBuffer++ = static_cast<uint8_t>(diff >> 8); 591 U_FALLTHROUGH; 592 case 1: 593 *charErrorBuffer = static_cast<uint8_t>(diff); 594 U_FALLTHROUGH; 595 default: 596 /* will never occur */ 597 break; 598 } 599 cnv->charErrorBufferLength = static_cast<int8_t>(length); 600 601 /* now output what fits into the regular target */ 602 diff>>=8*length; /* length was reduced by targetCapacity */ 603 switch(targetCapacity) { 604 /* each branch falls through to the next one */ 605 case 3: 606 *target++ = static_cast<uint8_t>(diff >> 16); 607 *offsets++=sourceIndex; 608 U_FALLTHROUGH; 609 case 2: 610 *target++ = static_cast<uint8_t>(diff >> 8); 611 *offsets++=sourceIndex; 612 U_FALLTHROUGH; 613 case 1: 614 *target++ = static_cast<uint8_t>(diff); 615 *offsets++=sourceIndex; 616 U_FALLTHROUGH; 617 default: 618 /* will never occur */ 619 break; 620 } 621 622 /* target overflow */ 623 targetCapacity=0; 624 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 625 break; 626 } 627 } 628 } else { 629 /* target is full */ 630 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 631 break; 632 } 633 } 634 635 /* set the converter state back into UConverter */ 636 cnv->fromUChar32= c<0 ? -c : 0; 637 cnv->fromUnicodeStatus = static_cast<uint32_t>(prev); 638 639 /* write back the updated pointers */ 640 pArgs->source=source; 641 pArgs->target = reinterpret_cast<char*>(target); 642 pArgs->offsets=offsets; 643 } 644 645 /* 646 * Identical to _Bocu1FromUnicodeWithOffsets but without offset handling. 647 * If a change is made in the original function, then either 648 * change this function the same way or 649 * re-copy the original function and remove the variables 650 * offsets, sourceIndex, and nextSourceIndex. 651 */ 652 static void U_CALLCONV 653 _Bocu1FromUnicode(UConverterFromUnicodeArgs *pArgs, 654 UErrorCode *pErrorCode) { 655 UConverter *cnv; 656 const char16_t *source, *sourceLimit; 657 uint8_t *target; 658 int32_t targetCapacity; 659 660 int32_t prev, c, diff; 661 662 /* set up the local pointers */ 663 cnv=pArgs->converter; 664 source=pArgs->source; 665 sourceLimit=pArgs->sourceLimit; 666 target = reinterpret_cast<uint8_t*>(pArgs->target); 667 targetCapacity = static_cast<int32_t>(pArgs->targetLimit - pArgs->target); 668 669 /* get the converter state from UConverter */ 670 c=cnv->fromUChar32; 671 prev = static_cast<int32_t>(cnv->fromUnicodeStatus); 672 if(prev==0) { 673 prev=BOCU1_ASCII_PREV; 674 } 675 676 /* conversion loop */ 677 if(c!=0 && targetCapacity>0) { 678 goto getTrail; 679 } 680 681 fastSingle: 682 /* fast loop for single-byte differences */ 683 /* use only one loop counter variable, targetCapacity, not also source */ 684 diff = static_cast<int32_t>(sourceLimit - source); 685 if(targetCapacity>diff) { 686 targetCapacity=diff; 687 } 688 while(targetCapacity>0 && (c=*source)<0x3000) { 689 if(c<=0x20) { 690 if(c!=0x20) { 691 prev=BOCU1_ASCII_PREV; 692 } 693 *target++ = static_cast<uint8_t>(c); 694 } else { 695 diff=c-prev; 696 if(DIFF_IS_SINGLE(diff)) { 697 prev=BOCU1_SIMPLE_PREV(c); 698 *target++ = static_cast<uint8_t>(PACK_SINGLE_DIFF(diff)); 699 } else { 700 break; 701 } 702 } 703 ++source; 704 --targetCapacity; 705 } 706 /* restore real values */ 707 targetCapacity = static_cast<int32_t>(reinterpret_cast<const uint8_t*>(pArgs->targetLimit) - target); 708 709 /* regular loop for all cases */ 710 while(source<sourceLimit) { 711 if(targetCapacity>0) { 712 c=*source++; 713 714 if(c<=0x20) { 715 /* 716 * ISO C0 control & space: 717 * Encode directly for MIME compatibility, 718 * and reset state except for space, to not disrupt compression. 719 */ 720 if(c!=0x20) { 721 prev=BOCU1_ASCII_PREV; 722 } 723 *target++ = static_cast<uint8_t>(c); 724 --targetCapacity; 725 continue; 726 } 727 728 if(U16_IS_LEAD(c)) { 729 getTrail: 730 if(source<sourceLimit) { 731 /* test the following code unit */ 732 char16_t trail=*source; 733 if(U16_IS_TRAIL(trail)) { 734 ++source; 735 c=U16_GET_SUPPLEMENTARY(c, trail); 736 } 737 } else { 738 /* no more input */ 739 c=-c; /* negative lead surrogate as "incomplete" indicator to avoid c=0 everywhere else */ 740 break; 741 } 742 } 743 744 /* 745 * all other Unicode code points c==U+0021..U+10ffff 746 * are encoded with the difference c-prev 747 * 748 * a new prev is computed from c, 749 * placed in the middle of a 0x80-block (for most small scripts) or 750 * in the middle of the Unihan and Hangul blocks 751 * to statistically minimize the following difference 752 */ 753 diff=c-prev; 754 prev=BOCU1_PREV(c); 755 if(DIFF_IS_SINGLE(diff)) { 756 *target++ = static_cast<uint8_t>(PACK_SINGLE_DIFF(diff)); 757 --targetCapacity; 758 if(c<0x3000) { 759 goto fastSingle; 760 } 761 } else if(DIFF_IS_DOUBLE(diff) && 2<=targetCapacity) { 762 /* optimize 2-byte case */ 763 int32_t m; 764 765 if(diff>=0) { 766 diff-=BOCU1_REACH_POS_1+1; 767 m=diff%BOCU1_TRAIL_COUNT; 768 diff/=BOCU1_TRAIL_COUNT; 769 diff+=BOCU1_START_POS_2; 770 } else { 771 diff-=BOCU1_REACH_NEG_1; 772 NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m); 773 diff+=BOCU1_START_NEG_2; 774 } 775 *target++ = static_cast<uint8_t>(diff); 776 *target++ = static_cast<uint8_t>(BOCU1_TRAIL_TO_BYTE(m)); 777 targetCapacity-=2; 778 } else { 779 int32_t length; /* will be 2..4 */ 780 781 diff=packDiff(diff); 782 length=BOCU1_LENGTH_FROM_PACKED(diff); 783 784 /* write the output character bytes from diff and length */ 785 /* from the first if in the loop we know that targetCapacity>0 */ 786 if(length<=targetCapacity) { 787 switch(length) { 788 /* each branch falls through to the next one */ 789 case 4: 790 *target++ = static_cast<uint8_t>(diff >> 24); 791 U_FALLTHROUGH; 792 case 3: 793 *target++ = static_cast<uint8_t>(diff >> 16); 794 /* case 2: handled above */ 795 *target++ = static_cast<uint8_t>(diff >> 8); 796 /* case 1: handled above */ 797 *target++ = static_cast<uint8_t>(diff); 798 U_FALLTHROUGH; 799 default: 800 /* will never occur */ 801 break; 802 } 803 targetCapacity-=length; 804 } else { 805 uint8_t *charErrorBuffer; 806 807 /* 808 * We actually do this backwards here: 809 * In order to save an intermediate variable, we output 810 * first to the overflow buffer what does not fit into the 811 * regular target. 812 */ 813 /* we know that 1<=targetCapacity<length<=4 */ 814 length-=targetCapacity; 815 charErrorBuffer=(uint8_t *)cnv->charErrorBuffer; 816 switch(length) { 817 /* each branch falls through to the next one */ 818 case 3: 819 *charErrorBuffer++ = static_cast<uint8_t>(diff >> 16); 820 U_FALLTHROUGH; 821 case 2: 822 *charErrorBuffer++ = static_cast<uint8_t>(diff >> 8); 823 U_FALLTHROUGH; 824 case 1: 825 *charErrorBuffer = static_cast<uint8_t>(diff); 826 U_FALLTHROUGH; 827 default: 828 /* will never occur */ 829 break; 830 } 831 cnv->charErrorBufferLength = static_cast<int8_t>(length); 832 833 /* now output what fits into the regular target */ 834 diff>>=8*length; /* length was reduced by targetCapacity */ 835 switch(targetCapacity) { 836 /* each branch falls through to the next one */ 837 case 3: 838 *target++ = static_cast<uint8_t>(diff >> 16); 839 U_FALLTHROUGH; 840 case 2: 841 *target++ = static_cast<uint8_t>(diff >> 8); 842 U_FALLTHROUGH; 843 case 1: 844 *target++ = static_cast<uint8_t>(diff); 845 U_FALLTHROUGH; 846 default: 847 /* will never occur */ 848 break; 849 } 850 851 /* target overflow */ 852 targetCapacity=0; 853 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 854 break; 855 } 856 } 857 } else { 858 /* target is full */ 859 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 860 break; 861 } 862 } 863 864 /* set the converter state back into UConverter */ 865 cnv->fromUChar32= c<0 ? -c : 0; 866 cnv->fromUnicodeStatus = static_cast<uint32_t>(prev); 867 868 /* write back the updated pointers */ 869 pArgs->source=source; 870 pArgs->target = reinterpret_cast<char*>(target); 871 } 872 873 /* BOCU-1-to-Unicode conversion functions ----------------------------------- */ 874 875 /** 876 * Function for BOCU-1 decoder; handles multi-byte lead bytes. 877 * 878 * @param b lead byte; 879 * BOCU1_MIN<=b<BOCU1_START_NEG_2 or BOCU1_START_POS_2<=b<BOCU1_MAX_LEAD 880 * @return (diff<<2)|count 881 */ 882 static inline int32_t 883 decodeBocu1LeadByte(int32_t b) { 884 int32_t diff, count; 885 886 if(b>=BOCU1_START_NEG_2) { 887 /* positive difference */ 888 if(b<BOCU1_START_POS_3) { 889 /* two bytes */ 890 diff = (b - BOCU1_START_POS_2) * BOCU1_TRAIL_COUNT + BOCU1_REACH_POS_1 + 1; 891 count=1; 892 } else if(b<BOCU1_START_POS_4) { 893 /* three bytes */ 894 diff = (b - BOCU1_START_POS_3) * BOCU1_TRAIL_COUNT * BOCU1_TRAIL_COUNT + BOCU1_REACH_POS_2 + 1; 895 count=2; 896 } else { 897 /* four bytes */ 898 diff=BOCU1_REACH_POS_3+1; 899 count=3; 900 } 901 } else { 902 /* negative difference */ 903 if(b>=BOCU1_START_NEG_3) { 904 /* two bytes */ 905 diff = (b - BOCU1_START_NEG_2) * BOCU1_TRAIL_COUNT + BOCU1_REACH_NEG_1; 906 count=1; 907 } else if(b>BOCU1_MIN) { 908 /* three bytes */ 909 diff = (b - BOCU1_START_NEG_3) * BOCU1_TRAIL_COUNT * BOCU1_TRAIL_COUNT + BOCU1_REACH_NEG_2; 910 count=2; 911 } else { 912 /* four bytes */ 913 diff=-BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT+BOCU1_REACH_NEG_3; 914 count=3; 915 } 916 } 917 918 /* return the state for decoding the trail byte(s) */ 919 return (static_cast<uint32_t>(diff) << 2) | count; 920 } 921 922 /** 923 * Function for BOCU-1 decoder; handles multi-byte trail bytes. 924 * 925 * @param count number of remaining trail bytes including this one 926 * @param b trail byte 927 * @return new delta for diff including b - <0 indicates an error 928 * 929 * @see decodeBocu1 930 */ 931 static inline int32_t 932 decodeBocu1TrailByte(int32_t count, int32_t b) { 933 if(b<=0x20) { 934 /* skip some C0 controls and make the trail byte range contiguous */ 935 b=bocu1ByteToTrail[b]; 936 /* b<0 for an illegal trail byte value will result in return<0 below */ 937 #if BOCU1_MAX_TRAIL<0xff 938 } else if(b>BOCU1_MAX_TRAIL) { 939 return -99; 940 #endif 941 } else { 942 b-=BOCU1_TRAIL_BYTE_OFFSET; 943 } 944 945 /* add trail byte into difference and decrement count */ 946 if(count==1) { 947 return b; 948 } else if(count==2) { 949 return b*BOCU1_TRAIL_COUNT; 950 } else /* count==3 */ { 951 return b*(BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT); 952 } 953 } 954 955 static void U_CALLCONV 956 _Bocu1ToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs, 957 UErrorCode *pErrorCode) { 958 UConverter *cnv; 959 const uint8_t *source, *sourceLimit; 960 char16_t *target; 961 const char16_t *targetLimit; 962 int32_t *offsets; 963 964 int32_t prev, count, diff, c; 965 966 int8_t byteIndex; 967 uint8_t *bytes; 968 969 int32_t sourceIndex, nextSourceIndex; 970 971 /* set up the local pointers */ 972 cnv=pArgs->converter; 973 source = reinterpret_cast<const uint8_t*>(pArgs->source); 974 sourceLimit = reinterpret_cast<const uint8_t*>(pArgs->sourceLimit); 975 target=pArgs->target; 976 targetLimit=pArgs->targetLimit; 977 offsets=pArgs->offsets; 978 979 /* get the converter state from UConverter */ 980 prev = static_cast<int32_t>(cnv->toUnicodeStatus); 981 if(prev==0) { 982 prev=BOCU1_ASCII_PREV; 983 } 984 diff=cnv->mode; /* mode may be set to UCNV_SI by ucnv_bld.c but then toULength==0 */ 985 count=diff&3; 986 diff>>=2; 987 988 byteIndex=cnv->toULength; 989 bytes=cnv->toUBytes; 990 991 /* sourceIndex=-1 if the current character began in the previous buffer */ 992 sourceIndex=byteIndex==0 ? 0 : -1; 993 nextSourceIndex=0; 994 995 /* conversion "loop" similar to _SCSUToUnicodeWithOffsets() */ 996 if(count>0 && byteIndex>0 && target<targetLimit) { 997 goto getTrail; 998 } 999 1000 fastSingle: 1001 /* fast loop for single-byte differences */ 1002 /* use count as the only loop counter variable */ 1003 diff = static_cast<int32_t>(sourceLimit - source); 1004 count = static_cast<int32_t>(pArgs->targetLimit - target); 1005 if(count>diff) { 1006 count=diff; 1007 } 1008 while(count>0) { 1009 if(BOCU1_START_NEG_2<=(c=*source) && c<BOCU1_START_POS_2) { 1010 c=prev+(c-BOCU1_MIDDLE); 1011 if(c<0x3000) { 1012 *target++ = static_cast<char16_t>(c); 1013 *offsets++=nextSourceIndex++; 1014 prev=BOCU1_SIMPLE_PREV(c); 1015 } else { 1016 break; 1017 } 1018 } else if(c<=0x20) { 1019 if(c!=0x20) { 1020 prev=BOCU1_ASCII_PREV; 1021 } 1022 *target++ = static_cast<char16_t>(c); 1023 *offsets++=nextSourceIndex++; 1024 } else { 1025 break; 1026 } 1027 ++source; 1028 --count; 1029 } 1030 sourceIndex=nextSourceIndex; /* wrong if offsets==nullptr but does not matter */ 1031 1032 /* decode a sequence of single and lead bytes */ 1033 while(source<sourceLimit) { 1034 if(target>=targetLimit) { 1035 /* target is full */ 1036 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 1037 break; 1038 } 1039 1040 ++nextSourceIndex; 1041 c=*source++; 1042 if(BOCU1_START_NEG_2<=c && c<BOCU1_START_POS_2) { 1043 /* Write a code point directly from a single-byte difference. */ 1044 c=prev+(c-BOCU1_MIDDLE); 1045 if(c<0x3000) { 1046 *target++ = static_cast<char16_t>(c); 1047 *offsets++=sourceIndex; 1048 prev=BOCU1_SIMPLE_PREV(c); 1049 sourceIndex=nextSourceIndex; 1050 goto fastSingle; 1051 } 1052 } else if(c<=0x20) { 1053 /* 1054 * Direct-encoded C0 control code or space. 1055 * Reset prev for C0 control codes but not for space. 1056 */ 1057 if(c!=0x20) { 1058 prev=BOCU1_ASCII_PREV; 1059 } 1060 *target++ = static_cast<char16_t>(c); 1061 *offsets++=sourceIndex; 1062 sourceIndex=nextSourceIndex; 1063 continue; 1064 } else if(BOCU1_START_NEG_3<=c && c<BOCU1_START_POS_3 && source<sourceLimit) { 1065 /* Optimize two-byte case. */ 1066 if(c>=BOCU1_MIDDLE) { 1067 diff = (c - BOCU1_START_POS_2) * BOCU1_TRAIL_COUNT + BOCU1_REACH_POS_1 + 1; 1068 } else { 1069 diff = (c - BOCU1_START_NEG_2) * BOCU1_TRAIL_COUNT + BOCU1_REACH_NEG_1; 1070 } 1071 1072 /* trail byte */ 1073 ++nextSourceIndex; 1074 c=decodeBocu1TrailByte(1, *source++); 1075 if (c < 0 || static_cast<uint32_t>(c = prev + diff + c) > 0x10ffff) { 1076 bytes[0]=source[-2]; 1077 bytes[1]=source[-1]; 1078 byteIndex=2; 1079 *pErrorCode=U_ILLEGAL_CHAR_FOUND; 1080 break; 1081 } 1082 } else if(c==BOCU1_RESET) { 1083 /* only reset the state, no code point */ 1084 prev=BOCU1_ASCII_PREV; 1085 sourceIndex=nextSourceIndex; 1086 continue; 1087 } else { 1088 /* 1089 * For multi-byte difference lead bytes, set the decoder state 1090 * with the partial difference value from the lead byte and 1091 * with the number of trail bytes. 1092 */ 1093 bytes[0] = static_cast<uint8_t>(c); 1094 byteIndex=1; 1095 1096 diff=decodeBocu1LeadByte(c); 1097 count=diff&3; 1098 diff>>=2; 1099 getTrail: 1100 for(;;) { 1101 if(source>=sourceLimit) { 1102 goto endloop; 1103 } 1104 ++nextSourceIndex; 1105 c=bytes[byteIndex++]=*source++; 1106 1107 /* trail byte in any position */ 1108 c=decodeBocu1TrailByte(count, c); 1109 if(c<0) { 1110 *pErrorCode=U_ILLEGAL_CHAR_FOUND; 1111 goto endloop; 1112 } 1113 1114 diff+=c; 1115 if(--count==0) { 1116 /* final trail byte, deliver a code point */ 1117 byteIndex=0; 1118 c=prev+diff; 1119 if (static_cast<uint32_t>(c) > 0x10ffff) { 1120 *pErrorCode=U_ILLEGAL_CHAR_FOUND; 1121 goto endloop; 1122 } 1123 break; 1124 } 1125 } 1126 } 1127 1128 /* calculate the next prev and output c */ 1129 prev=BOCU1_PREV(c); 1130 if(c<=0xffff) { 1131 *target++ = static_cast<char16_t>(c); 1132 *offsets++=sourceIndex; 1133 } else { 1134 /* output surrogate pair */ 1135 *target++=U16_LEAD(c); 1136 if(target<targetLimit) { 1137 *target++=U16_TRAIL(c); 1138 *offsets++=sourceIndex; 1139 *offsets++=sourceIndex; 1140 } else { 1141 /* target overflow */ 1142 *offsets++=sourceIndex; 1143 cnv->UCharErrorBuffer[0]=U16_TRAIL(c); 1144 cnv->UCharErrorBufferLength=1; 1145 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 1146 break; 1147 } 1148 } 1149 sourceIndex=nextSourceIndex; 1150 } 1151 endloop: 1152 1153 if(*pErrorCode==U_ILLEGAL_CHAR_FOUND) { 1154 /* set the converter state in UConverter to deal with the next character */ 1155 cnv->toUnicodeStatus=BOCU1_ASCII_PREV; 1156 cnv->mode=0; 1157 } else { 1158 /* set the converter state back into UConverter */ 1159 cnv->toUnicodeStatus = static_cast<uint32_t>(prev); 1160 cnv->mode = static_cast<int32_t>(static_cast<uint32_t>(diff) << 2) | count; 1161 } 1162 cnv->toULength=byteIndex; 1163 1164 /* write back the updated pointers */ 1165 pArgs->source = reinterpret_cast<const char*>(source); 1166 pArgs->target=target; 1167 pArgs->offsets=offsets; 1168 } 1169 1170 /* 1171 * Identical to _Bocu1ToUnicodeWithOffsets but without offset handling. 1172 * If a change is made in the original function, then either 1173 * change this function the same way or 1174 * re-copy the original function and remove the variables 1175 * offsets, sourceIndex, and nextSourceIndex. 1176 */ 1177 static void U_CALLCONV 1178 _Bocu1ToUnicode(UConverterToUnicodeArgs *pArgs, 1179 UErrorCode *pErrorCode) { 1180 UConverter *cnv; 1181 const uint8_t *source, *sourceLimit; 1182 char16_t *target; 1183 const char16_t *targetLimit; 1184 1185 int32_t prev, count, diff, c; 1186 1187 int8_t byteIndex; 1188 uint8_t *bytes; 1189 1190 /* set up the local pointers */ 1191 cnv=pArgs->converter; 1192 source = reinterpret_cast<const uint8_t*>(pArgs->source); 1193 sourceLimit = reinterpret_cast<const uint8_t*>(pArgs->sourceLimit); 1194 target=pArgs->target; 1195 targetLimit=pArgs->targetLimit; 1196 1197 /* get the converter state from UConverter */ 1198 prev = static_cast<int32_t>(cnv->toUnicodeStatus); 1199 if(prev==0) { 1200 prev=BOCU1_ASCII_PREV; 1201 } 1202 diff=cnv->mode; /* mode may be set to UCNV_SI by ucnv_bld.c but then toULength==0 */ 1203 count=diff&3; 1204 diff>>=2; 1205 1206 byteIndex=cnv->toULength; 1207 bytes=cnv->toUBytes; 1208 1209 /* conversion "loop" similar to _SCSUToUnicodeWithOffsets() */ 1210 if(count>0 && byteIndex>0 && target<targetLimit) { 1211 goto getTrail; 1212 } 1213 1214 fastSingle: 1215 /* fast loop for single-byte differences */ 1216 /* use count as the only loop counter variable */ 1217 diff = static_cast<int32_t>(sourceLimit - source); 1218 count = static_cast<int32_t>(pArgs->targetLimit - target); 1219 if(count>diff) { 1220 count=diff; 1221 } 1222 while(count>0) { 1223 if(BOCU1_START_NEG_2<=(c=*source) && c<BOCU1_START_POS_2) { 1224 c=prev+(c-BOCU1_MIDDLE); 1225 if(c<0x3000) { 1226 *target++ = static_cast<char16_t>(c); 1227 prev=BOCU1_SIMPLE_PREV(c); 1228 } else { 1229 break; 1230 } 1231 } else if(c<=0x20) { 1232 if(c!=0x20) { 1233 prev=BOCU1_ASCII_PREV; 1234 } 1235 *target++ = static_cast<char16_t>(c); 1236 } else { 1237 break; 1238 } 1239 ++source; 1240 --count; 1241 } 1242 1243 /* decode a sequence of single and lead bytes */ 1244 while(source<sourceLimit) { 1245 if(target>=targetLimit) { 1246 /* target is full */ 1247 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 1248 break; 1249 } 1250 1251 c=*source++; 1252 if(BOCU1_START_NEG_2<=c && c<BOCU1_START_POS_2) { 1253 /* Write a code point directly from a single-byte difference. */ 1254 c=prev+(c-BOCU1_MIDDLE); 1255 if(c<0x3000) { 1256 *target++ = static_cast<char16_t>(c); 1257 prev=BOCU1_SIMPLE_PREV(c); 1258 goto fastSingle; 1259 } 1260 } else if(c<=0x20) { 1261 /* 1262 * Direct-encoded C0 control code or space. 1263 * Reset prev for C0 control codes but not for space. 1264 */ 1265 if(c!=0x20) { 1266 prev=BOCU1_ASCII_PREV; 1267 } 1268 *target++ = static_cast<char16_t>(c); 1269 continue; 1270 } else if(BOCU1_START_NEG_3<=c && c<BOCU1_START_POS_3 && source<sourceLimit) { 1271 /* Optimize two-byte case. */ 1272 if(c>=BOCU1_MIDDLE) { 1273 diff = (c - BOCU1_START_POS_2) * BOCU1_TRAIL_COUNT + BOCU1_REACH_POS_1 + 1; 1274 } else { 1275 diff = (c - BOCU1_START_NEG_2) * BOCU1_TRAIL_COUNT + BOCU1_REACH_NEG_1; 1276 } 1277 1278 /* trail byte */ 1279 c=decodeBocu1TrailByte(1, *source++); 1280 if (c < 0 || static_cast<uint32_t>(c = prev + diff + c) > 0x10ffff) { 1281 bytes[0]=source[-2]; 1282 bytes[1]=source[-1]; 1283 byteIndex=2; 1284 *pErrorCode=U_ILLEGAL_CHAR_FOUND; 1285 break; 1286 } 1287 } else if(c==BOCU1_RESET) { 1288 /* only reset the state, no code point */ 1289 prev=BOCU1_ASCII_PREV; 1290 continue; 1291 } else { 1292 /* 1293 * For multi-byte difference lead bytes, set the decoder state 1294 * with the partial difference value from the lead byte and 1295 * with the number of trail bytes. 1296 */ 1297 bytes[0] = static_cast<uint8_t>(c); 1298 byteIndex=1; 1299 1300 diff=decodeBocu1LeadByte(c); 1301 count=diff&3; 1302 diff>>=2; 1303 getTrail: 1304 for(;;) { 1305 if(source>=sourceLimit) { 1306 goto endloop; 1307 } 1308 c=bytes[byteIndex++]=*source++; 1309 1310 /* trail byte in any position */ 1311 c=decodeBocu1TrailByte(count, c); 1312 if(c<0) { 1313 *pErrorCode=U_ILLEGAL_CHAR_FOUND; 1314 goto endloop; 1315 } 1316 1317 diff+=c; 1318 if(--count==0) { 1319 /* final trail byte, deliver a code point */ 1320 byteIndex=0; 1321 c=prev+diff; 1322 if (static_cast<uint32_t>(c) > 0x10ffff) { 1323 *pErrorCode=U_ILLEGAL_CHAR_FOUND; 1324 goto endloop; 1325 } 1326 break; 1327 } 1328 } 1329 } 1330 1331 /* calculate the next prev and output c */ 1332 prev=BOCU1_PREV(c); 1333 if(c<=0xffff) { 1334 *target++ = static_cast<char16_t>(c); 1335 } else { 1336 /* output surrogate pair */ 1337 *target++=U16_LEAD(c); 1338 if(target<targetLimit) { 1339 *target++=U16_TRAIL(c); 1340 } else { 1341 /* target overflow */ 1342 cnv->UCharErrorBuffer[0]=U16_TRAIL(c); 1343 cnv->UCharErrorBufferLength=1; 1344 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 1345 break; 1346 } 1347 } 1348 } 1349 endloop: 1350 1351 if(*pErrorCode==U_ILLEGAL_CHAR_FOUND) { 1352 /* set the converter state in UConverter to deal with the next character */ 1353 cnv->toUnicodeStatus=BOCU1_ASCII_PREV; 1354 cnv->mode=0; 1355 } else { 1356 /* set the converter state back into UConverter */ 1357 cnv->toUnicodeStatus = static_cast<uint32_t>(prev); 1358 cnv->mode = (static_cast<uint32_t>(diff) << 2) | count; 1359 } 1360 cnv->toULength=byteIndex; 1361 1362 /* write back the updated pointers */ 1363 pArgs->source = reinterpret_cast<const char*>(source); 1364 pArgs->target=target; 1365 } 1366 1367 /* miscellaneous ------------------------------------------------------------ */ 1368 1369 static const UConverterImpl _Bocu1Impl={ 1370 UCNV_BOCU1, 1371 1372 nullptr, 1373 nullptr, 1374 1375 nullptr, 1376 nullptr, 1377 nullptr, 1378 1379 _Bocu1ToUnicode, 1380 _Bocu1ToUnicodeWithOffsets, 1381 _Bocu1FromUnicode, 1382 _Bocu1FromUnicodeWithOffsets, 1383 nullptr, 1384 1385 nullptr, 1386 nullptr, 1387 nullptr, 1388 nullptr, 1389 ucnv_getCompleteUnicodeSet, 1390 1391 nullptr, 1392 nullptr 1393 }; 1394 1395 static const UConverterStaticData _Bocu1StaticData={ 1396 sizeof(UConverterStaticData), 1397 "BOCU-1", 1398 1214, /* CCSID for BOCU-1 */ 1399 UCNV_IBM, UCNV_BOCU1, 1400 1, 4, /* one char16_t generates at least 1 byte and at most 4 bytes */ 1401 { 0x1a, 0, 0, 0 }, 1, /* BOCU-1 never needs to write a subchar */ 1402 false, false, 1403 0, 1404 0, 1405 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */ 1406 }; 1407 1408 const UConverterSharedData _Bocu1Data= 1409 UCNV_IMMUTABLE_SHARED_DATA_INITIALIZER(&_Bocu1StaticData, &_Bocu1Impl); 1410 1411 #endif