ucnvscsu.cpp (76217B)
1 // © 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html 3 /* 4 ****************************************************************************** 5 * 6 * Copyright (C) 2000-2016, International Business Machines 7 * Corporation and others. All Rights Reserved. 8 * 9 ****************************************************************************** 10 * file name: ucnvscsu.c 11 * encoding: UTF-8 12 * tab size: 8 (not used) 13 * indentation:4 14 * 15 * created on: 2000nov18 16 * created by: Markus W. Scherer 17 * 18 * This is an implementation of the Standard Compression Scheme for Unicode 19 * as defined in https://www.unicode.org/reports/tr6/ . 20 * Reserved commands and window settings are treated as illegal sequences and 21 * will result in callback calls. 22 */ 23 24 #include "unicode/utypes.h" 25 26 #if !UCONFIG_NO_CONVERSION && !UCONFIG_ONLY_HTML_CONVERSION 27 28 #include "unicode/ucnv.h" 29 #include "unicode/ucnv_cb.h" 30 #include "unicode/utf16.h" 31 #include "ucnv_bld.h" 32 #include "ucnv_cnv.h" 33 #include "cmemory.h" 34 35 /* SCSU definitions --------------------------------------------------------- */ 36 37 /* SCSU command byte values */ 38 enum { 39 SQ0=0x01, /* Quote from window pair 0 */ 40 SQ7=0x08, /* Quote from window pair 7 */ 41 SDX=0x0B, /* Define a window as extended */ 42 Srs=0x0C, /* reserved */ 43 SQU=0x0E, /* Quote a single Unicode character */ 44 SCU=0x0F, /* Change to Unicode mode */ 45 SC0=0x10, /* Select window 0 */ 46 SC7=0x17, /* Select window 7 */ 47 SD0=0x18, /* Define and select window 0 */ 48 SD7=0x1F, /* Define and select window 7 */ 49 50 UC0=0xE0, /* Select window 0 */ 51 UC7=0xE7, /* Select window 7 */ 52 UD0=0xE8, /* Define and select window 0 */ 53 UD7=0xEF, /* Define and select window 7 */ 54 UQU=0xF0, /* Quote a single Unicode character */ 55 UDX=0xF1, /* Define a Window as extended */ 56 Urs=0xF2 /* reserved */ 57 }; 58 59 enum { 60 /* 61 * Unicode code points from 3400 to E000 are not adressible by 62 * dynamic window, since in these areas no short run alphabets are 63 * found. Therefore add gapOffset to all values from gapThreshold. 64 */ 65 gapThreshold=0x68, 66 gapOffset=0xAC00, 67 68 /* values between reservedStart and fixedThreshold are reserved */ 69 reservedStart=0xA8, 70 71 /* use table of predefined fixed offsets for values from fixedThreshold */ 72 fixedThreshold=0xF9 73 }; 74 75 /* constant offsets for the 8 static windows */ 76 static const uint32_t staticOffsets[8]={ 77 0x0000, /* ASCII for quoted tags */ 78 0x0080, /* Latin - 1 Supplement (for access to punctuation) */ 79 0x0100, /* Latin Extended-A */ 80 0x0300, /* Combining Diacritical Marks */ 81 0x2000, /* General Punctuation */ 82 0x2080, /* Currency Symbols */ 83 0x2100, /* Letterlike Symbols and Number Forms */ 84 0x3000 /* CJK Symbols and punctuation */ 85 }; 86 87 /* initial offsets for the 8 dynamic (sliding) windows */ 88 static const uint32_t initialDynamicOffsets[8]={ 89 0x0080, /* Latin-1 */ 90 0x00C0, /* Latin Extended A */ 91 0x0400, /* Cyrillic */ 92 0x0600, /* Arabic */ 93 0x0900, /* Devanagari */ 94 0x3040, /* Hiragana */ 95 0x30A0, /* Katakana */ 96 0xFF00 /* Fullwidth ASCII */ 97 }; 98 99 /* Table of fixed predefined Offsets */ 100 static const uint32_t fixedOffsets[]={ 101 /* 0xF9 */ 0x00C0, /* Latin-1 Letters + half of Latin Extended A */ 102 /* 0xFA */ 0x0250, /* IPA extensions */ 103 /* 0xFB */ 0x0370, /* Greek */ 104 /* 0xFC */ 0x0530, /* Armenian */ 105 /* 0xFD */ 0x3040, /* Hiragana */ 106 /* 0xFE */ 0x30A0, /* Katakana */ 107 /* 0xFF */ 0xFF60 /* Halfwidth Katakana */ 108 }; 109 110 /* state values */ 111 enum { 112 readCommand, 113 quotePairOne, 114 quotePairTwo, 115 quoteOne, 116 definePairOne, 117 definePairTwo, 118 defineOne 119 }; 120 121 typedef struct SCSUData { 122 /* dynamic window offsets, initialize to default values from initialDynamicOffsets */ 123 uint32_t toUDynamicOffsets[8]; 124 uint32_t fromUDynamicOffsets[8]; 125 126 /* state machine state - toUnicode */ 127 UBool toUIsSingleByteMode; 128 uint8_t toUState; 129 int8_t toUQuoteWindow, toUDynamicWindow; 130 uint8_t toUByteOne; 131 uint8_t toUPadding[3]; 132 133 /* state machine state - fromUnicode */ 134 UBool fromUIsSingleByteMode; 135 int8_t fromUDynamicWindow; 136 137 /* 138 * windowUse[] keeps track of the use of the dynamic windows: 139 * At nextWindowUseIndex there is the least recently used window, 140 * and the following windows (in a wrapping manner) are more and more 141 * recently used. 142 * At nextWindowUseIndex-1 there is the most recently used window. 143 */ 144 uint8_t locale; 145 int8_t nextWindowUseIndex; 146 int8_t windowUse[8]; 147 } SCSUData; 148 149 static const int8_t initialWindowUse[8]={ 7, 0, 3, 2, 4, 5, 6, 1 }; 150 static const int8_t initialWindowUse_ja[8]={ 3, 2, 4, 1, 0, 7, 5, 6 }; 151 152 enum { 153 lGeneric, l_ja 154 }; 155 156 /* SCSU setup functions ----------------------------------------------------- */ 157 U_CDECL_BEGIN 158 static void U_CALLCONV 159 _SCSUReset(UConverter *cnv, UConverterResetChoice choice) { 160 SCSUData *scsu=(SCSUData *)cnv->extraInfo; 161 162 if(choice<=UCNV_RESET_TO_UNICODE) { 163 /* reset toUnicode */ 164 uprv_memcpy(scsu->toUDynamicOffsets, initialDynamicOffsets, 32); 165 166 scsu->toUIsSingleByteMode=true; 167 scsu->toUState=readCommand; 168 scsu->toUQuoteWindow=scsu->toUDynamicWindow=0; 169 scsu->toUByteOne=0; 170 171 cnv->toULength=0; 172 } 173 if(choice!=UCNV_RESET_TO_UNICODE) { 174 /* reset fromUnicode */ 175 uprv_memcpy(scsu->fromUDynamicOffsets, initialDynamicOffsets, 32); 176 177 scsu->fromUIsSingleByteMode=true; 178 scsu->fromUDynamicWindow=0; 179 180 scsu->nextWindowUseIndex=0; 181 switch(scsu->locale) { 182 case l_ja: 183 uprv_memcpy(scsu->windowUse, initialWindowUse_ja, 8); 184 break; 185 default: 186 uprv_memcpy(scsu->windowUse, initialWindowUse, 8); 187 break; 188 } 189 190 cnv->fromUChar32=0; 191 } 192 } 193 194 static void U_CALLCONV 195 _SCSUOpen(UConverter *cnv, 196 UConverterLoadArgs *pArgs, 197 UErrorCode *pErrorCode) { 198 const char *locale=pArgs->locale; 199 if(pArgs->onlyTestIsLoadable) { 200 return; 201 } 202 cnv->extraInfo=uprv_malloc(sizeof(SCSUData)); 203 if(cnv->extraInfo!=nullptr) { 204 if(locale!=nullptr && locale[0]=='j' && locale[1]=='a' && (locale[2]==0 || locale[2]=='_')) { 205 ((SCSUData *)cnv->extraInfo)->locale=l_ja; 206 } else { 207 ((SCSUData *)cnv->extraInfo)->locale=lGeneric; 208 } 209 _SCSUReset(cnv, UCNV_RESET_BOTH); 210 } else { 211 *pErrorCode=U_MEMORY_ALLOCATION_ERROR; 212 } 213 214 /* Set the substitution character U+fffd as a Unicode string. */ 215 cnv->subUChars[0]=0xfffd; 216 cnv->subCharLen=-1; 217 } 218 219 static void U_CALLCONV 220 _SCSUClose(UConverter *cnv) { 221 if(cnv->extraInfo!=nullptr) { 222 if(!cnv->isExtraLocal) { 223 uprv_free(cnv->extraInfo); 224 } 225 cnv->extraInfo=nullptr; 226 } 227 } 228 229 /* SCSU-to-Unicode conversion functions ------------------------------------- */ 230 231 static void U_CALLCONV 232 _SCSUToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs, 233 UErrorCode *pErrorCode) { 234 UConverter *cnv; 235 SCSUData *scsu; 236 const uint8_t *source, *sourceLimit; 237 char16_t *target; 238 const char16_t *targetLimit; 239 int32_t *offsets; 240 UBool isSingleByteMode; 241 uint8_t state, byteOne; 242 int8_t quoteWindow, dynamicWindow; 243 244 int32_t sourceIndex, nextSourceIndex; 245 246 uint8_t b; 247 248 /* set up the local pointers */ 249 cnv=pArgs->converter; 250 scsu=(SCSUData *)cnv->extraInfo; 251 252 source=(const uint8_t *)pArgs->source; 253 sourceLimit=(const uint8_t *)pArgs->sourceLimit; 254 target=pArgs->target; 255 targetLimit=pArgs->targetLimit; 256 offsets=pArgs->offsets; 257 258 /* get the state machine state */ 259 isSingleByteMode=scsu->toUIsSingleByteMode; 260 state=scsu->toUState; 261 quoteWindow=scsu->toUQuoteWindow; 262 dynamicWindow=scsu->toUDynamicWindow; 263 byteOne=scsu->toUByteOne; 264 265 /* sourceIndex=-1 if the current character began in the previous buffer */ 266 sourceIndex=state==readCommand ? 0 : -1; 267 nextSourceIndex=0; 268 269 /* 270 * conversion "loop" 271 * 272 * For performance, this is not a normal C loop. 273 * Instead, there are two code blocks for the two SCSU modes. 274 * The function branches to either one, and a change of the mode is done with a goto to 275 * the other branch. 276 * 277 * Each branch has two conventional loops: 278 * - a fast-path loop for the most common codes in the mode 279 * - a loop for all other codes in the mode 280 * When the fast-path runs into a code that it cannot handle, its loop ends and it 281 * runs into the following loop to handle the other codes. 282 * The end of the input or output buffer is also handled by the slower loop. 283 * The slow loop jumps (goto) to the fast-path loop again as soon as possible. 284 * 285 * The callback handling is done by returning with an error code. 286 * The conversion framework actually calls the callback function. 287 */ 288 if(isSingleByteMode) { 289 /* fast path for single-byte mode */ 290 if(state==readCommand) { 291 fastSingle: 292 while(source<sourceLimit && target<targetLimit && (b=*source)>=0x20) { 293 ++source; 294 ++nextSourceIndex; 295 if(b<=0x7f) { 296 /* write US-ASCII graphic character or DEL */ 297 *target++=(char16_t)b; 298 if(offsets!=nullptr) { 299 *offsets++=sourceIndex; 300 } 301 } else { 302 /* write from dynamic window */ 303 uint32_t c=scsu->toUDynamicOffsets[dynamicWindow]+(b&0x7f); 304 if(c<=0xffff) { 305 *target++=(char16_t)c; 306 if(offsets!=nullptr) { 307 *offsets++=sourceIndex; 308 } 309 } else { 310 /* output surrogate pair */ 311 *target++=(char16_t)(0xd7c0+(c>>10)); 312 if(target<targetLimit) { 313 *target++=(char16_t)(0xdc00|(c&0x3ff)); 314 if(offsets!=nullptr) { 315 *offsets++=sourceIndex; 316 *offsets++=sourceIndex; 317 } 318 } else { 319 /* target overflow */ 320 if(offsets!=nullptr) { 321 *offsets++=sourceIndex; 322 } 323 cnv->UCharErrorBuffer[0]=(char16_t)(0xdc00|(c&0x3ff)); 324 cnv->UCharErrorBufferLength=1; 325 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 326 goto endloop; 327 } 328 } 329 } 330 sourceIndex=nextSourceIndex; 331 } 332 } 333 334 /* normal state machine for single-byte mode, minus handling for what fastSingle covers */ 335 singleByteMode: 336 while(source<sourceLimit) { 337 if(target>=targetLimit) { 338 /* target is full */ 339 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 340 break; 341 } 342 b=*source++; 343 ++nextSourceIndex; 344 switch(state) { 345 case readCommand: 346 /* redundant conditions are commented out */ 347 /* here: b<0x20 because otherwise we would be in fastSingle */ 348 if((1UL<<b)&0x2601 /* binary 0010 0110 0000 0001, check for b==0xd || b==0xa || b==9 || b==0 */) { 349 /* CR/LF/TAB/NUL */ 350 *target++=(char16_t)b; 351 if(offsets!=nullptr) { 352 *offsets++=sourceIndex; 353 } 354 sourceIndex=nextSourceIndex; 355 goto fastSingle; 356 } else if(SC0<=b) { 357 if(b<=SC7) { 358 dynamicWindow=(int8_t)(b-SC0); 359 sourceIndex=nextSourceIndex; 360 goto fastSingle; 361 } else /* if(SD0<=b && b<=SD7) */ { 362 dynamicWindow=(int8_t)(b-SD0); 363 state=defineOne; 364 } 365 } else if(/* SQ0<=b && */ b<=SQ7) { 366 quoteWindow=(int8_t)(b-SQ0); 367 state=quoteOne; 368 } else if(b==SDX) { 369 state=definePairOne; 370 } else if(b==SQU) { 371 state=quotePairOne; 372 } else if(b==SCU) { 373 sourceIndex=nextSourceIndex; 374 isSingleByteMode=false; 375 goto fastUnicode; 376 } else /* Srs */ { 377 /* callback(illegal) */ 378 *pErrorCode=U_ILLEGAL_CHAR_FOUND; 379 cnv->toUBytes[0]=b; 380 cnv->toULength=1; 381 goto endloop; 382 } 383 384 /* store the first byte of a multibyte sequence in toUBytes[] */ 385 cnv->toUBytes[0]=b; 386 cnv->toULength=1; 387 break; 388 case quotePairOne: 389 byteOne=b; 390 cnv->toUBytes[1]=b; 391 cnv->toULength=2; 392 state=quotePairTwo; 393 break; 394 case quotePairTwo: 395 *target++=(char16_t)((byteOne<<8)|b); 396 if(offsets!=nullptr) { 397 *offsets++=sourceIndex; 398 } 399 sourceIndex=nextSourceIndex; 400 state=readCommand; 401 goto fastSingle; 402 case quoteOne: 403 if(b<0x80) { 404 /* all static offsets are in the BMP */ 405 *target++=(char16_t)(staticOffsets[quoteWindow]+b); 406 if(offsets!=nullptr) { 407 *offsets++=sourceIndex; 408 } 409 } else { 410 /* write from dynamic window */ 411 uint32_t c=scsu->toUDynamicOffsets[quoteWindow]+(b&0x7f); 412 if(c<=0xffff) { 413 *target++=(char16_t)c; 414 if(offsets!=nullptr) { 415 *offsets++=sourceIndex; 416 } 417 } else { 418 /* output surrogate pair */ 419 *target++=(char16_t)(0xd7c0+(c>>10)); 420 if(target<targetLimit) { 421 *target++=(char16_t)(0xdc00|(c&0x3ff)); 422 if(offsets!=nullptr) { 423 *offsets++=sourceIndex; 424 *offsets++=sourceIndex; 425 } 426 } else { 427 /* target overflow */ 428 if(offsets!=nullptr) { 429 *offsets++=sourceIndex; 430 } 431 cnv->UCharErrorBuffer[0]=(char16_t)(0xdc00|(c&0x3ff)); 432 cnv->UCharErrorBufferLength=1; 433 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 434 goto endloop; 435 } 436 } 437 } 438 sourceIndex=nextSourceIndex; 439 state=readCommand; 440 goto fastSingle; 441 case definePairOne: 442 dynamicWindow=(int8_t)((b>>5)&7); 443 byteOne=(uint8_t)(b&0x1f); 444 cnv->toUBytes[1]=b; 445 cnv->toULength=2; 446 state=definePairTwo; 447 break; 448 case definePairTwo: 449 scsu->toUDynamicOffsets[dynamicWindow]=0x10000+(byteOne<<15UL | b<<7UL); 450 sourceIndex=nextSourceIndex; 451 state=readCommand; 452 goto fastSingle; 453 case defineOne: 454 if(b==0) { 455 /* callback(illegal): Reserved window offset value 0 */ 456 cnv->toUBytes[1]=b; 457 cnv->toULength=2; 458 goto endloop; 459 } else if(b<gapThreshold) { 460 scsu->toUDynamicOffsets[dynamicWindow]=b<<7UL; 461 } else if((uint8_t)(b-gapThreshold)<(reservedStart-gapThreshold)) { 462 scsu->toUDynamicOffsets[dynamicWindow]=(b<<7UL)+gapOffset; 463 } else if(b>=fixedThreshold) { 464 scsu->toUDynamicOffsets[dynamicWindow]=fixedOffsets[b-fixedThreshold]; 465 } else { 466 /* callback(illegal): Reserved window offset value 0xa8..0xf8 */ 467 cnv->toUBytes[1]=b; 468 cnv->toULength=2; 469 goto endloop; 470 } 471 sourceIndex=nextSourceIndex; 472 state=readCommand; 473 goto fastSingle; 474 } 475 } 476 } else { 477 /* fast path for Unicode mode */ 478 if(state==readCommand) { 479 fastUnicode: 480 while(source+1<sourceLimit && target<targetLimit && (uint8_t)((b=*source)-UC0)>(Urs-UC0)) { 481 *target++=(char16_t)((b<<8)|source[1]); 482 if(offsets!=nullptr) { 483 *offsets++=sourceIndex; 484 } 485 sourceIndex=nextSourceIndex; 486 nextSourceIndex+=2; 487 source+=2; 488 } 489 } 490 491 /* normal state machine for Unicode mode */ 492 /* unicodeByteMode: */ 493 while(source<sourceLimit) { 494 if(target>=targetLimit) { 495 /* target is full */ 496 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 497 break; 498 } 499 b=*source++; 500 ++nextSourceIndex; 501 switch(state) { 502 case readCommand: 503 if((uint8_t)(b-UC0)>(Urs-UC0)) { 504 byteOne=b; 505 cnv->toUBytes[0]=b; 506 cnv->toULength=1; 507 state=quotePairTwo; 508 } else if(/* UC0<=b && */ b<=UC7) { 509 dynamicWindow=(int8_t)(b-UC0); 510 sourceIndex=nextSourceIndex; 511 isSingleByteMode=true; 512 goto fastSingle; 513 } else if(/* UD0<=b && */ b<=UD7) { 514 dynamicWindow=(int8_t)(b-UD0); 515 isSingleByteMode=true; 516 cnv->toUBytes[0]=b; 517 cnv->toULength=1; 518 state=defineOne; 519 goto singleByteMode; 520 } else if(b==UDX) { 521 isSingleByteMode=true; 522 cnv->toUBytes[0]=b; 523 cnv->toULength=1; 524 state=definePairOne; 525 goto singleByteMode; 526 } else if(b==UQU) { 527 cnv->toUBytes[0]=b; 528 cnv->toULength=1; 529 state=quotePairOne; 530 } else /* Urs */ { 531 /* callback(illegal) */ 532 *pErrorCode=U_ILLEGAL_CHAR_FOUND; 533 cnv->toUBytes[0]=b; 534 cnv->toULength=1; 535 goto endloop; 536 } 537 break; 538 case quotePairOne: 539 byteOne=b; 540 cnv->toUBytes[1]=b; 541 cnv->toULength=2; 542 state=quotePairTwo; 543 break; 544 case quotePairTwo: 545 *target++=(char16_t)((byteOne<<8)|b); 546 if(offsets!=nullptr) { 547 *offsets++=sourceIndex; 548 } 549 sourceIndex=nextSourceIndex; 550 state=readCommand; 551 goto fastUnicode; 552 } 553 } 554 } 555 endloop: 556 557 /* set the converter state back into UConverter */ 558 if(U_FAILURE(*pErrorCode) && *pErrorCode!=U_BUFFER_OVERFLOW_ERROR) { 559 /* reset to deal with the next character */ 560 state=readCommand; 561 } else if(state==readCommand) { 562 /* not in a multi-byte sequence, reset toULength */ 563 cnv->toULength=0; 564 } 565 scsu->toUIsSingleByteMode=isSingleByteMode; 566 scsu->toUState=state; 567 scsu->toUQuoteWindow=quoteWindow; 568 scsu->toUDynamicWindow=dynamicWindow; 569 scsu->toUByteOne=byteOne; 570 571 /* write back the updated pointers */ 572 pArgs->source=(const char *)source; 573 pArgs->target=target; 574 pArgs->offsets=offsets; 575 } 576 577 /* 578 * Identical to _SCSUToUnicodeWithOffsets but without offset handling. 579 * If a change is made in the original function, then either 580 * change this function the same way or 581 * re-copy the original function and remove the variables 582 * offsets, sourceIndex, and nextSourceIndex. 583 */ 584 static void U_CALLCONV 585 _SCSUToUnicode(UConverterToUnicodeArgs *pArgs, 586 UErrorCode *pErrorCode) { 587 UConverter *cnv; 588 SCSUData *scsu; 589 const uint8_t *source, *sourceLimit; 590 char16_t *target; 591 const char16_t *targetLimit; 592 UBool isSingleByteMode; 593 uint8_t state, byteOne; 594 int8_t quoteWindow, dynamicWindow; 595 596 uint8_t b; 597 598 /* set up the local pointers */ 599 cnv=pArgs->converter; 600 scsu=(SCSUData *)cnv->extraInfo; 601 602 source=(const uint8_t *)pArgs->source; 603 sourceLimit=(const uint8_t *)pArgs->sourceLimit; 604 target=pArgs->target; 605 targetLimit=pArgs->targetLimit; 606 607 /* get the state machine state */ 608 isSingleByteMode=scsu->toUIsSingleByteMode; 609 state=scsu->toUState; 610 quoteWindow=scsu->toUQuoteWindow; 611 dynamicWindow=scsu->toUDynamicWindow; 612 byteOne=scsu->toUByteOne; 613 614 /* 615 * conversion "loop" 616 * 617 * For performance, this is not a normal C loop. 618 * Instead, there are two code blocks for the two SCSU modes. 619 * The function branches to either one, and a change of the mode is done with a goto to 620 * the other branch. 621 * 622 * Each branch has two conventional loops: 623 * - a fast-path loop for the most common codes in the mode 624 * - a loop for all other codes in the mode 625 * When the fast-path runs into a code that it cannot handle, its loop ends and it 626 * runs into the following loop to handle the other codes. 627 * The end of the input or output buffer is also handled by the slower loop. 628 * The slow loop jumps (goto) to the fast-path loop again as soon as possible. 629 * 630 * The callback handling is done by returning with an error code. 631 * The conversion framework actually calls the callback function. 632 */ 633 if(isSingleByteMode) { 634 /* fast path for single-byte mode */ 635 if(state==readCommand) { 636 fastSingle: 637 while(source<sourceLimit && target<targetLimit && (b=*source)>=0x20) { 638 ++source; 639 if(b<=0x7f) { 640 /* write US-ASCII graphic character or DEL */ 641 *target++=(char16_t)b; 642 } else { 643 /* write from dynamic window */ 644 uint32_t c=scsu->toUDynamicOffsets[dynamicWindow]+(b&0x7f); 645 if(c<=0xffff) { 646 *target++=(char16_t)c; 647 } else { 648 /* output surrogate pair */ 649 *target++=(char16_t)(0xd7c0+(c>>10)); 650 if(target<targetLimit) { 651 *target++=(char16_t)(0xdc00|(c&0x3ff)); 652 } else { 653 /* target overflow */ 654 cnv->UCharErrorBuffer[0]=(char16_t)(0xdc00|(c&0x3ff)); 655 cnv->UCharErrorBufferLength=1; 656 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 657 goto endloop; 658 } 659 } 660 } 661 } 662 } 663 664 /* normal state machine for single-byte mode, minus handling for what fastSingle covers */ 665 singleByteMode: 666 while(source<sourceLimit) { 667 if(target>=targetLimit) { 668 /* target is full */ 669 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 670 break; 671 } 672 b=*source++; 673 switch(state) { 674 case readCommand: 675 /* redundant conditions are commented out */ 676 /* here: b<0x20 because otherwise we would be in fastSingle */ 677 if((1UL<<b)&0x2601 /* binary 0010 0110 0000 0001, check for b==0xd || b==0xa || b==9 || b==0 */) { 678 /* CR/LF/TAB/NUL */ 679 *target++=(char16_t)b; 680 goto fastSingle; 681 } else if(SC0<=b) { 682 if(b<=SC7) { 683 dynamicWindow=(int8_t)(b-SC0); 684 goto fastSingle; 685 } else /* if(SD0<=b && b<=SD7) */ { 686 dynamicWindow=(int8_t)(b-SD0); 687 state=defineOne; 688 } 689 } else if(/* SQ0<=b && */ b<=SQ7) { 690 quoteWindow=(int8_t)(b-SQ0); 691 state=quoteOne; 692 } else if(b==SDX) { 693 state=definePairOne; 694 } else if(b==SQU) { 695 state=quotePairOne; 696 } else if(b==SCU) { 697 isSingleByteMode=false; 698 goto fastUnicode; 699 } else /* Srs */ { 700 /* callback(illegal) */ 701 *pErrorCode=U_ILLEGAL_CHAR_FOUND; 702 cnv->toUBytes[0]=b; 703 cnv->toULength=1; 704 goto endloop; 705 } 706 707 /* store the first byte of a multibyte sequence in toUBytes[] */ 708 cnv->toUBytes[0]=b; 709 cnv->toULength=1; 710 break; 711 case quotePairOne: 712 byteOne=b; 713 cnv->toUBytes[1]=b; 714 cnv->toULength=2; 715 state=quotePairTwo; 716 break; 717 case quotePairTwo: 718 *target++=(char16_t)((byteOne<<8)|b); 719 state=readCommand; 720 goto fastSingle; 721 case quoteOne: 722 if(b<0x80) { 723 /* all static offsets are in the BMP */ 724 *target++=(char16_t)(staticOffsets[quoteWindow]+b); 725 } else { 726 /* write from dynamic window */ 727 uint32_t c=scsu->toUDynamicOffsets[quoteWindow]+(b&0x7f); 728 if(c<=0xffff) { 729 *target++=(char16_t)c; 730 } else { 731 /* output surrogate pair */ 732 *target++=(char16_t)(0xd7c0+(c>>10)); 733 if(target<targetLimit) { 734 *target++=(char16_t)(0xdc00|(c&0x3ff)); 735 } else { 736 /* target overflow */ 737 cnv->UCharErrorBuffer[0]=(char16_t)(0xdc00|(c&0x3ff)); 738 cnv->UCharErrorBufferLength=1; 739 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 740 goto endloop; 741 } 742 } 743 } 744 state=readCommand; 745 goto fastSingle; 746 case definePairOne: 747 dynamicWindow=(int8_t)((b>>5)&7); 748 byteOne=(uint8_t)(b&0x1f); 749 cnv->toUBytes[1]=b; 750 cnv->toULength=2; 751 state=definePairTwo; 752 break; 753 case definePairTwo: 754 scsu->toUDynamicOffsets[dynamicWindow]=0x10000+(byteOne<<15UL | b<<7UL); 755 state=readCommand; 756 goto fastSingle; 757 case defineOne: 758 if(b==0) { 759 /* callback(illegal): Reserved window offset value 0 */ 760 cnv->toUBytes[1]=b; 761 cnv->toULength=2; 762 goto endloop; 763 } else if(b<gapThreshold) { 764 scsu->toUDynamicOffsets[dynamicWindow]=b<<7UL; 765 } else if((uint8_t)(b-gapThreshold)<(reservedStart-gapThreshold)) { 766 scsu->toUDynamicOffsets[dynamicWindow]=(b<<7UL)+gapOffset; 767 } else if(b>=fixedThreshold) { 768 scsu->toUDynamicOffsets[dynamicWindow]=fixedOffsets[b-fixedThreshold]; 769 } else { 770 /* callback(illegal): Reserved window offset value 0xa8..0xf8 */ 771 cnv->toUBytes[1]=b; 772 cnv->toULength=2; 773 goto endloop; 774 } 775 state=readCommand; 776 goto fastSingle; 777 } 778 } 779 } else { 780 /* fast path for Unicode mode */ 781 if(state==readCommand) { 782 fastUnicode: 783 while(source+1<sourceLimit && target<targetLimit && (uint8_t)((b=*source)-UC0)>(Urs-UC0)) { 784 *target++=(char16_t)((b<<8)|source[1]); 785 source+=2; 786 } 787 } 788 789 /* normal state machine for Unicode mode */ 790 /* unicodeByteMode: */ 791 while(source<sourceLimit) { 792 if(target>=targetLimit) { 793 /* target is full */ 794 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 795 break; 796 } 797 b=*source++; 798 switch(state) { 799 case readCommand: 800 if((uint8_t)(b-UC0)>(Urs-UC0)) { 801 byteOne=b; 802 cnv->toUBytes[0]=b; 803 cnv->toULength=1; 804 state=quotePairTwo; 805 } else if(/* UC0<=b && */ b<=UC7) { 806 dynamicWindow=(int8_t)(b-UC0); 807 isSingleByteMode=true; 808 goto fastSingle; 809 } else if(/* UD0<=b && */ b<=UD7) { 810 dynamicWindow=(int8_t)(b-UD0); 811 isSingleByteMode=true; 812 cnv->toUBytes[0]=b; 813 cnv->toULength=1; 814 state=defineOne; 815 goto singleByteMode; 816 } else if(b==UDX) { 817 isSingleByteMode=true; 818 cnv->toUBytes[0]=b; 819 cnv->toULength=1; 820 state=definePairOne; 821 goto singleByteMode; 822 } else if(b==UQU) { 823 cnv->toUBytes[0]=b; 824 cnv->toULength=1; 825 state=quotePairOne; 826 } else /* Urs */ { 827 /* callback(illegal) */ 828 *pErrorCode=U_ILLEGAL_CHAR_FOUND; 829 cnv->toUBytes[0]=b; 830 cnv->toULength=1; 831 goto endloop; 832 } 833 break; 834 case quotePairOne: 835 byteOne=b; 836 cnv->toUBytes[1]=b; 837 cnv->toULength=2; 838 state=quotePairTwo; 839 break; 840 case quotePairTwo: 841 *target++=(char16_t)((byteOne<<8)|b); 842 state=readCommand; 843 goto fastUnicode; 844 } 845 } 846 } 847 endloop: 848 849 /* set the converter state back into UConverter */ 850 if(U_FAILURE(*pErrorCode) && *pErrorCode!=U_BUFFER_OVERFLOW_ERROR) { 851 /* reset to deal with the next character */ 852 state=readCommand; 853 } else if(state==readCommand) { 854 /* not in a multi-byte sequence, reset toULength */ 855 cnv->toULength=0; 856 } 857 scsu->toUIsSingleByteMode=isSingleByteMode; 858 scsu->toUState=state; 859 scsu->toUQuoteWindow=quoteWindow; 860 scsu->toUDynamicWindow=dynamicWindow; 861 scsu->toUByteOne=byteOne; 862 863 /* write back the updated pointers */ 864 pArgs->source=(const char *)source; 865 pArgs->target=target; 866 } 867 U_CDECL_END 868 /* SCSU-from-Unicode conversion functions ----------------------------------- */ 869 870 /* 871 * This SCSU Encoder is fairly simple but uses all SCSU commands to achieve 872 * reasonable results. The lookahead is minimal. 873 * Many cases are simple: 874 * A character fits directly into the current mode, a dynamic or static window, 875 * or is not compressible. These cases are tested first. 876 * Real compression heuristics are applied to the rest, in code branches for 877 * single/Unicode mode and BMP/supplementary code points. 878 * The heuristics used here are extremely simple. 879 */ 880 881 /* get the number of the window that this character is in, or -1 */ 882 static int8_t 883 getWindow(const uint32_t offsets[8], uint32_t c) { 884 int i; 885 for(i=0; i<8; ++i) { 886 if (c - offsets[i] <= 0x7f) { 887 return static_cast<int8_t>(i); 888 } 889 } 890 return -1; 891 } 892 893 /* is the character in the dynamic window starting at the offset, or in the direct-encoded range? */ 894 static UBool 895 isInOffsetWindowOrDirect(uint32_t offset, uint32_t c) { 896 return c<=offset+0x7f && 897 (c>=offset || (c<=0x7f && 898 (c>=0x20 || (1UL<<c)&0x2601))); 899 /* binary 0010 0110 0000 0001, 900 check for b==0xd || b==0xa || b==9 || b==0 */ 901 } 902 903 /* 904 * getNextDynamicWindow returns the next dynamic window to be redefined 905 */ 906 static int8_t 907 getNextDynamicWindow(SCSUData *scsu) { 908 int8_t window=scsu->windowUse[scsu->nextWindowUseIndex]; 909 if(++scsu->nextWindowUseIndex==8) { 910 scsu->nextWindowUseIndex=0; 911 } 912 return window; 913 } 914 915 /* 916 * useDynamicWindow() adjusts 917 * windowUse[] and nextWindowUseIndex for the algorithm to choose 918 * the next dynamic window to be defined; 919 * a subclass may override it and provide its own algorithm. 920 */ 921 static void 922 useDynamicWindow(SCSUData *scsu, int8_t window) { 923 /* 924 * move the existing window, which just became the most recently used one, 925 * up in windowUse[] to nextWindowUseIndex-1 926 */ 927 928 /* first, find the index of the window - backwards to favor the more recently used windows */ 929 int i, j; 930 931 i=scsu->nextWindowUseIndex; 932 do { 933 if(--i<0) { 934 i=7; 935 } 936 } while(scsu->windowUse[i]!=window); 937 938 /* now copy each windowUse[i+1] to [i] */ 939 j=i+1; 940 if(j==8) { 941 j=0; 942 } 943 while(j!=scsu->nextWindowUseIndex) { 944 scsu->windowUse[i]=scsu->windowUse[j]; 945 i=j; 946 if(++j==8) { j=0; } 947 } 948 949 /* finally, set the window into the most recently used index */ 950 scsu->windowUse[i]=window; 951 } 952 953 /* 954 * calculate the offset and the code for a dynamic window that contains the character 955 * takes fixed offsets into account 956 * the offset of the window is stored in the offset variable, 957 * the code is returned 958 * 959 * return offset code: -1 none <=0xff code for SDn/UDn else code for SDX/UDX, subtract 0x200 to get the true code 960 */ 961 static int 962 getDynamicOffset(uint32_t c, uint32_t *pOffset) { 963 int i; 964 965 for(i=0; i<7; ++i) { 966 if (c - fixedOffsets[i] <= 0x7f) { 967 *pOffset=fixedOffsets[i]; 968 return 0xf9+i; 969 } 970 } 971 972 if(c<0x80) { 973 /* No dynamic window for US-ASCII. */ 974 return -1; 975 } else if(c<0x3400 || 976 c - 0x10000 < 0x14000 - 0x10000 || 977 c - 0x1d000 <= 0x1ffff - 0x1d000 978 ) { 979 /* This character is in a code range for a "small", i.e., reasonably windowable, script. */ 980 *pOffset=c&0x7fffff80; 981 return static_cast<int>(c >> 7); 982 } else if(0xe000<=c && c!=0xfeff && c<0xfff0) { 983 /* For these characters we need to take the gapOffset into account. */ 984 *pOffset=c&0x7fffff80; 985 return static_cast<int>((c - gapOffset) >> 7); 986 } else { 987 return -1; 988 } 989 } 990 U_CDECL_BEGIN 991 /* 992 * Idea for compression: 993 * - save SCSUData and other state before really starting work 994 * - at endloop, see if compression could be better with just unicode mode 995 * - don't do this if a callback has been called 996 * - if unicode mode would be smaller, then override the results with it - may need SCU at the beginning 997 * - different buffer handling! 998 * 999 * Drawback or need for corrective handling: 1000 * it is desirable to encode U+feff as SQU fe ff for the SCSU signature, and 1001 * it is desirable to start a document in US-ASCII/Latin-1 for as long as possible 1002 * not only for compression but also for HTML/XML documents with following charset/encoding announcers. 1003 * 1004 * How to achieve both? 1005 * - Only replace the result after an SDX or SCU? 1006 */ 1007 1008 static void U_CALLCONV 1009 _SCSUFromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs, 1010 UErrorCode *pErrorCode) { 1011 UConverter *cnv; 1012 SCSUData *scsu; 1013 const char16_t *source, *sourceLimit; 1014 uint8_t *target; 1015 int32_t targetCapacity; 1016 int32_t *offsets; 1017 1018 UBool isSingleByteMode; 1019 uint8_t dynamicWindow; 1020 uint32_t currentOffset; 1021 1022 uint32_t c, delta; 1023 1024 int32_t sourceIndex, nextSourceIndex; 1025 1026 int32_t length; 1027 1028 /* variables for compression heuristics */ 1029 uint32_t offset; 1030 char16_t lead, trail; 1031 int code; 1032 int8_t window; 1033 1034 /* set up the local pointers */ 1035 cnv=pArgs->converter; 1036 scsu=(SCSUData *)cnv->extraInfo; 1037 1038 /* set up the local pointers */ 1039 source=pArgs->source; 1040 sourceLimit=pArgs->sourceLimit; 1041 target=(uint8_t *)pArgs->target; 1042 targetCapacity=(int32_t)(pArgs->targetLimit-pArgs->target); 1043 offsets=pArgs->offsets; 1044 1045 /* get the state machine state */ 1046 isSingleByteMode=scsu->fromUIsSingleByteMode; 1047 dynamicWindow=scsu->fromUDynamicWindow; 1048 currentOffset=scsu->fromUDynamicOffsets[dynamicWindow]; 1049 1050 c=cnv->fromUChar32; 1051 1052 /* sourceIndex=-1 if the current character began in the previous buffer */ 1053 sourceIndex= c==0 ? 0 : -1; 1054 nextSourceIndex=0; 1055 1056 /* similar conversion "loop" as in toUnicode */ 1057 loop: 1058 if(isSingleByteMode) { 1059 if(c!=0 && targetCapacity>0) { 1060 goto getTrailSingle; 1061 } 1062 1063 /* state machine for single-byte mode */ 1064 /* singleByteMode: */ 1065 while(source<sourceLimit) { 1066 if(targetCapacity<=0) { 1067 /* target is full */ 1068 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 1069 break; 1070 } 1071 c=*source++; 1072 ++nextSourceIndex; 1073 1074 if((c-0x20)<=0x5f) { 1075 /* pass US-ASCII graphic character through */ 1076 *target++=(uint8_t)c; 1077 if(offsets!=nullptr) { 1078 *offsets++=sourceIndex; 1079 } 1080 --targetCapacity; 1081 } else if(c<0x20) { 1082 if((1UL<<c)&0x2601 /* binary 0010 0110 0000 0001, check for b==0xd || b==0xa || b==9 || b==0 */) { 1083 /* CR/LF/TAB/NUL */ 1084 *target++=(uint8_t)c; 1085 if(offsets!=nullptr) { 1086 *offsets++=sourceIndex; 1087 } 1088 --targetCapacity; 1089 } else { 1090 /* quote C0 control character */ 1091 c|=SQ0<<8; 1092 length=2; 1093 goto outputBytes; 1094 } 1095 } else if((delta=c-currentOffset)<=0x7f) { 1096 /* use the current dynamic window */ 1097 *target++=(uint8_t)(delta|0x80); 1098 if(offsets!=nullptr) { 1099 *offsets++=sourceIndex; 1100 } 1101 --targetCapacity; 1102 } else if(U16_IS_SURROGATE(c)) { 1103 if(U16_IS_SURROGATE_LEAD(c)) { 1104 getTrailSingle: 1105 lead=(char16_t)c; 1106 if(source<sourceLimit) { 1107 /* test the following code unit */ 1108 trail=*source; 1109 if(U16_IS_TRAIL(trail)) { 1110 ++source; 1111 ++nextSourceIndex; 1112 c=U16_GET_SUPPLEMENTARY(c, trail); 1113 /* convert this surrogate code point */ 1114 /* exit this condition tree */ 1115 } else { 1116 /* this is an unmatched lead code unit (1st surrogate) */ 1117 /* callback(illegal) */ 1118 *pErrorCode=U_ILLEGAL_CHAR_FOUND; 1119 goto endloop; 1120 } 1121 } else { 1122 /* no more input */ 1123 break; 1124 } 1125 } else { 1126 /* this is an unmatched trail code unit (2nd surrogate) */ 1127 /* callback(illegal) */ 1128 *pErrorCode=U_ILLEGAL_CHAR_FOUND; 1129 goto endloop; 1130 } 1131 1132 /* compress supplementary character U+10000..U+10ffff */ 1133 if((delta=c-currentOffset)<=0x7f) { 1134 /* use the current dynamic window */ 1135 *target++=(uint8_t)(delta|0x80); 1136 if(offsets!=nullptr) { 1137 *offsets++=sourceIndex; 1138 } 1139 --targetCapacity; 1140 } else if((window=getWindow(scsu->fromUDynamicOffsets, c))>=0) { 1141 /* there is a dynamic window that contains this character, change to it */ 1142 dynamicWindow=window; 1143 currentOffset=scsu->fromUDynamicOffsets[dynamicWindow]; 1144 useDynamicWindow(scsu, dynamicWindow); 1145 c=((uint32_t)(SC0+dynamicWindow)<<8)|(c-currentOffset)|0x80; 1146 length=2; 1147 goto outputBytes; 1148 } else if((code=getDynamicOffset(c, &offset))>=0) { 1149 /* might check if there are more characters in this window to come */ 1150 /* define an extended window with this character */ 1151 code-=0x200; 1152 dynamicWindow=getNextDynamicWindow(scsu); 1153 currentOffset=scsu->fromUDynamicOffsets[dynamicWindow]=offset; 1154 useDynamicWindow(scsu, dynamicWindow); 1155 c=((uint32_t)SDX<<24)|((uint32_t)dynamicWindow<<21)|((uint32_t)code<<8)|(c-currentOffset)|0x80; 1156 length=4; 1157 goto outputBytes; 1158 } else { 1159 /* change to Unicode mode and output this (lead, trail) pair */ 1160 isSingleByteMode=false; 1161 *target++=(uint8_t)SCU; 1162 if(offsets!=nullptr) { 1163 *offsets++=sourceIndex; 1164 } 1165 --targetCapacity; 1166 c=((uint32_t)lead<<16)|trail; 1167 length=4; 1168 goto outputBytes; 1169 } 1170 } else if(c<0xa0) { 1171 /* quote C1 control character */ 1172 c=(c&0x7f)|(SQ0+1)<<8; /* SQ0+1==SQ1 */ 1173 length=2; 1174 goto outputBytes; 1175 } else if(c==0xfeff || c>=0xfff0) { 1176 /* quote signature character=byte order mark and specials */ 1177 c|=SQU<<16; 1178 length=3; 1179 goto outputBytes; 1180 } else { 1181 /* compress all other BMP characters */ 1182 if((window=getWindow(scsu->fromUDynamicOffsets, c))>=0) { 1183 /* there is a window defined that contains this character - switch to it or quote from it? */ 1184 if(source>=sourceLimit || isInOffsetWindowOrDirect(scsu->fromUDynamicOffsets[window], *source)) { 1185 /* change to dynamic window */ 1186 dynamicWindow=window; 1187 currentOffset=scsu->fromUDynamicOffsets[dynamicWindow]; 1188 useDynamicWindow(scsu, dynamicWindow); 1189 c=((uint32_t)(SC0+dynamicWindow)<<8)|(c-currentOffset)|0x80; 1190 length=2; 1191 goto outputBytes; 1192 } else { 1193 /* quote from dynamic window */ 1194 c=((uint32_t)(SQ0+window)<<8)|(c-scsu->fromUDynamicOffsets[window])|0x80; 1195 length=2; 1196 goto outputBytes; 1197 } 1198 } else if((window=getWindow(staticOffsets, c))>=0) { 1199 /* quote from static window */ 1200 c=((uint32_t)(SQ0+window)<<8)|(c-staticOffsets[window]); 1201 length=2; 1202 goto outputBytes; 1203 } else if((code=getDynamicOffset(c, &offset))>=0) { 1204 /* define a dynamic window with this character */ 1205 dynamicWindow=getNextDynamicWindow(scsu); 1206 currentOffset=scsu->fromUDynamicOffsets[dynamicWindow]=offset; 1207 useDynamicWindow(scsu, dynamicWindow); 1208 c=((uint32_t)(SD0+dynamicWindow)<<16)|((uint32_t)code<<8)|(c-currentOffset)|0x80; 1209 length=3; 1210 goto outputBytes; 1211 } else if ((c - 0x3400) < (0xd800 - 0x3400) && 1212 (source >= sourceLimit || (uint32_t)(*source - 0x3400) < (0xd800 - 0x3400)) 1213 ) { 1214 /* 1215 * this character is not compressible (a BMP ideograph or similar); 1216 * switch to Unicode mode if this is the last character in the block 1217 * or there is at least one more ideograph following immediately 1218 */ 1219 isSingleByteMode=false; 1220 c|=SCU<<16; 1221 length=3; 1222 goto outputBytes; 1223 } else { 1224 /* quote Unicode */ 1225 c|=SQU<<16; 1226 length=3; 1227 goto outputBytes; 1228 } 1229 } 1230 1231 /* normal end of conversion: prepare for a new character */ 1232 c=0; 1233 sourceIndex=nextSourceIndex; 1234 } 1235 } else { 1236 if(c!=0 && targetCapacity>0) { 1237 goto getTrailUnicode; 1238 } 1239 1240 /* state machine for Unicode mode */ 1241 /* unicodeByteMode: */ 1242 while(source<sourceLimit) { 1243 if(targetCapacity<=0) { 1244 /* target is full */ 1245 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 1246 break; 1247 } 1248 c=*source++; 1249 ++nextSourceIndex; 1250 1251 if ((c - 0x3400) < (0xd800 - 0x3400)) { 1252 /* not compressible, write character directly */ 1253 if(targetCapacity>=2) { 1254 *target++=(uint8_t)(c>>8); 1255 *target++=(uint8_t)c; 1256 if(offsets!=nullptr) { 1257 *offsets++=sourceIndex; 1258 *offsets++=sourceIndex; 1259 } 1260 targetCapacity-=2; 1261 } else { 1262 length=2; 1263 goto outputBytes; 1264 } 1265 } else if (c - 0x3400 >= 0xf300 - 0x3400 /* c<0x3400 || c>=0xf300 */) { 1266 /* compress BMP character if the following one is not an uncompressible ideograph */ 1267 if(!(source<sourceLimit && (uint32_t)(*source-0x3400)<(0xd800-0x3400))) { 1268 if (c - 0x30 < 10 || c - 0x61 < 26 || c - 0x41 < 26) { 1269 /* ASCII digit or letter */ 1270 isSingleByteMode=true; 1271 c|=((uint32_t)(UC0+dynamicWindow)<<8)|c; 1272 length=2; 1273 goto outputBytes; 1274 } else if((window=getWindow(scsu->fromUDynamicOffsets, c))>=0) { 1275 /* there is a dynamic window that contains this character, change to it */ 1276 isSingleByteMode=true; 1277 dynamicWindow=window; 1278 currentOffset=scsu->fromUDynamicOffsets[dynamicWindow]; 1279 useDynamicWindow(scsu, dynamicWindow); 1280 c=((uint32_t)(UC0+dynamicWindow)<<8)|(c-currentOffset)|0x80; 1281 length=2; 1282 goto outputBytes; 1283 } else if((code=getDynamicOffset(c, &offset))>=0) { 1284 /* define a dynamic window with this character */ 1285 isSingleByteMode=true; 1286 dynamicWindow=getNextDynamicWindow(scsu); 1287 currentOffset=scsu->fromUDynamicOffsets[dynamicWindow]=offset; 1288 useDynamicWindow(scsu, dynamicWindow); 1289 c=((uint32_t)(UD0+dynamicWindow)<<16)|((uint32_t)code<<8)|(c-currentOffset)|0x80; 1290 length=3; 1291 goto outputBytes; 1292 } 1293 } 1294 1295 /* don't know how to compress this character, just write it directly */ 1296 length=2; 1297 goto outputBytes; 1298 } else if(c<0xe000) { 1299 /* c is a surrogate */ 1300 if(U16_IS_SURROGATE_LEAD(c)) { 1301 getTrailUnicode: 1302 lead=(char16_t)c; 1303 if(source<sourceLimit) { 1304 /* test the following code unit */ 1305 trail=*source; 1306 if(U16_IS_TRAIL(trail)) { 1307 ++source; 1308 ++nextSourceIndex; 1309 c=U16_GET_SUPPLEMENTARY(c, trail); 1310 /* convert this surrogate code point */ 1311 /* exit this condition tree */ 1312 } else { 1313 /* this is an unmatched lead code unit (1st surrogate) */ 1314 /* callback(illegal) */ 1315 *pErrorCode=U_ILLEGAL_CHAR_FOUND; 1316 goto endloop; 1317 } 1318 } else { 1319 /* no more input */ 1320 break; 1321 } 1322 } else { 1323 /* this is an unmatched trail code unit (2nd surrogate) */ 1324 /* callback(illegal) */ 1325 *pErrorCode=U_ILLEGAL_CHAR_FOUND; 1326 goto endloop; 1327 } 1328 1329 /* compress supplementary character */ 1330 if( (window=getWindow(scsu->fromUDynamicOffsets, c))>=0 && 1331 !(source<sourceLimit && (uint32_t)(*source-0x3400)<(0xd800-0x3400)) 1332 ) { 1333 /* 1334 * there is a dynamic window that contains this character and 1335 * the following character is not uncompressible, 1336 * change to the window 1337 */ 1338 isSingleByteMode=true; 1339 dynamicWindow=window; 1340 currentOffset=scsu->fromUDynamicOffsets[dynamicWindow]; 1341 useDynamicWindow(scsu, dynamicWindow); 1342 c=((uint32_t)(UC0+dynamicWindow)<<8)|(c-currentOffset)|0x80; 1343 length=2; 1344 goto outputBytes; 1345 } else if(source<sourceLimit && lead==*source && /* too lazy to check trail in same window as source[1] */ 1346 (code=getDynamicOffset(c, &offset))>=0 1347 ) { 1348 /* two supplementary characters in (probably) the same window - define an extended one */ 1349 isSingleByteMode=true; 1350 code-=0x200; 1351 dynamicWindow=getNextDynamicWindow(scsu); 1352 currentOffset=scsu->fromUDynamicOffsets[dynamicWindow]=offset; 1353 useDynamicWindow(scsu, dynamicWindow); 1354 c=((uint32_t)UDX<<24)|((uint32_t)dynamicWindow<<21)|((uint32_t)code<<8)|(c-currentOffset)|0x80; 1355 length=4; 1356 goto outputBytes; 1357 } else { 1358 /* don't know how to compress this character, just write it directly */ 1359 c=((uint32_t)lead<<16)|trail; 1360 length=4; 1361 goto outputBytes; 1362 } 1363 } else /* 0xe000<=c<0xf300 */ { 1364 /* quote to avoid SCSU tags */ 1365 c|=UQU<<16; 1366 length=3; 1367 goto outputBytes; 1368 } 1369 1370 /* normal end of conversion: prepare for a new character */ 1371 c=0; 1372 sourceIndex=nextSourceIndex; 1373 } 1374 } 1375 endloop: 1376 1377 /* set the converter state back into UConverter */ 1378 scsu->fromUIsSingleByteMode=isSingleByteMode; 1379 scsu->fromUDynamicWindow=dynamicWindow; 1380 1381 cnv->fromUChar32=c; 1382 1383 /* write back the updated pointers */ 1384 pArgs->source=source; 1385 pArgs->target=(char *)target; 1386 pArgs->offsets=offsets; 1387 return; 1388 1389 outputBytes: 1390 /* write the output character bytes from c and length [code copied from ucnvmbcs.c] */ 1391 /* from the first if in the loop we know that targetCapacity>0 */ 1392 if(length<=targetCapacity) { 1393 if(offsets==nullptr) { 1394 switch(length) { 1395 /* each branch falls through to the next one */ 1396 case 4: 1397 *target++=(uint8_t)(c>>24); 1398 U_FALLTHROUGH; 1399 case 3: 1400 *target++=(uint8_t)(c>>16); 1401 U_FALLTHROUGH; 1402 case 2: 1403 *target++=(uint8_t)(c>>8); 1404 U_FALLTHROUGH; 1405 case 1: 1406 *target++=(uint8_t)c; 1407 U_FALLTHROUGH; 1408 default: 1409 /* will never occur */ 1410 break; 1411 } 1412 } else { 1413 switch(length) { 1414 /* each branch falls through to the next one */ 1415 case 4: 1416 *target++=(uint8_t)(c>>24); 1417 *offsets++=sourceIndex; 1418 U_FALLTHROUGH; 1419 case 3: 1420 *target++=(uint8_t)(c>>16); 1421 *offsets++=sourceIndex; 1422 U_FALLTHROUGH; 1423 case 2: 1424 *target++=(uint8_t)(c>>8); 1425 *offsets++=sourceIndex; 1426 U_FALLTHROUGH; 1427 case 1: 1428 *target++=(uint8_t)c; 1429 *offsets++=sourceIndex; 1430 U_FALLTHROUGH; 1431 default: 1432 /* will never occur */ 1433 break; 1434 } 1435 } 1436 targetCapacity-=length; 1437 1438 /* normal end of conversion: prepare for a new character */ 1439 c=0; 1440 sourceIndex=nextSourceIndex; 1441 goto loop; 1442 } else { 1443 uint8_t *p; 1444 1445 /* 1446 * We actually do this backwards here: 1447 * In order to save an intermediate variable, we output 1448 * first to the overflow buffer what does not fit into the 1449 * regular target. 1450 */ 1451 /* we know that 0<=targetCapacity<length<=4 */ 1452 /* targetCapacity==0 when SCU+supplementary where SCU used up targetCapacity==1 */ 1453 length-=targetCapacity; 1454 p=(uint8_t *)cnv->charErrorBuffer; 1455 switch(length) { 1456 /* each branch falls through to the next one */ 1457 case 4: 1458 *p++=(uint8_t)(c>>24); 1459 U_FALLTHROUGH; 1460 case 3: 1461 *p++=(uint8_t)(c>>16); 1462 U_FALLTHROUGH; 1463 case 2: 1464 *p++=(uint8_t)(c>>8); 1465 U_FALLTHROUGH; 1466 case 1: 1467 *p=(uint8_t)c; 1468 U_FALLTHROUGH; 1469 default: 1470 /* will never occur */ 1471 break; 1472 } 1473 cnv->charErrorBufferLength=(int8_t)length; 1474 1475 /* now output what fits into the regular target */ 1476 c>>=8*length; /* length was reduced by targetCapacity */ 1477 switch(targetCapacity) { 1478 /* each branch falls through to the next one */ 1479 case 3: 1480 *target++=(uint8_t)(c>>16); 1481 if(offsets!=nullptr) { 1482 *offsets++=sourceIndex; 1483 } 1484 U_FALLTHROUGH; 1485 case 2: 1486 *target++=(uint8_t)(c>>8); 1487 if(offsets!=nullptr) { 1488 *offsets++=sourceIndex; 1489 } 1490 U_FALLTHROUGH; 1491 case 1: 1492 *target++=(uint8_t)c; 1493 if(offsets!=nullptr) { 1494 *offsets++=sourceIndex; 1495 } 1496 U_FALLTHROUGH; 1497 default: 1498 break; 1499 } 1500 1501 /* target overflow */ 1502 targetCapacity=0; 1503 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 1504 c=0; 1505 goto endloop; 1506 } 1507 } 1508 1509 /* 1510 * Identical to _SCSUFromUnicodeWithOffsets but without offset handling. 1511 * If a change is made in the original function, then either 1512 * change this function the same way or 1513 * re-copy the original function and remove the variables 1514 * offsets, sourceIndex, and nextSourceIndex. 1515 */ 1516 static void U_CALLCONV 1517 _SCSUFromUnicode(UConverterFromUnicodeArgs *pArgs, 1518 UErrorCode *pErrorCode) { 1519 UConverter *cnv; 1520 SCSUData *scsu; 1521 const char16_t *source, *sourceLimit; 1522 uint8_t *target; 1523 int32_t targetCapacity; 1524 1525 UBool isSingleByteMode; 1526 uint8_t dynamicWindow; 1527 uint32_t currentOffset; 1528 1529 uint32_t c, delta; 1530 1531 int32_t length; 1532 1533 /* variables for compression heuristics */ 1534 uint32_t offset; 1535 char16_t lead, trail; 1536 int code; 1537 int8_t window; 1538 1539 /* set up the local pointers */ 1540 cnv=pArgs->converter; 1541 scsu=(SCSUData *)cnv->extraInfo; 1542 1543 /* set up the local pointers */ 1544 source=pArgs->source; 1545 sourceLimit=pArgs->sourceLimit; 1546 target=(uint8_t *)pArgs->target; 1547 targetCapacity=(int32_t)(pArgs->targetLimit-pArgs->target); 1548 1549 /* get the state machine state */ 1550 isSingleByteMode=scsu->fromUIsSingleByteMode; 1551 dynamicWindow=scsu->fromUDynamicWindow; 1552 currentOffset=scsu->fromUDynamicOffsets[dynamicWindow]; 1553 1554 c=cnv->fromUChar32; 1555 1556 /* similar conversion "loop" as in toUnicode */ 1557 loop: 1558 if(isSingleByteMode) { 1559 if(c!=0 && targetCapacity>0) { 1560 goto getTrailSingle; 1561 } 1562 1563 /* state machine for single-byte mode */ 1564 /* singleByteMode: */ 1565 while(source<sourceLimit) { 1566 if(targetCapacity<=0) { 1567 /* target is full */ 1568 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 1569 break; 1570 } 1571 c=*source++; 1572 1573 if((c-0x20)<=0x5f) { 1574 /* pass US-ASCII graphic character through */ 1575 *target++=(uint8_t)c; 1576 --targetCapacity; 1577 } else if(c<0x20) { 1578 if((1UL<<c)&0x2601 /* binary 0010 0110 0000 0001, check for b==0xd || b==0xa || b==9 || b==0 */) { 1579 /* CR/LF/TAB/NUL */ 1580 *target++=(uint8_t)c; 1581 --targetCapacity; 1582 } else { 1583 /* quote C0 control character */ 1584 c|=SQ0<<8; 1585 length=2; 1586 goto outputBytes; 1587 } 1588 } else if((delta=c-currentOffset)<=0x7f) { 1589 /* use the current dynamic window */ 1590 *target++=(uint8_t)(delta|0x80); 1591 --targetCapacity; 1592 } else if(U16_IS_SURROGATE(c)) { 1593 if(U16_IS_SURROGATE_LEAD(c)) { 1594 getTrailSingle: 1595 lead=(char16_t)c; 1596 if(source<sourceLimit) { 1597 /* test the following code unit */ 1598 trail=*source; 1599 if(U16_IS_TRAIL(trail)) { 1600 ++source; 1601 c=U16_GET_SUPPLEMENTARY(c, trail); 1602 /* convert this surrogate code point */ 1603 /* exit this condition tree */ 1604 } else { 1605 /* this is an unmatched lead code unit (1st surrogate) */ 1606 /* callback(illegal) */ 1607 *pErrorCode=U_ILLEGAL_CHAR_FOUND; 1608 goto endloop; 1609 } 1610 } else { 1611 /* no more input */ 1612 break; 1613 } 1614 } else { 1615 /* this is an unmatched trail code unit (2nd surrogate) */ 1616 /* callback(illegal) */ 1617 *pErrorCode=U_ILLEGAL_CHAR_FOUND; 1618 goto endloop; 1619 } 1620 1621 /* compress supplementary character U+10000..U+10ffff */ 1622 if((delta=c-currentOffset)<=0x7f) { 1623 /* use the current dynamic window */ 1624 *target++=(uint8_t)(delta|0x80); 1625 --targetCapacity; 1626 } else if((window=getWindow(scsu->fromUDynamicOffsets, c))>=0) { 1627 /* there is a dynamic window that contains this character, change to it */ 1628 dynamicWindow=window; 1629 currentOffset=scsu->fromUDynamicOffsets[dynamicWindow]; 1630 useDynamicWindow(scsu, dynamicWindow); 1631 c=((uint32_t)(SC0+dynamicWindow)<<8)|(c-currentOffset)|0x80; 1632 length=2; 1633 goto outputBytes; 1634 } else if((code=getDynamicOffset(c, &offset))>=0) { 1635 /* might check if there are more characters in this window to come */ 1636 /* define an extended window with this character */ 1637 code-=0x200; 1638 dynamicWindow=getNextDynamicWindow(scsu); 1639 currentOffset=scsu->fromUDynamicOffsets[dynamicWindow]=offset; 1640 useDynamicWindow(scsu, dynamicWindow); 1641 c=((uint32_t)SDX<<24)|((uint32_t)dynamicWindow<<21)|((uint32_t)code<<8)|(c-currentOffset)|0x80; 1642 length=4; 1643 goto outputBytes; 1644 } else { 1645 /* change to Unicode mode and output this (lead, trail) pair */ 1646 isSingleByteMode=false; 1647 *target++=(uint8_t)SCU; 1648 --targetCapacity; 1649 c=((uint32_t)lead<<16)|trail; 1650 length=4; 1651 goto outputBytes; 1652 } 1653 } else if(c<0xa0) { 1654 /* quote C1 control character */ 1655 c=(c&0x7f)|(SQ0+1)<<8; /* SQ0+1==SQ1 */ 1656 length=2; 1657 goto outputBytes; 1658 } else if(c==0xfeff || c>=0xfff0) { 1659 /* quote signature character=byte order mark and specials */ 1660 c|=SQU<<16; 1661 length=3; 1662 goto outputBytes; 1663 } else { 1664 /* compress all other BMP characters */ 1665 if((window=getWindow(scsu->fromUDynamicOffsets, c))>=0) { 1666 /* there is a window defined that contains this character - switch to it or quote from it? */ 1667 if(source>=sourceLimit || isInOffsetWindowOrDirect(scsu->fromUDynamicOffsets[window], *source)) { 1668 /* change to dynamic window */ 1669 dynamicWindow=window; 1670 currentOffset=scsu->fromUDynamicOffsets[dynamicWindow]; 1671 useDynamicWindow(scsu, dynamicWindow); 1672 c=((uint32_t)(SC0+dynamicWindow)<<8)|(c-currentOffset)|0x80; 1673 length=2; 1674 goto outputBytes; 1675 } else { 1676 /* quote from dynamic window */ 1677 c=((uint32_t)(SQ0+window)<<8)|(c-scsu->fromUDynamicOffsets[window])|0x80; 1678 length=2; 1679 goto outputBytes; 1680 } 1681 } else if((window=getWindow(staticOffsets, c))>=0) { 1682 /* quote from static window */ 1683 c=((uint32_t)(SQ0+window)<<8)|(c-staticOffsets[window]); 1684 length=2; 1685 goto outputBytes; 1686 } else if((code=getDynamicOffset(c, &offset))>=0) { 1687 /* define a dynamic window with this character */ 1688 dynamicWindow=getNextDynamicWindow(scsu); 1689 currentOffset=scsu->fromUDynamicOffsets[dynamicWindow]=offset; 1690 useDynamicWindow(scsu, dynamicWindow); 1691 c=((uint32_t)(SD0+dynamicWindow)<<16)|((uint32_t)code<<8)|(c-currentOffset)|0x80; 1692 length=3; 1693 goto outputBytes; 1694 } else if (c - 0x3400 < 0xd800 - 0x3400 && 1695 (source >= sourceLimit || static_cast<uint32_t>(*source - 0x3400) < 0xd800 - 0x3400) 1696 ) { 1697 /* 1698 * this character is not compressible (a BMP ideograph or similar); 1699 * switch to Unicode mode if this is the last character in the block 1700 * or there is at least one more ideograph following immediately 1701 */ 1702 isSingleByteMode=false; 1703 c|=SCU<<16; 1704 length=3; 1705 goto outputBytes; 1706 } else { 1707 /* quote Unicode */ 1708 c|=SQU<<16; 1709 length=3; 1710 goto outputBytes; 1711 } 1712 } 1713 1714 /* normal end of conversion: prepare for a new character */ 1715 c=0; 1716 } 1717 } else { 1718 if(c!=0 && targetCapacity>0) { 1719 goto getTrailUnicode; 1720 } 1721 1722 /* state machine for Unicode mode */ 1723 /* unicodeByteMode: */ 1724 while(source<sourceLimit) { 1725 if(targetCapacity<=0) { 1726 /* target is full */ 1727 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 1728 break; 1729 } 1730 c=*source++; 1731 1732 if (c - 0x3400 < 0xd800 - 0x3400) { 1733 /* not compressible, write character directly */ 1734 if(targetCapacity>=2) { 1735 *target++=(uint8_t)(c>>8); 1736 *target++=(uint8_t)c; 1737 targetCapacity-=2; 1738 } else { 1739 length=2; 1740 goto outputBytes; 1741 } 1742 } else if (c - 0x3400 >= 0xf300 - 0x3400 /* c<0x3400 || c>=0xf300 */) { 1743 /* compress BMP character if the following one is not an uncompressible ideograph */ 1744 if(!(source<sourceLimit && (uint32_t)(*source-0x3400)<(0xd800-0x3400))) { 1745 if (c - 0x30 < 10 || c - 0x61 < 26 || c - 0x41 < 26) { 1746 /* ASCII digit or letter */ 1747 isSingleByteMode=true; 1748 c|=((uint32_t)(UC0+dynamicWindow)<<8)|c; 1749 length=2; 1750 goto outputBytes; 1751 } else if((window=getWindow(scsu->fromUDynamicOffsets, c))>=0) { 1752 /* there is a dynamic window that contains this character, change to it */ 1753 isSingleByteMode=true; 1754 dynamicWindow=window; 1755 currentOffset=scsu->fromUDynamicOffsets[dynamicWindow]; 1756 useDynamicWindow(scsu, dynamicWindow); 1757 c=((uint32_t)(UC0+dynamicWindow)<<8)|(c-currentOffset)|0x80; 1758 length=2; 1759 goto outputBytes; 1760 } else if((code=getDynamicOffset(c, &offset))>=0) { 1761 /* define a dynamic window with this character */ 1762 isSingleByteMode=true; 1763 dynamicWindow=getNextDynamicWindow(scsu); 1764 currentOffset=scsu->fromUDynamicOffsets[dynamicWindow]=offset; 1765 useDynamicWindow(scsu, dynamicWindow); 1766 c=((uint32_t)(UD0+dynamicWindow)<<16)|((uint32_t)code<<8)|(c-currentOffset)|0x80; 1767 length=3; 1768 goto outputBytes; 1769 } 1770 } 1771 1772 /* don't know how to compress this character, just write it directly */ 1773 length=2; 1774 goto outputBytes; 1775 } else if(c<0xe000) { 1776 /* c is a surrogate */ 1777 if(U16_IS_SURROGATE_LEAD(c)) { 1778 getTrailUnicode: 1779 lead=(char16_t)c; 1780 if(source<sourceLimit) { 1781 /* test the following code unit */ 1782 trail=*source; 1783 if(U16_IS_TRAIL(trail)) { 1784 ++source; 1785 c=U16_GET_SUPPLEMENTARY(c, trail); 1786 /* convert this surrogate code point */ 1787 /* exit this condition tree */ 1788 } else { 1789 /* this is an unmatched lead code unit (1st surrogate) */ 1790 /* callback(illegal) */ 1791 *pErrorCode=U_ILLEGAL_CHAR_FOUND; 1792 goto endloop; 1793 } 1794 } else { 1795 /* no more input */ 1796 break; 1797 } 1798 } else { 1799 /* this is an unmatched trail code unit (2nd surrogate) */ 1800 /* callback(illegal) */ 1801 *pErrorCode=U_ILLEGAL_CHAR_FOUND; 1802 goto endloop; 1803 } 1804 1805 /* compress supplementary character */ 1806 if( (window=getWindow(scsu->fromUDynamicOffsets, c))>=0 && 1807 !(source<sourceLimit && (uint32_t)(*source-0x3400)<(0xd800-0x3400)) 1808 ) { 1809 /* 1810 * there is a dynamic window that contains this character and 1811 * the following character is not uncompressible, 1812 * change to the window 1813 */ 1814 isSingleByteMode=true; 1815 dynamicWindow=window; 1816 currentOffset=scsu->fromUDynamicOffsets[dynamicWindow]; 1817 useDynamicWindow(scsu, dynamicWindow); 1818 c=((uint32_t)(UC0+dynamicWindow)<<8)|(c-currentOffset)|0x80; 1819 length=2; 1820 goto outputBytes; 1821 } else if(source<sourceLimit && lead==*source && /* too lazy to check trail in same window as source[1] */ 1822 (code=getDynamicOffset(c, &offset))>=0 1823 ) { 1824 /* two supplementary characters in (probably) the same window - define an extended one */ 1825 isSingleByteMode=true; 1826 code-=0x200; 1827 dynamicWindow=getNextDynamicWindow(scsu); 1828 currentOffset=scsu->fromUDynamicOffsets[dynamicWindow]=offset; 1829 useDynamicWindow(scsu, dynamicWindow); 1830 c=((uint32_t)UDX<<24)|((uint32_t)dynamicWindow<<21)|((uint32_t)code<<8)|(c-currentOffset)|0x80; 1831 length=4; 1832 goto outputBytes; 1833 } else { 1834 /* don't know how to compress this character, just write it directly */ 1835 c=((uint32_t)lead<<16)|trail; 1836 length=4; 1837 goto outputBytes; 1838 } 1839 } else /* 0xe000<=c<0xf300 */ { 1840 /* quote to avoid SCSU tags */ 1841 c|=UQU<<16; 1842 length=3; 1843 goto outputBytes; 1844 } 1845 1846 /* normal end of conversion: prepare for a new character */ 1847 c=0; 1848 } 1849 } 1850 endloop: 1851 1852 /* set the converter state back into UConverter */ 1853 scsu->fromUIsSingleByteMode=isSingleByteMode; 1854 scsu->fromUDynamicWindow=dynamicWindow; 1855 1856 cnv->fromUChar32=c; 1857 1858 /* write back the updated pointers */ 1859 pArgs->source=source; 1860 pArgs->target=(char *)target; 1861 return; 1862 1863 outputBytes: 1864 /* write the output character bytes from c and length [code copied from ucnvmbcs.c] */ 1865 /* from the first if in the loop we know that targetCapacity>0 */ 1866 if(length<=targetCapacity) { 1867 switch(length) { 1868 /* each branch falls through to the next one */ 1869 case 4: 1870 *target++=(uint8_t)(c>>24); 1871 U_FALLTHROUGH; 1872 case 3: 1873 *target++=(uint8_t)(c>>16); 1874 U_FALLTHROUGH; 1875 case 2: 1876 *target++=(uint8_t)(c>>8); 1877 U_FALLTHROUGH; 1878 case 1: 1879 *target++=(uint8_t)c; 1880 U_FALLTHROUGH; 1881 default: 1882 /* will never occur */ 1883 break; 1884 } 1885 targetCapacity-=length; 1886 1887 /* normal end of conversion: prepare for a new character */ 1888 c=0; 1889 goto loop; 1890 } else { 1891 uint8_t *p; 1892 1893 /* 1894 * We actually do this backwards here: 1895 * In order to save an intermediate variable, we output 1896 * first to the overflow buffer what does not fit into the 1897 * regular target. 1898 */ 1899 /* we know that 0<=targetCapacity<length<=4 */ 1900 /* targetCapacity==0 when SCU+supplementary where SCU used up targetCapacity==1 */ 1901 length-=targetCapacity; 1902 p=(uint8_t *)cnv->charErrorBuffer; 1903 switch(length) { 1904 /* each branch falls through to the next one */ 1905 case 4: 1906 *p++=(uint8_t)(c>>24); 1907 U_FALLTHROUGH; 1908 case 3: 1909 *p++=(uint8_t)(c>>16); 1910 U_FALLTHROUGH; 1911 case 2: 1912 *p++=(uint8_t)(c>>8); 1913 U_FALLTHROUGH; 1914 case 1: 1915 *p=(uint8_t)c; 1916 U_FALLTHROUGH; 1917 default: 1918 /* will never occur */ 1919 break; 1920 } 1921 cnv->charErrorBufferLength=(int8_t)length; 1922 1923 /* now output what fits into the regular target */ 1924 c = (length == 4) ? 0 : c >> 8*length; /* length was reduced by targetCapacity */ 1925 switch(targetCapacity) { 1926 /* each branch falls through to the next one */ 1927 case 3: 1928 *target++=(uint8_t)(c>>16); 1929 U_FALLTHROUGH; 1930 case 2: 1931 *target++=(uint8_t)(c>>8); 1932 U_FALLTHROUGH; 1933 case 1: 1934 *target++=(uint8_t)c; 1935 U_FALLTHROUGH; 1936 default: 1937 break; 1938 } 1939 1940 /* target overflow */ 1941 targetCapacity=0; 1942 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 1943 c=0; 1944 goto endloop; 1945 } 1946 } 1947 1948 /* miscellaneous ------------------------------------------------------------ */ 1949 1950 static const char * U_CALLCONV 1951 _SCSUGetName(const UConverter *cnv) { 1952 SCSUData *scsu=(SCSUData *)cnv->extraInfo; 1953 1954 switch(scsu->locale) { 1955 case l_ja: 1956 return "SCSU,locale=ja"; 1957 default: 1958 return "SCSU"; 1959 } 1960 } 1961 1962 /* structure for SafeClone calculations */ 1963 struct cloneSCSUStruct 1964 { 1965 UConverter cnv; 1966 SCSUData mydata; 1967 }; 1968 1969 static UConverter * U_CALLCONV 1970 _SCSUSafeClone(const UConverter *cnv, 1971 void *stackBuffer, 1972 int32_t *pBufferSize, 1973 UErrorCode *status) 1974 { 1975 struct cloneSCSUStruct * localClone; 1976 int32_t bufferSizeNeeded = sizeof(struct cloneSCSUStruct); 1977 1978 if (U_FAILURE(*status)){ 1979 return nullptr; 1980 } 1981 1982 if (*pBufferSize == 0){ /* 'preflighting' request - set needed size into *pBufferSize */ 1983 *pBufferSize = bufferSizeNeeded; 1984 return nullptr; 1985 } 1986 1987 localClone = (struct cloneSCSUStruct *)stackBuffer; 1988 /* ucnv.c/ucnv_safeClone() copied the main UConverter already */ 1989 1990 uprv_memcpy(&localClone->mydata, cnv->extraInfo, sizeof(SCSUData)); 1991 localClone->cnv.extraInfo = &localClone->mydata; 1992 localClone->cnv.isExtraLocal = true; 1993 1994 return &localClone->cnv; 1995 } 1996 U_CDECL_END 1997 1998 static const UConverterImpl _SCSUImpl={ 1999 UCNV_SCSU, 2000 2001 nullptr, 2002 nullptr, 2003 2004 _SCSUOpen, 2005 _SCSUClose, 2006 _SCSUReset, 2007 2008 _SCSUToUnicode, 2009 _SCSUToUnicodeWithOffsets, 2010 _SCSUFromUnicode, 2011 _SCSUFromUnicodeWithOffsets, 2012 nullptr, 2013 2014 nullptr, 2015 _SCSUGetName, 2016 nullptr, 2017 _SCSUSafeClone, 2018 ucnv_getCompleteUnicodeSet, 2019 nullptr, 2020 nullptr 2021 }; 2022 2023 static const UConverterStaticData _SCSUStaticData={ 2024 sizeof(UConverterStaticData), 2025 "SCSU", 2026 1212, /* CCSID for SCSU */ 2027 UCNV_IBM, UCNV_SCSU, 2028 1, 3, /* one char16_t generates at least 1 byte and at most 3 bytes */ 2029 /* 2030 * The subchar here is ignored because _SCSUOpen() sets U+fffd as a Unicode 2031 * substitution string. 2032 */ 2033 { 0x0e, 0xff, 0xfd, 0 }, 3, 2034 false, false, 2035 0, 2036 0, 2037 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */ 2038 }; 2039 2040 const UConverterSharedData _SCSUData= 2041 UCNV_IMMUTABLE_SHARED_DATA_INITIALIZER(&_SCSUStaticData, &_SCSUImpl); 2042 2043 #endif