tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

ucnv_err.cpp (18401B)


      1 // © 2016 and later: Unicode, Inc. and others.
      2 // License & terms of use: http://www.unicode.org/copyright.html
      3 /*
      4 *****************************************************************************
      5 *
      6 *   Copyright (C) 1998-2016, International Business Machines
      7 *   Corporation and others.  All Rights Reserved.
      8 *
      9 *****************************************************************************
     10 *
     11 *  ucnv_err.c
     12 *  Implements error behaviour functions called by T_UConverter_{from,to}Unicode
     13 *
     14 *
     15 *   Change history:
     16 *
     17 *   06/29/2000  helena      Major rewrite of the callback APIs.
     18 */
     19 
     20 #include "unicode/utypes.h"
     21 
     22 #if !UCONFIG_NO_CONVERSION
     23 
     24 #include "unicode/ucnv_err.h"
     25 #include "unicode/ucnv_cb.h"
     26 #include "ucnv_cnv.h"
     27 #include "cmemory.h"
     28 #include "unicode/ucnv.h"
     29 #include "ustrfmt.h"
     30 
     31 #define VALUE_STRING_LENGTH 48
     32 /*Magic # 32 = 4(number of char in value string) * 8(max number of bytes per char for any converter) */
     33 #define UNICODE_PERCENT_SIGN_CODEPOINT  0x0025
     34 #define UNICODE_U_CODEPOINT             0x0055
     35 #define UNICODE_X_CODEPOINT             0x0058
     36 #define UNICODE_RS_CODEPOINT            0x005C
     37 #define UNICODE_U_LOW_CODEPOINT         0x0075
     38 #define UNICODE_X_LOW_CODEPOINT         0x0078
     39 #define UNICODE_AMP_CODEPOINT           0x0026
     40 #define UNICODE_HASH_CODEPOINT          0x0023
     41 #define UNICODE_SEMICOLON_CODEPOINT     0x003B
     42 #define UNICODE_PLUS_CODEPOINT          0x002B
     43 #define UNICODE_LEFT_CURLY_CODEPOINT    0x007B
     44 #define UNICODE_RIGHT_CURLY_CODEPOINT   0x007D
     45 #define UNICODE_SPACE_CODEPOINT         0x0020
     46 #define UCNV_PRV_ESCAPE_ICU         0
     47 #define UCNV_PRV_ESCAPE_C           'C'
     48 #define UCNV_PRV_ESCAPE_XML_DEC     'D'
     49 #define UCNV_PRV_ESCAPE_XML_HEX     'X'
     50 #define UCNV_PRV_ESCAPE_JAVA        'J'
     51 #define UCNV_PRV_ESCAPE_UNICODE     'U'
     52 #define UCNV_PRV_ESCAPE_CSS2        'S'
     53 #define UCNV_PRV_STOP_ON_ILLEGAL    'i'
     54 
     55 /*
     56 * IS_DEFAULT_IGNORABLE_CODE_POINT
     57 * This is to check if a code point has the default ignorable unicode property.
     58 * As such, this list needs to be updated if the ignorable code point list ever
     59 * changes.
     60 * To avoid dependency on other code, this list is hard coded here.
     61 * When an ignorable code point is found and is unmappable, the default callbacks
     62 * will ignore them.
     63 * For a list of the default ignorable code points, use this link:
     64 * https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=%5B%3ADI%3A%5D&abb=on&g=&i=
     65 *
     66 * This list should be sync with the one in CharsetCallback.java
     67 */
     68 #define IS_DEFAULT_IGNORABLE_CODE_POINT(c) ( \
     69    (c == 0x00AD) || \
     70    (c == 0x034F) || \
     71    (c == 0x061C) || \
     72    (c == 0x115F) || \
     73    (c == 0x1160) || \
     74    (0x17B4 <= c && c <= 0x17B5) || \
     75    (0x180B <= c && c <= 0x180F) || \
     76    (0x200B <= c && c <= 0x200F) || \
     77    (0x202A <= c && c <= 0x202E) || \
     78    (0x2060 <= c && c <= 0x206F) || \
     79    (c == 0x3164) || \
     80    (0xFE00 <= c && c <= 0xFE0F) || \
     81    (c == 0xFEFF) || \
     82    (c == 0xFFA0) || \
     83    (0xFFF0 <= c && c <= 0xFFF8) || \
     84    (0x1BCA0 <= c && c <= 0x1BCA3) || \
     85    (0x1D173 <= c && c <= 0x1D17A) || \
     86    (0xE0000 <= c && c <= 0xE0FFF))
     87 
     88 
     89 /*Function Pointer STOPS at the ILLEGAL_SEQUENCE */
     90 U_CAPI void    U_EXPORT2
     91 UCNV_FROM_U_CALLBACK_STOP (
     92                  const void *context,
     93                  UConverterFromUnicodeArgs *fromUArgs,
     94                  const char16_t* codeUnits,
     95                  int32_t length,
     96                  UChar32 codePoint,
     97                  UConverterCallbackReason reason,
     98                  UErrorCode * err)
     99 {
    100    (void)context;
    101    (void)fromUArgs;
    102    (void)codeUnits;
    103    (void)length;
    104    if (reason == UCNV_UNASSIGNED && IS_DEFAULT_IGNORABLE_CODE_POINT(codePoint))
    105    {
    106        /*
    107         * Skip if the codepoint has unicode property of default ignorable.
    108         */
    109        *err = U_ZERO_ERROR;
    110    }
    111    /* the caller must have set the error code accordingly */
    112 }
    113 
    114 
    115 /*Function Pointer STOPS at the ILLEGAL_SEQUENCE */
    116 U_CAPI void    U_EXPORT2
    117 UCNV_TO_U_CALLBACK_STOP (
    118                   const void *context,
    119                   UConverterToUnicodeArgs *toUArgs,
    120                   const char* codePoints,
    121                   int32_t length,
    122                   UConverterCallbackReason reason,
    123                   UErrorCode * err)
    124 {
    125    /* the caller must have set the error code accordingly */
    126    (void)context; (void)toUArgs; (void)codePoints; (void)length; (void)reason; (void)err;
    127 }
    128 
    129 U_CAPI void    U_EXPORT2
    130 UCNV_FROM_U_CALLBACK_SKIP (                  
    131                  const void *context,
    132                  UConverterFromUnicodeArgs *fromUArgs,
    133                  const char16_t* codeUnits,
    134                  int32_t length,
    135                  UChar32 codePoint,
    136                  UConverterCallbackReason reason,
    137                  UErrorCode * err)
    138 {
    139    (void)fromUArgs;
    140    (void)codeUnits;
    141    (void)length;
    142    if (reason <= UCNV_IRREGULAR)
    143    {
    144        if (reason == UCNV_UNASSIGNED && IS_DEFAULT_IGNORABLE_CODE_POINT(codePoint))
    145        {
    146            /*
    147             * Skip if the codepoint has unicode property of default ignorable.
    148             */
    149            *err = U_ZERO_ERROR;
    150        }
    151        else if (context == nullptr || (*((char*)context) == UCNV_PRV_STOP_ON_ILLEGAL && reason == UCNV_UNASSIGNED))
    152        {
    153            *err = U_ZERO_ERROR;
    154        }
    155        /* else the caller must have set the error code accordingly. */
    156    }
    157    /* else ignore the reset, close and clone calls. */
    158 }
    159 
    160 U_CAPI void    U_EXPORT2
    161 UCNV_FROM_U_CALLBACK_SUBSTITUTE (
    162                  const void *context,
    163                  UConverterFromUnicodeArgs *fromArgs,
    164                  const char16_t* codeUnits,
    165                  int32_t length,
    166                  UChar32 codePoint,
    167                  UConverterCallbackReason reason,
    168                  UErrorCode * err)
    169 {
    170    (void)codeUnits;
    171    (void)length;
    172    if (reason <= UCNV_IRREGULAR)
    173    {
    174        if (reason == UCNV_UNASSIGNED && IS_DEFAULT_IGNORABLE_CODE_POINT(codePoint))
    175        {
    176            /*
    177             * Skip if the codepoint has unicode property of default ignorable.
    178             */
    179            *err = U_ZERO_ERROR;
    180        }
    181        else if (context == nullptr || (*((char*)context) == UCNV_PRV_STOP_ON_ILLEGAL && reason == UCNV_UNASSIGNED))
    182        {
    183            *err = U_ZERO_ERROR;
    184            ucnv_cbFromUWriteSub(fromArgs, 0, err);
    185        }
    186        /* else the caller must have set the error code accordingly. */
    187    }
    188    /* else ignore the reset, close and clone calls. */
    189 }
    190 
    191 /*uses uprv_itou to get a unicode escape sequence of the offensive sequence,
    192 *uses a clean copy (resetted) of the converter, to convert that unicode
    193 *escape sequence to the target codepage (if conversion failure happens then
    194 *we revert to substituting with subchar)
    195 */
    196 U_CAPI void    U_EXPORT2
    197 UCNV_FROM_U_CALLBACK_ESCAPE (
    198                         const void *context,
    199                         UConverterFromUnicodeArgs *fromArgs,
    200                         const char16_t *codeUnits,
    201                         int32_t length,
    202                         UChar32 codePoint,
    203                         UConverterCallbackReason reason,
    204                         UErrorCode * err)
    205 {
    206 
    207  char16_t valueString[VALUE_STRING_LENGTH];
    208  int32_t valueStringLength = 0;
    209  int32_t i = 0;
    210 
    211  const char16_t *myValueSource = nullptr;
    212  UErrorCode err2 = U_ZERO_ERROR;
    213  UConverterFromUCallback original = nullptr;
    214  const void *originalContext;
    215 
    216  UConverterFromUCallback ignoredCallback = nullptr;
    217  const void *ignoredContext;
    218  
    219  if (reason > UCNV_IRREGULAR)
    220  {
    221      return;
    222  }
    223  else if (reason == UCNV_UNASSIGNED && IS_DEFAULT_IGNORABLE_CODE_POINT(codePoint))
    224  {
    225      /*
    226       * Skip if the codepoint has unicode property of default ignorable.
    227       */
    228      *err = U_ZERO_ERROR;
    229      return;
    230  }
    231 
    232  ucnv_setFromUCallBack (fromArgs->converter,
    233                     (UConverterFromUCallback) UCNV_FROM_U_CALLBACK_SUBSTITUTE,
    234                     nullptr,
    235                     &original,
    236                     &originalContext,
    237                     &err2);
    238  
    239  if (U_FAILURE (err2))
    240  {
    241    *err = err2;
    242    return;
    243  } 
    244  if(context==nullptr)
    245  { 
    246      while (i < length)
    247      {
    248        valueString[valueStringLength++] = (char16_t) UNICODE_PERCENT_SIGN_CODEPOINT;  /* adding % */
    249        valueString[valueStringLength++] = (char16_t) UNICODE_U_CODEPOINT; /* adding U */
    250        valueStringLength += uprv_itou (valueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, (uint16_t)codeUnits[i++], 16, 4);
    251      }
    252  }
    253  else
    254  {
    255      switch(*((char*)context))
    256      {
    257      case UCNV_PRV_ESCAPE_JAVA:
    258          while (i < length)
    259          {
    260              valueString[valueStringLength++] = (char16_t) UNICODE_RS_CODEPOINT;    /* adding \ */
    261              valueString[valueStringLength++] = (char16_t) UNICODE_U_LOW_CODEPOINT; /* adding u */
    262              valueStringLength += uprv_itou (valueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, (uint16_t)codeUnits[i++], 16, 4);
    263          }
    264          break;
    265 
    266      case UCNV_PRV_ESCAPE_C:
    267          valueString[valueStringLength++] = (char16_t) UNICODE_RS_CODEPOINT;    /* adding \ */
    268 
    269          if(length==2){
    270              valueString[valueStringLength++] = (char16_t) UNICODE_U_CODEPOINT; /* adding U */
    271              valueStringLength += uprv_itou (valueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, codePoint, 16, 8);
    272 
    273          }
    274          else{
    275              valueString[valueStringLength++] = (char16_t) UNICODE_U_LOW_CODEPOINT; /* adding u */
    276              valueStringLength += uprv_itou (valueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, (uint16_t)codeUnits[0], 16, 4);
    277          }
    278          break;
    279 
    280      case UCNV_PRV_ESCAPE_XML_DEC:
    281 
    282          valueString[valueStringLength++] = (char16_t) UNICODE_AMP_CODEPOINT;   /* adding & */
    283          valueString[valueStringLength++] = (char16_t) UNICODE_HASH_CODEPOINT;  /* adding # */
    284          if(length==2){
    285              valueStringLength += uprv_itou (valueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, codePoint, 10, 0);
    286          }
    287          else{
    288              valueStringLength += uprv_itou (valueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, (uint16_t)codeUnits[0], 10, 0);
    289          }
    290          valueString[valueStringLength++] = (char16_t) UNICODE_SEMICOLON_CODEPOINT; /* adding ; */
    291          break;
    292 
    293      case UCNV_PRV_ESCAPE_XML_HEX:
    294 
    295          valueString[valueStringLength++] = (char16_t) UNICODE_AMP_CODEPOINT;   /* adding & */
    296          valueString[valueStringLength++] = (char16_t) UNICODE_HASH_CODEPOINT;  /* adding # */
    297          valueString[valueStringLength++] = (char16_t) UNICODE_X_LOW_CODEPOINT; /* adding x */
    298          if(length==2){
    299              valueStringLength += uprv_itou (valueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, codePoint, 16, 0);
    300          }
    301          else{
    302              valueStringLength += uprv_itou (valueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, (uint16_t)codeUnits[0], 16, 0);
    303          }
    304          valueString[valueStringLength++] = (char16_t) UNICODE_SEMICOLON_CODEPOINT; /* adding ; */
    305          break;
    306 
    307      case UCNV_PRV_ESCAPE_UNICODE:
    308          valueString[valueStringLength++] = (char16_t) UNICODE_LEFT_CURLY_CODEPOINT;    /* adding { */
    309          valueString[valueStringLength++] = (char16_t) UNICODE_U_CODEPOINT;    /* adding U */
    310          valueString[valueStringLength++] = (char16_t) UNICODE_PLUS_CODEPOINT; /* adding + */
    311          if (length == 2) {
    312              valueStringLength += uprv_itou (valueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, codePoint, 16, 4);
    313          } else {
    314              valueStringLength += uprv_itou (valueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, (uint16_t)codeUnits[0], 16, 4);
    315          }
    316          valueString[valueStringLength++] = (char16_t) UNICODE_RIGHT_CURLY_CODEPOINT;    /* adding } */
    317          break;
    318 
    319      case UCNV_PRV_ESCAPE_CSS2:
    320          valueString[valueStringLength++] = (char16_t) UNICODE_RS_CODEPOINT;    /* adding \ */
    321          valueStringLength += uprv_itou (valueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, codePoint, 16, 0);
    322          /* Always add space character, because the next character might be whitespace,
    323             which would erroneously be considered the termination of the escape sequence. */
    324          valueString[valueStringLength++] = (char16_t) UNICODE_SPACE_CODEPOINT;
    325          break;
    326 
    327      default:
    328          while (i < length)
    329          {
    330              valueString[valueStringLength++] = (char16_t) UNICODE_PERCENT_SIGN_CODEPOINT;  /* adding % */
    331              valueString[valueStringLength++] = (char16_t) UNICODE_U_CODEPOINT;             /* adding U */
    332              valueStringLength += uprv_itou (valueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, (uint16_t)codeUnits[i++], 16, 4);
    333          }
    334      }
    335  }  
    336  myValueSource = valueString;
    337 
    338  /* reset the error */
    339  *err = U_ZERO_ERROR;
    340 
    341  ucnv_cbFromUWriteUChars(fromArgs, &myValueSource, myValueSource+valueStringLength, 0, err);
    342 
    343  ucnv_setFromUCallBack (fromArgs->converter,
    344                         original,
    345                         originalContext,
    346                         &ignoredCallback,
    347                         &ignoredContext,
    348                         &err2);
    349  if (U_FAILURE (err2))
    350  {
    351      *err = err2;
    352      return;
    353  }
    354 }
    355 
    356 
    357 
    358 U_CAPI void  U_EXPORT2
    359 UCNV_TO_U_CALLBACK_SKIP (
    360                 const void *context,
    361                 UConverterToUnicodeArgs *toArgs,
    362                 const char* codeUnits,
    363                 int32_t length,
    364                 UConverterCallbackReason reason,
    365                 UErrorCode * err)
    366 {
    367    (void)toArgs;
    368    (void)codeUnits;
    369    (void)length;
    370    if (reason <= UCNV_IRREGULAR)
    371    {
    372        if (context == nullptr || (*((char*)context) == UCNV_PRV_STOP_ON_ILLEGAL && reason == UCNV_UNASSIGNED))
    373        {
    374            *err = U_ZERO_ERROR;
    375        }
    376        /* else the caller must have set the error code accordingly. */
    377    }
    378    /* else ignore the reset, close and clone calls. */
    379 }
    380 
    381 U_CAPI void    U_EXPORT2
    382 UCNV_TO_U_CALLBACK_SUBSTITUTE (
    383                 const void *context,
    384                 UConverterToUnicodeArgs *toArgs,
    385                 const char* codeUnits,
    386                 int32_t length,
    387                 UConverterCallbackReason reason,
    388                 UErrorCode * err)
    389 {
    390    (void)codeUnits;
    391    (void)length;
    392    if (reason <= UCNV_IRREGULAR)
    393    {
    394        if (context == nullptr || (*((char*)context) == UCNV_PRV_STOP_ON_ILLEGAL && reason == UCNV_UNASSIGNED))
    395        {
    396            *err = U_ZERO_ERROR;
    397            ucnv_cbToUWriteSub(toArgs,0,err);
    398        }
    399        /* else the caller must have set the error code accordingly. */
    400    }
    401    /* else ignore the reset, close and clone calls. */
    402 }
    403 
    404 /*uses uprv_itou to get a unicode escape sequence of the offensive sequence,
    405 *and uses that as the substitution sequence
    406 */
    407 U_CAPI void   U_EXPORT2
    408 UCNV_TO_U_CALLBACK_ESCAPE (
    409                 const void *context,
    410                 UConverterToUnicodeArgs *toArgs,
    411                 const char* codeUnits,
    412                 int32_t length,
    413                 UConverterCallbackReason reason,
    414                 UErrorCode * err)
    415 {
    416    char16_t uniValueString[VALUE_STRING_LENGTH];
    417    int32_t valueStringLength = 0;
    418    int32_t i = 0;
    419 
    420    if (reason > UCNV_IRREGULAR)
    421    {
    422        return;
    423    }
    424 
    425    if(context==nullptr)
    426    {    
    427        while (i < length)
    428        {
    429            uniValueString[valueStringLength++] = (char16_t) UNICODE_PERCENT_SIGN_CODEPOINT; /* adding % */
    430            uniValueString[valueStringLength++] = (char16_t) UNICODE_X_CODEPOINT;    /* adding X */
    431            valueStringLength += uprv_itou (uniValueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, (uint8_t) codeUnits[i++], 16, 2);
    432        }
    433    }
    434    else
    435    {
    436        switch(*((char*)context))
    437        {
    438        case UCNV_PRV_ESCAPE_XML_DEC:
    439            while (i < length)
    440            {
    441                uniValueString[valueStringLength++] = (char16_t) UNICODE_AMP_CODEPOINT;   /* adding & */
    442                uniValueString[valueStringLength++] = (char16_t) UNICODE_HASH_CODEPOINT;  /* adding # */
    443                valueStringLength += uprv_itou (uniValueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, (uint8_t)codeUnits[i++], 10, 0);
    444                uniValueString[valueStringLength++] = (char16_t) UNICODE_SEMICOLON_CODEPOINT; /* adding ; */
    445            }
    446            break;
    447 
    448        case UCNV_PRV_ESCAPE_XML_HEX:
    449            while (i < length)
    450            {
    451                uniValueString[valueStringLength++] = (char16_t) UNICODE_AMP_CODEPOINT;   /* adding & */
    452                uniValueString[valueStringLength++] = (char16_t) UNICODE_HASH_CODEPOINT;  /* adding # */
    453                uniValueString[valueStringLength++] = (char16_t) UNICODE_X_LOW_CODEPOINT; /* adding x */
    454                valueStringLength += uprv_itou (uniValueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, (uint8_t)codeUnits[i++], 16, 0);
    455                uniValueString[valueStringLength++] = (char16_t) UNICODE_SEMICOLON_CODEPOINT; /* adding ; */
    456            }
    457            break;
    458        case UCNV_PRV_ESCAPE_C:
    459            while (i < length)
    460            {
    461                uniValueString[valueStringLength++] = (char16_t) UNICODE_RS_CODEPOINT;    /* adding \ */
    462                uniValueString[valueStringLength++] = (char16_t) UNICODE_X_LOW_CODEPOINT; /* adding x */
    463                valueStringLength += uprv_itou (uniValueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, (uint8_t)codeUnits[i++], 16, 2);
    464            }
    465            break;
    466        default:
    467            while (i < length)
    468            {
    469                uniValueString[valueStringLength++] = (char16_t) UNICODE_PERCENT_SIGN_CODEPOINT; /* adding % */
    470                uniValueString[valueStringLength++] = (char16_t) UNICODE_X_CODEPOINT;    /* adding X */
    471                uprv_itou (uniValueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, (uint8_t) codeUnits[i++], 16, 2);
    472                valueStringLength += 2;
    473            }
    474        }
    475    }
    476    /* reset the error */
    477    *err = U_ZERO_ERROR;
    478 
    479    ucnv_cbToUWriteUChars(toArgs, uniValueString, valueStringLength, 0, err);
    480 }
    481 
    482 #endif