tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

UTFConvert.cpp (6301B)


      1 // UTFConvert.cpp
      2 
      3 #include "StdAfx.h"
      4 
      5 #include "MyTypes.h"
      6 #include "UTFConvert.h"
      7 
      8 #ifdef _WIN32
      9 #define _WCHART_IS_16BIT 1
     10 #endif
     11 
     12 /*
     13  _UTF8_START(n) - is a base value for start byte (head), if there are (n) additional bytes after start byte
     14  
     15  n : _UTF8_START(n) : Bits of code point
     16 
     17  0 : 0x80 :    : unused
     18  1 : 0xC0 : 11 :
     19  2 : 0xE0 : 16 : Basic Multilingual Plane
     20  3 : 0xF0 : 21 : Unicode space
     21  3 : 0xF8 : 26 :
     22  5 : 0xFC : 31 : UCS-4
     23  6 : 0xFE : 36 : We can use it, if we want to encode any 32-bit value
     24  7 : 0xFF :
     25 */
     26 
     27 #define _UTF8_START(n) (0x100 - (1 << (7 - (n))))
     28 
     29 #define _UTF8_HEAD_PARSE2(n) if (c < _UTF8_START((n) + 1)) { numBytes = (n); c -= _UTF8_START(n); }
     30 
     31 #define _UTF8_HEAD_PARSE \
     32         _UTF8_HEAD_PARSE2(1) \
     33    else _UTF8_HEAD_PARSE2(2) \
     34    else _UTF8_HEAD_PARSE2(3) \
     35    else _UTF8_HEAD_PARSE2(4) \
     36    else _UTF8_HEAD_PARSE2(5) \
     37 
     38    // else _UTF8_HEAD_PARSE2(6)
     39 
     40 bool CheckUTF8(const char *src, bool allowReduced) throw()
     41 {
     42  for (;;)
     43  {
     44    Byte c = *src++;
     45    if (c == 0)
     46      return true;
     47 
     48    if (c < 0x80)
     49      continue;
     50    if (c < 0xC0)   // (c < 0xC0 + 2) // if we support only optimal encoding chars
     51      return false;
     52    
     53    unsigned numBytes;
     54    _UTF8_HEAD_PARSE
     55    else
     56      return false;
     57    
     58    UInt32 val = c;
     59 
     60    do
     61    {
     62      Byte c2 = *src++;
     63      if (c2 < 0x80 || c2 >= 0xC0)
     64        return allowReduced && c2 == 0;
     65      val <<= 6;
     66      val |= (c2 - 0x80);
     67    }
     68    while (--numBytes);
     69    
     70    if (val >= 0x110000)
     71      return false;
     72  }
     73 }
     74 
     75 
     76 #define _ERROR_UTF8 \
     77  { if (dest) dest[destPos] = (wchar_t)0xFFFD; destPos++; ok = false; continue; }
     78 
     79 static bool Utf8_To_Utf16(wchar_t *dest, size_t *destLen, const char *src, const char *srcLim) throw()
     80 {
     81  size_t destPos = 0;
     82  bool ok = true;
     83 
     84  for (;;)
     85  {
     86    Byte c;
     87    if (src == srcLim)
     88    {
     89      *destLen = destPos;
     90      return ok;
     91    }
     92    c = *src++;
     93 
     94    if (c < 0x80)
     95    {
     96      if (dest)
     97        dest[destPos] = (wchar_t)c;
     98      destPos++;
     99      continue;
    100    }
    101    if (c < 0xC0)
    102      _ERROR_UTF8
    103 
    104    unsigned numBytes;
    105    _UTF8_HEAD_PARSE
    106    else
    107      _ERROR_UTF8
    108    
    109    UInt32 val = c;
    110 
    111    do
    112    {
    113      Byte c2;
    114      if (src == srcLim)
    115        break;
    116      c2 = *src;
    117      if (c2 < 0x80 || c2 >= 0xC0)
    118        break;
    119      src++;
    120      val <<= 6;
    121      val |= (c2 - 0x80);
    122    }
    123    while (--numBytes);
    124 
    125    if (numBytes != 0)
    126      _ERROR_UTF8
    127 
    128    if (val < 0x10000)
    129    {
    130      if (dest)
    131        dest[destPos] = (wchar_t)val;
    132      destPos++;
    133    }
    134    else
    135    {
    136      val -= 0x10000;
    137      if (val >= 0x100000)
    138        _ERROR_UTF8
    139      if (dest)
    140      {
    141        dest[destPos + 0] = (wchar_t)(0xD800 + (val >> 10));
    142        dest[destPos + 1] = (wchar_t)(0xDC00 + (val & 0x3FF));
    143      }
    144      destPos += 2;
    145    }
    146  }
    147 }
    148 
    149 #define _UTF8_RANGE(n) (((UInt32)1) << ((n) * 5 + 6))
    150 
    151 #define _UTF8_HEAD(n, val) ((char)(_UTF8_START(n) + (val >> (6 * (n)))))
    152 #define _UTF8_CHAR(n, val) ((char)(0x80 + (((val) >> (6 * (n))) & 0x3F)))
    153 
    154 static size_t Utf16_To_Utf8_Calc(const wchar_t *src, const wchar_t *srcLim)
    155 {
    156  size_t size = srcLim - src;
    157  for (;;)
    158  {
    159    if (src == srcLim)
    160      return size;
    161    
    162    UInt32 val = *src++;
    163   
    164    if (val < 0x80)
    165      continue;
    166 
    167    if (val < _UTF8_RANGE(1))
    168    {
    169      size++;
    170      continue;
    171    }
    172 
    173    if (val >= 0xD800 && val < 0xDC00 && src != srcLim)
    174    {
    175      UInt32 c2 = *src;
    176      if (c2 >= 0xDC00 && c2 < 0xE000)
    177      {
    178        src++;
    179        size += 2;
    180        continue;
    181      }
    182    }
    183 
    184    #ifdef _WCHART_IS_16BIT
    185    
    186    size += 2;
    187    
    188    #else
    189 
    190         if (val < _UTF8_RANGE(2)) size += 2;
    191    else if (val < _UTF8_RANGE(3)) size += 3;
    192    else if (val < _UTF8_RANGE(4)) size += 4;
    193    else if (val < _UTF8_RANGE(5)) size += 5;
    194    else                           size += 6;
    195    
    196    #endif
    197  }
    198 }
    199 
    200 static char *Utf16_To_Utf8(char *dest, const wchar_t *src, const wchar_t *srcLim)
    201 {
    202  for (;;)
    203  {
    204    if (src == srcLim)
    205      return dest;
    206    
    207    UInt32 val = *src++;
    208    
    209    if (val < 0x80)
    210    {
    211      *dest++ = (char)val;
    212      continue;
    213    }
    214 
    215    if (val < _UTF8_RANGE(1))
    216    {
    217      dest[0] = _UTF8_HEAD(1, val);
    218      dest[1] = _UTF8_CHAR(0, val);
    219      dest += 2;
    220      continue;
    221    }
    222 
    223    if (val >= 0xD800 && val < 0xDC00 && src != srcLim)
    224    {
    225      UInt32 c2 = *src;
    226      if (c2 >= 0xDC00 && c2 < 0xE000)
    227      {
    228        src++;
    229        val = (((val - 0xD800) << 10) | (c2 - 0xDC00)) + 0x10000;
    230        dest[0] = _UTF8_HEAD(3, val);
    231        dest[1] = _UTF8_CHAR(2, val);
    232        dest[2] = _UTF8_CHAR(1, val);
    233        dest[3] = _UTF8_CHAR(0, val);
    234        dest += 4;
    235        continue;
    236      }
    237    }
    238    
    239    #ifndef _WCHART_IS_16BIT
    240    if (val < _UTF8_RANGE(2))
    241    #endif
    242    {
    243      dest[0] = _UTF8_HEAD(2, val);
    244      dest[1] = _UTF8_CHAR(1, val);
    245      dest[2] = _UTF8_CHAR(0, val);
    246      dest += 3;
    247      continue;
    248    }
    249    
    250    #ifndef _WCHART_IS_16BIT
    251 
    252    UInt32 b;
    253    unsigned numBits;
    254         if (val < _UTF8_RANGE(3)) { numBits = 6 * 3; b = _UTF8_HEAD(3, val); }
    255    else if (val < _UTF8_RANGE(4)) { numBits = 6 * 4; b = _UTF8_HEAD(4, val); }
    256    else if (val < _UTF8_RANGE(5)) { numBits = 6 * 5; b = _UTF8_HEAD(5, val); }
    257    else                           { numBits = 6 * 6; b = _UTF8_START(6); }
    258    
    259    *dest++ = (Byte)b;
    260    
    261    do
    262    {
    263      numBits -= 6;
    264      *dest++ = (char)(0x80 + ((val >> numBits) & 0x3F));
    265    }
    266    while (numBits != 0);
    267 
    268    #endif
    269  }
    270 }
    271 
    272 bool ConvertUTF8ToUnicode(const AString &src, UString &dest)
    273 {
    274  dest.Empty();
    275  size_t destLen = 0;
    276  Utf8_To_Utf16(NULL, &destLen, src, src.Ptr(src.Len()));
    277  bool res = Utf8_To_Utf16(dest.GetBuf((unsigned)destLen), &destLen, src, src.Ptr(src.Len()));
    278  dest.ReleaseBuf_SetEnd((unsigned)destLen);
    279  return res;
    280 }
    281 
    282 void ConvertUnicodeToUTF8(const UString &src, AString &dest)
    283 {
    284  dest.Empty();
    285  size_t destLen = Utf16_To_Utf8_Calc(src, src.Ptr(src.Len()));
    286  Utf16_To_Utf8(dest.GetBuf((unsigned)destLen), src, src.Ptr(src.Len()));
    287  dest.ReleaseBuf_SetEnd((unsigned)destLen);
    288 }