tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

UtfCodec.h (7588B)


      1 /*  GRAPHITE2 LICENSING
      2 
      3    Copyright 2011, SIL International
      4    All rights reserved.
      5 
      6    This library is free software; you can redistribute it and/or modify
      7    it under the terms of the GNU Lesser General Public License as published
      8    by the Free Software Foundation; either version 2.1 of License, or
      9    (at your option) any later version.
     10 
     11    This program is distributed in the hope that it will be useful,
     12    but WITHOUT ANY WARRANTY; without even the implied warranty of
     13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
     14    Lesser General Public License for more details.
     15 
     16    You should also have received a copy of the GNU Lesser General Public
     17    License along with this library in the file named "LICENSE".
     18    If not, write to the Free Software Foundation, 51 Franklin Street,
     19    Suite 500, Boston, MA 02110-1335, USA or visit their web page on the
     20    internet at http://www.fsf.org/licenses/lgpl.html.
     21 
     22 Alternatively, the contents of this file may be used under the terms of the
     23 Mozilla Public License (http://mozilla.org/MPL) or the GNU General Public
     24 License, as published by the Free Software Foundation, either version 2
     25 of the License or (at your option) any later version.
     26 */
     27 #pragma once
     28 
     29 #include <cstdlib>
     30 #include "inc/Main.h"
     31 
     32 namespace graphite2 {
     33 
     34 typedef uint32  uchar_t;
     35 
     36 template <int N>
     37 struct _utf_codec
     38 {
     39    typedef uchar_t codeunit_t;
     40 
     41    static void     put(codeunit_t * cp, const uchar_t , int8 & len) throw();
     42    static uchar_t  get(const codeunit_t * cp, int8 & len) throw();
     43    static bool     validate(const codeunit_t * s, const codeunit_t * const e) throw();
     44 };
     45 
     46 
     47 template <>
     48 struct _utf_codec<32>
     49 {
     50 private:
     51    static const uchar_t    limit = 0x110000;
     52 public:
     53    typedef uint32  codeunit_t;
     54 
     55    inline
     56    static void put(codeunit_t * cp, const uchar_t usv, int8 & l) throw()
     57    {
     58        *cp = usv; l = 1;
     59    }
     60 
     61    inline
     62    static uchar_t get(const codeunit_t * cp, int8 & l) throw()
     63    {
     64        if (cp[0] < limit)  { l = 1;  return cp[0]; }
     65        else                { l = -1; return 0xFFFD; }
     66    }
     67 
     68    inline
     69    static bool validate(const codeunit_t * s, const codeunit_t * const e) throw()
     70    {
     71        return s <= e;
     72    }
     73 };
     74 
     75 
     76 template <>
     77 struct _utf_codec<16>
     78 {
     79 private:
     80    static const int32  lead_offset      = 0xD800 - (0x10000 >> 10);
     81    static const int32  surrogate_offset = 0x10000 - (0xD800 << 10) - 0xDC00;
     82 public:
     83    typedef uint16  codeunit_t;
     84 
     85    inline
     86    static void put(codeunit_t * cp, const uchar_t usv, int8 & l) throw()
     87    {
     88        if (usv < 0x10000)  { l = 1; cp[0] = codeunit_t(usv); }
     89        else
     90        {
     91            cp[0] = codeunit_t(lead_offset + (usv >> 10));
     92            cp[1] = codeunit_t(0xDC00 + (usv & 0x3FF));
     93            l = 2;
     94        }
     95    }
     96 
     97    inline
     98    static uchar_t get(const codeunit_t * cp, int8 & l) throw()
     99    {
    100        const uint32    uh = cp[0];
    101        l = 1;
    102 
    103        if (uh < 0xD800|| uh > 0xDFFF) { return uh; }
    104        if (uh > 0xDBFF) { l = -1; return 0xFFFD; }
    105        const uint32 ul = cp[1];
    106        if (ul < 0xDC00 || ul > 0xDFFF) { l = -1; return 0xFFFD; }
    107        ++l;
    108        return (uh<<10) + ul + surrogate_offset;
    109    }
    110 
    111    inline
    112    static bool validate(const codeunit_t * s, const codeunit_t * const e) throw()
    113    {
    114        const ptrdiff_t n = e-s;
    115        if (n <= 0) return n == 0;
    116        const uint32 u = *(e-1); // Get the last codepoint
    117        return (u < 0xD800 || u > 0xDBFF);
    118    }
    119 };
    120 
    121 
    122 template <>
    123 struct _utf_codec<8>
    124 {
    125 private:
    126    static const int8 sz_lut[16];
    127    static const byte mask_lut[5];
    128    static const uchar_t    limit = 0x110000;
    129 
    130 public:
    131    typedef uint8   codeunit_t;
    132 
    133    inline
    134    static void put(codeunit_t * cp, const uchar_t usv, int8 & l) throw()
    135    {
    136        if (usv < 0x80)     {l = 1; cp[0] = usv; return; }
    137        if (usv < 0x0800)   {l = 2; cp[0] = 0xC0 + (usv >> 6);  cp[1] = 0x80 + (usv & 0x3F); return; }
    138        if (usv < 0x10000)  {l = 3; cp[0] = 0xE0 + (usv >> 12); cp[1] = 0x80 + ((usv >> 6) & 0x3F);  cp[2] = 0x80 + (usv & 0x3F); return; }
    139        else                {l = 4; cp[0] = 0xF0 + (usv >> 18); cp[1] = 0x80 + ((usv >> 12) & 0x3F); cp[2] = 0x80 + ((usv >> 6) & 0x3F); cp[3] = 0x80 + (usv & 0x3F); return; }
    140    }
    141 
    142    inline
    143    static uchar_t get(const codeunit_t * cp, int8 & l) throw()
    144    {
    145        const int8 seq_sz = sz_lut[*cp >> 4];
    146        uchar_t u = *cp & mask_lut[seq_sz];
    147        l = 1;
    148        bool toolong = false;
    149 
    150        switch(seq_sz) {
    151            case 4:     u <<= 6; u |= *++cp & 0x3F; if (*cp >> 6 != 2) break; ++l; toolong  = (u < 0x10); GR_FALLTHROUGH;
    152                // no break
    153            case 3:     u <<= 6; u |= *++cp & 0x3F; if (*cp >> 6 != 2) break; ++l; toolong |= (u < 0x20); GR_FALLTHROUGH;
    154                // no break
    155            case 2:     u <<= 6; u |= *++cp & 0x3F; if (*cp >> 6 != 2) break; ++l; toolong |= (u < 0x80); GR_FALLTHROUGH;
    156                // no break
    157            case 1:     break;
    158            case 0:     l = -1; return 0xFFFD;
    159        }
    160 
    161        if (l != seq_sz || toolong  || u >= limit)
    162        {
    163            l = -l;
    164            return 0xFFFD;
    165        }
    166        return u;
    167    }
    168 
    169    inline
    170    static bool validate(const codeunit_t * s, const codeunit_t * const e) throw()
    171    {
    172        const ptrdiff_t n = e-s;
    173        if (n <= 0) return n == 0;
    174        s += (n-1);
    175        if (*s < 0x80) return true;
    176        if (*s >= 0xC0) return false;
    177        if (n == 1) return true;
    178        if (*--s < 0x80) return true;
    179        if (*s >= 0xE0) return false;
    180        if (n == 2 || *s >= 0xC0) return true;
    181        if (*--s < 0x80) return true;
    182        if (*s >= 0xF0) return false;
    183        return true;
    184    }
    185 
    186 };
    187 
    188 
    189 template <typename C>
    190 class _utf_iterator
    191 {
    192    typedef _utf_codec<sizeof(C)*8> codec;
    193 
    194    C             * cp;
    195    mutable int8    sl;
    196 
    197 public:
    198    typedef C           codeunit_type;
    199    typedef uchar_t     value_type;
    200    typedef uchar_t   * pointer;
    201 
    202    class reference
    203    {
    204        const _utf_iterator & _i;
    205 
    206        reference(const _utf_iterator & i): _i(i) {}
    207    public:
    208        operator value_type () const throw ()                   { return codec::get(_i.cp, _i.sl); }
    209        reference & operator = (const value_type usv) throw()   { codec::put(_i.cp, usv, _i.sl); return *this; }
    210 
    211        friend class _utf_iterator;
    212    };
    213 
    214 
    215    _utf_iterator(const void * us=0)    : cp(reinterpret_cast<C *>(const_cast<void *>(us))), sl(1) { }
    216 
    217    _utf_iterator   & operator ++ ()    { cp += abs(sl); return *this; }
    218    _utf_iterator   operator ++ (int)   { _utf_iterator tmp(*this); operator++(); return tmp; }
    219 
    220    bool operator == (const _utf_iterator & rhs) const throw() { return cp >= rhs.cp; }
    221    bool operator != (const _utf_iterator & rhs) const throw() { return !operator==(rhs); }
    222 
    223    reference   operator * () const throw() { return *this; }
    224    pointer     operator ->() const throw() { return &operator *(); }
    225 
    226    operator codeunit_type * () const throw() { return cp; }
    227 
    228    bool error() const throw()  { return sl < 1; }
    229    bool validate(const _utf_iterator & e)  { return codec::validate(cp, e.cp); }
    230 };
    231 
    232 template <typename C>
    233 struct utf
    234 {
    235    typedef typename _utf_codec<sizeof(C)*8>::codeunit_t codeunit_t;
    236 
    237    typedef _utf_iterator<C>        iterator;
    238    typedef _utf_iterator<const C>  const_iterator;
    239 
    240    inline
    241    static bool validate(codeunit_t * s, codeunit_t * e) throw() {
    242        return _utf_codec<sizeof(C)*8>::validate(s,e);
    243    }
    244 };
    245 
    246 
    247 typedef utf<uint32> utf32;
    248 typedef utf<uint16> utf16;
    249 typedef utf<uint8>  utf8;
    250 
    251 } // namespace graphite2