tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

IrishCasing.cpp (10006B)


      1 /* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
      2 /* This Source Code Form is subject to the terms of the Mozilla Public
      3 * License, v. 2.0. If a copy of the MPL was not distributed with this
      4 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
      5 
      6 /******************************************************************************
      7 
      8 This file provides a finite state machine to support Irish Gaelic uppercasing
      9 rules.
     10 
     11 The caller will need to iterate through a string, passing a State variable
     12 along with the current character to each UpperCase call and checking the flags
     13 that are returned:
     14 
     15  If aMarkPos is true, caller must remember the current index in the string as
     16  a possible target for a future action.
     17 
     18  If aAction is non-zero, then one or more characters from the marked index are
     19  to be modified:
     20    1  lowercase the marked letter
     21    2  lowercase the marked letter and its successor
     22    3  lowercase the marked letter, and delete its successor
     23 
     24 
     25 ### Rules from https://bugzilla.mozilla.org/show_bug.cgi?id=1014639,
     26 ### comments 1 and 4:
     27 
     28 v = [a,á,e,é,i,í,o,ó,u,ú]
     29 V = [A,Á,E,É,I,Í,O,Ó,U,Ú]
     30 
     31 bhf -> bhF
     32 bhF -> bhF
     33 bp  -> bP
     34 bP  -> bP
     35 dt  -> dT
     36 dT  -> dT
     37 gc  -> gC
     38 gC  -> gC
     39 h{V}  -> h{V}
     40 mb  -> mB
     41 mB  -> mB
     42 n-{v} -> n{V}
     43 n{V} -> n{V}
     44 nd  -> nD
     45 nD  -> nD
     46 ng  -> nG
     47 nG  -> nG
     48 t-{v} -> t{V}
     49 t{V} -> t{V}
     50 ts{v} -> tS{V}
     51 tS{v} -> tS{V}
     52 tS{V} -> tS{V}
     53 tsl  -> tSL
     54 tSl  -> tSL
     55 tSL  -> tSL
     56 tsn  -> tSN
     57 tSn  -> tSN
     58 tSN  -> tSN
     59 tsr  -> tSR
     60 tSr  -> tSR
     61 tSR  -> tSR
     62 
     63 ### Create table of states and actions for each input class.
     64 
     65 Start (non-word) state is #; generic in-word state is _, once we know there's
     66 no special action to do in this word.
     67 
     68         #   _   b   bh  d   g   h   m   n   n-  t   t-  ts
     69 input\state
     70 b        b'  _   _   _   _   _   _   1   _   _   _   _   _
     71 B        _   _   _   _   _   _   _   1   _   _   _   _   _
     72 c        _   _   _   _   _   1   _   _   _   _   _   _   _
     73 C        _   _   _   _   _   1   _   _   _   _   _   _   _
     74 d        d'  _   _   _   _   _   _   _   1   _   _   _   _
     75 D        _   _   _   _   _   _   _   _   1   _   _   _   _
     76 f        _   _   _   2   _   _   _   _   _   _   _   _   _
     77 F        _   _   _   2   _   _   _   _   _   _   _   _   _
     78 g        g'  _   _   _   _   _   _   _   1   _   _   _   _
     79 G        _   _   _   _   _   _   _   _   1   _   _   _   _
     80 h        h'  _   bh  _   _   _   _   _   _   _   _   _   _
     81 l        _   _   _   _   _   _   _   _   _   _   _   _   1
     82 L        _   _   _   _   _   _   _   _   _   _   _   _   1
     83 m        m'  _   _   _   _   _   _   _   _   _   _   _   _
     84 n        n'  _   _   _   _   _   _   _   _   _   _   _   1
     85 N        _   _   _   _   _   _   _   _   _   _   _   _   1
     86 p        _   _   1   _   _   _   _   _   _   _   _   _   _
     87 P        _   _   1   _   _   _   _   _   _   _   _   _   _
     88 r        _   _   _   _   _   _   _   _   _   _   _   _   1
     89 R        _   _   _   _   _   _   _   _   _   _   _   _   1
     90 s        _   _   _   _   _   _   _   _   _   _   ts  _   _
     91 S        _   _   _   _   _   _   _   _   _   _   ts  _   _
     92 t        t'  _   _   _   1   _   _   _   _   _   _   _   _
     93 T        _   _   _   _   1   _   _   _   _   _   _   _   _
     94 vowel    _   _   _   _   _   _   _   _   _   1d  _   1d  1
     95 Vowel    _   _   _   _   _   _   1   _   1   _   1   _   1
     96 hyph     _   _   _   _   _   _   _   _   n-  _   t-  _   _
     97 letter   _   _   _   _   _   _   _   _   _   _   _   _   _
     98 other    #   #   #   #   #   #   #   #   #   #   #   #   #
     99 
    100 Actions:
    101  1            lowercase one letter at start of word
    102  2            lowercase two letters at start of word
    103  1d           lowercase one letter at start of word, and delete next
    104               (and then go to state _, nothing further to do in this word)
    105 
    106 else just go to the given state; suffix ' indicates mark start-of-word.
    107 
    108 ### Consolidate identical states and classes:
    109 
    110         0   1   2   3   4   5   6   7   8   9   A   B
    111         #   _   b   bh  d   g   h   m   n [nt]- t   ts
    112 input\state
    113 b        b'  _   _   _   _   _   _   1   _   _   _   _
    114 B        _   _   _   _   _   _   _   1   _   _   _   _
    115 [cC]     _   _   _   _   _   1   _   _   _   _   _   _
    116 d        d'  _   _   _   _   _   _   _   1   _   _   _
    117 [DG]     _   _   _   _   _   _   _   _   1   _   _   _
    118 [fF]     _   _   _   2   _   _   _   _   _   _   _   _
    119 g        g'  _   _   _   _   _   _   _   1   _   _   _
    120 h        h'  _   bh  _   _   _   _   _   _   _   _   _
    121 [lLNrR]  _   _   _   _   _   _   _   _   _   _   _   1
    122 m        m'  _   _   _   _   _   _   _   _   _   _   _
    123 n        n'  _   _   _   _   _   _   _   _   _   _   1
    124 [pP]     _   _   1   _   _   _   _   _   _   _   _   _
    125 [sS]     _   _   _   _   _   _   _   _   _   _   ts  _
    126 t        t'  _   _   _   1   _   _   _   _   _   _   _
    127 T        _   _   _   _   1   _   _   _   _   _   _   _
    128 vowel    _   _   _   _   _   _   _   _   _   1d  _   1
    129 Vowel    _   _   _   _   _   _   1   _   1   _   1   1
    130 hyph     _   _   _   _   _   _   _   _ [nt-] _ [nt-] _
    131 letter   _   _   _   _   _   _   _   _   _   _   _   _
    132 other    #   #   #   #   #   #   #   #   #   #   #   #
    133 
    134 So we have 20 input classes, and 12 states.
    135 
    136 State table array will contain bytes that encode action and new state:
    137 
    138  0x80  -  bit flag: mark start-of-word position
    139  0x40  -  currently unused
    140  0x30  -  action mask: 4 values
    141           0x00  -  do nothing
    142           0x10  -  lowercase one letter
    143           0x20  -  lowercase two letters
    144           0x30  -  lowercase one, delete one
    145  0x0F  -  next-state mask
    146 ******************************************************************************/
    147 
    148 #include "IrishCasing.h"
    149 
    150 #include "nsUnicodeProperties.h"
    151 #include "nsUnicharUtils.h"
    152 
    153 namespace mozilla {
    154 
    155 const uint8_t IrishCasing::sUppercaseStateTable[kNumClasses][kNumStates] = {
    156    //  #     _     b     bh    d     g     h     m     n     [nt]- t     ts
    157    {0x82, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x11, 0x01, 0x01, 0x01,
    158     0x01},  // b
    159    {0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x11, 0x01, 0x01, 0x01,
    160     0x01},  // B
    161    {0x01, 0x01, 0x01, 0x01, 0x01, 0x10, 0x01, 0x01, 0x01, 0x01, 0x01,
    162     0x01},  // [cC]
    163    {0x84, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x11, 0x01, 0x01,
    164     0x01},  // d
    165    {0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x11, 0x01, 0x01,
    166     0x01},  // [DG]
    167    {0x01, 0x01, 0x01, 0x21, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
    168     0x01},  // [fF]
    169    {0x85, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x11, 0x01, 0x01,
    170     0x01},  // g
    171    {0x86, 0x01, 0x03, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
    172     0x01},  // h
    173    {0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
    174     0x11},  // [lLNrR]
    175    {0x87, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
    176     0x01},  // m
    177    {0x88, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
    178     0x11},  // n
    179    {0x01, 0x01, 0x11, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
    180     0x01},  // [pP]
    181    {0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x0B,
    182     0x01},  // [sS]
    183    {0x8A, 0x01, 0x01, 0x01, 0x11, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
    184     0x01},  // t
    185    {0x01, 0x01, 0x01, 0x01, 0x11, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
    186     0x01},  // T
    187    {0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x31, 0x01,
    188     0x11},  // vowel
    189    {0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x11, 0x01, 0x11, 0x01, 0x11,
    190     0x11},  // Vowel
    191    {0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x09, 0x01, 0x09,
    192     0x01},  // hyph
    193    {0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
    194     0x01},  // letter
    195    {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
    196     0x00}  // other
    197 };
    198 
    199 #define HYPHEN 0x2010
    200 #define NO_BREAK_HYPHEN 0x2011
    201 #define a_ACUTE 0x00e1
    202 #define e_ACUTE 0x00e9
    203 #define i_ACUTE 0x00ed
    204 #define o_ACUTE 0x00f3
    205 #define u_ACUTE 0x00fa
    206 #define A_ACUTE 0x00c1
    207 #define E_ACUTE 0x00c9
    208 #define I_ACUTE 0x00cd
    209 #define O_ACUTE 0x00d3
    210 #define U_ACUTE 0x00da
    211 
    212 const uint8_t IrishCasing::sLcClasses[26] = {
    213    kClass_vowel,  kClass_b,      kClass_cC,     kClass_d,      kClass_vowel,
    214    kClass_fF,     kClass_g,      kClass_h,      kClass_vowel,  kClass_letter,
    215    kClass_letter, kClass_lLNrR,  kClass_m,      kClass_n,      kClass_vowel,
    216    kClass_pP,     kClass_letter, kClass_lLNrR,  kClass_sS,     kClass_t,
    217    kClass_vowel,  kClass_letter, kClass_letter, kClass_letter, kClass_letter,
    218    kClass_letter};
    219 
    220 const uint8_t IrishCasing::sUcClasses[26] = {
    221    kClass_Vowel,  kClass_B,      kClass_cC,     kClass_DG,     kClass_Vowel,
    222    kClass_fF,     kClass_DG,     kClass_letter, kClass_Vowel,  kClass_letter,
    223    kClass_letter, kClass_lLNrR,  kClass_letter, kClass_lLNrR,  kClass_Vowel,
    224    kClass_pP,     kClass_letter, kClass_lLNrR,  kClass_sS,     kClass_T,
    225    kClass_Vowel,  kClass_letter, kClass_letter, kClass_letter, kClass_letter,
    226    kClass_letter};
    227 
    228 uint8_t IrishCasing::GetClass(uint32_t aCh) {
    229  using mozilla::unicode::GetGenCategory;
    230  if (aCh >= 'a' && aCh <= 'z') {
    231    return sLcClasses[aCh - 'a'];
    232  }
    233 
    234  if (aCh >= 'A' && aCh <= 'Z') {
    235    return sUcClasses[aCh - 'A'];
    236  }
    237 
    238  if (GetGenCategory(aCh) == nsUGenCategory::kLetter) {
    239    if (aCh == a_ACUTE || aCh == e_ACUTE || aCh == i_ACUTE || aCh == o_ACUTE ||
    240        aCh == u_ACUTE) {
    241      return kClass_vowel;
    242    }
    243 
    244    if (aCh == A_ACUTE || aCh == E_ACUTE || aCh == I_ACUTE || aCh == O_ACUTE ||
    245        aCh == U_ACUTE) {
    246      return kClass_Vowel;
    247    }
    248 
    249    return kClass_letter;
    250  }
    251 
    252  if (aCh == '-' || aCh == HYPHEN || aCh == NO_BREAK_HYPHEN) {
    253    return kClass_hyph;
    254  }
    255 
    256  return kClass_other;
    257 }
    258 
    259 uint32_t IrishCasing::UpperCase(uint32_t aCh, State& aState, bool& aMarkPos,
    260                                uint8_t& aAction) {
    261  uint8_t cls = GetClass(aCh);
    262  uint8_t stateEntry = sUppercaseStateTable[cls][aState];
    263  aMarkPos = !!(stateEntry & kMarkPositionFlag);
    264  aAction = (stateEntry & kActionMask) >> kActionShift;
    265  aState = State(stateEntry & kNextStateMask);
    266 
    267  return ToUpperCase(aCh);
    268 }
    269 
    270 }  // namespace mozilla