IrishCasing.cpp (10006B)
1 /* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ 2 /* This Source Code Form is subject to the terms of the Mozilla Public 3 * License, v. 2.0. If a copy of the MPL was not distributed with this 4 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ 5 6 /****************************************************************************** 7 8 This file provides a finite state machine to support Irish Gaelic uppercasing 9 rules. 10 11 The caller will need to iterate through a string, passing a State variable 12 along with the current character to each UpperCase call and checking the flags 13 that are returned: 14 15 If aMarkPos is true, caller must remember the current index in the string as 16 a possible target for a future action. 17 18 If aAction is non-zero, then one or more characters from the marked index are 19 to be modified: 20 1 lowercase the marked letter 21 2 lowercase the marked letter and its successor 22 3 lowercase the marked letter, and delete its successor 23 24 25 ### Rules from https://bugzilla.mozilla.org/show_bug.cgi?id=1014639, 26 ### comments 1 and 4: 27 28 v = [a,á,e,é,i,í,o,ó,u,ú] 29 V = [A,Á,E,É,I,Í,O,Ó,U,Ú] 30 31 bhf -> bhF 32 bhF -> bhF 33 bp -> bP 34 bP -> bP 35 dt -> dT 36 dT -> dT 37 gc -> gC 38 gC -> gC 39 h{V} -> h{V} 40 mb -> mB 41 mB -> mB 42 n-{v} -> n{V} 43 n{V} -> n{V} 44 nd -> nD 45 nD -> nD 46 ng -> nG 47 nG -> nG 48 t-{v} -> t{V} 49 t{V} -> t{V} 50 ts{v} -> tS{V} 51 tS{v} -> tS{V} 52 tS{V} -> tS{V} 53 tsl -> tSL 54 tSl -> tSL 55 tSL -> tSL 56 tsn -> tSN 57 tSn -> tSN 58 tSN -> tSN 59 tsr -> tSR 60 tSr -> tSR 61 tSR -> tSR 62 63 ### Create table of states and actions for each input class. 64 65 Start (non-word) state is #; generic in-word state is _, once we know there's 66 no special action to do in this word. 67 68 # _ b bh d g h m n n- t t- ts 69 input\state 70 b b' _ _ _ _ _ _ 1 _ _ _ _ _ 71 B _ _ _ _ _ _ _ 1 _ _ _ _ _ 72 c _ _ _ _ _ 1 _ _ _ _ _ _ _ 73 C _ _ _ _ _ 1 _ _ _ _ _ _ _ 74 d d' _ _ _ _ _ _ _ 1 _ _ _ _ 75 D _ _ _ _ _ _ _ _ 1 _ _ _ _ 76 f _ _ _ 2 _ _ _ _ _ _ _ _ _ 77 F _ _ _ 2 _ _ _ _ _ _ _ _ _ 78 g g' _ _ _ _ _ _ _ 1 _ _ _ _ 79 G _ _ _ _ _ _ _ _ 1 _ _ _ _ 80 h h' _ bh _ _ _ _ _ _ _ _ _ _ 81 l _ _ _ _ _ _ _ _ _ _ _ _ 1 82 L _ _ _ _ _ _ _ _ _ _ _ _ 1 83 m m' _ _ _ _ _ _ _ _ _ _ _ _ 84 n n' _ _ _ _ _ _ _ _ _ _ _ 1 85 N _ _ _ _ _ _ _ _ _ _ _ _ 1 86 p _ _ 1 _ _ _ _ _ _ _ _ _ _ 87 P _ _ 1 _ _ _ _ _ _ _ _ _ _ 88 r _ _ _ _ _ _ _ _ _ _ _ _ 1 89 R _ _ _ _ _ _ _ _ _ _ _ _ 1 90 s _ _ _ _ _ _ _ _ _ _ ts _ _ 91 S _ _ _ _ _ _ _ _ _ _ ts _ _ 92 t t' _ _ _ 1 _ _ _ _ _ _ _ _ 93 T _ _ _ _ 1 _ _ _ _ _ _ _ _ 94 vowel _ _ _ _ _ _ _ _ _ 1d _ 1d 1 95 Vowel _ _ _ _ _ _ 1 _ 1 _ 1 _ 1 96 hyph _ _ _ _ _ _ _ _ n- _ t- _ _ 97 letter _ _ _ _ _ _ _ _ _ _ _ _ _ 98 other # # # # # # # # # # # # # 99 100 Actions: 101 1 lowercase one letter at start of word 102 2 lowercase two letters at start of word 103 1d lowercase one letter at start of word, and delete next 104 (and then go to state _, nothing further to do in this word) 105 106 else just go to the given state; suffix ' indicates mark start-of-word. 107 108 ### Consolidate identical states and classes: 109 110 0 1 2 3 4 5 6 7 8 9 A B 111 # _ b bh d g h m n [nt]- t ts 112 input\state 113 b b' _ _ _ _ _ _ 1 _ _ _ _ 114 B _ _ _ _ _ _ _ 1 _ _ _ _ 115 [cC] _ _ _ _ _ 1 _ _ _ _ _ _ 116 d d' _ _ _ _ _ _ _ 1 _ _ _ 117 [DG] _ _ _ _ _ _ _ _ 1 _ _ _ 118 [fF] _ _ _ 2 _ _ _ _ _ _ _ _ 119 g g' _ _ _ _ _ _ _ 1 _ _ _ 120 h h' _ bh _ _ _ _ _ _ _ _ _ 121 [lLNrR] _ _ _ _ _ _ _ _ _ _ _ 1 122 m m' _ _ _ _ _ _ _ _ _ _ _ 123 n n' _ _ _ _ _ _ _ _ _ _ 1 124 [pP] _ _ 1 _ _ _ _ _ _ _ _ _ 125 [sS] _ _ _ _ _ _ _ _ _ _ ts _ 126 t t' _ _ _ 1 _ _ _ _ _ _ _ 127 T _ _ _ _ 1 _ _ _ _ _ _ _ 128 vowel _ _ _ _ _ _ _ _ _ 1d _ 1 129 Vowel _ _ _ _ _ _ 1 _ 1 _ 1 1 130 hyph _ _ _ _ _ _ _ _ [nt-] _ [nt-] _ 131 letter _ _ _ _ _ _ _ _ _ _ _ _ 132 other # # # # # # # # # # # # 133 134 So we have 20 input classes, and 12 states. 135 136 State table array will contain bytes that encode action and new state: 137 138 0x80 - bit flag: mark start-of-word position 139 0x40 - currently unused 140 0x30 - action mask: 4 values 141 0x00 - do nothing 142 0x10 - lowercase one letter 143 0x20 - lowercase two letters 144 0x30 - lowercase one, delete one 145 0x0F - next-state mask 146 ******************************************************************************/ 147 148 #include "IrishCasing.h" 149 150 #include "nsUnicodeProperties.h" 151 #include "nsUnicharUtils.h" 152 153 namespace mozilla { 154 155 const uint8_t IrishCasing::sUppercaseStateTable[kNumClasses][kNumStates] = { 156 // # _ b bh d g h m n [nt]- t ts 157 {0x82, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x11, 0x01, 0x01, 0x01, 158 0x01}, // b 159 {0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x11, 0x01, 0x01, 0x01, 160 0x01}, // B 161 {0x01, 0x01, 0x01, 0x01, 0x01, 0x10, 0x01, 0x01, 0x01, 0x01, 0x01, 162 0x01}, // [cC] 163 {0x84, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x11, 0x01, 0x01, 164 0x01}, // d 165 {0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x11, 0x01, 0x01, 166 0x01}, // [DG] 167 {0x01, 0x01, 0x01, 0x21, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 168 0x01}, // [fF] 169 {0x85, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x11, 0x01, 0x01, 170 0x01}, // g 171 {0x86, 0x01, 0x03, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 172 0x01}, // h 173 {0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 174 0x11}, // [lLNrR] 175 {0x87, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 176 0x01}, // m 177 {0x88, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 178 0x11}, // n 179 {0x01, 0x01, 0x11, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 180 0x01}, // [pP] 181 {0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x0B, 182 0x01}, // [sS] 183 {0x8A, 0x01, 0x01, 0x01, 0x11, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 184 0x01}, // t 185 {0x01, 0x01, 0x01, 0x01, 0x11, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 186 0x01}, // T 187 {0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x31, 0x01, 188 0x11}, // vowel 189 {0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x11, 0x01, 0x11, 0x01, 0x11, 190 0x11}, // Vowel 191 {0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x09, 0x01, 0x09, 192 0x01}, // hyph 193 {0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 194 0x01}, // letter 195 {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 196 0x00} // other 197 }; 198 199 #define HYPHEN 0x2010 200 #define NO_BREAK_HYPHEN 0x2011 201 #define a_ACUTE 0x00e1 202 #define e_ACUTE 0x00e9 203 #define i_ACUTE 0x00ed 204 #define o_ACUTE 0x00f3 205 #define u_ACUTE 0x00fa 206 #define A_ACUTE 0x00c1 207 #define E_ACUTE 0x00c9 208 #define I_ACUTE 0x00cd 209 #define O_ACUTE 0x00d3 210 #define U_ACUTE 0x00da 211 212 const uint8_t IrishCasing::sLcClasses[26] = { 213 kClass_vowel, kClass_b, kClass_cC, kClass_d, kClass_vowel, 214 kClass_fF, kClass_g, kClass_h, kClass_vowel, kClass_letter, 215 kClass_letter, kClass_lLNrR, kClass_m, kClass_n, kClass_vowel, 216 kClass_pP, kClass_letter, kClass_lLNrR, kClass_sS, kClass_t, 217 kClass_vowel, kClass_letter, kClass_letter, kClass_letter, kClass_letter, 218 kClass_letter}; 219 220 const uint8_t IrishCasing::sUcClasses[26] = { 221 kClass_Vowel, kClass_B, kClass_cC, kClass_DG, kClass_Vowel, 222 kClass_fF, kClass_DG, kClass_letter, kClass_Vowel, kClass_letter, 223 kClass_letter, kClass_lLNrR, kClass_letter, kClass_lLNrR, kClass_Vowel, 224 kClass_pP, kClass_letter, kClass_lLNrR, kClass_sS, kClass_T, 225 kClass_Vowel, kClass_letter, kClass_letter, kClass_letter, kClass_letter, 226 kClass_letter}; 227 228 uint8_t IrishCasing::GetClass(uint32_t aCh) { 229 using mozilla::unicode::GetGenCategory; 230 if (aCh >= 'a' && aCh <= 'z') { 231 return sLcClasses[aCh - 'a']; 232 } 233 234 if (aCh >= 'A' && aCh <= 'Z') { 235 return sUcClasses[aCh - 'A']; 236 } 237 238 if (GetGenCategory(aCh) == nsUGenCategory::kLetter) { 239 if (aCh == a_ACUTE || aCh == e_ACUTE || aCh == i_ACUTE || aCh == o_ACUTE || 240 aCh == u_ACUTE) { 241 return kClass_vowel; 242 } 243 244 if (aCh == A_ACUTE || aCh == E_ACUTE || aCh == I_ACUTE || aCh == O_ACUTE || 245 aCh == U_ACUTE) { 246 return kClass_Vowel; 247 } 248 249 return kClass_letter; 250 } 251 252 if (aCh == '-' || aCh == HYPHEN || aCh == NO_BREAK_HYPHEN) { 253 return kClass_hyph; 254 } 255 256 return kClass_other; 257 } 258 259 uint32_t IrishCasing::UpperCase(uint32_t aCh, State& aState, bool& aMarkPos, 260 uint8_t& aAction) { 261 uint8_t cls = GetClass(aCh); 262 uint8_t stateEntry = sUppercaseStateTable[cls][aState]; 263 aMarkPos = !!(stateEntry & kMarkPositionFlag); 264 aAction = (stateEntry & kActionMask) >> kActionShift; 265 aState = State(stateEntry & kNextStateMask); 266 267 return ToUpperCase(aCh); 268 } 269 270 } // namespace mozilla