GreekCasing.cpp (9691B)
1 /* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ 2 /* This Source Code Form is subject to the terms of the Mozilla Public 3 * License, v. 2.0. If a copy of the MPL was not distributed with this 4 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ 5 6 #include "GreekCasing.h" 7 #include "nsUnicharUtils.h" 8 #include "nsUnicodeProperties.h" 9 10 // Custom uppercase mapping for Greek; see bug 307039 for details 11 #define GREEK_LOWER_ALPHA 0x03B1 12 #define GREEK_LOWER_ALPHA_TONOS 0x03AC 13 #define GREEK_LOWER_ALPHA_OXIA 0x1F71 14 #define GREEK_LOWER_EPSILON 0x03B5 15 #define GREEK_LOWER_EPSILON_TONOS 0x03AD 16 #define GREEK_LOWER_EPSILON_OXIA 0x1F73 17 #define GREEK_LOWER_ETA 0x03B7 18 #define GREEK_LOWER_ETA_TONOS 0x03AE 19 #define GREEK_LOWER_ETA_OXIA 0x1F75 20 #define GREEK_LOWER_IOTA 0x03B9 21 #define GREEK_LOWER_IOTA_TONOS 0x03AF 22 #define GREEK_LOWER_IOTA_OXIA 0x1F77 23 #define GREEK_LOWER_IOTA_DIALYTIKA 0x03CA 24 #define GREEK_LOWER_IOTA_DIALYTIKA_TONOS 0x0390 25 #define GREEK_LOWER_IOTA_DIALYTIKA_OXIA 0x1FD3 26 #define GREEK_LOWER_OMICRON 0x03BF 27 #define GREEK_LOWER_OMICRON_TONOS 0x03CC 28 #define GREEK_LOWER_OMICRON_OXIA 0x1F79 29 #define GREEK_LOWER_UPSILON 0x03C5 30 #define GREEK_LOWER_UPSILON_TONOS 0x03CD 31 #define GREEK_LOWER_UPSILON_OXIA 0x1F7B 32 #define GREEK_LOWER_UPSILON_DIALYTIKA 0x03CB 33 #define GREEK_LOWER_UPSILON_DIALYTIKA_TONOS 0x03B0 34 #define GREEK_LOWER_UPSILON_DIALYTIKA_OXIA 0x1FE3 35 #define GREEK_LOWER_OMEGA 0x03C9 36 #define GREEK_LOWER_OMEGA_TONOS 0x03CE 37 #define GREEK_LOWER_OMEGA_OXIA 0x1F7D 38 #define GREEK_UPPER_ALPHA 0x0391 39 #define GREEK_UPPER_EPSILON 0x0395 40 #define GREEK_UPPER_ETA 0x0397 41 #define GREEK_UPPER_IOTA 0x0399 42 #define GREEK_UPPER_IOTA_DIALYTIKA 0x03AA 43 #define GREEK_UPPER_OMICRON 0x039F 44 #define GREEK_UPPER_UPSILON 0x03A5 45 #define GREEK_UPPER_UPSILON_DIALYTIKA 0x03AB 46 #define GREEK_UPPER_OMEGA 0x03A9 47 #define GREEK_UPPER_ALPHA_TONOS 0x0386 48 #define GREEK_UPPER_ALPHA_OXIA 0x1FBB 49 #define GREEK_UPPER_EPSILON_TONOS 0x0388 50 #define GREEK_UPPER_EPSILON_OXIA 0x1FC9 51 #define GREEK_UPPER_ETA_TONOS 0x0389 52 #define GREEK_UPPER_ETA_OXIA 0x1FCB 53 #define GREEK_UPPER_IOTA_TONOS 0x038A 54 #define GREEK_UPPER_IOTA_OXIA 0x1FDB 55 #define GREEK_UPPER_OMICRON_TONOS 0x038C 56 #define GREEK_UPPER_OMICRON_OXIA 0x1FF9 57 #define GREEK_UPPER_UPSILON_TONOS 0x038E 58 #define GREEK_UPPER_UPSILON_OXIA 0x1FEB 59 #define GREEK_UPPER_OMEGA_TONOS 0x038F 60 #define GREEK_UPPER_OMEGA_OXIA 0x1FFB 61 #define COMBINING_ACUTE_ACCENT 0x0301 62 #define COMBINING_DIAERESIS 0x0308 63 #define COMBINING_ACUTE_TONE_MARK 0x0341 64 #define COMBINING_GREEK_DIALYTIKA_TONOS 0x0344 65 66 namespace mozilla { 67 68 uint32_t GreekCasing::UpperCase(uint32_t aCh, GreekCasing::State& aState, 69 bool& aMarkEtaPos, bool& aUpdateMarkedEta) { 70 aMarkEtaPos = false; 71 aUpdateMarkedEta = false; 72 73 uint8_t category = unicode::GetGeneralCategory(aCh); 74 75 if (aState == kEtaAccMarked) { 76 switch (category) { 77 case HB_UNICODE_GENERAL_CATEGORY_LOWERCASE_LETTER: 78 case HB_UNICODE_GENERAL_CATEGORY_MODIFIER_LETTER: 79 case HB_UNICODE_GENERAL_CATEGORY_OTHER_LETTER: 80 case HB_UNICODE_GENERAL_CATEGORY_TITLECASE_LETTER: 81 case HB_UNICODE_GENERAL_CATEGORY_UPPERCASE_LETTER: 82 case HB_UNICODE_GENERAL_CATEGORY_SPACING_MARK: 83 case HB_UNICODE_GENERAL_CATEGORY_ENCLOSING_MARK: 84 case HB_UNICODE_GENERAL_CATEGORY_NON_SPACING_MARK: 85 aUpdateMarkedEta = true; 86 break; 87 default: 88 break; 89 } 90 aState = kEtaAcc; 91 } 92 93 switch (aCh) { 94 case GREEK_UPPER_ALPHA: 95 case GREEK_LOWER_ALPHA: 96 aState = kAlpha; 97 return GREEK_UPPER_ALPHA; 98 99 case GREEK_UPPER_EPSILON: 100 case GREEK_LOWER_EPSILON: 101 aState = kEpsilon; 102 return GREEK_UPPER_EPSILON; 103 104 case GREEK_UPPER_ETA: 105 case GREEK_LOWER_ETA: 106 aState = kEta; 107 return GREEK_UPPER_ETA; 108 109 case GREEK_UPPER_IOTA: 110 aState = kIota; 111 return GREEK_UPPER_IOTA; 112 113 case GREEK_UPPER_OMICRON: 114 case GREEK_LOWER_OMICRON: 115 aState = kOmicron; 116 return GREEK_UPPER_OMICRON; 117 118 case GREEK_UPPER_UPSILON: 119 switch (aState) { 120 case kOmicron: 121 aState = kOmicronUpsilon; 122 break; 123 default: 124 aState = kUpsilon; 125 break; 126 } 127 return GREEK_UPPER_UPSILON; 128 129 case GREEK_UPPER_OMEGA: 130 case GREEK_LOWER_OMEGA: 131 aState = kOmega; 132 return GREEK_UPPER_OMEGA; 133 134 // iota and upsilon may be the second vowel of a diphthong 135 case GREEK_LOWER_IOTA: 136 switch (aState) { 137 case kAlphaAcc: 138 case kEpsilonAcc: 139 case kOmicronAcc: 140 case kUpsilonAcc: 141 aState = kInWord; 142 return GREEK_UPPER_IOTA_DIALYTIKA; 143 default: 144 break; 145 } 146 aState = kIota; 147 return GREEK_UPPER_IOTA; 148 149 case GREEK_LOWER_UPSILON: 150 switch (aState) { 151 case kAlphaAcc: 152 case kEpsilonAcc: 153 case kEtaAcc: 154 case kOmicronAcc: 155 aState = kInWord; 156 return GREEK_UPPER_UPSILON_DIALYTIKA; 157 case kOmicron: 158 aState = kOmicronUpsilon; 159 break; 160 default: 161 aState = kUpsilon; 162 break; 163 } 164 return GREEK_UPPER_UPSILON; 165 166 case GREEK_UPPER_IOTA_DIALYTIKA: 167 case GREEK_LOWER_IOTA_DIALYTIKA: 168 case GREEK_UPPER_UPSILON_DIALYTIKA: 169 case GREEK_LOWER_UPSILON_DIALYTIKA: 170 case COMBINING_DIAERESIS: 171 aState = kDiaeresis; 172 return ToUpperCase(aCh); 173 174 // remove accent if it follows a vowel or diaeresis, 175 // and set appropriate state for diphthong detection 176 case COMBINING_ACUTE_ACCENT: 177 case COMBINING_ACUTE_TONE_MARK: 178 switch (aState) { 179 case kAlpha: 180 aState = kAlphaAcc; 181 return uint32_t(-1); // omit this char from result string 182 case kEpsilon: 183 aState = kEpsilonAcc; 184 return uint32_t(-1); 185 case kEta: 186 aState = kEtaAcc; 187 return uint32_t(-1); 188 case kIota: 189 aState = kIotaAcc; 190 return uint32_t(-1); 191 case kOmicron: 192 aState = kOmicronAcc; 193 return uint32_t(-1); 194 case kUpsilon: 195 aState = kUpsilonAcc; 196 return uint32_t(-1); 197 case kOmicronUpsilon: 198 aState = kInWord; // this completed a diphthong 199 return uint32_t(-1); 200 case kOmega: 201 aState = kOmegaAcc; 202 return uint32_t(-1); 203 case kDiaeresis: 204 aState = kInWord; 205 return uint32_t(-1); 206 default: 207 break; 208 } 209 break; 210 211 // combinations with dieresis+accent just strip the accent, 212 // and reset to start state (don't form diphthong with following vowel) 213 case GREEK_LOWER_IOTA_DIALYTIKA_TONOS: 214 case GREEK_LOWER_IOTA_DIALYTIKA_OXIA: 215 aState = kInWord; 216 return GREEK_UPPER_IOTA_DIALYTIKA; 217 218 case GREEK_LOWER_UPSILON_DIALYTIKA_TONOS: 219 case GREEK_LOWER_UPSILON_DIALYTIKA_OXIA: 220 aState = kInWord; 221 return GREEK_UPPER_UPSILON_DIALYTIKA; 222 223 case COMBINING_GREEK_DIALYTIKA_TONOS: 224 aState = kInWord; 225 return COMBINING_DIAERESIS; 226 227 // strip accents from vowels, and note the vowel seen so that we can detect 228 // diphthongs where diaeresis needs to be added 229 case GREEK_LOWER_ALPHA_TONOS: 230 case GREEK_LOWER_ALPHA_OXIA: 231 case GREEK_UPPER_ALPHA_TONOS: 232 case GREEK_UPPER_ALPHA_OXIA: 233 aState = kAlphaAcc; 234 return GREEK_UPPER_ALPHA; 235 236 case GREEK_LOWER_EPSILON_TONOS: 237 case GREEK_LOWER_EPSILON_OXIA: 238 case GREEK_UPPER_EPSILON_TONOS: 239 case GREEK_UPPER_EPSILON_OXIA: 240 aState = kEpsilonAcc; 241 return GREEK_UPPER_EPSILON; 242 243 case GREEK_LOWER_ETA_TONOS: 244 case GREEK_UPPER_ETA_TONOS: 245 if (aState == kStart) { 246 aState = kEtaAccMarked; 247 aMarkEtaPos = true; // mark in case we need to remove the tonos later 248 return GREEK_UPPER_ETA_TONOS; // treat as disjunctive eta for now 249 } 250 // if not in initial state, fall through to strip the accent 251 [[fallthrough]]; 252 253 case GREEK_LOWER_ETA_OXIA: 254 case GREEK_UPPER_ETA_OXIA: 255 aState = kEtaAcc; 256 return GREEK_UPPER_ETA; 257 258 case GREEK_LOWER_IOTA_TONOS: 259 case GREEK_LOWER_IOTA_OXIA: 260 case GREEK_UPPER_IOTA_TONOS: 261 case GREEK_UPPER_IOTA_OXIA: 262 aState = kIotaAcc; 263 return GREEK_UPPER_IOTA; 264 265 case GREEK_LOWER_OMICRON_TONOS: 266 case GREEK_LOWER_OMICRON_OXIA: 267 case GREEK_UPPER_OMICRON_TONOS: 268 case GREEK_UPPER_OMICRON_OXIA: 269 aState = kOmicronAcc; 270 return GREEK_UPPER_OMICRON; 271 272 case GREEK_LOWER_UPSILON_TONOS: 273 case GREEK_LOWER_UPSILON_OXIA: 274 case GREEK_UPPER_UPSILON_TONOS: 275 case GREEK_UPPER_UPSILON_OXIA: 276 switch (aState) { 277 case kOmicron: 278 aState = kInWord; // this completed a diphthong 279 break; 280 default: 281 aState = kUpsilonAcc; 282 break; 283 } 284 return GREEK_UPPER_UPSILON; 285 286 case GREEK_LOWER_OMEGA_TONOS: 287 case GREEK_LOWER_OMEGA_OXIA: 288 case GREEK_UPPER_OMEGA_TONOS: 289 case GREEK_UPPER_OMEGA_OXIA: 290 aState = kOmegaAcc; 291 return GREEK_UPPER_OMEGA; 292 } 293 294 // all other characters just reset the state to either kStart or kInWord, 295 // and use standard mappings 296 switch (category) { 297 case HB_UNICODE_GENERAL_CATEGORY_LOWERCASE_LETTER: 298 case HB_UNICODE_GENERAL_CATEGORY_MODIFIER_LETTER: 299 case HB_UNICODE_GENERAL_CATEGORY_OTHER_LETTER: 300 case HB_UNICODE_GENERAL_CATEGORY_TITLECASE_LETTER: 301 case HB_UNICODE_GENERAL_CATEGORY_UPPERCASE_LETTER: 302 case HB_UNICODE_GENERAL_CATEGORY_SPACING_MARK: 303 case HB_UNICODE_GENERAL_CATEGORY_ENCLOSING_MARK: 304 case HB_UNICODE_GENERAL_CATEGORY_NON_SPACING_MARK: 305 aState = kInWord; 306 break; 307 default: 308 aState = kStart; 309 break; 310 } 311 312 return ToUpperCase(aCh); 313 } 314 315 } // namespace mozilla