tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

GreekCasing.cpp (9691B)


      1 /* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
      2 /* This Source Code Form is subject to the terms of the Mozilla Public
      3 * License, v. 2.0. If a copy of the MPL was not distributed with this
      4 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
      5 
      6 #include "GreekCasing.h"
      7 #include "nsUnicharUtils.h"
      8 #include "nsUnicodeProperties.h"
      9 
     10 // Custom uppercase mapping for Greek; see bug 307039 for details
     11 #define GREEK_LOWER_ALPHA 0x03B1
     12 #define GREEK_LOWER_ALPHA_TONOS 0x03AC
     13 #define GREEK_LOWER_ALPHA_OXIA 0x1F71
     14 #define GREEK_LOWER_EPSILON 0x03B5
     15 #define GREEK_LOWER_EPSILON_TONOS 0x03AD
     16 #define GREEK_LOWER_EPSILON_OXIA 0x1F73
     17 #define GREEK_LOWER_ETA 0x03B7
     18 #define GREEK_LOWER_ETA_TONOS 0x03AE
     19 #define GREEK_LOWER_ETA_OXIA 0x1F75
     20 #define GREEK_LOWER_IOTA 0x03B9
     21 #define GREEK_LOWER_IOTA_TONOS 0x03AF
     22 #define GREEK_LOWER_IOTA_OXIA 0x1F77
     23 #define GREEK_LOWER_IOTA_DIALYTIKA 0x03CA
     24 #define GREEK_LOWER_IOTA_DIALYTIKA_TONOS 0x0390
     25 #define GREEK_LOWER_IOTA_DIALYTIKA_OXIA 0x1FD3
     26 #define GREEK_LOWER_OMICRON 0x03BF
     27 #define GREEK_LOWER_OMICRON_TONOS 0x03CC
     28 #define GREEK_LOWER_OMICRON_OXIA 0x1F79
     29 #define GREEK_LOWER_UPSILON 0x03C5
     30 #define GREEK_LOWER_UPSILON_TONOS 0x03CD
     31 #define GREEK_LOWER_UPSILON_OXIA 0x1F7B
     32 #define GREEK_LOWER_UPSILON_DIALYTIKA 0x03CB
     33 #define GREEK_LOWER_UPSILON_DIALYTIKA_TONOS 0x03B0
     34 #define GREEK_LOWER_UPSILON_DIALYTIKA_OXIA 0x1FE3
     35 #define GREEK_LOWER_OMEGA 0x03C9
     36 #define GREEK_LOWER_OMEGA_TONOS 0x03CE
     37 #define GREEK_LOWER_OMEGA_OXIA 0x1F7D
     38 #define GREEK_UPPER_ALPHA 0x0391
     39 #define GREEK_UPPER_EPSILON 0x0395
     40 #define GREEK_UPPER_ETA 0x0397
     41 #define GREEK_UPPER_IOTA 0x0399
     42 #define GREEK_UPPER_IOTA_DIALYTIKA 0x03AA
     43 #define GREEK_UPPER_OMICRON 0x039F
     44 #define GREEK_UPPER_UPSILON 0x03A5
     45 #define GREEK_UPPER_UPSILON_DIALYTIKA 0x03AB
     46 #define GREEK_UPPER_OMEGA 0x03A9
     47 #define GREEK_UPPER_ALPHA_TONOS 0x0386
     48 #define GREEK_UPPER_ALPHA_OXIA 0x1FBB
     49 #define GREEK_UPPER_EPSILON_TONOS 0x0388
     50 #define GREEK_UPPER_EPSILON_OXIA 0x1FC9
     51 #define GREEK_UPPER_ETA_TONOS 0x0389
     52 #define GREEK_UPPER_ETA_OXIA 0x1FCB
     53 #define GREEK_UPPER_IOTA_TONOS 0x038A
     54 #define GREEK_UPPER_IOTA_OXIA 0x1FDB
     55 #define GREEK_UPPER_OMICRON_TONOS 0x038C
     56 #define GREEK_UPPER_OMICRON_OXIA 0x1FF9
     57 #define GREEK_UPPER_UPSILON_TONOS 0x038E
     58 #define GREEK_UPPER_UPSILON_OXIA 0x1FEB
     59 #define GREEK_UPPER_OMEGA_TONOS 0x038F
     60 #define GREEK_UPPER_OMEGA_OXIA 0x1FFB
     61 #define COMBINING_ACUTE_ACCENT 0x0301
     62 #define COMBINING_DIAERESIS 0x0308
     63 #define COMBINING_ACUTE_TONE_MARK 0x0341
     64 #define COMBINING_GREEK_DIALYTIKA_TONOS 0x0344
     65 
     66 namespace mozilla {
     67 
     68 uint32_t GreekCasing::UpperCase(uint32_t aCh, GreekCasing::State& aState,
     69                                bool& aMarkEtaPos, bool& aUpdateMarkedEta) {
     70  aMarkEtaPos = false;
     71  aUpdateMarkedEta = false;
     72 
     73  uint8_t category = unicode::GetGeneralCategory(aCh);
     74 
     75  if (aState == kEtaAccMarked) {
     76    switch (category) {
     77      case HB_UNICODE_GENERAL_CATEGORY_LOWERCASE_LETTER:
     78      case HB_UNICODE_GENERAL_CATEGORY_MODIFIER_LETTER:
     79      case HB_UNICODE_GENERAL_CATEGORY_OTHER_LETTER:
     80      case HB_UNICODE_GENERAL_CATEGORY_TITLECASE_LETTER:
     81      case HB_UNICODE_GENERAL_CATEGORY_UPPERCASE_LETTER:
     82      case HB_UNICODE_GENERAL_CATEGORY_SPACING_MARK:
     83      case HB_UNICODE_GENERAL_CATEGORY_ENCLOSING_MARK:
     84      case HB_UNICODE_GENERAL_CATEGORY_NON_SPACING_MARK:
     85        aUpdateMarkedEta = true;
     86        break;
     87      default:
     88        break;
     89    }
     90    aState = kEtaAcc;
     91  }
     92 
     93  switch (aCh) {
     94    case GREEK_UPPER_ALPHA:
     95    case GREEK_LOWER_ALPHA:
     96      aState = kAlpha;
     97      return GREEK_UPPER_ALPHA;
     98 
     99    case GREEK_UPPER_EPSILON:
    100    case GREEK_LOWER_EPSILON:
    101      aState = kEpsilon;
    102      return GREEK_UPPER_EPSILON;
    103 
    104    case GREEK_UPPER_ETA:
    105    case GREEK_LOWER_ETA:
    106      aState = kEta;
    107      return GREEK_UPPER_ETA;
    108 
    109    case GREEK_UPPER_IOTA:
    110      aState = kIota;
    111      return GREEK_UPPER_IOTA;
    112 
    113    case GREEK_UPPER_OMICRON:
    114    case GREEK_LOWER_OMICRON:
    115      aState = kOmicron;
    116      return GREEK_UPPER_OMICRON;
    117 
    118    case GREEK_UPPER_UPSILON:
    119      switch (aState) {
    120        case kOmicron:
    121          aState = kOmicronUpsilon;
    122          break;
    123        default:
    124          aState = kUpsilon;
    125          break;
    126      }
    127      return GREEK_UPPER_UPSILON;
    128 
    129    case GREEK_UPPER_OMEGA:
    130    case GREEK_LOWER_OMEGA:
    131      aState = kOmega;
    132      return GREEK_UPPER_OMEGA;
    133 
    134    // iota and upsilon may be the second vowel of a diphthong
    135    case GREEK_LOWER_IOTA:
    136      switch (aState) {
    137        case kAlphaAcc:
    138        case kEpsilonAcc:
    139        case kOmicronAcc:
    140        case kUpsilonAcc:
    141          aState = kInWord;
    142          return GREEK_UPPER_IOTA_DIALYTIKA;
    143        default:
    144          break;
    145      }
    146      aState = kIota;
    147      return GREEK_UPPER_IOTA;
    148 
    149    case GREEK_LOWER_UPSILON:
    150      switch (aState) {
    151        case kAlphaAcc:
    152        case kEpsilonAcc:
    153        case kEtaAcc:
    154        case kOmicronAcc:
    155          aState = kInWord;
    156          return GREEK_UPPER_UPSILON_DIALYTIKA;
    157        case kOmicron:
    158          aState = kOmicronUpsilon;
    159          break;
    160        default:
    161          aState = kUpsilon;
    162          break;
    163      }
    164      return GREEK_UPPER_UPSILON;
    165 
    166    case GREEK_UPPER_IOTA_DIALYTIKA:
    167    case GREEK_LOWER_IOTA_DIALYTIKA:
    168    case GREEK_UPPER_UPSILON_DIALYTIKA:
    169    case GREEK_LOWER_UPSILON_DIALYTIKA:
    170    case COMBINING_DIAERESIS:
    171      aState = kDiaeresis;
    172      return ToUpperCase(aCh);
    173 
    174    // remove accent if it follows a vowel or diaeresis,
    175    // and set appropriate state for diphthong detection
    176    case COMBINING_ACUTE_ACCENT:
    177    case COMBINING_ACUTE_TONE_MARK:
    178      switch (aState) {
    179        case kAlpha:
    180          aState = kAlphaAcc;
    181          return uint32_t(-1);  // omit this char from result string
    182        case kEpsilon:
    183          aState = kEpsilonAcc;
    184          return uint32_t(-1);
    185        case kEta:
    186          aState = kEtaAcc;
    187          return uint32_t(-1);
    188        case kIota:
    189          aState = kIotaAcc;
    190          return uint32_t(-1);
    191        case kOmicron:
    192          aState = kOmicronAcc;
    193          return uint32_t(-1);
    194        case kUpsilon:
    195          aState = kUpsilonAcc;
    196          return uint32_t(-1);
    197        case kOmicronUpsilon:
    198          aState = kInWord;  // this completed a diphthong
    199          return uint32_t(-1);
    200        case kOmega:
    201          aState = kOmegaAcc;
    202          return uint32_t(-1);
    203        case kDiaeresis:
    204          aState = kInWord;
    205          return uint32_t(-1);
    206        default:
    207          break;
    208      }
    209      break;
    210 
    211    // combinations with dieresis+accent just strip the accent,
    212    // and reset to start state (don't form diphthong with following vowel)
    213    case GREEK_LOWER_IOTA_DIALYTIKA_TONOS:
    214    case GREEK_LOWER_IOTA_DIALYTIKA_OXIA:
    215      aState = kInWord;
    216      return GREEK_UPPER_IOTA_DIALYTIKA;
    217 
    218    case GREEK_LOWER_UPSILON_DIALYTIKA_TONOS:
    219    case GREEK_LOWER_UPSILON_DIALYTIKA_OXIA:
    220      aState = kInWord;
    221      return GREEK_UPPER_UPSILON_DIALYTIKA;
    222 
    223    case COMBINING_GREEK_DIALYTIKA_TONOS:
    224      aState = kInWord;
    225      return COMBINING_DIAERESIS;
    226 
    227    // strip accents from vowels, and note the vowel seen so that we can detect
    228    // diphthongs where diaeresis needs to be added
    229    case GREEK_LOWER_ALPHA_TONOS:
    230    case GREEK_LOWER_ALPHA_OXIA:
    231    case GREEK_UPPER_ALPHA_TONOS:
    232    case GREEK_UPPER_ALPHA_OXIA:
    233      aState = kAlphaAcc;
    234      return GREEK_UPPER_ALPHA;
    235 
    236    case GREEK_LOWER_EPSILON_TONOS:
    237    case GREEK_LOWER_EPSILON_OXIA:
    238    case GREEK_UPPER_EPSILON_TONOS:
    239    case GREEK_UPPER_EPSILON_OXIA:
    240      aState = kEpsilonAcc;
    241      return GREEK_UPPER_EPSILON;
    242 
    243    case GREEK_LOWER_ETA_TONOS:
    244    case GREEK_UPPER_ETA_TONOS:
    245      if (aState == kStart) {
    246        aState = kEtaAccMarked;
    247        aMarkEtaPos = true;  // mark in case we need to remove the tonos later
    248        return GREEK_UPPER_ETA_TONOS;  // treat as disjunctive eta for now
    249      }
    250      // if not in initial state, fall through to strip the accent
    251      [[fallthrough]];
    252 
    253    case GREEK_LOWER_ETA_OXIA:
    254    case GREEK_UPPER_ETA_OXIA:
    255      aState = kEtaAcc;
    256      return GREEK_UPPER_ETA;
    257 
    258    case GREEK_LOWER_IOTA_TONOS:
    259    case GREEK_LOWER_IOTA_OXIA:
    260    case GREEK_UPPER_IOTA_TONOS:
    261    case GREEK_UPPER_IOTA_OXIA:
    262      aState = kIotaAcc;
    263      return GREEK_UPPER_IOTA;
    264 
    265    case GREEK_LOWER_OMICRON_TONOS:
    266    case GREEK_LOWER_OMICRON_OXIA:
    267    case GREEK_UPPER_OMICRON_TONOS:
    268    case GREEK_UPPER_OMICRON_OXIA:
    269      aState = kOmicronAcc;
    270      return GREEK_UPPER_OMICRON;
    271 
    272    case GREEK_LOWER_UPSILON_TONOS:
    273    case GREEK_LOWER_UPSILON_OXIA:
    274    case GREEK_UPPER_UPSILON_TONOS:
    275    case GREEK_UPPER_UPSILON_OXIA:
    276      switch (aState) {
    277        case kOmicron:
    278          aState = kInWord;  // this completed a diphthong
    279          break;
    280        default:
    281          aState = kUpsilonAcc;
    282          break;
    283      }
    284      return GREEK_UPPER_UPSILON;
    285 
    286    case GREEK_LOWER_OMEGA_TONOS:
    287    case GREEK_LOWER_OMEGA_OXIA:
    288    case GREEK_UPPER_OMEGA_TONOS:
    289    case GREEK_UPPER_OMEGA_OXIA:
    290      aState = kOmegaAcc;
    291      return GREEK_UPPER_OMEGA;
    292  }
    293 
    294  // all other characters just reset the state to either kStart or kInWord,
    295  // and use standard mappings
    296  switch (category) {
    297    case HB_UNICODE_GENERAL_CATEGORY_LOWERCASE_LETTER:
    298    case HB_UNICODE_GENERAL_CATEGORY_MODIFIER_LETTER:
    299    case HB_UNICODE_GENERAL_CATEGORY_OTHER_LETTER:
    300    case HB_UNICODE_GENERAL_CATEGORY_TITLECASE_LETTER:
    301    case HB_UNICODE_GENERAL_CATEGORY_UPPERCASE_LETTER:
    302    case HB_UNICODE_GENERAL_CATEGORY_SPACING_MARK:
    303    case HB_UNICODE_GENERAL_CATEGORY_ENCLOSING_MARK:
    304    case HB_UNICODE_GENERAL_CATEGORY_NON_SPACING_MARK:
    305      aState = kInWord;
    306      break;
    307    default:
    308      aState = kStart;
    309      break;
    310  }
    311 
    312  return ToUpperCase(aCh);
    313 }
    314 
    315 }  // namespace mozilla