[ tor-browser ].git.dasho

Zawgyi_my.txt (8334B)
      1 # © 2016 and later: Unicode, Inc. and others.
      2 # License & terms of use: http://www.unicode.org/copyright.html
      3 # Generated using tools/cldr/cldr-to-icu/
      4 #
      5 # File: Zawgyi_my.txt
      6 # Generated from CLDR
      7 #
      8 
      9 # This transform converts Zawgyi "encoded" Burmese into proper
     10 # unicode. Zawgyi is a popular encoding scheme in Myanmar. It uses
     11 # the Myanmar unicode range but assigns different characters or
     12 # glyphs to some codepoints. In addition to the character mapping,
     13 # there is reordering of codepoints needed to match the expected
     14 # unicode order. This reordering is context-based.
     15 #
     16 # This transform is done in two main stages:
     17 # (1) Map all Zawgyi codepoints to their Unicode counterpart.
     18 # (2) Perform reordering.
     19 # Modern Burmese digits & Unicode code points.
     20 $nondigits = [^\u1040-\u1049];
     21 $consonant = [\u1000-\u1021];
     22 $vowelsign = [\u102B-\u1030\u1032];  # Unicode vowel signs except E (1031)
     23 $vowelsAndConsonants = [\u1000-\u102a];
     24 $umedial = [\u103B-\u103E];    # Medial codepoints in Unicode
     25 $vowelmedial = [\u102B-\u1030\u1032\1u36\u1037\u103A-\u103F];  # Union of vowel signs and medials
     26 $ukinzi = \u1004\u103A\u1039;  # Codepoints representing kinzi in Unicode
     27 # Zawgyi medial ra has multiple representations
     28 $zmedialra = [\u103B\u107E-\u1084];
     29 $wspace = [\u0020\u00a0\u1680\u2000-\u200d\u2060\u202f\u205f\u3000\ufeff];
     30 ####
     31 #### STAGE 1: CODEPOINT MAPPING FROM ZAWGYI TO UNICODE
     32 ####
     33 # Kinzi (predefined ligatures)
     34 # Move base character to the right
     35 ($consonant) \u103A \u1064 → $ukinzi $1 \u103B;
     36 ($consonant) \u1064 → $ukinzi $1;
     37 \u1064 → $ukinzi;
     38 # Special cases moving base character to right before vowel signs
     39 ($consonant) \u108B → $ukinzi $1 \u102D;
     40 ($consonant) \u108C → $ukinzi $1 \u102E;
     41 ($consonant) \u108D → $ukinzi $1 \u1036;
     42 # Special cases moving Kinzi block to left
     43 ($consonant) \u103A \u1033 \u108B → $ukinzi $1 \u103B \u102D \u102F;
     44 ($consonant) \u103A \u108b → $ukinzi $1 \u103B \u102D ;
     45 ($consonant) \u103A \u108C → $ukinzi $1 \u103B \u102E ;
     46 ($consonant) \u103A \u108D → $ukinzi $1 \u103B \u1036 ;
     47 ($consonant) \u103A \u108e → $1 \u103B \u102D \u1036 ;
     48 \u108B → $ukinzi \u102D ;
     49 \u108C → $ukinzi \u102E ;
     50 \u108D → $ukinzi \u1036 ;
     51 # Consonants (only the ones that have to change)
     52 \u106A → \u1009 ;  # NYA
     53 \u106B → \u100A ;
     54 \u108F → \u1014 ;
     55 \u1090 → \u101B ;
     56 \u1086 → \u103F ;
     57 # yapin
     58 [\u103A\u107d] → \u103B ;
     59 # yayit
     60 ($zmedialra)+ → \u103C ;
     61 # wasway
     62 \u103C* \u108A → \u103D \u103E;  # To avoid duplicate medials
     63 \u103C → \u103D ;
     64 # hatoh
     65 [\u103D\u1087] → \u103E ;
     66 \u1088 → \u103E \u102F ;
     67 \u1089 → \u103E \u1030 ;
     68 # Vowels
     69 \u1033 → \u102F ;
     70 \u1034 → \u1030 ;
     71 # asat
     72 \u1039 → \u103A ;
     73 # lower dot
     74 [\u1094\u1095] → \u1037 ;
     75 # Special cases for 1025 vs 1009;
     76 \u1025 \u1039 → \u1009 \u103a;
     77 \u1025 \u1061 → \u1009 \u1039 \u1001;
     78 \u1025 \u1062 → \u1009 \u1039 \u1002;
     79 \u1025 \u1065 → \u1009 \u1039 \u1005;
     80 \u1025 \u1068 → \u1009 \u1039 \u1007;
     81 \u1025 \u1076 → \u1009 \u1039 \u1013;
     82 \u1025 \u1078 → \u1009 \u1039 \u1015;
     83 \u1025 \u107A → \u1009 \u1039 \u1017;
     84 \u1025 \u1079 → \u1009 \u1039 \u1016;
     85 # Stacked Consonants
     86 \u105A → \u102B \u103A ;
     87 \u1060 → \u1039 \u1000 ;
     88 \u1061 → \u1039 \u1001 ;
     89 \u1062 → \u1039 \u1002 ;
     90 \u1063 → \u1039 \u1003 ;
     91 \u1065 → \u1039 \u1005 ;
     92 [\u1066\u1067] → \u1039 \u1006 ;
     93 \u1068 → \u1039 \u1007 ;
     94 \u1069 → \u1039 \u1008 ;
     95 \u106C → \u1039 \u100B ;
     96 \u106D → \u1039 \u100C ;
     97 \u1070 → \u1039 \u100F ;
     98 [\u1071\u1072] → \u1039 \u1010 ;
     99 \u1096 → \u1039 \u1010 \u103D;
    100 [\u1073\u1074] → \u1039 \u1011 ;
    101 \u1075 → \u1039 \u1012 ;
    102 \u1076 → \u1039 \u1013 ;
    103 \u1077 → \u1039 \u1014 ;
    104 \u1078 → \u1039 \u1015 ;
    105 \u1079 → \u1039 \u1016 ;
    106 \u107A → \u1039 \u1017 ;
    107 [\u107B\u1093] → \u1039 \u1018 ;
    108 \u107C → \u1039 \u1019 ;
    109 \u1085 → \u1039 \u101C ;
    110 \u108E → \u102D \u1036 ;
    111 # Pre-defined ligatures
    112 \u106E → \u100D\u1039\u100D ;
    113 \u106F → \u100D\u1039\u100E ;
    114 \u1091 → \u100F\u1039\u100D ;
    115 \u1092 → \u100B\u1039\u100C ;
    116 \u1097 → \u100B\u1039\u100B ;
    117 \u104E → \u104E\u1004\u103A\u1038 ;
    118 ####
    119 #### STAGE 1.01: Digits 0 and 4 used instead of letters
    120 # Case of MYANMAR digit being used instead of a letter
    121 # Lone digit zero and four at start
    122 ::Null;
    123 ^ \u1040 ($nondigits) → \u101D $1;
    124 ^ \u1044 ($nondigits) → | \u104E $1 ;
    125 # Lone digit zero or four at end
    126 ($nondigits) \u1040 $ → $1 \u101D;
    127 ($nondigits) \u1044 $ → $1 \u104e;
    128 # Evowel and dependent vowel signs before 0 or 4 only
    129 #   -> convert to the consonant.
    130 ([\u102b-\u103f]) \u1040 ($nondigits) → $1 \u101d $2;
    131 ([\u102b-\u103f]) \u1044 ($nondigits) → $1 \u104E $2;
    132 ####
    133 #### STAGE 1.1: Strip spaces immediately before combining characters.
    134 ####   Move e-vowel after consonants and medials
    135 ####   Now every codepoint is Unicode. This starts conversion
    136 ####   from semi-visual order to logical order.
    137 ####
    138 ::Null;
    139 # Don't remove spaces before E vowel or medial Ra at this stage
    140 ($wspace) \u1037 > \u1037 $1;
    141 ($wspace+) ([\u102b-\u1030\u1032-\u103b\u103d\u103e]) → $2;
    142 # Remove a duplicate early
    143 \u1037+ → \u1037;
    144 # Move e-vowel after medials and consonants.
    145 \u1031+ $ukinzi ($consonant) > $ukinzi $1 \u1031;
    146 \u1031+ \u1037+ ($consonant) > $1 \u1031 \u1037 ;
    147 \u1031+ \u103c ($consonant) > $1 \u103c \u1031;
    148 # Move medials other than 103c before the 1031. Leave 103c for
    149 # the next consonant.
    150 \u1031+ ($consonant) ([\u103b\u103d\u103e]+) > $1 $2 \u1031;
    151 \u1031+ ($vowelsAndConsonants) > $1 \u1031;
    152 ####
    153 #### STAGE 2: POST REORDERING RULES FOR UNICODE RENDERING
    154 ####
    155 ::Null;
    156 \u103b \u103a > \u103a \u103b;
    157 # Simpler replacements for Zawgyi 1025
    158 \u1025 \u102E → \u1026;
    159 # Asat and dot below reordering, to Unicode NFC.
    160 \u103A\u1037 → \u1037\u103A;
    161 # Reorder some vowel signs
    162 \u1036 ($umedial*) ($vowelsign+) → $1 $2 \u1036 ;
    163 ([\u102B\u102C\u102F\u1030]) ([\u102D\u102E\u1032]) → $2 $1;
    164 # Move ra medial which precedes consonant, but not other medials.
    165 \u103C ($consonant) → $1 \u103C;
    166 ####
    167 #### Stage 3
    168 #### Move \u1036, and \u103C after consonants.
    169 ::Null;
    170 ($umedial) \u1039 ($consonant) > \u1039 $2 $1;
    171 \u103C \u103A \u1039 ($consonant) → \u103A \u1039 $1 \u103C;
    172 \u1036 ($umedial+) → $1 \u1036;
    173 ####
    174 #### Stage 4
    175 #### Reordering medials, dot below, contractions, E sign, and asat.
    176 ::Null;
    177 # Reorder the medials
    178 ([\u103C\u103D\u103E]+) \u103B → \u103B $1;
    179 ([\u103D\u103E]+) \u103C → \u103C $1;
    180 \u103E\u103D → \u103D\u103E ;
    181 # Contractions with vowel signs
    182 ([\u1031]+) ($vowelsign*) \u1039 ($consonant) → \u1039 $3 $1 $2;
    183 ($vowelsign+) \u1039 ($consonant) → \u1039 $2 $1;
    184 # Move vowel sign E \u1031 after medials, but not across consonants
    185 ($umedial*) ([\u1031]+) ($umedial*) → $1 $3 $2;
    186 # Reorder dot below after medials and vowel diacritics
    187 \u1037 ([\u102D-\u1030\u1032\u1036\u103b-\u103e]+) → $1 \u1037;
    188 # Move vowel signs after medials
    189 ($vowelsign+) ($umedial+) → $2 $1;
    190 # Reorder modifiers and asat
    191 ($consonant) ([\u102B-\u1032\u1036\u103B-\u103E]) \u103A ($consonant) → $1 \u103A $2 $3;
    192 ####
    193 #### Stage 5.  More reorderings
    194 #### Vowel signs after medials, sort medials,
    195 ####
    196 ::Null;
    197 # Replace CA + YA with JHA after moving other things beyond the medials.
    198 \u1005 \u103b → \u1008;
    199 # More moving vowel signs after medials
    200 ([\u102b-\u1032]) ($umedial) → $2 $1;
    201 # Sort the medials
    202 ([\u103C\u103D\u103E]) \u103B → \u103B $1;
    203 ([\u103D\u103E]) \u103C → \u103C $1;
    204 \u103E\u103D → \u103D\u103E ;
    205 # Move visarga after other signs
    206 \u1038 ($vowelmedial) → $1 \u1038;
    207 # Reorder
    208 \u1036 \u102f → \u102f \u1036;
    209 ###
    210 ### Stage 6
    211 ### Finish conflicting and extra diacritics. Remove some white space
    212 ###
    213 ::Null;
    214 # Fix duplicate combiners
    215 \u102D \u102D+ → \u102D;
    216 \u102E \u102E+ → \u102E;
    217 \u102F \u102F+ → \u102F;
    218 \u1030 \u1030+ → \u1030;
    219 \u1032 \u1032+ → \u1032;
    220 \u1036 \u1036+ → \u1036;
    221 \u1037 \u1037+ → \u1037;
    222 \u1039 \u1039+ → \u1039;
    223 \u103a \u103a+ → \u103a;
    224 \u103b \u103b+ → \u103b;
    225 \u103c \u103c+ → \u103c;
    226 \u103d \u103d+ → \u103d;
    227 \u103e \u103e+ → \u103e; # http://unicode.org/cldr/trac/ticket/10386
    228 # Fix overlapping signs
    229 \u102F [\u1030\u103a] → \u102F;
    230 \u102D \u102E → \u102E;
    231 # Remove space directly before diacritics.
    232 ($wspace)+ ([\u102b-\u1032\u1036-\u103e]) → $2;
    233 # Remove ZWSP at start and end
    234 ^ \u200b+ → ;
    235 \u200b+ $ → ;
    236 # Fix multiple spaces around ZWSP to single ZWSP.
    237 $wspace* \u200b $wspace* → \u200b;
	tor-browser The Tor Browser
	git clone https://git.dasho.dev/tor-browser.git
	Log \| Files \| Refs \| README \| LICENSE