Zawgyi_my.txt (8334B)
1 # © 2016 and later: Unicode, Inc. and others. 2 # License & terms of use: http://www.unicode.org/copyright.html 3 # Generated using tools/cldr/cldr-to-icu/ 4 # 5 # File: Zawgyi_my.txt 6 # Generated from CLDR 7 # 8 9 # This transform converts Zawgyi "encoded" Burmese into proper 10 # unicode. Zawgyi is a popular encoding scheme in Myanmar. It uses 11 # the Myanmar unicode range but assigns different characters or 12 # glyphs to some codepoints. In addition to the character mapping, 13 # there is reordering of codepoints needed to match the expected 14 # unicode order. This reordering is context-based. 15 # 16 # This transform is done in two main stages: 17 # (1) Map all Zawgyi codepoints to their Unicode counterpart. 18 # (2) Perform reordering. 19 # Modern Burmese digits & Unicode code points. 20 $nondigits = [^\u1040-\u1049]; 21 $consonant = [\u1000-\u1021]; 22 $vowelsign = [\u102B-\u1030\u1032]; # Unicode vowel signs except E (1031) 23 $vowelsAndConsonants = [\u1000-\u102a]; 24 $umedial = [\u103B-\u103E]; # Medial codepoints in Unicode 25 $vowelmedial = [\u102B-\u1030\u1032\1u36\u1037\u103A-\u103F]; # Union of vowel signs and medials 26 $ukinzi = \u1004\u103A\u1039; # Codepoints representing kinzi in Unicode 27 # Zawgyi medial ra has multiple representations 28 $zmedialra = [\u103B\u107E-\u1084]; 29 $wspace = [\u0020\u00a0\u1680\u2000-\u200d\u2060\u202f\u205f\u3000\ufeff]; 30 #### 31 #### STAGE 1: CODEPOINT MAPPING FROM ZAWGYI TO UNICODE 32 #### 33 # Kinzi (predefined ligatures) 34 # Move base character to the right 35 ($consonant) \u103A \u1064 → $ukinzi $1 \u103B; 36 ($consonant) \u1064 → $ukinzi $1; 37 \u1064 → $ukinzi; 38 # Special cases moving base character to right before vowel signs 39 ($consonant) \u108B → $ukinzi $1 \u102D; 40 ($consonant) \u108C → $ukinzi $1 \u102E; 41 ($consonant) \u108D → $ukinzi $1 \u1036; 42 # Special cases moving Kinzi block to left 43 ($consonant) \u103A \u1033 \u108B → $ukinzi $1 \u103B \u102D \u102F; 44 ($consonant) \u103A \u108b → $ukinzi $1 \u103B \u102D ; 45 ($consonant) \u103A \u108C → $ukinzi $1 \u103B \u102E ; 46 ($consonant) \u103A \u108D → $ukinzi $1 \u103B \u1036 ; 47 ($consonant) \u103A \u108e → $1 \u103B \u102D \u1036 ; 48 \u108B → $ukinzi \u102D ; 49 \u108C → $ukinzi \u102E ; 50 \u108D → $ukinzi \u1036 ; 51 # Consonants (only the ones that have to change) 52 \u106A → \u1009 ; # NYA 53 \u106B → \u100A ; 54 \u108F → \u1014 ; 55 \u1090 → \u101B ; 56 \u1086 → \u103F ; 57 # yapin 58 [\u103A\u107d] → \u103B ; 59 # yayit 60 ($zmedialra)+ → \u103C ; 61 # wasway 62 \u103C* \u108A → \u103D \u103E; # To avoid duplicate medials 63 \u103C → \u103D ; 64 # hatoh 65 [\u103D\u1087] → \u103E ; 66 \u1088 → \u103E \u102F ; 67 \u1089 → \u103E \u1030 ; 68 # Vowels 69 \u1033 → \u102F ; 70 \u1034 → \u1030 ; 71 # asat 72 \u1039 → \u103A ; 73 # lower dot 74 [\u1094\u1095] → \u1037 ; 75 # Special cases for 1025 vs 1009; 76 \u1025 \u1039 → \u1009 \u103a; 77 \u1025 \u1061 → \u1009 \u1039 \u1001; 78 \u1025 \u1062 → \u1009 \u1039 \u1002; 79 \u1025 \u1065 → \u1009 \u1039 \u1005; 80 \u1025 \u1068 → \u1009 \u1039 \u1007; 81 \u1025 \u1076 → \u1009 \u1039 \u1013; 82 \u1025 \u1078 → \u1009 \u1039 \u1015; 83 \u1025 \u107A → \u1009 \u1039 \u1017; 84 \u1025 \u1079 → \u1009 \u1039 \u1016; 85 # Stacked Consonants 86 \u105A → \u102B \u103A ; 87 \u1060 → \u1039 \u1000 ; 88 \u1061 → \u1039 \u1001 ; 89 \u1062 → \u1039 \u1002 ; 90 \u1063 → \u1039 \u1003 ; 91 \u1065 → \u1039 \u1005 ; 92 [\u1066\u1067] → \u1039 \u1006 ; 93 \u1068 → \u1039 \u1007 ; 94 \u1069 → \u1039 \u1008 ; 95 \u106C → \u1039 \u100B ; 96 \u106D → \u1039 \u100C ; 97 \u1070 → \u1039 \u100F ; 98 [\u1071\u1072] → \u1039 \u1010 ; 99 \u1096 → \u1039 \u1010 \u103D; 100 [\u1073\u1074] → \u1039 \u1011 ; 101 \u1075 → \u1039 \u1012 ; 102 \u1076 → \u1039 \u1013 ; 103 \u1077 → \u1039 \u1014 ; 104 \u1078 → \u1039 \u1015 ; 105 \u1079 → \u1039 \u1016 ; 106 \u107A → \u1039 \u1017 ; 107 [\u107B\u1093] → \u1039 \u1018 ; 108 \u107C → \u1039 \u1019 ; 109 \u1085 → \u1039 \u101C ; 110 \u108E → \u102D \u1036 ; 111 # Pre-defined ligatures 112 \u106E → \u100D\u1039\u100D ; 113 \u106F → \u100D\u1039\u100E ; 114 \u1091 → \u100F\u1039\u100D ; 115 \u1092 → \u100B\u1039\u100C ; 116 \u1097 → \u100B\u1039\u100B ; 117 \u104E → \u104E\u1004\u103A\u1038 ; 118 #### 119 #### STAGE 1.01: Digits 0 and 4 used instead of letters 120 # Case of MYANMAR digit being used instead of a letter 121 # Lone digit zero and four at start 122 ::Null; 123 ^ \u1040 ($nondigits) → \u101D $1; 124 ^ \u1044 ($nondigits) → | \u104E $1 ; 125 # Lone digit zero or four at end 126 ($nondigits) \u1040 $ → $1 \u101D; 127 ($nondigits) \u1044 $ → $1 \u104e; 128 # Evowel and dependent vowel signs before 0 or 4 only 129 # -> convert to the consonant. 130 ([\u102b-\u103f]) \u1040 ($nondigits) → $1 \u101d $2; 131 ([\u102b-\u103f]) \u1044 ($nondigits) → $1 \u104E $2; 132 #### 133 #### STAGE 1.1: Strip spaces immediately before combining characters. 134 #### Move e-vowel after consonants and medials 135 #### Now every codepoint is Unicode. This starts conversion 136 #### from semi-visual order to logical order. 137 #### 138 ::Null; 139 # Don't remove spaces before E vowel or medial Ra at this stage 140 ($wspace) \u1037 > \u1037 $1; 141 ($wspace+) ([\u102b-\u1030\u1032-\u103b\u103d\u103e]) → $2; 142 # Remove a duplicate early 143 \u1037+ → \u1037; 144 # Move e-vowel after medials and consonants. 145 \u1031+ $ukinzi ($consonant) > $ukinzi $1 \u1031; 146 \u1031+ \u1037+ ($consonant) > $1 \u1031 \u1037 ; 147 \u1031+ \u103c ($consonant) > $1 \u103c \u1031; 148 # Move medials other than 103c before the 1031. Leave 103c for 149 # the next consonant. 150 \u1031+ ($consonant) ([\u103b\u103d\u103e]+) > $1 $2 \u1031; 151 \u1031+ ($vowelsAndConsonants) > $1 \u1031; 152 #### 153 #### STAGE 2: POST REORDERING RULES FOR UNICODE RENDERING 154 #### 155 ::Null; 156 \u103b \u103a > \u103a \u103b; 157 # Simpler replacements for Zawgyi 1025 158 \u1025 \u102E → \u1026; 159 # Asat and dot below reordering, to Unicode NFC. 160 \u103A\u1037 → \u1037\u103A; 161 # Reorder some vowel signs 162 \u1036 ($umedial*) ($vowelsign+) → $1 $2 \u1036 ; 163 ([\u102B\u102C\u102F\u1030]) ([\u102D\u102E\u1032]) → $2 $1; 164 # Move ra medial which precedes consonant, but not other medials. 165 \u103C ($consonant) → $1 \u103C; 166 #### 167 #### Stage 3 168 #### Move \u1036, and \u103C after consonants. 169 ::Null; 170 ($umedial) \u1039 ($consonant) > \u1039 $2 $1; 171 \u103C \u103A \u1039 ($consonant) → \u103A \u1039 $1 \u103C; 172 \u1036 ($umedial+) → $1 \u1036; 173 #### 174 #### Stage 4 175 #### Reordering medials, dot below, contractions, E sign, and asat. 176 ::Null; 177 # Reorder the medials 178 ([\u103C\u103D\u103E]+) \u103B → \u103B $1; 179 ([\u103D\u103E]+) \u103C → \u103C $1; 180 \u103E\u103D → \u103D\u103E ; 181 # Contractions with vowel signs 182 ([\u1031]+) ($vowelsign*) \u1039 ($consonant) → \u1039 $3 $1 $2; 183 ($vowelsign+) \u1039 ($consonant) → \u1039 $2 $1; 184 # Move vowel sign E \u1031 after medials, but not across consonants 185 ($umedial*) ([\u1031]+) ($umedial*) → $1 $3 $2; 186 # Reorder dot below after medials and vowel diacritics 187 \u1037 ([\u102D-\u1030\u1032\u1036\u103b-\u103e]+) → $1 \u1037; 188 # Move vowel signs after medials 189 ($vowelsign+) ($umedial+) → $2 $1; 190 # Reorder modifiers and asat 191 ($consonant) ([\u102B-\u1032\u1036\u103B-\u103E]) \u103A ($consonant) → $1 \u103A $2 $3; 192 #### 193 #### Stage 5. More reorderings 194 #### Vowel signs after medials, sort medials, 195 #### 196 ::Null; 197 # Replace CA + YA with JHA after moving other things beyond the medials. 198 \u1005 \u103b → \u1008; 199 # More moving vowel signs after medials 200 ([\u102b-\u1032]) ($umedial) → $2 $1; 201 # Sort the medials 202 ([\u103C\u103D\u103E]) \u103B → \u103B $1; 203 ([\u103D\u103E]) \u103C → \u103C $1; 204 \u103E\u103D → \u103D\u103E ; 205 # Move visarga after other signs 206 \u1038 ($vowelmedial) → $1 \u1038; 207 # Reorder 208 \u1036 \u102f → \u102f \u1036; 209 ### 210 ### Stage 6 211 ### Finish conflicting and extra diacritics. Remove some white space 212 ### 213 ::Null; 214 # Fix duplicate combiners 215 \u102D \u102D+ → \u102D; 216 \u102E \u102E+ → \u102E; 217 \u102F \u102F+ → \u102F; 218 \u1030 \u1030+ → \u1030; 219 \u1032 \u1032+ → \u1032; 220 \u1036 \u1036+ → \u1036; 221 \u1037 \u1037+ → \u1037; 222 \u1039 \u1039+ → \u1039; 223 \u103a \u103a+ → \u103a; 224 \u103b \u103b+ → \u103b; 225 \u103c \u103c+ → \u103c; 226 \u103d \u103d+ → \u103d; 227 \u103e \u103e+ → \u103e; # http://unicode.org/cldr/trac/ticket/10386 228 # Fix overlapping signs 229 \u102F [\u1030\u103a] → \u102F; 230 \u102D \u102E → \u102E; 231 # Remove space directly before diacritics. 232 ($wspace)+ ([\u102b-\u1032\u1036-\u103e]) → $2; 233 # Remove ZWSP at start and end 234 ^ \u200b+ → ; 235 \u200b+ $ → ; 236 # Fix multiple spaces around ZWSP to single ZWSP. 237 $wspace* \u200b $wspace* → \u200b;