Latn_Kana.txt (10275B)
1 # © 2016 and later: Unicode, Inc. and others. 2 # License & terms of use: http://www.unicode.org/copyright.html 3 # Generated using tools/cldr/cldr-to-icu/ 4 # 5 # File: Latn_Kana.txt 6 # Generated from CLDR 7 # 8 9 # note: a global filter is more efficient, but MUST include all source chars 10 #:: [\u0000-\u007E 、。 \u3099-゜ ァ-ー 。-゚ [:Latin:][:Katakana:] [:Nonspacing_Mark:]] ; 11 # MINIMAL FILTER GENERATED FOR: Latin-Katakana 12 ### WARNING -- must add width filter, both here and below!!! ### 13 :: [[ᄀ-ᄒᄚᄡ\u1160-ᅵᆪᆬ-ᆭᆰ-ᆵ←-↓│■○\u3000-。「-」\u3099-\u309Aァ-ロワヲ-ヴヷヺ-ー!-~¢-₩][',.A-Za-z~À-ÖØ-öø-ďĒ-ĥĨ-İĴ-ķĹ-ľŃ-ňŌ-őŔ-ťŨ-žƠ-ơƯ-ưǍ-ǜǞ-ǣǦ-ǭǰǴ-ǵǸ-țȞ-ȟȦ-ȳ\u0304Ӣ-ӣӮ-ӯḀ-ẙẠ-ỹᾱᾹῑῙῡῩK-Å]] ; 14 :: [:Latin:] fullwidth-halfwidth (); 15 :: NFD (NFC); 16 :: Lower (); # whenever transliterating from cased to uncased script, include this 17 # :: NFD () ; # this would catch the odd cases where a lowercase is not in NFD, but none are important for Japanese 18 # Uses modified Hepburn. Small changes to make unambiguous. 19 # | Kunrei-shiki: Hepburn/MHepburn 20 # | ------------------------------ 21 # | si: shi 22 # | si ~ya: sha 23 # | si ~yu: shu 24 # | si ~yo: sho 25 # | zi: ji 26 # | zi ~ya: ja 27 # | zi ~yu: ju 28 # | zi ~yo: jo 29 # | ti: chi 30 # | ti ~ya: cha 31 # | ti ~yu: chu 32 # | ti ~yu: cho 33 # | tu: tsu 34 # | di: ji/dji 35 # | du: zu/dzu 36 # | hu: fu 37 # | For foreign words: 38 # | ----------------- 39 # | se ~i si 40 # | si ~e she 41 # | 42 # | ze ~i zi 43 # | zi ~e je 44 # | 45 # | te ~i ti 46 # | ti ~e che 47 # | te ~u tu 48 # | 49 # | de ~i di 50 # | de ~u du 51 # | de ~i di 52 # | 53 # | he ~u: hu 54 # | hu ~a fa 55 # | hu ~i fi 56 # | hu ~e he 57 # | hu ~o ho 58 # Most small forms are generated, but if necessary 59 # explicit small forms are given with ~a, ~ya, etc. 60 #------------------------------------------------------ 61 # Variables 62 $vowel = [aeiou] ; 63 $consonant = [bcdfghjklmnpqrstvwxyz] ; 64 $macron = \u0304 ; 65 # Variables used for doubled-consonants with tsu 66 $kana = [ぁ-ゔ] ; 67 $voice = [\u3099゛]; 68 $semivoice = [\u309A゜]; 69 $k_start = [カキクケコかきくけこ] ; 70 $s_start = [サシスセソさしすせそ] ; 71 $j_start = [シし] $voice ; 72 $t_start = [タチツテトたちつてと] ; 73 $n_start = [ナニヌネノンなにぬねの] ; 74 $h_start = [ハヒヘホはひへほ] ; 75 $f_start = [フふ] ; 76 $m_start = [マミムメモまみむめも] ; 77 $y_start = [ヤユヨやゆよ] ; 78 $r_start = [ラリルレロらりるれろ] ; 79 $w_start = [ワヰヱヲわゐゑを] ; 80 $v_start = [ワヰヱヲ]\u3099 ; 81 $voweled_basekana = [ァ-オカキクケコサシスセソタチッツテトナ-ノハヒフヘホマ-ヲヵヶ] ; 82 # if ン is followed by $n_quoter, then it needs an 83 # apostrophe after its romaji form to disambiguate it. 84 # e.g., ン ア ! = ナ, so represent as "n'a", not "na". 85 $n_quoter = [ア イ ウ エ オ ナ ニ ヌ ネ ノ ヤ ユ ヨ ン] ; 86 $small_y = [ャィュェョ] ; 87 $iteration = ゝ ; 88 #------------------------------------------------------ 89 # katakana rules 90 # Punctuation 91 '.' ↔ 。; 92 ',' ↔ 、; 93 # ' ' } [a-z] → ; # delete spaces before latin 94 # ' ' ← [^' '゠-ヿ] {} ['゠-ヿ] ; #insert spaces before hiragana 95 # Iteration Mark 96 # Copy previous letter § marks 97 # TODO 98 # | $1 $1 ← ($kana [[:M:]$voice$semivoice]?) $iteration 99 # Specials for katakana -- not shared with hiragana 100 va ↔ ワ\u3099 ; 101 vi ↔ ヰ\u3099 ; 102 ve ↔ ヱ\u3099 ; 103 vo ↔ ヲ\u3099 ; 104 '~ka' ↔ ヵ ; 105 '~ke' ↔ ヶ ; 106 # ~~~ begin shared rules ~~~ 107 #special 108 ya ← '~'ャ; 109 yi ← '~'ィ ; 110 yu ← '~'ュ; 111 ye ← '~'ェ; 112 yo ← '~'ョ; 113 #normal 114 a ↔ ア ; 115 b | '~' ← ヒ \u3099} $small_y ; 116 by } $vowel → ヒ\u3099 | '~y' ; 117 ba ↔ ハ\u3099 ; 118 bi ↔ ヒ\u3099 ; 119 bu ↔ フ\u3099 ; 120 be ↔ ヘ\u3099 ; 121 bo ↔ ホ\u3099 ; 122 c } i → | s ; 123 c } e → | s ; 124 da ↔ タ\u3099 ; 125 di ↔ テ\u3099ィ ; 126 du ↔ テ\u3099ゥ ; 127 de ↔ テ\u3099 ; 128 do ↔ ト\u3099 ; 129 dzu ↔ ツ\u3099 ; 130 dja ← チ\u3099ャ ; 131 dji'~i' ← チ\u3099ィ ; # liu 132 dju ← チ\u3099ュ ; 133 dje ← チ\u3099ェ ; 134 djo ← チ\u3099ョ ; 135 dji ↔ チ\u3099 ; 136 dj } $vowel → チ\u3099 | '~y' ; 137 # TODO: QUESTION: use ĵĴżŻ instead of dj, dz 138 cha ← チャ ; 139 chi'~i' ← チィ ; # liu 140 chu ← チュ ; 141 che ← チェ ; 142 cho ← チョ ; 143 chi ↔ チ ; 144 ch } $vowel → チ | '~y' ; 145 e ↔ エ ; 146 g | '~' ← キ\u3099} $small_y ; 147 gy } $vowel → キ\u3099 | '~y' ; 148 ga ↔ カ\u3099 ; 149 gi ↔ キ\u3099 ; 150 gu ↔ ク\u3099 ; 151 ge ↔ ケ\u3099 ; 152 go ↔ コ\u3099 ; 153 i ↔ イ ; 154 # j } $vowel → シ\u3099 | '~y' ; 155 ja ↔ シ\u3099ャ ; 156 ji'~i' ← シ\u3099ィ ; # liu 157 ju ↔ シ\u3099ュ ; 158 je ↔ シ\u3099ェ ; 159 jo ↔ シ\u3099ョ ; 160 ji ↔ シ\u3099 ; 161 k | '~' ← キ} $small_y ; 162 ky } $vowel → キ | '~y' ; 163 ka ↔ カ ; 164 ki ↔ キ ; 165 ku ↔ ク ; 166 ke ↔ ケ ; 167 ko ↔ コ ; 168 m | '~' ← ミ} $small_y ; 169 my } $vowel → ミ | '~y' ; 170 ma ↔ マ ; 171 mi ↔ ミ ; 172 mu ↔ ム ; 173 me ↔ メ ; 174 mo ↔ モ ; 175 m } [pbfv] → ン ; 176 n | '~' ← ニ } $small_y ; 177 ny } $vowel → ニ | '~y' ; 178 na ↔ ナ ; 179 ni ↔ ニ ; 180 nu ↔ ヌ ; 181 ne ↔ ネ ; 182 no ↔ ノ ; 183 o ↔ オ ; 184 p | '~' ← ヒ\u309A } $small_y ; 185 py } $vowel → ヒ\u309A | '~y' ; 186 pa ↔ ハ\u309A ; 187 pi ↔ ヒ\u309A ; 188 pu ↔ フ\u309A ; 189 pe ↔ ヘ\u309A ; 190 po ↔ ホ\u309A ; 191 h | '~' ← ヒ } $small_y ; 192 hy } $vowel → ヒ | '~y' ; 193 ha ↔ ハ ; 194 hi ↔ ヒ ; 195 hu ↔ ヘゥ ; 196 he ↔ ヘ ; 197 ho ↔ ホ ; 198 # f | '~' ← フ } $small_y ; 199 # f } $vowel → フ | '~' ; 200 fa ↔ ファ ; 201 fi ↔ フィ ; 202 fe ↔ フェ ; 203 fo ↔ フォ ; 204 fu ↔ フ ; 205 r | '~' ← リ } $small_y ; 206 ry } $vowel → リ | '~y' ; 207 ra ↔ ラ ; 208 ri ↔ リ ; 209 ru ↔ ル ; 210 re ↔ レ ; 211 ro ↔ ロ ; 212 za ↔ サ\u3099 ; 213 zi ↔ セ\u3099ィ ; 214 zu ↔ ス\u3099 ; 215 ze ↔ セ\u3099 ; 216 zo ↔ ソ\u3099 ; 217 sa ↔ サ ; 218 si ↔ セィ ; 219 su ↔ ス ; 220 se ↔ セ ; 221 so ↔ ソ ; 222 sha ← シャ ; 223 shi'~i' ← シィ ; # liu 224 shu ← シュ ; 225 she ← シェ ; 226 sho ← ショ ; 227 shi ↔ シ ; 228 sh } $vowel → シ | '~y' ; 229 ta ↔ タ ; 230 ti ↔ ティ ; 231 tu ↔ テゥ ; 232 te ↔ テ ; 233 to ↔ ト ; 234 tsu ↔ ツ ; 235 # v } $vowel → ウ\u3099 | '~' ; 236 #'v~a' ← ウ\u3099ァ ; # liu 237 #'v~i' ← ウ\u3099ィ ; # liu 238 #'v~e' ← ウ\u3099ェ ; # liu 239 #'v~o' ← ウ\u3099ォ ; # liu 240 vu ↔ ウ\u3099 ; 241 u ↔ ウ ; 242 # w } $vowel → ウ | '~' ; 243 wa ↔ ワ ; 244 wi ↔ ヰ ; 245 wu → ウ ; 246 we ↔ ヱ ; 247 wo ↔ ヲ ; 248 ya ↔ ヤ ; 249 yi → イ ; 250 yu ↔ ユ ; 251 ye → エ ; 252 yo ↔ ヨ ; 253 # double consonants 254 #specials 255 s } sh → ッ ; 256 t } ch → ッ ; 257 #voiced 258 j } j ↔ ッ } $j_start ; 259 b } b ↔ ッ } [$h_start$f_start] $voice; 260 d } d ↔ ッ } $t_start $voice; 261 g } g ↔ ッ } $k_start $voice; 262 p } p ↔ ッ } [$h_start$f_start] $semivoice; 263 # v } v ↔ ッ } [ワヰウヱヲう] $voice ; 264 z } z ↔ ッ } $s_start $voice; 265 v } v ↔ ッ } $v_start; 266 # normal 267 k } k ↔ ッ } $k_start ; 268 m } m ↔ ッ } $m_start ; 269 n } n ↔ ッ } $n_start ; 270 h } h ↔ ッ } $h_start ; 271 f } f ↔ ッ } $f_start ; 272 r } r ↔ ッ } $r_start ; 273 t } t ↔ ッ } $t_start ; 274 s } s ↔ ッ } $s_start ; 275 w } w ↔ ッ } $w_start; 276 y } y ↔ ッ } $y_start; 277 # completeness 278 x } x → ッ ; 279 c } k → ッ ; 280 c } c → ッ ; 281 c } q → ッ ; 282 l } l → ッ ; 283 q } q → ッ ; 284 # y } y → ッ ; 285 # w } w → ッ ; 286 # prolonged vowel mark. this indicates a doubling of 287 # the preceding vowel sound 288 #a ← a { ー ; # liu 289 #e ← e { ー ; # liu 290 #i ← i { ー ; # liu 291 #o ← o { ー ; # liu 292 #u ← u { ー ; # liu 293 $macron ↔ ー ; 294 # small forms 295 '~a' ↔ ァ ; 296 '~i' ↔ ィ ; 297 '~u' ↔ ゥ ; 298 '~e' ↔ ェ ; 299 '~o' ↔ ォ ; 300 '~tsu' ↔ ッ ; 301 '~wa' ↔ ヮ ; 302 '~ya' ↔ ャ ; 303 '~yi' → ィ ; 304 '~yu' ↔ ュ ; 305 '~ye' → ェ ; 306 '~yo' ↔ ョ ; 307 # iteration marks 308 # TODO: make more accurate 309 j $1 ← sh (y* $vowel) {ヽ$voice ; 310 dj $1 ← ch (y* $vowel) {ヽ$voice ; 311 dz $1 ← ts (y* $vowel) {ヽ$voice ; 312 g $1 ← k (y* $vowel) {ヽ$voice ; 313 z $1 ← s (y* $vowel) {ヽ$voice ; 314 d $1 ← t (y* $vowel) {ヽ$voice ; 315 h $1 ← b (y* $vowel) {ヽ$voice ; 316 v $1 ← w (y* $vowel) {ヽ$voice ; 317 sh $1 ← sh (y* $vowel) {ヽ$voice ; 318 j $1 ← j (y* $vowel) {ヽ$voice ; 319 ch $1 ← ch (y* $vowel) {ヽ$voice ; 320 dj $1 ← dj(y* $vowel) {ヽ$voice ; 321 ts $1 ← ts (y* $vowel) {ヽ$voice ; 322 dz $1 ← dz (y* $vowel) {ヽ$voice ; 323 $1 ← ($consonant y* $vowel) {ヽ$voice? ; 324 $1 ← (.) {ヽ $voice? ; # otherwise repeat last character 325 ← ヽ $voice? ; # delete if no characters found 326 # h- rule: lengthens vowel if not followed by a vowel. 327 # At the point this is applied, latin [cons]?vowel sequences 328 # have been converted to katakana in NFD form. 329 $voweled_basekana [\u3099 \u309A]? { h → ー ; 330 # one-way latin- → kana rules. these do not occur in 331 # well-formed romaji representing actual japanese text. 332 # their purpose is to make all romaji map to kana of 333 # some sort. 334 # the following are not really necessary, but produce 335 # slightly more natural results. 336 cy → セィ ; 337 dy → テ\u3099ィ ; 338 hy → ヒ ; 339 sy → セィ ; 340 ty → ティ ; 341 zy → セ\u3099ィ ; 342 h → ヘ ; 343 # isolated consonants listed here so as not to mask 344 # longer rules above. 345 ch → チ; 346 sh → シ ; 347 dz → ツ\u3099 ; 348 dj → チ\u3099; 349 b → フ\u3099 ; 350 d → テ\u3099 ; 351 g → ク\u3099 ; 352 k → ク ; 353 m → ム ; 354 n'' ← ン } $n_quoter ; 355 n ↔ ン ; 356 p → フ\u309A ; 357 r → ル ; 358 s → ス ; 359 t → テ ; 360 y → イ ; 361 z → ス\u3099 ; 362 v → ウ\u3099 ; 363 f → フ; 364 j → シ\u3099; 365 w → ウ; 366 ß → | ss ; 367 æ → | e ; 368 ð → | d ; 369 ø → | u ; 370 þ → | th ; 371 # simple substitutions using backup 372 c → | k ; 373 l → | r ; 374 q → | k ; 375 x → | ks ; 376 # ~~~ END shared rules ~~~ 377 #------------------------------------------------------ 378 # Final cleanup 379 '~' → ; # delete stray tildes between letters 380 [:Katakana:] { '' } [:Latin:] → ; # delete stray quotes between letters 381 # [ʾ[:Nonspacing_Mark:]-[\u3099-゜]] → ; # delete any non-spacing marks that we didn't use 382 :: NFC (NFD) ; 383 :: ([[:Katakana:][\u309B\u309C\u30A0\u30FC\uFF70\uFF9E\uFF9F]] halfwidth-fullwidth); 384 # note: a global filter is more efficient, but MUST include all source chars!! 385 #:: ([\u0000-\u007E 、。 \u3099-゜ ァ-ー 。-゚ [:Latin:][:Katakana:] [:Nonspacing_Mark:]]); 386 # MINIMAL FILTER GENERATED FOR: Latin-Katakana BACKWARD 387 :: ( [[\ -~¢-£¥-¦¬\u0304₩。-하-ᅦᅧ-ᅬᅭ-ᅲᅳ-ᅵ│-○][~、-。がぎぐげござじずぜぞだぢづでどば-ぱび-ぴぶ-ぷべ-ぺぼ-ぽゔ\u3099-゛ゞァ-ヺー-ヾ][\u309B\u309C\u30A0\u30FC\uFF70\uFF9E\uFF9F]] ) ; 388 # eof