char.txt (2249B)
1 # 2 # Copyright (C) 2016 and later: Unicode, Inc. and others. 3 # License & terms of use: http://www.unicode.org/copyright.html 4 # Copyright (C) 2002-2016, International Business Machines Corporation and others. 5 # All Rights Reserved. 6 # 7 # file: char.txt 8 # 9 # ICU Character Break Rules 10 # These rules are based on the Extended Grapheme Cluster rules from 11 # Unicode UAX #29 Revision 34 for Unicode Version 12.0 12 13 !!quoted_literals_only; 14 15 # 16 # Character Class Definitions. 17 # 18 $CR = [\p{Grapheme_Cluster_Break = CR}]; 19 $LF = [\p{Grapheme_Cluster_Break = LF}]; 20 $Control = [[\p{Grapheme_Cluster_Break = Control}]]; 21 $Extend = [[\p{Grapheme_Cluster_Break = Extend}]]; 22 $ZWJ = [\p{Grapheme_Cluster_Break = ZWJ}]; 23 $Regional_Indicator = [\p{Grapheme_Cluster_Break = Regional_Indicator}]; 24 $Prepend = [\p{Grapheme_Cluster_Break = Prepend}]; 25 $SpacingMark = [\p{Grapheme_Cluster_Break = SpacingMark}]; 26 27 $InCBConsonant = [\p{InCB=Consonant}]; 28 $InCBExtend = [\p{InCB=Extend}]; 29 $InCBLinker = [\p{InCB=Linker}]; 30 31 # Korean Syllable Definitions 32 # 33 $L = [\p{Grapheme_Cluster_Break = L}]; 34 $V = [\p{Grapheme_Cluster_Break = V}]; 35 $T = [\p{Grapheme_Cluster_Break = T}]; 36 37 $LV = [\p{Grapheme_Cluster_Break = LV}]; 38 $LVT = [\p{Grapheme_Cluster_Break = LVT}]; 39 40 # Emoji definitions 41 42 $Extended_Pict = [:ExtPict:]; 43 44 ## ------------------------------------------------- 45 !!chain; 46 !!lookAheadHardBreak; 47 48 $CR $LF; 49 50 $L ($L | $V | $LV | $LVT); 51 ($LV | $V) ($V | $T); 52 ($LVT | $T) $T; 53 54 # GB 9 55 [^$Control $CR $LF] ($Extend | $ZWJ); 56 57 # GB 9a 58 [^$Control $CR $LF] $SpacingMark; 59 60 # GB 9b 61 $Prepend [^$Control $CR $LF]; 62 63 # GB 9c 64 $InCBConsonant [ $InCBExtend $InCBLinker ]* $InCBLinker [ $InCBExtend $InCBLinker ]* $InCBConsonant; 65 66 # GB 11 Do not break within emoji modifier sequences or emoji zwj sequences. 67 $Extended_Pict $Extend* $ZWJ $Extended_Pict; 68 69 # GB 12-13. Keep pairs of regional indicators together 70 # Note that hard break '/' rule triggers only if there are three or more initial RIs, 71 72 ^$Prepend* $Regional_Indicator $Regional_Indicator / $Regional_Indicator; 73 ^$Prepend* $Regional_Indicator $Regional_Indicator; 74 75 # GB 999 Match a single code point if no other rule applies. 76 .;