word.txt (3435B)
1 # 2 # Copyright (C) 2016 and later: Unicode, Inc. and others. 3 # License & terms of use: http://www.unicode.org/copyright.html 4 # Copyright (c) 2016, International Business Machines Corporation and others. All Rights Reserved. 5 6 # file: word.txt 7 # 8 # Reference Word Break rules for intltest rbbi/RBBIMonkeyTest 9 # 10 # Note: Rule syntax and the monkey test itself are still a work in progress. 11 # They are expected to change with review and the addition of support for rule tailoring. 12 13 14 type = word; # one of grapheme | word | line | sentence 15 locale = en; 16 17 Han = [:Han:]; 18 19 CR = [\p{Word_Break = CR}]; 20 LF = [\p{Word_Break = LF}]; 21 Newline = [\p{Word_Break = Newline}]; 22 Extend = [\p{Word_Break = Extend}-Han]; 23 ZWJ = [\p{Word_Break = ZWJ}]; 24 Regional_Indicator = [\p{Word_Break = Regional_Indicator}]; 25 Format = [\p{Word_Break = Format}]; 26 Katakana = [\p{Word_Break = Katakana}]; 27 Hebrew_Letter = [\p{Word_Break = Hebrew_Letter}]; 28 ALetter = [\p{Word_Break = ALetter}]; 29 Single_Quote = [\p{Word_Break = Single_Quote}]; 30 Double_Quote = [\p{Word_Break = Double_Quote}]; 31 MidNumLet = [\p{Word_Break = MidNumLet}]; 32 MidLetter = [\p{Word_Break = MidLetter}]; 33 MidNum = [\p{Word_Break = MidNum}]; 34 Numeric = [\p{Word_Break = Numeric}]; 35 ExtendNumLet = [\p{Word_Break = ExtendNumLet}]; 36 WSegSpace = [\p{Word_Break = WSegSpace}]; 37 Extended_Pict = [:ExtPict:]; 38 39 #define dictionary, with the effect being that those characters don't appear in test data. 40 41 Hiragana = [:Hiragana:]; 42 43 Control = [\p{Grapheme_Cluster_Break = Control}]; 44 HangulSyllable = [\uac00-\ud7a3]; 45 ComplexContext = [:LineBreak = Complex_Context:]; 46 KanaKanji = [Han Hiragana Katakana]; 47 dictionaryCJK = [KanaKanji HangulSyllable]; 48 dictionary = [ComplexContext dictionaryCJK]; 49 50 # leave dictionary scripts out of ALetter 51 52 ALetter = [ALetter - dictionary]; 53 54 AHLetter = [ALetter Hebrew_Letter]; 55 MidNumLetQ = [MidNumLet Single_Quote]; 56 ExtFmt = [Extend Format ZWJ]; 57 58 WB3: CR LF; 59 WB3a: (Newline | CR | LF) ÷; 60 WB3b: . ÷ (Newline | CR | LF); # actually redundant? No other rule combines. 61 # (but needed with UAX treat-as scheme.) 62 WB3c: ZWJ Extended_Pict; 63 WB3d: WSegSpace WSegSpace; 64 65 WB5: AHLetter ExtFmt* AHLetter; 66 67 # includes both WB6 and WB7 68 WB6: AHLetter ExtFmt* (MidLetter | MidNumLetQ) ExtFmt* AHLetter; 69 70 WB7a: Hebrew_Letter ExtFmt* Single_Quote; 71 WB7b: Hebrew_Letter ExtFmt* Double_Quote ExtFmt* Hebrew_Letter; # Include WB7c 72 73 WB8: Numeric ExtFmt* Numeric; 74 WB9: AHLetter ExtFmt* Numeric; 75 WB10: Numeric ExtFmt* AHLetter; 76 77 WB11: Numeric ExtFmt* (MidNum | MidNumLetQ) ExtFmt* Numeric; # includes WB12 78 WB13: Katakana ExtFmt* Katakana; 79 80 WB13a: (AHLetter | Numeric | Katakana | ExtendNumLet) ExtFmt* ExtendNumLet; 81 WB13b: ExtendNumLet ExtFmt* (AHLetter | Numeric | Katakana); 82 83 # WB rule 15 - 17, pairs of Regional Indicators stay unbroken. 84 # Interacts with WB3c. 85 WB15: Regional_Indicator ExtFmt* Regional_Indicator ExtFmt* ZWJ Extended_Pict; 86 WB17: Regional_Indicator ExtFmt* Regional_Indicator ExtFmt* ÷; 87 88 # Rule WB 999 Any ÷ Any 89 # Interacts with WB3c, do not break between ZWJ and (Extended_Pict | EBG). 90 WB999.1: . ExtFmt* ZWJ Extended_Pict; 91 WB999.2: . ExtFmt* ÷;