word-latin1.js (4894B)
1 // |reftest| skip-if(!this.hasOwnProperty('Intl')||!this.Intl.Segmenter) 2 3 // https://www.unicode.org/reports/tr29/#Word_Boundary_Rules 4 5 const strings = { 6 // WB1, WB2 7 "": [], 8 9 // WB3 10 "\r\n": ["\r\n"], 11 12 // WB3a, WB3b 13 "\n": ["\n"], 14 "\r": ["\r"], 15 "\v": ["\v"], 16 "\f": ["\f"], 17 "\x85": ["\x85"], 18 19 // WB3d 20 " ": [" "], 21 " ": [" "], 22 23 // WB4 24 "\xAD": ["\xAD"], 25 "\xAD\xAD": ["\xAD\xAD"], 26 27 // WB5 28 "a": ["a"], 29 "ab": ["ab"], 30 31 // WB6, WB7 32 // Colon might be different rules per locale. (https://unicode-org.atlassian.net/browse/ICU-22112) 33 // "a:b": ["a:b"], 34 "a·b": ["a·b"], 35 "a.b": ["a.b"], 36 "a'b": ["a'b"], 37 38 // WB8 39 "1": ["1"], 40 "12": ["12"], 41 42 // WB9 43 "a1": ["a1"], 44 45 // WB10 46 "1a": ["1a"], 47 48 // WB11, WB12 49 "1,2": ["1,2"], 50 "1;2": ["1;2"], 51 "1.2": ["1.2"], 52 "1'2": ["1'2"], 53 54 // WB13a 55 "a_": ["a_"], 56 "1_": ["1_"], 57 "__": ["__"], 58 59 // WB13b 60 "_a": ["_a"], 61 "_1": ["_1"], 62 63 // WB999 64 "\0": ["\0"], 65 "?": ["?"], 66 "??": ["?", "?"], 67 }; 68 69 function assertSegments(string, words) { 70 let seg = segmenter.segment(string); 71 let segments = [...seg]; 72 73 // The computed segments match the expected value. 74 assertEqArray(segments.map(({segment}) => segment), words); 75 76 // |containing()| should return the same result. 77 for (let expected of segments) { 78 let {segment, index} = expected; 79 for (let i = index; i < index + segment.length; ++i) { 80 let actual = seg.containing(i); 81 assertDeepEq(actual, expected); 82 } 83 } 84 } 85 86 let segmenter = new Intl.Segmenter("en", {granularity: "word"}); 87 88 for (let [string, words] of Object.entries(strings)) { 89 assertSegments(string, words); 90 } 91 92 // WB3, WB3a, WB3b and WB4 93 for (let string of ["\r\n", "\n", "\r", "\v", "\f", "\x85"]) { 94 assertSegments(string + "\xAD", [string, "\xAD"]); 95 assertSegments("\xAD" + string, ["\xAD", string]); 96 } 97 98 // WB3d and WB4 99 for (let string of [" ", " "]) { 100 assertSegments(string + "\xAD", [string + "\xAD"]); 101 assertSegments("\xAD" + string, ["\xAD", string]); 102 } 103 assertSegments(" \xAD ", [" \xAD", " "]); 104 assertSegments(" \xAD\xAD ", [" \xAD\xAD", " "]); 105 106 // WB5-WB13 and WB4 107 for (let string of [ 108 // WB5 109 "a", "ab", 110 111 // WB6, WB7 112 // Colon might be different rules per locale. (https://unicode-org.atlassian.net/browse/ICU-22112) 113 // "a:b", 114 "a·b", 115 "a.b", 116 "a'b", 117 118 // WB8 119 "1", 120 "12", 121 122 // WB9 123 "a1", 124 125 // WB10 126 "1a", 127 128 // WB11, WB12 129 "1,2", 130 "1;2", 131 "1.2", 132 "1'2", 133 134 // WB13a 135 "a_", 136 "1_", 137 "__", 138 139 // WB13b 140 "_a", 141 "_1", 142 143 // WB999 144 "?", 145 ]) { 146 assertSegments(string + "\xAD", [string + "\xAD"]); 147 assertSegments("\xAD" + string, ["\xAD", string]); 148 149 assertSegments(string.split("").join("\xAD"), [string.split("").join("\xAD")]); 150 assertSegments(string.split("").join("\xAD\xAD"), [string.split("").join("\xAD\xAD")]); 151 } 152 153 assertSegments("?\xAD?", ["?\xAD", "?"]); 154 155 for (let string of [ 156 // WB6, WB7 157 "a:b", 158 "a·b", 159 "a.b", 160 "a'b", 161 162 // WB11, WB12 163 "1,2", 164 "1;2", 165 "1.2", 166 "1'2", 167 ]) { 168 let prefix = string.slice(0, -1); 169 let suffix = string.slice(1); 170 171 assertSegments(prefix, prefix.split("")); 172 assertSegments(suffix, suffix.split("")); 173 } 174 175 // MidNum with ALetter 176 assertSegments("a,b", ["a", ",", "b"]); 177 assertSegments("a;b", ["a", ";", "b"]); 178 179 // MidLetter with Numeric 180 assertSegments("1:2", ["1", ":", "2"]); 181 assertSegments("1·2", ["1", "·", "2"]); 182 183 // MidNumLet with mixed ALetter and Numeric 184 assertSegments("a.2", ["a", ".", "2"]); 185 assertSegments("1.b", ["1", ".", "b"]); 186 assertSegments("a'2", ["a", "'", "2"]); 187 assertSegments("1'b", ["1", "'", "b"]); 188 189 // MidNum with ExtendNumLet 190 assertSegments("_,_", ["_", ",", "_"]); 191 assertSegments("_;_", ["_", ";", "_"]); 192 193 // MidLetter with ExtendNumLet 194 assertSegments("_:_", ["_", ":", "_"]); 195 assertSegments("_·_", ["_", "·", "_"]); 196 197 // MidNumLet with ExtendNumLet 198 assertSegments("_._", ["_", ".", "_"]); 199 assertSegments("_'_", ["_", "'", "_"]); 200 201 // CLDR has locale-dependent word segmentation for the "en-posix" locale. This 202 // locale is currently not selectable, so the Latin-1 fast-paths don't need to 203 // implement it. If one of the two below assertions ever fail, please update 204 // the Latin-1 fast-paths for word segmentation to implement the "en-posix" 205 // changes. 206 assertEq(new Intl.Segmenter("en-posix").resolvedOptions().locale, "en"); 207 assertEq(new Intl.Segmenter("en-u-va-posix").resolvedOptions().locale, "en"); 208 209 // Locale-dependent word segmentation. 210 { 211 // https://en.wikipedia.org/wiki/Colon_(punctuation)#Abbreviation_mark 212 let string = "Word:with:colon"; 213 214 let english = new Intl.Segmenter("en", {granularity: "word"}); 215 let svenska = new Intl.Segmenter("sv", {granularity: "word"}); 216 217 // Three words with two separators in English. 218 assertEq([...english.segment(string)].length, 5); 219 220 // A single word in Swedish. 221 assertEq([...svenska.segment(string)].length, 1); 222 } 223 224 if (typeof reportCompare === "function") 225 reportCompare(0, 0);