tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

word-latin1.js (4894B)


      1 // |reftest| skip-if(!this.hasOwnProperty('Intl')||!this.Intl.Segmenter)
      2 
      3 // https://www.unicode.org/reports/tr29/#Word_Boundary_Rules
      4 
      5 const strings = {
      6  // WB1, WB2
      7  "": [],
      8 
      9  // WB3
     10  "\r\n": ["\r\n"],
     11 
     12  // WB3a, WB3b
     13  "\n": ["\n"],
     14  "\r": ["\r"],
     15  "\v": ["\v"],
     16  "\f": ["\f"],
     17  "\x85": ["\x85"],
     18 
     19  // WB3d
     20  " ": [" "],
     21  "  ": ["  "],
     22 
     23  // WB4
     24  "\xAD": ["\xAD"],
     25  "\xAD\xAD": ["\xAD\xAD"],
     26 
     27  // WB5
     28  "a": ["a"],
     29  "ab": ["ab"],
     30 
     31  // WB6, WB7
     32  // Colon might be different rules per locale. (https://unicode-org.atlassian.net/browse/ICU-22112)
     33  // "a:b": ["a:b"],
     34  "a·b": ["a·b"],
     35  "a.b": ["a.b"],
     36  "a'b": ["a'b"],
     37 
     38  // WB8
     39  "1": ["1"],
     40  "12": ["12"],
     41 
     42  // WB9
     43  "a1": ["a1"],
     44 
     45  // WB10
     46  "1a": ["1a"],
     47 
     48  // WB11, WB12
     49  "1,2": ["1,2"],
     50  "1;2": ["1;2"],
     51  "1.2": ["1.2"],
     52  "1'2": ["1'2"],
     53 
     54  // WB13a
     55  "a_": ["a_"],
     56  "1_": ["1_"],
     57  "__": ["__"],
     58 
     59  // WB13b
     60  "_a": ["_a"],
     61  "_1": ["_1"],
     62 
     63  // WB999
     64  "\0": ["\0"],
     65  "?": ["?"],
     66  "??": ["?", "?"],
     67 };
     68 
     69 function assertSegments(string, words) {
     70  let seg = segmenter.segment(string);
     71  let segments = [...seg];
     72 
     73  // The computed segments match the expected value.
     74  assertEqArray(segments.map(({segment}) => segment), words);
     75 
     76  // |containing()| should return the same result.
     77  for (let expected of segments) {
     78    let {segment, index} = expected;
     79    for (let i = index; i < index + segment.length; ++i) {
     80      let actual = seg.containing(i);
     81      assertDeepEq(actual, expected);
     82    }
     83  }
     84 }
     85 
     86 let segmenter = new Intl.Segmenter("en", {granularity: "word"});
     87 
     88 for (let [string, words] of Object.entries(strings)) {
     89  assertSegments(string, words);
     90 }
     91 
     92 // WB3, WB3a, WB3b and WB4
     93 for (let string of ["\r\n", "\n", "\r", "\v", "\f", "\x85"]) {
     94  assertSegments(string + "\xAD", [string, "\xAD"]);
     95  assertSegments("\xAD" + string, ["\xAD", string]);
     96 }
     97 
     98 // WB3d and WB4
     99 for (let string of [" ", "  "]) {
    100  assertSegments(string + "\xAD", [string + "\xAD"]);
    101  assertSegments("\xAD" + string, ["\xAD", string]);
    102 }
    103 assertSegments(" \xAD ", [" \xAD", " "]);
    104 assertSegments(" \xAD\xAD ", [" \xAD\xAD", " "]);
    105 
    106 // WB5-WB13 and WB4
    107 for (let string of [
    108  // WB5
    109  "a", "ab",
    110 
    111  // WB6, WB7
    112  // Colon might be different rules per locale. (https://unicode-org.atlassian.net/browse/ICU-22112)
    113  // "a:b",
    114  "a·b",
    115  "a.b",
    116  "a'b",
    117 
    118  // WB8
    119  "1",
    120  "12",
    121 
    122  // WB9
    123  "a1",
    124 
    125  // WB10
    126  "1a",
    127 
    128  // WB11, WB12
    129  "1,2",
    130  "1;2",
    131  "1.2",
    132  "1'2",
    133 
    134  // WB13a
    135  "a_",
    136  "1_",
    137  "__",
    138 
    139  // WB13b
    140  "_a",
    141  "_1",
    142 
    143  // WB999
    144  "?",
    145 ]) {
    146  assertSegments(string + "\xAD", [string + "\xAD"]);
    147  assertSegments("\xAD" + string, ["\xAD", string]);
    148 
    149  assertSegments(string.split("").join("\xAD"), [string.split("").join("\xAD")]);
    150  assertSegments(string.split("").join("\xAD\xAD"), [string.split("").join("\xAD\xAD")]);
    151 }
    152 
    153 assertSegments("?\xAD?", ["?\xAD", "?"]);
    154 
    155 for (let string of [
    156  // WB6, WB7
    157  "a:b",
    158  "a·b",
    159  "a.b",
    160  "a'b",
    161 
    162  // WB11, WB12
    163  "1,2",
    164  "1;2",
    165  "1.2",
    166  "1'2",
    167 ]) {
    168  let prefix = string.slice(0, -1);
    169  let suffix = string.slice(1);
    170 
    171  assertSegments(prefix, prefix.split(""));
    172  assertSegments(suffix, suffix.split(""));
    173 }
    174 
    175 // MidNum with ALetter
    176 assertSegments("a,b", ["a", ",", "b"]);
    177 assertSegments("a;b", ["a", ";", "b"]);
    178 
    179 // MidLetter with Numeric
    180 assertSegments("1:2", ["1", ":", "2"]);
    181 assertSegments("1·2", ["1", "·", "2"]);
    182 
    183 // MidNumLet with mixed ALetter and Numeric
    184 assertSegments("a.2", ["a", ".", "2"]);
    185 assertSegments("1.b", ["1", ".", "b"]);
    186 assertSegments("a'2", ["a", "'", "2"]);
    187 assertSegments("1'b", ["1", "'", "b"]);
    188 
    189 // MidNum with ExtendNumLet
    190 assertSegments("_,_", ["_", ",", "_"]);
    191 assertSegments("_;_", ["_", ";", "_"]);
    192 
    193 // MidLetter with ExtendNumLet
    194 assertSegments("_:_", ["_", ":", "_"]);
    195 assertSegments("_·_", ["_", "·", "_"]);
    196 
    197 // MidNumLet with ExtendNumLet
    198 assertSegments("_._", ["_", ".", "_"]);
    199 assertSegments("_'_", ["_", "'", "_"]);
    200 
    201 // CLDR has locale-dependent word segmentation for the "en-posix" locale. This
    202 // locale is currently not selectable, so the Latin-1 fast-paths don't need to
    203 // implement it. If one of the two below assertions ever fail, please update
    204 // the Latin-1 fast-paths for word segmentation to implement the "en-posix"
    205 // changes.
    206 assertEq(new Intl.Segmenter("en-posix").resolvedOptions().locale, "en");
    207 assertEq(new Intl.Segmenter("en-u-va-posix").resolvedOptions().locale, "en");
    208 
    209 // Locale-dependent word segmentation.
    210 {
    211  // https://en.wikipedia.org/wiki/Colon_(punctuation)#Abbreviation_mark
    212  let string = "Word:with:colon";
    213 
    214  let english = new Intl.Segmenter("en", {granularity: "word"});
    215  let svenska = new Intl.Segmenter("sv", {granularity: "word"});
    216 
    217  // Three words with two separators in English.
    218  assertEq([...english.segment(string)].length, 5);
    219 
    220  // A single word in Swedish.
    221  assertEq([...svenska.segment(string)].length, 1);
    222 }
    223 
    224 if (typeof reportCompare === "function")
    225  reportCompare(0, 0);