tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

word.js (5298B)


      1 // |reftest| skip-if(!this.hasOwnProperty('Intl')||!this.Intl.Segmenter)
      2 
      3 // Word boundaries are locale independent. Test with various locales to ensure
      4 // we get the same results.
      5 const locales = [
      6  "en", "de", "fr", "ar", "ja", "zh", "th",
      7 ];
      8 
      9 let strings = {
     10  // Empty string
     11  "": [],
     12 
     13  // Ascii
     14  "This is an English sentence.": [
     15    "This", " ", "is", " ", "an", " ", "English", " ", "sentence", "."
     16  ],
     17  "Moi?  N'est-ce pas.": [
     18    "Moi", "?", "  ", "N'est", "-", "ce", " ", "pas", "."
     19  ],
     20 
     21  // Latin-1
     22  "Unnötig umständlich Wörter überlegen.": [
     23    "Unnötig", " ", "umständlich", " ", "Wörter", " ", "überlegen", "."
     24  ],
     25 
     26  // Two-Byte
     27  // Source: https://en.wikipedia.org/wiki/Japanese_writing_system#Examples
     28  "ラドクリフ、マラソン五輪代表に 1万メートル出場にも含み。": [
     29    "ラドクリフ", "、", "マラソン", "五輪", "代表", "に", " ", "1", "万", "メートル", "出場", "に", "も", "含み", "。"
     30  ],
     31 
     32  // From: Language Sense and Ambiguity in Thai
     33  // Source: https://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.98.118
     34  "ขนบนอก": [
     35    // According to the paper this should instead be separated into ขน|บน|อก.
     36    "ขนบ", "นอก"
     37  ],
     38  "พนักงานนําโคลงเรือสามตัว": [
     39    // Expected segmentation is พนักงาน|นํา|โค|ลง|เรือ|สาม|ตัว.
     40 
     41    // ICU4C segmentation:
     42    // "พนัก", "งาน", "นํา", "โคลง", "เรือ", "สาม", "ตัว"
     43 
     44    // ICU4X segmentation:
     45    "พ", "นัก", "งานนํา", "โคลง", "เรือ", "สาม", "ตัว"
     46  ],
     47 
     48  "หมอหุงขาวสวยด": [
     49    // Has three possible segmentations:
     50    // หมอหงขาว|สวย|ด
     51    // หมอ|หง|ขาวสวย|ด
     52    // หมอ|หง|ขาว|สวย|ด
     53 
     54    // ICU4C segmentation:
     55    // "หมอ", "หุง", "ขาว", "สวย", "ด"
     56 
     57    // ICU4X segmentation:
     58    "หมอ", "หุง", "ขาว", "สวยด"
     59  ],
     60 
     61  // From: Thoughts on Word and Sentence Segmentation in Thai
     62  // Source: https://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.63.7038
     63  "หนังสือรวมบทความทางวิชาการในการประชุมสัมมนา": [
     64    "หนังสือ", "รวม", "บทความ", "ทาง", "วิชาการ", "ใน", "การ", "ประชุม", "สัมมนา"
     65  ],
     66 };
     67 
     68 function assertIsSegmentDataObject(obj) {
     69  // The prototype is %Object.prototype%.
     70  assertEq(Object.getPrototypeOf(obj), Object.prototype);
     71 
     72  // The Segment Data object has exactly four own properties.
     73  let keys = Reflect.ownKeys(obj);
     74  assertEq(keys.length, 4);
     75  assertEq(keys[0], "segment");
     76  assertEq(keys[1], "index");
     77  assertEq(keys[2], "input");
     78  assertEq(keys[3], "isWordLike");
     79 
     80  // Ensure each property has the correct value type.
     81  assertEq(typeof obj.segment, "string");
     82  assertEq(typeof obj.index, "number");
     83  assertEq(typeof obj.input, "string");
     84  assertEq(typeof obj.isWordLike, "boolean");
     85 
     86  // |index| is an integer index into |string|.
     87  assertEq(Number.isInteger(obj.index), true);
     88  assertEq(obj.index >= 0, true);
     89  assertEq(obj.index < obj.input.length, true);
     90 
     91  // Segments are non-empty.
     92  assertEq(obj.segment.length > 0, true);
     93 
     94  // Ensure the segment is present in the input at the correct position.
     95  assertEq(obj.input.substr(obj.index, obj.segment.length), obj.segment);
     96 
     97  // The non-word parts in the samples are either punctuators or space separators.
     98  let expectedWordLike = !/^(\p{gc=P}|\p{gc=Zs})+$/u.test(obj.segment);
     99 
    100  assertEq(obj.isWordLike, expectedWordLike, obj.segment);
    101 }
    102 
    103 function segmentsFromContaining(segmenter, string) {
    104  let segments = segmenter.segment(string);
    105 
    106  let result = [];
    107  for (let index = 0, data; (data = segments.containing(index)); index += data.segment.length) {
    108    result.push(data);
    109  }
    110  return result;
    111 }
    112 
    113 for (let locale of locales) {
    114  let segmenter = new Intl.Segmenter(locale, {granularity: "word"});
    115 
    116  let resolved = segmenter.resolvedOptions();
    117  assertEq(resolved.locale, locale);
    118  assertEq(resolved.granularity, "word");
    119 
    120  for (let [string, words] of Object.entries(strings)) {
    121    let segments = [...segmenter.segment(string)];
    122 
    123    // Assert each segment is a valid Segment Data object.
    124    segments.forEach(assertIsSegmentDataObject);
    125 
    126    // Concatenating all segments should return the input.
    127    assertEq(segments.reduce((acc, {segment}) => acc + segment, ""), string);
    128 
    129    // The "input" property matches the original input string.
    130    assertEq(segments.every(({input}) => input === string), true);
    131 
    132    // The indices are sorted in ascending order.
    133    assertEq(isNaN(segments.reduce((acc, {index}) => index > acc ? index : NaN, -Infinity)), false);
    134 
    135    // The computed segments match the expected value.
    136    assertEqArray(segments.map(({segment}) => segment), words);
    137 
    138    // Segment iteration and %Segments.prototype%.containing return the same results.
    139    assertDeepEq(segmentsFromContaining(segmenter, string), segments);
    140  }
    141 }
    142 
    143 if (typeof reportCompare === "function")
    144  reportCompare(0, 0);