word.js (5298B)
1 // |reftest| skip-if(!this.hasOwnProperty('Intl')||!this.Intl.Segmenter) 2 3 // Word boundaries are locale independent. Test with various locales to ensure 4 // we get the same results. 5 const locales = [ 6 "en", "de", "fr", "ar", "ja", "zh", "th", 7 ]; 8 9 let strings = { 10 // Empty string 11 "": [], 12 13 // Ascii 14 "This is an English sentence.": [ 15 "This", " ", "is", " ", "an", " ", "English", " ", "sentence", "." 16 ], 17 "Moi? N'est-ce pas.": [ 18 "Moi", "?", " ", "N'est", "-", "ce", " ", "pas", "." 19 ], 20 21 // Latin-1 22 "Unnötig umständlich Wörter überlegen.": [ 23 "Unnötig", " ", "umständlich", " ", "Wörter", " ", "überlegen", "." 24 ], 25 26 // Two-Byte 27 // Source: https://en.wikipedia.org/wiki/Japanese_writing_system#Examples 28 "ラドクリフ、マラソン五輪代表に 1万メートル出場にも含み。": [ 29 "ラドクリフ", "、", "マラソン", "五輪", "代表", "に", " ", "1", "万", "メートル", "出場", "に", "も", "含み", "。" 30 ], 31 32 // From: Language Sense and Ambiguity in Thai 33 // Source: https://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.98.118 34 "ขนบนอก": [ 35 // According to the paper this should instead be separated into ขน|บน|อก. 36 "ขนบ", "นอก" 37 ], 38 "พนักงานนําโคลงเรือสามตัว": [ 39 // Expected segmentation is พนักงาน|นํา|โค|ลง|เรือ|สาม|ตัว. 40 41 // ICU4C segmentation: 42 // "พนัก", "งาน", "นํา", "โคลง", "เรือ", "สาม", "ตัว" 43 44 // ICU4X segmentation: 45 "พ", "นัก", "งานนํา", "โคลง", "เรือ", "สาม", "ตัว" 46 ], 47 48 "หมอหุงขาวสวยด": [ 49 // Has three possible segmentations: 50 // หมอหงขาว|สวย|ด 51 // หมอ|หง|ขาวสวย|ด 52 // หมอ|หง|ขาว|สวย|ด 53 54 // ICU4C segmentation: 55 // "หมอ", "หุง", "ขาว", "สวย", "ด" 56 57 // ICU4X segmentation: 58 "หมอ", "หุง", "ขาว", "สวยด" 59 ], 60 61 // From: Thoughts on Word and Sentence Segmentation in Thai 62 // Source: https://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.63.7038 63 "หนังสือรวมบทความทางวิชาการในการประชุมสัมมนา": [ 64 "หนังสือ", "รวม", "บทความ", "ทาง", "วิชาการ", "ใน", "การ", "ประชุม", "สัมมนา" 65 ], 66 }; 67 68 function assertIsSegmentDataObject(obj) { 69 // The prototype is %Object.prototype%. 70 assertEq(Object.getPrototypeOf(obj), Object.prototype); 71 72 // The Segment Data object has exactly four own properties. 73 let keys = Reflect.ownKeys(obj); 74 assertEq(keys.length, 4); 75 assertEq(keys[0], "segment"); 76 assertEq(keys[1], "index"); 77 assertEq(keys[2], "input"); 78 assertEq(keys[3], "isWordLike"); 79 80 // Ensure each property has the correct value type. 81 assertEq(typeof obj.segment, "string"); 82 assertEq(typeof obj.index, "number"); 83 assertEq(typeof obj.input, "string"); 84 assertEq(typeof obj.isWordLike, "boolean"); 85 86 // |index| is an integer index into |string|. 87 assertEq(Number.isInteger(obj.index), true); 88 assertEq(obj.index >= 0, true); 89 assertEq(obj.index < obj.input.length, true); 90 91 // Segments are non-empty. 92 assertEq(obj.segment.length > 0, true); 93 94 // Ensure the segment is present in the input at the correct position. 95 assertEq(obj.input.substr(obj.index, obj.segment.length), obj.segment); 96 97 // The non-word parts in the samples are either punctuators or space separators. 98 let expectedWordLike = !/^(\p{gc=P}|\p{gc=Zs})+$/u.test(obj.segment); 99 100 assertEq(obj.isWordLike, expectedWordLike, obj.segment); 101 } 102 103 function segmentsFromContaining(segmenter, string) { 104 let segments = segmenter.segment(string); 105 106 let result = []; 107 for (let index = 0, data; (data = segments.containing(index)); index += data.segment.length) { 108 result.push(data); 109 } 110 return result; 111 } 112 113 for (let locale of locales) { 114 let segmenter = new Intl.Segmenter(locale, {granularity: "word"}); 115 116 let resolved = segmenter.resolvedOptions(); 117 assertEq(resolved.locale, locale); 118 assertEq(resolved.granularity, "word"); 119 120 for (let [string, words] of Object.entries(strings)) { 121 let segments = [...segmenter.segment(string)]; 122 123 // Assert each segment is a valid Segment Data object. 124 segments.forEach(assertIsSegmentDataObject); 125 126 // Concatenating all segments should return the input. 127 assertEq(segments.reduce((acc, {segment}) => acc + segment, ""), string); 128 129 // The "input" property matches the original input string. 130 assertEq(segments.every(({input}) => input === string), true); 131 132 // The indices are sorted in ascending order. 133 assertEq(isNaN(segments.reduce((acc, {index}) => index > acc ? index : NaN, -Infinity)), false); 134 135 // The computed segments match the expected value. 136 assertEqArray(segments.map(({segment}) => segment), words); 137 138 // Segment iteration and %Segments.prototype%.containing return the same results. 139 assertDeepEq(segmentsFromContaining(segmenter, string), segments); 140 } 141 } 142 143 if (typeof reportCompare === "function") 144 reportCompare(0, 0);