sentence.js (5170B)
1 // |reftest| skip-if(!this.hasOwnProperty('Intl')||!this.Intl.Segmenter) 2 3 // Sentence boundaries can be locale dependent. The following locales don't use 4 // any custom tailoring, so they should give the same results. 5 const locales = [ 6 "en", "de", "fr", "ar", "ja", "zh", "th", 7 ]; 8 9 let strings = { 10 // Empty string 11 "": [], 12 13 // Ascii 14 "This is an English sentence. And this is another one.": [ 15 "This is an English sentence. ", 16 "And this is another one." 17 ], 18 "The colon: it doesn't start a new sentence.": [ 19 "The colon: it doesn't start a new sentence." 20 ], 21 22 // Latin-1 23 "Unnötig umständlich Wörter überlegen. Und dann lästigerweise zu längeren Sätzen überarbeiten!": [ 24 "Unnötig umständlich Wörter überlegen. ", 25 "Und dann lästigerweise zu längeren Sätzen überarbeiten!" 26 ], 27 28 // Two-Byte 29 // Source: https://ja.wikipedia.org/wiki/Unicode 30 "Unicode(ユニコード)は、符号化文字集合や文字符号化方式などを定めた、文字コードの業界規格。文字集合(文字セット)が単一の大規模文字セットであること(「Uni」という名はそれに由来する)などが特徴である。": [ 31 "Unicode(ユニコード)は、符号化文字集合や文字符号化方式などを定めた、文字コードの業界規格。", 32 "文字集合(文字セット)が単一の大規模文字セットであること(「Uni」という名はそれに由来する)などが特徴である。" 33 ], 34 }; 35 36 function assertIsSegmentDataObject(obj) { 37 // The prototype is %Object.prototype%. 38 assertEq(Object.getPrototypeOf(obj), Object.prototype); 39 40 // The Segment Data object has exactly three own properties. 41 let keys = Reflect.ownKeys(obj); 42 assertEq(keys.length, 3); 43 assertEq(keys[0], "segment"); 44 assertEq(keys[1], "index"); 45 assertEq(keys[2], "input"); 46 47 // Ensure each property has the correct value type. 48 assertEq(typeof obj.segment, "string"); 49 assertEq(typeof obj.index, "number"); 50 assertEq(typeof obj.input, "string"); 51 52 // |index| is an integer index into |string|. 53 assertEq(Number.isInteger(obj.index), true); 54 assertEq(obj.index >= 0, true); 55 assertEq(obj.index < obj.input.length, true); 56 57 // Segments are non-empty. 58 assertEq(obj.segment.length > 0, true); 59 60 // Ensure the segment is present in the input at the correct position. 61 assertEq(obj.input.substr(obj.index, obj.segment.length), obj.segment); 62 } 63 64 function segmentsFromContaining(segmenter, string) { 65 let segments = segmenter.segment(string); 66 67 let result = []; 68 for (let index = 0, data; (data = segments.containing(index)); index += data.segment.length) { 69 result.push(data); 70 } 71 return result; 72 } 73 74 for (let locale of locales) { 75 let segmenter = new Intl.Segmenter(locale, {granularity: "sentence"}); 76 77 let resolved = segmenter.resolvedOptions(); 78 assertEq(resolved.locale, locale); 79 assertEq(resolved.granularity, "sentence"); 80 81 for (let [string, sentences] of Object.entries(strings)) { 82 let segments = [...segmenter.segment(string)]; 83 84 // Assert each segment is a valid Segment Data object. 85 segments.forEach(assertIsSegmentDataObject); 86 87 // Concatenating all segments should return the input. 88 assertEq(segments.reduce((acc, {segment}) => acc + segment, ""), string); 89 90 // The "input" property matches the original input string. 91 assertEq(segments.every(({input}) => input === string), true); 92 93 // The indices are sorted in ascending order. 94 assertEq(isNaN(segments.reduce((acc, {index}) => index > acc ? index : NaN, -Infinity)), false); 95 96 // The computed segments match the expected value. 97 assertEqArray(segments.map(({segment}) => segment), sentences); 98 99 // Segment iteration and %Segments.prototype%.containing return the same results. 100 assertDeepEq(segmentsFromContaining(segmenter, string), segments); 101 } 102 } 103 104 // Sentence break suppressions through the "ss" Unicode extension key aren't supported. 105 { 106 let segmenter = new Intl.Segmenter("en-u-ss-standard", {granularity: "sentence"}); 107 assertEq(segmenter.resolvedOptions().locale, "en"); 108 109 let segments = [...segmenter.segment("Dr. Strange is a fictional character.")]; 110 assertEqArray(segments.map(({segment}) => segment), 111 ["Dr. ", "Strange is a fictional character."]); 112 } 113 114 // Locale-dependent sentence segmentation. 115 { 116 // https://en.wikipedia.org/wiki/Greek_question_mark#Greek_question_mark 117 let string1 = "Από πού είσαι; Τί κάνεις;"; 118 let string2 = string1.replaceAll(";", "\u037E"); // U+037E GREEK QUESTION MARK 119 assertEq(string1 !== string2, true); 120 121 for (let string of [string1, string2]) { 122 let english = new Intl.Segmenter("en", {granularity: "sentence"}); 123 let greek = new Intl.Segmenter("el", {granularity: "sentence"}); 124 125 // A single sentence in English. 126 assertEq([...english.segment(string)].length, 1); 127 128 // But two sentences in Greek. 129 // 130 // ICU4X doesn't support locale-specific tailoring: 131 // https://github.com/unicode-org/icu4x/issues/3284 132 // assertEq([...greek.segment(string)].length, 2); 133 } 134 } 135 136 if (typeof reportCompare === "function") 137 reportCompare(0, 0);