tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

sentence.js (5170B)


      1 // |reftest| skip-if(!this.hasOwnProperty('Intl')||!this.Intl.Segmenter)
      2 
      3 // Sentence boundaries can be locale dependent. The following locales don't use
      4 // any custom tailoring, so they should give the same results.
      5 const locales = [
      6  "en", "de", "fr", "ar", "ja", "zh", "th",
      7 ];
      8 
      9 let strings = {
     10  // Empty string
     11  "": [],
     12 
     13  // Ascii
     14  "This is an English sentence. And this is another one.": [
     15    "This is an English sentence. ",
     16    "And this is another one."
     17  ],
     18  "The colon: it doesn't start a new sentence.": [
     19    "The colon: it doesn't start a new sentence."
     20  ],
     21 
     22  // Latin-1
     23  "Unnötig umständlich Wörter überlegen. Und dann lästigerweise zu längeren Sätzen überarbeiten!": [
     24    "Unnötig umständlich Wörter überlegen. ",
     25    "Und dann lästigerweise zu längeren Sätzen überarbeiten!"
     26  ],
     27 
     28  // Two-Byte
     29  // Source: https://ja.wikipedia.org/wiki/Unicode
     30  "Unicode(ユニコード)は、符号化文字集合や文字符号化方式などを定めた、文字コードの業界規格。文字集合(文字セット)が単一の大規模文字セットであること(「Uni」という名はそれに由来する)などが特徴である。": [
     31    "Unicode(ユニコード)は、符号化文字集合や文字符号化方式などを定めた、文字コードの業界規格。",
     32    "文字集合(文字セット)が単一の大規模文字セットであること(「Uni」という名はそれに由来する)などが特徴である。"
     33  ],
     34 };
     35 
     36 function assertIsSegmentDataObject(obj) {
     37  // The prototype is %Object.prototype%.
     38  assertEq(Object.getPrototypeOf(obj), Object.prototype);
     39 
     40  // The Segment Data object has exactly three own properties.
     41  let keys = Reflect.ownKeys(obj);
     42  assertEq(keys.length, 3);
     43  assertEq(keys[0], "segment");
     44  assertEq(keys[1], "index");
     45  assertEq(keys[2], "input");
     46 
     47  // Ensure each property has the correct value type.
     48  assertEq(typeof obj.segment, "string");
     49  assertEq(typeof obj.index, "number");
     50  assertEq(typeof obj.input, "string");
     51 
     52  // |index| is an integer index into |string|.
     53  assertEq(Number.isInteger(obj.index), true);
     54  assertEq(obj.index >= 0, true);
     55  assertEq(obj.index < obj.input.length, true);
     56 
     57  // Segments are non-empty.
     58  assertEq(obj.segment.length > 0, true);
     59 
     60  // Ensure the segment is present in the input at the correct position.
     61  assertEq(obj.input.substr(obj.index, obj.segment.length), obj.segment);
     62 }
     63 
     64 function segmentsFromContaining(segmenter, string) {
     65  let segments = segmenter.segment(string);
     66 
     67  let result = [];
     68  for (let index = 0, data; (data = segments.containing(index)); index += data.segment.length) {
     69    result.push(data);
     70  }
     71  return result;
     72 }
     73 
     74 for (let locale of locales) {
     75  let segmenter = new Intl.Segmenter(locale, {granularity: "sentence"});
     76 
     77  let resolved = segmenter.resolvedOptions();
     78  assertEq(resolved.locale, locale);
     79  assertEq(resolved.granularity, "sentence");
     80 
     81  for (let [string, sentences] of Object.entries(strings)) {
     82    let segments = [...segmenter.segment(string)];
     83 
     84    // Assert each segment is a valid Segment Data object.
     85    segments.forEach(assertIsSegmentDataObject);
     86 
     87    // Concatenating all segments should return the input.
     88    assertEq(segments.reduce((acc, {segment}) => acc + segment, ""), string);
     89 
     90    // The "input" property matches the original input string.
     91    assertEq(segments.every(({input}) => input === string), true);
     92 
     93    // The indices are sorted in ascending order.
     94    assertEq(isNaN(segments.reduce((acc, {index}) => index > acc ? index : NaN, -Infinity)), false);
     95 
     96    // The computed segments match the expected value.
     97    assertEqArray(segments.map(({segment}) => segment), sentences);
     98 
     99    // Segment iteration and %Segments.prototype%.containing return the same results.
    100    assertDeepEq(segmentsFromContaining(segmenter, string), segments);
    101  }
    102 }
    103 
    104 // Sentence break suppressions through the "ss" Unicode extension key aren't supported.
    105 {
    106  let segmenter = new Intl.Segmenter("en-u-ss-standard", {granularity: "sentence"});
    107  assertEq(segmenter.resolvedOptions().locale, "en");
    108 
    109  let segments = [...segmenter.segment("Dr. Strange is a fictional character.")];
    110  assertEqArray(segments.map(({segment}) => segment),
    111                ["Dr. ", "Strange is a fictional character."]);
    112 }
    113 
    114 // Locale-dependent sentence segmentation.
    115 {
    116  // https://en.wikipedia.org/wiki/Greek_question_mark#Greek_question_mark
    117  let string1 = "Από πού είσαι; Τί κάνεις;";
    118  let string2 = string1.replaceAll(";", "\u037E"); // U+037E GREEK QUESTION MARK
    119  assertEq(string1 !== string2, true);
    120 
    121  for (let string of [string1, string2]) {
    122    let english = new Intl.Segmenter("en", {granularity: "sentence"});
    123    let greek = new Intl.Segmenter("el", {granularity: "sentence"});
    124 
    125    // A single sentence in English.
    126    assertEq([...english.segment(string)].length, 1);
    127 
    128    // But two sentences in Greek.
    129    //
    130    // ICU4X doesn't support locale-specific tailoring:
    131    // https://github.com/unicode-org/icu4x/issues/3284
    132    // assertEq([...greek.segment(string)].length, 2);
    133  }
    134 }
    135 
    136 if (typeof reportCompare === "function")
    137  reportCompare(0, 0);