tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

Tokenize.test.js (3781B)


      1 import {
      2  tokenize,
      3  toksToTfIdfVector,
      4 } from "lib/PersonalityProvider/Tokenize.mjs";
      5 
      6 const EPSILON = 0.00001;
      7 
      8 describe("TF-IDF Term Vectorizer", () => {
      9  describe("#tokenize", () => {
     10    let testCases = [
     11      { input: "HELLO there", expected: ["hello", "there"] },
     12      { input: "blah,,,blah,blah", expected: ["blah", "blah", "blah"] },
     13      {
     14        input: "Call Jenny: 967-5309",
     15        expected: ["call", "jenny", "967", "5309"],
     16      },
     17      {
     18        input: "Yo(what)[[hello]]{{jim}}}bob{1:2:1+2=$3",
     19        expected: [
     20          "yo",
     21          "what",
     22          "hello",
     23          "jim",
     24          "bob",
     25          "1",
     26          "2",
     27          "1",
     28          "2",
     29          "3",
     30        ],
     31      },
     32      { input: "čÄfė 80's", expected: ["čäfė", "80", "s"] },
     33      { input: "我知道很多东西。", expected: ["我知道很多东西"] },
     34    ];
     35    let checkTokenization = tc => {
     36      it(`${tc.input} should tokenize to ${tc.expected}`, () => {
     37        assert.deepEqual(tc.expected, tokenize(tc.input));
     38      });
     39    };
     40 
     41    for (let i = 0; i < testCases.length; i++) {
     42      checkTokenization(testCases[i]);
     43    }
     44  });
     45 
     46  describe("#tfidf", () => {
     47    let vocab_idfs = {
     48      deal: [221, 5.5058519847862275],
     49      easy: [269, 5.5058519847862275],
     50      tanks: [867, 5.601162164590552],
     51      sites: [792, 5.957837108529285],
     52      care: [153, 5.957837108529285],
     53      needs: [596, 5.824305715904762],
     54      finally: [334, 5.706522680248379],
     55    };
     56    let testCases = [
     57      {
     58        input: "Finally! Easy care for your tanks!",
     59        expected: {
     60          finally: [334, 0.5009816295853761],
     61          easy: [269, 0.48336453811728713],
     62          care: [153, 0.5230447876368227],
     63          tanks: [867, 0.49173191907236774],
     64        },
     65      },
     66      {
     67        input: "Easy easy EASY",
     68        expected: { easy: [269, 1.0] },
     69      },
     70      {
     71        input: "Easy easy care",
     72        expected: {
     73          easy: [269, 0.8795205218806832],
     74          care: [153, 0.4758609582543317],
     75        },
     76      },
     77      {
     78        input: "easy care",
     79        expected: {
     80          easy: [269, 0.6786999710383944],
     81          care: [153, 0.7344156515982504],
     82        },
     83      },
     84      {
     85        input: "这个空间故意留空。",
     86        expected: {
     87          /* This space is left intentionally blank. */
     88        },
     89      },
     90    ];
     91    let checkTokenGeneration = tc => {
     92      describe(`${tc.input} should have only vocabulary tokens`, () => {
     93        let actual = toksToTfIdfVector(tokenize(tc.input), vocab_idfs);
     94 
     95        it(`${tc.input} should generate exactly ${Object.keys(
     96          tc.expected
     97        )}`, () => {
     98          let seen = {};
     99          Object.keys(actual).forEach(actualTok => {
    100            assert.isTrue(actualTok in tc.expected);
    101            seen[actualTok] = true;
    102          });
    103          Object.keys(tc.expected).forEach(expectedTok => {
    104            assert.isTrue(expectedTok in seen);
    105          });
    106        });
    107 
    108        it(`${tc.input} should have the correct token ids`, () => {
    109          Object.keys(actual).forEach(actualTok => {
    110            assert.equal(tc.expected[actualTok][0], actual[actualTok][0]);
    111          });
    112        });
    113      });
    114    };
    115 
    116    let checkTfIdfVector = tc => {
    117      let actual = toksToTfIdfVector(tokenize(tc.input), vocab_idfs);
    118      it(`${tc.input} should have the correct tf-idf`, () => {
    119        Object.keys(actual).forEach(actualTok => {
    120          let delta = Math.abs(
    121            tc.expected[actualTok][1] - actual[actualTok][1]
    122          );
    123          assert.isTrue(delta <= EPSILON);
    124        });
    125      });
    126    };
    127 
    128    // run the tests
    129    for (let i = 0; i < testCases.length; i++) {
    130      checkTokenGeneration(testCases[i]);
    131      checkTfIdfVector(testCases[i]);
    132    }
    133  });
    134 });