Tokenize.test.js (3781B)
1 import { 2 tokenize, 3 toksToTfIdfVector, 4 } from "lib/PersonalityProvider/Tokenize.mjs"; 5 6 const EPSILON = 0.00001; 7 8 describe("TF-IDF Term Vectorizer", () => { 9 describe("#tokenize", () => { 10 let testCases = [ 11 { input: "HELLO there", expected: ["hello", "there"] }, 12 { input: "blah,,,blah,blah", expected: ["blah", "blah", "blah"] }, 13 { 14 input: "Call Jenny: 967-5309", 15 expected: ["call", "jenny", "967", "5309"], 16 }, 17 { 18 input: "Yo(what)[[hello]]{{jim}}}bob{1:2:1+2=$3", 19 expected: [ 20 "yo", 21 "what", 22 "hello", 23 "jim", 24 "bob", 25 "1", 26 "2", 27 "1", 28 "2", 29 "3", 30 ], 31 }, 32 { input: "čÄfė 80's", expected: ["čäfė", "80", "s"] }, 33 { input: "我知道很多东西。", expected: ["我知道很多东西"] }, 34 ]; 35 let checkTokenization = tc => { 36 it(`${tc.input} should tokenize to ${tc.expected}`, () => { 37 assert.deepEqual(tc.expected, tokenize(tc.input)); 38 }); 39 }; 40 41 for (let i = 0; i < testCases.length; i++) { 42 checkTokenization(testCases[i]); 43 } 44 }); 45 46 describe("#tfidf", () => { 47 let vocab_idfs = { 48 deal: [221, 5.5058519847862275], 49 easy: [269, 5.5058519847862275], 50 tanks: [867, 5.601162164590552], 51 sites: [792, 5.957837108529285], 52 care: [153, 5.957837108529285], 53 needs: [596, 5.824305715904762], 54 finally: [334, 5.706522680248379], 55 }; 56 let testCases = [ 57 { 58 input: "Finally! Easy care for your tanks!", 59 expected: { 60 finally: [334, 0.5009816295853761], 61 easy: [269, 0.48336453811728713], 62 care: [153, 0.5230447876368227], 63 tanks: [867, 0.49173191907236774], 64 }, 65 }, 66 { 67 input: "Easy easy EASY", 68 expected: { easy: [269, 1.0] }, 69 }, 70 { 71 input: "Easy easy care", 72 expected: { 73 easy: [269, 0.8795205218806832], 74 care: [153, 0.4758609582543317], 75 }, 76 }, 77 { 78 input: "easy care", 79 expected: { 80 easy: [269, 0.6786999710383944], 81 care: [153, 0.7344156515982504], 82 }, 83 }, 84 { 85 input: "这个空间故意留空。", 86 expected: { 87 /* This space is left intentionally blank. */ 88 }, 89 }, 90 ]; 91 let checkTokenGeneration = tc => { 92 describe(`${tc.input} should have only vocabulary tokens`, () => { 93 let actual = toksToTfIdfVector(tokenize(tc.input), vocab_idfs); 94 95 it(`${tc.input} should generate exactly ${Object.keys( 96 tc.expected 97 )}`, () => { 98 let seen = {}; 99 Object.keys(actual).forEach(actualTok => { 100 assert.isTrue(actualTok in tc.expected); 101 seen[actualTok] = true; 102 }); 103 Object.keys(tc.expected).forEach(expectedTok => { 104 assert.isTrue(expectedTok in seen); 105 }); 106 }); 107 108 it(`${tc.input} should have the correct token ids`, () => { 109 Object.keys(actual).forEach(actualTok => { 110 assert.equal(tc.expected[actualTok][0], actual[actualTok][0]); 111 }); 112 }); 113 }); 114 }; 115 116 let checkTfIdfVector = tc => { 117 let actual = toksToTfIdfVector(tokenize(tc.input), vocab_idfs); 118 it(`${tc.input} should have the correct tf-idf`, () => { 119 Object.keys(actual).forEach(actualTok => { 120 let delta = Math.abs( 121 tc.expected[actualTok][1] - actual[actualTok][1] 122 ); 123 assert.isTrue(delta <= EPSILON); 124 }); 125 }); 126 }; 127 128 // run the tests 129 for (let i = 0; i < testCases.length; i++) { 130 checkTokenGeneration(testCases[i]); 131 checkTfIdfVector(testCases[i]); 132 } 133 }); 134 });