LlamaRunner.webidl (8334B)
1 /* This Source Code Form is subject to the terms of the Mozilla Public 2 * License, v. 2.0. If a copy of the MPL was not distributed with this 3 * file, You can obtain one at https://mozilla.org/MPL/2.0/. */ 4 5 6 /** 7 * Represents a single message exchanged in a chat with a language model. 8 * 9 * This dictionary is used to define the role and content of each message in the 10 * prompt passed to a llama.cpp-based LLM (e.g., user or assistant messages). 11 */ 12 dictionary LlamaChatMessage { 13 /** 14 * The role of the speaker in the chat conversation. 15 * Common values include "system", "user", or "assistant". 16 */ 17 required UTF8String role; 18 /** 19 * The textual content of the message associated with the given role. 20 */ 21 required UTF8String content; 22 }; 23 24 enum LlamaChatPhase { 25 "prompt", 26 "generation" 27 }; 28 29 [GenerateConversionToJS] 30 dictionary LlamaChatResponse { 31 required UTF8String piece; 32 required sequence<long> tokens; 33 required LlamaChatPhase phase; 34 required boolean isPhaseCompleted; 35 }; 36 37 dictionary LlamaFormatChatOptions { 38 39 /** 40 * List of role and their content to be formatted into a string prompt. 41 */ 42 required sequence<LlamaChatMessage> messages; 43 /** 44 * If true, the assistant role will be added to the generated prompt 45 */ 46 boolean addAssistant = true; 47 48 }; 49 50 enum LlamaKVCacheDtype { 51 "f32", 52 "f16", 53 "q8_0", 54 "q5_1", 55 "q5_0", 56 "q4_1", 57 "q4_0" 58 }; 59 60 /** 61 * Configuration options for creating a LLaMA context. 62 * Name are chosen to map directly to the ones from llama.cpp. 63 */ 64 dictionary LlamaContextOptions { 65 /** 66 * Maximum combined length of input & generate tokens. 67 */ 68 unsigned long nCtx = 2048; 69 70 /** 71 * Logical max batch size submitted to llama_decode. 72 */ 73 unsigned long nBatch = 2048; 74 75 /** 76 * Physical max batch size for actual processing. 77 */ 78 unsigned long nUbatch = 512; 79 80 /** 81 * Maximum number of concurrent sequences (distinct KV states). 82 */ 83 unsigned long nSeqMax = 1; 84 85 /** 86 * Number of threads for generation. 87 */ 88 required long nThreads; 89 90 /** 91 * Number of threads for batch/prompt processing. 92 */ 93 required long nThreadsBatch; 94 95 /** 96 * Data type for the K (key) cache [EXPERIMENTAL]. 97 */ 98 LlamaKVCacheDtype kCacheDtype = "f16"; 99 100 /** 101 * Data type for the V (value) cache [EXPERIMENTAL]. 102 */ 103 LlamaKVCacheDtype vCacheDtype = "f16"; 104 105 /** 106 * If true, offload K, Q, V ops (including KV cache) to the GPU. 107 * Only active if GPU is used with NGpULayers != 0 in LlamaModelOptions 108 */ 109 boolean offloadKqv = true; 110 111 /** 112 * If true, use FlashAttention (experimental). 113 */ 114 boolean flashAttn = false; 115 116 /** 117 * If true, disable performance measurement (no timing output). 118 */ 119 boolean noPerf = false; 120 121 /** 122 * If true, offload host-side tensor operations to the device. 123 * Only active if GPU is used with NGpULayers != 0 in LlamaModelOptions 124 */ 125 boolean opOffload = true; 126 127 /** 128 * If true, use a full-size SWA (sliding window attention) cache. 129 */ 130 boolean swaFull = true; 131 }; 132 133 /** 134 * Configuration options for loading a LLaMA model. 135 * See comments here https://github.com/ggml-org/llama.cpp/blob/b5774/include/llama.h#L298 136 * for more description on each field. 137 */ 138 dictionary LlamaModelOptions { 139 /** 140 * If true, use `mmap` for loading the model, if supported. 141 */ 142 boolean useMmap = true; 143 144 /** 145 * If true, attempt to lock the model in RAM using `mlock`. 146 */ 147 boolean useMlock = false; 148 149 /** 150 * If true, perform extra validation on model tensor data. 151 */ 152 boolean checkTensors = false; 153 154 /** 155 * Number of model layers to offload to GPU. 156 * A value of 0 disables GPU offloading. 157 */ 158 long nGpuLayers = 0; 159 160 /** 161 * Context configuration (e.g. nCtx, threads). 162 */ 163 LlamaContextOptions context = {}; 164 165 }; 166 167 enum LlamaSamplerType { 168 "logit-bias", 169 "dry", 170 "top-k", 171 "top-p", 172 "top-n-sigma", 173 "min-p", 174 "xtc", 175 "typical", 176 "temperature", 177 "temperature-ext", 178 "infill", 179 "penalties", 180 "mirostat", 181 "dist", 182 }; 183 184 dictionary LlamaLogitBias { 185 required long token; 186 required float bias; 187 }; 188 189 /** 190 * Represents a configured sampler. 191 */ 192 dictionary LlamaSamplerConfig { 193 /** 194 * The sampler algorithm to use. 195 */ 196 required LlamaSamplerType type; 197 198 /** 199 * Minimum number of tokens to keep (0 = disabled). 200 */ 201 long minKeep = 0; 202 203 /** 204 * Top-K cutoff. If <= 0, uses full vocabulary. 205 */ 206 long topK = 40; 207 208 /** 209 * Top-P (nucleus) sampling threshold. 210 */ 211 float topP = 0.95; 212 213 /** 214 * Minimum P cutoff. 215 */ 216 float minP = 0.05; 217 218 /** 219 * XTC sampling probability (0.0 = disabled). 220 */ 221 float xtcProbability = 0.0; 222 223 /** 224 * XTC threshold (values > 0.5 disable XTC). 225 */ 226 float xtcThreshold = 0.10; 227 228 /** 229 * Typical sampling cutoff (1.0 = disabled). 230 */ 231 float typP = 1.0; 232 233 /** 234 * Sampling temperature (0.0 or below = greedy decoding). 235 */ 236 float temp = 0.80; 237 238 /** 239 * Dynamic temperature range (0.0 = disabled). 240 */ 241 float dynatempRange = 0.0; 242 243 /** 244 * Dynamic temperature exponent (entropy-to-temp mapping). 245 */ 246 float dynatempExponent = 1.0; 247 248 /** 249 * Repetition penalty: number of tokens to track (-1 = context size). 250 */ 251 long penaltyLastN = 64; 252 253 /** 254 * Repetition penalty multiplier (1.0 = disabled). 255 */ 256 float penaltyRepeat = 1.0; 257 258 /** 259 * Frequency penalty (0.0 = disabled). 260 */ 261 float penaltyFreq = 0.0; 262 263 /** 264 * Presence penalty (0.0 = disabled). 265 */ 266 float penaltyPresent = 0.0; 267 268 /** 269 * DRY multiplier (0.0 = disabled). 270 */ 271 float dryMultiplier = 0.0; 272 273 /** 274 * DRY base exponent (0.0 = disabled). 275 */ 276 float dryBase = 1.75; 277 278 /** 279 * DRY allowed repetition length before penalization starts. 280 */ 281 long dryAllowedLength = 2; 282 283 /** 284 * DRY lookback window (0 = disable, -1 = context size). 285 */ 286 long dryPenaltyLastN = -1; 287 288 /** 289 * Mirostat mode (0 = disabled, 1 = v1, 2 = v2). 290 */ 291 long mirostat = 0; 292 293 /** 294 * Top-n sigma sampling cutoff (-1.0 = disabled). 295 */ 296 float topNSigma = -1.0; 297 298 /** 299 * Mirostat target entropy (tau). 300 */ 301 float mirostatTau = 5.0; 302 303 /** 304 * Mirostat learning rate (eta). 305 */ 306 float mirostatEta = 0.1; 307 308 /** 309 * List of token-specific logit biases. 310 */ 311 sequence<LlamaLogitBias> logitBias = []; 312 313 /** 314 * If true, disables performance metrics. 315 */ 316 boolean noPerf = false; 317 318 /** 319 * Random number seed for sampling. 320 */ 321 unsigned long seed; 322 }; 323 324 dictionary LlamaDeTokenizationOptions { 325 /** 326 * Maximum number of UTF-8 characters that may be contained in a single model token. 327 * This is used to reserve enough space during detokenization. 328 */ 329 long maxCharsPerToken = 256; 330 331 /** 332 * Whether to render special tokens such as <BOS>, <EOS>, or <UNK> in the output. 333 * If false, these tokens will be omitted from the detokenized result. 334 */ 335 boolean renderSpecialTokens = true; 336 }; 337 338 dictionary LlamaTokenizationOptions { 339 340 /** 341 * Allow to add BOS and EOS tokens if model is configured to do so. 342 */ 343 boolean addBosAndEos = true; 344 345 346 /** 347 * Allow tokenizing special and/or control tokens which otherwise are not exposed and treated 348 * as plaintext. Does not insert a leading space. 349 */ 350 boolean parseSpecilControlTokens = true; 351 }; 352 353 dictionary LlamaChatOptions { 354 355 /** 356 * Sampler stack to apply during decoding. 357 */ 358 sequence<LlamaSamplerConfig> samplers = []; 359 360 /** 361 * Input prompt text to process. 362 */ 363 required UTF8String prompt; 364 365 /** 366 * Optional output buffer size (0 = no preallocation). 367 */ 368 long minOutputBufferSize = 1; 369 370 /** 371 * Maximum number of generation steps (tokens). 372 */ 373 long maxGeneratedTokens = 512; 374 375 /** 376 * If true, stop when encountering known model end-of-generation tokens. 377 */ 378 boolean stopOnEndOfGenerationTokens = true; 379 380 /** 381 * List of token IDs that should stop generation. 382 */ 383 sequence<long> stopTokens = []; 384 385 LlamaTokenizationOptions tokenizationOptions = {}; 386 387 LlamaDeTokenizationOptions deTokenizationOptions = {}; 388 }; 389 390 [Func="LlamaRunner::InInferenceProcess", Exposed=(DedicatedWorker,Window)] 391 interface LlamaRunner { 392 [Throws] constructor(); 393 394 [Throws] Promise<undefined> initialize(LlamaModelOptions options, Blob modelBlob); 395 396 [Throws] Promise<UTF8String> formatChat(LlamaFormatChatOptions options); 397 398 [NewObject, Throws] ReadableStream createGenerationStream(LlamaChatOptions options); 399 };