tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

LlamaRunner.webidl (8334B)


      1 /* This Source Code Form is subject to the terms of the Mozilla Public
      2 * License, v. 2.0. If a copy of the MPL was not distributed with this
      3 * file, You can obtain one at https://mozilla.org/MPL/2.0/. */
      4 
      5 
      6 /**
      7 * Represents a single message exchanged in a chat with a language model.
      8 *
      9 * This dictionary is used to define the role and content of each message in the
     10 * prompt passed to a llama.cpp-based LLM (e.g., user or assistant messages).
     11 */
     12 dictionary LlamaChatMessage {
     13  /**
     14   * The role of the speaker in the chat conversation.
     15   * Common values include "system", "user", or "assistant".
     16   */
     17  required UTF8String role;
     18  /**
     19   * The textual content of the message associated with the given role.
     20   */
     21  required UTF8String content;
     22 };
     23 
     24 enum LlamaChatPhase {
     25  "prompt",
     26  "generation"
     27 };
     28 
     29 [GenerateConversionToJS]
     30 dictionary LlamaChatResponse {
     31  required UTF8String piece;
     32  required sequence<long> tokens;
     33  required LlamaChatPhase phase;
     34  required boolean isPhaseCompleted;
     35 };
     36 
     37 dictionary LlamaFormatChatOptions {
     38 
     39  /**
     40   * List of role and their content to be formatted into a string prompt.
     41   */
     42  required sequence<LlamaChatMessage> messages;
     43  /**
     44   * If true, the assistant role will be added to the generated prompt
     45   */
     46  boolean addAssistant = true;
     47 
     48 };
     49 
     50 enum LlamaKVCacheDtype {
     51  "f32",
     52  "f16",
     53  "q8_0",
     54  "q5_1",
     55  "q5_0",
     56  "q4_1",
     57  "q4_0"
     58 };
     59 
     60 /**
     61 * Configuration options for creating a LLaMA context.
     62 * Name are chosen to map directly to the ones from llama.cpp.
     63 */
     64 dictionary LlamaContextOptions {
     65  /**
     66   * Maximum combined length of input & generate tokens.
     67   */
     68  unsigned long nCtx = 2048;
     69 
     70  /**
     71   * Logical max batch size submitted to llama_decode.
     72   */
     73  unsigned long nBatch = 2048;
     74 
     75  /**
     76   * Physical max batch size for actual processing.
     77   */
     78  unsigned long nUbatch = 512;
     79 
     80  /**
     81   * Maximum number of concurrent sequences (distinct KV states).
     82   */
     83  unsigned long nSeqMax = 1;
     84 
     85  /**
     86   * Number of threads for generation.
     87   */
     88  required long nThreads;
     89 
     90  /**
     91   * Number of threads for batch/prompt processing.
     92   */
     93  required long nThreadsBatch;
     94 
     95  /**
     96   * Data type for the K (key) cache [EXPERIMENTAL].
     97   */
     98  LlamaKVCacheDtype kCacheDtype = "f16";
     99 
    100  /**
    101   * Data type for the V (value) cache [EXPERIMENTAL].
    102   */
    103  LlamaKVCacheDtype vCacheDtype = "f16";
    104 
    105  /**
    106   * If true, offload K, Q, V ops (including KV cache) to the GPU.
    107   * Only active if GPU is used with NGpULayers != 0 in LlamaModelOptions
    108   */
    109  boolean offloadKqv = true;
    110 
    111  /**
    112   * If true, use FlashAttention (experimental).
    113   */
    114  boolean flashAttn = false;
    115 
    116  /**
    117   * If true, disable performance measurement (no timing output).
    118   */
    119  boolean noPerf = false;
    120 
    121  /**
    122   * If true, offload host-side tensor operations to the device.
    123   * Only active if GPU is used with NGpULayers != 0 in LlamaModelOptions
    124   */
    125  boolean opOffload = true;
    126 
    127  /**
    128   * If true, use a full-size SWA (sliding window attention) cache.
    129   */
    130  boolean swaFull = true;
    131 };
    132 
    133 /**
    134 * Configuration options for loading a LLaMA model.
    135 * See comments here https://github.com/ggml-org/llama.cpp/blob/b5774/include/llama.h#L298
    136 * for more description on each field.
    137 */
    138 dictionary LlamaModelOptions {
    139  /**
    140   * If true, use `mmap` for loading the model, if supported.
    141   */
    142  boolean useMmap = true;
    143 
    144  /**
    145   * If true, attempt to lock the model in RAM using `mlock`.
    146   */
    147  boolean useMlock = false;
    148 
    149  /**
    150   * If true, perform extra validation on model tensor data.
    151   */
    152  boolean checkTensors = false;
    153 
    154  /**
    155   * Number of model layers to offload to GPU.
    156   * A value of 0 disables GPU offloading.
    157   */
    158  long nGpuLayers = 0;
    159 
    160  /**
    161   * Context configuration (e.g. nCtx, threads).
    162   */
    163  LlamaContextOptions context = {};
    164 
    165 };
    166 
    167 enum LlamaSamplerType {
    168  "logit-bias",
    169  "dry",
    170  "top-k",
    171  "top-p",
    172  "top-n-sigma",
    173  "min-p",
    174  "xtc",
    175  "typical",
    176  "temperature",
    177  "temperature-ext",
    178  "infill",
    179  "penalties",
    180  "mirostat",
    181  "dist",
    182 };
    183 
    184 dictionary LlamaLogitBias {
    185  required long token;
    186  required float bias;
    187 };
    188 
    189 /**
    190 * Represents a configured sampler.
    191 */
    192 dictionary LlamaSamplerConfig {
    193  /**
    194   * The sampler algorithm to use.
    195   */
    196  required LlamaSamplerType type;
    197 
    198  /**
    199   * Minimum number of tokens to keep (0 = disabled).
    200   */
    201  long minKeep = 0;
    202 
    203  /**
    204   * Top-K cutoff. If <= 0, uses full vocabulary.
    205   */
    206  long topK = 40;
    207 
    208  /**
    209   * Top-P (nucleus) sampling threshold.
    210   */
    211  float topP = 0.95;
    212 
    213  /**
    214   * Minimum P cutoff.
    215   */
    216  float minP = 0.05;
    217 
    218  /**
    219   * XTC sampling probability (0.0 = disabled).
    220   */
    221  float xtcProbability = 0.0;
    222 
    223  /**
    224   * XTC threshold (values > 0.5 disable XTC).
    225   */
    226  float xtcThreshold = 0.10;
    227 
    228  /**
    229   * Typical sampling cutoff (1.0 = disabled).
    230   */
    231  float typP = 1.0;
    232 
    233  /**
    234   * Sampling temperature (0.0 or below = greedy decoding).
    235   */
    236  float temp = 0.80;
    237 
    238  /**
    239   * Dynamic temperature range (0.0 = disabled).
    240   */
    241  float dynatempRange = 0.0;
    242 
    243  /**
    244   * Dynamic temperature exponent (entropy-to-temp mapping).
    245   */
    246  float dynatempExponent = 1.0;
    247 
    248  /**
    249   * Repetition penalty: number of tokens to track (-1 = context size).
    250   */
    251  long penaltyLastN = 64;
    252 
    253  /**
    254   * Repetition penalty multiplier (1.0 = disabled).
    255   */
    256  float penaltyRepeat = 1.0;
    257 
    258  /**
    259   * Frequency penalty (0.0 = disabled).
    260   */
    261  float penaltyFreq = 0.0;
    262 
    263  /**
    264   * Presence penalty (0.0 = disabled).
    265   */
    266  float penaltyPresent = 0.0;
    267 
    268  /**
    269   * DRY multiplier (0.0 = disabled).
    270   */
    271  float dryMultiplier = 0.0;
    272 
    273  /**
    274   * DRY base exponent (0.0 = disabled).
    275   */
    276  float dryBase = 1.75;
    277 
    278  /**
    279   * DRY allowed repetition length before penalization starts.
    280   */
    281  long dryAllowedLength = 2;
    282 
    283  /**
    284   * DRY lookback window (0 = disable, -1 = context size).
    285   */
    286  long dryPenaltyLastN = -1;
    287 
    288  /**
    289   * Mirostat mode (0 = disabled, 1 = v1, 2 = v2).
    290   */
    291  long mirostat = 0;
    292 
    293  /**
    294   * Top-n sigma sampling cutoff (-1.0 = disabled).
    295   */
    296  float topNSigma = -1.0;
    297 
    298  /**
    299   * Mirostat target entropy (tau).
    300   */
    301  float mirostatTau = 5.0;
    302 
    303  /**
    304   * Mirostat learning rate (eta).
    305   */
    306  float mirostatEta = 0.1;
    307 
    308  /**
    309   * List of token-specific logit biases.
    310   */
    311  sequence<LlamaLogitBias> logitBias = [];
    312 
    313  /**
    314   * If true, disables performance metrics.
    315   */
    316  boolean noPerf = false;
    317 
    318  /**
    319   * Random number seed for sampling.
    320   */
    321  unsigned long  seed;
    322 };
    323 
    324 dictionary LlamaDeTokenizationOptions {
    325  /**
    326   * Maximum number of UTF-8 characters that may be contained in a single model token.
    327   * This is used to reserve enough space during  detokenization.
    328   */
    329  long maxCharsPerToken = 256;
    330 
    331  /**
    332   * Whether to render special tokens such as <BOS>, <EOS>, or <UNK> in the output.
    333   * If false, these tokens will be omitted from the detokenized result.
    334   */
    335  boolean renderSpecialTokens = true;
    336 };
    337 
    338 dictionary LlamaTokenizationOptions {
    339 
    340  /**
    341   * Allow to add BOS and EOS tokens if model is configured to do so.
    342   */
    343  boolean addBosAndEos = true;
    344 
    345 
    346  /**
    347   * Allow tokenizing special and/or control tokens which otherwise are not exposed and treated
    348   * as plaintext. Does not insert a leading space.
    349   */
    350  boolean parseSpecilControlTokens = true;
    351 };
    352 
    353 dictionary LlamaChatOptions {
    354 
    355  /**
    356   * Sampler stack to apply during decoding.
    357   */
    358  sequence<LlamaSamplerConfig> samplers = [];
    359 
    360  /**
    361   * Input prompt text to process.
    362   */
    363  required UTF8String prompt;
    364 
    365  /**
    366   * Optional output buffer size (0 = no preallocation).
    367   */
    368  long minOutputBufferSize = 1;
    369 
    370  /**
    371   * Maximum number of generation steps (tokens).
    372   */
    373  long maxGeneratedTokens = 512;
    374 
    375  /**
    376   * If true, stop when encountering known model end-of-generation tokens.
    377   */
    378  boolean stopOnEndOfGenerationTokens = true;
    379 
    380  /**
    381   * List of token IDs that should stop generation.
    382   */
    383  sequence<long> stopTokens = [];
    384 
    385  LlamaTokenizationOptions tokenizationOptions = {};
    386 
    387  LlamaDeTokenizationOptions deTokenizationOptions = {};
    388 };
    389 
    390 [Func="LlamaRunner::InInferenceProcess", Exposed=(DedicatedWorker,Window)]
    391 interface LlamaRunner {
    392  [Throws] constructor();
    393 
    394  [Throws] Promise<undefined> initialize(LlamaModelOptions options, Blob modelBlob);
    395 
    396  [Throws] Promise<UTF8String> formatChat(LlamaFormatChatOptions options);
    397 
    398  [NewObject, Throws] ReadableStream createGenerationStream(LlamaChatOptions options);
    399 };