static_dict_lut.c (7420B)
1 /* Copyright 2025 Google Inc. All Rights Reserved. 2 3 Distributed under MIT license. 4 See file LICENSE for detail or copy at https://opensource.org/licenses/MIT 5 */ 6 7 /* Lookup table for static dictionary and transforms. */ 8 9 #include "static_dict_lut.h" 10 11 #include "../common/platform.h" /* IWYU pragma: keep */ 12 #include "../common/static_init.h" 13 14 #if (BROTLI_STATIC_INIT != BROTLI_STATIC_INIT_NONE) 15 #include "../common/dictionary.h" 16 #include "../common/transform.h" 17 #include "hash_base.h" 18 #endif 19 20 #if defined(__cplusplus) || defined(c_plusplus) 21 extern "C" { 22 #endif 23 24 #if (BROTLI_STATIC_INIT != BROTLI_STATIC_INIT_NONE) 25 26 /* TODO(eustas): deal with largest bucket(s). Not it contains 163 items. */ 27 static BROTLI_BOOL BROTLI_COLD DoBrotliEncoderInitStaticDictionaryLut( 28 const BrotliDictionary* dict, uint16_t* buckets, DictWord* words, 29 void* arena) { 30 DictWord* slots = (DictWord*)arena; 31 uint16_t* heads = (uint16_t*)(slots + BROTLI_ENC_STATIC_DICT_LUT_NUM_ITEMS); 32 uint16_t* counts = heads + BROTLI_ENC_STATIC_DICT_LUT_NUM_BUCKETS; 33 uint16_t* prev = counts + BROTLI_ENC_STATIC_DICT_LUT_NUM_BUCKETS; 34 size_t next_slot = 0; 35 uint8_t transformed_word[24]; 36 uint8_t transformed_other_word[24]; 37 size_t l; 38 size_t pos; 39 size_t i; 40 41 memset(counts, 0, BROTLI_ENC_STATIC_DICT_LUT_NUM_BUCKETS * sizeof(uint16_t)); 42 memset(heads, 0, BROTLI_ENC_STATIC_DICT_LUT_NUM_BUCKETS * sizeof(uint16_t)); 43 memset(prev, 0, BROTLI_ENC_STATIC_DICT_LUT_NUM_ITEMS * sizeof(uint16_t)); 44 45 for (l = 4; l <= 24; ++l) { 46 size_t n = 1u << dict->size_bits_by_length[l]; 47 const uint8_t* dict_words = dict->data + dict->offsets_by_length[l]; 48 for (i = 0; i < n; ++i) { 49 const uint8_t* dict_word = dict_words + l * i; 50 uint32_t key = Hash15(dict_word); 51 slots[next_slot].len = (uint8_t)l; 52 slots[next_slot].transform = BROTLI_TRANSFORM_IDENTITY; 53 slots[next_slot].idx = (uint16_t)i; 54 prev[next_slot] = heads[key]; 55 heads[key] = (uint16_t)next_slot; 56 counts[key]++; 57 ++next_slot; 58 } 59 for (i = 0; i < n; ++i) { 60 uint32_t key; 61 uint32_t prefix; 62 BROTLI_BOOL found; 63 size_t curr; 64 const uint8_t* dict_word = dict_words + l * i; 65 if (dict_word[0] < 'a' || dict_word[0] > 'z') continue; 66 memcpy(transformed_word, dict_word, l); 67 transformed_word[0] = transformed_word[0] - 32; 68 key = Hash15(transformed_word); 69 prefix = BROTLI_UNALIGNED_LOAD32LE(transformed_word) & ~0x20202020u; 70 found = BROTLI_FALSE; 71 curr = heads[key]; 72 while (curr != 0) { 73 const uint8_t* other_word; 74 uint32_t other_prefix; 75 if (slots[curr].len != l) break; 76 other_word = dict_words + l * slots[curr].idx; 77 other_prefix = BROTLI_UNALIGNED_LOAD32LE(other_word) & ~0x20202020u; 78 if (prefix == other_prefix) { 79 if (memcmp(transformed_word, other_word, l) == 0) { 80 found = BROTLI_TRUE; 81 break; 82 } 83 } 84 curr = prev[curr]; 85 } 86 if (found) continue; 87 slots[next_slot].len = (uint8_t)l; 88 slots[next_slot].transform = BROTLI_TRANSFORM_UPPERCASE_FIRST; 89 slots[next_slot].idx = (uint16_t)i; 90 prev[next_slot] = heads[key]; 91 heads[key] = (uint16_t)next_slot; 92 counts[key]++; 93 ++next_slot; 94 } 95 for (i = 0; i < n; ++i) { 96 const uint8_t* dict_word = dict_words + l * i; 97 BROTLI_BOOL is_ascii = BROTLI_TRUE; 98 BROTLI_BOOL has_lower = BROTLI_FALSE; 99 size_t k; 100 uint32_t prefix; 101 uint32_t key; 102 size_t curr; 103 BROTLI_BOOL found; 104 for (k = 0; k < l; ++k) { 105 if (dict_word[k] >= 128) is_ascii = BROTLI_FALSE; 106 if (k > 0 && dict_word[k] >= 'a' && dict_word[k] <= 'z') 107 has_lower = BROTLI_TRUE; 108 } 109 if (!is_ascii || !has_lower) continue; 110 memcpy(transformed_word, dict_word, l); 111 prefix = BROTLI_UNALIGNED_LOAD32LE(transformed_word) & ~0x20202020u; 112 for (k = 0; k < l; ++k) { 113 if (transformed_word[k] >= 'a' && transformed_word[k] <= 'z') { 114 transformed_word[k] = transformed_word[k] - 32; 115 } 116 } 117 key = Hash15(transformed_word); 118 found = BROTLI_FALSE; 119 curr = heads[key]; 120 while (curr != 0) { 121 const uint8_t* other_word; 122 uint32_t other_prefix; 123 if (slots[curr].len != l) break; 124 other_word = dict_words + l * slots[curr].idx; 125 other_prefix = BROTLI_UNALIGNED_LOAD32LE(other_word) & ~0x20202020u; 126 if (prefix == other_prefix) { 127 if (slots[curr].transform == BROTLI_TRANSFORM_IDENTITY) { 128 if (memcmp(transformed_word, other_word, l) == 0) { 129 found = BROTLI_TRUE; 130 break; 131 } 132 } else if (slots[curr].transform == 133 BROTLI_TRANSFORM_UPPERCASE_FIRST) { 134 if ((transformed_word[0] == (other_word[0] - 32)) && 135 memcmp(transformed_word + 1, other_word + 1, l - 1) == 0) { 136 found = BROTLI_TRUE; 137 break; 138 } 139 } else { 140 for (k = 0; k < l; ++k) { 141 if (other_word[k] >= 'a' && other_word[k] <= 'z') { 142 transformed_other_word[k] = other_word[k] - 32; 143 } else { 144 transformed_other_word[k] = other_word[k]; 145 } 146 } 147 if (memcmp(transformed_word, transformed_other_word, l) == 0) { 148 found = BROTLI_TRUE; 149 break; 150 } 151 } 152 } 153 curr = prev[curr]; 154 } 155 if (found) { 156 continue; 157 } 158 slots[next_slot].len = (uint8_t)l; 159 slots[next_slot].transform = BROTLI_TRANSFORM_UPPERCASE_ALL; 160 slots[next_slot].idx = (uint16_t)i; 161 prev[next_slot] = heads[key]; 162 heads[key] = (uint16_t)next_slot; 163 counts[key]++; 164 ++next_slot; 165 } 166 } 167 168 if (next_slot != 31704) return BROTLI_FALSE; 169 pos = 0; 170 /* Unused; makes offsets start from 1. */ 171 words[pos].len = 0; 172 words[pos].transform = 0; 173 words[pos].idx = 0; 174 pos++; 175 for (i = 0; i < BROTLI_ENC_STATIC_DICT_LUT_NUM_BUCKETS; ++i) { 176 size_t num_words = counts[i]; 177 size_t curr; 178 if (num_words == 0) { 179 buckets[i] = 0; 180 continue; 181 } 182 buckets[i] = (uint16_t)pos; 183 curr = heads[i]; 184 pos += num_words; 185 for (size_t k = 0; k < num_words; ++k) { 186 words[pos - 1 - k] = slots[curr]; 187 curr = prev[curr]; 188 } 189 words[pos - 1].len |= 0x80; 190 } 191 return BROTLI_TRUE; 192 } 193 194 BROTLI_BOOL BrotliEncoderInitStaticDictionaryLut( 195 const BrotliDictionary* dict, uint16_t* buckets, DictWord* words) { 196 size_t arena_size = 197 BROTLI_ENC_STATIC_DICT_LUT_NUM_ITEMS * 198 (sizeof(uint16_t) + sizeof(DictWord)) + 199 BROTLI_ENC_STATIC_DICT_LUT_NUM_BUCKETS * 2 * sizeof(uint16_t); 200 void* arena = malloc(arena_size); 201 BROTLI_BOOL ok; 202 if (arena == NULL) { 203 return BROTLI_FALSE; 204 } 205 ok = DoBrotliEncoderInitStaticDictionaryLut(dict, buckets, words, arena); 206 free(arena); 207 return ok; 208 } 209 210 BROTLI_MODEL("small") 211 uint16_t kStaticDictionaryBuckets[BROTLI_ENC_STATIC_DICT_LUT_NUM_BUCKETS]; 212 BROTLI_MODEL("small") 213 DictWord kStaticDictionaryWords[BROTLI_ENC_STATIC_DICT_LUT_NUM_ITEMS]; 214 215 #else /* BROTLI_STATIC_INIT */ 216 217 /* Embed kStaticDictionaryBuckets and kStaticDictionaryWords. */ 218 #include "static_dict_lut_inc.h" 219 220 #endif /* BROTLI_STATIC_INIT */ 221 222 #if defined(__cplusplus) || defined(c_plusplus) 223 } /* extern "C" */ 224 #endif