polyval.c (17236B)
1 /* Copyright (c) 2025, The Tor Project, Inc. */ 2 /* See LICENSE for licensing information */ 3 4 /** 5 * \file polyval.h 6 * \brief Implementation for polyval universal hash function. 7 * 8 * Polyval, which was first defined for AES-GCM-SIV, is a 9 * universal hash based on multiplication in GF(2^128). 10 * Unlike the more familiar GHASH, it is defined to work on 11 * little-endian inputs, and so is more straightforward and efficient 12 * on little-endian architectures. 13 * 14 * In Tor we use it as part of the Counter Galois Onion relay 15 * encryption format. 16 **/ 17 18 /* Implementation notes: 19 * 20 * Our implementation is based on the GHASH code from BearSSL 21 * by Thomas Pornin. There are three implementations: 22 * 23 * pclmul.c -- An x86-only version, based on the CLMUL instructions 24 * introduced for westlake processors in 2010. 25 * 26 * ctmul64.c -- A portable contant-time implementation for 64-bit 27 * processors. 28 * 29 * ctmul.c -- A portable constant-time implementation for 32-bit 30 * processors with an efficient 32x32->64 multiply instruction. 31 * 32 * Note that the "ctmul" implementations are only constant-time 33 * if the corresponding CPU multiply and rotate instructions are 34 * also constant-time. But that's an architectural assumption we 35 * make in Tor. 36 */ 37 38 #include "ext/polyval/polyval.h" 39 #include "lib/cc/compat_compiler.h" 40 41 #include <string.h> 42 43 #ifdef PV_USE_PCLMUL_DETECT 44 #include <cpuid.h> 45 #endif 46 47 typedef pv_u128_ u128; 48 49 /* ======== 50 * We declare these functions, to help implement polyval. 51 * 52 * They have different definitions depending on our representation 53 * of 128-bit integers. 54 */ 55 #if 0 56 /** 57 * Read a u128-bit little-endian integer from 'bytes', 58 * which may not be aligned. 59 */ 60 static inline u128 u128_from_bytes(const uint8_t *bytes); 61 /** 62 * Store a u128-bit little-endian integer to 'bytes_out', 63 * which may not be aligned. 64 */ 65 static inline void u128_to_bytes(u128, uint8_t *bytes_out); 66 /** 67 * XOR a u128 into the y field of a polyval_t struct. 68 * 69 * (Within the polyval struct, perform "y ^= v"). 70 */ 71 static inline void pv_xor_y(polyval_t *, u128 v); 72 73 /* ======== 74 * The function which we expect our backend to implement. 75 */ 76 /** 77 * Within the polyval struct, perform "y *= h". 78 * 79 * (This is a carryless multiply in the Polyval galois field) 80 */ 81 static void pv_mul_y_h(polyval_t *);h 82 #endif 83 84 /* ===== 85 * Endianness conversion for big-endian platforms 86 */ 87 #ifdef WORDS_BIG_ENDIAN 88 #ifdef __GNUC__ 89 #define bswap64(x) __builtin_bswap64(x) 90 #define bswap32(x) __builtin_bswap32(x) 91 #else 92 /* The (compiler should optimize these into a decent bswap instruction) */ 93 static inline uint64_t 94 bswap64(uint64_t v) 95 { 96 return 97 ((value & 0xFF00000000000000) >> 56) | 98 ((value & 0x00FF000000000000) >> 48) | 99 ((value & 0x0000FF0000000000) >> 40) | 100 ((value & 0x000000FF00000000) >> 32) | 101 ((value & 0x00000000FF000000) >> 24) | 102 ((value & 0x0000000000FF0000) >> 16) | 103 ((value & 0x000000000000FF00) >> 8) | 104 ((value & 0x00000000000000FF)); 105 } 106 static inline uint64_t 107 bswap32(uint64_t v) 108 { 109 return 110 ((value & 0xFF000000) >> 24) | 111 ((value & 0x00FF0000) >> 16) | 112 ((value & 0x0000FF00) >> 8) | 113 ((value & 0x000000FF)); 114 } 115 #endif 116 #endif 117 118 #ifdef WORDS_BIG_ENDIAN 119 #define convert_byte_order64(x) bswap64(x) 120 #define convert_byte_order32(x) bswap32(x) 121 #else 122 #define convert_byte_order64(x) (x) 123 #define convert_byte_order32(x) (x) 124 #endif 125 126 #if defined PV_USE_PCLMUL_UNCONDITIONAL 127 #define PCLMUL_MEMBER(v) (v) 128 #define PV_USE_PCLMUL 129 130 #elif defined PV_USE_PCLMUL_DETECT 131 #define PCLMUL_MEMBER(v) (v).u128x1 132 #define CTMUL64_MEMBER(v) (v).u64x2 133 #define PV_USE_PCLMUL 134 #define PV_USE_CTMUL64 135 136 #elif defined PV_USE_CTMUL64 137 #define CTMUL64_MEMBER(v) (v) 138 #endif 139 140 #ifdef PV_USE_PCLMUL 141 #include "ext/polyval/pclmul.c" 142 143 DISABLE_GCC_WARNING("-Waggregate-return") 144 static inline u128 145 u128_from_bytes_pclmul(const uint8_t *bytes) 146 { 147 u128 r; 148 PCLMUL_MEMBER(r) = _mm_loadu_si128((const __m128i*)bytes); 149 return r; 150 } 151 ENABLE_GCC_WARNING("-Waggregate-return") 152 153 static inline void 154 u128_to_bytes_pclmul(u128 val, uint8_t *bytes_out) 155 { 156 _mm_storeu_si128((__m128i*)bytes_out, PCLMUL_MEMBER(val)); 157 } 158 static inline void 159 pv_xor_y_pclmul(polyval_t *pv, u128 v) 160 { 161 PCLMUL_MEMBER(pv->y) = _mm_xor_si128(PCLMUL_MEMBER(pv->y), 162 PCLMUL_MEMBER(v)); 163 } 164 #endif 165 166 #if defined(PV_USE_CTMUL64) 167 #include "ext/polyval/ctmul64.c" 168 169 DISABLE_GCC_WARNING("-Waggregate-return") 170 static inline u128 171 u128_from_bytes_ctmul64(const uint8_t *bytes) 172 { 173 u128 r; 174 memcpy(&CTMUL64_MEMBER(r).lo, bytes, 8); 175 memcpy(&CTMUL64_MEMBER(r).hi, bytes + 8, 8); 176 CTMUL64_MEMBER(r).lo = convert_byte_order64(CTMUL64_MEMBER(r).lo); 177 CTMUL64_MEMBER(r).hi = convert_byte_order64(CTMUL64_MEMBER(r).hi); 178 return r; 179 } 180 ENABLE_GCC_WARNING("-Waggregate-return") 181 182 static inline void 183 u128_to_bytes_ctmul64(u128 val, uint8_t *bytes_out) 184 { 185 uint64_t lo = convert_byte_order64(CTMUL64_MEMBER(val).lo); 186 uint64_t hi = convert_byte_order64(CTMUL64_MEMBER(val).hi); 187 memcpy(bytes_out, &lo, 8); 188 memcpy(bytes_out + 8, &hi, 8); 189 } 190 static inline void 191 pv_xor_y_ctmul64(polyval_t *pv, u128 val) 192 { 193 CTMUL64_MEMBER(pv->y).lo ^= CTMUL64_MEMBER(val).lo; 194 CTMUL64_MEMBER(pv->y).hi ^= CTMUL64_MEMBER(val).hi; 195 } 196 #endif 197 198 #if defined(PV_USE_CTMUL) 199 #include "ext/polyval/ctmul.c" 200 201 DISABLE_GCC_WARNING("-Waggregate-return") 202 static inline u128 203 u128_from_bytes_ctmul(const uint8_t *bytes) 204 { 205 u128 r; 206 memcpy(&r.v, bytes, 16); 207 for (int i = 0; i < 4; ++i) { 208 r.v[i] = convert_byte_order32(r.v[i]); 209 } 210 return r; 211 } 212 ENABLE_GCC_WARNING("-Waggregate-return") 213 214 static inline void 215 u128_to_bytes_ctmul(u128 val, uint8_t *bytes_out) 216 { 217 uint32_t v[4]; 218 for (int i = 0; i < 4; ++i) { 219 v[i] = convert_byte_order32(val.v[i]); 220 } 221 memcpy(bytes_out, v, 16); 222 } 223 static inline void 224 pv_xor_y_ctmul(polyval_t *pv, u128 val) 225 { 226 for (int i = 0; i < 4; ++i) { 227 pv->y.v[i] ^= val.v[i]; 228 } 229 } 230 #endif 231 232 struct expanded_key_none {}; 233 static inline void add_multiple_none(polyval_t *pv, 234 const uint8_t *input, 235 const struct expanded_key_none *expanded) 236 { 237 (void) pv; 238 (void) input; 239 (void) expanded; 240 } 241 static inline void expand_key_none(const polyval_t *inp, 242 struct expanded_key_none *out) 243 { 244 (void) inp; 245 (void) out; 246 } 247 248 /* Kludge: a special value to use for block_stride when we don't support 249 * processing multiple blocks at once. Previously we used 0, but that 250 * caused warnings with some comparisons. */ 251 #define BLOCK_STRIDE_NONE 0xffff 252 253 DISABLE_GCC_WARNING("-Waggregate-return") 254 #define PV_DECLARE(prefix, \ 255 st, \ 256 u128_from_bytes, \ 257 u128_to_bytes, \ 258 pv_xor_y, \ 259 pv_mul_y_h, \ 260 block_stride, \ 261 expanded_key_tp, expand_fn, add_multiple_fn) \ 262 st void \ 263 prefix ## polyval_key_init(polyval_key_t *pvk, const uint8_t *key) \ 264 { \ 265 pvk->h = u128_from_bytes(key); \ 266 } \ 267 st void \ 268 prefix ## polyval_init(polyval_t *pv, const uint8_t *key) \ 269 { \ 270 polyval_key_init(&pv->key, key); \ 271 memset(&pv->y, 0, sizeof(u128)); \ 272 } \ 273 st void \ 274 prefix ## polyval_init_from_key(polyval_t *pv, const polyval_key_t *key) \ 275 { \ 276 memcpy(&pv->key, key, sizeof(polyval_key_t)); \ 277 memset(&pv->y, 0, sizeof(u128)); \ 278 } \ 279 st void \ 280 prefix ## polyval_add_block(polyval_t *pv, const uint8_t *block) \ 281 { \ 282 u128 b = u128_from_bytes(block); \ 283 pv_xor_y(pv, b); \ 284 pv_mul_y_h(pv); \ 285 } \ 286 st void \ 287 prefix ## polyval_add_zpad(polyval_t *pv, const uint8_t *data, size_t n) \ 288 { \ 289 /* since block_stride is a constant, this should get optimized */ \ 290 if ((block_stride != BLOCK_STRIDE_NONE) \ 291 && n >= (block_stride) * 16) { \ 292 expanded_key_tp expanded_key; \ 293 expand_fn(pv, &expanded_key); \ 294 while (n >= (block_stride) * 16) { \ 295 add_multiple_fn(pv, data, &expanded_key); \ 296 n -= block_stride*16; \ 297 data += block_stride * 16; \ 298 } \ 299 } \ 300 while (n > 16) { \ 301 polyval_add_block(pv, data); \ 302 data += 16; \ 303 n -= 16; \ 304 } \ 305 if (n) { \ 306 uint8_t block[16]; \ 307 memset(&block, 0, sizeof(block)); \ 308 memcpy(block, data, n); \ 309 polyval_add_block(pv, block); \ 310 } \ 311 } \ 312 st void \ 313 prefix ## polyval_get_tag(const polyval_t *pv, uint8_t *tag_out) \ 314 { \ 315 u128_to_bytes(pv->y, tag_out); \ 316 } \ 317 st void \ 318 prefix ## polyval_reset(polyval_t *pv) \ 319 { \ 320 memset(&pv->y, 0, sizeof(u128)); \ 321 } 322 ENABLE_GCC_WARNING("-Waggregate-return") 323 324 #ifdef PV_USE_PCLMUL_DETECT 325 /* We use a boolean to distinguish whether to use the PCLMUL instructions, 326 * but instead we could use function pointers. It's probably worth 327 * benchmarking, though it's unlikely to make a measurable difference. 328 */ 329 static bool use_pclmul = false; 330 331 /* Declare _both_ variations of our code, statically, 332 * with different prefixes. */ 333 DISABLE_GCC_WARNING("-Waggregate-return") 334 PV_DECLARE(pclmul_, static, 335 u128_from_bytes_pclmul, 336 u128_to_bytes_pclmul, 337 pv_xor_y_pclmul, 338 pv_mul_y_h_pclmul, 339 PV_BLOCK_STRIDE, 340 pv_expanded_key_t, 341 expand_key_pclmul, 342 pv_add_multiple_pclmul) 343 344 PV_DECLARE(ctmul64_, static, 345 u128_from_bytes_ctmul64, 346 u128_to_bytes_ctmul64, 347 pv_xor_y_ctmul64, 348 pv_mul_y_h_ctmul64, 349 BLOCK_STRIDE_NONE, 350 struct expanded_key_none, 351 expand_key_none, 352 add_multiple_none) 353 ENABLE_GCC_WARNING("-Waggregate-return") 354 355 void 356 polyval_key_init(polyval_key_t *pv, const uint8_t *key) 357 { 358 if (use_pclmul) 359 pclmul_polyval_key_init(pv, key); 360 else 361 ctmul64_polyval_key_init(pv, key); 362 } 363 void 364 polyval_init(polyval_t *pv, const uint8_t *key) 365 { 366 if (use_pclmul) 367 pclmul_polyval_init(pv, key); 368 else 369 ctmul64_polyval_init(pv, key); 370 } 371 void 372 polyval_init_from_key(polyval_t *pv, const polyval_key_t *key) 373 { 374 if (use_pclmul) 375 pclmul_polyval_init_from_key(pv, key); 376 else 377 ctmul64_polyval_init_from_key(pv, key); 378 } 379 void 380 polyval_add_block(polyval_t *pv, const uint8_t *block) 381 { 382 if (use_pclmul) 383 pclmul_polyval_add_block(pv, block); 384 else 385 ctmul64_polyval_add_block(pv, block); 386 } 387 void 388 polyval_add_zpad(polyval_t *pv, const uint8_t *data, size_t n) 389 { 390 if (use_pclmul) 391 pclmul_polyval_add_zpad(pv, data, n); 392 else 393 ctmul64_polyval_add_zpad(pv, data, n); 394 } 395 void 396 polyval_get_tag(const polyval_t *pv, uint8_t *tag_out) 397 { 398 if (use_pclmul) 399 pclmul_polyval_get_tag(pv, tag_out); 400 else 401 ctmul64_polyval_get_tag(pv, tag_out); 402 } 403 void 404 polyval_reset(polyval_t *pv) 405 { 406 if (use_pclmul) 407 pclmul_polyval_reset(pv); 408 else 409 ctmul64_polyval_reset(pv); 410 } 411 412 #elif defined(PV_USE_PCLMUL) 413 PV_DECLARE(, , 414 u128_from_bytes_pclmul, 415 u128_to_bytes_pclmul, 416 pv_xor_y_pclmul, 417 pv_mul_y_h_pclmul, 418 PV_BLOCK_STRIDE, 419 pv_expanded_key_t, 420 expand_key_pclmul, 421 pv_add_multiple_pclmul) 422 #elif defined(PV_USE_CTMUL64) 423 PV_DECLARE(, , 424 u128_from_bytes_ctmul64, 425 u128_to_bytes_ctmul64, 426 pv_xor_y_ctmul64, 427 pv_mul_y_h_ctmul64, 428 BLOCK_STRIDE_NONE, 429 struct expanded_key_none, 430 expand_key_none, 431 add_multiple_none) 432 433 #elif defined(PV_USE_CTMUL) 434 DISABLE_GCC_WARNING("-Waggregate-return") 435 PV_DECLARE(, , u128_from_bytes_ctmul, 436 u128_to_bytes_ctmul, 437 pv_xor_y_ctmul, 438 pv_mul_y_h_ctmul, 439 BLOCK_STRIDE_NONE, 440 struct expanded_key_none, 441 expand_key_none, 442 add_multiple_none) 443 ENABLE_GCC_WARNING("-Waggregate-return") 444 #endif 445 446 #ifdef PV_USE_PCLMUL_DETECT 447 void 448 polyval_detect_implementation(void) 449 { 450 unsigned int eax, ebx, ecx, edx; 451 use_pclmul = false; 452 if (__get_cpuid(1, &eax, &ebx, &ecx, &edx)) { 453 if (0 != (ecx & bit_PCLMUL)) { 454 use_pclmul = true; 455 } 456 } 457 } 458 #else 459 void 460 polyval_detect_implementation(void) 461 { 462 } 463 #endif 464 465 #ifdef POLYVAL_USE_EXPANDED_KEYS 466 467 #ifdef PV_USE_PCLMUL_DETECT 468 #define SHOULD_EXPAND() (use_pclmul) 469 #else 470 #define SHOULD_EXPAND() (1) 471 #endif 472 473 void 474 polyvalx_init(polyvalx_t *pvx, const uint8_t *key) 475 { 476 polyval_init(&pvx->pv, key); 477 if (SHOULD_EXPAND()) { 478 expand_key_pclmul(&pvx->pv, &pvx->expanded); 479 } 480 } 481 void 482 polyvalx_init_from_key(polyvalx_t *pvx, const polyval_key_t *key) 483 { 484 polyval_init_from_key(&pvx->pv, key); 485 if (SHOULD_EXPAND()) { 486 expand_key_pclmul(&pvx->pv, &pvx->expanded); 487 } 488 } 489 void 490 polyvalx_add_block(polyvalx_t *pvx, const uint8_t *block) 491 { 492 polyval_add_block(&pvx->pv, block); 493 } 494 void 495 polyvalx_add_zpad(polyvalx_t *pvx, const uint8_t *data, size_t n) 496 { 497 if (SHOULD_EXPAND() && n >= PV_BLOCK_STRIDE * 16) { 498 while (n > PV_BLOCK_STRIDE * 16) { 499 pv_add_multiple_pclmul(&pvx->pv, data, &pvx->expanded); 500 data += PV_BLOCK_STRIDE * 16; 501 n -= PV_BLOCK_STRIDE * 16; 502 } 503 } 504 while (n > 16) { 505 polyval_add_block(&pvx->pv, data); 506 data += 16; 507 n -= 16; 508 } 509 if (n) { 510 uint8_t block[16]; 511 memset(&block, 0, sizeof(block)); 512 memcpy(block, data, n); 513 polyval_add_block(&pvx->pv, block); 514 } 515 } 516 void 517 polyvalx_get_tag(const polyvalx_t *pvx, uint8_t *tag_out) 518 { 519 polyval_get_tag(&pvx->pv, tag_out); 520 } 521 void polyvalx_reset(polyvalx_t *pvx) 522 { 523 polyval_reset(&pvx->pv); 524 } 525 #endif 526 527 #if 0 528 #include <stdio.h> 529 int 530 main(int c, char **v) 531 { 532 // From RFC 8452 appendix A 533 uint8_t key[16] = 534 { 0x25, 0x62, 0x93, 0x47, 0x58, 0x92, 0x42, 0x76, 535 0x1d, 0x31, 0xf8, 0x26, 0xba, 0x4b, 0x75, 0x7b }; 536 uint8_t block1[16] = 537 { 0x4f, 0x4f, 0x95, 0x66, 0x8c, 0x83, 0xdf, 0xb6, 538 0x40, 0x17, 0x62, 0xbb, 0x2d, 0x01, 0xa2, 0x62 }; 539 uint8_t block2[16] = { 540 0xd1, 0xa2, 0x4d, 0xdd, 0x27, 0x21, 0xd0, 0x06, 541 0xbb, 0xe4, 0x5f, 0x20, 0xd3, 0xc9, 0xf3, 0x62 }; 542 uint8_t expect_result[16] = { 543 0xf7, 0xa3, 0xb4, 0x7b, 0x84, 0x61, 0x19, 0xfa, 544 0xe5, 0xb7, 0x86, 0x6c, 0xf5, 0xe5, 0xb7, 0x7e }; 545 546 polyval_t pv; 547 uint8_t tag[16]; 548 polyval_init(&pv, key); 549 polyval_add_block(&pv, block1); 550 polyval_add_block(&pv, block2); 551 polyval_get_tag(&pv, tag); 552 if (!memcmp(expect_result, tag, 16)) { 553 puts("OK"); 554 return 0; 555 } else { 556 puts("NOPE"); 557 return 1; 558 } 559 } 560 #endif