lossless_avx2.c (18843B)
1 // Copyright 2025 Google Inc. All Rights Reserved. 2 // 3 // Use of this source code is governed by a BSD-style license 4 // that can be found in the COPYING file in the root of the source 5 // tree. An additional intellectual property rights grant can be found 6 // in the file PATENTS. All contributing project authors may 7 // be found in the AUTHORS file in the root of the source tree. 8 // ----------------------------------------------------------------------------- 9 // 10 // AVX2 variant of methods for lossless decoder 11 // 12 // Author: Vincent Rabaud (vrabaud@google.com) 13 14 #include "src/dsp/dsp.h" 15 16 #if defined(WEBP_USE_AVX2) 17 18 #include <stddef.h> 19 #include <immintrin.h> 20 21 #include "src/dsp/cpu.h" 22 #include "src/dsp/lossless.h" 23 #include "src/webp/format_constants.h" 24 #include "src/webp/types.h" 25 26 //------------------------------------------------------------------------------ 27 // Predictor Transform 28 29 static WEBP_INLINE void Average2_m256i(const __m256i* const a0, 30 const __m256i* const a1, 31 __m256i* const avg) { 32 // (a + b) >> 1 = ((a + b + 1) >> 1) - ((a ^ b) & 1) 33 const __m256i ones = _mm256_set1_epi8(1); 34 const __m256i avg1 = _mm256_avg_epu8(*a0, *a1); 35 const __m256i one = _mm256_and_si256(_mm256_xor_si256(*a0, *a1), ones); 36 *avg = _mm256_sub_epi8(avg1, one); 37 } 38 39 // Batch versions of those functions. 40 41 // Predictor0: ARGB_BLACK. 42 static void PredictorAdd0_AVX2(const uint32_t* in, const uint32_t* upper, 43 int num_pixels, uint32_t* WEBP_RESTRICT out) { 44 int i; 45 const __m256i black = _mm256_set1_epi32((int)ARGB_BLACK); 46 for (i = 0; i + 8 <= num_pixels; i += 8) { 47 const __m256i src = _mm256_loadu_si256((const __m256i*)&in[i]); 48 const __m256i res = _mm256_add_epi8(src, black); 49 _mm256_storeu_si256((__m256i*)&out[i], res); 50 } 51 if (i != num_pixels) { 52 VP8LPredictorsAdd_SSE[0](in + i, NULL, num_pixels - i, out + i); 53 } 54 (void)upper; 55 } 56 57 // Predictor1: left. 58 static void PredictorAdd1_AVX2(const uint32_t* in, const uint32_t* upper, 59 int num_pixels, uint32_t* WEBP_RESTRICT out) { 60 int i; 61 __m256i prev = _mm256_set1_epi32((int)out[-1]); 62 for (i = 0; i + 8 <= num_pixels; i += 8) { 63 // h | g | f | e | d | c | b | a 64 const __m256i src = _mm256_loadu_si256((const __m256i*)&in[i]); 65 // g | f | e | 0 | c | b | a | 0 66 const __m256i shift0 = _mm256_slli_si256(src, 4); 67 // g + h | f + g | e + f | e | c + d | b + c | a + b | a 68 const __m256i sum0 = _mm256_add_epi8(src, shift0); 69 // e + f | e | 0 | 0 | a + b | a | 0 | 0 70 const __m256i shift1 = _mm256_slli_si256(sum0, 8); 71 // e + f + g + h | e + f + g | e + f | e | a + b + c + d | a + b + c | a + b 72 // | a 73 const __m256i sum1 = _mm256_add_epi8(sum0, shift1); 74 // Add a + b + c + d to the upper lane. 75 const int32_t sum_abcd = _mm256_extract_epi32(sum1, 3); 76 const __m256i sum2 = _mm256_add_epi8( 77 sum1, 78 _mm256_set_epi32(sum_abcd, sum_abcd, sum_abcd, sum_abcd, 0, 0, 0, 0)); 79 80 const __m256i res = _mm256_add_epi8(sum2, prev); 81 _mm256_storeu_si256((__m256i*)&out[i], res); 82 // replicate last res output in prev. 83 prev = _mm256_permutevar8x32_epi32( 84 res, _mm256_set_epi32(7, 7, 7, 7, 7, 7, 7, 7)); 85 } 86 if (i != num_pixels) { 87 VP8LPredictorsAdd_SSE[1](in + i, upper + i, num_pixels - i, out + i); 88 } 89 } 90 91 // Macro that adds 32-bit integers from IN using mod 256 arithmetic 92 // per 8 bit channel. 93 #define GENERATE_PREDICTOR_1(X, IN) \ 94 static void PredictorAdd##X##_AVX2(const uint32_t* in, \ 95 const uint32_t* upper, int num_pixels, \ 96 uint32_t* WEBP_RESTRICT out) { \ 97 int i; \ 98 for (i = 0; i + 8 <= num_pixels; i += 8) { \ 99 const __m256i src = _mm256_loadu_si256((const __m256i*)&in[i]); \ 100 const __m256i other = _mm256_loadu_si256((const __m256i*)&(IN)); \ 101 const __m256i res = _mm256_add_epi8(src, other); \ 102 _mm256_storeu_si256((__m256i*)&out[i], res); \ 103 } \ 104 if (i != num_pixels) { \ 105 VP8LPredictorsAdd_SSE[(X)](in + i, upper + i, num_pixels - i, out + i); \ 106 } \ 107 } 108 109 // Predictor2: Top. 110 GENERATE_PREDICTOR_1(2, upper[i]) 111 // Predictor3: Top-right. 112 GENERATE_PREDICTOR_1(3, upper[i + 1]) 113 // Predictor4: Top-left. 114 GENERATE_PREDICTOR_1(4, upper[i - 1]) 115 #undef GENERATE_PREDICTOR_1 116 117 // Due to averages with integers, values cannot be accumulated in parallel for 118 // predictors 5 to 7. 119 120 #define GENERATE_PREDICTOR_2(X, IN) \ 121 static void PredictorAdd##X##_AVX2(const uint32_t* in, \ 122 const uint32_t* upper, int num_pixels, \ 123 uint32_t* WEBP_RESTRICT out) { \ 124 int i; \ 125 for (i = 0; i + 8 <= num_pixels; i += 8) { \ 126 const __m256i Tother = _mm256_loadu_si256((const __m256i*)&(IN)); \ 127 const __m256i T = _mm256_loadu_si256((const __m256i*)&upper[i]); \ 128 const __m256i src = _mm256_loadu_si256((const __m256i*)&in[i]); \ 129 __m256i avg, res; \ 130 Average2_m256i(&T, &Tother, &avg); \ 131 res = _mm256_add_epi8(avg, src); \ 132 _mm256_storeu_si256((__m256i*)&out[i], res); \ 133 } \ 134 if (i != num_pixels) { \ 135 VP8LPredictorsAdd_SSE[(X)](in + i, upper + i, num_pixels - i, out + i); \ 136 } \ 137 } 138 // Predictor8: average TL T. 139 GENERATE_PREDICTOR_2(8, upper[i - 1]) 140 // Predictor9: average T TR. 141 GENERATE_PREDICTOR_2(9, upper[i + 1]) 142 #undef GENERATE_PREDICTOR_2 143 144 // Predictor10: average of (average of (L,TL), average of (T, TR)). 145 #define DO_PRED10(OUT) \ 146 do { \ 147 __m256i avgLTL, avg; \ 148 Average2_m256i(&L, &TL, &avgLTL); \ 149 Average2_m256i(&avgTTR, &avgLTL, &avg); \ 150 L = _mm256_add_epi8(avg, src); \ 151 out[i + (OUT)] = (uint32_t)_mm256_cvtsi256_si32(L); \ 152 } while (0) 153 154 #define DO_PRED10_SHIFT \ 155 do { \ 156 /* Rotate the pre-computed values for the next iteration.*/ \ 157 avgTTR = _mm256_srli_si256(avgTTR, 4); \ 158 TL = _mm256_srli_si256(TL, 4); \ 159 src = _mm256_srli_si256(src, 4); \ 160 } while (0) 161 162 static void PredictorAdd10_AVX2(const uint32_t* in, const uint32_t* upper, 163 int num_pixels, uint32_t* WEBP_RESTRICT out) { 164 int i, j; 165 __m256i L = _mm256_setr_epi32((int)out[-1], 0, 0, 0, 0, 0, 0, 0); 166 for (i = 0; i + 8 <= num_pixels; i += 8) { 167 __m256i src = _mm256_loadu_si256((const __m256i*)&in[i]); 168 __m256i TL = _mm256_loadu_si256((const __m256i*)&upper[i - 1]); 169 const __m256i T = _mm256_loadu_si256((const __m256i*)&upper[i]); 170 const __m256i TR = _mm256_loadu_si256((const __m256i*)&upper[i + 1]); 171 __m256i avgTTR; 172 Average2_m256i(&T, &TR, &avgTTR); 173 { 174 const __m256i avgTTR_bak = avgTTR; 175 const __m256i TL_bak = TL; 176 const __m256i src_bak = src; 177 for (j = 0; j < 4; ++j) { 178 DO_PRED10(j); 179 DO_PRED10_SHIFT; 180 } 181 avgTTR = _mm256_permute2x128_si256(avgTTR_bak, avgTTR_bak, 1); 182 TL = _mm256_permute2x128_si256(TL_bak, TL_bak, 1); 183 src = _mm256_permute2x128_si256(src_bak, src_bak, 1); 184 for (; j < 8; ++j) { 185 DO_PRED10(j); 186 DO_PRED10_SHIFT; 187 } 188 } 189 } 190 if (i != num_pixels) { 191 VP8LPredictorsAdd_SSE[10](in + i, upper + i, num_pixels - i, out + i); 192 } 193 } 194 #undef DO_PRED10 195 #undef DO_PRED10_SHIFT 196 197 // Predictor11: select. 198 #define DO_PRED11(OUT) \ 199 do { \ 200 const __m256i L_lo = _mm256_unpacklo_epi32(L, T); \ 201 const __m256i TL_lo = _mm256_unpacklo_epi32(TL, T); \ 202 const __m256i pb = _mm256_sad_epu8(L_lo, TL_lo); /* pb = sum |L-TL|*/ \ 203 const __m256i mask = _mm256_cmpgt_epi32(pb, pa); \ 204 const __m256i A = _mm256_and_si256(mask, L); \ 205 const __m256i B = _mm256_andnot_si256(mask, T); \ 206 const __m256i pred = _mm256_or_si256(A, B); /* pred = (pa > b)? L : T*/ \ 207 L = _mm256_add_epi8(src, pred); \ 208 out[i + (OUT)] = (uint32_t)_mm256_cvtsi256_si32(L); \ 209 } while (0) 210 211 #define DO_PRED11_SHIFT \ 212 do { \ 213 /* Shift the pre-computed value for the next iteration.*/ \ 214 T = _mm256_srli_si256(T, 4); \ 215 TL = _mm256_srli_si256(TL, 4); \ 216 src = _mm256_srli_si256(src, 4); \ 217 pa = _mm256_srli_si256(pa, 4); \ 218 } while (0) 219 220 static void PredictorAdd11_AVX2(const uint32_t* in, const uint32_t* upper, 221 int num_pixels, uint32_t* WEBP_RESTRICT out) { 222 int i, j; 223 __m256i pa; 224 __m256i L = _mm256_setr_epi32((int)out[-1], 0, 0, 0, 0, 0, 0, 0); 225 for (i = 0; i + 8 <= num_pixels; i += 8) { 226 __m256i T = _mm256_loadu_si256((const __m256i*)&upper[i]); 227 __m256i TL = _mm256_loadu_si256((const __m256i*)&upper[i - 1]); 228 __m256i src = _mm256_loadu_si256((const __m256i*)&in[i]); 229 { 230 // We can unpack with any value on the upper 32 bits, provided it's the 231 // same on both operands (so that their sum of abs diff is zero). Here we 232 // use T. 233 const __m256i T_lo = _mm256_unpacklo_epi32(T, T); 234 const __m256i TL_lo = _mm256_unpacklo_epi32(TL, T); 235 const __m256i T_hi = _mm256_unpackhi_epi32(T, T); 236 const __m256i TL_hi = _mm256_unpackhi_epi32(TL, T); 237 const __m256i s_lo = _mm256_sad_epu8(T_lo, TL_lo); 238 const __m256i s_hi = _mm256_sad_epu8(T_hi, TL_hi); 239 pa = _mm256_packs_epi32(s_lo, s_hi); // pa = sum |T-TL| 240 } 241 { 242 const __m256i T_bak = T; 243 const __m256i TL_bak = TL; 244 const __m256i src_bak = src; 245 const __m256i pa_bak = pa; 246 for (j = 0; j < 4; ++j) { 247 DO_PRED11(j); 248 DO_PRED11_SHIFT; 249 } 250 T = _mm256_permute2x128_si256(T_bak, T_bak, 1); 251 TL = _mm256_permute2x128_si256(TL_bak, TL_bak, 1); 252 src = _mm256_permute2x128_si256(src_bak, src_bak, 1); 253 pa = _mm256_permute2x128_si256(pa_bak, pa_bak, 1); 254 for (; j < 8; ++j) { 255 DO_PRED11(j); 256 DO_PRED11_SHIFT; 257 } 258 } 259 } 260 if (i != num_pixels) { 261 VP8LPredictorsAdd_SSE[11](in + i, upper + i, num_pixels - i, out + i); 262 } 263 } 264 #undef DO_PRED11 265 #undef DO_PRED11_SHIFT 266 267 // Predictor12: ClampedAddSubtractFull. 268 #define DO_PRED12(DIFF, OUT) \ 269 do { \ 270 const __m256i all = _mm256_add_epi16(L, (DIFF)); \ 271 const __m256i alls = _mm256_packus_epi16(all, all); \ 272 const __m256i res = _mm256_add_epi8(src, alls); \ 273 out[i + (OUT)] = (uint32_t)_mm256_cvtsi256_si32(res); \ 274 L = _mm256_unpacklo_epi8(res, zero); \ 275 } while (0) 276 277 #define DO_PRED12_SHIFT(DIFF, LANE) \ 278 do { \ 279 /* Shift the pre-computed value for the next iteration.*/ \ 280 if ((LANE) == 0) (DIFF) = _mm256_srli_si256(DIFF, 8); \ 281 src = _mm256_srli_si256(src, 4); \ 282 } while (0) 283 284 static void PredictorAdd12_AVX2(const uint32_t* in, const uint32_t* upper, 285 int num_pixels, uint32_t* WEBP_RESTRICT out) { 286 int i; 287 const __m256i zero = _mm256_setzero_si256(); 288 const __m256i L8 = _mm256_setr_epi32((int)out[-1], 0, 0, 0, 0, 0, 0, 0); 289 __m256i L = _mm256_unpacklo_epi8(L8, zero); 290 for (i = 0; i + 8 <= num_pixels; i += 8) { 291 // Load 8 pixels at a time. 292 __m256i src = _mm256_loadu_si256((const __m256i*)&in[i]); 293 const __m256i T = _mm256_loadu_si256((const __m256i*)&upper[i]); 294 const __m256i T_lo = _mm256_unpacklo_epi8(T, zero); 295 const __m256i T_hi = _mm256_unpackhi_epi8(T, zero); 296 const __m256i TL = _mm256_loadu_si256((const __m256i*)&upper[i - 1]); 297 const __m256i TL_lo = _mm256_unpacklo_epi8(TL, zero); 298 const __m256i TL_hi = _mm256_unpackhi_epi8(TL, zero); 299 __m256i diff_lo = _mm256_sub_epi16(T_lo, TL_lo); 300 __m256i diff_hi = _mm256_sub_epi16(T_hi, TL_hi); 301 const __m256i diff_lo_bak = diff_lo; 302 const __m256i diff_hi_bak = diff_hi; 303 const __m256i src_bak = src; 304 DO_PRED12(diff_lo, 0); 305 DO_PRED12_SHIFT(diff_lo, 0); 306 DO_PRED12(diff_lo, 1); 307 DO_PRED12_SHIFT(diff_lo, 0); 308 DO_PRED12(diff_hi, 2); 309 DO_PRED12_SHIFT(diff_hi, 0); 310 DO_PRED12(diff_hi, 3); 311 DO_PRED12_SHIFT(diff_hi, 0); 312 313 // Process the upper lane. 314 diff_lo = _mm256_permute2x128_si256(diff_lo_bak, diff_lo_bak, 1); 315 diff_hi = _mm256_permute2x128_si256(diff_hi_bak, diff_hi_bak, 1); 316 src = _mm256_permute2x128_si256(src_bak, src_bak, 1); 317 318 DO_PRED12(diff_lo, 4); 319 DO_PRED12_SHIFT(diff_lo, 0); 320 DO_PRED12(diff_lo, 5); 321 DO_PRED12_SHIFT(diff_lo, 1); 322 DO_PRED12(diff_hi, 6); 323 DO_PRED12_SHIFT(diff_hi, 0); 324 DO_PRED12(diff_hi, 7); 325 } 326 if (i != num_pixels) { 327 VP8LPredictorsAdd_SSE[12](in + i, upper + i, num_pixels - i, out + i); 328 } 329 } 330 #undef DO_PRED12 331 #undef DO_PRED12_SHIFT 332 333 // Due to averages with integers, values cannot be accumulated in parallel for 334 // predictors 13. 335 336 //------------------------------------------------------------------------------ 337 // Subtract-Green Transform 338 339 static void AddGreenToBlueAndRed_AVX2(const uint32_t* const src, int num_pixels, 340 uint32_t* dst) { 341 int i; 342 const __m256i kCstShuffle = _mm256_set_epi8( 343 -1, 29, -1, 29, -1, 25, -1, 25, -1, 21, -1, 21, -1, 17, -1, 17, -1, 13, 344 -1, 13, -1, 9, -1, 9, -1, 5, -1, 5, -1, 1, -1, 1); 345 for (i = 0; i + 8 <= num_pixels; i += 8) { 346 const __m256i in = _mm256_loadu_si256((const __m256i*)&src[i]); // argb 347 const __m256i in_0g0g = _mm256_shuffle_epi8(in, kCstShuffle); // 0g0g 348 const __m256i out = _mm256_add_epi8(in, in_0g0g); 349 _mm256_storeu_si256((__m256i*)&dst[i], out); 350 } 351 // fallthrough and finish off with SSE. 352 if (i != num_pixels) { 353 VP8LAddGreenToBlueAndRed_SSE(src + i, num_pixels - i, dst + i); 354 } 355 } 356 357 //------------------------------------------------------------------------------ 358 // Color Transform 359 360 static void TransformColorInverse_AVX2(const VP8LMultipliers* const m, 361 const uint32_t* const src, 362 int num_pixels, uint32_t* dst) { 363 // sign-extended multiplying constants, pre-shifted by 5. 364 #define CST(X) (((int16_t)(m->X << 8)) >> 5) // sign-extend 365 const __m256i mults_rb = 366 _mm256_set1_epi32((int)((uint32_t)CST(green_to_red) << 16 | 367 (CST(green_to_blue) & 0xffff))); 368 const __m256i mults_b2 = _mm256_set1_epi32(CST(red_to_blue)); 369 #undef CST 370 const __m256i mask_ag = _mm256_set1_epi32((int)0xff00ff00); 371 const __m256i perm1 = _mm256_setr_epi8( 372 -1, 1, -1, 1, -1, 5, -1, 5, -1, 9, -1, 9, -1, 13, -1, 13, -1, 17, -1, 17, 373 -1, 21, -1, 21, -1, 25, -1, 25, -1, 29, -1, 29); 374 const __m256i perm2 = _mm256_setr_epi8( 375 -1, 2, -1, -1, -1, 6, -1, -1, -1, 10, -1, -1, -1, 14, -1, -1, -1, 18, -1, 376 -1, -1, 22, -1, -1, -1, 26, -1, -1, -1, 30, -1, -1); 377 int i; 378 for (i = 0; i + 8 <= num_pixels; i += 8) { 379 const __m256i A = _mm256_loadu_si256((const __m256i*)(src + i)); 380 const __m256i B = _mm256_shuffle_epi8(A, perm1); // argb -> g0g0 381 const __m256i C = _mm256_mulhi_epi16(B, mults_rb); 382 const __m256i D = _mm256_add_epi8(A, C); 383 const __m256i E = _mm256_shuffle_epi8(D, perm2); 384 const __m256i F = _mm256_mulhi_epi16(E, mults_b2); 385 const __m256i G = _mm256_add_epi8(D, F); 386 const __m256i out = _mm256_blendv_epi8(G, A, mask_ag); 387 _mm256_storeu_si256((__m256i*)&dst[i], out); 388 } 389 // Fall-back to SSE-version for left-overs. 390 if (i != num_pixels) { 391 VP8LTransformColorInverse_SSE(m, src + i, num_pixels - i, dst + i); 392 } 393 } 394 395 //------------------------------------------------------------------------------ 396 // Color-space conversion functions 397 398 static void ConvertBGRAToRGBA_AVX2(const uint32_t* WEBP_RESTRICT src, 399 int num_pixels, uint8_t* WEBP_RESTRICT dst) { 400 const __m256i* in = (const __m256i*)src; 401 __m256i* out = (__m256i*)dst; 402 while (num_pixels >= 8) { 403 const __m256i A = _mm256_loadu_si256(in++); 404 const __m256i B = _mm256_shuffle_epi8( 405 A, 406 _mm256_set_epi8(15, 12, 13, 14, 11, 8, 9, 10, 7, 4, 5, 6, 3, 0, 1, 2, 407 15, 12, 13, 14, 11, 8, 9, 10, 7, 4, 5, 6, 3, 0, 1, 2)); 408 _mm256_storeu_si256(out++, B); 409 num_pixels -= 8; 410 } 411 // left-overs 412 if (num_pixels > 0) { 413 VP8LConvertBGRAToRGBA_SSE((const uint32_t*)in, num_pixels, (uint8_t*)out); 414 } 415 } 416 417 //------------------------------------------------------------------------------ 418 // Entry point 419 420 extern void VP8LDspInitAVX2(void); 421 422 WEBP_TSAN_IGNORE_FUNCTION void VP8LDspInitAVX2(void) { 423 VP8LPredictorsAdd[0] = PredictorAdd0_AVX2; 424 VP8LPredictorsAdd[1] = PredictorAdd1_AVX2; 425 VP8LPredictorsAdd[2] = PredictorAdd2_AVX2; 426 VP8LPredictorsAdd[3] = PredictorAdd3_AVX2; 427 VP8LPredictorsAdd[4] = PredictorAdd4_AVX2; 428 VP8LPredictorsAdd[8] = PredictorAdd8_AVX2; 429 VP8LPredictorsAdd[9] = PredictorAdd9_AVX2; 430 VP8LPredictorsAdd[10] = PredictorAdd10_AVX2; 431 VP8LPredictorsAdd[11] = PredictorAdd11_AVX2; 432 VP8LPredictorsAdd[12] = PredictorAdd12_AVX2; 433 434 VP8LAddGreenToBlueAndRed = AddGreenToBlueAndRed_AVX2; 435 VP8LTransformColorInverse = TransformColorInverse_AVX2; 436 VP8LConvertBGRAToRGBA = ConvertBGRAToRGBA_AVX2; 437 } 438 439 #else // !WEBP_USE_AVX2 440 441 WEBP_DSP_INIT_STUB(VP8LDspInitAVX2) 442 443 #endif // WEBP_USE_AVX2