cfl_avx2.c (21372B)
1 /* 2 * Copyright (c) 2017, Alliance for Open Media. All rights reserved. 3 * 4 * This source code is subject to the terms of the BSD 2 Clause License and 5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License 6 * was not distributed with this source code in the LICENSE file, you can 7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open 8 * Media Patent License 1.0 was not distributed with this source code in the 9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent. 10 */ 11 #include <immintrin.h> 12 13 #include "config/av1_rtcd.h" 14 15 #include "av1/common/cfl.h" 16 17 #include "av1/common/x86/cfl_simd.h" 18 19 #define CFL_GET_SUBSAMPLE_FUNCTION_AVX2(sub, bd) \ 20 CFL_SUBSAMPLE(avx2, sub, bd, 32, 32) \ 21 CFL_SUBSAMPLE(avx2, sub, bd, 32, 16) \ 22 CFL_SUBSAMPLE(avx2, sub, bd, 32, 8) \ 23 cfl_subsample_##bd##_fn cfl_get_luma_subsampling_##sub##_##bd##_avx2( \ 24 TX_SIZE tx_size) { \ 25 static const cfl_subsample_##bd##_fn subfn_##sub[TX_SIZES_ALL] = { \ 26 cfl_subsample_##bd##_##sub##_4x4_ssse3, /* 4x4 */ \ 27 cfl_subsample_##bd##_##sub##_8x8_ssse3, /* 8x8 */ \ 28 cfl_subsample_##bd##_##sub##_16x16_ssse3, /* 16x16 */ \ 29 cfl_subsample_##bd##_##sub##_32x32_avx2, /* 32x32 */ \ 30 NULL, /* 64x64 (invalid CFL size) */ \ 31 cfl_subsample_##bd##_##sub##_4x8_ssse3, /* 4x8 */ \ 32 cfl_subsample_##bd##_##sub##_8x4_ssse3, /* 8x4 */ \ 33 cfl_subsample_##bd##_##sub##_8x16_ssse3, /* 8x16 */ \ 34 cfl_subsample_##bd##_##sub##_16x8_ssse3, /* 16x8 */ \ 35 cfl_subsample_##bd##_##sub##_16x32_ssse3, /* 16x32 */ \ 36 cfl_subsample_##bd##_##sub##_32x16_avx2, /* 32x16 */ \ 37 NULL, /* 32x64 (invalid CFL size) */ \ 38 NULL, /* 64x32 (invalid CFL size) */ \ 39 cfl_subsample_##bd##_##sub##_4x16_ssse3, /* 4x16 */ \ 40 cfl_subsample_##bd##_##sub##_16x4_ssse3, /* 16x4 */ \ 41 cfl_subsample_##bd##_##sub##_8x32_ssse3, /* 8x32 */ \ 42 cfl_subsample_##bd##_##sub##_32x8_avx2, /* 32x8 */ \ 43 NULL, /* 16x64 (invalid CFL size) */ \ 44 NULL, /* 64x16 (invalid CFL size) */ \ 45 }; \ 46 return subfn_##sub[tx_size]; \ 47 } 48 49 /** 50 * Adds 4 pixels (in a 2x2 grid) and multiplies them by 2. Resulting in a more 51 * precise version of a box filter 4:2:0 pixel subsampling in Q3. 52 * 53 * The CfL prediction buffer is always of size CFL_BUF_SQUARE. However, the 54 * active area is specified using width and height. 55 * 56 * Note: We don't need to worry about going over the active area, as long as we 57 * stay inside the CfL prediction buffer. 58 * 59 * Note: For 4:2:0 luma subsampling, the width will never be greater than 16. 60 */ 61 static void cfl_luma_subsampling_420_lbd_avx2(const uint8_t *input, 62 int input_stride, 63 uint16_t *pred_buf_q3, int width, 64 int height) { 65 (void)width; // Forever 32 66 const __m256i twos = _mm256_set1_epi8(2); // Thirty two twos 67 const int luma_stride = input_stride << 1; 68 __m256i *row = (__m256i *)pred_buf_q3; 69 const __m256i *row_end = row + (height >> 1) * CFL_BUF_LINE_I256; 70 do { 71 __m256i top = _mm256_loadu_si256((__m256i *)input); 72 __m256i bot = _mm256_loadu_si256((__m256i *)(input + input_stride)); 73 74 __m256i top_16x16 = _mm256_maddubs_epi16(top, twos); 75 __m256i bot_16x16 = _mm256_maddubs_epi16(bot, twos); 76 __m256i sum_16x16 = _mm256_add_epi16(top_16x16, bot_16x16); 77 78 _mm256_storeu_si256(row, sum_16x16); 79 80 input += luma_stride; 81 } while ((row += CFL_BUF_LINE_I256) < row_end); 82 } 83 84 CFL_GET_SUBSAMPLE_FUNCTION_AVX2(420, lbd) 85 86 /** 87 * Adds 2 pixels (in a 2x1 grid) and multiplies them by 4. Resulting in a more 88 * precise version of a box filter 4:2:2 pixel subsampling in Q3. 89 * 90 * The CfL prediction buffer is always of size CFL_BUF_SQUARE. However, the 91 * active area is specified using width and height. 92 * 93 * Note: We don't need to worry about going over the active area, as long as we 94 * stay inside the CfL prediction buffer. 95 */ 96 static void cfl_luma_subsampling_422_lbd_avx2(const uint8_t *input, 97 int input_stride, 98 uint16_t *pred_buf_q3, int width, 99 int height) { 100 (void)width; // Forever 32 101 const __m256i fours = _mm256_set1_epi8(4); // Thirty two fours 102 __m256i *row = (__m256i *)pred_buf_q3; 103 const __m256i *row_end = row + height * CFL_BUF_LINE_I256; 104 do { 105 __m256i top = _mm256_loadu_si256((__m256i *)input); 106 __m256i top_16x16 = _mm256_maddubs_epi16(top, fours); 107 _mm256_storeu_si256(row, top_16x16); 108 input += input_stride; 109 } while ((row += CFL_BUF_LINE_I256) < row_end); 110 } 111 112 CFL_GET_SUBSAMPLE_FUNCTION_AVX2(422, lbd) 113 114 /** 115 * Multiplies the pixels by 8 (scaling in Q3). The AVX2 subsampling is only 116 * performed on block of width 32. 117 * 118 * The CfL prediction buffer is always of size CFL_BUF_SQUARE. However, the 119 * active area is specified using width and height. 120 * 121 * Note: We don't need to worry about going over the active area, as long as we 122 * stay inside the CfL prediction buffer. 123 */ 124 static void cfl_luma_subsampling_444_lbd_avx2(const uint8_t *input, 125 int input_stride, 126 uint16_t *pred_buf_q3, int width, 127 int height) { 128 (void)width; // Forever 32 129 __m256i *row = (__m256i *)pred_buf_q3; 130 const __m256i *row_end = row + height * CFL_BUF_LINE_I256; 131 const __m256i zeros = _mm256_setzero_si256(); 132 do { 133 __m256i top = _mm256_loadu_si256((__m256i *)input); 134 top = _mm256_permute4x64_epi64(top, _MM_SHUFFLE(3, 1, 2, 0)); 135 136 __m256i row_lo = _mm256_unpacklo_epi8(top, zeros); 137 row_lo = _mm256_slli_epi16(row_lo, 3); 138 __m256i row_hi = _mm256_unpackhi_epi8(top, zeros); 139 row_hi = _mm256_slli_epi16(row_hi, 3); 140 141 _mm256_storeu_si256(row, row_lo); 142 _mm256_storeu_si256(row + 1, row_hi); 143 144 input += input_stride; 145 } while ((row += CFL_BUF_LINE_I256) < row_end); 146 } 147 148 CFL_GET_SUBSAMPLE_FUNCTION_AVX2(444, lbd) 149 150 #if CONFIG_AV1_HIGHBITDEPTH 151 /** 152 * Adds 4 pixels (in a 2x2 grid) and multiplies them by 2. Resulting in a more 153 * precise version of a box filter 4:2:0 pixel subsampling in Q3. 154 * 155 * The CfL prediction buffer is always of size CFL_BUF_SQUARE. However, the 156 * active area is specified using width and height. 157 * 158 * Note: We don't need to worry about going over the active area, as long as we 159 * stay inside the CfL prediction buffer. 160 * 161 * Note: For 4:2:0 luma subsampling, the width will never be greater than 16. 162 */ 163 static void cfl_luma_subsampling_420_hbd_avx2(const uint16_t *input, 164 int input_stride, 165 uint16_t *pred_buf_q3, int width, 166 int height) { 167 (void)width; // Forever 32 168 const int luma_stride = input_stride << 1; 169 __m256i *row = (__m256i *)pred_buf_q3; 170 const __m256i *row_end = row + (height >> 1) * CFL_BUF_LINE_I256; 171 do { 172 __m256i top = _mm256_loadu_si256((__m256i *)input); 173 __m256i bot = _mm256_loadu_si256((__m256i *)(input + input_stride)); 174 __m256i sum = _mm256_add_epi16(top, bot); 175 176 __m256i top_1 = _mm256_loadu_si256((__m256i *)(input + 16)); 177 __m256i bot_1 = _mm256_loadu_si256((__m256i *)(input + 16 + input_stride)); 178 __m256i sum_1 = _mm256_add_epi16(top_1, bot_1); 179 180 __m256i hsum = _mm256_hadd_epi16(sum, sum_1); 181 hsum = _mm256_permute4x64_epi64(hsum, _MM_SHUFFLE(3, 1, 2, 0)); 182 hsum = _mm256_add_epi16(hsum, hsum); 183 184 _mm256_storeu_si256(row, hsum); 185 186 input += luma_stride; 187 } while ((row += CFL_BUF_LINE_I256) < row_end); 188 } 189 190 CFL_GET_SUBSAMPLE_FUNCTION_AVX2(420, hbd) 191 192 /** 193 * Adds 2 pixels (in a 2x1 grid) and multiplies them by 4. Resulting in a more 194 * precise version of a box filter 4:2:2 pixel subsampling in Q3. 195 * 196 * The CfL prediction buffer is always of size CFL_BUF_SQUARE. However, the 197 * active area is specified using width and height. 198 * 199 * Note: We don't need to worry about going over the active area, as long as we 200 * stay inside the CfL prediction buffer. 201 * 202 */ 203 static void cfl_luma_subsampling_422_hbd_avx2(const uint16_t *input, 204 int input_stride, 205 uint16_t *pred_buf_q3, int width, 206 int height) { 207 (void)width; // Forever 32 208 __m256i *row = (__m256i *)pred_buf_q3; 209 const __m256i *row_end = row + height * CFL_BUF_LINE_I256; 210 do { 211 __m256i top = _mm256_loadu_si256((__m256i *)input); 212 __m256i top_1 = _mm256_loadu_si256((__m256i *)(input + 16)); 213 __m256i hsum = _mm256_hadd_epi16(top, top_1); 214 hsum = _mm256_permute4x64_epi64(hsum, _MM_SHUFFLE(3, 1, 2, 0)); 215 hsum = _mm256_slli_epi16(hsum, 2); 216 217 _mm256_storeu_si256(row, hsum); 218 219 input += input_stride; 220 } while ((row += CFL_BUF_LINE_I256) < row_end); 221 } 222 223 CFL_GET_SUBSAMPLE_FUNCTION_AVX2(422, hbd) 224 225 static void cfl_luma_subsampling_444_hbd_avx2(const uint16_t *input, 226 int input_stride, 227 uint16_t *pred_buf_q3, int width, 228 int height) { 229 (void)width; // Forever 32 230 __m256i *row = (__m256i *)pred_buf_q3; 231 const __m256i *row_end = row + height * CFL_BUF_LINE_I256; 232 do { 233 __m256i top = _mm256_loadu_si256((__m256i *)input); 234 __m256i top_1 = _mm256_loadu_si256((__m256i *)(input + 16)); 235 _mm256_storeu_si256(row, _mm256_slli_epi16(top, 3)); 236 _mm256_storeu_si256(row + 1, _mm256_slli_epi16(top_1, 3)); 237 input += input_stride; 238 } while ((row += CFL_BUF_LINE_I256) < row_end); 239 } 240 241 CFL_GET_SUBSAMPLE_FUNCTION_AVX2(444, hbd) 242 #endif // CONFIG_AV1_HIGHBITDEPTH 243 244 static inline __m256i predict_unclipped(const __m256i *input, __m256i alpha_q12, 245 __m256i alpha_sign, __m256i dc_q0) { 246 __m256i ac_q3 = _mm256_loadu_si256(input); 247 __m256i ac_sign = _mm256_sign_epi16(alpha_sign, ac_q3); 248 __m256i scaled_luma_q0 = 249 _mm256_mulhrs_epi16(_mm256_abs_epi16(ac_q3), alpha_q12); 250 scaled_luma_q0 = _mm256_sign_epi16(scaled_luma_q0, ac_sign); 251 return _mm256_add_epi16(scaled_luma_q0, dc_q0); 252 } 253 254 static inline void cfl_predict_lbd_avx2(const int16_t *pred_buf_q3, 255 uint8_t *dst, int dst_stride, 256 int alpha_q3, int width, int height) { 257 (void)width; 258 const __m256i alpha_sign = _mm256_set1_epi16(alpha_q3); 259 const __m256i alpha_q12 = _mm256_slli_epi16(_mm256_abs_epi16(alpha_sign), 9); 260 const __m256i dc_q0 = _mm256_set1_epi16(*dst); 261 __m256i *row = (__m256i *)pred_buf_q3; 262 const __m256i *row_end = row + height * CFL_BUF_LINE_I256; 263 264 do { 265 __m256i res = predict_unclipped(row, alpha_q12, alpha_sign, dc_q0); 266 __m256i next = predict_unclipped(row + 1, alpha_q12, alpha_sign, dc_q0); 267 res = _mm256_packus_epi16(res, next); 268 res = _mm256_permute4x64_epi64(res, _MM_SHUFFLE(3, 1, 2, 0)); 269 _mm256_storeu_si256((__m256i *)dst, res); 270 dst += dst_stride; 271 } while ((row += CFL_BUF_LINE_I256) < row_end); 272 } 273 274 CFL_PREDICT_X(avx2, 32, 8, lbd) 275 CFL_PREDICT_X(avx2, 32, 16, lbd) 276 CFL_PREDICT_X(avx2, 32, 32, lbd) 277 278 cfl_predict_lbd_fn cfl_get_predict_lbd_fn_avx2(TX_SIZE tx_size) { 279 static const cfl_predict_lbd_fn pred[TX_SIZES_ALL] = { 280 cfl_predict_lbd_4x4_ssse3, /* 4x4 */ 281 cfl_predict_lbd_8x8_ssse3, /* 8x8 */ 282 cfl_predict_lbd_16x16_ssse3, /* 16x16 */ 283 cfl_predict_lbd_32x32_avx2, /* 32x32 */ 284 NULL, /* 64x64 (invalid CFL size) */ 285 cfl_predict_lbd_4x8_ssse3, /* 4x8 */ 286 cfl_predict_lbd_8x4_ssse3, /* 8x4 */ 287 cfl_predict_lbd_8x16_ssse3, /* 8x16 */ 288 cfl_predict_lbd_16x8_ssse3, /* 16x8 */ 289 cfl_predict_lbd_16x32_ssse3, /* 16x32 */ 290 cfl_predict_lbd_32x16_avx2, /* 32x16 */ 291 NULL, /* 32x64 (invalid CFL size) */ 292 NULL, /* 64x32 (invalid CFL size) */ 293 cfl_predict_lbd_4x16_ssse3, /* 4x16 */ 294 cfl_predict_lbd_16x4_ssse3, /* 16x4 */ 295 cfl_predict_lbd_8x32_ssse3, /* 8x32 */ 296 cfl_predict_lbd_32x8_avx2, /* 32x8 */ 297 NULL, /* 16x64 (invalid CFL size) */ 298 NULL, /* 64x16 (invalid CFL size) */ 299 }; 300 // Modulo TX_SIZES_ALL to ensure that an attacker won't be able to index the 301 // function pointer array out of bounds. 302 return pred[tx_size % TX_SIZES_ALL]; 303 } 304 305 #if CONFIG_AV1_HIGHBITDEPTH 306 static __m256i highbd_max_epi16(int bd) { 307 const __m256i neg_one = _mm256_set1_epi16(-1); 308 // (1 << bd) - 1 => -(-1 << bd) -1 => -1 - (-1 << bd) => -1 ^ (-1 << bd) 309 return _mm256_xor_si256(_mm256_slli_epi16(neg_one, bd), neg_one); 310 } 311 312 static __m256i highbd_clamp_epi16(__m256i u, __m256i zero, __m256i max) { 313 return _mm256_max_epi16(_mm256_min_epi16(u, max), zero); 314 } 315 316 static inline void cfl_predict_hbd_avx2(const int16_t *pred_buf_q3, 317 uint16_t *dst, int dst_stride, 318 int alpha_q3, int bd, int width, 319 int height) { 320 // Use SSSE3 version for smaller widths 321 assert(width == 16 || width == 32); 322 const __m256i alpha_sign = _mm256_set1_epi16(alpha_q3); 323 const __m256i alpha_q12 = _mm256_slli_epi16(_mm256_abs_epi16(alpha_sign), 9); 324 const __m256i dc_q0 = _mm256_loadu_si256((__m256i *)dst); 325 const __m256i max = highbd_max_epi16(bd); 326 327 __m256i *row = (__m256i *)pred_buf_q3; 328 const __m256i *row_end = row + height * CFL_BUF_LINE_I256; 329 do { 330 const __m256i res = predict_unclipped(row, alpha_q12, alpha_sign, dc_q0); 331 _mm256_storeu_si256((__m256i *)dst, 332 highbd_clamp_epi16(res, _mm256_setzero_si256(), max)); 333 if (width == 32) { 334 const __m256i res_1 = 335 predict_unclipped(row + 1, alpha_q12, alpha_sign, dc_q0); 336 _mm256_storeu_si256( 337 (__m256i *)(dst + 16), 338 highbd_clamp_epi16(res_1, _mm256_setzero_si256(), max)); 339 } 340 dst += dst_stride; 341 } while ((row += CFL_BUF_LINE_I256) < row_end); 342 } 343 344 CFL_PREDICT_X(avx2, 16, 4, hbd) 345 CFL_PREDICT_X(avx2, 16, 8, hbd) 346 CFL_PREDICT_X(avx2, 16, 16, hbd) 347 CFL_PREDICT_X(avx2, 16, 32, hbd) 348 CFL_PREDICT_X(avx2, 32, 8, hbd) 349 CFL_PREDICT_X(avx2, 32, 16, hbd) 350 CFL_PREDICT_X(avx2, 32, 32, hbd) 351 352 cfl_predict_hbd_fn cfl_get_predict_hbd_fn_avx2(TX_SIZE tx_size) { 353 static const cfl_predict_hbd_fn pred[TX_SIZES_ALL] = { 354 cfl_predict_hbd_4x4_ssse3, /* 4x4 */ 355 cfl_predict_hbd_8x8_ssse3, /* 8x8 */ 356 cfl_predict_hbd_16x16_avx2, /* 16x16 */ 357 cfl_predict_hbd_32x32_avx2, /* 32x32 */ 358 NULL, /* 64x64 (invalid CFL size) */ 359 cfl_predict_hbd_4x8_ssse3, /* 4x8 */ 360 cfl_predict_hbd_8x4_ssse3, /* 8x4 */ 361 cfl_predict_hbd_8x16_ssse3, /* 8x16 */ 362 cfl_predict_hbd_16x8_avx2, /* 16x8 */ 363 cfl_predict_hbd_16x32_avx2, /* 16x32 */ 364 cfl_predict_hbd_32x16_avx2, /* 32x16 */ 365 NULL, /* 32x64 (invalid CFL size) */ 366 NULL, /* 64x32 (invalid CFL size) */ 367 cfl_predict_hbd_4x16_ssse3, /* 4x16 */ 368 cfl_predict_hbd_16x4_avx2, /* 16x4 */ 369 cfl_predict_hbd_8x32_ssse3, /* 8x32 */ 370 cfl_predict_hbd_32x8_avx2, /* 32x8 */ 371 NULL, /* 16x64 (invalid CFL size) */ 372 NULL, /* 64x16 (invalid CFL size) */ 373 }; 374 // Modulo TX_SIZES_ALL to ensure that an attacker won't be able to index the 375 // function pointer array out of bounds. 376 return pred[tx_size % TX_SIZES_ALL]; 377 } 378 #endif // CONFIG_AV1_HIGHBITDEPTH 379 380 // Returns a vector where all the (32-bits) elements are the sum of all the 381 // lanes in a. 382 static inline __m256i fill_sum_epi32(__m256i a) { 383 // Given that a == [A, B, C, D, E, F, G, H] 384 a = _mm256_hadd_epi32(a, a); 385 // Given that A' == A + B, C' == C + D, E' == E + F, G' == G + H 386 // a == [A', C', A', C', E', G', E', G'] 387 a = _mm256_permute4x64_epi64(a, _MM_SHUFFLE(3, 1, 2, 0)); 388 // a == [A', C', E', G', A', C', E', G'] 389 a = _mm256_hadd_epi32(a, a); 390 // Given that A'' == A' + C' and E'' == E' + G' 391 // a == [A'', E'', A'', E'', A'', E'', A'', E''] 392 return _mm256_hadd_epi32(a, a); 393 // Given that A''' == A'' + E'' 394 // a == [A''', A''', A''', A''', A''', A''', A''', A'''] 395 } 396 397 static inline __m256i _mm256_addl_epi16(__m256i a) { 398 return _mm256_add_epi32(_mm256_unpacklo_epi16(a, _mm256_setzero_si256()), 399 _mm256_unpackhi_epi16(a, _mm256_setzero_si256())); 400 } 401 402 static inline void subtract_average_avx2(const uint16_t *src_ptr, 403 int16_t *dst_ptr, int width, 404 int height, int round_offset, 405 int num_pel_log2) { 406 // Use SSE2 version for smaller widths 407 assert(width == 16 || width == 32); 408 409 const __m256i *src = (__m256i *)src_ptr; 410 const __m256i *const end = src + height * CFL_BUF_LINE_I256; 411 // To maximize usage of the AVX2 registers, we sum two rows per loop 412 // iteration 413 const int step = 2 * CFL_BUF_LINE_I256; 414 415 __m256i sum = _mm256_setzero_si256(); 416 // For width 32, we use a second sum accumulator to reduce accumulator 417 // dependencies in the loop. 418 __m256i sum2; 419 if (width == 32) sum2 = _mm256_setzero_si256(); 420 421 do { 422 // Add top row to the bottom row 423 __m256i l0 = _mm256_add_epi16(_mm256_loadu_si256(src), 424 _mm256_loadu_si256(src + CFL_BUF_LINE_I256)); 425 sum = _mm256_add_epi32(sum, _mm256_addl_epi16(l0)); 426 if (width == 32) { /* Don't worry, this if it gets optimized out. */ 427 // Add the second part of the top row to the second part of the bottom row 428 __m256i l1 = 429 _mm256_add_epi16(_mm256_loadu_si256(src + 1), 430 _mm256_loadu_si256(src + 1 + CFL_BUF_LINE_I256)); 431 sum2 = _mm256_add_epi32(sum2, _mm256_addl_epi16(l1)); 432 } 433 src += step; 434 } while (src < end); 435 // Combine both sum accumulators 436 if (width == 32) sum = _mm256_add_epi32(sum, sum2); 437 438 __m256i fill = fill_sum_epi32(sum); 439 440 __m256i avg_epi16 = _mm256_srli_epi32( 441 _mm256_add_epi32(fill, _mm256_set1_epi32(round_offset)), num_pel_log2); 442 avg_epi16 = _mm256_packs_epi32(avg_epi16, avg_epi16); 443 444 // Store and subtract loop 445 src = (__m256i *)src_ptr; 446 __m256i *dst = (__m256i *)dst_ptr; 447 do { 448 _mm256_storeu_si256(dst, 449 _mm256_sub_epi16(_mm256_loadu_si256(src), avg_epi16)); 450 if (width == 32) { 451 _mm256_storeu_si256( 452 dst + 1, _mm256_sub_epi16(_mm256_loadu_si256(src + 1), avg_epi16)); 453 } 454 src += CFL_BUF_LINE_I256; 455 dst += CFL_BUF_LINE_I256; 456 } while (src < end); 457 } 458 459 // Declare wrappers for AVX2 sizes 460 CFL_SUB_AVG_X(avx2, 16, 4, 32, 6) 461 CFL_SUB_AVG_X(avx2, 16, 8, 64, 7) 462 CFL_SUB_AVG_X(avx2, 16, 16, 128, 8) 463 CFL_SUB_AVG_X(avx2, 16, 32, 256, 9) 464 CFL_SUB_AVG_X(avx2, 32, 8, 128, 8) 465 CFL_SUB_AVG_X(avx2, 32, 16, 256, 9) 466 CFL_SUB_AVG_X(avx2, 32, 32, 512, 10) 467 468 // Based on the observation that for small blocks AVX2 does not outperform 469 // SSE2, we call the SSE2 code for block widths 4 and 8. 470 cfl_subtract_average_fn cfl_get_subtract_average_fn_avx2(TX_SIZE tx_size) { 471 static const cfl_subtract_average_fn sub_avg[TX_SIZES_ALL] = { 472 cfl_subtract_average_4x4_sse2, /* 4x4 */ 473 cfl_subtract_average_8x8_sse2, /* 8x8 */ 474 cfl_subtract_average_16x16_avx2, /* 16x16 */ 475 cfl_subtract_average_32x32_avx2, /* 32x32 */ 476 NULL, /* 64x64 (invalid CFL size) */ 477 cfl_subtract_average_4x8_sse2, /* 4x8 */ 478 cfl_subtract_average_8x4_sse2, /* 8x4 */ 479 cfl_subtract_average_8x16_sse2, /* 8x16 */ 480 cfl_subtract_average_16x8_avx2, /* 16x8 */ 481 cfl_subtract_average_16x32_avx2, /* 16x32 */ 482 cfl_subtract_average_32x16_avx2, /* 32x16 */ 483 NULL, /* 32x64 (invalid CFL size) */ 484 NULL, /* 64x32 (invalid CFL size) */ 485 cfl_subtract_average_4x16_sse2, /* 4x16 */ 486 cfl_subtract_average_16x4_avx2, /* 16x4 */ 487 cfl_subtract_average_8x32_sse2, /* 8x32 */ 488 cfl_subtract_average_32x8_avx2, /* 32x8 */ 489 NULL, /* 16x64 (invalid CFL size) */ 490 NULL, /* 64x16 (invalid CFL size) */ 491 }; 492 // Modulo TX_SIZES_ALL to ensure that an attacker won't be able to 493 // index the function pointer array out of bounds. 494 return sub_avg[tx_size % TX_SIZES_ALL]; 495 }