tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

cfl_avx2.c (21372B)


      1 /*
      2 * Copyright (c) 2017, Alliance for Open Media. All rights reserved.
      3 *
      4 * This source code is subject to the terms of the BSD 2 Clause License and
      5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
      6 * was not distributed with this source code in the LICENSE file, you can
      7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
      8 * Media Patent License 1.0 was not distributed with this source code in the
      9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
     10 */
     11 #include <immintrin.h>
     12 
     13 #include "config/av1_rtcd.h"
     14 
     15 #include "av1/common/cfl.h"
     16 
     17 #include "av1/common/x86/cfl_simd.h"
     18 
     19 #define CFL_GET_SUBSAMPLE_FUNCTION_AVX2(sub, bd)                               \
     20  CFL_SUBSAMPLE(avx2, sub, bd, 32, 32)                                         \
     21  CFL_SUBSAMPLE(avx2, sub, bd, 32, 16)                                         \
     22  CFL_SUBSAMPLE(avx2, sub, bd, 32, 8)                                          \
     23  cfl_subsample_##bd##_fn cfl_get_luma_subsampling_##sub##_##bd##_avx2(        \
     24      TX_SIZE tx_size) {                                                       \
     25    static const cfl_subsample_##bd##_fn subfn_##sub[TX_SIZES_ALL] = {         \
     26      cfl_subsample_##bd##_##sub##_4x4_ssse3,   /* 4x4 */                      \
     27      cfl_subsample_##bd##_##sub##_8x8_ssse3,   /* 8x8 */                      \
     28      cfl_subsample_##bd##_##sub##_16x16_ssse3, /* 16x16 */                    \
     29      cfl_subsample_##bd##_##sub##_32x32_avx2,  /* 32x32 */                    \
     30      NULL,                                     /* 64x64 (invalid CFL size) */ \
     31      cfl_subsample_##bd##_##sub##_4x8_ssse3,   /* 4x8 */                      \
     32      cfl_subsample_##bd##_##sub##_8x4_ssse3,   /* 8x4 */                      \
     33      cfl_subsample_##bd##_##sub##_8x16_ssse3,  /* 8x16 */                     \
     34      cfl_subsample_##bd##_##sub##_16x8_ssse3,  /* 16x8 */                     \
     35      cfl_subsample_##bd##_##sub##_16x32_ssse3, /* 16x32 */                    \
     36      cfl_subsample_##bd##_##sub##_32x16_avx2,  /* 32x16 */                    \
     37      NULL,                                     /* 32x64 (invalid CFL size) */ \
     38      NULL,                                     /* 64x32 (invalid CFL size) */ \
     39      cfl_subsample_##bd##_##sub##_4x16_ssse3,  /* 4x16  */                    \
     40      cfl_subsample_##bd##_##sub##_16x4_ssse3,  /* 16x4  */                    \
     41      cfl_subsample_##bd##_##sub##_8x32_ssse3,  /* 8x32  */                    \
     42      cfl_subsample_##bd##_##sub##_32x8_avx2,   /* 32x8  */                    \
     43      NULL,                                     /* 16x64 (invalid CFL size) */ \
     44      NULL,                                     /* 64x16 (invalid CFL size) */ \
     45    };                                                                         \
     46    return subfn_##sub[tx_size];                                               \
     47  }
     48 
     49 /**
     50 * Adds 4 pixels (in a 2x2 grid) and multiplies them by 2. Resulting in a more
     51 * precise version of a box filter 4:2:0 pixel subsampling in Q3.
     52 *
     53 * The CfL prediction buffer is always of size CFL_BUF_SQUARE. However, the
     54 * active area is specified using width and height.
     55 *
     56 * Note: We don't need to worry about going over the active area, as long as we
     57 * stay inside the CfL prediction buffer.
     58 *
     59 * Note: For 4:2:0 luma subsampling, the width will never be greater than 16.
     60 */
     61 static void cfl_luma_subsampling_420_lbd_avx2(const uint8_t *input,
     62                                              int input_stride,
     63                                              uint16_t *pred_buf_q3, int width,
     64                                              int height) {
     65  (void)width;                               // Forever 32
     66  const __m256i twos = _mm256_set1_epi8(2);  // Thirty two twos
     67  const int luma_stride = input_stride << 1;
     68  __m256i *row = (__m256i *)pred_buf_q3;
     69  const __m256i *row_end = row + (height >> 1) * CFL_BUF_LINE_I256;
     70  do {
     71    __m256i top = _mm256_loadu_si256((__m256i *)input);
     72    __m256i bot = _mm256_loadu_si256((__m256i *)(input + input_stride));
     73 
     74    __m256i top_16x16 = _mm256_maddubs_epi16(top, twos);
     75    __m256i bot_16x16 = _mm256_maddubs_epi16(bot, twos);
     76    __m256i sum_16x16 = _mm256_add_epi16(top_16x16, bot_16x16);
     77 
     78    _mm256_storeu_si256(row, sum_16x16);
     79 
     80    input += luma_stride;
     81  } while ((row += CFL_BUF_LINE_I256) < row_end);
     82 }
     83 
     84 CFL_GET_SUBSAMPLE_FUNCTION_AVX2(420, lbd)
     85 
     86 /**
     87 * Adds 2 pixels (in a 2x1 grid) and multiplies them by 4. Resulting in a more
     88 * precise version of a box filter 4:2:2 pixel subsampling in Q3.
     89 *
     90 * The CfL prediction buffer is always of size CFL_BUF_SQUARE. However, the
     91 * active area is specified using width and height.
     92 *
     93 * Note: We don't need to worry about going over the active area, as long as we
     94 * stay inside the CfL prediction buffer.
     95 */
     96 static void cfl_luma_subsampling_422_lbd_avx2(const uint8_t *input,
     97                                              int input_stride,
     98                                              uint16_t *pred_buf_q3, int width,
     99                                              int height) {
    100  (void)width;                                // Forever 32
    101  const __m256i fours = _mm256_set1_epi8(4);  // Thirty two fours
    102  __m256i *row = (__m256i *)pred_buf_q3;
    103  const __m256i *row_end = row + height * CFL_BUF_LINE_I256;
    104  do {
    105    __m256i top = _mm256_loadu_si256((__m256i *)input);
    106    __m256i top_16x16 = _mm256_maddubs_epi16(top, fours);
    107    _mm256_storeu_si256(row, top_16x16);
    108    input += input_stride;
    109  } while ((row += CFL_BUF_LINE_I256) < row_end);
    110 }
    111 
    112 CFL_GET_SUBSAMPLE_FUNCTION_AVX2(422, lbd)
    113 
    114 /**
    115 * Multiplies the pixels by 8 (scaling in Q3). The AVX2 subsampling is only
    116 * performed on block of width 32.
    117 *
    118 * The CfL prediction buffer is always of size CFL_BUF_SQUARE. However, the
    119 * active area is specified using width and height.
    120 *
    121 * Note: We don't need to worry about going over the active area, as long as we
    122 * stay inside the CfL prediction buffer.
    123 */
    124 static void cfl_luma_subsampling_444_lbd_avx2(const uint8_t *input,
    125                                              int input_stride,
    126                                              uint16_t *pred_buf_q3, int width,
    127                                              int height) {
    128  (void)width;  // Forever 32
    129  __m256i *row = (__m256i *)pred_buf_q3;
    130  const __m256i *row_end = row + height * CFL_BUF_LINE_I256;
    131  const __m256i zeros = _mm256_setzero_si256();
    132  do {
    133    __m256i top = _mm256_loadu_si256((__m256i *)input);
    134    top = _mm256_permute4x64_epi64(top, _MM_SHUFFLE(3, 1, 2, 0));
    135 
    136    __m256i row_lo = _mm256_unpacklo_epi8(top, zeros);
    137    row_lo = _mm256_slli_epi16(row_lo, 3);
    138    __m256i row_hi = _mm256_unpackhi_epi8(top, zeros);
    139    row_hi = _mm256_slli_epi16(row_hi, 3);
    140 
    141    _mm256_storeu_si256(row, row_lo);
    142    _mm256_storeu_si256(row + 1, row_hi);
    143 
    144    input += input_stride;
    145  } while ((row += CFL_BUF_LINE_I256) < row_end);
    146 }
    147 
    148 CFL_GET_SUBSAMPLE_FUNCTION_AVX2(444, lbd)
    149 
    150 #if CONFIG_AV1_HIGHBITDEPTH
    151 /**
    152 * Adds 4 pixels (in a 2x2 grid) and multiplies them by 2. Resulting in a more
    153 * precise version of a box filter 4:2:0 pixel subsampling in Q3.
    154 *
    155 * The CfL prediction buffer is always of size CFL_BUF_SQUARE. However, the
    156 * active area is specified using width and height.
    157 *
    158 * Note: We don't need to worry about going over the active area, as long as we
    159 * stay inside the CfL prediction buffer.
    160 *
    161 * Note: For 4:2:0 luma subsampling, the width will never be greater than 16.
    162 */
    163 static void cfl_luma_subsampling_420_hbd_avx2(const uint16_t *input,
    164                                              int input_stride,
    165                                              uint16_t *pred_buf_q3, int width,
    166                                              int height) {
    167  (void)width;  // Forever 32
    168  const int luma_stride = input_stride << 1;
    169  __m256i *row = (__m256i *)pred_buf_q3;
    170  const __m256i *row_end = row + (height >> 1) * CFL_BUF_LINE_I256;
    171  do {
    172    __m256i top = _mm256_loadu_si256((__m256i *)input);
    173    __m256i bot = _mm256_loadu_si256((__m256i *)(input + input_stride));
    174    __m256i sum = _mm256_add_epi16(top, bot);
    175 
    176    __m256i top_1 = _mm256_loadu_si256((__m256i *)(input + 16));
    177    __m256i bot_1 = _mm256_loadu_si256((__m256i *)(input + 16 + input_stride));
    178    __m256i sum_1 = _mm256_add_epi16(top_1, bot_1);
    179 
    180    __m256i hsum = _mm256_hadd_epi16(sum, sum_1);
    181    hsum = _mm256_permute4x64_epi64(hsum, _MM_SHUFFLE(3, 1, 2, 0));
    182    hsum = _mm256_add_epi16(hsum, hsum);
    183 
    184    _mm256_storeu_si256(row, hsum);
    185 
    186    input += luma_stride;
    187  } while ((row += CFL_BUF_LINE_I256) < row_end);
    188 }
    189 
    190 CFL_GET_SUBSAMPLE_FUNCTION_AVX2(420, hbd)
    191 
    192 /**
    193 * Adds 2 pixels (in a 2x1 grid) and multiplies them by 4. Resulting in a more
    194 * precise version of a box filter 4:2:2 pixel subsampling in Q3.
    195 *
    196 * The CfL prediction buffer is always of size CFL_BUF_SQUARE. However, the
    197 * active area is specified using width and height.
    198 *
    199 * Note: We don't need to worry about going over the active area, as long as we
    200 * stay inside the CfL prediction buffer.
    201 *
    202 */
    203 static void cfl_luma_subsampling_422_hbd_avx2(const uint16_t *input,
    204                                              int input_stride,
    205                                              uint16_t *pred_buf_q3, int width,
    206                                              int height) {
    207  (void)width;  // Forever 32
    208  __m256i *row = (__m256i *)pred_buf_q3;
    209  const __m256i *row_end = row + height * CFL_BUF_LINE_I256;
    210  do {
    211    __m256i top = _mm256_loadu_si256((__m256i *)input);
    212    __m256i top_1 = _mm256_loadu_si256((__m256i *)(input + 16));
    213    __m256i hsum = _mm256_hadd_epi16(top, top_1);
    214    hsum = _mm256_permute4x64_epi64(hsum, _MM_SHUFFLE(3, 1, 2, 0));
    215    hsum = _mm256_slli_epi16(hsum, 2);
    216 
    217    _mm256_storeu_si256(row, hsum);
    218 
    219    input += input_stride;
    220  } while ((row += CFL_BUF_LINE_I256) < row_end);
    221 }
    222 
    223 CFL_GET_SUBSAMPLE_FUNCTION_AVX2(422, hbd)
    224 
    225 static void cfl_luma_subsampling_444_hbd_avx2(const uint16_t *input,
    226                                              int input_stride,
    227                                              uint16_t *pred_buf_q3, int width,
    228                                              int height) {
    229  (void)width;  // Forever 32
    230  __m256i *row = (__m256i *)pred_buf_q3;
    231  const __m256i *row_end = row + height * CFL_BUF_LINE_I256;
    232  do {
    233    __m256i top = _mm256_loadu_si256((__m256i *)input);
    234    __m256i top_1 = _mm256_loadu_si256((__m256i *)(input + 16));
    235    _mm256_storeu_si256(row, _mm256_slli_epi16(top, 3));
    236    _mm256_storeu_si256(row + 1, _mm256_slli_epi16(top_1, 3));
    237    input += input_stride;
    238  } while ((row += CFL_BUF_LINE_I256) < row_end);
    239 }
    240 
    241 CFL_GET_SUBSAMPLE_FUNCTION_AVX2(444, hbd)
    242 #endif  // CONFIG_AV1_HIGHBITDEPTH
    243 
    244 static inline __m256i predict_unclipped(const __m256i *input, __m256i alpha_q12,
    245                                        __m256i alpha_sign, __m256i dc_q0) {
    246  __m256i ac_q3 = _mm256_loadu_si256(input);
    247  __m256i ac_sign = _mm256_sign_epi16(alpha_sign, ac_q3);
    248  __m256i scaled_luma_q0 =
    249      _mm256_mulhrs_epi16(_mm256_abs_epi16(ac_q3), alpha_q12);
    250  scaled_luma_q0 = _mm256_sign_epi16(scaled_luma_q0, ac_sign);
    251  return _mm256_add_epi16(scaled_luma_q0, dc_q0);
    252 }
    253 
    254 static inline void cfl_predict_lbd_avx2(const int16_t *pred_buf_q3,
    255                                        uint8_t *dst, int dst_stride,
    256                                        int alpha_q3, int width, int height) {
    257  (void)width;
    258  const __m256i alpha_sign = _mm256_set1_epi16(alpha_q3);
    259  const __m256i alpha_q12 = _mm256_slli_epi16(_mm256_abs_epi16(alpha_sign), 9);
    260  const __m256i dc_q0 = _mm256_set1_epi16(*dst);
    261  __m256i *row = (__m256i *)pred_buf_q3;
    262  const __m256i *row_end = row + height * CFL_BUF_LINE_I256;
    263 
    264  do {
    265    __m256i res = predict_unclipped(row, alpha_q12, alpha_sign, dc_q0);
    266    __m256i next = predict_unclipped(row + 1, alpha_q12, alpha_sign, dc_q0);
    267    res = _mm256_packus_epi16(res, next);
    268    res = _mm256_permute4x64_epi64(res, _MM_SHUFFLE(3, 1, 2, 0));
    269    _mm256_storeu_si256((__m256i *)dst, res);
    270    dst += dst_stride;
    271  } while ((row += CFL_BUF_LINE_I256) < row_end);
    272 }
    273 
    274 CFL_PREDICT_X(avx2, 32, 8, lbd)
    275 CFL_PREDICT_X(avx2, 32, 16, lbd)
    276 CFL_PREDICT_X(avx2, 32, 32, lbd)
    277 
    278 cfl_predict_lbd_fn cfl_get_predict_lbd_fn_avx2(TX_SIZE tx_size) {
    279  static const cfl_predict_lbd_fn pred[TX_SIZES_ALL] = {
    280    cfl_predict_lbd_4x4_ssse3,   /* 4x4 */
    281    cfl_predict_lbd_8x8_ssse3,   /* 8x8 */
    282    cfl_predict_lbd_16x16_ssse3, /* 16x16 */
    283    cfl_predict_lbd_32x32_avx2,  /* 32x32 */
    284    NULL,                        /* 64x64 (invalid CFL size) */
    285    cfl_predict_lbd_4x8_ssse3,   /* 4x8 */
    286    cfl_predict_lbd_8x4_ssse3,   /* 8x4 */
    287    cfl_predict_lbd_8x16_ssse3,  /* 8x16 */
    288    cfl_predict_lbd_16x8_ssse3,  /* 16x8 */
    289    cfl_predict_lbd_16x32_ssse3, /* 16x32 */
    290    cfl_predict_lbd_32x16_avx2,  /* 32x16 */
    291    NULL,                        /* 32x64 (invalid CFL size) */
    292    NULL,                        /* 64x32 (invalid CFL size) */
    293    cfl_predict_lbd_4x16_ssse3,  /* 4x16  */
    294    cfl_predict_lbd_16x4_ssse3,  /* 16x4  */
    295    cfl_predict_lbd_8x32_ssse3,  /* 8x32  */
    296    cfl_predict_lbd_32x8_avx2,   /* 32x8  */
    297    NULL,                        /* 16x64 (invalid CFL size) */
    298    NULL,                        /* 64x16 (invalid CFL size) */
    299  };
    300  // Modulo TX_SIZES_ALL to ensure that an attacker won't be able to index the
    301  // function pointer array out of bounds.
    302  return pred[tx_size % TX_SIZES_ALL];
    303 }
    304 
    305 #if CONFIG_AV1_HIGHBITDEPTH
    306 static __m256i highbd_max_epi16(int bd) {
    307  const __m256i neg_one = _mm256_set1_epi16(-1);
    308  // (1 << bd) - 1 => -(-1 << bd) -1 => -1 - (-1 << bd) => -1 ^ (-1 << bd)
    309  return _mm256_xor_si256(_mm256_slli_epi16(neg_one, bd), neg_one);
    310 }
    311 
    312 static __m256i highbd_clamp_epi16(__m256i u, __m256i zero, __m256i max) {
    313  return _mm256_max_epi16(_mm256_min_epi16(u, max), zero);
    314 }
    315 
    316 static inline void cfl_predict_hbd_avx2(const int16_t *pred_buf_q3,
    317                                        uint16_t *dst, int dst_stride,
    318                                        int alpha_q3, int bd, int width,
    319                                        int height) {
    320  // Use SSSE3 version for smaller widths
    321  assert(width == 16 || width == 32);
    322  const __m256i alpha_sign = _mm256_set1_epi16(alpha_q3);
    323  const __m256i alpha_q12 = _mm256_slli_epi16(_mm256_abs_epi16(alpha_sign), 9);
    324  const __m256i dc_q0 = _mm256_loadu_si256((__m256i *)dst);
    325  const __m256i max = highbd_max_epi16(bd);
    326 
    327  __m256i *row = (__m256i *)pred_buf_q3;
    328  const __m256i *row_end = row + height * CFL_BUF_LINE_I256;
    329  do {
    330    const __m256i res = predict_unclipped(row, alpha_q12, alpha_sign, dc_q0);
    331    _mm256_storeu_si256((__m256i *)dst,
    332                        highbd_clamp_epi16(res, _mm256_setzero_si256(), max));
    333    if (width == 32) {
    334      const __m256i res_1 =
    335          predict_unclipped(row + 1, alpha_q12, alpha_sign, dc_q0);
    336      _mm256_storeu_si256(
    337          (__m256i *)(dst + 16),
    338          highbd_clamp_epi16(res_1, _mm256_setzero_si256(), max));
    339    }
    340    dst += dst_stride;
    341  } while ((row += CFL_BUF_LINE_I256) < row_end);
    342 }
    343 
    344 CFL_PREDICT_X(avx2, 16, 4, hbd)
    345 CFL_PREDICT_X(avx2, 16, 8, hbd)
    346 CFL_PREDICT_X(avx2, 16, 16, hbd)
    347 CFL_PREDICT_X(avx2, 16, 32, hbd)
    348 CFL_PREDICT_X(avx2, 32, 8, hbd)
    349 CFL_PREDICT_X(avx2, 32, 16, hbd)
    350 CFL_PREDICT_X(avx2, 32, 32, hbd)
    351 
    352 cfl_predict_hbd_fn cfl_get_predict_hbd_fn_avx2(TX_SIZE tx_size) {
    353  static const cfl_predict_hbd_fn pred[TX_SIZES_ALL] = {
    354    cfl_predict_hbd_4x4_ssse3,  /* 4x4 */
    355    cfl_predict_hbd_8x8_ssse3,  /* 8x8 */
    356    cfl_predict_hbd_16x16_avx2, /* 16x16 */
    357    cfl_predict_hbd_32x32_avx2, /* 32x32 */
    358    NULL,                       /* 64x64 (invalid CFL size) */
    359    cfl_predict_hbd_4x8_ssse3,  /* 4x8 */
    360    cfl_predict_hbd_8x4_ssse3,  /* 8x4 */
    361    cfl_predict_hbd_8x16_ssse3, /* 8x16 */
    362    cfl_predict_hbd_16x8_avx2,  /* 16x8 */
    363    cfl_predict_hbd_16x32_avx2, /* 16x32 */
    364    cfl_predict_hbd_32x16_avx2, /* 32x16 */
    365    NULL,                       /* 32x64 (invalid CFL size) */
    366    NULL,                       /* 64x32 (invalid CFL size) */
    367    cfl_predict_hbd_4x16_ssse3, /* 4x16  */
    368    cfl_predict_hbd_16x4_avx2,  /* 16x4  */
    369    cfl_predict_hbd_8x32_ssse3, /* 8x32  */
    370    cfl_predict_hbd_32x8_avx2,  /* 32x8  */
    371    NULL,                       /* 16x64 (invalid CFL size) */
    372    NULL,                       /* 64x16 (invalid CFL size) */
    373  };
    374  // Modulo TX_SIZES_ALL to ensure that an attacker won't be able to index the
    375  // function pointer array out of bounds.
    376  return pred[tx_size % TX_SIZES_ALL];
    377 }
    378 #endif  // CONFIG_AV1_HIGHBITDEPTH
    379 
    380 // Returns a vector where all the (32-bits) elements are the sum of all the
    381 // lanes in a.
    382 static inline __m256i fill_sum_epi32(__m256i a) {
    383  // Given that a == [A, B, C, D, E, F, G, H]
    384  a = _mm256_hadd_epi32(a, a);
    385  // Given that A' == A + B, C' == C + D, E' == E + F, G' == G + H
    386  // a == [A', C', A', C', E', G', E', G']
    387  a = _mm256_permute4x64_epi64(a, _MM_SHUFFLE(3, 1, 2, 0));
    388  // a == [A', C', E', G', A', C', E', G']
    389  a = _mm256_hadd_epi32(a, a);
    390  // Given that A'' == A' + C' and E'' == E' + G'
    391  // a == [A'', E'', A'', E'', A'', E'', A'', E'']
    392  return _mm256_hadd_epi32(a, a);
    393  // Given that A''' == A'' + E''
    394  // a == [A''', A''', A''', A''', A''', A''', A''', A''']
    395 }
    396 
    397 static inline __m256i _mm256_addl_epi16(__m256i a) {
    398  return _mm256_add_epi32(_mm256_unpacklo_epi16(a, _mm256_setzero_si256()),
    399                          _mm256_unpackhi_epi16(a, _mm256_setzero_si256()));
    400 }
    401 
    402 static inline void subtract_average_avx2(const uint16_t *src_ptr,
    403                                         int16_t *dst_ptr, int width,
    404                                         int height, int round_offset,
    405                                         int num_pel_log2) {
    406  // Use SSE2 version for smaller widths
    407  assert(width == 16 || width == 32);
    408 
    409  const __m256i *src = (__m256i *)src_ptr;
    410  const __m256i *const end = src + height * CFL_BUF_LINE_I256;
    411  // To maximize usage of the AVX2 registers, we sum two rows per loop
    412  // iteration
    413  const int step = 2 * CFL_BUF_LINE_I256;
    414 
    415  __m256i sum = _mm256_setzero_si256();
    416  // For width 32, we use a second sum accumulator to reduce accumulator
    417  // dependencies in the loop.
    418  __m256i sum2;
    419  if (width == 32) sum2 = _mm256_setzero_si256();
    420 
    421  do {
    422    // Add top row to the bottom row
    423    __m256i l0 = _mm256_add_epi16(_mm256_loadu_si256(src),
    424                                  _mm256_loadu_si256(src + CFL_BUF_LINE_I256));
    425    sum = _mm256_add_epi32(sum, _mm256_addl_epi16(l0));
    426    if (width == 32) { /* Don't worry, this if it gets optimized out. */
    427      // Add the second part of the top row to the second part of the bottom row
    428      __m256i l1 =
    429          _mm256_add_epi16(_mm256_loadu_si256(src + 1),
    430                           _mm256_loadu_si256(src + 1 + CFL_BUF_LINE_I256));
    431      sum2 = _mm256_add_epi32(sum2, _mm256_addl_epi16(l1));
    432    }
    433    src += step;
    434  } while (src < end);
    435  // Combine both sum accumulators
    436  if (width == 32) sum = _mm256_add_epi32(sum, sum2);
    437 
    438  __m256i fill = fill_sum_epi32(sum);
    439 
    440  __m256i avg_epi16 = _mm256_srli_epi32(
    441      _mm256_add_epi32(fill, _mm256_set1_epi32(round_offset)), num_pel_log2);
    442  avg_epi16 = _mm256_packs_epi32(avg_epi16, avg_epi16);
    443 
    444  // Store and subtract loop
    445  src = (__m256i *)src_ptr;
    446  __m256i *dst = (__m256i *)dst_ptr;
    447  do {
    448    _mm256_storeu_si256(dst,
    449                        _mm256_sub_epi16(_mm256_loadu_si256(src), avg_epi16));
    450    if (width == 32) {
    451      _mm256_storeu_si256(
    452          dst + 1, _mm256_sub_epi16(_mm256_loadu_si256(src + 1), avg_epi16));
    453    }
    454    src += CFL_BUF_LINE_I256;
    455    dst += CFL_BUF_LINE_I256;
    456  } while (src < end);
    457 }
    458 
    459 // Declare wrappers for AVX2 sizes
    460 CFL_SUB_AVG_X(avx2, 16, 4, 32, 6)
    461 CFL_SUB_AVG_X(avx2, 16, 8, 64, 7)
    462 CFL_SUB_AVG_X(avx2, 16, 16, 128, 8)
    463 CFL_SUB_AVG_X(avx2, 16, 32, 256, 9)
    464 CFL_SUB_AVG_X(avx2, 32, 8, 128, 8)
    465 CFL_SUB_AVG_X(avx2, 32, 16, 256, 9)
    466 CFL_SUB_AVG_X(avx2, 32, 32, 512, 10)
    467 
    468 // Based on the observation that for small blocks AVX2 does not outperform
    469 // SSE2, we call the SSE2 code for block widths 4 and 8.
    470 cfl_subtract_average_fn cfl_get_subtract_average_fn_avx2(TX_SIZE tx_size) {
    471  static const cfl_subtract_average_fn sub_avg[TX_SIZES_ALL] = {
    472    cfl_subtract_average_4x4_sse2,   /* 4x4 */
    473    cfl_subtract_average_8x8_sse2,   /* 8x8 */
    474    cfl_subtract_average_16x16_avx2, /* 16x16 */
    475    cfl_subtract_average_32x32_avx2, /* 32x32 */
    476    NULL,                            /* 64x64 (invalid CFL size) */
    477    cfl_subtract_average_4x8_sse2,   /* 4x8 */
    478    cfl_subtract_average_8x4_sse2,   /* 8x4 */
    479    cfl_subtract_average_8x16_sse2,  /* 8x16 */
    480    cfl_subtract_average_16x8_avx2,  /* 16x8 */
    481    cfl_subtract_average_16x32_avx2, /* 16x32 */
    482    cfl_subtract_average_32x16_avx2, /* 32x16 */
    483    NULL,                            /* 32x64 (invalid CFL size) */
    484    NULL,                            /* 64x32 (invalid CFL size) */
    485    cfl_subtract_average_4x16_sse2,  /* 4x16 */
    486    cfl_subtract_average_16x4_avx2,  /* 16x4 */
    487    cfl_subtract_average_8x32_sse2,  /* 8x32 */
    488    cfl_subtract_average_32x8_avx2,  /* 32x8 */
    489    NULL,                            /* 16x64 (invalid CFL size) */
    490    NULL,                            /* 64x16 (invalid CFL size) */
    491  };
    492  // Modulo TX_SIZES_ALL to ensure that an attacker won't be able to
    493  // index the function pointer array out of bounds.
    494  return sub_avg[tx_size % TX_SIZES_ALL];
    495 }