tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

variance_impl_avx2.c (67415B)


      1 /*
      2 * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
      3 *
      4 * This source code is subject to the terms of the BSD 2 Clause License and
      5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
      6 * was not distributed with this source code in the LICENSE file, you can
      7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
      8 * Media Patent License 1.0 was not distributed with this source code in the
      9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
     10 */
     11 
     12 #include <immintrin.h>  // AVX2
     13 
     14 #include "config/aom_dsp_rtcd.h"
     15 
     16 #include "aom_ports/mem.h"
     17 
     18 /* clang-format off */
     19 DECLARE_ALIGNED(32, static const uint8_t, bilinear_filters_avx2[512]) = {
     20  16,  0, 16,  0, 16,  0, 16,  0, 16,  0, 16,  0, 16,  0, 16,  0,
     21  16,  0, 16,  0, 16,  0, 16,  0, 16,  0, 16,  0, 16,  0, 16,  0,
     22  14,  2, 14,  2, 14,  2, 14,  2, 14,  2, 14,  2, 14,  2, 14,  2,
     23  14,  2, 14,  2, 14,  2, 14,  2, 14,  2, 14,  2, 14,  2, 14,  2,
     24  12,  4, 12,  4, 12,  4, 12,  4, 12,  4, 12,  4, 12,  4, 12,  4,
     25  12,  4, 12,  4, 12,  4, 12,  4, 12,  4, 12,  4, 12,  4, 12,  4,
     26  10,  6, 10,  6, 10,  6, 10,  6, 10,  6, 10,  6, 10,  6, 10,  6,
     27  10,  6, 10,  6, 10,  6, 10,  6, 10,  6, 10,  6, 10,  6, 10,  6,
     28   8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,
     29   8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,
     30   6, 10,  6, 10,  6, 10,  6, 10,  6, 10,  6, 10,  6, 10,  6, 10,
     31   6, 10,  6, 10,  6, 10,  6, 10,  6, 10,  6, 10,  6, 10,  6, 10,
     32   4, 12,  4, 12,  4, 12,  4, 12,  4, 12,  4, 12,  4, 12,  4, 12,
     33   4, 12,  4, 12,  4, 12,  4, 12,  4, 12,  4, 12,  4, 12,  4, 12,
     34   2, 14,  2, 14,  2, 14,  2, 14,  2, 14,  2, 14,  2, 14,  2, 14,
     35   2, 14,  2, 14,  2, 14,  2, 14,  2, 14,  2, 14,  2, 14,  2, 14,
     36 };
     37 /* clang-format on */
     38 
     39 #define FILTER_SRC(filter)                               \
     40  /* filter the source */                                \
     41  exp_src_lo = _mm256_maddubs_epi16(exp_src_lo, filter); \
     42  exp_src_hi = _mm256_maddubs_epi16(exp_src_hi, filter); \
     43                                                         \
     44  /* add 8 to source */                                  \
     45  exp_src_lo = _mm256_add_epi16(exp_src_lo, pw8);        \
     46  exp_src_hi = _mm256_add_epi16(exp_src_hi, pw8);        \
     47                                                         \
     48  /* divide source by 16 */                              \
     49  exp_src_lo = _mm256_srai_epi16(exp_src_lo, 4);         \
     50  exp_src_hi = _mm256_srai_epi16(exp_src_hi, 4);
     51 
     52 #define MERGE_WITH_SRC(src_reg, reg)               \
     53  exp_src_lo = _mm256_unpacklo_epi8(src_reg, reg); \
     54  exp_src_hi = _mm256_unpackhi_epi8(src_reg, reg);
     55 
     56 #define LOAD_SRC_DST                                    \
     57  /* load source and destination */                     \
     58  src_reg = _mm256_loadu_si256((__m256i const *)(src)); \
     59  dst_reg = _mm256_loadu_si256((__m256i const *)(dst));
     60 
     61 #define AVG_NEXT_SRC(src_reg, size_stride)                                 \
     62  src_next_reg = _mm256_loadu_si256((__m256i const *)(src + size_stride)); \
     63  /* average between current and next stride source */                     \
     64  src_reg = _mm256_avg_epu8(src_reg, src_next_reg);
     65 
     66 #define MERGE_NEXT_SRC(src_reg, size_stride)                               \
     67  src_next_reg = _mm256_loadu_si256((__m256i const *)(src + size_stride)); \
     68  MERGE_WITH_SRC(src_reg, src_next_reg)
     69 
     70 #define CALC_SUM_SSE_INSIDE_LOOP                          \
     71  /* expand each byte to 2 bytes */                       \
     72  exp_dst_lo = _mm256_unpacklo_epi8(dst_reg, zero_reg);   \
     73  exp_dst_hi = _mm256_unpackhi_epi8(dst_reg, zero_reg);   \
     74  /* source - dest */                                     \
     75  exp_src_lo = _mm256_sub_epi16(exp_src_lo, exp_dst_lo);  \
     76  exp_src_hi = _mm256_sub_epi16(exp_src_hi, exp_dst_hi);  \
     77  /* caculate sum */                                      \
     78  sum_reg = _mm256_add_epi16(sum_reg, exp_src_lo);        \
     79  exp_src_lo = _mm256_madd_epi16(exp_src_lo, exp_src_lo); \
     80  sum_reg = _mm256_add_epi16(sum_reg, exp_src_hi);        \
     81  exp_src_hi = _mm256_madd_epi16(exp_src_hi, exp_src_hi); \
     82  /* calculate sse */                                     \
     83  sse_reg = _mm256_add_epi32(sse_reg, exp_src_lo);        \
     84  sse_reg = _mm256_add_epi32(sse_reg, exp_src_hi);
     85 
     86 // final calculation to sum and sse
     87 #define CALC_SUM_AND_SSE                                                   \
     88  res_cmp = _mm256_cmpgt_epi16(zero_reg, sum_reg);                         \
     89  sse_reg_hi = _mm256_srli_si256(sse_reg, 8);                              \
     90  sum_reg_lo = _mm256_unpacklo_epi16(sum_reg, res_cmp);                    \
     91  sum_reg_hi = _mm256_unpackhi_epi16(sum_reg, res_cmp);                    \
     92  sse_reg = _mm256_add_epi32(sse_reg, sse_reg_hi);                         \
     93  sum_reg = _mm256_add_epi32(sum_reg_lo, sum_reg_hi);                      \
     94                                                                           \
     95  sse_reg_hi = _mm256_srli_si256(sse_reg, 4);                              \
     96  sum_reg_hi = _mm256_srli_si256(sum_reg, 8);                              \
     97                                                                           \
     98  sse_reg = _mm256_add_epi32(sse_reg, sse_reg_hi);                         \
     99  sum_reg = _mm256_add_epi32(sum_reg, sum_reg_hi);                         \
    100  *((int *)sse) = _mm_cvtsi128_si32(_mm256_castsi256_si128(sse_reg)) +     \
    101                  _mm_cvtsi128_si32(_mm256_extractf128_si256(sse_reg, 1)); \
    102  sum_reg_hi = _mm256_srli_si256(sum_reg, 4);                              \
    103  sum_reg = _mm256_add_epi32(sum_reg, sum_reg_hi);                         \
    104  sum = _mm_cvtsi128_si32(_mm256_castsi256_si128(sum_reg)) +               \
    105        _mm_cvtsi128_si32(_mm256_extractf128_si256(sum_reg, 1));
    106 
    107 // Functions related to sub pixel variance width 16
    108 #define LOAD_SRC_DST_INSERT(src_stride, dst_stride)              \
    109  /* load source and destination of 2 rows and insert*/          \
    110  src_reg = _mm256_inserti128_si256(                             \
    111      _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)(src))), \
    112      _mm_loadu_si128((__m128i *)(src + src_stride)), 1);        \
    113  dst_reg = _mm256_inserti128_si256(                             \
    114      _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)(dst))), \
    115      _mm_loadu_si128((__m128i *)(dst + dst_stride)), 1);
    116 
    117 #define AVG_NEXT_SRC_INSERT(src_reg, size_stride)                              \
    118  src_next_reg = _mm256_inserti128_si256(                                      \
    119      _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)(src + size_stride))), \
    120      _mm_loadu_si128((__m128i *)(src + (size_stride << 1))), 1);              \
    121  /* average between current and next stride source */                         \
    122  src_reg = _mm256_avg_epu8(src_reg, src_next_reg);
    123 
    124 #define MERGE_NEXT_SRC_INSERT(src_reg, size_stride)                            \
    125  src_next_reg = _mm256_inserti128_si256(                                      \
    126      _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)(src + size_stride))), \
    127      _mm_loadu_si128((__m128i *)(src + (src_stride + size_stride))), 1);      \
    128  MERGE_WITH_SRC(src_reg, src_next_reg)
    129 
    130 #define LOAD_SRC_NEXT_BYTE_INSERT                                    \
    131  /* load source and another source from next row   */               \
    132  src_reg = _mm256_inserti128_si256(                                 \
    133      _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)(src))),     \
    134      _mm_loadu_si128((__m128i *)(src + src_stride)), 1);            \
    135  /* load source and next row source from 1 byte onwards   */        \
    136  src_next_reg = _mm256_inserti128_si256(                            \
    137      _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)(src + 1))), \
    138      _mm_loadu_si128((__m128i *)(src + src_stride + 1)), 1);
    139 
    140 #define LOAD_DST_INSERT                                          \
    141  dst_reg = _mm256_inserti128_si256(                             \
    142      _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)(dst))), \
    143      _mm_loadu_si128((__m128i *)(dst + dst_stride)), 1);
    144 
    145 #define LOAD_SRC_MERGE_128BIT(filter)                        \
    146  __m128i src_reg_0 = _mm_loadu_si128((__m128i *)(src));     \
    147  __m128i src_reg_1 = _mm_loadu_si128((__m128i *)(src + 1)); \
    148  __m128i src_lo = _mm_unpacklo_epi8(src_reg_0, src_reg_1);  \
    149  __m128i src_hi = _mm_unpackhi_epi8(src_reg_0, src_reg_1);  \
    150  __m128i filter_128bit = _mm256_castsi256_si128(filter);    \
    151  __m128i pw8_128bit = _mm256_castsi256_si128(pw8);
    152 
    153 #define FILTER_SRC_128BIT(filter)             \
    154  /* filter the source */                     \
    155  src_lo = _mm_maddubs_epi16(src_lo, filter); \
    156  src_hi = _mm_maddubs_epi16(src_hi, filter); \
    157                                              \
    158  /* add 8 to source */                       \
    159  src_lo = _mm_add_epi16(src_lo, pw8_128bit); \
    160  src_hi = _mm_add_epi16(src_hi, pw8_128bit); \
    161                                              \
    162  /* divide source by 16 */                   \
    163  src_lo = _mm_srai_epi16(src_lo, 4);         \
    164  src_hi = _mm_srai_epi16(src_hi, 4);
    165 
    166 // TODO(chiyotsai@google.com): These variance functions are macro-fied so we
    167 // don't have to manually optimize the individual for-loops. We could save some
    168 // binary size by optimizing the loops more carefully without duplicating the
    169 // codes with a macro.
    170 #define MAKE_SUB_PIXEL_VAR_32XH(height, log2height)                           \
    171  static inline int aom_sub_pixel_variance32x##height##_imp_avx2(             \
    172      const uint8_t *src, int src_stride, int x_offset, int y_offset,         \
    173      const uint8_t *dst, int dst_stride, unsigned int *sse) {                \
    174    __m256i src_reg, dst_reg, exp_src_lo, exp_src_hi, exp_dst_lo, exp_dst_hi; \
    175    __m256i sse_reg, sum_reg, sse_reg_hi, res_cmp, sum_reg_lo, sum_reg_hi;    \
    176    __m256i zero_reg;                                                         \
    177    int i, sum;                                                               \
    178    sum_reg = _mm256_setzero_si256();                                         \
    179    sse_reg = _mm256_setzero_si256();                                         \
    180    zero_reg = _mm256_setzero_si256();                                        \
    181                                                                              \
    182    /* x_offset = 0 and y_offset = 0 */                                       \
    183    if (x_offset == 0) {                                                      \
    184      if (y_offset == 0) {                                                    \
    185        for (i = 0; i < height; i++) {                                        \
    186          LOAD_SRC_DST                                                        \
    187          /* expend each byte to 2 bytes */                                   \
    188          MERGE_WITH_SRC(src_reg, zero_reg)                                   \
    189          CALC_SUM_SSE_INSIDE_LOOP                                            \
    190          src += src_stride;                                                  \
    191          dst += dst_stride;                                                  \
    192        }                                                                     \
    193        /* x_offset = 0 and y_offset = 4 */                                   \
    194      } else if (y_offset == 4) {                                             \
    195        __m256i src_next_reg;                                                 \
    196        for (i = 0; i < height; i++) {                                        \
    197          LOAD_SRC_DST                                                        \
    198          AVG_NEXT_SRC(src_reg, src_stride)                                   \
    199          /* expend each byte to 2 bytes */                                   \
    200          MERGE_WITH_SRC(src_reg, zero_reg)                                   \
    201          CALC_SUM_SSE_INSIDE_LOOP                                            \
    202          src += src_stride;                                                  \
    203          dst += dst_stride;                                                  \
    204        }                                                                     \
    205        /* x_offset = 0 and y_offset = bilin interpolation */                 \
    206      } else {                                                                \
    207        __m256i filter, pw8, src_next_reg;                                    \
    208                                                                              \
    209        y_offset <<= 5;                                                       \
    210        filter = _mm256_load_si256(                                           \
    211            (__m256i const *)(bilinear_filters_avx2 + y_offset));             \
    212        pw8 = _mm256_set1_epi16(8);                                           \
    213        for (i = 0; i < height; i++) {                                        \
    214          LOAD_SRC_DST                                                        \
    215          MERGE_NEXT_SRC(src_reg, src_stride)                                 \
    216          FILTER_SRC(filter)                                                  \
    217          CALC_SUM_SSE_INSIDE_LOOP                                            \
    218          src += src_stride;                                                  \
    219          dst += dst_stride;                                                  \
    220        }                                                                     \
    221      }                                                                       \
    222      /* x_offset = 4  and y_offset = 0 */                                    \
    223    } else if (x_offset == 4) {                                               \
    224      if (y_offset == 0) {                                                    \
    225        __m256i src_next_reg;                                                 \
    226        for (i = 0; i < height; i++) {                                        \
    227          LOAD_SRC_DST                                                        \
    228          AVG_NEXT_SRC(src_reg, 1)                                            \
    229          /* expand each byte to 2 bytes */                                   \
    230          MERGE_WITH_SRC(src_reg, zero_reg)                                   \
    231          CALC_SUM_SSE_INSIDE_LOOP                                            \
    232          src += src_stride;                                                  \
    233          dst += dst_stride;                                                  \
    234        }                                                                     \
    235        /* x_offset = 4  and y_offset = 4 */                                  \
    236      } else if (y_offset == 4) {                                             \
    237        __m256i src_next_reg, src_avg;                                        \
    238        /* load source and another source starting from the next */           \
    239        /* following byte */                                                  \
    240        src_reg = _mm256_loadu_si256((__m256i const *)(src));                 \
    241        AVG_NEXT_SRC(src_reg, 1)                                              \
    242        for (i = 0; i < height; i++) {                                        \
    243          src_avg = src_reg;                                                  \
    244          src += src_stride;                                                  \
    245          LOAD_SRC_DST                                                        \
    246          AVG_NEXT_SRC(src_reg, 1)                                            \
    247          /* average between previous average to current average */           \
    248          src_avg = _mm256_avg_epu8(src_avg, src_reg);                        \
    249          /* expand each byte to 2 bytes */                                   \
    250          MERGE_WITH_SRC(src_avg, zero_reg)                                   \
    251          /* save current source average */                                   \
    252          CALC_SUM_SSE_INSIDE_LOOP                                            \
    253          dst += dst_stride;                                                  \
    254        }                                                                     \
    255        /* x_offset = 4  and y_offset = bilin interpolation */                \
    256      } else {                                                                \
    257        __m256i filter, pw8, src_next_reg, src_avg;                           \
    258        y_offset <<= 5;                                                       \
    259        filter = _mm256_load_si256(                                           \
    260            (__m256i const *)(bilinear_filters_avx2 + y_offset));             \
    261        pw8 = _mm256_set1_epi16(8);                                           \
    262        /* load source and another source starting from the next */           \
    263        /* following byte */                                                  \
    264        src_reg = _mm256_loadu_si256((__m256i const *)(src));                 \
    265        AVG_NEXT_SRC(src_reg, 1)                                              \
    266        for (i = 0; i < height; i++) {                                        \
    267          /* save current source average */                                   \
    268          src_avg = src_reg;                                                  \
    269          src += src_stride;                                                  \
    270          LOAD_SRC_DST                                                        \
    271          AVG_NEXT_SRC(src_reg, 1)                                            \
    272          MERGE_WITH_SRC(src_avg, src_reg)                                    \
    273          FILTER_SRC(filter)                                                  \
    274          CALC_SUM_SSE_INSIDE_LOOP                                            \
    275          dst += dst_stride;                                                  \
    276        }                                                                     \
    277      }                                                                       \
    278      /* x_offset = bilin interpolation and y_offset = 0 */                   \
    279    } else {                                                                  \
    280      if (y_offset == 0) {                                                    \
    281        __m256i filter, pw8, src_next_reg;                                    \
    282        x_offset <<= 5;                                                       \
    283        filter = _mm256_load_si256(                                           \
    284            (__m256i const *)(bilinear_filters_avx2 + x_offset));             \
    285        pw8 = _mm256_set1_epi16(8);                                           \
    286        for (i = 0; i < height; i++) {                                        \
    287          LOAD_SRC_DST                                                        \
    288          MERGE_NEXT_SRC(src_reg, 1)                                          \
    289          FILTER_SRC(filter)                                                  \
    290          CALC_SUM_SSE_INSIDE_LOOP                                            \
    291          src += src_stride;                                                  \
    292          dst += dst_stride;                                                  \
    293        }                                                                     \
    294        /* x_offset = bilin interpolation and y_offset = 4 */                 \
    295      } else if (y_offset == 4) {                                             \
    296        __m256i filter, pw8, src_next_reg, src_pack;                          \
    297        x_offset <<= 5;                                                       \
    298        filter = _mm256_load_si256(                                           \
    299            (__m256i const *)(bilinear_filters_avx2 + x_offset));             \
    300        pw8 = _mm256_set1_epi16(8);                                           \
    301        src_reg = _mm256_loadu_si256((__m256i const *)(src));                 \
    302        MERGE_NEXT_SRC(src_reg, 1)                                            \
    303        FILTER_SRC(filter)                                                    \
    304        /* convert each 16 bit to 8 bit to each low and high lane source */   \
    305        src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi);               \
    306        for (i = 0; i < height; i++) {                                        \
    307          src += src_stride;                                                  \
    308          LOAD_SRC_DST                                                        \
    309          MERGE_NEXT_SRC(src_reg, 1)                                          \
    310          FILTER_SRC(filter)                                                  \
    311          src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);              \
    312          /* average between previous pack to the current */                  \
    313          src_pack = _mm256_avg_epu8(src_pack, src_reg);                      \
    314          MERGE_WITH_SRC(src_pack, zero_reg)                                  \
    315          CALC_SUM_SSE_INSIDE_LOOP                                            \
    316          src_pack = src_reg;                                                 \
    317          dst += dst_stride;                                                  \
    318        }                                                                     \
    319        /* x_offset = bilin interpolation and y_offset = bilin interpolation  \
    320         */                                                                   \
    321      } else {                                                                \
    322        __m256i xfilter, yfilter, pw8, src_next_reg, src_pack;                \
    323        x_offset <<= 5;                                                       \
    324        xfilter = _mm256_load_si256(                                          \
    325            (__m256i const *)(bilinear_filters_avx2 + x_offset));             \
    326        y_offset <<= 5;                                                       \
    327        yfilter = _mm256_load_si256(                                          \
    328            (__m256i const *)(bilinear_filters_avx2 + y_offset));             \
    329        pw8 = _mm256_set1_epi16(8);                                           \
    330        /* load source and another source starting from the next */           \
    331        /* following byte */                                                  \
    332        src_reg = _mm256_loadu_si256((__m256i const *)(src));                 \
    333        MERGE_NEXT_SRC(src_reg, 1)                                            \
    334                                                                              \
    335        FILTER_SRC(xfilter)                                                   \
    336        /* convert each 16 bit to 8 bit to each low and high lane source */   \
    337        src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi);               \
    338        for (i = 0; i < height; i++) {                                        \
    339          src += src_stride;                                                  \
    340          LOAD_SRC_DST                                                        \
    341          MERGE_NEXT_SRC(src_reg, 1)                                          \
    342          FILTER_SRC(xfilter)                                                 \
    343          src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);              \
    344          /* merge previous pack to current pack source */                    \
    345          MERGE_WITH_SRC(src_pack, src_reg)                                   \
    346          /* filter the source */                                             \
    347          FILTER_SRC(yfilter)                                                 \
    348          src_pack = src_reg;                                                 \
    349          CALC_SUM_SSE_INSIDE_LOOP                                            \
    350          dst += dst_stride;                                                  \
    351        }                                                                     \
    352      }                                                                       \
    353    }                                                                         \
    354    CALC_SUM_AND_SSE                                                          \
    355    _mm256_zeroupper();                                                       \
    356    return sum;                                                               \
    357  }                                                                           \
    358  unsigned int aom_sub_pixel_variance32x##height##_avx2(                      \
    359      const uint8_t *src, int src_stride, int x_offset, int y_offset,         \
    360      const uint8_t *dst, int dst_stride, unsigned int *sse) {                \
    361    const int sum = aom_sub_pixel_variance32x##height##_imp_avx2(             \
    362        src, src_stride, x_offset, y_offset, dst, dst_stride, sse);           \
    363    return *sse - (unsigned int)(((int64_t)sum * sum) >> (5 + log2height));   \
    364  }
    365 
    366 MAKE_SUB_PIXEL_VAR_32XH(64, 6)
    367 MAKE_SUB_PIXEL_VAR_32XH(32, 5)
    368 MAKE_SUB_PIXEL_VAR_32XH(16, 4)
    369 
    370 #define AOM_SUB_PIXEL_VAR_AVX2(w, h, wf, hf, wlog2, hlog2)                \
    371  unsigned int aom_sub_pixel_variance##w##x##h##_avx2(                    \
    372      const uint8_t *src, int src_stride, int x_offset, int y_offset,     \
    373      const uint8_t *dst, int dst_stride, unsigned int *sse_ptr) {        \
    374    unsigned int sse = 0;                                                 \
    375    int se = 0;                                                           \
    376    for (int i = 0; i < (w / wf); ++i) {                                  \
    377      const uint8_t *src_ptr = src;                                       \
    378      const uint8_t *dst_ptr = dst;                                       \
    379      for (int j = 0; j < (h / hf); ++j) {                                \
    380        unsigned int sse2;                                                \
    381        const int se2 = aom_sub_pixel_variance##wf##x##hf##_imp_avx2(     \
    382            src_ptr, src_stride, x_offset, y_offset, dst_ptr, dst_stride, \
    383            &sse2);                                                       \
    384        dst_ptr += hf * dst_stride;                                       \
    385        src_ptr += hf * src_stride;                                       \
    386        se += se2;                                                        \
    387        sse += sse2;                                                      \
    388      }                                                                   \
    389      src += wf;                                                          \
    390      dst += wf;                                                          \
    391    }                                                                     \
    392    *sse_ptr = sse;                                                       \
    393    return sse - (unsigned int)(((int64_t)se * se) >> (wlog2 + hlog2));   \
    394  }
    395 
    396 // Note: hf = AOMMIN(h, 64) to avoid overflow in helper by capping height.
    397 AOM_SUB_PIXEL_VAR_AVX2(128, 128, 32, 64, 7, 7)
    398 AOM_SUB_PIXEL_VAR_AVX2(128, 64, 32, 64, 7, 6)
    399 AOM_SUB_PIXEL_VAR_AVX2(64, 128, 32, 64, 6, 7)
    400 AOM_SUB_PIXEL_VAR_AVX2(64, 64, 32, 64, 6, 6)
    401 AOM_SUB_PIXEL_VAR_AVX2(64, 32, 32, 32, 6, 5)
    402 
    403 #define MAKE_SUB_PIXEL_VAR_16XH(height, log2height)                           \
    404  unsigned int aom_sub_pixel_variance16x##height##_avx2(                      \
    405      const uint8_t *src, int src_stride, int x_offset, int y_offset,         \
    406      const uint8_t *dst, int dst_stride, unsigned int *sse) {                \
    407    __m256i src_reg, dst_reg, exp_src_lo, exp_src_hi, exp_dst_lo, exp_dst_hi; \
    408    __m256i sse_reg, sum_reg, sse_reg_hi, res_cmp, sum_reg_lo, sum_reg_hi;    \
    409    __m256i zero_reg;                                                         \
    410    int i, sum;                                                               \
    411    sum_reg = _mm256_setzero_si256();                                         \
    412    sse_reg = _mm256_setzero_si256();                                         \
    413    zero_reg = _mm256_setzero_si256();                                        \
    414                                                                              \
    415    /* x_offset = 0 and y_offset = 0 */                                       \
    416    if (x_offset == 0) {                                                      \
    417      if (y_offset == 0) {                                                    \
    418        for (i = 0; i < height; i += 2) {                                     \
    419          LOAD_SRC_DST_INSERT(src_stride, dst_stride)                         \
    420          /* expend each byte to 2 bytes */                                   \
    421          MERGE_WITH_SRC(src_reg, zero_reg)                                   \
    422          CALC_SUM_SSE_INSIDE_LOOP                                            \
    423          src += (src_stride << 1);                                           \
    424          dst += (dst_stride << 1);                                           \
    425        }                                                                     \
    426        /* x_offset = 0 and y_offset = 4 */                                   \
    427      } else if (y_offset == 4) {                                             \
    428        __m256i src_next_reg;                                                 \
    429        for (i = 0; i < height; i += 2) {                                     \
    430          LOAD_SRC_DST_INSERT(src_stride, dst_stride)                         \
    431          AVG_NEXT_SRC_INSERT(src_reg, src_stride)                            \
    432          /* expend each byte to 2 bytes */                                   \
    433          MERGE_WITH_SRC(src_reg, zero_reg)                                   \
    434          CALC_SUM_SSE_INSIDE_LOOP                                            \
    435          src += (src_stride << 1);                                           \
    436          dst += (dst_stride << 1);                                           \
    437        }                                                                     \
    438        /* x_offset = 0 and y_offset = bilin interpolation */                 \
    439      } else {                                                                \
    440        __m256i filter, pw8, src_next_reg;                                    \
    441        y_offset <<= 5;                                                       \
    442        filter = _mm256_load_si256(                                           \
    443            (__m256i const *)(bilinear_filters_avx2 + y_offset));             \
    444        pw8 = _mm256_set1_epi16(8);                                           \
    445        for (i = 0; i < height; i += 2) {                                     \
    446          LOAD_SRC_DST_INSERT(src_stride, dst_stride)                         \
    447          MERGE_NEXT_SRC_INSERT(src_reg, src_stride)                          \
    448          FILTER_SRC(filter)                                                  \
    449          CALC_SUM_SSE_INSIDE_LOOP                                            \
    450          src += (src_stride << 1);                                           \
    451          dst += (dst_stride << 1);                                           \
    452        }                                                                     \
    453      }                                                                       \
    454      /* x_offset = 4  and y_offset = 0 */                                    \
    455    } else if (x_offset == 4) {                                               \
    456      if (y_offset == 0) {                                                    \
    457        __m256i src_next_reg;                                                 \
    458        for (i = 0; i < height; i += 2) {                                     \
    459          LOAD_SRC_NEXT_BYTE_INSERT                                           \
    460          LOAD_DST_INSERT                                                     \
    461          /* average between current and next stride source */                \
    462          src_reg = _mm256_avg_epu8(src_reg, src_next_reg);                   \
    463          /* expand each byte to 2 bytes */                                   \
    464          MERGE_WITH_SRC(src_reg, zero_reg)                                   \
    465          CALC_SUM_SSE_INSIDE_LOOP                                            \
    466          src += (src_stride << 1);                                           \
    467          dst += (dst_stride << 1);                                           \
    468        }                                                                     \
    469        /* x_offset = 4  and y_offset = 4 */                                  \
    470      } else if (y_offset == 4) {                                             \
    471        __m256i src_next_reg, src_avg, src_temp;                              \
    472        /* load and insert source and next row source */                      \
    473        LOAD_SRC_NEXT_BYTE_INSERT                                             \
    474        src_avg = _mm256_avg_epu8(src_reg, src_next_reg);                     \
    475        src += src_stride << 1;                                               \
    476        for (i = 0; i < height - 2; i += 2) {                                 \
    477          LOAD_SRC_NEXT_BYTE_INSERT                                           \
    478          src_next_reg = _mm256_avg_epu8(src_reg, src_next_reg);              \
    479          src_temp = _mm256_permute2x128_si256(src_avg, src_next_reg, 0x21);  \
    480          src_temp = _mm256_avg_epu8(src_avg, src_temp);                      \
    481          LOAD_DST_INSERT                                                     \
    482          /* expand each byte to 2 bytes */                                   \
    483          MERGE_WITH_SRC(src_temp, zero_reg)                                  \
    484          /* save current source average */                                   \
    485          src_avg = src_next_reg;                                             \
    486          CALC_SUM_SSE_INSIDE_LOOP                                            \
    487          dst += dst_stride << 1;                                             \
    488          src += src_stride << 1;                                             \
    489        }                                                                     \
    490        /* last 2 rows processing happens here */                             \
    491        __m128i src_reg_0 = _mm_loadu_si128((__m128i *)(src));                \
    492        __m128i src_reg_1 = _mm_loadu_si128((__m128i *)(src + 1));            \
    493        src_reg_0 = _mm_avg_epu8(src_reg_0, src_reg_1);                       \
    494        src_next_reg = _mm256_permute2x128_si256(                             \
    495            src_avg, _mm256_castsi128_si256(src_reg_0), 0x21);                \
    496        LOAD_DST_INSERT                                                       \
    497        src_avg = _mm256_avg_epu8(src_avg, src_next_reg);                     \
    498        MERGE_WITH_SRC(src_avg, zero_reg)                                     \
    499        CALC_SUM_SSE_INSIDE_LOOP                                              \
    500      } else {                                                                \
    501        /* x_offset = 4  and y_offset = bilin interpolation */                \
    502        __m256i filter, pw8, src_next_reg, src_avg, src_temp;                 \
    503        y_offset <<= 5;                                                       \
    504        filter = _mm256_load_si256(                                           \
    505            (__m256i const *)(bilinear_filters_avx2 + y_offset));             \
    506        pw8 = _mm256_set1_epi16(8);                                           \
    507        /* load and insert source and next row source */                      \
    508        LOAD_SRC_NEXT_BYTE_INSERT                                             \
    509        src_avg = _mm256_avg_epu8(src_reg, src_next_reg);                     \
    510        src += src_stride << 1;                                               \
    511        for (i = 0; i < height - 2; i += 2) {                                 \
    512          LOAD_SRC_NEXT_BYTE_INSERT                                           \
    513          src_next_reg = _mm256_avg_epu8(src_reg, src_next_reg);              \
    514          src_temp = _mm256_permute2x128_si256(src_avg, src_next_reg, 0x21);  \
    515          LOAD_DST_INSERT                                                     \
    516          MERGE_WITH_SRC(src_avg, src_temp)                                   \
    517          /* save current source average */                                   \
    518          src_avg = src_next_reg;                                             \
    519          FILTER_SRC(filter)                                                  \
    520          CALC_SUM_SSE_INSIDE_LOOP                                            \
    521          dst += dst_stride << 1;                                             \
    522          src += src_stride << 1;                                             \
    523        }                                                                     \
    524        /* last 2 rows processing happens here */                             \
    525        __m128i src_reg_0 = _mm_loadu_si128((__m128i *)(src));                \
    526        __m128i src_reg_1 = _mm_loadu_si128((__m128i *)(src + 1));            \
    527        src_reg_0 = _mm_avg_epu8(src_reg_0, src_reg_1);                       \
    528        src_next_reg = _mm256_permute2x128_si256(                             \
    529            src_avg, _mm256_castsi128_si256(src_reg_0), 0x21);                \
    530        LOAD_DST_INSERT                                                       \
    531        MERGE_WITH_SRC(src_avg, src_next_reg)                                 \
    532        FILTER_SRC(filter)                                                    \
    533        CALC_SUM_SSE_INSIDE_LOOP                                              \
    534      }                                                                       \
    535      /* x_offset = bilin interpolation and y_offset = 0 */                   \
    536    } else {                                                                  \
    537      if (y_offset == 0) {                                                    \
    538        __m256i filter, pw8, src_next_reg;                                    \
    539        x_offset <<= 5;                                                       \
    540        filter = _mm256_load_si256(                                           \
    541            (__m256i const *)(bilinear_filters_avx2 + x_offset));             \
    542        pw8 = _mm256_set1_epi16(8);                                           \
    543        for (i = 0; i < height; i += 2) {                                     \
    544          LOAD_SRC_DST_INSERT(src_stride, dst_stride)                         \
    545          MERGE_NEXT_SRC_INSERT(src_reg, 1)                                   \
    546          FILTER_SRC(filter)                                                  \
    547          CALC_SUM_SSE_INSIDE_LOOP                                            \
    548          src += (src_stride << 1);                                           \
    549          dst += (dst_stride << 1);                                           \
    550        }                                                                     \
    551        /* x_offset = bilin interpolation and y_offset = 4 */                 \
    552      } else if (y_offset == 4) {                                             \
    553        __m256i filter, pw8, src_next_reg, src_pack;                          \
    554        x_offset <<= 5;                                                       \
    555        filter = _mm256_load_si256(                                           \
    556            (__m256i const *)(bilinear_filters_avx2 + x_offset));             \
    557        pw8 = _mm256_set1_epi16(8);                                           \
    558        /* load and insert source and next row source */                      \
    559        LOAD_SRC_NEXT_BYTE_INSERT                                             \
    560        MERGE_WITH_SRC(src_reg, src_next_reg)                                 \
    561        FILTER_SRC(filter)                                                    \
    562        /* convert each 16 bit to 8 bit to each low and high lane source */   \
    563        src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi);               \
    564        src += src_stride << 1;                                               \
    565        for (i = 0; i < height - 2; i += 2) {                                 \
    566          LOAD_SRC_NEXT_BYTE_INSERT                                           \
    567          LOAD_DST_INSERT                                                     \
    568          MERGE_WITH_SRC(src_reg, src_next_reg)                               \
    569          FILTER_SRC(filter)                                                  \
    570          src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);              \
    571          src_next_reg = _mm256_permute2x128_si256(src_pack, src_reg, 0x21);  \
    572          /* average between previous pack to the current */                  \
    573          src_pack = _mm256_avg_epu8(src_pack, src_next_reg);                 \
    574          MERGE_WITH_SRC(src_pack, zero_reg)                                  \
    575          CALC_SUM_SSE_INSIDE_LOOP                                            \
    576          src_pack = src_reg;                                                 \
    577          src += src_stride << 1;                                             \
    578          dst += dst_stride << 1;                                             \
    579        }                                                                     \
    580        /* last 2 rows processing happens here */                             \
    581        LOAD_SRC_MERGE_128BIT(filter)                                         \
    582        LOAD_DST_INSERT                                                       \
    583        FILTER_SRC_128BIT(filter_128bit)                                      \
    584        src_reg_0 = _mm_packus_epi16(src_lo, src_hi);                         \
    585        src_next_reg = _mm256_permute2x128_si256(                             \
    586            src_pack, _mm256_castsi128_si256(src_reg_0), 0x21);               \
    587        /* average between previous pack to the current */                    \
    588        src_pack = _mm256_avg_epu8(src_pack, src_next_reg);                   \
    589        MERGE_WITH_SRC(src_pack, zero_reg)                                    \
    590        CALC_SUM_SSE_INSIDE_LOOP                                              \
    591      } else {                                                                \
    592        /* x_offset = bilin interpolation and y_offset = bilin interpolation  \
    593         */                                                                   \
    594        __m256i xfilter, yfilter, pw8, src_next_reg, src_pack;                \
    595        x_offset <<= 5;                                                       \
    596        xfilter = _mm256_load_si256(                                          \
    597            (__m256i const *)(bilinear_filters_avx2 + x_offset));             \
    598        y_offset <<= 5;                                                       \
    599        yfilter = _mm256_load_si256(                                          \
    600            (__m256i const *)(bilinear_filters_avx2 + y_offset));             \
    601        pw8 = _mm256_set1_epi16(8);                                           \
    602        /* load and insert source and next row source */                      \
    603        LOAD_SRC_NEXT_BYTE_INSERT                                             \
    604        MERGE_WITH_SRC(src_reg, src_next_reg)                                 \
    605        FILTER_SRC(xfilter)                                                   \
    606        /* convert each 16 bit to 8 bit to each low and high lane source */   \
    607        src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi);               \
    608        src += src_stride << 1;                                               \
    609        for (i = 0; i < height - 2; i += 2) {                                 \
    610          LOAD_SRC_NEXT_BYTE_INSERT                                           \
    611          LOAD_DST_INSERT                                                     \
    612          MERGE_WITH_SRC(src_reg, src_next_reg)                               \
    613          FILTER_SRC(xfilter)                                                 \
    614          src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);              \
    615          src_next_reg = _mm256_permute2x128_si256(src_pack, src_reg, 0x21);  \
    616          /* average between previous pack to the current */                  \
    617          MERGE_WITH_SRC(src_pack, src_next_reg)                              \
    618          /* filter the source */                                             \
    619          FILTER_SRC(yfilter)                                                 \
    620          src_pack = src_reg;                                                 \
    621          CALC_SUM_SSE_INSIDE_LOOP                                            \
    622          src += src_stride << 1;                                             \
    623          dst += dst_stride << 1;                                             \
    624        }                                                                     \
    625        /* last 2 rows processing happens here */                             \
    626        LOAD_SRC_MERGE_128BIT(xfilter)                                        \
    627        LOAD_DST_INSERT                                                       \
    628        FILTER_SRC_128BIT(filter_128bit)                                      \
    629        src_reg_0 = _mm_packus_epi16(src_lo, src_hi);                         \
    630        src_next_reg = _mm256_permute2x128_si256(                             \
    631            src_pack, _mm256_castsi128_si256(src_reg_0), 0x21);               \
    632        MERGE_WITH_SRC(src_pack, src_next_reg)                                \
    633        FILTER_SRC(yfilter)                                                   \
    634        CALC_SUM_SSE_INSIDE_LOOP                                              \
    635      }                                                                       \
    636    }                                                                         \
    637    CALC_SUM_AND_SSE                                                          \
    638    _mm256_zeroupper();                                                       \
    639    return *sse - (unsigned int)(((int64_t)sum * sum) >> (4 + log2height));   \
    640  }
    641 
    642 MAKE_SUB_PIXEL_VAR_16XH(32, 5)
    643 MAKE_SUB_PIXEL_VAR_16XH(16, 4)
    644 MAKE_SUB_PIXEL_VAR_16XH(8, 3)
    645 #if !CONFIG_REALTIME_ONLY
    646 MAKE_SUB_PIXEL_VAR_16XH(64, 6)
    647 MAKE_SUB_PIXEL_VAR_16XH(4, 2)
    648 #endif
    649 
    650 #define MAKE_SUB_PIXEL_AVG_VAR_32XH(height, log2height)                       \
    651  static int sub_pixel_avg_variance32x##height##_imp_avx2(                    \
    652      const uint8_t *src, int src_stride, int x_offset, int y_offset,         \
    653      const uint8_t *dst, int dst_stride, const uint8_t *sec, int sec_stride, \
    654      unsigned int *sse) {                                                    \
    655    __m256i sec_reg;                                                          \
    656    __m256i src_reg, dst_reg, exp_src_lo, exp_src_hi, exp_dst_lo, exp_dst_hi; \
    657    __m256i sse_reg, sum_reg, sse_reg_hi, res_cmp, sum_reg_lo, sum_reg_hi;    \
    658    __m256i zero_reg;                                                         \
    659    int i, sum;                                                               \
    660    sum_reg = _mm256_setzero_si256();                                         \
    661    sse_reg = _mm256_setzero_si256();                                         \
    662    zero_reg = _mm256_setzero_si256();                                        \
    663                                                                              \
    664    /* x_offset = 0 and y_offset = 0 */                                       \
    665    if (x_offset == 0) {                                                      \
    666      if (y_offset == 0) {                                                    \
    667        for (i = 0; i < height; i++) {                                        \
    668          LOAD_SRC_DST                                                        \
    669          sec_reg = _mm256_loadu_si256((__m256i const *)(sec));               \
    670          src_reg = _mm256_avg_epu8(src_reg, sec_reg);                        \
    671          sec += sec_stride;                                                  \
    672          /* expend each byte to 2 bytes */                                   \
    673          MERGE_WITH_SRC(src_reg, zero_reg)                                   \
    674          CALC_SUM_SSE_INSIDE_LOOP                                            \
    675          src += src_stride;                                                  \
    676          dst += dst_stride;                                                  \
    677        }                                                                     \
    678      } else if (y_offset == 4) {                                             \
    679        __m256i src_next_reg;                                                 \
    680        for (i = 0; i < height; i++) {                                        \
    681          LOAD_SRC_DST                                                        \
    682          AVG_NEXT_SRC(src_reg, src_stride)                                   \
    683          sec_reg = _mm256_loadu_si256((__m256i const *)(sec));               \
    684          src_reg = _mm256_avg_epu8(src_reg, sec_reg);                        \
    685          sec += sec_stride;                                                  \
    686          /* expend each byte to 2 bytes */                                   \
    687          MERGE_WITH_SRC(src_reg, zero_reg)                                   \
    688          CALC_SUM_SSE_INSIDE_LOOP                                            \
    689          src += src_stride;                                                  \
    690          dst += dst_stride;                                                  \
    691        }                                                                     \
    692        /* x_offset = 0 and y_offset = bilin interpolation */                 \
    693      } else {                                                                \
    694        __m256i filter, pw8, src_next_reg;                                    \
    695                                                                              \
    696        y_offset <<= 5;                                                       \
    697        filter = _mm256_load_si256(                                           \
    698            (__m256i const *)(bilinear_filters_avx2 + y_offset));             \
    699        pw8 = _mm256_set1_epi16(8);                                           \
    700        for (i = 0; i < height; i++) {                                        \
    701          LOAD_SRC_DST                                                        \
    702          MERGE_NEXT_SRC(src_reg, src_stride)                                 \
    703          FILTER_SRC(filter)                                                  \
    704          src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);              \
    705          sec_reg = _mm256_loadu_si256((__m256i const *)(sec));               \
    706          src_reg = _mm256_avg_epu8(src_reg, sec_reg);                        \
    707          sec += sec_stride;                                                  \
    708          MERGE_WITH_SRC(src_reg, zero_reg)                                   \
    709          CALC_SUM_SSE_INSIDE_LOOP                                            \
    710          src += src_stride;                                                  \
    711          dst += dst_stride;                                                  \
    712        }                                                                     \
    713      }                                                                       \
    714      /* x_offset = 4  and y_offset = 0 */                                    \
    715    } else if (x_offset == 4) {                                               \
    716      if (y_offset == 0) {                                                    \
    717        __m256i src_next_reg;                                                 \
    718        for (i = 0; i < height; i++) {                                        \
    719          LOAD_SRC_DST                                                        \
    720          AVG_NEXT_SRC(src_reg, 1)                                            \
    721          sec_reg = _mm256_loadu_si256((__m256i const *)(sec));               \
    722          src_reg = _mm256_avg_epu8(src_reg, sec_reg);                        \
    723          sec += sec_stride;                                                  \
    724          /* expand each byte to 2 bytes */                                   \
    725          MERGE_WITH_SRC(src_reg, zero_reg)                                   \
    726          CALC_SUM_SSE_INSIDE_LOOP                                            \
    727          src += src_stride;                                                  \
    728          dst += dst_stride;                                                  \
    729        }                                                                     \
    730        /* x_offset = 4  and y_offset = 4 */                                  \
    731      } else if (y_offset == 4) {                                             \
    732        __m256i src_next_reg, src_avg;                                        \
    733        /* load source and another source starting from the next */           \
    734        /* following byte */                                                  \
    735        src_reg = _mm256_loadu_si256((__m256i const *)(src));                 \
    736        AVG_NEXT_SRC(src_reg, 1)                                              \
    737        for (i = 0; i < height; i++) {                                        \
    738          /* save current source average */                                   \
    739          src_avg = src_reg;                                                  \
    740          src += src_stride;                                                  \
    741          LOAD_SRC_DST                                                        \
    742          AVG_NEXT_SRC(src_reg, 1)                                            \
    743          /* average between previous average to current average */           \
    744          src_avg = _mm256_avg_epu8(src_avg, src_reg);                        \
    745          sec_reg = _mm256_loadu_si256((__m256i const *)(sec));               \
    746          src_avg = _mm256_avg_epu8(src_avg, sec_reg);                        \
    747          sec += sec_stride;                                                  \
    748          /* expand each byte to 2 bytes */                                   \
    749          MERGE_WITH_SRC(src_avg, zero_reg)                                   \
    750          CALC_SUM_SSE_INSIDE_LOOP                                            \
    751          dst += dst_stride;                                                  \
    752        }                                                                     \
    753        /* x_offset = 4  and y_offset = bilin interpolation */                \
    754      } else {                                                                \
    755        __m256i filter, pw8, src_next_reg, src_avg;                           \
    756        y_offset <<= 5;                                                       \
    757        filter = _mm256_load_si256(                                           \
    758            (__m256i const *)(bilinear_filters_avx2 + y_offset));             \
    759        pw8 = _mm256_set1_epi16(8);                                           \
    760        /* load source and another source starting from the next */           \
    761        /* following byte */                                                  \
    762        src_reg = _mm256_loadu_si256((__m256i const *)(src));                 \
    763        AVG_NEXT_SRC(src_reg, 1)                                              \
    764        for (i = 0; i < height; i++) {                                        \
    765          /* save current source average */                                   \
    766          src_avg = src_reg;                                                  \
    767          src += src_stride;                                                  \
    768          LOAD_SRC_DST                                                        \
    769          AVG_NEXT_SRC(src_reg, 1)                                            \
    770          MERGE_WITH_SRC(src_avg, src_reg)                                    \
    771          FILTER_SRC(filter)                                                  \
    772          src_avg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);              \
    773          sec_reg = _mm256_loadu_si256((__m256i const *)(sec));               \
    774          src_avg = _mm256_avg_epu8(src_avg, sec_reg);                        \
    775          /* expand each byte to 2 bytes */                                   \
    776          MERGE_WITH_SRC(src_avg, zero_reg)                                   \
    777          sec += sec_stride;                                                  \
    778          CALC_SUM_SSE_INSIDE_LOOP                                            \
    779          dst += dst_stride;                                                  \
    780        }                                                                     \
    781      }                                                                       \
    782      /* x_offset = bilin interpolation and y_offset = 0 */                   \
    783    } else {                                                                  \
    784      if (y_offset == 0) {                                                    \
    785        __m256i filter, pw8, src_next_reg;                                    \
    786        x_offset <<= 5;                                                       \
    787        filter = _mm256_load_si256(                                           \
    788            (__m256i const *)(bilinear_filters_avx2 + x_offset));             \
    789        pw8 = _mm256_set1_epi16(8);                                           \
    790        for (i = 0; i < height; i++) {                                        \
    791          LOAD_SRC_DST                                                        \
    792          MERGE_NEXT_SRC(src_reg, 1)                                          \
    793          FILTER_SRC(filter)                                                  \
    794          src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);              \
    795          sec_reg = _mm256_loadu_si256((__m256i const *)(sec));               \
    796          src_reg = _mm256_avg_epu8(src_reg, sec_reg);                        \
    797          MERGE_WITH_SRC(src_reg, zero_reg)                                   \
    798          sec += sec_stride;                                                  \
    799          CALC_SUM_SSE_INSIDE_LOOP                                            \
    800          src += src_stride;                                                  \
    801          dst += dst_stride;                                                  \
    802        }                                                                     \
    803        /* x_offset = bilin interpolation and y_offset = 4 */                 \
    804      } else if (y_offset == 4) {                                             \
    805        __m256i filter, pw8, src_next_reg, src_pack;                          \
    806        x_offset <<= 5;                                                       \
    807        filter = _mm256_load_si256(                                           \
    808            (__m256i const *)(bilinear_filters_avx2 + x_offset));             \
    809        pw8 = _mm256_set1_epi16(8);                                           \
    810        src_reg = _mm256_loadu_si256((__m256i const *)(src));                 \
    811        MERGE_NEXT_SRC(src_reg, 1)                                            \
    812        FILTER_SRC(filter)                                                    \
    813        /* convert each 16 bit to 8 bit to each low and high lane source */   \
    814        src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi);               \
    815        for (i = 0; i < height; i++) {                                        \
    816          src += src_stride;                                                  \
    817          LOAD_SRC_DST                                                        \
    818          MERGE_NEXT_SRC(src_reg, 1)                                          \
    819          FILTER_SRC(filter)                                                  \
    820          src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);              \
    821          /* average between previous pack to the current */                  \
    822          src_pack = _mm256_avg_epu8(src_pack, src_reg);                      \
    823          sec_reg = _mm256_loadu_si256((__m256i const *)(sec));               \
    824          src_pack = _mm256_avg_epu8(src_pack, sec_reg);                      \
    825          sec += sec_stride;                                                  \
    826          MERGE_WITH_SRC(src_pack, zero_reg)                                  \
    827          src_pack = src_reg;                                                 \
    828          CALC_SUM_SSE_INSIDE_LOOP                                            \
    829          dst += dst_stride;                                                  \
    830        }                                                                     \
    831        /* x_offset = bilin interpolation and y_offset = bilin interpolation  \
    832         */                                                                   \
    833      } else {                                                                \
    834        __m256i xfilter, yfilter, pw8, src_next_reg, src_pack;                \
    835        x_offset <<= 5;                                                       \
    836        xfilter = _mm256_load_si256(                                          \
    837            (__m256i const *)(bilinear_filters_avx2 + x_offset));             \
    838        y_offset <<= 5;                                                       \
    839        yfilter = _mm256_load_si256(                                          \
    840            (__m256i const *)(bilinear_filters_avx2 + y_offset));             \
    841        pw8 = _mm256_set1_epi16(8);                                           \
    842        /* load source and another source starting from the next */           \
    843        /* following byte */                                                  \
    844        src_reg = _mm256_loadu_si256((__m256i const *)(src));                 \
    845        MERGE_NEXT_SRC(src_reg, 1)                                            \
    846                                                                              \
    847        FILTER_SRC(xfilter)                                                   \
    848        /* convert each 16 bit to 8 bit to each low and high lane source */   \
    849        src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi);               \
    850        for (i = 0; i < height; i++) {                                        \
    851          src += src_stride;                                                  \
    852          LOAD_SRC_DST                                                        \
    853          MERGE_NEXT_SRC(src_reg, 1)                                          \
    854          FILTER_SRC(xfilter)                                                 \
    855          src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);              \
    856          /* merge previous pack to current pack source */                    \
    857          MERGE_WITH_SRC(src_pack, src_reg)                                   \
    858          /* filter the source */                                             \
    859          FILTER_SRC(yfilter)                                                 \
    860          src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi);             \
    861          sec_reg = _mm256_loadu_si256((__m256i const *)(sec));               \
    862          src_pack = _mm256_avg_epu8(src_pack, sec_reg);                      \
    863          MERGE_WITH_SRC(src_pack, zero_reg)                                  \
    864          src_pack = src_reg;                                                 \
    865          sec += sec_stride;                                                  \
    866          CALC_SUM_SSE_INSIDE_LOOP                                            \
    867          dst += dst_stride;                                                  \
    868        }                                                                     \
    869      }                                                                       \
    870    }                                                                         \
    871    CALC_SUM_AND_SSE                                                          \
    872    _mm256_zeroupper();                                                       \
    873    return sum;                                                               \
    874  }                                                                           \
    875  unsigned int aom_sub_pixel_avg_variance32x##height##_avx2(                  \
    876      const uint8_t *src, int src_stride, int x_offset, int y_offset,         \
    877      const uint8_t *dst, int dst_stride, unsigned int *sse,                  \
    878      const uint8_t *sec_ptr) {                                               \
    879    const int sum = sub_pixel_avg_variance32x##height##_imp_avx2(             \
    880        src, src_stride, x_offset, y_offset, dst, dst_stride, sec_ptr, 32,    \
    881        sse);                                                                 \
    882    return *sse - (unsigned int)(((int64_t)sum * sum) >> (5 + log2height));   \
    883  }
    884 
    885 MAKE_SUB_PIXEL_AVG_VAR_32XH(64, 6)
    886 MAKE_SUB_PIXEL_AVG_VAR_32XH(32, 5)
    887 MAKE_SUB_PIXEL_AVG_VAR_32XH(16, 4)
    888 
    889 #define AOM_SUB_PIXEL_AVG_VAR_AVX2(w, h, wf, hf, wlog2, hlog2)            \
    890  unsigned int aom_sub_pixel_avg_variance##w##x##h##_avx2(                \
    891      const uint8_t *src, int src_stride, int x_offset, int y_offset,     \
    892      const uint8_t *dst, int dst_stride, unsigned int *sse_ptr,          \
    893      const uint8_t *sec) {                                               \
    894    unsigned int sse = 0;                                                 \
    895    int se = 0;                                                           \
    896    for (int i = 0; i < (w / wf); ++i) {                                  \
    897      const uint8_t *src_ptr = src;                                       \
    898      const uint8_t *dst_ptr = dst;                                       \
    899      const uint8_t *sec_ptr = sec;                                       \
    900      for (int j = 0; j < (h / hf); ++j) {                                \
    901        unsigned int sse2;                                                \
    902        const int se2 = sub_pixel_avg_variance##wf##x##hf##_imp_avx2(     \
    903            src_ptr, src_stride, x_offset, y_offset, dst_ptr, dst_stride, \
    904            sec_ptr, w, &sse2);                                           \
    905        dst_ptr += hf * dst_stride;                                       \
    906        src_ptr += hf * src_stride;                                       \
    907        sec_ptr += hf * w;                                                \
    908        se += se2;                                                        \
    909        sse += sse2;                                                      \
    910      }                                                                   \
    911      src += wf;                                                          \
    912      dst += wf;                                                          \
    913      sec += wf;                                                          \
    914    }                                                                     \
    915    *sse_ptr = sse;                                                       \
    916    return sse - (unsigned int)(((int64_t)se * se) >> (wlog2 + hlog2));   \
    917  }
    918 
    919 // Note: hf = AOMMIN(h, 64) to avoid overflow in helper by capping height.
    920 AOM_SUB_PIXEL_AVG_VAR_AVX2(128, 128, 32, 64, 7, 7)
    921 AOM_SUB_PIXEL_AVG_VAR_AVX2(128, 64, 32, 64, 7, 6)
    922 AOM_SUB_PIXEL_AVG_VAR_AVX2(64, 128, 32, 64, 6, 7)
    923 AOM_SUB_PIXEL_AVG_VAR_AVX2(64, 64, 32, 64, 6, 6)
    924 AOM_SUB_PIXEL_AVG_VAR_AVX2(64, 32, 32, 32, 6, 5)