tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

highbd_variance_avx2.c (37550B)


      1 /*
      2 * Copyright (c) 2020, Alliance for Open Media. All rights reserved.
      3 *
      4 * This source code is subject to the terms of the BSD 2 Clause License and
      5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
      6 * was not distributed with this source code in the LICENSE file, you can
      7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
      8 * Media Patent License 1.0 was not distributed with this source code in the
      9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
     10 */
     11 
     12 #include <assert.h>
     13 #include <immintrin.h>  // AVX2
     14 
     15 #include "config/aom_dsp_rtcd.h"
     16 #include "aom_dsp/aom_filter.h"
     17 #include "aom_dsp/x86/synonyms.h"
     18 
     19 typedef void (*high_variance_fn_t)(const uint16_t *src, int src_stride,
     20                                   const uint16_t *ref, int ref_stride,
     21                                   uint32_t *sse, int *sum);
     22 
     23 static uint32_t aom_highbd_var_filter_block2d_bil_avx2(
     24    const uint8_t *src_ptr8, unsigned int src_pixels_per_line, int pixel_step,
     25    unsigned int output_height, unsigned int output_width,
     26    const uint32_t xoffset, const uint32_t yoffset, const uint8_t *dst_ptr8,
     27    int dst_stride, uint32_t *sse) {
     28  const __m256i filter1 =
     29      _mm256_set1_epi32((int)(bilinear_filters_2t[xoffset][1] << 16) |
     30                        bilinear_filters_2t[xoffset][0]);
     31  const __m256i filter2 =
     32      _mm256_set1_epi32((int)(bilinear_filters_2t[yoffset][1] << 16) |
     33                        bilinear_filters_2t[yoffset][0]);
     34  const __m256i one = _mm256_set1_epi16(1);
     35  const int bitshift = 0x40;
     36  (void)pixel_step;
     37  unsigned int i, j, prev = 0, curr = 2;
     38  uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src_ptr8);
     39  uint16_t *dst_ptr = CONVERT_TO_SHORTPTR(dst_ptr8);
     40  uint16_t *src_ptr_ref = src_ptr;
     41  uint16_t *dst_ptr_ref = dst_ptr;
     42  int64_t sum_long = 0;
     43  uint64_t sse_long = 0;
     44  unsigned int rshift = 0, inc = 1;
     45  __m256i rbias = _mm256_set1_epi32(bitshift);
     46  __m256i opointer[8];
     47  unsigned int range;
     48  if (xoffset == 0) {
     49    if (yoffset == 0) {  // xoffset==0 && yoffset==0
     50      range = output_width / 16;
     51      if (output_height == 8) inc = 2;
     52      if (output_height == 4) inc = 4;
     53      for (j = 0; j < range * output_height * inc / 16; j++) {
     54        if (j % (output_height * inc / 16) == 0) {
     55          src_ptr = src_ptr_ref;
     56          src_ptr_ref += 16;
     57          dst_ptr = dst_ptr_ref;
     58          dst_ptr_ref += 16;
     59        }
     60        __m256i sum1 = _mm256_setzero_si256();
     61        __m256i sse1 = _mm256_setzero_si256();
     62        for (i = 0; i < 16 / inc; ++i) {
     63          __m256i V_S_SRC = _mm256_loadu_si256((const __m256i *)src_ptr);
     64          src_ptr += src_pixels_per_line;
     65          __m256i V_D_DST = _mm256_loadu_si256((const __m256i *)dst_ptr);
     66          dst_ptr += dst_stride;
     67 
     68          __m256i V_R_SUB = _mm256_sub_epi16(V_S_SRC, V_D_DST);
     69          __m256i V_R_MAD = _mm256_madd_epi16(V_R_SUB, V_R_SUB);
     70 
     71          sum1 = _mm256_add_epi16(sum1, V_R_SUB);
     72          sse1 = _mm256_add_epi32(sse1, V_R_MAD);
     73        }
     74 
     75        __m256i v_sum0 = _mm256_madd_epi16(sum1, one);
     76        __m256i v_d_l = _mm256_unpacklo_epi32(v_sum0, sse1);
     77        __m256i v_d_h = _mm256_unpackhi_epi32(v_sum0, sse1);
     78        __m256i v_d_lh = _mm256_add_epi32(v_d_l, v_d_h);
     79        const __m128i v_d0_d = _mm256_castsi256_si128(v_d_lh);
     80        const __m128i v_d1_d = _mm256_extracti128_si256(v_d_lh, 1);
     81        __m128i v_d = _mm_add_epi32(v_d0_d, v_d1_d);
     82        v_d = _mm_add_epi32(v_d, _mm_srli_si128(v_d, 8));
     83        sum_long += _mm_extract_epi32(v_d, 0);
     84        sse_long += _mm_extract_epi32(v_d, 1);
     85      }
     86 
     87      rshift = get_msb(output_height) + get_msb(output_width);
     88 
     89    } else if (yoffset == 4) {  // xoffset==0 && yoffset==4
     90      range = output_width / 16;
     91      if (output_height == 8) inc = 2;
     92      if (output_height == 4) inc = 4;
     93      for (j = 0; j < range * output_height * inc / 16; j++) {
     94        if (j % (output_height * inc / 16) == 0) {
     95          src_ptr = src_ptr_ref;
     96          src_ptr_ref += 16;
     97          dst_ptr = dst_ptr_ref;
     98          dst_ptr_ref += 16;
     99 
    100          opointer[0] = _mm256_loadu_si256((const __m256i *)src_ptr);
    101          src_ptr += src_pixels_per_line;
    102          curr = 0;
    103        }
    104 
    105        __m256i sum1 = _mm256_setzero_si256();
    106        __m256i sse1 = _mm256_setzero_si256();
    107 
    108        for (i = 0; i < 16 / inc; ++i) {
    109          prev = curr;
    110          curr = (curr == 0) ? 1 : 0;
    111          opointer[curr] = _mm256_loadu_si256((const __m256i *)src_ptr);
    112          src_ptr += src_pixels_per_line;
    113 
    114          __m256i V_S_SRC = _mm256_avg_epu16(opointer[curr], opointer[prev]);
    115 
    116          __m256i V_D_DST = _mm256_loadu_si256((const __m256i *)dst_ptr);
    117          dst_ptr += dst_stride;
    118          __m256i V_R_SUB = _mm256_sub_epi16(V_S_SRC, V_D_DST);
    119          __m256i V_R_MAD = _mm256_madd_epi16(V_R_SUB, V_R_SUB);
    120          sum1 = _mm256_add_epi16(sum1, V_R_SUB);
    121          sse1 = _mm256_add_epi32(sse1, V_R_MAD);
    122        }
    123 
    124        __m256i v_sum0 = _mm256_madd_epi16(sum1, one);
    125        __m256i v_d_l = _mm256_unpacklo_epi32(v_sum0, sse1);
    126        __m256i v_d_h = _mm256_unpackhi_epi32(v_sum0, sse1);
    127        __m256i v_d_lh = _mm256_add_epi32(v_d_l, v_d_h);
    128        const __m128i v_d0_d = _mm256_castsi256_si128(v_d_lh);
    129        const __m128i v_d1_d = _mm256_extracti128_si256(v_d_lh, 1);
    130        __m128i v_d = _mm_add_epi32(v_d0_d, v_d1_d);
    131        v_d = _mm_add_epi32(v_d, _mm_srli_si128(v_d, 8));
    132        sum_long += _mm_extract_epi32(v_d, 0);
    133        sse_long += _mm_extract_epi32(v_d, 1);
    134      }
    135 
    136      rshift = get_msb(output_height) + get_msb(output_width);
    137 
    138    } else {  // xoffset==0 && yoffset==1,2,3,5,6,7
    139      range = output_width / 16;
    140      if (output_height == 8) inc = 2;
    141      if (output_height == 4) inc = 4;
    142      for (j = 0; j < range * output_height * inc / 16; j++) {
    143        if (j % (output_height * inc / 16) == 0) {
    144          src_ptr = src_ptr_ref;
    145          src_ptr_ref += 16;
    146          dst_ptr = dst_ptr_ref;
    147          dst_ptr_ref += 16;
    148 
    149          opointer[0] = _mm256_loadu_si256((const __m256i *)src_ptr);
    150          src_ptr += src_pixels_per_line;
    151          curr = 0;
    152        }
    153 
    154        __m256i sum1 = _mm256_setzero_si256();
    155        __m256i sse1 = _mm256_setzero_si256();
    156 
    157        for (i = 0; i < 16 / inc; ++i) {
    158          prev = curr;
    159          curr = (curr == 0) ? 1 : 0;
    160          opointer[curr] = _mm256_loadu_si256((const __m256i *)src_ptr);
    161          src_ptr += src_pixels_per_line;
    162 
    163          __m256i V_S_M1 =
    164              _mm256_unpacklo_epi16(opointer[prev], opointer[curr]);
    165          __m256i V_S_M2 =
    166              _mm256_unpackhi_epi16(opointer[prev], opointer[curr]);
    167 
    168          __m256i V_S_MAD1 = _mm256_madd_epi16(V_S_M1, filter2);
    169          __m256i V_S_MAD2 = _mm256_madd_epi16(V_S_M2, filter2);
    170 
    171          __m256i V_S_S1 =
    172              _mm256_srli_epi32(_mm256_add_epi32(V_S_MAD1, rbias), 7);
    173          __m256i V_S_S2 =
    174              _mm256_srli_epi32(_mm256_add_epi32(V_S_MAD2, rbias), 7);
    175 
    176          __m256i V_S_SRC = _mm256_packus_epi32(V_S_S1, V_S_S2);
    177 
    178          __m256i V_D_DST = _mm256_loadu_si256((const __m256i *)dst_ptr);
    179          dst_ptr += dst_stride;
    180 
    181          __m256i V_R_SUB = _mm256_sub_epi16(V_S_SRC, V_D_DST);
    182          __m256i V_R_MAD = _mm256_madd_epi16(V_R_SUB, V_R_SUB);
    183 
    184          sum1 = _mm256_add_epi16(sum1, V_R_SUB);
    185          sse1 = _mm256_add_epi32(sse1, V_R_MAD);
    186        }
    187 
    188        __m256i v_sum0 = _mm256_madd_epi16(sum1, one);
    189        __m256i v_d_l = _mm256_unpacklo_epi32(v_sum0, sse1);
    190        __m256i v_d_h = _mm256_unpackhi_epi32(v_sum0, sse1);
    191        __m256i v_d_lh = _mm256_add_epi32(v_d_l, v_d_h);
    192        const __m128i v_d0_d = _mm256_castsi256_si128(v_d_lh);
    193        const __m128i v_d1_d = _mm256_extracti128_si256(v_d_lh, 1);
    194        __m128i v_d = _mm_add_epi32(v_d0_d, v_d1_d);
    195        v_d = _mm_add_epi32(v_d, _mm_srli_si128(v_d, 8));
    196        sum_long += _mm_extract_epi32(v_d, 0);
    197        sse_long += _mm_extract_epi32(v_d, 1);
    198      }
    199 
    200      rshift = get_msb(output_height) + get_msb(output_width);
    201    }
    202  } else if (xoffset == 4) {
    203    if (yoffset == 0) {  // xoffset==4 && yoffset==0
    204      range = output_width / 16;
    205      if (output_height == 8) inc = 2;
    206      if (output_height == 4) inc = 4;
    207      for (j = 0; j < range * output_height * inc / 16; j++) {
    208        if (j % (output_height * inc / 16) == 0) {
    209          src_ptr = src_ptr_ref;
    210          src_ptr_ref += 16;
    211          dst_ptr = dst_ptr_ref;
    212          dst_ptr_ref += 16;
    213          __m256i V_H_D1 = _mm256_loadu_si256((const __m256i *)src_ptr);
    214          __m256i V_H_D2 = _mm256_loadu_si256((const __m256i *)(src_ptr + 1));
    215          src_ptr += src_pixels_per_line;
    216 
    217          opointer[0] = _mm256_avg_epu16(V_H_D1, V_H_D2);
    218 
    219          curr = 0;
    220        }
    221 
    222        __m256i sum1 = _mm256_setzero_si256();
    223        __m256i sse1 = _mm256_setzero_si256();
    224 
    225        for (i = 0; i < 16 / inc; ++i) {
    226          prev = curr;
    227          curr = (curr == 0) ? 1 : 0;
    228          __m256i V_V_D1 = _mm256_loadu_si256((const __m256i *)src_ptr);
    229          __m256i V_V_D2 = _mm256_loadu_si256((const __m256i *)(src_ptr + 1));
    230          src_ptr += src_pixels_per_line;
    231 
    232          opointer[curr] = _mm256_avg_epu16(V_V_D1, V_V_D2);
    233 
    234          __m256i V_S_M1 =
    235              _mm256_unpacklo_epi16(opointer[prev], opointer[curr]);
    236          __m256i V_S_M2 =
    237              _mm256_unpackhi_epi16(opointer[prev], opointer[curr]);
    238 
    239          __m256i V_S_MAD1 = _mm256_madd_epi16(V_S_M1, filter2);
    240          __m256i V_S_MAD2 = _mm256_madd_epi16(V_S_M2, filter2);
    241 
    242          __m256i V_S_S1 =
    243              _mm256_srli_epi32(_mm256_add_epi32(V_S_MAD1, rbias), 7);
    244          __m256i V_S_S2 =
    245              _mm256_srli_epi32(_mm256_add_epi32(V_S_MAD2, rbias), 7);
    246 
    247          __m256i V_S_SRC = _mm256_packus_epi32(V_S_S1, V_S_S2);
    248 
    249          __m256i V_D_DST = _mm256_loadu_si256((const __m256i *)dst_ptr);
    250          dst_ptr += dst_stride;
    251 
    252          __m256i V_R_SUB = _mm256_sub_epi16(V_S_SRC, V_D_DST);
    253          __m256i V_R_MAD = _mm256_madd_epi16(V_R_SUB, V_R_SUB);
    254 
    255          sum1 = _mm256_add_epi16(sum1, V_R_SUB);
    256          sse1 = _mm256_add_epi32(sse1, V_R_MAD);
    257        }
    258 
    259        __m256i v_sum0 = _mm256_madd_epi16(sum1, one);
    260        __m256i v_d_l = _mm256_unpacklo_epi32(v_sum0, sse1);
    261        __m256i v_d_h = _mm256_unpackhi_epi32(v_sum0, sse1);
    262        __m256i v_d_lh = _mm256_add_epi32(v_d_l, v_d_h);
    263        const __m128i v_d0_d = _mm256_castsi256_si128(v_d_lh);
    264        const __m128i v_d1_d = _mm256_extracti128_si256(v_d_lh, 1);
    265        __m128i v_d = _mm_add_epi32(v_d0_d, v_d1_d);
    266        v_d = _mm_add_epi32(v_d, _mm_srli_si128(v_d, 8));
    267        sum_long += _mm_extract_epi32(v_d, 0);
    268        sse_long += _mm_extract_epi32(v_d, 1);
    269      }
    270 
    271      rshift = get_msb(output_height) + get_msb(output_width);
    272 
    273    } else if (yoffset == 4) {  // xoffset==4 && yoffset==4
    274      range = output_width / 16;
    275      if (output_height == 8) inc = 2;
    276      if (output_height == 4) inc = 4;
    277      for (j = 0; j < range * output_height * inc / 16; j++) {
    278        if (j % (output_height * inc / 16) == 0) {
    279          src_ptr = src_ptr_ref;
    280          src_ptr_ref += 16;
    281          dst_ptr = dst_ptr_ref;
    282          dst_ptr_ref += 16;
    283 
    284          __m256i V_H_D1 = _mm256_loadu_si256((const __m256i *)src_ptr);
    285          __m256i V_H_D2 = _mm256_loadu_si256((const __m256i *)(src_ptr + 1));
    286          src_ptr += src_pixels_per_line;
    287          opointer[0] = _mm256_avg_epu16(V_H_D1, V_H_D2);
    288          curr = 0;
    289        }
    290 
    291        __m256i sum1 = _mm256_setzero_si256();
    292        __m256i sse1 = _mm256_setzero_si256();
    293 
    294        for (i = 0; i < 16 / inc; ++i) {
    295          prev = curr;
    296          curr = (curr == 0) ? 1 : 0;
    297          __m256i V_V_D1 = _mm256_loadu_si256((const __m256i *)src_ptr);
    298          __m256i V_V_D2 = _mm256_loadu_si256((const __m256i *)(src_ptr + 1));
    299          src_ptr += src_pixels_per_line;
    300          opointer[curr] = _mm256_avg_epu16(V_V_D1, V_V_D2);
    301          __m256i V_S_SRC = _mm256_avg_epu16(opointer[curr], opointer[prev]);
    302 
    303          __m256i V_D_DST = _mm256_loadu_si256((const __m256i *)dst_ptr);
    304          dst_ptr += dst_stride;
    305          __m256i V_R_SUB = _mm256_sub_epi16(V_S_SRC, V_D_DST);
    306          __m256i V_R_MAD = _mm256_madd_epi16(V_R_SUB, V_R_SUB);
    307          sum1 = _mm256_add_epi16(sum1, V_R_SUB);
    308          sse1 = _mm256_add_epi32(sse1, V_R_MAD);
    309        }
    310 
    311        __m256i v_sum0 = _mm256_madd_epi16(sum1, one);
    312        __m256i v_d_l = _mm256_unpacklo_epi32(v_sum0, sse1);
    313        __m256i v_d_h = _mm256_unpackhi_epi32(v_sum0, sse1);
    314        __m256i v_d_lh = _mm256_add_epi32(v_d_l, v_d_h);
    315        const __m128i v_d0_d = _mm256_castsi256_si128(v_d_lh);
    316        const __m128i v_d1_d = _mm256_extracti128_si256(v_d_lh, 1);
    317        __m128i v_d = _mm_add_epi32(v_d0_d, v_d1_d);
    318        v_d = _mm_add_epi32(v_d, _mm_srli_si128(v_d, 8));
    319        sum_long += _mm_extract_epi32(v_d, 0);
    320        sse_long += _mm_extract_epi32(v_d, 1);
    321      }
    322 
    323      rshift = get_msb(output_height) + get_msb(output_width);
    324 
    325    } else {  // xoffset==4 && yoffset==1,2,3,5,6,7
    326      range = output_width / 16;
    327      if (output_height == 8) inc = 2;
    328      if (output_height == 4) inc = 4;
    329      for (j = 0; j < range * output_height * inc / 16; j++) {
    330        if (j % (output_height * inc / 16) == 0) {
    331          src_ptr = src_ptr_ref;
    332          src_ptr_ref += 16;
    333          dst_ptr = dst_ptr_ref;
    334          dst_ptr_ref += 16;
    335 
    336          __m256i V_H_D1 = _mm256_loadu_si256((const __m256i *)src_ptr);
    337          __m256i V_H_D2 = _mm256_loadu_si256((const __m256i *)(src_ptr + 1));
    338          src_ptr += src_pixels_per_line;
    339          opointer[0] = _mm256_avg_epu16(V_H_D1, V_H_D2);
    340          curr = 0;
    341        }
    342 
    343        __m256i sum1 = _mm256_setzero_si256();
    344        __m256i sse1 = _mm256_setzero_si256();
    345 
    346        for (i = 0; i < 16 / inc; ++i) {
    347          prev = curr;
    348          curr = (curr == 0) ? 1 : 0;
    349          __m256i V_V_D1 = _mm256_loadu_si256((const __m256i *)src_ptr);
    350          __m256i V_V_D2 = _mm256_loadu_si256((const __m256i *)(src_ptr + 1));
    351          src_ptr += src_pixels_per_line;
    352          opointer[curr] = _mm256_avg_epu16(V_V_D1, V_V_D2);
    353 
    354          __m256i V_S_M1 =
    355              _mm256_unpacklo_epi16(opointer[prev], opointer[curr]);
    356          __m256i V_S_M2 =
    357              _mm256_unpackhi_epi16(opointer[prev], opointer[curr]);
    358 
    359          __m256i V_S_MAD1 = _mm256_madd_epi16(V_S_M1, filter2);
    360          __m256i V_S_MAD2 = _mm256_madd_epi16(V_S_M2, filter2);
    361 
    362          __m256i V_S_S1 =
    363              _mm256_srli_epi32(_mm256_add_epi32(V_S_MAD1, rbias), 7);
    364          __m256i V_S_S2 =
    365              _mm256_srli_epi32(_mm256_add_epi32(V_S_MAD2, rbias), 7);
    366 
    367          __m256i V_S_SRC = _mm256_packus_epi32(V_S_S1, V_S_S2);
    368 
    369          __m256i V_D_DST = _mm256_loadu_si256((const __m256i *)dst_ptr);
    370          dst_ptr += dst_stride;
    371 
    372          __m256i V_R_SUB = _mm256_sub_epi16(V_S_SRC, V_D_DST);
    373          __m256i V_R_MAD = _mm256_madd_epi16(V_R_SUB, V_R_SUB);
    374 
    375          sum1 = _mm256_add_epi16(sum1, V_R_SUB);
    376          sse1 = _mm256_add_epi32(sse1, V_R_MAD);
    377        }
    378 
    379        __m256i v_sum0 = _mm256_madd_epi16(sum1, one);
    380        __m256i v_d_l = _mm256_unpacklo_epi32(v_sum0, sse1);
    381        __m256i v_d_h = _mm256_unpackhi_epi32(v_sum0, sse1);
    382        __m256i v_d_lh = _mm256_add_epi32(v_d_l, v_d_h);
    383        const __m128i v_d0_d = _mm256_castsi256_si128(v_d_lh);
    384        const __m128i v_d1_d = _mm256_extracti128_si256(v_d_lh, 1);
    385        __m128i v_d = _mm_add_epi32(v_d0_d, v_d1_d);
    386        v_d = _mm_add_epi32(v_d, _mm_srli_si128(v_d, 8));
    387        sum_long += _mm_extract_epi32(v_d, 0);
    388        sse_long += _mm_extract_epi32(v_d, 1);
    389      }
    390 
    391      rshift = get_msb(output_height) + get_msb(output_width);
    392    }
    393  } else if (yoffset == 0) {  // xoffset==1,2,3,5,6,7 && yoffset==0
    394    range = output_width / 16;
    395    if (output_height == 8) inc = 2;
    396    if (output_height == 4) inc = 4;
    397    for (j = 0; j < range * output_height * inc / 16; j++) {
    398      if (j % (output_height * inc / 16) == 0) {
    399        src_ptr = src_ptr_ref;
    400        src_ptr_ref += 16;
    401        dst_ptr = dst_ptr_ref;
    402        dst_ptr_ref += 16;
    403 
    404        curr = 0;
    405      }
    406 
    407      __m256i sum1 = _mm256_setzero_si256();
    408      __m256i sse1 = _mm256_setzero_si256();
    409 
    410      for (i = 0; i < 16 / inc; ++i) {
    411        __m256i V_V_D1 = _mm256_loadu_si256((const __m256i *)src_ptr);
    412        __m256i V_V_D2 = _mm256_loadu_si256((const __m256i *)(src_ptr + 1));
    413        src_ptr += src_pixels_per_line;
    414        __m256i V_V_M1 = _mm256_unpacklo_epi16(V_V_D1, V_V_D2);
    415        __m256i V_V_M2 = _mm256_unpackhi_epi16(V_V_D1, V_V_D2);
    416        __m256i V_V_MAD1 = _mm256_madd_epi16(V_V_M1, filter1);
    417        __m256i V_V_MAD2 = _mm256_madd_epi16(V_V_M2, filter1);
    418        __m256i V_V_S1 =
    419            _mm256_srli_epi32(_mm256_add_epi32(V_V_MAD1, rbias), 7);
    420        __m256i V_V_S2 =
    421            _mm256_srli_epi32(_mm256_add_epi32(V_V_MAD2, rbias), 7);
    422        opointer[curr] = _mm256_packus_epi32(V_V_S1, V_V_S2);
    423 
    424        __m256i V_D_DST = _mm256_loadu_si256((const __m256i *)dst_ptr);
    425        dst_ptr += dst_stride;
    426        __m256i V_R_SUB = _mm256_sub_epi16(opointer[curr], V_D_DST);
    427        __m256i V_R_MAD = _mm256_madd_epi16(V_R_SUB, V_R_SUB);
    428 
    429        sum1 = _mm256_add_epi16(sum1, V_R_SUB);
    430        sse1 = _mm256_add_epi32(sse1, V_R_MAD);
    431      }
    432 
    433      __m256i v_sum0 = _mm256_madd_epi16(sum1, one);
    434      __m256i v_d_l = _mm256_unpacklo_epi32(v_sum0, sse1);
    435      __m256i v_d_h = _mm256_unpackhi_epi32(v_sum0, sse1);
    436      __m256i v_d_lh = _mm256_add_epi32(v_d_l, v_d_h);
    437      const __m128i v_d0_d = _mm256_castsi256_si128(v_d_lh);
    438      const __m128i v_d1_d = _mm256_extracti128_si256(v_d_lh, 1);
    439      __m128i v_d = _mm_add_epi32(v_d0_d, v_d1_d);
    440      v_d = _mm_add_epi32(v_d, _mm_srli_si128(v_d, 8));
    441      sum_long += _mm_extract_epi32(v_d, 0);
    442      sse_long += _mm_extract_epi32(v_d, 1);
    443    }
    444 
    445    rshift = get_msb(output_height) + get_msb(output_width);
    446 
    447  } else if (yoffset == 4) {  // xoffset==1,2,3,5,6,7 && yoffset==4
    448 
    449    range = output_width / 16;
    450    if (output_height == 8) inc = 2;
    451    if (output_height == 4) inc = 4;
    452    for (j = 0; j < range * output_height * inc / 16; j++) {
    453      if (j % (output_height * inc / 16) == 0) {
    454        src_ptr = src_ptr_ref;
    455        src_ptr_ref += 16;
    456        dst_ptr = dst_ptr_ref;
    457        dst_ptr_ref += 16;
    458 
    459        __m256i V_H_D1 = _mm256_loadu_si256((const __m256i *)src_ptr);
    460        __m256i V_H_D2 = _mm256_loadu_si256((const __m256i *)(src_ptr + 1));
    461        src_ptr += src_pixels_per_line;
    462 
    463        __m256i V_H_M1 = _mm256_unpacklo_epi16(V_H_D1, V_H_D2);
    464        __m256i V_H_M2 = _mm256_unpackhi_epi16(V_H_D1, V_H_D2);
    465 
    466        __m256i V_H_MAD1 = _mm256_madd_epi16(V_H_M1, filter1);
    467        __m256i V_H_MAD2 = _mm256_madd_epi16(V_H_M2, filter1);
    468 
    469        __m256i V_H_S1 =
    470            _mm256_srli_epi32(_mm256_add_epi32(V_H_MAD1, rbias), 7);
    471        __m256i V_H_S2 =
    472            _mm256_srli_epi32(_mm256_add_epi32(V_H_MAD2, rbias), 7);
    473 
    474        opointer[0] = _mm256_packus_epi32(V_H_S1, V_H_S2);
    475 
    476        curr = 0;
    477      }
    478 
    479      __m256i sum1 = _mm256_setzero_si256();
    480      __m256i sse1 = _mm256_setzero_si256();
    481 
    482      for (i = 0; i < 16 / inc; ++i) {
    483        prev = curr;
    484        curr = (curr == 0) ? 1 : 0;
    485        __m256i V_V_D1 = _mm256_loadu_si256((const __m256i *)src_ptr);
    486        __m256i V_V_D2 = _mm256_loadu_si256((const __m256i *)(src_ptr + 1));
    487        src_ptr += src_pixels_per_line;
    488        __m256i V_V_M1 = _mm256_unpacklo_epi16(V_V_D1, V_V_D2);
    489        __m256i V_V_M2 = _mm256_unpackhi_epi16(V_V_D1, V_V_D2);
    490        __m256i V_V_MAD1 = _mm256_madd_epi16(V_V_M1, filter1);
    491        __m256i V_V_MAD2 = _mm256_madd_epi16(V_V_M2, filter1);
    492        __m256i V_V_S1 =
    493            _mm256_srli_epi32(_mm256_add_epi32(V_V_MAD1, rbias), 7);
    494        __m256i V_V_S2 =
    495            _mm256_srli_epi32(_mm256_add_epi32(V_V_MAD2, rbias), 7);
    496        opointer[curr] = _mm256_packus_epi32(V_V_S1, V_V_S2);
    497 
    498        __m256i V_S_SRC = _mm256_avg_epu16(opointer[prev], opointer[curr]);
    499 
    500        __m256i V_D_DST = _mm256_loadu_si256((const __m256i *)dst_ptr);
    501        dst_ptr += dst_stride;
    502 
    503        __m256i V_R_SUB = _mm256_sub_epi16(V_S_SRC, V_D_DST);
    504        __m256i V_R_MAD = _mm256_madd_epi16(V_R_SUB, V_R_SUB);
    505 
    506        sum1 = _mm256_add_epi16(sum1, V_R_SUB);
    507        sse1 = _mm256_add_epi32(sse1, V_R_MAD);
    508      }
    509 
    510      __m256i v_sum0 = _mm256_madd_epi16(sum1, one);
    511      __m256i v_d_l = _mm256_unpacklo_epi32(v_sum0, sse1);
    512      __m256i v_d_h = _mm256_unpackhi_epi32(v_sum0, sse1);
    513      __m256i v_d_lh = _mm256_add_epi32(v_d_l, v_d_h);
    514      const __m128i v_d0_d = _mm256_castsi256_si128(v_d_lh);
    515      const __m128i v_d1_d = _mm256_extracti128_si256(v_d_lh, 1);
    516      __m128i v_d = _mm_add_epi32(v_d0_d, v_d1_d);
    517      v_d = _mm_add_epi32(v_d, _mm_srli_si128(v_d, 8));
    518      sum_long += _mm_extract_epi32(v_d, 0);
    519      sse_long += _mm_extract_epi32(v_d, 1);
    520    }
    521 
    522    rshift = get_msb(output_height) + get_msb(output_width);
    523 
    524  } else {  // xoffset==1,2,3,5,6,7 && yoffset==1,2,3,5,6,7
    525    range = output_width / 16;
    526    if (output_height == 8) inc = 2;
    527    if (output_height == 4) inc = 4;
    528    unsigned int nloop = 16 / inc;
    529    for (j = 0; j < range * output_height * inc / 16; j++) {
    530      if (j % (output_height * inc / 16) == 0) {
    531        src_ptr = src_ptr_ref;
    532        src_ptr_ref += 16;
    533        dst_ptr = dst_ptr_ref;
    534        dst_ptr_ref += 16;
    535 
    536        __m256i V_H_D1 = _mm256_loadu_si256((const __m256i *)src_ptr);
    537        __m256i V_H_D2 = _mm256_loadu_si256((const __m256i *)(src_ptr + 1));
    538        src_ptr += src_pixels_per_line;
    539 
    540        __m256i V_H_M1 = _mm256_unpacklo_epi16(V_H_D1, V_H_D2);
    541        __m256i V_H_M2 = _mm256_unpackhi_epi16(V_H_D1, V_H_D2);
    542 
    543        __m256i V_H_MAD1 = _mm256_madd_epi16(V_H_M1, filter1);
    544        __m256i V_H_MAD2 = _mm256_madd_epi16(V_H_M2, filter1);
    545 
    546        __m256i V_H_S1 =
    547            _mm256_srli_epi32(_mm256_add_epi32(V_H_MAD1, rbias), 7);
    548        __m256i V_H_S2 =
    549            _mm256_srli_epi32(_mm256_add_epi32(V_H_MAD2, rbias), 7);
    550 
    551        opointer[0] = _mm256_packus_epi32(V_H_S1, V_H_S2);
    552 
    553        curr = 0;
    554      }
    555 
    556      __m256i sum1 = _mm256_setzero_si256();
    557      __m256i sse1 = _mm256_setzero_si256();
    558 
    559      for (i = 0; i < nloop; ++i) {
    560        prev = curr;
    561        curr = !curr;
    562        __m256i V_V_D1 = _mm256_loadu_si256((const __m256i *)src_ptr);
    563        __m256i V_V_D2 = _mm256_loadu_si256((const __m256i *)(src_ptr + 1));
    564        src_ptr += src_pixels_per_line;
    565        __m256i V_V_M1 = _mm256_unpacklo_epi16(V_V_D1, V_V_D2);
    566        __m256i V_V_M2 = _mm256_unpackhi_epi16(V_V_D1, V_V_D2);
    567        __m256i V_V_MAD1 = _mm256_madd_epi16(V_V_M1, filter1);
    568        __m256i V_V_MAD2 = _mm256_madd_epi16(V_V_M2, filter1);
    569        __m256i V_V_S1 =
    570            _mm256_srli_epi32(_mm256_add_epi32(V_V_MAD1, rbias), 7);
    571        __m256i V_V_S2 =
    572            _mm256_srli_epi32(_mm256_add_epi32(V_V_MAD2, rbias), 7);
    573        opointer[curr] = _mm256_packus_epi32(V_V_S1, V_V_S2);
    574 
    575        __m256i V_S_M1 = _mm256_unpacklo_epi16(opointer[prev], opointer[curr]);
    576        __m256i V_S_M2 = _mm256_unpackhi_epi16(opointer[prev], opointer[curr]);
    577 
    578        __m256i V_S_MAD1 = _mm256_madd_epi16(V_S_M1, filter2);
    579        __m256i V_S_MAD2 = _mm256_madd_epi16(V_S_M2, filter2);
    580 
    581        __m256i V_S_S1 =
    582            _mm256_srli_epi32(_mm256_add_epi32(V_S_MAD1, rbias), 7);
    583        __m256i V_S_S2 =
    584            _mm256_srli_epi32(_mm256_add_epi32(V_S_MAD2, rbias), 7);
    585 
    586        __m256i V_S_SRC = _mm256_packus_epi32(V_S_S1, V_S_S2);
    587 
    588        __m256i V_D_DST = _mm256_loadu_si256((const __m256i *)dst_ptr);
    589        dst_ptr += dst_stride;
    590 
    591        __m256i V_R_SUB = _mm256_sub_epi16(V_S_SRC, V_D_DST);
    592        __m256i V_R_MAD = _mm256_madd_epi16(V_R_SUB, V_R_SUB);
    593 
    594        sum1 = _mm256_add_epi16(sum1, V_R_SUB);
    595        sse1 = _mm256_add_epi32(sse1, V_R_MAD);
    596      }
    597 
    598      __m256i v_sum0 = _mm256_madd_epi16(sum1, one);
    599      __m256i v_d_l = _mm256_unpacklo_epi32(v_sum0, sse1);
    600      __m256i v_d_h = _mm256_unpackhi_epi32(v_sum0, sse1);
    601      __m256i v_d_lh = _mm256_add_epi32(v_d_l, v_d_h);
    602      const __m128i v_d0_d = _mm256_castsi256_si128(v_d_lh);
    603      const __m128i v_d1_d = _mm256_extracti128_si256(v_d_lh, 1);
    604      __m128i v_d = _mm_add_epi32(v_d0_d, v_d1_d);
    605      v_d = _mm_add_epi32(v_d, _mm_srli_si128(v_d, 8));
    606      sum_long += _mm_extract_epi32(v_d, 0);
    607      sse_long += _mm_extract_epi32(v_d, 1);
    608    }
    609 
    610    rshift = get_msb(output_height) + get_msb(output_width);
    611  }
    612 
    613  *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_long, 4);
    614  int sum = (int)ROUND_POWER_OF_TWO(sum_long, 2);
    615 
    616  int32_t var = *sse - (uint32_t)(((int64_t)sum * sum) >> rshift);
    617 
    618  return (var > 0) ? var : 0;
    619 }
    620 
    621 static void highbd_calc8x8var_avx2(const uint16_t *src, int src_stride,
    622                                   const uint16_t *ref, int ref_stride,
    623                                   uint32_t *sse, int *sum) {
    624  __m256i v_sum_d = _mm256_setzero_si256();
    625  __m256i v_sse_d = _mm256_setzero_si256();
    626  for (int i = 0; i < 8; i += 2) {
    627    const __m128i v_p_a0 = _mm_loadu_si128((const __m128i *)src);
    628    const __m128i v_p_a1 = _mm_loadu_si128((const __m128i *)(src + src_stride));
    629    const __m128i v_p_b0 = _mm_loadu_si128((const __m128i *)ref);
    630    const __m128i v_p_b1 = _mm_loadu_si128((const __m128i *)(ref + ref_stride));
    631    __m256i v_p_a = _mm256_castsi128_si256(v_p_a0);
    632    __m256i v_p_b = _mm256_castsi128_si256(v_p_b0);
    633    v_p_a = _mm256_inserti128_si256(v_p_a, v_p_a1, 1);
    634    v_p_b = _mm256_inserti128_si256(v_p_b, v_p_b1, 1);
    635    const __m256i v_diff = _mm256_sub_epi16(v_p_a, v_p_b);
    636    const __m256i v_sqrdiff = _mm256_madd_epi16(v_diff, v_diff);
    637    v_sum_d = _mm256_add_epi16(v_sum_d, v_diff);
    638    v_sse_d = _mm256_add_epi32(v_sse_d, v_sqrdiff);
    639    src += src_stride * 2;
    640    ref += ref_stride * 2;
    641  }
    642  __m256i v_sum00 = _mm256_cvtepi16_epi32(_mm256_castsi256_si128(v_sum_d));
    643  __m256i v_sum01 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(v_sum_d, 1));
    644  __m256i v_sum0 = _mm256_add_epi32(v_sum00, v_sum01);
    645  __m256i v_d_l = _mm256_unpacklo_epi32(v_sum0, v_sse_d);
    646  __m256i v_d_h = _mm256_unpackhi_epi32(v_sum0, v_sse_d);
    647  __m256i v_d_lh = _mm256_add_epi32(v_d_l, v_d_h);
    648  const __m128i v_d0_d = _mm256_castsi256_si128(v_d_lh);
    649  const __m128i v_d1_d = _mm256_extracti128_si256(v_d_lh, 1);
    650  __m128i v_d = _mm_add_epi32(v_d0_d, v_d1_d);
    651  v_d = _mm_add_epi32(v_d, _mm_srli_si128(v_d, 8));
    652  *sum = _mm_extract_epi32(v_d, 0);
    653  *sse = _mm_extract_epi32(v_d, 1);
    654 }
    655 
    656 static void highbd_calc16x16var_avx2(const uint16_t *src, int src_stride,
    657                                     const uint16_t *ref, int ref_stride,
    658                                     uint32_t *sse, int *sum) {
    659  __m256i v_sum_d = _mm256_setzero_si256();
    660  __m256i v_sse_d = _mm256_setzero_si256();
    661  const __m256i one = _mm256_set1_epi16(1);
    662  for (int i = 0; i < 16; ++i) {
    663    const __m256i v_p_a = _mm256_loadu_si256((const __m256i *)src);
    664    const __m256i v_p_b = _mm256_loadu_si256((const __m256i *)ref);
    665    const __m256i v_diff = _mm256_sub_epi16(v_p_a, v_p_b);
    666    const __m256i v_sqrdiff = _mm256_madd_epi16(v_diff, v_diff);
    667    v_sum_d = _mm256_add_epi16(v_sum_d, v_diff);
    668    v_sse_d = _mm256_add_epi32(v_sse_d, v_sqrdiff);
    669    src += src_stride;
    670    ref += ref_stride;
    671  }
    672  __m256i v_sum0 = _mm256_madd_epi16(v_sum_d, one);
    673  __m256i v_d_l = _mm256_unpacklo_epi32(v_sum0, v_sse_d);
    674  __m256i v_d_h = _mm256_unpackhi_epi32(v_sum0, v_sse_d);
    675  __m256i v_d_lh = _mm256_add_epi32(v_d_l, v_d_h);
    676  const __m128i v_d0_d = _mm256_castsi256_si128(v_d_lh);
    677  const __m128i v_d1_d = _mm256_extracti128_si256(v_d_lh, 1);
    678  __m128i v_d = _mm_add_epi32(v_d0_d, v_d1_d);
    679  v_d = _mm_add_epi32(v_d, _mm_srli_si128(v_d, 8));
    680  *sum = _mm_extract_epi32(v_d, 0);
    681  *sse = _mm_extract_epi32(v_d, 1);
    682 }
    683 
    684 static void highbd_10_variance_avx2(const uint16_t *src, int src_stride,
    685                                    const uint16_t *ref, int ref_stride, int w,
    686                                    int h, uint32_t *sse, int *sum,
    687                                    high_variance_fn_t var_fn, int block_size) {
    688  int i, j;
    689  uint64_t sse_long = 0;
    690  int32_t sum_long = 0;
    691 
    692  for (i = 0; i < h; i += block_size) {
    693    for (j = 0; j < w; j += block_size) {
    694      unsigned int sse0;
    695      int sum0;
    696      var_fn(src + src_stride * i + j, src_stride, ref + ref_stride * i + j,
    697             ref_stride, &sse0, &sum0);
    698      sse_long += sse0;
    699      sum_long += sum0;
    700    }
    701  }
    702  *sum = ROUND_POWER_OF_TWO(sum_long, 2);
    703  *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_long, 4);
    704 }
    705 
    706 #define VAR_FN(w, h, block_size, shift)                                        \
    707  uint32_t aom_highbd_10_variance##w##x##h##_avx2(                             \
    708      const uint8_t *src8, int src_stride, const uint8_t *ref8,                \
    709      int ref_stride, uint32_t *sse) {                                         \
    710    int sum;                                                                   \
    711    int64_t var;                                                               \
    712    uint16_t *src = CONVERT_TO_SHORTPTR(src8);                                 \
    713    uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);                                 \
    714    highbd_10_variance_avx2(src, src_stride, ref, ref_stride, w, h, sse, &sum, \
    715                            highbd_calc##block_size##x##block_size##var_avx2,  \
    716                            block_size);                                       \
    717    var = (int64_t)(*sse) - (((int64_t)sum * sum) >> shift);                   \
    718    return (var >= 0) ? (uint32_t)var : 0;                                     \
    719  }
    720 
    721 VAR_FN(128, 128, 16, 14)
    722 VAR_FN(128, 64, 16, 13)
    723 VAR_FN(64, 128, 16, 13)
    724 VAR_FN(64, 64, 16, 12)
    725 VAR_FN(64, 32, 16, 11)
    726 VAR_FN(32, 64, 16, 11)
    727 VAR_FN(32, 32, 16, 10)
    728 VAR_FN(32, 16, 16, 9)
    729 VAR_FN(16, 32, 16, 9)
    730 VAR_FN(16, 16, 16, 8)
    731 VAR_FN(16, 8, 8, 7)
    732 VAR_FN(8, 16, 8, 7)
    733 VAR_FN(8, 8, 8, 6)
    734 
    735 #if !CONFIG_REALTIME_ONLY
    736 VAR_FN(16, 64, 16, 10)
    737 VAR_FN(32, 8, 8, 8)
    738 VAR_FN(64, 16, 16, 10)
    739 VAR_FN(8, 32, 8, 8)
    740 #endif  // !CONFIG_REALTIME_ONLY
    741 
    742 #undef VAR_FN
    743 
    744 unsigned int aom_highbd_10_mse16x16_avx2(const uint8_t *src8, int src_stride,
    745                                         const uint8_t *ref8, int ref_stride,
    746                                         unsigned int *sse) {
    747  int sum;
    748  uint16_t *src = CONVERT_TO_SHORTPTR(src8);
    749  uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
    750  highbd_10_variance_avx2(src, src_stride, ref, ref_stride, 16, 16, sse, &sum,
    751                          highbd_calc16x16var_avx2, 16);
    752  return *sse;
    753 }
    754 
    755 #define SSE2_HEIGHT(H)                                                 \
    756  uint32_t aom_highbd_10_sub_pixel_variance8x##H##_sse2(               \
    757      const uint8_t *src8, int src_stride, int x_offset, int y_offset, \
    758      const uint8_t *dst8, int dst_stride, uint32_t *sse_ptr);
    759 
    760 SSE2_HEIGHT(8)
    761 SSE2_HEIGHT(16)
    762 
    763 #undef SSE2_HEIGHT
    764 
    765 #define HIGHBD_SUBPIX_VAR(W, H)                                              \
    766  uint32_t aom_highbd_10_sub_pixel_variance##W##x##H##_avx2(                 \
    767      const uint8_t *src, int src_stride, int xoffset, int yoffset,          \
    768      const uint8_t *dst, int dst_stride, uint32_t *sse) {                   \
    769    if (W == 8 && H == 16)                                                   \
    770      return aom_highbd_10_sub_pixel_variance8x16_sse2(                      \
    771          src, src_stride, xoffset, yoffset, dst, dst_stride, sse);          \
    772    else if (W == 8 && H == 8)                                               \
    773      return aom_highbd_10_sub_pixel_variance8x8_sse2(                       \
    774          src, src_stride, xoffset, yoffset, dst, dst_stride, sse);          \
    775    else                                                                     \
    776      return aom_highbd_var_filter_block2d_bil_avx2(                         \
    777          src, src_stride, 1, H, W, xoffset, yoffset, dst, dst_stride, sse); \
    778  }
    779 
    780 HIGHBD_SUBPIX_VAR(128, 128)
    781 HIGHBD_SUBPIX_VAR(128, 64)
    782 HIGHBD_SUBPIX_VAR(64, 128)
    783 HIGHBD_SUBPIX_VAR(64, 64)
    784 HIGHBD_SUBPIX_VAR(64, 32)
    785 HIGHBD_SUBPIX_VAR(32, 64)
    786 HIGHBD_SUBPIX_VAR(32, 32)
    787 HIGHBD_SUBPIX_VAR(32, 16)
    788 HIGHBD_SUBPIX_VAR(16, 32)
    789 HIGHBD_SUBPIX_VAR(16, 16)
    790 HIGHBD_SUBPIX_VAR(16, 8)
    791 HIGHBD_SUBPIX_VAR(8, 16)
    792 HIGHBD_SUBPIX_VAR(8, 8)
    793 
    794 #undef HIGHBD_SUBPIX_VAR
    795 
    796 static uint64_t mse_4xh_16bit_highbd_avx2(uint16_t *dst, int dstride,
    797                                          uint16_t *src, int sstride, int h) {
    798  uint64_t sum = 0;
    799  __m128i reg0_4x16, reg1_4x16, reg2_4x16, reg3_4x16;
    800  __m256i src0_8x16, src1_8x16, src_16x16;
    801  __m256i dst0_8x16, dst1_8x16, dst_16x16;
    802  __m256i res0_4x64, res1_4x64, res2_4x64, res3_4x64;
    803  __m256i sub_result;
    804  const __m256i zeros = _mm256_broadcastsi128_si256(_mm_setzero_si128());
    805  __m256i square_result = _mm256_broadcastsi128_si256(_mm_setzero_si128());
    806  for (int i = 0; i < h; i += 4) {
    807    reg0_4x16 = _mm_loadl_epi64((__m128i const *)(&dst[(i + 0) * dstride]));
    808    reg1_4x16 = _mm_loadl_epi64((__m128i const *)(&dst[(i + 1) * dstride]));
    809    reg2_4x16 = _mm_loadl_epi64((__m128i const *)(&dst[(i + 2) * dstride]));
    810    reg3_4x16 = _mm_loadl_epi64((__m128i const *)(&dst[(i + 3) * dstride]));
    811    dst0_8x16 =
    812        _mm256_castsi128_si256(_mm_unpacklo_epi64(reg0_4x16, reg1_4x16));
    813    dst1_8x16 =
    814        _mm256_castsi128_si256(_mm_unpacklo_epi64(reg2_4x16, reg3_4x16));
    815    dst_16x16 = _mm256_permute2x128_si256(dst0_8x16, dst1_8x16, 0x20);
    816 
    817    reg0_4x16 = _mm_loadl_epi64((__m128i const *)(&src[(i + 0) * sstride]));
    818    reg1_4x16 = _mm_loadl_epi64((__m128i const *)(&src[(i + 1) * sstride]));
    819    reg2_4x16 = _mm_loadl_epi64((__m128i const *)(&src[(i + 2) * sstride]));
    820    reg3_4x16 = _mm_loadl_epi64((__m128i const *)(&src[(i + 3) * sstride]));
    821    src0_8x16 =
    822        _mm256_castsi128_si256(_mm_unpacklo_epi64(reg0_4x16, reg1_4x16));
    823    src1_8x16 =
    824        _mm256_castsi128_si256(_mm_unpacklo_epi64(reg2_4x16, reg3_4x16));
    825    src_16x16 = _mm256_permute2x128_si256(src0_8x16, src1_8x16, 0x20);
    826 
    827    sub_result = _mm256_abs_epi16(_mm256_sub_epi16(src_16x16, dst_16x16));
    828 
    829    src_16x16 = _mm256_unpacklo_epi16(sub_result, zeros);
    830    dst_16x16 = _mm256_unpackhi_epi16(sub_result, zeros);
    831 
    832    src_16x16 = _mm256_madd_epi16(src_16x16, src_16x16);
    833    dst_16x16 = _mm256_madd_epi16(dst_16x16, dst_16x16);
    834 
    835    res0_4x64 = _mm256_unpacklo_epi32(src_16x16, zeros);
    836    res1_4x64 = _mm256_unpackhi_epi32(src_16x16, zeros);
    837    res2_4x64 = _mm256_unpacklo_epi32(dst_16x16, zeros);
    838    res3_4x64 = _mm256_unpackhi_epi32(dst_16x16, zeros);
    839 
    840    square_result = _mm256_add_epi64(
    841        square_result,
    842        _mm256_add_epi64(
    843            _mm256_add_epi64(_mm256_add_epi64(res0_4x64, res1_4x64), res2_4x64),
    844            res3_4x64));
    845  }
    846  const __m128i sum_2x64 =
    847      _mm_add_epi64(_mm256_castsi256_si128(square_result),
    848                    _mm256_extracti128_si256(square_result, 1));
    849  const __m128i sum_1x64 = _mm_add_epi64(sum_2x64, _mm_srli_si128(sum_2x64, 8));
    850  xx_storel_64(&sum, sum_1x64);
    851  return sum;
    852 }
    853 
    854 static uint64_t mse_8xh_16bit_highbd_avx2(uint16_t *dst, int dstride,
    855                                          uint16_t *src, int sstride, int h) {
    856  uint64_t sum = 0;
    857  __m256i src0_8x16, src1_8x16, src_16x16;
    858  __m256i dst0_8x16, dst1_8x16, dst_16x16;
    859  __m256i res0_4x64, res1_4x64, res2_4x64, res3_4x64;
    860  __m256i sub_result;
    861  const __m256i zeros = _mm256_broadcastsi128_si256(_mm_setzero_si128());
    862  __m256i square_result = _mm256_broadcastsi128_si256(_mm_setzero_si128());
    863 
    864  for (int i = 0; i < h; i += 2) {
    865    dst0_8x16 =
    866        _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)&dst[i * dstride]));
    867    dst1_8x16 = _mm256_castsi128_si256(
    868        _mm_loadu_si128((__m128i *)&dst[(i + 1) * dstride]));
    869    dst_16x16 = _mm256_permute2x128_si256(dst0_8x16, dst1_8x16, 0x20);
    870 
    871    src0_8x16 =
    872        _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)&src[i * sstride]));
    873    src1_8x16 = _mm256_castsi128_si256(
    874        _mm_loadu_si128((__m128i *)&src[(i + 1) * sstride]));
    875    src_16x16 = _mm256_permute2x128_si256(src0_8x16, src1_8x16, 0x20);
    876 
    877    sub_result = _mm256_abs_epi16(_mm256_sub_epi16(src_16x16, dst_16x16));
    878 
    879    src_16x16 = _mm256_unpacklo_epi16(sub_result, zeros);
    880    dst_16x16 = _mm256_unpackhi_epi16(sub_result, zeros);
    881 
    882    src_16x16 = _mm256_madd_epi16(src_16x16, src_16x16);
    883    dst_16x16 = _mm256_madd_epi16(dst_16x16, dst_16x16);
    884 
    885    res0_4x64 = _mm256_unpacklo_epi32(src_16x16, zeros);
    886    res1_4x64 = _mm256_unpackhi_epi32(src_16x16, zeros);
    887    res2_4x64 = _mm256_unpacklo_epi32(dst_16x16, zeros);
    888    res3_4x64 = _mm256_unpackhi_epi32(dst_16x16, zeros);
    889 
    890    square_result = _mm256_add_epi64(
    891        square_result,
    892        _mm256_add_epi64(
    893            _mm256_add_epi64(_mm256_add_epi64(res0_4x64, res1_4x64), res2_4x64),
    894            res3_4x64));
    895  }
    896 
    897  const __m128i sum_2x64 =
    898      _mm_add_epi64(_mm256_castsi256_si128(square_result),
    899                    _mm256_extracti128_si256(square_result, 1));
    900  const __m128i sum_1x64 = _mm_add_epi64(sum_2x64, _mm_srli_si128(sum_2x64, 8));
    901  xx_storel_64(&sum, sum_1x64);
    902  return sum;
    903 }
    904 
    905 uint64_t aom_mse_wxh_16bit_highbd_avx2(uint16_t *dst, int dstride,
    906                                       uint16_t *src, int sstride, int w,
    907                                       int h) {
    908  assert((w == 8 || w == 4) && (h == 8 || h == 4) &&
    909         "w=8/4 and h=8/4 must satisfy");
    910  switch (w) {
    911    case 4: return mse_4xh_16bit_highbd_avx2(dst, dstride, src, sstride, h);
    912    case 8: return mse_8xh_16bit_highbd_avx2(dst, dstride, src, sstride, h);
    913    default: assert(0 && "unsupported width"); return -1;
    914  }
    915 }