tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

highbd_sad_avx2.c (27169B)


      1 /*
      2 * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
      3 *
      4 * This source code is subject to the terms of the BSD 2 Clause License and
      5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
      6 * was not distributed with this source code in the LICENSE file, you can
      7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
      8 * Media Patent License 1.0 was not distributed with this source code in the
      9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
     10 */
     11 
     12 #include <immintrin.h>
     13 
     14 #include "config/aom_config.h"
     15 #include "config/aom_dsp_rtcd.h"
     16 
     17 #include "aom/aom_integer.h"
     18 #include "aom_dsp/x86/synonyms_avx2.h"
     19 #include "aom_ports/mem.h"
     20 
     21 // SAD
     22 static inline unsigned int get_sad_from_mm256_epi32(const __m256i *v) {
     23  // input 8 32-bit summation
     24  __m128i lo128, hi128;
     25  __m256i u = _mm256_srli_si256(*v, 8);
     26  u = _mm256_add_epi32(u, *v);
     27 
     28  // 4 32-bit summation
     29  hi128 = _mm256_extracti128_si256(u, 1);
     30  lo128 = _mm256_castsi256_si128(u);
     31  lo128 = _mm_add_epi32(hi128, lo128);
     32 
     33  // 2 32-bit summation
     34  hi128 = _mm_srli_si128(lo128, 4);
     35  lo128 = _mm_add_epi32(lo128, hi128);
     36 
     37  return (unsigned int)_mm_cvtsi128_si32(lo128);
     38 }
     39 
     40 static inline void highbd_sad16x4_core_avx2(__m256i *s, __m256i *r,
     41                                            __m256i *sad_acc) {
     42  const __m256i zero = _mm256_setzero_si256();
     43  int i;
     44  for (i = 0; i < 4; i++) {
     45    s[i] = _mm256_sub_epi16(s[i], r[i]);
     46    s[i] = _mm256_abs_epi16(s[i]);
     47  }
     48 
     49  s[0] = _mm256_add_epi16(s[0], s[1]);
     50  s[0] = _mm256_add_epi16(s[0], s[2]);
     51  s[0] = _mm256_add_epi16(s[0], s[3]);
     52 
     53  r[0] = _mm256_unpacklo_epi16(s[0], zero);
     54  r[1] = _mm256_unpackhi_epi16(s[0], zero);
     55 
     56  r[0] = _mm256_add_epi32(r[0], r[1]);
     57  *sad_acc = _mm256_add_epi32(*sad_acc, r[0]);
     58 }
     59 
     60 // If sec_ptr = 0, calculate regular SAD. Otherwise, calculate average SAD.
     61 static inline void sad16x4(const uint16_t *src_ptr, int src_stride,
     62                           const uint16_t *ref_ptr, int ref_stride,
     63                           const uint16_t *sec_ptr, __m256i *sad_acc) {
     64  __m256i s[4], r[4];
     65  s[0] = _mm256_loadu_si256((const __m256i *)src_ptr);
     66  s[1] = _mm256_loadu_si256((const __m256i *)(src_ptr + src_stride));
     67  s[2] = _mm256_loadu_si256((const __m256i *)(src_ptr + 2 * src_stride));
     68  s[3] = _mm256_loadu_si256((const __m256i *)(src_ptr + 3 * src_stride));
     69 
     70  r[0] = _mm256_loadu_si256((const __m256i *)ref_ptr);
     71  r[1] = _mm256_loadu_si256((const __m256i *)(ref_ptr + ref_stride));
     72  r[2] = _mm256_loadu_si256((const __m256i *)(ref_ptr + 2 * ref_stride));
     73  r[3] = _mm256_loadu_si256((const __m256i *)(ref_ptr + 3 * ref_stride));
     74 
     75  if (sec_ptr) {
     76    r[0] = _mm256_avg_epu16(r[0], _mm256_loadu_si256((const __m256i *)sec_ptr));
     77    r[1] = _mm256_avg_epu16(
     78        r[1], _mm256_loadu_si256((const __m256i *)(sec_ptr + 16)));
     79    r[2] = _mm256_avg_epu16(
     80        r[2], _mm256_loadu_si256((const __m256i *)(sec_ptr + 32)));
     81    r[3] = _mm256_avg_epu16(
     82        r[3], _mm256_loadu_si256((const __m256i *)(sec_ptr + 48)));
     83  }
     84  highbd_sad16x4_core_avx2(s, r, sad_acc);
     85 }
     86 
     87 static AOM_FORCE_INLINE unsigned int aom_highbd_sad16xN_avx2(int N,
     88                                                             const uint8_t *src,
     89                                                             int src_stride,
     90                                                             const uint8_t *ref,
     91                                                             int ref_stride) {
     92  const uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src);
     93  const uint16_t *ref_ptr = CONVERT_TO_SHORTPTR(ref);
     94  int i;
     95  __m256i sad = _mm256_setzero_si256();
     96  for (i = 0; i < N; i += 4) {
     97    sad16x4(src_ptr, src_stride, ref_ptr, ref_stride, NULL, &sad);
     98    src_ptr += src_stride << 2;
     99    ref_ptr += ref_stride << 2;
    100  }
    101  return (unsigned int)get_sad_from_mm256_epi32(&sad);
    102 }
    103 
    104 static void sad32x4(const uint16_t *src_ptr, int src_stride,
    105                    const uint16_t *ref_ptr, int ref_stride,
    106                    const uint16_t *sec_ptr, __m256i *sad_acc) {
    107  __m256i s[4], r[4];
    108  int row_sections = 0;
    109 
    110  while (row_sections < 2) {
    111    s[0] = _mm256_loadu_si256((const __m256i *)src_ptr);
    112    s[1] = _mm256_loadu_si256((const __m256i *)(src_ptr + 16));
    113    s[2] = _mm256_loadu_si256((const __m256i *)(src_ptr + src_stride));
    114    s[3] = _mm256_loadu_si256((const __m256i *)(src_ptr + src_stride + 16));
    115 
    116    r[0] = _mm256_loadu_si256((const __m256i *)ref_ptr);
    117    r[1] = _mm256_loadu_si256((const __m256i *)(ref_ptr + 16));
    118    r[2] = _mm256_loadu_si256((const __m256i *)(ref_ptr + ref_stride));
    119    r[3] = _mm256_loadu_si256((const __m256i *)(ref_ptr + ref_stride + 16));
    120 
    121    if (sec_ptr) {
    122      r[0] =
    123          _mm256_avg_epu16(r[0], _mm256_loadu_si256((const __m256i *)sec_ptr));
    124      r[1] = _mm256_avg_epu16(
    125          r[1], _mm256_loadu_si256((const __m256i *)(sec_ptr + 16)));
    126      r[2] = _mm256_avg_epu16(
    127          r[2], _mm256_loadu_si256((const __m256i *)(sec_ptr + 32)));
    128      r[3] = _mm256_avg_epu16(
    129          r[3], _mm256_loadu_si256((const __m256i *)(sec_ptr + 48)));
    130      sec_ptr += 32 << 1;
    131    }
    132    highbd_sad16x4_core_avx2(s, r, sad_acc);
    133 
    134    row_sections += 1;
    135    src_ptr += src_stride << 1;
    136    ref_ptr += ref_stride << 1;
    137  }
    138 }
    139 
    140 static AOM_FORCE_INLINE unsigned int aom_highbd_sad32xN_avx2(int N,
    141                                                             const uint8_t *src,
    142                                                             int src_stride,
    143                                                             const uint8_t *ref,
    144                                                             int ref_stride) {
    145  __m256i sad = _mm256_setzero_si256();
    146  uint16_t *srcp = CONVERT_TO_SHORTPTR(src);
    147  uint16_t *refp = CONVERT_TO_SHORTPTR(ref);
    148  const int left_shift = 2;
    149  int i;
    150 
    151  for (i = 0; i < N; i += 4) {
    152    sad32x4(srcp, src_stride, refp, ref_stride, NULL, &sad);
    153    srcp += src_stride << left_shift;
    154    refp += ref_stride << left_shift;
    155  }
    156  return get_sad_from_mm256_epi32(&sad);
    157 }
    158 
    159 static void sad64x2(const uint16_t *src_ptr, int src_stride,
    160                    const uint16_t *ref_ptr, int ref_stride,
    161                    const uint16_t *sec_ptr, __m256i *sad_acc) {
    162  __m256i s[4], r[4];
    163  int i;
    164  for (i = 0; i < 2; i++) {
    165    s[0] = _mm256_loadu_si256((const __m256i *)src_ptr);
    166    s[1] = _mm256_loadu_si256((const __m256i *)(src_ptr + 16));
    167    s[2] = _mm256_loadu_si256((const __m256i *)(src_ptr + 32));
    168    s[3] = _mm256_loadu_si256((const __m256i *)(src_ptr + 48));
    169 
    170    r[0] = _mm256_loadu_si256((const __m256i *)ref_ptr);
    171    r[1] = _mm256_loadu_si256((const __m256i *)(ref_ptr + 16));
    172    r[2] = _mm256_loadu_si256((const __m256i *)(ref_ptr + 32));
    173    r[3] = _mm256_loadu_si256((const __m256i *)(ref_ptr + 48));
    174    if (sec_ptr) {
    175      r[0] =
    176          _mm256_avg_epu16(r[0], _mm256_loadu_si256((const __m256i *)sec_ptr));
    177      r[1] = _mm256_avg_epu16(
    178          r[1], _mm256_loadu_si256((const __m256i *)(sec_ptr + 16)));
    179      r[2] = _mm256_avg_epu16(
    180          r[2], _mm256_loadu_si256((const __m256i *)(sec_ptr + 32)));
    181      r[3] = _mm256_avg_epu16(
    182          r[3], _mm256_loadu_si256((const __m256i *)(sec_ptr + 48)));
    183      sec_ptr += 64;
    184    }
    185    highbd_sad16x4_core_avx2(s, r, sad_acc);
    186    src_ptr += src_stride;
    187    ref_ptr += ref_stride;
    188  }
    189 }
    190 
    191 static AOM_FORCE_INLINE unsigned int aom_highbd_sad64xN_avx2(int N,
    192                                                             const uint8_t *src,
    193                                                             int src_stride,
    194                                                             const uint8_t *ref,
    195                                                             int ref_stride) {
    196  __m256i sad = _mm256_setzero_si256();
    197  uint16_t *srcp = CONVERT_TO_SHORTPTR(src);
    198  uint16_t *refp = CONVERT_TO_SHORTPTR(ref);
    199  const int left_shift = 1;
    200  int i;
    201  for (i = 0; i < N; i += 2) {
    202    sad64x2(srcp, src_stride, refp, ref_stride, NULL, &sad);
    203    srcp += src_stride << left_shift;
    204    refp += ref_stride << left_shift;
    205  }
    206  return get_sad_from_mm256_epi32(&sad);
    207 }
    208 
    209 static void sad128x1(const uint16_t *src_ptr, const uint16_t *ref_ptr,
    210                     const uint16_t *sec_ptr, __m256i *sad_acc) {
    211  __m256i s[4], r[4];
    212  int i;
    213  for (i = 0; i < 2; i++) {
    214    s[0] = _mm256_loadu_si256((const __m256i *)src_ptr);
    215    s[1] = _mm256_loadu_si256((const __m256i *)(src_ptr + 16));
    216    s[2] = _mm256_loadu_si256((const __m256i *)(src_ptr + 32));
    217    s[3] = _mm256_loadu_si256((const __m256i *)(src_ptr + 48));
    218    r[0] = _mm256_loadu_si256((const __m256i *)ref_ptr);
    219    r[1] = _mm256_loadu_si256((const __m256i *)(ref_ptr + 16));
    220    r[2] = _mm256_loadu_si256((const __m256i *)(ref_ptr + 32));
    221    r[3] = _mm256_loadu_si256((const __m256i *)(ref_ptr + 48));
    222    if (sec_ptr) {
    223      r[0] =
    224          _mm256_avg_epu16(r[0], _mm256_loadu_si256((const __m256i *)sec_ptr));
    225      r[1] = _mm256_avg_epu16(
    226          r[1], _mm256_loadu_si256((const __m256i *)(sec_ptr + 16)));
    227      r[2] = _mm256_avg_epu16(
    228          r[2], _mm256_loadu_si256((const __m256i *)(sec_ptr + 32)));
    229      r[3] = _mm256_avg_epu16(
    230          r[3], _mm256_loadu_si256((const __m256i *)(sec_ptr + 48)));
    231      sec_ptr += 64;
    232    }
    233    highbd_sad16x4_core_avx2(s, r, sad_acc);
    234    src_ptr += 64;
    235    ref_ptr += 64;
    236  }
    237 }
    238 
    239 static AOM_FORCE_INLINE unsigned int aom_highbd_sad128xN_avx2(
    240    int N, const uint8_t *src, int src_stride, const uint8_t *ref,
    241    int ref_stride) {
    242  __m256i sad = _mm256_setzero_si256();
    243  uint16_t *srcp = CONVERT_TO_SHORTPTR(src);
    244  uint16_t *refp = CONVERT_TO_SHORTPTR(ref);
    245  int row = 0;
    246  while (row < N) {
    247    sad128x1(srcp, refp, NULL, &sad);
    248    srcp += src_stride;
    249    refp += ref_stride;
    250    row++;
    251  }
    252  return get_sad_from_mm256_epi32(&sad);
    253 }
    254 
    255 #define HIGHBD_SADMXN_AVX2(m, n)                                            \
    256  unsigned int aom_highbd_sad##m##x##n##_avx2(                              \
    257      const uint8_t *src, int src_stride, const uint8_t *ref,               \
    258      int ref_stride) {                                                     \
    259    return aom_highbd_sad##m##xN_avx2(n, src, src_stride, ref, ref_stride); \
    260  }
    261 
    262 #define HIGHBD_SAD_SKIP_MXN_AVX2(m, n)                                       \
    263  unsigned int aom_highbd_sad_skip_##m##x##n##_avx2(                         \
    264      const uint8_t *src, int src_stride, const uint8_t *ref,                \
    265      int ref_stride) {                                                      \
    266    return 2 * aom_highbd_sad##m##xN_avx2((n / 2), src, 2 * src_stride, ref, \
    267                                          2 * ref_stride);                   \
    268  }
    269 
    270 HIGHBD_SADMXN_AVX2(16, 8)
    271 HIGHBD_SADMXN_AVX2(16, 16)
    272 HIGHBD_SADMXN_AVX2(16, 32)
    273 
    274 HIGHBD_SADMXN_AVX2(32, 16)
    275 HIGHBD_SADMXN_AVX2(32, 32)
    276 HIGHBD_SADMXN_AVX2(32, 64)
    277 
    278 HIGHBD_SADMXN_AVX2(64, 32)
    279 HIGHBD_SADMXN_AVX2(64, 64)
    280 HIGHBD_SADMXN_AVX2(64, 128)
    281 
    282 HIGHBD_SADMXN_AVX2(128, 64)
    283 HIGHBD_SADMXN_AVX2(128, 128)
    284 
    285 #if !CONFIG_REALTIME_ONLY
    286 HIGHBD_SADMXN_AVX2(16, 4)
    287 HIGHBD_SADMXN_AVX2(16, 64)
    288 HIGHBD_SADMXN_AVX2(32, 8)
    289 HIGHBD_SADMXN_AVX2(64, 16)
    290 #endif  // !CONFIG_REALTIME_ONLY
    291 
    292 HIGHBD_SAD_SKIP_MXN_AVX2(16, 16)
    293 HIGHBD_SAD_SKIP_MXN_AVX2(16, 32)
    294 
    295 HIGHBD_SAD_SKIP_MXN_AVX2(32, 16)
    296 HIGHBD_SAD_SKIP_MXN_AVX2(32, 32)
    297 HIGHBD_SAD_SKIP_MXN_AVX2(32, 64)
    298 
    299 HIGHBD_SAD_SKIP_MXN_AVX2(64, 32)
    300 HIGHBD_SAD_SKIP_MXN_AVX2(64, 64)
    301 HIGHBD_SAD_SKIP_MXN_AVX2(64, 128)
    302 
    303 HIGHBD_SAD_SKIP_MXN_AVX2(128, 64)
    304 HIGHBD_SAD_SKIP_MXN_AVX2(128, 128)
    305 
    306 #if !CONFIG_REALTIME_ONLY
    307 HIGHBD_SAD_SKIP_MXN_AVX2(16, 64)
    308 HIGHBD_SAD_SKIP_MXN_AVX2(64, 16)
    309 #endif  // !CONFIG_REALTIME_ONLY
    310 
    311 unsigned int aom_highbd_sad16x8_avg_avx2(const uint8_t *src, int src_stride,
    312                                         const uint8_t *ref, int ref_stride,
    313                                         const uint8_t *second_pred) {
    314  __m256i sad = _mm256_setzero_si256();
    315  uint16_t *srcp = CONVERT_TO_SHORTPTR(src);
    316  uint16_t *refp = CONVERT_TO_SHORTPTR(ref);
    317  uint16_t *secp = CONVERT_TO_SHORTPTR(second_pred);
    318 
    319  sad16x4(srcp, src_stride, refp, ref_stride, secp, &sad);
    320 
    321  // Next 4 rows
    322  srcp += src_stride << 2;
    323  refp += ref_stride << 2;
    324  secp += 64;
    325  sad16x4(srcp, src_stride, refp, ref_stride, secp, &sad);
    326  return get_sad_from_mm256_epi32(&sad);
    327 }
    328 
    329 unsigned int aom_highbd_sad16x16_avg_avx2(const uint8_t *src, int src_stride,
    330                                          const uint8_t *ref, int ref_stride,
    331                                          const uint8_t *second_pred) {
    332  const int left_shift = 3;
    333  uint32_t sum = aom_highbd_sad16x8_avg_avx2(src, src_stride, ref, ref_stride,
    334                                             second_pred);
    335  src += src_stride << left_shift;
    336  ref += ref_stride << left_shift;
    337  second_pred += 16 << left_shift;
    338  sum += aom_highbd_sad16x8_avg_avx2(src, src_stride, ref, ref_stride,
    339                                     second_pred);
    340  return sum;
    341 }
    342 
    343 unsigned int aom_highbd_sad16x32_avg_avx2(const uint8_t *src, int src_stride,
    344                                          const uint8_t *ref, int ref_stride,
    345                                          const uint8_t *second_pred) {
    346  const int left_shift = 4;
    347  uint32_t sum = aom_highbd_sad16x16_avg_avx2(src, src_stride, ref, ref_stride,
    348                                              second_pred);
    349  src += src_stride << left_shift;
    350  ref += ref_stride << left_shift;
    351  second_pred += 16 << left_shift;
    352  sum += aom_highbd_sad16x16_avg_avx2(src, src_stride, ref, ref_stride,
    353                                      second_pred);
    354  return sum;
    355 }
    356 
    357 #if !CONFIG_REALTIME_ONLY
    358 unsigned int aom_highbd_sad16x64_avg_avx2(const uint8_t *src, int src_stride,
    359                                          const uint8_t *ref, int ref_stride,
    360                                          const uint8_t *second_pred) {
    361  const int left_shift = 5;
    362  uint32_t sum = aom_highbd_sad16x32_avg_avx2(src, src_stride, ref, ref_stride,
    363                                              second_pred);
    364  src += src_stride << left_shift;
    365  ref += ref_stride << left_shift;
    366  second_pred += 16 << left_shift;
    367  sum += aom_highbd_sad16x32_avg_avx2(src, src_stride, ref, ref_stride,
    368                                      second_pred);
    369  return sum;
    370 }
    371 
    372 unsigned int aom_highbd_sad32x8_avg_avx2(const uint8_t *src, int src_stride,
    373                                         const uint8_t *ref, int ref_stride,
    374                                         const uint8_t *second_pred) {
    375  __m256i sad = _mm256_setzero_si256();
    376  uint16_t *srcp = CONVERT_TO_SHORTPTR(src);
    377  uint16_t *refp = CONVERT_TO_SHORTPTR(ref);
    378  uint16_t *secp = CONVERT_TO_SHORTPTR(second_pred);
    379  const int left_shift = 2;
    380  int row_section = 0;
    381 
    382  while (row_section < 2) {
    383    sad32x4(srcp, src_stride, refp, ref_stride, secp, &sad);
    384    srcp += src_stride << left_shift;
    385    refp += ref_stride << left_shift;
    386    secp += 32 << left_shift;
    387    row_section += 1;
    388  }
    389  return get_sad_from_mm256_epi32(&sad);
    390 }
    391 #endif  // !CONFIG_REALTIME_ONLY
    392 
    393 unsigned int aom_highbd_sad32x16_avg_avx2(const uint8_t *src, int src_stride,
    394                                          const uint8_t *ref, int ref_stride,
    395                                          const uint8_t *second_pred) {
    396  __m256i sad = _mm256_setzero_si256();
    397  uint16_t *srcp = CONVERT_TO_SHORTPTR(src);
    398  uint16_t *refp = CONVERT_TO_SHORTPTR(ref);
    399  uint16_t *secp = CONVERT_TO_SHORTPTR(second_pred);
    400  const int left_shift = 2;
    401  int row_section = 0;
    402 
    403  while (row_section < 4) {
    404    sad32x4(srcp, src_stride, refp, ref_stride, secp, &sad);
    405    srcp += src_stride << left_shift;
    406    refp += ref_stride << left_shift;
    407    secp += 32 << left_shift;
    408    row_section += 1;
    409  }
    410  return get_sad_from_mm256_epi32(&sad);
    411 }
    412 
    413 unsigned int aom_highbd_sad32x32_avg_avx2(const uint8_t *src, int src_stride,
    414                                          const uint8_t *ref, int ref_stride,
    415                                          const uint8_t *second_pred) {
    416  const int left_shift = 4;
    417  uint32_t sum = aom_highbd_sad32x16_avg_avx2(src, src_stride, ref, ref_stride,
    418                                              second_pred);
    419  src += src_stride << left_shift;
    420  ref += ref_stride << left_shift;
    421  second_pred += 32 << left_shift;
    422  sum += aom_highbd_sad32x16_avg_avx2(src, src_stride, ref, ref_stride,
    423                                      second_pred);
    424  return sum;
    425 }
    426 
    427 unsigned int aom_highbd_sad32x64_avg_avx2(const uint8_t *src, int src_stride,
    428                                          const uint8_t *ref, int ref_stride,
    429                                          const uint8_t *second_pred) {
    430  const int left_shift = 5;
    431  uint32_t sum = aom_highbd_sad32x32_avg_avx2(src, src_stride, ref, ref_stride,
    432                                              second_pred);
    433  src += src_stride << left_shift;
    434  ref += ref_stride << left_shift;
    435  second_pred += 32 << left_shift;
    436  sum += aom_highbd_sad32x32_avg_avx2(src, src_stride, ref, ref_stride,
    437                                      second_pred);
    438  return sum;
    439 }
    440 
    441 #if !CONFIG_REALTIME_ONLY
    442 unsigned int aom_highbd_sad64x16_avg_avx2(const uint8_t *src, int src_stride,
    443                                          const uint8_t *ref, int ref_stride,
    444                                          const uint8_t *second_pred) {
    445  __m256i sad = _mm256_setzero_si256();
    446  uint16_t *srcp = CONVERT_TO_SHORTPTR(src);
    447  uint16_t *refp = CONVERT_TO_SHORTPTR(ref);
    448  uint16_t *secp = CONVERT_TO_SHORTPTR(second_pred);
    449  const int left_shift = 1;
    450  int row_section = 0;
    451 
    452  while (row_section < 8) {
    453    sad64x2(srcp, src_stride, refp, ref_stride, secp, &sad);
    454    srcp += src_stride << left_shift;
    455    refp += ref_stride << left_shift;
    456    secp += 64 << left_shift;
    457    row_section += 1;
    458  }
    459  return get_sad_from_mm256_epi32(&sad);
    460 }
    461 #endif  // !CONFIG_REALTIME_ONLY
    462 
    463 unsigned int aom_highbd_sad64x32_avg_avx2(const uint8_t *src, int src_stride,
    464                                          const uint8_t *ref, int ref_stride,
    465                                          const uint8_t *second_pred) {
    466  __m256i sad = _mm256_setzero_si256();
    467  uint16_t *srcp = CONVERT_TO_SHORTPTR(src);
    468  uint16_t *refp = CONVERT_TO_SHORTPTR(ref);
    469  uint16_t *secp = CONVERT_TO_SHORTPTR(second_pred);
    470  const int left_shift = 1;
    471  int row_section = 0;
    472 
    473  while (row_section < 16) {
    474    sad64x2(srcp, src_stride, refp, ref_stride, secp, &sad);
    475    srcp += src_stride << left_shift;
    476    refp += ref_stride << left_shift;
    477    secp += 64 << left_shift;
    478    row_section += 1;
    479  }
    480  return get_sad_from_mm256_epi32(&sad);
    481 }
    482 
    483 unsigned int aom_highbd_sad64x64_avg_avx2(const uint8_t *src, int src_stride,
    484                                          const uint8_t *ref, int ref_stride,
    485                                          const uint8_t *second_pred) {
    486  const int left_shift = 5;
    487  uint32_t sum = aom_highbd_sad64x32_avg_avx2(src, src_stride, ref, ref_stride,
    488                                              second_pred);
    489  src += src_stride << left_shift;
    490  ref += ref_stride << left_shift;
    491  second_pred += 64 << left_shift;
    492  sum += aom_highbd_sad64x32_avg_avx2(src, src_stride, ref, ref_stride,
    493                                      second_pred);
    494  return sum;
    495 }
    496 
    497 unsigned int aom_highbd_sad64x128_avg_avx2(const uint8_t *src, int src_stride,
    498                                           const uint8_t *ref, int ref_stride,
    499                                           const uint8_t *second_pred) {
    500  const int left_shift = 6;
    501  uint32_t sum = aom_highbd_sad64x64_avg_avx2(src, src_stride, ref, ref_stride,
    502                                              second_pred);
    503  src += src_stride << left_shift;
    504  ref += ref_stride << left_shift;
    505  second_pred += 64 << left_shift;
    506  sum += aom_highbd_sad64x64_avg_avx2(src, src_stride, ref, ref_stride,
    507                                      second_pred);
    508  return sum;
    509 }
    510 
    511 unsigned int aom_highbd_sad128x64_avg_avx2(const uint8_t *src, int src_stride,
    512                                           const uint8_t *ref, int ref_stride,
    513                                           const uint8_t *second_pred) {
    514  __m256i sad = _mm256_setzero_si256();
    515  uint16_t *srcp = CONVERT_TO_SHORTPTR(src);
    516  uint16_t *refp = CONVERT_TO_SHORTPTR(ref);
    517  uint16_t *secp = CONVERT_TO_SHORTPTR(second_pred);
    518  int row = 0;
    519  while (row < 64) {
    520    sad128x1(srcp, refp, secp, &sad);
    521    srcp += src_stride;
    522    refp += ref_stride;
    523    secp += 16 << 3;
    524    row += 1;
    525  }
    526  return get_sad_from_mm256_epi32(&sad);
    527 }
    528 
    529 unsigned int aom_highbd_sad128x128_avg_avx2(const uint8_t *src, int src_stride,
    530                                            const uint8_t *ref, int ref_stride,
    531                                            const uint8_t *second_pred) {
    532  unsigned int sum;
    533  const int left_shift = 6;
    534 
    535  sum = aom_highbd_sad128x64_avg_avx2(src, src_stride, ref, ref_stride,
    536                                      second_pred);
    537  src += src_stride << left_shift;
    538  ref += ref_stride << left_shift;
    539  second_pred += 128 << left_shift;
    540  sum += aom_highbd_sad128x64_avg_avx2(src, src_stride, ref, ref_stride,
    541                                       second_pred);
    542  return sum;
    543 }
    544 
    545 // SAD 4D
    546 // Combine 4 __m256i input vectors  v to uint32_t result[4]
    547 static inline void get_4d_sad_from_mm256_epi32(const __m256i *v,
    548                                               uint32_t *res) {
    549  __m256i u0, u1, u2, u3;
    550  const __m256i mask = _mm256_set1_epi64x(~0u);
    551  __m128i sad;
    552 
    553  // 8 32-bit summation
    554  u0 = _mm256_srli_si256(v[0], 4);
    555  u1 = _mm256_srli_si256(v[1], 4);
    556  u2 = _mm256_srli_si256(v[2], 4);
    557  u3 = _mm256_srli_si256(v[3], 4);
    558 
    559  u0 = _mm256_add_epi32(u0, v[0]);
    560  u1 = _mm256_add_epi32(u1, v[1]);
    561  u2 = _mm256_add_epi32(u2, v[2]);
    562  u3 = _mm256_add_epi32(u3, v[3]);
    563 
    564  u0 = _mm256_and_si256(u0, mask);
    565  u1 = _mm256_and_si256(u1, mask);
    566  u2 = _mm256_and_si256(u2, mask);
    567  u3 = _mm256_and_si256(u3, mask);
    568  // 4 32-bit summation, evenly positioned
    569 
    570  u1 = _mm256_slli_si256(u1, 4);
    571  u3 = _mm256_slli_si256(u3, 4);
    572 
    573  u0 = _mm256_or_si256(u0, u1);
    574  u2 = _mm256_or_si256(u2, u3);
    575  // 8 32-bit summation, interleaved
    576 
    577  u1 = _mm256_unpacklo_epi64(u0, u2);
    578  u3 = _mm256_unpackhi_epi64(u0, u2);
    579 
    580  u0 = _mm256_add_epi32(u1, u3);
    581  sad = _mm_add_epi32(_mm256_extractf128_si256(u0, 1),
    582                      _mm256_castsi256_si128(u0));
    583  _mm_storeu_si128((__m128i *)res, sad);
    584 }
    585 
    586 static void convert_pointers(const uint8_t *const ref8[],
    587                             const uint16_t *ref[]) {
    588  ref[0] = CONVERT_TO_SHORTPTR(ref8[0]);
    589  ref[1] = CONVERT_TO_SHORTPTR(ref8[1]);
    590  ref[2] = CONVERT_TO_SHORTPTR(ref8[2]);
    591  ref[3] = CONVERT_TO_SHORTPTR(ref8[3]);
    592 }
    593 
    594 static void init_sad(__m256i *s) {
    595  s[0] = _mm256_setzero_si256();
    596  s[1] = _mm256_setzero_si256();
    597  s[2] = _mm256_setzero_si256();
    598  s[3] = _mm256_setzero_si256();
    599 }
    600 
    601 static AOM_FORCE_INLINE void aom_highbd_sadMxNxD_avx2(
    602    int M, int N, int D, const uint8_t *src, int src_stride,
    603    const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]) {
    604  __m256i sad_vec[4];
    605  const uint16_t *refp[4];
    606  const uint16_t *keep = CONVERT_TO_SHORTPTR(src);
    607  const uint16_t *srcp;
    608  const int shift_for_rows = (M < 128) + (M < 64);
    609  const int row_units = 1 << shift_for_rows;
    610  int i, r;
    611 
    612  init_sad(sad_vec);
    613  convert_pointers(ref_array, refp);
    614 
    615  for (i = 0; i < D; ++i) {
    616    srcp = keep;
    617    for (r = 0; r < N; r += row_units) {
    618      if (M == 128) {
    619        sad128x1(srcp, refp[i], NULL, &sad_vec[i]);
    620      } else if (M == 64) {
    621        sad64x2(srcp, src_stride, refp[i], ref_stride, NULL, &sad_vec[i]);
    622      } else if (M == 32) {
    623        sad32x4(srcp, src_stride, refp[i], ref_stride, 0, &sad_vec[i]);
    624      } else if (M == 16) {
    625        sad16x4(srcp, src_stride, refp[i], ref_stride, 0, &sad_vec[i]);
    626      } else {
    627        assert(0);
    628      }
    629      srcp += src_stride << shift_for_rows;
    630      refp[i] += ref_stride << shift_for_rows;
    631    }
    632  }
    633  get_4d_sad_from_mm256_epi32(sad_vec, sad_array);
    634 }
    635 
    636 #define HIGHBD_SAD_MXNX4D_AVX2(m, n)                                          \
    637  void aom_highbd_sad##m##x##n##x4d_avx2(                                     \
    638      const uint8_t *src, int src_stride, const uint8_t *const ref_array[4],  \
    639      int ref_stride, uint32_t sad_array[4]) {                                \
    640    aom_highbd_sadMxNxD_avx2(m, n, 4, src, src_stride, ref_array, ref_stride, \
    641                             sad_array);                                      \
    642  }
    643 #define HIGHBD_SAD_SKIP_MXNX4D_AVX2(m, n)                                    \
    644  void aom_highbd_sad_skip_##m##x##n##x4d_avx2(                              \
    645      const uint8_t *src, int src_stride, const uint8_t *const ref_array[4], \
    646      int ref_stride, uint32_t sad_array[4]) {                               \
    647    aom_highbd_sadMxNxD_avx2(m, (n / 2), 4, src, 2 * src_stride, ref_array,  \
    648                             2 * ref_stride, sad_array);                     \
    649    sad_array[0] <<= 1;                                                      \
    650    sad_array[1] <<= 1;                                                      \
    651    sad_array[2] <<= 1;                                                      \
    652    sad_array[3] <<= 1;                                                      \
    653  }
    654 #define HIGHBD_SAD_MXNX3D_AVX2(m, n)                                          \
    655  void aom_highbd_sad##m##x##n##x3d_avx2(                                     \
    656      const uint8_t *src, int src_stride, const uint8_t *const ref_array[4],  \
    657      int ref_stride, uint32_t sad_array[4]) {                                \
    658    aom_highbd_sadMxNxD_avx2(m, n, 3, src, src_stride, ref_array, ref_stride, \
    659                             sad_array);                                      \
    660  }
    661 
    662 HIGHBD_SAD_MXNX4D_AVX2(16, 8)
    663 HIGHBD_SAD_MXNX4D_AVX2(16, 16)
    664 HIGHBD_SAD_MXNX4D_AVX2(16, 32)
    665 
    666 HIGHBD_SAD_MXNX4D_AVX2(32, 16)
    667 HIGHBD_SAD_MXNX4D_AVX2(32, 32)
    668 HIGHBD_SAD_MXNX4D_AVX2(32, 64)
    669 
    670 HIGHBD_SAD_MXNX4D_AVX2(64, 32)
    671 HIGHBD_SAD_MXNX4D_AVX2(64, 64)
    672 HIGHBD_SAD_MXNX4D_AVX2(64, 128)
    673 
    674 HIGHBD_SAD_MXNX4D_AVX2(128, 64)
    675 HIGHBD_SAD_MXNX4D_AVX2(128, 128)
    676 
    677 #if !CONFIG_REALTIME_ONLY
    678 HIGHBD_SAD_MXNX4D_AVX2(16, 4)
    679 HIGHBD_SAD_MXNX4D_AVX2(16, 64)
    680 HIGHBD_SAD_MXNX4D_AVX2(32, 8)
    681 HIGHBD_SAD_MXNX4D_AVX2(64, 16)
    682 #endif  // !CONFIG_REALTIME_ONLY
    683 
    684 HIGHBD_SAD_SKIP_MXNX4D_AVX2(16, 16)
    685 HIGHBD_SAD_SKIP_MXNX4D_AVX2(16, 32)
    686 
    687 HIGHBD_SAD_SKIP_MXNX4D_AVX2(32, 16)
    688 HIGHBD_SAD_SKIP_MXNX4D_AVX2(32, 32)
    689 HIGHBD_SAD_SKIP_MXNX4D_AVX2(32, 64)
    690 
    691 HIGHBD_SAD_SKIP_MXNX4D_AVX2(64, 32)
    692 HIGHBD_SAD_SKIP_MXNX4D_AVX2(64, 64)
    693 HIGHBD_SAD_SKIP_MXNX4D_AVX2(64, 128)
    694 
    695 HIGHBD_SAD_SKIP_MXNX4D_AVX2(128, 64)
    696 HIGHBD_SAD_SKIP_MXNX4D_AVX2(128, 128)
    697 
    698 #if !CONFIG_REALTIME_ONLY
    699 HIGHBD_SAD_SKIP_MXNX4D_AVX2(16, 64)
    700 HIGHBD_SAD_SKIP_MXNX4D_AVX2(64, 16)
    701 #endif  // !CONFIG_REALTIME_ONLY
    702 
    703 HIGHBD_SAD_MXNX3D_AVX2(16, 8)
    704 HIGHBD_SAD_MXNX3D_AVX2(16, 16)
    705 HIGHBD_SAD_MXNX3D_AVX2(16, 32)
    706 
    707 HIGHBD_SAD_MXNX3D_AVX2(32, 16)
    708 HIGHBD_SAD_MXNX3D_AVX2(32, 32)
    709 HIGHBD_SAD_MXNX3D_AVX2(32, 64)
    710 
    711 HIGHBD_SAD_MXNX3D_AVX2(64, 32)
    712 HIGHBD_SAD_MXNX3D_AVX2(64, 64)
    713 HIGHBD_SAD_MXNX3D_AVX2(64, 128)
    714 
    715 HIGHBD_SAD_MXNX3D_AVX2(128, 64)
    716 HIGHBD_SAD_MXNX3D_AVX2(128, 128)
    717 
    718 #if !CONFIG_REALTIME_ONLY
    719 HIGHBD_SAD_MXNX3D_AVX2(16, 4)
    720 HIGHBD_SAD_MXNX3D_AVX2(16, 64)
    721 HIGHBD_SAD_MXNX3D_AVX2(32, 8)
    722 HIGHBD_SAD_MXNX3D_AVX2(64, 16)
    723 #endif  // !CONFIG_REALTIME_ONLY