tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

blend_a64_mask_avx2.c (58126B)


      1 /*
      2 * Copyright (c) 2018, Alliance for Open Media. All rights reserved.
      3 *
      4 * This source code is subject to the terms of the BSD 2 Clause License and
      5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
      6 * was not distributed with this source code in the LICENSE file, you can
      7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
      8 * Media Patent License 1.0 was not distributed with this source code in the
      9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
     10 */
     11 
     12 #include <smmintrin.h>  // SSE4.1
     13 #include <immintrin.h>  // AVX2
     14 
     15 #include <assert.h>
     16 
     17 #include "aom/aom_integer.h"
     18 #include "aom_ports/mem.h"
     19 #include "aom_dsp/aom_dsp_common.h"
     20 
     21 #include "aom_dsp/x86/synonyms.h"
     22 #include "aom_dsp/x86/synonyms_avx2.h"
     23 #include "aom_dsp/x86/blend_sse4.h"
     24 #include "aom_dsp/x86/blend_mask_sse4.h"
     25 
     26 #include "config/aom_dsp_rtcd.h"
     27 
     28 static inline void blend_a64_d16_mask_w16_avx2(
     29    uint8_t *dst, const CONV_BUF_TYPE *src0, const CONV_BUF_TYPE *src1,
     30    const __m256i *m0, const __m256i *v_round_offset, const __m256i *v_maxval,
     31    int shift) {
     32  const __m256i max_minus_m0 = _mm256_sub_epi16(*v_maxval, *m0);
     33  const __m256i s0_0 = yy_loadu_256(src0);
     34  const __m256i s1_0 = yy_loadu_256(src1);
     35  __m256i res0_lo = _mm256_madd_epi16(_mm256_unpacklo_epi16(s0_0, s1_0),
     36                                      _mm256_unpacklo_epi16(*m0, max_minus_m0));
     37  __m256i res0_hi = _mm256_madd_epi16(_mm256_unpackhi_epi16(s0_0, s1_0),
     38                                      _mm256_unpackhi_epi16(*m0, max_minus_m0));
     39  res0_lo =
     40      _mm256_srai_epi32(_mm256_sub_epi32(res0_lo, *v_round_offset), shift);
     41  res0_hi =
     42      _mm256_srai_epi32(_mm256_sub_epi32(res0_hi, *v_round_offset), shift);
     43  const __m256i res0 = _mm256_packs_epi32(res0_lo, res0_hi);
     44  __m256i res = _mm256_packus_epi16(res0, res0);
     45  res = _mm256_permute4x64_epi64(res, 0xd8);
     46  _mm_storeu_si128((__m128i *)(dst), _mm256_castsi256_si128(res));
     47 }
     48 
     49 static inline void blend_a64_d16_mask_w32_avx2(
     50    uint8_t *dst, const CONV_BUF_TYPE *src0, const CONV_BUF_TYPE *src1,
     51    const __m256i *m0, const __m256i *m1, const __m256i *v_round_offset,
     52    const __m256i *v_maxval, int shift) {
     53  const __m256i max_minus_m0 = _mm256_sub_epi16(*v_maxval, *m0);
     54  const __m256i max_minus_m1 = _mm256_sub_epi16(*v_maxval, *m1);
     55  const __m256i s0_0 = yy_loadu_256(src0);
     56  const __m256i s0_1 = yy_loadu_256(src0 + 16);
     57  const __m256i s1_0 = yy_loadu_256(src1);
     58  const __m256i s1_1 = yy_loadu_256(src1 + 16);
     59  __m256i res0_lo = _mm256_madd_epi16(_mm256_unpacklo_epi16(s0_0, s1_0),
     60                                      _mm256_unpacklo_epi16(*m0, max_minus_m0));
     61  __m256i res0_hi = _mm256_madd_epi16(_mm256_unpackhi_epi16(s0_0, s1_0),
     62                                      _mm256_unpackhi_epi16(*m0, max_minus_m0));
     63  __m256i res1_lo = _mm256_madd_epi16(_mm256_unpacklo_epi16(s0_1, s1_1),
     64                                      _mm256_unpacklo_epi16(*m1, max_minus_m1));
     65  __m256i res1_hi = _mm256_madd_epi16(_mm256_unpackhi_epi16(s0_1, s1_1),
     66                                      _mm256_unpackhi_epi16(*m1, max_minus_m1));
     67  res0_lo =
     68      _mm256_srai_epi32(_mm256_sub_epi32(res0_lo, *v_round_offset), shift);
     69  res0_hi =
     70      _mm256_srai_epi32(_mm256_sub_epi32(res0_hi, *v_round_offset), shift);
     71  res1_lo =
     72      _mm256_srai_epi32(_mm256_sub_epi32(res1_lo, *v_round_offset), shift);
     73  res1_hi =
     74      _mm256_srai_epi32(_mm256_sub_epi32(res1_hi, *v_round_offset), shift);
     75  const __m256i res0 = _mm256_packs_epi32(res0_lo, res0_hi);
     76  const __m256i res1 = _mm256_packs_epi32(res1_lo, res1_hi);
     77  __m256i res = _mm256_packus_epi16(res0, res1);
     78  res = _mm256_permute4x64_epi64(res, 0xd8);
     79  _mm256_storeu_si256((__m256i *)(dst), res);
     80 }
     81 
     82 static inline void lowbd_blend_a64_d16_mask_subw0_subh0_w16_avx2(
     83    uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
     84    uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
     85    const uint8_t *mask, uint32_t mask_stride, int h,
     86    const __m256i *round_offset, int shift) {
     87  const __m256i v_maxval = _mm256_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
     88  for (int i = 0; i < h; ++i) {
     89    const __m128i m = xx_loadu_128(mask);
     90    const __m256i m0 = _mm256_cvtepu8_epi16(m);
     91 
     92    blend_a64_d16_mask_w16_avx2(dst, src0, src1, &m0, round_offset, &v_maxval,
     93                                shift);
     94    mask += mask_stride;
     95    dst += dst_stride;
     96    src0 += src0_stride;
     97    src1 += src1_stride;
     98  }
     99 }
    100 
    101 static inline void lowbd_blend_a64_d16_mask_subw0_subh0_w32_avx2(
    102    uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
    103    uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
    104    const uint8_t *mask, uint32_t mask_stride, int h, int w,
    105    const __m256i *round_offset, int shift) {
    106  const __m256i v_maxval = _mm256_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
    107  for (int i = 0; i < h; ++i) {
    108    for (int j = 0; j < w; j += 32) {
    109      const __m256i m = yy_loadu_256(mask + j);
    110      const __m256i m0 = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(m));
    111      const __m256i m1 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(m, 1));
    112 
    113      blend_a64_d16_mask_w32_avx2(dst + j, src0 + j, src1 + j, &m0, &m1,
    114                                  round_offset, &v_maxval, shift);
    115    }
    116    mask += mask_stride;
    117    dst += dst_stride;
    118    src0 += src0_stride;
    119    src1 += src1_stride;
    120  }
    121 }
    122 
    123 static inline void lowbd_blend_a64_d16_mask_subw1_subh1_w16_avx2(
    124    uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
    125    uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
    126    const uint8_t *mask, uint32_t mask_stride, int h,
    127    const __m256i *round_offset, int shift) {
    128  const __m256i v_maxval = _mm256_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
    129  const __m256i one_b = _mm256_set1_epi8(1);
    130  const __m256i two_w = _mm256_set1_epi16(2);
    131  for (int i = 0; i < h; ++i) {
    132    const __m256i m_i00 = yy_loadu_256(mask);
    133    const __m256i m_i10 = yy_loadu_256(mask + mask_stride);
    134 
    135    const __m256i m0_ac = _mm256_adds_epu8(m_i00, m_i10);
    136    const __m256i m0_acbd = _mm256_maddubs_epi16(m0_ac, one_b);
    137    const __m256i m0 = _mm256_srli_epi16(_mm256_add_epi16(m0_acbd, two_w), 2);
    138 
    139    blend_a64_d16_mask_w16_avx2(dst, src0, src1, &m0, round_offset, &v_maxval,
    140                                shift);
    141    mask += mask_stride << 1;
    142    dst += dst_stride;
    143    src0 += src0_stride;
    144    src1 += src1_stride;
    145  }
    146 }
    147 
    148 static inline void lowbd_blend_a64_d16_mask_subw1_subh1_w32_avx2(
    149    uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
    150    uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
    151    const uint8_t *mask, uint32_t mask_stride, int h, int w,
    152    const __m256i *round_offset, int shift) {
    153  const __m256i v_maxval = _mm256_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
    154  const __m256i one_b = _mm256_set1_epi8(1);
    155  const __m256i two_w = _mm256_set1_epi16(2);
    156  for (int i = 0; i < h; ++i) {
    157    for (int j = 0; j < w; j += 32) {
    158      const __m256i m_i00 = yy_loadu_256(mask + 2 * j);
    159      const __m256i m_i01 = yy_loadu_256(mask + 2 * j + 32);
    160      const __m256i m_i10 = yy_loadu_256(mask + mask_stride + 2 * j);
    161      const __m256i m_i11 = yy_loadu_256(mask + mask_stride + 2 * j + 32);
    162 
    163      const __m256i m0_ac = _mm256_adds_epu8(m_i00, m_i10);
    164      const __m256i m1_ac = _mm256_adds_epu8(m_i01, m_i11);
    165      const __m256i m0_acbd = _mm256_maddubs_epi16(m0_ac, one_b);
    166      const __m256i m1_acbd = _mm256_maddubs_epi16(m1_ac, one_b);
    167      const __m256i m0 = _mm256_srli_epi16(_mm256_add_epi16(m0_acbd, two_w), 2);
    168      const __m256i m1 = _mm256_srli_epi16(_mm256_add_epi16(m1_acbd, two_w), 2);
    169 
    170      blend_a64_d16_mask_w32_avx2(dst + j, src0 + j, src1 + j, &m0, &m1,
    171                                  round_offset, &v_maxval, shift);
    172    }
    173    mask += mask_stride << 1;
    174    dst += dst_stride;
    175    src0 += src0_stride;
    176    src1 += src1_stride;
    177  }
    178 }
    179 
    180 static inline void lowbd_blend_a64_d16_mask_subw1_subh0_w16_avx2(
    181    uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
    182    uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
    183    const uint8_t *mask, uint32_t mask_stride, int h, int w,
    184    const __m256i *round_offset, int shift) {
    185  const __m256i v_maxval = _mm256_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
    186  const __m256i one_b = _mm256_set1_epi8(1);
    187  const __m256i zeros = _mm256_setzero_si256();
    188  for (int i = 0; i < h; ++i) {
    189    for (int j = 0; j < w; j += 16) {
    190      const __m256i m_i00 = yy_loadu_256(mask + 2 * j);
    191      const __m256i m0_ac = _mm256_maddubs_epi16(m_i00, one_b);
    192      const __m256i m0 = _mm256_avg_epu16(m0_ac, zeros);
    193 
    194      blend_a64_d16_mask_w16_avx2(dst + j, src0 + j, src1 + j, &m0,
    195                                  round_offset, &v_maxval, shift);
    196    }
    197    mask += mask_stride;
    198    dst += dst_stride;
    199    src0 += src0_stride;
    200    src1 += src1_stride;
    201  }
    202 }
    203 
    204 static inline void lowbd_blend_a64_d16_mask_subw1_subh0_w32_avx2(
    205    uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
    206    uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
    207    const uint8_t *mask, uint32_t mask_stride, int h, int w,
    208    const __m256i *round_offset, int shift) {
    209  const __m256i v_maxval = _mm256_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
    210  const __m256i one_b = _mm256_set1_epi8(1);
    211  const __m256i zeros = _mm256_setzero_si256();
    212  for (int i = 0; i < h; ++i) {
    213    for (int j = 0; j < w; j += 32) {
    214      const __m256i m_i00 = yy_loadu_256(mask + 2 * j);
    215      const __m256i m_i01 = yy_loadu_256(mask + 2 * j + 32);
    216      const __m256i m0_ac = _mm256_maddubs_epi16(m_i00, one_b);
    217      const __m256i m1_ac = _mm256_maddubs_epi16(m_i01, one_b);
    218      const __m256i m0 = _mm256_avg_epu16(m0_ac, zeros);
    219      const __m256i m1 = _mm256_avg_epu16(m1_ac, zeros);
    220 
    221      blend_a64_d16_mask_w32_avx2(dst + j, src0 + j, src1 + j, &m0, &m1,
    222                                  round_offset, &v_maxval, shift);
    223    }
    224    mask += mask_stride;
    225    dst += dst_stride;
    226    src0 += src0_stride;
    227    src1 += src1_stride;
    228  }
    229 }
    230 
    231 static inline void lowbd_blend_a64_d16_mask_subw0_subh1_w16_avx2(
    232    uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
    233    uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
    234    const uint8_t *mask, uint32_t mask_stride, int h, int w,
    235    const __m256i *round_offset, int shift) {
    236  const __m256i v_maxval = _mm256_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
    237  const __m128i zeros = _mm_setzero_si128();
    238  for (int i = 0; i < h; ++i) {
    239    for (int j = 0; j < w; j += 16) {
    240      const __m128i m_i00 = xx_loadu_128(mask + j);
    241      const __m128i m_i10 = xx_loadu_128(mask + mask_stride + j);
    242 
    243      const __m128i m_ac = _mm_avg_epu8(_mm_adds_epu8(m_i00, m_i10), zeros);
    244      const __m256i m0 = _mm256_cvtepu8_epi16(m_ac);
    245 
    246      blend_a64_d16_mask_w16_avx2(dst + j, src0 + j, src1 + j, &m0,
    247                                  round_offset, &v_maxval, shift);
    248    }
    249    mask += mask_stride << 1;
    250    dst += dst_stride;
    251    src0 += src0_stride;
    252    src1 += src1_stride;
    253  }
    254 }
    255 
    256 static inline void lowbd_blend_a64_d16_mask_subw0_subh1_w32_avx2(
    257    uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
    258    uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
    259    const uint8_t *mask, uint32_t mask_stride, int h, int w,
    260    const __m256i *round_offset, int shift) {
    261  const __m256i v_maxval = _mm256_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
    262  const __m256i zeros = _mm256_setzero_si256();
    263  for (int i = 0; i < h; ++i) {
    264    for (int j = 0; j < w; j += 32) {
    265      const __m256i m_i00 = yy_loadu_256(mask + j);
    266      const __m256i m_i10 = yy_loadu_256(mask + mask_stride + j);
    267 
    268      const __m256i m_ac =
    269          _mm256_avg_epu8(_mm256_adds_epu8(m_i00, m_i10), zeros);
    270      const __m256i m0 = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(m_ac));
    271      const __m256i m1 =
    272          _mm256_cvtepu8_epi16(_mm256_extracti128_si256(m_ac, 1));
    273 
    274      blend_a64_d16_mask_w32_avx2(dst + j, src0 + j, src1 + j, &m0, &m1,
    275                                  round_offset, &v_maxval, shift);
    276    }
    277    mask += mask_stride << 1;
    278    dst += dst_stride;
    279    src0 += src0_stride;
    280    src1 += src1_stride;
    281  }
    282 }
    283 
    284 void aom_lowbd_blend_a64_d16_mask_avx2(
    285    uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
    286    uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
    287    const uint8_t *mask, uint32_t mask_stride, int w, int h, int subw, int subh,
    288    ConvolveParams *conv_params) {
    289  const int bd = 8;
    290  const int round_bits =
    291      2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
    292 
    293  const int round_offset =
    294      ((1 << (round_bits + bd)) + (1 << (round_bits + bd - 1)) -
    295       (1 << (round_bits - 1)))
    296      << AOM_BLEND_A64_ROUND_BITS;
    297 
    298  const int shift = round_bits + AOM_BLEND_A64_ROUND_BITS;
    299  assert(IMPLIES((void *)src0 == dst, src0_stride == dst_stride));
    300  assert(IMPLIES((void *)src1 == dst, src1_stride == dst_stride));
    301 
    302  assert(h >= 4);
    303  assert(w >= 4);
    304  assert(IS_POWER_OF_TWO(h));
    305  assert(IS_POWER_OF_TWO(w));
    306  const __m128i v_round_offset = _mm_set1_epi32(round_offset);
    307  const __m256i y_round_offset = _mm256_set1_epi32(round_offset);
    308 
    309  if (subw == 0 && subh == 0) {
    310    switch (w) {
    311      case 4:
    312        aom_lowbd_blend_a64_d16_mask_subw0_subh0_w4_sse4_1(
    313            dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
    314            mask_stride, h, &v_round_offset, shift);
    315        break;
    316      case 8:
    317        aom_lowbd_blend_a64_d16_mask_subw0_subh0_w8_sse4_1(
    318            dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
    319            mask_stride, h, &v_round_offset, shift);
    320        break;
    321      case 16:
    322        lowbd_blend_a64_d16_mask_subw0_subh0_w16_avx2(
    323            dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
    324            mask_stride, h, &y_round_offset, shift);
    325        break;
    326      default:
    327        lowbd_blend_a64_d16_mask_subw0_subh0_w32_avx2(
    328            dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
    329            mask_stride, h, w, &y_round_offset, shift);
    330        break;
    331    }
    332  } else if (subw == 1 && subh == 1) {
    333    switch (w) {
    334      case 4:
    335        aom_lowbd_blend_a64_d16_mask_subw1_subh1_w4_sse4_1(
    336            dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
    337            mask_stride, h, &v_round_offset, shift);
    338        break;
    339      case 8:
    340        aom_lowbd_blend_a64_d16_mask_subw1_subh1_w8_sse4_1(
    341            dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
    342            mask_stride, h, &v_round_offset, shift);
    343        break;
    344      case 16:
    345        lowbd_blend_a64_d16_mask_subw1_subh1_w16_avx2(
    346            dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
    347            mask_stride, h, &y_round_offset, shift);
    348        break;
    349      default:
    350        lowbd_blend_a64_d16_mask_subw1_subh1_w32_avx2(
    351            dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
    352            mask_stride, h, w, &y_round_offset, shift);
    353        break;
    354    }
    355  } else if (subw == 1 && subh == 0) {
    356    switch (w) {
    357      case 4:
    358        aom_lowbd_blend_a64_d16_mask_subw1_subh0_w4_sse4_1(
    359            dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
    360            mask_stride, h, &v_round_offset, shift);
    361        break;
    362      case 8:
    363        aom_lowbd_blend_a64_d16_mask_subw1_subh0_w8_sse4_1(
    364            dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
    365            mask_stride, h, &v_round_offset, shift);
    366        break;
    367      case 16:
    368        lowbd_blend_a64_d16_mask_subw1_subh0_w16_avx2(
    369            dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
    370            mask_stride, h, w, &y_round_offset, shift);
    371        break;
    372      default:
    373        lowbd_blend_a64_d16_mask_subw1_subh0_w32_avx2(
    374            dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
    375            mask_stride, h, w, &y_round_offset, shift);
    376        break;
    377    }
    378  } else {
    379    switch (w) {
    380      case 4:
    381        aom_lowbd_blend_a64_d16_mask_subw0_subh1_w4_sse4_1(
    382            dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
    383            mask_stride, h, &v_round_offset, shift);
    384        break;
    385      case 8:
    386        aom_lowbd_blend_a64_d16_mask_subw0_subh1_w8_sse4_1(
    387            dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
    388            mask_stride, h, &v_round_offset, shift);
    389        break;
    390      case 16:
    391        lowbd_blend_a64_d16_mask_subw0_subh1_w16_avx2(
    392            dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
    393            mask_stride, h, w, &y_round_offset, shift);
    394        break;
    395      default:
    396        lowbd_blend_a64_d16_mask_subw0_subh1_w32_avx2(
    397            dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
    398            mask_stride, h, w, &y_round_offset, shift);
    399        break;
    400    }
    401  }
    402 }
    403 
    404 static inline __m256i blend_16_u8_avx2(const uint8_t *src0, const uint8_t *src1,
    405                                       const __m256i *v_m0_b,
    406                                       const __m256i *v_m1_b,
    407                                       const int32_t bits) {
    408  const __m256i v_s0_b = _mm256_castsi128_si256(xx_loadu_128(src0));
    409  const __m256i v_s1_b = _mm256_castsi128_si256(xx_loadu_128(src1));
    410  const __m256i v_s0_s_b = _mm256_permute4x64_epi64(v_s0_b, 0xd8);
    411  const __m256i v_s1_s_b = _mm256_permute4x64_epi64(v_s1_b, 0xd8);
    412 
    413  const __m256i v_p0_w =
    414      _mm256_maddubs_epi16(_mm256_unpacklo_epi8(v_s0_s_b, v_s1_s_b),
    415                           _mm256_unpacklo_epi8(*v_m0_b, *v_m1_b));
    416 
    417  const __m256i v_res0_w = yy_roundn_epu16(v_p0_w, bits);
    418  const __m256i v_res_b = _mm256_packus_epi16(v_res0_w, v_res0_w);
    419  const __m256i v_res = _mm256_permute4x64_epi64(v_res_b, 0xd8);
    420  return v_res;
    421 }
    422 
    423 static inline __m256i blend_32_u8_avx2(const uint8_t *src0, const uint8_t *src1,
    424                                       const __m256i *v_m0_b,
    425                                       const __m256i *v_m1_b,
    426                                       const int32_t bits) {
    427  const __m256i v_s0_b = yy_loadu_256(src0);
    428  const __m256i v_s1_b = yy_loadu_256(src1);
    429 
    430  const __m256i v_p0_w =
    431      _mm256_maddubs_epi16(_mm256_unpacklo_epi8(v_s0_b, v_s1_b),
    432                           _mm256_unpacklo_epi8(*v_m0_b, *v_m1_b));
    433  const __m256i v_p1_w =
    434      _mm256_maddubs_epi16(_mm256_unpackhi_epi8(v_s0_b, v_s1_b),
    435                           _mm256_unpackhi_epi8(*v_m0_b, *v_m1_b));
    436 
    437  const __m256i v_res0_w = yy_roundn_epu16(v_p0_w, bits);
    438  const __m256i v_res1_w = yy_roundn_epu16(v_p1_w, bits);
    439  const __m256i v_res = _mm256_packus_epi16(v_res0_w, v_res1_w);
    440  return v_res;
    441 }
    442 
    443 static inline void blend_a64_mask_sx_sy_w16_avx2(
    444    uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
    445    uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
    446    const uint8_t *mask, uint32_t mask_stride, int h) {
    447  const __m256i v_zmask_b = _mm256_set1_epi16(0xFF);
    448  const __m256i v_maxval_b = _mm256_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
    449  do {
    450    const __m256i v_ral_b = yy_loadu_256(mask);
    451    const __m256i v_rbl_b = yy_loadu_256(mask + mask_stride);
    452    const __m256i v_rvsl_b = _mm256_add_epi8(v_ral_b, v_rbl_b);
    453    const __m256i v_rvsal_w = _mm256_and_si256(v_rvsl_b, v_zmask_b);
    454    const __m256i v_rvsbl_w =
    455        _mm256_and_si256(_mm256_srli_si256(v_rvsl_b, 1), v_zmask_b);
    456    const __m256i v_rsl_w = _mm256_add_epi16(v_rvsal_w, v_rvsbl_w);
    457 
    458    const __m256i v_m0_w = yy_roundn_epu16(v_rsl_w, 2);
    459    const __m256i v_m0_b = _mm256_packus_epi16(v_m0_w, v_m0_w);
    460    const __m256i v_m1_b = _mm256_sub_epi8(v_maxval_b, v_m0_b);
    461 
    462    const __m256i y_res_b = blend_16_u8_avx2(src0, src1, &v_m0_b, &v_m1_b,
    463                                             AOM_BLEND_A64_ROUND_BITS);
    464 
    465    xx_storeu_128(dst, _mm256_castsi256_si128(y_res_b));
    466    dst += dst_stride;
    467    src0 += src0_stride;
    468    src1 += src1_stride;
    469    mask += 2 * mask_stride;
    470  } while (--h);
    471 }
    472 
    473 static inline void blend_a64_mask_sx_sy_w32n_avx2(
    474    uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
    475    uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
    476    const uint8_t *mask, uint32_t mask_stride, int w, int h) {
    477  const __m256i v_maxval_b = _mm256_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
    478  const __m256i v_zmask_b = _mm256_set1_epi16(0xFF);
    479  do {
    480    int c;
    481    for (c = 0; c < w; c += 32) {
    482      const __m256i v_ral_b = yy_loadu_256(mask + 2 * c);
    483      const __m256i v_rah_b = yy_loadu_256(mask + 2 * c + 32);
    484      const __m256i v_rbl_b = yy_loadu_256(mask + mask_stride + 2 * c);
    485      const __m256i v_rbh_b = yy_loadu_256(mask + mask_stride + 2 * c + 32);
    486      const __m256i v_rvsl_b = _mm256_add_epi8(v_ral_b, v_rbl_b);
    487      const __m256i v_rvsh_b = _mm256_add_epi8(v_rah_b, v_rbh_b);
    488      const __m256i v_rvsal_w = _mm256_and_si256(v_rvsl_b, v_zmask_b);
    489      const __m256i v_rvsah_w = _mm256_and_si256(v_rvsh_b, v_zmask_b);
    490      const __m256i v_rvsbl_w =
    491          _mm256_and_si256(_mm256_srli_si256(v_rvsl_b, 1), v_zmask_b);
    492      const __m256i v_rvsbh_w =
    493          _mm256_and_si256(_mm256_srli_si256(v_rvsh_b, 1), v_zmask_b);
    494      const __m256i v_rsl_w = _mm256_add_epi16(v_rvsal_w, v_rvsbl_w);
    495      const __m256i v_rsh_w = _mm256_add_epi16(v_rvsah_w, v_rvsbh_w);
    496 
    497      const __m256i v_m0l_w = yy_roundn_epu16(v_rsl_w, 2);
    498      const __m256i v_m0h_w = yy_roundn_epu16(v_rsh_w, 2);
    499      const __m256i v_m0_b =
    500          _mm256_permute4x64_epi64(_mm256_packus_epi16(v_m0l_w, v_m0h_w), 0xd8);
    501      const __m256i v_m1_b = _mm256_sub_epi8(v_maxval_b, v_m0_b);
    502 
    503      const __m256i v_res_b = blend_32_u8_avx2(
    504          src0 + c, src1 + c, &v_m0_b, &v_m1_b, AOM_BLEND_A64_ROUND_BITS);
    505 
    506      yy_storeu_256(dst + c, v_res_b);
    507    }
    508    dst += dst_stride;
    509    src0 += src0_stride;
    510    src1 += src1_stride;
    511    mask += 2 * mask_stride;
    512  } while (--h);
    513 }
    514 
    515 static inline void blend_a64_mask_sx_sy_avx2(
    516    uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
    517    uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
    518    const uint8_t *mask, uint32_t mask_stride, int w, int h) {
    519  const __m128i v_shuffle_b = xx_loadu_128(g_blend_a64_mask_shuffle);
    520  const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
    521  const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS));
    522  switch (w) {
    523    case 4:
    524      do {
    525        const __m128i v_ra_b = xx_loadl_64(mask);
    526        const __m128i v_rb_b = xx_loadl_64(mask + mask_stride);
    527        const __m128i v_rvs_b = _mm_add_epi8(v_ra_b, v_rb_b);
    528        const __m128i v_r_s_b = _mm_shuffle_epi8(v_rvs_b, v_shuffle_b);
    529        const __m128i v_r0_s_w = _mm_cvtepu8_epi16(v_r_s_b);
    530        const __m128i v_r1_s_w = _mm_cvtepu8_epi16(_mm_srli_si128(v_r_s_b, 8));
    531        const __m128i v_rs_w = _mm_add_epi16(v_r0_s_w, v_r1_s_w);
    532        const __m128i v_m0_w = xx_roundn_epu16(v_rs_w, 2);
    533        const __m128i v_m0_b = _mm_packus_epi16(v_m0_w, v_m0_w);
    534        const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b);
    535 
    536        const __m128i v_res_b = blend_4_u8(src0, src1, &v_m0_b, &v_m1_b, &_r);
    537 
    538        xx_storel_32(dst, v_res_b);
    539 
    540        dst += dst_stride;
    541        src0 += src0_stride;
    542        src1 += src1_stride;
    543        mask += 2 * mask_stride;
    544      } while (--h);
    545      break;
    546    case 8:
    547      do {
    548        const __m128i v_ra_b = xx_loadu_128(mask);
    549        const __m128i v_rb_b = xx_loadu_128(mask + mask_stride);
    550        const __m128i v_rvs_b = _mm_add_epi8(v_ra_b, v_rb_b);
    551        const __m128i v_r_s_b = _mm_shuffle_epi8(v_rvs_b, v_shuffle_b);
    552        const __m128i v_r0_s_w = _mm_cvtepu8_epi16(v_r_s_b);
    553        const __m128i v_r1_s_w = _mm_cvtepu8_epi16(_mm_srli_si128(v_r_s_b, 8));
    554        const __m128i v_rs_w = _mm_add_epi16(v_r0_s_w, v_r1_s_w);
    555        const __m128i v_m0_w = xx_roundn_epu16(v_rs_w, 2);
    556        const __m128i v_m0_b = _mm_packus_epi16(v_m0_w, v_m0_w);
    557        const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b);
    558 
    559        const __m128i v_res_b = blend_8_u8(src0, src1, &v_m0_b, &v_m1_b, &_r);
    560 
    561        xx_storel_64(dst, v_res_b);
    562 
    563        dst += dst_stride;
    564        src0 += src0_stride;
    565        src1 += src1_stride;
    566        mask += 2 * mask_stride;
    567      } while (--h);
    568      break;
    569    case 16:
    570      blend_a64_mask_sx_sy_w16_avx2(dst, dst_stride, src0, src0_stride, src1,
    571                                    src1_stride, mask, mask_stride, h);
    572      break;
    573    default:
    574      blend_a64_mask_sx_sy_w32n_avx2(dst, dst_stride, src0, src0_stride, src1,
    575                                     src1_stride, mask, mask_stride, w, h);
    576      break;
    577  }
    578 }
    579 
    580 static inline void blend_a64_mask_sx_w16_avx2(
    581    uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
    582    uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
    583    const uint8_t *mask, uint32_t mask_stride, int h) {
    584  const __m256i v_maxval_b = _mm256_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
    585  const __m256i v_zmask_b = _mm256_set1_epi16(0xff);
    586  do {
    587    const __m256i v_rl_b = yy_loadu_256(mask);
    588    const __m256i v_al_b =
    589        _mm256_avg_epu8(v_rl_b, _mm256_srli_si256(v_rl_b, 1));
    590 
    591    const __m256i v_m0_w = _mm256_and_si256(v_al_b, v_zmask_b);
    592    const __m256i v_m0_b = _mm256_packus_epi16(v_m0_w, _mm256_setzero_si256());
    593    const __m256i v_m1_b = _mm256_sub_epi8(v_maxval_b, v_m0_b);
    594 
    595    const __m256i v_res_b = blend_16_u8_avx2(src0, src1, &v_m0_b, &v_m1_b,
    596                                             AOM_BLEND_A64_ROUND_BITS);
    597 
    598    xx_storeu_128(dst, _mm256_castsi256_si128(v_res_b));
    599    dst += dst_stride;
    600    src0 += src0_stride;
    601    src1 += src1_stride;
    602    mask += mask_stride;
    603  } while (--h);
    604 }
    605 
    606 static inline void blend_a64_mask_sx_w32n_avx2(
    607    uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
    608    uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
    609    const uint8_t *mask, uint32_t mask_stride, int w, int h) {
    610  const __m256i v_shuffle_b = yy_loadu_256(g_blend_a64_mask_shuffle);
    611  const __m256i v_maxval_b = _mm256_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
    612  do {
    613    int c;
    614    for (c = 0; c < w; c += 32) {
    615      const __m256i v_r0_b = yy_loadu_256(mask + 2 * c);
    616      const __m256i v_r1_b = yy_loadu_256(mask + 2 * c + 32);
    617      const __m256i v_r0_s_b = _mm256_shuffle_epi8(v_r0_b, v_shuffle_b);
    618      const __m256i v_r1_s_b = _mm256_shuffle_epi8(v_r1_b, v_shuffle_b);
    619      const __m256i v_al_b =
    620          _mm256_avg_epu8(v_r0_s_b, _mm256_srli_si256(v_r0_s_b, 8));
    621      const __m256i v_ah_b =
    622          _mm256_avg_epu8(v_r1_s_b, _mm256_srli_si256(v_r1_s_b, 8));
    623 
    624      const __m256i v_m0_b =
    625          _mm256_permute4x64_epi64(_mm256_unpacklo_epi64(v_al_b, v_ah_b), 0xd8);
    626      const __m256i v_m1_b = _mm256_sub_epi8(v_maxval_b, v_m0_b);
    627 
    628      const __m256i v_res_b = blend_32_u8_avx2(
    629          src0 + c, src1 + c, &v_m0_b, &v_m1_b, AOM_BLEND_A64_ROUND_BITS);
    630 
    631      yy_storeu_256(dst + c, v_res_b);
    632    }
    633    dst += dst_stride;
    634    src0 += src0_stride;
    635    src1 += src1_stride;
    636    mask += mask_stride;
    637  } while (--h);
    638 }
    639 
    640 static inline void blend_a64_mask_sx_avx2(
    641    uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
    642    uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
    643    const uint8_t *mask, uint32_t mask_stride, int w, int h) {
    644  const __m128i v_shuffle_b = xx_loadu_128(g_blend_a64_mask_shuffle);
    645  const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
    646  const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS));
    647  switch (w) {
    648    case 4:
    649      do {
    650        const __m128i v_r_b = xx_loadl_64(mask);
    651        const __m128i v_r0_s_b = _mm_shuffle_epi8(v_r_b, v_shuffle_b);
    652        const __m128i v_r_lo_b = _mm_unpacklo_epi64(v_r0_s_b, v_r0_s_b);
    653        const __m128i v_r_hi_b = _mm_unpackhi_epi64(v_r0_s_b, v_r0_s_b);
    654        const __m128i v_m0_b = _mm_avg_epu8(v_r_lo_b, v_r_hi_b);
    655        const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b);
    656 
    657        const __m128i v_res_b = blend_4_u8(src0, src1, &v_m0_b, &v_m1_b, &_r);
    658 
    659        xx_storel_32(dst, v_res_b);
    660 
    661        dst += dst_stride;
    662        src0 += src0_stride;
    663        src1 += src1_stride;
    664        mask += mask_stride;
    665      } while (--h);
    666      break;
    667    case 8:
    668      do {
    669        const __m128i v_r_b = xx_loadu_128(mask);
    670        const __m128i v_r0_s_b = _mm_shuffle_epi8(v_r_b, v_shuffle_b);
    671        const __m128i v_r_lo_b = _mm_unpacklo_epi64(v_r0_s_b, v_r0_s_b);
    672        const __m128i v_r_hi_b = _mm_unpackhi_epi64(v_r0_s_b, v_r0_s_b);
    673        const __m128i v_m0_b = _mm_avg_epu8(v_r_lo_b, v_r_hi_b);
    674        const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b);
    675 
    676        const __m128i v_res_b = blend_8_u8(src0, src1, &v_m0_b, &v_m1_b, &_r);
    677 
    678        xx_storel_64(dst, v_res_b);
    679 
    680        dst += dst_stride;
    681        src0 += src0_stride;
    682        src1 += src1_stride;
    683        mask += mask_stride;
    684      } while (--h);
    685      break;
    686    case 16:
    687      blend_a64_mask_sx_w16_avx2(dst, dst_stride, src0, src0_stride, src1,
    688                                 src1_stride, mask, mask_stride, h);
    689      break;
    690    default:
    691      blend_a64_mask_sx_w32n_avx2(dst, dst_stride, src0, src0_stride, src1,
    692                                  src1_stride, mask, mask_stride, w, h);
    693      break;
    694  }
    695 }
    696 
    697 static inline void blend_a64_mask_sy_w16_avx2(
    698    uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
    699    uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
    700    const uint8_t *mask, uint32_t mask_stride, int h) {
    701  const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS));
    702  const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
    703  do {
    704    const __m128i v_ra_b = xx_loadu_128(mask);
    705    const __m128i v_rb_b = xx_loadu_128(mask + mask_stride);
    706    const __m128i v_m0_b = _mm_avg_epu8(v_ra_b, v_rb_b);
    707 
    708    const __m128i v_m1_b = _mm_sub_epi16(v_maxval_b, v_m0_b);
    709    const __m128i v_res_b = blend_16_u8(src0, src1, &v_m0_b, &v_m1_b, &_r);
    710 
    711    xx_storeu_128(dst, v_res_b);
    712    dst += dst_stride;
    713    src0 += src0_stride;
    714    src1 += src1_stride;
    715    mask += 2 * mask_stride;
    716  } while (--h);
    717 }
    718 
    719 static inline void blend_a64_mask_sy_w32n_avx2(
    720    uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
    721    uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
    722    const uint8_t *mask, uint32_t mask_stride, int w, int h) {
    723  const __m256i v_maxval_b = _mm256_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
    724  do {
    725    int c;
    726    for (c = 0; c < w; c += 32) {
    727      const __m256i v_ra_b = yy_loadu_256(mask + c);
    728      const __m256i v_rb_b = yy_loadu_256(mask + c + mask_stride);
    729      const __m256i v_m0_b = _mm256_avg_epu8(v_ra_b, v_rb_b);
    730      const __m256i v_m1_b = _mm256_sub_epi8(v_maxval_b, v_m0_b);
    731      const __m256i v_res_b = blend_32_u8_avx2(
    732          src0 + c, src1 + c, &v_m0_b, &v_m1_b, AOM_BLEND_A64_ROUND_BITS);
    733 
    734      yy_storeu_256(dst + c, v_res_b);
    735    }
    736    dst += dst_stride;
    737    src0 += src0_stride;
    738    src1 += src1_stride;
    739    mask += 2 * mask_stride;
    740  } while (--h);
    741 }
    742 
    743 static inline void blend_a64_mask_sy_avx2(
    744    uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
    745    uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
    746    const uint8_t *mask, uint32_t mask_stride, int w, int h) {
    747  const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS));
    748  const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
    749  switch (w) {
    750    case 4:
    751      do {
    752        const __m128i v_ra_b = xx_loadl_32(mask);
    753        const __m128i v_rb_b = xx_loadl_32(mask + mask_stride);
    754        const __m128i v_m0_b = _mm_avg_epu8(v_ra_b, v_rb_b);
    755        const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b);
    756        const __m128i v_res_b = blend_4_u8(src0, src1, &v_m0_b, &v_m1_b, &_r);
    757 
    758        xx_storel_32(dst, v_res_b);
    759 
    760        dst += dst_stride;
    761        src0 += src0_stride;
    762        src1 += src1_stride;
    763        mask += 2 * mask_stride;
    764      } while (--h);
    765      break;
    766    case 8:
    767      do {
    768        const __m128i v_ra_b = xx_loadl_64(mask);
    769        const __m128i v_rb_b = xx_loadl_64(mask + mask_stride);
    770        const __m128i v_m0_b = _mm_avg_epu8(v_ra_b, v_rb_b);
    771        const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b);
    772        const __m128i v_res_b = blend_8_u8(src0, src1, &v_m0_b, &v_m1_b, &_r);
    773 
    774        xx_storel_64(dst, v_res_b);
    775 
    776        dst += dst_stride;
    777        src0 += src0_stride;
    778        src1 += src1_stride;
    779        mask += 2 * mask_stride;
    780      } while (--h);
    781      break;
    782    case 16:
    783      blend_a64_mask_sy_w16_avx2(dst, dst_stride, src0, src0_stride, src1,
    784                                 src1_stride, mask, mask_stride, h);
    785      break;
    786    default:
    787      blend_a64_mask_sy_w32n_avx2(dst, dst_stride, src0, src0_stride, src1,
    788                                  src1_stride, mask, mask_stride, w, h);
    789  }
    790 }
    791 
    792 static inline void blend_a64_mask_w32n_avx2(
    793    uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
    794    uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
    795    const uint8_t *mask, uint32_t mask_stride, int w, int h) {
    796  const __m256i v_maxval_b = _mm256_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
    797  do {
    798    int c;
    799    for (c = 0; c < w; c += 32) {
    800      const __m256i v_m0_b = yy_loadu_256(mask + c);
    801      const __m256i v_m1_b = _mm256_sub_epi8(v_maxval_b, v_m0_b);
    802 
    803      const __m256i v_res_b = blend_32_u8_avx2(
    804          src0 + c, src1 + c, &v_m0_b, &v_m1_b, AOM_BLEND_A64_ROUND_BITS);
    805 
    806      yy_storeu_256(dst + c, v_res_b);
    807    }
    808    dst += dst_stride;
    809    src0 += src0_stride;
    810    src1 += src1_stride;
    811    mask += mask_stride;
    812  } while (--h);
    813 }
    814 
    815 static inline void blend_a64_mask_avx2(
    816    uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
    817    uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
    818    const uint8_t *mask, uint32_t mask_stride, int w, int h) {
    819  const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
    820  const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS));
    821  switch (w) {
    822    case 4:
    823      do {
    824        const __m128i v_m0_b = xx_loadl_32(mask);
    825        const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b);
    826        const __m128i v_res_b = blend_4_u8(src0, src1, &v_m0_b, &v_m1_b, &_r);
    827 
    828        xx_storel_32(dst, v_res_b);
    829 
    830        dst += dst_stride;
    831        src0 += src0_stride;
    832        src1 += src1_stride;
    833        mask += mask_stride;
    834      } while (--h);
    835      break;
    836    case 8:
    837      do {
    838        const __m128i v_m0_b = xx_loadl_64(mask);
    839        const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b);
    840        const __m128i v_res_b = blend_8_u8(src0, src1, &v_m0_b, &v_m1_b, &_r);
    841 
    842        xx_storel_64(dst, v_res_b);
    843 
    844        dst += dst_stride;
    845        src0 += src0_stride;
    846        src1 += src1_stride;
    847        mask += mask_stride;
    848      } while (--h);
    849      break;
    850    case 16:
    851      do {
    852        const __m128i v_m0_b = xx_loadu_128(mask);
    853        const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b);
    854        const __m128i v_res_b = blend_16_u8(src0, src1, &v_m0_b, &v_m1_b, &_r);
    855 
    856        xx_storeu_128(dst, v_res_b);
    857        dst += dst_stride;
    858        src0 += src0_stride;
    859        src1 += src1_stride;
    860        mask += mask_stride;
    861      } while (--h);
    862      break;
    863    default:
    864      blend_a64_mask_w32n_avx2(dst, dst_stride, src0, src0_stride, src1,
    865                               src1_stride, mask, mask_stride, w, h);
    866  }
    867 }
    868 
    869 void aom_blend_a64_mask_avx2(uint8_t *dst, uint32_t dst_stride,
    870                             const uint8_t *src0, uint32_t src0_stride,
    871                             const uint8_t *src1, uint32_t src1_stride,
    872                             const uint8_t *mask, uint32_t mask_stride, int w,
    873                             int h, int subw, int subh) {
    874  assert(IMPLIES(src0 == dst, src0_stride == dst_stride));
    875  assert(IMPLIES(src1 == dst, src1_stride == dst_stride));
    876 
    877  assert(h >= 1);
    878  assert(w >= 1);
    879  assert(IS_POWER_OF_TWO(h));
    880  assert(IS_POWER_OF_TWO(w));
    881 
    882  if (UNLIKELY((h | w) & 3)) {  // if (w <= 2 || h <= 2)
    883    aom_blend_a64_mask_c(dst, dst_stride, src0, src0_stride, src1, src1_stride,
    884                         mask, mask_stride, w, h, subw, subh);
    885  } else {
    886    if (subw & subh) {
    887      blend_a64_mask_sx_sy_avx2(dst, dst_stride, src0, src0_stride, src1,
    888                                src1_stride, mask, mask_stride, w, h);
    889    } else if (subw) {
    890      blend_a64_mask_sx_avx2(dst, dst_stride, src0, src0_stride, src1,
    891                             src1_stride, mask, mask_stride, w, h);
    892    } else if (subh) {
    893      blend_a64_mask_sy_avx2(dst, dst_stride, src0, src0_stride, src1,
    894                             src1_stride, mask, mask_stride, w, h);
    895    } else {
    896      blend_a64_mask_avx2(dst, dst_stride, src0, src0_stride, src1, src1_stride,
    897                          mask, mask_stride, w, h);
    898    }
    899  }
    900 }
    901 
    902 #if CONFIG_AV1_HIGHBITDEPTH
    903 //////////////////////////////////////////////////////////////////////////////
    904 // aom_highbd_blend_a64_d16_mask_avx2()
    905 //////////////////////////////////////////////////////////////////////////////
    906 
    907 static inline void highbd_blend_a64_d16_mask_w4_avx2(
    908    uint16_t *dst, int dst_stride, const CONV_BUF_TYPE *src0, int src0_stride,
    909    const CONV_BUF_TYPE *src1, int src1_stride, const __m256i *mask0,
    910    const __m256i *round_offset, int shift, const __m256i *clip_low,
    911    const __m256i *clip_high, const __m256i *mask_max) {
    912  // Load 4x u16 pixels from each of 4 rows from each source
    913  const __m256i s0 =
    914      yy_loadu_4x64(src0 + 3 * src0_stride, src0 + 2 * src0_stride,
    915                    src0 + 1 * src0_stride, src0 + 0 * src0_stride);
    916  const __m256i s1 =
    917      yy_loadu_4x64(src1 + 3 * src1_stride, src1 + 2 * src1_stride,
    918                    src1 + 1 * src1_stride, src1 + 0 * src1_stride);
    919  // Generate the inverse mask
    920  const __m256i mask1 = _mm256_sub_epi16(*mask_max, *mask0);
    921 
    922  // Multiply each mask by the respective source
    923  const __m256i mul0_highs = _mm256_mulhi_epu16(*mask0, s0);
    924  const __m256i mul0_lows = _mm256_mullo_epi16(*mask0, s0);
    925  const __m256i mul0h = _mm256_unpackhi_epi16(mul0_lows, mul0_highs);
    926  const __m256i mul0l = _mm256_unpacklo_epi16(mul0_lows, mul0_highs);
    927  // Note that AVX2 unpack orders 64-bit words as [3 1] [2 0] to keep within
    928  // lanes Later, packs does the same again which cancels this out with no need
    929  // for a permute.  The intermediate values being reordered makes no difference
    930 
    931  const __m256i mul1_highs = _mm256_mulhi_epu16(mask1, s1);
    932  const __m256i mul1_lows = _mm256_mullo_epi16(mask1, s1);
    933  const __m256i mul1h = _mm256_unpackhi_epi16(mul1_lows, mul1_highs);
    934  const __m256i mul1l = _mm256_unpacklo_epi16(mul1_lows, mul1_highs);
    935 
    936  const __m256i sumh = _mm256_add_epi32(mul0h, mul1h);
    937  const __m256i suml = _mm256_add_epi32(mul0l, mul1l);
    938 
    939  const __m256i roundh =
    940      _mm256_srai_epi32(_mm256_sub_epi32(sumh, *round_offset), shift);
    941  const __m256i roundl =
    942      _mm256_srai_epi32(_mm256_sub_epi32(suml, *round_offset), shift);
    943 
    944  const __m256i pack = _mm256_packs_epi32(roundl, roundh);
    945  const __m256i clip =
    946      _mm256_min_epi16(_mm256_max_epi16(pack, *clip_low), *clip_high);
    947 
    948  // _mm256_extract_epi64 doesn't exist on x86, so do it the old-fashioned way:
    949  const __m128i cliph = _mm256_extracti128_si256(clip, 1);
    950  xx_storel_64(dst + 3 * dst_stride, _mm_srli_si128(cliph, 8));
    951  xx_storel_64(dst + 2 * dst_stride, cliph);
    952  const __m128i clipl = _mm256_castsi256_si128(clip);
    953  xx_storel_64(dst + 1 * dst_stride, _mm_srli_si128(clipl, 8));
    954  xx_storel_64(dst + 0 * dst_stride, clipl);
    955 }
    956 
    957 static inline void highbd_blend_a64_d16_mask_subw0_subh0_w4_avx2(
    958    uint16_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
    959    uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
    960    const uint8_t *mask, uint32_t mask_stride, int h,
    961    const __m256i *round_offset, int shift, const __m256i *clip_low,
    962    const __m256i *clip_high, const __m256i *mask_max) {
    963  do {
    964    // Load 8x u8 pixels from each of 4 rows of the mask, pad each to u16
    965    const __m128i mask08 = _mm_set_epi32(*(int32_t *)(mask + 3 * mask_stride),
    966                                         *(int32_t *)(mask + 2 * mask_stride),
    967                                         *(int32_t *)(mask + 1 * mask_stride),
    968                                         *(int32_t *)(mask + 0 * mask_stride));
    969    const __m256i mask0 = _mm256_cvtepu8_epi16(mask08);
    970 
    971    highbd_blend_a64_d16_mask_w4_avx2(dst, dst_stride, src0, src0_stride, src1,
    972                                      src1_stride, &mask0, round_offset, shift,
    973                                      clip_low, clip_high, mask_max);
    974 
    975    dst += dst_stride * 4;
    976    src0 += src0_stride * 4;
    977    src1 += src1_stride * 4;
    978    mask += mask_stride * 4;
    979  } while (h -= 4);
    980 }
    981 
    982 static inline void highbd_blend_a64_d16_mask_subw1_subh1_w4_avx2(
    983    uint16_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
    984    uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
    985    const uint8_t *mask, uint32_t mask_stride, int h,
    986    const __m256i *round_offset, int shift, const __m256i *clip_low,
    987    const __m256i *clip_high, const __m256i *mask_max) {
    988  const __m256i one_b = _mm256_set1_epi8(1);
    989  const __m256i two_w = _mm256_set1_epi16(2);
    990  do {
    991    // Load 8 pixels from each of 8 rows of mask,
    992    // (saturating) add together rows then use madd to add adjacent pixels
    993    // Finally, divide each value by 4 (with rounding)
    994    const __m256i m0246 =
    995        _mm256_set_epi64x(*(int64_t *)(mask + 6 * mask_stride),
    996                          *(int64_t *)(mask + 4 * mask_stride),
    997                          *(int64_t *)(mask + 2 * mask_stride),
    998                          *(int64_t *)(mask + 0 * mask_stride));
    999    const __m256i m1357 =
   1000        _mm256_set_epi64x(*(int64_t *)(mask + 7 * mask_stride),
   1001                          *(int64_t *)(mask + 5 * mask_stride),
   1002                          *(int64_t *)(mask + 3 * mask_stride),
   1003                          *(int64_t *)(mask + 1 * mask_stride));
   1004    const __m256i addrows = _mm256_adds_epu8(m0246, m1357);
   1005    const __m256i adjacent = _mm256_maddubs_epi16(addrows, one_b);
   1006    const __m256i mask0 =
   1007        _mm256_srli_epi16(_mm256_add_epi16(adjacent, two_w), 2);
   1008 
   1009    highbd_blend_a64_d16_mask_w4_avx2(dst, dst_stride, src0, src0_stride, src1,
   1010                                      src1_stride, &mask0, round_offset, shift,
   1011                                      clip_low, clip_high, mask_max);
   1012 
   1013    dst += dst_stride * 4;
   1014    src0 += src0_stride * 4;
   1015    src1 += src1_stride * 4;
   1016    mask += mask_stride * 8;
   1017  } while (h -= 4);
   1018 }
   1019 
   1020 static inline void highbd_blend_a64_d16_mask_w8_avx2(
   1021    uint16_t *dst, int dst_stride, const CONV_BUF_TYPE *src0, int src0_stride,
   1022    const CONV_BUF_TYPE *src1, int src1_stride, const __m256i *mask0a,
   1023    const __m256i *mask0b, const __m256i *round_offset, int shift,
   1024    const __m256i *clip_low, const __m256i *clip_high,
   1025    const __m256i *mask_max) {
   1026  // Load 8x u16 pixels from each of 4 rows from each source
   1027  const __m256i s0a =
   1028      yy_loadu2_128(src0 + 0 * src0_stride, src0 + 1 * src0_stride);
   1029  const __m256i s0b =
   1030      yy_loadu2_128(src0 + 2 * src0_stride, src0 + 3 * src0_stride);
   1031  const __m256i s1a =
   1032      yy_loadu2_128(src1 + 0 * src1_stride, src1 + 1 * src1_stride);
   1033  const __m256i s1b =
   1034      yy_loadu2_128(src1 + 2 * src1_stride, src1 + 3 * src1_stride);
   1035 
   1036  // Generate inverse masks
   1037  const __m256i mask1a = _mm256_sub_epi16(*mask_max, *mask0a);
   1038  const __m256i mask1b = _mm256_sub_epi16(*mask_max, *mask0b);
   1039 
   1040  // Multiply sources by respective masks
   1041  const __m256i mul0a_highs = _mm256_mulhi_epu16(*mask0a, s0a);
   1042  const __m256i mul0a_lows = _mm256_mullo_epi16(*mask0a, s0a);
   1043  const __m256i mul0ah = _mm256_unpackhi_epi16(mul0a_lows, mul0a_highs);
   1044  const __m256i mul0al = _mm256_unpacklo_epi16(mul0a_lows, mul0a_highs);
   1045  // Note that AVX2 unpack orders 64-bit words as [3 1] [2 0] to keep within
   1046  // lanes Later, packs does the same again which cancels this out with no need
   1047  // for a permute.  The intermediate values being reordered makes no difference
   1048 
   1049  const __m256i mul1a_highs = _mm256_mulhi_epu16(mask1a, s1a);
   1050  const __m256i mul1a_lows = _mm256_mullo_epi16(mask1a, s1a);
   1051  const __m256i mul1ah = _mm256_unpackhi_epi16(mul1a_lows, mul1a_highs);
   1052  const __m256i mul1al = _mm256_unpacklo_epi16(mul1a_lows, mul1a_highs);
   1053 
   1054  const __m256i sumah = _mm256_add_epi32(mul0ah, mul1ah);
   1055  const __m256i sumal = _mm256_add_epi32(mul0al, mul1al);
   1056 
   1057  const __m256i mul0b_highs = _mm256_mulhi_epu16(*mask0b, s0b);
   1058  const __m256i mul0b_lows = _mm256_mullo_epi16(*mask0b, s0b);
   1059  const __m256i mul0bh = _mm256_unpackhi_epi16(mul0b_lows, mul0b_highs);
   1060  const __m256i mul0bl = _mm256_unpacklo_epi16(mul0b_lows, mul0b_highs);
   1061 
   1062  const __m256i mul1b_highs = _mm256_mulhi_epu16(mask1b, s1b);
   1063  const __m256i mul1b_lows = _mm256_mullo_epi16(mask1b, s1b);
   1064  const __m256i mul1bh = _mm256_unpackhi_epi16(mul1b_lows, mul1b_highs);
   1065  const __m256i mul1bl = _mm256_unpacklo_epi16(mul1b_lows, mul1b_highs);
   1066 
   1067  const __m256i sumbh = _mm256_add_epi32(mul0bh, mul1bh);
   1068  const __m256i sumbl = _mm256_add_epi32(mul0bl, mul1bl);
   1069 
   1070  // Divide down each result, with rounding
   1071  const __m256i roundah =
   1072      _mm256_srai_epi32(_mm256_sub_epi32(sumah, *round_offset), shift);
   1073  const __m256i roundal =
   1074      _mm256_srai_epi32(_mm256_sub_epi32(sumal, *round_offset), shift);
   1075  const __m256i roundbh =
   1076      _mm256_srai_epi32(_mm256_sub_epi32(sumbh, *round_offset), shift);
   1077  const __m256i roundbl =
   1078      _mm256_srai_epi32(_mm256_sub_epi32(sumbl, *round_offset), shift);
   1079 
   1080  // Pack each i32 down to an i16 with saturation, then clip to valid range
   1081  const __m256i packa = _mm256_packs_epi32(roundal, roundah);
   1082  const __m256i clipa =
   1083      _mm256_min_epi16(_mm256_max_epi16(packa, *clip_low), *clip_high);
   1084  const __m256i packb = _mm256_packs_epi32(roundbl, roundbh);
   1085  const __m256i clipb =
   1086      _mm256_min_epi16(_mm256_max_epi16(packb, *clip_low), *clip_high);
   1087 
   1088  // Store 8x u16 pixels to each of 4 rows in the destination
   1089  yy_storeu2_128(dst + 0 * dst_stride, dst + 1 * dst_stride, clipa);
   1090  yy_storeu2_128(dst + 2 * dst_stride, dst + 3 * dst_stride, clipb);
   1091 }
   1092 
   1093 static inline void highbd_blend_a64_d16_mask_subw0_subh0_w8_avx2(
   1094    uint16_t *dst, int dst_stride, const CONV_BUF_TYPE *src0, int src0_stride,
   1095    const CONV_BUF_TYPE *src1, int src1_stride, const uint8_t *mask,
   1096    int mask_stride, int h, const __m256i *round_offset, int shift,
   1097    const __m256i *clip_low, const __m256i *clip_high,
   1098    const __m256i *mask_max) {
   1099  do {
   1100    // Load 8x u8 pixels from each of 4 rows in the mask
   1101    const __m128i mask0a8 =
   1102        _mm_set_epi64x(*(int64_t *)mask, *(uint64_t *)(mask + mask_stride));
   1103    const __m128i mask0b8 =
   1104        _mm_set_epi64x(*(int64_t *)(mask + 2 * mask_stride),
   1105                       *(int64_t *)(mask + 3 * mask_stride));
   1106    const __m256i mask0a = _mm256_cvtepu8_epi16(mask0a8);
   1107    const __m256i mask0b = _mm256_cvtepu8_epi16(mask0b8);
   1108 
   1109    highbd_blend_a64_d16_mask_w8_avx2(
   1110        dst, dst_stride, src0, src0_stride, src1, src1_stride, &mask0a, &mask0b,
   1111        round_offset, shift, clip_low, clip_high, mask_max);
   1112 
   1113    dst += dst_stride * 4;
   1114    src0 += src0_stride * 4;
   1115    src1 += src1_stride * 4;
   1116    mask += mask_stride * 4;
   1117  } while (h -= 4);
   1118 }
   1119 
   1120 static inline void highbd_blend_a64_d16_mask_subw1_subh1_w8_avx2(
   1121    uint16_t *dst, int dst_stride, const CONV_BUF_TYPE *src0, int src0_stride,
   1122    const CONV_BUF_TYPE *src1, int src1_stride, const uint8_t *mask,
   1123    int mask_stride, int h, const __m256i *round_offset, int shift,
   1124    const __m256i *clip_low, const __m256i *clip_high,
   1125    const __m256i *mask_max) {
   1126  const __m256i one_b = _mm256_set1_epi8(1);
   1127  const __m256i two_w = _mm256_set1_epi16(2);
   1128  do {
   1129    // Load 16x u8 pixels from each of 8 rows in the mask,
   1130    // (saturating) add together rows then use madd to add adjacent pixels
   1131    // Finally, divide each value by 4 (with rounding)
   1132    const __m256i m02 =
   1133        yy_loadu2_128(mask + 0 * mask_stride, mask + 2 * mask_stride);
   1134    const __m256i m13 =
   1135        yy_loadu2_128(mask + 1 * mask_stride, mask + 3 * mask_stride);
   1136    const __m256i m0123 =
   1137        _mm256_maddubs_epi16(_mm256_adds_epu8(m02, m13), one_b);
   1138    const __m256i mask_0a =
   1139        _mm256_srli_epi16(_mm256_add_epi16(m0123, two_w), 2);
   1140    const __m256i m46 =
   1141        yy_loadu2_128(mask + 4 * mask_stride, mask + 6 * mask_stride);
   1142    const __m256i m57 =
   1143        yy_loadu2_128(mask + 5 * mask_stride, mask + 7 * mask_stride);
   1144    const __m256i m4567 =
   1145        _mm256_maddubs_epi16(_mm256_adds_epu8(m46, m57), one_b);
   1146    const __m256i mask_0b =
   1147        _mm256_srli_epi16(_mm256_add_epi16(m4567, two_w), 2);
   1148 
   1149    highbd_blend_a64_d16_mask_w8_avx2(
   1150        dst, dst_stride, src0, src0_stride, src1, src1_stride, &mask_0a,
   1151        &mask_0b, round_offset, shift, clip_low, clip_high, mask_max);
   1152 
   1153    dst += dst_stride * 4;
   1154    src0 += src0_stride * 4;
   1155    src1 += src1_stride * 4;
   1156    mask += mask_stride * 8;
   1157  } while (h -= 4);
   1158 }
   1159 
   1160 static inline void highbd_blend_a64_d16_mask_w16_avx2(
   1161    uint16_t *dst, int dst_stride, const CONV_BUF_TYPE *src0, int src0_stride,
   1162    const CONV_BUF_TYPE *src1, int src1_stride, const __m256i *mask0a,
   1163    const __m256i *mask0b, const __m256i *round_offset, int shift,
   1164    const __m256i *clip_low, const __m256i *clip_high,
   1165    const __m256i *mask_max) {
   1166  // Load 16x pixels from each of 2 rows from each source
   1167  const __m256i s0a = yy_loadu_256(src0);
   1168  const __m256i s0b = yy_loadu_256(src0 + src0_stride);
   1169  const __m256i s1a = yy_loadu_256(src1);
   1170  const __m256i s1b = yy_loadu_256(src1 + src1_stride);
   1171 
   1172  // Calculate inverse masks
   1173  const __m256i mask1a = _mm256_sub_epi16(*mask_max, *mask0a);
   1174  const __m256i mask1b = _mm256_sub_epi16(*mask_max, *mask0b);
   1175 
   1176  // Multiply each source by appropriate mask
   1177  const __m256i mul0a_highs = _mm256_mulhi_epu16(*mask0a, s0a);
   1178  const __m256i mul0a_lows = _mm256_mullo_epi16(*mask0a, s0a);
   1179  const __m256i mul0ah = _mm256_unpackhi_epi16(mul0a_lows, mul0a_highs);
   1180  const __m256i mul0al = _mm256_unpacklo_epi16(mul0a_lows, mul0a_highs);
   1181  // Note that AVX2 unpack orders 64-bit words as [3 1] [2 0] to keep within
   1182  // lanes Later, packs does the same again which cancels this out with no need
   1183  // for a permute.  The intermediate values being reordered makes no difference
   1184 
   1185  const __m256i mul1a_highs = _mm256_mulhi_epu16(mask1a, s1a);
   1186  const __m256i mul1a_lows = _mm256_mullo_epi16(mask1a, s1a);
   1187  const __m256i mul1ah = _mm256_unpackhi_epi16(mul1a_lows, mul1a_highs);
   1188  const __m256i mul1al = _mm256_unpacklo_epi16(mul1a_lows, mul1a_highs);
   1189 
   1190  const __m256i mulah = _mm256_add_epi32(mul0ah, mul1ah);
   1191  const __m256i mulal = _mm256_add_epi32(mul0al, mul1al);
   1192 
   1193  const __m256i mul0b_highs = _mm256_mulhi_epu16(*mask0b, s0b);
   1194  const __m256i mul0b_lows = _mm256_mullo_epi16(*mask0b, s0b);
   1195  const __m256i mul0bh = _mm256_unpackhi_epi16(mul0b_lows, mul0b_highs);
   1196  const __m256i mul0bl = _mm256_unpacklo_epi16(mul0b_lows, mul0b_highs);
   1197 
   1198  const __m256i mul1b_highs = _mm256_mulhi_epu16(mask1b, s1b);
   1199  const __m256i mul1b_lows = _mm256_mullo_epi16(mask1b, s1b);
   1200  const __m256i mul1bh = _mm256_unpackhi_epi16(mul1b_lows, mul1b_highs);
   1201  const __m256i mul1bl = _mm256_unpacklo_epi16(mul1b_lows, mul1b_highs);
   1202 
   1203  const __m256i mulbh = _mm256_add_epi32(mul0bh, mul1bh);
   1204  const __m256i mulbl = _mm256_add_epi32(mul0bl, mul1bl);
   1205 
   1206  const __m256i resah =
   1207      _mm256_srai_epi32(_mm256_sub_epi32(mulah, *round_offset), shift);
   1208  const __m256i resal =
   1209      _mm256_srai_epi32(_mm256_sub_epi32(mulal, *round_offset), shift);
   1210  const __m256i resbh =
   1211      _mm256_srai_epi32(_mm256_sub_epi32(mulbh, *round_offset), shift);
   1212  const __m256i resbl =
   1213      _mm256_srai_epi32(_mm256_sub_epi32(mulbl, *round_offset), shift);
   1214 
   1215  // Signed saturating pack from i32 to i16:
   1216  const __m256i packa = _mm256_packs_epi32(resal, resah);
   1217  const __m256i packb = _mm256_packs_epi32(resbl, resbh);
   1218 
   1219  // Clip the values to the valid range
   1220  const __m256i clipa =
   1221      _mm256_min_epi16(_mm256_max_epi16(packa, *clip_low), *clip_high);
   1222  const __m256i clipb =
   1223      _mm256_min_epi16(_mm256_max_epi16(packb, *clip_low), *clip_high);
   1224 
   1225  // Store 16 pixels
   1226  yy_storeu_256(dst, clipa);
   1227  yy_storeu_256(dst + dst_stride, clipb);
   1228 }
   1229 
   1230 static inline void highbd_blend_a64_d16_mask_subw0_subh0_w16_avx2(
   1231    uint16_t *dst, int dst_stride, const CONV_BUF_TYPE *src0, int src0_stride,
   1232    const CONV_BUF_TYPE *src1, int src1_stride, const uint8_t *mask,
   1233    int mask_stride, int h, int w, const __m256i *round_offset, int shift,
   1234    const __m256i *clip_low, const __m256i *clip_high,
   1235    const __m256i *mask_max) {
   1236  for (int i = 0; i < h; i += 2) {
   1237    for (int j = 0; j < w; j += 16) {
   1238      // Load 16x u8 alpha-mask values from each of two rows and pad to u16
   1239      const __m128i masks_a8 = xx_loadu_128(mask + j);
   1240      const __m128i masks_b8 = xx_loadu_128(mask + mask_stride + j);
   1241      const __m256i mask0a = _mm256_cvtepu8_epi16(masks_a8);
   1242      const __m256i mask0b = _mm256_cvtepu8_epi16(masks_b8);
   1243 
   1244      highbd_blend_a64_d16_mask_w16_avx2(
   1245          dst + j, dst_stride, src0 + j, src0_stride, src1 + j, src1_stride,
   1246          &mask0a, &mask0b, round_offset, shift, clip_low, clip_high, mask_max);
   1247    }
   1248    dst += dst_stride * 2;
   1249    src0 += src0_stride * 2;
   1250    src1 += src1_stride * 2;
   1251    mask += mask_stride * 2;
   1252  }
   1253 }
   1254 
   1255 static inline void highbd_blend_a64_d16_mask_subw1_subh1_w16_avx2(
   1256    uint16_t *dst, int dst_stride, const CONV_BUF_TYPE *src0, int src0_stride,
   1257    const CONV_BUF_TYPE *src1, int src1_stride, const uint8_t *mask,
   1258    int mask_stride, int h, int w, const __m256i *round_offset, int shift,
   1259    const __m256i *clip_low, const __m256i *clip_high,
   1260    const __m256i *mask_max) {
   1261  const __m256i one_b = _mm256_set1_epi8(1);
   1262  const __m256i two_w = _mm256_set1_epi16(2);
   1263  for (int i = 0; i < h; i += 2) {
   1264    for (int j = 0; j < w; j += 16) {
   1265      // Load 32x u8 alpha-mask values from each of four rows
   1266      // (saturating) add pairs of rows, then use madd to add adjacent values
   1267      // Finally, divide down each result with rounding
   1268      const __m256i m0 = yy_loadu_256(mask + 0 * mask_stride + 2 * j);
   1269      const __m256i m1 = yy_loadu_256(mask + 1 * mask_stride + 2 * j);
   1270      const __m256i m2 = yy_loadu_256(mask + 2 * mask_stride + 2 * j);
   1271      const __m256i m3 = yy_loadu_256(mask + 3 * mask_stride + 2 * j);
   1272 
   1273      const __m256i m01_8 = _mm256_adds_epu8(m0, m1);
   1274      const __m256i m23_8 = _mm256_adds_epu8(m2, m3);
   1275 
   1276      const __m256i m01 = _mm256_maddubs_epi16(m01_8, one_b);
   1277      const __m256i m23 = _mm256_maddubs_epi16(m23_8, one_b);
   1278 
   1279      const __m256i mask0a = _mm256_srli_epi16(_mm256_add_epi16(m01, two_w), 2);
   1280      const __m256i mask0b = _mm256_srli_epi16(_mm256_add_epi16(m23, two_w), 2);
   1281 
   1282      highbd_blend_a64_d16_mask_w16_avx2(
   1283          dst + j, dst_stride, src0 + j, src0_stride, src1 + j, src1_stride,
   1284          &mask0a, &mask0b, round_offset, shift, clip_low, clip_high, mask_max);
   1285    }
   1286    dst += dst_stride * 2;
   1287    src0 += src0_stride * 2;
   1288    src1 += src1_stride * 2;
   1289    mask += mask_stride * 4;
   1290  }
   1291 }
   1292 
   1293 void aom_highbd_blend_a64_d16_mask_avx2(
   1294    uint8_t *dst8, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
   1295    uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
   1296    const uint8_t *mask, uint32_t mask_stride, int w, int h, int subw, int subh,
   1297    ConvolveParams *conv_params, const int bd) {
   1298  uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
   1299  const int round_bits =
   1300      2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
   1301  const int32_t round_offset =
   1302      ((1 << (round_bits + bd)) + (1 << (round_bits + bd - 1)) -
   1303       (1 << (round_bits - 1)))
   1304      << AOM_BLEND_A64_ROUND_BITS;
   1305  const __m256i v_round_offset = _mm256_set1_epi32(round_offset);
   1306  const int shift = round_bits + AOM_BLEND_A64_ROUND_BITS;
   1307 
   1308  const __m256i clip_low = _mm256_setzero_si256();
   1309  const __m256i clip_high = _mm256_set1_epi16((1 << bd) - 1);
   1310  const __m256i mask_max = _mm256_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
   1311 
   1312  assert(IMPLIES((void *)src0 == dst, src0_stride == dst_stride));
   1313  assert(IMPLIES((void *)src1 == dst, src1_stride == dst_stride));
   1314 
   1315  assert(h >= 4);
   1316  assert(w >= 4);
   1317  assert(IS_POWER_OF_TWO(h));
   1318  assert(IS_POWER_OF_TWO(w));
   1319 
   1320  if (subw == 0 && subh == 0) {
   1321    switch (w) {
   1322      case 4:
   1323        highbd_blend_a64_d16_mask_subw0_subh0_w4_avx2(
   1324            dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
   1325            mask_stride, h, &v_round_offset, shift, &clip_low, &clip_high,
   1326            &mask_max);
   1327        break;
   1328      case 8:
   1329        highbd_blend_a64_d16_mask_subw0_subh0_w8_avx2(
   1330            dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
   1331            mask_stride, h, &v_round_offset, shift, &clip_low, &clip_high,
   1332            &mask_max);
   1333        break;
   1334      default:  // >= 16
   1335        highbd_blend_a64_d16_mask_subw0_subh0_w16_avx2(
   1336            dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
   1337            mask_stride, h, w, &v_round_offset, shift, &clip_low, &clip_high,
   1338            &mask_max);
   1339        break;
   1340    }
   1341 
   1342  } else if (subw == 1 && subh == 1) {
   1343    switch (w) {
   1344      case 4:
   1345        highbd_blend_a64_d16_mask_subw1_subh1_w4_avx2(
   1346            dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
   1347            mask_stride, h, &v_round_offset, shift, &clip_low, &clip_high,
   1348            &mask_max);
   1349        break;
   1350      case 8:
   1351        highbd_blend_a64_d16_mask_subw1_subh1_w8_avx2(
   1352            dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
   1353            mask_stride, h, &v_round_offset, shift, &clip_low, &clip_high,
   1354            &mask_max);
   1355        break;
   1356      default:  // >= 16
   1357        highbd_blend_a64_d16_mask_subw1_subh1_w16_avx2(
   1358            dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
   1359            mask_stride, h, w, &v_round_offset, shift, &clip_low, &clip_high,
   1360            &mask_max);
   1361        break;
   1362    }
   1363  } else {
   1364    // Sub-sampling in only one axis doesn't seem to happen very much, so fall
   1365    // back to the vanilla C implementation instead of having all the optimised
   1366    // code for these.
   1367    aom_highbd_blend_a64_d16_mask_c(dst8, dst_stride, src0, src0_stride, src1,
   1368                                    src1_stride, mask, mask_stride, w, h, subw,
   1369                                    subh, conv_params, bd);
   1370  }
   1371 }
   1372 #endif  // CONFIG_AV1_HIGHBITDEPTH