tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

blend_a64_mask_sse4.c (64681B)


      1 /*
      2 * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
      3 *
      4 * This source code is subject to the terms of the BSD 2 Clause License and
      5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
      6 * was not distributed with this source code in the LICENSE file, you can
      7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
      8 * Media Patent License 1.0 was not distributed with this source code in the
      9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
     10 */
     11 
     12 #include <smmintrin.h>  // SSE4.1
     13 
     14 #include <assert.h>
     15 
     16 #include "aom/aom_integer.h"
     17 #include "aom_ports/mem.h"
     18 #include "aom_dsp/aom_dsp_common.h"
     19 #include "aom_dsp/blend.h"
     20 
     21 #include "aom_dsp/x86/synonyms.h"
     22 #include "aom_dsp/x86/blend_sse4.h"
     23 #include "aom_dsp/x86/blend_mask_sse4.h"
     24 
     25 #include "config/aom_dsp_rtcd.h"
     26 
     27 //////////////////////////////////////////////////////////////////////////////
     28 // No sub-sampling
     29 //////////////////////////////////////////////////////////////////////////////
     30 
     31 static void blend_a64_mask_w4_sse4_1(uint8_t *dst, uint32_t dst_stride,
     32                                     const uint8_t *src0, uint32_t src0_stride,
     33                                     const uint8_t *src1, uint32_t src1_stride,
     34                                     const uint8_t *mask, uint32_t mask_stride,
     35                                     int w, int h) {
     36  (void)w;
     37  const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
     38  const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS));
     39  do {
     40    const __m128i v_m0_b = xx_loadl_32(mask);
     41    const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b);
     42    const __m128i v_res_b = blend_4_u8(src0, src1, &v_m0_b, &v_m1_b, &_r);
     43    xx_storel_32(dst, v_res_b);
     44 
     45    dst += dst_stride;
     46    src0 += src0_stride;
     47    src1 += src1_stride;
     48    mask += mask_stride;
     49  } while (--h);
     50 }
     51 
     52 static void blend_a64_mask_w8_sse4_1(uint8_t *dst, uint32_t dst_stride,
     53                                     const uint8_t *src0, uint32_t src0_stride,
     54                                     const uint8_t *src1, uint32_t src1_stride,
     55                                     const uint8_t *mask, uint32_t mask_stride,
     56                                     int w, int h) {
     57  (void)w;
     58  const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
     59  const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS));
     60  do {
     61    const __m128i v_m0_b = xx_loadl_64(mask);
     62    const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b);
     63    const __m128i v_res_b = blend_8_u8(src0, src1, &v_m0_b, &v_m1_b, &_r);
     64    xx_storel_64(dst, v_res_b);
     65 
     66    dst += dst_stride;
     67    src0 += src0_stride;
     68    src1 += src1_stride;
     69    mask += mask_stride;
     70  } while (--h);
     71 }
     72 
     73 static void blend_a64_mask_w16n_sse4_1(
     74    uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
     75    uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
     76    const uint8_t *mask, uint32_t mask_stride, int w, int h) {
     77  const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
     78  const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS));
     79 
     80  do {
     81    int c;
     82    for (c = 0; c < w; c += 16) {
     83      const __m128i v_m0_b = xx_loadu_128(mask + c);
     84      const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b);
     85 
     86      const __m128i v_res_b =
     87          blend_16_u8(src0 + c, src1 + c, &v_m0_b, &v_m1_b, &_r);
     88 
     89      xx_storeu_128(dst + c, v_res_b);
     90    }
     91    dst += dst_stride;
     92    src0 += src0_stride;
     93    src1 += src1_stride;
     94    mask += mask_stride;
     95  } while (--h);
     96 }
     97 
     98 //////////////////////////////////////////////////////////////////////////////
     99 // Horizontal sub-sampling
    100 //////////////////////////////////////////////////////////////////////////////
    101 
    102 static void blend_a64_mask_sx_w4_sse4_1(
    103    uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
    104    uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
    105    const uint8_t *mask, uint32_t mask_stride, int w, int h) {
    106  (void)w;
    107 
    108  const __m128i v_shuffle_b = xx_loadu_128(g_blend_a64_mask_shuffle);
    109  const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
    110  const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS));
    111  do {
    112    const __m128i v_r_b = xx_loadl_64(mask);
    113    const __m128i v_r0_s_b = _mm_shuffle_epi8(v_r_b, v_shuffle_b);
    114    const __m128i v_r_lo_b = _mm_unpacklo_epi64(v_r0_s_b, v_r0_s_b);
    115    const __m128i v_r_hi_b = _mm_unpackhi_epi64(v_r0_s_b, v_r0_s_b);
    116    const __m128i v_m0_b = _mm_avg_epu8(v_r_lo_b, v_r_hi_b);
    117    const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b);
    118 
    119    const __m128i v_res_b = blend_4_u8(src0, src1, &v_m0_b, &v_m1_b, &_r);
    120    xx_storel_32(dst, v_res_b);
    121 
    122    dst += dst_stride;
    123    src0 += src0_stride;
    124    src1 += src1_stride;
    125    mask += mask_stride;
    126  } while (--h);
    127 }
    128 
    129 static void blend_a64_mask_sx_w8_sse4_1(
    130    uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
    131    uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
    132    const uint8_t *mask, uint32_t mask_stride, int w, int h) {
    133  (void)w;
    134 
    135  const __m128i v_shuffle_b = xx_loadu_128(g_blend_a64_mask_shuffle);
    136  const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
    137  const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS));
    138  do {
    139    const __m128i v_r_b = xx_loadu_128(mask);
    140    const __m128i v_r0_s_b = _mm_shuffle_epi8(v_r_b, v_shuffle_b);
    141    const __m128i v_r_lo_b = _mm_unpacklo_epi64(v_r0_s_b, v_r0_s_b);
    142    const __m128i v_r_hi_b = _mm_unpackhi_epi64(v_r0_s_b, v_r0_s_b);
    143    const __m128i v_m0_b = _mm_avg_epu8(v_r_lo_b, v_r_hi_b);
    144    const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b);
    145 
    146    const __m128i v_res_b = blend_8_u8(src0, src1, &v_m0_b, &v_m1_b, &_r);
    147 
    148    xx_storel_64(dst, v_res_b);
    149 
    150    dst += dst_stride;
    151    src0 += src0_stride;
    152    src1 += src1_stride;
    153    mask += mask_stride;
    154  } while (--h);
    155 }
    156 
    157 static void blend_a64_mask_sx_w16n_sse4_1(
    158    uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
    159    uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
    160    const uint8_t *mask, uint32_t mask_stride, int w, int h) {
    161  const __m128i v_shuffle_b = xx_loadu_128(g_blend_a64_mask_shuffle);
    162  const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
    163  const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS));
    164 
    165  do {
    166    int c;
    167    for (c = 0; c < w; c += 16) {
    168      const __m128i v_r0_b = xx_loadu_128(mask + 2 * c);
    169      const __m128i v_r1_b = xx_loadu_128(mask + 2 * c + 16);
    170      const __m128i v_r0_s_b = _mm_shuffle_epi8(v_r0_b, v_shuffle_b);
    171      const __m128i v_r1_s_b = _mm_shuffle_epi8(v_r1_b, v_shuffle_b);
    172      const __m128i v_r_lo_b = _mm_unpacklo_epi64(v_r0_s_b, v_r1_s_b);
    173      const __m128i v_r_hi_b = _mm_unpackhi_epi64(v_r0_s_b, v_r1_s_b);
    174      const __m128i v_m0_b = _mm_avg_epu8(v_r_lo_b, v_r_hi_b);
    175      const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b);
    176 
    177      const __m128i v_res_b =
    178          blend_16_u8(src0 + c, src1 + c, &v_m0_b, &v_m1_b, &_r);
    179 
    180      xx_storeu_128(dst + c, v_res_b);
    181    }
    182    dst += dst_stride;
    183    src0 += src0_stride;
    184    src1 += src1_stride;
    185    mask += mask_stride;
    186  } while (--h);
    187 }
    188 
    189 //////////////////////////////////////////////////////////////////////////////
    190 // Vertical sub-sampling
    191 //////////////////////////////////////////////////////////////////////////////
    192 
    193 static void blend_a64_mask_sy_w4_sse4_1(
    194    uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
    195    uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
    196    const uint8_t *mask, uint32_t mask_stride, int w, int h) {
    197  (void)w;
    198 
    199  const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
    200  const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS));
    201 
    202  do {
    203    const __m128i v_ra_b = xx_loadl_32(mask);
    204    const __m128i v_rb_b = xx_loadl_32(mask + mask_stride);
    205    const __m128i v_m0_b = _mm_avg_epu8(v_ra_b, v_rb_b);
    206    const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b);
    207 
    208    const __m128i v_res_b = blend_4_u8(src0, src1, &v_m0_b, &v_m1_b, &_r);
    209 
    210    xx_storel_32(dst, v_res_b);
    211 
    212    dst += dst_stride;
    213    src0 += src0_stride;
    214    src1 += src1_stride;
    215    mask += 2 * mask_stride;
    216  } while (--h);
    217 }
    218 
    219 static void blend_a64_mask_sy_w8_sse4_1(
    220    uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
    221    uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
    222    const uint8_t *mask, uint32_t mask_stride, int w, int h) {
    223  (void)w;
    224 
    225  const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
    226  const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS));
    227  do {
    228    const __m128i v_ra_b = xx_loadl_64(mask);
    229    const __m128i v_rb_b = xx_loadl_64(mask + mask_stride);
    230    const __m128i v_m0_b = _mm_avg_epu8(v_ra_b, v_rb_b);
    231    const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b);
    232    const __m128i v_res_b = blend_8_u8(src0, src1, &v_m0_b, &v_m1_b, &_r);
    233 
    234    xx_storel_64(dst, v_res_b);
    235 
    236    dst += dst_stride;
    237    src0 += src0_stride;
    238    src1 += src1_stride;
    239    mask += 2 * mask_stride;
    240  } while (--h);
    241 }
    242 
    243 static void blend_a64_mask_sy_w16n_sse4_1(
    244    uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
    245    uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
    246    const uint8_t *mask, uint32_t mask_stride, int w, int h) {
    247  const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
    248  const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS));
    249  do {
    250    int c;
    251    for (c = 0; c < w; c += 16) {
    252      const __m128i v_ra_b = xx_loadu_128(mask + c);
    253      const __m128i v_rb_b = xx_loadu_128(mask + c + mask_stride);
    254      const __m128i v_m0_b = _mm_avg_epu8(v_ra_b, v_rb_b);
    255      const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b);
    256 
    257      const __m128i v_res_b =
    258          blend_16_u8(src0 + c, src1 + c, &v_m0_b, &v_m1_b, &_r);
    259 
    260      xx_storeu_128(dst + c, v_res_b);
    261    }
    262    dst += dst_stride;
    263    src0 += src0_stride;
    264    src1 += src1_stride;
    265    mask += 2 * mask_stride;
    266  } while (--h);
    267 }
    268 
    269 //////////////////////////////////////////////////////////////////////////////
    270 // Horizontal and Vertical sub-sampling
    271 //////////////////////////////////////////////////////////////////////////////
    272 
    273 static void blend_a64_mask_sx_sy_w4_sse4_1(
    274    uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
    275    uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
    276    const uint8_t *mask, uint32_t mask_stride, int w, int h) {
    277  const __m128i v_shuffle_b = xx_loadu_128(g_blend_a64_mask_shuffle);
    278  const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
    279  const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS));
    280  (void)w;
    281 
    282  do {
    283    const __m128i v_ra_b = xx_loadl_64(mask);
    284    const __m128i v_rb_b = xx_loadl_64(mask + mask_stride);
    285    const __m128i v_rvs_b = _mm_add_epi8(v_ra_b, v_rb_b);
    286    const __m128i v_r_s_b = _mm_shuffle_epi8(v_rvs_b, v_shuffle_b);
    287    const __m128i v_r0_s_w = _mm_cvtepu8_epi16(v_r_s_b);
    288    const __m128i v_r1_s_w = _mm_cvtepu8_epi16(_mm_srli_si128(v_r_s_b, 8));
    289    const __m128i v_rs_w = _mm_add_epi16(v_r0_s_w, v_r1_s_w);
    290    const __m128i v_m0_w = xx_roundn_epu16(v_rs_w, 2);
    291    const __m128i v_m0_b = _mm_packus_epi16(v_m0_w, v_m0_w);
    292    const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b);
    293 
    294    const __m128i v_res_b = blend_4_u8(src0, src1, &v_m0_b, &v_m1_b, &_r);
    295 
    296    xx_storel_32(dst, v_res_b);
    297 
    298    dst += dst_stride;
    299    src0 += src0_stride;
    300    src1 += src1_stride;
    301    mask += 2 * mask_stride;
    302  } while (--h);
    303 }
    304 
    305 static void blend_a64_mask_sx_sy_w8_sse4_1(
    306    uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
    307    uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
    308    const uint8_t *mask, uint32_t mask_stride, int w, int h) {
    309  const __m128i v_shuffle_b = xx_loadu_128(g_blend_a64_mask_shuffle);
    310  const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
    311  const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS));
    312  (void)w;
    313 
    314  do {
    315    const __m128i v_ra_b = xx_loadu_128(mask);
    316    const __m128i v_rb_b = xx_loadu_128(mask + mask_stride);
    317 
    318    const __m128i v_rvs_b = _mm_add_epi8(v_ra_b, v_rb_b);
    319    const __m128i v_r_s_b = _mm_shuffle_epi8(v_rvs_b, v_shuffle_b);
    320    const __m128i v_r0_s_w = _mm_cvtepu8_epi16(v_r_s_b);
    321    const __m128i v_r1_s_w = _mm_cvtepu8_epi16(_mm_srli_si128(v_r_s_b, 8));
    322    const __m128i v_rs_w = _mm_add_epi16(v_r0_s_w, v_r1_s_w);
    323    const __m128i v_m0_w = xx_roundn_epu16(v_rs_w, 2);
    324    const __m128i v_m0_b = _mm_packus_epi16(v_m0_w, v_m0_w);
    325    const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b);
    326 
    327    const __m128i v_res_b = blend_8_u8(src0, src1, &v_m0_b, &v_m1_b, &_r);
    328 
    329    xx_storel_64(dst, v_res_b);
    330 
    331    dst += dst_stride;
    332    src0 += src0_stride;
    333    src1 += src1_stride;
    334    mask += 2 * mask_stride;
    335  } while (--h);
    336 }
    337 
    338 static void blend_a64_mask_sx_sy_w16n_sse4_1(
    339    uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
    340    uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
    341    const uint8_t *mask, uint32_t mask_stride, int w, int h) {
    342  const __m128i v_zmask_b =
    343      _mm_set_epi8(0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1);
    344  const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
    345  const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS));
    346  do {
    347    int c;
    348    for (c = 0; c < w; c += 16) {
    349      const __m128i v_ral_b = xx_loadu_128(mask + 2 * c);
    350      const __m128i v_rah_b = xx_loadu_128(mask + 2 * c + 16);
    351      const __m128i v_rbl_b = xx_loadu_128(mask + mask_stride + 2 * c);
    352      const __m128i v_rbh_b = xx_loadu_128(mask + mask_stride + 2 * c + 16);
    353      const __m128i v_rvsl_b = _mm_add_epi8(v_ral_b, v_rbl_b);
    354      const __m128i v_rvsh_b = _mm_add_epi8(v_rah_b, v_rbh_b);
    355      const __m128i v_rvsal_w = _mm_and_si128(v_rvsl_b, v_zmask_b);
    356      const __m128i v_rvsah_w = _mm_and_si128(v_rvsh_b, v_zmask_b);
    357      const __m128i v_rvsbl_w =
    358          _mm_and_si128(_mm_srli_si128(v_rvsl_b, 1), v_zmask_b);
    359      const __m128i v_rvsbh_w =
    360          _mm_and_si128(_mm_srli_si128(v_rvsh_b, 1), v_zmask_b);
    361      const __m128i v_rsl_w = _mm_add_epi16(v_rvsal_w, v_rvsbl_w);
    362      const __m128i v_rsh_w = _mm_add_epi16(v_rvsah_w, v_rvsbh_w);
    363 
    364      const __m128i v_m0l_w = xx_roundn_epu16(v_rsl_w, 2);
    365      const __m128i v_m0h_w = xx_roundn_epu16(v_rsh_w, 2);
    366      const __m128i v_m0_b = _mm_packus_epi16(v_m0l_w, v_m0h_w);
    367      const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b);
    368 
    369      const __m128i v_res_b =
    370          blend_16_u8(src0 + c, src1 + c, &v_m0_b, &v_m1_b, &_r);
    371 
    372      xx_storeu_128(dst + c, v_res_b);
    373    }
    374    dst += dst_stride;
    375    src0 += src0_stride;
    376    src1 += src1_stride;
    377    mask += 2 * mask_stride;
    378  } while (--h);
    379 }
    380 
    381 //////////////////////////////////////////////////////////////////////////////
    382 // Dispatch
    383 //////////////////////////////////////////////////////////////////////////////
    384 
    385 void aom_blend_a64_mask_sse4_1(uint8_t *dst, uint32_t dst_stride,
    386                               const uint8_t *src0, uint32_t src0_stride,
    387                               const uint8_t *src1, uint32_t src1_stride,
    388                               const uint8_t *mask, uint32_t mask_stride, int w,
    389                               int h, int subw, int subh) {
    390  typedef void (*blend_fn)(
    391      uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
    392      uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
    393      const uint8_t *mask, uint32_t mask_stride, int w, int h);
    394 
    395  // Dimensions are: width_index X subx X suby
    396  static const blend_fn blend[3][2][2] = {
    397    { // w % 16 == 0
    398      { blend_a64_mask_w16n_sse4_1, blend_a64_mask_sy_w16n_sse4_1 },
    399      { blend_a64_mask_sx_w16n_sse4_1, blend_a64_mask_sx_sy_w16n_sse4_1 } },
    400    { // w == 4
    401      { blend_a64_mask_w4_sse4_1, blend_a64_mask_sy_w4_sse4_1 },
    402      { blend_a64_mask_sx_w4_sse4_1, blend_a64_mask_sx_sy_w4_sse4_1 } },
    403    { // w == 8
    404      { blend_a64_mask_w8_sse4_1, blend_a64_mask_sy_w8_sse4_1 },
    405      { blend_a64_mask_sx_w8_sse4_1, blend_a64_mask_sx_sy_w8_sse4_1 } }
    406  };
    407 
    408  assert(IMPLIES(src0 == dst, src0_stride == dst_stride));
    409  assert(IMPLIES(src1 == dst, src1_stride == dst_stride));
    410 
    411  assert(h >= 1);
    412  assert(w >= 1);
    413  assert(IS_POWER_OF_TWO(h));
    414  assert(IS_POWER_OF_TWO(w));
    415 
    416  if (UNLIKELY((h | w) & 3)) {  // if (w <= 2 || h <= 2)
    417    aom_blend_a64_mask_c(dst, dst_stride, src0, src0_stride, src1, src1_stride,
    418                         mask, mask_stride, w, h, subw, subh);
    419  } else {
    420    blend[(w >> 2) & 3][subw != 0][subh != 0](dst, dst_stride, src0,
    421                                              src0_stride, src1, src1_stride,
    422                                              mask, mask_stride, w, h);
    423  }
    424 }
    425 
    426 #if CONFIG_AV1_HIGHBITDEPTH
    427 //////////////////////////////////////////////////////////////////////////////
    428 // No sub-sampling
    429 //////////////////////////////////////////////////////////////////////////////
    430 
    431 static inline void blend_a64_mask_bn_w4_sse4_1(
    432    uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
    433    uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
    434    const uint8_t *mask, uint32_t mask_stride, int h, blend_unit_fn blend) {
    435  const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
    436 
    437  do {
    438    const __m128i v_m0_b = xx_loadl_32(mask);
    439    const __m128i v_m0_w = _mm_cvtepu8_epi16(v_m0_b);
    440    const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
    441 
    442    const __m128i v_res_w = blend(src0, src1, v_m0_w, v_m1_w);
    443 
    444    xx_storel_64(dst, v_res_w);
    445 
    446    dst += dst_stride;
    447    src0 += src0_stride;
    448    src1 += src1_stride;
    449    mask += mask_stride;
    450  } while (--h);
    451 }
    452 
    453 static void blend_a64_mask_b10_w4_sse4_1(
    454    uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
    455    uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
    456    const uint8_t *mask, uint32_t mask_stride, int w, int h) {
    457  (void)w;
    458  blend_a64_mask_bn_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1,
    459                              src1_stride, mask, mask_stride, h, blend_4_b10);
    460 }
    461 
    462 static void blend_a64_mask_b12_w4_sse4_1(
    463    uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
    464    uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
    465    const uint8_t *mask, uint32_t mask_stride, int w, int h) {
    466  (void)w;
    467  blend_a64_mask_bn_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1,
    468                              src1_stride, mask, mask_stride, h, blend_4_b12);
    469 }
    470 
    471 static inline void blend_a64_mask_bn_w8n_sse4_1(
    472    uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
    473    uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
    474    const uint8_t *mask, uint32_t mask_stride, int w, int h,
    475    blend_unit_fn blend) {
    476  const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
    477 
    478  do {
    479    int c;
    480    for (c = 0; c < w; c += 8) {
    481      const __m128i v_m0_b = xx_loadl_64(mask + c);
    482      const __m128i v_m0_w = _mm_cvtepu8_epi16(v_m0_b);
    483      const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
    484 
    485      const __m128i v_res_w = blend(src0 + c, src1 + c, v_m0_w, v_m1_w);
    486 
    487      xx_storeu_128(dst + c, v_res_w);
    488    }
    489    dst += dst_stride;
    490    src0 += src0_stride;
    491    src1 += src1_stride;
    492    mask += mask_stride;
    493  } while (--h);
    494 }
    495 
    496 static void blend_a64_mask_b10_w8n_sse4_1(
    497    uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
    498    uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
    499    const uint8_t *mask, uint32_t mask_stride, int w, int h) {
    500  blend_a64_mask_bn_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1,
    501                               src1_stride, mask, mask_stride, w, h,
    502                               blend_8_b10);
    503 }
    504 
    505 static void blend_a64_mask_b12_w8n_sse4_1(
    506    uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
    507    uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
    508    const uint8_t *mask, uint32_t mask_stride, int w, int h) {
    509  blend_a64_mask_bn_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1,
    510                               src1_stride, mask, mask_stride, w, h,
    511                               blend_8_b12);
    512 }
    513 
    514 //////////////////////////////////////////////////////////////////////////////
    515 // Horizontal sub-sampling
    516 //////////////////////////////////////////////////////////////////////////////
    517 
    518 static inline void blend_a64_mask_bn_sx_w4_sse4_1(
    519    uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
    520    uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
    521    const uint8_t *mask, uint32_t mask_stride, int h, blend_unit_fn blend) {
    522  const __m128i v_zmask_b =
    523      _mm_set_epi8(0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1);
    524  const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
    525 
    526  do {
    527    const __m128i v_r_b = xx_loadl_64(mask);
    528    const __m128i v_a_b = _mm_avg_epu8(v_r_b, _mm_srli_si128(v_r_b, 1));
    529 
    530    const __m128i v_m0_w = _mm_and_si128(v_a_b, v_zmask_b);
    531    const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
    532 
    533    const __m128i v_res_w = blend(src0, src1, v_m0_w, v_m1_w);
    534 
    535    xx_storel_64(dst, v_res_w);
    536 
    537    dst += dst_stride;
    538    src0 += src0_stride;
    539    src1 += src1_stride;
    540    mask += mask_stride;
    541  } while (--h);
    542 }
    543 
    544 static void blend_a64_mask_b10_sx_w4_sse4_1(
    545    uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
    546    uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
    547    const uint8_t *mask, uint32_t mask_stride, int w, int h) {
    548  (void)w;
    549  blend_a64_mask_bn_sx_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1,
    550                                 src1_stride, mask, mask_stride, h,
    551                                 blend_4_b10);
    552 }
    553 
    554 static void blend_a64_mask_b12_sx_w4_sse4_1(
    555    uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
    556    uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
    557    const uint8_t *mask, uint32_t mask_stride, int w, int h) {
    558  (void)w;
    559  blend_a64_mask_bn_sx_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1,
    560                                 src1_stride, mask, mask_stride, h,
    561                                 blend_4_b12);
    562 }
    563 
    564 static inline void blend_a64_mask_bn_sx_w8n_sse4_1(
    565    uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
    566    uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
    567    const uint8_t *mask, uint32_t mask_stride, int w, int h,
    568    blend_unit_fn blend) {
    569  const __m128i v_zmask_b =
    570      _mm_set_epi8(0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1);
    571  const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
    572 
    573  do {
    574    int c;
    575    for (c = 0; c < w; c += 8) {
    576      const __m128i v_r_b = xx_loadu_128(mask + 2 * c);
    577      const __m128i v_a_b = _mm_avg_epu8(v_r_b, _mm_srli_si128(v_r_b, 1));
    578 
    579      const __m128i v_m0_w = _mm_and_si128(v_a_b, v_zmask_b);
    580      const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
    581 
    582      const __m128i v_res_w = blend(src0 + c, src1 + c, v_m0_w, v_m1_w);
    583 
    584      xx_storeu_128(dst + c, v_res_w);
    585    }
    586    dst += dst_stride;
    587    src0 += src0_stride;
    588    src1 += src1_stride;
    589    mask += mask_stride;
    590  } while (--h);
    591 }
    592 
    593 static void blend_a64_mask_b10_sx_w8n_sse4_1(
    594    uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
    595    uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
    596    const uint8_t *mask, uint32_t mask_stride, int w, int h) {
    597  blend_a64_mask_bn_sx_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1,
    598                                  src1_stride, mask, mask_stride, w, h,
    599                                  blend_8_b10);
    600 }
    601 
    602 static void blend_a64_mask_b12_sx_w8n_sse4_1(
    603    uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
    604    uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
    605    const uint8_t *mask, uint32_t mask_stride, int w, int h) {
    606  blend_a64_mask_bn_sx_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1,
    607                                  src1_stride, mask, mask_stride, w, h,
    608                                  blend_8_b12);
    609 }
    610 
    611 //////////////////////////////////////////////////////////////////////////////
    612 // Vertical sub-sampling
    613 //////////////////////////////////////////////////////////////////////////////
    614 
    615 static inline void blend_a64_mask_bn_sy_w4_sse4_1(
    616    uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
    617    uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
    618    const uint8_t *mask, uint32_t mask_stride, int h, blend_unit_fn blend) {
    619  const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
    620 
    621  do {
    622    const __m128i v_ra_b = xx_loadl_32(mask);
    623    const __m128i v_rb_b = xx_loadl_32(mask + mask_stride);
    624    const __m128i v_a_b = _mm_avg_epu8(v_ra_b, v_rb_b);
    625 
    626    const __m128i v_m0_w = _mm_cvtepu8_epi16(v_a_b);
    627    const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
    628 
    629    const __m128i v_res_w = blend(src0, src1, v_m0_w, v_m1_w);
    630 
    631    xx_storel_64(dst, v_res_w);
    632 
    633    dst += dst_stride;
    634    src0 += src0_stride;
    635    src1 += src1_stride;
    636    mask += 2 * mask_stride;
    637  } while (--h);
    638 }
    639 
    640 static void blend_a64_mask_b10_sy_w4_sse4_1(
    641    uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
    642    uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
    643    const uint8_t *mask, uint32_t mask_stride, int w, int h) {
    644  (void)w;
    645  blend_a64_mask_bn_sy_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1,
    646                                 src1_stride, mask, mask_stride, h,
    647                                 blend_4_b10);
    648 }
    649 
    650 static void blend_a64_mask_b12_sy_w4_sse4_1(
    651    uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
    652    uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
    653    const uint8_t *mask, uint32_t mask_stride, int w, int h) {
    654  (void)w;
    655  blend_a64_mask_bn_sy_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1,
    656                                 src1_stride, mask, mask_stride, h,
    657                                 blend_4_b12);
    658 }
    659 
    660 static inline void blend_a64_mask_bn_sy_w8n_sse4_1(
    661    uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
    662    uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
    663    const uint8_t *mask, uint32_t mask_stride, int w, int h,
    664    blend_unit_fn blend) {
    665  const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
    666 
    667  do {
    668    int c;
    669    for (c = 0; c < w; c += 8) {
    670      const __m128i v_ra_b = xx_loadl_64(mask + c);
    671      const __m128i v_rb_b = xx_loadl_64(mask + c + mask_stride);
    672      const __m128i v_a_b = _mm_avg_epu8(v_ra_b, v_rb_b);
    673 
    674      const __m128i v_m0_w = _mm_cvtepu8_epi16(v_a_b);
    675      const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
    676 
    677      const __m128i v_res_w = blend(src0 + c, src1 + c, v_m0_w, v_m1_w);
    678 
    679      xx_storeu_128(dst + c, v_res_w);
    680    }
    681    dst += dst_stride;
    682    src0 += src0_stride;
    683    src1 += src1_stride;
    684    mask += 2 * mask_stride;
    685  } while (--h);
    686 }
    687 
    688 static void blend_a64_mask_b10_sy_w8n_sse4_1(
    689    uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
    690    uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
    691    const uint8_t *mask, uint32_t mask_stride, int w, int h) {
    692  blend_a64_mask_bn_sy_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1,
    693                                  src1_stride, mask, mask_stride, w, h,
    694                                  blend_8_b10);
    695 }
    696 
    697 static void blend_a64_mask_b12_sy_w8n_sse4_1(
    698    uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
    699    uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
    700    const uint8_t *mask, uint32_t mask_stride, int w, int h) {
    701  blend_a64_mask_bn_sy_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1,
    702                                  src1_stride, mask, mask_stride, w, h,
    703                                  blend_8_b12);
    704 }
    705 
    706 //////////////////////////////////////////////////////////////////////////////
    707 // Horizontal and Vertical sub-sampling
    708 //////////////////////////////////////////////////////////////////////////////
    709 
    710 static inline void blend_a64_mask_bn_sx_sy_w4_sse4_1(
    711    uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
    712    uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
    713    const uint8_t *mask, uint32_t mask_stride, int h, blend_unit_fn blend) {
    714  const __m128i v_zmask_b =
    715      _mm_set_epi8(0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1);
    716  const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
    717 
    718  do {
    719    const __m128i v_ra_b = xx_loadl_64(mask);
    720    const __m128i v_rb_b = xx_loadl_64(mask + mask_stride);
    721    const __m128i v_rvs_b = _mm_add_epi8(v_ra_b, v_rb_b);
    722    const __m128i v_rvsa_w = _mm_and_si128(v_rvs_b, v_zmask_b);
    723    const __m128i v_rvsb_w =
    724        _mm_and_si128(_mm_srli_si128(v_rvs_b, 1), v_zmask_b);
    725    const __m128i v_rs_w = _mm_add_epi16(v_rvsa_w, v_rvsb_w);
    726 
    727    const __m128i v_m0_w = xx_roundn_epu16(v_rs_w, 2);
    728    const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
    729 
    730    const __m128i v_res_w = blend(src0, src1, v_m0_w, v_m1_w);
    731 
    732    xx_storel_64(dst, v_res_w);
    733 
    734    dst += dst_stride;
    735    src0 += src0_stride;
    736    src1 += src1_stride;
    737    mask += 2 * mask_stride;
    738  } while (--h);
    739 }
    740 
    741 static void blend_a64_mask_b10_sx_sy_w4_sse4_1(
    742    uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
    743    uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
    744    const uint8_t *mask, uint32_t mask_stride, int w, int h) {
    745  (void)w;
    746  blend_a64_mask_bn_sx_sy_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1,
    747                                    src1_stride, mask, mask_stride, h,
    748                                    blend_4_b10);
    749 }
    750 
    751 static void blend_a64_mask_b12_sx_sy_w4_sse4_1(
    752    uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
    753    uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
    754    const uint8_t *mask, uint32_t mask_stride, int w, int h) {
    755  (void)w;
    756  blend_a64_mask_bn_sx_sy_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1,
    757                                    src1_stride, mask, mask_stride, h,
    758                                    blend_4_b12);
    759 }
    760 
    761 static inline void blend_a64_mask_bn_sx_sy_w8n_sse4_1(
    762    uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
    763    uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
    764    const uint8_t *mask, uint32_t mask_stride, int w, int h,
    765    blend_unit_fn blend) {
    766  const __m128i v_zmask_b =
    767      _mm_set_epi8(0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1);
    768  const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
    769 
    770  do {
    771    int c;
    772    for (c = 0; c < w; c += 8) {
    773      const __m128i v_ra_b = xx_loadu_128(mask + 2 * c);
    774      const __m128i v_rb_b = xx_loadu_128(mask + 2 * c + mask_stride);
    775      const __m128i v_rvs_b = _mm_add_epi8(v_ra_b, v_rb_b);
    776      const __m128i v_rvsa_w = _mm_and_si128(v_rvs_b, v_zmask_b);
    777      const __m128i v_rvsb_w =
    778          _mm_and_si128(_mm_srli_si128(v_rvs_b, 1), v_zmask_b);
    779      const __m128i v_rs_w = _mm_add_epi16(v_rvsa_w, v_rvsb_w);
    780 
    781      const __m128i v_m0_w = xx_roundn_epu16(v_rs_w, 2);
    782      const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
    783 
    784      const __m128i v_res_w = blend(src0 + c, src1 + c, v_m0_w, v_m1_w);
    785 
    786      xx_storeu_128(dst + c, v_res_w);
    787    }
    788    dst += dst_stride;
    789    src0 += src0_stride;
    790    src1 += src1_stride;
    791    mask += 2 * mask_stride;
    792  } while (--h);
    793 }
    794 
    795 static void blend_a64_mask_b10_sx_sy_w8n_sse4_1(
    796    uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
    797    uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
    798    const uint8_t *mask, uint32_t mask_stride, int w, int h) {
    799  blend_a64_mask_bn_sx_sy_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1,
    800                                     src1_stride, mask, mask_stride, w, h,
    801                                     blend_8_b10);
    802 }
    803 
    804 static void blend_a64_mask_b12_sx_sy_w8n_sse4_1(
    805    uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
    806    uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
    807    const uint8_t *mask, uint32_t mask_stride, int w, int h) {
    808  blend_a64_mask_bn_sx_sy_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1,
    809                                     src1_stride, mask, mask_stride, w, h,
    810                                     blend_8_b12);
    811 }
    812 
    813 //////////////////////////////////////////////////////////////////////////////
    814 // Dispatch
    815 //////////////////////////////////////////////////////////////////////////////
    816 void aom_highbd_blend_a64_mask_sse4_1(uint8_t *dst_8, uint32_t dst_stride,
    817                                      const uint8_t *src0_8,
    818                                      uint32_t src0_stride,
    819                                      const uint8_t *src1_8,
    820                                      uint32_t src1_stride, const uint8_t *mask,
    821                                      uint32_t mask_stride, int w, int h,
    822                                      int subw, int subh, int bd) {
    823  typedef void (*blend_fn)(
    824      uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
    825      uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
    826      const uint8_t *mask, uint32_t mask_stride, int w, int h);
    827 
    828  // Dimensions are: bd_index X width_index X subw X subh
    829  static const blend_fn blend[2][2][2][2] = {
    830    {   // bd == 8 or 10
    831      { // w % 8 == 0
    832        { blend_a64_mask_b10_w8n_sse4_1, blend_a64_mask_b10_sy_w8n_sse4_1 },
    833        { blend_a64_mask_b10_sx_w8n_sse4_1,
    834          blend_a64_mask_b10_sx_sy_w8n_sse4_1 } },
    835      { // w == 4
    836        { blend_a64_mask_b10_w4_sse4_1, blend_a64_mask_b10_sy_w4_sse4_1 },
    837        { blend_a64_mask_b10_sx_w4_sse4_1,
    838          blend_a64_mask_b10_sx_sy_w4_sse4_1 } } },
    839    {   // bd == 12
    840      { // w % 8 == 0
    841        { blend_a64_mask_b12_w8n_sse4_1, blend_a64_mask_b12_sy_w8n_sse4_1 },
    842        { blend_a64_mask_b12_sx_w8n_sse4_1,
    843          blend_a64_mask_b12_sx_sy_w8n_sse4_1 } },
    844      { // w == 4
    845        { blend_a64_mask_b12_w4_sse4_1, blend_a64_mask_b12_sy_w4_sse4_1 },
    846        { blend_a64_mask_b12_sx_w4_sse4_1,
    847          blend_a64_mask_b12_sx_sy_w4_sse4_1 } } }
    848  };
    849 
    850  assert(IMPLIES(src0_8 == dst_8, src0_stride == dst_stride));
    851  assert(IMPLIES(src1_8 == dst_8, src1_stride == dst_stride));
    852 
    853  assert(h >= 1);
    854  assert(w >= 1);
    855  assert(IS_POWER_OF_TWO(h));
    856  assert(IS_POWER_OF_TWO(w));
    857 
    858  assert(bd == 8 || bd == 10 || bd == 12);
    859  if (UNLIKELY((h | w) & 3)) {  // if (w <= 2 || h <= 2)
    860    aom_highbd_blend_a64_mask_c(dst_8, dst_stride, src0_8, src0_stride, src1_8,
    861                                src1_stride, mask, mask_stride, w, h, subw,
    862                                subh, bd);
    863  } else {
    864    uint16_t *const dst = CONVERT_TO_SHORTPTR(dst_8);
    865    const uint16_t *const src0 = CONVERT_TO_SHORTPTR(src0_8);
    866    const uint16_t *const src1 = CONVERT_TO_SHORTPTR(src1_8);
    867 
    868    blend[bd == 12][(w >> 2) & 1][subw != 0][subh != 0](
    869        dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
    870        mask_stride, w, h);
    871  }
    872 }
    873 #endif  // CONFIG_AV1_HIGHBITDEPTH
    874 
    875 static inline void blend_a64_d16_mask_w16_sse41(
    876    uint8_t *dst, const CONV_BUF_TYPE *src0, const CONV_BUF_TYPE *src1,
    877    const __m128i *m0, const __m128i *m1, const __m128i *v_round_offset,
    878    const __m128i *v_maxval, int shift) {
    879  const __m128i max_minus_m0 = _mm_sub_epi16(*v_maxval, *m0);
    880  const __m128i max_minus_m1 = _mm_sub_epi16(*v_maxval, *m1);
    881  const __m128i s0_0 = xx_loadu_128(src0);
    882  const __m128i s0_1 = xx_loadu_128(src0 + 8);
    883  const __m128i s1_0 = xx_loadu_128(src1);
    884  const __m128i s1_1 = xx_loadu_128(src1 + 8);
    885  __m128i res0_lo = _mm_madd_epi16(_mm_unpacklo_epi16(s0_0, s1_0),
    886                                   _mm_unpacklo_epi16(*m0, max_minus_m0));
    887  __m128i res0_hi = _mm_madd_epi16(_mm_unpackhi_epi16(s0_0, s1_0),
    888                                   _mm_unpackhi_epi16(*m0, max_minus_m0));
    889  __m128i res1_lo = _mm_madd_epi16(_mm_unpacklo_epi16(s0_1, s1_1),
    890                                   _mm_unpacklo_epi16(*m1, max_minus_m1));
    891  __m128i res1_hi = _mm_madd_epi16(_mm_unpackhi_epi16(s0_1, s1_1),
    892                                   _mm_unpackhi_epi16(*m1, max_minus_m1));
    893  res0_lo = _mm_srai_epi32(_mm_sub_epi32(res0_lo, *v_round_offset), shift);
    894  res0_hi = _mm_srai_epi32(_mm_sub_epi32(res0_hi, *v_round_offset), shift);
    895  res1_lo = _mm_srai_epi32(_mm_sub_epi32(res1_lo, *v_round_offset), shift);
    896  res1_hi = _mm_srai_epi32(_mm_sub_epi32(res1_hi, *v_round_offset), shift);
    897  const __m128i res0 = _mm_packs_epi32(res0_lo, res0_hi);
    898  const __m128i res1 = _mm_packs_epi32(res1_lo, res1_hi);
    899  const __m128i res = _mm_packus_epi16(res0, res1);
    900 
    901  _mm_storeu_si128((__m128i *)(dst), res);
    902 }
    903 
    904 static inline void lowbd_blend_a64_d16_mask_subw0_subh0_w16_sse4_1(
    905    uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
    906    uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
    907    const uint8_t *mask, uint32_t mask_stride, int h, int w,
    908    const __m128i *round_offset, int shift) {
    909  const __m128i v_maxval = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
    910  for (int i = 0; i < h; ++i) {
    911    for (int j = 0; j < w; j += 16) {
    912      const __m128i m = xx_loadu_128(mask + j);
    913      const __m128i m0 = _mm_cvtepu8_epi16(m);
    914      const __m128i m1 = _mm_cvtepu8_epi16(_mm_srli_si128(m, 8));
    915 
    916      blend_a64_d16_mask_w16_sse41(dst + j, src0 + j, src1 + j, &m0, &m1,
    917                                   round_offset, &v_maxval, shift);
    918    }
    919    mask += mask_stride;
    920    dst += dst_stride;
    921    src0 += src0_stride;
    922    src1 += src1_stride;
    923  }
    924 }
    925 
    926 static inline void lowbd_blend_a64_d16_mask_subw1_subh1_w16_sse4_1(
    927    uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
    928    uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
    929    const uint8_t *mask, uint32_t mask_stride, int h, int w,
    930    const __m128i *round_offset, int shift) {
    931  const __m128i v_maxval = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
    932  const __m128i one_b = _mm_set1_epi8(1);
    933  const __m128i two_w = _mm_set1_epi16(2);
    934  for (int i = 0; i < h; ++i) {
    935    for (int j = 0; j < w; j += 16) {
    936      const __m128i m_i00 = xx_loadu_128(mask + 2 * j);
    937      const __m128i m_i01 = xx_loadu_128(mask + 2 * j + 16);
    938      const __m128i m_i10 = xx_loadu_128(mask + mask_stride + 2 * j);
    939      const __m128i m_i11 = xx_loadu_128(mask + mask_stride + 2 * j + 16);
    940 
    941      const __m128i m0_ac = _mm_adds_epu8(m_i00, m_i10);
    942      const __m128i m1_ac = _mm_adds_epu8(m_i01, m_i11);
    943      const __m128i m0_acbd = _mm_maddubs_epi16(m0_ac, one_b);
    944      const __m128i m1_acbd = _mm_maddubs_epi16(m1_ac, one_b);
    945      const __m128i m0 = _mm_srli_epi16(_mm_add_epi16(m0_acbd, two_w), 2);
    946      const __m128i m1 = _mm_srli_epi16(_mm_add_epi16(m1_acbd, two_w), 2);
    947 
    948      blend_a64_d16_mask_w16_sse41(dst + j, src0 + j, src1 + j, &m0, &m1,
    949                                   round_offset, &v_maxval, shift);
    950    }
    951    mask += mask_stride << 1;
    952    dst += dst_stride;
    953    src0 += src0_stride;
    954    src1 += src1_stride;
    955  }
    956 }
    957 
    958 static inline void lowbd_blend_a64_d16_mask_subw1_subh0_w16_sse4_1(
    959    uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
    960    uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
    961    const uint8_t *mask, uint32_t mask_stride, int h, int w,
    962    const __m128i *round_offset, int shift) {
    963  const __m128i v_maxval = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
    964  const __m128i one_b = _mm_set1_epi8(1);
    965  const __m128i zeros = _mm_setzero_si128();
    966  for (int i = 0; i < h; ++i) {
    967    for (int j = 0; j < w; j += 16) {
    968      const __m128i m_i00 = xx_loadu_128(mask + 2 * j);
    969      const __m128i m_i01 = xx_loadu_128(mask + 2 * j + 16);
    970      const __m128i m0_ac = _mm_maddubs_epi16(m_i00, one_b);
    971      const __m128i m1_ac = _mm_maddubs_epi16(m_i01, one_b);
    972      const __m128i m0 = _mm_avg_epu16(m0_ac, zeros);
    973      const __m128i m1 = _mm_avg_epu16(m1_ac, zeros);
    974 
    975      blend_a64_d16_mask_w16_sse41(dst + j, src0 + j, src1 + j, &m0, &m1,
    976                                   round_offset, &v_maxval, shift);
    977    }
    978    mask += mask_stride;
    979    dst += dst_stride;
    980    src0 += src0_stride;
    981    src1 += src1_stride;
    982  }
    983 }
    984 
    985 static inline void lowbd_blend_a64_d16_mask_subw0_subh1_w16_sse4_1(
    986    uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
    987    uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
    988    const uint8_t *mask, uint32_t mask_stride, int h, int w,
    989    const __m128i *round_offset, int shift) {
    990  const __m128i v_maxval = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
    991  const __m128i zeros = _mm_setzero_si128();
    992  for (int i = 0; i < h; ++i) {
    993    for (int j = 0; j < w; j += 16) {
    994      const __m128i m_i00 = xx_loadu_128(mask + j);
    995      const __m128i m_i10 = xx_loadu_128(mask + mask_stride + j);
    996 
    997      const __m128i m_ac = _mm_avg_epu8(_mm_adds_epu8(m_i00, m_i10), zeros);
    998      const __m128i m0 = _mm_cvtepu8_epi16(m_ac);
    999      const __m128i m1 = _mm_cvtepu8_epi16(_mm_srli_si128(m_ac, 8));
   1000 
   1001      blend_a64_d16_mask_w16_sse41(dst + j, src0 + j, src1 + j, &m0, &m1,
   1002                                   round_offset, &v_maxval, shift);
   1003    }
   1004    mask += mask_stride << 1;
   1005    dst += dst_stride;
   1006    src0 += src0_stride;
   1007    src1 += src1_stride;
   1008  }
   1009 }
   1010 
   1011 void aom_lowbd_blend_a64_d16_mask_sse4_1(
   1012    uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
   1013    uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
   1014    const uint8_t *mask, uint32_t mask_stride, int w, int h, int subw, int subh,
   1015    ConvolveParams *conv_params) {
   1016  const int bd = 8;
   1017  const int round_bits =
   1018      2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
   1019 
   1020  const int round_offset =
   1021      ((1 << (round_bits + bd)) + (1 << (round_bits + bd - 1)) -
   1022       (1 << (round_bits - 1)))
   1023      << AOM_BLEND_A64_ROUND_BITS;
   1024 
   1025  const int shift = round_bits + AOM_BLEND_A64_ROUND_BITS;
   1026  assert(IMPLIES((void *)src0 == dst, src0_stride == dst_stride));
   1027  assert(IMPLIES((void *)src1 == dst, src1_stride == dst_stride));
   1028 
   1029  assert(h >= 4);
   1030  assert(w >= 4);
   1031  assert(IS_POWER_OF_TWO(h));
   1032  assert(IS_POWER_OF_TWO(w));
   1033 
   1034  const __m128i v_round_offset = _mm_set1_epi32(round_offset);
   1035 
   1036  if (subw == 0 && subh == 0) {
   1037    switch (w) {
   1038      case 4:
   1039        aom_lowbd_blend_a64_d16_mask_subw0_subh0_w4_sse4_1(
   1040            dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
   1041            mask_stride, h, &v_round_offset, shift);
   1042        break;
   1043      case 8:
   1044        aom_lowbd_blend_a64_d16_mask_subw0_subh0_w8_sse4_1(
   1045            dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
   1046            mask_stride, h, &v_round_offset, shift);
   1047        break;
   1048      default:
   1049        lowbd_blend_a64_d16_mask_subw0_subh0_w16_sse4_1(
   1050            dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
   1051            mask_stride, h, w, &v_round_offset, shift);
   1052        break;
   1053    }
   1054 
   1055  } else if (subw == 1 && subh == 1) {
   1056    switch (w) {
   1057      case 4:
   1058        aom_lowbd_blend_a64_d16_mask_subw1_subh1_w4_sse4_1(
   1059            dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
   1060            mask_stride, h, &v_round_offset, shift);
   1061        break;
   1062      case 8:
   1063        aom_lowbd_blend_a64_d16_mask_subw1_subh1_w8_sse4_1(
   1064            dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
   1065            mask_stride, h, &v_round_offset, shift);
   1066        break;
   1067      default:
   1068        lowbd_blend_a64_d16_mask_subw1_subh1_w16_sse4_1(
   1069            dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
   1070            mask_stride, h, w, &v_round_offset, shift);
   1071        break;
   1072    }
   1073  } else if (subw == 1 && subh == 0) {
   1074    switch (w) {
   1075      case 4:
   1076        aom_lowbd_blend_a64_d16_mask_subw1_subh0_w4_sse4_1(
   1077            dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
   1078            mask_stride, h, &v_round_offset, shift);
   1079        break;
   1080      case 8:
   1081        aom_lowbd_blend_a64_d16_mask_subw1_subh0_w8_sse4_1(
   1082            dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
   1083            mask_stride, h, &v_round_offset, shift);
   1084        break;
   1085      default:
   1086        lowbd_blend_a64_d16_mask_subw1_subh0_w16_sse4_1(
   1087            dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
   1088            mask_stride, h, w, &v_round_offset, shift);
   1089        break;
   1090    }
   1091  } else {
   1092    switch (w) {
   1093      case 4:
   1094        aom_lowbd_blend_a64_d16_mask_subw0_subh1_w4_sse4_1(
   1095            dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
   1096            mask_stride, h, &v_round_offset, shift);
   1097        break;
   1098      case 8:
   1099        aom_lowbd_blend_a64_d16_mask_subw0_subh1_w8_sse4_1(
   1100            dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
   1101            mask_stride, h, &v_round_offset, shift);
   1102        break;
   1103      default:
   1104        lowbd_blend_a64_d16_mask_subw0_subh1_w16_sse4_1(
   1105            dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
   1106            mask_stride, h, w, &v_round_offset, shift);
   1107        break;
   1108    }
   1109  }
   1110 }
   1111 
   1112 //////////////////////////////////////////////////////////////////////////////
   1113 // aom_highbd_blend_a64_d16_mask_sse4_1()
   1114 //////////////////////////////////////////////////////////////////////////////
   1115 #if CONFIG_AV1_HIGHBITDEPTH
   1116 static inline void highbd_blend_a64_d16_mask_w4_sse4_1(
   1117    uint16_t *dst, int dst_stride, const CONV_BUF_TYPE *src0, int src0_stride,
   1118    const CONV_BUF_TYPE *src1, int src1_stride, const __m128i *mask0a,
   1119    const __m128i *mask0b, const __m128i *round_offset, int shift,
   1120    const __m128i *clip_low, const __m128i *clip_high,
   1121    const __m128i *mask_max) {
   1122  // Load 4 pixels from each of 4 rows from each source
   1123  const __m128i s0a = xx_loadu_2x64(src0, src0 + src0_stride);
   1124  const __m128i s0b =
   1125      xx_loadu_2x64(src0 + 2 * src0_stride, src0 + 3 * src0_stride);
   1126  const __m128i s1a = xx_loadu_2x64(src1, src1 + src1_stride);
   1127  const __m128i s1b =
   1128      xx_loadu_2x64(src1 + 2 * src1_stride, src1 + 3 * src1_stride);
   1129 
   1130  // Generate the inverse masks
   1131  const __m128i mask1a = _mm_sub_epi16(*mask_max, *mask0a);
   1132  const __m128i mask1b = _mm_sub_epi16(*mask_max, *mask0b);
   1133 
   1134  // Multiply each mask by the respective source
   1135  const __m128i mul0a_highs = _mm_mulhi_epu16(*mask0a, s0a);
   1136  const __m128i mul0a_lows = _mm_mullo_epi16(*mask0a, s0a);
   1137  const __m128i mul0ah = _mm_unpackhi_epi16(mul0a_lows, mul0a_highs);
   1138  const __m128i mul0al = _mm_unpacklo_epi16(mul0a_lows, mul0a_highs);
   1139  const __m128i mul1a_highs = _mm_mulhi_epu16(mask1a, s1a);
   1140  const __m128i mul1a_lows = _mm_mullo_epi16(mask1a, s1a);
   1141  const __m128i mul1ah = _mm_unpackhi_epi16(mul1a_lows, mul1a_highs);
   1142  const __m128i mul1al = _mm_unpacklo_epi16(mul1a_lows, mul1a_highs);
   1143 
   1144  const __m128i mul0b_highs = _mm_mulhi_epu16(*mask0b, s0b);
   1145  const __m128i mul0b_lows = _mm_mullo_epi16(*mask0b, s0b);
   1146  const __m128i mul0bh = _mm_unpackhi_epi16(mul0b_lows, mul0b_highs);
   1147  const __m128i mul0bl = _mm_unpacklo_epi16(mul0b_lows, mul0b_highs);
   1148  const __m128i mul1b_highs = _mm_mulhi_epu16(mask1b, s1b);
   1149  const __m128i mul1b_lows = _mm_mullo_epi16(mask1b, s1b);
   1150  const __m128i mul1bh = _mm_unpackhi_epi16(mul1b_lows, mul1b_highs);
   1151  const __m128i mul1bl = _mm_unpacklo_epi16(mul1b_lows, mul1b_highs);
   1152 
   1153  const __m128i sumah = _mm_add_epi32(mul0ah, mul1ah);
   1154  const __m128i sumal = _mm_add_epi32(mul0al, mul1al);
   1155  const __m128i sumbh = _mm_add_epi32(mul0bh, mul1bh);
   1156  const __m128i sumbl = _mm_add_epi32(mul0bl, mul1bl);
   1157 
   1158  const __m128i roundah =
   1159      _mm_srai_epi32(_mm_sub_epi32(sumah, *round_offset), shift);
   1160  const __m128i roundbh =
   1161      _mm_srai_epi32(_mm_sub_epi32(sumbh, *round_offset), shift);
   1162  const __m128i roundal =
   1163      _mm_srai_epi32(_mm_sub_epi32(sumal, *round_offset), shift);
   1164  const __m128i roundbl =
   1165      _mm_srai_epi32(_mm_sub_epi32(sumbl, *round_offset), shift);
   1166 
   1167  const __m128i packa = _mm_packs_epi32(roundal, roundah);
   1168  const __m128i packb = _mm_packs_epi32(roundbl, roundbh);
   1169 
   1170  const __m128i clipa =
   1171      _mm_min_epi16(_mm_max_epi16(packa, *clip_low), *clip_high);
   1172  const __m128i clipb =
   1173      _mm_min_epi16(_mm_max_epi16(packb, *clip_low), *clip_high);
   1174 
   1175  xx_storel_64(dst, _mm_srli_si128(clipa, 8));
   1176  xx_storel_64(dst + dst_stride, clipa);
   1177  xx_storel_64(dst + 2 * dst_stride, _mm_srli_si128(clipb, 8));
   1178  xx_storel_64(dst + 3 * dst_stride, clipb);
   1179 }
   1180 
   1181 static inline void highbd_blend_a64_d16_mask_subw0_subh0_w4_sse4_1(
   1182    uint16_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
   1183    uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
   1184    const uint8_t *mask, uint32_t mask_stride, int h,
   1185    const __m128i *round_offset, int shift, const __m128i *clip_low,
   1186    const __m128i *clip_high, const __m128i *mask_max) {
   1187  do {
   1188    const __m128i mask0a8 =
   1189        _mm_set_epi32(0, 0, *(int32_t *)mask, *(int32_t *)(mask + mask_stride));
   1190    const __m128i mask0b8 =
   1191        _mm_set_epi32(0, 0, *(int32_t *)(mask + 2 * mask_stride),
   1192                      *(int32_t *)(mask + 3 * mask_stride));
   1193    const __m128i mask0a = _mm_cvtepu8_epi16(mask0a8);
   1194    const __m128i mask0b = _mm_cvtepu8_epi16(mask0b8);
   1195 
   1196    highbd_blend_a64_d16_mask_w4_sse4_1(
   1197        dst, dst_stride, src0, src0_stride, src1, src1_stride, &mask0a, &mask0b,
   1198        round_offset, shift, clip_low, clip_high, mask_max);
   1199 
   1200    dst += dst_stride * 4;
   1201    src0 += src0_stride * 4;
   1202    src1 += src1_stride * 4;
   1203    mask += mask_stride * 4;
   1204  } while (h -= 4);
   1205 }
   1206 
   1207 static inline void highbd_blend_a64_d16_mask_subw1_subh1_w4_sse4_1(
   1208    uint16_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
   1209    uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
   1210    const uint8_t *mask, uint32_t mask_stride, int h,
   1211    const __m128i *round_offset, int shift, const __m128i *clip_low,
   1212    const __m128i *clip_high, const __m128i *mask_max) {
   1213  const __m128i one_b = _mm_set1_epi8(1);
   1214  const __m128i two_w = _mm_set1_epi16(2);
   1215  do {
   1216    // Load 8 pixels from each of 8 rows of mask,
   1217    // (saturating) add together rows then use madd to add adjacent pixels
   1218    // Finally, divide each value by 4 (with rounding)
   1219    const __m128i m02 = _mm_set_epi64x(*(int64_t *)(mask),
   1220                                       *(int64_t *)(mask + 2 * mask_stride));
   1221    const __m128i m13 = _mm_set_epi64x(*(int64_t *)(mask + mask_stride),
   1222                                       *(int64_t *)(mask + 3 * mask_stride));
   1223    const __m128i m0123 = _mm_maddubs_epi16(_mm_adds_epu8(m02, m13), one_b);
   1224    const __m128i mask_0a = _mm_srli_epi16(_mm_add_epi16(m0123, two_w), 2);
   1225    const __m128i m46 = _mm_set_epi64x(*(int64_t *)(mask + 4 * mask_stride),
   1226                                       *(int64_t *)(mask + 6 * mask_stride));
   1227    const __m128i m57 = _mm_set_epi64x(*(int64_t *)(mask + 5 * mask_stride),
   1228                                       *(int64_t *)(mask + 7 * mask_stride));
   1229    const __m128i m4567 = _mm_maddubs_epi16(_mm_adds_epu8(m46, m57), one_b);
   1230    const __m128i mask_0b = _mm_srli_epi16(_mm_add_epi16(m4567, two_w), 2);
   1231 
   1232    highbd_blend_a64_d16_mask_w4_sse4_1(
   1233        dst, dst_stride, src0, src0_stride, src1, src1_stride, &mask_0a,
   1234        &mask_0b, round_offset, shift, clip_low, clip_high, mask_max);
   1235 
   1236    dst += dst_stride * 4;
   1237    src0 += src0_stride * 4;
   1238    src1 += src1_stride * 4;
   1239    mask += mask_stride * 8;
   1240  } while (h -= 4);
   1241 }
   1242 
   1243 static inline void highbd_blend_a64_d16_mask_w8_sse4_1(
   1244    uint16_t *dst, int dst_stride, const CONV_BUF_TYPE *src0, int src0_stride,
   1245    const CONV_BUF_TYPE *src1, int src1_stride, const __m128i *mask0a,
   1246    const __m128i *mask0b, const __m128i *round_offset, int shift,
   1247    const __m128i *clip_low, const __m128i *clip_high,
   1248    const __m128i *max_mask) {
   1249  // Load 8x pixels from each of 2 rows from each source
   1250  const __m128i s0a = xx_loadu_128(src0);
   1251  const __m128i s0b = xx_loadu_128(src0 + src0_stride);
   1252  const __m128i s1a = xx_loadu_128(src1);
   1253  const __m128i s1b = xx_loadu_128(src1 + src1_stride);
   1254 
   1255  // Generate inverse masks
   1256  const __m128i mask1a = _mm_sub_epi16(*max_mask, *mask0a);
   1257  const __m128i mask1b = _mm_sub_epi16(*max_mask, *mask0b);
   1258 
   1259  // Multiply sources by respective masks
   1260  const __m128i mul0a_highs = _mm_mulhi_epu16(*mask0a, s0a);
   1261  const __m128i mul0a_lows = _mm_mullo_epi16(*mask0a, s0a);
   1262  const __m128i mul0ah = _mm_unpackhi_epi16(mul0a_lows, mul0a_highs);
   1263  const __m128i mul0al = _mm_unpacklo_epi16(mul0a_lows, mul0a_highs);
   1264 
   1265  const __m128i mul1a_highs = _mm_mulhi_epu16(mask1a, s1a);
   1266  const __m128i mul1a_lows = _mm_mullo_epi16(mask1a, s1a);
   1267  const __m128i mul1ah = _mm_unpackhi_epi16(mul1a_lows, mul1a_highs);
   1268  const __m128i mul1al = _mm_unpacklo_epi16(mul1a_lows, mul1a_highs);
   1269 
   1270  const __m128i sumah = _mm_add_epi32(mul0ah, mul1ah);
   1271  const __m128i sumal = _mm_add_epi32(mul0al, mul1al);
   1272 
   1273  const __m128i mul0b_highs = _mm_mulhi_epu16(*mask0b, s0b);
   1274  const __m128i mul0b_lows = _mm_mullo_epi16(*mask0b, s0b);
   1275  const __m128i mul0bh = _mm_unpackhi_epi16(mul0b_lows, mul0b_highs);
   1276  const __m128i mul0bl = _mm_unpacklo_epi16(mul0b_lows, mul0b_highs);
   1277 
   1278  const __m128i mul1b_highs = _mm_mulhi_epu16(mask1b, s1b);
   1279  const __m128i mul1b_lows = _mm_mullo_epi16(mask1b, s1b);
   1280  const __m128i mul1bh = _mm_unpackhi_epi16(mul1b_lows, mul1b_highs);
   1281  const __m128i mul1bl = _mm_unpacklo_epi16(mul1b_lows, mul1b_highs);
   1282 
   1283  const __m128i sumbh = _mm_add_epi32(mul0bh, mul1bh);
   1284  const __m128i sumbl = _mm_add_epi32(mul0bl, mul1bl);
   1285 
   1286  const __m128i roundah =
   1287      _mm_srai_epi32(_mm_sub_epi32(sumah, *round_offset), shift);
   1288  const __m128i roundal =
   1289      _mm_srai_epi32(_mm_sub_epi32(sumal, *round_offset), shift);
   1290  const __m128i roundbh =
   1291      _mm_srai_epi32(_mm_sub_epi32(sumbh, *round_offset), shift);
   1292  const __m128i roundbl =
   1293      _mm_srai_epi32(_mm_sub_epi32(sumbl, *round_offset), shift);
   1294 
   1295  const __m128i packa = _mm_packs_epi32(roundal, roundah);
   1296  const __m128i clipa =
   1297      _mm_min_epi16(_mm_max_epi16(packa, *clip_low), *clip_high);
   1298  const __m128i packb = _mm_packs_epi32(roundbl, roundbh);
   1299  const __m128i clipb =
   1300      _mm_min_epi16(_mm_max_epi16(packb, *clip_low), *clip_high);
   1301 
   1302  xx_storeu_128(dst, clipa);
   1303  xx_storeu_128(dst + dst_stride, clipb);
   1304 }
   1305 
   1306 static inline void highbd_blend_a64_d16_mask_subw0_subh0_w8_sse4_1(
   1307    uint16_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
   1308    uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
   1309    const uint8_t *mask, uint32_t mask_stride, int h,
   1310    const __m128i *round_offset, int shift, const __m128i *clip_low,
   1311    const __m128i *clip_high, const __m128i *max_mask) {
   1312  do {
   1313    const __m128i mask0a = _mm_cvtepu8_epi16(xx_loadl_64(mask));
   1314    const __m128i mask0b = _mm_cvtepu8_epi16(xx_loadl_64(mask + mask_stride));
   1315    highbd_blend_a64_d16_mask_w8_sse4_1(
   1316        dst, dst_stride, src0, src0_stride, src1, src1_stride, &mask0a, &mask0b,
   1317        round_offset, shift, clip_low, clip_high, max_mask);
   1318 
   1319    dst += dst_stride * 2;
   1320    src0 += src0_stride * 2;
   1321    src1 += src1_stride * 2;
   1322    mask += mask_stride * 2;
   1323  } while (h -= 2);
   1324 }
   1325 
   1326 static inline void highbd_blend_a64_d16_mask_subw1_subh1_w8_sse4_1(
   1327    uint16_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
   1328    uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
   1329    const uint8_t *mask, uint32_t mask_stride, int h,
   1330    const __m128i *round_offset, int shift, const __m128i *clip_low,
   1331    const __m128i *clip_high, const __m128i *max_mask) {
   1332  const __m128i one_b = _mm_set1_epi8(1);
   1333  const __m128i two_w = _mm_set1_epi16(2);
   1334  do {
   1335    const __m128i mask_thisrowa = xx_loadu_128(mask);
   1336    const __m128i mask_nextrowa = xx_loadu_128(mask + mask_stride);
   1337    const __m128i mask_thisrowb = xx_loadu_128(mask + 2 * mask_stride);
   1338    const __m128i mask_nextrowb = xx_loadu_128(mask + 3 * mask_stride);
   1339    const __m128i mask_bothrowsa = _mm_adds_epu8(mask_thisrowa, mask_nextrowa);
   1340    const __m128i mask_bothrowsb = _mm_adds_epu8(mask_thisrowb, mask_nextrowb);
   1341    const __m128i mask_16a = _mm_maddubs_epi16(mask_bothrowsa, one_b);
   1342    const __m128i mask_16b = _mm_maddubs_epi16(mask_bothrowsb, one_b);
   1343    const __m128i mask_sa = _mm_srli_epi16(_mm_add_epi16(mask_16a, two_w), 2);
   1344    const __m128i mask_sb = _mm_srli_epi16(_mm_add_epi16(mask_16b, two_w), 2);
   1345 
   1346    highbd_blend_a64_d16_mask_w8_sse4_1(
   1347        dst, dst_stride, src0, src0_stride, src1, src1_stride, &mask_sa,
   1348        &mask_sb, round_offset, shift, clip_low, clip_high, max_mask);
   1349 
   1350    dst += dst_stride * 2;
   1351    src0 += src0_stride * 2;
   1352    src1 += src1_stride * 2;
   1353    mask += mask_stride * 4;
   1354  } while (h -= 2);
   1355 }
   1356 
   1357 static inline void highbd_blend_a64_d16_mask_w16_sse4_1(
   1358    uint16_t *dst, const CONV_BUF_TYPE *src0, const CONV_BUF_TYPE *src1,
   1359    const __m128i *round_offset, int shift, const __m128i *mask0l,
   1360    const __m128i *mask0h, const __m128i *clip_low, const __m128i *clip_high,
   1361    const __m128i *mask_max) {
   1362  // Load 16x u16 pixels for this row from each src
   1363  const __m128i s0l = xx_loadu_128(src0);
   1364  const __m128i s0h = xx_loadu_128(src0 + 8);
   1365  const __m128i s1l = xx_loadu_128(src1);
   1366  const __m128i s1h = xx_loadu_128(src1 + 8);
   1367 
   1368  // Calculate inverse masks
   1369  const __m128i mask1h = _mm_sub_epi16(*mask_max, *mask0h);
   1370  const __m128i mask1l = _mm_sub_epi16(*mask_max, *mask0l);
   1371 
   1372  const __m128i mul0_highs = _mm_mulhi_epu16(*mask0h, s0h);
   1373  const __m128i mul0_lows = _mm_mullo_epi16(*mask0h, s0h);
   1374  const __m128i mul0h = _mm_unpackhi_epi16(mul0_lows, mul0_highs);
   1375  const __m128i mul0l = _mm_unpacklo_epi16(mul0_lows, mul0_highs);
   1376 
   1377  const __m128i mul1_highs = _mm_mulhi_epu16(mask1h, s1h);
   1378  const __m128i mul1_lows = _mm_mullo_epi16(mask1h, s1h);
   1379  const __m128i mul1h = _mm_unpackhi_epi16(mul1_lows, mul1_highs);
   1380  const __m128i mul1l = _mm_unpacklo_epi16(mul1_lows, mul1_highs);
   1381 
   1382  const __m128i mulhh = _mm_add_epi32(mul0h, mul1h);
   1383  const __m128i mulhl = _mm_add_epi32(mul0l, mul1l);
   1384 
   1385  const __m128i mul2_highs = _mm_mulhi_epu16(*mask0l, s0l);
   1386  const __m128i mul2_lows = _mm_mullo_epi16(*mask0l, s0l);
   1387  const __m128i mul2h = _mm_unpackhi_epi16(mul2_lows, mul2_highs);
   1388  const __m128i mul2l = _mm_unpacklo_epi16(mul2_lows, mul2_highs);
   1389 
   1390  const __m128i mul3_highs = _mm_mulhi_epu16(mask1l, s1l);
   1391  const __m128i mul3_lows = _mm_mullo_epi16(mask1l, s1l);
   1392  const __m128i mul3h = _mm_unpackhi_epi16(mul3_lows, mul3_highs);
   1393  const __m128i mul3l = _mm_unpacklo_epi16(mul3_lows, mul3_highs);
   1394 
   1395  const __m128i mullh = _mm_add_epi32(mul2h, mul3h);
   1396  const __m128i mulll = _mm_add_epi32(mul2l, mul3l);
   1397 
   1398  const __m128i reshh =
   1399      _mm_srai_epi32(_mm_sub_epi32(mulhh, *round_offset), shift);
   1400  const __m128i reshl =
   1401      _mm_srai_epi32(_mm_sub_epi32(mulhl, *round_offset), shift);
   1402  const __m128i reslh =
   1403      _mm_srai_epi32(_mm_sub_epi32(mullh, *round_offset), shift);
   1404  const __m128i resll =
   1405      _mm_srai_epi32(_mm_sub_epi32(mulll, *round_offset), shift);
   1406 
   1407  // Signed saturating pack from i32 to i16:
   1408  const __m128i packh = _mm_packs_epi32(reshl, reshh);
   1409  const __m128i packl = _mm_packs_epi32(resll, reslh);
   1410 
   1411  // Clip the values to the valid range
   1412  const __m128i cliph =
   1413      _mm_min_epi16(_mm_max_epi16(packh, *clip_low), *clip_high);
   1414  const __m128i clipl =
   1415      _mm_min_epi16(_mm_max_epi16(packl, *clip_low), *clip_high);
   1416 
   1417  // Store 16 pixels
   1418  xx_storeu_128(dst, clipl);
   1419  xx_storeu_128(dst + 8, cliph);
   1420 }
   1421 
   1422 static inline void highbd_blend_a64_d16_mask_subw0_subh0_w16_sse4_1(
   1423    uint16_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
   1424    uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
   1425    const uint8_t *mask, uint32_t mask_stride, int h, int w,
   1426    const __m128i *round_offset, int shift, const __m128i *clip_low,
   1427    const __m128i *clip_high, const __m128i *mask_max) {
   1428  for (int i = 0; i < h; i++) {
   1429    for (int j = 0; j < w; j += 16) {
   1430      // Load 16x u8 alpha-mask values and pad to u16
   1431      const __m128i masks_u8 = xx_loadu_128(mask + j);
   1432      const __m128i mask0l = _mm_cvtepu8_epi16(masks_u8);
   1433      const __m128i mask0h = _mm_cvtepu8_epi16(_mm_srli_si128(masks_u8, 8));
   1434 
   1435      highbd_blend_a64_d16_mask_w16_sse4_1(
   1436          dst + j, src0 + j, src1 + j, round_offset, shift, &mask0l, &mask0h,
   1437          clip_low, clip_high, mask_max);
   1438    }
   1439    dst += dst_stride;
   1440    src0 += src0_stride;
   1441    src1 += src1_stride;
   1442    mask += mask_stride;
   1443  }
   1444 }
   1445 
   1446 static inline void highbd_blend_a64_d16_mask_subw1_subh1_w16_sse4_1(
   1447    uint16_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
   1448    uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
   1449    const uint8_t *mask, uint32_t mask_stride, int h, int w,
   1450    const __m128i *round_offset, int shift, const __m128i *clip_low,
   1451    const __m128i *clip_high, const __m128i *mask_max) {
   1452  const __m128i one_b = _mm_set1_epi8(1);
   1453  const __m128i two_w = _mm_set1_epi16(2);
   1454  for (int i = 0; i < h; i++) {
   1455    for (int j = 0; j < w; j += 16) {
   1456      const __m128i m_i00 = xx_loadu_128(mask + 2 * j);
   1457      const __m128i m_i01 = xx_loadu_128(mask + 2 * j + 16);
   1458      const __m128i m_i10 = xx_loadu_128(mask + mask_stride + 2 * j);
   1459      const __m128i m_i11 = xx_loadu_128(mask + mask_stride + 2 * j + 16);
   1460 
   1461      const __m128i m0_ac = _mm_adds_epu8(m_i00, m_i10);
   1462      const __m128i m1_ac = _mm_adds_epu8(m_i01, m_i11);
   1463      const __m128i m0_acbd = _mm_maddubs_epi16(m0_ac, one_b);
   1464      const __m128i m1_acbd = _mm_maddubs_epi16(m1_ac, one_b);
   1465      const __m128i mask_l = _mm_srli_epi16(_mm_add_epi16(m0_acbd, two_w), 2);
   1466      const __m128i mask_h = _mm_srli_epi16(_mm_add_epi16(m1_acbd, two_w), 2);
   1467 
   1468      highbd_blend_a64_d16_mask_w16_sse4_1(
   1469          dst + j, src0 + j, src1 + j, round_offset, shift, &mask_l, &mask_h,
   1470          clip_low, clip_high, mask_max);
   1471    }
   1472    dst += dst_stride;
   1473    src0 += src0_stride;
   1474    src1 += src1_stride;
   1475    mask += mask_stride * 2;
   1476  }
   1477 }
   1478 
   1479 void aom_highbd_blend_a64_d16_mask_sse4_1(
   1480    uint8_t *dst8, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
   1481    uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
   1482    const uint8_t *mask, uint32_t mask_stride, int w, int h, int subw, int subh,
   1483    ConvolveParams *conv_params, const int bd) {
   1484  uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
   1485  const int round_bits =
   1486      2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
   1487  const int32_t round_offset =
   1488      ((1 << (round_bits + bd)) + (1 << (round_bits + bd - 1)) -
   1489       (1 << (round_bits - 1)))
   1490      << AOM_BLEND_A64_ROUND_BITS;
   1491  const __m128i v_round_offset = _mm_set1_epi32(round_offset);
   1492  const int shift = round_bits + AOM_BLEND_A64_ROUND_BITS;
   1493 
   1494  const __m128i clip_low = _mm_setzero_si128();
   1495  const __m128i clip_high = _mm_set1_epi16((1 << bd) - 1);
   1496  const __m128i mask_max = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
   1497 
   1498  assert(IMPLIES((void *)src0 == dst, src0_stride == dst_stride));
   1499  assert(IMPLIES((void *)src1 == dst, src1_stride == dst_stride));
   1500 
   1501  assert(h >= 4);
   1502  assert(w >= 4);
   1503  assert(IS_POWER_OF_TWO(h));
   1504  assert(IS_POWER_OF_TWO(w));
   1505 
   1506  if (subw == 0 && subh == 0) {
   1507    switch (w) {
   1508      case 4:
   1509        highbd_blend_a64_d16_mask_subw0_subh0_w4_sse4_1(
   1510            dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
   1511            mask_stride, h, &v_round_offset, shift, &clip_low, &clip_high,
   1512            &mask_max);
   1513        break;
   1514      case 8:
   1515        highbd_blend_a64_d16_mask_subw0_subh0_w8_sse4_1(
   1516            dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
   1517            mask_stride, h, &v_round_offset, shift, &clip_low, &clip_high,
   1518            &mask_max);
   1519        break;
   1520      default:  // >=16
   1521        highbd_blend_a64_d16_mask_subw0_subh0_w16_sse4_1(
   1522            dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
   1523            mask_stride, h, w, &v_round_offset, shift, &clip_low, &clip_high,
   1524            &mask_max);
   1525        break;
   1526    }
   1527 
   1528  } else if (subw == 1 && subh == 1) {
   1529    switch (w) {
   1530      case 4:
   1531        highbd_blend_a64_d16_mask_subw1_subh1_w4_sse4_1(
   1532            dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
   1533            mask_stride, h, &v_round_offset, shift, &clip_low, &clip_high,
   1534            &mask_max);
   1535        break;
   1536      case 8:
   1537        highbd_blend_a64_d16_mask_subw1_subh1_w8_sse4_1(
   1538            dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
   1539            mask_stride, h, &v_round_offset, shift, &clip_low, &clip_high,
   1540            &mask_max);
   1541        break;
   1542      default:  // >=16
   1543        highbd_blend_a64_d16_mask_subw1_subh1_w16_sse4_1(
   1544            dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
   1545            mask_stride, h, w, &v_round_offset, shift, &clip_low, &clip_high,
   1546            &mask_max);
   1547        break;
   1548    }
   1549  } else {
   1550    // Sub-sampling in only one axis doesn't seem to happen very much, so fall
   1551    // back to the vanilla C implementation instead of having all the optimised
   1552    // code for these.
   1553    aom_highbd_blend_a64_d16_mask_c(dst8, dst_stride, src0, src0_stride, src1,
   1554                                    src1_stride, mask, mask_stride, w, h, subw,
   1555                                    subh, conv_params, bd);
   1556  }
   1557 }
   1558 #endif  // CONFIG_AV1_HIGHBITDEPTH