tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

intrapred_ssse3.c (142986B)


      1 /*
      2 * Copyright (c) 2017, Alliance for Open Media. All rights reserved.
      3 *
      4 * This source code is subject to the terms of the BSD 2 Clause License and
      5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
      6 * was not distributed with this source code in the LICENSE file, you can
      7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
      8 * Media Patent License 1.0 was not distributed with this source code in the
      9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
     10 */
     11 
     12 #include <tmmintrin.h>
     13 
     14 #include "config/aom_dsp_rtcd.h"
     15 
     16 #include "aom_dsp/intrapred_common.h"
     17 
     18 // -----------------------------------------------------------------------------
     19 // PAETH_PRED
     20 
     21 // Return 8 16-bit pixels in one row
     22 static inline __m128i paeth_8x1_pred(const __m128i *left, const __m128i *top,
     23                                     const __m128i *topleft) {
     24  const __m128i base = _mm_sub_epi16(_mm_add_epi16(*top, *left), *topleft);
     25 
     26  __m128i pl = _mm_abs_epi16(_mm_sub_epi16(base, *left));
     27  __m128i pt = _mm_abs_epi16(_mm_sub_epi16(base, *top));
     28  __m128i ptl = _mm_abs_epi16(_mm_sub_epi16(base, *topleft));
     29 
     30  __m128i mask1 = _mm_cmpgt_epi16(pl, pt);
     31  mask1 = _mm_or_si128(mask1, _mm_cmpgt_epi16(pl, ptl));
     32  __m128i mask2 = _mm_cmpgt_epi16(pt, ptl);
     33 
     34  pl = _mm_andnot_si128(mask1, *left);
     35 
     36  ptl = _mm_and_si128(mask2, *topleft);
     37  pt = _mm_andnot_si128(mask2, *top);
     38  pt = _mm_or_si128(pt, ptl);
     39  pt = _mm_and_si128(mask1, pt);
     40 
     41  return _mm_or_si128(pl, pt);
     42 }
     43 
     44 void aom_paeth_predictor_4x4_ssse3(uint8_t *dst, ptrdiff_t stride,
     45                                   const uint8_t *above, const uint8_t *left) {
     46  __m128i l = _mm_loadl_epi64((const __m128i *)left);
     47  const __m128i t = _mm_loadl_epi64((const __m128i *)above);
     48  const __m128i zero = _mm_setzero_si128();
     49  const __m128i t16 = _mm_unpacklo_epi8(t, zero);
     50  const __m128i tl16 = _mm_set1_epi16((int16_t)above[-1]);
     51  __m128i rep = _mm_set1_epi16((short)0x8000);
     52  const __m128i one = _mm_set1_epi16(1);
     53 
     54  int i;
     55  for (i = 0; i < 4; ++i) {
     56    const __m128i l16 = _mm_shuffle_epi8(l, rep);
     57    const __m128i row = paeth_8x1_pred(&l16, &t16, &tl16);
     58 
     59    *(int *)dst = _mm_cvtsi128_si32(_mm_packus_epi16(row, row));
     60    dst += stride;
     61    rep = _mm_add_epi16(rep, one);
     62  }
     63 }
     64 
     65 void aom_paeth_predictor_4x8_ssse3(uint8_t *dst, ptrdiff_t stride,
     66                                   const uint8_t *above, const uint8_t *left) {
     67  __m128i l = _mm_loadl_epi64((const __m128i *)left);
     68  const __m128i t = _mm_loadl_epi64((const __m128i *)above);
     69  const __m128i zero = _mm_setzero_si128();
     70  const __m128i t16 = _mm_unpacklo_epi8(t, zero);
     71  const __m128i tl16 = _mm_set1_epi16((int16_t)above[-1]);
     72  __m128i rep = _mm_set1_epi16((short)0x8000);
     73  const __m128i one = _mm_set1_epi16(1);
     74 
     75  int i;
     76  for (i = 0; i < 8; ++i) {
     77    const __m128i l16 = _mm_shuffle_epi8(l, rep);
     78    const __m128i row = paeth_8x1_pred(&l16, &t16, &tl16);
     79 
     80    *(int *)dst = _mm_cvtsi128_si32(_mm_packus_epi16(row, row));
     81    dst += stride;
     82    rep = _mm_add_epi16(rep, one);
     83  }
     84 }
     85 
     86 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
     87 void aom_paeth_predictor_4x16_ssse3(uint8_t *dst, ptrdiff_t stride,
     88                                    const uint8_t *above, const uint8_t *left) {
     89  __m128i l = _mm_load_si128((const __m128i *)left);
     90  const __m128i t = _mm_cvtsi32_si128(((const int *)above)[0]);
     91  const __m128i zero = _mm_setzero_si128();
     92  const __m128i t16 = _mm_unpacklo_epi8(t, zero);
     93  const __m128i tl16 = _mm_set1_epi16((int16_t)above[-1]);
     94  __m128i rep = _mm_set1_epi16((short)0x8000);
     95  const __m128i one = _mm_set1_epi16(1);
     96 
     97  for (int i = 0; i < 16; ++i) {
     98    const __m128i l16 = _mm_shuffle_epi8(l, rep);
     99    const __m128i row = paeth_8x1_pred(&l16, &t16, &tl16);
    100 
    101    *(int *)dst = _mm_cvtsi128_si32(_mm_packus_epi16(row, row));
    102    dst += stride;
    103    rep = _mm_add_epi16(rep, one);
    104  }
    105 }
    106 #endif  // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
    107 
    108 void aom_paeth_predictor_8x4_ssse3(uint8_t *dst, ptrdiff_t stride,
    109                                   const uint8_t *above, const uint8_t *left) {
    110  __m128i l = _mm_loadl_epi64((const __m128i *)left);
    111  const __m128i t = _mm_loadl_epi64((const __m128i *)above);
    112  const __m128i zero = _mm_setzero_si128();
    113  const __m128i t16 = _mm_unpacklo_epi8(t, zero);
    114  const __m128i tl16 = _mm_set1_epi16((int16_t)above[-1]);
    115  __m128i rep = _mm_set1_epi16((short)0x8000);
    116  const __m128i one = _mm_set1_epi16(1);
    117 
    118  int i;
    119  for (i = 0; i < 4; ++i) {
    120    const __m128i l16 = _mm_shuffle_epi8(l, rep);
    121    const __m128i row = paeth_8x1_pred(&l16, &t16, &tl16);
    122 
    123    _mm_storel_epi64((__m128i *)dst, _mm_packus_epi16(row, row));
    124    dst += stride;
    125    rep = _mm_add_epi16(rep, one);
    126  }
    127 }
    128 
    129 void aom_paeth_predictor_8x8_ssse3(uint8_t *dst, ptrdiff_t stride,
    130                                   const uint8_t *above, const uint8_t *left) {
    131  __m128i l = _mm_loadl_epi64((const __m128i *)left);
    132  const __m128i t = _mm_loadl_epi64((const __m128i *)above);
    133  const __m128i zero = _mm_setzero_si128();
    134  const __m128i t16 = _mm_unpacklo_epi8(t, zero);
    135  const __m128i tl16 = _mm_set1_epi16((int16_t)above[-1]);
    136  __m128i rep = _mm_set1_epi16((short)0x8000);
    137  const __m128i one = _mm_set1_epi16(1);
    138 
    139  int i;
    140  for (i = 0; i < 8; ++i) {
    141    const __m128i l16 = _mm_shuffle_epi8(l, rep);
    142    const __m128i row = paeth_8x1_pred(&l16, &t16, &tl16);
    143 
    144    _mm_storel_epi64((__m128i *)dst, _mm_packus_epi16(row, row));
    145    dst += stride;
    146    rep = _mm_add_epi16(rep, one);
    147  }
    148 }
    149 
    150 void aom_paeth_predictor_8x16_ssse3(uint8_t *dst, ptrdiff_t stride,
    151                                    const uint8_t *above, const uint8_t *left) {
    152  __m128i l = _mm_load_si128((const __m128i *)left);
    153  const __m128i t = _mm_loadl_epi64((const __m128i *)above);
    154  const __m128i zero = _mm_setzero_si128();
    155  const __m128i t16 = _mm_unpacklo_epi8(t, zero);
    156  const __m128i tl16 = _mm_set1_epi16((int16_t)above[-1]);
    157  __m128i rep = _mm_set1_epi16((short)0x8000);
    158  const __m128i one = _mm_set1_epi16(1);
    159 
    160  int i;
    161  for (i = 0; i < 16; ++i) {
    162    const __m128i l16 = _mm_shuffle_epi8(l, rep);
    163    const __m128i row = paeth_8x1_pred(&l16, &t16, &tl16);
    164 
    165    _mm_storel_epi64((__m128i *)dst, _mm_packus_epi16(row, row));
    166    dst += stride;
    167    rep = _mm_add_epi16(rep, one);
    168  }
    169 }
    170 
    171 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
    172 void aom_paeth_predictor_8x32_ssse3(uint8_t *dst, ptrdiff_t stride,
    173                                    const uint8_t *above, const uint8_t *left) {
    174  const __m128i t = _mm_loadl_epi64((const __m128i *)above);
    175  const __m128i zero = _mm_setzero_si128();
    176  const __m128i t16 = _mm_unpacklo_epi8(t, zero);
    177  const __m128i tl16 = _mm_set1_epi16((int16_t)above[-1]);
    178  const __m128i one = _mm_set1_epi16(1);
    179 
    180  for (int j = 0; j < 2; ++j) {
    181    const __m128i l = _mm_load_si128((const __m128i *)(left + j * 16));
    182    __m128i rep = _mm_set1_epi16((short)0x8000);
    183    for (int i = 0; i < 16; ++i) {
    184      const __m128i l16 = _mm_shuffle_epi8(l, rep);
    185      const __m128i row = paeth_8x1_pred(&l16, &t16, &tl16);
    186 
    187      _mm_storel_epi64((__m128i *)dst, _mm_packus_epi16(row, row));
    188      dst += stride;
    189      rep = _mm_add_epi16(rep, one);
    190    }
    191  }
    192 }
    193 #endif  // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
    194 
    195 // Return 16 8-bit pixels in one row
    196 static inline __m128i paeth_16x1_pred(const __m128i *left, const __m128i *top0,
    197                                      const __m128i *top1,
    198                                      const __m128i *topleft) {
    199  const __m128i p0 = paeth_8x1_pred(left, top0, topleft);
    200  const __m128i p1 = paeth_8x1_pred(left, top1, topleft);
    201  return _mm_packus_epi16(p0, p1);
    202 }
    203 
    204 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
    205 void aom_paeth_predictor_16x4_ssse3(uint8_t *dst, ptrdiff_t stride,
    206                                    const uint8_t *above, const uint8_t *left) {
    207  __m128i l = _mm_cvtsi32_si128(((const int *)left)[0]);
    208  const __m128i t = _mm_load_si128((const __m128i *)above);
    209  const __m128i zero = _mm_setzero_si128();
    210  const __m128i top0 = _mm_unpacklo_epi8(t, zero);
    211  const __m128i top1 = _mm_unpackhi_epi8(t, zero);
    212  const __m128i tl16 = _mm_set1_epi16((int16_t)above[-1]);
    213  __m128i rep = _mm_set1_epi16((short)0x8000);
    214  const __m128i one = _mm_set1_epi16(1);
    215 
    216  for (int i = 0; i < 4; ++i) {
    217    const __m128i l16 = _mm_shuffle_epi8(l, rep);
    218    const __m128i row = paeth_16x1_pred(&l16, &top0, &top1, &tl16);
    219 
    220    _mm_store_si128((__m128i *)dst, row);
    221    dst += stride;
    222    rep = _mm_add_epi16(rep, one);
    223  }
    224 }
    225 #endif  // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
    226 
    227 void aom_paeth_predictor_16x8_ssse3(uint8_t *dst, ptrdiff_t stride,
    228                                    const uint8_t *above, const uint8_t *left) {
    229  __m128i l = _mm_loadl_epi64((const __m128i *)left);
    230  const __m128i t = _mm_load_si128((const __m128i *)above);
    231  const __m128i zero = _mm_setzero_si128();
    232  const __m128i top0 = _mm_unpacklo_epi8(t, zero);
    233  const __m128i top1 = _mm_unpackhi_epi8(t, zero);
    234  const __m128i tl16 = _mm_set1_epi16((int16_t)above[-1]);
    235  __m128i rep = _mm_set1_epi16((short)0x8000);
    236  const __m128i one = _mm_set1_epi16(1);
    237 
    238  int i;
    239  for (i = 0; i < 8; ++i) {
    240    const __m128i l16 = _mm_shuffle_epi8(l, rep);
    241    const __m128i row = paeth_16x1_pred(&l16, &top0, &top1, &tl16);
    242 
    243    _mm_store_si128((__m128i *)dst, row);
    244    dst += stride;
    245    rep = _mm_add_epi16(rep, one);
    246  }
    247 }
    248 
    249 void aom_paeth_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t stride,
    250                                     const uint8_t *above,
    251                                     const uint8_t *left) {
    252  __m128i l = _mm_load_si128((const __m128i *)left);
    253  const __m128i t = _mm_load_si128((const __m128i *)above);
    254  const __m128i zero = _mm_setzero_si128();
    255  const __m128i top0 = _mm_unpacklo_epi8(t, zero);
    256  const __m128i top1 = _mm_unpackhi_epi8(t, zero);
    257  const __m128i tl16 = _mm_set1_epi16((int16_t)above[-1]);
    258  __m128i rep = _mm_set1_epi16((short)0x8000);
    259  const __m128i one = _mm_set1_epi16(1);
    260 
    261  int i;
    262  for (i = 0; i < 16; ++i) {
    263    const __m128i l16 = _mm_shuffle_epi8(l, rep);
    264    const __m128i row = paeth_16x1_pred(&l16, &top0, &top1, &tl16);
    265 
    266    _mm_store_si128((__m128i *)dst, row);
    267    dst += stride;
    268    rep = _mm_add_epi16(rep, one);
    269  }
    270 }
    271 
    272 void aom_paeth_predictor_16x32_ssse3(uint8_t *dst, ptrdiff_t stride,
    273                                     const uint8_t *above,
    274                                     const uint8_t *left) {
    275  __m128i l = _mm_load_si128((const __m128i *)left);
    276  const __m128i t = _mm_load_si128((const __m128i *)above);
    277  const __m128i zero = _mm_setzero_si128();
    278  const __m128i top0 = _mm_unpacklo_epi8(t, zero);
    279  const __m128i top1 = _mm_unpackhi_epi8(t, zero);
    280  const __m128i tl16 = _mm_set1_epi16((int16_t)above[-1]);
    281  __m128i rep = _mm_set1_epi16((short)0x8000);
    282  const __m128i one = _mm_set1_epi16(1);
    283  __m128i l16;
    284 
    285  int i;
    286  for (i = 0; i < 16; ++i) {
    287    l16 = _mm_shuffle_epi8(l, rep);
    288    const __m128i row = paeth_16x1_pred(&l16, &top0, &top1, &tl16);
    289 
    290    _mm_store_si128((__m128i *)dst, row);
    291    dst += stride;
    292    rep = _mm_add_epi16(rep, one);
    293  }
    294 
    295  l = _mm_load_si128((const __m128i *)(left + 16));
    296  rep = _mm_set1_epi16((short)0x8000);
    297  for (i = 0; i < 16; ++i) {
    298    l16 = _mm_shuffle_epi8(l, rep);
    299    const __m128i row = paeth_16x1_pred(&l16, &top0, &top1, &tl16);
    300 
    301    _mm_store_si128((__m128i *)dst, row);
    302    dst += stride;
    303    rep = _mm_add_epi16(rep, one);
    304  }
    305 }
    306 
    307 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
    308 void aom_paeth_predictor_16x64_ssse3(uint8_t *dst, ptrdiff_t stride,
    309                                     const uint8_t *above,
    310                                     const uint8_t *left) {
    311  const __m128i t = _mm_load_si128((const __m128i *)above);
    312  const __m128i zero = _mm_setzero_si128();
    313  const __m128i top0 = _mm_unpacklo_epi8(t, zero);
    314  const __m128i top1 = _mm_unpackhi_epi8(t, zero);
    315  const __m128i tl16 = _mm_set1_epi16((int16_t)above[-1]);
    316  const __m128i one = _mm_set1_epi16(1);
    317 
    318  for (int j = 0; j < 4; ++j) {
    319    const __m128i l = _mm_load_si128((const __m128i *)(left + j * 16));
    320    __m128i rep = _mm_set1_epi16((short)0x8000);
    321    for (int i = 0; i < 16; ++i) {
    322      const __m128i l16 = _mm_shuffle_epi8(l, rep);
    323      const __m128i row = paeth_16x1_pred(&l16, &top0, &top1, &tl16);
    324      _mm_store_si128((__m128i *)dst, row);
    325      dst += stride;
    326      rep = _mm_add_epi16(rep, one);
    327    }
    328  }
    329 }
    330 
    331 void aom_paeth_predictor_32x8_ssse3(uint8_t *dst, ptrdiff_t stride,
    332                                    const uint8_t *above, const uint8_t *left) {
    333  const __m128i a = _mm_load_si128((const __m128i *)above);
    334  const __m128i b = _mm_load_si128((const __m128i *)(above + 16));
    335  const __m128i zero = _mm_setzero_si128();
    336  const __m128i al = _mm_unpacklo_epi8(a, zero);
    337  const __m128i ah = _mm_unpackhi_epi8(a, zero);
    338  const __m128i bl = _mm_unpacklo_epi8(b, zero);
    339  const __m128i bh = _mm_unpackhi_epi8(b, zero);
    340 
    341  const __m128i tl16 = _mm_set1_epi16((int16_t)above[-1]);
    342  __m128i rep = _mm_set1_epi16((short)0x8000);
    343  const __m128i one = _mm_set1_epi16(1);
    344  const __m128i l = _mm_loadl_epi64((const __m128i *)left);
    345  __m128i l16;
    346 
    347  for (int i = 0; i < 8; ++i) {
    348    l16 = _mm_shuffle_epi8(l, rep);
    349    const __m128i r32l = paeth_16x1_pred(&l16, &al, &ah, &tl16);
    350    const __m128i r32h = paeth_16x1_pred(&l16, &bl, &bh, &tl16);
    351 
    352    _mm_store_si128((__m128i *)dst, r32l);
    353    _mm_store_si128((__m128i *)(dst + 16), r32h);
    354    dst += stride;
    355    rep = _mm_add_epi16(rep, one);
    356  }
    357 }
    358 #endif  // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
    359 
    360 void aom_paeth_predictor_32x16_ssse3(uint8_t *dst, ptrdiff_t stride,
    361                                     const uint8_t *above,
    362                                     const uint8_t *left) {
    363  const __m128i a = _mm_load_si128((const __m128i *)above);
    364  const __m128i b = _mm_load_si128((const __m128i *)(above + 16));
    365  const __m128i zero = _mm_setzero_si128();
    366  const __m128i al = _mm_unpacklo_epi8(a, zero);
    367  const __m128i ah = _mm_unpackhi_epi8(a, zero);
    368  const __m128i bl = _mm_unpacklo_epi8(b, zero);
    369  const __m128i bh = _mm_unpackhi_epi8(b, zero);
    370 
    371  const __m128i tl16 = _mm_set1_epi16((int16_t)above[-1]);
    372  __m128i rep = _mm_set1_epi16((short)0x8000);
    373  const __m128i one = _mm_set1_epi16(1);
    374  __m128i l = _mm_load_si128((const __m128i *)left);
    375  __m128i l16;
    376 
    377  int i;
    378  for (i = 0; i < 16; ++i) {
    379    l16 = _mm_shuffle_epi8(l, rep);
    380    const __m128i r32l = paeth_16x1_pred(&l16, &al, &ah, &tl16);
    381    const __m128i r32h = paeth_16x1_pred(&l16, &bl, &bh, &tl16);
    382 
    383    _mm_store_si128((__m128i *)dst, r32l);
    384    _mm_store_si128((__m128i *)(dst + 16), r32h);
    385    dst += stride;
    386    rep = _mm_add_epi16(rep, one);
    387  }
    388 }
    389 
    390 void aom_paeth_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t stride,
    391                                     const uint8_t *above,
    392                                     const uint8_t *left) {
    393  const __m128i a = _mm_load_si128((const __m128i *)above);
    394  const __m128i b = _mm_load_si128((const __m128i *)(above + 16));
    395  const __m128i zero = _mm_setzero_si128();
    396  const __m128i al = _mm_unpacklo_epi8(a, zero);
    397  const __m128i ah = _mm_unpackhi_epi8(a, zero);
    398  const __m128i bl = _mm_unpacklo_epi8(b, zero);
    399  const __m128i bh = _mm_unpackhi_epi8(b, zero);
    400 
    401  const __m128i tl16 = _mm_set1_epi16((int16_t)above[-1]);
    402  __m128i rep = _mm_set1_epi16((short)0x8000);
    403  const __m128i one = _mm_set1_epi16(1);
    404  __m128i l = _mm_load_si128((const __m128i *)left);
    405  __m128i l16;
    406 
    407  int i;
    408  for (i = 0; i < 16; ++i) {
    409    l16 = _mm_shuffle_epi8(l, rep);
    410    const __m128i r32l = paeth_16x1_pred(&l16, &al, &ah, &tl16);
    411    const __m128i r32h = paeth_16x1_pred(&l16, &bl, &bh, &tl16);
    412 
    413    _mm_store_si128((__m128i *)dst, r32l);
    414    _mm_store_si128((__m128i *)(dst + 16), r32h);
    415    dst += stride;
    416    rep = _mm_add_epi16(rep, one);
    417  }
    418 
    419  rep = _mm_set1_epi16((short)0x8000);
    420  l = _mm_load_si128((const __m128i *)(left + 16));
    421  for (i = 0; i < 16; ++i) {
    422    l16 = _mm_shuffle_epi8(l, rep);
    423    const __m128i r32l = paeth_16x1_pred(&l16, &al, &ah, &tl16);
    424    const __m128i r32h = paeth_16x1_pred(&l16, &bl, &bh, &tl16);
    425 
    426    _mm_store_si128((__m128i *)dst, r32l);
    427    _mm_store_si128((__m128i *)(dst + 16), r32h);
    428    dst += stride;
    429    rep = _mm_add_epi16(rep, one);
    430  }
    431 }
    432 
    433 void aom_paeth_predictor_32x64_ssse3(uint8_t *dst, ptrdiff_t stride,
    434                                     const uint8_t *above,
    435                                     const uint8_t *left) {
    436  const __m128i a = _mm_load_si128((const __m128i *)above);
    437  const __m128i b = _mm_load_si128((const __m128i *)(above + 16));
    438  const __m128i zero = _mm_setzero_si128();
    439  const __m128i al = _mm_unpacklo_epi8(a, zero);
    440  const __m128i ah = _mm_unpackhi_epi8(a, zero);
    441  const __m128i bl = _mm_unpacklo_epi8(b, zero);
    442  const __m128i bh = _mm_unpackhi_epi8(b, zero);
    443 
    444  const __m128i tl16 = _mm_set1_epi16((int16_t)above[-1]);
    445  const __m128i one = _mm_set1_epi16(1);
    446  __m128i l16;
    447 
    448  int i, j;
    449  for (j = 0; j < 4; ++j) {
    450    const __m128i l = _mm_load_si128((const __m128i *)(left + j * 16));
    451    __m128i rep = _mm_set1_epi16((short)0x8000);
    452    for (i = 0; i < 16; ++i) {
    453      l16 = _mm_shuffle_epi8(l, rep);
    454      const __m128i r32l = paeth_16x1_pred(&l16, &al, &ah, &tl16);
    455      const __m128i r32h = paeth_16x1_pred(&l16, &bl, &bh, &tl16);
    456 
    457      _mm_store_si128((__m128i *)dst, r32l);
    458      _mm_store_si128((__m128i *)(dst + 16), r32h);
    459      dst += stride;
    460      rep = _mm_add_epi16(rep, one);
    461    }
    462  }
    463 }
    464 
    465 void aom_paeth_predictor_64x32_ssse3(uint8_t *dst, ptrdiff_t stride,
    466                                     const uint8_t *above,
    467                                     const uint8_t *left) {
    468  const __m128i a = _mm_load_si128((const __m128i *)above);
    469  const __m128i b = _mm_load_si128((const __m128i *)(above + 16));
    470  const __m128i c = _mm_load_si128((const __m128i *)(above + 32));
    471  const __m128i d = _mm_load_si128((const __m128i *)(above + 48));
    472  const __m128i zero = _mm_setzero_si128();
    473  const __m128i al = _mm_unpacklo_epi8(a, zero);
    474  const __m128i ah = _mm_unpackhi_epi8(a, zero);
    475  const __m128i bl = _mm_unpacklo_epi8(b, zero);
    476  const __m128i bh = _mm_unpackhi_epi8(b, zero);
    477  const __m128i cl = _mm_unpacklo_epi8(c, zero);
    478  const __m128i ch = _mm_unpackhi_epi8(c, zero);
    479  const __m128i dl = _mm_unpacklo_epi8(d, zero);
    480  const __m128i dh = _mm_unpackhi_epi8(d, zero);
    481 
    482  const __m128i tl16 = _mm_set1_epi16((int16_t)above[-1]);
    483  const __m128i one = _mm_set1_epi16(1);
    484  __m128i l16;
    485 
    486  int i, j;
    487  for (j = 0; j < 2; ++j) {
    488    const __m128i l = _mm_load_si128((const __m128i *)(left + j * 16));
    489    __m128i rep = _mm_set1_epi16((short)0x8000);
    490    for (i = 0; i < 16; ++i) {
    491      l16 = _mm_shuffle_epi8(l, rep);
    492      const __m128i r0 = paeth_16x1_pred(&l16, &al, &ah, &tl16);
    493      const __m128i r1 = paeth_16x1_pred(&l16, &bl, &bh, &tl16);
    494      const __m128i r2 = paeth_16x1_pred(&l16, &cl, &ch, &tl16);
    495      const __m128i r3 = paeth_16x1_pred(&l16, &dl, &dh, &tl16);
    496 
    497      _mm_store_si128((__m128i *)dst, r0);
    498      _mm_store_si128((__m128i *)(dst + 16), r1);
    499      _mm_store_si128((__m128i *)(dst + 32), r2);
    500      _mm_store_si128((__m128i *)(dst + 48), r3);
    501      dst += stride;
    502      rep = _mm_add_epi16(rep, one);
    503    }
    504  }
    505 }
    506 
    507 void aom_paeth_predictor_64x64_ssse3(uint8_t *dst, ptrdiff_t stride,
    508                                     const uint8_t *above,
    509                                     const uint8_t *left) {
    510  const __m128i a = _mm_load_si128((const __m128i *)above);
    511  const __m128i b = _mm_load_si128((const __m128i *)(above + 16));
    512  const __m128i c = _mm_load_si128((const __m128i *)(above + 32));
    513  const __m128i d = _mm_load_si128((const __m128i *)(above + 48));
    514  const __m128i zero = _mm_setzero_si128();
    515  const __m128i al = _mm_unpacklo_epi8(a, zero);
    516  const __m128i ah = _mm_unpackhi_epi8(a, zero);
    517  const __m128i bl = _mm_unpacklo_epi8(b, zero);
    518  const __m128i bh = _mm_unpackhi_epi8(b, zero);
    519  const __m128i cl = _mm_unpacklo_epi8(c, zero);
    520  const __m128i ch = _mm_unpackhi_epi8(c, zero);
    521  const __m128i dl = _mm_unpacklo_epi8(d, zero);
    522  const __m128i dh = _mm_unpackhi_epi8(d, zero);
    523 
    524  const __m128i tl16 = _mm_set1_epi16((int16_t)above[-1]);
    525  const __m128i one = _mm_set1_epi16(1);
    526  __m128i l16;
    527 
    528  int i, j;
    529  for (j = 0; j < 4; ++j) {
    530    const __m128i l = _mm_load_si128((const __m128i *)(left + j * 16));
    531    __m128i rep = _mm_set1_epi16((short)0x8000);
    532    for (i = 0; i < 16; ++i) {
    533      l16 = _mm_shuffle_epi8(l, rep);
    534      const __m128i r0 = paeth_16x1_pred(&l16, &al, &ah, &tl16);
    535      const __m128i r1 = paeth_16x1_pred(&l16, &bl, &bh, &tl16);
    536      const __m128i r2 = paeth_16x1_pred(&l16, &cl, &ch, &tl16);
    537      const __m128i r3 = paeth_16x1_pred(&l16, &dl, &dh, &tl16);
    538 
    539      _mm_store_si128((__m128i *)dst, r0);
    540      _mm_store_si128((__m128i *)(dst + 16), r1);
    541      _mm_store_si128((__m128i *)(dst + 32), r2);
    542      _mm_store_si128((__m128i *)(dst + 48), r3);
    543      dst += stride;
    544      rep = _mm_add_epi16(rep, one);
    545    }
    546  }
    547 }
    548 
    549 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
    550 void aom_paeth_predictor_64x16_ssse3(uint8_t *dst, ptrdiff_t stride,
    551                                     const uint8_t *above,
    552                                     const uint8_t *left) {
    553  const __m128i a = _mm_load_si128((const __m128i *)above);
    554  const __m128i b = _mm_load_si128((const __m128i *)(above + 16));
    555  const __m128i c = _mm_load_si128((const __m128i *)(above + 32));
    556  const __m128i d = _mm_load_si128((const __m128i *)(above + 48));
    557  const __m128i zero = _mm_setzero_si128();
    558  const __m128i al = _mm_unpacklo_epi8(a, zero);
    559  const __m128i ah = _mm_unpackhi_epi8(a, zero);
    560  const __m128i bl = _mm_unpacklo_epi8(b, zero);
    561  const __m128i bh = _mm_unpackhi_epi8(b, zero);
    562  const __m128i cl = _mm_unpacklo_epi8(c, zero);
    563  const __m128i ch = _mm_unpackhi_epi8(c, zero);
    564  const __m128i dl = _mm_unpacklo_epi8(d, zero);
    565  const __m128i dh = _mm_unpackhi_epi8(d, zero);
    566 
    567  const __m128i tl16 = _mm_set1_epi16((int16_t)above[-1]);
    568  const __m128i one = _mm_set1_epi16(1);
    569  __m128i l16;
    570 
    571  int i;
    572  const __m128i l = _mm_load_si128((const __m128i *)left);
    573  __m128i rep = _mm_set1_epi16((short)0x8000);
    574  for (i = 0; i < 16; ++i) {
    575    l16 = _mm_shuffle_epi8(l, rep);
    576    const __m128i r0 = paeth_16x1_pred(&l16, &al, &ah, &tl16);
    577    const __m128i r1 = paeth_16x1_pred(&l16, &bl, &bh, &tl16);
    578    const __m128i r2 = paeth_16x1_pred(&l16, &cl, &ch, &tl16);
    579    const __m128i r3 = paeth_16x1_pred(&l16, &dl, &dh, &tl16);
    580 
    581    _mm_store_si128((__m128i *)dst, r0);
    582    _mm_store_si128((__m128i *)(dst + 16), r1);
    583    _mm_store_si128((__m128i *)(dst + 32), r2);
    584    _mm_store_si128((__m128i *)(dst + 48), r3);
    585    dst += stride;
    586    rep = _mm_add_epi16(rep, one);
    587  }
    588 }
    589 #endif  // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
    590 
    591 // -----------------------------------------------------------------------------
    592 // SMOOTH_PRED
    593 
    594 // pixels[0]: above and below_pred interleave vector
    595 // pixels[1]: left vector
    596 // pixels[2]: right_pred vector
    597 static inline void load_pixel_w4(const uint8_t *above, const uint8_t *left,
    598                                 int height, __m128i *pixels) {
    599  __m128i d = _mm_cvtsi32_si128(((const int *)above)[0]);
    600  if (height == 4)
    601    pixels[1] = _mm_cvtsi32_si128(((const int *)left)[0]);
    602  else if (height == 8)
    603    pixels[1] = _mm_loadl_epi64(((const __m128i *)left));
    604  else
    605    pixels[1] = _mm_loadu_si128(((const __m128i *)left));
    606 
    607  pixels[2] = _mm_set1_epi16((int16_t)above[3]);
    608 
    609  const __m128i bp = _mm_set1_epi16((int16_t)left[height - 1]);
    610  const __m128i zero = _mm_setzero_si128();
    611  d = _mm_unpacklo_epi8(d, zero);
    612  pixels[0] = _mm_unpacklo_epi16(d, bp);
    613 }
    614 
    615 // weight_h[0]: weight_h vector
    616 // weight_h[1]: scale - weight_h vector
    617 // weight_h[2]: same as [0], second half for height = 16 only
    618 // weight_h[3]: same as [1], second half for height = 16 only
    619 // weight_w[0]: weights_w and scale - weights_w interleave vector
    620 static inline void load_weight_w4(int height, __m128i *weight_h,
    621                                  __m128i *weight_w) {
    622  const __m128i zero = _mm_setzero_si128();
    623  const __m128i d = _mm_set1_epi16((int16_t)(1 << SMOOTH_WEIGHT_LOG2_SCALE));
    624  const __m128i t = _mm_cvtsi32_si128(((const int *)smooth_weights)[0]);
    625  weight_h[0] = _mm_unpacklo_epi8(t, zero);
    626  weight_h[1] = _mm_sub_epi16(d, weight_h[0]);
    627  weight_w[0] = _mm_unpacklo_epi16(weight_h[0], weight_h[1]);
    628 
    629  if (height == 8) {
    630    const __m128i weight = _mm_loadl_epi64((const __m128i *)&smooth_weights[4]);
    631    weight_h[0] = _mm_unpacklo_epi8(weight, zero);
    632    weight_h[1] = _mm_sub_epi16(d, weight_h[0]);
    633  } else if (height == 16) {
    634    const __m128i weight =
    635        _mm_loadu_si128((const __m128i *)&smooth_weights[12]);
    636    weight_h[0] = _mm_unpacklo_epi8(weight, zero);
    637    weight_h[1] = _mm_sub_epi16(d, weight_h[0]);
    638    weight_h[2] = _mm_unpackhi_epi8(weight, zero);
    639    weight_h[3] = _mm_sub_epi16(d, weight_h[2]);
    640  }
    641 }
    642 
    643 static inline void smooth_pred_4xh(const __m128i *pixel, const __m128i *wh,
    644                                   const __m128i *ww, int h, uint8_t *dst,
    645                                   ptrdiff_t stride, int second_half) {
    646  const __m128i round = _mm_set1_epi32((1 << SMOOTH_WEIGHT_LOG2_SCALE));
    647  const __m128i one = _mm_set1_epi16(1);
    648  const __m128i inc = _mm_set1_epi16(0x202);
    649  const __m128i gat = _mm_set1_epi32(0xc080400);
    650  __m128i rep = second_half ? _mm_set1_epi16((short)0x8008)
    651                            : _mm_set1_epi16((short)0x8000);
    652  __m128i d = _mm_set1_epi16(0x100);
    653 
    654  for (int i = 0; i < h; ++i) {
    655    const __m128i wg_wg = _mm_shuffle_epi8(wh[0], d);
    656    const __m128i sc_sc = _mm_shuffle_epi8(wh[1], d);
    657    const __m128i wh_sc = _mm_unpacklo_epi16(wg_wg, sc_sc);
    658    __m128i s = _mm_madd_epi16(pixel[0], wh_sc);
    659 
    660    __m128i b = _mm_shuffle_epi8(pixel[1], rep);
    661    b = _mm_unpacklo_epi16(b, pixel[2]);
    662    __m128i sum = _mm_madd_epi16(b, ww[0]);
    663 
    664    sum = _mm_add_epi32(s, sum);
    665    sum = _mm_add_epi32(sum, round);
    666    sum = _mm_srai_epi32(sum, 1 + SMOOTH_WEIGHT_LOG2_SCALE);
    667 
    668    sum = _mm_shuffle_epi8(sum, gat);
    669    *(int *)dst = _mm_cvtsi128_si32(sum);
    670    dst += stride;
    671 
    672    rep = _mm_add_epi16(rep, one);
    673    d = _mm_add_epi16(d, inc);
    674  }
    675 }
    676 
    677 void aom_smooth_predictor_4x4_ssse3(uint8_t *dst, ptrdiff_t stride,
    678                                    const uint8_t *above, const uint8_t *left) {
    679  __m128i pixels[3];
    680  load_pixel_w4(above, left, 4, pixels);
    681 
    682  __m128i wh[4], ww[2];
    683  load_weight_w4(4, wh, ww);
    684 
    685  smooth_pred_4xh(pixels, wh, ww, 4, dst, stride, 0);
    686 }
    687 
    688 void aom_smooth_predictor_4x8_ssse3(uint8_t *dst, ptrdiff_t stride,
    689                                    const uint8_t *above, const uint8_t *left) {
    690  __m128i pixels[3];
    691  load_pixel_w4(above, left, 8, pixels);
    692 
    693  __m128i wh[4], ww[2];
    694  load_weight_w4(8, wh, ww);
    695 
    696  smooth_pred_4xh(pixels, wh, ww, 8, dst, stride, 0);
    697 }
    698 
    699 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
    700 void aom_smooth_predictor_4x16_ssse3(uint8_t *dst, ptrdiff_t stride,
    701                                     const uint8_t *above,
    702                                     const uint8_t *left) {
    703  __m128i pixels[3];
    704  load_pixel_w4(above, left, 16, pixels);
    705 
    706  __m128i wh[4], ww[2];
    707  load_weight_w4(16, wh, ww);
    708 
    709  smooth_pred_4xh(pixels, wh, ww, 8, dst, stride, 0);
    710  dst += stride << 3;
    711  smooth_pred_4xh(pixels, &wh[2], ww, 8, dst, stride, 1);
    712 }
    713 #endif  // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
    714 
    715 // pixels[0]: above and below_pred interleave vector, first half
    716 // pixels[1]: above and below_pred interleave vector, second half
    717 // pixels[2]: left vector
    718 // pixels[3]: right_pred vector
    719 // pixels[4]: above and below_pred interleave vector, first half
    720 // pixels[5]: above and below_pred interleave vector, second half
    721 // pixels[6]: left vector + 16
    722 // pixels[7]: right_pred vector
    723 static inline void load_pixel_w8(const uint8_t *above, const uint8_t *left,
    724                                 int height, __m128i *pixels) {
    725  const __m128i zero = _mm_setzero_si128();
    726  const __m128i bp = _mm_set1_epi16((int16_t)left[height - 1]);
    727  __m128i d = _mm_loadl_epi64((const __m128i *)above);
    728  d = _mm_unpacklo_epi8(d, zero);
    729  pixels[0] = _mm_unpacklo_epi16(d, bp);
    730  pixels[1] = _mm_unpackhi_epi16(d, bp);
    731 
    732  pixels[3] = _mm_set1_epi16((int16_t)above[7]);
    733 
    734  if (height == 4) {
    735    pixels[2] = _mm_cvtsi32_si128(((const int *)left)[0]);
    736  } else if (height == 8) {
    737    pixels[2] = _mm_loadl_epi64((const __m128i *)left);
    738  } else if (height == 16) {
    739    pixels[2] = _mm_load_si128((const __m128i *)left);
    740  } else {
    741    pixels[2] = _mm_load_si128((const __m128i *)left);
    742    pixels[4] = pixels[0];
    743    pixels[5] = pixels[1];
    744    pixels[6] = _mm_load_si128((const __m128i *)(left + 16));
    745    pixels[7] = pixels[3];
    746  }
    747 }
    748 
    749 // weight_h[0]: weight_h vector
    750 // weight_h[1]: scale - weight_h vector
    751 // weight_h[2]: same as [0], offset 8
    752 // weight_h[3]: same as [1], offset 8
    753 // weight_h[4]: same as [0], offset 16
    754 // weight_h[5]: same as [1], offset 16
    755 // weight_h[6]: same as [0], offset 24
    756 // weight_h[7]: same as [1], offset 24
    757 // weight_w[0]: weights_w and scale - weights_w interleave vector, first half
    758 // weight_w[1]: weights_w and scale - weights_w interleave vector, second half
    759 static inline void load_weight_w8(int height, __m128i *weight_h,
    760                                  __m128i *weight_w) {
    761  const __m128i zero = _mm_setzero_si128();
    762  const int we_offset = height < 8 ? 0 : 4;
    763  __m128i we = _mm_loadu_si128((const __m128i *)&smooth_weights[we_offset]);
    764  weight_h[0] = _mm_unpacklo_epi8(we, zero);
    765  const __m128i d = _mm_set1_epi16((int16_t)(1 << SMOOTH_WEIGHT_LOG2_SCALE));
    766  weight_h[1] = _mm_sub_epi16(d, weight_h[0]);
    767 
    768  if (height == 4) {
    769    we = _mm_srli_si128(we, 4);
    770    __m128i tmp1 = _mm_unpacklo_epi8(we, zero);
    771    __m128i tmp2 = _mm_sub_epi16(d, tmp1);
    772    weight_w[0] = _mm_unpacklo_epi16(tmp1, tmp2);
    773    weight_w[1] = _mm_unpackhi_epi16(tmp1, tmp2);
    774  } else {
    775    weight_w[0] = _mm_unpacklo_epi16(weight_h[0], weight_h[1]);
    776    weight_w[1] = _mm_unpackhi_epi16(weight_h[0], weight_h[1]);
    777  }
    778 
    779  if (height == 16) {
    780    we = _mm_loadu_si128((const __m128i *)&smooth_weights[12]);
    781    weight_h[0] = _mm_unpacklo_epi8(we, zero);
    782    weight_h[1] = _mm_sub_epi16(d, weight_h[0]);
    783    weight_h[2] = _mm_unpackhi_epi8(we, zero);
    784    weight_h[3] = _mm_sub_epi16(d, weight_h[2]);
    785  } else if (height == 32) {
    786    const __m128i weight_lo =
    787        _mm_loadu_si128((const __m128i *)&smooth_weights[28]);
    788    weight_h[0] = _mm_unpacklo_epi8(weight_lo, zero);
    789    weight_h[1] = _mm_sub_epi16(d, weight_h[0]);
    790    weight_h[2] = _mm_unpackhi_epi8(weight_lo, zero);
    791    weight_h[3] = _mm_sub_epi16(d, weight_h[2]);
    792    const __m128i weight_hi =
    793        _mm_loadu_si128((const __m128i *)&smooth_weights[28 + 16]);
    794    weight_h[4] = _mm_unpacklo_epi8(weight_hi, zero);
    795    weight_h[5] = _mm_sub_epi16(d, weight_h[4]);
    796    weight_h[6] = _mm_unpackhi_epi8(weight_hi, zero);
    797    weight_h[7] = _mm_sub_epi16(d, weight_h[6]);
    798  }
    799 }
    800 
    801 static inline void smooth_pred_8xh(const __m128i *pixels, const __m128i *wh,
    802                                   const __m128i *ww, int h, uint8_t *dst,
    803                                   ptrdiff_t stride, int second_half) {
    804  const __m128i round = _mm_set1_epi32((1 << SMOOTH_WEIGHT_LOG2_SCALE));
    805  const __m128i one = _mm_set1_epi16(1);
    806  const __m128i inc = _mm_set1_epi16(0x202);
    807  const __m128i gat = _mm_set_epi32(0, 0, 0xe0c0a08, 0x6040200);
    808 
    809  __m128i rep = second_half ? _mm_set1_epi16((short)0x8008)
    810                            : _mm_set1_epi16((short)0x8000);
    811  __m128i d = _mm_set1_epi16(0x100);
    812 
    813  int i;
    814  for (i = 0; i < h; ++i) {
    815    const __m128i wg_wg = _mm_shuffle_epi8(wh[0], d);
    816    const __m128i sc_sc = _mm_shuffle_epi8(wh[1], d);
    817    const __m128i wh_sc = _mm_unpacklo_epi16(wg_wg, sc_sc);
    818    __m128i s0 = _mm_madd_epi16(pixels[0], wh_sc);
    819    __m128i s1 = _mm_madd_epi16(pixels[1], wh_sc);
    820 
    821    __m128i b = _mm_shuffle_epi8(pixels[2], rep);
    822    b = _mm_unpacklo_epi16(b, pixels[3]);
    823    __m128i sum0 = _mm_madd_epi16(b, ww[0]);
    824    __m128i sum1 = _mm_madd_epi16(b, ww[1]);
    825 
    826    s0 = _mm_add_epi32(s0, sum0);
    827    s0 = _mm_add_epi32(s0, round);
    828    s0 = _mm_srai_epi32(s0, 1 + SMOOTH_WEIGHT_LOG2_SCALE);
    829 
    830    s1 = _mm_add_epi32(s1, sum1);
    831    s1 = _mm_add_epi32(s1, round);
    832    s1 = _mm_srai_epi32(s1, 1 + SMOOTH_WEIGHT_LOG2_SCALE);
    833 
    834    sum0 = _mm_packus_epi16(s0, s1);
    835    sum0 = _mm_shuffle_epi8(sum0, gat);
    836    _mm_storel_epi64((__m128i *)dst, sum0);
    837    dst += stride;
    838 
    839    rep = _mm_add_epi16(rep, one);
    840    d = _mm_add_epi16(d, inc);
    841  }
    842 }
    843 
    844 void aom_smooth_predictor_8x4_ssse3(uint8_t *dst, ptrdiff_t stride,
    845                                    const uint8_t *above, const uint8_t *left) {
    846  __m128i pixels[4];
    847  load_pixel_w8(above, left, 4, pixels);
    848 
    849  __m128i wh[4], ww[2];
    850  load_weight_w8(4, wh, ww);
    851 
    852  smooth_pred_8xh(pixels, wh, ww, 4, dst, stride, 0);
    853 }
    854 
    855 void aom_smooth_predictor_8x8_ssse3(uint8_t *dst, ptrdiff_t stride,
    856                                    const uint8_t *above, const uint8_t *left) {
    857  __m128i pixels[4];
    858  load_pixel_w8(above, left, 8, pixels);
    859 
    860  __m128i wh[4], ww[2];
    861  load_weight_w8(8, wh, ww);
    862 
    863  smooth_pred_8xh(pixels, wh, ww, 8, dst, stride, 0);
    864 }
    865 
    866 void aom_smooth_predictor_8x16_ssse3(uint8_t *dst, ptrdiff_t stride,
    867                                     const uint8_t *above,
    868                                     const uint8_t *left) {
    869  __m128i pixels[4];
    870  load_pixel_w8(above, left, 16, pixels);
    871 
    872  __m128i wh[4], ww[2];
    873  load_weight_w8(16, wh, ww);
    874 
    875  smooth_pred_8xh(pixels, wh, ww, 8, dst, stride, 0);
    876  dst += stride << 3;
    877  smooth_pred_8xh(pixels, &wh[2], ww, 8, dst, stride, 1);
    878 }
    879 
    880 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
    881 void aom_smooth_predictor_8x32_ssse3(uint8_t *dst, ptrdiff_t stride,
    882                                     const uint8_t *above,
    883                                     const uint8_t *left) {
    884  __m128i pixels[8];
    885  load_pixel_w8(above, left, 32, pixels);
    886 
    887  __m128i wh[8], ww[2];
    888  load_weight_w8(32, wh, ww);
    889 
    890  smooth_pred_8xh(&pixels[0], wh, ww, 8, dst, stride, 0);
    891  dst += stride << 3;
    892  smooth_pred_8xh(&pixels[0], &wh[2], ww, 8, dst, stride, 1);
    893  dst += stride << 3;
    894  smooth_pred_8xh(&pixels[4], &wh[4], ww, 8, dst, stride, 0);
    895  dst += stride << 3;
    896  smooth_pred_8xh(&pixels[4], &wh[6], ww, 8, dst, stride, 1);
    897 }
    898 #endif  // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
    899 
    900 // TODO(slavarnway): Visual Studio only supports restrict when /std:c11
    901 // (available in 2019+) or greater is specified; __restrict can be used in that
    902 // case. This should be moved to rtcd and used consistently between the
    903 // function declarations and definitions to avoid warnings in Visual Studio
    904 // when defining LIBAOM_RESTRICT to restrict or __restrict.
    905 #if defined(_MSC_VER)
    906 #define LIBAOM_RESTRICT
    907 #else
    908 #define LIBAOM_RESTRICT restrict
    909 #endif
    910 
    911 static AOM_FORCE_INLINE __m128i Load4(const void *src) {
    912  // With new compilers such as clang 8.0.0 we can use the new _mm_loadu_si32
    913  // intrinsic. Both _mm_loadu_si32(src) and the code here are compiled into a
    914  // movss instruction.
    915  //
    916  // Until compiler support of _mm_loadu_si32 is widespread, use of
    917  // _mm_loadu_si32 is banned.
    918  int val;
    919  memcpy(&val, src, sizeof(val));
    920  return _mm_cvtsi32_si128(val);
    921 }
    922 
    923 static AOM_FORCE_INLINE __m128i LoadLo8(const void *a) {
    924  return _mm_loadl_epi64((const __m128i *)(a));
    925 }
    926 
    927 static AOM_FORCE_INLINE __m128i LoadUnaligned16(const void *a) {
    928  return _mm_loadu_si128((const __m128i *)(a));
    929 }
    930 
    931 static AOM_FORCE_INLINE void Store4(void *dst, const __m128i x) {
    932  const int val = _mm_cvtsi128_si32(x);
    933  memcpy(dst, &val, sizeof(val));
    934 }
    935 
    936 static AOM_FORCE_INLINE void StoreLo8(void *a, const __m128i v) {
    937  _mm_storel_epi64((__m128i *)(a), v);
    938 }
    939 
    940 static AOM_FORCE_INLINE void StoreUnaligned16(void *a, const __m128i v) {
    941  _mm_storeu_si128((__m128i *)(a), v);
    942 }
    943 
    944 static AOM_FORCE_INLINE __m128i cvtepu8_epi16(__m128i x) {
    945  return _mm_unpacklo_epi8((x), _mm_setzero_si128());
    946 }
    947 
    948 static AOM_FORCE_INLINE __m128i cvtepu8_epi32(__m128i x) {
    949  const __m128i tmp = _mm_unpacklo_epi8((x), _mm_setzero_si128());
    950  return _mm_unpacklo_epi16(tmp, _mm_setzero_si128());
    951 }
    952 
    953 static AOM_FORCE_INLINE __m128i cvtepu16_epi32(__m128i x) {
    954  return _mm_unpacklo_epi16((x), _mm_setzero_si128());
    955 }
    956 
    957 static void smooth_predictor_wxh(uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
    958                                 const uint8_t *LIBAOM_RESTRICT top_row,
    959                                 const uint8_t *LIBAOM_RESTRICT left_column,
    960                                 int width, int height) {
    961  const uint8_t *const sm_weights_h = smooth_weights + height - 4;
    962  const uint8_t *const sm_weights_w = smooth_weights + width - 4;
    963  const __m128i zero = _mm_setzero_si128();
    964  const __m128i scale_value = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
    965  const __m128i bottom_left = _mm_cvtsi32_si128(left_column[height - 1]);
    966  const __m128i top_right = _mm_set1_epi16(top_row[width - 1]);
    967  const __m128i round = _mm_set1_epi32(1 << SMOOTH_WEIGHT_LOG2_SCALE);
    968  for (int y = 0; y < height; ++y) {
    969    const __m128i weights_y = _mm_cvtsi32_si128(sm_weights_h[y]);
    970    const __m128i left_y = _mm_cvtsi32_si128(left_column[y]);
    971    const __m128i scale_m_weights_y = _mm_sub_epi16(scale_value, weights_y);
    972    __m128i scaled_bottom_left =
    973        _mm_mullo_epi16(scale_m_weights_y, bottom_left);
    974    const __m128i weight_left_y =
    975        _mm_shuffle_epi32(_mm_unpacklo_epi16(weights_y, left_y), 0);
    976    scaled_bottom_left = _mm_add_epi32(scaled_bottom_left, round);
    977    scaled_bottom_left = _mm_shuffle_epi32(scaled_bottom_left, 0);
    978    for (int x = 0; x < width; x += 8) {
    979      const __m128i top_x = LoadLo8(top_row + x);
    980      const __m128i weights_x = LoadLo8(sm_weights_w + x);
    981      const __m128i top_weights_x = _mm_unpacklo_epi8(top_x, weights_x);
    982      const __m128i top_weights_x_lo = cvtepu8_epi16(top_weights_x);
    983      const __m128i top_weights_x_hi = _mm_unpackhi_epi8(top_weights_x, zero);
    984 
    985      // Here opposite weights and pixels are multiplied, where the order of
    986      // interleaving is indicated in the names.
    987      __m128i pred_lo = _mm_madd_epi16(top_weights_x_lo, weight_left_y);
    988      __m128i pred_hi = _mm_madd_epi16(top_weights_x_hi, weight_left_y);
    989 
    990      // |scaled_bottom_left| is always scaled by the same weight each row, so
    991      // we only derive |scaled_top_right| values here.
    992      const __m128i inverted_weights_x =
    993          _mm_sub_epi16(scale_value, cvtepu8_epi16(weights_x));
    994      const __m128i scaled_top_right =
    995          _mm_mullo_epi16(inverted_weights_x, top_right);
    996      const __m128i scaled_top_right_lo = cvtepu16_epi32(scaled_top_right);
    997      const __m128i scaled_top_right_hi =
    998          _mm_unpackhi_epi16(scaled_top_right, zero);
    999      pred_lo = _mm_add_epi32(pred_lo, scaled_bottom_left);
   1000      pred_hi = _mm_add_epi32(pred_hi, scaled_bottom_left);
   1001      pred_lo = _mm_add_epi32(pred_lo, scaled_top_right_lo);
   1002      pred_hi = _mm_add_epi32(pred_hi, scaled_top_right_hi);
   1003 
   1004      // The round value for RightShiftWithRounding was added with
   1005      // |scaled_bottom_left|.
   1006      pred_lo = _mm_srli_epi32(pred_lo, (1 + SMOOTH_WEIGHT_LOG2_SCALE));
   1007      pred_hi = _mm_srli_epi32(pred_hi, (1 + SMOOTH_WEIGHT_LOG2_SCALE));
   1008      const __m128i pred = _mm_packus_epi16(pred_lo, pred_hi);
   1009      StoreLo8(dst + x, _mm_packus_epi16(pred, pred));
   1010    }
   1011    dst += stride;
   1012  }
   1013 }
   1014 
   1015 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
   1016 void aom_smooth_predictor_16x4_ssse3(uint8_t *dst, ptrdiff_t stride,
   1017                                     const uint8_t *above,
   1018                                     const uint8_t *left) {
   1019  smooth_predictor_wxh(dst, stride, above, left, 16, 4);
   1020 }
   1021 #endif  // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
   1022 
   1023 void aom_smooth_predictor_16x8_ssse3(uint8_t *dst, ptrdiff_t stride,
   1024                                     const uint8_t *above,
   1025                                     const uint8_t *left) {
   1026  smooth_predictor_wxh(dst, stride, above, left, 16, 8);
   1027 }
   1028 
   1029 void aom_smooth_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t stride,
   1030                                      const uint8_t *above,
   1031                                      const uint8_t *left) {
   1032  smooth_predictor_wxh(dst, stride, above, left, 16, 16);
   1033 }
   1034 
   1035 void aom_smooth_predictor_16x32_ssse3(uint8_t *dst, ptrdiff_t stride,
   1036                                      const uint8_t *above,
   1037                                      const uint8_t *left) {
   1038  smooth_predictor_wxh(dst, stride, above, left, 16, 32);
   1039 }
   1040 
   1041 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
   1042 void aom_smooth_predictor_16x64_ssse3(uint8_t *dst, ptrdiff_t stride,
   1043                                      const uint8_t *above,
   1044                                      const uint8_t *left) {
   1045  smooth_predictor_wxh(dst, stride, above, left, 16, 64);
   1046 }
   1047 
   1048 void aom_smooth_predictor_32x8_ssse3(uint8_t *dst, ptrdiff_t stride,
   1049                                     const uint8_t *above,
   1050                                     const uint8_t *left) {
   1051  smooth_predictor_wxh(dst, stride, above, left, 32, 8);
   1052 }
   1053 #endif  // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
   1054 
   1055 void aom_smooth_predictor_32x16_ssse3(uint8_t *dst, ptrdiff_t stride,
   1056                                      const uint8_t *above,
   1057                                      const uint8_t *left) {
   1058  smooth_predictor_wxh(dst, stride, above, left, 32, 16);
   1059 }
   1060 
   1061 void aom_smooth_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t stride,
   1062                                      const uint8_t *above,
   1063                                      const uint8_t *left) {
   1064  smooth_predictor_wxh(dst, stride, above, left, 32, 32);
   1065 }
   1066 
   1067 void aom_smooth_predictor_32x64_ssse3(uint8_t *dst, ptrdiff_t stride,
   1068                                      const uint8_t *above,
   1069                                      const uint8_t *left) {
   1070  smooth_predictor_wxh(dst, stride, above, left, 32, 64);
   1071 }
   1072 
   1073 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
   1074 void aom_smooth_predictor_64x16_ssse3(uint8_t *dst, ptrdiff_t stride,
   1075                                      const uint8_t *above,
   1076                                      const uint8_t *left) {
   1077  smooth_predictor_wxh(dst, stride, above, left, 64, 16);
   1078 }
   1079 #endif  // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
   1080 
   1081 void aom_smooth_predictor_64x32_ssse3(uint8_t *dst, ptrdiff_t stride,
   1082                                      const uint8_t *above,
   1083                                      const uint8_t *left) {
   1084  smooth_predictor_wxh(dst, stride, above, left, 64, 32);
   1085 }
   1086 
   1087 void aom_smooth_predictor_64x64_ssse3(uint8_t *dst, ptrdiff_t stride,
   1088                                      const uint8_t *above,
   1089                                      const uint8_t *left) {
   1090  smooth_predictor_wxh(dst, stride, above, left, 64, 64);
   1091 }
   1092 
   1093 // -----------------------------------------------------------------------------
   1094 // Smooth horizontal/vertical helper functions.
   1095 
   1096 // For Horizontal, pixels1 and pixels2 are the same repeated value. For
   1097 // Vertical, weights1 and weights2 are the same, and scaled_corner1 and
   1098 // scaled_corner2 are the same.
   1099 static AOM_FORCE_INLINE void write_smooth_directional_sum16(
   1100    uint8_t *LIBAOM_RESTRICT dst, const __m128i pixels1, const __m128i pixels2,
   1101    const __m128i weights1, const __m128i weights2,
   1102    const __m128i scaled_corner1, const __m128i scaled_corner2,
   1103    const __m128i round) {
   1104  const __m128i weighted_px1 = _mm_mullo_epi16(pixels1, weights1);
   1105  const __m128i weighted_px2 = _mm_mullo_epi16(pixels2, weights2);
   1106  const __m128i pred_sum1 = _mm_add_epi16(scaled_corner1, weighted_px1);
   1107  const __m128i pred_sum2 = _mm_add_epi16(scaled_corner2, weighted_px2);
   1108  // Equivalent to RightShiftWithRounding(pred[x][y], 8).
   1109  const __m128i pred1 = _mm_srli_epi16(_mm_add_epi16(pred_sum1, round), 8);
   1110  const __m128i pred2 = _mm_srli_epi16(_mm_add_epi16(pred_sum2, round), 8);
   1111  StoreUnaligned16(dst, _mm_packus_epi16(pred1, pred2));
   1112 }
   1113 
   1114 static AOM_FORCE_INLINE __m128i smooth_directional_sum8(
   1115    const __m128i pixels, const __m128i weights, const __m128i scaled_corner) {
   1116  const __m128i weighted_px = _mm_mullo_epi16(pixels, weights);
   1117  return _mm_add_epi16(scaled_corner, weighted_px);
   1118 }
   1119 
   1120 static AOM_FORCE_INLINE void write_smooth_directional_sum8(
   1121    uint8_t *LIBAOM_RESTRICT dst, const __m128i *pixels, const __m128i *weights,
   1122    const __m128i *scaled_corner, const __m128i *round) {
   1123  const __m128i pred_sum =
   1124      smooth_directional_sum8(*pixels, *weights, *scaled_corner);
   1125  // Equivalent to RightShiftWithRounding(pred[x][y], 8).
   1126  const __m128i pred = _mm_srli_epi16(_mm_add_epi16(pred_sum, *round), 8);
   1127  StoreLo8(dst, _mm_packus_epi16(pred, pred));
   1128 }
   1129 
   1130 // -----------------------------------------------------------------------------
   1131 // SMOOTH_V_PRED
   1132 
   1133 static AOM_FORCE_INLINE void load_smooth_vertical_pixels4(
   1134    const uint8_t *LIBAOM_RESTRICT above, const uint8_t *LIBAOM_RESTRICT left,
   1135    const int height, __m128i *pixels) {
   1136  __m128i top = Load4(above);
   1137  const __m128i bottom_left = _mm_set1_epi16(left[height - 1]);
   1138  top = cvtepu8_epi16(top);
   1139  pixels[0] = _mm_unpacklo_epi16(top, bottom_left);
   1140 }
   1141 
   1142 // |weight_array| alternates weight vectors from the table with their inverted
   1143 // (256-w) counterparts. This is precomputed by the compiler when the weights
   1144 // table is visible to this module. Removing this visibility can cut speed by up
   1145 // to half in both 4xH and 8xH transforms.
   1146 static AOM_FORCE_INLINE void load_smooth_vertical_weights4(
   1147    const uint8_t *LIBAOM_RESTRICT weight_array, const int height,
   1148    __m128i *weights) {
   1149  const __m128i inverter = _mm_set1_epi16(256);
   1150 
   1151  if (height == 4) {
   1152    const __m128i weight = Load4(weight_array);
   1153    weights[0] = cvtepu8_epi16(weight);
   1154    weights[1] = _mm_sub_epi16(inverter, weights[0]);
   1155  } else if (height == 8) {
   1156    const __m128i weight = LoadLo8(weight_array + 4);
   1157    weights[0] = cvtepu8_epi16(weight);
   1158    weights[1] = _mm_sub_epi16(inverter, weights[0]);
   1159  } else {
   1160    const __m128i weight = LoadUnaligned16(weight_array + 12);
   1161    const __m128i zero = _mm_setzero_si128();
   1162    weights[0] = cvtepu8_epi16(weight);
   1163    weights[1] = _mm_sub_epi16(inverter, weights[0]);
   1164    weights[2] = _mm_unpackhi_epi8(weight, zero);
   1165    weights[3] = _mm_sub_epi16(inverter, weights[2]);
   1166  }
   1167 }
   1168 
   1169 static AOM_FORCE_INLINE void write_smooth_vertical4xh(
   1170    const __m128i *pixel, const __m128i *weight, const int height,
   1171    uint8_t *LIBAOM_RESTRICT dst, const ptrdiff_t stride) {
   1172  const __m128i pred_round = _mm_set1_epi32(128);
   1173  const __m128i mask_increment = _mm_set1_epi16(0x0202);
   1174  const __m128i cvtepu8_epi32 = _mm_set1_epi32(0xC080400);
   1175  __m128i y_select = _mm_set1_epi16(0x0100);
   1176 
   1177  for (int y = 0; y < height; ++y) {
   1178    const __m128i weight_y = _mm_shuffle_epi8(weight[0], y_select);
   1179    const __m128i inverted_weight_y = _mm_shuffle_epi8(weight[1], y_select);
   1180    const __m128i alternate_weights =
   1181        _mm_unpacklo_epi16(weight_y, inverted_weight_y);
   1182    // Here the pixel vector is top_row[0], corner, top_row[1], corner, ...
   1183    // The madd instruction yields four results of the form:
   1184    // (top_row[x] * weight[y] + corner * inverted_weight[y])
   1185    __m128i sum = _mm_madd_epi16(pixel[0], alternate_weights);
   1186    sum = _mm_add_epi32(sum, pred_round);
   1187    sum = _mm_srai_epi32(sum, 8);
   1188    sum = _mm_shuffle_epi8(sum, cvtepu8_epi32);
   1189    Store4(dst, sum);
   1190    dst += stride;
   1191    y_select = _mm_add_epi16(y_select, mask_increment);
   1192  }
   1193 }
   1194 
   1195 void aom_smooth_v_predictor_4x4_ssse3(
   1196    uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
   1197    const uint8_t *LIBAOM_RESTRICT top_row,
   1198    const uint8_t *LIBAOM_RESTRICT left_column) {
   1199  __m128i pixels;
   1200  load_smooth_vertical_pixels4(top_row, left_column, 4, &pixels);
   1201 
   1202  __m128i weights[2];
   1203  load_smooth_vertical_weights4(smooth_weights, 4, weights);
   1204 
   1205  write_smooth_vertical4xh(&pixels, weights, 4, dst, stride);
   1206 }
   1207 
   1208 void aom_smooth_v_predictor_4x8_ssse3(
   1209    uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
   1210    const uint8_t *LIBAOM_RESTRICT top_row,
   1211    const uint8_t *LIBAOM_RESTRICT left_column) {
   1212  __m128i pixels;
   1213  load_smooth_vertical_pixels4(top_row, left_column, 8, &pixels);
   1214 
   1215  __m128i weights[2];
   1216  load_smooth_vertical_weights4(smooth_weights, 8, weights);
   1217 
   1218  write_smooth_vertical4xh(&pixels, weights, 8, dst, stride);
   1219 }
   1220 
   1221 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
   1222 void aom_smooth_v_predictor_4x16_ssse3(
   1223    uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
   1224    const uint8_t *LIBAOM_RESTRICT top_row,
   1225    const uint8_t *LIBAOM_RESTRICT left_column) {
   1226  __m128i pixels;
   1227  load_smooth_vertical_pixels4(top_row, left_column, 16, &pixels);
   1228 
   1229  __m128i weights[4];
   1230  load_smooth_vertical_weights4(smooth_weights, 16, weights);
   1231 
   1232  write_smooth_vertical4xh(&pixels, weights, 8, dst, stride);
   1233  dst += stride << 3;
   1234  write_smooth_vertical4xh(&pixels, &weights[2], 8, dst, stride);
   1235 }
   1236 #endif  // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
   1237 
   1238 void aom_smooth_v_predictor_8x4_ssse3(
   1239    uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
   1240    const uint8_t *LIBAOM_RESTRICT top_row,
   1241    const uint8_t *LIBAOM_RESTRICT left_column) {
   1242  const __m128i bottom_left = _mm_set1_epi16(left_column[3]);
   1243  const __m128i weights = cvtepu8_epi16(Load4(smooth_weights));
   1244  const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
   1245  const __m128i inverted_weights = _mm_sub_epi16(scale, weights);
   1246  const __m128i scaled_bottom_left =
   1247      _mm_mullo_epi16(inverted_weights, bottom_left);
   1248  const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
   1249  __m128i y_select = _mm_set1_epi32(0x01000100);
   1250  const __m128i top = cvtepu8_epi16(LoadLo8(top_row));
   1251  __m128i weights_y = _mm_shuffle_epi8(weights, y_select);
   1252  __m128i scaled_bottom_left_y = _mm_shuffle_epi8(scaled_bottom_left, y_select);
   1253  write_smooth_directional_sum8(dst, &top, &weights_y, &scaled_bottom_left_y,
   1254                                &round);
   1255  dst += stride;
   1256  y_select = _mm_set1_epi32(0x03020302);
   1257  weights_y = _mm_shuffle_epi8(weights, y_select);
   1258  scaled_bottom_left_y = _mm_shuffle_epi8(scaled_bottom_left, y_select);
   1259  write_smooth_directional_sum8(dst, &top, &weights_y, &scaled_bottom_left_y,
   1260                                &round);
   1261  dst += stride;
   1262  y_select = _mm_set1_epi32(0x05040504);
   1263  weights_y = _mm_shuffle_epi8(weights, y_select);
   1264  scaled_bottom_left_y = _mm_shuffle_epi8(scaled_bottom_left, y_select);
   1265  write_smooth_directional_sum8(dst, &top, &weights_y, &scaled_bottom_left_y,
   1266                                &round);
   1267  dst += stride;
   1268  y_select = _mm_set1_epi32(0x07060706);
   1269  weights_y = _mm_shuffle_epi8(weights, y_select);
   1270  scaled_bottom_left_y = _mm_shuffle_epi8(scaled_bottom_left, y_select);
   1271  write_smooth_directional_sum8(dst, &top, &weights_y, &scaled_bottom_left_y,
   1272                                &round);
   1273 }
   1274 
   1275 void aom_smooth_v_predictor_8x8_ssse3(
   1276    uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
   1277    const uint8_t *LIBAOM_RESTRICT top_row,
   1278    const uint8_t *LIBAOM_RESTRICT left_column) {
   1279  const __m128i bottom_left = _mm_set1_epi16(left_column[7]);
   1280  const __m128i weights = cvtepu8_epi16(LoadLo8(smooth_weights + 4));
   1281  const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
   1282  const __m128i inverted_weights = _mm_sub_epi16(scale, weights);
   1283  const __m128i scaled_bottom_left =
   1284      _mm_mullo_epi16(inverted_weights, bottom_left);
   1285  const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
   1286  const __m128i top = cvtepu8_epi16(LoadLo8(top_row));
   1287  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
   1288    const __m128i y_select = _mm_set1_epi32(y_mask);
   1289    const __m128i weights_y = _mm_shuffle_epi8(weights, y_select);
   1290    const __m128i scaled_bottom_left_y =
   1291        _mm_shuffle_epi8(scaled_bottom_left, y_select);
   1292    write_smooth_directional_sum8(dst, &top, &weights_y, &scaled_bottom_left_y,
   1293                                  &round);
   1294    dst += stride;
   1295  }
   1296 }
   1297 
   1298 void aom_smooth_v_predictor_8x16_ssse3(
   1299    uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
   1300    const uint8_t *LIBAOM_RESTRICT top_row,
   1301    const uint8_t *LIBAOM_RESTRICT left_column) {
   1302  const __m128i bottom_left = _mm_set1_epi16(left_column[15]);
   1303  const __m128i weights = LoadUnaligned16(smooth_weights + 12);
   1304 
   1305  const __m128i weights1 = cvtepu8_epi16(weights);
   1306  const __m128i weights2 = cvtepu8_epi16(_mm_srli_si128(weights, 8));
   1307  const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
   1308  const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
   1309  const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
   1310  const __m128i scaled_bottom_left1 =
   1311      _mm_mullo_epi16(inverted_weights1, bottom_left);
   1312  const __m128i scaled_bottom_left2 =
   1313      _mm_mullo_epi16(inverted_weights2, bottom_left);
   1314  const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
   1315  const __m128i top = cvtepu8_epi16(LoadLo8(top_row));
   1316  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
   1317    const __m128i y_select = _mm_set1_epi32(y_mask);
   1318    const __m128i weights_y = _mm_shuffle_epi8(weights1, y_select);
   1319    const __m128i scaled_bottom_left_y =
   1320        _mm_shuffle_epi8(scaled_bottom_left1, y_select);
   1321    write_smooth_directional_sum8(dst, &top, &weights_y, &scaled_bottom_left_y,
   1322                                  &round);
   1323    dst += stride;
   1324  }
   1325  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
   1326    const __m128i y_select = _mm_set1_epi32(y_mask);
   1327    const __m128i weights_y = _mm_shuffle_epi8(weights2, y_select);
   1328    const __m128i scaled_bottom_left_y =
   1329        _mm_shuffle_epi8(scaled_bottom_left2, y_select);
   1330    write_smooth_directional_sum8(dst, &top, &weights_y, &scaled_bottom_left_y,
   1331                                  &round);
   1332    dst += stride;
   1333  }
   1334 }
   1335 
   1336 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
   1337 void aom_smooth_v_predictor_8x32_ssse3(
   1338    uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
   1339    const uint8_t *LIBAOM_RESTRICT top_row,
   1340    const uint8_t *LIBAOM_RESTRICT left_column) {
   1341  const __m128i zero = _mm_setzero_si128();
   1342  const __m128i bottom_left = _mm_set1_epi16(left_column[31]);
   1343  const __m128i weights_lo = LoadUnaligned16(smooth_weights + 28);
   1344  const __m128i weights_hi = LoadUnaligned16(smooth_weights + 44);
   1345  const __m128i weights1 = cvtepu8_epi16(weights_lo);
   1346  const __m128i weights2 = _mm_unpackhi_epi8(weights_lo, zero);
   1347  const __m128i weights3 = cvtepu8_epi16(weights_hi);
   1348  const __m128i weights4 = _mm_unpackhi_epi8(weights_hi, zero);
   1349  const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
   1350  const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
   1351  const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
   1352  const __m128i inverted_weights3 = _mm_sub_epi16(scale, weights3);
   1353  const __m128i inverted_weights4 = _mm_sub_epi16(scale, weights4);
   1354  const __m128i scaled_bottom_left1 =
   1355      _mm_mullo_epi16(inverted_weights1, bottom_left);
   1356  const __m128i scaled_bottom_left2 =
   1357      _mm_mullo_epi16(inverted_weights2, bottom_left);
   1358  const __m128i scaled_bottom_left3 =
   1359      _mm_mullo_epi16(inverted_weights3, bottom_left);
   1360  const __m128i scaled_bottom_left4 =
   1361      _mm_mullo_epi16(inverted_weights4, bottom_left);
   1362  const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
   1363  const __m128i top = cvtepu8_epi16(LoadLo8(top_row));
   1364  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
   1365    const __m128i y_select = _mm_set1_epi32(y_mask);
   1366    const __m128i weights_y = _mm_shuffle_epi8(weights1, y_select);
   1367    const __m128i scaled_bottom_left_y =
   1368        _mm_shuffle_epi8(scaled_bottom_left1, y_select);
   1369    write_smooth_directional_sum8(dst, &top, &weights_y, &scaled_bottom_left_y,
   1370                                  &round);
   1371    dst += stride;
   1372  }
   1373  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
   1374    const __m128i y_select = _mm_set1_epi32(y_mask);
   1375    const __m128i weights_y = _mm_shuffle_epi8(weights2, y_select);
   1376    const __m128i scaled_bottom_left_y =
   1377        _mm_shuffle_epi8(scaled_bottom_left2, y_select);
   1378    write_smooth_directional_sum8(dst, &top, &weights_y, &scaled_bottom_left_y,
   1379                                  &round);
   1380    dst += stride;
   1381  }
   1382  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
   1383    const __m128i y_select = _mm_set1_epi32(y_mask);
   1384    const __m128i weights_y = _mm_shuffle_epi8(weights3, y_select);
   1385    const __m128i scaled_bottom_left_y =
   1386        _mm_shuffle_epi8(scaled_bottom_left3, y_select);
   1387    write_smooth_directional_sum8(dst, &top, &weights_y, &scaled_bottom_left_y,
   1388                                  &round);
   1389    dst += stride;
   1390  }
   1391  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
   1392    const __m128i y_select = _mm_set1_epi32(y_mask);
   1393    const __m128i weights_y = _mm_shuffle_epi8(weights4, y_select);
   1394    const __m128i scaled_bottom_left_y =
   1395        _mm_shuffle_epi8(scaled_bottom_left4, y_select);
   1396    write_smooth_directional_sum8(dst, &top, &weights_y, &scaled_bottom_left_y,
   1397                                  &round);
   1398    dst += stride;
   1399  }
   1400 }
   1401 
   1402 void aom_smooth_v_predictor_16x4_ssse3(
   1403    uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
   1404    const uint8_t *LIBAOM_RESTRICT top_row,
   1405    const uint8_t *LIBAOM_RESTRICT left_column) {
   1406  const __m128i bottom_left = _mm_set1_epi16(left_column[3]);
   1407  const __m128i weights = cvtepu8_epi16(Load4(smooth_weights));
   1408  const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
   1409  const __m128i inverted_weights = _mm_sub_epi16(scale, weights);
   1410  const __m128i scaled_bottom_left =
   1411      _mm_mullo_epi16(inverted_weights, bottom_left);
   1412  const __m128i round = _mm_set1_epi16(128);
   1413  const __m128i top = LoadUnaligned16(top_row);
   1414  const __m128i top_lo = cvtepu8_epi16(top);
   1415  const __m128i top_hi = cvtepu8_epi16(_mm_srli_si128(top, 8));
   1416 
   1417  __m128i y_select = _mm_set1_epi32(0x01000100);
   1418  __m128i weights_y = _mm_shuffle_epi8(weights, y_select);
   1419  __m128i scaled_bottom_left_y = _mm_shuffle_epi8(scaled_bottom_left, y_select);
   1420  write_smooth_directional_sum16(dst, top_lo, top_hi, weights_y, weights_y,
   1421                                 scaled_bottom_left_y, scaled_bottom_left_y,
   1422                                 round);
   1423  dst += stride;
   1424  y_select = _mm_set1_epi32(0x03020302);
   1425  weights_y = _mm_shuffle_epi8(weights, y_select);
   1426  scaled_bottom_left_y = _mm_shuffle_epi8(scaled_bottom_left, y_select);
   1427  write_smooth_directional_sum16(dst, top_lo, top_hi, weights_y, weights_y,
   1428                                 scaled_bottom_left_y, scaled_bottom_left_y,
   1429                                 round);
   1430  dst += stride;
   1431  y_select = _mm_set1_epi32(0x05040504);
   1432  weights_y = _mm_shuffle_epi8(weights, y_select);
   1433  scaled_bottom_left_y = _mm_shuffle_epi8(scaled_bottom_left, y_select);
   1434  write_smooth_directional_sum16(dst, top_lo, top_hi, weights_y, weights_y,
   1435                                 scaled_bottom_left_y, scaled_bottom_left_y,
   1436                                 round);
   1437  dst += stride;
   1438  y_select = _mm_set1_epi32(0x07060706);
   1439  weights_y = _mm_shuffle_epi8(weights, y_select);
   1440  scaled_bottom_left_y = _mm_shuffle_epi8(scaled_bottom_left, y_select);
   1441  write_smooth_directional_sum16(dst, top_lo, top_hi, weights_y, weights_y,
   1442                                 scaled_bottom_left_y, scaled_bottom_left_y,
   1443                                 round);
   1444 }
   1445 #endif  // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
   1446 
   1447 void aom_smooth_v_predictor_16x8_ssse3(
   1448    uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
   1449    const uint8_t *LIBAOM_RESTRICT top_row,
   1450    const uint8_t *LIBAOM_RESTRICT left_column) {
   1451  const __m128i bottom_left = _mm_set1_epi16(left_column[7]);
   1452  const __m128i weights = cvtepu8_epi16(LoadLo8(smooth_weights + 4));
   1453  const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
   1454  const __m128i inverted_weights = _mm_sub_epi16(scale, weights);
   1455  const __m128i scaled_bottom_left =
   1456      _mm_mullo_epi16(inverted_weights, bottom_left);
   1457  const __m128i round = _mm_set1_epi16(128);
   1458  const __m128i top = LoadUnaligned16(top_row);
   1459  const __m128i top_lo = cvtepu8_epi16(top);
   1460  const __m128i top_hi = cvtepu8_epi16(_mm_srli_si128(top, 8));
   1461  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
   1462    const __m128i y_select = _mm_set1_epi32(y_mask);
   1463    const __m128i weights_y = _mm_shuffle_epi8(weights, y_select);
   1464    const __m128i scaled_bottom_left_y =
   1465        _mm_shuffle_epi8(scaled_bottom_left, y_select);
   1466    write_smooth_directional_sum16(dst, top_lo, top_hi, weights_y, weights_y,
   1467                                   scaled_bottom_left_y, scaled_bottom_left_y,
   1468                                   round);
   1469    dst += stride;
   1470  }
   1471 }
   1472 
   1473 void aom_smooth_v_predictor_16x16_ssse3(
   1474    uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
   1475    const uint8_t *LIBAOM_RESTRICT top_row,
   1476    const uint8_t *LIBAOM_RESTRICT left_column) {
   1477  const __m128i bottom_left = _mm_set1_epi16(left_column[15]);
   1478  const __m128i zero = _mm_setzero_si128();
   1479  const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
   1480  const __m128i weights = LoadUnaligned16(smooth_weights + 12);
   1481  const __m128i weights_lo = cvtepu8_epi16(weights);
   1482  const __m128i weights_hi = _mm_unpackhi_epi8(weights, zero);
   1483  const __m128i inverted_weights_lo = _mm_sub_epi16(scale, weights_lo);
   1484  const __m128i inverted_weights_hi = _mm_sub_epi16(scale, weights_hi);
   1485  const __m128i scaled_bottom_left_lo =
   1486      _mm_mullo_epi16(inverted_weights_lo, bottom_left);
   1487  const __m128i scaled_bottom_left_hi =
   1488      _mm_mullo_epi16(inverted_weights_hi, bottom_left);
   1489  const __m128i round = _mm_set1_epi16(128);
   1490 
   1491  const __m128i top = LoadUnaligned16(top_row);
   1492  const __m128i top_lo = cvtepu8_epi16(top);
   1493  const __m128i top_hi = _mm_unpackhi_epi8(top, zero);
   1494  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
   1495    const __m128i y_select = _mm_set1_epi32(y_mask);
   1496    const __m128i weights_y = _mm_shuffle_epi8(weights_lo, y_select);
   1497    const __m128i scaled_bottom_left_y =
   1498        _mm_shuffle_epi8(scaled_bottom_left_lo, y_select);
   1499    write_smooth_directional_sum16(dst, top_lo, top_hi, weights_y, weights_y,
   1500                                   scaled_bottom_left_y, scaled_bottom_left_y,
   1501                                   round);
   1502    dst += stride;
   1503  }
   1504  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
   1505    const __m128i y_select = _mm_set1_epi32(y_mask);
   1506    const __m128i weights_y = _mm_shuffle_epi8(weights_hi, y_select);
   1507    const __m128i scaled_bottom_left_y =
   1508        _mm_shuffle_epi8(scaled_bottom_left_hi, y_select);
   1509    write_smooth_directional_sum16(dst, top_lo, top_hi, weights_y, weights_y,
   1510                                   scaled_bottom_left_y, scaled_bottom_left_y,
   1511                                   round);
   1512    dst += stride;
   1513  }
   1514 }
   1515 
   1516 void aom_smooth_v_predictor_16x32_ssse3(
   1517    uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
   1518    const uint8_t *LIBAOM_RESTRICT top_row,
   1519    const uint8_t *LIBAOM_RESTRICT left_column) {
   1520  const __m128i bottom_left = _mm_set1_epi16(left_column[31]);
   1521  const __m128i weights_lo = LoadUnaligned16(smooth_weights + 28);
   1522  const __m128i weights_hi = LoadUnaligned16(smooth_weights + 44);
   1523  const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
   1524  const __m128i zero = _mm_setzero_si128();
   1525  const __m128i weights1 = cvtepu8_epi16(weights_lo);
   1526  const __m128i weights2 = _mm_unpackhi_epi8(weights_lo, zero);
   1527  const __m128i weights3 = cvtepu8_epi16(weights_hi);
   1528  const __m128i weights4 = _mm_unpackhi_epi8(weights_hi, zero);
   1529  const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
   1530  const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
   1531  const __m128i inverted_weights3 = _mm_sub_epi16(scale, weights3);
   1532  const __m128i inverted_weights4 = _mm_sub_epi16(scale, weights4);
   1533  const __m128i scaled_bottom_left1 =
   1534      _mm_mullo_epi16(inverted_weights1, bottom_left);
   1535  const __m128i scaled_bottom_left2 =
   1536      _mm_mullo_epi16(inverted_weights2, bottom_left);
   1537  const __m128i scaled_bottom_left3 =
   1538      _mm_mullo_epi16(inverted_weights3, bottom_left);
   1539  const __m128i scaled_bottom_left4 =
   1540      _mm_mullo_epi16(inverted_weights4, bottom_left);
   1541  const __m128i round = _mm_set1_epi16(128);
   1542 
   1543  const __m128i top = LoadUnaligned16(top_row);
   1544  const __m128i top_lo = cvtepu8_epi16(top);
   1545  const __m128i top_hi = _mm_unpackhi_epi8(top, zero);
   1546  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
   1547    const __m128i y_select = _mm_set1_epi32(y_mask);
   1548    const __m128i weights_y = _mm_shuffle_epi8(weights1, y_select);
   1549    const __m128i scaled_bottom_left_y =
   1550        _mm_shuffle_epi8(scaled_bottom_left1, y_select);
   1551    write_smooth_directional_sum16(dst, top_lo, top_hi, weights_y, weights_y,
   1552                                   scaled_bottom_left_y, scaled_bottom_left_y,
   1553                                   round);
   1554    dst += stride;
   1555  }
   1556  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
   1557    const __m128i y_select = _mm_set1_epi32(y_mask);
   1558    const __m128i weights_y = _mm_shuffle_epi8(weights2, y_select);
   1559    const __m128i scaled_bottom_left_y =
   1560        _mm_shuffle_epi8(scaled_bottom_left2, y_select);
   1561    write_smooth_directional_sum16(dst, top_lo, top_hi, weights_y, weights_y,
   1562                                   scaled_bottom_left_y, scaled_bottom_left_y,
   1563                                   round);
   1564    dst += stride;
   1565  }
   1566  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
   1567    const __m128i y_select = _mm_set1_epi32(y_mask);
   1568    const __m128i weights_y = _mm_shuffle_epi8(weights3, y_select);
   1569    const __m128i scaled_bottom_left_y =
   1570        _mm_shuffle_epi8(scaled_bottom_left3, y_select);
   1571    write_smooth_directional_sum16(dst, top_lo, top_hi, weights_y, weights_y,
   1572                                   scaled_bottom_left_y, scaled_bottom_left_y,
   1573                                   round);
   1574    dst += stride;
   1575  }
   1576  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
   1577    const __m128i y_select = _mm_set1_epi32(y_mask);
   1578    const __m128i weights_y = _mm_shuffle_epi8(weights4, y_select);
   1579    const __m128i scaled_bottom_left_y =
   1580        _mm_shuffle_epi8(scaled_bottom_left4, y_select);
   1581    write_smooth_directional_sum16(dst, top_lo, top_hi, weights_y, weights_y,
   1582                                   scaled_bottom_left_y, scaled_bottom_left_y,
   1583                                   round);
   1584    dst += stride;
   1585  }
   1586 }
   1587 
   1588 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
   1589 void aom_smooth_v_predictor_16x64_ssse3(
   1590    uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
   1591    const uint8_t *LIBAOM_RESTRICT top_row,
   1592    const uint8_t *LIBAOM_RESTRICT left_column) {
   1593  const __m128i bottom_left = _mm_set1_epi16(left_column[63]);
   1594  const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
   1595  const __m128i round = _mm_set1_epi16(128);
   1596  const __m128i zero = _mm_setzero_si128();
   1597  const __m128i top = LoadUnaligned16(top_row);
   1598  const __m128i top_lo = cvtepu8_epi16(top);
   1599  const __m128i top_hi = _mm_unpackhi_epi8(top, zero);
   1600  const uint8_t *weights_base_ptr = smooth_weights + 60;
   1601  for (int left_offset = 0; left_offset < 64; left_offset += 16) {
   1602    const __m128i weights = LoadUnaligned16(weights_base_ptr + left_offset);
   1603    const __m128i weights_lo = cvtepu8_epi16(weights);
   1604    const __m128i weights_hi = _mm_unpackhi_epi8(weights, zero);
   1605    const __m128i inverted_weights_lo = _mm_sub_epi16(scale, weights_lo);
   1606    const __m128i inverted_weights_hi = _mm_sub_epi16(scale, weights_hi);
   1607    const __m128i scaled_bottom_left_lo =
   1608        _mm_mullo_epi16(inverted_weights_lo, bottom_left);
   1609    const __m128i scaled_bottom_left_hi =
   1610        _mm_mullo_epi16(inverted_weights_hi, bottom_left);
   1611 
   1612    for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
   1613      const __m128i y_select = _mm_set1_epi32(y_mask);
   1614      const __m128i weights_y = _mm_shuffle_epi8(weights_lo, y_select);
   1615      const __m128i scaled_bottom_left_y =
   1616          _mm_shuffle_epi8(scaled_bottom_left_lo, y_select);
   1617      write_smooth_directional_sum16(dst, top_lo, top_hi, weights_y, weights_y,
   1618                                     scaled_bottom_left_y, scaled_bottom_left_y,
   1619                                     round);
   1620      dst += stride;
   1621    }
   1622    for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
   1623      const __m128i y_select = _mm_set1_epi32(y_mask);
   1624      const __m128i weights_y = _mm_shuffle_epi8(weights_hi, y_select);
   1625      const __m128i scaled_bottom_left_y =
   1626          _mm_shuffle_epi8(scaled_bottom_left_hi, y_select);
   1627      write_smooth_directional_sum16(dst, top_lo, top_hi, weights_y, weights_y,
   1628                                     scaled_bottom_left_y, scaled_bottom_left_y,
   1629                                     round);
   1630      dst += stride;
   1631    }
   1632  }
   1633 }
   1634 
   1635 void aom_smooth_v_predictor_32x8_ssse3(
   1636    uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
   1637    const uint8_t *LIBAOM_RESTRICT top_row,
   1638    const uint8_t *LIBAOM_RESTRICT left_column) {
   1639  const __m128i zero = _mm_setzero_si128();
   1640  const __m128i bottom_left = _mm_set1_epi16(left_column[7]);
   1641  const __m128i top_lo = LoadUnaligned16(top_row);
   1642  const __m128i top_hi = LoadUnaligned16(top_row + 16);
   1643  const __m128i top1 = cvtepu8_epi16(top_lo);
   1644  const __m128i top2 = _mm_unpackhi_epi8(top_lo, zero);
   1645  const __m128i top3 = cvtepu8_epi16(top_hi);
   1646  const __m128i top4 = _mm_unpackhi_epi8(top_hi, zero);
   1647  __m128i scale = _mm_set1_epi16(256);
   1648  const __m128i weights = cvtepu8_epi16(LoadLo8(smooth_weights + 4));
   1649  const __m128i inverted_weights = _mm_sub_epi16(scale, weights);
   1650  const __m128i scaled_bottom_left =
   1651      _mm_mullo_epi16(inverted_weights, bottom_left);
   1652  const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
   1653  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
   1654    __m128i y_select = _mm_set1_epi32(y_mask);
   1655    const __m128i weights_y = _mm_shuffle_epi8(weights, y_select);
   1656    const __m128i scaled_bottom_left_y =
   1657        _mm_shuffle_epi8(scaled_bottom_left, y_select);
   1658    write_smooth_directional_sum16(dst, top1, top2, weights_y, weights_y,
   1659                                   scaled_bottom_left_y, scaled_bottom_left_y,
   1660                                   round);
   1661    write_smooth_directional_sum16(dst + 16, top3, top4, weights_y, weights_y,
   1662                                   scaled_bottom_left_y, scaled_bottom_left_y,
   1663                                   round);
   1664    dst += stride;
   1665  }
   1666 }
   1667 #endif  // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
   1668 
   1669 void aom_smooth_v_predictor_32x16_ssse3(
   1670    uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
   1671    const uint8_t *LIBAOM_RESTRICT top_row,
   1672    const uint8_t *LIBAOM_RESTRICT left_column) {
   1673  const __m128i zero = _mm_setzero_si128();
   1674  const __m128i bottom_left = _mm_set1_epi16(left_column[15]);
   1675  const __m128i top_lo = LoadUnaligned16(top_row);
   1676  const __m128i top_hi = LoadUnaligned16(top_row + 16);
   1677  const __m128i top1 = cvtepu8_epi16(top_lo);
   1678  const __m128i top2 = _mm_unpackhi_epi8(top_lo, zero);
   1679  const __m128i top3 = cvtepu8_epi16(top_hi);
   1680  const __m128i top4 = _mm_unpackhi_epi8(top_hi, zero);
   1681  const __m128i weights = LoadUnaligned16(smooth_weights + 12);
   1682  const __m128i weights1 = cvtepu8_epi16(weights);
   1683  const __m128i weights2 = _mm_unpackhi_epi8(weights, zero);
   1684  const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
   1685  const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
   1686  const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
   1687  const __m128i scaled_bottom_left1 =
   1688      _mm_mullo_epi16(inverted_weights1, bottom_left);
   1689  const __m128i scaled_bottom_left2 =
   1690      _mm_mullo_epi16(inverted_weights2, bottom_left);
   1691  const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
   1692  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
   1693    __m128i y_select = _mm_set1_epi32(y_mask);
   1694    const __m128i weights_y = _mm_shuffle_epi8(weights1, y_select);
   1695    const __m128i scaled_bottom_left_y =
   1696        _mm_shuffle_epi8(scaled_bottom_left1, y_select);
   1697    write_smooth_directional_sum16(dst, top1, top2, weights_y, weights_y,
   1698                                   scaled_bottom_left_y, scaled_bottom_left_y,
   1699                                   round);
   1700    write_smooth_directional_sum16(dst + 16, top3, top4, weights_y, weights_y,
   1701                                   scaled_bottom_left_y, scaled_bottom_left_y,
   1702                                   round);
   1703    dst += stride;
   1704  }
   1705  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
   1706    __m128i y_select = _mm_set1_epi32(y_mask);
   1707    const __m128i weights_y = _mm_shuffle_epi8(weights2, y_select);
   1708    const __m128i scaled_bottom_left_y =
   1709        _mm_shuffle_epi8(scaled_bottom_left2, y_select);
   1710    write_smooth_directional_sum16(dst, top1, top2, weights_y, weights_y,
   1711                                   scaled_bottom_left_y, scaled_bottom_left_y,
   1712                                   round);
   1713    write_smooth_directional_sum16(dst + 16, top3, top4, weights_y, weights_y,
   1714                                   scaled_bottom_left_y, scaled_bottom_left_y,
   1715                                   round);
   1716    dst += stride;
   1717  }
   1718 }
   1719 
   1720 void aom_smooth_v_predictor_32x32_ssse3(
   1721    uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
   1722    const uint8_t *LIBAOM_RESTRICT top_row,
   1723    const uint8_t *LIBAOM_RESTRICT left_column) {
   1724  const __m128i bottom_left = _mm_set1_epi16(left_column[31]);
   1725  const __m128i weights_lo = LoadUnaligned16(smooth_weights + 28);
   1726  const __m128i weights_hi = LoadUnaligned16(smooth_weights + 44);
   1727  const __m128i zero = _mm_setzero_si128();
   1728  const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
   1729  const __m128i top_lo = LoadUnaligned16(top_row);
   1730  const __m128i top_hi = LoadUnaligned16(top_row + 16);
   1731  const __m128i top1 = cvtepu8_epi16(top_lo);
   1732  const __m128i top2 = _mm_unpackhi_epi8(top_lo, zero);
   1733  const __m128i top3 = cvtepu8_epi16(top_hi);
   1734  const __m128i top4 = _mm_unpackhi_epi8(top_hi, zero);
   1735  const __m128i weights1 = cvtepu8_epi16(weights_lo);
   1736  const __m128i weights2 = _mm_unpackhi_epi8(weights_lo, zero);
   1737  const __m128i weights3 = cvtepu8_epi16(weights_hi);
   1738  const __m128i weights4 = _mm_unpackhi_epi8(weights_hi, zero);
   1739  const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
   1740  const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
   1741  const __m128i inverted_weights3 = _mm_sub_epi16(scale, weights3);
   1742  const __m128i inverted_weights4 = _mm_sub_epi16(scale, weights4);
   1743  const __m128i scaled_bottom_left1 =
   1744      _mm_mullo_epi16(inverted_weights1, bottom_left);
   1745  const __m128i scaled_bottom_left2 =
   1746      _mm_mullo_epi16(inverted_weights2, bottom_left);
   1747  const __m128i scaled_bottom_left3 =
   1748      _mm_mullo_epi16(inverted_weights3, bottom_left);
   1749  const __m128i scaled_bottom_left4 =
   1750      _mm_mullo_epi16(inverted_weights4, bottom_left);
   1751  const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
   1752  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
   1753    const __m128i y_select = _mm_set1_epi32(y_mask);
   1754    const __m128i weights_y = _mm_shuffle_epi8(weights1, y_select);
   1755    const __m128i scaled_bottom_left_y =
   1756        _mm_shuffle_epi8(scaled_bottom_left1, y_select);
   1757    write_smooth_directional_sum16(dst, top1, top2, weights_y, weights_y,
   1758                                   scaled_bottom_left_y, scaled_bottom_left_y,
   1759                                   round);
   1760    write_smooth_directional_sum16(dst + 16, top3, top4, weights_y, weights_y,
   1761                                   scaled_bottom_left_y, scaled_bottom_left_y,
   1762                                   round);
   1763    dst += stride;
   1764  }
   1765  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
   1766    const __m128i y_select = _mm_set1_epi32(y_mask);
   1767    const __m128i weights_y = _mm_shuffle_epi8(weights2, y_select);
   1768    const __m128i scaled_bottom_left_y =
   1769        _mm_shuffle_epi8(scaled_bottom_left2, y_select);
   1770    write_smooth_directional_sum16(dst, top1, top2, weights_y, weights_y,
   1771                                   scaled_bottom_left_y, scaled_bottom_left_y,
   1772                                   round);
   1773    write_smooth_directional_sum16(dst + 16, top3, top4, weights_y, weights_y,
   1774                                   scaled_bottom_left_y, scaled_bottom_left_y,
   1775                                   round);
   1776    dst += stride;
   1777  }
   1778  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
   1779    const __m128i y_select = _mm_set1_epi32(y_mask);
   1780    const __m128i weights_y = _mm_shuffle_epi8(weights3, y_select);
   1781    const __m128i scaled_bottom_left_y =
   1782        _mm_shuffle_epi8(scaled_bottom_left3, y_select);
   1783    write_smooth_directional_sum16(dst, top1, top2, weights_y, weights_y,
   1784                                   scaled_bottom_left_y, scaled_bottom_left_y,
   1785                                   round);
   1786    write_smooth_directional_sum16(dst + 16, top3, top4, weights_y, weights_y,
   1787                                   scaled_bottom_left_y, scaled_bottom_left_y,
   1788                                   round);
   1789    dst += stride;
   1790  }
   1791  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
   1792    const __m128i y_select = _mm_set1_epi32(y_mask);
   1793    const __m128i weights_y = _mm_shuffle_epi8(weights4, y_select);
   1794    const __m128i scaled_bottom_left_y =
   1795        _mm_shuffle_epi8(scaled_bottom_left4, y_select);
   1796    write_smooth_directional_sum16(dst, top1, top2, weights_y, weights_y,
   1797                                   scaled_bottom_left_y, scaled_bottom_left_y,
   1798                                   round);
   1799    write_smooth_directional_sum16(dst + 16, top3, top4, weights_y, weights_y,
   1800                                   scaled_bottom_left_y, scaled_bottom_left_y,
   1801                                   round);
   1802    dst += stride;
   1803  }
   1804 }
   1805 
   1806 void aom_smooth_v_predictor_32x64_ssse3(
   1807    uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
   1808    const uint8_t *LIBAOM_RESTRICT top_row,
   1809    const uint8_t *LIBAOM_RESTRICT left_column) {
   1810  const __m128i zero = _mm_setzero_si128();
   1811  const __m128i bottom_left = _mm_set1_epi16(left_column[63]);
   1812  const __m128i top_lo = LoadUnaligned16(top_row);
   1813  const __m128i top_hi = LoadUnaligned16(top_row + 16);
   1814  const __m128i top1 = cvtepu8_epi16(top_lo);
   1815  const __m128i top2 = _mm_unpackhi_epi8(top_lo, zero);
   1816  const __m128i top3 = cvtepu8_epi16(top_hi);
   1817  const __m128i top4 = _mm_unpackhi_epi8(top_hi, zero);
   1818  const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
   1819  const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
   1820  const uint8_t *weights_base_ptr = smooth_weights + 60;
   1821  for (int left_offset = 0; left_offset < 64; left_offset += 16) {
   1822    const __m128i weights = LoadUnaligned16(weights_base_ptr + left_offset);
   1823    const __m128i weights_lo = cvtepu8_epi16(weights);
   1824    const __m128i weights_hi = _mm_unpackhi_epi8(weights, zero);
   1825    const __m128i inverted_weights_lo = _mm_sub_epi16(scale, weights_lo);
   1826    const __m128i inverted_weights_hi = _mm_sub_epi16(scale, weights_hi);
   1827    const __m128i scaled_bottom_left_lo =
   1828        _mm_mullo_epi16(inverted_weights_lo, bottom_left);
   1829    const __m128i scaled_bottom_left_hi =
   1830        _mm_mullo_epi16(inverted_weights_hi, bottom_left);
   1831 
   1832    for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
   1833      const __m128i y_select = _mm_set1_epi32(y_mask);
   1834      const __m128i weights_y = _mm_shuffle_epi8(weights_lo, y_select);
   1835      const __m128i scaled_bottom_left_y =
   1836          _mm_shuffle_epi8(scaled_bottom_left_lo, y_select);
   1837      write_smooth_directional_sum16(dst, top1, top2, weights_y, weights_y,
   1838                                     scaled_bottom_left_y, scaled_bottom_left_y,
   1839                                     round);
   1840      write_smooth_directional_sum16(dst + 16, top3, top4, weights_y, weights_y,
   1841                                     scaled_bottom_left_y, scaled_bottom_left_y,
   1842                                     round);
   1843      dst += stride;
   1844    }
   1845    for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
   1846      const __m128i y_select = _mm_set1_epi32(y_mask);
   1847      const __m128i weights_y = _mm_shuffle_epi8(weights_hi, y_select);
   1848      const __m128i scaled_bottom_left_y =
   1849          _mm_shuffle_epi8(scaled_bottom_left_hi, y_select);
   1850      write_smooth_directional_sum16(dst, top1, top2, weights_y, weights_y,
   1851                                     scaled_bottom_left_y, scaled_bottom_left_y,
   1852                                     round);
   1853      write_smooth_directional_sum16(dst + 16, top3, top4, weights_y, weights_y,
   1854                                     scaled_bottom_left_y, scaled_bottom_left_y,
   1855                                     round);
   1856      dst += stride;
   1857    }
   1858  }
   1859 }
   1860 
   1861 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
   1862 void aom_smooth_v_predictor_64x16_ssse3(
   1863    uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
   1864    const uint8_t *LIBAOM_RESTRICT top_row,
   1865    const uint8_t *LIBAOM_RESTRICT left_column) {
   1866  const __m128i bottom_left = _mm_set1_epi16(left_column[15]);
   1867  const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
   1868  const __m128i zero = _mm_setzero_si128();
   1869  const __m128i top_lolo = LoadUnaligned16(top_row);
   1870  const __m128i top_lohi = LoadUnaligned16(top_row + 16);
   1871  const __m128i top1 = cvtepu8_epi16(top_lolo);
   1872  const __m128i top2 = _mm_unpackhi_epi8(top_lolo, zero);
   1873  const __m128i top3 = cvtepu8_epi16(top_lohi);
   1874  const __m128i top4 = _mm_unpackhi_epi8(top_lohi, zero);
   1875 
   1876  const __m128i weights = LoadUnaligned16(smooth_weights + 12);
   1877  const __m128i weights1 = cvtepu8_epi16(weights);
   1878  const __m128i weights2 = _mm_unpackhi_epi8(weights, zero);
   1879  const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
   1880  const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
   1881  const __m128i top_hilo = LoadUnaligned16(top_row + 32);
   1882  const __m128i top_hihi = LoadUnaligned16(top_row + 48);
   1883  const __m128i top5 = cvtepu8_epi16(top_hilo);
   1884  const __m128i top6 = _mm_unpackhi_epi8(top_hilo, zero);
   1885  const __m128i top7 = cvtepu8_epi16(top_hihi);
   1886  const __m128i top8 = _mm_unpackhi_epi8(top_hihi, zero);
   1887  const __m128i scaled_bottom_left1 =
   1888      _mm_mullo_epi16(inverted_weights1, bottom_left);
   1889  const __m128i scaled_bottom_left2 =
   1890      _mm_mullo_epi16(inverted_weights2, bottom_left);
   1891  const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
   1892  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
   1893    const __m128i y_select = _mm_set1_epi32(y_mask);
   1894    const __m128i weights_y = _mm_shuffle_epi8(weights1, y_select);
   1895    const __m128i scaled_bottom_left_y =
   1896        _mm_shuffle_epi8(scaled_bottom_left1, y_select);
   1897    write_smooth_directional_sum16(dst, top1, top2, weights_y, weights_y,
   1898                                   scaled_bottom_left_y, scaled_bottom_left_y,
   1899                                   round);
   1900    write_smooth_directional_sum16(dst + 16, top3, top4, weights_y, weights_y,
   1901                                   scaled_bottom_left_y, scaled_bottom_left_y,
   1902                                   round);
   1903    write_smooth_directional_sum16(dst + 32, top5, top6, weights_y, weights_y,
   1904                                   scaled_bottom_left_y, scaled_bottom_left_y,
   1905                                   round);
   1906    write_smooth_directional_sum16(dst + 48, top7, top8, weights_y, weights_y,
   1907                                   scaled_bottom_left_y, scaled_bottom_left_y,
   1908                                   round);
   1909    dst += stride;
   1910  }
   1911  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
   1912    const __m128i y_select = _mm_set1_epi32(y_mask);
   1913    const __m128i weights_y = _mm_shuffle_epi8(weights2, y_select);
   1914    const __m128i scaled_bottom_left_y =
   1915        _mm_shuffle_epi8(scaled_bottom_left2, y_select);
   1916    write_smooth_directional_sum16(dst, top1, top2, weights_y, weights_y,
   1917                                   scaled_bottom_left_y, scaled_bottom_left_y,
   1918                                   round);
   1919    write_smooth_directional_sum16(dst + 16, top3, top4, weights_y, weights_y,
   1920                                   scaled_bottom_left_y, scaled_bottom_left_y,
   1921                                   round);
   1922    write_smooth_directional_sum16(dst + 32, top5, top6, weights_y, weights_y,
   1923                                   scaled_bottom_left_y, scaled_bottom_left_y,
   1924                                   round);
   1925    write_smooth_directional_sum16(dst + 48, top7, top8, weights_y, weights_y,
   1926                                   scaled_bottom_left_y, scaled_bottom_left_y,
   1927                                   round);
   1928    dst += stride;
   1929  }
   1930 }
   1931 #endif  // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
   1932 
   1933 void aom_smooth_v_predictor_64x32_ssse3(
   1934    uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
   1935    const uint8_t *LIBAOM_RESTRICT top_row,
   1936    const uint8_t *LIBAOM_RESTRICT left_column) {
   1937  const __m128i zero = _mm_setzero_si128();
   1938  const __m128i bottom_left = _mm_set1_epi16(left_column[31]);
   1939  const __m128i top_lolo = LoadUnaligned16(top_row);
   1940  const __m128i top_lohi = LoadUnaligned16(top_row + 16);
   1941  const __m128i top1 = cvtepu8_epi16(top_lolo);
   1942  const __m128i top2 = _mm_unpackhi_epi8(top_lolo, zero);
   1943  const __m128i top3 = cvtepu8_epi16(top_lohi);
   1944  const __m128i top4 = _mm_unpackhi_epi8(top_lohi, zero);
   1945  const __m128i top_hilo = LoadUnaligned16(top_row + 32);
   1946  const __m128i top_hihi = LoadUnaligned16(top_row + 48);
   1947  const __m128i top5 = cvtepu8_epi16(top_hilo);
   1948  const __m128i top6 = _mm_unpackhi_epi8(top_hilo, zero);
   1949  const __m128i top7 = cvtepu8_epi16(top_hihi);
   1950  const __m128i top8 = _mm_unpackhi_epi8(top_hihi, zero);
   1951  const __m128i weights_lo = LoadUnaligned16(smooth_weights + 28);
   1952  const __m128i weights_hi = LoadUnaligned16(smooth_weights + 44);
   1953  const __m128i weights1 = cvtepu8_epi16(weights_lo);
   1954  const __m128i weights2 = _mm_unpackhi_epi8(weights_lo, zero);
   1955  const __m128i weights3 = cvtepu8_epi16(weights_hi);
   1956  const __m128i weights4 = _mm_unpackhi_epi8(weights_hi, zero);
   1957  const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
   1958  const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
   1959  const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
   1960  const __m128i inverted_weights3 = _mm_sub_epi16(scale, weights3);
   1961  const __m128i inverted_weights4 = _mm_sub_epi16(scale, weights4);
   1962  const __m128i scaled_bottom_left1 =
   1963      _mm_mullo_epi16(inverted_weights1, bottom_left);
   1964  const __m128i scaled_bottom_left2 =
   1965      _mm_mullo_epi16(inverted_weights2, bottom_left);
   1966  const __m128i scaled_bottom_left3 =
   1967      _mm_mullo_epi16(inverted_weights3, bottom_left);
   1968  const __m128i scaled_bottom_left4 =
   1969      _mm_mullo_epi16(inverted_weights4, bottom_left);
   1970  const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
   1971 
   1972  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
   1973    const __m128i y_select = _mm_set1_epi32(y_mask);
   1974    const __m128i weights_y = _mm_shuffle_epi8(weights1, y_select);
   1975    const __m128i scaled_bottom_left_y =
   1976        _mm_shuffle_epi8(scaled_bottom_left1, y_select);
   1977    write_smooth_directional_sum16(dst, top1, top2, weights_y, weights_y,
   1978                                   scaled_bottom_left_y, scaled_bottom_left_y,
   1979                                   round);
   1980    write_smooth_directional_sum16(dst + 16, top3, top4, weights_y, weights_y,
   1981                                   scaled_bottom_left_y, scaled_bottom_left_y,
   1982                                   round);
   1983    write_smooth_directional_sum16(dst + 32, top5, top6, weights_y, weights_y,
   1984                                   scaled_bottom_left_y, scaled_bottom_left_y,
   1985                                   round);
   1986    write_smooth_directional_sum16(dst + 48, top7, top8, weights_y, weights_y,
   1987                                   scaled_bottom_left_y, scaled_bottom_left_y,
   1988                                   round);
   1989    dst += stride;
   1990  }
   1991  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
   1992    const __m128i y_select = _mm_set1_epi32(y_mask);
   1993    const __m128i weights_y = _mm_shuffle_epi8(weights2, y_select);
   1994    const __m128i scaled_bottom_left_y =
   1995        _mm_shuffle_epi8(scaled_bottom_left2, y_select);
   1996    write_smooth_directional_sum16(dst, top1, top2, weights_y, weights_y,
   1997                                   scaled_bottom_left_y, scaled_bottom_left_y,
   1998                                   round);
   1999    write_smooth_directional_sum16(dst + 16, top3, top4, weights_y, weights_y,
   2000                                   scaled_bottom_left_y, scaled_bottom_left_y,
   2001                                   round);
   2002    write_smooth_directional_sum16(dst + 32, top5, top6, weights_y, weights_y,
   2003                                   scaled_bottom_left_y, scaled_bottom_left_y,
   2004                                   round);
   2005    write_smooth_directional_sum16(dst + 48, top7, top8, weights_y, weights_y,
   2006                                   scaled_bottom_left_y, scaled_bottom_left_y,
   2007                                   round);
   2008    dst += stride;
   2009  }
   2010  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
   2011    const __m128i y_select = _mm_set1_epi32(y_mask);
   2012    const __m128i weights_y = _mm_shuffle_epi8(weights3, y_select);
   2013    const __m128i scaled_bottom_left_y =
   2014        _mm_shuffle_epi8(scaled_bottom_left3, y_select);
   2015    write_smooth_directional_sum16(dst, top1, top2, weights_y, weights_y,
   2016                                   scaled_bottom_left_y, scaled_bottom_left_y,
   2017                                   round);
   2018    write_smooth_directional_sum16(dst + 16, top3, top4, weights_y, weights_y,
   2019                                   scaled_bottom_left_y, scaled_bottom_left_y,
   2020                                   round);
   2021    write_smooth_directional_sum16(dst + 32, top5, top6, weights_y, weights_y,
   2022                                   scaled_bottom_left_y, scaled_bottom_left_y,
   2023                                   round);
   2024    write_smooth_directional_sum16(dst + 48, top7, top8, weights_y, weights_y,
   2025                                   scaled_bottom_left_y, scaled_bottom_left_y,
   2026                                   round);
   2027    dst += stride;
   2028  }
   2029  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
   2030    const __m128i y_select = _mm_set1_epi32(y_mask);
   2031    const __m128i weights_y = _mm_shuffle_epi8(weights4, y_select);
   2032    const __m128i scaled_bottom_left_y =
   2033        _mm_shuffle_epi8(scaled_bottom_left4, y_select);
   2034    write_smooth_directional_sum16(dst, top1, top2, weights_y, weights_y,
   2035                                   scaled_bottom_left_y, scaled_bottom_left_y,
   2036                                   round);
   2037    write_smooth_directional_sum16(dst + 16, top3, top4, weights_y, weights_y,
   2038                                   scaled_bottom_left_y, scaled_bottom_left_y,
   2039                                   round);
   2040    write_smooth_directional_sum16(dst + 32, top5, top6, weights_y, weights_y,
   2041                                   scaled_bottom_left_y, scaled_bottom_left_y,
   2042                                   round);
   2043    write_smooth_directional_sum16(dst + 48, top7, top8, weights_y, weights_y,
   2044                                   scaled_bottom_left_y, scaled_bottom_left_y,
   2045                                   round);
   2046    dst += stride;
   2047  }
   2048 }
   2049 
   2050 void aom_smooth_v_predictor_64x64_ssse3(
   2051    uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
   2052    const uint8_t *LIBAOM_RESTRICT top_row,
   2053    const uint8_t *LIBAOM_RESTRICT left_column) {
   2054  const __m128i zero = _mm_setzero_si128();
   2055  const __m128i bottom_left = _mm_set1_epi16(left_column[63]);
   2056  const __m128i top_lolo = LoadUnaligned16(top_row);
   2057  const __m128i top_lohi = LoadUnaligned16(top_row + 16);
   2058  const __m128i top1 = cvtepu8_epi16(top_lolo);
   2059  const __m128i top2 = _mm_unpackhi_epi8(top_lolo, zero);
   2060  const __m128i top3 = cvtepu8_epi16(top_lohi);
   2061  const __m128i top4 = _mm_unpackhi_epi8(top_lohi, zero);
   2062  const __m128i top_hilo = LoadUnaligned16(top_row + 32);
   2063  const __m128i top_hihi = LoadUnaligned16(top_row + 48);
   2064  const __m128i top5 = cvtepu8_epi16(top_hilo);
   2065  const __m128i top6 = _mm_unpackhi_epi8(top_hilo, zero);
   2066  const __m128i top7 = cvtepu8_epi16(top_hihi);
   2067  const __m128i top8 = _mm_unpackhi_epi8(top_hihi, zero);
   2068  const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
   2069  const __m128i round = _mm_set1_epi16(128);
   2070  const uint8_t *weights_base_ptr = smooth_weights + 60;
   2071  for (int left_offset = 0; left_offset < 64; left_offset += 16) {
   2072    const __m128i weights = LoadUnaligned16(weights_base_ptr + left_offset);
   2073    const __m128i weights_lo = cvtepu8_epi16(weights);
   2074    const __m128i weights_hi = _mm_unpackhi_epi8(weights, zero);
   2075    const __m128i inverted_weights_lo = _mm_sub_epi16(scale, weights_lo);
   2076    const __m128i inverted_weights_hi = _mm_sub_epi16(scale, weights_hi);
   2077    const __m128i scaled_bottom_left_lo =
   2078        _mm_mullo_epi16(inverted_weights_lo, bottom_left);
   2079    const __m128i scaled_bottom_left_hi =
   2080        _mm_mullo_epi16(inverted_weights_hi, bottom_left);
   2081    for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
   2082      const __m128i y_select = _mm_set1_epi32(y_mask);
   2083      const __m128i weights_y = _mm_shuffle_epi8(weights_lo, y_select);
   2084      const __m128i scaled_bottom_left_y =
   2085          _mm_shuffle_epi8(scaled_bottom_left_lo, y_select);
   2086      write_smooth_directional_sum16(dst, top1, top2, weights_y, weights_y,
   2087                                     scaled_bottom_left_y, scaled_bottom_left_y,
   2088                                     round);
   2089      write_smooth_directional_sum16(dst + 16, top3, top4, weights_y, weights_y,
   2090                                     scaled_bottom_left_y, scaled_bottom_left_y,
   2091                                     round);
   2092      write_smooth_directional_sum16(dst + 32, top5, top6, weights_y, weights_y,
   2093                                     scaled_bottom_left_y, scaled_bottom_left_y,
   2094                                     round);
   2095      write_smooth_directional_sum16(dst + 48, top7, top8, weights_y, weights_y,
   2096                                     scaled_bottom_left_y, scaled_bottom_left_y,
   2097                                     round);
   2098      dst += stride;
   2099    }
   2100    for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
   2101      const __m128i y_select = _mm_set1_epi32(y_mask);
   2102      const __m128i weights_y = _mm_shuffle_epi8(weights_hi, y_select);
   2103      const __m128i scaled_bottom_left_y =
   2104          _mm_shuffle_epi8(scaled_bottom_left_hi, y_select);
   2105      write_smooth_directional_sum16(dst, top1, top2, weights_y, weights_y,
   2106                                     scaled_bottom_left_y, scaled_bottom_left_y,
   2107                                     round);
   2108      write_smooth_directional_sum16(dst + 16, top3, top4, weights_y, weights_y,
   2109                                     scaled_bottom_left_y, scaled_bottom_left_y,
   2110                                     round);
   2111      write_smooth_directional_sum16(dst + 32, top5, top6, weights_y, weights_y,
   2112                                     scaled_bottom_left_y, scaled_bottom_left_y,
   2113                                     round);
   2114      write_smooth_directional_sum16(dst + 48, top7, top8, weights_y, weights_y,
   2115                                     scaled_bottom_left_y, scaled_bottom_left_y,
   2116                                     round);
   2117      dst += stride;
   2118    }
   2119  }
   2120 }
   2121 
   2122 // -----------------------------------------------------------------------------
   2123 // SMOOTH_H_PRED
   2124 static AOM_FORCE_INLINE void write_smooth_horizontal_sum4(
   2125    uint8_t *LIBAOM_RESTRICT dst, const __m128i *left_y, const __m128i *weights,
   2126    const __m128i *scaled_top_right, const __m128i *round) {
   2127  const __m128i weighted_left_y = _mm_mullo_epi16(*left_y, *weights);
   2128  const __m128i pred_sum = _mm_add_epi32(*scaled_top_right, weighted_left_y);
   2129  // Equivalent to RightShiftWithRounding(pred[x][y], 8).
   2130  const __m128i pred = _mm_srli_epi32(_mm_add_epi32(pred_sum, *round), 8);
   2131  const __m128i cvtepi32_epi8 = _mm_set1_epi32(0x0C080400);
   2132  Store4(dst, _mm_shuffle_epi8(pred, cvtepi32_epi8));
   2133 }
   2134 
   2135 void aom_smooth_h_predictor_4x4_ssse3(
   2136    uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
   2137    const uint8_t *LIBAOM_RESTRICT top_row,
   2138    const uint8_t *LIBAOM_RESTRICT left_column) {
   2139  const __m128i top_right = _mm_set1_epi32(top_row[3]);
   2140  const __m128i left = cvtepu8_epi32(Load4(left_column));
   2141  const __m128i weights = cvtepu8_epi32(Load4(smooth_weights));
   2142  const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
   2143  const __m128i inverted_weights = _mm_sub_epi32(scale, weights);
   2144  const __m128i scaled_top_right = _mm_mullo_epi16(inverted_weights, top_right);
   2145  const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
   2146  __m128i left_y = _mm_shuffle_epi32(left, 0);
   2147  write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
   2148                               &round);
   2149  dst += stride;
   2150  left_y = _mm_shuffle_epi32(left, 0x55);
   2151  write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
   2152                               &round);
   2153  dst += stride;
   2154  left_y = _mm_shuffle_epi32(left, 0xaa);
   2155  write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
   2156                               &round);
   2157  dst += stride;
   2158  left_y = _mm_shuffle_epi32(left, 0xff);
   2159  write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
   2160                               &round);
   2161 }
   2162 
   2163 void aom_smooth_h_predictor_4x8_ssse3(
   2164    uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
   2165    const uint8_t *LIBAOM_RESTRICT top_row,
   2166    const uint8_t *LIBAOM_RESTRICT left_column) {
   2167  const __m128i top_right = _mm_set1_epi32(top_row[3]);
   2168  const __m128i weights = cvtepu8_epi32(Load4(smooth_weights));
   2169  const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
   2170  const __m128i inverted_weights = _mm_sub_epi32(scale, weights);
   2171  const __m128i scaled_top_right = _mm_mullo_epi16(inverted_weights, top_right);
   2172  const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
   2173  __m128i left = cvtepu8_epi32(Load4(left_column));
   2174  __m128i left_y = _mm_shuffle_epi32(left, 0);
   2175  write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
   2176                               &round);
   2177  dst += stride;
   2178  left_y = _mm_shuffle_epi32(left, 0x55);
   2179  write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
   2180                               &round);
   2181  dst += stride;
   2182  left_y = _mm_shuffle_epi32(left, 0xaa);
   2183  write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
   2184                               &round);
   2185  dst += stride;
   2186  left_y = _mm_shuffle_epi32(left, 0xff);
   2187  write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
   2188                               &round);
   2189  dst += stride;
   2190 
   2191  left = cvtepu8_epi32(Load4(left_column + 4));
   2192  left_y = _mm_shuffle_epi32(left, 0);
   2193  write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
   2194                               &round);
   2195  dst += stride;
   2196  left_y = _mm_shuffle_epi32(left, 0x55);
   2197  write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
   2198                               &round);
   2199  dst += stride;
   2200  left_y = _mm_shuffle_epi32(left, 0xaa);
   2201  write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
   2202                               &round);
   2203  dst += stride;
   2204  left_y = _mm_shuffle_epi32(left, 0xff);
   2205  write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
   2206                               &round);
   2207 }
   2208 
   2209 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
   2210 void aom_smooth_h_predictor_4x16_ssse3(
   2211    uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
   2212    const uint8_t *LIBAOM_RESTRICT top_row,
   2213    const uint8_t *LIBAOM_RESTRICT left_column) {
   2214  const __m128i top_right = _mm_set1_epi32(top_row[3]);
   2215  const __m128i weights = cvtepu8_epi32(Load4(smooth_weights));
   2216  const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
   2217  const __m128i inverted_weights = _mm_sub_epi32(scale, weights);
   2218  const __m128i scaled_top_right = _mm_mullo_epi16(inverted_weights, top_right);
   2219  const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
   2220  __m128i left = cvtepu8_epi32(Load4(left_column));
   2221  __m128i left_y = _mm_shuffle_epi32(left, 0);
   2222  write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
   2223                               &round);
   2224  dst += stride;
   2225  left_y = _mm_shuffle_epi32(left, 0x55);
   2226  write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
   2227                               &round);
   2228  dst += stride;
   2229  left_y = _mm_shuffle_epi32(left, 0xaa);
   2230  write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
   2231                               &round);
   2232  dst += stride;
   2233  left_y = _mm_shuffle_epi32(left, 0xff);
   2234  write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
   2235                               &round);
   2236  dst += stride;
   2237 
   2238  left = cvtepu8_epi32(Load4(left_column + 4));
   2239  left_y = _mm_shuffle_epi32(left, 0);
   2240  write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
   2241                               &round);
   2242  dst += stride;
   2243  left_y = _mm_shuffle_epi32(left, 0x55);
   2244  write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
   2245                               &round);
   2246  dst += stride;
   2247  left_y = _mm_shuffle_epi32(left, 0xaa);
   2248  write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
   2249                               &round);
   2250  dst += stride;
   2251  left_y = _mm_shuffle_epi32(left, 0xff);
   2252  write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
   2253                               &round);
   2254  dst += stride;
   2255 
   2256  left = cvtepu8_epi32(Load4(left_column + 8));
   2257  left_y = _mm_shuffle_epi32(left, 0);
   2258  write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
   2259                               &round);
   2260  dst += stride;
   2261  left_y = _mm_shuffle_epi32(left, 0x55);
   2262  write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
   2263                               &round);
   2264  dst += stride;
   2265  left_y = _mm_shuffle_epi32(left, 0xaa);
   2266  write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
   2267                               &round);
   2268  dst += stride;
   2269  left_y = _mm_shuffle_epi32(left, 0xff);
   2270  write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
   2271                               &round);
   2272  dst += stride;
   2273 
   2274  left = cvtepu8_epi32(Load4(left_column + 12));
   2275  left_y = _mm_shuffle_epi32(left, 0);
   2276  write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
   2277                               &round);
   2278  dst += stride;
   2279  left_y = _mm_shuffle_epi32(left, 0x55);
   2280  write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
   2281                               &round);
   2282  dst += stride;
   2283  left_y = _mm_shuffle_epi32(left, 0xaa);
   2284  write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
   2285                               &round);
   2286  dst += stride;
   2287  left_y = _mm_shuffle_epi32(left, 0xff);
   2288  write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
   2289                               &round);
   2290 }
   2291 #endif  // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
   2292 
   2293 // For SMOOTH_H, |pixels| is the repeated left value for the row. For SMOOTH_V,
   2294 // |pixels| is a segment of the top row or the whole top row, and |weights| is
   2295 // repeated.
   2296 void aom_smooth_h_predictor_8x4_ssse3(
   2297    uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
   2298    const uint8_t *LIBAOM_RESTRICT top_row,
   2299    const uint8_t *LIBAOM_RESTRICT left_column) {
   2300  const __m128i top_right = _mm_set1_epi16(top_row[7]);
   2301  const __m128i left = cvtepu8_epi16(Load4(left_column));
   2302  const __m128i weights = cvtepu8_epi16(LoadLo8(smooth_weights + 4));
   2303  const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
   2304  const __m128i inverted_weights = _mm_sub_epi16(scale, weights);
   2305  const __m128i scaled_top_right = _mm_mullo_epi16(inverted_weights, top_right);
   2306  const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
   2307  __m128i y_select = _mm_set1_epi32(0x01000100);
   2308  __m128i left_y = _mm_shuffle_epi8(left, y_select);
   2309  write_smooth_directional_sum8(dst, &left_y, &weights, &scaled_top_right,
   2310                                &round);
   2311  dst += stride;
   2312  y_select = _mm_set1_epi32(0x03020302);
   2313  left_y = _mm_shuffle_epi8(left, y_select);
   2314  write_smooth_directional_sum8(dst, &left_y, &weights, &scaled_top_right,
   2315                                &round);
   2316  dst += stride;
   2317  y_select = _mm_set1_epi32(0x05040504);
   2318  left_y = _mm_shuffle_epi8(left, y_select);
   2319  write_smooth_directional_sum8(dst, &left_y, &weights, &scaled_top_right,
   2320                                &round);
   2321  dst += stride;
   2322  y_select = _mm_set1_epi32(0x07060706);
   2323  left_y = _mm_shuffle_epi8(left, y_select);
   2324  write_smooth_directional_sum8(dst, &left_y, &weights, &scaled_top_right,
   2325                                &round);
   2326 }
   2327 
   2328 void aom_smooth_h_predictor_8x8_ssse3(
   2329    uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
   2330    const uint8_t *LIBAOM_RESTRICT top_row,
   2331    const uint8_t *LIBAOM_RESTRICT left_column) {
   2332  const __m128i top_right = _mm_set1_epi16(top_row[7]);
   2333  const __m128i left = cvtepu8_epi16(LoadLo8(left_column));
   2334  const __m128i weights = cvtepu8_epi16(LoadLo8(smooth_weights + 4));
   2335  const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
   2336  const __m128i inverted_weights = _mm_sub_epi16(scale, weights);
   2337  const __m128i scaled_top_right = _mm_mullo_epi16(inverted_weights, top_right);
   2338  const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
   2339  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
   2340    const __m128i y_select = _mm_set1_epi32(y_mask);
   2341    const __m128i left_y = _mm_shuffle_epi8(left, y_select);
   2342    write_smooth_directional_sum8(dst, &left_y, &weights, &scaled_top_right,
   2343                                  &round);
   2344    dst += stride;
   2345  }
   2346 }
   2347 
   2348 void aom_smooth_h_predictor_8x16_ssse3(
   2349    uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
   2350    const uint8_t *LIBAOM_RESTRICT top_row,
   2351    const uint8_t *LIBAOM_RESTRICT left_column) {
   2352  const __m128i top_right = _mm_set1_epi16(top_row[7]);
   2353  const __m128i weights = cvtepu8_epi16(LoadLo8(smooth_weights + 4));
   2354  const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
   2355  const __m128i inverted_weights = _mm_sub_epi16(scale, weights);
   2356  const __m128i scaled_top_right = _mm_mullo_epi16(inverted_weights, top_right);
   2357  const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
   2358  __m128i left = cvtepu8_epi16(LoadLo8(left_column));
   2359  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
   2360    const __m128i y_select = _mm_set1_epi32(y_mask);
   2361    const __m128i left_y = _mm_shuffle_epi8(left, y_select);
   2362    write_smooth_directional_sum8(dst, &left_y, &weights, &scaled_top_right,
   2363                                  &round);
   2364    dst += stride;
   2365  }
   2366  left = cvtepu8_epi16(LoadLo8(left_column + 8));
   2367  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
   2368    const __m128i y_select = _mm_set1_epi32(y_mask);
   2369    const __m128i left_y = _mm_shuffle_epi8(left, y_select);
   2370    write_smooth_directional_sum8(dst, &left_y, &weights, &scaled_top_right,
   2371                                  &round);
   2372    dst += stride;
   2373  }
   2374 }
   2375 
   2376 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
   2377 void aom_smooth_h_predictor_8x32_ssse3(
   2378    uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
   2379    const uint8_t *LIBAOM_RESTRICT top_row,
   2380    const uint8_t *LIBAOM_RESTRICT left_column) {
   2381  const __m128i top_right = _mm_set1_epi16(top_row[7]);
   2382  const __m128i weights = cvtepu8_epi16(LoadLo8(smooth_weights + 4));
   2383  const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
   2384  const __m128i inverted_weights = _mm_sub_epi16(scale, weights);
   2385  const __m128i scaled_top_right = _mm_mullo_epi16(inverted_weights, top_right);
   2386  const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
   2387  __m128i left = cvtepu8_epi16(LoadLo8(left_column));
   2388  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
   2389    const __m128i y_select = _mm_set1_epi32(y_mask);
   2390    const __m128i left_y = _mm_shuffle_epi8(left, y_select);
   2391    write_smooth_directional_sum8(dst, &left_y, &weights, &scaled_top_right,
   2392                                  &round);
   2393    dst += stride;
   2394  }
   2395  left = cvtepu8_epi16(LoadLo8(left_column + 8));
   2396  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
   2397    const __m128i y_select = _mm_set1_epi32(y_mask);
   2398    const __m128i left_y = _mm_shuffle_epi8(left, y_select);
   2399    write_smooth_directional_sum8(dst, &left_y, &weights, &scaled_top_right,
   2400                                  &round);
   2401    dst += stride;
   2402  }
   2403  left = cvtepu8_epi16(LoadLo8(left_column + 16));
   2404  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
   2405    const __m128i y_select = _mm_set1_epi32(y_mask);
   2406    const __m128i left_y = _mm_shuffle_epi8(left, y_select);
   2407    write_smooth_directional_sum8(dst, &left_y, &weights, &scaled_top_right,
   2408                                  &round);
   2409    dst += stride;
   2410  }
   2411  left = cvtepu8_epi16(LoadLo8(left_column + 24));
   2412  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
   2413    const __m128i y_select = _mm_set1_epi32(y_mask);
   2414    const __m128i left_y = _mm_shuffle_epi8(left, y_select);
   2415    write_smooth_directional_sum8(dst, &left_y, &weights, &scaled_top_right,
   2416                                  &round);
   2417    dst += stride;
   2418  }
   2419 }
   2420 
   2421 void aom_smooth_h_predictor_16x4_ssse3(
   2422    uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
   2423    const uint8_t *LIBAOM_RESTRICT top_row,
   2424    const uint8_t *LIBAOM_RESTRICT left_column) {
   2425  const __m128i top_right = _mm_set1_epi16(top_row[15]);
   2426  const __m128i left = cvtepu8_epi16(Load4(left_column));
   2427  const __m128i weights = LoadUnaligned16(smooth_weights + 12);
   2428  const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
   2429  const __m128i weights1 = cvtepu8_epi16(weights);
   2430  const __m128i weights2 = cvtepu8_epi16(_mm_srli_si128(weights, 8));
   2431  const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
   2432  const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
   2433  const __m128i scaled_top_right1 =
   2434      _mm_mullo_epi16(inverted_weights1, top_right);
   2435  const __m128i scaled_top_right2 =
   2436      _mm_mullo_epi16(inverted_weights2, top_right);
   2437  const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
   2438  __m128i y_mask = _mm_set1_epi32(0x01000100);
   2439  __m128i left_y = _mm_shuffle_epi8(left, y_mask);
   2440  write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
   2441                                 scaled_top_right1, scaled_top_right2, round);
   2442  dst += stride;
   2443  y_mask = _mm_set1_epi32(0x03020302);
   2444  left_y = _mm_shuffle_epi8(left, y_mask);
   2445  write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
   2446                                 scaled_top_right1, scaled_top_right2, round);
   2447  dst += stride;
   2448  y_mask = _mm_set1_epi32(0x05040504);
   2449  left_y = _mm_shuffle_epi8(left, y_mask);
   2450  write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
   2451                                 scaled_top_right1, scaled_top_right2, round);
   2452  dst += stride;
   2453  y_mask = _mm_set1_epi32(0x07060706);
   2454  left_y = _mm_shuffle_epi8(left, y_mask);
   2455  write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
   2456                                 scaled_top_right1, scaled_top_right2, round);
   2457 }
   2458 #endif  // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
   2459 
   2460 void aom_smooth_h_predictor_16x8_ssse3(
   2461    uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
   2462    const uint8_t *LIBAOM_RESTRICT top_row,
   2463    const uint8_t *LIBAOM_RESTRICT left_column) {
   2464  const __m128i top_right = _mm_set1_epi16(top_row[15]);
   2465  const __m128i left = cvtepu8_epi16(LoadLo8(left_column));
   2466  const __m128i weights = LoadUnaligned16(smooth_weights + 12);
   2467  const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
   2468  const __m128i weights1 = cvtepu8_epi16(weights);
   2469  const __m128i weights2 = cvtepu8_epi16(_mm_srli_si128(weights, 8));
   2470  const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
   2471  const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
   2472  const __m128i scaled_top_right1 =
   2473      _mm_mullo_epi16(inverted_weights1, top_right);
   2474  const __m128i scaled_top_right2 =
   2475      _mm_mullo_epi16(inverted_weights2, top_right);
   2476  const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
   2477  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
   2478    const __m128i y_select = _mm_set1_epi32(y_mask);
   2479    const __m128i left_y = _mm_shuffle_epi8(left, y_select);
   2480    write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
   2481                                   scaled_top_right1, scaled_top_right2, round);
   2482    dst += stride;
   2483  }
   2484 }
   2485 
   2486 void aom_smooth_h_predictor_16x16_ssse3(
   2487    uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
   2488    const uint8_t *LIBAOM_RESTRICT top_row,
   2489    const uint8_t *LIBAOM_RESTRICT left_column) {
   2490  const __m128i top_right = _mm_set1_epi16(top_row[15]);
   2491  const __m128i weights = LoadUnaligned16(smooth_weights + 12);
   2492  const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
   2493  const __m128i weights1 = cvtepu8_epi16(weights);
   2494  const __m128i weights2 = cvtepu8_epi16(_mm_srli_si128(weights, 8));
   2495  const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
   2496  const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
   2497  const __m128i scaled_top_right1 =
   2498      _mm_mullo_epi16(inverted_weights1, top_right);
   2499  const __m128i scaled_top_right2 =
   2500      _mm_mullo_epi16(inverted_weights2, top_right);
   2501  const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
   2502  __m128i left = cvtepu8_epi16(LoadLo8(left_column));
   2503  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
   2504    const __m128i y_select = _mm_set1_epi32(y_mask);
   2505    const __m128i left_y = _mm_shuffle_epi8(left, y_select);
   2506    write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
   2507                                   scaled_top_right1, scaled_top_right2, round);
   2508    dst += stride;
   2509  }
   2510  left = cvtepu8_epi16(LoadLo8(left_column + 8));
   2511  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
   2512    const __m128i y_select = _mm_set1_epi32(y_mask);
   2513    const __m128i left_y = _mm_shuffle_epi8(left, y_select);
   2514    write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
   2515                                   scaled_top_right1, scaled_top_right2, round);
   2516    dst += stride;
   2517  }
   2518 }
   2519 
   2520 void aom_smooth_h_predictor_16x32_ssse3(
   2521    uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
   2522    const uint8_t *LIBAOM_RESTRICT top_row,
   2523    const uint8_t *LIBAOM_RESTRICT left_column) {
   2524  const __m128i top_right = _mm_set1_epi16(top_row[15]);
   2525  const __m128i weights = LoadUnaligned16(smooth_weights + 12);
   2526  const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
   2527  const __m128i weights1 = cvtepu8_epi16(weights);
   2528  const __m128i weights2 = cvtepu8_epi16(_mm_srli_si128(weights, 8));
   2529  const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
   2530  const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
   2531  const __m128i scaled_top_right1 =
   2532      _mm_mullo_epi16(inverted_weights1, top_right);
   2533  const __m128i scaled_top_right2 =
   2534      _mm_mullo_epi16(inverted_weights2, top_right);
   2535  const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
   2536  __m128i left = cvtepu8_epi16(LoadLo8(left_column));
   2537  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
   2538    const __m128i y_select = _mm_set1_epi32(y_mask);
   2539    const __m128i left_y = _mm_shuffle_epi8(left, y_select);
   2540    write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
   2541                                   scaled_top_right1, scaled_top_right2, round);
   2542    dst += stride;
   2543  }
   2544  left = cvtepu8_epi16(LoadLo8(left_column + 8));
   2545  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
   2546    const __m128i y_select = _mm_set1_epi32(y_mask);
   2547    const __m128i left_y = _mm_shuffle_epi8(left, y_select);
   2548    write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
   2549                                   scaled_top_right1, scaled_top_right2, round);
   2550    dst += stride;
   2551  }
   2552  left = cvtepu8_epi16(LoadLo8(left_column + 16));
   2553  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
   2554    const __m128i y_select = _mm_set1_epi32(y_mask);
   2555    const __m128i left_y = _mm_shuffle_epi8(left, y_select);
   2556    write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
   2557                                   scaled_top_right1, scaled_top_right2, round);
   2558    dst += stride;
   2559  }
   2560  left = cvtepu8_epi16(LoadLo8(left_column + 24));
   2561  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
   2562    const __m128i y_select = _mm_set1_epi32(y_mask);
   2563    const __m128i left_y = _mm_shuffle_epi8(left, y_select);
   2564    write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
   2565                                   scaled_top_right1, scaled_top_right2, round);
   2566    dst += stride;
   2567  }
   2568 }
   2569 
   2570 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
   2571 void aom_smooth_h_predictor_16x64_ssse3(
   2572    uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
   2573    const uint8_t *LIBAOM_RESTRICT top_row,
   2574    const uint8_t *LIBAOM_RESTRICT left_column) {
   2575  const __m128i top_right = _mm_set1_epi16(top_row[15]);
   2576  const __m128i weights = LoadUnaligned16(smooth_weights + 12);
   2577  const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
   2578  const __m128i weights1 = cvtepu8_epi16(weights);
   2579  const __m128i weights2 = cvtepu8_epi16(_mm_srli_si128(weights, 8));
   2580  const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
   2581  const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
   2582  const __m128i scaled_top_right1 =
   2583      _mm_mullo_epi16(inverted_weights1, top_right);
   2584  const __m128i scaled_top_right2 =
   2585      _mm_mullo_epi16(inverted_weights2, top_right);
   2586  const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
   2587  for (int left_offset = 0; left_offset < 64; left_offset += 8) {
   2588    const __m128i left = cvtepu8_epi16(LoadLo8(left_column + left_offset));
   2589    for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
   2590      const __m128i y_select = _mm_set1_epi32(y_mask);
   2591      const __m128i left_y = _mm_shuffle_epi8(left, y_select);
   2592      write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
   2593                                     scaled_top_right1, scaled_top_right2,
   2594                                     round);
   2595      dst += stride;
   2596    }
   2597  }
   2598 }
   2599 
   2600 void aom_smooth_h_predictor_32x8_ssse3(
   2601    uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
   2602    const uint8_t *LIBAOM_RESTRICT top_row,
   2603    const uint8_t *LIBAOM_RESTRICT left_column) {
   2604  const __m128i top_right = _mm_set1_epi16(top_row[31]);
   2605  const __m128i left = cvtepu8_epi16(LoadLo8(left_column));
   2606  const __m128i weights_lo = LoadUnaligned16(smooth_weights + 28);
   2607  const __m128i weights_hi = LoadUnaligned16(smooth_weights + 44);
   2608  const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
   2609  const __m128i weights1 = cvtepu8_epi16(weights_lo);
   2610  const __m128i weights2 = cvtepu8_epi16(_mm_srli_si128(weights_lo, 8));
   2611  const __m128i weights3 = cvtepu8_epi16(weights_hi);
   2612  const __m128i weights4 = cvtepu8_epi16(_mm_srli_si128(weights_hi, 8));
   2613  const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
   2614  const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
   2615  const __m128i inverted_weights3 = _mm_sub_epi16(scale, weights3);
   2616  const __m128i inverted_weights4 = _mm_sub_epi16(scale, weights4);
   2617  const __m128i scaled_top_right1 =
   2618      _mm_mullo_epi16(inverted_weights1, top_right);
   2619  const __m128i scaled_top_right2 =
   2620      _mm_mullo_epi16(inverted_weights2, top_right);
   2621  const __m128i scaled_top_right3 =
   2622      _mm_mullo_epi16(inverted_weights3, top_right);
   2623  const __m128i scaled_top_right4 =
   2624      _mm_mullo_epi16(inverted_weights4, top_right);
   2625  const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
   2626  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
   2627    __m128i y_select = _mm_set1_epi32(y_mask);
   2628    __m128i left_y = _mm_shuffle_epi8(left, y_select);
   2629    write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
   2630                                   scaled_top_right1, scaled_top_right2, round);
   2631    write_smooth_directional_sum16(dst + 16, left_y, left_y, weights3, weights4,
   2632                                   scaled_top_right3, scaled_top_right4, round);
   2633    dst += stride;
   2634  }
   2635 }
   2636 #endif  // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
   2637 
   2638 void aom_smooth_h_predictor_32x16_ssse3(
   2639    uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
   2640    const uint8_t *LIBAOM_RESTRICT top_row,
   2641    const uint8_t *LIBAOM_RESTRICT left_column) {
   2642  const __m128i top_right = _mm_set1_epi16(top_row[31]);
   2643  const __m128i left1 = cvtepu8_epi16(LoadLo8(left_column));
   2644  const __m128i weights_lo = LoadUnaligned16(smooth_weights + 28);
   2645  const __m128i weights_hi = LoadUnaligned16(smooth_weights + 44);
   2646  const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
   2647  const __m128i weights1 = cvtepu8_epi16(weights_lo);
   2648  const __m128i weights2 = cvtepu8_epi16(_mm_srli_si128(weights_lo, 8));
   2649  const __m128i weights3 = cvtepu8_epi16(weights_hi);
   2650  const __m128i weights4 = cvtepu8_epi16(_mm_srli_si128(weights_hi, 8));
   2651  const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
   2652  const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
   2653  const __m128i inverted_weights3 = _mm_sub_epi16(scale, weights3);
   2654  const __m128i inverted_weights4 = _mm_sub_epi16(scale, weights4);
   2655  const __m128i scaled_top_right1 =
   2656      _mm_mullo_epi16(inverted_weights1, top_right);
   2657  const __m128i scaled_top_right2 =
   2658      _mm_mullo_epi16(inverted_weights2, top_right);
   2659  const __m128i scaled_top_right3 =
   2660      _mm_mullo_epi16(inverted_weights3, top_right);
   2661  const __m128i scaled_top_right4 =
   2662      _mm_mullo_epi16(inverted_weights4, top_right);
   2663  const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
   2664  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
   2665    __m128i y_select = _mm_set1_epi32(y_mask);
   2666    __m128i left_y = _mm_shuffle_epi8(left1, y_select);
   2667    write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
   2668                                   scaled_top_right1, scaled_top_right2, round);
   2669    write_smooth_directional_sum16(dst + 16, left_y, left_y, weights3, weights4,
   2670                                   scaled_top_right3, scaled_top_right4, round);
   2671    dst += stride;
   2672  }
   2673  const __m128i left2 =
   2674      cvtepu8_epi16(LoadLo8((const uint8_t *)left_column + 8));
   2675  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
   2676    __m128i y_select = _mm_set1_epi32(y_mask);
   2677    __m128i left_y = _mm_shuffle_epi8(left2, y_select);
   2678    write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
   2679                                   scaled_top_right1, scaled_top_right2, round);
   2680    write_smooth_directional_sum16(dst + 16, left_y, left_y, weights3, weights4,
   2681                                   scaled_top_right3, scaled_top_right4, round);
   2682    dst += stride;
   2683  }
   2684 }
   2685 
   2686 void aom_smooth_h_predictor_32x32_ssse3(
   2687    uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
   2688    const uint8_t *LIBAOM_RESTRICT top_row,
   2689    const uint8_t *LIBAOM_RESTRICT left_column) {
   2690  const __m128i top_right = _mm_set1_epi16(top_row[31]);
   2691  const __m128i weights_lo = LoadUnaligned16(smooth_weights + 28);
   2692  const __m128i weights_hi = LoadUnaligned16(smooth_weights + 44);
   2693  const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
   2694  const __m128i weights1 = cvtepu8_epi16(weights_lo);
   2695  const __m128i weights2 = cvtepu8_epi16(_mm_srli_si128(weights_lo, 8));
   2696  const __m128i weights3 = cvtepu8_epi16(weights_hi);
   2697  const __m128i weights4 = cvtepu8_epi16(_mm_srli_si128(weights_hi, 8));
   2698  const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
   2699  const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
   2700  const __m128i inverted_weights3 = _mm_sub_epi16(scale, weights3);
   2701  const __m128i inverted_weights4 = _mm_sub_epi16(scale, weights4);
   2702  const __m128i scaled_top_right1 =
   2703      _mm_mullo_epi16(inverted_weights1, top_right);
   2704  const __m128i scaled_top_right2 =
   2705      _mm_mullo_epi16(inverted_weights2, top_right);
   2706  const __m128i scaled_top_right3 =
   2707      _mm_mullo_epi16(inverted_weights3, top_right);
   2708  const __m128i scaled_top_right4 =
   2709      _mm_mullo_epi16(inverted_weights4, top_right);
   2710  const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
   2711  __m128i left = cvtepu8_epi16(LoadLo8(left_column));
   2712  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
   2713    __m128i y_select = _mm_set1_epi32(y_mask);
   2714    __m128i left_y = _mm_shuffle_epi8(left, y_select);
   2715    write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
   2716                                   scaled_top_right1, scaled_top_right2, round);
   2717    write_smooth_directional_sum16(dst + 16, left_y, left_y, weights3, weights4,
   2718                                   scaled_top_right3, scaled_top_right4, round);
   2719    dst += stride;
   2720  }
   2721  left = cvtepu8_epi16(LoadLo8(left_column + 8));
   2722  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
   2723    __m128i y_select = _mm_set1_epi32(y_mask);
   2724    __m128i left_y = _mm_shuffle_epi8(left, y_select);
   2725    write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
   2726                                   scaled_top_right1, scaled_top_right2, round);
   2727    write_smooth_directional_sum16(dst + 16, left_y, left_y, weights3, weights4,
   2728                                   scaled_top_right3, scaled_top_right4, round);
   2729    dst += stride;
   2730  }
   2731  left = cvtepu8_epi16(LoadLo8(left_column + 16));
   2732  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
   2733    __m128i y_select = _mm_set1_epi32(y_mask);
   2734    __m128i left_y = _mm_shuffle_epi8(left, y_select);
   2735    write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
   2736                                   scaled_top_right1, scaled_top_right2, round);
   2737    write_smooth_directional_sum16(dst + 16, left_y, left_y, weights3, weights4,
   2738                                   scaled_top_right3, scaled_top_right4, round);
   2739    dst += stride;
   2740  }
   2741  left = cvtepu8_epi16(LoadLo8(left_column + 24));
   2742  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
   2743    __m128i y_select = _mm_set1_epi32(y_mask);
   2744    __m128i left_y = _mm_shuffle_epi8(left, y_select);
   2745    write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
   2746                                   scaled_top_right1, scaled_top_right2, round);
   2747    write_smooth_directional_sum16(dst + 16, left_y, left_y, weights3, weights4,
   2748                                   scaled_top_right3, scaled_top_right4, round);
   2749    dst += stride;
   2750  }
   2751 }
   2752 
   2753 void aom_smooth_h_predictor_32x64_ssse3(
   2754    uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
   2755    const uint8_t *LIBAOM_RESTRICT top_row,
   2756    const uint8_t *LIBAOM_RESTRICT left_column) {
   2757  const __m128i top_right = _mm_set1_epi16(top_row[31]);
   2758  const __m128i weights_lo = LoadUnaligned16(smooth_weights + 28);
   2759  const __m128i weights_hi = LoadUnaligned16(smooth_weights + 44);
   2760  const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
   2761  const __m128i weights1 = cvtepu8_epi16(weights_lo);
   2762  const __m128i weights2 = cvtepu8_epi16(_mm_srli_si128(weights_lo, 8));
   2763  const __m128i weights3 = cvtepu8_epi16(weights_hi);
   2764  const __m128i weights4 = cvtepu8_epi16(_mm_srli_si128(weights_hi, 8));
   2765  const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
   2766  const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
   2767  const __m128i inverted_weights3 = _mm_sub_epi16(scale, weights3);
   2768  const __m128i inverted_weights4 = _mm_sub_epi16(scale, weights4);
   2769  const __m128i scaled_top_right1 =
   2770      _mm_mullo_epi16(inverted_weights1, top_right);
   2771  const __m128i scaled_top_right2 =
   2772      _mm_mullo_epi16(inverted_weights2, top_right);
   2773  const __m128i scaled_top_right3 =
   2774      _mm_mullo_epi16(inverted_weights3, top_right);
   2775  const __m128i scaled_top_right4 =
   2776      _mm_mullo_epi16(inverted_weights4, top_right);
   2777  const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
   2778  for (int left_offset = 0; left_offset < 64; left_offset += 8) {
   2779    const __m128i left = cvtepu8_epi16(LoadLo8(left_column + left_offset));
   2780    for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
   2781      const __m128i y_select = _mm_set1_epi32(y_mask);
   2782      const __m128i left_y = _mm_shuffle_epi8(left, y_select);
   2783      write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
   2784                                     scaled_top_right1, scaled_top_right2,
   2785                                     round);
   2786      write_smooth_directional_sum16(dst + 16, left_y, left_y, weights3,
   2787                                     weights4, scaled_top_right3,
   2788                                     scaled_top_right4, round);
   2789      dst += stride;
   2790    }
   2791  }
   2792 }
   2793 
   2794 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
   2795 void aom_smooth_h_predictor_64x16_ssse3(
   2796    uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
   2797    const uint8_t *LIBAOM_RESTRICT top_row,
   2798    const uint8_t *LIBAOM_RESTRICT left_column) {
   2799  const __m128i top_right = _mm_set1_epi16(top_row[63]);
   2800  const __m128i left1 = cvtepu8_epi16(LoadLo8(left_column));
   2801  const __m128i weights_lolo = LoadUnaligned16(smooth_weights + 60);
   2802  const __m128i weights_lohi = LoadUnaligned16(smooth_weights + 76);
   2803  const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
   2804  const __m128i weights1 = cvtepu8_epi16(weights_lolo);
   2805  const __m128i weights2 = cvtepu8_epi16(_mm_srli_si128(weights_lolo, 8));
   2806  const __m128i weights3 = cvtepu8_epi16(weights_lohi);
   2807  const __m128i weights4 = cvtepu8_epi16(_mm_srli_si128(weights_lohi, 8));
   2808  const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
   2809  const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
   2810  const __m128i inverted_weights3 = _mm_sub_epi16(scale, weights3);
   2811  const __m128i inverted_weights4 = _mm_sub_epi16(scale, weights4);
   2812  const __m128i scaled_top_right1 =
   2813      _mm_mullo_epi16(inverted_weights1, top_right);
   2814  const __m128i scaled_top_right2 =
   2815      _mm_mullo_epi16(inverted_weights2, top_right);
   2816  const __m128i scaled_top_right3 =
   2817      _mm_mullo_epi16(inverted_weights3, top_right);
   2818  const __m128i scaled_top_right4 =
   2819      _mm_mullo_epi16(inverted_weights4, top_right);
   2820  const __m128i weights_hilo = LoadUnaligned16(smooth_weights + 92);
   2821  const __m128i weights_hihi = LoadUnaligned16(smooth_weights + 108);
   2822  const __m128i weights5 = cvtepu8_epi16(weights_hilo);
   2823  const __m128i weights6 = cvtepu8_epi16(_mm_srli_si128(weights_hilo, 8));
   2824  const __m128i weights7 = cvtepu8_epi16(weights_hihi);
   2825  const __m128i weights8 = cvtepu8_epi16(_mm_srli_si128(weights_hihi, 8));
   2826  const __m128i inverted_weights5 = _mm_sub_epi16(scale, weights5);
   2827  const __m128i inverted_weights6 = _mm_sub_epi16(scale, weights6);
   2828  const __m128i inverted_weights7 = _mm_sub_epi16(scale, weights7);
   2829  const __m128i inverted_weights8 = _mm_sub_epi16(scale, weights8);
   2830  const __m128i scaled_top_right5 =
   2831      _mm_mullo_epi16(inverted_weights5, top_right);
   2832  const __m128i scaled_top_right6 =
   2833      _mm_mullo_epi16(inverted_weights6, top_right);
   2834  const __m128i scaled_top_right7 =
   2835      _mm_mullo_epi16(inverted_weights7, top_right);
   2836  const __m128i scaled_top_right8 =
   2837      _mm_mullo_epi16(inverted_weights8, top_right);
   2838  const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
   2839  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
   2840    __m128i y_select = _mm_set1_epi32(y_mask);
   2841    __m128i left_y = _mm_shuffle_epi8(left1, y_select);
   2842    write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
   2843                                   scaled_top_right1, scaled_top_right2, round);
   2844    write_smooth_directional_sum16(dst + 16, left_y, left_y, weights3, weights4,
   2845                                   scaled_top_right3, scaled_top_right4, round);
   2846    write_smooth_directional_sum16(dst + 32, left_y, left_y, weights5, weights6,
   2847                                   scaled_top_right5, scaled_top_right6, round);
   2848    write_smooth_directional_sum16(dst + 48, left_y, left_y, weights7, weights8,
   2849                                   scaled_top_right7, scaled_top_right8, round);
   2850    dst += stride;
   2851  }
   2852  const __m128i left2 = cvtepu8_epi16(LoadLo8(left_column + 8));
   2853  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
   2854    __m128i y_select = _mm_set1_epi32(y_mask);
   2855    __m128i left_y = _mm_shuffle_epi8(left2, y_select);
   2856    write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
   2857                                   scaled_top_right1, scaled_top_right2, round);
   2858    write_smooth_directional_sum16(dst + 16, left_y, left_y, weights3, weights4,
   2859                                   scaled_top_right3, scaled_top_right4, round);
   2860    write_smooth_directional_sum16(dst + 32, left_y, left_y, weights5, weights6,
   2861                                   scaled_top_right5, scaled_top_right6, round);
   2862    write_smooth_directional_sum16(dst + 48, left_y, left_y, weights7, weights8,
   2863                                   scaled_top_right7, scaled_top_right8, round);
   2864    dst += stride;
   2865  }
   2866 }
   2867 #endif  // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
   2868 
   2869 void aom_smooth_h_predictor_64x32_ssse3(
   2870    uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
   2871    const uint8_t *LIBAOM_RESTRICT top_row,
   2872    const uint8_t *LIBAOM_RESTRICT left_column) {
   2873  const __m128i top_right = _mm_set1_epi16(top_row[63]);
   2874  const __m128i left1 = cvtepu8_epi16(LoadLo8(left_column));
   2875  const __m128i weights_lolo = LoadUnaligned16(smooth_weights + 60);
   2876  const __m128i weights_lohi = LoadUnaligned16(smooth_weights + 76);
   2877  const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
   2878  const __m128i weights1 = cvtepu8_epi16(weights_lolo);
   2879  const __m128i weights2 = cvtepu8_epi16(_mm_srli_si128(weights_lolo, 8));
   2880  const __m128i weights3 = cvtepu8_epi16(weights_lohi);
   2881  const __m128i weights4 = cvtepu8_epi16(_mm_srli_si128(weights_lohi, 8));
   2882  const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
   2883  const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
   2884  const __m128i inverted_weights3 = _mm_sub_epi16(scale, weights3);
   2885  const __m128i inverted_weights4 = _mm_sub_epi16(scale, weights4);
   2886  const __m128i scaled_top_right1 =
   2887      _mm_mullo_epi16(inverted_weights1, top_right);
   2888  const __m128i scaled_top_right2 =
   2889      _mm_mullo_epi16(inverted_weights2, top_right);
   2890  const __m128i scaled_top_right3 =
   2891      _mm_mullo_epi16(inverted_weights3, top_right);
   2892  const __m128i scaled_top_right4 =
   2893      _mm_mullo_epi16(inverted_weights4, top_right);
   2894  const __m128i weights_hilo = LoadUnaligned16(smooth_weights + 92);
   2895  const __m128i weights_hihi = LoadUnaligned16(smooth_weights + 108);
   2896  const __m128i weights5 = cvtepu8_epi16(weights_hilo);
   2897  const __m128i weights6 = cvtepu8_epi16(_mm_srli_si128(weights_hilo, 8));
   2898  const __m128i weights7 = cvtepu8_epi16(weights_hihi);
   2899  const __m128i weights8 = cvtepu8_epi16(_mm_srli_si128(weights_hihi, 8));
   2900  const __m128i inverted_weights5 = _mm_sub_epi16(scale, weights5);
   2901  const __m128i inverted_weights6 = _mm_sub_epi16(scale, weights6);
   2902  const __m128i inverted_weights7 = _mm_sub_epi16(scale, weights7);
   2903  const __m128i inverted_weights8 = _mm_sub_epi16(scale, weights8);
   2904  const __m128i scaled_top_right5 =
   2905      _mm_mullo_epi16(inverted_weights5, top_right);
   2906  const __m128i scaled_top_right6 =
   2907      _mm_mullo_epi16(inverted_weights6, top_right);
   2908  const __m128i scaled_top_right7 =
   2909      _mm_mullo_epi16(inverted_weights7, top_right);
   2910  const __m128i scaled_top_right8 =
   2911      _mm_mullo_epi16(inverted_weights8, top_right);
   2912  const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
   2913  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
   2914    const __m128i y_select = _mm_set1_epi32(y_mask);
   2915    const __m128i left_y = _mm_shuffle_epi8(left1, y_select);
   2916    write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
   2917                                   scaled_top_right1, scaled_top_right2, round);
   2918    write_smooth_directional_sum16(dst + 16, left_y, left_y, weights3, weights4,
   2919                                   scaled_top_right3, scaled_top_right4, round);
   2920    write_smooth_directional_sum16(dst + 32, left_y, left_y, weights5, weights6,
   2921                                   scaled_top_right5, scaled_top_right6, round);
   2922    write_smooth_directional_sum16(dst + 48, left_y, left_y, weights7, weights8,
   2923                                   scaled_top_right7, scaled_top_right8, round);
   2924    dst += stride;
   2925  }
   2926  const __m128i left2 = cvtepu8_epi16(LoadLo8(left_column + 8));
   2927  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
   2928    const __m128i y_select = _mm_set1_epi32(y_mask);
   2929    const __m128i left_y = _mm_shuffle_epi8(left2, y_select);
   2930    write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
   2931                                   scaled_top_right1, scaled_top_right2, round);
   2932    write_smooth_directional_sum16(dst + 16, left_y, left_y, weights3, weights4,
   2933                                   scaled_top_right3, scaled_top_right4, round);
   2934    write_smooth_directional_sum16(dst + 32, left_y, left_y, weights5, weights6,
   2935                                   scaled_top_right5, scaled_top_right6, round);
   2936    write_smooth_directional_sum16(dst + 48, left_y, left_y, weights7, weights8,
   2937                                   scaled_top_right7, scaled_top_right8, round);
   2938    dst += stride;
   2939  }
   2940  const __m128i left3 = cvtepu8_epi16(LoadLo8(left_column + 16));
   2941  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
   2942    const __m128i y_select = _mm_set1_epi32(y_mask);
   2943    const __m128i left_y = _mm_shuffle_epi8(left3, y_select);
   2944    write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
   2945                                   scaled_top_right1, scaled_top_right2, round);
   2946    write_smooth_directional_sum16(dst + 16, left_y, left_y, weights3, weights4,
   2947                                   scaled_top_right3, scaled_top_right4, round);
   2948    write_smooth_directional_sum16(dst + 32, left_y, left_y, weights5, weights6,
   2949                                   scaled_top_right5, scaled_top_right6, round);
   2950    write_smooth_directional_sum16(dst + 48, left_y, left_y, weights7, weights8,
   2951                                   scaled_top_right7, scaled_top_right8, round);
   2952    dst += stride;
   2953  }
   2954  const __m128i left4 = cvtepu8_epi16(LoadLo8(left_column + 24));
   2955  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
   2956    const __m128i y_select = _mm_set1_epi32(y_mask);
   2957    const __m128i left_y = _mm_shuffle_epi8(left4, y_select);
   2958    write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
   2959                                   scaled_top_right1, scaled_top_right2, round);
   2960    write_smooth_directional_sum16(dst + 16, left_y, left_y, weights3, weights4,
   2961                                   scaled_top_right3, scaled_top_right4, round);
   2962    write_smooth_directional_sum16(dst + 32, left_y, left_y, weights5, weights6,
   2963                                   scaled_top_right5, scaled_top_right6, round);
   2964    write_smooth_directional_sum16(dst + 48, left_y, left_y, weights7, weights8,
   2965                                   scaled_top_right7, scaled_top_right8, round);
   2966    dst += stride;
   2967  }
   2968 }
   2969 
   2970 void aom_smooth_h_predictor_64x64_ssse3(
   2971    uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
   2972    const uint8_t *LIBAOM_RESTRICT top_row,
   2973    const uint8_t *LIBAOM_RESTRICT left_column) {
   2974  const __m128i top_right = _mm_set1_epi16(top_row[63]);
   2975  const __m128i weights_lolo = LoadUnaligned16(smooth_weights + 60);
   2976  const __m128i weights_lohi = LoadUnaligned16(smooth_weights + 76);
   2977  const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
   2978  const __m128i weights1 = cvtepu8_epi16(weights_lolo);
   2979  const __m128i weights2 = cvtepu8_epi16(_mm_srli_si128(weights_lolo, 8));
   2980  const __m128i weights3 = cvtepu8_epi16(weights_lohi);
   2981  const __m128i weights4 = cvtepu8_epi16(_mm_srli_si128(weights_lohi, 8));
   2982  const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
   2983  const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
   2984  const __m128i inverted_weights3 = _mm_sub_epi16(scale, weights3);
   2985  const __m128i inverted_weights4 = _mm_sub_epi16(scale, weights4);
   2986  const __m128i scaled_top_right1 =
   2987      _mm_mullo_epi16(inverted_weights1, top_right);
   2988  const __m128i scaled_top_right2 =
   2989      _mm_mullo_epi16(inverted_weights2, top_right);
   2990  const __m128i scaled_top_right3 =
   2991      _mm_mullo_epi16(inverted_weights3, top_right);
   2992  const __m128i scaled_top_right4 =
   2993      _mm_mullo_epi16(inverted_weights4, top_right);
   2994  const __m128i weights_hilo = LoadUnaligned16(smooth_weights + 92);
   2995  const __m128i weights_hihi = LoadUnaligned16(smooth_weights + 108);
   2996  const __m128i weights5 = cvtepu8_epi16(weights_hilo);
   2997  const __m128i weights6 = cvtepu8_epi16(_mm_srli_si128(weights_hilo, 8));
   2998  const __m128i weights7 = cvtepu8_epi16(weights_hihi);
   2999  const __m128i weights8 = cvtepu8_epi16(_mm_srli_si128(weights_hihi, 8));
   3000  const __m128i inverted_weights5 = _mm_sub_epi16(scale, weights5);
   3001  const __m128i inverted_weights6 = _mm_sub_epi16(scale, weights6);
   3002  const __m128i inverted_weights7 = _mm_sub_epi16(scale, weights7);
   3003  const __m128i inverted_weights8 = _mm_sub_epi16(scale, weights8);
   3004  const __m128i scaled_top_right5 =
   3005      _mm_mullo_epi16(inverted_weights5, top_right);
   3006  const __m128i scaled_top_right6 =
   3007      _mm_mullo_epi16(inverted_weights6, top_right);
   3008  const __m128i scaled_top_right7 =
   3009      _mm_mullo_epi16(inverted_weights7, top_right);
   3010  const __m128i scaled_top_right8 =
   3011      _mm_mullo_epi16(inverted_weights8, top_right);
   3012  const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
   3013  for (int left_offset = 0; left_offset < 64; left_offset += 8) {
   3014    const __m128i left = cvtepu8_epi16(LoadLo8(left_column + left_offset));
   3015    for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
   3016      const __m128i y_select = _mm_set1_epi32(y_mask);
   3017      const __m128i left_y = _mm_shuffle_epi8(left, y_select);
   3018      write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
   3019                                     scaled_top_right1, scaled_top_right2,
   3020                                     round);
   3021      write_smooth_directional_sum16(dst + 16, left_y, left_y, weights3,
   3022                                     weights4, scaled_top_right3,
   3023                                     scaled_top_right4, round);
   3024      write_smooth_directional_sum16(dst + 32, left_y, left_y, weights5,
   3025                                     weights6, scaled_top_right5,
   3026                                     scaled_top_right6, round);
   3027      write_smooth_directional_sum16(dst + 48, left_y, left_y, weights7,
   3028                                     weights8, scaled_top_right7,
   3029                                     scaled_top_right8, round);
   3030      dst += stride;
   3031    }
   3032  }
   3033 }