tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

intrapred_sse2.c (56011B)


      1 /*
      2 * Copyright (c) 2017, Alliance for Open Media. All rights reserved.
      3 *
      4 * This source code is subject to the terms of the BSD 2 Clause License and
      5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
      6 * was not distributed with this source code in the LICENSE file, you can
      7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
      8 * Media Patent License 1.0 was not distributed with this source code in the
      9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
     10 */
     11 
     12 #include <emmintrin.h>
     13 #include "aom_dsp/x86/intrapred_x86.h"
     14 #include "config/aom_dsp_rtcd.h"
     15 
     16 static inline void dc_store_4xh(uint32_t dc, int height, uint8_t *dst,
     17                                ptrdiff_t stride) {
     18  for (int i = 0; i < height; i += 2) {
     19    *(uint32_t *)dst = dc;
     20    dst += stride;
     21    *(uint32_t *)dst = dc;
     22    dst += stride;
     23  }
     24 }
     25 
     26 static inline void dc_store_8xh(const __m128i *row, int height, uint8_t *dst,
     27                                ptrdiff_t stride) {
     28  int i;
     29  for (i = 0; i < height; ++i) {
     30    _mm_storel_epi64((__m128i *)dst, *row);
     31    dst += stride;
     32  }
     33 }
     34 
     35 static inline void dc_store_16xh(const __m128i *row, int height, uint8_t *dst,
     36                                 ptrdiff_t stride) {
     37  int i;
     38  for (i = 0; i < height; ++i) {
     39    _mm_store_si128((__m128i *)dst, *row);
     40    dst += stride;
     41  }
     42 }
     43 
     44 static inline void dc_store_32xh(const __m128i *row, int height, uint8_t *dst,
     45                                 ptrdiff_t stride) {
     46  int i;
     47  for (i = 0; i < height; ++i) {
     48    _mm_store_si128((__m128i *)dst, *row);
     49    _mm_store_si128((__m128i *)(dst + 16), *row);
     50    dst += stride;
     51  }
     52 }
     53 
     54 static inline void dc_store_64xh(const __m128i *row, int height, uint8_t *dst,
     55                                 ptrdiff_t stride) {
     56  for (int i = 0; i < height; ++i) {
     57    _mm_store_si128((__m128i *)dst, *row);
     58    _mm_store_si128((__m128i *)(dst + 16), *row);
     59    _mm_store_si128((__m128i *)(dst + 32), *row);
     60    _mm_store_si128((__m128i *)(dst + 48), *row);
     61    dst += stride;
     62  }
     63 }
     64 
     65 static inline __m128i dc_sum_4(const uint8_t *ref) {
     66  __m128i x = _mm_loadl_epi64((__m128i const *)ref);
     67  const __m128i zero = _mm_setzero_si128();
     68  x = _mm_unpacklo_epi8(x, zero);
     69  return _mm_sad_epu8(x, zero);
     70 }
     71 
     72 static inline __m128i dc_sum_8(const uint8_t *ref) {
     73  __m128i x = _mm_loadl_epi64((__m128i const *)ref);
     74  const __m128i zero = _mm_setzero_si128();
     75  return _mm_sad_epu8(x, zero);
     76 }
     77 
     78 static inline __m128i dc_sum_64(const uint8_t *ref) {
     79  __m128i x0 = _mm_load_si128((__m128i const *)ref);
     80  __m128i x1 = _mm_load_si128((__m128i const *)(ref + 16));
     81  __m128i x2 = _mm_load_si128((__m128i const *)(ref + 32));
     82  __m128i x3 = _mm_load_si128((__m128i const *)(ref + 48));
     83  const __m128i zero = _mm_setzero_si128();
     84  x0 = _mm_sad_epu8(x0, zero);
     85  x1 = _mm_sad_epu8(x1, zero);
     86  x2 = _mm_sad_epu8(x2, zero);
     87  x3 = _mm_sad_epu8(x3, zero);
     88  x0 = _mm_add_epi16(x0, x1);
     89  x2 = _mm_add_epi16(x2, x3);
     90  x0 = _mm_add_epi16(x0, x2);
     91  const __m128i high = _mm_unpackhi_epi64(x0, x0);
     92  return _mm_add_epi16(x0, high);
     93 }
     94 
     95 #define DC_MULTIPLIER_1X2 0x5556
     96 #define DC_MULTIPLIER_1X4 0x3334
     97 
     98 #define DC_SHIFT2 16
     99 
    100 static inline int divide_using_multiply_shift(int num, int shift1,
    101                                              int multiplier) {
    102  const int interm = num >> shift1;
    103  return interm * multiplier >> DC_SHIFT2;
    104 }
    105 
    106 // -----------------------------------------------------------------------------
    107 // DC_PRED
    108 
    109 void aom_dc_predictor_4x8_sse2(uint8_t *dst, ptrdiff_t stride,
    110                               const uint8_t *above, const uint8_t *left) {
    111  const __m128i sum_left = dc_sum_8(left);
    112  __m128i sum_above = dc_sum_4(above);
    113  sum_above = _mm_add_epi16(sum_left, sum_above);
    114 
    115  uint32_t sum = (uint32_t)_mm_cvtsi128_si32(sum_above);
    116  sum += 6;
    117  sum = divide_using_multiply_shift(sum, 2, DC_MULTIPLIER_1X2);
    118 
    119  const __m128i row = _mm_set1_epi8((int8_t)sum);
    120  const uint32_t pred = (uint32_t)_mm_cvtsi128_si32(row);
    121  dc_store_4xh(pred, 8, dst, stride);
    122 }
    123 
    124 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
    125 void aom_dc_predictor_4x16_sse2(uint8_t *dst, ptrdiff_t stride,
    126                                const uint8_t *above, const uint8_t *left) {
    127  const __m128i sum_left = dc_sum_16_sse2(left);
    128  __m128i sum_above = dc_sum_4(above);
    129  sum_above = _mm_add_epi16(sum_left, sum_above);
    130 
    131  uint32_t sum = (uint32_t)_mm_cvtsi128_si32(sum_above);
    132  sum += 10;
    133  sum = divide_using_multiply_shift(sum, 2, DC_MULTIPLIER_1X4);
    134 
    135  const __m128i row = _mm_set1_epi8((int8_t)sum);
    136  const uint32_t pred = (uint32_t)_mm_cvtsi128_si32(row);
    137  dc_store_4xh(pred, 16, dst, stride);
    138 }
    139 #endif  // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
    140 
    141 void aom_dc_predictor_8x4_sse2(uint8_t *dst, ptrdiff_t stride,
    142                               const uint8_t *above, const uint8_t *left) {
    143  const __m128i sum_left = dc_sum_4(left);
    144  __m128i sum_above = dc_sum_8(above);
    145  sum_above = _mm_add_epi16(sum_above, sum_left);
    146 
    147  uint32_t sum = (uint32_t)_mm_cvtsi128_si32(sum_above);
    148  sum += 6;
    149  sum = divide_using_multiply_shift(sum, 2, DC_MULTIPLIER_1X2);
    150 
    151  const __m128i row = _mm_set1_epi8((int8_t)sum);
    152  dc_store_8xh(&row, 4, dst, stride);
    153 }
    154 
    155 void aom_dc_predictor_8x16_sse2(uint8_t *dst, ptrdiff_t stride,
    156                                const uint8_t *above, const uint8_t *left) {
    157  const __m128i sum_left = dc_sum_16_sse2(left);
    158  __m128i sum_above = dc_sum_8(above);
    159  sum_above = _mm_add_epi16(sum_above, sum_left);
    160 
    161  uint32_t sum = (uint32_t)_mm_cvtsi128_si32(sum_above);
    162  sum += 12;
    163  sum = divide_using_multiply_shift(sum, 3, DC_MULTIPLIER_1X2);
    164  const __m128i row = _mm_set1_epi8((int8_t)sum);
    165  dc_store_8xh(&row, 16, dst, stride);
    166 }
    167 
    168 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
    169 void aom_dc_predictor_8x32_sse2(uint8_t *dst, ptrdiff_t stride,
    170                                const uint8_t *above, const uint8_t *left) {
    171  const __m128i sum_left = dc_sum_32_sse2(left);
    172  __m128i sum_above = dc_sum_8(above);
    173  sum_above = _mm_add_epi16(sum_above, sum_left);
    174 
    175  uint32_t sum = (uint32_t)_mm_cvtsi128_si32(sum_above);
    176  sum += 20;
    177  sum = divide_using_multiply_shift(sum, 3, DC_MULTIPLIER_1X4);
    178  const __m128i row = _mm_set1_epi8((int8_t)sum);
    179  dc_store_8xh(&row, 32, dst, stride);
    180 }
    181 
    182 void aom_dc_predictor_16x4_sse2(uint8_t *dst, ptrdiff_t stride,
    183                                const uint8_t *above, const uint8_t *left) {
    184  const __m128i sum_left = dc_sum_4(left);
    185  __m128i sum_above = dc_sum_16_sse2(above);
    186  sum_above = _mm_add_epi16(sum_above, sum_left);
    187 
    188  uint32_t sum = (uint32_t)_mm_cvtsi128_si32(sum_above);
    189  sum += 10;
    190  sum = divide_using_multiply_shift(sum, 2, DC_MULTIPLIER_1X4);
    191  const __m128i row = _mm_set1_epi8((int8_t)sum);
    192  dc_store_16xh(&row, 4, dst, stride);
    193 }
    194 #endif  // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
    195 
    196 void aom_dc_predictor_16x8_sse2(uint8_t *dst, ptrdiff_t stride,
    197                                const uint8_t *above, const uint8_t *left) {
    198  const __m128i sum_left = dc_sum_8(left);
    199  __m128i sum_above = dc_sum_16_sse2(above);
    200  sum_above = _mm_add_epi16(sum_above, sum_left);
    201 
    202  uint32_t sum = (uint32_t)_mm_cvtsi128_si32(sum_above);
    203  sum += 12;
    204  sum = divide_using_multiply_shift(sum, 3, DC_MULTIPLIER_1X2);
    205  const __m128i row = _mm_set1_epi8((int8_t)sum);
    206  dc_store_16xh(&row, 8, dst, stride);
    207 }
    208 
    209 void aom_dc_predictor_16x32_sse2(uint8_t *dst, ptrdiff_t stride,
    210                                 const uint8_t *above, const uint8_t *left) {
    211  const __m128i sum_left = dc_sum_32_sse2(left);
    212  __m128i sum_above = dc_sum_16_sse2(above);
    213  sum_above = _mm_add_epi16(sum_left, sum_above);
    214 
    215  uint32_t sum = (uint32_t)_mm_cvtsi128_si32(sum_above);
    216  sum += 24;
    217  sum = divide_using_multiply_shift(sum, 4, DC_MULTIPLIER_1X2);
    218  const __m128i row = _mm_set1_epi8((int8_t)sum);
    219  dc_store_16xh(&row, 32, dst, stride);
    220 }
    221 
    222 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
    223 void aom_dc_predictor_16x64_sse2(uint8_t *dst, ptrdiff_t stride,
    224                                 const uint8_t *above, const uint8_t *left) {
    225  const __m128i sum_left = dc_sum_64(left);
    226  __m128i sum_above = dc_sum_16_sse2(above);
    227  sum_above = _mm_add_epi16(sum_left, sum_above);
    228 
    229  uint32_t sum = (uint32_t)_mm_cvtsi128_si32(sum_above);
    230  sum += 40;
    231  sum = divide_using_multiply_shift(sum, 4, DC_MULTIPLIER_1X4);
    232  const __m128i row = _mm_set1_epi8((int8_t)sum);
    233  dc_store_16xh(&row, 64, dst, stride);
    234 }
    235 
    236 void aom_dc_predictor_32x8_sse2(uint8_t *dst, ptrdiff_t stride,
    237                                const uint8_t *above, const uint8_t *left) {
    238  __m128i sum_above = dc_sum_32_sse2(above);
    239  const __m128i sum_left = dc_sum_8(left);
    240  sum_above = _mm_add_epi16(sum_above, sum_left);
    241 
    242  uint32_t sum = (uint32_t)_mm_cvtsi128_si32(sum_above);
    243  sum += 20;
    244  sum = divide_using_multiply_shift(sum, 3, DC_MULTIPLIER_1X4);
    245  const __m128i row = _mm_set1_epi8((int8_t)sum);
    246  dc_store_32xh(&row, 8, dst, stride);
    247 }
    248 #endif  // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
    249 
    250 void aom_dc_predictor_32x16_sse2(uint8_t *dst, ptrdiff_t stride,
    251                                 const uint8_t *above, const uint8_t *left) {
    252  __m128i sum_above = dc_sum_32_sse2(above);
    253  const __m128i sum_left = dc_sum_16_sse2(left);
    254  sum_above = _mm_add_epi16(sum_above, sum_left);
    255 
    256  uint32_t sum = (uint32_t)_mm_cvtsi128_si32(sum_above);
    257  sum += 24;
    258  sum = divide_using_multiply_shift(sum, 4, DC_MULTIPLIER_1X2);
    259  const __m128i row = _mm_set1_epi8((int8_t)sum);
    260  dc_store_32xh(&row, 16, dst, stride);
    261 }
    262 
    263 void aom_dc_predictor_32x64_sse2(uint8_t *dst, ptrdiff_t stride,
    264                                 const uint8_t *above, const uint8_t *left) {
    265  __m128i sum_above = dc_sum_32_sse2(above);
    266  const __m128i sum_left = dc_sum_64(left);
    267  sum_above = _mm_add_epi16(sum_above, sum_left);
    268 
    269  uint32_t sum = (uint32_t)_mm_cvtsi128_si32(sum_above);
    270  sum += 48;
    271  sum = divide_using_multiply_shift(sum, 5, DC_MULTIPLIER_1X2);
    272  const __m128i row = _mm_set1_epi8((int8_t)sum);
    273  dc_store_32xh(&row, 64, dst, stride);
    274 }
    275 
    276 void aom_dc_predictor_64x64_sse2(uint8_t *dst, ptrdiff_t stride,
    277                                 const uint8_t *above, const uint8_t *left) {
    278  __m128i sum_above = dc_sum_64(above);
    279  const __m128i sum_left = dc_sum_64(left);
    280  sum_above = _mm_add_epi16(sum_above, sum_left);
    281 
    282  uint32_t sum = (uint32_t)_mm_cvtsi128_si32(sum_above);
    283  sum += 64;
    284  sum /= 128;
    285  const __m128i row = _mm_set1_epi8((int8_t)sum);
    286  dc_store_64xh(&row, 64, dst, stride);
    287 }
    288 
    289 void aom_dc_predictor_64x32_sse2(uint8_t *dst, ptrdiff_t stride,
    290                                 const uint8_t *above, const uint8_t *left) {
    291  __m128i sum_above = dc_sum_64(above);
    292  const __m128i sum_left = dc_sum_32_sse2(left);
    293  sum_above = _mm_add_epi16(sum_above, sum_left);
    294 
    295  uint32_t sum = (uint32_t)_mm_cvtsi128_si32(sum_above);
    296  sum += 48;
    297  sum = divide_using_multiply_shift(sum, 5, DC_MULTIPLIER_1X2);
    298  const __m128i row = _mm_set1_epi8((int8_t)sum);
    299  dc_store_64xh(&row, 32, dst, stride);
    300 }
    301 
    302 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
    303 void aom_dc_predictor_64x16_sse2(uint8_t *dst, ptrdiff_t stride,
    304                                 const uint8_t *above, const uint8_t *left) {
    305  __m128i sum_above = dc_sum_64(above);
    306  const __m128i sum_left = dc_sum_16_sse2(left);
    307  sum_above = _mm_add_epi16(sum_above, sum_left);
    308 
    309  uint32_t sum = (uint32_t)_mm_cvtsi128_si32(sum_above);
    310  sum += 40;
    311  sum = divide_using_multiply_shift(sum, 4, DC_MULTIPLIER_1X4);
    312  const __m128i row = _mm_set1_epi8((int8_t)sum);
    313  dc_store_64xh(&row, 16, dst, stride);
    314 }
    315 #endif  // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
    316 
    317 // -----------------------------------------------------------------------------
    318 // DC_TOP
    319 
    320 void aom_dc_top_predictor_4x8_sse2(uint8_t *dst, ptrdiff_t stride,
    321                                   const uint8_t *above, const uint8_t *left) {
    322  (void)left;
    323  __m128i sum_above = dc_sum_4(above);
    324  const __m128i two = _mm_set1_epi16(2);
    325  sum_above = _mm_add_epi16(sum_above, two);
    326  sum_above = _mm_srai_epi16(sum_above, 2);
    327  sum_above = _mm_shufflelo_epi16(sum_above, 0);
    328  sum_above = _mm_packus_epi16(sum_above, sum_above);
    329 
    330  const uint32_t pred = (uint32_t)_mm_cvtsi128_si32(sum_above);
    331  dc_store_4xh(pred, 8, dst, stride);
    332 }
    333 
    334 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
    335 void aom_dc_top_predictor_4x16_sse2(uint8_t *dst, ptrdiff_t stride,
    336                                    const uint8_t *above, const uint8_t *left) {
    337  (void)left;
    338  __m128i sum_above = dc_sum_4(above);
    339  const __m128i two = _mm_set1_epi16(2);
    340  sum_above = _mm_add_epi16(sum_above, two);
    341  sum_above = _mm_srai_epi16(sum_above, 2);
    342  sum_above = _mm_shufflelo_epi16(sum_above, 0);
    343  sum_above = _mm_packus_epi16(sum_above, sum_above);
    344 
    345  const uint32_t pred = (uint32_t)_mm_cvtsi128_si32(sum_above);
    346  dc_store_4xh(pred, 16, dst, stride);
    347 }
    348 #endif  // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
    349 
    350 void aom_dc_top_predictor_8x4_sse2(uint8_t *dst, ptrdiff_t stride,
    351                                   const uint8_t *above, const uint8_t *left) {
    352  (void)left;
    353  __m128i sum_above = dc_sum_8(above);
    354  const __m128i four = _mm_set1_epi16(4);
    355  sum_above = _mm_add_epi16(sum_above, four);
    356  sum_above = _mm_srai_epi16(sum_above, 3);
    357  sum_above = _mm_unpacklo_epi8(sum_above, sum_above);
    358  const __m128i row = _mm_shufflelo_epi16(sum_above, 0);
    359  dc_store_8xh(&row, 4, dst, stride);
    360 }
    361 
    362 void aom_dc_top_predictor_8x16_sse2(uint8_t *dst, ptrdiff_t stride,
    363                                    const uint8_t *above, const uint8_t *left) {
    364  (void)left;
    365  __m128i sum_above = dc_sum_8(above);
    366  const __m128i four = _mm_set1_epi16(4);
    367  sum_above = _mm_add_epi16(sum_above, four);
    368  sum_above = _mm_srai_epi16(sum_above, 3);
    369  sum_above = _mm_unpacklo_epi8(sum_above, sum_above);
    370  const __m128i row = _mm_shufflelo_epi16(sum_above, 0);
    371  dc_store_8xh(&row, 16, dst, stride);
    372 }
    373 
    374 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
    375 void aom_dc_top_predictor_8x32_sse2(uint8_t *dst, ptrdiff_t stride,
    376                                    const uint8_t *above, const uint8_t *left) {
    377  (void)left;
    378  __m128i sum_above = dc_sum_8(above);
    379  const __m128i four = _mm_set1_epi16(4);
    380  sum_above = _mm_add_epi16(sum_above, four);
    381  sum_above = _mm_srai_epi16(sum_above, 3);
    382  sum_above = _mm_unpacklo_epi8(sum_above, sum_above);
    383  const __m128i row = _mm_shufflelo_epi16(sum_above, 0);
    384  dc_store_8xh(&row, 32, dst, stride);
    385 }
    386 
    387 void aom_dc_top_predictor_16x4_sse2(uint8_t *dst, ptrdiff_t stride,
    388                                    const uint8_t *above, const uint8_t *left) {
    389  (void)left;
    390  __m128i sum_above = dc_sum_16_sse2(above);
    391  const __m128i eight = _mm_set1_epi16(8);
    392  sum_above = _mm_add_epi16(sum_above, eight);
    393  sum_above = _mm_srai_epi16(sum_above, 4);
    394  sum_above = _mm_unpacklo_epi8(sum_above, sum_above);
    395  sum_above = _mm_shufflelo_epi16(sum_above, 0);
    396  const __m128i row = _mm_unpacklo_epi64(sum_above, sum_above);
    397  dc_store_16xh(&row, 4, dst, stride);
    398 }
    399 #endif  // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
    400 
    401 void aom_dc_top_predictor_16x8_sse2(uint8_t *dst, ptrdiff_t stride,
    402                                    const uint8_t *above, const uint8_t *left) {
    403  (void)left;
    404  __m128i sum_above = dc_sum_16_sse2(above);
    405  const __m128i eight = _mm_set1_epi16(8);
    406  sum_above = _mm_add_epi16(sum_above, eight);
    407  sum_above = _mm_srai_epi16(sum_above, 4);
    408  sum_above = _mm_unpacklo_epi8(sum_above, sum_above);
    409  sum_above = _mm_shufflelo_epi16(sum_above, 0);
    410  const __m128i row = _mm_unpacklo_epi64(sum_above, sum_above);
    411  dc_store_16xh(&row, 8, dst, stride);
    412 }
    413 
    414 void aom_dc_top_predictor_16x32_sse2(uint8_t *dst, ptrdiff_t stride,
    415                                     const uint8_t *above,
    416                                     const uint8_t *left) {
    417  (void)left;
    418  __m128i sum_above = dc_sum_16_sse2(above);
    419  const __m128i eight = _mm_set1_epi16(8);
    420  sum_above = _mm_add_epi16(sum_above, eight);
    421  sum_above = _mm_srai_epi16(sum_above, 4);
    422  sum_above = _mm_unpacklo_epi8(sum_above, sum_above);
    423  sum_above = _mm_shufflelo_epi16(sum_above, 0);
    424  const __m128i row = _mm_unpacklo_epi64(sum_above, sum_above);
    425  dc_store_16xh(&row, 32, dst, stride);
    426 }
    427 
    428 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
    429 void aom_dc_top_predictor_16x64_sse2(uint8_t *dst, ptrdiff_t stride,
    430                                     const uint8_t *above,
    431                                     const uint8_t *left) {
    432  (void)left;
    433  __m128i sum_above = dc_sum_16_sse2(above);
    434  const __m128i eight = _mm_set1_epi16(8);
    435  sum_above = _mm_add_epi16(sum_above, eight);
    436  sum_above = _mm_srai_epi16(sum_above, 4);
    437  sum_above = _mm_unpacklo_epi8(sum_above, sum_above);
    438  sum_above = _mm_shufflelo_epi16(sum_above, 0);
    439  const __m128i row = _mm_unpacklo_epi64(sum_above, sum_above);
    440  dc_store_16xh(&row, 64, dst, stride);
    441 }
    442 
    443 void aom_dc_top_predictor_32x8_sse2(uint8_t *dst, ptrdiff_t stride,
    444                                    const uint8_t *above, const uint8_t *left) {
    445  (void)left;
    446  __m128i sum_above = dc_sum_32_sse2(above);
    447  const __m128i sixteen = _mm_set1_epi16(16);
    448  sum_above = _mm_add_epi16(sum_above, sixteen);
    449  sum_above = _mm_srai_epi16(sum_above, 5);
    450  sum_above = _mm_unpacklo_epi8(sum_above, sum_above);
    451  sum_above = _mm_shufflelo_epi16(sum_above, 0);
    452  const __m128i row = _mm_unpacklo_epi64(sum_above, sum_above);
    453  dc_store_32xh(&row, 8, dst, stride);
    454 }
    455 #endif  // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
    456 
    457 void aom_dc_top_predictor_32x16_sse2(uint8_t *dst, ptrdiff_t stride,
    458                                     const uint8_t *above,
    459                                     const uint8_t *left) {
    460  (void)left;
    461  __m128i sum_above = dc_sum_32_sse2(above);
    462  const __m128i sixteen = _mm_set1_epi16(16);
    463  sum_above = _mm_add_epi16(sum_above, sixteen);
    464  sum_above = _mm_srai_epi16(sum_above, 5);
    465  sum_above = _mm_unpacklo_epi8(sum_above, sum_above);
    466  sum_above = _mm_shufflelo_epi16(sum_above, 0);
    467  const __m128i row = _mm_unpacklo_epi64(sum_above, sum_above);
    468  dc_store_32xh(&row, 16, dst, stride);
    469 }
    470 
    471 void aom_dc_top_predictor_32x64_sse2(uint8_t *dst, ptrdiff_t stride,
    472                                     const uint8_t *above,
    473                                     const uint8_t *left) {
    474  (void)left;
    475  __m128i sum_above = dc_sum_32_sse2(above);
    476  const __m128i sixteen = _mm_set1_epi16(16);
    477  sum_above = _mm_add_epi16(sum_above, sixteen);
    478  sum_above = _mm_srai_epi16(sum_above, 5);
    479  sum_above = _mm_unpacklo_epi8(sum_above, sum_above);
    480  sum_above = _mm_shufflelo_epi16(sum_above, 0);
    481  const __m128i row = _mm_unpacklo_epi64(sum_above, sum_above);
    482  dc_store_32xh(&row, 64, dst, stride);
    483 }
    484 
    485 void aom_dc_top_predictor_64x64_sse2(uint8_t *dst, ptrdiff_t stride,
    486                                     const uint8_t *above,
    487                                     const uint8_t *left) {
    488  (void)left;
    489  __m128i sum_above = dc_sum_64(above);
    490  const __m128i thirtytwo = _mm_set1_epi16(32);
    491  sum_above = _mm_add_epi16(sum_above, thirtytwo);
    492  sum_above = _mm_srai_epi16(sum_above, 6);
    493  sum_above = _mm_unpacklo_epi8(sum_above, sum_above);
    494  sum_above = _mm_shufflelo_epi16(sum_above, 0);
    495  const __m128i row = _mm_unpacklo_epi64(sum_above, sum_above);
    496  dc_store_64xh(&row, 64, dst, stride);
    497 }
    498 
    499 void aom_dc_top_predictor_64x32_sse2(uint8_t *dst, ptrdiff_t stride,
    500                                     const uint8_t *above,
    501                                     const uint8_t *left) {
    502  (void)left;
    503  __m128i sum_above = dc_sum_64(above);
    504  const __m128i thirtytwo = _mm_set1_epi16(32);
    505  sum_above = _mm_add_epi16(sum_above, thirtytwo);
    506  sum_above = _mm_srai_epi16(sum_above, 6);
    507  sum_above = _mm_unpacklo_epi8(sum_above, sum_above);
    508  sum_above = _mm_shufflelo_epi16(sum_above, 0);
    509  const __m128i row = _mm_unpacklo_epi64(sum_above, sum_above);
    510  dc_store_64xh(&row, 32, dst, stride);
    511 }
    512 
    513 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
    514 void aom_dc_top_predictor_64x16_sse2(uint8_t *dst, ptrdiff_t stride,
    515                                     const uint8_t *above,
    516                                     const uint8_t *left) {
    517  (void)left;
    518  __m128i sum_above = dc_sum_64(above);
    519  const __m128i thirtytwo = _mm_set1_epi16(32);
    520  sum_above = _mm_add_epi16(sum_above, thirtytwo);
    521  sum_above = _mm_srai_epi16(sum_above, 6);
    522  sum_above = _mm_unpacklo_epi8(sum_above, sum_above);
    523  sum_above = _mm_shufflelo_epi16(sum_above, 0);
    524  const __m128i row = _mm_unpacklo_epi64(sum_above, sum_above);
    525  dc_store_64xh(&row, 16, dst, stride);
    526 }
    527 #endif  // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
    528 
    529 // -----------------------------------------------------------------------------
    530 // DC_LEFT
    531 
    532 void aom_dc_left_predictor_4x8_sse2(uint8_t *dst, ptrdiff_t stride,
    533                                    const uint8_t *above, const uint8_t *left) {
    534  (void)above;
    535  __m128i sum_left = dc_sum_8(left);
    536  const __m128i four = _mm_set1_epi16(4);
    537  sum_left = _mm_add_epi16(sum_left, four);
    538  sum_left = _mm_srai_epi16(sum_left, 3);
    539  sum_left = _mm_shufflelo_epi16(sum_left, 0);
    540  sum_left = _mm_packus_epi16(sum_left, sum_left);
    541 
    542  const uint32_t pred = (uint32_t)_mm_cvtsi128_si32(sum_left);
    543  dc_store_4xh(pred, 8, dst, stride);
    544 }
    545 
    546 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
    547 void aom_dc_left_predictor_4x16_sse2(uint8_t *dst, ptrdiff_t stride,
    548                                     const uint8_t *above,
    549                                     const uint8_t *left) {
    550  (void)above;
    551  __m128i sum_left = dc_sum_16_sse2(left);
    552  const __m128i eight = _mm_set1_epi16(8);
    553  sum_left = _mm_add_epi16(sum_left, eight);
    554  sum_left = _mm_srai_epi16(sum_left, 4);
    555  sum_left = _mm_shufflelo_epi16(sum_left, 0);
    556  sum_left = _mm_packus_epi16(sum_left, sum_left);
    557 
    558  const uint32_t pred = (uint32_t)_mm_cvtsi128_si32(sum_left);
    559  dc_store_4xh(pred, 16, dst, stride);
    560 }
    561 #endif  // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
    562 
    563 void aom_dc_left_predictor_8x4_sse2(uint8_t *dst, ptrdiff_t stride,
    564                                    const uint8_t *above, const uint8_t *left) {
    565  (void)above;
    566  __m128i sum_left = dc_sum_4(left);
    567  const __m128i two = _mm_set1_epi16(2);
    568  sum_left = _mm_add_epi16(sum_left, two);
    569  sum_left = _mm_srai_epi16(sum_left, 2);
    570  sum_left = _mm_unpacklo_epi8(sum_left, sum_left);
    571  const __m128i row = _mm_shufflelo_epi16(sum_left, 0);
    572  dc_store_8xh(&row, 4, dst, stride);
    573 }
    574 
    575 void aom_dc_left_predictor_8x16_sse2(uint8_t *dst, ptrdiff_t stride,
    576                                     const uint8_t *above,
    577                                     const uint8_t *left) {
    578  (void)above;
    579  __m128i sum_left = dc_sum_16_sse2(left);
    580  const __m128i eight = _mm_set1_epi16(8);
    581  sum_left = _mm_add_epi16(sum_left, eight);
    582  sum_left = _mm_srai_epi16(sum_left, 4);
    583  sum_left = _mm_unpacklo_epi8(sum_left, sum_left);
    584  const __m128i row = _mm_shufflelo_epi16(sum_left, 0);
    585  dc_store_8xh(&row, 16, dst, stride);
    586 }
    587 
    588 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
    589 void aom_dc_left_predictor_8x32_sse2(uint8_t *dst, ptrdiff_t stride,
    590                                     const uint8_t *above,
    591                                     const uint8_t *left) {
    592  (void)above;
    593  __m128i sum_left = dc_sum_32_sse2(left);
    594  const __m128i sixteen = _mm_set1_epi16(16);
    595  sum_left = _mm_add_epi16(sum_left, sixteen);
    596  sum_left = _mm_srai_epi16(sum_left, 5);
    597  sum_left = _mm_unpacklo_epi8(sum_left, sum_left);
    598  const __m128i row = _mm_shufflelo_epi16(sum_left, 0);
    599  dc_store_8xh(&row, 32, dst, stride);
    600 }
    601 
    602 void aom_dc_left_predictor_16x4_sse2(uint8_t *dst, ptrdiff_t stride,
    603                                     const uint8_t *above,
    604                                     const uint8_t *left) {
    605  (void)above;
    606  __m128i sum_left = dc_sum_4(left);
    607  const __m128i two = _mm_set1_epi16(2);
    608  sum_left = _mm_add_epi16(sum_left, two);
    609  sum_left = _mm_srai_epi16(sum_left, 2);
    610  sum_left = _mm_unpacklo_epi8(sum_left, sum_left);
    611  sum_left = _mm_shufflelo_epi16(sum_left, 0);
    612  const __m128i row = _mm_unpacklo_epi64(sum_left, sum_left);
    613  dc_store_16xh(&row, 4, dst, stride);
    614 }
    615 #endif  // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
    616 
    617 void aom_dc_left_predictor_16x8_sse2(uint8_t *dst, ptrdiff_t stride,
    618                                     const uint8_t *above,
    619                                     const uint8_t *left) {
    620  (void)above;
    621  __m128i sum_left = dc_sum_8(left);
    622  const __m128i four = _mm_set1_epi16(4);
    623  sum_left = _mm_add_epi16(sum_left, four);
    624  sum_left = _mm_srai_epi16(sum_left, 3);
    625  sum_left = _mm_unpacklo_epi8(sum_left, sum_left);
    626  sum_left = _mm_shufflelo_epi16(sum_left, 0);
    627  const __m128i row = _mm_unpacklo_epi64(sum_left, sum_left);
    628  dc_store_16xh(&row, 8, dst, stride);
    629 }
    630 
    631 void aom_dc_left_predictor_16x32_sse2(uint8_t *dst, ptrdiff_t stride,
    632                                      const uint8_t *above,
    633                                      const uint8_t *left) {
    634  (void)above;
    635  __m128i sum_left = dc_sum_32_sse2(left);
    636  const __m128i sixteen = _mm_set1_epi16(16);
    637  sum_left = _mm_add_epi16(sum_left, sixteen);
    638  sum_left = _mm_srai_epi16(sum_left, 5);
    639  sum_left = _mm_unpacklo_epi8(sum_left, sum_left);
    640  sum_left = _mm_shufflelo_epi16(sum_left, 0);
    641  const __m128i row = _mm_unpacklo_epi64(sum_left, sum_left);
    642  dc_store_16xh(&row, 32, dst, stride);
    643 }
    644 
    645 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
    646 void aom_dc_left_predictor_16x64_sse2(uint8_t *dst, ptrdiff_t stride,
    647                                      const uint8_t *above,
    648                                      const uint8_t *left) {
    649  (void)above;
    650  __m128i sum_left = dc_sum_64(left);
    651  const __m128i thirtytwo = _mm_set1_epi16(32);
    652  sum_left = _mm_add_epi16(sum_left, thirtytwo);
    653  sum_left = _mm_srai_epi16(sum_left, 6);
    654  sum_left = _mm_unpacklo_epi8(sum_left, sum_left);
    655  sum_left = _mm_shufflelo_epi16(sum_left, 0);
    656  const __m128i row = _mm_unpacklo_epi64(sum_left, sum_left);
    657  dc_store_16xh(&row, 64, dst, stride);
    658 }
    659 
    660 void aom_dc_left_predictor_32x8_sse2(uint8_t *dst, ptrdiff_t stride,
    661                                     const uint8_t *above,
    662                                     const uint8_t *left) {
    663  (void)above;
    664  __m128i sum_left = dc_sum_8(left);
    665  const __m128i four = _mm_set1_epi16(4);
    666  sum_left = _mm_add_epi16(sum_left, four);
    667  sum_left = _mm_srai_epi16(sum_left, 3);
    668  sum_left = _mm_unpacklo_epi8(sum_left, sum_left);
    669  sum_left = _mm_shufflelo_epi16(sum_left, 0);
    670  const __m128i row = _mm_unpacklo_epi64(sum_left, sum_left);
    671  dc_store_32xh(&row, 8, dst, stride);
    672 }
    673 #endif  // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
    674 
    675 void aom_dc_left_predictor_32x16_sse2(uint8_t *dst, ptrdiff_t stride,
    676                                      const uint8_t *above,
    677                                      const uint8_t *left) {
    678  (void)above;
    679  __m128i sum_left = dc_sum_16_sse2(left);
    680  const __m128i eight = _mm_set1_epi16(8);
    681  sum_left = _mm_add_epi16(sum_left, eight);
    682  sum_left = _mm_srai_epi16(sum_left, 4);
    683  sum_left = _mm_unpacklo_epi8(sum_left, sum_left);
    684  sum_left = _mm_shufflelo_epi16(sum_left, 0);
    685  const __m128i row = _mm_unpacklo_epi64(sum_left, sum_left);
    686  dc_store_32xh(&row, 16, dst, stride);
    687 }
    688 
    689 void aom_dc_left_predictor_32x64_sse2(uint8_t *dst, ptrdiff_t stride,
    690                                      const uint8_t *above,
    691                                      const uint8_t *left) {
    692  (void)above;
    693  __m128i sum_left = dc_sum_64(left);
    694  const __m128i thirtytwo = _mm_set1_epi16(32);
    695  sum_left = _mm_add_epi16(sum_left, thirtytwo);
    696  sum_left = _mm_srai_epi16(sum_left, 6);
    697  sum_left = _mm_unpacklo_epi8(sum_left, sum_left);
    698  sum_left = _mm_shufflelo_epi16(sum_left, 0);
    699  const __m128i row = _mm_unpacklo_epi64(sum_left, sum_left);
    700  dc_store_32xh(&row, 64, dst, stride);
    701 }
    702 
    703 void aom_dc_left_predictor_64x64_sse2(uint8_t *dst, ptrdiff_t stride,
    704                                      const uint8_t *above,
    705                                      const uint8_t *left) {
    706  (void)above;
    707  __m128i sum_left = dc_sum_64(left);
    708  const __m128i thirtytwo = _mm_set1_epi16(32);
    709  sum_left = _mm_add_epi16(sum_left, thirtytwo);
    710  sum_left = _mm_srai_epi16(sum_left, 6);
    711  sum_left = _mm_unpacklo_epi8(sum_left, sum_left);
    712  sum_left = _mm_shufflelo_epi16(sum_left, 0);
    713  const __m128i row = _mm_unpacklo_epi64(sum_left, sum_left);
    714  dc_store_64xh(&row, 64, dst, stride);
    715 }
    716 
    717 void aom_dc_left_predictor_64x32_sse2(uint8_t *dst, ptrdiff_t stride,
    718                                      const uint8_t *above,
    719                                      const uint8_t *left) {
    720  (void)above;
    721  __m128i sum_left = dc_sum_32_sse2(left);
    722  const __m128i sixteen = _mm_set1_epi16(16);
    723  sum_left = _mm_add_epi16(sum_left, sixteen);
    724  sum_left = _mm_srai_epi16(sum_left, 5);
    725  sum_left = _mm_unpacklo_epi8(sum_left, sum_left);
    726  sum_left = _mm_shufflelo_epi16(sum_left, 0);
    727  const __m128i row = _mm_unpacklo_epi64(sum_left, sum_left);
    728  dc_store_64xh(&row, 32, dst, stride);
    729 }
    730 
    731 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
    732 void aom_dc_left_predictor_64x16_sse2(uint8_t *dst, ptrdiff_t stride,
    733                                      const uint8_t *above,
    734                                      const uint8_t *left) {
    735  (void)above;
    736  __m128i sum_left = dc_sum_16_sse2(left);
    737  const __m128i eight = _mm_set1_epi16(8);
    738  sum_left = _mm_add_epi16(sum_left, eight);
    739  sum_left = _mm_srai_epi16(sum_left, 4);
    740  sum_left = _mm_unpacklo_epi8(sum_left, sum_left);
    741  sum_left = _mm_shufflelo_epi16(sum_left, 0);
    742  const __m128i row = _mm_unpacklo_epi64(sum_left, sum_left);
    743  dc_store_64xh(&row, 16, dst, stride);
    744 }
    745 #endif  // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
    746 
    747 // -----------------------------------------------------------------------------
    748 // DC_128
    749 
    750 void aom_dc_128_predictor_4x8_sse2(uint8_t *dst, ptrdiff_t stride,
    751                                   const uint8_t *above, const uint8_t *left) {
    752  (void)above;
    753  (void)left;
    754  const uint32_t pred = 0x80808080;
    755  dc_store_4xh(pred, 8, dst, stride);
    756 }
    757 
    758 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
    759 void aom_dc_128_predictor_4x16_sse2(uint8_t *dst, ptrdiff_t stride,
    760                                    const uint8_t *above, const uint8_t *left) {
    761  (void)above;
    762  (void)left;
    763  const uint32_t pred = 0x80808080;
    764  dc_store_4xh(pred, 16, dst, stride);
    765 }
    766 #endif  // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
    767 
    768 void aom_dc_128_predictor_8x4_sse2(uint8_t *dst, ptrdiff_t stride,
    769                                   const uint8_t *above, const uint8_t *left) {
    770  (void)above;
    771  (void)left;
    772  const __m128i row = _mm_set1_epi8((int8_t)128);
    773  dc_store_8xh(&row, 4, dst, stride);
    774 }
    775 
    776 void aom_dc_128_predictor_8x16_sse2(uint8_t *dst, ptrdiff_t stride,
    777                                    const uint8_t *above, const uint8_t *left) {
    778  (void)above;
    779  (void)left;
    780  const __m128i row = _mm_set1_epi8((int8_t)128);
    781  dc_store_8xh(&row, 16, dst, stride);
    782 }
    783 
    784 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
    785 void aom_dc_128_predictor_8x32_sse2(uint8_t *dst, ptrdiff_t stride,
    786                                    const uint8_t *above, const uint8_t *left) {
    787  (void)above;
    788  (void)left;
    789  const __m128i row = _mm_set1_epi8((int8_t)128);
    790  dc_store_8xh(&row, 32, dst, stride);
    791 }
    792 
    793 void aom_dc_128_predictor_16x4_sse2(uint8_t *dst, ptrdiff_t stride,
    794                                    const uint8_t *above, const uint8_t *left) {
    795  (void)above;
    796  (void)left;
    797  const __m128i row = _mm_set1_epi8((int8_t)128);
    798  dc_store_16xh(&row, 4, dst, stride);
    799 }
    800 #endif  // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
    801 
    802 void aom_dc_128_predictor_16x8_sse2(uint8_t *dst, ptrdiff_t stride,
    803                                    const uint8_t *above, const uint8_t *left) {
    804  (void)above;
    805  (void)left;
    806  const __m128i row = _mm_set1_epi8((int8_t)128);
    807  dc_store_16xh(&row, 8, dst, stride);
    808 }
    809 
    810 void aom_dc_128_predictor_16x32_sse2(uint8_t *dst, ptrdiff_t stride,
    811                                     const uint8_t *above,
    812                                     const uint8_t *left) {
    813  (void)above;
    814  (void)left;
    815  const __m128i row = _mm_set1_epi8((int8_t)128);
    816  dc_store_16xh(&row, 32, dst, stride);
    817 }
    818 
    819 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
    820 void aom_dc_128_predictor_16x64_sse2(uint8_t *dst, ptrdiff_t stride,
    821                                     const uint8_t *above,
    822                                     const uint8_t *left) {
    823  (void)above;
    824  (void)left;
    825  const __m128i row = _mm_set1_epi8((int8_t)128);
    826  dc_store_16xh(&row, 64, dst, stride);
    827 }
    828 
    829 void aom_dc_128_predictor_32x8_sse2(uint8_t *dst, ptrdiff_t stride,
    830                                    const uint8_t *above, const uint8_t *left) {
    831  (void)above;
    832  (void)left;
    833  const __m128i row = _mm_set1_epi8((int8_t)128);
    834  dc_store_32xh(&row, 8, dst, stride);
    835 }
    836 #endif  // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
    837 
    838 void aom_dc_128_predictor_32x16_sse2(uint8_t *dst, ptrdiff_t stride,
    839                                     const uint8_t *above,
    840                                     const uint8_t *left) {
    841  (void)above;
    842  (void)left;
    843  const __m128i row = _mm_set1_epi8((int8_t)128);
    844  dc_store_32xh(&row, 16, dst, stride);
    845 }
    846 
    847 void aom_dc_128_predictor_32x64_sse2(uint8_t *dst, ptrdiff_t stride,
    848                                     const uint8_t *above,
    849                                     const uint8_t *left) {
    850  (void)above;
    851  (void)left;
    852  const __m128i row = _mm_set1_epi8((int8_t)128);
    853  dc_store_32xh(&row, 64, dst, stride);
    854 }
    855 
    856 void aom_dc_128_predictor_64x64_sse2(uint8_t *dst, ptrdiff_t stride,
    857                                     const uint8_t *above,
    858                                     const uint8_t *left) {
    859  (void)above;
    860  (void)left;
    861  const __m128i row = _mm_set1_epi8((int8_t)128);
    862  dc_store_64xh(&row, 64, dst, stride);
    863 }
    864 
    865 void aom_dc_128_predictor_64x32_sse2(uint8_t *dst, ptrdiff_t stride,
    866                                     const uint8_t *above,
    867                                     const uint8_t *left) {
    868  (void)above;
    869  (void)left;
    870  const __m128i row = _mm_set1_epi8((int8_t)128);
    871  dc_store_64xh(&row, 32, dst, stride);
    872 }
    873 
    874 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
    875 void aom_dc_128_predictor_64x16_sse2(uint8_t *dst, ptrdiff_t stride,
    876                                     const uint8_t *above,
    877                                     const uint8_t *left) {
    878  (void)above;
    879  (void)left;
    880  const __m128i row = _mm_set1_epi8((int8_t)128);
    881  dc_store_64xh(&row, 16, dst, stride);
    882 }
    883 #endif  // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
    884 
    885 // -----------------------------------------------------------------------------
    886 // V_PRED
    887 
    888 void aom_v_predictor_4x8_sse2(uint8_t *dst, ptrdiff_t stride,
    889                              const uint8_t *above, const uint8_t *left) {
    890  const uint32_t pred = *(uint32_t *)above;
    891  (void)left;
    892  dc_store_4xh(pred, 8, dst, stride);
    893 }
    894 
    895 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
    896 void aom_v_predictor_4x16_sse2(uint8_t *dst, ptrdiff_t stride,
    897                               const uint8_t *above, const uint8_t *left) {
    898  const uint32_t pred = *(uint32_t *)above;
    899  (void)left;
    900  dc_store_4xh(pred, 16, dst, stride);
    901 }
    902 #endif  // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
    903 
    904 void aom_v_predictor_8x4_sse2(uint8_t *dst, ptrdiff_t stride,
    905                              const uint8_t *above, const uint8_t *left) {
    906  const __m128i row = _mm_loadl_epi64((__m128i const *)above);
    907  (void)left;
    908  dc_store_8xh(&row, 4, dst, stride);
    909 }
    910 
    911 void aom_v_predictor_8x16_sse2(uint8_t *dst, ptrdiff_t stride,
    912                               const uint8_t *above, const uint8_t *left) {
    913  const __m128i row = _mm_loadl_epi64((__m128i const *)above);
    914  (void)left;
    915  dc_store_8xh(&row, 16, dst, stride);
    916 }
    917 
    918 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
    919 void aom_v_predictor_8x32_sse2(uint8_t *dst, ptrdiff_t stride,
    920                               const uint8_t *above, const uint8_t *left) {
    921  const __m128i row = _mm_loadl_epi64((__m128i const *)above);
    922  (void)left;
    923  dc_store_8xh(&row, 32, dst, stride);
    924 }
    925 
    926 void aom_v_predictor_16x4_sse2(uint8_t *dst, ptrdiff_t stride,
    927                               const uint8_t *above, const uint8_t *left) {
    928  const __m128i row = _mm_load_si128((__m128i const *)above);
    929  (void)left;
    930  dc_store_16xh(&row, 4, dst, stride);
    931 }
    932 #endif  // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
    933 
    934 void aom_v_predictor_16x8_sse2(uint8_t *dst, ptrdiff_t stride,
    935                               const uint8_t *above, const uint8_t *left) {
    936  const __m128i row = _mm_load_si128((__m128i const *)above);
    937  (void)left;
    938  dc_store_16xh(&row, 8, dst, stride);
    939 }
    940 
    941 void aom_v_predictor_16x32_sse2(uint8_t *dst, ptrdiff_t stride,
    942                                const uint8_t *above, const uint8_t *left) {
    943  const __m128i row = _mm_load_si128((__m128i const *)above);
    944  (void)left;
    945  dc_store_16xh(&row, 32, dst, stride);
    946 }
    947 
    948 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
    949 void aom_v_predictor_16x64_sse2(uint8_t *dst, ptrdiff_t stride,
    950                                const uint8_t *above, const uint8_t *left) {
    951  const __m128i row = _mm_load_si128((__m128i const *)above);
    952  (void)left;
    953  dc_store_16xh(&row, 64, dst, stride);
    954 }
    955 #endif  // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
    956 
    957 static inline void v_predictor_32xh(uint8_t *dst, ptrdiff_t stride,
    958                                    const uint8_t *above, int height) {
    959  const __m128i row0 = _mm_load_si128((__m128i const *)above);
    960  const __m128i row1 = _mm_load_si128((__m128i const *)(above + 16));
    961  for (int i = 0; i < height; ++i) {
    962    _mm_store_si128((__m128i *)dst, row0);
    963    _mm_store_si128((__m128i *)(dst + 16), row1);
    964    dst += stride;
    965  }
    966 }
    967 
    968 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
    969 void aom_v_predictor_32x8_sse2(uint8_t *dst, ptrdiff_t stride,
    970                               const uint8_t *above, const uint8_t *left) {
    971  (void)left;
    972  v_predictor_32xh(dst, stride, above, 8);
    973 }
    974 #endif  // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
    975 
    976 void aom_v_predictor_32x16_sse2(uint8_t *dst, ptrdiff_t stride,
    977                                const uint8_t *above, const uint8_t *left) {
    978  (void)left;
    979  v_predictor_32xh(dst, stride, above, 16);
    980 }
    981 
    982 void aom_v_predictor_32x64_sse2(uint8_t *dst, ptrdiff_t stride,
    983                                const uint8_t *above, const uint8_t *left) {
    984  (void)left;
    985  v_predictor_32xh(dst, stride, above, 64);
    986 }
    987 
    988 static inline void v_predictor_64xh(uint8_t *dst, ptrdiff_t stride,
    989                                    const uint8_t *above, int height) {
    990  const __m128i row0 = _mm_load_si128((__m128i const *)above);
    991  const __m128i row1 = _mm_load_si128((__m128i const *)(above + 16));
    992  const __m128i row2 = _mm_load_si128((__m128i const *)(above + 32));
    993  const __m128i row3 = _mm_load_si128((__m128i const *)(above + 48));
    994  for (int i = 0; i < height; ++i) {
    995    _mm_store_si128((__m128i *)dst, row0);
    996    _mm_store_si128((__m128i *)(dst + 16), row1);
    997    _mm_store_si128((__m128i *)(dst + 32), row2);
    998    _mm_store_si128((__m128i *)(dst + 48), row3);
    999    dst += stride;
   1000  }
   1001 }
   1002 
   1003 void aom_v_predictor_64x64_sse2(uint8_t *dst, ptrdiff_t stride,
   1004                                const uint8_t *above, const uint8_t *left) {
   1005  (void)left;
   1006  v_predictor_64xh(dst, stride, above, 64);
   1007 }
   1008 
   1009 void aom_v_predictor_64x32_sse2(uint8_t *dst, ptrdiff_t stride,
   1010                                const uint8_t *above, const uint8_t *left) {
   1011  (void)left;
   1012  v_predictor_64xh(dst, stride, above, 32);
   1013 }
   1014 
   1015 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
   1016 void aom_v_predictor_64x16_sse2(uint8_t *dst, ptrdiff_t stride,
   1017                                const uint8_t *above, const uint8_t *left) {
   1018  (void)left;
   1019  v_predictor_64xh(dst, stride, above, 16);
   1020 }
   1021 #endif  // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
   1022 
   1023 // -----------------------------------------------------------------------------
   1024 // H_PRED
   1025 
   1026 void aom_h_predictor_4x8_sse2(uint8_t *dst, ptrdiff_t stride,
   1027                              const uint8_t *above, const uint8_t *left) {
   1028  (void)above;
   1029  __m128i left_col = _mm_loadl_epi64((__m128i const *)left);
   1030  left_col = _mm_unpacklo_epi8(left_col, left_col);
   1031  __m128i row0 = _mm_shufflelo_epi16(left_col, 0);
   1032  __m128i row1 = _mm_shufflelo_epi16(left_col, 0x55);
   1033  __m128i row2 = _mm_shufflelo_epi16(left_col, 0xaa);
   1034  __m128i row3 = _mm_shufflelo_epi16(left_col, 0xff);
   1035  *(int *)dst = _mm_cvtsi128_si32(row0);
   1036  dst += stride;
   1037  *(int *)dst = _mm_cvtsi128_si32(row1);
   1038  dst += stride;
   1039  *(int *)dst = _mm_cvtsi128_si32(row2);
   1040  dst += stride;
   1041  *(int *)dst = _mm_cvtsi128_si32(row3);
   1042  dst += stride;
   1043  left_col = _mm_unpackhi_epi64(left_col, left_col);
   1044  row0 = _mm_shufflelo_epi16(left_col, 0);
   1045  row1 = _mm_shufflelo_epi16(left_col, 0x55);
   1046  row2 = _mm_shufflelo_epi16(left_col, 0xaa);
   1047  row3 = _mm_shufflelo_epi16(left_col, 0xff);
   1048  *(int *)dst = _mm_cvtsi128_si32(row0);
   1049  dst += stride;
   1050  *(int *)dst = _mm_cvtsi128_si32(row1);
   1051  dst += stride;
   1052  *(int *)dst = _mm_cvtsi128_si32(row2);
   1053  dst += stride;
   1054  *(int *)dst = _mm_cvtsi128_si32(row3);
   1055 }
   1056 
   1057 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
   1058 void aom_h_predictor_4x16_sse2(uint8_t *dst, ptrdiff_t stride,
   1059                               const uint8_t *above, const uint8_t *left) {
   1060  (void)above;
   1061  const __m128i left_col = _mm_load_si128((__m128i const *)left);
   1062  __m128i left_col_low = _mm_unpacklo_epi8(left_col, left_col);
   1063  __m128i left_col_high = _mm_unpackhi_epi8(left_col, left_col);
   1064 
   1065  __m128i row0 = _mm_shufflelo_epi16(left_col_low, 0);
   1066  __m128i row1 = _mm_shufflelo_epi16(left_col_low, 0x55);
   1067  __m128i row2 = _mm_shufflelo_epi16(left_col_low, 0xaa);
   1068  __m128i row3 = _mm_shufflelo_epi16(left_col_low, 0xff);
   1069  *(int *)dst = _mm_cvtsi128_si32(row0);
   1070  dst += stride;
   1071  *(int *)dst = _mm_cvtsi128_si32(row1);
   1072  dst += stride;
   1073  *(int *)dst = _mm_cvtsi128_si32(row2);
   1074  dst += stride;
   1075  *(int *)dst = _mm_cvtsi128_si32(row3);
   1076  dst += stride;
   1077 
   1078  left_col_low = _mm_unpackhi_epi64(left_col_low, left_col_low);
   1079  row0 = _mm_shufflelo_epi16(left_col_low, 0);
   1080  row1 = _mm_shufflelo_epi16(left_col_low, 0x55);
   1081  row2 = _mm_shufflelo_epi16(left_col_low, 0xaa);
   1082  row3 = _mm_shufflelo_epi16(left_col_low, 0xff);
   1083  *(int *)dst = _mm_cvtsi128_si32(row0);
   1084  dst += stride;
   1085  *(int *)dst = _mm_cvtsi128_si32(row1);
   1086  dst += stride;
   1087  *(int *)dst = _mm_cvtsi128_si32(row2);
   1088  dst += stride;
   1089  *(int *)dst = _mm_cvtsi128_si32(row3);
   1090  dst += stride;
   1091 
   1092  row0 = _mm_shufflelo_epi16(left_col_high, 0);
   1093  row1 = _mm_shufflelo_epi16(left_col_high, 0x55);
   1094  row2 = _mm_shufflelo_epi16(left_col_high, 0xaa);
   1095  row3 = _mm_shufflelo_epi16(left_col_high, 0xff);
   1096  *(int *)dst = _mm_cvtsi128_si32(row0);
   1097  dst += stride;
   1098  *(int *)dst = _mm_cvtsi128_si32(row1);
   1099  dst += stride;
   1100  *(int *)dst = _mm_cvtsi128_si32(row2);
   1101  dst += stride;
   1102  *(int *)dst = _mm_cvtsi128_si32(row3);
   1103  dst += stride;
   1104 
   1105  left_col_high = _mm_unpackhi_epi64(left_col_high, left_col_high);
   1106  row0 = _mm_shufflelo_epi16(left_col_high, 0);
   1107  row1 = _mm_shufflelo_epi16(left_col_high, 0x55);
   1108  row2 = _mm_shufflelo_epi16(left_col_high, 0xaa);
   1109  row3 = _mm_shufflelo_epi16(left_col_high, 0xff);
   1110  *(int *)dst = _mm_cvtsi128_si32(row0);
   1111  dst += stride;
   1112  *(int *)dst = _mm_cvtsi128_si32(row1);
   1113  dst += stride;
   1114  *(int *)dst = _mm_cvtsi128_si32(row2);
   1115  dst += stride;
   1116  *(int *)dst = _mm_cvtsi128_si32(row3);
   1117 }
   1118 #endif  // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
   1119 
   1120 void aom_h_predictor_8x4_sse2(uint8_t *dst, ptrdiff_t stride,
   1121                              const uint8_t *above, const uint8_t *left) {
   1122  (void)above;
   1123  __m128i left_col = _mm_loadl_epi64((__m128i const *)left);
   1124  left_col = _mm_unpacklo_epi8(left_col, left_col);
   1125  __m128i row0 = _mm_shufflelo_epi16(left_col, 0);
   1126  __m128i row1 = _mm_shufflelo_epi16(left_col, 0x55);
   1127  __m128i row2 = _mm_shufflelo_epi16(left_col, 0xaa);
   1128  __m128i row3 = _mm_shufflelo_epi16(left_col, 0xff);
   1129  _mm_storel_epi64((__m128i *)dst, row0);
   1130  dst += stride;
   1131  _mm_storel_epi64((__m128i *)dst, row1);
   1132  dst += stride;
   1133  _mm_storel_epi64((__m128i *)dst, row2);
   1134  dst += stride;
   1135  _mm_storel_epi64((__m128i *)dst, row3);
   1136 }
   1137 
   1138 static inline void h_predictor_8x16xc(uint8_t *dst, ptrdiff_t stride,
   1139                                      const uint8_t *above, const uint8_t *left,
   1140                                      int count) {
   1141  (void)above;
   1142  for (int i = 0; i < count; ++i) {
   1143    const __m128i left_col = _mm_load_si128((__m128i const *)left);
   1144    __m128i left_col_low = _mm_unpacklo_epi8(left_col, left_col);
   1145    __m128i left_col_high = _mm_unpackhi_epi8(left_col, left_col);
   1146 
   1147    __m128i row0 = _mm_shufflelo_epi16(left_col_low, 0);
   1148    __m128i row1 = _mm_shufflelo_epi16(left_col_low, 0x55);
   1149    __m128i row2 = _mm_shufflelo_epi16(left_col_low, 0xaa);
   1150    __m128i row3 = _mm_shufflelo_epi16(left_col_low, 0xff);
   1151    _mm_storel_epi64((__m128i *)dst, row0);
   1152    dst += stride;
   1153    _mm_storel_epi64((__m128i *)dst, row1);
   1154    dst += stride;
   1155    _mm_storel_epi64((__m128i *)dst, row2);
   1156    dst += stride;
   1157    _mm_storel_epi64((__m128i *)dst, row3);
   1158    dst += stride;
   1159 
   1160    left_col_low = _mm_unpackhi_epi64(left_col_low, left_col_low);
   1161    row0 = _mm_shufflelo_epi16(left_col_low, 0);
   1162    row1 = _mm_shufflelo_epi16(left_col_low, 0x55);
   1163    row2 = _mm_shufflelo_epi16(left_col_low, 0xaa);
   1164    row3 = _mm_shufflelo_epi16(left_col_low, 0xff);
   1165    _mm_storel_epi64((__m128i *)dst, row0);
   1166    dst += stride;
   1167    _mm_storel_epi64((__m128i *)dst, row1);
   1168    dst += stride;
   1169    _mm_storel_epi64((__m128i *)dst, row2);
   1170    dst += stride;
   1171    _mm_storel_epi64((__m128i *)dst, row3);
   1172    dst += stride;
   1173 
   1174    row0 = _mm_shufflelo_epi16(left_col_high, 0);
   1175    row1 = _mm_shufflelo_epi16(left_col_high, 0x55);
   1176    row2 = _mm_shufflelo_epi16(left_col_high, 0xaa);
   1177    row3 = _mm_shufflelo_epi16(left_col_high, 0xff);
   1178    _mm_storel_epi64((__m128i *)dst, row0);
   1179    dst += stride;
   1180    _mm_storel_epi64((__m128i *)dst, row1);
   1181    dst += stride;
   1182    _mm_storel_epi64((__m128i *)dst, row2);
   1183    dst += stride;
   1184    _mm_storel_epi64((__m128i *)dst, row3);
   1185    dst += stride;
   1186 
   1187    left_col_high = _mm_unpackhi_epi64(left_col_high, left_col_high);
   1188    row0 = _mm_shufflelo_epi16(left_col_high, 0);
   1189    row1 = _mm_shufflelo_epi16(left_col_high, 0x55);
   1190    row2 = _mm_shufflelo_epi16(left_col_high, 0xaa);
   1191    row3 = _mm_shufflelo_epi16(left_col_high, 0xff);
   1192    _mm_storel_epi64((__m128i *)dst, row0);
   1193    dst += stride;
   1194    _mm_storel_epi64((__m128i *)dst, row1);
   1195    dst += stride;
   1196    _mm_storel_epi64((__m128i *)dst, row2);
   1197    dst += stride;
   1198    _mm_storel_epi64((__m128i *)dst, row3);
   1199    dst += stride;
   1200    left += 16;
   1201  }
   1202 }
   1203 
   1204 void aom_h_predictor_8x16_sse2(uint8_t *dst, ptrdiff_t stride,
   1205                               const uint8_t *above, const uint8_t *left) {
   1206  h_predictor_8x16xc(dst, stride, above, left, 1);
   1207 }
   1208 
   1209 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
   1210 void aom_h_predictor_8x32_sse2(uint8_t *dst, ptrdiff_t stride,
   1211                               const uint8_t *above, const uint8_t *left) {
   1212  h_predictor_8x16xc(dst, stride, above, left, 2);
   1213 }
   1214 #endif  // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
   1215 
   1216 static inline void h_pred_store_16xh(const __m128i *row, int h, uint8_t *dst,
   1217                                     ptrdiff_t stride) {
   1218  int i;
   1219  for (i = 0; i < h; ++i) {
   1220    _mm_store_si128((__m128i *)dst, row[i]);
   1221    dst += stride;
   1222  }
   1223 }
   1224 
   1225 static inline void repeat_low_4pixels(const __m128i *x, __m128i *row) {
   1226  const __m128i u0 = _mm_shufflelo_epi16(*x, 0);
   1227  const __m128i u1 = _mm_shufflelo_epi16(*x, 0x55);
   1228  const __m128i u2 = _mm_shufflelo_epi16(*x, 0xaa);
   1229  const __m128i u3 = _mm_shufflelo_epi16(*x, 0xff);
   1230 
   1231  row[0] = _mm_unpacklo_epi64(u0, u0);
   1232  row[1] = _mm_unpacklo_epi64(u1, u1);
   1233  row[2] = _mm_unpacklo_epi64(u2, u2);
   1234  row[3] = _mm_unpacklo_epi64(u3, u3);
   1235 }
   1236 
   1237 static inline void repeat_high_4pixels(const __m128i *x, __m128i *row) {
   1238  const __m128i u0 = _mm_shufflehi_epi16(*x, 0);
   1239  const __m128i u1 = _mm_shufflehi_epi16(*x, 0x55);
   1240  const __m128i u2 = _mm_shufflehi_epi16(*x, 0xaa);
   1241  const __m128i u3 = _mm_shufflehi_epi16(*x, 0xff);
   1242 
   1243  row[0] = _mm_unpackhi_epi64(u0, u0);
   1244  row[1] = _mm_unpackhi_epi64(u1, u1);
   1245  row[2] = _mm_unpackhi_epi64(u2, u2);
   1246  row[3] = _mm_unpackhi_epi64(u3, u3);
   1247 }
   1248 
   1249 // Process 16x8, first 4 rows
   1250 // Use first 8 bytes of left register: xxxxxxxx33221100
   1251 static inline void h_prediction_16x8_1(const __m128i *left, uint8_t *dst,
   1252                                       ptrdiff_t stride) {
   1253  __m128i row[4];
   1254  repeat_low_4pixels(left, row);
   1255  h_pred_store_16xh(row, 4, dst, stride);
   1256 }
   1257 
   1258 // Process 16x8, second 4 rows
   1259 // Use second 8 bytes of left register: 77665544xxxxxxxx
   1260 static inline void h_prediction_16x8_2(const __m128i *left, uint8_t *dst,
   1261                                       ptrdiff_t stride) {
   1262  __m128i row[4];
   1263  repeat_high_4pixels(left, row);
   1264  h_pred_store_16xh(row, 4, dst, stride);
   1265 }
   1266 
   1267 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
   1268 void aom_h_predictor_16x4_sse2(uint8_t *dst, ptrdiff_t stride,
   1269                               const uint8_t *above, const uint8_t *left) {
   1270  (void)above;
   1271  const __m128i left_col = _mm_loadl_epi64((const __m128i *)left);
   1272  const __m128i left_col_8p = _mm_unpacklo_epi8(left_col, left_col);
   1273  h_prediction_16x8_1(&left_col_8p, dst, stride);
   1274 }
   1275 #endif  // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
   1276 
   1277 void aom_h_predictor_16x8_sse2(uint8_t *dst, ptrdiff_t stride,
   1278                               const uint8_t *above, const uint8_t *left) {
   1279  (void)above;
   1280  const __m128i left_col = _mm_loadl_epi64((const __m128i *)left);
   1281  const __m128i left_col_8p = _mm_unpacklo_epi8(left_col, left_col);
   1282  h_prediction_16x8_1(&left_col_8p, dst, stride);
   1283  dst += stride << 2;
   1284  h_prediction_16x8_2(&left_col_8p, dst, stride);
   1285 }
   1286 
   1287 static inline void h_predictor_16xh(uint8_t *dst, ptrdiff_t stride,
   1288                                    const uint8_t *left, int count) {
   1289  int i = 0;
   1290  do {
   1291    const __m128i left_col = _mm_load_si128((const __m128i *)left);
   1292    const __m128i left_col_8p_lo = _mm_unpacklo_epi8(left_col, left_col);
   1293    h_prediction_16x8_1(&left_col_8p_lo, dst, stride);
   1294    dst += stride << 2;
   1295    h_prediction_16x8_2(&left_col_8p_lo, dst, stride);
   1296    dst += stride << 2;
   1297 
   1298    const __m128i left_col_8p_hi = _mm_unpackhi_epi8(left_col, left_col);
   1299    h_prediction_16x8_1(&left_col_8p_hi, dst, stride);
   1300    dst += stride << 2;
   1301    h_prediction_16x8_2(&left_col_8p_hi, dst, stride);
   1302    dst += stride << 2;
   1303 
   1304    left += 16;
   1305    i++;
   1306  } while (i < count);
   1307 }
   1308 
   1309 void aom_h_predictor_16x32_sse2(uint8_t *dst, ptrdiff_t stride,
   1310                                const uint8_t *above, const uint8_t *left) {
   1311  (void)above;
   1312  h_predictor_16xh(dst, stride, left, 2);
   1313 }
   1314 
   1315 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
   1316 void aom_h_predictor_16x64_sse2(uint8_t *dst, ptrdiff_t stride,
   1317                                const uint8_t *above, const uint8_t *left) {
   1318  (void)above;
   1319  h_predictor_16xh(dst, stride, left, 4);
   1320 }
   1321 #endif  // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
   1322 
   1323 static inline void h_pred_store_32xh(const __m128i *row, int h, uint8_t *dst,
   1324                                     ptrdiff_t stride) {
   1325  int i;
   1326  for (i = 0; i < h; ++i) {
   1327    _mm_store_si128((__m128i *)dst, row[i]);
   1328    _mm_store_si128((__m128i *)(dst + 16), row[i]);
   1329    dst += stride;
   1330  }
   1331 }
   1332 
   1333 // Process 32x8, first 4 rows
   1334 // Use first 8 bytes of left register: xxxxxxxx33221100
   1335 static inline void h_prediction_32x8_1(const __m128i *left, uint8_t *dst,
   1336                                       ptrdiff_t stride) {
   1337  __m128i row[4];
   1338  repeat_low_4pixels(left, row);
   1339  h_pred_store_32xh(row, 4, dst, stride);
   1340 }
   1341 
   1342 // Process 32x8, second 4 rows
   1343 // Use second 8 bytes of left register: 77665544xxxxxxxx
   1344 static inline void h_prediction_32x8_2(const __m128i *left, uint8_t *dst,
   1345                                       ptrdiff_t stride) {
   1346  __m128i row[4];
   1347  repeat_high_4pixels(left, row);
   1348  h_pred_store_32xh(row, 4, dst, stride);
   1349 }
   1350 
   1351 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
   1352 void aom_h_predictor_32x8_sse2(uint8_t *dst, ptrdiff_t stride,
   1353                               const uint8_t *above, const uint8_t *left) {
   1354  __m128i left_col, left_col_8p;
   1355  (void)above;
   1356 
   1357  left_col = _mm_load_si128((const __m128i *)left);
   1358 
   1359  left_col_8p = _mm_unpacklo_epi8(left_col, left_col);
   1360  h_prediction_32x8_1(&left_col_8p, dst, stride);
   1361  dst += stride << 2;
   1362  h_prediction_32x8_2(&left_col_8p, dst, stride);
   1363 }
   1364 #endif  // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
   1365 
   1366 void aom_h_predictor_32x16_sse2(uint8_t *dst, ptrdiff_t stride,
   1367                                const uint8_t *above, const uint8_t *left) {
   1368  __m128i left_col, left_col_8p;
   1369  (void)above;
   1370 
   1371  left_col = _mm_load_si128((const __m128i *)left);
   1372 
   1373  left_col_8p = _mm_unpacklo_epi8(left_col, left_col);
   1374  h_prediction_32x8_1(&left_col_8p, dst, stride);
   1375  dst += stride << 2;
   1376  h_prediction_32x8_2(&left_col_8p, dst, stride);
   1377  dst += stride << 2;
   1378 
   1379  left_col_8p = _mm_unpackhi_epi8(left_col, left_col);
   1380  h_prediction_32x8_1(&left_col_8p, dst, stride);
   1381  dst += stride << 2;
   1382  h_prediction_32x8_2(&left_col_8p, dst, stride);
   1383 }
   1384 
   1385 static inline void h_predictor_32xh(uint8_t *dst, ptrdiff_t stride,
   1386                                    const uint8_t *left, int height) {
   1387  int i = height >> 2;
   1388  do {
   1389    __m128i left4 = _mm_cvtsi32_si128(((int *)left)[0]);
   1390    left4 = _mm_unpacklo_epi8(left4, left4);
   1391    left4 = _mm_unpacklo_epi8(left4, left4);
   1392    const __m128i r0 = _mm_shuffle_epi32(left4, 0x0);
   1393    const __m128i r1 = _mm_shuffle_epi32(left4, 0x55);
   1394    _mm_store_si128((__m128i *)dst, r0);
   1395    _mm_store_si128((__m128i *)(dst + 16), r0);
   1396    _mm_store_si128((__m128i *)(dst + stride), r1);
   1397    _mm_store_si128((__m128i *)(dst + stride + 16), r1);
   1398    const __m128i r2 = _mm_shuffle_epi32(left4, 0xaa);
   1399    const __m128i r3 = _mm_shuffle_epi32(left4, 0xff);
   1400    _mm_store_si128((__m128i *)(dst + stride * 2), r2);
   1401    _mm_store_si128((__m128i *)(dst + stride * 2 + 16), r2);
   1402    _mm_store_si128((__m128i *)(dst + stride * 3), r3);
   1403    _mm_store_si128((__m128i *)(dst + stride * 3 + 16), r3);
   1404    left += 4;
   1405    dst += stride * 4;
   1406  } while (--i);
   1407 }
   1408 
   1409 void aom_h_predictor_32x64_sse2(uint8_t *dst, ptrdiff_t stride,
   1410                                const uint8_t *above, const uint8_t *left) {
   1411  (void)above;
   1412  h_predictor_32xh(dst, stride, left, 64);
   1413 }
   1414 
   1415 static inline void h_predictor_64xh(uint8_t *dst, ptrdiff_t stride,
   1416                                    const uint8_t *left, int height) {
   1417  int i = height >> 2;
   1418  do {
   1419    __m128i left4 = _mm_cvtsi32_si128(((int *)left)[0]);
   1420    left4 = _mm_unpacklo_epi8(left4, left4);
   1421    left4 = _mm_unpacklo_epi8(left4, left4);
   1422    const __m128i r0 = _mm_shuffle_epi32(left4, 0x0);
   1423    const __m128i r1 = _mm_shuffle_epi32(left4, 0x55);
   1424    _mm_store_si128((__m128i *)dst, r0);
   1425    _mm_store_si128((__m128i *)(dst + 16), r0);
   1426    _mm_store_si128((__m128i *)(dst + 32), r0);
   1427    _mm_store_si128((__m128i *)(dst + 48), r0);
   1428    _mm_store_si128((__m128i *)(dst + stride), r1);
   1429    _mm_store_si128((__m128i *)(dst + stride + 16), r1);
   1430    _mm_store_si128((__m128i *)(dst + stride + 32), r1);
   1431    _mm_store_si128((__m128i *)(dst + stride + 48), r1);
   1432    const __m128i r2 = _mm_shuffle_epi32(left4, 0xaa);
   1433    const __m128i r3 = _mm_shuffle_epi32(left4, 0xff);
   1434    _mm_store_si128((__m128i *)(dst + stride * 2), r2);
   1435    _mm_store_si128((__m128i *)(dst + stride * 2 + 16), r2);
   1436    _mm_store_si128((__m128i *)(dst + stride * 2 + 32), r2);
   1437    _mm_store_si128((__m128i *)(dst + stride * 2 + 48), r2);
   1438    _mm_store_si128((__m128i *)(dst + stride * 3), r3);
   1439    _mm_store_si128((__m128i *)(dst + stride * 3 + 16), r3);
   1440    _mm_store_si128((__m128i *)(dst + stride * 3 + 32), r3);
   1441    _mm_store_si128((__m128i *)(dst + stride * 3 + 48), r3);
   1442    left += 4;
   1443    dst += stride * 4;
   1444  } while (--i);
   1445 }
   1446 
   1447 void aom_h_predictor_64x64_sse2(uint8_t *dst, ptrdiff_t stride,
   1448                                const uint8_t *above, const uint8_t *left) {
   1449  (void)above;
   1450  h_predictor_64xh(dst, stride, left, 64);
   1451 }
   1452 
   1453 void aom_h_predictor_64x32_sse2(uint8_t *dst, ptrdiff_t stride,
   1454                                const uint8_t *above, const uint8_t *left) {
   1455  (void)above;
   1456  h_predictor_64xh(dst, stride, left, 32);
   1457 }
   1458 
   1459 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
   1460 void aom_h_predictor_64x16_sse2(uint8_t *dst, ptrdiff_t stride,
   1461                                const uint8_t *above, const uint8_t *left) {
   1462  (void)above;
   1463  h_predictor_64xh(dst, stride, left, 16);
   1464 }
   1465 #endif  // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER