tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

highbd_intrapred_sse2.c (37593B)


      1 /*
      2 * Copyright (c) 2017, Alliance for Open Media. All rights reserved.
      3 *
      4 * This source code is subject to the terms of the BSD 2 Clause License and
      5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
      6 * was not distributed with this source code in the LICENSE file, you can
      7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
      8 * Media Patent License 1.0 was not distributed with this source code in the
      9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
     10 */
     11 
     12 #include <emmintrin.h>
     13 
     14 #include "config/aom_dsp_rtcd.h"
     15 
     16 // -----------------------------------------------------------------------------
     17 // H_PRED
     18 
     19 void aom_highbd_h_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t stride,
     20                                     const uint16_t *above,
     21                                     const uint16_t *left, int bd) {
     22  const __m128i left_u16 = _mm_loadl_epi64((const __m128i *)left);
     23  const __m128i row0 = _mm_shufflelo_epi16(left_u16, 0x0);
     24  const __m128i row1 = _mm_shufflelo_epi16(left_u16, 0x55);
     25  const __m128i row2 = _mm_shufflelo_epi16(left_u16, 0xaa);
     26  const __m128i row3 = _mm_shufflelo_epi16(left_u16, 0xff);
     27  (void)above;
     28  (void)bd;
     29  _mm_storel_epi64((__m128i *)dst, row0);
     30  dst += stride;
     31  _mm_storel_epi64((__m128i *)dst, row1);
     32  dst += stride;
     33  _mm_storel_epi64((__m128i *)dst, row2);
     34  dst += stride;
     35  _mm_storel_epi64((__m128i *)dst, row3);
     36 }
     37 
     38 void aom_highbd_h_predictor_4x8_sse2(uint16_t *dst, ptrdiff_t stride,
     39                                     const uint16_t *above,
     40                                     const uint16_t *left, int bd) {
     41  aom_highbd_h_predictor_4x4_sse2(dst, stride, above, left, bd);
     42  dst += stride << 2;
     43  left += 4;
     44  aom_highbd_h_predictor_4x4_sse2(dst, stride, above, left, bd);
     45 }
     46 
     47 void aom_highbd_h_predictor_8x4_sse2(uint16_t *dst, ptrdiff_t stride,
     48                                     const uint16_t *above,
     49                                     const uint16_t *left, int bd) {
     50  const __m128i left_u16 = _mm_load_si128((const __m128i *)left);
     51  const __m128i row0 = _mm_shufflelo_epi16(left_u16, 0x0);
     52  const __m128i row1 = _mm_shufflelo_epi16(left_u16, 0x55);
     53  const __m128i row2 = _mm_shufflelo_epi16(left_u16, 0xaa);
     54  const __m128i row3 = _mm_shufflelo_epi16(left_u16, 0xff);
     55  (void)above;
     56  (void)bd;
     57  _mm_store_si128((__m128i *)dst, _mm_unpacklo_epi64(row0, row0));
     58  dst += stride;
     59  _mm_store_si128((__m128i *)dst, _mm_unpacklo_epi64(row1, row1));
     60  dst += stride;
     61  _mm_store_si128((__m128i *)dst, _mm_unpacklo_epi64(row2, row2));
     62  dst += stride;
     63  _mm_store_si128((__m128i *)dst, _mm_unpacklo_epi64(row3, row3));
     64 }
     65 
     66 void aom_highbd_h_predictor_8x8_sse2(uint16_t *dst, ptrdiff_t stride,
     67                                     const uint16_t *above,
     68                                     const uint16_t *left, int bd) {
     69  const __m128i left_u16 = _mm_load_si128((const __m128i *)left);
     70  const __m128i row0 = _mm_shufflelo_epi16(left_u16, 0x0);
     71  const __m128i row1 = _mm_shufflelo_epi16(left_u16, 0x55);
     72  const __m128i row2 = _mm_shufflelo_epi16(left_u16, 0xaa);
     73  const __m128i row3 = _mm_shufflelo_epi16(left_u16, 0xff);
     74  const __m128i row4 = _mm_shufflehi_epi16(left_u16, 0x0);
     75  const __m128i row5 = _mm_shufflehi_epi16(left_u16, 0x55);
     76  const __m128i row6 = _mm_shufflehi_epi16(left_u16, 0xaa);
     77  const __m128i row7 = _mm_shufflehi_epi16(left_u16, 0xff);
     78  (void)above;
     79  (void)bd;
     80  _mm_store_si128((__m128i *)dst, _mm_unpacklo_epi64(row0, row0));
     81  dst += stride;
     82  _mm_store_si128((__m128i *)dst, _mm_unpacklo_epi64(row1, row1));
     83  dst += stride;
     84  _mm_store_si128((__m128i *)dst, _mm_unpacklo_epi64(row2, row2));
     85  dst += stride;
     86  _mm_store_si128((__m128i *)dst, _mm_unpacklo_epi64(row3, row3));
     87  dst += stride;
     88  _mm_store_si128((__m128i *)dst, _mm_unpackhi_epi64(row4, row4));
     89  dst += stride;
     90  _mm_store_si128((__m128i *)dst, _mm_unpackhi_epi64(row5, row5));
     91  dst += stride;
     92  _mm_store_si128((__m128i *)dst, _mm_unpackhi_epi64(row6, row6));
     93  dst += stride;
     94  _mm_store_si128((__m128i *)dst, _mm_unpackhi_epi64(row7, row7));
     95 }
     96 
     97 void aom_highbd_h_predictor_8x16_sse2(uint16_t *dst, ptrdiff_t stride,
     98                                      const uint16_t *above,
     99                                      const uint16_t *left, int bd) {
    100  aom_highbd_h_predictor_8x8_sse2(dst, stride, above, left, bd);
    101  dst += stride << 3;
    102  left += 8;
    103  aom_highbd_h_predictor_8x8_sse2(dst, stride, above, left, bd);
    104 }
    105 
    106 static inline void h_store_16_unpacklo(uint16_t **dst, const ptrdiff_t stride,
    107                                       const __m128i *row) {
    108  const __m128i val = _mm_unpacklo_epi64(*row, *row);
    109  _mm_store_si128((__m128i *)*dst, val);
    110  _mm_store_si128((__m128i *)(*dst + 8), val);
    111  *dst += stride;
    112 }
    113 
    114 static inline void h_store_16_unpackhi(uint16_t **dst, const ptrdiff_t stride,
    115                                       const __m128i *row) {
    116  const __m128i val = _mm_unpackhi_epi64(*row, *row);
    117  _mm_store_si128((__m128i *)(*dst), val);
    118  _mm_store_si128((__m128i *)(*dst + 8), val);
    119  *dst += stride;
    120 }
    121 
    122 static inline void h_predictor_16x8(uint16_t *dst, ptrdiff_t stride,
    123                                    const uint16_t *left) {
    124  const __m128i left_u16 = _mm_load_si128((const __m128i *)left);
    125  const __m128i row0 = _mm_shufflelo_epi16(left_u16, 0x0);
    126  const __m128i row1 = _mm_shufflelo_epi16(left_u16, 0x55);
    127  const __m128i row2 = _mm_shufflelo_epi16(left_u16, 0xaa);
    128  const __m128i row3 = _mm_shufflelo_epi16(left_u16, 0xff);
    129  const __m128i row4 = _mm_shufflehi_epi16(left_u16, 0x0);
    130  const __m128i row5 = _mm_shufflehi_epi16(left_u16, 0x55);
    131  const __m128i row6 = _mm_shufflehi_epi16(left_u16, 0xaa);
    132  const __m128i row7 = _mm_shufflehi_epi16(left_u16, 0xff);
    133  h_store_16_unpacklo(&dst, stride, &row0);
    134  h_store_16_unpacklo(&dst, stride, &row1);
    135  h_store_16_unpacklo(&dst, stride, &row2);
    136  h_store_16_unpacklo(&dst, stride, &row3);
    137  h_store_16_unpackhi(&dst, stride, &row4);
    138  h_store_16_unpackhi(&dst, stride, &row5);
    139  h_store_16_unpackhi(&dst, stride, &row6);
    140  h_store_16_unpackhi(&dst, stride, &row7);
    141 }
    142 
    143 void aom_highbd_h_predictor_16x8_sse2(uint16_t *dst, ptrdiff_t stride,
    144                                      const uint16_t *above,
    145                                      const uint16_t *left, int bd) {
    146  (void)above;
    147  (void)bd;
    148  h_predictor_16x8(dst, stride, left);
    149 }
    150 
    151 void aom_highbd_h_predictor_16x16_sse2(uint16_t *dst, ptrdiff_t stride,
    152                                       const uint16_t *above,
    153                                       const uint16_t *left, int bd) {
    154  int i;
    155  (void)above;
    156  (void)bd;
    157 
    158  for (i = 0; i < 2; i++, left += 8) {
    159    h_predictor_16x8(dst, stride, left);
    160    dst += stride << 3;
    161  }
    162 }
    163 
    164 void aom_highbd_h_predictor_16x32_sse2(uint16_t *dst, ptrdiff_t stride,
    165                                       const uint16_t *above,
    166                                       const uint16_t *left, int bd) {
    167  int i;
    168  (void)above;
    169  (void)bd;
    170 
    171  for (i = 0; i < 4; i++, left += 8) {
    172    h_predictor_16x8(dst, stride, left);
    173    dst += stride << 3;
    174  }
    175 }
    176 
    177 static inline void h_store_32_unpacklo(uint16_t **dst, const ptrdiff_t stride,
    178                                       const __m128i *row) {
    179  const __m128i val = _mm_unpacklo_epi64(*row, *row);
    180  _mm_store_si128((__m128i *)(*dst), val);
    181  _mm_store_si128((__m128i *)(*dst + 8), val);
    182  _mm_store_si128((__m128i *)(*dst + 16), val);
    183  _mm_store_si128((__m128i *)(*dst + 24), val);
    184  *dst += stride;
    185 }
    186 
    187 static inline void h_store_32_unpackhi(uint16_t **dst, const ptrdiff_t stride,
    188                                       const __m128i *row) {
    189  const __m128i val = _mm_unpackhi_epi64(*row, *row);
    190  _mm_store_si128((__m128i *)(*dst), val);
    191  _mm_store_si128((__m128i *)(*dst + 8), val);
    192  _mm_store_si128((__m128i *)(*dst + 16), val);
    193  _mm_store_si128((__m128i *)(*dst + 24), val);
    194  *dst += stride;
    195 }
    196 
    197 static inline void h_predictor_32x8(uint16_t *dst, ptrdiff_t stride,
    198                                    const uint16_t *left) {
    199  const __m128i left_u16 = _mm_load_si128((const __m128i *)left);
    200  const __m128i row0 = _mm_shufflelo_epi16(left_u16, 0x0);
    201  const __m128i row1 = _mm_shufflelo_epi16(left_u16, 0x55);
    202  const __m128i row2 = _mm_shufflelo_epi16(left_u16, 0xaa);
    203  const __m128i row3 = _mm_shufflelo_epi16(left_u16, 0xff);
    204  const __m128i row4 = _mm_shufflehi_epi16(left_u16, 0x0);
    205  const __m128i row5 = _mm_shufflehi_epi16(left_u16, 0x55);
    206  const __m128i row6 = _mm_shufflehi_epi16(left_u16, 0xaa);
    207  const __m128i row7 = _mm_shufflehi_epi16(left_u16, 0xff);
    208  h_store_32_unpacklo(&dst, stride, &row0);
    209  h_store_32_unpacklo(&dst, stride, &row1);
    210  h_store_32_unpacklo(&dst, stride, &row2);
    211  h_store_32_unpacklo(&dst, stride, &row3);
    212  h_store_32_unpackhi(&dst, stride, &row4);
    213  h_store_32_unpackhi(&dst, stride, &row5);
    214  h_store_32_unpackhi(&dst, stride, &row6);
    215  h_store_32_unpackhi(&dst, stride, &row7);
    216 }
    217 
    218 void aom_highbd_h_predictor_32x16_sse2(uint16_t *dst, ptrdiff_t stride,
    219                                       const uint16_t *above,
    220                                       const uint16_t *left, int bd) {
    221  int i;
    222  (void)above;
    223  (void)bd;
    224 
    225  for (i = 0; i < 2; i++, left += 8) {
    226    h_predictor_32x8(dst, stride, left);
    227    dst += stride << 3;
    228  }
    229 }
    230 
    231 void aom_highbd_h_predictor_32x32_sse2(uint16_t *dst, ptrdiff_t stride,
    232                                       const uint16_t *above,
    233                                       const uint16_t *left, int bd) {
    234  int i;
    235  (void)above;
    236  (void)bd;
    237 
    238  for (i = 0; i < 4; i++, left += 8) {
    239    h_predictor_32x8(dst, stride, left);
    240    dst += stride << 3;
    241  }
    242 }
    243 
    244 // -----------------------------------------------------------------------------
    245 // DC_TOP, DC_LEFT, DC_128
    246 
    247 // 4x4
    248 
    249 static inline __m128i dc_sum_4(const uint16_t *ref) {
    250  const __m128i _dcba = _mm_loadl_epi64((const __m128i *)ref);
    251  const __m128i _xxdc = _mm_shufflelo_epi16(_dcba, 0xe);
    252  const __m128i a = _mm_add_epi16(_dcba, _xxdc);
    253  return _mm_add_epi16(a, _mm_shufflelo_epi16(a, 0x1));
    254 }
    255 
    256 static inline void dc_store_4x4(uint16_t *dst, ptrdiff_t stride,
    257                                const __m128i *dc) {
    258  const __m128i dc_dup = _mm_shufflelo_epi16(*dc, 0x0);
    259  int i;
    260  for (i = 0; i < 4; ++i, dst += stride) {
    261    _mm_storel_epi64((__m128i *)dst, dc_dup);
    262  }
    263 }
    264 
    265 void aom_highbd_dc_left_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t stride,
    266                                           const uint16_t *above,
    267                                           const uint16_t *left, int bd) {
    268  const __m128i two = _mm_cvtsi32_si128(2);
    269  const __m128i sum = dc_sum_4(left);
    270  const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, two), 2);
    271  (void)above;
    272  (void)bd;
    273  dc_store_4x4(dst, stride, &dc);
    274 }
    275 
    276 void aom_highbd_dc_top_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t stride,
    277                                          const uint16_t *above,
    278                                          const uint16_t *left, int bd) {
    279  const __m128i two = _mm_cvtsi32_si128(2);
    280  const __m128i sum = dc_sum_4(above);
    281  const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, two), 2);
    282  (void)left;
    283  (void)bd;
    284  dc_store_4x4(dst, stride, &dc);
    285 }
    286 
    287 void aom_highbd_dc_128_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t stride,
    288                                          const uint16_t *above,
    289                                          const uint16_t *left, int bd) {
    290  const __m128i dc = _mm_cvtsi32_si128(1 << (bd - 1));
    291  const __m128i dc_dup = _mm_shufflelo_epi16(dc, 0x0);
    292  (void)above;
    293  (void)left;
    294  dc_store_4x4(dst, stride, &dc_dup);
    295 }
    296 
    297 // -----------------------------------------------------------------------------
    298 // 4x8
    299 
    300 static inline void dc_store_4x8(uint16_t *dst, ptrdiff_t stride,
    301                                const __m128i *dc) {
    302  const __m128i dc_dup = _mm_shufflelo_epi16(*dc, 0x0);
    303  int i;
    304  for (i = 0; i < 8; ++i, dst += stride) {
    305    _mm_storel_epi64((__m128i *)dst, dc_dup);
    306  }
    307 }
    308 
    309 // Shared with DC 8xh
    310 static inline __m128i dc_sum_8(const uint16_t *ref) {
    311  const __m128i ref_u16 = _mm_load_si128((const __m128i *)ref);
    312  const __m128i _dcba = _mm_add_epi16(ref_u16, _mm_srli_si128(ref_u16, 8));
    313  const __m128i _xxdc = _mm_shufflelo_epi16(_dcba, 0xe);
    314  const __m128i a = _mm_add_epi16(_dcba, _xxdc);
    315 
    316  return _mm_add_epi16(a, _mm_shufflelo_epi16(a, 0x1));
    317 }
    318 
    319 void aom_highbd_dc_left_predictor_4x8_sse2(uint16_t *dst, ptrdiff_t stride,
    320                                           const uint16_t *above,
    321                                           const uint16_t *left, int bd) {
    322  const __m128i sum = dc_sum_8(left);
    323  const __m128i four = _mm_cvtsi32_si128(4);
    324  const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, four), 3);
    325  (void)above;
    326  (void)bd;
    327  dc_store_4x8(dst, stride, &dc);
    328 }
    329 
    330 void aom_highbd_dc_top_predictor_4x8_sse2(uint16_t *dst, ptrdiff_t stride,
    331                                          const uint16_t *above,
    332                                          const uint16_t *left, int bd) {
    333  const __m128i two = _mm_cvtsi32_si128(2);
    334  const __m128i sum = dc_sum_4(above);
    335  const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, two), 2);
    336  (void)left;
    337  (void)bd;
    338  dc_store_4x8(dst, stride, &dc);
    339 }
    340 
    341 void aom_highbd_dc_128_predictor_4x8_sse2(uint16_t *dst, ptrdiff_t stride,
    342                                          const uint16_t *above,
    343                                          const uint16_t *left, int bd) {
    344  const __m128i dc = _mm_cvtsi32_si128(1 << (bd - 1));
    345  const __m128i dc_dup = _mm_shufflelo_epi16(dc, 0x0);
    346  (void)above;
    347  (void)left;
    348  dc_store_4x8(dst, stride, &dc_dup);
    349 }
    350 
    351 // -----------------------------------------------------------------------------
    352 // 8xh
    353 
    354 static inline void dc_store_8xh(uint16_t *dst, ptrdiff_t stride, int height,
    355                                const __m128i *dc) {
    356  const __m128i dc_dup_lo = _mm_shufflelo_epi16(*dc, 0);
    357  const __m128i dc_dup = _mm_unpacklo_epi64(dc_dup_lo, dc_dup_lo);
    358  int i;
    359  for (i = 0; i < height; ++i, dst += stride) {
    360    _mm_store_si128((__m128i *)dst, dc_dup);
    361  }
    362 }
    363 
    364 // -----------------------------------------------------------------------------
    365 // DC_TOP
    366 
    367 static inline void dc_top_predictor_8xh(uint16_t *dst, ptrdiff_t stride,
    368                                        int height, const uint16_t *above) {
    369  const __m128i four = _mm_cvtsi32_si128(4);
    370  const __m128i sum = dc_sum_8(above);
    371  const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, four), 3);
    372  dc_store_8xh(dst, stride, height, &dc);
    373 }
    374 
    375 void aom_highbd_dc_top_predictor_8x4_sse2(uint16_t *dst, ptrdiff_t stride,
    376                                          const uint16_t *above,
    377                                          const uint16_t *left, int bd) {
    378  (void)left;
    379  (void)bd;
    380  dc_top_predictor_8xh(dst, stride, 4, above);
    381 }
    382 
    383 void aom_highbd_dc_top_predictor_8x8_sse2(uint16_t *dst, ptrdiff_t stride,
    384                                          const uint16_t *above,
    385                                          const uint16_t *left, int bd) {
    386  (void)left;
    387  (void)bd;
    388  dc_top_predictor_8xh(dst, stride, 8, above);
    389 }
    390 
    391 void aom_highbd_dc_top_predictor_8x16_sse2(uint16_t *dst, ptrdiff_t stride,
    392                                           const uint16_t *above,
    393                                           const uint16_t *left, int bd) {
    394  (void)left;
    395  (void)bd;
    396  dc_top_predictor_8xh(dst, stride, 16, above);
    397 }
    398 
    399 // -----------------------------------------------------------------------------
    400 // DC_LEFT
    401 
    402 void aom_highbd_dc_left_predictor_8x4_sse2(uint16_t *dst, ptrdiff_t stride,
    403                                           const uint16_t *above,
    404                                           const uint16_t *left, int bd) {
    405  const __m128i two = _mm_cvtsi32_si128(2);
    406  const __m128i sum = dc_sum_4(left);
    407  const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, two), 2);
    408  (void)above;
    409  (void)bd;
    410  dc_store_8xh(dst, stride, 4, &dc);
    411 }
    412 
    413 void aom_highbd_dc_left_predictor_8x8_sse2(uint16_t *dst, ptrdiff_t stride,
    414                                           const uint16_t *above,
    415                                           const uint16_t *left, int bd) {
    416  const __m128i four = _mm_cvtsi32_si128(4);
    417  const __m128i sum = dc_sum_8(left);
    418  const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, four), 3);
    419  (void)above;
    420  (void)bd;
    421  dc_store_8xh(dst, stride, 8, &dc);
    422 }
    423 
    424 // Shared with DC 16xh
    425 static inline __m128i dc_sum_16(const uint16_t *ref) {
    426  const __m128i sum_lo = dc_sum_8(ref);
    427  const __m128i sum_hi = dc_sum_8(ref + 8);
    428  return _mm_add_epi16(sum_lo, sum_hi);
    429 }
    430 
    431 void aom_highbd_dc_left_predictor_8x16_sse2(uint16_t *dst, ptrdiff_t stride,
    432                                            const uint16_t *above,
    433                                            const uint16_t *left, int bd) {
    434  const __m128i eight = _mm_cvtsi32_si128(8);
    435  const __m128i sum = dc_sum_16(left);
    436  const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, eight), 4);
    437  (void)above;
    438  (void)bd;
    439  dc_store_8xh(dst, stride, 16, &dc);
    440 }
    441 
    442 // -----------------------------------------------------------------------------
    443 // DC_128
    444 
    445 static inline void dc_128_predictor_8xh(uint16_t *dst, ptrdiff_t stride,
    446                                        int height, int bd) {
    447  const __m128i dc = _mm_cvtsi32_si128(1 << (bd - 1));
    448  const __m128i dc_dup = _mm_shufflelo_epi16(dc, 0x0);
    449  dc_store_8xh(dst, stride, height, &dc_dup);
    450 }
    451 
    452 void aom_highbd_dc_128_predictor_8x4_sse2(uint16_t *dst, ptrdiff_t stride,
    453                                          const uint16_t *above,
    454                                          const uint16_t *left, int bd) {
    455  (void)above;
    456  (void)left;
    457  dc_128_predictor_8xh(dst, stride, 4, bd);
    458 }
    459 
    460 void aom_highbd_dc_128_predictor_8x8_sse2(uint16_t *dst, ptrdiff_t stride,
    461                                          const uint16_t *above,
    462                                          const uint16_t *left, int bd) {
    463  (void)above;
    464  (void)left;
    465  dc_128_predictor_8xh(dst, stride, 8, bd);
    466 }
    467 
    468 void aom_highbd_dc_128_predictor_8x16_sse2(uint16_t *dst, ptrdiff_t stride,
    469                                           const uint16_t *above,
    470                                           const uint16_t *left, int bd) {
    471  (void)above;
    472  (void)left;
    473  dc_128_predictor_8xh(dst, stride, 16, bd);
    474 }
    475 
    476 // -----------------------------------------------------------------------------
    477 // 16xh
    478 
    479 static inline void dc_store_16xh(uint16_t *dst, ptrdiff_t stride, int height,
    480                                 const __m128i *dc) {
    481  const __m128i dc_dup_lo = _mm_shufflelo_epi16(*dc, 0);
    482  const __m128i dc_dup = _mm_unpacklo_epi64(dc_dup_lo, dc_dup_lo);
    483  int i;
    484  for (i = 0; i < height; ++i, dst += stride) {
    485    _mm_store_si128((__m128i *)dst, dc_dup);
    486    _mm_store_si128((__m128i *)(dst + 8), dc_dup);
    487  }
    488 }
    489 
    490 // -----------------------------------------------------------------------------
    491 // DC_LEFT
    492 
    493 void aom_highbd_dc_left_predictor_16x8_sse2(uint16_t *dst, ptrdiff_t stride,
    494                                            const uint16_t *above,
    495                                            const uint16_t *left, int bd) {
    496  const __m128i four = _mm_cvtsi32_si128(4);
    497  const __m128i sum = dc_sum_8(left);
    498  const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, four), 3);
    499  (void)above;
    500  (void)bd;
    501  dc_store_16xh(dst, stride, 8, &dc);
    502 }
    503 
    504 void aom_highbd_dc_left_predictor_16x16_sse2(uint16_t *dst, ptrdiff_t stride,
    505                                             const uint16_t *above,
    506                                             const uint16_t *left, int bd) {
    507  const __m128i eight = _mm_cvtsi32_si128(8);
    508  const __m128i sum = dc_sum_16(left);
    509  const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, eight), 4);
    510  (void)above;
    511  (void)bd;
    512  dc_store_16xh(dst, stride, 16, &dc);
    513 }
    514 
    515 // Shared with 32xh
    516 static inline __m128i dc_sum_32(const uint16_t *ref) {
    517  const __m128i zero = _mm_setzero_si128();
    518  const __m128i sum_a = dc_sum_16(ref);
    519  const __m128i sum_b = dc_sum_16(ref + 16);
    520  // 12 bit bd will outrange, so expand to 32 bit before adding final total
    521  return _mm_add_epi32(_mm_unpacklo_epi16(sum_a, zero),
    522                       _mm_unpacklo_epi16(sum_b, zero));
    523 }
    524 
    525 void aom_highbd_dc_left_predictor_16x32_sse2(uint16_t *dst, ptrdiff_t stride,
    526                                             const uint16_t *above,
    527                                             const uint16_t *left, int bd) {
    528  const __m128i sixteen = _mm_cvtsi32_si128(16);
    529  const __m128i sum = dc_sum_32(left);
    530  const __m128i dc = _mm_srli_epi32(_mm_add_epi32(sum, sixteen), 5);
    531  (void)above;
    532  (void)bd;
    533  dc_store_16xh(dst, stride, 32, &dc);
    534 }
    535 
    536 // -----------------------------------------------------------------------------
    537 // DC_TOP
    538 
    539 void aom_highbd_dc_top_predictor_16x8_sse2(uint16_t *dst, ptrdiff_t stride,
    540                                           const uint16_t *above,
    541                                           const uint16_t *left, int bd) {
    542  const __m128i eight = _mm_cvtsi32_si128(8);
    543  const __m128i sum = dc_sum_16(above);
    544  const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, eight), 4);
    545  (void)left;
    546  (void)bd;
    547  dc_store_16xh(dst, stride, 8, &dc);
    548 }
    549 
    550 void aom_highbd_dc_top_predictor_16x16_sse2(uint16_t *dst, ptrdiff_t stride,
    551                                            const uint16_t *above,
    552                                            const uint16_t *left, int bd) {
    553  const __m128i eight = _mm_cvtsi32_si128(8);
    554  const __m128i sum = dc_sum_16(above);
    555  const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, eight), 4);
    556  (void)left;
    557  (void)bd;
    558  dc_store_16xh(dst, stride, 16, &dc);
    559 }
    560 
    561 void aom_highbd_dc_top_predictor_16x32_sse2(uint16_t *dst, ptrdiff_t stride,
    562                                            const uint16_t *above,
    563                                            const uint16_t *left, int bd) {
    564  const __m128i eight = _mm_cvtsi32_si128(8);
    565  const __m128i sum = dc_sum_16(above);
    566  const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, eight), 4);
    567  (void)left;
    568  (void)bd;
    569  dc_store_16xh(dst, stride, 32, &dc);
    570 }
    571 
    572 // -----------------------------------------------------------------------------
    573 // DC_128
    574 
    575 void aom_highbd_dc_128_predictor_16x8_sse2(uint16_t *dst, ptrdiff_t stride,
    576                                           const uint16_t *above,
    577                                           const uint16_t *left, int bd) {
    578  const __m128i dc = _mm_cvtsi32_si128(1 << (bd - 1));
    579  const __m128i dc_dup = _mm_shufflelo_epi16(dc, 0x0);
    580  (void)above;
    581  (void)left;
    582  dc_store_16xh(dst, stride, 8, &dc_dup);
    583 }
    584 
    585 void aom_highbd_dc_128_predictor_16x16_sse2(uint16_t *dst, ptrdiff_t stride,
    586                                            const uint16_t *above,
    587                                            const uint16_t *left, int bd) {
    588  const __m128i dc = _mm_cvtsi32_si128(1 << (bd - 1));
    589  const __m128i dc_dup = _mm_shufflelo_epi16(dc, 0x0);
    590  (void)above;
    591  (void)left;
    592  dc_store_16xh(dst, stride, 16, &dc_dup);
    593 }
    594 
    595 void aom_highbd_dc_128_predictor_16x32_sse2(uint16_t *dst, ptrdiff_t stride,
    596                                            const uint16_t *above,
    597                                            const uint16_t *left, int bd) {
    598  const __m128i dc = _mm_cvtsi32_si128(1 << (bd - 1));
    599  const __m128i dc_dup = _mm_shufflelo_epi16(dc, 0x0);
    600  (void)above;
    601  (void)left;
    602  dc_store_16xh(dst, stride, 32, &dc_dup);
    603 }
    604 
    605 // -----------------------------------------------------------------------------
    606 // 32xh
    607 
    608 static inline void dc_store_32xh(uint16_t *dst, ptrdiff_t stride, int height,
    609                                 const __m128i *dc) {
    610  const __m128i dc_dup_lo = _mm_shufflelo_epi16(*dc, 0);
    611  const __m128i dc_dup = _mm_unpacklo_epi64(dc_dup_lo, dc_dup_lo);
    612  int i;
    613  for (i = 0; i < height; ++i, dst += stride) {
    614    _mm_store_si128((__m128i *)dst, dc_dup);
    615    _mm_store_si128((__m128i *)(dst + 8), dc_dup);
    616    _mm_store_si128((__m128i *)(dst + 16), dc_dup);
    617    _mm_store_si128((__m128i *)(dst + 24), dc_dup);
    618  }
    619 }
    620 
    621 void aom_highbd_dc_left_predictor_32x16_sse2(uint16_t *dst, ptrdiff_t stride,
    622                                             const uint16_t *above,
    623                                             const uint16_t *left, int bd) {
    624  const __m128i eight = _mm_cvtsi32_si128(8);
    625  const __m128i sum = dc_sum_16(left);
    626  const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, eight), 4);
    627  (void)above;
    628  (void)bd;
    629  dc_store_32xh(dst, stride, 16, &dc);
    630 }
    631 
    632 void aom_highbd_dc_left_predictor_32x32_sse2(uint16_t *dst, ptrdiff_t stride,
    633                                             const uint16_t *above,
    634                                             const uint16_t *left, int bd) {
    635  const __m128i sixteen = _mm_cvtsi32_si128(16);
    636  const __m128i sum = dc_sum_32(left);
    637  const __m128i dc = _mm_srli_epi32(_mm_add_epi32(sum, sixteen), 5);
    638  (void)above;
    639  (void)bd;
    640  dc_store_32xh(dst, stride, 32, &dc);
    641 }
    642 
    643 void aom_highbd_dc_top_predictor_32x16_sse2(uint16_t *dst, ptrdiff_t stride,
    644                                            const uint16_t *above,
    645                                            const uint16_t *left, int bd) {
    646  const __m128i sixteen = _mm_cvtsi32_si128(16);
    647  const __m128i sum = dc_sum_32(above);
    648  const __m128i dc = _mm_srli_epi32(_mm_add_epi32(sum, sixteen), 5);
    649  (void)left;
    650  (void)bd;
    651  dc_store_32xh(dst, stride, 16, &dc);
    652 }
    653 
    654 void aom_highbd_dc_128_predictor_32x16_sse2(uint16_t *dst, ptrdiff_t stride,
    655                                            const uint16_t *above,
    656                                            const uint16_t *left, int bd) {
    657  const __m128i dc = _mm_cvtsi32_si128(1 << (bd - 1));
    658  const __m128i dc_dup = _mm_shufflelo_epi16(dc, 0x0);
    659  (void)above;
    660  (void)left;
    661  dc_store_32xh(dst, stride, 16, &dc_dup);
    662 }
    663 
    664 void aom_highbd_dc_top_predictor_32x32_sse2(uint16_t *dst, ptrdiff_t stride,
    665                                            const uint16_t *above,
    666                                            const uint16_t *left, int bd) {
    667  const __m128i sixteen = _mm_cvtsi32_si128(16);
    668  const __m128i sum = dc_sum_32(above);
    669  const __m128i dc = _mm_srli_epi32(_mm_add_epi32(sum, sixteen), 5);
    670  (void)left;
    671  (void)bd;
    672  dc_store_32xh(dst, stride, 32, &dc);
    673 }
    674 
    675 void aom_highbd_dc_128_predictor_32x32_sse2(uint16_t *dst, ptrdiff_t stride,
    676                                            const uint16_t *above,
    677                                            const uint16_t *left, int bd) {
    678  const __m128i dc = _mm_cvtsi32_si128(1 << (bd - 1));
    679  const __m128i dc_dup = _mm_shufflelo_epi16(dc, 0x0);
    680  (void)above;
    681  (void)left;
    682  dc_store_32xh(dst, stride, 32, &dc_dup);
    683 }
    684 
    685 // -----------------------------------------------------------------------------
    686 // V_PRED
    687 
    688 void aom_highbd_v_predictor_4x8_sse2(uint16_t *dst, ptrdiff_t stride,
    689                                     const uint16_t *above,
    690                                     const uint16_t *left, int bd) {
    691  (void)left;
    692  (void)bd;
    693  const __m128i above_u16 = _mm_loadl_epi64((const __m128i *)above);
    694  int i;
    695  for (i = 0; i < 2; ++i) {
    696    _mm_storel_epi64((__m128i *)dst, above_u16);
    697    _mm_storel_epi64((__m128i *)(dst + stride), above_u16);
    698    _mm_storel_epi64((__m128i *)(dst + 2 * stride), above_u16);
    699    _mm_storel_epi64((__m128i *)(dst + 3 * stride), above_u16);
    700    dst += stride << 2;
    701  }
    702 }
    703 
    704 void aom_highbd_v_predictor_8x4_sse2(uint16_t *dst, ptrdiff_t stride,
    705                                     const uint16_t *above,
    706                                     const uint16_t *left, int bd) {
    707  (void)left;
    708  (void)bd;
    709  const __m128i above_u16 = _mm_load_si128((const __m128i *)above);
    710  _mm_store_si128((__m128i *)dst, above_u16);
    711  _mm_store_si128((__m128i *)(dst + stride), above_u16);
    712  _mm_store_si128((__m128i *)(dst + 2 * stride), above_u16);
    713  _mm_store_si128((__m128i *)(dst + 3 * stride), above_u16);
    714 }
    715 
    716 void aom_highbd_v_predictor_8x16_sse2(uint16_t *dst, ptrdiff_t stride,
    717                                      const uint16_t *above,
    718                                      const uint16_t *left, int bd) {
    719  (void)left;
    720  (void)bd;
    721  const __m128i above_u16 = _mm_load_si128((const __m128i *)above);
    722  int i;
    723  for (i = 0; i < 4; ++i) {
    724    _mm_store_si128((__m128i *)dst, above_u16);
    725    _mm_store_si128((__m128i *)(dst + stride), above_u16);
    726    _mm_store_si128((__m128i *)(dst + 2 * stride), above_u16);
    727    _mm_store_si128((__m128i *)(dst + 3 * stride), above_u16);
    728    dst += stride << 2;
    729  }
    730 }
    731 
    732 void aom_highbd_v_predictor_16x8_sse2(uint16_t *dst, ptrdiff_t stride,
    733                                      const uint16_t *above,
    734                                      const uint16_t *left, int bd) {
    735  (void)left;
    736  (void)bd;
    737  const __m128i above0_u16 = _mm_load_si128((const __m128i *)above);
    738  const __m128i above1_u16 = _mm_load_si128((const __m128i *)(above + 8));
    739  int i;
    740  for (i = 0; i < 2; ++i) {
    741    _mm_store_si128((__m128i *)dst, above0_u16);
    742    _mm_store_si128((__m128i *)(dst + 8), above1_u16);
    743    dst += stride;
    744    _mm_store_si128((__m128i *)dst, above0_u16);
    745    _mm_store_si128((__m128i *)(dst + 8), above1_u16);
    746    dst += stride;
    747    _mm_store_si128((__m128i *)dst, above0_u16);
    748    _mm_store_si128((__m128i *)(dst + 8), above1_u16);
    749    dst += stride;
    750    _mm_store_si128((__m128i *)dst, above0_u16);
    751    _mm_store_si128((__m128i *)(dst + 8), above1_u16);
    752    dst += stride;
    753  }
    754 }
    755 
    756 void aom_highbd_v_predictor_16x32_sse2(uint16_t *dst, ptrdiff_t stride,
    757                                       const uint16_t *above,
    758                                       const uint16_t *left, int bd) {
    759  (void)left;
    760  (void)bd;
    761  const __m128i above0_u16 = _mm_load_si128((const __m128i *)above);
    762  const __m128i above1_u16 = _mm_load_si128((const __m128i *)(above + 8));
    763  int i;
    764  for (i = 0; i < 8; ++i) {
    765    _mm_store_si128((__m128i *)dst, above0_u16);
    766    _mm_store_si128((__m128i *)(dst + 8), above1_u16);
    767    dst += stride;
    768    _mm_store_si128((__m128i *)dst, above0_u16);
    769    _mm_store_si128((__m128i *)(dst + 8), above1_u16);
    770    dst += stride;
    771    _mm_store_si128((__m128i *)dst, above0_u16);
    772    _mm_store_si128((__m128i *)(dst + 8), above1_u16);
    773    dst += stride;
    774    _mm_store_si128((__m128i *)dst, above0_u16);
    775    _mm_store_si128((__m128i *)(dst + 8), above1_u16);
    776    dst += stride;
    777  }
    778 }
    779 
    780 void aom_highbd_v_predictor_32x16_sse2(uint16_t *dst, ptrdiff_t stride,
    781                                       const uint16_t *above,
    782                                       const uint16_t *left, int bd) {
    783  (void)left;
    784  (void)bd;
    785  const __m128i above0_u16 = _mm_load_si128((const __m128i *)above);
    786  const __m128i above1_u16 = _mm_load_si128((const __m128i *)(above + 8));
    787  const __m128i above2_u16 = _mm_load_si128((const __m128i *)(above + 16));
    788  const __m128i above3_u16 = _mm_load_si128((const __m128i *)(above + 24));
    789  int i;
    790  for (i = 0; i < 4; ++i) {
    791    _mm_store_si128((__m128i *)dst, above0_u16);
    792    _mm_store_si128((__m128i *)(dst + 8), above1_u16);
    793    _mm_store_si128((__m128i *)(dst + 16), above2_u16);
    794    _mm_store_si128((__m128i *)(dst + 24), above3_u16);
    795    dst += stride;
    796    _mm_store_si128((__m128i *)dst, above0_u16);
    797    _mm_store_si128((__m128i *)(dst + 8), above1_u16);
    798    _mm_store_si128((__m128i *)(dst + 16), above2_u16);
    799    _mm_store_si128((__m128i *)(dst + 24), above3_u16);
    800    dst += stride;
    801    _mm_store_si128((__m128i *)dst, above0_u16);
    802    _mm_store_si128((__m128i *)(dst + 8), above1_u16);
    803    _mm_store_si128((__m128i *)(dst + 16), above2_u16);
    804    _mm_store_si128((__m128i *)(dst + 24), above3_u16);
    805    dst += stride;
    806    _mm_store_si128((__m128i *)dst, above0_u16);
    807    _mm_store_si128((__m128i *)(dst + 8), above1_u16);
    808    _mm_store_si128((__m128i *)(dst + 16), above2_u16);
    809    _mm_store_si128((__m128i *)(dst + 24), above3_u16);
    810    dst += stride;
    811  }
    812 }
    813 
    814 // -----------------------------------------------------------------------------
    815 // DC_PRED
    816 
    817 void aom_highbd_dc_predictor_4x8_sse2(uint16_t *dst, ptrdiff_t stride,
    818                                      const uint16_t *above,
    819                                      const uint16_t *left, int bd) {
    820  (void)bd;
    821  const __m128i sum_above = dc_sum_4(above);
    822  const __m128i sum_left = dc_sum_8(left);
    823  const __m128i sum = _mm_add_epi16(sum_above, sum_left);
    824  uint32_t sum32 = (uint32_t)_mm_cvtsi128_si32(sum);
    825  sum32 >>= 16;
    826  sum32 += 6;
    827  sum32 /= 12;
    828  const __m128i row = _mm_set1_epi16((int16_t)sum32);
    829  int i;
    830  for (i = 0; i < 4; ++i) {
    831    _mm_storel_epi64((__m128i *)dst, row);
    832    dst += stride;
    833    _mm_storel_epi64((__m128i *)dst, row);
    834    dst += stride;
    835  }
    836 }
    837 
    838 void aom_highbd_dc_predictor_8x4_sse2(uint16_t *dst, ptrdiff_t stride,
    839                                      const uint16_t *above,
    840                                      const uint16_t *left, int bd) {
    841  (void)bd;
    842  const __m128i sum_left = dc_sum_4(left);
    843  const __m128i sum_above = dc_sum_8(above);
    844  const __m128i sum = _mm_add_epi16(sum_above, sum_left);
    845  uint32_t sum32 = (uint32_t)_mm_cvtsi128_si32(sum);
    846  sum32 >>= 16;
    847  sum32 += 6;
    848  sum32 /= 12;
    849  const __m128i row = _mm_set1_epi16((int16_t)sum32);
    850 
    851  _mm_store_si128((__m128i *)dst, row);
    852  dst += stride;
    853  _mm_store_si128((__m128i *)dst, row);
    854  dst += stride;
    855  _mm_store_si128((__m128i *)dst, row);
    856  dst += stride;
    857  _mm_store_si128((__m128i *)dst, row);
    858 }
    859 
    860 void aom_highbd_dc_predictor_8x16_sse2(uint16_t *dst, ptrdiff_t stride,
    861                                       const uint16_t *above,
    862                                       const uint16_t *left, int bd) {
    863  (void)bd;
    864  __m128i sum_left = dc_sum_16(left);
    865  __m128i sum_above = dc_sum_8(above);
    866  const __m128i zero = _mm_setzero_si128();
    867  sum_left = _mm_unpacklo_epi16(sum_left, zero);
    868  sum_above = _mm_unpacklo_epi16(sum_above, zero);
    869  const __m128i sum = _mm_add_epi32(sum_left, sum_above);
    870  uint32_t sum32 = (uint32_t)_mm_cvtsi128_si32(sum);
    871  sum32 += 12;
    872  sum32 /= 24;
    873  const __m128i row = _mm_set1_epi16((int16_t)sum32);
    874  int i;
    875  for (i = 0; i < 4; ++i) {
    876    _mm_store_si128((__m128i *)dst, row);
    877    dst += stride;
    878    _mm_store_si128((__m128i *)dst, row);
    879    dst += stride;
    880    _mm_store_si128((__m128i *)dst, row);
    881    dst += stride;
    882    _mm_store_si128((__m128i *)dst, row);
    883    dst += stride;
    884  }
    885 }
    886 
    887 void aom_highbd_dc_predictor_16x8_sse2(uint16_t *dst, ptrdiff_t stride,
    888                                       const uint16_t *above,
    889                                       const uint16_t *left, int bd) {
    890  (void)bd;
    891  __m128i sum_left = dc_sum_8(left);
    892  __m128i sum_above = dc_sum_16(above);
    893  const __m128i zero = _mm_setzero_si128();
    894  sum_left = _mm_unpacklo_epi16(sum_left, zero);
    895  sum_above = _mm_unpacklo_epi16(sum_above, zero);
    896  const __m128i sum = _mm_add_epi32(sum_left, sum_above);
    897  uint32_t sum32 = (uint32_t)_mm_cvtsi128_si32(sum);
    898  sum32 += 12;
    899  sum32 /= 24;
    900  const __m128i row = _mm_set1_epi16((int16_t)sum32);
    901  int i;
    902  for (i = 0; i < 2; ++i) {
    903    _mm_store_si128((__m128i *)dst, row);
    904    _mm_store_si128((__m128i *)(dst + 8), row);
    905    dst += stride;
    906    _mm_store_si128((__m128i *)dst, row);
    907    _mm_store_si128((__m128i *)(dst + 8), row);
    908    dst += stride;
    909    _mm_store_si128((__m128i *)dst, row);
    910    _mm_store_si128((__m128i *)(dst + 8), row);
    911    dst += stride;
    912    _mm_store_si128((__m128i *)dst, row);
    913    _mm_store_si128((__m128i *)(dst + 8), row);
    914    dst += stride;
    915  }
    916 }
    917 
    918 void aom_highbd_dc_predictor_16x32_sse2(uint16_t *dst, ptrdiff_t stride,
    919                                        const uint16_t *above,
    920                                        const uint16_t *left, int bd) {
    921  (void)bd;
    922  __m128i sum_left = dc_sum_32(left);
    923  __m128i sum_above = dc_sum_16(above);
    924  const __m128i zero = _mm_setzero_si128();
    925  sum_above = _mm_unpacklo_epi16(sum_above, zero);
    926  const __m128i sum = _mm_add_epi32(sum_left, sum_above);
    927  uint32_t sum32 = (uint32_t)_mm_cvtsi128_si32(sum);
    928  sum32 += 24;
    929  sum32 /= 48;
    930  const __m128i row = _mm_set1_epi16((int16_t)sum32);
    931  int i;
    932  for (i = 0; i < 8; ++i) {
    933    _mm_store_si128((__m128i *)dst, row);
    934    _mm_store_si128((__m128i *)(dst + 8), row);
    935    dst += stride;
    936    _mm_store_si128((__m128i *)dst, row);
    937    _mm_store_si128((__m128i *)(dst + 8), row);
    938    dst += stride;
    939    _mm_store_si128((__m128i *)dst, row);
    940    _mm_store_si128((__m128i *)(dst + 8), row);
    941    dst += stride;
    942    _mm_store_si128((__m128i *)dst, row);
    943    _mm_store_si128((__m128i *)(dst + 8), row);
    944    dst += stride;
    945  }
    946 }
    947 
    948 void aom_highbd_dc_predictor_32x16_sse2(uint16_t *dst, ptrdiff_t stride,
    949                                        const uint16_t *above,
    950                                        const uint16_t *left, int bd) {
    951  (void)bd;
    952  __m128i sum_left = dc_sum_16(left);
    953  __m128i sum_above = dc_sum_32(above);
    954  const __m128i zero = _mm_setzero_si128();
    955  sum_left = _mm_unpacklo_epi16(sum_left, zero);
    956  const __m128i sum = _mm_add_epi32(sum_left, sum_above);
    957  uint32_t sum32 = (uint32_t)_mm_cvtsi128_si32(sum);
    958  sum32 += 24;
    959  sum32 /= 48;
    960  const __m128i row = _mm_set1_epi16((int16_t)sum32);
    961  int i;
    962  for (i = 0; i < 4; ++i) {
    963    _mm_store_si128((__m128i *)dst, row);
    964    _mm_store_si128((__m128i *)(dst + 8), row);
    965    _mm_store_si128((__m128i *)(dst + 16), row);
    966    _mm_store_si128((__m128i *)(dst + 24), row);
    967    dst += stride;
    968    _mm_store_si128((__m128i *)dst, row);
    969    _mm_store_si128((__m128i *)(dst + 8), row);
    970    _mm_store_si128((__m128i *)(dst + 16), row);
    971    _mm_store_si128((__m128i *)(dst + 24), row);
    972    dst += stride;
    973    _mm_store_si128((__m128i *)dst, row);
    974    _mm_store_si128((__m128i *)(dst + 8), row);
    975    _mm_store_si128((__m128i *)(dst + 16), row);
    976    _mm_store_si128((__m128i *)(dst + 24), row);
    977    dst += stride;
    978    _mm_store_si128((__m128i *)dst, row);
    979    _mm_store_si128((__m128i *)(dst + 8), row);
    980    _mm_store_si128((__m128i *)(dst + 16), row);
    981    _mm_store_si128((__m128i *)(dst + 24), row);
    982    dst += stride;
    983  }
    984 }