tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

av1_txfm_sse2.h (13143B)


      1 /*
      2 * Copyright (c) 2018, Alliance for Open Media. All rights reserved.
      3 *
      4 * This source code is subject to the terms of the BSD 2 Clause License and
      5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
      6 * was not distributed with this source code in the LICENSE file, you can
      7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
      8 * Media Patent License 1.0 was not distributed with this source code in the
      9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
     10 */
     11 #ifndef AOM_AV1_COMMON_X86_AV1_TXFM_SSE2_H_
     12 #define AOM_AV1_COMMON_X86_AV1_TXFM_SSE2_H_
     13 
     14 #include <emmintrin.h>  // SSE2
     15 
     16 #include "config/aom_config.h"
     17 #include "config/av1_rtcd.h"
     18 
     19 #include "aom/aom_integer.h"
     20 #include "aom_dsp/x86/transpose_sse2.h"
     21 #include "aom_dsp/x86/txfm_common_sse2.h"
     22 #include "av1/common/av1_txfm.h"
     23 
     24 #ifdef __cplusplus
     25 extern "C" {
     26 #endif
     27 
     28 static inline void btf_16_w4_sse2(
     29    const __m128i *const w0, const __m128i *const w1, const __m128i __rounding,
     30    const int8_t cos_bit, const __m128i *const in0, const __m128i *const in1,
     31    __m128i *const out0, __m128i *const out1) {
     32  const __m128i t0 = _mm_unpacklo_epi16(*in0, *in1);
     33  const __m128i u0 = _mm_madd_epi16(t0, *w0);
     34  const __m128i v0 = _mm_madd_epi16(t0, *w1);
     35  const __m128i a0 = _mm_add_epi32(u0, __rounding);
     36  const __m128i b0 = _mm_add_epi32(v0, __rounding);
     37  const __m128i c0 = _mm_srai_epi32(a0, cos_bit);
     38  const __m128i d0 = _mm_srai_epi32(b0, cos_bit);
     39 
     40  *out0 = _mm_packs_epi32(c0, c0);
     41  *out1 = _mm_packs_epi32(d0, c0);
     42 }
     43 
     44 #define btf_16_4p_sse2(w0, w1, in0, in1, out0, out1) \
     45  do {                                               \
     46    __m128i t0 = _mm_unpacklo_epi16(in0, in1);       \
     47    __m128i u0 = _mm_madd_epi16(t0, w0);             \
     48    __m128i v0 = _mm_madd_epi16(t0, w1);             \
     49                                                     \
     50    __m128i a0 = _mm_add_epi32(u0, __rounding);      \
     51    __m128i b0 = _mm_add_epi32(v0, __rounding);      \
     52                                                     \
     53    __m128i c0 = _mm_srai_epi32(a0, cos_bit);        \
     54    __m128i d0 = _mm_srai_epi32(b0, cos_bit);        \
     55                                                     \
     56    out0 = _mm_packs_epi32(c0, c0);                  \
     57    out1 = _mm_packs_epi32(d0, d0);                  \
     58  } while (0)
     59 
     60 #define btf_16_sse2(w0, w1, in0, in1, out0, out1) \
     61  do {                                            \
     62    __m128i t0 = _mm_unpacklo_epi16(in0, in1);    \
     63    __m128i t1 = _mm_unpackhi_epi16(in0, in1);    \
     64    __m128i u0 = _mm_madd_epi16(t0, w0);          \
     65    __m128i u1 = _mm_madd_epi16(t1, w0);          \
     66    __m128i v0 = _mm_madd_epi16(t0, w1);          \
     67    __m128i v1 = _mm_madd_epi16(t1, w1);          \
     68                                                  \
     69    __m128i a0 = _mm_add_epi32(u0, __rounding);   \
     70    __m128i a1 = _mm_add_epi32(u1, __rounding);   \
     71    __m128i b0 = _mm_add_epi32(v0, __rounding);   \
     72    __m128i b1 = _mm_add_epi32(v1, __rounding);   \
     73                                                  \
     74    __m128i c0 = _mm_srai_epi32(a0, cos_bit);     \
     75    __m128i c1 = _mm_srai_epi32(a1, cos_bit);     \
     76    __m128i d0 = _mm_srai_epi32(b0, cos_bit);     \
     77    __m128i d1 = _mm_srai_epi32(b1, cos_bit);     \
     78                                                  \
     79    out0 = _mm_packs_epi32(c0, c1);               \
     80    out1 = _mm_packs_epi32(d0, d1);               \
     81  } while (0)
     82 
     83 static inline __m128i load_16bit_to_16bit(const int16_t *a) {
     84  return _mm_load_si128((const __m128i *)a);
     85 }
     86 
     87 static inline __m128i load_32bit_to_16bit(const int32_t *a) {
     88  const __m128i a_low = _mm_load_si128((const __m128i *)a);
     89  return _mm_packs_epi32(a_low, *(const __m128i *)(a + 4));
     90 }
     91 
     92 static inline __m128i load_32bit_to_16bit_w4(const int32_t *a) {
     93  const __m128i a_low = _mm_load_si128((const __m128i *)a);
     94  return _mm_packs_epi32(a_low, a_low);
     95 }
     96 
     97 // Store 4 16 bit values. Sign extend the values.
     98 static inline void store_16bit_to_32bit_w4(const __m128i a, int32_t *const b) {
     99  const __m128i a_lo = _mm_unpacklo_epi16(a, a);
    100  const __m128i a_1 = _mm_srai_epi32(a_lo, 16);
    101  _mm_store_si128((__m128i *)b, a_1);
    102 }
    103 
    104 // Store 8 16 bit values. Sign extend the values.
    105 static inline void store_16bit_to_32bit(__m128i a, int32_t *b) {
    106  const __m128i a_lo = _mm_unpacklo_epi16(a, a);
    107  const __m128i a_hi = _mm_unpackhi_epi16(a, a);
    108  const __m128i a_1 = _mm_srai_epi32(a_lo, 16);
    109  const __m128i a_2 = _mm_srai_epi32(a_hi, 16);
    110  _mm_store_si128((__m128i *)b, a_1);
    111  _mm_store_si128((__m128i *)(b + 4), a_2);
    112 }
    113 
    114 static inline __m128i scale_round_sse2(const __m128i a, const int scale) {
    115  const __m128i scale_rounding = pair_set_epi16(scale, 1 << (NewSqrt2Bits - 1));
    116  const __m128i b = _mm_madd_epi16(a, scale_rounding);
    117  return _mm_srai_epi32(b, NewSqrt2Bits);
    118 }
    119 
    120 static inline void store_rect_16bit_to_32bit_w4(const __m128i a,
    121                                                int32_t *const b) {
    122  const __m128i one = _mm_set1_epi16(1);
    123  const __m128i a_lo = _mm_unpacklo_epi16(a, one);
    124  const __m128i b_lo = scale_round_sse2(a_lo, NewSqrt2);
    125  _mm_store_si128((__m128i *)b, b_lo);
    126 }
    127 
    128 static inline void store_rect_16bit_to_32bit(const __m128i a,
    129                                             int32_t *const b) {
    130  const __m128i one = _mm_set1_epi16(1);
    131  const __m128i a_lo = _mm_unpacklo_epi16(a, one);
    132  const __m128i a_hi = _mm_unpackhi_epi16(a, one);
    133  const __m128i b_lo = scale_round_sse2(a_lo, NewSqrt2);
    134  const __m128i b_hi = scale_round_sse2(a_hi, NewSqrt2);
    135  _mm_store_si128((__m128i *)b, b_lo);
    136  _mm_store_si128((__m128i *)(b + 4), b_hi);
    137 }
    138 
    139 static inline void load_buffer_16bit_to_16bit_w4(const int16_t *const in,
    140                                                 const int stride,
    141                                                 __m128i *const out,
    142                                                 const int out_size) {
    143  for (int i = 0; i < out_size; ++i) {
    144    out[i] = _mm_loadl_epi64((const __m128i *)(in + i * stride));
    145  }
    146 }
    147 
    148 static inline void load_buffer_16bit_to_16bit_w4_flip(const int16_t *const in,
    149                                                      const int stride,
    150                                                      __m128i *const out,
    151                                                      const int out_size) {
    152  for (int i = 0; i < out_size; ++i) {
    153    out[out_size - i - 1] = _mm_loadl_epi64((const __m128i *)(in + i * stride));
    154  }
    155 }
    156 
    157 static inline void load_buffer_16bit_to_16bit(const int16_t *in, int stride,
    158                                              __m128i *out, int out_size) {
    159  for (int i = 0; i < out_size; ++i) {
    160    out[i] = load_16bit_to_16bit(in + i * stride);
    161  }
    162 }
    163 
    164 static inline void load_buffer_16bit_to_16bit_flip(const int16_t *in,
    165                                                   int stride, __m128i *out,
    166                                                   int out_size) {
    167  for (int i = 0; i < out_size; ++i) {
    168    out[out_size - i - 1] = load_16bit_to_16bit(in + i * stride);
    169  }
    170 }
    171 
    172 static inline void load_buffer_32bit_to_16bit(const int32_t *in, int stride,
    173                                              __m128i *out, int out_size) {
    174  for (int i = 0; i < out_size; ++i) {
    175    out[i] = load_32bit_to_16bit(in + i * stride);
    176  }
    177 }
    178 
    179 static inline void load_buffer_32bit_to_16bit_w4(const int32_t *in, int stride,
    180                                                 __m128i *out, int out_size) {
    181  for (int i = 0; i < out_size; ++i) {
    182    out[i] = load_32bit_to_16bit_w4(in + i * stride);
    183  }
    184 }
    185 
    186 static inline void load_buffer_32bit_to_16bit_flip(const int32_t *in,
    187                                                   int stride, __m128i *out,
    188                                                   int out_size) {
    189  for (int i = 0; i < out_size; ++i) {
    190    out[out_size - i - 1] = load_32bit_to_16bit(in + i * stride);
    191  }
    192 }
    193 
    194 static inline void store_buffer_16bit_to_32bit_w4(const __m128i *const in,
    195                                                  int32_t *const out,
    196                                                  const int stride,
    197                                                  const int out_size) {
    198  for (int i = 0; i < out_size; ++i) {
    199    store_16bit_to_32bit_w4(in[i], out + i * stride);
    200  }
    201 }
    202 
    203 static inline void store_buffer_16bit_to_32bit_w8(const __m128i *const in,
    204                                                  int32_t *const out,
    205                                                  const int stride,
    206                                                  const int out_size) {
    207  for (int i = 0; i < out_size; ++i) {
    208    store_16bit_to_32bit(in[i], out + i * stride);
    209  }
    210 }
    211 
    212 static inline void store_rect_buffer_16bit_to_32bit_w4(const __m128i *const in,
    213                                                       int32_t *const out,
    214                                                       const int stride,
    215                                                       const int out_size) {
    216  for (int i = 0; i < out_size; ++i) {
    217    store_rect_16bit_to_32bit_w4(in[i], out + i * stride);
    218  }
    219 }
    220 
    221 static inline void store_rect_buffer_16bit_to_32bit_w8(const __m128i *const in,
    222                                                       int32_t *const out,
    223                                                       const int stride,
    224                                                       const int out_size) {
    225  for (int i = 0; i < out_size; ++i) {
    226    store_rect_16bit_to_32bit(in[i], out + i * stride);
    227  }
    228 }
    229 
    230 static inline void store_buffer_16bit_to_16bit_8x8(const __m128i *in,
    231                                                   uint16_t *out,
    232                                                   const int stride) {
    233  for (int i = 0; i < 8; ++i) {
    234    _mm_store_si128((__m128i *)(out + i * stride), in[i]);
    235  }
    236 }
    237 
    238 static inline void round_shift_16bit(__m128i *in, int size, int bit) {
    239  if (bit < 0) {
    240    bit = -bit;
    241    __m128i rounding = _mm_set1_epi16(1 << (bit - 1));
    242    for (int i = 0; i < size; ++i) {
    243      in[i] = _mm_adds_epi16(in[i], rounding);
    244      in[i] = _mm_srai_epi16(in[i], bit);
    245    }
    246  } else if (bit > 0) {
    247    for (int i = 0; i < size; ++i) {
    248      in[i] = _mm_slli_epi16(in[i], bit);
    249    }
    250  }
    251 }
    252 
    253 static inline void flip_buf_sse2(__m128i *in, __m128i *out, int size) {
    254  for (int i = 0; i < size; ++i) {
    255    out[size - i - 1] = in[i];
    256  }
    257 }
    258 
    259 void av1_lowbd_fwd_txfm2d_4x4_sse2(const int16_t *input, int32_t *output,
    260                                   int stride, TX_TYPE tx_type, int bd);
    261 
    262 void av1_lowbd_fwd_txfm2d_4x8_sse2(const int16_t *input, int32_t *output,
    263                                   int stride, TX_TYPE tx_type, int bd);
    264 
    265 void av1_lowbd_fwd_txfm2d_4x16_sse2(const int16_t *input, int32_t *output,
    266                                    int stride, TX_TYPE tx_type, int bd);
    267 
    268 void av1_lowbd_fwd_txfm2d_8x4_sse2(const int16_t *input, int32_t *output,
    269                                   int stride, TX_TYPE tx_type, int bd);
    270 
    271 void av1_lowbd_fwd_txfm2d_8x8_sse2(const int16_t *input, int32_t *output,
    272                                   int stride, TX_TYPE tx_type, int bd);
    273 
    274 void av1_lowbd_fwd_txfm2d_8x16_sse2(const int16_t *input, int32_t *output,
    275                                    int stride, TX_TYPE tx_type, int bd);
    276 
    277 void av1_lowbd_fwd_txfm2d_8x32_sse2(const int16_t *input, int32_t *output,
    278                                    int stride, TX_TYPE tx_type, int bd);
    279 
    280 void av1_lowbd_fwd_txfm2d_16x4_sse2(const int16_t *input, int32_t *output,
    281                                    int stride, TX_TYPE tx_type, int bd);
    282 
    283 void av1_lowbd_fwd_txfm2d_16x8_sse2(const int16_t *input, int32_t *output,
    284                                    int stride, TX_TYPE tx_type, int bd);
    285 
    286 void av1_lowbd_fwd_txfm2d_16x16_sse2(const int16_t *input, int32_t *output,
    287                                     int stride, TX_TYPE tx_type, int bd);
    288 
    289 void av1_lowbd_fwd_txfm2d_16x32_sse2(const int16_t *input, int32_t *output,
    290                                     int stride, TX_TYPE tx_type, int bd);
    291 
    292 void av1_lowbd_fwd_txfm2d_32x8_sse2(const int16_t *input, int32_t *output,
    293                                    int stride, TX_TYPE tx_type, int bd);
    294 
    295 void av1_lowbd_fwd_txfm2d_32x16_sse2(const int16_t *input, int32_t *output,
    296                                     int stride, TX_TYPE tx_type, int bd);
    297 
    298 void av1_lowbd_fwd_txfm2d_32x32_sse2(const int16_t *input, int32_t *output,
    299                                     int stride, TX_TYPE tx_type, int bd);
    300 
    301 void av1_lowbd_fwd_txfm2d_16x64_sse2(const int16_t *input, int32_t *output,
    302                                     int stride, TX_TYPE tx_type, int bd);
    303 
    304 void av1_lowbd_fwd_txfm2d_64x16_sse2(const int16_t *input, int32_t *output,
    305                                     int stride, TX_TYPE tx_type, int bd);
    306 
    307 typedef void (*transform_1d_sse2)(const __m128i *input, __m128i *output,
    308                                  int8_t cos_bit);
    309 
    310 void av1_iadst8_sse2(const __m128i *input, __m128i *output);
    311 
    312 void av1_idct8_sse2(const __m128i *input, __m128i *output);
    313 
    314 typedef struct {
    315  transform_1d_sse2 col, row;  // vertical and horizontal
    316 } transform_2d_sse2;
    317 
    318 #ifdef __cplusplus
    319 }
    320 #endif  // __cplusplus
    321 #endif  // AOM_AV1_COMMON_X86_AV1_TXFM_SSE2_H_