tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

av1_inv_txfm_ssse3.h (8965B)


      1 /*
      2 * Copyright (c) 2018, Alliance for Open Media. All rights reserved.
      3 *
      4 * This source code is subject to the terms of the BSD 2 Clause License and
      5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
      6 * was not distributed with this source code in the LICENSE file, you can
      7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
      8 * Media Patent License 1.0 was not distributed with this source code in the
      9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
     10 */
     11 #ifndef AOM_AV1_COMMON_X86_AV1_INV_TXFM_SSSE3_H_
     12 #define AOM_AV1_COMMON_X86_AV1_INV_TXFM_SSSE3_H_
     13 
     14 #include <emmintrin.h>  // SSE2
     15 #include <tmmintrin.h>  // SSSE3
     16 
     17 #include "config/aom_config.h"
     18 #include "config/av1_rtcd.h"
     19 
     20 #include "aom/aom_integer.h"
     21 #include "aom_dsp/x86/transpose_sse2.h"
     22 
     23 #ifdef __cplusplus
     24 extern "C" {
     25 #endif
     26 
     27 #define btf_16_ssse3(w0, w1, in, out0, out1)    \
     28  do {                                          \
     29    const __m128i _w0 = _mm_set1_epi16(w0 * 8); \
     30    const __m128i _w1 = _mm_set1_epi16(w1 * 8); \
     31    const __m128i _in = in;                     \
     32    out0 = _mm_mulhrs_epi16(_in, _w0);          \
     33    out1 = _mm_mulhrs_epi16(_in, _w1);          \
     34  } while (0)
     35 
     36 #define btf_16_adds_subs_sse2(in0, in1) \
     37  do {                                  \
     38    const __m128i _in0 = in0;           \
     39    const __m128i _in1 = in1;           \
     40    in0 = _mm_adds_epi16(_in0, _in1);   \
     41    in1 = _mm_subs_epi16(_in0, _in1);   \
     42  } while (0)
     43 
     44 #define btf_16_subs_adds_sse2(in0, in1) \
     45  do {                                  \
     46    const __m128i _in0 = in0;           \
     47    const __m128i _in1 = in1;           \
     48    in1 = _mm_subs_epi16(_in0, _in1);   \
     49    in0 = _mm_adds_epi16(_in0, _in1);   \
     50  } while (0)
     51 
     52 #define btf_16_adds_subs_out_sse2(out0, out1, in0, in1) \
     53  do {                                                  \
     54    const __m128i _in0 = in0;                           \
     55    const __m128i _in1 = in1;                           \
     56    out0 = _mm_adds_epi16(_in0, _in1);                  \
     57    out1 = _mm_subs_epi16(_in0, _in1);                  \
     58  } while (0)
     59 
     60 static inline void round_shift_16bit_ssse3(__m128i *in, int size, int bit) {
     61  if (bit < 0) {
     62    const __m128i scale = _mm_set1_epi16(1 << (15 + bit));
     63    for (int i = 0; i < size; ++i) {
     64      in[i] = _mm_mulhrs_epi16(in[i], scale);
     65    }
     66  } else if (bit > 0) {
     67    for (int i = 0; i < size; ++i) {
     68      in[i] = _mm_slli_epi16(in[i], bit);
     69    }
     70  }
     71 }
     72 
     73 // 1D itx types
     74 enum {
     75  IDCT_1D,
     76  IADST_1D,
     77  IFLIPADST_1D = IADST_1D,
     78  IIDENTITY_1D,
     79  ITX_TYPES_1D,
     80 } UENUM1BYTE(ITX_TYPE_1D);
     81 
     82 static const ITX_TYPE_1D vitx_1d_tab[TX_TYPES] = {
     83  IDCT_1D,      IADST_1D,     IDCT_1D,      IADST_1D,
     84  IFLIPADST_1D, IDCT_1D,      IFLIPADST_1D, IADST_1D,
     85  IFLIPADST_1D, IIDENTITY_1D, IDCT_1D,      IIDENTITY_1D,
     86  IADST_1D,     IIDENTITY_1D, IFLIPADST_1D, IIDENTITY_1D,
     87 };
     88 
     89 static const ITX_TYPE_1D hitx_1d_tab[TX_TYPES] = {
     90  IDCT_1D,      IDCT_1D,      IADST_1D,     IADST_1D,
     91  IDCT_1D,      IFLIPADST_1D, IFLIPADST_1D, IFLIPADST_1D,
     92  IADST_1D,     IIDENTITY_1D, IIDENTITY_1D, IDCT_1D,
     93  IIDENTITY_1D, IADST_1D,     IIDENTITY_1D, IFLIPADST_1D,
     94 };
     95 
     96 DECLARE_ALIGNED(16, static const int16_t, av1_eob_to_eobxy_8x8_default[8]) = {
     97  0x0707, 0x0707, 0x0707, 0x0707, 0x0707, 0x0707, 0x0707, 0x0707,
     98 };
     99 
    100 DECLARE_ALIGNED(16, static const int16_t,
    101                av1_eob_to_eobxy_16x16_default[16]) = {
    102  0x0707, 0x0707, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f,
    103  0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f,
    104 };
    105 
    106 DECLARE_ALIGNED(16, static const int16_t,
    107                av1_eob_to_eobxy_32x32_default[32]) = {
    108  0x0707, 0x0f0f, 0x0f0f, 0x0f0f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f,
    109  0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f,
    110  0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f,
    111  0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f,
    112 };
    113 
    114 DECLARE_ALIGNED(16, static const int16_t, av1_eob_to_eobxy_8x16_default[16]) = {
    115  0x0707, 0x0707, 0x0707, 0x0707, 0x0707, 0x0f07, 0x0f07, 0x0f07,
    116  0x0f07, 0x0f07, 0x0f07, 0x0f07, 0x0f07, 0x0f07, 0x0f07, 0x0f07,
    117 };
    118 
    119 DECLARE_ALIGNED(16, static const int16_t, av1_eob_to_eobxy_16x8_default[8]) = {
    120  0x0707, 0x0707, 0x070f, 0x070f, 0x070f, 0x070f, 0x070f, 0x070f,
    121 };
    122 
    123 DECLARE_ALIGNED(16, static const int16_t,
    124                av1_eob_to_eobxy_16x32_default[32]) = {
    125  0x0707, 0x0707, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f,
    126  0x0f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f,
    127  0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f,
    128  0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f,
    129 };
    130 
    131 DECLARE_ALIGNED(16, static const int16_t,
    132                av1_eob_to_eobxy_32x16_default[16]) = {
    133  0x0707, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f1f, 0x0f1f, 0x0f1f, 0x0f1f,
    134  0x0f1f, 0x0f1f, 0x0f1f, 0x0f1f, 0x0f1f, 0x0f1f, 0x0f1f, 0x0f1f,
    135 };
    136 
    137 DECLARE_ALIGNED(16, static const int16_t, av1_eob_to_eobxy_8x32_default[32]) = {
    138  0x0707, 0x0707, 0x0707, 0x0707, 0x0707, 0x0f07, 0x0f07, 0x0f07,
    139  0x0f07, 0x0f07, 0x0f07, 0x0f07, 0x0f07, 0x1f07, 0x1f07, 0x1f07,
    140  0x1f07, 0x1f07, 0x1f07, 0x1f07, 0x1f07, 0x1f07, 0x1f07, 0x1f07,
    141  0x1f07, 0x1f07, 0x1f07, 0x1f07, 0x1f07, 0x1f07, 0x1f07, 0x1f07,
    142 };
    143 
    144 DECLARE_ALIGNED(16, static const int16_t, av1_eob_to_eobxy_32x8_default[8]) = {
    145  0x0707, 0x070f, 0x070f, 0x071f, 0x071f, 0x071f, 0x071f, 0x071f,
    146 };
    147 
    148 DECLARE_ALIGNED(16, static const int16_t *,
    149                av1_eob_to_eobxy_default[TX_SIZES_ALL]) = {
    150  NULL,
    151  av1_eob_to_eobxy_8x8_default,
    152  av1_eob_to_eobxy_16x16_default,
    153  av1_eob_to_eobxy_32x32_default,
    154  av1_eob_to_eobxy_32x32_default,
    155  NULL,
    156  NULL,
    157  av1_eob_to_eobxy_8x16_default,
    158  av1_eob_to_eobxy_16x8_default,
    159  av1_eob_to_eobxy_16x32_default,
    160  av1_eob_to_eobxy_32x16_default,
    161  av1_eob_to_eobxy_32x32_default,
    162  av1_eob_to_eobxy_32x32_default,
    163  NULL,
    164  NULL,
    165  av1_eob_to_eobxy_8x32_default,
    166  av1_eob_to_eobxy_32x8_default,
    167  av1_eob_to_eobxy_16x32_default,
    168  av1_eob_to_eobxy_32x16_default,
    169 };
    170 
    171 static const int lowbd_txfm_all_1d_zeros_idx[32] = {
    172  0, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2,
    173  3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
    174 };
    175 
    176 // Transform block width in log2 for eob (size of 64 map to 32)
    177 static const int tx_size_wide_log2_eob[TX_SIZES_ALL] = {
    178  2, 3, 4, 5, 5, 2, 3, 3, 4, 4, 5, 5, 5, 2, 4, 3, 5, 4, 5,
    179 };
    180 
    181 static inline void get_eobx_eoby_scan_default(int *eobx, int *eoby,
    182                                              TX_SIZE tx_size, int eob) {
    183  if (eob == 1) {
    184    *eobx = 0;
    185    *eoby = 0;
    186    return;
    187  }
    188 
    189  const int tx_w_log2 = tx_size_wide_log2_eob[tx_size];
    190  const int eob_row = (eob - 1) >> tx_w_log2;
    191  const int eobxy = av1_eob_to_eobxy_default[tx_size][eob_row];
    192  *eobx = eobxy & 0xFF;
    193  *eoby = eobxy >> 8;
    194 }
    195 
    196 static const int eob_fill[32] = {
    197  0,  7,  7,  7,  7,  7,  7,  7,  15, 15, 15, 15, 15, 15, 15, 15,
    198  31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
    199 };
    200 
    201 static inline void get_eobx_eoby_scan_h_identity(int *eobx, int *eoby,
    202                                                 TX_SIZE tx_size, int eob) {
    203  eob -= 1;
    204  const int txfm_size_col = tx_size_wide[tx_size];
    205  const int eobx_max = AOMMIN(32, txfm_size_col) - 1;
    206  *eobx = (eob >= eobx_max) ? eobx_max : eob_fill[eob];
    207  const int temp_eoby = eob / (eobx_max + 1);
    208  assert(temp_eoby < 32);
    209  *eoby = eob_fill[temp_eoby];
    210 }
    211 
    212 static inline void get_eobx_eoby_scan_v_identity(int *eobx, int *eoby,
    213                                                 TX_SIZE tx_size, int eob) {
    214  eob -= 1;
    215  const int txfm_size_row = tx_size_high[tx_size];
    216  const int eoby_max = AOMMIN(32, txfm_size_row) - 1;
    217  *eobx = eob_fill[eob / (eoby_max + 1)];
    218  *eoby = (eob >= eoby_max) ? eoby_max : eob_fill[eob];
    219 }
    220 
    221 typedef void (*transform_1d_ssse3)(const __m128i *input, __m128i *output);
    222 
    223 void av1_lowbd_inv_txfm2d_add_ssse3(const int32_t *input, uint8_t *output,
    224                                    int stride, TX_TYPE tx_type,
    225                                    TX_SIZE tx_size, int eob);
    226 
    227 void av1_lowbd_inv_txfm2d_add_idtx_ssse3(const int32_t *input, uint8_t *output,
    228                                         int stride, TX_SIZE tx_size);
    229 
    230 void av1_lowbd_inv_txfm2d_add_h_identity_ssse3(const int32_t *input,
    231                                               uint8_t *output, int stride,
    232                                               TX_TYPE tx_type, TX_SIZE tx_size,
    233                                               int eob);
    234 void av1_lowbd_inv_txfm2d_add_v_identity_ssse3(const int32_t *input,
    235                                               uint8_t *output, int stride,
    236                                               TX_TYPE tx_type, TX_SIZE tx_size,
    237                                               int eob);
    238 
    239 void av1_iadst8_low1_ssse3(const __m128i *input, __m128i *output);
    240 
    241 void av1_idct8_low1_ssse3(const __m128i *input, __m128i *output);
    242 
    243 #ifdef __cplusplus
    244 }  // extern "C"
    245 #endif
    246 
    247 #endif  // AOM_AV1_COMMON_X86_AV1_INV_TXFM_SSSE3_H_