av1_txfm_sse2.h (13143B)
1 /* 2 * Copyright (c) 2018, Alliance for Open Media. All rights reserved. 3 * 4 * This source code is subject to the terms of the BSD 2 Clause License and 5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License 6 * was not distributed with this source code in the LICENSE file, you can 7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open 8 * Media Patent License 1.0 was not distributed with this source code in the 9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent. 10 */ 11 #ifndef AOM_AV1_COMMON_X86_AV1_TXFM_SSE2_H_ 12 #define AOM_AV1_COMMON_X86_AV1_TXFM_SSE2_H_ 13 14 #include <emmintrin.h> // SSE2 15 16 #include "config/aom_config.h" 17 #include "config/av1_rtcd.h" 18 19 #include "aom/aom_integer.h" 20 #include "aom_dsp/x86/transpose_sse2.h" 21 #include "aom_dsp/x86/txfm_common_sse2.h" 22 #include "av1/common/av1_txfm.h" 23 24 #ifdef __cplusplus 25 extern "C" { 26 #endif 27 28 static inline void btf_16_w4_sse2( 29 const __m128i *const w0, const __m128i *const w1, const __m128i __rounding, 30 const int8_t cos_bit, const __m128i *const in0, const __m128i *const in1, 31 __m128i *const out0, __m128i *const out1) { 32 const __m128i t0 = _mm_unpacklo_epi16(*in0, *in1); 33 const __m128i u0 = _mm_madd_epi16(t0, *w0); 34 const __m128i v0 = _mm_madd_epi16(t0, *w1); 35 const __m128i a0 = _mm_add_epi32(u0, __rounding); 36 const __m128i b0 = _mm_add_epi32(v0, __rounding); 37 const __m128i c0 = _mm_srai_epi32(a0, cos_bit); 38 const __m128i d0 = _mm_srai_epi32(b0, cos_bit); 39 40 *out0 = _mm_packs_epi32(c0, c0); 41 *out1 = _mm_packs_epi32(d0, c0); 42 } 43 44 #define btf_16_4p_sse2(w0, w1, in0, in1, out0, out1) \ 45 do { \ 46 __m128i t0 = _mm_unpacklo_epi16(in0, in1); \ 47 __m128i u0 = _mm_madd_epi16(t0, w0); \ 48 __m128i v0 = _mm_madd_epi16(t0, w1); \ 49 \ 50 __m128i a0 = _mm_add_epi32(u0, __rounding); \ 51 __m128i b0 = _mm_add_epi32(v0, __rounding); \ 52 \ 53 __m128i c0 = _mm_srai_epi32(a0, cos_bit); \ 54 __m128i d0 = _mm_srai_epi32(b0, cos_bit); \ 55 \ 56 out0 = _mm_packs_epi32(c0, c0); \ 57 out1 = _mm_packs_epi32(d0, d0); \ 58 } while (0) 59 60 #define btf_16_sse2(w0, w1, in0, in1, out0, out1) \ 61 do { \ 62 __m128i t0 = _mm_unpacklo_epi16(in0, in1); \ 63 __m128i t1 = _mm_unpackhi_epi16(in0, in1); \ 64 __m128i u0 = _mm_madd_epi16(t0, w0); \ 65 __m128i u1 = _mm_madd_epi16(t1, w0); \ 66 __m128i v0 = _mm_madd_epi16(t0, w1); \ 67 __m128i v1 = _mm_madd_epi16(t1, w1); \ 68 \ 69 __m128i a0 = _mm_add_epi32(u0, __rounding); \ 70 __m128i a1 = _mm_add_epi32(u1, __rounding); \ 71 __m128i b0 = _mm_add_epi32(v0, __rounding); \ 72 __m128i b1 = _mm_add_epi32(v1, __rounding); \ 73 \ 74 __m128i c0 = _mm_srai_epi32(a0, cos_bit); \ 75 __m128i c1 = _mm_srai_epi32(a1, cos_bit); \ 76 __m128i d0 = _mm_srai_epi32(b0, cos_bit); \ 77 __m128i d1 = _mm_srai_epi32(b1, cos_bit); \ 78 \ 79 out0 = _mm_packs_epi32(c0, c1); \ 80 out1 = _mm_packs_epi32(d0, d1); \ 81 } while (0) 82 83 static inline __m128i load_16bit_to_16bit(const int16_t *a) { 84 return _mm_load_si128((const __m128i *)a); 85 } 86 87 static inline __m128i load_32bit_to_16bit(const int32_t *a) { 88 const __m128i a_low = _mm_load_si128((const __m128i *)a); 89 return _mm_packs_epi32(a_low, *(const __m128i *)(a + 4)); 90 } 91 92 static inline __m128i load_32bit_to_16bit_w4(const int32_t *a) { 93 const __m128i a_low = _mm_load_si128((const __m128i *)a); 94 return _mm_packs_epi32(a_low, a_low); 95 } 96 97 // Store 4 16 bit values. Sign extend the values. 98 static inline void store_16bit_to_32bit_w4(const __m128i a, int32_t *const b) { 99 const __m128i a_lo = _mm_unpacklo_epi16(a, a); 100 const __m128i a_1 = _mm_srai_epi32(a_lo, 16); 101 _mm_store_si128((__m128i *)b, a_1); 102 } 103 104 // Store 8 16 bit values. Sign extend the values. 105 static inline void store_16bit_to_32bit(__m128i a, int32_t *b) { 106 const __m128i a_lo = _mm_unpacklo_epi16(a, a); 107 const __m128i a_hi = _mm_unpackhi_epi16(a, a); 108 const __m128i a_1 = _mm_srai_epi32(a_lo, 16); 109 const __m128i a_2 = _mm_srai_epi32(a_hi, 16); 110 _mm_store_si128((__m128i *)b, a_1); 111 _mm_store_si128((__m128i *)(b + 4), a_2); 112 } 113 114 static inline __m128i scale_round_sse2(const __m128i a, const int scale) { 115 const __m128i scale_rounding = pair_set_epi16(scale, 1 << (NewSqrt2Bits - 1)); 116 const __m128i b = _mm_madd_epi16(a, scale_rounding); 117 return _mm_srai_epi32(b, NewSqrt2Bits); 118 } 119 120 static inline void store_rect_16bit_to_32bit_w4(const __m128i a, 121 int32_t *const b) { 122 const __m128i one = _mm_set1_epi16(1); 123 const __m128i a_lo = _mm_unpacklo_epi16(a, one); 124 const __m128i b_lo = scale_round_sse2(a_lo, NewSqrt2); 125 _mm_store_si128((__m128i *)b, b_lo); 126 } 127 128 static inline void store_rect_16bit_to_32bit(const __m128i a, 129 int32_t *const b) { 130 const __m128i one = _mm_set1_epi16(1); 131 const __m128i a_lo = _mm_unpacklo_epi16(a, one); 132 const __m128i a_hi = _mm_unpackhi_epi16(a, one); 133 const __m128i b_lo = scale_round_sse2(a_lo, NewSqrt2); 134 const __m128i b_hi = scale_round_sse2(a_hi, NewSqrt2); 135 _mm_store_si128((__m128i *)b, b_lo); 136 _mm_store_si128((__m128i *)(b + 4), b_hi); 137 } 138 139 static inline void load_buffer_16bit_to_16bit_w4(const int16_t *const in, 140 const int stride, 141 __m128i *const out, 142 const int out_size) { 143 for (int i = 0; i < out_size; ++i) { 144 out[i] = _mm_loadl_epi64((const __m128i *)(in + i * stride)); 145 } 146 } 147 148 static inline void load_buffer_16bit_to_16bit_w4_flip(const int16_t *const in, 149 const int stride, 150 __m128i *const out, 151 const int out_size) { 152 for (int i = 0; i < out_size; ++i) { 153 out[out_size - i - 1] = _mm_loadl_epi64((const __m128i *)(in + i * stride)); 154 } 155 } 156 157 static inline void load_buffer_16bit_to_16bit(const int16_t *in, int stride, 158 __m128i *out, int out_size) { 159 for (int i = 0; i < out_size; ++i) { 160 out[i] = load_16bit_to_16bit(in + i * stride); 161 } 162 } 163 164 static inline void load_buffer_16bit_to_16bit_flip(const int16_t *in, 165 int stride, __m128i *out, 166 int out_size) { 167 for (int i = 0; i < out_size; ++i) { 168 out[out_size - i - 1] = load_16bit_to_16bit(in + i * stride); 169 } 170 } 171 172 static inline void load_buffer_32bit_to_16bit(const int32_t *in, int stride, 173 __m128i *out, int out_size) { 174 for (int i = 0; i < out_size; ++i) { 175 out[i] = load_32bit_to_16bit(in + i * stride); 176 } 177 } 178 179 static inline void load_buffer_32bit_to_16bit_w4(const int32_t *in, int stride, 180 __m128i *out, int out_size) { 181 for (int i = 0; i < out_size; ++i) { 182 out[i] = load_32bit_to_16bit_w4(in + i * stride); 183 } 184 } 185 186 static inline void load_buffer_32bit_to_16bit_flip(const int32_t *in, 187 int stride, __m128i *out, 188 int out_size) { 189 for (int i = 0; i < out_size; ++i) { 190 out[out_size - i - 1] = load_32bit_to_16bit(in + i * stride); 191 } 192 } 193 194 static inline void store_buffer_16bit_to_32bit_w4(const __m128i *const in, 195 int32_t *const out, 196 const int stride, 197 const int out_size) { 198 for (int i = 0; i < out_size; ++i) { 199 store_16bit_to_32bit_w4(in[i], out + i * stride); 200 } 201 } 202 203 static inline void store_buffer_16bit_to_32bit_w8(const __m128i *const in, 204 int32_t *const out, 205 const int stride, 206 const int out_size) { 207 for (int i = 0; i < out_size; ++i) { 208 store_16bit_to_32bit(in[i], out + i * stride); 209 } 210 } 211 212 static inline void store_rect_buffer_16bit_to_32bit_w4(const __m128i *const in, 213 int32_t *const out, 214 const int stride, 215 const int out_size) { 216 for (int i = 0; i < out_size; ++i) { 217 store_rect_16bit_to_32bit_w4(in[i], out + i * stride); 218 } 219 } 220 221 static inline void store_rect_buffer_16bit_to_32bit_w8(const __m128i *const in, 222 int32_t *const out, 223 const int stride, 224 const int out_size) { 225 for (int i = 0; i < out_size; ++i) { 226 store_rect_16bit_to_32bit(in[i], out + i * stride); 227 } 228 } 229 230 static inline void store_buffer_16bit_to_16bit_8x8(const __m128i *in, 231 uint16_t *out, 232 const int stride) { 233 for (int i = 0; i < 8; ++i) { 234 _mm_store_si128((__m128i *)(out + i * stride), in[i]); 235 } 236 } 237 238 static inline void round_shift_16bit(__m128i *in, int size, int bit) { 239 if (bit < 0) { 240 bit = -bit; 241 __m128i rounding = _mm_set1_epi16(1 << (bit - 1)); 242 for (int i = 0; i < size; ++i) { 243 in[i] = _mm_adds_epi16(in[i], rounding); 244 in[i] = _mm_srai_epi16(in[i], bit); 245 } 246 } else if (bit > 0) { 247 for (int i = 0; i < size; ++i) { 248 in[i] = _mm_slli_epi16(in[i], bit); 249 } 250 } 251 } 252 253 static inline void flip_buf_sse2(__m128i *in, __m128i *out, int size) { 254 for (int i = 0; i < size; ++i) { 255 out[size - i - 1] = in[i]; 256 } 257 } 258 259 void av1_lowbd_fwd_txfm2d_4x4_sse2(const int16_t *input, int32_t *output, 260 int stride, TX_TYPE tx_type, int bd); 261 262 void av1_lowbd_fwd_txfm2d_4x8_sse2(const int16_t *input, int32_t *output, 263 int stride, TX_TYPE tx_type, int bd); 264 265 void av1_lowbd_fwd_txfm2d_4x16_sse2(const int16_t *input, int32_t *output, 266 int stride, TX_TYPE tx_type, int bd); 267 268 void av1_lowbd_fwd_txfm2d_8x4_sse2(const int16_t *input, int32_t *output, 269 int stride, TX_TYPE tx_type, int bd); 270 271 void av1_lowbd_fwd_txfm2d_8x8_sse2(const int16_t *input, int32_t *output, 272 int stride, TX_TYPE tx_type, int bd); 273 274 void av1_lowbd_fwd_txfm2d_8x16_sse2(const int16_t *input, int32_t *output, 275 int stride, TX_TYPE tx_type, int bd); 276 277 void av1_lowbd_fwd_txfm2d_8x32_sse2(const int16_t *input, int32_t *output, 278 int stride, TX_TYPE tx_type, int bd); 279 280 void av1_lowbd_fwd_txfm2d_16x4_sse2(const int16_t *input, int32_t *output, 281 int stride, TX_TYPE tx_type, int bd); 282 283 void av1_lowbd_fwd_txfm2d_16x8_sse2(const int16_t *input, int32_t *output, 284 int stride, TX_TYPE tx_type, int bd); 285 286 void av1_lowbd_fwd_txfm2d_16x16_sse2(const int16_t *input, int32_t *output, 287 int stride, TX_TYPE tx_type, int bd); 288 289 void av1_lowbd_fwd_txfm2d_16x32_sse2(const int16_t *input, int32_t *output, 290 int stride, TX_TYPE tx_type, int bd); 291 292 void av1_lowbd_fwd_txfm2d_32x8_sse2(const int16_t *input, int32_t *output, 293 int stride, TX_TYPE tx_type, int bd); 294 295 void av1_lowbd_fwd_txfm2d_32x16_sse2(const int16_t *input, int32_t *output, 296 int stride, TX_TYPE tx_type, int bd); 297 298 void av1_lowbd_fwd_txfm2d_32x32_sse2(const int16_t *input, int32_t *output, 299 int stride, TX_TYPE tx_type, int bd); 300 301 void av1_lowbd_fwd_txfm2d_16x64_sse2(const int16_t *input, int32_t *output, 302 int stride, TX_TYPE tx_type, int bd); 303 304 void av1_lowbd_fwd_txfm2d_64x16_sse2(const int16_t *input, int32_t *output, 305 int stride, TX_TYPE tx_type, int bd); 306 307 typedef void (*transform_1d_sse2)(const __m128i *input, __m128i *output, 308 int8_t cos_bit); 309 310 void av1_iadst8_sse2(const __m128i *input, __m128i *output); 311 312 void av1_idct8_sse2(const __m128i *input, __m128i *output); 313 314 typedef struct { 315 transform_1d_sse2 col, row; // vertical and horizontal 316 } transform_2d_sse2; 317 318 #ifdef __cplusplus 319 } 320 #endif // __cplusplus 321 #endif // AOM_AV1_COMMON_X86_AV1_TXFM_SSE2_H_