av1_txfm1d_sse4.h (4913B)
1 /* 2 * Copyright (c) 2018, Alliance for Open Media. All rights reserved. 3 * 4 * This source code is subject to the terms of the BSD 2 Clause License and 5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License 6 * was not distributed with this source code in the LICENSE file, you can 7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open 8 * Media Patent License 1.0 was not distributed with this source code in the 9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent. 10 */ 11 12 #ifndef AOM_AV1_ENCODER_X86_AV1_TXFM1D_SSE4_H_ 13 #define AOM_AV1_ENCODER_X86_AV1_TXFM1D_SSE4_H_ 14 15 #include <smmintrin.h> 16 #include "av1/common/av1_txfm.h" 17 #include "av1/common/x86/av1_txfm_sse4.h" 18 19 #ifdef __cplusplus 20 extern "C" { 21 #endif 22 23 void av1_fdct32_sse4_1(__m128i *input, __m128i *output, int cos_bit, 24 const int stride); 25 void av1_fdct64_sse4_1(__m128i *input, __m128i *output, int8_t cos_bit, 26 const int instride, const int outstride); 27 28 void av1_idtx32_sse4_1(__m128i *input, __m128i *output, int cos_bit, 29 const int col_num); 30 31 static inline void transpose_32_4x4(int stride, const __m128i *input, 32 __m128i *output) { 33 __m128i temp0 = _mm_unpacklo_epi32(input[0 * stride], input[2 * stride]); 34 __m128i temp1 = _mm_unpackhi_epi32(input[0 * stride], input[2 * stride]); 35 __m128i temp2 = _mm_unpacklo_epi32(input[1 * stride], input[3 * stride]); 36 __m128i temp3 = _mm_unpackhi_epi32(input[1 * stride], input[3 * stride]); 37 38 output[0 * stride] = _mm_unpacklo_epi32(temp0, temp2); 39 output[1 * stride] = _mm_unpackhi_epi32(temp0, temp2); 40 output[2 * stride] = _mm_unpacklo_epi32(temp1, temp3); 41 output[3 * stride] = _mm_unpackhi_epi32(temp1, temp3); 42 } 43 44 // the entire input block can be represent by a grid of 4x4 blocks 45 // each 4x4 blocks can be represent by 4 vertical __m128i 46 // we first transpose each 4x4 block internally 47 // then transpose the grid 48 static inline void transpose_32(int txfm_size, const __m128i *input, 49 __m128i *output) { 50 const int num_per_128 = 4; 51 const int row_size = txfm_size; 52 const int col_size = txfm_size / num_per_128; 53 int r, c; 54 55 // transpose each 4x4 block internally 56 for (r = 0; r < row_size; r += 4) { 57 for (c = 0; c < col_size; c++) { 58 transpose_32_4x4(col_size, &input[r * col_size + c], 59 &output[c * 4 * col_size + r / 4]); 60 } 61 } 62 } 63 64 // out0 = in0*w0 + in1*w1 65 // out1 = -in1*w0 + in0*w1 66 #define btf_32_sse4_1_type0(w0, w1, in0, in1, out0, out1, bit) \ 67 do { \ 68 const __m128i ww0 = _mm_set1_epi32(w0); \ 69 const __m128i ww1 = _mm_set1_epi32(w1); \ 70 const __m128i in0_w0 = _mm_mullo_epi32(in0, ww0); \ 71 const __m128i in1_w1 = _mm_mullo_epi32(in1, ww1); \ 72 out0 = _mm_add_epi32(in0_w0, in1_w1); \ 73 out0 = av1_round_shift_32_sse4_1(out0, bit); \ 74 const __m128i in0_w1 = _mm_mullo_epi32(in0, ww1); \ 75 const __m128i in1_w0 = _mm_mullo_epi32(in1, ww0); \ 76 out1 = _mm_sub_epi32(in0_w1, in1_w0); \ 77 out1 = av1_round_shift_32_sse4_1(out1, bit); \ 78 } while (0) 79 80 // out0 = in0*w0 + in1*w1 81 // out1 = in1*w0 - in0*w1 82 #define btf_32_sse4_1_type1(w0, w1, in0, in1, out0, out1, bit) \ 83 do { \ 84 btf_32_sse4_1_type0(w1, w0, in1, in0, out0, out1, bit); \ 85 } while (0) 86 87 // out0 = in0*w0 + in1*w1 88 // out1 = -in1*w0 + in0*w1 89 #define btf_32_type0_sse4_1_new(ww0, ww1, in0, in1, out0, out1, r, bit) \ 90 do { \ 91 const __m128i in0_w0 = _mm_mullo_epi32(in0, ww0); \ 92 const __m128i in1_w1 = _mm_mullo_epi32(in1, ww1); \ 93 out0 = _mm_add_epi32(in0_w0, in1_w1); \ 94 out0 = _mm_add_epi32(out0, r); \ 95 out0 = _mm_srai_epi32(out0, bit); \ 96 const __m128i in0_w1 = _mm_mullo_epi32(in0, ww1); \ 97 const __m128i in1_w0 = _mm_mullo_epi32(in1, ww0); \ 98 out1 = _mm_sub_epi32(in0_w1, in1_w0); \ 99 out1 = _mm_add_epi32(out1, r); \ 100 out1 = _mm_srai_epi32(out1, bit); \ 101 } while (0) 102 103 // out0 = in0*w0 + in1*w1 104 // out1 = in1*w0 - in0*w1 105 #define btf_32_type1_sse4_1_new(ww0, ww1, in0, in1, out0, out1, r, bit) \ 106 do { \ 107 btf_32_type0_sse4_1_new(ww1, ww0, in1, in0, out0, out1, r, bit); \ 108 } while (0) 109 110 #ifdef __cplusplus 111 } 112 #endif 113 114 #endif // AOM_AV1_ENCODER_X86_AV1_TXFM1D_SSE4_H_