av1_fwd_txfm_avx2.h (4197B)
1 /* 2 * Copyright (c) 2018, Alliance for Open Media. All rights reserved. 3 * 4 * This source code is subject to the terms of the BSD 2 Clause License and 5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License 6 * was not distributed with this source code in the LICENSE file, you can 7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open 8 * Media Patent License 1.0 was not distributed with this source code in the 9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent. 10 */ 11 12 #ifndef AOM_AV1_ENCODER_X86_AV1_FWD_TXFM_AVX2_H_ 13 #define AOM_AV1_ENCODER_X86_AV1_FWD_TXFM_AVX2_H_ 14 #include <immintrin.h> 15 16 // out0 = in0*w0 + in1*w1 17 // out1 = -in1*w0 + in0*w1 18 static inline void btf_32_avx2_type0(const int32_t w0, const int32_t w1, 19 __m256i *in0, __m256i *in1, 20 const __m256i _r, const int32_t cos_bit) { 21 __m256i _in0 = *in0; 22 __m256i _in1 = *in1; 23 const __m256i ww0 = _mm256_set1_epi32(w0); 24 const __m256i ww1 = _mm256_set1_epi32(w1); 25 const __m256i in0_w0 = _mm256_mullo_epi32(_in0, ww0); 26 const __m256i in1_w1 = _mm256_mullo_epi32(_in1, ww1); 27 __m256i temp0 = _mm256_add_epi32(in0_w0, in1_w1); 28 temp0 = _mm256_add_epi32(temp0, _r); 29 *in0 = _mm256_srai_epi32(temp0, cos_bit); 30 const __m256i in0_w1 = _mm256_mullo_epi32(_in0, ww1); 31 const __m256i in1_w0 = _mm256_mullo_epi32(_in1, ww0); 32 __m256i temp1 = _mm256_sub_epi32(in0_w1, in1_w0); 33 temp1 = _mm256_add_epi32(temp1, _r); 34 *in1 = _mm256_srai_epi32(temp1, cos_bit); 35 } 36 37 static inline void btf_32_avx2_type1(const int32_t w0, const int32_t w1, 38 __m256i *in0, __m256i *in1, 39 const __m256i _r, const int32_t cos_bit) { 40 __m256i _in0 = *in0; 41 __m256i _in1 = *in1; 42 const __m256i ww0 = _mm256_set1_epi32(w0); 43 const __m256i ww1 = _mm256_set1_epi32(w1); 44 const __m256i in0_w0 = _mm256_mullo_epi32(_in0, ww0); 45 const __m256i in1_w1 = _mm256_mullo_epi32(_in1, ww1); 46 __m256i temp0 = _mm256_add_epi32(in0_w0, in1_w1); 47 temp0 = _mm256_add_epi32(temp0, _r); 48 *in0 = _mm256_srai_epi32(temp0, cos_bit); 49 const __m256i in0_w1 = _mm256_mullo_epi32(_in0, ww1); 50 const __m256i in1_w0 = _mm256_mullo_epi32(_in1, ww0); 51 __m256i temp1 = _mm256_sub_epi32(in1_w0, in0_w1); 52 temp1 = _mm256_add_epi32(temp1, _r); 53 *in1 = _mm256_srai_epi32(temp1, cos_bit); 54 } 55 56 // out0 = in0*w0 + in1*w1 57 // out1 = -in1*w0 + in0*w1 58 static inline void btf_32_avx2_type0_new(const __m256i ww0, const __m256i ww1, 59 __m256i *in0, __m256i *in1, 60 const __m256i _r, 61 const int32_t cos_bit) { 62 __m256i _in0 = *in0; 63 __m256i _in1 = *in1; 64 const __m256i in0_w0 = _mm256_mullo_epi32(_in0, ww0); 65 const __m256i in1_w1 = _mm256_mullo_epi32(_in1, ww1); 66 __m256i temp0 = _mm256_add_epi32(in0_w0, in1_w1); 67 temp0 = _mm256_add_epi32(temp0, _r); 68 *in0 = _mm256_srai_epi32(temp0, cos_bit); 69 const __m256i in0_w1 = _mm256_mullo_epi32(_in0, ww1); 70 const __m256i in1_w0 = _mm256_mullo_epi32(_in1, ww0); 71 __m256i temp1 = _mm256_sub_epi32(in0_w1, in1_w0); 72 temp1 = _mm256_add_epi32(temp1, _r); 73 *in1 = _mm256_srai_epi32(temp1, cos_bit); 74 } 75 76 // out0 = in0*w0 + in1*w1 77 // out1 = in1*w0 - in0*w1 78 static inline void btf_32_avx2_type1_new(const __m256i ww0, const __m256i ww1, 79 __m256i *in0, __m256i *in1, 80 const __m256i _r, 81 const int32_t cos_bit) { 82 __m256i _in0 = *in0; 83 __m256i _in1 = *in1; 84 const __m256i in0_w0 = _mm256_mullo_epi32(_in0, ww0); 85 const __m256i in1_w1 = _mm256_mullo_epi32(_in1, ww1); 86 __m256i temp0 = _mm256_add_epi32(in0_w0, in1_w1); 87 temp0 = _mm256_add_epi32(temp0, _r); 88 *in0 = _mm256_srai_epi32(temp0, cos_bit); 89 const __m256i in0_w1 = _mm256_mullo_epi32(_in0, ww1); 90 const __m256i in1_w0 = _mm256_mullo_epi32(_in1, ww0); 91 __m256i temp1 = _mm256_sub_epi32(in1_w0, in0_w1); 92 temp1 = _mm256_add_epi32(temp1, _r); 93 *in1 = _mm256_srai_epi32(temp1, cos_bit); 94 } 95 96 #endif // AOM_AV1_ENCODER_X86_AV1_FWD_TXFM_AVX2_H_