fft_sse2.c (7300B)
1 /* 2 * Copyright (c) 2018, Alliance for Open Media. All rights reserved. 3 * 4 * This source code is subject to the terms of the BSD 2 Clause License and 5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License 6 * was not distributed with this source code in the LICENSE file, you can 7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open 8 * Media Patent License 1.0 was not distributed with this source code in the 9 s * PATENTS file, you can obtain it at www.aomedia.org/license/patent. 10 */ 11 12 #include <xmmintrin.h> 13 14 #include "config/aom_dsp_rtcd.h" 15 #include "aom_dsp/aom_dsp_common.h" 16 #include "aom_dsp/fft_common.h" 17 18 static inline void transpose4x4(const float *A, float *B, const int lda, 19 const int ldb) { 20 __m128 row1 = _mm_load_ps(&A[0 * lda]); 21 __m128 row2 = _mm_load_ps(&A[1 * lda]); 22 __m128 row3 = _mm_load_ps(&A[2 * lda]); 23 __m128 row4 = _mm_load_ps(&A[3 * lda]); 24 _MM_TRANSPOSE4_PS(row1, row2, row3, row4); 25 _mm_store_ps(&B[0 * ldb], row1); 26 _mm_store_ps(&B[1 * ldb], row2); 27 _mm_store_ps(&B[2 * ldb], row3); 28 _mm_store_ps(&B[3 * ldb], row4); 29 } 30 31 // Referenced by fft_avx2.c. 32 void aom_transpose_float_sse2(const float *A, float *B, int n); 33 34 void aom_transpose_float_sse2(const float *A, float *B, int n) { 35 for (int y = 0; y < n; y += 4) { 36 for (int x = 0; x < n; x += 4) { 37 transpose4x4(A + y * n + x, B + x * n + y, n, n); 38 } 39 } 40 } 41 42 // Referenced by fft_avx2.c. 43 void aom_fft_unpack_2d_output_sse2(const float *packed, float *output, int n); 44 45 void aom_fft_unpack_2d_output_sse2(const float *packed, float *output, int n) { 46 const int n2 = n / 2; 47 output[0] = packed[0]; 48 output[1] = 0; 49 output[2 * (n2 * n)] = packed[n2 * n]; 50 output[2 * (n2 * n) + 1] = 0; 51 52 output[2 * n2] = packed[n2]; 53 output[2 * n2 + 1] = 0; 54 output[2 * (n2 * n + n2)] = packed[n2 * n + n2]; 55 output[2 * (n2 * n + n2) + 1] = 0; 56 57 for (int c = 1; c < n2; ++c) { 58 output[2 * (0 * n + c)] = packed[c]; 59 output[2 * (0 * n + c) + 1] = packed[c + n2]; 60 output[2 * (n2 * n + c) + 0] = packed[n2 * n + c]; 61 output[2 * (n2 * n + c) + 1] = packed[n2 * n + c + n2]; 62 } 63 for (int r = 1; r < n2; ++r) { 64 output[2 * (r * n + 0)] = packed[r * n]; 65 output[2 * (r * n + 0) + 1] = packed[(r + n2) * n]; 66 output[2 * (r * n + n2) + 0] = packed[r * n + n2]; 67 output[2 * (r * n + n2) + 1] = packed[(r + n2) * n + n2]; 68 69 for (int c = 1; c < AOMMIN(n2, 4); ++c) { 70 output[2 * (r * n + c)] = 71 packed[r * n + c] - packed[(r + n2) * n + c + n2]; 72 output[2 * (r * n + c) + 1] = 73 packed[(r + n2) * n + c] + packed[r * n + c + n2]; 74 } 75 76 for (int c = 4; c < n2; c += 4) { 77 __m128 real1 = _mm_load_ps(packed + r * n + c); 78 __m128 real2 = _mm_load_ps(packed + (r + n2) * n + c + n2); 79 __m128 imag1 = _mm_load_ps(packed + (r + n2) * n + c); 80 __m128 imag2 = _mm_load_ps(packed + r * n + c + n2); 81 real1 = _mm_sub_ps(real1, real2); 82 imag1 = _mm_add_ps(imag1, imag2); 83 _mm_store_ps(output + 2 * (r * n + c), _mm_unpacklo_ps(real1, imag1)); 84 _mm_store_ps(output + 2 * (r * n + c + 2), _mm_unpackhi_ps(real1, imag1)); 85 } 86 87 int r2 = r + n2; 88 int r3 = n - r2; 89 output[2 * (r2 * n + 0)] = packed[r3 * n]; 90 output[2 * (r2 * n + 0) + 1] = -packed[(r3 + n2) * n]; 91 output[2 * (r2 * n + n2)] = packed[r3 * n + n2]; 92 output[2 * (r2 * n + n2) + 1] = -packed[(r3 + n2) * n + n2]; 93 for (int c = 1; c < AOMMIN(4, n2); ++c) { 94 output[2 * (r2 * n + c)] = 95 packed[r3 * n + c] + packed[(r3 + n2) * n + c + n2]; 96 output[2 * (r2 * n + c) + 1] = 97 -packed[(r3 + n2) * n + c] + packed[r3 * n + c + n2]; 98 } 99 for (int c = 4; c < n2; c += 4) { 100 __m128 real1 = _mm_load_ps(packed + r3 * n + c); 101 __m128 real2 = _mm_load_ps(packed + (r3 + n2) * n + c + n2); 102 __m128 imag1 = _mm_load_ps(packed + (r3 + n2) * n + c); 103 __m128 imag2 = _mm_load_ps(packed + r3 * n + c + n2); 104 real1 = _mm_add_ps(real1, real2); 105 imag1 = _mm_sub_ps(imag2, imag1); 106 _mm_store_ps(output + 2 * (r2 * n + c), _mm_unpacklo_ps(real1, imag1)); 107 _mm_store_ps(output + 2 * (r2 * n + c + 2), 108 _mm_unpackhi_ps(real1, imag1)); 109 } 110 } 111 } 112 113 // Generate definitions for 1d transforms using float and __mm128 114 GEN_FFT_4(static inline void, sse2, float, __m128, _mm_load_ps, _mm_store_ps, 115 _mm_set1_ps, _mm_add_ps, _mm_sub_ps) 116 GEN_FFT_8(static inline void, sse2, float, __m128, _mm_load_ps, _mm_store_ps, 117 _mm_set1_ps, _mm_add_ps, _mm_sub_ps, _mm_mul_ps) 118 GEN_FFT_16(static inline void, sse2, float, __m128, _mm_load_ps, _mm_store_ps, 119 _mm_set1_ps, _mm_add_ps, _mm_sub_ps, _mm_mul_ps) 120 GEN_FFT_32(static inline void, sse2, float, __m128, _mm_load_ps, _mm_store_ps, 121 _mm_set1_ps, _mm_add_ps, _mm_sub_ps, _mm_mul_ps) 122 123 void aom_fft4x4_float_sse2(const float *input, float *temp, float *output) { 124 aom_fft_2d_gen(input, temp, output, 4, aom_fft1d_4_sse2, 125 aom_transpose_float_sse2, aom_fft_unpack_2d_output_sse2, 4); 126 } 127 128 void aom_fft8x8_float_sse2(const float *input, float *temp, float *output) { 129 aom_fft_2d_gen(input, temp, output, 8, aom_fft1d_8_sse2, 130 aom_transpose_float_sse2, aom_fft_unpack_2d_output_sse2, 4); 131 } 132 133 void aom_fft16x16_float_sse2(const float *input, float *temp, float *output) { 134 aom_fft_2d_gen(input, temp, output, 16, aom_fft1d_16_sse2, 135 aom_transpose_float_sse2, aom_fft_unpack_2d_output_sse2, 4); 136 } 137 138 void aom_fft32x32_float_sse2(const float *input, float *temp, float *output) { 139 aom_fft_2d_gen(input, temp, output, 32, aom_fft1d_32_sse2, 140 aom_transpose_float_sse2, aom_fft_unpack_2d_output_sse2, 4); 141 } 142 143 // Generate definitions for 1d inverse transforms using float and mm128 144 GEN_IFFT_4(static inline void, sse2, float, __m128, _mm_load_ps, _mm_store_ps, 145 _mm_set1_ps, _mm_add_ps, _mm_sub_ps) 146 GEN_IFFT_8(static inline void, sse2, float, __m128, _mm_load_ps, _mm_store_ps, 147 _mm_set1_ps, _mm_add_ps, _mm_sub_ps, _mm_mul_ps) 148 GEN_IFFT_16(static inline void, sse2, float, __m128, _mm_load_ps, _mm_store_ps, 149 _mm_set1_ps, _mm_add_ps, _mm_sub_ps, _mm_mul_ps) 150 GEN_IFFT_32(static inline void, sse2, float, __m128, _mm_load_ps, _mm_store_ps, 151 _mm_set1_ps, _mm_add_ps, _mm_sub_ps, _mm_mul_ps) 152 153 void aom_ifft4x4_float_sse2(const float *input, float *temp, float *output) { 154 aom_ifft_2d_gen(input, temp, output, 4, aom_fft1d_4_float, aom_fft1d_4_sse2, 155 aom_ifft1d_4_sse2, aom_transpose_float_sse2, 4); 156 } 157 158 void aom_ifft8x8_float_sse2(const float *input, float *temp, float *output) { 159 aom_ifft_2d_gen(input, temp, output, 8, aom_fft1d_8_float, aom_fft1d_8_sse2, 160 aom_ifft1d_8_sse2, aom_transpose_float_sse2, 4); 161 } 162 163 void aom_ifft16x16_float_sse2(const float *input, float *temp, float *output) { 164 aom_ifft_2d_gen(input, temp, output, 16, aom_fft1d_16_float, 165 aom_fft1d_16_sse2, aom_ifft1d_16_sse2, 166 aom_transpose_float_sse2, 4); 167 } 168 169 void aom_ifft32x32_float_sse2(const float *input, float *temp, float *output) { 170 aom_ifft_2d_gen(input, temp, output, 32, aom_fft1d_32_float, 171 aom_fft1d_32_sse2, aom_ifft1d_32_sse2, 172 aom_transpose_float_sse2, 4); 173 }