sinc_resampler_avx2.cc (2561B)
1 /* 2 * Copyright (c) 2020 The WebRTC project authors. All Rights Reserved. 3 * 4 * Use of this source code is governed by a BSD-style license 5 * that can be found in the LICENSE file in the root of the source 6 * tree. An additional intellectual property rights grant can be found 7 * in the file PATENTS. All contributing project authors may 8 * be found in the AUTHORS file in the root of the source tree. 9 */ 10 11 #include <immintrin.h> 12 #include <xmmintrin.h> 13 14 #include <cstddef> 15 #include <cstdint> 16 17 #include "common_audio/resampler/sinc_resampler.h" 18 19 namespace webrtc { 20 21 float SincResampler::Convolve_AVX2(const float* input_ptr, 22 const float* k1, 23 const float* k2, 24 double kernel_interpolation_factor) { 25 __m256 m_input; 26 __m256 m_sums1 = _mm256_setzero_ps(); 27 __m256 m_sums2 = _mm256_setzero_ps(); 28 29 // Based on `input_ptr` alignment, we need to use loadu or load. Unrolling 30 // these loops has not been tested or benchmarked. 31 bool aligned_input = (reinterpret_cast<uintptr_t>(input_ptr) & 0x1F) == 0; 32 if (!aligned_input) { 33 for (size_t i = 0; i < kKernelSize; i += 8) { 34 m_input = _mm256_loadu_ps(input_ptr + i); 35 m_sums1 = _mm256_fmadd_ps(m_input, _mm256_load_ps(k1 + i), m_sums1); 36 m_sums2 = _mm256_fmadd_ps(m_input, _mm256_load_ps(k2 + i), m_sums2); 37 } 38 } else { 39 for (size_t i = 0; i < kKernelSize; i += 8) { 40 m_input = _mm256_load_ps(input_ptr + i); 41 m_sums1 = _mm256_fmadd_ps(m_input, _mm256_load_ps(k1 + i), m_sums1); 42 m_sums2 = _mm256_fmadd_ps(m_input, _mm256_load_ps(k2 + i), m_sums2); 43 } 44 } 45 46 // Linearly interpolate the two "convolutions". 47 __m128 m128_sums1 = _mm_add_ps(_mm256_extractf128_ps(m_sums1, 0), 48 _mm256_extractf128_ps(m_sums1, 1)); 49 __m128 m128_sums2 = _mm_add_ps(_mm256_extractf128_ps(m_sums2, 0), 50 _mm256_extractf128_ps(m_sums2, 1)); 51 m128_sums1 = _mm_mul_ps( 52 m128_sums1, 53 _mm_set_ps1(static_cast<float>(1.0 - kernel_interpolation_factor))); 54 m128_sums2 = _mm_mul_ps( 55 m128_sums2, _mm_set_ps1(static_cast<float>(kernel_interpolation_factor))); 56 m128_sums1 = _mm_add_ps(m128_sums1, m128_sums2); 57 58 // Sum components together. 59 float result; 60 m128_sums2 = _mm_add_ps(_mm_movehl_ps(m128_sums1, m128_sums1), m128_sums1); 61 _mm_store_ss(&result, _mm_add_ss(m128_sums2, 62 _mm_shuffle_ps(m128_sums2, m128_sums2, 1))); 63 64 return result; 65 } 66 67 } // namespace webrtc