tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

sinc_resampler_avx2.cc (2561B)


      1 /*
      2 *  Copyright (c) 2020 The WebRTC project authors. All Rights Reserved.
      3 *
      4 *  Use of this source code is governed by a BSD-style license
      5 *  that can be found in the LICENSE file in the root of the source
      6 *  tree. An additional intellectual property rights grant can be found
      7 *  in the file PATENTS.  All contributing project authors may
      8 *  be found in the AUTHORS file in the root of the source tree.
      9 */
     10 
     11 #include <immintrin.h>
     12 #include <xmmintrin.h>
     13 
     14 #include <cstddef>
     15 #include <cstdint>
     16 
     17 #include "common_audio/resampler/sinc_resampler.h"
     18 
     19 namespace webrtc {
     20 
     21 float SincResampler::Convolve_AVX2(const float* input_ptr,
     22                                   const float* k1,
     23                                   const float* k2,
     24                                   double kernel_interpolation_factor) {
     25  __m256 m_input;
     26  __m256 m_sums1 = _mm256_setzero_ps();
     27  __m256 m_sums2 = _mm256_setzero_ps();
     28 
     29  // Based on `input_ptr` alignment, we need to use loadu or load.  Unrolling
     30  // these loops has not been tested or benchmarked.
     31  bool aligned_input = (reinterpret_cast<uintptr_t>(input_ptr) & 0x1F) == 0;
     32  if (!aligned_input) {
     33    for (size_t i = 0; i < kKernelSize; i += 8) {
     34      m_input = _mm256_loadu_ps(input_ptr + i);
     35      m_sums1 = _mm256_fmadd_ps(m_input, _mm256_load_ps(k1 + i), m_sums1);
     36      m_sums2 = _mm256_fmadd_ps(m_input, _mm256_load_ps(k2 + i), m_sums2);
     37    }
     38  } else {
     39    for (size_t i = 0; i < kKernelSize; i += 8) {
     40      m_input = _mm256_load_ps(input_ptr + i);
     41      m_sums1 = _mm256_fmadd_ps(m_input, _mm256_load_ps(k1 + i), m_sums1);
     42      m_sums2 = _mm256_fmadd_ps(m_input, _mm256_load_ps(k2 + i), m_sums2);
     43    }
     44  }
     45 
     46  // Linearly interpolate the two "convolutions".
     47  __m128 m128_sums1 = _mm_add_ps(_mm256_extractf128_ps(m_sums1, 0),
     48                                 _mm256_extractf128_ps(m_sums1, 1));
     49  __m128 m128_sums2 = _mm_add_ps(_mm256_extractf128_ps(m_sums2, 0),
     50                                 _mm256_extractf128_ps(m_sums2, 1));
     51  m128_sums1 = _mm_mul_ps(
     52      m128_sums1,
     53      _mm_set_ps1(static_cast<float>(1.0 - kernel_interpolation_factor)));
     54  m128_sums2 = _mm_mul_ps(
     55      m128_sums2, _mm_set_ps1(static_cast<float>(kernel_interpolation_factor)));
     56  m128_sums1 = _mm_add_ps(m128_sums1, m128_sums2);
     57 
     58  // Sum components together.
     59  float result;
     60  m128_sums2 = _mm_add_ps(_mm_movehl_ps(m128_sums1, m128_sums1), m128_sums1);
     61  _mm_store_ss(&result, _mm_add_ss(m128_sums2,
     62                                   _mm_shuffle_ps(m128_sums2, m128_sums2, 1)));
     63 
     64  return result;
     65 }
     66 
     67 }  // namespace webrtc