tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

convolve_ssse3.h (2115B)


      1 /*
      2 * Copyright (c) 2021, Alliance for Open Media. All rights reserved.
      3 *
      4 * This source code is subject to the terms of the BSD 2 Clause License and
      5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
      6 * was not distributed with this source code in the LICENSE file, you can
      7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
      8 * Media Patent License 1.0 was not distributed with this source code in the
      9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
     10 */
     11 
     12 #ifndef AOM_AOM_DSP_X86_CONVOLVE_SSSE3_H_
     13 #define AOM_AOM_DSP_X86_CONVOLVE_SSSE3_H_
     14 
     15 #include <tmmintrin.h>  // SSSE3
     16 
     17 static inline void shuffle_filter_ssse3(const int16_t *const filter,
     18                                        __m128i *const f) {
     19  const __m128i f_values = _mm_load_si128((const __m128i *)filter);
     20  // pack and duplicate the filter values
     21  f[0] = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0200u));
     22  f[1] = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0604u));
     23  f[2] = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0a08u));
     24  f[3] = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0e0cu));
     25 }
     26 
     27 static inline __m128i convolve8_8_ssse3(const __m128i *const s,
     28                                        const __m128i *const f) {
     29  // multiply 2 adjacent elements with the filter and add the result
     30  const __m128i k_64 = _mm_set1_epi16(1 << 6);
     31  const __m128i x0 = _mm_maddubs_epi16(s[0], f[0]);
     32  const __m128i x1 = _mm_maddubs_epi16(s[1], f[1]);
     33  const __m128i x2 = _mm_maddubs_epi16(s[2], f[2]);
     34  const __m128i x3 = _mm_maddubs_epi16(s[3], f[3]);
     35  __m128i sum1, sum2;
     36 
     37  // sum the results together, saturating only on the final step
     38  // adding x0 with x2 and x1 with x3 is the only order that prevents
     39  // outranges for all filters
     40  sum1 = _mm_add_epi16(x0, x2);
     41  sum2 = _mm_add_epi16(x1, x3);
     42  // add the rounding offset early to avoid another saturated add
     43  sum1 = _mm_add_epi16(sum1, k_64);
     44  sum1 = _mm_adds_epi16(sum1, sum2);
     45  // shift by 7 bit each 16 bit
     46  sum1 = _mm_srai_epi16(sum1, 7);
     47  return sum1;
     48 }
     49 
     50 #endif  // AOM_AOM_DSP_X86_CONVOLVE_SSSE3_H_