tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

avg_intrin_sse4.c (2183B)


      1 /*
      2 * Copyright (c) 2022, Alliance for Open Media. All rights reserved.
      3 *
      4 * This source code is subject to the terms of the BSD 2 Clause License and
      5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
      6 * was not distributed with this source code in the LICENSE file, you can
      7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
      8 * Media Patent License 1.0 was not distributed with this source code in the
      9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
     10 */
     11 
     12 #include <smmintrin.h>
     13 
     14 #include "config/aom_dsp_rtcd.h"
     15 
     16 // ref: [0 - 510]
     17 // src: [0 - 510]
     18 // bwl: {2, 3, 4, 5}
     19 int aom_vector_var_sse4_1(const int16_t *ref, const int16_t *src, int bwl) {
     20  const int width = 4 << bwl;
     21  assert(width % 16 == 0);
     22 
     23  const __m128i k_one_epi16 = _mm_set1_epi16((int16_t)1);
     24  __m128i mean = _mm_setzero_si128();
     25  __m128i sse = _mm_setzero_si128();
     26 
     27  for (int i = 0; i < width; i += 16) {
     28    const __m128i src_line = _mm_loadu_si128((const __m128i *)src);
     29    const __m128i ref_line = _mm_loadu_si128((const __m128i *)ref);
     30    const __m128i src_line2 = _mm_loadu_si128((const __m128i *)(src + 8));
     31    const __m128i ref_line2 = _mm_loadu_si128((const __m128i *)(ref + 8));
     32    __m128i diff = _mm_sub_epi16(ref_line, src_line);
     33    const __m128i diff2 = _mm_sub_epi16(ref_line2, src_line2);
     34    __m128i diff_sqr = _mm_madd_epi16(diff, diff);
     35    const __m128i diff_sqr2 = _mm_madd_epi16(diff2, diff2);
     36 
     37    diff = _mm_add_epi16(diff, diff2);
     38    diff_sqr = _mm_add_epi32(diff_sqr, diff_sqr2);
     39    sse = _mm_add_epi32(sse, diff_sqr);
     40    mean = _mm_add_epi16(mean, diff);
     41 
     42    src += 16;
     43    ref += 16;
     44  }
     45 
     46  // m0 m1 m2 m3
     47  mean = _mm_madd_epi16(mean, k_one_epi16);
     48  // m0+m1 m2+m3 s0+s1 s2+s3
     49  __m128i result = _mm_hadd_epi32(mean, sse);
     50  // m0+m1+m2+m3 s0+s1+s2+s3 x x
     51  result = _mm_add_epi32(result, _mm_bsrli_si128(result, 4));
     52 
     53  // (mean * mean): dynamic range 31 bits.
     54  const int mean_int = _mm_extract_epi32(result, 0);
     55  const int sse_int = _mm_extract_epi32(result, 2);
     56  const unsigned int mean_abs = abs(mean_int);
     57  const int var = sse_int - ((mean_abs * mean_abs) >> (bwl + 2));
     58  return var;
     59 }