tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

wedge_utils_sse2.c (9630B)


      1 /*
      2 * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
      3 *
      4 * This source code is subject to the terms of the BSD 2 Clause License and
      5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
      6 * was not distributed with this source code in the LICENSE file, you can
      7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
      8 * Media Patent License 1.0 was not distributed with this source code in the
      9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
     10 */
     11 
     12 #include <assert.h>
     13 #include <immintrin.h>
     14 
     15 #include "aom_dsp/x86/synonyms.h"
     16 
     17 #include "aom/aom_integer.h"
     18 
     19 #include "av1/common/reconinter.h"
     20 
     21 #define MAX_MASK_VALUE (1 << WEDGE_WEIGHT_BITS)
     22 
     23 /**
     24 * See av1_wedge_sse_from_residuals_c
     25 */
     26 uint64_t av1_wedge_sse_from_residuals_sse2(const int16_t *r1, const int16_t *d,
     27                                           const uint8_t *m, int N) {
     28  int n = -N;
     29  int n8 = n + 8;
     30 
     31  uint64_t csse;
     32 
     33  const __m128i v_mask_max_w = _mm_set1_epi16(MAX_MASK_VALUE);
     34  const __m128i v_zext_q = _mm_set1_epi64x(~0u);
     35 
     36  __m128i v_acc0_q = _mm_setzero_si128();
     37 
     38  assert(N % 64 == 0);
     39 
     40  r1 += N;
     41  d += N;
     42  m += N;
     43 
     44  do {
     45    const __m128i v_r0_w = xx_load_128(r1 + n);
     46    const __m128i v_r1_w = xx_load_128(r1 + n8);
     47    const __m128i v_d0_w = xx_load_128(d + n);
     48    const __m128i v_d1_w = xx_load_128(d + n8);
     49    const __m128i v_m01_b = xx_load_128(m + n);
     50 
     51    const __m128i v_rd0l_w = _mm_unpacklo_epi16(v_d0_w, v_r0_w);
     52    const __m128i v_rd0h_w = _mm_unpackhi_epi16(v_d0_w, v_r0_w);
     53    const __m128i v_rd1l_w = _mm_unpacklo_epi16(v_d1_w, v_r1_w);
     54    const __m128i v_rd1h_w = _mm_unpackhi_epi16(v_d1_w, v_r1_w);
     55    const __m128i v_m0_w = _mm_unpacklo_epi8(v_m01_b, _mm_setzero_si128());
     56    const __m128i v_m1_w = _mm_unpackhi_epi8(v_m01_b, _mm_setzero_si128());
     57 
     58    const __m128i v_m0l_w = _mm_unpacklo_epi16(v_m0_w, v_mask_max_w);
     59    const __m128i v_m0h_w = _mm_unpackhi_epi16(v_m0_w, v_mask_max_w);
     60    const __m128i v_m1l_w = _mm_unpacklo_epi16(v_m1_w, v_mask_max_w);
     61    const __m128i v_m1h_w = _mm_unpackhi_epi16(v_m1_w, v_mask_max_w);
     62 
     63    const __m128i v_t0l_d = _mm_madd_epi16(v_rd0l_w, v_m0l_w);
     64    const __m128i v_t0h_d = _mm_madd_epi16(v_rd0h_w, v_m0h_w);
     65    const __m128i v_t1l_d = _mm_madd_epi16(v_rd1l_w, v_m1l_w);
     66    const __m128i v_t1h_d = _mm_madd_epi16(v_rd1h_w, v_m1h_w);
     67 
     68    const __m128i v_t0_w = _mm_packs_epi32(v_t0l_d, v_t0h_d);
     69    const __m128i v_t1_w = _mm_packs_epi32(v_t1l_d, v_t1h_d);
     70 
     71    const __m128i v_sq0_d = _mm_madd_epi16(v_t0_w, v_t0_w);
     72    const __m128i v_sq1_d = _mm_madd_epi16(v_t1_w, v_t1_w);
     73 
     74    const __m128i v_sum0_q = _mm_add_epi64(_mm_and_si128(v_sq0_d, v_zext_q),
     75                                           _mm_srli_epi64(v_sq0_d, 32));
     76    const __m128i v_sum1_q = _mm_add_epi64(_mm_and_si128(v_sq1_d, v_zext_q),
     77                                           _mm_srli_epi64(v_sq1_d, 32));
     78 
     79    v_acc0_q = _mm_add_epi64(v_acc0_q, v_sum0_q);
     80    v_acc0_q = _mm_add_epi64(v_acc0_q, v_sum1_q);
     81 
     82    n8 += 16;
     83    n += 16;
     84  } while (n);
     85 
     86  v_acc0_q = _mm_add_epi64(v_acc0_q, _mm_srli_si128(v_acc0_q, 8));
     87 
     88 #if AOM_ARCH_X86_64
     89  csse = (uint64_t)_mm_cvtsi128_si64(v_acc0_q);
     90 #else
     91  xx_storel_64(&csse, v_acc0_q);
     92 #endif
     93 
     94  return ROUND_POWER_OF_TWO(csse, 2 * WEDGE_WEIGHT_BITS);
     95 }
     96 
     97 /**
     98 * See av1_wedge_sign_from_residuals_c
     99 */
    100 int8_t av1_wedge_sign_from_residuals_sse2(const int16_t *ds, const uint8_t *m,
    101                                          int N, int64_t limit) {
    102  int64_t acc;
    103 
    104  __m128i v_sign_d;
    105  __m128i v_acc0_d = _mm_setzero_si128();
    106  __m128i v_acc1_d = _mm_setzero_si128();
    107  __m128i v_acc_q;
    108 
    109  // Input size limited to 8192 by the use of 32 bit accumulators and m
    110  // being between [0, 64]. Overflow might happen at larger sizes,
    111  // though it is practically impossible on real video input.
    112  assert(N < 8192);
    113  assert(N % 64 == 0);
    114 
    115  do {
    116    const __m128i v_m01_b = xx_load_128(m);
    117    const __m128i v_m23_b = xx_load_128(m + 16);
    118    const __m128i v_m45_b = xx_load_128(m + 32);
    119    const __m128i v_m67_b = xx_load_128(m + 48);
    120 
    121    const __m128i v_d0_w = xx_load_128(ds);
    122    const __m128i v_d1_w = xx_load_128(ds + 8);
    123    const __m128i v_d2_w = xx_load_128(ds + 16);
    124    const __m128i v_d3_w = xx_load_128(ds + 24);
    125    const __m128i v_d4_w = xx_load_128(ds + 32);
    126    const __m128i v_d5_w = xx_load_128(ds + 40);
    127    const __m128i v_d6_w = xx_load_128(ds + 48);
    128    const __m128i v_d7_w = xx_load_128(ds + 56);
    129 
    130    const __m128i v_m0_w = _mm_unpacklo_epi8(v_m01_b, _mm_setzero_si128());
    131    const __m128i v_m1_w = _mm_unpackhi_epi8(v_m01_b, _mm_setzero_si128());
    132    const __m128i v_m2_w = _mm_unpacklo_epi8(v_m23_b, _mm_setzero_si128());
    133    const __m128i v_m3_w = _mm_unpackhi_epi8(v_m23_b, _mm_setzero_si128());
    134    const __m128i v_m4_w = _mm_unpacklo_epi8(v_m45_b, _mm_setzero_si128());
    135    const __m128i v_m5_w = _mm_unpackhi_epi8(v_m45_b, _mm_setzero_si128());
    136    const __m128i v_m6_w = _mm_unpacklo_epi8(v_m67_b, _mm_setzero_si128());
    137    const __m128i v_m7_w = _mm_unpackhi_epi8(v_m67_b, _mm_setzero_si128());
    138 
    139    const __m128i v_p0_d = _mm_madd_epi16(v_d0_w, v_m0_w);
    140    const __m128i v_p1_d = _mm_madd_epi16(v_d1_w, v_m1_w);
    141    const __m128i v_p2_d = _mm_madd_epi16(v_d2_w, v_m2_w);
    142    const __m128i v_p3_d = _mm_madd_epi16(v_d3_w, v_m3_w);
    143    const __m128i v_p4_d = _mm_madd_epi16(v_d4_w, v_m4_w);
    144    const __m128i v_p5_d = _mm_madd_epi16(v_d5_w, v_m5_w);
    145    const __m128i v_p6_d = _mm_madd_epi16(v_d6_w, v_m6_w);
    146    const __m128i v_p7_d = _mm_madd_epi16(v_d7_w, v_m7_w);
    147 
    148    const __m128i v_p01_d = _mm_add_epi32(v_p0_d, v_p1_d);
    149    const __m128i v_p23_d = _mm_add_epi32(v_p2_d, v_p3_d);
    150    const __m128i v_p45_d = _mm_add_epi32(v_p4_d, v_p5_d);
    151    const __m128i v_p67_d = _mm_add_epi32(v_p6_d, v_p7_d);
    152 
    153    const __m128i v_p0123_d = _mm_add_epi32(v_p01_d, v_p23_d);
    154    const __m128i v_p4567_d = _mm_add_epi32(v_p45_d, v_p67_d);
    155 
    156    v_acc0_d = _mm_add_epi32(v_acc0_d, v_p0123_d);
    157    v_acc1_d = _mm_add_epi32(v_acc1_d, v_p4567_d);
    158 
    159    ds += 64;
    160    m += 64;
    161 
    162    N -= 64;
    163  } while (N);
    164 
    165  v_sign_d = _mm_cmplt_epi32(v_acc0_d, _mm_setzero_si128());
    166  v_acc0_d = _mm_add_epi64(_mm_unpacklo_epi32(v_acc0_d, v_sign_d),
    167                           _mm_unpackhi_epi32(v_acc0_d, v_sign_d));
    168 
    169  v_sign_d = _mm_cmplt_epi32(v_acc1_d, _mm_setzero_si128());
    170  v_acc1_d = _mm_add_epi64(_mm_unpacklo_epi32(v_acc1_d, v_sign_d),
    171                           _mm_unpackhi_epi32(v_acc1_d, v_sign_d));
    172 
    173  v_acc_q = _mm_add_epi64(v_acc0_d, v_acc1_d);
    174 
    175  v_acc_q = _mm_add_epi64(v_acc_q, _mm_srli_si128(v_acc_q, 8));
    176 
    177 #if AOM_ARCH_X86_64
    178  acc = _mm_cvtsi128_si64(v_acc_q);
    179 #else
    180  xx_storel_64(&acc, v_acc_q);
    181 #endif
    182 
    183  return acc > limit;
    184 }
    185 
    186 // Negate under mask
    187 static inline __m128i negm_epi16(__m128i v_v_w, __m128i v_mask_w) {
    188  return _mm_sub_epi16(_mm_xor_si128(v_v_w, v_mask_w), v_mask_w);
    189 }
    190 
    191 /**
    192 * av1_wedge_compute_delta_squares_c
    193 */
    194 void av1_wedge_compute_delta_squares_sse2(int16_t *d, const int16_t *a,
    195                                          const int16_t *b, int N) {
    196  const __m128i v_neg_w = _mm_set_epi16((short)0xffff, 0, (short)0xffff, 0,
    197                                        (short)0xffff, 0, (short)0xffff, 0);
    198 
    199  assert(N % 64 == 0);
    200 
    201  do {
    202    const __m128i v_a0_w = xx_load_128(a);
    203    const __m128i v_b0_w = xx_load_128(b);
    204    const __m128i v_a1_w = xx_load_128(a + 8);
    205    const __m128i v_b1_w = xx_load_128(b + 8);
    206    const __m128i v_a2_w = xx_load_128(a + 16);
    207    const __m128i v_b2_w = xx_load_128(b + 16);
    208    const __m128i v_a3_w = xx_load_128(a + 24);
    209    const __m128i v_b3_w = xx_load_128(b + 24);
    210 
    211    const __m128i v_ab0l_w = _mm_unpacklo_epi16(v_a0_w, v_b0_w);
    212    const __m128i v_ab0h_w = _mm_unpackhi_epi16(v_a0_w, v_b0_w);
    213    const __m128i v_ab1l_w = _mm_unpacklo_epi16(v_a1_w, v_b1_w);
    214    const __m128i v_ab1h_w = _mm_unpackhi_epi16(v_a1_w, v_b1_w);
    215    const __m128i v_ab2l_w = _mm_unpacklo_epi16(v_a2_w, v_b2_w);
    216    const __m128i v_ab2h_w = _mm_unpackhi_epi16(v_a2_w, v_b2_w);
    217    const __m128i v_ab3l_w = _mm_unpacklo_epi16(v_a3_w, v_b3_w);
    218    const __m128i v_ab3h_w = _mm_unpackhi_epi16(v_a3_w, v_b3_w);
    219 
    220    // Negate top word of pairs
    221    const __m128i v_abl0n_w = negm_epi16(v_ab0l_w, v_neg_w);
    222    const __m128i v_abh0n_w = negm_epi16(v_ab0h_w, v_neg_w);
    223    const __m128i v_abl1n_w = negm_epi16(v_ab1l_w, v_neg_w);
    224    const __m128i v_abh1n_w = negm_epi16(v_ab1h_w, v_neg_w);
    225    const __m128i v_abl2n_w = negm_epi16(v_ab2l_w, v_neg_w);
    226    const __m128i v_abh2n_w = negm_epi16(v_ab2h_w, v_neg_w);
    227    const __m128i v_abl3n_w = negm_epi16(v_ab3l_w, v_neg_w);
    228    const __m128i v_abh3n_w = negm_epi16(v_ab3h_w, v_neg_w);
    229 
    230    const __m128i v_r0l_w = _mm_madd_epi16(v_ab0l_w, v_abl0n_w);
    231    const __m128i v_r0h_w = _mm_madd_epi16(v_ab0h_w, v_abh0n_w);
    232    const __m128i v_r1l_w = _mm_madd_epi16(v_ab1l_w, v_abl1n_w);
    233    const __m128i v_r1h_w = _mm_madd_epi16(v_ab1h_w, v_abh1n_w);
    234    const __m128i v_r2l_w = _mm_madd_epi16(v_ab2l_w, v_abl2n_w);
    235    const __m128i v_r2h_w = _mm_madd_epi16(v_ab2h_w, v_abh2n_w);
    236    const __m128i v_r3l_w = _mm_madd_epi16(v_ab3l_w, v_abl3n_w);
    237    const __m128i v_r3h_w = _mm_madd_epi16(v_ab3h_w, v_abh3n_w);
    238 
    239    const __m128i v_r0_w = _mm_packs_epi32(v_r0l_w, v_r0h_w);
    240    const __m128i v_r1_w = _mm_packs_epi32(v_r1l_w, v_r1h_w);
    241    const __m128i v_r2_w = _mm_packs_epi32(v_r2l_w, v_r2h_w);
    242    const __m128i v_r3_w = _mm_packs_epi32(v_r3l_w, v_r3h_w);
    243 
    244    xx_store_128(d, v_r0_w);
    245    xx_store_128(d + 8, v_r1_w);
    246    xx_store_128(d + 16, v_r2_w);
    247    xx_store_128(d + 24, v_r3_w);
    248 
    249    a += 32;
    250    b += 32;
    251    d += 32;
    252    N -= 32;
    253  } while (N);
    254 }