tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

reconinter_sse4.c (6233B)


      1 /*
      2 * Copyright (c) 2018, Alliance for Open Media. All rights reserved.
      3 *
      4 * This source code is subject to the terms of the BSD 2 Clause License and
      5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
      6 * was not distributed with this source code in the LICENSE file, you can
      7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
      8 * Media Patent License 1.0 was not distributed with this source code in the
      9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
     10 */
     11 
     12 #include <emmintrin.h>  // SSE2
     13 #include <smmintrin.h>  /* SSE4.1 */
     14 
     15 #include "aom/aom_integer.h"
     16 #include "aom_dsp/blend.h"
     17 #include "av1/common/blockd.h"
     18 #include "config/av1_rtcd.h"
     19 
     20 static inline __m128i calc_mask(const __m128i mask_base, const __m128i s0,
     21                                const __m128i s1) {
     22  const __m128i diff = _mm_abs_epi16(_mm_sub_epi16(s0, s1));
     23  return _mm_abs_epi16(_mm_add_epi16(mask_base, _mm_srli_epi16(diff, 4)));
     24  // clamp(diff, 0, 64) can be skiped for diff is always in the range ( 38, 54)
     25 }
     26 
     27 void av1_build_compound_diffwtd_mask_sse4_1(uint8_t *mask,
     28                                            DIFFWTD_MASK_TYPE mask_type,
     29                                            const uint8_t *src0, int stride0,
     30                                            const uint8_t *src1, int stride1,
     31                                            int h, int w) {
     32  const int mb = (mask_type == DIFFWTD_38_INV) ? AOM_BLEND_A64_MAX_ALPHA : 0;
     33  const __m128i mask_base = _mm_set1_epi16(38 - mb);
     34  int i = 0;
     35  if (4 == w) {
     36    do {
     37      const __m128i s0A = _mm_cvtsi32_si128(*(int *)src0);
     38      const __m128i s0B = _mm_cvtsi32_si128(*(int *)(src0 + stride0));
     39      const __m128i s0AB = _mm_unpacklo_epi32(s0A, s0B);
     40      const __m128i s0 = _mm_cvtepu8_epi16(s0AB);
     41 
     42      const __m128i s1A = _mm_cvtsi32_si128(*(int *)src1);
     43      const __m128i s1B = _mm_cvtsi32_si128(*(int *)(src1 + stride1));
     44      const __m128i s1AB = _mm_unpacklo_epi32(s1A, s1B);
     45      const __m128i s1 = _mm_cvtepu8_epi16(s1AB);
     46 
     47      const __m128i m16 = calc_mask(mask_base, s0, s1);
     48      const __m128i m8 = _mm_packus_epi16(m16, m16);
     49 
     50      *(int *)mask = _mm_cvtsi128_si32(m8);
     51      *(int *)(mask + w) = _mm_extract_epi32(m8, 1);
     52      src0 += (stride0 << 1);
     53      src1 += (stride1 << 1);
     54      mask += 8;
     55      i += 2;
     56    } while (i < h);
     57  } else if (8 == w) {
     58    do {
     59      __m128i s0 = _mm_loadl_epi64((__m128i const *)src0);
     60      __m128i s1 = _mm_loadl_epi64((__m128i const *)src1);
     61      s0 = _mm_cvtepu8_epi16(s0);
     62      s1 = _mm_cvtepu8_epi16(s1);
     63      const __m128i m16 = calc_mask(mask_base, s0, s1);
     64      const __m128i m8 = _mm_packus_epi16(m16, m16);
     65      _mm_storel_epi64((__m128i *)mask, m8);
     66      src0 += stride0;
     67      src1 += stride1;
     68      mask += 8;
     69      i += 1;
     70    } while (i < h);
     71  } else {
     72    const __m128i zero = _mm_setzero_si128();
     73    do {
     74      int j = 0;
     75      do {
     76        const __m128i s0 = _mm_load_si128((__m128i const *)(src0 + j));
     77        const __m128i s1 = _mm_load_si128((__m128i const *)(src1 + j));
     78        const __m128i s0L = _mm_cvtepu8_epi16(s0);
     79        const __m128i s1L = _mm_cvtepu8_epi16(s1);
     80        const __m128i s0H = _mm_unpackhi_epi8(s0, zero);
     81        const __m128i s1H = _mm_unpackhi_epi8(s1, zero);
     82 
     83        const __m128i m16L = calc_mask(mask_base, s0L, s1L);
     84        const __m128i m16H = calc_mask(mask_base, s0H, s1H);
     85 
     86        const __m128i m8 = _mm_packus_epi16(m16L, m16H);
     87        _mm_store_si128((__m128i *)(mask + j), m8);
     88        j += 16;
     89      } while (j < w);
     90      src0 += stride0;
     91      src1 += stride1;
     92      mask += w;
     93      i += 1;
     94    } while (i < h);
     95  }
     96 }
     97 
     98 void av1_build_compound_diffwtd_mask_d16_sse4_1(
     99    uint8_t *mask, DIFFWTD_MASK_TYPE mask_type, const CONV_BUF_TYPE *src0,
    100    int src0_stride, const CONV_BUF_TYPE *src1, int src1_stride, int h, int w,
    101    ConvolveParams *conv_params, int bd) {
    102  const int which_inverse = (mask_type == DIFFWTD_38) ? 0 : 1;
    103  const int mask_base = 38;
    104  int round =
    105      2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1 + (bd - 8);
    106  const __m128i round_const = _mm_set1_epi16((1 << round) >> 1);
    107  const __m128i mask_base_16 = _mm_set1_epi16(mask_base);
    108  const __m128i clip_diff = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
    109  const __m128i add_const =
    110      _mm_set1_epi16((which_inverse ? AOM_BLEND_A64_MAX_ALPHA : 0));
    111  const __m128i add_sign = _mm_set1_epi16((which_inverse ? -1 : 1));
    112 
    113  int i, j;
    114  // When rounding constant is added, there is a possibility of overflow.
    115  // However that much precision is not required. Code should very well work for
    116  // other values of DIFF_FACTOR_LOG2 and AOM_BLEND_A64_MAX_ALPHA as well. But
    117  // there is a possibility of corner case bugs.
    118  assert(DIFF_FACTOR_LOG2 == 4);
    119  assert(AOM_BLEND_A64_MAX_ALPHA == 64);
    120  for (i = 0; i < h; ++i) {
    121    for (j = 0; j < w; j += 8) {
    122      const __m128i data_src0 =
    123          _mm_loadu_si128((__m128i *)&src0[(i * src0_stride) + j]);
    124      const __m128i data_src1 =
    125          _mm_loadu_si128((__m128i *)&src1[(i * src1_stride) + j]);
    126 
    127      const __m128i diffa = _mm_subs_epu16(data_src0, data_src1);
    128      const __m128i diffb = _mm_subs_epu16(data_src1, data_src0);
    129      const __m128i diff = _mm_max_epu16(diffa, diffb);
    130      const __m128i diff_round =
    131          _mm_srli_epi16(_mm_adds_epu16(diff, round_const), round);
    132      const __m128i diff_factor = _mm_srli_epi16(diff_round, DIFF_FACTOR_LOG2);
    133      const __m128i diff_mask = _mm_adds_epi16(diff_factor, mask_base_16);
    134      __m128i diff_clamp = _mm_min_epi16(diff_mask, clip_diff);
    135      // clamp to 0 can be skipped since we are using add and saturate
    136      // instruction
    137 
    138      const __m128i diff_sign = _mm_sign_epi16(diff_clamp, add_sign);
    139      const __m128i diff_const_16 = _mm_add_epi16(diff_sign, add_const);
    140 
    141      // 8 bit conversion and saturation to uint8
    142      const __m128i res_8 = _mm_packus_epi16(diff_const_16, diff_const_16);
    143 
    144      // Store values into the destination buffer
    145      __m128i *const dst = (__m128i *)&mask[i * w + j];
    146 
    147      if ((w - j) > 4) {
    148        _mm_storel_epi64(dst, res_8);
    149      } else {  // w==4
    150        *(int *)dst = _mm_cvtsi128_si32(res_8);
    151      }
    152    }
    153  }
    154 }