tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

jnt_convolve_ssse3.c (9983B)


      1 /*
      2 * Copyright (c) 2018, Alliance for Open Media. All rights reserved.
      3 *
      4 * This source code is subject to the terms of the BSD 2 Clause License and
      5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
      6 * was not distributed with this source code in the LICENSE file, you can
      7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
      8 * Media Patent License 1.0 was not distributed with this source code in the
      9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
     10 */
     11 
     12 #include <tmmintrin.h>
     13 
     14 #include "config/av1_rtcd.h"
     15 
     16 #include "aom_dsp/aom_filter.h"
     17 #include "aom_dsp/x86/convolve_sse2.h"
     18 
     19 void av1_dist_wtd_convolve_2d_ssse3(
     20    const uint8_t *src, int src_stride, uint8_t *dst0, int dst_stride0, int w,
     21    int h, const InterpFilterParams *filter_params_x,
     22    const InterpFilterParams *filter_params_y, const int subpel_x_qn,
     23    const int subpel_y_qn, ConvolveParams *conv_params) {
     24  CONV_BUF_TYPE *dst = conv_params->dst;
     25  int dst_stride = conv_params->dst_stride;
     26  const int bd = 8;
     27 
     28  DECLARE_ALIGNED(16, int16_t,
     29                  im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE]);
     30  int im_h = h + filter_params_y->taps - 1;
     31  int im_stride = MAX_SB_SIZE;
     32  int i, j;
     33  const int fo_vert = filter_params_y->taps / 2 - 1;
     34  const int fo_horiz = filter_params_x->taps / 2 - 1;
     35  const int do_average = conv_params->do_average;
     36  const int use_dist_wtd_comp_avg = conv_params->use_dist_wtd_comp_avg;
     37  const uint8_t *const src_ptr = src - fo_vert * src_stride - fo_horiz;
     38 
     39  const __m128i zero = _mm_setzero_si128();
     40 
     41  const int w0 = conv_params->fwd_offset;
     42  const int w1 = conv_params->bck_offset;
     43  const __m128i wt0 = _mm_set1_epi16(w0);
     44  const __m128i wt1 = _mm_set1_epi16(w1);
     45  const __m128i wt = _mm_unpacklo_epi16(wt0, wt1);
     46 
     47  const int offset_0 =
     48      bd + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
     49  const int offset = (1 << offset_0) + (1 << (offset_0 - 1));
     50  const __m128i offset_const = _mm_set1_epi16(offset);
     51  const int rounding_shift =
     52      2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
     53  const __m128i rounding_const = _mm_set1_epi16((1 << rounding_shift) >> 1);
     54 
     55  /* Horizontal filter */
     56  {
     57    const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
     58        filter_params_x, subpel_x_qn & SUBPEL_MASK);
     59    const __m128i coeffs_x = _mm_loadu_si128((__m128i *)x_filter);
     60 
     61    // coeffs 0 1 0 1 2 3 2 3
     62    const __m128i tmp_0 = _mm_unpacklo_epi32(coeffs_x, coeffs_x);
     63    // coeffs 4 5 4 5 6 7 6 7
     64    const __m128i tmp_1 = _mm_unpackhi_epi32(coeffs_x, coeffs_x);
     65 
     66    // coeffs 0 1 0 1 0 1 0 1
     67    const __m128i coeff_01 = _mm_unpacklo_epi64(tmp_0, tmp_0);
     68    // coeffs 2 3 2 3 2 3 2 3
     69    const __m128i coeff_23 = _mm_unpackhi_epi64(tmp_0, tmp_0);
     70    // coeffs 4 5 4 5 4 5 4 5
     71    const __m128i coeff_45 = _mm_unpacklo_epi64(tmp_1, tmp_1);
     72    // coeffs 6 7 6 7 6 7 6 7
     73    const __m128i coeff_67 = _mm_unpackhi_epi64(tmp_1, tmp_1);
     74 
     75    const __m128i round_const = _mm_set1_epi32(
     76        ((1 << conv_params->round_0) >> 1) + (1 << (bd + FILTER_BITS - 1)));
     77    const __m128i round_shift = _mm_cvtsi32_si128(conv_params->round_0);
     78 
     79    for (i = 0; i < im_h; ++i) {
     80      for (j = 0; j < w; j += 8) {
     81        const __m128i data =
     82            _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + j]);
     83 
     84        const __m128i src_lo = _mm_unpacklo_epi8(data, zero);
     85        const __m128i src_hi = _mm_unpackhi_epi8(data, zero);
     86 
     87        // Filter even-index pixels
     88        const __m128i res_0 = _mm_madd_epi16(src_lo, coeff_01);
     89        const __m128i src_2 = _mm_alignr_epi8(src_hi, src_lo, 4);
     90        const __m128i res_2 = _mm_madd_epi16(src_2, coeff_23);
     91        const __m128i src_4 = _mm_alignr_epi8(src_hi, src_lo, 8);
     92        const __m128i res_4 = _mm_madd_epi16(src_4, coeff_45);
     93        const __m128i src_6 = _mm_alignr_epi8(src_hi, src_lo, 12);
     94        const __m128i res_6 = _mm_madd_epi16(src_6, coeff_67);
     95 
     96        __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_4),
     97                                         _mm_add_epi32(res_2, res_6));
     98        res_even =
     99            _mm_sra_epi32(_mm_add_epi32(res_even, round_const), round_shift);
    100 
    101        // Filter odd-index pixels
    102        const __m128i src_1 = _mm_alignr_epi8(src_hi, src_lo, 2);
    103        const __m128i res_1 = _mm_madd_epi16(src_1, coeff_01);
    104        const __m128i src_3 = _mm_alignr_epi8(src_hi, src_lo, 6);
    105        const __m128i res_3 = _mm_madd_epi16(src_3, coeff_23);
    106        const __m128i src_5 = _mm_alignr_epi8(src_hi, src_lo, 10);
    107        const __m128i res_5 = _mm_madd_epi16(src_5, coeff_45);
    108        const __m128i src_7 = _mm_alignr_epi8(src_hi, src_lo, 14);
    109        const __m128i res_7 = _mm_madd_epi16(src_7, coeff_67);
    110 
    111        __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_5),
    112                                        _mm_add_epi32(res_3, res_7));
    113        res_odd =
    114            _mm_sra_epi32(_mm_add_epi32(res_odd, round_const), round_shift);
    115 
    116        // Pack in the column order 0, 2, 4, 6, 1, 3, 5, 7
    117        __m128i res = _mm_packs_epi32(res_even, res_odd);
    118        _mm_store_si128((__m128i *)&im_block[i * im_stride + j], res);
    119      }
    120    }
    121  }
    122 
    123  /* Vertical filter */
    124  {
    125    const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
    126        filter_params_y, subpel_y_qn & SUBPEL_MASK);
    127    const __m128i coeffs_y = _mm_loadu_si128((__m128i *)y_filter);
    128 
    129    // coeffs 0 1 0 1 2 3 2 3
    130    const __m128i tmp_0 = _mm_unpacklo_epi32(coeffs_y, coeffs_y);
    131    // coeffs 4 5 4 5 6 7 6 7
    132    const __m128i tmp_1 = _mm_unpackhi_epi32(coeffs_y, coeffs_y);
    133 
    134    // coeffs 0 1 0 1 0 1 0 1
    135    const __m128i coeff_01 = _mm_unpacklo_epi64(tmp_0, tmp_0);
    136    // coeffs 2 3 2 3 2 3 2 3
    137    const __m128i coeff_23 = _mm_unpackhi_epi64(tmp_0, tmp_0);
    138    // coeffs 4 5 4 5 4 5 4 5
    139    const __m128i coeff_45 = _mm_unpacklo_epi64(tmp_1, tmp_1);
    140    // coeffs 6 7 6 7 6 7 6 7
    141    const __m128i coeff_67 = _mm_unpackhi_epi64(tmp_1, tmp_1);
    142 
    143    const __m128i round_const = _mm_set1_epi32(
    144        ((1 << conv_params->round_1) >> 1) -
    145        (1 << (bd + 2 * FILTER_BITS - conv_params->round_0 - 1)));
    146    const __m128i round_shift = _mm_cvtsi32_si128(conv_params->round_1);
    147 
    148    for (i = 0; i < h; ++i) {
    149      for (j = 0; j < w; j += 8) {
    150        // Filter even-index pixels
    151        const int16_t *data = &im_block[i * im_stride + j];
    152        const __m128i src_0 =
    153            _mm_unpacklo_epi16(*(__m128i *)(data + 0 * im_stride),
    154                               *(__m128i *)(data + 1 * im_stride));
    155        const __m128i src_2 =
    156            _mm_unpacklo_epi16(*(__m128i *)(data + 2 * im_stride),
    157                               *(__m128i *)(data + 3 * im_stride));
    158        const __m128i src_4 =
    159            _mm_unpacklo_epi16(*(__m128i *)(data + 4 * im_stride),
    160                               *(__m128i *)(data + 5 * im_stride));
    161        const __m128i src_6 =
    162            _mm_unpacklo_epi16(*(__m128i *)(data + 6 * im_stride),
    163                               *(__m128i *)(data + 7 * im_stride));
    164 
    165        const __m128i res_0 = _mm_madd_epi16(src_0, coeff_01);
    166        const __m128i res_2 = _mm_madd_epi16(src_2, coeff_23);
    167        const __m128i res_4 = _mm_madd_epi16(src_4, coeff_45);
    168        const __m128i res_6 = _mm_madd_epi16(src_6, coeff_67);
    169 
    170        const __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_2),
    171                                               _mm_add_epi32(res_4, res_6));
    172 
    173        // Filter odd-index pixels
    174        const __m128i src_1 =
    175            _mm_unpackhi_epi16(*(__m128i *)(data + 0 * im_stride),
    176                               *(__m128i *)(data + 1 * im_stride));
    177        const __m128i src_3 =
    178            _mm_unpackhi_epi16(*(__m128i *)(data + 2 * im_stride),
    179                               *(__m128i *)(data + 3 * im_stride));
    180        const __m128i src_5 =
    181            _mm_unpackhi_epi16(*(__m128i *)(data + 4 * im_stride),
    182                               *(__m128i *)(data + 5 * im_stride));
    183        const __m128i src_7 =
    184            _mm_unpackhi_epi16(*(__m128i *)(data + 6 * im_stride),
    185                               *(__m128i *)(data + 7 * im_stride));
    186 
    187        const __m128i res_1 = _mm_madd_epi16(src_1, coeff_01);
    188        const __m128i res_3 = _mm_madd_epi16(src_3, coeff_23);
    189        const __m128i res_5 = _mm_madd_epi16(src_5, coeff_45);
    190        const __m128i res_7 = _mm_madd_epi16(src_7, coeff_67);
    191 
    192        const __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_3),
    193                                              _mm_add_epi32(res_5, res_7));
    194 
    195        // Rearrange pixels back into the order 0 ... 7
    196        const __m128i res_lo = _mm_unpacklo_epi32(res_even, res_odd);
    197        const __m128i res_hi = _mm_unpackhi_epi32(res_even, res_odd);
    198 
    199        const __m128i res_lo_round =
    200            _mm_sra_epi32(_mm_add_epi32(res_lo, round_const), round_shift);
    201        const __m128i res_hi_round =
    202            _mm_sra_epi32(_mm_add_epi32(res_hi, round_const), round_shift);
    203 
    204        const __m128i res_16b = _mm_packs_epi32(res_lo_round, res_hi_round);
    205        const __m128i res_unsigned = _mm_add_epi16(res_16b, offset_const);
    206 
    207        // Accumulate values into the destination buffer
    208        if (do_average) {
    209          const __m128i data_ref_0 =
    210              _mm_loadu_si128((__m128i *)(&dst[i * dst_stride + j]));
    211 
    212          const __m128i comp_avg_res =
    213              comp_avg(&data_ref_0, &res_unsigned, &wt, use_dist_wtd_comp_avg);
    214 
    215          const __m128i round_result = convolve_rounding(
    216              &comp_avg_res, &offset_const, &rounding_const, rounding_shift);
    217 
    218          const __m128i res_8 = _mm_packus_epi16(round_result, round_result);
    219 
    220          if (w > 4)
    221            _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]), res_8);
    222          else
    223            *(int *)(&dst0[i * dst_stride0 + j]) = _mm_cvtsi128_si32(res_8);
    224        } else {
    225          _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_unsigned);
    226        }
    227      }
    228    }
    229  }
    230 }