tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

highbd_convolve_2d_ssse3.c (17061B)


      1 /*
      2 * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
      3 *
      4 * This source code is subject to the terms of the BSD 2 Clause License and
      5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
      6 * was not distributed with this source code in the LICENSE file, you can
      7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
      8 * Media Patent License 1.0 was not distributed with this source code in the
      9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
     10 */
     11 
     12 #include <tmmintrin.h>
     13 #include <assert.h>
     14 
     15 #include "config/av1_rtcd.h"
     16 
     17 #include "aom_dsp/aom_dsp_common.h"
     18 #include "aom_dsp/aom_filter.h"
     19 #include "aom_dsp/x86/convolve_sse2.h"
     20 #include "av1/common/convolve.h"
     21 #include "aom_dsp/x86/convolve_common_intrin.h"
     22 
     23 void av1_highbd_convolve_2d_sr_ssse3(
     24    const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w,
     25    int h, const InterpFilterParams *filter_params_x,
     26    const InterpFilterParams *filter_params_y, const int subpel_x_qn,
     27    const int subpel_y_qn, ConvolveParams *conv_params, int bd) {
     28  DECLARE_ALIGNED(32, int16_t, im_block[(MAX_SB_SIZE + MAX_FILTER_TAP) * 8]);
     29  int im_h = h + filter_params_y->taps - 1;
     30  int im_stride = 8;
     31  int i, j;
     32  const int fo_vert = filter_params_y->taps / 2 - 1;
     33  const int fo_horiz = filter_params_x->taps / 2 - 1;
     34  const uint16_t *const src_ptr = src - fo_vert * src_stride - fo_horiz;
     35 
     36  // Check that, even with 12-bit input, the intermediate values will fit
     37  // into an unsigned 16-bit intermediate array.
     38  assert(bd + FILTER_BITS + 2 - conv_params->round_0 <= 16);
     39 
     40  const __m128i round_const_x = _mm_set1_epi32(
     41      ((1 << conv_params->round_0) >> 1) + (1 << (bd + FILTER_BITS - 1)));
     42  const __m128i round_shift_x = _mm_cvtsi32_si128(conv_params->round_0);
     43 
     44  const __m128i round_const_y =
     45      _mm_set1_epi32(((1 << conv_params->round_1) >> 1) -
     46                     (1 << (bd + 2 * FILTER_BITS - conv_params->round_0 - 1)));
     47  const __m128i round_shift_y = _mm_cvtsi32_si128(conv_params->round_1);
     48 
     49  const int bits =
     50      FILTER_BITS * 2 - conv_params->round_0 - conv_params->round_1;
     51  const __m128i round_shift_bits = _mm_cvtsi32_si128(bits);
     52  const __m128i round_const_bits = _mm_set1_epi32((1 << bits) >> 1);
     53  const __m128i clip_pixel =
     54      _mm_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255));
     55  const __m128i zero = _mm_setzero_si128();
     56 
     57  if (filter_params_x->taps == 12) {
     58    __m128i coeffs_x[6], coeffs_y[6], s[24];
     59    prepare_coeffs_12tap(filter_params_x, subpel_x_qn, coeffs_x);
     60    prepare_coeffs_12tap(filter_params_y, subpel_y_qn, coeffs_y);
     61 
     62    for (j = 0; j < w; j += 8) {
     63      /* Horizontal filter */
     64      {
     65        for (i = 0; i < im_h; i += 1) {
     66          const __m128i row00 =
     67              _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + j]);
     68          const __m128i row01 =
     69              _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + (j + 8)]);
     70          const __m128i row02 =
     71              _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + (j + 16)]);
     72 
     73          // even pixels
     74          s[0] = _mm_alignr_epi8(row01, row00, 0);
     75          s[1] = _mm_alignr_epi8(row01, row00, 4);
     76          s[2] = _mm_alignr_epi8(row01, row00, 8);
     77          s[3] = _mm_alignr_epi8(row01, row00, 12);
     78          s[4] = _mm_alignr_epi8(row02, row01, 0);
     79          s[5] = _mm_alignr_epi8(row02, row01, 4);
     80 
     81          __m128i res_even = convolve_12tap(s, coeffs_x);
     82          res_even = _mm_sra_epi32(_mm_add_epi32(res_even, round_const_x),
     83                                   round_shift_x);
     84 
     85          // odd pixels
     86          s[0] = _mm_alignr_epi8(row01, row00, 2);
     87          s[1] = _mm_alignr_epi8(row01, row00, 6);
     88          s[2] = _mm_alignr_epi8(row01, row00, 10);
     89          s[3] = _mm_alignr_epi8(row01, row00, 14);
     90          s[4] = _mm_alignr_epi8(row02, row01, 2);
     91          s[5] = _mm_alignr_epi8(row02, row01, 6);
     92 
     93          __m128i res_odd = convolve_12tap(s, coeffs_x);
     94          res_odd = _mm_sra_epi32(_mm_add_epi32(res_odd, round_const_x),
     95                                  round_shift_x);
     96 
     97          __m128i res_even1 = _mm_packs_epi32(res_even, res_even);
     98          __m128i res_odd1 = _mm_packs_epi32(res_odd, res_odd);
     99          __m128i res = _mm_unpacklo_epi16(res_even1, res_odd1);
    100 
    101          _mm_store_si128((__m128i *)&im_block[i * im_stride], res);
    102        }
    103      }
    104 
    105      /* Vertical filter */
    106      {
    107        __m128i s0 = _mm_loadu_si128((__m128i *)(im_block + 0 * im_stride));
    108        __m128i s1 = _mm_loadu_si128((__m128i *)(im_block + 1 * im_stride));
    109        __m128i s2 = _mm_loadu_si128((__m128i *)(im_block + 2 * im_stride));
    110        __m128i s3 = _mm_loadu_si128((__m128i *)(im_block + 3 * im_stride));
    111        __m128i s4 = _mm_loadu_si128((__m128i *)(im_block + 4 * im_stride));
    112        __m128i s5 = _mm_loadu_si128((__m128i *)(im_block + 5 * im_stride));
    113        __m128i s6 = _mm_loadu_si128((__m128i *)(im_block + 6 * im_stride));
    114        __m128i s7 = _mm_loadu_si128((__m128i *)(im_block + 7 * im_stride));
    115        __m128i s8 = _mm_loadu_si128((__m128i *)(im_block + 8 * im_stride));
    116        __m128i s9 = _mm_loadu_si128((__m128i *)(im_block + 9 * im_stride));
    117        __m128i s10 = _mm_loadu_si128((__m128i *)(im_block + 10 * im_stride));
    118 
    119        s[0] = _mm_unpacklo_epi16(s0, s1);
    120        s[1] = _mm_unpacklo_epi16(s2, s3);
    121        s[2] = _mm_unpacklo_epi16(s4, s5);
    122        s[3] = _mm_unpacklo_epi16(s6, s7);
    123        s[4] = _mm_unpacklo_epi16(s8, s9);
    124 
    125        s[6] = _mm_unpackhi_epi16(s0, s1);
    126        s[7] = _mm_unpackhi_epi16(s2, s3);
    127        s[8] = _mm_unpackhi_epi16(s4, s5);
    128        s[9] = _mm_unpackhi_epi16(s6, s7);
    129        s[10] = _mm_unpackhi_epi16(s8, s9);
    130 
    131        s[12] = _mm_unpacklo_epi16(s1, s2);
    132        s[13] = _mm_unpacklo_epi16(s3, s4);
    133        s[14] = _mm_unpacklo_epi16(s5, s6);
    134        s[15] = _mm_unpacklo_epi16(s7, s8);
    135        s[16] = _mm_unpacklo_epi16(s9, s10);
    136 
    137        s[18] = _mm_unpackhi_epi16(s1, s2);
    138        s[19] = _mm_unpackhi_epi16(s3, s4);
    139        s[20] = _mm_unpackhi_epi16(s5, s6);
    140        s[21] = _mm_unpackhi_epi16(s7, s8);
    141        s[22] = _mm_unpackhi_epi16(s9, s10);
    142 
    143        for (i = 0; i < h; i += 2) {
    144          const int16_t *data = &im_block[i * im_stride];
    145 
    146          __m128i s11 = _mm_loadu_si128((__m128i *)(data + 11 * im_stride));
    147          __m128i s12 = _mm_loadu_si128((__m128i *)(data + 12 * im_stride));
    148 
    149          s[5] = _mm_unpacklo_epi16(s10, s11);
    150          s[11] = _mm_unpackhi_epi16(s10, s11);
    151 
    152          s[17] = _mm_unpacklo_epi16(s11, s12);
    153          s[23] = _mm_unpackhi_epi16(s11, s12);
    154 
    155          const __m128i res_a0 = convolve_12tap(s, coeffs_y);
    156          __m128i res_a_round0 = _mm_sra_epi32(
    157              _mm_add_epi32(res_a0, round_const_y), round_shift_y);
    158          res_a_round0 = _mm_sra_epi32(
    159              _mm_add_epi32(res_a_round0, round_const_bits), round_shift_bits);
    160 
    161          const __m128i res_a1 = convolve_12tap(s + 12, coeffs_y);
    162          __m128i res_a_round1 = _mm_sra_epi32(
    163              _mm_add_epi32(res_a1, round_const_y), round_shift_y);
    164          res_a_round1 = _mm_sra_epi32(
    165              _mm_add_epi32(res_a_round1, round_const_bits), round_shift_bits);
    166 
    167          if (w - j > 4) {
    168            const __m128i res_b0 = convolve_12tap(s + 6, coeffs_y);
    169            __m128i res_b_round0 = _mm_sra_epi32(
    170                _mm_add_epi32(res_b0, round_const_y), round_shift_y);
    171            res_b_round0 =
    172                _mm_sra_epi32(_mm_add_epi32(res_b_round0, round_const_bits),
    173                              round_shift_bits);
    174 
    175            const __m128i res_b1 = convolve_12tap(s + 18, coeffs_y);
    176            __m128i res_b_round1 = _mm_sra_epi32(
    177                _mm_add_epi32(res_b1, round_const_y), round_shift_y);
    178            res_b_round1 =
    179                _mm_sra_epi32(_mm_add_epi32(res_b_round1, round_const_bits),
    180                              round_shift_bits);
    181 
    182            __m128i res_16bit0 = _mm_packs_epi32(res_a_round0, res_b_round0);
    183            res_16bit0 = _mm_min_epi16(res_16bit0, clip_pixel);
    184            res_16bit0 = _mm_max_epi16(res_16bit0, zero);
    185 
    186            __m128i res_16bit1 = _mm_packs_epi32(res_a_round1, res_b_round1);
    187            res_16bit1 = _mm_min_epi16(res_16bit1, clip_pixel);
    188            res_16bit1 = _mm_max_epi16(res_16bit1, zero);
    189 
    190            _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j], res_16bit0);
    191            _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j + dst_stride],
    192                             res_16bit1);
    193          } else if (w == 4) {
    194            res_a_round0 = _mm_packs_epi32(res_a_round0, res_a_round0);
    195            res_a_round0 = _mm_min_epi16(res_a_round0, clip_pixel);
    196            res_a_round0 = _mm_max_epi16(res_a_round0, zero);
    197 
    198            res_a_round1 = _mm_packs_epi32(res_a_round1, res_a_round1);
    199            res_a_round1 = _mm_min_epi16(res_a_round1, clip_pixel);
    200            res_a_round1 = _mm_max_epi16(res_a_round1, zero);
    201 
    202            _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j], res_a_round0);
    203            _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j + dst_stride],
    204                             res_a_round1);
    205          } else {
    206            res_a_round0 = _mm_packs_epi32(res_a_round0, res_a_round0);
    207            res_a_round0 = _mm_min_epi16(res_a_round0, clip_pixel);
    208            res_a_round0 = _mm_max_epi16(res_a_round0, zero);
    209 
    210            res_a_round1 = _mm_packs_epi32(res_a_round1, res_a_round1);
    211            res_a_round1 = _mm_min_epi16(res_a_round1, clip_pixel);
    212            res_a_round1 = _mm_max_epi16(res_a_round1, zero);
    213 
    214            *((int *)(&dst[i * dst_stride + j])) =
    215                _mm_cvtsi128_si32(res_a_round0);
    216 
    217            *((int *)(&dst[i * dst_stride + j + dst_stride])) =
    218                _mm_cvtsi128_si32(res_a_round1);
    219          }
    220          s[0] = s[1];
    221          s[1] = s[2];
    222          s[2] = s[3];
    223          s[3] = s[4];
    224          s[4] = s[5];
    225 
    226          s[6] = s[7];
    227          s[7] = s[8];
    228          s[8] = s[9];
    229          s[9] = s[10];
    230          s[10] = s[11];
    231 
    232          s[12] = s[13];
    233          s[13] = s[14];
    234          s[14] = s[15];
    235          s[15] = s[16];
    236          s[16] = s[17];
    237 
    238          s[18] = s[19];
    239          s[19] = s[20];
    240          s[20] = s[21];
    241          s[21] = s[22];
    242          s[22] = s[23];
    243 
    244          s10 = s12;
    245        }
    246      }
    247    }
    248  } else {
    249    __m128i coeffs_x[4], coeffs_y[4], s[16];
    250    prepare_coeffs(filter_params_x, subpel_x_qn, coeffs_x);
    251    prepare_coeffs(filter_params_y, subpel_y_qn, coeffs_y);
    252 
    253    for (j = 0; j < w; j += 8) {
    254      /* Horizontal filter */
    255      {
    256        for (i = 0; i < im_h; i += 1) {
    257          const __m128i row00 =
    258              _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + j]);
    259          const __m128i row01 =
    260              _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + (j + 8)]);
    261 
    262          // even pixels
    263          s[0] = _mm_alignr_epi8(row01, row00, 0);
    264          s[1] = _mm_alignr_epi8(row01, row00, 4);
    265          s[2] = _mm_alignr_epi8(row01, row00, 8);
    266          s[3] = _mm_alignr_epi8(row01, row00, 12);
    267 
    268          __m128i res_even = convolve(s, coeffs_x);
    269          res_even = _mm_sra_epi32(_mm_add_epi32(res_even, round_const_x),
    270                                   round_shift_x);
    271 
    272          // odd pixels
    273          s[0] = _mm_alignr_epi8(row01, row00, 2);
    274          s[1] = _mm_alignr_epi8(row01, row00, 6);
    275          s[2] = _mm_alignr_epi8(row01, row00, 10);
    276          s[3] = _mm_alignr_epi8(row01, row00, 14);
    277 
    278          __m128i res_odd = convolve(s, coeffs_x);
    279          res_odd = _mm_sra_epi32(_mm_add_epi32(res_odd, round_const_x),
    280                                  round_shift_x);
    281 
    282          __m128i res_even1 = _mm_packs_epi32(res_even, res_even);
    283          __m128i res_odd1 = _mm_packs_epi32(res_odd, res_odd);
    284          __m128i res = _mm_unpacklo_epi16(res_even1, res_odd1);
    285 
    286          _mm_store_si128((__m128i *)&im_block[i * im_stride], res);
    287        }
    288      }
    289 
    290      /* Vertical filter */
    291      {
    292        __m128i s0 = _mm_loadu_si128((__m128i *)(im_block + 0 * im_stride));
    293        __m128i s1 = _mm_loadu_si128((__m128i *)(im_block + 1 * im_stride));
    294        __m128i s2 = _mm_loadu_si128((__m128i *)(im_block + 2 * im_stride));
    295        __m128i s3 = _mm_loadu_si128((__m128i *)(im_block + 3 * im_stride));
    296        __m128i s4 = _mm_loadu_si128((__m128i *)(im_block + 4 * im_stride));
    297        __m128i s5 = _mm_loadu_si128((__m128i *)(im_block + 5 * im_stride));
    298        __m128i s6 = _mm_loadu_si128((__m128i *)(im_block + 6 * im_stride));
    299 
    300        s[0] = _mm_unpacklo_epi16(s0, s1);
    301        s[1] = _mm_unpacklo_epi16(s2, s3);
    302        s[2] = _mm_unpacklo_epi16(s4, s5);
    303 
    304        s[4] = _mm_unpackhi_epi16(s0, s1);
    305        s[5] = _mm_unpackhi_epi16(s2, s3);
    306        s[6] = _mm_unpackhi_epi16(s4, s5);
    307 
    308        s[0 + 8] = _mm_unpacklo_epi16(s1, s2);
    309        s[1 + 8] = _mm_unpacklo_epi16(s3, s4);
    310        s[2 + 8] = _mm_unpacklo_epi16(s5, s6);
    311 
    312        s[4 + 8] = _mm_unpackhi_epi16(s1, s2);
    313        s[5 + 8] = _mm_unpackhi_epi16(s3, s4);
    314        s[6 + 8] = _mm_unpackhi_epi16(s5, s6);
    315 
    316        for (i = 0; i < h; i += 2) {
    317          const int16_t *data = &im_block[i * im_stride];
    318 
    319          __m128i s7 = _mm_loadu_si128((__m128i *)(data + 7 * im_stride));
    320          __m128i s8 = _mm_loadu_si128((__m128i *)(data + 8 * im_stride));
    321 
    322          s[3] = _mm_unpacklo_epi16(s6, s7);
    323          s[7] = _mm_unpackhi_epi16(s6, s7);
    324 
    325          s[3 + 8] = _mm_unpacklo_epi16(s7, s8);
    326          s[7 + 8] = _mm_unpackhi_epi16(s7, s8);
    327 
    328          const __m128i res_a0 = convolve(s, coeffs_y);
    329          __m128i res_a_round0 = _mm_sra_epi32(
    330              _mm_add_epi32(res_a0, round_const_y), round_shift_y);
    331          res_a_round0 = _mm_sra_epi32(
    332              _mm_add_epi32(res_a_round0, round_const_bits), round_shift_bits);
    333 
    334          const __m128i res_a1 = convolve(s + 8, coeffs_y);
    335          __m128i res_a_round1 = _mm_sra_epi32(
    336              _mm_add_epi32(res_a1, round_const_y), round_shift_y);
    337          res_a_round1 = _mm_sra_epi32(
    338              _mm_add_epi32(res_a_round1, round_const_bits), round_shift_bits);
    339 
    340          if (w - j > 4) {
    341            const __m128i res_b0 = convolve(s + 4, coeffs_y);
    342            __m128i res_b_round0 = _mm_sra_epi32(
    343                _mm_add_epi32(res_b0, round_const_y), round_shift_y);
    344            res_b_round0 =
    345                _mm_sra_epi32(_mm_add_epi32(res_b_round0, round_const_bits),
    346                              round_shift_bits);
    347 
    348            const __m128i res_b1 = convolve(s + 4 + 8, coeffs_y);
    349            __m128i res_b_round1 = _mm_sra_epi32(
    350                _mm_add_epi32(res_b1, round_const_y), round_shift_y);
    351            res_b_round1 =
    352                _mm_sra_epi32(_mm_add_epi32(res_b_round1, round_const_bits),
    353                              round_shift_bits);
    354 
    355            __m128i res_16bit0 = _mm_packs_epi32(res_a_round0, res_b_round0);
    356            res_16bit0 = _mm_min_epi16(res_16bit0, clip_pixel);
    357            res_16bit0 = _mm_max_epi16(res_16bit0, zero);
    358 
    359            __m128i res_16bit1 = _mm_packs_epi32(res_a_round1, res_b_round1);
    360            res_16bit1 = _mm_min_epi16(res_16bit1, clip_pixel);
    361            res_16bit1 = _mm_max_epi16(res_16bit1, zero);
    362 
    363            _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j], res_16bit0);
    364            _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j + dst_stride],
    365                             res_16bit1);
    366          } else if (w == 4) {
    367            res_a_round0 = _mm_packs_epi32(res_a_round0, res_a_round0);
    368            res_a_round0 = _mm_min_epi16(res_a_round0, clip_pixel);
    369            res_a_round0 = _mm_max_epi16(res_a_round0, zero);
    370 
    371            res_a_round1 = _mm_packs_epi32(res_a_round1, res_a_round1);
    372            res_a_round1 = _mm_min_epi16(res_a_round1, clip_pixel);
    373            res_a_round1 = _mm_max_epi16(res_a_round1, zero);
    374 
    375            _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j], res_a_round0);
    376            _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j + dst_stride],
    377                             res_a_round1);
    378          } else {
    379            res_a_round0 = _mm_packs_epi32(res_a_round0, res_a_round0);
    380            res_a_round0 = _mm_min_epi16(res_a_round0, clip_pixel);
    381            res_a_round0 = _mm_max_epi16(res_a_round0, zero);
    382 
    383            res_a_round1 = _mm_packs_epi32(res_a_round1, res_a_round1);
    384            res_a_round1 = _mm_min_epi16(res_a_round1, clip_pixel);
    385            res_a_round1 = _mm_max_epi16(res_a_round1, zero);
    386 
    387            *((int *)(&dst[i * dst_stride + j])) =
    388                _mm_cvtsi128_si32(res_a_round0);
    389 
    390            *((int *)(&dst[i * dst_stride + j + dst_stride])) =
    391                _mm_cvtsi128_si32(res_a_round1);
    392          }
    393          s[0] = s[1];
    394          s[1] = s[2];
    395          s[2] = s[3];
    396 
    397          s[4] = s[5];
    398          s[5] = s[6];
    399          s[6] = s[7];
    400 
    401          s[0 + 8] = s[1 + 8];
    402          s[1 + 8] = s[2 + 8];
    403          s[2 + 8] = s[3 + 8];
    404 
    405          s[4 + 8] = s[5 + 8];
    406          s[5 + 8] = s[6 + 8];
    407          s[6 + 8] = s[7 + 8];
    408 
    409          s6 = s8;
    410        }
    411      }
    412    }
    413  }
    414 }