tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

convolve_2d_sse2.c (23807B)


      1 /*
      2 * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
      3 *
      4 * This source code is subject to the terms of the BSD 2 Clause License and
      5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
      6 * was not distributed with this source code in the LICENSE file, you can
      7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
      8 * Media Patent License 1.0 was not distributed with this source code in the
      9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
     10 */
     11 
     12 #include <emmintrin.h>
     13 
     14 #include "config/av1_rtcd.h"
     15 
     16 #include "aom_dsp/aom_dsp_common.h"
     17 #include "aom_dsp/aom_filter.h"
     18 #include "aom_dsp/x86/convolve_sse2.h"
     19 #include "aom_dsp/x86/convolve_common_intrin.h"
     20 #include "av1/common/convolve.h"
     21 
     22 static void convolve_2d_sr_12tap_sse2(
     23    const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w,
     24    int h, const InterpFilterParams *filter_params_x,
     25    const InterpFilterParams *filter_params_y, const int subpel_x_qn,
     26    const int subpel_y_qn, ConvolveParams *conv_params) {
     27  const int bd = 8;
     28 
     29  DECLARE_ALIGNED(16, int16_t,
     30                  im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE]);
     31  int im_h = h + filter_params_y->taps - 1;
     32  int im_stride = w;
     33  int i, j;
     34  const int fo_vert = filter_params_y->taps / 2 - 1;
     35  const int fo_horiz = filter_params_x->taps / 2 - 1;
     36  const uint8_t *const src_ptr = src - fo_vert * src_stride - fo_horiz;
     37 
     38  const __m128i zero = _mm_setzero_si128();
     39  const int bits =
     40      FILTER_BITS * 2 - conv_params->round_0 - conv_params->round_1;
     41  const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
     42 
     43  assert(conv_params->round_0 > 0);
     44  __m128i coeffs[6];
     45 
     46  /* Horizontal filter */
     47  {
     48    prepare_coeffs_12tap(filter_params_x, subpel_x_qn, coeffs);
     49 
     50    const __m128i round_const = _mm_set1_epi32(
     51        (1 << (bd + FILTER_BITS - 1)) + ((1 << conv_params->round_0) >> 1));
     52    const __m128i round_shift = _mm_cvtsi32_si128(conv_params->round_0);
     53 
     54    for (i = 0; i < im_h; ++i) {
     55      for (j = 0; j < w; j += 8) {
     56        const __m128i data =
     57            _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + j]);
     58        const __m128i data_2 =
     59            _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + (j + 4)]);
     60 
     61        // Filter even-index pixels
     62        const __m128i src_0 = _mm_unpacklo_epi8(data, zero);
     63        const __m128i res_0 = _mm_madd_epi16(src_0, coeffs[0]);
     64        const __m128i src_2 = _mm_unpacklo_epi8(_mm_srli_si128(data, 2), zero);
     65        const __m128i res_2 = _mm_madd_epi16(src_2, coeffs[1]);
     66        const __m128i src_4 = _mm_unpacklo_epi8(data_2, zero);
     67        const __m128i res_4 = _mm_madd_epi16(src_4, coeffs[2]);
     68        const __m128i src_6 =
     69            _mm_unpacklo_epi8(_mm_srli_si128(data_2, 2), zero);
     70        const __m128i res_6 = _mm_madd_epi16(src_6, coeffs[3]);
     71        const __m128i src_8 =
     72            _mm_unpacklo_epi8(_mm_srli_si128(data_2, 4), zero);
     73        const __m128i res_8 = _mm_madd_epi16(src_8, coeffs[4]);
     74        const __m128i src_10 =
     75            _mm_unpacklo_epi8(_mm_srli_si128(data_2, 6), zero);
     76        const __m128i res_10 = _mm_madd_epi16(src_10, coeffs[5]);
     77 
     78        const __m128i res_0246 = _mm_add_epi32(_mm_add_epi32(res_0, res_4),
     79                                               _mm_add_epi32(res_2, res_6));
     80        __m128i res_even =
     81            _mm_add_epi32(_mm_add_epi32(res_8, res_10), res_0246);
     82        res_even =
     83            _mm_sra_epi32(_mm_add_epi32(res_even, round_const), round_shift);
     84 
     85        // Filter odd-index pixels
     86        const __m128i src_1 = _mm_unpacklo_epi8(_mm_srli_si128(data, 1), zero);
     87        const __m128i res_1 = _mm_madd_epi16(src_1, coeffs[0]);
     88        const __m128i src_3 = _mm_unpacklo_epi8(_mm_srli_si128(data, 3), zero);
     89        const __m128i res_3 = _mm_madd_epi16(src_3, coeffs[1]);
     90        const __m128i src_5 =
     91            _mm_unpacklo_epi8(_mm_srli_si128(data_2, 1), zero);
     92        const __m128i res_5 = _mm_madd_epi16(src_5, coeffs[2]);
     93        const __m128i src_7 =
     94            _mm_unpacklo_epi8(_mm_srli_si128(data_2, 3), zero);
     95        const __m128i res_7 = _mm_madd_epi16(src_7, coeffs[3]);
     96        const __m128i src_9 =
     97            _mm_unpacklo_epi8(_mm_srli_si128(data_2, 5), zero);
     98        const __m128i res_9 = _mm_madd_epi16(src_9, coeffs[4]);
     99        const __m128i src_11 =
    100            _mm_unpacklo_epi8(_mm_srli_si128(data_2, 7), zero);
    101        const __m128i res_11 = _mm_madd_epi16(src_11, coeffs[5]);
    102 
    103        const __m128i res_1357 = _mm_add_epi32(_mm_add_epi32(res_1, res_5),
    104                                               _mm_add_epi32(res_3, res_7));
    105        __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_9, res_11), res_1357);
    106        res_odd =
    107            _mm_sra_epi32(_mm_add_epi32(res_odd, round_const), round_shift);
    108 
    109        // Pack in the column order 0, 2, 4, 6, 1, 3, 5, 7
    110        __m128i res = _mm_packs_epi32(res_even, res_odd);
    111        _mm_storeu_si128((__m128i *)&im_block[i * im_stride + j], res);
    112      }
    113    }
    114  }
    115 
    116  /* Vertical filter */
    117  {
    118    prepare_coeffs_12tap(filter_params_y, subpel_y_qn, coeffs);
    119 
    120    const __m128i sum_round =
    121        _mm_set1_epi32((1 << offset_bits) + ((1 << conv_params->round_1) >> 1));
    122    const __m128i sum_shift = _mm_cvtsi32_si128(conv_params->round_1);
    123 
    124    const __m128i round_const = _mm_set1_epi32(
    125        ((1 << bits) >> 1) - (1 << (offset_bits - conv_params->round_1)) -
    126        ((1 << (offset_bits - conv_params->round_1)) >> 1));
    127    const __m128i round_shift = _mm_cvtsi32_si128(bits);
    128 
    129    for (i = 0; i < h; ++i) {
    130      for (j = 0; j < w; j += 8) {
    131        // Filter even-index pixels
    132        const int16_t *data = &im_block[i * im_stride + j];
    133        const __m128i src_0 =
    134            _mm_unpacklo_epi16(*(__m128i *)(data + 0 * im_stride),
    135                               *(__m128i *)(data + 1 * im_stride));
    136        const __m128i src_2 =
    137            _mm_unpacklo_epi16(*(__m128i *)(data + 2 * im_stride),
    138                               *(__m128i *)(data + 3 * im_stride));
    139        const __m128i src_4 =
    140            _mm_unpacklo_epi16(*(__m128i *)(data + 4 * im_stride),
    141                               *(__m128i *)(data + 5 * im_stride));
    142        const __m128i src_6 =
    143            _mm_unpacklo_epi16(*(__m128i *)(data + 6 * im_stride),
    144                               *(__m128i *)(data + 7 * im_stride));
    145        const __m128i src_8 =
    146            _mm_unpacklo_epi16(*(__m128i *)(data + 8 * im_stride),
    147                               *(__m128i *)(data + 9 * im_stride));
    148        const __m128i src_10 =
    149            _mm_unpacklo_epi16(*(__m128i *)(data + 10 * im_stride),
    150                               *(__m128i *)(data + 11 * im_stride));
    151 
    152        const __m128i res_0 = _mm_madd_epi16(src_0, coeffs[0]);
    153        const __m128i res_2 = _mm_madd_epi16(src_2, coeffs[1]);
    154        const __m128i res_4 = _mm_madd_epi16(src_4, coeffs[2]);
    155        const __m128i res_6 = _mm_madd_epi16(src_6, coeffs[3]);
    156        const __m128i res_8 = _mm_madd_epi16(src_8, coeffs[4]);
    157        const __m128i res_10 = _mm_madd_epi16(src_10, coeffs[5]);
    158 
    159        const __m128i res_0246 = _mm_add_epi32(_mm_add_epi32(res_0, res_2),
    160                                               _mm_add_epi32(res_4, res_6));
    161        __m128i res_even =
    162            _mm_add_epi32(_mm_add_epi32(res_8, res_10), res_0246);
    163 
    164        // Filter odd-index pixels
    165        const __m128i src_1 =
    166            _mm_unpackhi_epi16(*(__m128i *)(data + 0 * im_stride),
    167                               *(__m128i *)(data + 1 * im_stride));
    168        const __m128i src_3 =
    169            _mm_unpackhi_epi16(*(__m128i *)(data + 2 * im_stride),
    170                               *(__m128i *)(data + 3 * im_stride));
    171        const __m128i src_5 =
    172            _mm_unpackhi_epi16(*(__m128i *)(data + 4 * im_stride),
    173                               *(__m128i *)(data + 5 * im_stride));
    174        const __m128i src_7 =
    175            _mm_unpackhi_epi16(*(__m128i *)(data + 6 * im_stride),
    176                               *(__m128i *)(data + 7 * im_stride));
    177        const __m128i src_9 =
    178            _mm_unpackhi_epi16(*(__m128i *)(data + 8 * im_stride),
    179                               *(__m128i *)(data + 9 * im_stride));
    180        const __m128i src_11 =
    181            _mm_unpackhi_epi16(*(__m128i *)(data + 10 * im_stride),
    182                               *(__m128i *)(data + 11 * im_stride));
    183 
    184        const __m128i res_1 = _mm_madd_epi16(src_1, coeffs[0]);
    185        const __m128i res_3 = _mm_madd_epi16(src_3, coeffs[1]);
    186        const __m128i res_5 = _mm_madd_epi16(src_5, coeffs[2]);
    187        const __m128i res_7 = _mm_madd_epi16(src_7, coeffs[3]);
    188        const __m128i res_9 = _mm_madd_epi16(src_9, coeffs[4]);
    189        const __m128i res_11 = _mm_madd_epi16(src_11, coeffs[5]);
    190 
    191        const __m128i res_1357 = _mm_add_epi32(_mm_add_epi32(res_1, res_5),
    192                                               _mm_add_epi32(res_3, res_7));
    193        __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_9, res_11), res_1357);
    194 
    195        // Rearrange pixels back into the order 0 ... 7
    196        const __m128i res_lo = _mm_unpacklo_epi32(res_even, res_odd);
    197        const __m128i res_hi = _mm_unpackhi_epi32(res_even, res_odd);
    198 
    199        __m128i res_lo_round =
    200            _mm_sra_epi32(_mm_add_epi32(res_lo, sum_round), sum_shift);
    201        __m128i res_hi_round =
    202            _mm_sra_epi32(_mm_add_epi32(res_hi, sum_round), sum_shift);
    203 
    204        res_lo_round = _mm_sra_epi32(_mm_add_epi32(res_lo_round, round_const),
    205                                     round_shift);
    206        res_hi_round = _mm_sra_epi32(_mm_add_epi32(res_hi_round, round_const),
    207                                     round_shift);
    208 
    209        const __m128i res16 = _mm_packs_epi32(res_lo_round, res_hi_round);
    210        const __m128i res = _mm_packus_epi16(res16, res16);
    211 
    212        // Accumulate values into the destination buffer
    213        __m128i *const p = (__m128i *)&dst[i * dst_stride + j];
    214 
    215        _mm_storel_epi64(p, res);
    216      }
    217    }
    218  }
    219 }
    220 
    221 void av1_convolve_2d_sr_sse2(const uint8_t *src, int src_stride, uint8_t *dst,
    222                             int dst_stride, int w, int h,
    223                             const InterpFilterParams *filter_params_x,
    224                             const InterpFilterParams *filter_params_y,
    225                             const int subpel_x_qn, const int subpel_y_qn,
    226                             ConvolveParams *conv_params) {
    227  if (filter_params_x->taps > 8) {
    228    if (w < 8) {
    229      av1_convolve_2d_sr_c(src, src_stride, dst, dst_stride, w, h,
    230                           filter_params_x, filter_params_y, subpel_x_qn,
    231                           subpel_y_qn, conv_params);
    232    } else {
    233      convolve_2d_sr_12tap_sse2(src, src_stride, dst, dst_stride, w, h,
    234                                filter_params_x, filter_params_y, subpel_x_qn,
    235                                subpel_y_qn, conv_params);
    236    }
    237  } else {
    238    const int bd = 8;
    239 
    240    DECLARE_ALIGNED(16, int16_t,
    241                    im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE]);
    242    int im_h = h + filter_params_y->taps - 1;
    243    int im_stride = MAX_SB_SIZE;
    244    int i, j;
    245    const int fo_vert = filter_params_y->taps / 2 - 1;
    246    const int fo_horiz = filter_params_x->taps / 2 - 1;
    247    const uint8_t *const src_ptr = src - fo_vert * src_stride - fo_horiz;
    248 
    249    const __m128i zero = _mm_setzero_si128();
    250    const int bits =
    251        FILTER_BITS * 2 - conv_params->round_0 - conv_params->round_1;
    252    const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
    253 
    254    assert(conv_params->round_0 > 0);
    255 
    256    /* Horizontal filter */
    257    {
    258      const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
    259          filter_params_x, subpel_x_qn & SUBPEL_MASK);
    260      const __m128i coeffs_x = _mm_loadu_si128((__m128i *)x_filter);
    261 
    262      // coeffs 0 1 0 1 2 3 2 3
    263      const __m128i tmp_0 = _mm_unpacklo_epi32(coeffs_x, coeffs_x);
    264      // coeffs 4 5 4 5 6 7 6 7
    265      const __m128i tmp_1 = _mm_unpackhi_epi32(coeffs_x, coeffs_x);
    266 
    267      // coeffs 0 1 0 1 0 1 0 1
    268      const __m128i coeff_01 = _mm_unpacklo_epi64(tmp_0, tmp_0);
    269      // coeffs 2 3 2 3 2 3 2 3
    270      const __m128i coeff_23 = _mm_unpackhi_epi64(tmp_0, tmp_0);
    271      // coeffs 4 5 4 5 4 5 4 5
    272      const __m128i coeff_45 = _mm_unpacklo_epi64(tmp_1, tmp_1);
    273      // coeffs 6 7 6 7 6 7 6 7
    274      const __m128i coeff_67 = _mm_unpackhi_epi64(tmp_1, tmp_1);
    275 
    276      const __m128i round_const = _mm_set1_epi32(
    277          (1 << (bd + FILTER_BITS - 1)) + ((1 << conv_params->round_0) >> 1));
    278      const __m128i round_shift = _mm_cvtsi32_si128(conv_params->round_0);
    279 
    280      for (i = 0; i < im_h; ++i) {
    281        for (j = 0; j < w; j += 8) {
    282          const __m128i data =
    283              _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + j]);
    284 
    285          // Filter even-index pixels
    286          const __m128i src_0 = _mm_unpacklo_epi8(data, zero);
    287          const __m128i res_0 = _mm_madd_epi16(src_0, coeff_01);
    288          const __m128i src_2 =
    289              _mm_unpacklo_epi8(_mm_srli_si128(data, 2), zero);
    290          const __m128i res_2 = _mm_madd_epi16(src_2, coeff_23);
    291          const __m128i src_4 =
    292              _mm_unpacklo_epi8(_mm_srli_si128(data, 4), zero);
    293          const __m128i res_4 = _mm_madd_epi16(src_4, coeff_45);
    294          const __m128i src_6 =
    295              _mm_unpacklo_epi8(_mm_srli_si128(data, 6), zero);
    296          const __m128i res_6 = _mm_madd_epi16(src_6, coeff_67);
    297 
    298          __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_4),
    299                                           _mm_add_epi32(res_2, res_6));
    300          res_even =
    301              _mm_sra_epi32(_mm_add_epi32(res_even, round_const), round_shift);
    302 
    303          // Filter odd-index pixels
    304          const __m128i src_1 =
    305              _mm_unpacklo_epi8(_mm_srli_si128(data, 1), zero);
    306          const __m128i res_1 = _mm_madd_epi16(src_1, coeff_01);
    307          const __m128i src_3 =
    308              _mm_unpacklo_epi8(_mm_srli_si128(data, 3), zero);
    309          const __m128i res_3 = _mm_madd_epi16(src_3, coeff_23);
    310          const __m128i src_5 =
    311              _mm_unpacklo_epi8(_mm_srli_si128(data, 5), zero);
    312          const __m128i res_5 = _mm_madd_epi16(src_5, coeff_45);
    313          const __m128i src_7 =
    314              _mm_unpacklo_epi8(_mm_srli_si128(data, 7), zero);
    315          const __m128i res_7 = _mm_madd_epi16(src_7, coeff_67);
    316 
    317          __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_5),
    318                                          _mm_add_epi32(res_3, res_7));
    319          res_odd =
    320              _mm_sra_epi32(_mm_add_epi32(res_odd, round_const), round_shift);
    321 
    322          // Pack in the column order 0, 2, 4, 6, 1, 3, 5, 7
    323          __m128i res = _mm_packs_epi32(res_even, res_odd);
    324          _mm_storeu_si128((__m128i *)&im_block[i * im_stride + j], res);
    325        }
    326      }
    327    }
    328 
    329    /* Vertical filter */
    330    {
    331      const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
    332          filter_params_y, subpel_y_qn & SUBPEL_MASK);
    333      const __m128i coeffs_y = _mm_loadu_si128((__m128i *)y_filter);
    334 
    335      // coeffs 0 1 0 1 2 3 2 3
    336      const __m128i tmp_0 = _mm_unpacklo_epi32(coeffs_y, coeffs_y);
    337      // coeffs 4 5 4 5 6 7 6 7
    338      const __m128i tmp_1 = _mm_unpackhi_epi32(coeffs_y, coeffs_y);
    339 
    340      // coeffs 0 1 0 1 0 1 0 1
    341      const __m128i coeff_01 = _mm_unpacklo_epi64(tmp_0, tmp_0);
    342      // coeffs 2 3 2 3 2 3 2 3
    343      const __m128i coeff_23 = _mm_unpackhi_epi64(tmp_0, tmp_0);
    344      // coeffs 4 5 4 5 4 5 4 5
    345      const __m128i coeff_45 = _mm_unpacklo_epi64(tmp_1, tmp_1);
    346      // coeffs 6 7 6 7 6 7 6 7
    347      const __m128i coeff_67 = _mm_unpackhi_epi64(tmp_1, tmp_1);
    348 
    349      const __m128i sum_round = _mm_set1_epi32(
    350          (1 << offset_bits) + ((1 << conv_params->round_1) >> 1));
    351      const __m128i sum_shift = _mm_cvtsi32_si128(conv_params->round_1);
    352 
    353      const __m128i round_const = _mm_set1_epi32(
    354          ((1 << bits) >> 1) - (1 << (offset_bits - conv_params->round_1)) -
    355          ((1 << (offset_bits - conv_params->round_1)) >> 1));
    356      const __m128i round_shift = _mm_cvtsi32_si128(bits);
    357 
    358      for (i = 0; i < h; ++i) {
    359        for (j = 0; j < w; j += 8) {
    360          // Filter even-index pixels
    361          const int16_t *data = &im_block[i * im_stride + j];
    362          const __m128i src_0 =
    363              _mm_unpacklo_epi16(*(__m128i *)(data + 0 * im_stride),
    364                                 *(__m128i *)(data + 1 * im_stride));
    365          const __m128i src_2 =
    366              _mm_unpacklo_epi16(*(__m128i *)(data + 2 * im_stride),
    367                                 *(__m128i *)(data + 3 * im_stride));
    368          const __m128i src_4 =
    369              _mm_unpacklo_epi16(*(__m128i *)(data + 4 * im_stride),
    370                                 *(__m128i *)(data + 5 * im_stride));
    371          const __m128i src_6 =
    372              _mm_unpacklo_epi16(*(__m128i *)(data + 6 * im_stride),
    373                                 *(__m128i *)(data + 7 * im_stride));
    374 
    375          const __m128i res_0 = _mm_madd_epi16(src_0, coeff_01);
    376          const __m128i res_2 = _mm_madd_epi16(src_2, coeff_23);
    377          const __m128i res_4 = _mm_madd_epi16(src_4, coeff_45);
    378          const __m128i res_6 = _mm_madd_epi16(src_6, coeff_67);
    379 
    380          const __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_2),
    381                                                 _mm_add_epi32(res_4, res_6));
    382 
    383          // Filter odd-index pixels
    384          const __m128i src_1 =
    385              _mm_unpackhi_epi16(*(__m128i *)(data + 0 * im_stride),
    386                                 *(__m128i *)(data + 1 * im_stride));
    387          const __m128i src_3 =
    388              _mm_unpackhi_epi16(*(__m128i *)(data + 2 * im_stride),
    389                                 *(__m128i *)(data + 3 * im_stride));
    390          const __m128i src_5 =
    391              _mm_unpackhi_epi16(*(__m128i *)(data + 4 * im_stride),
    392                                 *(__m128i *)(data + 5 * im_stride));
    393          const __m128i src_7 =
    394              _mm_unpackhi_epi16(*(__m128i *)(data + 6 * im_stride),
    395                                 *(__m128i *)(data + 7 * im_stride));
    396 
    397          const __m128i res_1 = _mm_madd_epi16(src_1, coeff_01);
    398          const __m128i res_3 = _mm_madd_epi16(src_3, coeff_23);
    399          const __m128i res_5 = _mm_madd_epi16(src_5, coeff_45);
    400          const __m128i res_7 = _mm_madd_epi16(src_7, coeff_67);
    401 
    402          const __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_3),
    403                                                _mm_add_epi32(res_5, res_7));
    404 
    405          // Rearrange pixels back into the order 0 ... 7
    406          const __m128i res_lo = _mm_unpacklo_epi32(res_even, res_odd);
    407          const __m128i res_hi = _mm_unpackhi_epi32(res_even, res_odd);
    408 
    409          __m128i res_lo_round =
    410              _mm_sra_epi32(_mm_add_epi32(res_lo, sum_round), sum_shift);
    411          __m128i res_hi_round =
    412              _mm_sra_epi32(_mm_add_epi32(res_hi, sum_round), sum_shift);
    413 
    414          res_lo_round = _mm_sra_epi32(_mm_add_epi32(res_lo_round, round_const),
    415                                       round_shift);
    416          res_hi_round = _mm_sra_epi32(_mm_add_epi32(res_hi_round, round_const),
    417                                       round_shift);
    418 
    419          const __m128i res16 = _mm_packs_epi32(res_lo_round, res_hi_round);
    420          const __m128i res = _mm_packus_epi16(res16, res16);
    421 
    422          // Accumulate values into the destination buffer
    423          __m128i *const p = (__m128i *)&dst[i * dst_stride + j];
    424 
    425          if (w == 2) {
    426            *(uint16_t *)p = (uint16_t)_mm_cvtsi128_si32(res);
    427          } else if (w == 4) {
    428            *(int *)p = _mm_cvtsi128_si32(res);
    429          } else {
    430            _mm_storel_epi64(p, res);
    431          }
    432        }
    433      }
    434    }
    435  }
    436 }
    437 
    438 void av1_dist_wtd_convolve_2d_copy_sse2(const uint8_t *src, int src_stride,
    439                                        uint8_t *dst0, int dst_stride0, int w,
    440                                        int h, ConvolveParams *conv_params) {
    441  const int bd = 8;
    442  CONV_BUF_TYPE *dst = conv_params->dst;
    443  int dst_stride = conv_params->dst_stride;
    444 
    445  const int bits =
    446      FILTER_BITS * 2 - conv_params->round_1 - conv_params->round_0;
    447  const int do_average = conv_params->do_average;
    448  const int use_dist_wtd_comp_avg = conv_params->use_dist_wtd_comp_avg;
    449  const __m128i zero = _mm_setzero_si128();
    450  const __m128i left_shift = _mm_cvtsi32_si128(bits);
    451  int i, j;
    452 
    453  const int w0 = conv_params->fwd_offset;
    454  const int w1 = conv_params->bck_offset;
    455  const __m128i wt0 = _mm_set1_epi16(w0);
    456  const __m128i wt1 = _mm_set1_epi16(w1);
    457  const __m128i wt = _mm_unpacklo_epi16(wt0, wt1);
    458 
    459  const int offset_0 =
    460      bd + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
    461  const int offset = (1 << offset_0) + (1 << (offset_0 - 1));
    462  const __m128i offset_const = _mm_set1_epi16(offset);
    463  const int rounding_shift =
    464      2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
    465  const __m128i rounding_const = _mm_set1_epi16((1 << rounding_shift) >> 1);
    466 
    467  assert((w % 4) == 0);
    468 
    469  if (!(w % 16)) {
    470    for (i = 0; i < h; ++i) {
    471      for (j = 0; j < w; j += 16) {
    472        const __m128i d8 = _mm_loadu_si128((__m128i *)&src[j]);
    473 
    474        const __m128i d16_lo = _mm_unpacklo_epi8(d8, zero);
    475        const __m128i d16_hi = _mm_unpackhi_epi8(d8, zero);
    476 
    477        const __m128i res_lo = _mm_sll_epi16(d16_lo, left_shift);
    478        const __m128i res_unsigned_lo = _mm_add_epi16(res_lo, offset_const);
    479 
    480        const __m128i res_hi = _mm_sll_epi16(d16_hi, left_shift);
    481        const __m128i res_unsigned_hi = _mm_add_epi16(res_hi, offset_const);
    482 
    483        if (do_average) {
    484          const __m128i data_ref_0_lo = _mm_loadu_si128((__m128i *)(&dst[j]));
    485          const __m128i data_ref_0_hi =
    486              _mm_loadu_si128((__m128i *)(&dst[j + 8]));
    487 
    488          const __m128i comp_avg_res_lo = comp_avg(
    489              &data_ref_0_lo, &res_unsigned_lo, &wt, use_dist_wtd_comp_avg);
    490 
    491          const __m128i round_result_lo = convolve_rounding(
    492              &comp_avg_res_lo, &offset_const, &rounding_const, rounding_shift);
    493 
    494          const __m128i comp_avg_res_hi = comp_avg(
    495              &data_ref_0_hi, &res_unsigned_hi, &wt, use_dist_wtd_comp_avg);
    496 
    497          const __m128i round_result_hi = convolve_rounding(
    498              &comp_avg_res_hi, &offset_const, &rounding_const, rounding_shift);
    499 
    500          const __m128i res_8 =
    501              _mm_packus_epi16(round_result_lo, round_result_hi);
    502 
    503          _mm_store_si128((__m128i *)(&dst0[j]), res_8);
    504        } else {
    505          _mm_store_si128((__m128i *)(&dst[j]), res_unsigned_lo);
    506          _mm_store_si128((__m128i *)(&dst[j + 8]), res_unsigned_hi);
    507        }
    508      }
    509      src += src_stride;
    510      dst += dst_stride;
    511      dst0 += dst_stride0;
    512    }
    513  } else {
    514    for (i = 0; i < h; ++i) {
    515      for (j = 0; j < w; j += 8) {
    516        const __m128i d8 = _mm_loadl_epi64((__m128i *)&src[j]);
    517        const __m128i d16_0 = _mm_unpacklo_epi8(d8, zero);
    518 
    519        const __m128i res = _mm_sll_epi16(d16_0, left_shift);
    520        const __m128i res_unsigned = _mm_add_epi16(res, offset_const);
    521 
    522        if (do_average) {
    523          const __m128i data_ref_0 = _mm_loadu_si128((__m128i *)(&dst[j]));
    524 
    525          const __m128i comp_avg_res =
    526              comp_avg(&data_ref_0, &res_unsigned, &wt, use_dist_wtd_comp_avg);
    527 
    528          const __m128i round_result = convolve_rounding(
    529              &comp_avg_res, &offset_const, &rounding_const, rounding_shift);
    530 
    531          const __m128i res_8 = _mm_packus_epi16(round_result, round_result);
    532 
    533          if (w > 4)
    534            _mm_storel_epi64((__m128i *)(&dst0[j]), res_8);
    535          else
    536            *(int *)(&dst0[j]) = _mm_cvtsi128_si32(res_8);
    537        } else {
    538          _mm_store_si128((__m128i *)(&dst[j]), res_unsigned);
    539        }
    540      }
    541      src += src_stride;
    542      dst += dst_stride;
    543      dst0 += dst_stride0;
    544    }
    545  }
    546 }