tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

highbd_convolve_sse2.c (16169B)


      1 /*
      2 * Copyright (c) 2018, Alliance for Open Media. All rights reserved.
      3 *
      4 * This source code is subject to the terms of the BSD 2 Clause License and
      5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
      6 * was not distributed with this source code in the LICENSE file, you can
      7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
      8 * Media Patent License 1.0 was not distributed with this source code in the
      9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
     10 */
     11 #include <emmintrin.h>
     12 
     13 #include "config/aom_dsp_rtcd.h"
     14 #include "aom_dsp/x86/convolve.h"
     15 
     16 // -----------------------------------------------------------------------------
     17 
     18 static void aom_highbd_filter_block1d4_v4_sse2(
     19    const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
     20    ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
     21  __m128i filtersReg;
     22  __m128i srcReg2, srcReg3, srcReg4, srcReg5, srcReg6;
     23  __m128i srcReg23_lo, srcReg34_lo;
     24  __m128i srcReg45_lo, srcReg56_lo;
     25  __m128i resReg23_lo, resReg34_lo, resReg45_lo, resReg56_lo;
     26  __m128i resReg23_45_lo, resReg34_56_lo;
     27  __m128i resReg23_45, resReg34_56;
     28  __m128i addFilterReg64, secondFilters, thirdFilters;
     29  unsigned int i;
     30  ptrdiff_t src_stride, dst_stride;
     31 
     32  const __m128i max = _mm_set1_epi16((1 << bd) - 1);
     33  addFilterReg64 = _mm_set1_epi32(64);
     34  filtersReg = _mm_loadu_si128((const __m128i *)filter);
     35 
     36  // coeffs 0 1 0 1 2 3 2 3
     37  const __m128i tmp0 = _mm_unpacklo_epi32(filtersReg, filtersReg);
     38  // coeffs 4 5 4 5 6 7 6 7
     39  const __m128i tmp1 = _mm_unpackhi_epi32(filtersReg, filtersReg);
     40 
     41  secondFilters = _mm_unpackhi_epi64(tmp0, tmp0);  // coeffs 2 3 2 3 2 3 2 3
     42  thirdFilters = _mm_unpacklo_epi64(tmp1, tmp1);   // coeffs 4 5 4 5 4 5 4 5
     43 
     44  // multiply the size of the source and destination stride by two
     45  src_stride = src_pitch << 1;
     46  dst_stride = dst_pitch << 1;
     47 
     48  srcReg2 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 2));
     49  srcReg3 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 3));
     50  srcReg23_lo = _mm_unpacklo_epi16(srcReg2, srcReg3);
     51 
     52  srcReg4 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 4));
     53  srcReg34_lo = _mm_unpacklo_epi16(srcReg3, srcReg4);
     54 
     55  for (i = height; i > 1; i -= 2) {
     56    srcReg5 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 5));
     57    srcReg45_lo = _mm_unpacklo_epi16(srcReg4, srcReg5);
     58 
     59    srcReg6 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 6));
     60    srcReg56_lo = _mm_unpacklo_epi16(srcReg5, srcReg6);
     61 
     62    // multiply 2 adjacent elements with the filter and add the result
     63 
     64    resReg23_lo = _mm_madd_epi16(srcReg23_lo, secondFilters);
     65    resReg34_lo = _mm_madd_epi16(srcReg34_lo, secondFilters);
     66    resReg45_lo = _mm_madd_epi16(srcReg45_lo, thirdFilters);
     67    resReg56_lo = _mm_madd_epi16(srcReg56_lo, thirdFilters);
     68 
     69    resReg23_45_lo = _mm_add_epi32(resReg23_lo, resReg45_lo);
     70    resReg34_56_lo = _mm_add_epi32(resReg34_lo, resReg56_lo);
     71 
     72    // shift by 7 bit each 32 bit
     73    resReg23_45_lo = _mm_add_epi32(resReg23_45_lo, addFilterReg64);
     74    resReg34_56_lo = _mm_add_epi32(resReg34_56_lo, addFilterReg64);
     75    resReg23_45_lo = _mm_srai_epi32(resReg23_45_lo, 7);
     76    resReg34_56_lo = _mm_srai_epi32(resReg34_56_lo, 7);
     77 
     78    // shrink to 16 bit each 32 bits, the first lane contain the first
     79    // convolve result and the second lane contain the second convolve
     80    // result
     81    resReg23_45 = _mm_packs_epi32(resReg23_45_lo, _mm_setzero_si128());
     82    resReg34_56 = _mm_packs_epi32(resReg34_56_lo, _mm_setzero_si128());
     83 
     84    resReg23_45 = _mm_max_epi16(resReg23_45, _mm_setzero_si128());
     85    resReg23_45 = _mm_min_epi16(resReg23_45, max);
     86    resReg34_56 = _mm_max_epi16(resReg34_56, _mm_setzero_si128());
     87    resReg34_56 = _mm_min_epi16(resReg34_56, max);
     88 
     89    src_ptr += src_stride;
     90 
     91    _mm_storel_epi64((__m128i *)dst_ptr, (resReg23_45));
     92    _mm_storel_epi64((__m128i *)(dst_ptr + dst_pitch), (resReg34_56));
     93 
     94    dst_ptr += dst_stride;
     95 
     96    // save part of the registers for next strides
     97    srcReg23_lo = srcReg45_lo;
     98    srcReg34_lo = srcReg56_lo;
     99    srcReg4 = srcReg6;
    100  }
    101 }
    102 
    103 static void aom_highbd_filter_block1d4_h4_sse2(
    104    const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
    105    ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
    106  __m128i filtersReg;
    107  __m128i addFilterReg64;
    108  __m128i secondFilters, thirdFilters;
    109  __m128i srcRegFilt32b1_1;
    110  __m128i srcReg32b1;
    111  unsigned int i;
    112  src_ptr -= 3;
    113  addFilterReg64 = _mm_set1_epi32(64);
    114  filtersReg = _mm_loadu_si128((const __m128i *)filter);
    115  const __m128i max = _mm_set1_epi16((1 << bd) - 1);
    116 
    117  // coeffs 0 1 0 1 2 3 2 3
    118  const __m128i tmp_0 = _mm_unpacklo_epi32(filtersReg, filtersReg);
    119  // coeffs 4 5 4 5 6 7 6 7
    120  const __m128i tmp_1 = _mm_unpackhi_epi32(filtersReg, filtersReg);
    121 
    122  secondFilters = _mm_unpackhi_epi64(tmp_0, tmp_0);  // coeffs 2 3 2 3 2 3 2 3
    123  thirdFilters = _mm_unpacklo_epi64(tmp_1, tmp_1);   // coeffs 4 5 4 5 4 5 4 5
    124 
    125  for (i = height; i > 0; i -= 1) {
    126    srcReg32b1 = _mm_loadu_si128((const __m128i *)(src_ptr + 2));
    127 
    128    __m128i ss_3_1 = _mm_srli_si128(srcReg32b1, 2);
    129    __m128i ss_4_1 = _mm_srli_si128(srcReg32b1, 4);
    130    __m128i ss_5_1 = _mm_srli_si128(srcReg32b1, 6);
    131    __m128i ss_23 = _mm_unpacklo_epi32(srcReg32b1, ss_3_1);
    132    __m128i ss_45 = _mm_unpacklo_epi32(ss_4_1, ss_5_1);
    133 
    134    ss_23 = _mm_madd_epi16(ss_23, secondFilters);
    135    ss_45 = _mm_madd_epi16(ss_45, thirdFilters);
    136    srcRegFilt32b1_1 = _mm_add_epi32(ss_23, ss_45);
    137 
    138    // shift by 7 bit each 32 bit
    139    srcRegFilt32b1_1 = _mm_add_epi32(srcRegFilt32b1_1, addFilterReg64);
    140    srcRegFilt32b1_1 = _mm_srai_epi32(srcRegFilt32b1_1, 7);
    141 
    142    srcRegFilt32b1_1 = _mm_packs_epi32(srcRegFilt32b1_1, _mm_setzero_si128());
    143    srcRegFilt32b1_1 = _mm_max_epi16(srcRegFilt32b1_1, _mm_setzero_si128());
    144    srcRegFilt32b1_1 = _mm_min_epi16(srcRegFilt32b1_1, max);
    145 
    146    src_ptr += src_pitch;
    147 
    148    _mm_storel_epi64((__m128i *)dst_ptr, srcRegFilt32b1_1);
    149 
    150    dst_ptr += dst_pitch;
    151  }
    152 }
    153 
    154 static void aom_highbd_filter_block1d8_v4_sse2(
    155    const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
    156    ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
    157  __m128i filtersReg;
    158  __m128i srcReg2, srcReg3, srcReg4, srcReg5, srcReg6;
    159  __m128i srcReg23_lo, srcReg23_hi, srcReg34_lo, srcReg34_hi;
    160  __m128i srcReg45_lo, srcReg45_hi, srcReg56_lo, srcReg56_hi;
    161  __m128i resReg23_lo, resReg34_lo, resReg45_lo, resReg56_lo;
    162  __m128i resReg23_hi, resReg34_hi, resReg45_hi, resReg56_hi;
    163  __m128i resReg23_45_lo, resReg34_56_lo, resReg23_45_hi, resReg34_56_hi;
    164  __m128i resReg23_45, resReg34_56;
    165  __m128i addFilterReg64, secondFilters, thirdFilters;
    166  unsigned int i;
    167  ptrdiff_t src_stride, dst_stride;
    168 
    169  const __m128i max = _mm_set1_epi16((1 << bd) - 1);
    170  addFilterReg64 = _mm_set1_epi32(64);
    171  filtersReg = _mm_loadu_si128((const __m128i *)filter);
    172 
    173  // coeffs 0 1 0 1 2 3 2 3
    174  const __m128i tmp0 = _mm_unpacklo_epi32(filtersReg, filtersReg);
    175  // coeffs 4 5 4 5 6 7 6 7
    176  const __m128i tmp1 = _mm_unpackhi_epi32(filtersReg, filtersReg);
    177 
    178  secondFilters = _mm_unpackhi_epi64(tmp0, tmp0);  // coeffs 2 3 2 3 2 3 2 3
    179  thirdFilters = _mm_unpacklo_epi64(tmp1, tmp1);   // coeffs 4 5 4 5 4 5 4 5
    180 
    181  // multiple the size of the source and destination stride by two
    182  src_stride = src_pitch << 1;
    183  dst_stride = dst_pitch << 1;
    184 
    185  srcReg2 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 2));
    186  srcReg3 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 3));
    187  srcReg23_lo = _mm_unpacklo_epi16(srcReg2, srcReg3);
    188  srcReg23_hi = _mm_unpackhi_epi16(srcReg2, srcReg3);
    189 
    190  srcReg4 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 4));
    191  srcReg34_lo = _mm_unpacklo_epi16(srcReg3, srcReg4);
    192  srcReg34_hi = _mm_unpackhi_epi16(srcReg3, srcReg4);
    193 
    194  for (i = height; i > 1; i -= 2) {
    195    srcReg5 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 5));
    196 
    197    srcReg45_lo = _mm_unpacklo_epi16(srcReg4, srcReg5);
    198    srcReg45_hi = _mm_unpackhi_epi16(srcReg4, srcReg5);
    199 
    200    srcReg6 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 6));
    201 
    202    srcReg56_lo = _mm_unpacklo_epi16(srcReg5, srcReg6);
    203    srcReg56_hi = _mm_unpackhi_epi16(srcReg5, srcReg6);
    204 
    205    // multiply 2 adjacent elements with the filter and add the result
    206 
    207    resReg23_lo = _mm_madd_epi16(srcReg23_lo, secondFilters);
    208    resReg34_lo = _mm_madd_epi16(srcReg34_lo, secondFilters);
    209    resReg45_lo = _mm_madd_epi16(srcReg45_lo, thirdFilters);
    210    resReg56_lo = _mm_madd_epi16(srcReg56_lo, thirdFilters);
    211 
    212    resReg23_45_lo = _mm_add_epi32(resReg23_lo, resReg45_lo);
    213    resReg34_56_lo = _mm_add_epi32(resReg34_lo, resReg56_lo);
    214 
    215    // multiply 2 adjacent elements with the filter and add the result
    216 
    217    resReg23_hi = _mm_madd_epi16(srcReg23_hi, secondFilters);
    218    resReg34_hi = _mm_madd_epi16(srcReg34_hi, secondFilters);
    219    resReg45_hi = _mm_madd_epi16(srcReg45_hi, thirdFilters);
    220    resReg56_hi = _mm_madd_epi16(srcReg56_hi, thirdFilters);
    221 
    222    resReg23_45_hi = _mm_add_epi32(resReg23_hi, resReg45_hi);
    223    resReg34_56_hi = _mm_add_epi32(resReg34_hi, resReg56_hi);
    224 
    225    // shift by 7 bit each 32 bit
    226    resReg23_45_lo = _mm_add_epi32(resReg23_45_lo, addFilterReg64);
    227    resReg34_56_lo = _mm_add_epi32(resReg34_56_lo, addFilterReg64);
    228    resReg23_45_hi = _mm_add_epi32(resReg23_45_hi, addFilterReg64);
    229    resReg34_56_hi = _mm_add_epi32(resReg34_56_hi, addFilterReg64);
    230    resReg23_45_lo = _mm_srai_epi32(resReg23_45_lo, 7);
    231    resReg34_56_lo = _mm_srai_epi32(resReg34_56_lo, 7);
    232    resReg23_45_hi = _mm_srai_epi32(resReg23_45_hi, 7);
    233    resReg34_56_hi = _mm_srai_epi32(resReg34_56_hi, 7);
    234 
    235    // shrink to 16 bit each 32 bits, the first lane contain the first
    236    // convolve result and the second lane contain the second convolve
    237    // result
    238    resReg23_45 = _mm_packs_epi32(resReg23_45_lo, resReg23_45_hi);
    239    resReg34_56 = _mm_packs_epi32(resReg34_56_lo, resReg34_56_hi);
    240 
    241    resReg23_45 = _mm_max_epi16(resReg23_45, _mm_setzero_si128());
    242    resReg23_45 = _mm_min_epi16(resReg23_45, max);
    243    resReg34_56 = _mm_max_epi16(resReg34_56, _mm_setzero_si128());
    244    resReg34_56 = _mm_min_epi16(resReg34_56, max);
    245 
    246    src_ptr += src_stride;
    247 
    248    _mm_store_si128((__m128i *)dst_ptr, (resReg23_45));
    249    _mm_store_si128((__m128i *)(dst_ptr + dst_pitch), (resReg34_56));
    250 
    251    dst_ptr += dst_stride;
    252 
    253    // save part of the registers for next strides
    254    srcReg23_lo = srcReg45_lo;
    255    srcReg23_hi = srcReg45_hi;
    256    srcReg34_lo = srcReg56_lo;
    257    srcReg34_hi = srcReg56_hi;
    258    srcReg4 = srcReg6;
    259  }
    260 }
    261 
    262 static void aom_highbd_filter_block1d8_h4_sse2(
    263    const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
    264    ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
    265  __m128i filtersReg;
    266  __m128i addFilterReg64;
    267  __m128i secondFilters, thirdFilters;
    268  __m128i srcRegFilt32b1_1, srcRegFilt32b1_2;
    269  __m128i srcReg32b1, srcReg32b2;
    270  unsigned int i;
    271  src_ptr -= 3;
    272  addFilterReg64 = _mm_set1_epi32(64);
    273  filtersReg = _mm_loadu_si128((const __m128i *)filter);
    274  const __m128i max = _mm_set1_epi16((1 << bd) - 1);
    275 
    276  // coeffs 0 1 0 1 2 3 2 3
    277  const __m128i tmp_0 = _mm_unpacklo_epi32(filtersReg, filtersReg);
    278  // coeffs 4 5 4 5 6 7 6 7
    279  const __m128i tmp_1 = _mm_unpackhi_epi32(filtersReg, filtersReg);
    280 
    281  secondFilters = _mm_unpackhi_epi64(tmp_0, tmp_0);  // coeffs 2 3 2 3 2 3 2 3
    282  thirdFilters = _mm_unpacklo_epi64(tmp_1, tmp_1);   // coeffs 4 5 4 5 4 5 4 5
    283 
    284  for (i = height; i > 0; i -= 1) {
    285    srcReg32b1 = _mm_loadu_si128((const __m128i *)(src_ptr + 2));
    286    srcReg32b2 = _mm_loadu_si128((const __m128i *)(src_ptr + 6));
    287 
    288    __m128i ss_4_1 = _mm_srli_si128(srcReg32b1, 4);
    289    __m128i ss_4_2 = _mm_srli_si128(srcReg32b2, 4);
    290    __m128i ss_4 = _mm_unpacklo_epi64(ss_4_1, ss_4_2);
    291 
    292    __m128i d1 = _mm_madd_epi16(srcReg32b1, secondFilters);
    293    __m128i d2 = _mm_madd_epi16(ss_4, thirdFilters);
    294    srcRegFilt32b1_1 = _mm_add_epi32(d1, d2);
    295 
    296    __m128i ss_3_1 = _mm_srli_si128(srcReg32b1, 2);
    297    __m128i ss_5_1 = _mm_srli_si128(srcReg32b1, 6);
    298    __m128i ss_3_2 = _mm_srli_si128(srcReg32b2, 2);
    299    __m128i ss_5_2 = _mm_srli_si128(srcReg32b2, 6);
    300    __m128i ss_3 = _mm_unpacklo_epi64(ss_3_1, ss_3_2);
    301    __m128i ss_5 = _mm_unpacklo_epi64(ss_5_1, ss_5_2);
    302 
    303    d1 = _mm_madd_epi16(ss_3, secondFilters);
    304    d2 = _mm_madd_epi16(ss_5, thirdFilters);
    305    srcRegFilt32b1_2 = _mm_add_epi32(d1, d2);
    306 
    307    __m128i res_lo_1 = _mm_unpacklo_epi32(srcRegFilt32b1_1, srcRegFilt32b1_2);
    308    __m128i res_hi_1 = _mm_unpackhi_epi32(srcRegFilt32b1_1, srcRegFilt32b1_2);
    309 
    310    // shift by 7 bit each 32 bit
    311    res_lo_1 = _mm_add_epi32(res_lo_1, addFilterReg64);
    312    res_hi_1 = _mm_add_epi32(res_hi_1, addFilterReg64);
    313    res_lo_1 = _mm_srai_epi32(res_lo_1, 7);
    314    res_hi_1 = _mm_srai_epi32(res_hi_1, 7);
    315 
    316    srcRegFilt32b1_1 = _mm_packs_epi32(res_lo_1, res_hi_1);
    317 
    318    srcRegFilt32b1_1 = _mm_max_epi16(srcRegFilt32b1_1, _mm_setzero_si128());
    319    srcRegFilt32b1_1 = _mm_min_epi16(srcRegFilt32b1_1, max);
    320 
    321    src_ptr += src_pitch;
    322 
    323    _mm_store_si128((__m128i *)dst_ptr, srcRegFilt32b1_1);
    324 
    325    dst_ptr += dst_pitch;
    326  }
    327 }
    328 
    329 static void aom_highbd_filter_block1d16_v4_sse2(
    330    const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
    331    ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
    332  aom_highbd_filter_block1d8_v4_sse2(src_ptr, src_pitch, dst_ptr, dst_pitch,
    333                                     height, filter, bd);
    334  aom_highbd_filter_block1d8_v4_sse2((src_ptr + 8), src_pitch, (dst_ptr + 8),
    335                                     dst_pitch, height, filter, bd);
    336 }
    337 
    338 static void aom_highbd_filter_block1d16_h4_sse2(
    339    const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
    340    ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
    341  aom_highbd_filter_block1d8_h4_sse2(src_ptr, src_pitch, dst_ptr, dst_pitch,
    342                                     height, filter, bd);
    343  aom_highbd_filter_block1d8_h4_sse2((src_ptr + 8), src_pitch, (dst_ptr + 8),
    344                                     dst_pitch, height, filter, bd);
    345 }
    346 
    347 // From aom_dsp/x86/aom_high_subpixel_8t_sse2.asm
    348 highbd_filter8_1dfunction aom_highbd_filter_block1d16_v8_sse2;
    349 highbd_filter8_1dfunction aom_highbd_filter_block1d16_h8_sse2;
    350 highbd_filter8_1dfunction aom_highbd_filter_block1d8_v8_sse2;
    351 highbd_filter8_1dfunction aom_highbd_filter_block1d8_h8_sse2;
    352 highbd_filter8_1dfunction aom_highbd_filter_block1d4_v8_sse2;
    353 highbd_filter8_1dfunction aom_highbd_filter_block1d4_h8_sse2;
    354 
    355 // From aom_dsp/x86/aom_high_subpixel_bilinear_sse2.asm
    356 highbd_filter8_1dfunction aom_highbd_filter_block1d16_v2_sse2;
    357 highbd_filter8_1dfunction aom_highbd_filter_block1d16_h2_sse2;
    358 highbd_filter8_1dfunction aom_highbd_filter_block1d8_v2_sse2;
    359 highbd_filter8_1dfunction aom_highbd_filter_block1d8_h2_sse2;
    360 highbd_filter8_1dfunction aom_highbd_filter_block1d4_v2_sse2;
    361 highbd_filter8_1dfunction aom_highbd_filter_block1d4_h2_sse2;
    362 
    363 // void aom_highbd_convolve8_horiz_sse2(const uint8_t *src,
    364 //                                      ptrdiff_t src_stride,
    365 //                                      uint8_t *dst,
    366 //                                      ptrdiff_t dst_stride,
    367 //                                      const int16_t *filter_x,
    368 //                                      int x_step_q4,
    369 //                                      const int16_t *filter_y,
    370 //                                      int y_step_q4,
    371 //                                      int w, int h, int bd);
    372 // void aom_highbd_convolve8_vert_sse2(const uint8_t *src,
    373 //                                     ptrdiff_t src_stride,
    374 //                                     uint8_t *dst,
    375 //                                     ptrdiff_t dst_stride,
    376 //                                     const int16_t *filter_x,
    377 //                                     int x_step_q4,
    378 //                                     const int16_t *filter_y,
    379 //                                     int y_step_q4,
    380 //                                     int w, int h, int bd);
    381 HIGH_FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , sse2)
    382 HIGH_FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , sse2)