tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

intra_edge_sse4.c (11606B)


      1 /*
      2 * Copyright (c) 2017, Alliance for Open Media. All rights reserved.
      3 *
      4 * This source code is subject to the terms of the BSD 2 Clause License and
      5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
      6 * was not distributed with this source code in the LICENSE file, you can
      7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
      8 * Media Patent License 1.0 was not distributed with this source code in the
      9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
     10 */
     11 
     12 #include <assert.h>
     13 #include <smmintrin.h>
     14 
     15 #include "config/aom_config.h"
     16 #include "config/av1_rtcd.h"
     17 
     18 void av1_filter_intra_edge_sse4_1(uint8_t *p, int sz, int strength) {
     19  if (!strength) return;
     20 
     21  DECLARE_ALIGNED(16, static const int8_t, kern[3][16]) = {
     22    { 4, 8, 4, 0, 4, 8, 4, 0, 4, 8, 4, 0, 4, 8, 4, 0 },  // strength 1: 4,8,4
     23    { 5, 6, 5, 0, 5, 6, 5, 0, 5, 6, 5, 0, 5, 6, 5, 0 },  // strength 2: 5,6,5
     24    { 2, 4, 4, 4, 2, 0, 0, 0, 2, 4, 4, 4, 2, 0, 0, 0 }  // strength 3: 2,4,4,4,2
     25  };
     26 
     27  DECLARE_ALIGNED(16, static const int8_t, v_const[5][16]) = {
     28    { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 },
     29    { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 },
     30    { 0, 1, 2, 3, 4, 5, 6, 7, 1, 2, 3, 4, 5, 6, 7, 8 },
     31    { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
     32  };
     33 
     34  // Extend the first and last samples to simplify the loop for the 5-tap case
     35  p[-1] = p[0];
     36  __m128i last = _mm_set1_epi8((char)p[sz - 1]);
     37  _mm_storeu_si128((__m128i *)&p[sz], last);
     38 
     39  // Adjust input pointer for filter support area
     40  uint8_t *in = (strength == 3) ? p - 1 : p;
     41 
     42  // Avoid modifying first sample
     43  uint8_t *out = p + 1;
     44  int len = sz - 1;
     45 
     46  const int use_3tap_filter = (strength < 3);
     47 
     48  if (use_3tap_filter) {
     49    __m128i coef0 = _mm_lddqu_si128((__m128i const *)kern[strength - 1]);
     50    __m128i shuf0 = _mm_lddqu_si128((__m128i const *)v_const[0]);
     51    __m128i shuf1 = _mm_lddqu_si128((__m128i const *)v_const[1]);
     52    __m128i iden = _mm_lddqu_si128((__m128i *)v_const[3]);
     53    __m128i in0 = _mm_lddqu_si128((__m128i *)in);
     54    while (len > 0) {
     55      int n_out = (len < 8) ? len : 8;
     56      __m128i d0 = _mm_shuffle_epi8(in0, shuf0);
     57      __m128i d1 = _mm_shuffle_epi8(in0, shuf1);
     58      d0 = _mm_maddubs_epi16(d0, coef0);
     59      d1 = _mm_maddubs_epi16(d1, coef0);
     60      d0 = _mm_hadd_epi16(d0, d1);
     61      __m128i eight = _mm_set1_epi16(8);
     62      d0 = _mm_add_epi16(d0, eight);
     63      d0 = _mm_srai_epi16(d0, 4);
     64      d0 = _mm_packus_epi16(d0, d0);
     65      __m128i out0 = _mm_lddqu_si128((__m128i *)out);
     66      __m128i n0 = _mm_set1_epi8(n_out);
     67      __m128i mask = _mm_cmpgt_epi8(n0, iden);
     68      out0 = _mm_blendv_epi8(out0, d0, mask);
     69      _mm_storel_epi64((__m128i *)out, out0);
     70      __m128i in1 = _mm_lddqu_si128((__m128i *)(in + 16));
     71      in0 = _mm_alignr_epi8(in1, in0, 8);
     72      in += 8;
     73      out += 8;
     74      len -= n_out;
     75    }
     76  } else {  // 5-tap filter
     77    __m128i coef0 = _mm_lddqu_si128((__m128i const *)kern[strength - 1]);
     78    __m128i two = _mm_set1_epi8(2);
     79    __m128i shuf_a = _mm_lddqu_si128((__m128i const *)v_const[2]);
     80    __m128i shuf_b = _mm_add_epi8(shuf_a, two);
     81    __m128i shuf_c = _mm_add_epi8(shuf_b, two);
     82    __m128i shuf_d = _mm_add_epi8(shuf_c, two);
     83    __m128i iden = _mm_lddqu_si128((__m128i *)v_const[3]);
     84    __m128i in0 = _mm_lddqu_si128((__m128i *)in);
     85    while (len > 0) {
     86      int n_out = (len < 8) ? len : 8;
     87      __m128i d0 = _mm_shuffle_epi8(in0, shuf_a);
     88      __m128i d1 = _mm_shuffle_epi8(in0, shuf_b);
     89      __m128i d2 = _mm_shuffle_epi8(in0, shuf_c);
     90      __m128i d3 = _mm_shuffle_epi8(in0, shuf_d);
     91      d0 = _mm_maddubs_epi16(d0, coef0);
     92      d1 = _mm_maddubs_epi16(d1, coef0);
     93      d2 = _mm_maddubs_epi16(d2, coef0);
     94      d3 = _mm_maddubs_epi16(d3, coef0);
     95      d0 = _mm_hadd_epi16(d0, d1);
     96      d2 = _mm_hadd_epi16(d2, d3);
     97      d0 = _mm_hadd_epi16(d0, d2);
     98      __m128i eight = _mm_set1_epi16(8);
     99      d0 = _mm_add_epi16(d0, eight);
    100      d0 = _mm_srai_epi16(d0, 4);
    101      d0 = _mm_packus_epi16(d0, d0);
    102      __m128i out0 = _mm_lddqu_si128((__m128i *)out);
    103      __m128i n0 = _mm_set1_epi8(n_out);
    104      __m128i mask = _mm_cmpgt_epi8(n0, iden);
    105      out0 = _mm_blendv_epi8(out0, d0, mask);
    106      _mm_storel_epi64((__m128i *)out, out0);
    107      __m128i in1 = _mm_lddqu_si128((__m128i *)(in + 16));
    108      in0 = _mm_alignr_epi8(in1, in0, 8);
    109      in += 8;
    110      out += 8;
    111      len -= n_out;
    112    }
    113  }
    114 }
    115 
    116 void av1_upsample_intra_edge_sse4_1(uint8_t *p, int sz) {
    117  // interpolate half-sample positions
    118  assert(sz <= 24);
    119 
    120  DECLARE_ALIGNED(16, static const int8_t, kernel[1][16]) = {
    121    { -1, 9, 9, -1, -1, 9, 9, -1, -1, 9, 9, -1, -1, 9, 9, -1 }
    122  };
    123 
    124  DECLARE_ALIGNED(
    125      16, static const int8_t,
    126      v_const[2][16]) = { { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 },
    127                          { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 } };
    128 
    129  // Extend first/last samples (upper-left p[-1], last p[sz-1])
    130  // to support 4-tap filter
    131  p[-2] = p[-1];
    132  p[sz] = p[sz - 1];
    133 
    134  uint8_t *in = &p[-2];
    135  uint8_t *out = &p[-2];
    136 
    137  int n = sz + 1;  // Input length including upper-left sample
    138 
    139  __m128i in0 = _mm_lddqu_si128((__m128i *)&in[0]);
    140  __m128i in16 = _mm_lddqu_si128((__m128i *)&in[16]);
    141 
    142  __m128i coef0 = _mm_lddqu_si128((__m128i *)kernel[0]);
    143  __m128i shuf0 = _mm_lddqu_si128((__m128i *)v_const[0]);
    144  __m128i shuf1 = _mm_lddqu_si128((__m128i *)v_const[1]);
    145 
    146  while (n > 0) {
    147    __m128i in8 = _mm_alignr_epi8(in16, in0, 8);
    148    __m128i d0 = _mm_shuffle_epi8(in0, shuf0);
    149    __m128i d1 = _mm_shuffle_epi8(in0, shuf1);
    150    __m128i d2 = _mm_shuffle_epi8(in8, shuf0);
    151    __m128i d3 = _mm_shuffle_epi8(in8, shuf1);
    152    d0 = _mm_maddubs_epi16(d0, coef0);
    153    d1 = _mm_maddubs_epi16(d1, coef0);
    154    d2 = _mm_maddubs_epi16(d2, coef0);
    155    d3 = _mm_maddubs_epi16(d3, coef0);
    156    d0 = _mm_hadd_epi16(d0, d1);
    157    d2 = _mm_hadd_epi16(d2, d3);
    158    __m128i eight = _mm_set1_epi16(8);
    159    d0 = _mm_add_epi16(d0, eight);
    160    d2 = _mm_add_epi16(d2, eight);
    161    d0 = _mm_srai_epi16(d0, 4);
    162    d2 = _mm_srai_epi16(d2, 4);
    163    d0 = _mm_packus_epi16(d0, d2);
    164    __m128i in1 = _mm_alignr_epi8(in16, in0, 1);
    165    __m128i out0 = _mm_unpacklo_epi8(in1, d0);
    166    __m128i out1 = _mm_unpackhi_epi8(in1, d0);
    167    _mm_storeu_si128((__m128i *)&out[0], out0);
    168    _mm_storeu_si128((__m128i *)&out[16], out1);
    169    in0 = in16;
    170    in16 = _mm_setzero_si128();
    171    out += 32;
    172    n -= 16;
    173  }
    174 }
    175 
    176 #if CONFIG_AV1_HIGHBITDEPTH
    177 
    178 void av1_highbd_filter_intra_edge_sse4_1(uint16_t *p, int sz, int strength) {
    179  if (!strength) return;
    180 
    181  DECLARE_ALIGNED(16, static const int16_t, kern[3][8]) = {
    182    { 4, 8, 4, 8, 4, 8, 4, 8 },  // strength 1: 4,8,4
    183    { 5, 6, 5, 6, 5, 6, 5, 6 },  // strength 2: 5,6,5
    184    { 2, 4, 2, 4, 2, 4, 2, 4 }   // strength 3: 2,4,4,4,2
    185  };
    186 
    187  DECLARE_ALIGNED(16, static const int16_t,
    188                  v_const[1][8]) = { { 0, 1, 2, 3, 4, 5, 6, 7 } };
    189 
    190  // Extend the first and last samples to simplify the loop for the 5-tap case
    191  p[-1] = p[0];
    192  __m128i last = _mm_set1_epi16(p[sz - 1]);
    193  _mm_storeu_si128((__m128i *)&p[sz], last);
    194 
    195  // Adjust input pointer for filter support area
    196  uint16_t *in = (strength == 3) ? p - 1 : p;
    197 
    198  // Avoid modifying first sample
    199  uint16_t *out = p + 1;
    200  int len = sz - 1;
    201 
    202  const int use_3tap_filter = (strength < 3);
    203 
    204  if (use_3tap_filter) {
    205    __m128i coef0 = _mm_lddqu_si128((__m128i const *)kern[strength - 1]);
    206    __m128i iden = _mm_lddqu_si128((__m128i *)v_const[0]);
    207    __m128i in0 = _mm_lddqu_si128((__m128i *)&in[0]);
    208    __m128i in8 = _mm_lddqu_si128((__m128i *)&in[8]);
    209    while (len > 0) {
    210      int n_out = (len < 8) ? len : 8;
    211      __m128i in1 = _mm_alignr_epi8(in8, in0, 2);
    212      __m128i in2 = _mm_alignr_epi8(in8, in0, 4);
    213      __m128i in02 = _mm_add_epi16(in0, in2);
    214      __m128i d0 = _mm_unpacklo_epi16(in02, in1);
    215      __m128i d1 = _mm_unpackhi_epi16(in02, in1);
    216      d0 = _mm_mullo_epi16(d0, coef0);
    217      d1 = _mm_mullo_epi16(d1, coef0);
    218      d0 = _mm_hadd_epi16(d0, d1);
    219      __m128i eight = _mm_set1_epi16(8);
    220      d0 = _mm_add_epi16(d0, eight);
    221      d0 = _mm_srli_epi16(d0, 4);
    222      __m128i out0 = _mm_lddqu_si128((__m128i *)out);
    223      __m128i n0 = _mm_set1_epi16(n_out);
    224      __m128i mask = _mm_cmpgt_epi16(n0, iden);
    225      out0 = _mm_blendv_epi8(out0, d0, mask);
    226      _mm_storeu_si128((__m128i *)out, out0);
    227      in += 8;
    228      in0 = in8;
    229      in8 = _mm_lddqu_si128((__m128i *)&in[8]);
    230      out += 8;
    231      len -= n_out;
    232    }
    233  } else {  // 5-tap filter
    234    __m128i coef0 = _mm_lddqu_si128((__m128i const *)kern[strength - 1]);
    235    __m128i iden = _mm_lddqu_si128((__m128i *)v_const[0]);
    236    __m128i in0 = _mm_lddqu_si128((__m128i *)&in[0]);
    237    __m128i in8 = _mm_lddqu_si128((__m128i *)&in[8]);
    238    while (len > 0) {
    239      int n_out = (len < 8) ? len : 8;
    240      __m128i in1 = _mm_alignr_epi8(in8, in0, 2);
    241      __m128i in2 = _mm_alignr_epi8(in8, in0, 4);
    242      __m128i in3 = _mm_alignr_epi8(in8, in0, 6);
    243      __m128i in4 = _mm_alignr_epi8(in8, in0, 8);
    244      __m128i in04 = _mm_add_epi16(in0, in4);
    245      __m128i in123 = _mm_add_epi16(in1, in2);
    246      in123 = _mm_add_epi16(in123, in3);
    247      __m128i d0 = _mm_unpacklo_epi16(in04, in123);
    248      __m128i d1 = _mm_unpackhi_epi16(in04, in123);
    249      d0 = _mm_mullo_epi16(d0, coef0);
    250      d1 = _mm_mullo_epi16(d1, coef0);
    251      d0 = _mm_hadd_epi16(d0, d1);
    252      __m128i eight = _mm_set1_epi16(8);
    253      d0 = _mm_add_epi16(d0, eight);
    254      d0 = _mm_srli_epi16(d0, 4);
    255      __m128i out0 = _mm_lddqu_si128((__m128i *)out);
    256      __m128i n0 = _mm_set1_epi16(n_out);
    257      __m128i mask = _mm_cmpgt_epi16(n0, iden);
    258      out0 = _mm_blendv_epi8(out0, d0, mask);
    259      _mm_storeu_si128((__m128i *)out, out0);
    260      in += 8;
    261      in0 = in8;
    262      in8 = _mm_lddqu_si128((__m128i *)&in[8]);
    263      out += 8;
    264      len -= n_out;
    265    }
    266  }
    267 }
    268 
    269 void av1_highbd_upsample_intra_edge_sse4_1(uint16_t *p, int sz, int bd) {
    270  // interpolate half-sample positions
    271  assert(sz <= 24);
    272 
    273  DECLARE_ALIGNED(16, static const int16_t,
    274                  kernel[1][8]) = { { -1, 9, -1, 9, -1, 9, -1, 9 } };
    275 
    276  // Extend first/last samples (upper-left p[-1], last p[sz-1])
    277  // to support 4-tap filter
    278  p[-2] = p[-1];
    279  p[sz] = p[sz - 1];
    280 
    281  uint16_t *in = &p[-2];
    282  uint16_t *out = in;
    283  int n = sz + 1;
    284 
    285  __m128i in0 = _mm_lddqu_si128((__m128i *)&in[0]);
    286  __m128i in8 = _mm_lddqu_si128((__m128i *)&in[8]);
    287  __m128i in16 = _mm_lddqu_si128((__m128i *)&in[16]);
    288  __m128i in24 = _mm_lddqu_si128((__m128i *)&in[24]);
    289 
    290  while (n > 0) {
    291    __m128i in1 = _mm_alignr_epi8(in8, in0, 2);
    292    __m128i in2 = _mm_alignr_epi8(in8, in0, 4);
    293    __m128i in3 = _mm_alignr_epi8(in8, in0, 6);
    294    __m128i sum0 = _mm_add_epi16(in0, in3);
    295    __m128i sum1 = _mm_add_epi16(in1, in2);
    296    __m128i d0 = _mm_unpacklo_epi16(sum0, sum1);
    297    __m128i d1 = _mm_unpackhi_epi16(sum0, sum1);
    298    __m128i coef0 = _mm_lddqu_si128((__m128i *)kernel[0]);
    299    d0 = _mm_madd_epi16(d0, coef0);
    300    d1 = _mm_madd_epi16(d1, coef0);
    301    __m128i eight = _mm_set1_epi32(8);
    302    d0 = _mm_add_epi32(d0, eight);
    303    d1 = _mm_add_epi32(d1, eight);
    304    d0 = _mm_srai_epi32(d0, 4);
    305    d1 = _mm_srai_epi32(d1, 4);
    306    d0 = _mm_packus_epi32(d0, d1);
    307    __m128i max0 = _mm_set1_epi16((1 << bd) - 1);
    308    d0 = _mm_min_epi16(d0, max0);
    309    __m128i out0 = _mm_unpacklo_epi16(in1, d0);
    310    __m128i out1 = _mm_unpackhi_epi16(in1, d0);
    311    _mm_storeu_si128((__m128i *)&out[0], out0);
    312    _mm_storeu_si128((__m128i *)&out[8], out1);
    313    in0 = in8;
    314    in8 = in16;
    315    in16 = in24;
    316    in24 = _mm_setzero_si128();
    317    out += 16;
    318    n -= 8;
    319  }
    320 }
    321 
    322 #endif  // CONFIG_AV1_HIGHBITDEPTH