[ tor-browser ].git.dasho

resize_sse2.c (15713B)
      1 /*
      2 * Copyright (c) 2024, Alliance for Open Media. All rights reserved.
      3 *
      4 * This source code is subject to the terms of the BSD 2 Clause License and
      5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
      6 * was not distributed with this source code in the LICENSE file, you can
      7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
      8 * Media Patent License 1.0 was not distributed with this source code in the
      9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
     10 */
     11 #include <immintrin.h>
     12 
     13 #include "config/av1_rtcd.h"
     14 
     15 #include "av1/common/resize.h"
     16 
     17 #include "aom_dsp/x86/synonyms.h"
     18 
     19 #define ROW_OFFSET 5
     20 
     21 #define PROCESS_RESIZE_Y_WD8                                           \
     22  /* ah0 ah1 ... ah7 */                                                \
     23  const __m128i AH = _mm_add_epi16(l0, l7);                            \
     24  /* bg0 bg1 ... bh7 */                                                \
     25  const __m128i BG = _mm_add_epi16(l1, l6);                            \
     26  /* cf0 cf1 ... cf7 */                                                \
     27  const __m128i CF = _mm_add_epi16(l2, l5);                            \
     28  /* de0 de1 ... de7 */                                                \
     29  const __m128i DE = _mm_add_epi16(l3, l4);                            \
     30                                                                       \
     31  /* ah0 bg0 ... ah3 bg3 */                                            \
     32  const __m128i AHBG_low = _mm_unpacklo_epi16(AH, BG);                 \
     33  /*cf0 de0 ... cf2 de2 */                                             \
     34  const __m128i CFDE_low = _mm_unpacklo_epi16(CF, DE);                 \
     35                                                                       \
     36  /* ah4 bg4... ah7 bg7 */                                             \
     37  const __m128i AHBG_hi = _mm_unpackhi_epi16(AH, BG);                  \
     38  /* cf4 de4... cf7 de7 */                                             \
     39  const __m128i CFDE_hi = _mm_unpackhi_epi16(CF, DE);                  \
     40                                                                       \
     41  /* r00 r01 r02 r03 */                                                \
     42  const __m128i r00 = _mm_madd_epi16(AHBG_low, coeffs_y[0]);           \
     43  const __m128i r01 = _mm_madd_epi16(CFDE_low, coeffs_y[1]);           \
     44  __m128i r0 = _mm_add_epi32(r00, r01);                                \
     45  /* r04 r05 r06 r07 */                                                \
     46  const __m128i r10 = _mm_madd_epi16(AHBG_hi, coeffs_y[0]);            \
     47  const __m128i r11 = _mm_madd_epi16(CFDE_hi, coeffs_y[1]);            \
     48  __m128i r1 = _mm_add_epi32(r10, r11);                                \
     49                                                                       \
     50  r0 = _mm_add_epi32(r0, round_const_bits);                            \
     51  r1 = _mm_add_epi32(r1, round_const_bits);                            \
     52  r0 = _mm_sra_epi32(r0, round_shift_bits);                            \
     53  r1 = _mm_sra_epi32(r1, round_shift_bits);                            \
     54                                                                       \
     55  /* r00 ... r07 (8 values of each 16bit) */                           \
     56  const __m128i res_16b = _mm_packs_epi32(r0, r1);                     \
     57  /* r00 ... r07 | r00 ... r07 (16 values of each 8bit) */             \
     58  const __m128i res_8b0 = _mm_packus_epi16(res_16b, res_16b);          \
     59                                                                       \
     60  __m128i res = _mm_min_epu8(res_8b0, clip_pixel);                     \
     61  res = _mm_max_epu8(res, zero);                                       \
     62  _mm_storel_epi64((__m128i *)&output[(i / 2) * out_stride + j], res); \
     63                                                                       \
     64  l0 = l2;                                                             \
     65  l1 = l3;                                                             \
     66  l2 = l4;                                                             \
     67  l3 = l5;                                                             \
     68  l4 = l6;                                                             \
     69  l5 = l7;                                                             \
     70  data += 2 * stride;
     71 
     72 static inline void prepare_filter_coeffs(const int16_t *filter,
     73                                         __m128i *const coeffs /* [2] */) {
     74  // f0 f1 f2 f3 x x x x
     75  const __m128i sym_even_filter = _mm_loadl_epi64((__m128i *)filter);
     76 
     77  // f1 f0 f3 f2 x x x x
     78  const __m128i tmp1 = _mm_shufflelo_epi16(sym_even_filter, 0xb1);
     79 
     80  // f3 f2 f3 f2 ...
     81  coeffs[0] = _mm_shuffle_epi32(tmp1, 0x55);
     82  // f1 f0 f1 f0 ...
     83  coeffs[1] = _mm_shuffle_epi32(tmp1, 0x00);
     84 }
     85 
     86 bool av1_resize_vert_dir_sse2(uint8_t *intbuf, uint8_t *output, int out_stride,
     87                              int height, int height2, int stride,
     88                              int start_col) {
     89  // For the GM tool, the input layer height or width is assured to be an even
     90  // number. Hence the function 'down2_symodd()' is not invoked and SIMD
     91  // optimization of the same is not implemented.
     92  // When the input height is less than 8 and even, the potential input
     93  // heights are limited to 2, 4, or 6. These scenarios require seperate
     94  // handling due to padding requirements. Invoking the C function here will
     95  // eliminate the need for conditional statements within the subsequent SIMD
     96  // code to manage these cases.
     97  if (height & 1 || height < 8) {
     98    return av1_resize_vert_dir_c(intbuf, output, out_stride, height, height2,
     99                                 stride, start_col);
    100  }
    101 
    102  __m128i coeffs_y[2];
    103  const int bits = FILTER_BITS;
    104  const __m128i round_const_bits = _mm_set1_epi32((1 << bits) >> 1);
    105  const __m128i round_shift_bits = _mm_cvtsi32_si128(bits);
    106  const uint8_t max_pixel = 255;
    107  const __m128i clip_pixel = _mm_set1_epi8((char)max_pixel);
    108  const __m128i zero = _mm_setzero_si128();
    109  prepare_filter_coeffs(av1_down2_symeven_half_filter, coeffs_y);
    110 
    111  const int remain_col = stride % 8;
    112 
    113  for (int j = start_col; j < stride - remain_col; j += 8) {
    114    uint8_t *data = &intbuf[j];
    115    // d0 ... d7
    116    const __m128i l8_3 = _mm_loadl_epi64((__m128i *)(data + 0 * stride));
    117    // Padding top 3 rows with the last available row at the top.
    118    // a0 ... a7
    119    const __m128i l8_0 = l8_3;
    120    // b0 ... b7
    121    const __m128i l8_1 = l8_3;
    122    // c0 ... c7
    123    const __m128i l8_2 = l8_3;
    124    // e0 ... e7
    125    const __m128i l8_4 = _mm_loadl_epi64((__m128i *)(data + 1 * stride));
    126    // f0 ... f7
    127    const __m128i l8_5 = _mm_loadl_epi64((__m128i *)(data + 2 * stride));
    128 
    129    // Convert to 16bit as addition of 2 source pixel crosses 8 bit.
    130    __m128i l0 = _mm_unpacklo_epi8(l8_0, zero);  // A(128bit) = a0 - a7(16 bit)
    131    __m128i l1 = _mm_unpacklo_epi8(l8_1, zero);  // B(128bit) = b0 - b7(16 bit)
    132    __m128i l2 = _mm_unpacklo_epi8(l8_2, zero);  // C(128bit) = c0 - c7(16 bit)
    133    __m128i l3 = _mm_unpacklo_epi8(l8_3, zero);  // D(128bit) = d0 - d7(16 bit)
    134    __m128i l4 = _mm_unpacklo_epi8(l8_4, zero);  // E(128bit) = e0 - e7(16 bit)
    135    __m128i l5 = _mm_unpacklo_epi8(l8_5, zero);  // F(128bit) = f0 - f7(16 bit)
    136 
    137    // Increment the pointer such that the loading starts from row G.
    138    data = data + 3 * stride;
    139    // The core vertical SIMD processes 2 input rows simultaneously to generate
    140    // output corresponding to 1 row. To streamline the core loop and eliminate
    141    // the need for conditional checks, the remaining rows 4 are processed
    142    // separately.
    143    for (int i = 0; i < height - 4; i += 2) {
    144      // g0 ... g7
    145      __m128i l8_6 = _mm_loadl_epi64((__m128i *)(data));
    146      // h0 ... h7
    147      __m128i l8_7 = _mm_loadl_epi64((__m128i *)(data + stride));
    148      __m128i l6 = _mm_unpacklo_epi8(l8_6, zero);  // G(128bit):g0-g7(16b)
    149      __m128i l7 = _mm_unpacklo_epi8(l8_7, zero);  // H(128bit):h0-h7(16b)
    150 
    151      PROCESS_RESIZE_Y_WD8
    152    }
    153 
    154    __m128i l8_6 = _mm_loadl_epi64((__m128i *)(data));
    155    __m128i l6 = _mm_unpacklo_epi8(l8_6, zero);
    156    // Process the last 4 input rows here.
    157    for (int i = height - 4; i < height; i += 2) {
    158      __m128i l7 = l6;
    159      PROCESS_RESIZE_Y_WD8
    160    }
    161  }
    162 
    163  if (remain_col)
    164    return av1_resize_vert_dir_c(intbuf, output, out_stride, height, height2,
    165                                 stride, stride - remain_col);
    166 
    167  return true;
    168 }
    169 
    170 // Blends a and b using mask and returns the result.
    171 static inline __m128i blend(__m128i a, __m128i b, __m128i mask) {
    172  const __m128i masked_b = _mm_and_si128(mask, b);
    173  const __m128i masked_a = _mm_andnot_si128(mask, a);
    174  return (_mm_or_si128(masked_a, masked_b));
    175 }
    176 
    177 // Masks used for width 16 pixels, with left and right padding
    178 // requirements.
    179 static const uint8_t left_padding_mask[16] = {
    180  255, 255, 255, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
    181 };
    182 
    183 static const uint8_t right_padding_mask[16] = { 0,   0,   0,   0,  0,   0,
    184                                                0,   0,   0,   0,  255, 255,
    185                                                255, 255, 255, 255 };
    186 
    187 static const uint8_t mask_16[16] = {
    188  255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0,
    189 };
    190 
    191 void av1_resize_horz_dir_sse2(const uint8_t *const input, int in_stride,
    192                              uint8_t *intbuf, int height, int filtered_length,
    193                              int width2) {
    194  assert(height % 2 == 0);
    195  // Invoke C for width less than 16.
    196  if (filtered_length < 16) {
    197    av1_resize_horz_dir_c(input, in_stride, intbuf, height, filtered_length,
    198                          width2);
    199    return;
    200  }
    201 
    202  __m128i coeffs_x[2];
    203  const int bits = FILTER_BITS;
    204  const int dst_stride = width2;
    205  const __m128i round_const_bits = _mm_set1_epi32((1 << bits) >> 1);
    206  const __m128i round_shift_bits = _mm_cvtsi32_si128(bits);
    207 
    208  const uint8_t max_pixel = 255;
    209  const __m128i clip_pixel = _mm_set1_epi8((char)max_pixel);
    210  const __m128i zero = _mm_setzero_si128();
    211 
    212  const __m128i start_pad_mask = _mm_loadu_si128((__m128i *)left_padding_mask);
    213  const __m128i end_pad_mask = _mm_loadu_si128((__m128i *)right_padding_mask);
    214  const __m128i mask_even = _mm_loadu_si128((__m128i *)mask_16);
    215  prepare_filter_coeffs(av1_down2_symeven_half_filter, coeffs_x);
    216 
    217  for (int i = 0; i < height; ++i) {
    218    int filter_offset = 0;
    219    int row01_offset = ROW_OFFSET;
    220    int remain_col = filtered_length;
    221    // To avoid pixel over-read at frame boundary, processing of 16 pixels
    222    // is done using the core loop only if sufficient number of pixels required
    223    // for the load are present.The remaining pixels are processed separately.
    224    for (int j = 0; j <= filtered_length - 16; j += 16) {
    225      if (remain_col == 18 || remain_col == 20) {
    226        break;
    227      }
    228      const int is_last_cols16 = (j == filtered_length - 16);
    229      // While processing the last 16 pixels of the row, ensure that only valid
    230      // pixels are loaded.
    231      if (is_last_cols16) row01_offset = 0;
    232      const int in_idx = i * in_stride + j - filter_offset;
    233      const int out_idx = i * dst_stride + j / 2;
    234      remain_col -= 16;
    235      // a0 a1 a2 a3 .... a15
    236      __m128i row00 = _mm_loadu_si128((__m128i *)&input[in_idx]);
    237      // a8 a9 a10 a11 .... a23
    238      __m128i row01 = _mm_loadu_si128(
    239          (__m128i *)&input[in_idx + row01_offset + filter_offset]);
    240      filter_offset = 3;
    241 
    242      // Pad start pixels to the left, while processing the first pixels in the
    243      // row.
    244      if (j == 0) {
    245        const __m128i start_pixel_row0 =
    246            _mm_set1_epi8((char)input[i * in_stride]);
    247        row00 =
    248            blend(_mm_slli_si128(row00, 3), start_pixel_row0, start_pad_mask);
    249      }
    250 
    251      // Pad end pixels to the right, while processing the last pixels in the
    252      // row.
    253      if (is_last_cols16) {
    254        const __m128i end_pixel_row0 =
    255            _mm_set1_epi8((char)input[i * in_stride + filtered_length - 1]);
    256        row01 = blend(_mm_srli_si128(row01, ROW_OFFSET), end_pixel_row0,
    257                      end_pad_mask);
    258      }
    259 
    260      // a2 a3 a4 a5 a6 a7 a8 a9 .... a17
    261      const __m128i row0_1 = _mm_unpacklo_epi64(_mm_srli_si128(row00, 2),
    262                                                _mm_srli_si128(row01, 2));
    263      // a4 a5 a6 a7 a9 10 a11 a12 .... a19
    264      const __m128i row0_2 = _mm_unpacklo_epi64(_mm_srli_si128(row00, 4),
    265                                                _mm_srli_si128(row01, 4));
    266      // a6 a7 a8 a9 a10 a11 a12 a13 .... a21
    267      const __m128i row0_3 = _mm_unpacklo_epi64(_mm_srli_si128(row00, 6),
    268                                                _mm_srli_si128(row01, 6));
    269 
    270      // a0 a2 a4 a6 a8 a10 a12 a14 (each 16 bit)
    271      const __m128i s0 = _mm_and_si128(row00, mask_even);
    272      // a1 a3 a5 a7 a9 a11 a13 a15
    273      const __m128i s1 = _mm_and_si128(_mm_srli_epi16(row00, 8), mask_even);
    274      // a2 a4 a6 a8 a10 a12 a14 a16
    275      const __m128i s2 = _mm_and_si128(row0_1, mask_even);
    276      // a3 a5 a7 a9 a11 a13 a15 a17
    277      const __m128i s3 = _mm_and_si128(_mm_srli_epi16(row0_1, 8), mask_even);
    278      // a4 a6 a8 a10 a12 a14 a16 a18
    279      const __m128i s4 = _mm_and_si128(row0_2, mask_even);
    280      // a5 a7 a9 a11 a13 a15 a17 a19
    281      const __m128i s5 = _mm_and_si128(_mm_srli_epi16(row0_2, 8), mask_even);
    282      // a6 a8 a10 a12 a14 a16 a18 a20
    283      const __m128i s6 = _mm_and_si128(row0_3, mask_even);
    284      // a7 a9 a11 a13 a15 a17 a19 a21
    285      const __m128i s7 = _mm_and_si128(_mm_srli_epi16(row0_3, 8), mask_even);
    286 
    287      // a0a7 a2a9 a4a11 .... a12a19 a14a21
    288      const __m128i s07 = _mm_add_epi16(s0, s7);
    289      // a1a6 a3a8 a5a10 .... a13a18 a15a20
    290      const __m128i s16 = _mm_add_epi16(s1, s6);
    291      // a2a5 a4a7 a6a9  .... a14a17 a16a19
    292      const __m128i s25 = _mm_add_epi16(s2, s5);
    293      // a3a4 a5a6 a7a8  .... a15a16 a17a18
    294      const __m128i s34 = _mm_add_epi16(s3, s4);
    295 
    296      // a0a7 a1a6 a2a9 a3a8 a4a11 a5a10 a6a13 a7a12
    297      const __m128i s1607_low = _mm_unpacklo_epi16(s07, s16);
    298      // a2a5 a3a4 a4a7 a5a6 a6a9 a7a8 a8a11 a9a10
    299      const __m128i s3425_low = _mm_unpacklo_epi16(s25, s34);
    300 
    301      // a8a15 a9a14 a10a17 a11a16 a12a19 a13a18 a14a21 a15a20
    302      const __m128i s1607_high = _mm_unpackhi_epi16(s07, s16);
    303      // a10a13 a11a12 a12a15 a13a14 a14a17 a15a16 a16a19 a17a18
    304      const __m128i s3425_high = _mm_unpackhi_epi16(s25, s34);
    305 
    306      const __m128i r01_0 = _mm_madd_epi16(s3425_low, coeffs_x[1]);
    307      const __m128i r01_1 = _mm_madd_epi16(s1607_low, coeffs_x[0]);
    308      const __m128i r01_2 = _mm_madd_epi16(s3425_high, coeffs_x[1]);
    309      const __m128i r01_3 = _mm_madd_epi16(s1607_high, coeffs_x[0]);
    310 
    311      // Result of first 8 pixels of row0 (a0 to a7).
    312      // r0_0 r0_1 r0_2 r0_3
    313      __m128i r00 = _mm_add_epi32(r01_0, r01_1);
    314      r00 = _mm_add_epi32(r00, round_const_bits);
    315      r00 = _mm_sra_epi32(r00, round_shift_bits);
    316 
    317      // Result of next 8 pixels of row0 (a8 to 15).
    318      // r0_4 r0_5 r0_6 r0_7
    319      __m128i r01 = _mm_add_epi32(r01_2, r01_3);
    320      r01 = _mm_add_epi32(r01, round_const_bits);
    321      r01 = _mm_sra_epi32(r01, round_shift_bits);
    322 
    323      // r0_0 r0_1 r1_2 r0_3 r0_4 r0_5 r0_6 r0_7
    324      const __m128i res_16 = _mm_packs_epi32(r00, r01);
    325      const __m128i res_8 = _mm_packus_epi16(res_16, res_16);
    326      __m128i res = _mm_min_epu8(res_8, clip_pixel);
    327      res = _mm_max_epu8(res, zero);
    328 
    329      // r0_0 r0_1 r1_2 r0_3 r0_4 r0_5 r0_6 r0_7
    330      _mm_storel_epi64((__m128i *)&intbuf[out_idx], res);
    331    }
    332 
    333    int wd_processed = filtered_length - remain_col;
    334    if (remain_col) {
    335      const int in_idx = (in_stride * i);
    336      const int out_idx = (wd_processed / 2) + width2 * i;
    337 
    338      down2_symeven(input + in_idx, filtered_length, intbuf + out_idx,
    339                    wd_processed);
    340    }
    341  }
    342 }
	tor-browser The Tor Browser
	git clone https://git.dasho.dev/tor-browser.git
	Log \| Files \| Refs \| README \| LICENSE