[ tor-browser ].git.dasho

aom_scaled_convolve8_neon_i8mm.c (11917B)
      1 /*
      2 * Copyright (c) 2024, Alliance for Open Media. All rights reserved
      3 *
      4 * This source code is subject to the terms of the BSD 2 Clause License and
      5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
      6 * was not distributed with this source code in the LICENSE file, you can
      7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
      8 * Media Patent License 1.0 was not distributed with this source code in the
      9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
     10 */
     11 
     12 #include <arm_neon.h>
     13 #include <assert.h>
     14 
     15 #include "aom_dsp/arm/aom_convolve8_neon.h"
     16 #include "aom_dsp/arm/mem_neon.h"
     17 #include "aom_dsp/arm/transpose_neon.h"
     18 #include "config/aom_dsp_rtcd.h"
     19 
     20 static inline uint8x8_t convolve8_4_h(uint8x8_t s0, uint8x8_t s1, uint8x8_t s2,
     21                                      uint8x8_t s3, int8x8_t filter) {
     22  int8x16_t filter_x2 = vcombine_s8(filter, filter);
     23 
     24  uint8x16_t s01 = vcombine_u8(s0, s1);
     25  uint8x16_t s23 = vcombine_u8(s2, s3);
     26 
     27  int32x4_t sum01 = vusdotq_s32(vdupq_n_s32(0), s01, filter_x2);
     28  int32x4_t sum23 = vusdotq_s32(vdupq_n_s32(0), s23, filter_x2);
     29 
     30  int32x4_t sum0123 = vpaddq_s32(sum01, sum23);
     31  int16x8_t sum = vcombine_s16(vmovn_s32(sum0123), vdup_n_s16(0));
     32 
     33  // We halved the filter values so -1 from right shift.
     34  return vqrshrun_n_s16(sum, FILTER_BITS - 1);
     35 }
     36 
     37 static inline uint8x8_t convolve8_8_h(uint8x8_t s0, uint8x8_t s1, uint8x8_t s2,
     38                                      uint8x8_t s3, uint8x8_t s4, uint8x8_t s5,
     39                                      uint8x8_t s6, uint8x8_t s7,
     40                                      int8x8_t filter) {
     41  int8x16_t filter_x2 = vcombine_s8(filter, filter);
     42 
     43  uint8x16_t s01 = vcombine_u8(s0, s1);
     44  uint8x16_t s23 = vcombine_u8(s2, s3);
     45  uint8x16_t s45 = vcombine_u8(s4, s5);
     46  uint8x16_t s67 = vcombine_u8(s6, s7);
     47 
     48  int32x4_t sum01 = vusdotq_s32(vdupq_n_s32(0), s01, filter_x2);
     49  int32x4_t sum23 = vusdotq_s32(vdupq_n_s32(0), s23, filter_x2);
     50  int32x4_t sum45 = vusdotq_s32(vdupq_n_s32(0), s45, filter_x2);
     51  int32x4_t sum67 = vusdotq_s32(vdupq_n_s32(0), s67, filter_x2);
     52 
     53  int32x4_t sum0123 = vpaddq_s32(sum01, sum23);
     54  int32x4_t sum4567 = vpaddq_s32(sum45, sum67);
     55  int16x8_t sum = vcombine_s16(vmovn_s32(sum0123), vmovn_s32(sum4567));
     56 
     57  // We halved the filter values so -1 from right shift.
     58  return vqrshrun_n_s16(sum, FILTER_BITS - 1);
     59 }
     60 
     61 static inline void scaled_convolve_horiz_neon_i8mm(
     62    const uint8_t *src, const ptrdiff_t src_stride, uint8_t *dst,
     63    const ptrdiff_t dst_stride, const InterpKernel *const x_filter,
     64    const int x0_q4, const int x_step_q4, int w, int h) {
     65  DECLARE_ALIGNED(16, uint8_t, temp[8 * 8]);
     66 
     67  if (w == 4) {
     68    do {
     69      int x_q4 = x0_q4;
     70 
     71      // Process a 4x4 tile.
     72      for (int r = 0; r < 4; ++r) {
     73        // Halve filter values (all even) to avoid the need for saturating
     74        // arithmetic in convolution kernels.
     75        const int8x8_t filter =
     76            vshrn_n_s16(vld1q_s16(x_filter[x_q4 & SUBPEL_MASK]), 1);
     77 
     78        const uint8_t *s = &src[x_q4 >> SUBPEL_BITS];
     79        uint8x8_t s0, s1, s2, s3;
     80        load_u8_8x4(s, src_stride, &s0, &s1, &s2, &s3);
     81 
     82        uint8x8_t d0 = convolve8_4_h(s0, s1, s2, s3, filter);
     83 
     84        store_u8_4x1(&temp[4 * r], d0);
     85 
     86        x_q4 += x_step_q4;
     87      }
     88 
     89      // Transpose the 4x4 result tile and store.
     90      uint8x8_t d01 = vld1_u8(temp + 0);
     91      uint8x8_t d23 = vld1_u8(temp + 8);
     92 
     93      transpose_elems_inplace_u8_4x4(&d01, &d23);
     94 
     95      store_u8x4_strided_x2(dst + 0 * dst_stride, 2 * dst_stride, d01);
     96      store_u8x4_strided_x2(dst + 1 * dst_stride, 2 * dst_stride, d23);
     97 
     98      src += 4 * src_stride;
     99      dst += 4 * dst_stride;
    100      h -= 4;
    101    } while (h > 0);
    102    return;
    103  }
    104 
    105  // w >= 8
    106  do {
    107    int x_q4 = x0_q4;
    108    uint8_t *d = dst;
    109    int width = w;
    110 
    111    do {
    112      // Process an 8x8 tile.
    113      for (int r = 0; r < 8; ++r) {
    114        // Halve filter values (all even) to avoid the need for saturating
    115        // arithmetic in convolution kernels.
    116        const int8x8_t filter =
    117            vshrn_n_s16(vld1q_s16(x_filter[x_q4 & SUBPEL_MASK]), 1);
    118 
    119        const uint8_t *s = &src[x_q4 >> SUBPEL_BITS];
    120        uint8x8_t s0, s1, s2, s3, s4, s5, s6, s7;
    121        load_u8_8x8(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7);
    122 
    123        uint8x8_t d0 = convolve8_8_h(s0, s1, s2, s3, s4, s5, s6, s7, filter);
    124 
    125        vst1_u8(&temp[r * 8], d0);
    126 
    127        x_q4 += x_step_q4;
    128      }
    129 
    130      // Transpose the 8x8 result tile and store.
    131      uint8x8_t d0, d1, d2, d3, d4, d5, d6, d7;
    132      load_u8_8x8(temp, 8, &d0, &d1, &d2, &d3, &d4, &d5, &d6, &d7);
    133 
    134      transpose_elems_inplace_u8_8x8(&d0, &d1, &d2, &d3, &d4, &d5, &d6, &d7);
    135 
    136      store_u8_8x8(d, dst_stride, d0, d1, d2, d3, d4, d5, d6, d7);
    137 
    138      d += 8;
    139      width -= 8;
    140    } while (width != 0);
    141 
    142    src += 8 * src_stride;
    143    dst += 8 * dst_stride;
    144    h -= 8;
    145  } while (h > 0);
    146 }
    147 
    148 static inline uint8x8_t convolve8_4_v(uint8x8_t s0, uint8x8_t s1, uint8x8_t s2,
    149                                      uint8x8_t s3, uint8x8_t s4, uint8x8_t s5,
    150                                      uint8x8_t s6, uint8x8_t s7,
    151                                      int8x8_t filter) {
    152  uint8x16_t s01 = vcombine_u8(vzip1_u8(s0, s1), vdup_n_u8(0));
    153  uint8x16_t s23 = vcombine_u8(vzip1_u8(s2, s3), vdup_n_u8(0));
    154  uint8x16_t s45 = vcombine_u8(vzip1_u8(s4, s5), vdup_n_u8(0));
    155  uint8x16_t s67 = vcombine_u8(vzip1_u8(s6, s7), vdup_n_u8(0));
    156 
    157  uint8x16_t s0123 = vreinterpretq_u8_u16(
    158      vzip1q_u16(vreinterpretq_u16_u8(s01), vreinterpretq_u16_u8(s23)));
    159  uint8x16_t s4567 = vreinterpretq_u8_u16(
    160      vzip1q_u16(vreinterpretq_u16_u8(s45), vreinterpretq_u16_u8(s67)));
    161 
    162  int32x4_t sum = vusdotq_lane_s32(vdupq_n_s32(0), s0123, filter, 0);
    163  sum = vusdotq_lane_s32(sum, s4567, filter, 1);
    164 
    165  // We halved the filter values so -1 from right shift.
    166  return vqrshrun_n_s16(vcombine_s16(vmovn_s32(sum), vdup_n_s16(0)),
    167                        FILTER_BITS - 1);
    168 }
    169 
    170 static inline uint8x8_t convolve8_8_v(uint8x8_t s0, uint8x8_t s1, uint8x8_t s2,
    171                                      uint8x8_t s3, uint8x8_t s4, uint8x8_t s5,
    172                                      uint8x8_t s6, uint8x8_t s7,
    173                                      int8x8_t filter) {
    174  uint8x16_t s01 =
    175      vzip1q_u8(vcombine_u8(s0, vdup_n_u8(0)), vcombine_u8(s1, vdup_n_u8(0)));
    176  uint8x16_t s23 =
    177      vzip1q_u8(vcombine_u8(s2, vdup_n_u8(0)), vcombine_u8(s3, vdup_n_u8(0)));
    178  uint8x16_t s45 =
    179      vzip1q_u8(vcombine_u8(s4, vdup_n_u8(0)), vcombine_u8(s5, vdup_n_u8(0)));
    180  uint8x16_t s67 =
    181      vzip1q_u8(vcombine_u8(s6, vdup_n_u8(0)), vcombine_u8(s7, vdup_n_u8(0)));
    182 
    183  uint8x16_t s0123[2] = {
    184    vreinterpretq_u8_u16(
    185        vzip1q_u16(vreinterpretq_u16_u8(s01), vreinterpretq_u16_u8(s23))),
    186    vreinterpretq_u8_u16(
    187        vzip2q_u16(vreinterpretq_u16_u8(s01), vreinterpretq_u16_u8(s23)))
    188  };
    189  uint8x16_t s4567[2] = {
    190    vreinterpretq_u8_u16(
    191        vzip1q_u16(vreinterpretq_u16_u8(s45), vreinterpretq_u16_u8(s67))),
    192    vreinterpretq_u8_u16(
    193        vzip2q_u16(vreinterpretq_u16_u8(s45), vreinterpretq_u16_u8(s67)))
    194  };
    195 
    196  int32x4_t sum0123 = vusdotq_lane_s32(vdupq_n_s32(0), s0123[0], filter, 0);
    197  sum0123 = vusdotq_lane_s32(sum0123, s4567[0], filter, 1);
    198 
    199  int32x4_t sum4567 = vusdotq_lane_s32(vdupq_n_s32(0), s0123[1], filter, 0);
    200  sum4567 = vusdotq_lane_s32(sum4567, s4567[1], filter, 1);
    201 
    202  int16x8_t sum = vcombine_s16(vmovn_s32(sum0123), vmovn_s32(sum4567));
    203  // We halved the filter values so -1 from right shift.
    204  return vqrshrun_n_s16(sum, FILTER_BITS - 1);
    205 }
    206 
    207 static inline void scaled_convolve_vert_neon_i8mm(
    208    const uint8_t *src, const ptrdiff_t src_stride, uint8_t *dst,
    209    const ptrdiff_t dst_stride, const InterpKernel *const y_filter,
    210    const int y0_q4, const int y_step_q4, int w, int h) {
    211  int y_q4 = y0_q4;
    212 
    213  if (w == 4) {
    214    do {
    215      const uint8_t *s = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
    216 
    217      if (y_q4 & SUBPEL_MASK) {
    218        // Halve filter values (all even) to avoid the need for saturating
    219        // arithmetic in convolution kernels.
    220        const int8x8_t filter =
    221            vshrn_n_s16(vld1q_s16(y_filter[y_q4 & SUBPEL_MASK]), 1);
    222 
    223        uint8x8_t s0, s1, s2, s3, s4, s5, s6, s7;
    224        load_u8_8x8(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7);
    225 
    226        uint8x8_t d0 = convolve8_4_v(s0, s1, s2, s3, s4, s5, s6, s7, filter);
    227 
    228        store_u8_4x1(dst, d0);
    229      } else {
    230        // Memcpy for non-subpel locations.
    231        memcpy(dst, &s[(SUBPEL_TAPS / 2 - 1) * src_stride], 4);
    232      }
    233 
    234      y_q4 += y_step_q4;
    235      dst += dst_stride;
    236    } while (--h != 0);
    237    return;
    238  }
    239 
    240  // w >= 8
    241  do {
    242    const uint8_t *s = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
    243    uint8_t *d = dst;
    244    int width = w;
    245 
    246    if (y_q4 & SUBPEL_MASK) {
    247      // Halve filter values (all even) to avoid the need for saturating
    248      // arithmetic in convolution kernels.
    249      const int8x8_t filter =
    250          vshrn_n_s16(vld1q_s16(y_filter[y_q4 & SUBPEL_MASK]), 1);
    251 
    252      do {
    253        uint8x8_t s0, s1, s2, s3, s4, s5, s6, s7;
    254        load_u8_8x8(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7);
    255 
    256        uint8x8_t d0 = convolve8_8_v(s0, s1, s2, s3, s4, s5, s6, s7, filter);
    257 
    258        vst1_u8(d, d0);
    259 
    260        s += 8;
    261        d += 8;
    262        width -= 8;
    263      } while (width != 0);
    264    } else {
    265      // Memcpy for non-subpel locations.
    266      s += (SUBPEL_TAPS / 2 - 1) * src_stride;
    267 
    268      do {
    269        uint8x8_t s0 = vld1_u8(s);
    270        vst1_u8(d, s0);
    271        s += 8;
    272        d += 8;
    273        width -= 8;
    274      } while (width != 0);
    275    }
    276 
    277    y_q4 += y_step_q4;
    278    dst += dst_stride;
    279  } while (--h != 0);
    280 }
    281 
    282 void aom_scaled_2d_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride,
    283                             uint8_t *dst, ptrdiff_t dst_stride,
    284                             const InterpKernel *filter, int x0_q4,
    285                             int x_step_q4, int y0_q4, int y_step_q4, int w,
    286                             int h) {
    287  // Fixed size intermediate buffer, im_block, places limits on parameters.
    288  // 2d filtering proceeds in 2 steps:
    289  //   (1) Interpolate horizontally into an intermediate buffer, temp.
    290  //   (2) Interpolate temp vertically to derive the sub-pixel result.
    291  // Deriving the maximum number of rows in the im_block buffer (135):
    292  // --Smallest scaling factor is x1/2 ==> y_step_q4 = 32 (Normative).
    293  // --Largest block size is 64x64 pixels.
    294  // --64 rows in the downscaled frame span a distance of (64 - 1) * 32 in the
    295  //   original frame (in 1/16th pixel units).
    296  // --Must round-up because block may be located at sub-pixel position.
    297  // --Require an additional SUBPEL_TAPS rows for the 8-tap filter tails.
    298  // --((64 - 1) * 32 + 15) >> 4 + 8 = 135.
    299  // --Require an additional 8 rows for the horiz_w8 transpose tail.
    300  // When calling in frame scaling function, the smallest scaling factor is x1/4
    301  // ==> y_step_q4 = 64. Since w and h are at most 16, the temp buffer is still
    302  // big enough.
    303  DECLARE_ALIGNED(16, uint8_t, im_block[(135 + 8) * 64]);
    304  const int im_height =
    305      (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS;
    306  const ptrdiff_t im_stride = 64;
    307 
    308  assert(w <= 64);
    309  assert(h <= 64);
    310  assert(y_step_q4 <= 32 || (y_step_q4 <= 64 && h <= 32));
    311  assert(x_step_q4 <= 64);
    312 
    313  // Account for needing SUBPEL_TAPS / 2 - 1 lines prior and SUBPEL_TAPS / 2
    314  // lines post both horizontally and vertically.
    315  const ptrdiff_t horiz_offset = SUBPEL_TAPS / 2 - 1;
    316  const ptrdiff_t vert_offset = (SUBPEL_TAPS / 2 - 1) * src_stride;
    317 
    318  scaled_convolve_horiz_neon_i8mm(src - horiz_offset - vert_offset, src_stride,
    319                                  im_block, im_stride, filter, x0_q4, x_step_q4,
    320                                  w, im_height);
    321 
    322  scaled_convolve_vert_neon_i8mm(im_block, im_stride, dst, dst_stride, filter,
    323                                 y0_q4, y_step_q4, w, h);
    324 }
	tor-browser The Tor Browser
	git clone https://git.dasho.dev/tor-browser.git
	Log \| Files \| Refs \| README \| LICENSE