tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

wiener_convolve_neon.c (13493B)


      1 /*
      2 * Copyright (c) 2018, Alliance for Open Media. All rights reserved.
      3 *
      4 * This source code is subject to the terms of the BSD 2 Clause License and
      5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
      6 * was not distributed with this source code in the LICENSE file, you can
      7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
      8 * Media Patent License 1.0 was not distributed with this source code in the
      9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
     10 */
     11 
     12 #include <arm_neon.h>
     13 #include <assert.h>
     14 
     15 #include "config/aom_config.h"
     16 #include "config/av1_rtcd.h"
     17 
     18 #include "aom_dsp/arm/mem_neon.h"
     19 #include "aom_dsp/arm/transpose_neon.h"
     20 #include "aom_dsp/txfm_common.h"
     21 #include "aom_ports/mem.h"
     22 #include "av1/common/common.h"
     23 #include "av1/common/restoration.h"
     24 
     25 static inline uint16x8_t wiener_convolve5_8_2d_h(
     26    const uint8x8_t t0, const uint8x8_t t1, const uint8x8_t t2,
     27    const uint8x8_t t3, const uint8x8_t t4, const int16x4_t x_filter,
     28    const int32x4_t round_vec, const uint16x8_t im_max_val) {
     29  // Since the Wiener filter is symmetric about the middle tap (tap 2) add
     30  // mirrored source elements before multiplying filter coefficients.
     31  int16x8_t s04 = vreinterpretq_s16_u16(vaddl_u8(t0, t4));
     32  int16x8_t s13 = vreinterpretq_s16_u16(vaddl_u8(t1, t3));
     33  int16x8_t s2 = vreinterpretq_s16_u16(vmovl_u8(t2));
     34 
     35  // x_filter[0] = 0. (5-tap filters are 0-padded to 7 taps.)
     36  int32x4_t sum_lo = vmlal_lane_s16(round_vec, vget_low_s16(s04), x_filter, 1);
     37  sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(s13), x_filter, 2);
     38  sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(s2), x_filter, 3);
     39 
     40  int32x4_t sum_hi = vmlal_lane_s16(round_vec, vget_high_s16(s04), x_filter, 1);
     41  sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(s13), x_filter, 2);
     42  sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(s2), x_filter, 3);
     43 
     44  uint16x8_t res = vcombine_u16(vqrshrun_n_s32(sum_lo, WIENER_ROUND0_BITS),
     45                                vqrshrun_n_s32(sum_hi, WIENER_ROUND0_BITS));
     46 
     47  return vminq_u16(res, im_max_val);
     48 }
     49 
     50 static inline void convolve_add_src_horiz_5tap_neon(
     51    const uint8_t *src_ptr, ptrdiff_t src_stride, uint16_t *dst_ptr,
     52    ptrdiff_t dst_stride, int w, int h, const int16x4_t x_filter,
     53    const int32x4_t round_vec, const uint16x8_t im_max_val) {
     54  do {
     55    const uint8_t *s = src_ptr;
     56    uint16_t *d = dst_ptr;
     57    int width = w;
     58 
     59    do {
     60      uint8x8_t s0, s1, s2, s3, s4;
     61      load_u8_8x5(s, 1, &s0, &s1, &s2, &s3, &s4);
     62 
     63      uint16x8_t d0 = wiener_convolve5_8_2d_h(s0, s1, s2, s3, s4, x_filter,
     64                                              round_vec, im_max_val);
     65 
     66      vst1q_u16(d, d0);
     67 
     68      s += 8;
     69      d += 8;
     70      width -= 8;
     71    } while (width != 0);
     72    src_ptr += src_stride;
     73    dst_ptr += dst_stride;
     74  } while (--h != 0);
     75 }
     76 
     77 static inline uint16x8_t wiener_convolve7_8_2d_h(
     78    const uint8x8_t t0, const uint8x8_t t1, const uint8x8_t t2,
     79    const uint8x8_t t3, const uint8x8_t t4, const uint8x8_t t5,
     80    const uint8x8_t t6, const int16x4_t x_filter, const int32x4_t round_vec,
     81    const uint16x8_t im_max_val) {
     82  // Since the Wiener filter is symmetric about the middle tap (tap 3) add
     83  // mirrored source elements before multiplying by filter coefficients.
     84  int16x8_t s06 = vreinterpretq_s16_u16(vaddl_u8(t0, t6));
     85  int16x8_t s15 = vreinterpretq_s16_u16(vaddl_u8(t1, t5));
     86  int16x8_t s24 = vreinterpretq_s16_u16(vaddl_u8(t2, t4));
     87  int16x8_t s3 = vreinterpretq_s16_u16(vmovl_u8(t3));
     88 
     89  int32x4_t sum_lo = vmlal_lane_s16(round_vec, vget_low_s16(s06), x_filter, 0);
     90  sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(s15), x_filter, 1);
     91  sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(s24), x_filter, 2);
     92  sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(s3), x_filter, 3);
     93 
     94  int32x4_t sum_hi = vmlal_lane_s16(round_vec, vget_high_s16(s06), x_filter, 0);
     95  sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(s15), x_filter, 1);
     96  sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(s24), x_filter, 2);
     97  sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(s3), x_filter, 3);
     98 
     99  uint16x8_t res = vcombine_u16(vqrshrun_n_s32(sum_lo, WIENER_ROUND0_BITS),
    100                                vqrshrun_n_s32(sum_hi, WIENER_ROUND0_BITS));
    101 
    102  return vminq_u16(res, im_max_val);
    103 }
    104 
    105 static inline void convolve_add_src_horiz_7tap_neon(
    106    const uint8_t *src_ptr, ptrdiff_t src_stride, uint16_t *dst_ptr,
    107    ptrdiff_t dst_stride, int w, int h, const int16x4_t x_filter,
    108    const int32x4_t round_vec, const uint16x8_t im_max_val) {
    109  do {
    110    const uint8_t *s = src_ptr;
    111    uint16_t *d = dst_ptr;
    112    int width = w;
    113 
    114    do {
    115      uint8x8_t s0, s1, s2, s3, s4, s5, s6;
    116      load_u8_8x7(s, 1, &s0, &s1, &s2, &s3, &s4, &s5, &s6);
    117 
    118      uint16x8_t d0 = wiener_convolve7_8_2d_h(s0, s1, s2, s3, s4, s5, s6,
    119                                              x_filter, round_vec, im_max_val);
    120 
    121      vst1q_u16(d, d0);
    122 
    123      s += 8;
    124      d += 8;
    125      width -= 8;
    126    } while (width != 0);
    127    src_ptr += src_stride;
    128    dst_ptr += dst_stride;
    129  } while (--h != 0);
    130 }
    131 
    132 static inline uint8x8_t wiener_convolve5_8_2d_v(
    133    const int16x8_t s0, const int16x8_t s1, const int16x8_t s2,
    134    const int16x8_t s3, const int16x8_t s4, const int16x4_t y_filter,
    135    const int32x4_t round_vec) {
    136  // Since the Wiener filter is symmetric about the middle tap (tap 2) add
    137  // mirrored source elements before multiplying by filter coefficients.
    138  int16x8_t s04 = vaddq_s16(s0, s4);
    139  int16x8_t s13 = vaddq_s16(s1, s3);
    140 
    141  int32x4_t sum_lo = vmlal_lane_s16(round_vec, vget_low_s16(s04), y_filter, 1);
    142  sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(s13), y_filter, 2);
    143  sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(s2), y_filter, 3);
    144 
    145  int32x4_t sum_hi = vmlal_lane_s16(round_vec, vget_high_s16(s04), y_filter, 1);
    146  sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(s13), y_filter, 2);
    147  sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(s2), y_filter, 3);
    148 
    149  int16x4_t res_lo = vshrn_n_s32(sum_lo, 2 * FILTER_BITS - WIENER_ROUND0_BITS);
    150  int16x4_t res_hi = vshrn_n_s32(sum_hi, 2 * FILTER_BITS - WIENER_ROUND0_BITS);
    151 
    152  return vqmovun_s16(vcombine_s16(res_lo, res_hi));
    153 }
    154 
    155 static inline void convolve_add_src_vert_5tap_neon(
    156    const uint16_t *src, ptrdiff_t src_stride, uint8_t *dst,
    157    ptrdiff_t dst_stride, int w, int h, const int16x4_t y_filter,
    158    const int32x4_t round_vec) {
    159  do {
    160    const int16_t *s = (int16_t *)src;
    161    uint8_t *d = dst;
    162    int height = h;
    163 
    164    while (height > 3) {
    165      int16x8_t s0, s1, s2, s3, s4, s5, s6, s7;
    166      load_s16_8x8(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7);
    167 
    168      uint8x8_t d0 =
    169          wiener_convolve5_8_2d_v(s0, s1, s2, s3, s4, y_filter, round_vec);
    170      uint8x8_t d1 =
    171          wiener_convolve5_8_2d_v(s1, s2, s3, s4, s5, y_filter, round_vec);
    172      uint8x8_t d2 =
    173          wiener_convolve5_8_2d_v(s2, s3, s4, s5, s6, y_filter, round_vec);
    174      uint8x8_t d3 =
    175          wiener_convolve5_8_2d_v(s3, s4, s5, s6, s7, y_filter, round_vec);
    176 
    177      store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
    178 
    179      s += 4 * src_stride;
    180      d += 4 * dst_stride;
    181      height -= 4;
    182    }
    183 
    184    while (height-- != 0) {
    185      int16x8_t s0, s1, s2, s3, s4;
    186      load_s16_8x5(s, src_stride, &s0, &s1, &s2, &s3, &s4);
    187 
    188      uint8x8_t d0 =
    189          wiener_convolve5_8_2d_v(s0, s1, s2, s3, s4, y_filter, round_vec);
    190 
    191      vst1_u8(d, d0);
    192 
    193      d += dst_stride;
    194      s += src_stride;
    195    }
    196 
    197    src += 8;
    198    dst += 8;
    199    w -= 8;
    200  } while (w != 0);
    201 }
    202 
    203 static inline uint8x8_t wiener_convolve7_8_2d_v(
    204    const int16x8_t s0, const int16x8_t s1, const int16x8_t s2,
    205    const int16x8_t s3, const int16x8_t s4, const int16x8_t s5,
    206    const int16x8_t s6, const int16x4_t y_filter, const int32x4_t round_vec) {
    207  // Since the Wiener filter is symmetric about the middle tap (tap 3) add
    208  // mirrored source elements before multiplying by filter coefficients.
    209  int16x8_t s06 = vaddq_s16(s0, s6);
    210  int16x8_t s15 = vaddq_s16(s1, s5);
    211  int16x8_t s24 = vaddq_s16(s2, s4);
    212 
    213  int32x4_t sum_lo = vmlal_lane_s16(round_vec, vget_low_s16(s06), y_filter, 0);
    214  sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(s15), y_filter, 1);
    215  sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(s24), y_filter, 2);
    216  sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(s3), y_filter, 3);
    217 
    218  int32x4_t sum_hi = vmlal_lane_s16(round_vec, vget_high_s16(s06), y_filter, 0);
    219  sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(s15), y_filter, 1);
    220  sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(s24), y_filter, 2);
    221  sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(s3), y_filter, 3);
    222 
    223  int16x4_t res_lo = vshrn_n_s32(sum_lo, 2 * FILTER_BITS - WIENER_ROUND0_BITS);
    224  int16x4_t res_hi = vshrn_n_s32(sum_hi, 2 * FILTER_BITS - WIENER_ROUND0_BITS);
    225 
    226  return vqmovun_s16(vcombine_s16(res_lo, res_hi));
    227 }
    228 
    229 static inline void convolve_add_src_vert_7tap_neon(
    230    const uint16_t *src, ptrdiff_t src_stride, uint8_t *dst,
    231    ptrdiff_t dst_stride, int w, int h, const int16x4_t y_filter,
    232    const int32x4_t round_vec) {
    233  do {
    234    const int16_t *s = (int16_t *)src;
    235    uint8_t *d = dst;
    236    int height = h;
    237 
    238    while (height > 3) {
    239      int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9;
    240      load_s16_8x10(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7, &s8,
    241                    &s9);
    242 
    243      uint8x8_t d0 = wiener_convolve7_8_2d_v(s0, s1, s2, s3, s4, s5, s6,
    244                                             y_filter, round_vec);
    245      uint8x8_t d1 = wiener_convolve7_8_2d_v(s1, s2, s3, s4, s5, s6, s7,
    246                                             y_filter, round_vec);
    247      uint8x8_t d2 = wiener_convolve7_8_2d_v(s2, s3, s4, s5, s6, s7, s8,
    248                                             y_filter, round_vec);
    249      uint8x8_t d3 = wiener_convolve7_8_2d_v(s3, s4, s5, s6, s7, s8, s9,
    250                                             y_filter, round_vec);
    251 
    252      store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
    253 
    254      s += 4 * src_stride;
    255      d += 4 * dst_stride;
    256      height -= 4;
    257    }
    258 
    259    while (height-- != 0) {
    260      int16x8_t s0, s1, s2, s3, s4, s5, s6;
    261      load_s16_8x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6);
    262 
    263      uint8x8_t d0 = wiener_convolve7_8_2d_v(s0, s1, s2, s3, s4, s5, s6,
    264                                             y_filter, round_vec);
    265 
    266      vst1_u8(d, d0);
    267 
    268      d += dst_stride;
    269      s += src_stride;
    270    }
    271 
    272    src += 8;
    273    dst += 8;
    274    w -= 8;
    275  } while (w != 0);
    276 }
    277 
    278 static inline int get_wiener_filter_taps(const int16_t *filter) {
    279  assert(filter[7] == 0);
    280  if (filter[0] == 0 && filter[6] == 0) {
    281    return WIENER_WIN_REDUCED;
    282  }
    283  return WIENER_WIN;
    284 }
    285 
    286 // Wiener filter 2D
    287 // Apply horizontal filter and store in a temporary buffer. When applying
    288 // vertical filter, overwrite the original pixel values.
    289 void av1_wiener_convolve_add_src_neon(const uint8_t *src, ptrdiff_t src_stride,
    290                                      uint8_t *dst, ptrdiff_t dst_stride,
    291                                      const int16_t *x_filter, int x_step_q4,
    292                                      const int16_t *y_filter, int y_step_q4,
    293                                      int w, int h,
    294                                      const WienerConvolveParams *conv_params) {
    295  (void)x_step_q4;
    296  (void)y_step_q4;
    297  (void)conv_params;
    298 
    299  assert(w % 8 == 0);
    300  assert(w <= MAX_SB_SIZE && h <= MAX_SB_SIZE);
    301  assert(x_step_q4 == 16 && y_step_q4 == 16);
    302  assert(x_filter[7] == 0 && y_filter[7] == 0);
    303  // For bd == 8, assert horizontal filtering output will not exceed 15-bit:
    304  assert(8 + 1 + FILTER_BITS - conv_params->round_0 <= 15);
    305 
    306  DECLARE_ALIGNED(16, uint16_t,
    307                  im_block[(MAX_SB_SIZE + WIENER_WIN - 1) * MAX_SB_SIZE]);
    308 
    309  const int x_filter_taps = get_wiener_filter_taps(x_filter);
    310  const int y_filter_taps = get_wiener_filter_taps(y_filter);
    311  int16x4_t x_filter_s16 = vld1_s16(x_filter);
    312  int16x4_t y_filter_s16 = vld1_s16(y_filter);
    313  // Add 128 to tap 3. (Needed for rounding.)
    314  x_filter_s16 = vadd_s16(x_filter_s16, vcreate_s16(128ULL << 48));
    315  y_filter_s16 = vadd_s16(y_filter_s16, vcreate_s16(128ULL << 48));
    316 
    317  const int im_stride = MAX_SB_SIZE;
    318  const int im_h = h + y_filter_taps - 1;
    319  const int horiz_offset = x_filter_taps / 2;
    320  const int vert_offset = (y_filter_taps / 2) * (int)src_stride;
    321 
    322  const int bd = 8;
    323  const uint16x8_t im_max_val =
    324      vdupq_n_u16((1 << (bd + 1 + FILTER_BITS - WIENER_ROUND0_BITS)) - 1);
    325  const int32x4_t horiz_round_vec = vdupq_n_s32(1 << (bd + FILTER_BITS - 1));
    326 
    327  const int32x4_t vert_round_vec =
    328      vdupq_n_s32((1 << (2 * FILTER_BITS - WIENER_ROUND0_BITS - 1)) -
    329                  (1 << (bd + (2 * FILTER_BITS - WIENER_ROUND0_BITS) - 1)));
    330 
    331  if (x_filter_taps == WIENER_WIN_REDUCED) {
    332    convolve_add_src_horiz_5tap_neon(src - horiz_offset - vert_offset,
    333                                     src_stride, im_block, im_stride, w, im_h,
    334                                     x_filter_s16, horiz_round_vec, im_max_val);
    335  } else {
    336    convolve_add_src_horiz_7tap_neon(src - horiz_offset - vert_offset,
    337                                     src_stride, im_block, im_stride, w, im_h,
    338                                     x_filter_s16, horiz_round_vec, im_max_val);
    339  }
    340 
    341  if (y_filter_taps == WIENER_WIN_REDUCED) {
    342    convolve_add_src_vert_5tap_neon(im_block, im_stride, dst, dst_stride, w, h,
    343                                    y_filter_s16, vert_round_vec);
    344  } else {
    345    convolve_add_src_vert_7tap_neon(im_block, im_stride, dst, dst_stride, w, h,
    346                                    y_filter_s16, vert_round_vec);
    347  }
    348 }