tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

warp_plane_neon.c (12786B)


      1 /*
      2 * Copyright (c) 2018, Alliance for Open Media. All rights reserved.
      3 *
      4 * This source code is subject to the terms of the BSD 2 Clause License and
      5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
      6 * was not distributed with this source code in the LICENSE file, you can
      7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
      8 * Media Patent License 1.0 was not distributed with this source code in the
      9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
     10 */
     11 
     12 #include "warp_plane_neon.h"
     13 
     14 static AOM_FORCE_INLINE int16x8_t horizontal_filter_4x1_f4(const uint8x16_t in,
     15                                                           int sx, int alpha) {
     16  const int32x4_t add_const = vdupq_n_s32(1 << (8 + FILTER_BITS - 1));
     17 
     18  // Loading the 8 filter taps
     19  int16x8_t f[4];
     20  load_filters_4(f, sx, alpha);
     21 
     22  int16x8_t in16_lo = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(in)));
     23  int16x8_t in16_hi = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(in)));
     24 
     25  int16x8_t m0 = vmulq_s16(f[0], in16_lo);
     26  int16x8_t m1 = vmulq_s16(f[1], vextq_s16(in16_lo, in16_hi, 1));
     27  int16x8_t m2 = vmulq_s16(f[2], vextq_s16(in16_lo, in16_hi, 2));
     28  int16x8_t m3 = vmulq_s16(f[3], vextq_s16(in16_lo, in16_hi, 3));
     29 
     30  int32x4_t m0123_pairs[] = { vpaddlq_s16(m0), vpaddlq_s16(m1), vpaddlq_s16(m2),
     31                              vpaddlq_s16(m3) };
     32 
     33  int32x4_t tmp_res_low = horizontal_add_4d_s32x4(m0123_pairs);
     34 
     35  tmp_res_low = vaddq_s32(tmp_res_low, add_const);
     36 
     37  uint16x8_t res =
     38      vcombine_u16(vqrshrun_n_s32(tmp_res_low, ROUND0_BITS), vdup_n_u16(0));
     39  return vreinterpretq_s16_u16(res);
     40 }
     41 
     42 static AOM_FORCE_INLINE int16x8_t horizontal_filter_8x1_f8(const uint8x16_t in,
     43                                                           int sx, int alpha) {
     44  const int32x4_t add_const = vdupq_n_s32(1 << (8 + FILTER_BITS - 1));
     45 
     46  // Loading the 8 filter taps
     47  int16x8_t f[8];
     48  load_filters_8(f, sx, alpha);
     49 
     50  int16x8_t in16_lo = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(in)));
     51  int16x8_t in16_hi = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(in)));
     52 
     53  int16x8_t m0 = vmulq_s16(f[0], in16_lo);
     54  int16x8_t m1 = vmulq_s16(f[1], vextq_s16(in16_lo, in16_hi, 1));
     55  int16x8_t m2 = vmulq_s16(f[2], vextq_s16(in16_lo, in16_hi, 2));
     56  int16x8_t m3 = vmulq_s16(f[3], vextq_s16(in16_lo, in16_hi, 3));
     57  int16x8_t m4 = vmulq_s16(f[4], vextq_s16(in16_lo, in16_hi, 4));
     58  int16x8_t m5 = vmulq_s16(f[5], vextq_s16(in16_lo, in16_hi, 5));
     59  int16x8_t m6 = vmulq_s16(f[6], vextq_s16(in16_lo, in16_hi, 6));
     60  int16x8_t m7 = vmulq_s16(f[7], vextq_s16(in16_lo, in16_hi, 7));
     61 
     62  int32x4_t m0123_pairs[] = { vpaddlq_s16(m0), vpaddlq_s16(m1), vpaddlq_s16(m2),
     63                              vpaddlq_s16(m3) };
     64  int32x4_t m4567_pairs[] = { vpaddlq_s16(m4), vpaddlq_s16(m5), vpaddlq_s16(m6),
     65                              vpaddlq_s16(m7) };
     66 
     67  int32x4_t tmp_res_low = horizontal_add_4d_s32x4(m0123_pairs);
     68  int32x4_t tmp_res_high = horizontal_add_4d_s32x4(m4567_pairs);
     69 
     70  tmp_res_low = vaddq_s32(tmp_res_low, add_const);
     71  tmp_res_high = vaddq_s32(tmp_res_high, add_const);
     72 
     73  uint16x8_t res = vcombine_u16(vqrshrun_n_s32(tmp_res_low, ROUND0_BITS),
     74                                vqrshrun_n_s32(tmp_res_high, ROUND0_BITS));
     75  return vreinterpretq_s16_u16(res);
     76 }
     77 
     78 static AOM_FORCE_INLINE int16x8_t
     79 horizontal_filter_4x1_f1_beta0(const uint8x16_t in, int16x8_t f_s16) {
     80  const int32x4_t add_const = vdupq_n_s32(1 << (8 + FILTER_BITS - 1));
     81 
     82  int16x8_t in16_lo = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(in)));
     83  int16x8_t in16_hi = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(in)));
     84 
     85  int16x8_t m0 = vmulq_s16(f_s16, in16_lo);
     86  int16x8_t m1 = vmulq_s16(f_s16, vextq_s16(in16_lo, in16_hi, 1));
     87  int16x8_t m2 = vmulq_s16(f_s16, vextq_s16(in16_lo, in16_hi, 2));
     88  int16x8_t m3 = vmulq_s16(f_s16, vextq_s16(in16_lo, in16_hi, 3));
     89 
     90  int32x4_t m0123_pairs[] = { vpaddlq_s16(m0), vpaddlq_s16(m1), vpaddlq_s16(m2),
     91                              vpaddlq_s16(m3) };
     92 
     93  int32x4_t tmp_res_low = horizontal_add_4d_s32x4(m0123_pairs);
     94 
     95  tmp_res_low = vaddq_s32(tmp_res_low, add_const);
     96 
     97  uint16x8_t res =
     98      vcombine_u16(vqrshrun_n_s32(tmp_res_low, ROUND0_BITS), vdup_n_u16(0));
     99  return vreinterpretq_s16_u16(res);
    100 }
    101 
    102 static AOM_FORCE_INLINE int16x8_t horizontal_filter_4x1_f1(const uint8x16_t in,
    103                                                           int sx) {
    104  int16x8_t f_s16 = vld1q_s16(av1_warped_filter[sx >> WARPEDDIFF_PREC_BITS]);
    105  return horizontal_filter_4x1_f1_beta0(in, f_s16);
    106 }
    107 
    108 static AOM_FORCE_INLINE int16x8_t
    109 horizontal_filter_8x1_f1_beta0(const uint8x16_t in, int16x8_t f_s16) {
    110  const int32x4_t add_const = vdupq_n_s32(1 << (8 + FILTER_BITS - 1));
    111 
    112  int16x8_t in16_lo = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(in)));
    113  int16x8_t in16_hi = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(in)));
    114 
    115  int16x8_t m0 = vmulq_s16(f_s16, in16_lo);
    116  int16x8_t m1 = vmulq_s16(f_s16, vextq_s16(in16_lo, in16_hi, 1));
    117  int16x8_t m2 = vmulq_s16(f_s16, vextq_s16(in16_lo, in16_hi, 2));
    118  int16x8_t m3 = vmulq_s16(f_s16, vextq_s16(in16_lo, in16_hi, 3));
    119  int16x8_t m4 = vmulq_s16(f_s16, vextq_s16(in16_lo, in16_hi, 4));
    120  int16x8_t m5 = vmulq_s16(f_s16, vextq_s16(in16_lo, in16_hi, 5));
    121  int16x8_t m6 = vmulq_s16(f_s16, vextq_s16(in16_lo, in16_hi, 6));
    122  int16x8_t m7 = vmulq_s16(f_s16, vextq_s16(in16_lo, in16_hi, 7));
    123 
    124  int32x4_t m0123_pairs[] = { vpaddlq_s16(m0), vpaddlq_s16(m1), vpaddlq_s16(m2),
    125                              vpaddlq_s16(m3) };
    126  int32x4_t m4567_pairs[] = { vpaddlq_s16(m4), vpaddlq_s16(m5), vpaddlq_s16(m6),
    127                              vpaddlq_s16(m7) };
    128 
    129  int32x4_t tmp_res_low = horizontal_add_4d_s32x4(m0123_pairs);
    130  int32x4_t tmp_res_high = horizontal_add_4d_s32x4(m4567_pairs);
    131 
    132  tmp_res_low = vaddq_s32(tmp_res_low, add_const);
    133  tmp_res_high = vaddq_s32(tmp_res_high, add_const);
    134 
    135  uint16x8_t res = vcombine_u16(vqrshrun_n_s32(tmp_res_low, ROUND0_BITS),
    136                                vqrshrun_n_s32(tmp_res_high, ROUND0_BITS));
    137  return vreinterpretq_s16_u16(res);
    138 }
    139 
    140 static AOM_FORCE_INLINE int16x8_t horizontal_filter_8x1_f1(const uint8x16_t in,
    141                                                           int sx) {
    142  int16x8_t f_s16 = vld1q_s16(av1_warped_filter[sx >> WARPEDDIFF_PREC_BITS]);
    143  return horizontal_filter_8x1_f1_beta0(in, f_s16);
    144 }
    145 
    146 static AOM_FORCE_INLINE void vertical_filter_4x1_f1(const int16x8_t *src,
    147                                                    int32x4_t *res, int sy) {
    148  int16x4_t s0 = vget_low_s16(src[0]);
    149  int16x4_t s1 = vget_low_s16(src[1]);
    150  int16x4_t s2 = vget_low_s16(src[2]);
    151  int16x4_t s3 = vget_low_s16(src[3]);
    152  int16x4_t s4 = vget_low_s16(src[4]);
    153  int16x4_t s5 = vget_low_s16(src[5]);
    154  int16x4_t s6 = vget_low_s16(src[6]);
    155  int16x4_t s7 = vget_low_s16(src[7]);
    156 
    157  int16x8_t f = vld1q_s16(av1_warped_filter[sy >> WARPEDDIFF_PREC_BITS]);
    158 
    159  int32x4_t m0123 = vmull_lane_s16(s0, vget_low_s16(f), 0);
    160  m0123 = vmlal_lane_s16(m0123, s1, vget_low_s16(f), 1);
    161  m0123 = vmlal_lane_s16(m0123, s2, vget_low_s16(f), 2);
    162  m0123 = vmlal_lane_s16(m0123, s3, vget_low_s16(f), 3);
    163  m0123 = vmlal_lane_s16(m0123, s4, vget_high_s16(f), 0);
    164  m0123 = vmlal_lane_s16(m0123, s5, vget_high_s16(f), 1);
    165  m0123 = vmlal_lane_s16(m0123, s6, vget_high_s16(f), 2);
    166  m0123 = vmlal_lane_s16(m0123, s7, vget_high_s16(f), 3);
    167 
    168  *res = m0123;
    169 }
    170 
    171 static AOM_FORCE_INLINE void vertical_filter_4x1_f4(const int16x8_t *src,
    172                                                    int32x4_t *res, int sy,
    173                                                    int gamma) {
    174  int16x8_t s0, s1, s2, s3;
    175  transpose_elems_s16_4x8(
    176      vget_low_s16(src[0]), vget_low_s16(src[1]), vget_low_s16(src[2]),
    177      vget_low_s16(src[3]), vget_low_s16(src[4]), vget_low_s16(src[5]),
    178      vget_low_s16(src[6]), vget_low_s16(src[7]), &s0, &s1, &s2, &s3);
    179 
    180  int16x8_t f[4];
    181  load_filters_4(f, sy, gamma);
    182 
    183  int32x4_t m0 = vmull_s16(vget_low_s16(s0), vget_low_s16(f[0]));
    184  m0 = vmlal_s16(m0, vget_high_s16(s0), vget_high_s16(f[0]));
    185  int32x4_t m1 = vmull_s16(vget_low_s16(s1), vget_low_s16(f[1]));
    186  m1 = vmlal_s16(m1, vget_high_s16(s1), vget_high_s16(f[1]));
    187  int32x4_t m2 = vmull_s16(vget_low_s16(s2), vget_low_s16(f[2]));
    188  m2 = vmlal_s16(m2, vget_high_s16(s2), vget_high_s16(f[2]));
    189  int32x4_t m3 = vmull_s16(vget_low_s16(s3), vget_low_s16(f[3]));
    190  m3 = vmlal_s16(m3, vget_high_s16(s3), vget_high_s16(f[3]));
    191 
    192  int32x4_t m0123_pairs[] = { m0, m1, m2, m3 };
    193 
    194  *res = horizontal_add_4d_s32x4(m0123_pairs);
    195 }
    196 
    197 static AOM_FORCE_INLINE void vertical_filter_8x1_f1(const int16x8_t *src,
    198                                                    int32x4_t *res_low,
    199                                                    int32x4_t *res_high,
    200                                                    int sy) {
    201  int16x8_t s0 = src[0];
    202  int16x8_t s1 = src[1];
    203  int16x8_t s2 = src[2];
    204  int16x8_t s3 = src[3];
    205  int16x8_t s4 = src[4];
    206  int16x8_t s5 = src[5];
    207  int16x8_t s6 = src[6];
    208  int16x8_t s7 = src[7];
    209 
    210  int16x8_t f = vld1q_s16(av1_warped_filter[sy >> WARPEDDIFF_PREC_BITS]);
    211 
    212  int32x4_t m0123 = vmull_lane_s16(vget_low_s16(s0), vget_low_s16(f), 0);
    213  m0123 = vmlal_lane_s16(m0123, vget_low_s16(s1), vget_low_s16(f), 1);
    214  m0123 = vmlal_lane_s16(m0123, vget_low_s16(s2), vget_low_s16(f), 2);
    215  m0123 = vmlal_lane_s16(m0123, vget_low_s16(s3), vget_low_s16(f), 3);
    216  m0123 = vmlal_lane_s16(m0123, vget_low_s16(s4), vget_high_s16(f), 0);
    217  m0123 = vmlal_lane_s16(m0123, vget_low_s16(s5), vget_high_s16(f), 1);
    218  m0123 = vmlal_lane_s16(m0123, vget_low_s16(s6), vget_high_s16(f), 2);
    219  m0123 = vmlal_lane_s16(m0123, vget_low_s16(s7), vget_high_s16(f), 3);
    220 
    221  int32x4_t m4567 = vmull_lane_s16(vget_high_s16(s0), vget_low_s16(f), 0);
    222  m4567 = vmlal_lane_s16(m4567, vget_high_s16(s1), vget_low_s16(f), 1);
    223  m4567 = vmlal_lane_s16(m4567, vget_high_s16(s2), vget_low_s16(f), 2);
    224  m4567 = vmlal_lane_s16(m4567, vget_high_s16(s3), vget_low_s16(f), 3);
    225  m4567 = vmlal_lane_s16(m4567, vget_high_s16(s4), vget_high_s16(f), 0);
    226  m4567 = vmlal_lane_s16(m4567, vget_high_s16(s5), vget_high_s16(f), 1);
    227  m4567 = vmlal_lane_s16(m4567, vget_high_s16(s6), vget_high_s16(f), 2);
    228  m4567 = vmlal_lane_s16(m4567, vget_high_s16(s7), vget_high_s16(f), 3);
    229 
    230  *res_low = m0123;
    231  *res_high = m4567;
    232 }
    233 
    234 static AOM_FORCE_INLINE void vertical_filter_8x1_f8(const int16x8_t *src,
    235                                                    int32x4_t *res_low,
    236                                                    int32x4_t *res_high, int sy,
    237                                                    int gamma) {
    238  int16x8_t s0 = src[0];
    239  int16x8_t s1 = src[1];
    240  int16x8_t s2 = src[2];
    241  int16x8_t s3 = src[3];
    242  int16x8_t s4 = src[4];
    243  int16x8_t s5 = src[5];
    244  int16x8_t s6 = src[6];
    245  int16x8_t s7 = src[7];
    246  transpose_elems_inplace_s16_8x8(&s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7);
    247 
    248  int16x8_t f[8];
    249  load_filters_8(f, sy, gamma);
    250 
    251  int32x4_t m0 = vmull_s16(vget_low_s16(s0), vget_low_s16(f[0]));
    252  m0 = vmlal_s16(m0, vget_high_s16(s0), vget_high_s16(f[0]));
    253  int32x4_t m1 = vmull_s16(vget_low_s16(s1), vget_low_s16(f[1]));
    254  m1 = vmlal_s16(m1, vget_high_s16(s1), vget_high_s16(f[1]));
    255  int32x4_t m2 = vmull_s16(vget_low_s16(s2), vget_low_s16(f[2]));
    256  m2 = vmlal_s16(m2, vget_high_s16(s2), vget_high_s16(f[2]));
    257  int32x4_t m3 = vmull_s16(vget_low_s16(s3), vget_low_s16(f[3]));
    258  m3 = vmlal_s16(m3, vget_high_s16(s3), vget_high_s16(f[3]));
    259  int32x4_t m4 = vmull_s16(vget_low_s16(s4), vget_low_s16(f[4]));
    260  m4 = vmlal_s16(m4, vget_high_s16(s4), vget_high_s16(f[4]));
    261  int32x4_t m5 = vmull_s16(vget_low_s16(s5), vget_low_s16(f[5]));
    262  m5 = vmlal_s16(m5, vget_high_s16(s5), vget_high_s16(f[5]));
    263  int32x4_t m6 = vmull_s16(vget_low_s16(s6), vget_low_s16(f[6]));
    264  m6 = vmlal_s16(m6, vget_high_s16(s6), vget_high_s16(f[6]));
    265  int32x4_t m7 = vmull_s16(vget_low_s16(s7), vget_low_s16(f[7]));
    266  m7 = vmlal_s16(m7, vget_high_s16(s7), vget_high_s16(f[7]));
    267 
    268  int32x4_t m0123_pairs[] = { m0, m1, m2, m3 };
    269  int32x4_t m4567_pairs[] = { m4, m5, m6, m7 };
    270 
    271  *res_low = horizontal_add_4d_s32x4(m0123_pairs);
    272  *res_high = horizontal_add_4d_s32x4(m4567_pairs);
    273 }
    274 
    275 void av1_warp_affine_neon(const int32_t *mat, const uint8_t *ref, int width,
    276                          int height, int stride, uint8_t *pred, int p_col,
    277                          int p_row, int p_width, int p_height, int p_stride,
    278                          int subsampling_x, int subsampling_y,
    279                          ConvolveParams *conv_params, int16_t alpha,
    280                          int16_t beta, int16_t gamma, int16_t delta) {
    281  av1_warp_affine_common(mat, ref, width, height, stride, pred, p_col, p_row,
    282                         p_width, p_height, p_stride, subsampling_x,
    283                         subsampling_y, conv_params, alpha, beta, gamma, delta);
    284 }