tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

convolve_neon.h (24355B)


      1 /*
      2 * Copyright (c) 2018, Alliance for Open Media. All rights reserved.
      3 *
      4 * This source code is subject to the terms of the BSD 2 Clause License and
      5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
      6 * was not distributed with this source code in the LICENSE file, you can
      7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
      8 * Media Patent License 1.0 was not distributed with this source code in the
      9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
     10 */
     11 
     12 #ifndef AOM_AV1_COMMON_ARM_CONVOLVE_NEON_H_
     13 #define AOM_AV1_COMMON_ARM_CONVOLVE_NEON_H_
     14 
     15 #include <arm_neon.h>
     16 
     17 #include "config/aom_config.h"
     18 
     19 #include "aom_dsp/arm/mem_neon.h"
     20 #include "av1/common/convolve.h"
     21 #include "av1/common/filter.h"
     22 
     23 static inline int32x4_t convolve12_4_2d_v(
     24    const int16x4_t s0, const int16x4_t s1, const int16x4_t s2,
     25    const int16x4_t s3, const int16x4_t s4, const int16x4_t s5,
     26    const int16x4_t s6, const int16x4_t s7, const int16x4_t s8,
     27    const int16x4_t s9, const int16x4_t s10, const int16x4_t s11,
     28    const int16x8_t y_filter_0_7, const int16x4_t y_filter_8_11) {
     29  const int16x4_t y_filter_0_3 = vget_low_s16(y_filter_0_7);
     30  const int16x4_t y_filter_4_7 = vget_high_s16(y_filter_0_7);
     31 
     32  int32x4_t sum = vmull_lane_s16(s0, y_filter_0_3, 0);
     33  sum = vmlal_lane_s16(sum, s1, y_filter_0_3, 1);
     34  sum = vmlal_lane_s16(sum, s2, y_filter_0_3, 2);
     35  sum = vmlal_lane_s16(sum, s3, y_filter_0_3, 3);
     36  sum = vmlal_lane_s16(sum, s4, y_filter_4_7, 0);
     37  sum = vmlal_lane_s16(sum, s5, y_filter_4_7, 1);
     38  sum = vmlal_lane_s16(sum, s6, y_filter_4_7, 2);
     39  sum = vmlal_lane_s16(sum, s7, y_filter_4_7, 3);
     40  sum = vmlal_lane_s16(sum, s8, y_filter_8_11, 0);
     41  sum = vmlal_lane_s16(sum, s9, y_filter_8_11, 1);
     42  sum = vmlal_lane_s16(sum, s10, y_filter_8_11, 2);
     43  sum = vmlal_lane_s16(sum, s11, y_filter_8_11, 3);
     44 
     45  return sum;
     46 }
     47 
     48 static inline uint8x8_t convolve12_8_2d_v(
     49    const int16x8_t s0, const int16x8_t s1, const int16x8_t s2,
     50    const int16x8_t s3, const int16x8_t s4, const int16x8_t s5,
     51    const int16x8_t s6, const int16x8_t s7, const int16x8_t s8,
     52    const int16x8_t s9, const int16x8_t s10, const int16x8_t s11,
     53    const int16x8_t y_filter_0_7, const int16x4_t y_filter_8_11,
     54    const int16x8_t sub_const) {
     55  const int16x4_t y_filter_0_3 = vget_low_s16(y_filter_0_7);
     56  const int16x4_t y_filter_4_7 = vget_high_s16(y_filter_0_7);
     57 
     58  int32x4_t sum0 = vmull_lane_s16(vget_low_s16(s0), y_filter_0_3, 0);
     59  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s1), y_filter_0_3, 1);
     60  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s2), y_filter_0_3, 2);
     61  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s3), y_filter_0_3, 3);
     62  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s4), y_filter_4_7, 0);
     63  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s5), y_filter_4_7, 1);
     64  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s6), y_filter_4_7, 2);
     65  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s7), y_filter_4_7, 3);
     66  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s8), y_filter_8_11, 0);
     67  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s9), y_filter_8_11, 1);
     68  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s10), y_filter_8_11, 2);
     69  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s11), y_filter_8_11, 3);
     70 
     71  int32x4_t sum1 = vmull_lane_s16(vget_high_s16(s0), y_filter_0_3, 0);
     72  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s1), y_filter_0_3, 1);
     73  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s2), y_filter_0_3, 2);
     74  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s3), y_filter_0_3, 3);
     75  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s4), y_filter_4_7, 0);
     76  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s5), y_filter_4_7, 1);
     77  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s6), y_filter_4_7, 2);
     78  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s7), y_filter_4_7, 3);
     79  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s8), y_filter_8_11, 0);
     80  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s9), y_filter_8_11, 1);
     81  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s10), y_filter_8_11, 2);
     82  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s11), y_filter_8_11, 3);
     83 
     84  int16x8_t res =
     85      vcombine_s16(vqrshrn_n_s32(sum0, 2 * FILTER_BITS - ROUND0_BITS),
     86                   vqrshrn_n_s32(sum1, 2 * FILTER_BITS - ROUND0_BITS));
     87  res = vsubq_s16(res, sub_const);
     88 
     89  return vqmovun_s16(res);
     90 }
     91 
     92 static inline void convolve_2d_sr_vert_12tap_neon(
     93    int16_t *src_ptr, int src_stride, uint8_t *dst_ptr, int dst_stride, int w,
     94    int h, const int16x8_t y_filter_0_7, const int16x4_t y_filter_8_11) {
     95  const int bd = 8;
     96  const int16x8_t sub_const = vdupq_n_s16(1 << (bd - 1));
     97 
     98  if (w <= 4) {
     99    int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
    100    load_s16_4x11(src_ptr, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7,
    101                  &s8, &s9, &s10);
    102    src_ptr += 11 * src_stride;
    103 
    104    do {
    105      int16x4_t s11, s12, s13, s14;
    106      load_s16_4x4(src_ptr, src_stride, &s11, &s12, &s13, &s14);
    107 
    108      int32x4_t d0 = convolve12_4_2d_v(s0, s1, s2, s3, s4, s5, s6, s7, s8, s9,
    109                                       s10, s11, y_filter_0_7, y_filter_8_11);
    110      int32x4_t d1 = convolve12_4_2d_v(s1, s2, s3, s4, s5, s6, s7, s8, s9, s10,
    111                                       s11, s12, y_filter_0_7, y_filter_8_11);
    112      int32x4_t d2 = convolve12_4_2d_v(s2, s3, s4, s5, s6, s7, s8, s9, s10, s11,
    113                                       s12, s13, y_filter_0_7, y_filter_8_11);
    114      int32x4_t d3 =
    115          convolve12_4_2d_v(s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14,
    116                            y_filter_0_7, y_filter_8_11);
    117 
    118      int16x8_t dd01 =
    119          vcombine_s16(vqrshrn_n_s32(d0, 2 * FILTER_BITS - ROUND0_BITS),
    120                       vqrshrn_n_s32(d1, 2 * FILTER_BITS - ROUND0_BITS));
    121      int16x8_t dd23 =
    122          vcombine_s16(vqrshrn_n_s32(d2, 2 * FILTER_BITS - ROUND0_BITS),
    123                       vqrshrn_n_s32(d3, 2 * FILTER_BITS - ROUND0_BITS));
    124 
    125      dd01 = vsubq_s16(dd01, sub_const);
    126      dd23 = vsubq_s16(dd23, sub_const);
    127 
    128      uint8x8_t d01 = vqmovun_s16(dd01);
    129      uint8x8_t d23 = vqmovun_s16(dd23);
    130 
    131      store_u8x4_strided_x2(dst_ptr + 0 * dst_stride, dst_stride, d01);
    132      store_u8x4_strided_x2(dst_ptr + 2 * dst_stride, dst_stride, d23);
    133 
    134      s0 = s4;
    135      s1 = s5;
    136      s2 = s6;
    137      s3 = s7;
    138      s4 = s8;
    139      s5 = s9;
    140      s6 = s10;
    141      s7 = s11;
    142      s8 = s12;
    143      s9 = s13;
    144      s10 = s14;
    145      src_ptr += 4 * src_stride;
    146      dst_ptr += 4 * dst_stride;
    147      h -= 4;
    148    } while (h != 0);
    149 
    150  } else {
    151    do {
    152      int height = h;
    153      int16_t *s = src_ptr;
    154      uint8_t *d = dst_ptr;
    155 
    156      int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
    157      load_s16_8x11(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7, &s8,
    158                    &s9, &s10);
    159      s += 11 * src_stride;
    160 
    161      do {
    162        int16x8_t s11, s12, s13, s14;
    163        load_s16_8x4(s, src_stride, &s11, &s12, &s13, &s14);
    164 
    165        uint8x8_t d0 =
    166            convolve12_8_2d_v(s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11,
    167                              y_filter_0_7, y_filter_8_11, sub_const);
    168        uint8x8_t d1 =
    169            convolve12_8_2d_v(s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12,
    170                              y_filter_0_7, y_filter_8_11, sub_const);
    171        uint8x8_t d2 =
    172            convolve12_8_2d_v(s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12,
    173                              s13, y_filter_0_7, y_filter_8_11, sub_const);
    174        uint8x8_t d3 =
    175            convolve12_8_2d_v(s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13,
    176                              s14, y_filter_0_7, y_filter_8_11, sub_const);
    177 
    178        store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
    179 
    180        s0 = s4;
    181        s1 = s5;
    182        s2 = s6;
    183        s3 = s7;
    184        s4 = s8;
    185        s5 = s9;
    186        s6 = s10;
    187        s7 = s11;
    188        s8 = s12;
    189        s9 = s13;
    190        s10 = s14;
    191        s += 4 * src_stride;
    192        d += 4 * dst_stride;
    193        height -= 4;
    194      } while (height != 0);
    195      src_ptr += 8;
    196      dst_ptr += 8;
    197      w -= 8;
    198    } while (w != 0);
    199  }
    200 }
    201 
    202 static inline int16x4_t convolve8_4_2d_v(const int16x4_t s0, const int16x4_t s1,
    203                                         const int16x4_t s2, const int16x4_t s3,
    204                                         const int16x4_t s4, const int16x4_t s5,
    205                                         const int16x4_t s6, const int16x4_t s7,
    206                                         const int16x8_t y_filter) {
    207  const int16x4_t y_filter_lo = vget_low_s16(y_filter);
    208  const int16x4_t y_filter_hi = vget_high_s16(y_filter);
    209 
    210  int32x4_t sum = vmull_lane_s16(s0, y_filter_lo, 0);
    211  sum = vmlal_lane_s16(sum, s1, y_filter_lo, 1);
    212  sum = vmlal_lane_s16(sum, s2, y_filter_lo, 2);
    213  sum = vmlal_lane_s16(sum, s3, y_filter_lo, 3);
    214  sum = vmlal_lane_s16(sum, s4, y_filter_hi, 0);
    215  sum = vmlal_lane_s16(sum, s5, y_filter_hi, 1);
    216  sum = vmlal_lane_s16(sum, s6, y_filter_hi, 2);
    217  sum = vmlal_lane_s16(sum, s7, y_filter_hi, 3);
    218 
    219  return vqrshrn_n_s32(sum, 2 * FILTER_BITS - ROUND0_BITS);
    220 }
    221 
    222 static inline uint8x8_t convolve8_8_2d_v(const int16x8_t s0, const int16x8_t s1,
    223                                         const int16x8_t s2, const int16x8_t s3,
    224                                         const int16x8_t s4, const int16x8_t s5,
    225                                         const int16x8_t s6, const int16x8_t s7,
    226                                         const int16x8_t y_filter,
    227                                         const int16x8_t sub_const) {
    228  const int16x4_t y_filter_lo = vget_low_s16(y_filter);
    229  const int16x4_t y_filter_hi = vget_high_s16(y_filter);
    230 
    231  int32x4_t sum0 = vmull_lane_s16(vget_low_s16(s0), y_filter_lo, 0);
    232  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s1), y_filter_lo, 1);
    233  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s2), y_filter_lo, 2);
    234  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s3), y_filter_lo, 3);
    235  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s4), y_filter_hi, 0);
    236  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s5), y_filter_hi, 1);
    237  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s6), y_filter_hi, 2);
    238  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s7), y_filter_hi, 3);
    239 
    240  int32x4_t sum1 = vmull_lane_s16(vget_high_s16(s0), y_filter_lo, 0);
    241  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s1), y_filter_lo, 1);
    242  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s2), y_filter_lo, 2);
    243  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s3), y_filter_lo, 3);
    244  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s4), y_filter_hi, 0);
    245  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s5), y_filter_hi, 1);
    246  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s6), y_filter_hi, 2);
    247  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s7), y_filter_hi, 3);
    248 
    249  int16x8_t res =
    250      vcombine_s16(vqrshrn_n_s32(sum0, 2 * FILTER_BITS - ROUND0_BITS),
    251                   vqrshrn_n_s32(sum1, 2 * FILTER_BITS - ROUND0_BITS));
    252  res = vsubq_s16(res, sub_const);
    253 
    254  return vqmovun_s16(res);
    255 }
    256 
    257 static inline void convolve_2d_sr_vert_8tap_neon(int16_t *src_ptr,
    258                                                 int src_stride,
    259                                                 uint8_t *dst_ptr,
    260                                                 int dst_stride, int w, int h,
    261                                                 const int16x8_t y_filter) {
    262  const int bd = 8;
    263  const int16x8_t sub_const = vdupq_n_s16(1 << (bd - 1));
    264 
    265  if (w <= 4) {
    266    int16x4_t s0, s1, s2, s3, s4, s5, s6;
    267    load_s16_4x7(src_ptr, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6);
    268    src_ptr += 7 * src_stride;
    269 
    270    do {
    271 #if AOM_ARCH_AARCH64
    272      int16x4_t s7, s8, s9, s10;
    273      load_s16_4x4(src_ptr, src_stride, &s7, &s8, &s9, &s10);
    274 
    275      int16x4_t d0 = convolve8_4_2d_v(s0, s1, s2, s3, s4, s5, s6, s7, y_filter);
    276      int16x4_t d1 = convolve8_4_2d_v(s1, s2, s3, s4, s5, s6, s7, s8, y_filter);
    277      int16x4_t d2 = convolve8_4_2d_v(s2, s3, s4, s5, s6, s7, s8, s9, y_filter);
    278      int16x4_t d3 =
    279          convolve8_4_2d_v(s3, s4, s5, s6, s7, s8, s9, s10, y_filter);
    280 
    281      uint8x8_t d01 = vqmovun_s16(vsubq_s16(vcombine_s16(d0, d1), sub_const));
    282      uint8x8_t d23 = vqmovun_s16(vsubq_s16(vcombine_s16(d2, d3), sub_const));
    283 
    284      store_u8x4_strided_x2(dst_ptr + 0 * dst_stride, dst_stride, d01);
    285      store_u8x4_strided_x2(dst_ptr + 2 * dst_stride, dst_stride, d23);
    286 
    287      s0 = s4;
    288      s1 = s5;
    289      s2 = s6;
    290      s3 = s7;
    291      s4 = s8;
    292      s5 = s9;
    293      s6 = s10;
    294      src_ptr += 4 * src_stride;
    295      dst_ptr += 4 * dst_stride;
    296      h -= 4;
    297 #else   // !AOM_ARCH_AARCH64
    298      int16x4_t s7 = vld1_s16(src_ptr);
    299      int16x4_t d0 = convolve8_4_2d_v(s0, s1, s2, s3, s4, s5, s6, s7, y_filter);
    300      uint8x8_t d01 =
    301          vqmovun_s16(vsubq_s16(vcombine_s16(d0, vdup_n_s16(0)), sub_const));
    302 
    303      store_u8_4x1(dst_ptr, d01);
    304 
    305      s0 = s1;
    306      s1 = s2;
    307      s2 = s3;
    308      s3 = s4;
    309      s4 = s5;
    310      s5 = s6;
    311      s6 = s7;
    312      src_ptr += src_stride;
    313      dst_ptr += dst_stride;
    314      h--;
    315 #endif  // AOM_ARCH_AARCH64
    316    } while (h != 0);
    317  } else {
    318    // Width is a multiple of 8 and height is a multiple of 4.
    319    do {
    320      int height = h;
    321      int16_t *s = src_ptr;
    322      uint8_t *d = dst_ptr;
    323 
    324      int16x8_t s0, s1, s2, s3, s4, s5, s6;
    325      load_s16_8x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6);
    326      s += 7 * src_stride;
    327 
    328      do {
    329 #if AOM_ARCH_AARCH64
    330        int16x8_t s7, s8, s9, s10;
    331        load_s16_8x4(s, src_stride, &s7, &s8, &s9, &s10);
    332 
    333        uint8x8_t d0 = convolve8_8_2d_v(s0, s1, s2, s3, s4, s5, s6, s7,
    334                                        y_filter, sub_const);
    335        uint8x8_t d1 = convolve8_8_2d_v(s1, s2, s3, s4, s5, s6, s7, s8,
    336                                        y_filter, sub_const);
    337        uint8x8_t d2 = convolve8_8_2d_v(s2, s3, s4, s5, s6, s7, s8, s9,
    338                                        y_filter, sub_const);
    339        uint8x8_t d3 = convolve8_8_2d_v(s3, s4, s5, s6, s7, s8, s9, s10,
    340                                        y_filter, sub_const);
    341 
    342        store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
    343 
    344        s0 = s4;
    345        s1 = s5;
    346        s2 = s6;
    347        s3 = s7;
    348        s4 = s8;
    349        s5 = s9;
    350        s6 = s10;
    351        s += 4 * src_stride;
    352        d += 4 * dst_stride;
    353        height -= 4;
    354 #else   // !AOM_ARCH_AARCH64
    355        int16x8_t s7 = vld1q_s16(s);
    356        uint8x8_t d0 = convolve8_8_2d_v(s0, s1, s2, s3, s4, s5, s6, s7,
    357                                        y_filter, sub_const);
    358        vst1_u8(d, d0);
    359 
    360        s0 = s1;
    361        s1 = s2;
    362        s2 = s3;
    363        s3 = s4;
    364        s4 = s5;
    365        s5 = s6;
    366        s6 = s7;
    367        s += src_stride;
    368        d += dst_stride;
    369        height--;
    370 #endif  // AOM_ARCH_AARCH64
    371      } while (height != 0);
    372      src_ptr += 8;
    373      dst_ptr += 8;
    374      w -= 8;
    375    } while (w != 0);
    376  }
    377 }
    378 
    379 static inline int16x4_t convolve6_4_2d_v(const int16x4_t s0, const int16x4_t s1,
    380                                         const int16x4_t s2, const int16x4_t s3,
    381                                         const int16x4_t s4, const int16x4_t s5,
    382                                         const int16x8_t y_filter) {
    383  const int16x4_t y_filter_lo = vget_low_s16(y_filter);
    384  const int16x4_t y_filter_hi = vget_high_s16(y_filter);
    385 
    386  int32x4_t sum = vmull_lane_s16(s0, y_filter_lo, 1);
    387  sum = vmlal_lane_s16(sum, s1, y_filter_lo, 2);
    388  sum = vmlal_lane_s16(sum, s2, y_filter_lo, 3);
    389  sum = vmlal_lane_s16(sum, s3, y_filter_hi, 0);
    390  sum = vmlal_lane_s16(sum, s4, y_filter_hi, 1);
    391  sum = vmlal_lane_s16(sum, s5, y_filter_hi, 2);
    392 
    393  return vqrshrn_n_s32(sum, 2 * FILTER_BITS - ROUND0_BITS);
    394 }
    395 
    396 static inline uint8x8_t convolve6_8_2d_v(const int16x8_t s0, const int16x8_t s1,
    397                                         const int16x8_t s2, const int16x8_t s3,
    398                                         const int16x8_t s4, const int16x8_t s5,
    399                                         const int16x8_t y_filter,
    400                                         const int16x8_t sub_const) {
    401  const int16x4_t y_filter_lo = vget_low_s16(y_filter);
    402  const int16x4_t y_filter_hi = vget_high_s16(y_filter);
    403 
    404  int32x4_t sum0 = vmull_lane_s16(vget_low_s16(s0), y_filter_lo, 1);
    405  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s1), y_filter_lo, 2);
    406  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s2), y_filter_lo, 3);
    407  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s3), y_filter_hi, 0);
    408  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s4), y_filter_hi, 1);
    409  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s5), y_filter_hi, 2);
    410 
    411  int32x4_t sum1 = vmull_lane_s16(vget_high_s16(s0), y_filter_lo, 1);
    412  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s1), y_filter_lo, 2);
    413  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s2), y_filter_lo, 3);
    414  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s3), y_filter_hi, 0);
    415  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s4), y_filter_hi, 1);
    416  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s5), y_filter_hi, 2);
    417 
    418  int16x8_t res =
    419      vcombine_s16(vqrshrn_n_s32(sum0, 2 * FILTER_BITS - ROUND0_BITS),
    420                   vqrshrn_n_s32(sum1, 2 * FILTER_BITS - ROUND0_BITS));
    421  res = vsubq_s16(res, sub_const);
    422 
    423  return vqmovun_s16(res);
    424 }
    425 
    426 static inline void convolve_2d_sr_vert_6tap_neon(int16_t *src_ptr,
    427                                                 int src_stride,
    428                                                 uint8_t *dst_ptr,
    429                                                 int dst_stride, int w, int h,
    430                                                 const int16x8_t y_filter) {
    431  const int bd = 8;
    432  const int16x8_t sub_const = vdupq_n_s16(1 << (bd - 1));
    433 
    434  if (w <= 4) {
    435    int16x4_t s0, s1, s2, s3, s4;
    436    load_s16_4x5(src_ptr, src_stride, &s0, &s1, &s2, &s3, &s4);
    437    src_ptr += 5 * src_stride;
    438 
    439    do {
    440 #if AOM_ARCH_AARCH64
    441      int16x4_t s5, s6, s7, s8;
    442      load_s16_4x4(src_ptr, src_stride, &s5, &s6, &s7, &s8);
    443 
    444      int16x4_t d0 = convolve6_4_2d_v(s0, s1, s2, s3, s4, s5, y_filter);
    445      int16x4_t d1 = convolve6_4_2d_v(s1, s2, s3, s4, s5, s6, y_filter);
    446      int16x4_t d2 = convolve6_4_2d_v(s2, s3, s4, s5, s6, s7, y_filter);
    447      int16x4_t d3 = convolve6_4_2d_v(s3, s4, s5, s6, s7, s8, y_filter);
    448 
    449      uint8x8_t d01 = vqmovun_s16(vsubq_s16(vcombine_s16(d0, d1), sub_const));
    450      uint8x8_t d23 = vqmovun_s16(vsubq_s16(vcombine_s16(d2, d3), sub_const));
    451 
    452      store_u8x4_strided_x2(dst_ptr + 0 * dst_stride, dst_stride, d01);
    453      store_u8x4_strided_x2(dst_ptr + 2 * dst_stride, dst_stride, d23);
    454 
    455      s0 = s4;
    456      s1 = s5;
    457      s2 = s6;
    458      s3 = s7;
    459      s4 = s8;
    460      src_ptr += 4 * src_stride;
    461      dst_ptr += 4 * dst_stride;
    462      h -= 4;
    463 #else   // !AOM_ARCH_AARCH64
    464      int16x4_t s5 = vld1_s16(src_ptr);
    465      int16x4_t d0 = convolve6_4_2d_v(s0, s1, s2, s3, s4, s5, y_filter);
    466      uint8x8_t d01 =
    467          vqmovun_s16(vsubq_s16(vcombine_s16(d0, vdup_n_s16(0)), sub_const));
    468 
    469      store_u8_4x1(dst_ptr, d01);
    470 
    471      s0 = s1;
    472      s1 = s2;
    473      s2 = s3;
    474      s3 = s4;
    475      s4 = s5;
    476      src_ptr += src_stride;
    477      dst_ptr += dst_stride;
    478      h--;
    479 #endif  // AOM_ARCH_AARCH64
    480    } while (h != 0);
    481  } else {
    482    // Width is a multiple of 8 and height is a multiple of 4.
    483    do {
    484      int height = h;
    485      int16_t *s = src_ptr;
    486      uint8_t *d = dst_ptr;
    487 
    488      int16x8_t s0, s1, s2, s3, s4;
    489      load_s16_8x5(s, src_stride, &s0, &s1, &s2, &s3, &s4);
    490      s += 5 * src_stride;
    491 
    492      do {
    493 #if AOM_ARCH_AARCH64
    494        int16x8_t s5, s6, s7, s8;
    495        load_s16_8x4(s, src_stride, &s5, &s6, &s7, &s8);
    496 
    497        uint8x8_t d0 =
    498            convolve6_8_2d_v(s0, s1, s2, s3, s4, s5, y_filter, sub_const);
    499        uint8x8_t d1 =
    500            convolve6_8_2d_v(s1, s2, s3, s4, s5, s6, y_filter, sub_const);
    501        uint8x8_t d2 =
    502            convolve6_8_2d_v(s2, s3, s4, s5, s6, s7, y_filter, sub_const);
    503        uint8x8_t d3 =
    504            convolve6_8_2d_v(s3, s4, s5, s6, s7, s8, y_filter, sub_const);
    505 
    506        store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
    507 
    508        s0 = s4;
    509        s1 = s5;
    510        s2 = s6;
    511        s3 = s7;
    512        s4 = s8;
    513        s += 4 * src_stride;
    514        d += 4 * dst_stride;
    515        height -= 4;
    516 #else   // !AOM_ARCH_AARCH64
    517        int16x8_t s5 = vld1q_s16(s);
    518        uint8x8_t d0 =
    519            convolve6_8_2d_v(s0, s1, s2, s3, s4, s5, y_filter, sub_const);
    520        vst1_u8(d, d0);
    521 
    522        s0 = s1;
    523        s1 = s2;
    524        s2 = s3;
    525        s3 = s4;
    526        s4 = s5;
    527        s += src_stride;
    528        d += dst_stride;
    529        height--;
    530 #endif  // AOM_ARCH_AARCH64
    531      } while (height != 0);
    532      src_ptr += 8;
    533      dst_ptr += 8;
    534      w -= 8;
    535    } while (w != 0);
    536  }
    537 }
    538 
    539 static inline int16x4_t convolve4_4_2d_v(const int16x4_t s0, const int16x4_t s1,
    540                                         const int16x4_t s2, const int16x4_t s3,
    541                                         const int16x4_t y_filter) {
    542  int32x4_t sum = vmull_lane_s16(s0, y_filter, 0);
    543  sum = vmlal_lane_s16(sum, s1, y_filter, 1);
    544  sum = vmlal_lane_s16(sum, s2, y_filter, 2);
    545  sum = vmlal_lane_s16(sum, s3, y_filter, 3);
    546 
    547  return vqrshrn_n_s32(sum, 2 * FILTER_BITS - ROUND0_BITS);
    548 }
    549 
    550 static inline uint8x8_t convolve4_8_2d_v(const int16x8_t s0, const int16x8_t s1,
    551                                         const int16x8_t s2, const int16x8_t s3,
    552                                         const int16x4_t y_filter,
    553                                         const int16x8_t sub_const) {
    554  int32x4_t sum0 = vmull_lane_s16(vget_low_s16(s0), y_filter, 0);
    555  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s1), y_filter, 1);
    556  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s2), y_filter, 2);
    557  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s3), y_filter, 3);
    558 
    559  int32x4_t sum1 = vmull_lane_s16(vget_high_s16(s0), y_filter, 0);
    560  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s1), y_filter, 1);
    561  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s2), y_filter, 2);
    562  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s3), y_filter, 3);
    563 
    564  int16x8_t res =
    565      vcombine_s16(vqrshrn_n_s32(sum0, 2 * FILTER_BITS - ROUND0_BITS),
    566                   vqrshrn_n_s32(sum1, 2 * FILTER_BITS - ROUND0_BITS));
    567  res = vsubq_s16(res, sub_const);
    568 
    569  return vqmovun_s16(res);
    570 }
    571 
    572 static inline void convolve_2d_sr_vert_4tap_neon(int16_t *src_ptr,
    573                                                 int src_stride,
    574                                                 uint8_t *dst_ptr,
    575                                                 int dst_stride, int w, int h,
    576                                                 const int16_t *y_filter) {
    577  const int bd = 8;
    578  const int16x8_t sub_const = vdupq_n_s16(1 << (bd - 1));
    579 
    580  const int16x4_t filter = vld1_s16(y_filter + 2);
    581 
    582  if (w == 4) {
    583    int16x4_t s0, s1, s2;
    584    load_s16_4x3(src_ptr, src_stride, &s0, &s1, &s2);
    585    src_ptr += 3 * src_stride;
    586 
    587    do {
    588      int16x4_t s3, s4, s5, s6;
    589      load_s16_4x4(src_ptr, src_stride, &s3, &s4, &s5, &s6);
    590 
    591      int16x4_t d0 = convolve4_4_2d_v(s0, s1, s2, s3, filter);
    592      int16x4_t d1 = convolve4_4_2d_v(s1, s2, s3, s4, filter);
    593      int16x4_t d2 = convolve4_4_2d_v(s2, s3, s4, s5, filter);
    594      int16x4_t d3 = convolve4_4_2d_v(s3, s4, s5, s6, filter);
    595 
    596      uint8x8_t d01 = vqmovun_s16(vsubq_s16(vcombine_s16(d0, d1), sub_const));
    597      uint8x8_t d23 = vqmovun_s16(vsubq_s16(vcombine_s16(d2, d3), sub_const));
    598 
    599      store_u8x4_strided_x2(dst_ptr + 0 * dst_stride, dst_stride, d01);
    600      store_u8x4_strided_x2(dst_ptr + 2 * dst_stride, dst_stride, d23);
    601 
    602      s0 = s4;
    603      s1 = s5;
    604      s2 = s6;
    605 
    606      src_ptr += 4 * src_stride;
    607      dst_ptr += 4 * dst_stride;
    608      h -= 4;
    609    } while (h != 0);
    610  } else {
    611    // Width is a multiple of 8 and height is a multiple of 4.
    612    do {
    613      int height = h;
    614      int16_t *s = src_ptr;
    615      uint8_t *d = dst_ptr;
    616 
    617      int16x8_t s0, s1, s2;
    618      load_s16_8x3(s, src_stride, &s0, &s1, &s2);
    619      s += 3 * src_stride;
    620 
    621      do {
    622        int16x8_t s3, s4, s5, s6;
    623        load_s16_8x4(s, src_stride, &s3, &s4, &s5, &s6);
    624 
    625        uint8x8_t d0 = convolve4_8_2d_v(s0, s1, s2, s3, filter, sub_const);
    626        uint8x8_t d1 = convolve4_8_2d_v(s1, s2, s3, s4, filter, sub_const);
    627        uint8x8_t d2 = convolve4_8_2d_v(s2, s3, s4, s5, filter, sub_const);
    628        uint8x8_t d3 = convolve4_8_2d_v(s3, s4, s5, s6, filter, sub_const);
    629 
    630        store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
    631 
    632        s0 = s4;
    633        s1 = s5;
    634        s2 = s6;
    635 
    636        s += 4 * src_stride;
    637        d += 4 * dst_stride;
    638        height -= 4;
    639      } while (height != 0);
    640      src_ptr += 8;
    641      dst_ptr += 8;
    642      w -= 8;
    643    } while (w != 0);
    644  }
    645 }
    646 
    647 #endif  // AOM_AV1_COMMON_ARM_CONVOLVE_NEON_H_