compound_convolve_neon.h (40973B)
1 /* 2 * Copyright (c) 2023, Alliance for Open Media. All rights reserved. 3 * 4 * This source code is subject to the terms of the BSD 2 Clause License and 5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License 6 * was not distributed with this source code in the LICENSE file, you can 7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open 8 * Media Patent License 1.0 was not distributed with this source code in the 9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent. 10 */ 11 #ifndef AOM_AV1_COMMON_ARM_COMPOUND_CONVOLVE_NEON_H_ 12 #define AOM_AV1_COMMON_ARM_COMPOUND_CONVOLVE_NEON_H_ 13 14 #include <arm_neon.h> 15 16 #include "av1/common/convolve.h" 17 #include "av1/common/enums.h" 18 #include "av1/common/filter.h" 19 20 static inline void compute_dist_wtd_avg_4x1(uint16x4_t dd0, uint16x4_t d0, 21 const uint16_t fwd_offset, 22 const uint16_t bck_offset, 23 const int16x4_t round_offset, 24 uint8x8_t *d0_u8) { 25 uint32x4_t blend0 = vmull_n_u16(dd0, fwd_offset); 26 blend0 = vmlal_n_u16(blend0, d0, bck_offset); 27 28 uint16x4_t avg0 = vshrn_n_u32(blend0, DIST_PRECISION_BITS); 29 30 int16x4_t dst0 = vsub_s16(vreinterpret_s16_u16(avg0), round_offset); 31 32 int16x8_t dst0q = vcombine_s16(dst0, vdup_n_s16(0)); 33 34 *d0_u8 = vqrshrun_n_s16(dst0q, FILTER_BITS - ROUND0_BITS); 35 } 36 37 static inline void compute_basic_avg_4x1(uint16x4_t dd0, uint16x4_t d0, 38 const int16x4_t round_offset, 39 uint8x8_t *d0_u8) { 40 uint16x4_t avg0 = vhadd_u16(dd0, d0); 41 42 int16x4_t dst0 = vsub_s16(vreinterpret_s16_u16(avg0), round_offset); 43 44 int16x8_t dst0q = vcombine_s16(dst0, vdup_n_s16(0)); 45 46 *d0_u8 = vqrshrun_n_s16(dst0q, FILTER_BITS - ROUND0_BITS); 47 } 48 49 static inline void compute_dist_wtd_avg_8x1(uint16x8_t dd0, uint16x8_t d0, 50 const uint16_t fwd_offset, 51 const uint16_t bck_offset, 52 const int16x8_t round_offset, 53 uint8x8_t *d0_u8) { 54 uint32x4_t blend0_lo = vmull_n_u16(vget_low_u16(dd0), fwd_offset); 55 blend0_lo = vmlal_n_u16(blend0_lo, vget_low_u16(d0), bck_offset); 56 uint32x4_t blend0_hi = vmull_n_u16(vget_high_u16(dd0), fwd_offset); 57 blend0_hi = vmlal_n_u16(blend0_hi, vget_high_u16(d0), bck_offset); 58 59 uint16x8_t avg0 = vcombine_u16(vshrn_n_u32(blend0_lo, DIST_PRECISION_BITS), 60 vshrn_n_u32(blend0_hi, DIST_PRECISION_BITS)); 61 62 int16x8_t dst0 = vsubq_s16(vreinterpretq_s16_u16(avg0), round_offset); 63 64 *d0_u8 = vqrshrun_n_s16(dst0, FILTER_BITS - ROUND0_BITS); 65 } 66 67 static inline void compute_basic_avg_8x1(uint16x8_t dd0, uint16x8_t d0, 68 const int16x8_t round_offset, 69 uint8x8_t *d0_u8) { 70 uint16x8_t avg0 = vhaddq_u16(dd0, d0); 71 72 int16x8_t dst0 = vsubq_s16(vreinterpretq_s16_u16(avg0), round_offset); 73 74 *d0_u8 = vqrshrun_n_s16(dst0, FILTER_BITS - ROUND0_BITS); 75 } 76 77 static inline void compute_dist_wtd_avg_4x4( 78 uint16x4_t dd0, uint16x4_t dd1, uint16x4_t dd2, uint16x4_t dd3, 79 uint16x4_t d0, uint16x4_t d1, uint16x4_t d2, uint16x4_t d3, 80 const uint16_t fwd_offset, const uint16_t bck_offset, 81 const int16x8_t round_offset, uint8x8_t *d01_u8, uint8x8_t *d23_u8) { 82 uint32x4_t blend0 = vmull_n_u16(dd0, fwd_offset); 83 blend0 = vmlal_n_u16(blend0, d0, bck_offset); 84 uint32x4_t blend1 = vmull_n_u16(dd1, fwd_offset); 85 blend1 = vmlal_n_u16(blend1, d1, bck_offset); 86 uint32x4_t blend2 = vmull_n_u16(dd2, fwd_offset); 87 blend2 = vmlal_n_u16(blend2, d2, bck_offset); 88 uint32x4_t blend3 = vmull_n_u16(dd3, fwd_offset); 89 blend3 = vmlal_n_u16(blend3, d3, bck_offset); 90 91 uint16x4_t avg0 = vshrn_n_u32(blend0, DIST_PRECISION_BITS); 92 uint16x4_t avg1 = vshrn_n_u32(blend1, DIST_PRECISION_BITS); 93 uint16x4_t avg2 = vshrn_n_u32(blend2, DIST_PRECISION_BITS); 94 uint16x4_t avg3 = vshrn_n_u32(blend3, DIST_PRECISION_BITS); 95 96 int16x8_t dst_01 = vreinterpretq_s16_u16(vcombine_u16(avg0, avg1)); 97 int16x8_t dst_23 = vreinterpretq_s16_u16(vcombine_u16(avg2, avg3)); 98 99 dst_01 = vsubq_s16(dst_01, round_offset); 100 dst_23 = vsubq_s16(dst_23, round_offset); 101 102 *d01_u8 = vqrshrun_n_s16(dst_01, FILTER_BITS - ROUND0_BITS); 103 *d23_u8 = vqrshrun_n_s16(dst_23, FILTER_BITS - ROUND0_BITS); 104 } 105 106 static inline void compute_basic_avg_4x4(uint16x4_t dd0, uint16x4_t dd1, 107 uint16x4_t dd2, uint16x4_t dd3, 108 uint16x4_t d0, uint16x4_t d1, 109 uint16x4_t d2, uint16x4_t d3, 110 const int16x8_t round_offset, 111 uint8x8_t *d01_u8, uint8x8_t *d23_u8) { 112 uint16x4_t avg0 = vhadd_u16(dd0, d0); 113 uint16x4_t avg1 = vhadd_u16(dd1, d1); 114 uint16x4_t avg2 = vhadd_u16(dd2, d2); 115 uint16x4_t avg3 = vhadd_u16(dd3, d3); 116 117 int16x8_t dst_01 = vreinterpretq_s16_u16(vcombine_u16(avg0, avg1)); 118 int16x8_t dst_23 = vreinterpretq_s16_u16(vcombine_u16(avg2, avg3)); 119 120 dst_01 = vsubq_s16(dst_01, round_offset); 121 dst_23 = vsubq_s16(dst_23, round_offset); 122 123 *d01_u8 = vqrshrun_n_s16(dst_01, FILTER_BITS - ROUND0_BITS); 124 *d23_u8 = vqrshrun_n_s16(dst_23, FILTER_BITS - ROUND0_BITS); 125 } 126 127 static inline void compute_dist_wtd_avg_8x4( 128 uint16x8_t dd0, uint16x8_t dd1, uint16x8_t dd2, uint16x8_t dd3, 129 uint16x8_t d0, uint16x8_t d1, uint16x8_t d2, uint16x8_t d3, 130 const uint16_t fwd_offset, const uint16_t bck_offset, 131 const int16x8_t round_offset, uint8x8_t *d0_u8, uint8x8_t *d1_u8, 132 uint8x8_t *d2_u8, uint8x8_t *d3_u8) { 133 uint32x4_t blend0_lo = vmull_n_u16(vget_low_u16(dd0), fwd_offset); 134 blend0_lo = vmlal_n_u16(blend0_lo, vget_low_u16(d0), bck_offset); 135 uint32x4_t blend0_hi = vmull_n_u16(vget_high_u16(dd0), fwd_offset); 136 blend0_hi = vmlal_n_u16(blend0_hi, vget_high_u16(d0), bck_offset); 137 138 uint32x4_t blend1_lo = vmull_n_u16(vget_low_u16(dd1), fwd_offset); 139 blend1_lo = vmlal_n_u16(blend1_lo, vget_low_u16(d1), bck_offset); 140 uint32x4_t blend1_hi = vmull_n_u16(vget_high_u16(dd1), fwd_offset); 141 blend1_hi = vmlal_n_u16(blend1_hi, vget_high_u16(d1), bck_offset); 142 143 uint32x4_t blend2_lo = vmull_n_u16(vget_low_u16(dd2), fwd_offset); 144 blend2_lo = vmlal_n_u16(blend2_lo, vget_low_u16(d2), bck_offset); 145 uint32x4_t blend2_hi = vmull_n_u16(vget_high_u16(dd2), fwd_offset); 146 blend2_hi = vmlal_n_u16(blend2_hi, vget_high_u16(d2), bck_offset); 147 148 uint32x4_t blend3_lo = vmull_n_u16(vget_low_u16(dd3), fwd_offset); 149 blend3_lo = vmlal_n_u16(blend3_lo, vget_low_u16(d3), bck_offset); 150 uint32x4_t blend3_hi = vmull_n_u16(vget_high_u16(dd3), fwd_offset); 151 blend3_hi = vmlal_n_u16(blend3_hi, vget_high_u16(d3), bck_offset); 152 153 uint16x8_t avg0 = vcombine_u16(vshrn_n_u32(blend0_lo, DIST_PRECISION_BITS), 154 vshrn_n_u32(blend0_hi, DIST_PRECISION_BITS)); 155 uint16x8_t avg1 = vcombine_u16(vshrn_n_u32(blend1_lo, DIST_PRECISION_BITS), 156 vshrn_n_u32(blend1_hi, DIST_PRECISION_BITS)); 157 uint16x8_t avg2 = vcombine_u16(vshrn_n_u32(blend2_lo, DIST_PRECISION_BITS), 158 vshrn_n_u32(blend2_hi, DIST_PRECISION_BITS)); 159 uint16x8_t avg3 = vcombine_u16(vshrn_n_u32(blend3_lo, DIST_PRECISION_BITS), 160 vshrn_n_u32(blend3_hi, DIST_PRECISION_BITS)); 161 162 int16x8_t dst0 = vsubq_s16(vreinterpretq_s16_u16(avg0), round_offset); 163 int16x8_t dst1 = vsubq_s16(vreinterpretq_s16_u16(avg1), round_offset); 164 int16x8_t dst2 = vsubq_s16(vreinterpretq_s16_u16(avg2), round_offset); 165 int16x8_t dst3 = vsubq_s16(vreinterpretq_s16_u16(avg3), round_offset); 166 167 *d0_u8 = vqrshrun_n_s16(dst0, FILTER_BITS - ROUND0_BITS); 168 *d1_u8 = vqrshrun_n_s16(dst1, FILTER_BITS - ROUND0_BITS); 169 *d2_u8 = vqrshrun_n_s16(dst2, FILTER_BITS - ROUND0_BITS); 170 *d3_u8 = vqrshrun_n_s16(dst3, FILTER_BITS - ROUND0_BITS); 171 } 172 173 static inline void compute_basic_avg_8x4(uint16x8_t dd0, uint16x8_t dd1, 174 uint16x8_t dd2, uint16x8_t dd3, 175 uint16x8_t d0, uint16x8_t d1, 176 uint16x8_t d2, uint16x8_t d3, 177 const int16x8_t round_offset, 178 uint8x8_t *d0_u8, uint8x8_t *d1_u8, 179 uint8x8_t *d2_u8, uint8x8_t *d3_u8) { 180 uint16x8_t avg0 = vhaddq_u16(dd0, d0); 181 uint16x8_t avg1 = vhaddq_u16(dd1, d1); 182 uint16x8_t avg2 = vhaddq_u16(dd2, d2); 183 uint16x8_t avg3 = vhaddq_u16(dd3, d3); 184 185 int16x8_t dst0 = vsubq_s16(vreinterpretq_s16_u16(avg0), round_offset); 186 int16x8_t dst1 = vsubq_s16(vreinterpretq_s16_u16(avg1), round_offset); 187 int16x8_t dst2 = vsubq_s16(vreinterpretq_s16_u16(avg2), round_offset); 188 int16x8_t dst3 = vsubq_s16(vreinterpretq_s16_u16(avg3), round_offset); 189 190 *d0_u8 = vqrshrun_n_s16(dst0, FILTER_BITS - ROUND0_BITS); 191 *d1_u8 = vqrshrun_n_s16(dst1, FILTER_BITS - ROUND0_BITS); 192 *d2_u8 = vqrshrun_n_s16(dst2, FILTER_BITS - ROUND0_BITS); 193 *d3_u8 = vqrshrun_n_s16(dst3, FILTER_BITS - ROUND0_BITS); 194 } 195 196 static inline uint16x4_t convolve6_4_2d_v( 197 const int16x4_t s0, const int16x4_t s1, const int16x4_t s2, 198 const int16x4_t s3, const int16x4_t s4, const int16x4_t s5, 199 const int16x8_t y_filter, const int32x4_t offset_const) { 200 const int16x4_t y_filter_0_3 = vget_low_s16(y_filter); 201 const int16x4_t y_filter_4_7 = vget_high_s16(y_filter); 202 203 int32x4_t sum = offset_const; 204 // Filter values at indices 0 and 7 are 0. 205 sum = vmlal_lane_s16(sum, s0, y_filter_0_3, 1); 206 sum = vmlal_lane_s16(sum, s1, y_filter_0_3, 2); 207 sum = vmlal_lane_s16(sum, s2, y_filter_0_3, 3); 208 sum = vmlal_lane_s16(sum, s3, y_filter_4_7, 0); 209 sum = vmlal_lane_s16(sum, s4, y_filter_4_7, 1); 210 sum = vmlal_lane_s16(sum, s5, y_filter_4_7, 2); 211 212 return vqrshrun_n_s32(sum, COMPOUND_ROUND1_BITS); 213 } 214 215 static inline uint16x8_t convolve6_8_2d_v( 216 const int16x8_t s0, const int16x8_t s1, const int16x8_t s2, 217 const int16x8_t s3, const int16x8_t s4, const int16x8_t s5, 218 const int16x8_t y_filter, const int32x4_t offset_const) { 219 const int16x4_t y_filter_0_3 = vget_low_s16(y_filter); 220 const int16x4_t y_filter_4_7 = vget_high_s16(y_filter); 221 222 int32x4_t sum0 = offset_const; 223 // Filter values at indices 0 and 7 are 0. 224 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s0), y_filter_0_3, 1); 225 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s1), y_filter_0_3, 2); 226 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s2), y_filter_0_3, 3); 227 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s3), y_filter_4_7, 0); 228 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s4), y_filter_4_7, 1); 229 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s5), y_filter_4_7, 2); 230 231 int32x4_t sum1 = offset_const; 232 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s0), y_filter_0_3, 1); 233 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s1), y_filter_0_3, 2); 234 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s2), y_filter_0_3, 3); 235 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s3), y_filter_4_7, 0); 236 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s4), y_filter_4_7, 1); 237 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s5), y_filter_4_7, 2); 238 239 return vcombine_u16(vqrshrun_n_s32(sum0, COMPOUND_ROUND1_BITS), 240 vqrshrun_n_s32(sum1, COMPOUND_ROUND1_BITS)); 241 } 242 243 static inline void dist_wtd_convolve_2d_vert_6tap_dist_wtd_avg_neon( 244 int16_t *src_ptr, const int src_stride, uint8_t *dst8_ptr, int dst8_stride, 245 ConvolveParams *conv_params, const int16x8_t y_filter, int h, int w) { 246 const int bd = 8; 247 const int offset_bits = bd + 2 * FILTER_BITS - ROUND0_BITS; 248 const int32x4_t offset_const = vdupq_n_s32(1 << offset_bits); 249 const int16_t round_offset = (1 << (offset_bits - COMPOUND_ROUND1_BITS)) + 250 (1 << (offset_bits - COMPOUND_ROUND1_BITS - 1)); 251 const int16x8_t round_offset_vec = vdupq_n_s16(round_offset); 252 253 const uint16_t fwd_offset = conv_params->fwd_offset; 254 const uint16_t bck_offset = conv_params->bck_offset; 255 256 CONV_BUF_TYPE *dst_ptr = conv_params->dst; 257 const int dst_stride = conv_params->dst_stride; 258 259 if (w == 4) { 260 int16x4_t s0, s1, s2, s3, s4; 261 load_s16_4x5(src_ptr, src_stride, &s0, &s1, &s2, &s3, &s4); 262 src_ptr += 5 * src_stride; 263 264 do { 265 #if AOM_ARCH_AARCH64 266 int16x4_t s5, s6, s7, s8; 267 load_s16_4x4(src_ptr, src_stride, &s5, &s6, &s7, &s8); 268 269 uint16x4_t d0 = 270 convolve6_4_2d_v(s0, s1, s2, s3, s4, s5, y_filter, offset_const); 271 uint16x4_t d1 = 272 convolve6_4_2d_v(s1, s2, s3, s4, s5, s6, y_filter, offset_const); 273 uint16x4_t d2 = 274 convolve6_4_2d_v(s2, s3, s4, s5, s6, s7, y_filter, offset_const); 275 uint16x4_t d3 = 276 convolve6_4_2d_v(s3, s4, s5, s6, s7, s8, y_filter, offset_const); 277 278 uint16x4_t dd0, dd1, dd2, dd3; 279 load_u16_4x4(dst_ptr, dst_stride, &dd0, &dd1, &dd2, &dd3); 280 281 uint8x8_t d01_u8, d23_u8; 282 compute_dist_wtd_avg_4x4(dd0, dd1, dd2, dd3, d0, d1, d2, d3, fwd_offset, 283 bck_offset, round_offset_vec, &d01_u8, &d23_u8); 284 285 store_u8x4_strided_x2(dst8_ptr + 0 * dst8_stride, dst8_stride, d01_u8); 286 store_u8x4_strided_x2(dst8_ptr + 2 * dst8_stride, dst8_stride, d23_u8); 287 dst8_ptr += 4 * dst8_stride; 288 289 s0 = s4; 290 s1 = s5; 291 s2 = s6; 292 s3 = s7; 293 s4 = s8; 294 src_ptr += 4 * src_stride; 295 dst_ptr += 4 * dst_stride; 296 h -= 4; 297 #else // !AOM_ARCH_AARCH64 298 int16x4_t s5 = vld1_s16(src_ptr); 299 300 uint16x4_t d0 = 301 convolve6_4_2d_v(s0, s1, s2, s3, s4, s5, y_filter, offset_const); 302 303 uint16x4_t dd0 = vld1_u16(dst_ptr); 304 305 uint8x8_t d01_u8; 306 compute_dist_wtd_avg_4x1(dd0, d0, fwd_offset, bck_offset, 307 vget_low_s16(round_offset_vec), &d01_u8); 308 309 store_u8_4x1(dst8_ptr, d01_u8); 310 dst8_ptr += dst8_stride; 311 312 s0 = s1; 313 s1 = s2; 314 s2 = s3; 315 s3 = s4; 316 s4 = s5; 317 src_ptr += src_stride; 318 dst_ptr += dst_stride; 319 h--; 320 #endif // AOM_ARCH_AARCH64 321 } while (h != 0); 322 } else { 323 do { 324 int16_t *s = src_ptr; 325 CONV_BUF_TYPE *d = dst_ptr; 326 uint8_t *d_u8 = dst8_ptr; 327 int height = h; 328 329 int16x8_t s0, s1, s2, s3, s4; 330 load_s16_8x5(s, src_stride, &s0, &s1, &s2, &s3, &s4); 331 s += 5 * src_stride; 332 333 do { 334 #if AOM_ARCH_AARCH64 335 int16x8_t s5, s6, s7, s8; 336 load_s16_8x4(s, src_stride, &s5, &s6, &s7, &s8); 337 338 uint16x8_t d0 = 339 convolve6_8_2d_v(s0, s1, s2, s3, s4, s5, y_filter, offset_const); 340 uint16x8_t d1 = 341 convolve6_8_2d_v(s1, s2, s3, s4, s5, s6, y_filter, offset_const); 342 uint16x8_t d2 = 343 convolve6_8_2d_v(s2, s3, s4, s5, s6, s7, y_filter, offset_const); 344 uint16x8_t d3 = 345 convolve6_8_2d_v(s3, s4, s5, s6, s7, s8, y_filter, offset_const); 346 347 uint16x8_t dd0, dd1, dd2, dd3; 348 load_u16_8x4(d, dst_stride, &dd0, &dd1, &dd2, &dd3); 349 350 uint8x8_t d0_u8, d1_u8, d2_u8, d3_u8; 351 compute_dist_wtd_avg_8x4(dd0, dd1, dd2, dd3, d0, d1, d2, d3, fwd_offset, 352 bck_offset, round_offset_vec, &d0_u8, &d1_u8, 353 &d2_u8, &d3_u8); 354 355 store_u8_8x4(d_u8, dst8_stride, d0_u8, d1_u8, d2_u8, d3_u8); 356 d_u8 += 4 * dst8_stride; 357 358 s0 = s4; 359 s1 = s5; 360 s2 = s6; 361 s3 = s7; 362 s4 = s8; 363 s += 4 * src_stride; 364 d += 4 * dst_stride; 365 height -= 4; 366 #else // !AOM_ARCH_AARCH64 367 int16x8_t s5 = vld1q_s16(s); 368 369 uint16x8_t d0 = 370 convolve6_8_2d_v(s0, s1, s2, s3, s4, s5, y_filter, offset_const); 371 372 uint16x8_t dd0 = vld1q_u16(d); 373 374 uint8x8_t d0_u8; 375 compute_dist_wtd_avg_8x1(dd0, d0, fwd_offset, bck_offset, 376 round_offset_vec, &d0_u8); 377 378 vst1_u8(d_u8, d0_u8); 379 d_u8 += dst8_stride; 380 381 s0 = s1; 382 s1 = s2; 383 s2 = s3; 384 s3 = s4; 385 s4 = s5; 386 s += src_stride; 387 d += dst_stride; 388 height--; 389 #endif // AOM_ARCH_AARCH64 390 } while (height != 0); 391 src_ptr += 8; 392 dst_ptr += 8; 393 dst8_ptr += 8; 394 w -= 8; 395 } while (w != 0); 396 } 397 } 398 399 static inline void dist_wtd_convolve_2d_vert_6tap_avg_neon( 400 int16_t *src_ptr, const int src_stride, uint8_t *dst8_ptr, int dst8_stride, 401 ConvolveParams *conv_params, const int16x8_t y_filter, int h, int w) { 402 const int bd = 8; 403 const int offset_bits = bd + 2 * FILTER_BITS - ROUND0_BITS; 404 const int32x4_t offset_const = vdupq_n_s32(1 << offset_bits); 405 const int16_t round_offset = (1 << (offset_bits - COMPOUND_ROUND1_BITS)) + 406 (1 << (offset_bits - COMPOUND_ROUND1_BITS - 1)); 407 const int16x8_t round_offset_vec = vdupq_n_s16(round_offset); 408 409 CONV_BUF_TYPE *dst_ptr = conv_params->dst; 410 const int dst_stride = conv_params->dst_stride; 411 412 if (w == 4) { 413 int16x4_t s0, s1, s2, s3, s4; 414 load_s16_4x5(src_ptr, src_stride, &s0, &s1, &s2, &s3, &s4); 415 src_ptr += 5 * src_stride; 416 417 do { 418 #if AOM_ARCH_AARCH64 419 int16x4_t s5, s6, s7, s8; 420 load_s16_4x4(src_ptr, src_stride, &s5, &s6, &s7, &s8); 421 422 uint16x4_t d0 = 423 convolve6_4_2d_v(s0, s1, s2, s3, s4, s5, y_filter, offset_const); 424 uint16x4_t d1 = 425 convolve6_4_2d_v(s1, s2, s3, s4, s5, s6, y_filter, offset_const); 426 uint16x4_t d2 = 427 convolve6_4_2d_v(s2, s3, s4, s5, s6, s7, y_filter, offset_const); 428 uint16x4_t d3 = 429 convolve6_4_2d_v(s3, s4, s5, s6, s7, s8, y_filter, offset_const); 430 431 uint16x4_t dd0, dd1, dd2, dd3; 432 load_u16_4x4(dst_ptr, dst_stride, &dd0, &dd1, &dd2, &dd3); 433 434 uint8x8_t d01_u8, d23_u8; 435 compute_basic_avg_4x4(dd0, dd1, dd2, dd3, d0, d1, d2, d3, 436 round_offset_vec, &d01_u8, &d23_u8); 437 438 store_u8x4_strided_x2(dst8_ptr + 0 * dst8_stride, dst8_stride, d01_u8); 439 store_u8x4_strided_x2(dst8_ptr + 2 * dst8_stride, dst8_stride, d23_u8); 440 dst8_ptr += 4 * dst8_stride; 441 442 s0 = s4; 443 s1 = s5; 444 s2 = s6; 445 s3 = s7; 446 s4 = s8; 447 src_ptr += 4 * src_stride; 448 dst_ptr += 4 * dst_stride; 449 h -= 4; 450 #else // !AOM_ARCH_AARCH64 451 int16x4_t s5 = vld1_s16(src_ptr); 452 453 uint16x4_t d0 = 454 convolve6_4_2d_v(s0, s1, s2, s3, s4, s5, y_filter, offset_const); 455 456 uint16x4_t dd0 = vld1_u16(dst_ptr); 457 458 uint8x8_t d01_u8; 459 compute_basic_avg_4x1(dd0, d0, vget_low_s16(round_offset_vec), &d01_u8); 460 461 store_u8_4x1(dst8_ptr, d01_u8); 462 dst8_ptr += dst8_stride; 463 464 s0 = s1; 465 s1 = s2; 466 s2 = s3; 467 s3 = s4; 468 s4 = s5; 469 src_ptr += src_stride; 470 dst_ptr += dst_stride; 471 h--; 472 #endif // AOM_ARCH_AARCH64 473 } while (h != 0); 474 } else { 475 do { 476 int16_t *s = src_ptr; 477 CONV_BUF_TYPE *d = dst_ptr; 478 uint8_t *d_u8 = dst8_ptr; 479 int height = h; 480 481 int16x8_t s0, s1, s2, s3, s4; 482 load_s16_8x5(s, src_stride, &s0, &s1, &s2, &s3, &s4); 483 s += 5 * src_stride; 484 485 do { 486 #if AOM_ARCH_AARCH64 487 int16x8_t s5, s6, s7, s8; 488 load_s16_8x4(s, src_stride, &s5, &s6, &s7, &s8); 489 490 uint16x8_t d0 = 491 convolve6_8_2d_v(s0, s1, s2, s3, s4, s5, y_filter, offset_const); 492 uint16x8_t d1 = 493 convolve6_8_2d_v(s1, s2, s3, s4, s5, s6, y_filter, offset_const); 494 uint16x8_t d2 = 495 convolve6_8_2d_v(s2, s3, s4, s5, s6, s7, y_filter, offset_const); 496 uint16x8_t d3 = 497 convolve6_8_2d_v(s3, s4, s5, s6, s7, s8, y_filter, offset_const); 498 499 uint16x8_t dd0, dd1, dd2, dd3; 500 load_u16_8x4(d, dst_stride, &dd0, &dd1, &dd2, &dd3); 501 502 uint8x8_t d0_u8, d1_u8, d2_u8, d3_u8; 503 compute_basic_avg_8x4(dd0, dd1, dd2, dd3, d0, d1, d2, d3, 504 round_offset_vec, &d0_u8, &d1_u8, &d2_u8, &d3_u8); 505 506 store_u8_8x4(d_u8, dst8_stride, d0_u8, d1_u8, d2_u8, d3_u8); 507 d_u8 += 4 * dst8_stride; 508 509 s0 = s4; 510 s1 = s5; 511 s2 = s6; 512 s3 = s7; 513 s4 = s8; 514 s += 4 * src_stride; 515 d += 4 * dst_stride; 516 height -= 4; 517 #else // !AOM_ARCH_AARCH64 518 int16x8_t s5 = vld1q_s16(s); 519 520 uint16x8_t d0 = 521 convolve6_8_2d_v(s0, s1, s2, s3, s4, s5, y_filter, offset_const); 522 523 uint16x8_t dd0 = vld1q_u16(d); 524 525 uint8x8_t d0_u8; 526 compute_basic_avg_8x1(dd0, d0, round_offset_vec, &d0_u8); 527 528 vst1_u8(d_u8, d0_u8); 529 d_u8 += dst8_stride; 530 531 s0 = s1; 532 s1 = s2; 533 s2 = s3; 534 s3 = s4; 535 s4 = s5; 536 s += src_stride; 537 d += dst_stride; 538 height--; 539 #endif // AOM_ARCH_AARCH64 540 } while (height != 0); 541 src_ptr += 8; 542 dst_ptr += 8; 543 dst8_ptr += 8; 544 w -= 8; 545 } while (w != 0); 546 } 547 } 548 549 static inline void dist_wtd_convolve_2d_vert_6tap_neon( 550 int16_t *src_ptr, const int src_stride, ConvolveParams *conv_params, 551 const int16x8_t y_filter, int h, int w) { 552 const int bd = 8; 553 const int offset_bits = bd + 2 * FILTER_BITS - ROUND0_BITS; 554 const int32x4_t offset_const = vdupq_n_s32(1 << offset_bits); 555 556 CONV_BUF_TYPE *dst_ptr = conv_params->dst; 557 const int dst_stride = conv_params->dst_stride; 558 559 if (w == 4) { 560 int16x4_t s0, s1, s2, s3, s4; 561 load_s16_4x5(src_ptr, src_stride, &s0, &s1, &s2, &s3, &s4); 562 src_ptr += 5 * src_stride; 563 564 do { 565 #if AOM_ARCH_AARCH64 566 int16x4_t s5, s6, s7, s8; 567 load_s16_4x4(src_ptr, src_stride, &s5, &s6, &s7, &s8); 568 569 uint16x4_t d0 = 570 convolve6_4_2d_v(s0, s1, s2, s3, s4, s5, y_filter, offset_const); 571 uint16x4_t d1 = 572 convolve6_4_2d_v(s1, s2, s3, s4, s5, s6, y_filter, offset_const); 573 uint16x4_t d2 = 574 convolve6_4_2d_v(s2, s3, s4, s5, s6, s7, y_filter, offset_const); 575 uint16x4_t d3 = 576 convolve6_4_2d_v(s3, s4, s5, s6, s7, s8, y_filter, offset_const); 577 578 store_u16_4x4(dst_ptr, dst_stride, d0, d1, d2, d3); 579 580 s0 = s4; 581 s1 = s5; 582 s2 = s6; 583 s3 = s7; 584 s4 = s8; 585 src_ptr += 4 * src_stride; 586 dst_ptr += 4 * dst_stride; 587 h -= 4; 588 #else // !AOM_ARCH_AARCH64 589 int16x4_t s5 = vld1_s16(src_ptr); 590 591 uint16x4_t d0 = 592 convolve6_4_2d_v(s0, s1, s2, s3, s4, s5, y_filter, offset_const); 593 594 vst1_u16(dst_ptr, d0); 595 596 s0 = s1; 597 s1 = s2; 598 s2 = s3; 599 s3 = s4; 600 s4 = s5; 601 src_ptr += src_stride; 602 dst_ptr += dst_stride; 603 h--; 604 #endif // AOM_ARCH_AARCH64 605 } while (h != 0); 606 } else { 607 do { 608 int16_t *s = src_ptr; 609 CONV_BUF_TYPE *d = dst_ptr; 610 int height = h; 611 612 int16x8_t s0, s1, s2, s3, s4; 613 load_s16_8x5(s, src_stride, &s0, &s1, &s2, &s3, &s4); 614 s += 5 * src_stride; 615 616 do { 617 #if AOM_ARCH_AARCH64 618 int16x8_t s5, s6, s7, s8; 619 load_s16_8x4(s, src_stride, &s5, &s6, &s7, &s8); 620 621 uint16x8_t d0 = 622 convolve6_8_2d_v(s0, s1, s2, s3, s4, s5, y_filter, offset_const); 623 uint16x8_t d1 = 624 convolve6_8_2d_v(s1, s2, s3, s4, s5, s6, y_filter, offset_const); 625 uint16x8_t d2 = 626 convolve6_8_2d_v(s2, s3, s4, s5, s6, s7, y_filter, offset_const); 627 uint16x8_t d3 = 628 convolve6_8_2d_v(s3, s4, s5, s6, s7, s8, y_filter, offset_const); 629 630 store_u16_8x4(d, dst_stride, d0, d1, d2, d3); 631 632 s0 = s4; 633 s1 = s5; 634 s2 = s6; 635 s3 = s7; 636 s4 = s8; 637 s += 4 * src_stride; 638 d += 4 * dst_stride; 639 height -= 4; 640 #else // !AOM_ARCH_AARCH64 641 int16x8_t s5 = vld1q_s16(s); 642 643 uint16x8_t d0 = 644 convolve6_8_2d_v(s0, s1, s2, s3, s4, s5, y_filter, offset_const); 645 646 vst1q_u16(d, d0); 647 648 s0 = s1; 649 s1 = s2; 650 s2 = s3; 651 s3 = s4; 652 s4 = s5; 653 s += src_stride; 654 d += dst_stride; 655 height--; 656 #endif // AOM_ARCH_AARCH64 657 } while (height != 0); 658 src_ptr += 8; 659 dst_ptr += 8; 660 w -= 8; 661 } while (w != 0); 662 } 663 } 664 665 static inline uint16x4_t convolve8_4_2d_v( 666 const int16x4_t s0, const int16x4_t s1, const int16x4_t s2, 667 const int16x4_t s3, const int16x4_t s4, const int16x4_t s5, 668 const int16x4_t s6, const int16x4_t s7, const int16x8_t y_filter, 669 const int32x4_t offset_const) { 670 const int16x4_t y_filter_0_3 = vget_low_s16(y_filter); 671 const int16x4_t y_filter_4_7 = vget_high_s16(y_filter); 672 673 int32x4_t sum = offset_const; 674 sum = vmlal_lane_s16(sum, s0, y_filter_0_3, 0); 675 sum = vmlal_lane_s16(sum, s1, y_filter_0_3, 1); 676 sum = vmlal_lane_s16(sum, s2, y_filter_0_3, 2); 677 sum = vmlal_lane_s16(sum, s3, y_filter_0_3, 3); 678 sum = vmlal_lane_s16(sum, s4, y_filter_4_7, 0); 679 sum = vmlal_lane_s16(sum, s5, y_filter_4_7, 1); 680 sum = vmlal_lane_s16(sum, s6, y_filter_4_7, 2); 681 sum = vmlal_lane_s16(sum, s7, y_filter_4_7, 3); 682 683 return vqrshrun_n_s32(sum, COMPOUND_ROUND1_BITS); 684 } 685 686 static inline uint16x8_t convolve8_8_2d_v( 687 const int16x8_t s0, const int16x8_t s1, const int16x8_t s2, 688 const int16x8_t s3, const int16x8_t s4, const int16x8_t s5, 689 const int16x8_t s6, const int16x8_t s7, const int16x8_t y_filter, 690 const int32x4_t offset_const) { 691 const int16x4_t y_filter_0_3 = vget_low_s16(y_filter); 692 const int16x4_t y_filter_4_7 = vget_high_s16(y_filter); 693 694 int32x4_t sum0 = offset_const; 695 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s0), y_filter_0_3, 0); 696 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s1), y_filter_0_3, 1); 697 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s2), y_filter_0_3, 2); 698 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s3), y_filter_0_3, 3); 699 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s4), y_filter_4_7, 0); 700 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s5), y_filter_4_7, 1); 701 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s6), y_filter_4_7, 2); 702 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s7), y_filter_4_7, 3); 703 704 int32x4_t sum1 = offset_const; 705 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s0), y_filter_0_3, 0); 706 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s1), y_filter_0_3, 1); 707 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s2), y_filter_0_3, 2); 708 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s3), y_filter_0_3, 3); 709 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s4), y_filter_4_7, 0); 710 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s5), y_filter_4_7, 1); 711 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s6), y_filter_4_7, 2); 712 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s7), y_filter_4_7, 3); 713 714 return vcombine_u16(vqrshrun_n_s32(sum0, COMPOUND_ROUND1_BITS), 715 vqrshrun_n_s32(sum1, COMPOUND_ROUND1_BITS)); 716 } 717 718 static inline void dist_wtd_convolve_2d_vert_8tap_dist_wtd_avg_neon( 719 int16_t *src_ptr, const int src_stride, uint8_t *dst8_ptr, int dst8_stride, 720 ConvolveParams *conv_params, const int16x8_t y_filter, int h, int w) { 721 const int bd = 8; 722 const int offset_bits = bd + 2 * FILTER_BITS - ROUND0_BITS; 723 const int32x4_t offset_const = vdupq_n_s32(1 << offset_bits); 724 const int16_t round_offset = (1 << (offset_bits - COMPOUND_ROUND1_BITS)) + 725 (1 << (offset_bits - COMPOUND_ROUND1_BITS - 1)); 726 const int16x8_t round_offset_vec = vdupq_n_s16(round_offset); 727 728 const uint16_t fwd_offset = conv_params->fwd_offset; 729 const uint16_t bck_offset = conv_params->bck_offset; 730 731 CONV_BUF_TYPE *dst_ptr = conv_params->dst; 732 const int dst_stride = conv_params->dst_stride; 733 734 if (w == 4) { 735 int16x4_t s0, s1, s2, s3, s4, s5, s6; 736 load_s16_4x7(src_ptr, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6); 737 src_ptr += 7 * src_stride; 738 739 do { 740 #if AOM_ARCH_AARCH64 741 int16x4_t s7, s8, s9, s10; 742 load_s16_4x4(src_ptr, src_stride, &s7, &s8, &s9, &s10); 743 744 uint16x4_t d0 = convolve8_4_2d_v(s0, s1, s2, s3, s4, s5, s6, s7, y_filter, 745 offset_const); 746 uint16x4_t d1 = convolve8_4_2d_v(s1, s2, s3, s4, s5, s6, s7, s8, y_filter, 747 offset_const); 748 uint16x4_t d2 = convolve8_4_2d_v(s2, s3, s4, s5, s6, s7, s8, s9, y_filter, 749 offset_const); 750 uint16x4_t d3 = convolve8_4_2d_v(s3, s4, s5, s6, s7, s8, s9, s10, 751 y_filter, offset_const); 752 753 uint16x4_t dd0, dd1, dd2, dd3; 754 load_u16_4x4(dst_ptr, dst_stride, &dd0, &dd1, &dd2, &dd3); 755 756 uint8x8_t d01_u8, d23_u8; 757 compute_dist_wtd_avg_4x4(dd0, dd1, dd2, dd3, d0, d1, d2, d3, fwd_offset, 758 bck_offset, round_offset_vec, &d01_u8, &d23_u8); 759 760 store_u8x4_strided_x2(dst8_ptr + 0 * dst8_stride, dst8_stride, d01_u8); 761 store_u8x4_strided_x2(dst8_ptr + 2 * dst8_stride, dst8_stride, d23_u8); 762 dst8_ptr += 4 * dst8_stride; 763 764 s0 = s4; 765 s1 = s5; 766 s2 = s6; 767 s3 = s7; 768 s4 = s8; 769 s5 = s9; 770 s6 = s10; 771 src_ptr += 4 * src_stride; 772 dst_ptr += 4 * dst_stride; 773 h -= 4; 774 #else // !AOM_ARCH_AARCH64 775 int16x4_t s7 = vld1_s16(src_ptr); 776 777 uint16x4_t d0 = convolve8_4_2d_v(s0, s1, s2, s3, s4, s5, s6, s7, y_filter, 778 offset_const); 779 780 uint16x4_t dd0 = vld1_u16(dst_ptr); 781 782 uint8x8_t d01_u8; 783 compute_dist_wtd_avg_4x1(dd0, d0, fwd_offset, bck_offset, 784 vget_low_s16(round_offset_vec), &d01_u8); 785 786 store_u8_4x1(dst8_ptr, d01_u8); 787 dst8_ptr += dst8_stride; 788 789 s0 = s1; 790 s1 = s2; 791 s2 = s3; 792 s3 = s4; 793 s4 = s5; 794 s5 = s6; 795 s6 = s7; 796 src_ptr += src_stride; 797 dst_ptr += dst_stride; 798 h--; 799 #endif // AOM_ARCH_AARCH64 800 } while (h != 0); 801 } else { 802 do { 803 int16_t *s = src_ptr; 804 CONV_BUF_TYPE *d = dst_ptr; 805 uint8_t *d_u8 = dst8_ptr; 806 int height = h; 807 808 int16x8_t s0, s1, s2, s3, s4, s5, s6; 809 load_s16_8x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6); 810 s += 7 * src_stride; 811 812 do { 813 #if AOM_ARCH_AARCH64 814 int16x8_t s7, s8, s9, s10; 815 load_s16_8x4(s, src_stride, &s7, &s8, &s9, &s10); 816 817 uint16x8_t d0 = convolve8_8_2d_v(s0, s1, s2, s3, s4, s5, s6, s7, 818 y_filter, offset_const); 819 uint16x8_t d1 = convolve8_8_2d_v(s1, s2, s3, s4, s5, s6, s7, s8, 820 y_filter, offset_const); 821 uint16x8_t d2 = convolve8_8_2d_v(s2, s3, s4, s5, s6, s7, s8, s9, 822 y_filter, offset_const); 823 uint16x8_t d3 = convolve8_8_2d_v(s3, s4, s5, s6, s7, s8, s9, s10, 824 y_filter, offset_const); 825 826 uint16x8_t dd0, dd1, dd2, dd3; 827 load_u16_8x4(d, dst_stride, &dd0, &dd1, &dd2, &dd3); 828 829 uint8x8_t d0_u8, d1_u8, d2_u8, d3_u8; 830 compute_dist_wtd_avg_8x4(dd0, dd1, dd2, dd3, d0, d1, d2, d3, fwd_offset, 831 bck_offset, round_offset_vec, &d0_u8, &d1_u8, 832 &d2_u8, &d3_u8); 833 834 store_u8_8x4(d_u8, dst8_stride, d0_u8, d1_u8, d2_u8, d3_u8); 835 d_u8 += 4 * dst8_stride; 836 837 s0 = s4; 838 s1 = s5; 839 s2 = s6; 840 s3 = s7; 841 s4 = s8; 842 s5 = s9; 843 s6 = s10; 844 s += 4 * src_stride; 845 d += 4 * dst_stride; 846 height -= 4; 847 #else // !AOM_ARCH_AARCH64 848 int16x8_t s7 = vld1q_s16(s); 849 850 uint16x8_t d0 = convolve8_8_2d_v(s0, s1, s2, s3, s4, s5, s6, s7, 851 y_filter, offset_const); 852 853 uint16x8_t dd0 = vld1q_u16(d); 854 855 uint8x8_t d0_u8; 856 compute_dist_wtd_avg_8x1(dd0, d0, fwd_offset, bck_offset, 857 round_offset_vec, &d0_u8); 858 859 vst1_u8(d_u8, d0_u8); 860 d_u8 += dst8_stride; 861 862 s0 = s1; 863 s1 = s2; 864 s2 = s3; 865 s3 = s4; 866 s4 = s5; 867 s5 = s6; 868 s6 = s7; 869 s += src_stride; 870 d += dst_stride; 871 height--; 872 #endif // AOM_ARCH_AARCH64 873 } while (height != 0); 874 src_ptr += 8; 875 dst_ptr += 8; 876 dst8_ptr += 8; 877 w -= 8; 878 } while (w != 0); 879 } 880 } 881 882 static inline void dist_wtd_convolve_2d_vert_8tap_avg_neon( 883 int16_t *src_ptr, const int src_stride, uint8_t *dst8_ptr, int dst8_stride, 884 ConvolveParams *conv_params, const int16x8_t y_filter, int h, int w) { 885 const int bd = 8; 886 const int offset_bits = bd + 2 * FILTER_BITS - ROUND0_BITS; 887 const int32x4_t offset_const = vdupq_n_s32(1 << offset_bits); 888 const int16_t round_offset = (1 << (offset_bits - COMPOUND_ROUND1_BITS)) + 889 (1 << (offset_bits - COMPOUND_ROUND1_BITS - 1)); 890 const int16x8_t round_offset_vec = vdupq_n_s16(round_offset); 891 892 CONV_BUF_TYPE *dst_ptr = conv_params->dst; 893 const int dst_stride = conv_params->dst_stride; 894 895 if (w == 4) { 896 int16x4_t s0, s1, s2, s3, s4, s5, s6; 897 load_s16_4x7(src_ptr, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6); 898 src_ptr += 7 * src_stride; 899 900 do { 901 #if AOM_ARCH_AARCH64 902 int16x4_t s7, s8, s9, s10; 903 load_s16_4x4(src_ptr, src_stride, &s7, &s8, &s9, &s10); 904 905 uint16x4_t d0 = convolve8_4_2d_v(s0, s1, s2, s3, s4, s5, s6, s7, y_filter, 906 offset_const); 907 uint16x4_t d1 = convolve8_4_2d_v(s1, s2, s3, s4, s5, s6, s7, s8, y_filter, 908 offset_const); 909 uint16x4_t d2 = convolve8_4_2d_v(s2, s3, s4, s5, s6, s7, s8, s9, y_filter, 910 offset_const); 911 uint16x4_t d3 = convolve8_4_2d_v(s3, s4, s5, s6, s7, s8, s9, s10, 912 y_filter, offset_const); 913 914 uint16x4_t dd0, dd1, dd2, dd3; 915 load_u16_4x4(dst_ptr, dst_stride, &dd0, &dd1, &dd2, &dd3); 916 917 uint8x8_t d01_u8, d23_u8; 918 compute_basic_avg_4x4(dd0, dd1, dd2, dd3, d0, d1, d2, d3, 919 round_offset_vec, &d01_u8, &d23_u8); 920 921 store_u8x4_strided_x2(dst8_ptr + 0 * dst8_stride, dst8_stride, d01_u8); 922 store_u8x4_strided_x2(dst8_ptr + 2 * dst8_stride, dst8_stride, d23_u8); 923 dst8_ptr += 4 * dst8_stride; 924 925 s0 = s4; 926 s1 = s5; 927 s2 = s6; 928 s3 = s7; 929 s4 = s8; 930 s5 = s9; 931 s6 = s10; 932 src_ptr += 4 * src_stride; 933 dst_ptr += 4 * dst_stride; 934 h -= 4; 935 #else // !AOM_ARCH_AARCH64 936 int16x4_t s7 = vld1_s16(src_ptr); 937 938 uint16x4_t d0 = convolve8_4_2d_v(s0, s1, s2, s3, s4, s5, s6, s7, y_filter, 939 offset_const); 940 941 uint16x4_t dd0 = vld1_u16(dst_ptr); 942 943 uint8x8_t d01_u8; 944 compute_basic_avg_4x1(dd0, d0, vget_low_s16(round_offset_vec), &d01_u8); 945 946 store_u8_4x1(dst8_ptr, d01_u8); 947 dst8_ptr += dst8_stride; 948 949 s0 = s1; 950 s1 = s2; 951 s2 = s3; 952 s3 = s4; 953 s4 = s5; 954 s5 = s6; 955 s6 = s7; 956 src_ptr += src_stride; 957 dst_ptr += dst_stride; 958 h--; 959 #endif // AOM_ARCH_AARCH64 960 } while (h != 0); 961 } else { 962 do { 963 int16_t *s = src_ptr; 964 CONV_BUF_TYPE *d = dst_ptr; 965 uint8_t *d_u8 = dst8_ptr; 966 int height = h; 967 968 int16x8_t s0, s1, s2, s3, s4, s5, s6; 969 load_s16_8x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6); 970 s += 7 * src_stride; 971 972 do { 973 #if AOM_ARCH_AARCH64 974 int16x8_t s7, s8, s9, s10; 975 load_s16_8x4(s, src_stride, &s7, &s8, &s9, &s10); 976 977 uint16x8_t d0 = convolve8_8_2d_v(s0, s1, s2, s3, s4, s5, s6, s7, 978 y_filter, offset_const); 979 uint16x8_t d1 = convolve8_8_2d_v(s1, s2, s3, s4, s5, s6, s7, s8, 980 y_filter, offset_const); 981 uint16x8_t d2 = convolve8_8_2d_v(s2, s3, s4, s5, s6, s7, s8, s9, 982 y_filter, offset_const); 983 uint16x8_t d3 = convolve8_8_2d_v(s3, s4, s5, s6, s7, s8, s9, s10, 984 y_filter, offset_const); 985 986 uint16x8_t dd0, dd1, dd2, dd3; 987 load_u16_8x4(d, dst_stride, &dd0, &dd1, &dd2, &dd3); 988 989 uint8x8_t d0_u8, d1_u8, d2_u8, d3_u8; 990 compute_basic_avg_8x4(dd0, dd1, dd2, dd3, d0, d1, d2, d3, 991 round_offset_vec, &d0_u8, &d1_u8, &d2_u8, &d3_u8); 992 993 store_u8_8x4(d_u8, dst8_stride, d0_u8, d1_u8, d2_u8, d3_u8); 994 d_u8 += 4 * dst8_stride; 995 996 s0 = s4; 997 s1 = s5; 998 s2 = s6; 999 s3 = s7; 1000 s4 = s8; 1001 s5 = s9; 1002 s6 = s10; 1003 s += 4 * src_stride; 1004 d += 4 * dst_stride; 1005 height -= 4; 1006 #else // !AOM_ARCH_AARCH64 1007 int16x8_t s7 = vld1q_s16(s); 1008 1009 uint16x8_t d0 = convolve8_8_2d_v(s0, s1, s2, s3, s4, s5, s6, s7, 1010 y_filter, offset_const); 1011 1012 uint16x8_t dd0 = vld1q_u16(d); 1013 1014 uint8x8_t d0_u8; 1015 compute_basic_avg_8x1(dd0, d0, round_offset_vec, &d0_u8); 1016 1017 vst1_u8(d_u8, d0_u8); 1018 d_u8 += dst8_stride; 1019 1020 s0 = s1; 1021 s1 = s2; 1022 s2 = s3; 1023 s3 = s4; 1024 s4 = s5; 1025 s5 = s6; 1026 s6 = s7; 1027 s += src_stride; 1028 d += dst_stride; 1029 height--; 1030 #endif // AOM_ARCH_AARCH64 1031 } while (height != 0); 1032 src_ptr += 8; 1033 dst_ptr += 8; 1034 dst8_ptr += 8; 1035 w -= 8; 1036 } while (w != 0); 1037 } 1038 } 1039 1040 static inline void dist_wtd_convolve_2d_vert_8tap_neon( 1041 int16_t *src_ptr, const int src_stride, ConvolveParams *conv_params, 1042 const int16x8_t y_filter, int h, int w) { 1043 const int bd = 8; 1044 const int offset_bits = bd + 2 * FILTER_BITS - ROUND0_BITS; 1045 const int32x4_t offset_const = vdupq_n_s32(1 << offset_bits); 1046 1047 CONV_BUF_TYPE *dst_ptr = conv_params->dst; 1048 const int dst_stride = conv_params->dst_stride; 1049 1050 if (w == 4) { 1051 int16x4_t s0, s1, s2, s3, s4, s5, s6; 1052 load_s16_4x7(src_ptr, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6); 1053 src_ptr += 7 * src_stride; 1054 1055 do { 1056 #if AOM_ARCH_AARCH64 1057 int16x4_t s7, s8, s9, s10; 1058 load_s16_4x4(src_ptr, src_stride, &s7, &s8, &s9, &s10); 1059 1060 uint16x4_t d0 = convolve8_4_2d_v(s0, s1, s2, s3, s4, s5, s6, s7, y_filter, 1061 offset_const); 1062 uint16x4_t d1 = convolve8_4_2d_v(s1, s2, s3, s4, s5, s6, s7, s8, y_filter, 1063 offset_const); 1064 uint16x4_t d2 = convolve8_4_2d_v(s2, s3, s4, s5, s6, s7, s8, s9, y_filter, 1065 offset_const); 1066 uint16x4_t d3 = convolve8_4_2d_v(s3, s4, s5, s6, s7, s8, s9, s10, 1067 y_filter, offset_const); 1068 1069 store_u16_4x4(dst_ptr, dst_stride, d0, d1, d2, d3); 1070 1071 s0 = s4; 1072 s1 = s5; 1073 s2 = s6; 1074 s3 = s7; 1075 s4 = s8; 1076 s5 = s9; 1077 s6 = s10; 1078 src_ptr += 4 * src_stride; 1079 dst_ptr += 4 * dst_stride; 1080 h -= 4; 1081 #else // !AOM_ARCH_AARCH64 1082 int16x4_t s7 = vld1_s16(src_ptr); 1083 1084 uint16x4_t d0 = convolve8_4_2d_v(s0, s1, s2, s3, s4, s5, s6, s7, y_filter, 1085 offset_const); 1086 1087 vst1_u16(dst_ptr, d0); 1088 1089 s0 = s1; 1090 s1 = s2; 1091 s2 = s3; 1092 s3 = s4; 1093 s4 = s5; 1094 s5 = s6; 1095 s6 = s7; 1096 src_ptr += src_stride; 1097 dst_ptr += dst_stride; 1098 h--; 1099 #endif // AOM_ARCH_AARCH64 1100 } while (h != 0); 1101 } else { 1102 do { 1103 int16_t *s = src_ptr; 1104 CONV_BUF_TYPE *d = dst_ptr; 1105 int height = h; 1106 1107 int16x8_t s0, s1, s2, s3, s4, s5, s6; 1108 load_s16_8x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6); 1109 s += 7 * src_stride; 1110 1111 do { 1112 #if AOM_ARCH_AARCH64 1113 int16x8_t s7, s8, s9, s10; 1114 load_s16_8x4(s, src_stride, &s7, &s8, &s9, &s10); 1115 1116 uint16x8_t d0 = convolve8_8_2d_v(s0, s1, s2, s3, s4, s5, s6, s7, 1117 y_filter, offset_const); 1118 uint16x8_t d1 = convolve8_8_2d_v(s1, s2, s3, s4, s5, s6, s7, s8, 1119 y_filter, offset_const); 1120 uint16x8_t d2 = convolve8_8_2d_v(s2, s3, s4, s5, s6, s7, s8, s9, 1121 y_filter, offset_const); 1122 uint16x8_t d3 = convolve8_8_2d_v(s3, s4, s5, s6, s7, s8, s9, s10, 1123 y_filter, offset_const); 1124 1125 store_u16_8x4(d, dst_stride, d0, d1, d2, d3); 1126 1127 s0 = s4; 1128 s1 = s5; 1129 s2 = s6; 1130 s3 = s7; 1131 s4 = s8; 1132 s5 = s9; 1133 s6 = s10; 1134 s += 4 * src_stride; 1135 d += 4 * dst_stride; 1136 height -= 4; 1137 #else // !AOM_ARCH_AARCH64 1138 int16x8_t s7 = vld1q_s16(s); 1139 1140 uint16x8_t d0 = convolve8_8_2d_v(s0, s1, s2, s3, s4, s5, s6, s7, 1141 y_filter, offset_const); 1142 1143 vst1q_u16(d, d0); 1144 1145 s0 = s1; 1146 s1 = s2; 1147 s2 = s3; 1148 s3 = s4; 1149 s4 = s5; 1150 s5 = s6; 1151 s6 = s7; 1152 s += src_stride; 1153 d += dst_stride; 1154 height--; 1155 #endif // AOM_ARCH_AARCH64 1156 } while (height != 0); 1157 src_ptr += 8; 1158 dst_ptr += 8; 1159 w -= 8; 1160 } while (w != 0); 1161 } 1162 } 1163 1164 #endif // AOM_AV1_COMMON_ARM_COMPOUND_CONVOLVE_NEON_H_