compound_convolve_neon.c (105069B)
1 /* 2 * Copyright (c) 2018, Alliance for Open Media. All rights reserved. 3 * 4 * This source code is subject to the terms of the BSD 2 Clause License and 5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License 6 * was not distributed with this source code in the LICENSE file, you can 7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open 8 * Media Patent License 1.0 was not distributed with this source code in the 9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent. 10 */ 11 12 #include <arm_neon.h> 13 #include <assert.h> 14 15 #include "aom_dsp/arm/mem_neon.h" 16 #include "aom_dsp/arm/transpose_neon.h" 17 #include "av1/common/arm/compound_convolve_neon.h" 18 #include "config/aom_config.h" 19 #include "config/av1_rtcd.h" 20 21 static inline int16x4_t convolve4_4_2d_h(const int16x4_t s0, const int16x4_t s1, 22 const int16x4_t s2, const int16x4_t s3, 23 const int16x4_t x_filter, 24 const int16x4_t horiz_const) { 25 int16x4_t sum = horiz_const; 26 sum = vmla_lane_s16(sum, s0, x_filter, 0); 27 sum = vmla_lane_s16(sum, s1, x_filter, 1); 28 sum = vmla_lane_s16(sum, s2, x_filter, 2); 29 sum = vmla_lane_s16(sum, s3, x_filter, 3); 30 31 // We halved the convolution filter values so -1 from the right shift. 32 return vshr_n_s16(sum, ROUND0_BITS - 1); 33 } 34 35 static inline int16x8_t convolve8_8_2d_h(const int16x8_t s0, const int16x8_t s1, 36 const int16x8_t s2, const int16x8_t s3, 37 const int16x8_t s4, const int16x8_t s5, 38 const int16x8_t s6, const int16x8_t s7, 39 const int16x8_t x_filter, 40 const int16x8_t horiz_const) { 41 const int16x4_t x_filter_0_3 = vget_low_s16(x_filter); 42 const int16x4_t x_filter_4_7 = vget_high_s16(x_filter); 43 44 int16x8_t sum = horiz_const; 45 sum = vmlaq_lane_s16(sum, s0, x_filter_0_3, 0); 46 sum = vmlaq_lane_s16(sum, s1, x_filter_0_3, 1); 47 sum = vmlaq_lane_s16(sum, s2, x_filter_0_3, 2); 48 sum = vmlaq_lane_s16(sum, s3, x_filter_0_3, 3); 49 sum = vmlaq_lane_s16(sum, s4, x_filter_4_7, 0); 50 sum = vmlaq_lane_s16(sum, s5, x_filter_4_7, 1); 51 sum = vmlaq_lane_s16(sum, s6, x_filter_4_7, 2); 52 sum = vmlaq_lane_s16(sum, s7, x_filter_4_7, 3); 53 54 // We halved the convolution filter values so -1 from the right shift. 55 return vshrq_n_s16(sum, ROUND0_BITS - 1); 56 } 57 58 static inline void dist_wtd_convolve_2d_horiz_neon( 59 const uint8_t *src, int src_stride, int16_t *im_block, const int im_stride, 60 const int16_t *x_filter_ptr, const int im_h, int w) { 61 const int bd = 8; 62 63 const uint8_t *src_ptr = src; 64 int16_t *dst_ptr = im_block; 65 int dst_stride = im_stride; 66 int height = im_h; 67 68 if (w == 4) { 69 // A shim of 1 << ((ROUND0_BITS - 1) - 1) enables us to use non-rounding 70 // shifts - which are generally faster than rounding shifts on modern CPUs. 71 // (The extra -1 is needed because we halved the filter values.) 72 const int16x4_t horiz_const = vdup_n_s16((1 << (bd + FILTER_BITS - 2)) + 73 (1 << ((ROUND0_BITS - 1) - 1))); 74 // 4-tap filters are used for blocks having width <= 4. 75 // Filter values are even, so halve to reduce intermediate precision reqs. 76 const int16x4_t x_filter = vshr_n_s16(vld1_s16(x_filter_ptr + 2), 1); 77 78 src_ptr += 2; 79 80 do { 81 uint8x8_t t0 = vld1_u8(src_ptr); // a0 a1 a2 a3 a4 a5 a6 a7 82 int16x4_t s0 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0))); 83 int16x4_t s4 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t0))); 84 85 __builtin_prefetch(dst_ptr); 86 87 int16x4_t s1 = vext_s16(s0, s4, 1); // a1 a2 a3 a4 88 int16x4_t s2 = vext_s16(s0, s4, 2); // a2 a3 a4 a5 89 int16x4_t s3 = vext_s16(s0, s4, 3); // a3 a4 a5 a6 90 91 int16x4_t d0 = convolve4_4_2d_h(s0, s1, s2, s3, x_filter, horiz_const); 92 93 vst1_s16(dst_ptr, d0); 94 95 src_ptr += src_stride; 96 dst_ptr += dst_stride; 97 } while (--height != 0); 98 } else { 99 // A shim of 1 << ((ROUND0_BITS - 1) - 1) enables us to use non-rounding 100 // shifts - which are generally faster than rounding shifts on modern CPUs. 101 // (The extra -1 is needed because we halved the filter values.) 102 const int16x8_t horiz_const = vdupq_n_s16((1 << (bd + FILTER_BITS - 2)) + 103 (1 << ((ROUND0_BITS - 1) - 1))); 104 // Filter values are even, so halve to reduce intermediate precision reqs. 105 const int16x8_t x_filter = vshrq_n_s16(vld1q_s16(x_filter_ptr), 1); 106 107 #if AOM_ARCH_AARCH64 108 do { 109 const uint8_t *s; 110 int16_t *d = dst_ptr; 111 int width = w; 112 113 __builtin_prefetch(src_ptr + 0 * src_stride); 114 __builtin_prefetch(src_ptr + 1 * src_stride); 115 __builtin_prefetch(src_ptr + 2 * src_stride); 116 __builtin_prefetch(src_ptr + 3 * src_stride); 117 __builtin_prefetch(src_ptr + 4 * src_stride); 118 __builtin_prefetch(src_ptr + 5 * src_stride); 119 __builtin_prefetch(src_ptr + 6 * src_stride); 120 __builtin_prefetch(src_ptr + 7 * src_stride); 121 122 uint8x8_t t0, t1, t2, t3, t4, t5, t6, t7; 123 load_u8_8x8(src_ptr, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); 124 transpose_elems_inplace_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); 125 126 int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t0)); 127 int16x8_t s1 = vreinterpretq_s16_u16(vmovl_u8(t1)); 128 int16x8_t s2 = vreinterpretq_s16_u16(vmovl_u8(t2)); 129 int16x8_t s3 = vreinterpretq_s16_u16(vmovl_u8(t3)); 130 int16x8_t s4 = vreinterpretq_s16_u16(vmovl_u8(t4)); 131 int16x8_t s5 = vreinterpretq_s16_u16(vmovl_u8(t5)); 132 int16x8_t s6 = vreinterpretq_s16_u16(vmovl_u8(t6)); 133 134 s = src_ptr + 7; 135 136 __builtin_prefetch(dst_ptr + 0 * dst_stride); 137 __builtin_prefetch(dst_ptr + 1 * dst_stride); 138 __builtin_prefetch(dst_ptr + 2 * dst_stride); 139 __builtin_prefetch(dst_ptr + 3 * dst_stride); 140 __builtin_prefetch(dst_ptr + 4 * dst_stride); 141 __builtin_prefetch(dst_ptr + 5 * dst_stride); 142 __builtin_prefetch(dst_ptr + 6 * dst_stride); 143 __builtin_prefetch(dst_ptr + 7 * dst_stride); 144 145 do { 146 load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); 147 transpose_elems_inplace_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); 148 149 int16x8_t s7 = vreinterpretq_s16_u16(vmovl_u8(t0)); 150 int16x8_t s8 = vreinterpretq_s16_u16(vmovl_u8(t1)); 151 int16x8_t s9 = vreinterpretq_s16_u16(vmovl_u8(t2)); 152 int16x8_t s10 = vreinterpretq_s16_u16(vmovl_u8(t3)); 153 int16x8_t s11 = vreinterpretq_s16_u16(vmovl_u8(t4)); 154 int16x8_t s12 = vreinterpretq_s16_u16(vmovl_u8(t5)); 155 int16x8_t s13 = vreinterpretq_s16_u16(vmovl_u8(t6)); 156 int16x8_t s14 = vreinterpretq_s16_u16(vmovl_u8(t7)); 157 158 int16x8_t d0 = convolve8_8_2d_h(s0, s1, s2, s3, s4, s5, s6, s7, 159 x_filter, horiz_const); 160 int16x8_t d1 = convolve8_8_2d_h(s1, s2, s3, s4, s5, s6, s7, s8, 161 x_filter, horiz_const); 162 int16x8_t d2 = convolve8_8_2d_h(s2, s3, s4, s5, s6, s7, s8, s9, 163 x_filter, horiz_const); 164 int16x8_t d3 = convolve8_8_2d_h(s3, s4, s5, s6, s7, s8, s9, s10, 165 x_filter, horiz_const); 166 int16x8_t d4 = convolve8_8_2d_h(s4, s5, s6, s7, s8, s9, s10, s11, 167 x_filter, horiz_const); 168 int16x8_t d5 = convolve8_8_2d_h(s5, s6, s7, s8, s9, s10, s11, s12, 169 x_filter, horiz_const); 170 int16x8_t d6 = convolve8_8_2d_h(s6, s7, s8, s9, s10, s11, s12, s13, 171 x_filter, horiz_const); 172 int16x8_t d7 = convolve8_8_2d_h(s7, s8, s9, s10, s11, s12, s13, s14, 173 x_filter, horiz_const); 174 175 transpose_elems_inplace_s16_8x8(&d0, &d1, &d2, &d3, &d4, &d5, &d6, &d7); 176 store_s16_8x8(d, dst_stride, d0, d1, d2, d3, d4, d5, d6, d7); 177 178 s0 = s8; 179 s1 = s9; 180 s2 = s10; 181 s3 = s11; 182 s4 = s12; 183 s5 = s13; 184 s6 = s14; 185 s += 8; 186 d += 8; 187 width -= 8; 188 } while (width > 0); 189 src_ptr += 8 * src_stride; 190 dst_ptr += 8 * dst_stride; 191 height -= 8; 192 } while (height > 8); 193 #endif // AOM_ARCH_AARCH64 194 195 do { 196 const uint8_t *s; 197 int16_t *d = dst_ptr; 198 int width = w; 199 200 uint8x8_t t0 = vld1_u8(src_ptr); 201 int16x8_t s0 = 202 vreinterpretq_s16_u16(vmovl_u8(t0)); // a0 a1 a2 a3 a4 a5 a6 a7 203 204 s = src_ptr + 8; 205 __builtin_prefetch(dst_ptr); 206 207 do { 208 t0 = vld1_u8(s); // a8 a9 a10 a11 a12 a13 a14 a15 209 int16x8_t s8 = vreinterpretq_s16_u16(vmovl_u8(t0)); 210 211 int16x8_t s1 = vextq_s16(s0, s8, 1); // a1 a2 a3 a4 a5 a6 a7 a8 212 int16x8_t s2 = vextq_s16(s0, s8, 2); // a2 a3 a4 a5 a6 a7 a8 a9 213 int16x8_t s3 = vextq_s16(s0, s8, 3); // a3 a4 a5 a6 a7 a8 a9 a10 214 int16x8_t s4 = vextq_s16(s0, s8, 4); // a4 a5 a6 a7 a8 a9 a10 a11 215 int16x8_t s5 = vextq_s16(s0, s8, 5); // a5 a6 a7 a8 a9 a10 a11 a12 216 int16x8_t s6 = vextq_s16(s0, s8, 6); // a6 a7 a8 a9 a10 a11 a12 a13 217 int16x8_t s7 = vextq_s16(s0, s8, 7); // a7 a8 a9 a10 a11 a12 a13 a14 218 219 int16x8_t d0 = convolve8_8_2d_h(s0, s1, s2, s3, s4, s5, s6, s7, 220 x_filter, horiz_const); 221 vst1q_s16(d, d0); 222 223 s0 = s8; 224 s += 8; 225 d += 8; 226 width -= 8; 227 } while (width > 0); 228 src_ptr += src_stride; 229 dst_ptr += dst_stride; 230 } while (--height != 0); 231 } 232 } 233 234 void av1_dist_wtd_convolve_2d_neon(const uint8_t *src, int src_stride, 235 uint8_t *dst8, int dst8_stride, int w, int h, 236 const InterpFilterParams *filter_params_x, 237 const InterpFilterParams *filter_params_y, 238 const int subpel_x_qn, const int subpel_y_qn, 239 ConvolveParams *conv_params) { 240 assert(w % 4 == 0); 241 assert(h % 4 == 0); 242 243 DECLARE_ALIGNED(16, int16_t, 244 im_block[(MAX_SB_SIZE + SUBPEL_TAPS - 1) * MAX_SB_SIZE]); 245 246 const int y_filter_taps = get_filter_tap(filter_params_y, subpel_y_qn); 247 const int clamped_y_taps = y_filter_taps < 6 ? 6 : y_filter_taps; 248 249 const int im_h = h + clamped_y_taps - 1; 250 const int im_stride = MAX_SB_SIZE; 251 const int vert_offset = clamped_y_taps / 2 - 1; 252 const int horiz_offset = filter_params_x->taps / 2 - 1; 253 const uint8_t *src_ptr = src - vert_offset * src_stride - horiz_offset; 254 const int16_t *x_filter_ptr = av1_get_interp_filter_subpel_kernel( 255 filter_params_x, subpel_x_qn & SUBPEL_MASK); 256 const int16_t *y_filter_ptr = av1_get_interp_filter_subpel_kernel( 257 filter_params_y, subpel_y_qn & SUBPEL_MASK); 258 259 const int16x8_t y_filter = vld1q_s16(y_filter_ptr); 260 261 dist_wtd_convolve_2d_horiz_neon(src_ptr, src_stride, im_block, im_stride, 262 x_filter_ptr, im_h, w); 263 264 if (clamped_y_taps == 6) { 265 if (conv_params->do_average) { 266 if (UNLIKELY(conv_params->use_dist_wtd_comp_avg)) { 267 dist_wtd_convolve_2d_vert_6tap_dist_wtd_avg_neon( 268 im_block, im_stride, dst8, dst8_stride, conv_params, y_filter, h, 269 w); 270 } else { 271 dist_wtd_convolve_2d_vert_6tap_avg_neon(im_block, im_stride, dst8, 272 dst8_stride, conv_params, 273 y_filter, h, w); 274 } 275 } else { 276 dist_wtd_convolve_2d_vert_6tap_neon(im_block, im_stride, conv_params, 277 y_filter, h, w); 278 } 279 } else { 280 if (conv_params->do_average) { 281 if (UNLIKELY(conv_params->use_dist_wtd_comp_avg)) { 282 dist_wtd_convolve_2d_vert_8tap_dist_wtd_avg_neon( 283 im_block, im_stride, dst8, dst8_stride, conv_params, y_filter, h, 284 w); 285 } else { 286 dist_wtd_convolve_2d_vert_8tap_avg_neon(im_block, im_stride, dst8, 287 dst8_stride, conv_params, 288 y_filter, h, w); 289 } 290 } else { 291 dist_wtd_convolve_2d_vert_8tap_neon(im_block, im_stride, conv_params, 292 y_filter, h, w); 293 } 294 } 295 } 296 297 static inline void dist_wtd_convolve_2d_copy_dist_wtd_avg_neon( 298 const uint8_t *src, int src_stride, uint8_t *dst8, int dst8_stride, int w, 299 int h, ConvolveParams *conv_params) { 300 assert(w % 4 == 0); 301 assert(h % 4 == 0); 302 303 const int bd = 8; 304 const int offset_bits = bd + 2 * FILTER_BITS - ROUND0_BITS; 305 const uint16_t round_offset = (1 << (offset_bits - COMPOUND_ROUND1_BITS)) + 306 (1 << (offset_bits - COMPOUND_ROUND1_BITS - 1)); 307 const uint16x8_t round_offset_vec = vdupq_n_u16(round_offset); 308 const uint8x8_t shift_by_bits = vdup_n_u8(1 << (FILTER_BITS - ROUND0_BITS)); 309 310 const uint16_t fwd_offset = conv_params->fwd_offset; 311 const uint16_t bck_offset = conv_params->bck_offset; 312 313 CONV_BUF_TYPE *dst = conv_params->dst; 314 const int dst_stride = conv_params->dst_stride; 315 int height = h; 316 317 if (w == 4) { 318 do { 319 uint8x8_t s0, s1, s2, s3; 320 load_u8_8x4(src, src_stride, &s0, &s1, &s2, &s3); 321 322 uint16x4_t d0 = 323 vget_low_u16(vmlal_u8(round_offset_vec, s0, shift_by_bits)); 324 uint16x4_t d1 = 325 vget_low_u16(vmlal_u8(round_offset_vec, s1, shift_by_bits)); 326 uint16x4_t d2 = 327 vget_low_u16(vmlal_u8(round_offset_vec, s2, shift_by_bits)); 328 uint16x4_t d3 = 329 vget_low_u16(vmlal_u8(round_offset_vec, s3, shift_by_bits)); 330 331 uint16x4_t dd0, dd1, dd2, dd3; 332 load_u16_4x4(dst, dst_stride, &dd0, &dd1, &dd2, &dd3); 333 334 uint8x8_t d01, d23; 335 compute_dist_wtd_avg_4x4( 336 dd0, dd1, dd2, dd3, d0, d1, d2, d3, fwd_offset, bck_offset, 337 vreinterpretq_s16_u16(round_offset_vec), &d01, &d23); 338 339 store_u8x4_strided_x2(dst8 + 0 * dst8_stride, dst8_stride, d01); 340 store_u8x4_strided_x2(dst8 + 2 * dst8_stride, dst8_stride, d23); 341 342 src += 4 * src_stride; 343 dst += 4 * dst_stride; 344 dst8 += 4 * dst8_stride; 345 height -= 4; 346 } while (height != 0); 347 } else { 348 do { 349 const uint8_t *s = src; 350 CONV_BUF_TYPE *d = dst; 351 uint8_t *d_u8 = dst8; 352 int width = w; 353 354 do { 355 uint8x8_t s0, s1, s2, s3; 356 load_u8_8x4(s, src_stride, &s0, &s1, &s2, &s3); 357 358 uint16x8_t d0 = vmlal_u8(round_offset_vec, s0, shift_by_bits); 359 uint16x8_t d1 = vmlal_u8(round_offset_vec, s1, shift_by_bits); 360 uint16x8_t d2 = vmlal_u8(round_offset_vec, s2, shift_by_bits); 361 uint16x8_t d3 = vmlal_u8(round_offset_vec, s3, shift_by_bits); 362 363 uint16x8_t dd0, dd1, dd2, dd3; 364 load_u16_8x4(d, dst_stride, &dd0, &dd1, &dd2, &dd3); 365 366 uint8x8_t d0_u8, d1_u8, d2_u8, d3_u8; 367 compute_dist_wtd_avg_8x4(dd0, dd1, dd2, dd3, d0, d1, d2, d3, fwd_offset, 368 bck_offset, 369 vreinterpretq_s16_u16(round_offset_vec), 370 &d0_u8, &d1_u8, &d2_u8, &d3_u8); 371 372 store_u8_8x4(d_u8, dst8_stride, d0_u8, d1_u8, d2_u8, d3_u8); 373 374 s += 8; 375 d += 8; 376 d_u8 += 8; 377 width -= 8; 378 } while (width != 0); 379 src += 4 * src_stride; 380 dst += 4 * dst_stride; 381 dst8 += 4 * dst8_stride; 382 height -= 4; 383 } while (height != 0); 384 } 385 } 386 387 static inline void dist_wtd_convolve_2d_copy_avg_neon( 388 const uint8_t *src, int src_stride, uint8_t *dst8, int dst8_stride, int w, 389 int h, ConvolveParams *conv_params) { 390 assert(w % 4 == 0); 391 assert(h % 4 == 0); 392 393 const int bd = 8; 394 const int offset_bits = bd + 2 * FILTER_BITS - ROUND0_BITS; 395 const uint16_t round_offset = (1 << (offset_bits - COMPOUND_ROUND1_BITS)) + 396 (1 << (offset_bits - COMPOUND_ROUND1_BITS - 1)); 397 const uint16x8_t round_offset_vec = vdupq_n_u16(round_offset); 398 const uint8x8_t shift_by_bits = vdup_n_u8(1 << (FILTER_BITS - ROUND0_BITS)); 399 400 CONV_BUF_TYPE *dst = conv_params->dst; 401 const int dst_stride = conv_params->dst_stride; 402 int height = h; 403 404 if (w == 4) { 405 do { 406 uint8x8_t s0, s1, s2, s3; 407 load_u8_8x4(src, src_stride, &s0, &s1, &s2, &s3); 408 409 uint16x4_t d0 = 410 vget_low_u16(vmlal_u8(round_offset_vec, s0, shift_by_bits)); 411 uint16x4_t d1 = 412 vget_low_u16(vmlal_u8(round_offset_vec, s1, shift_by_bits)); 413 uint16x4_t d2 = 414 vget_low_u16(vmlal_u8(round_offset_vec, s2, shift_by_bits)); 415 uint16x4_t d3 = 416 vget_low_u16(vmlal_u8(round_offset_vec, s3, shift_by_bits)); 417 418 uint16x4_t dd0, dd1, dd2, dd3; 419 load_u16_4x4(dst, dst_stride, &dd0, &dd1, &dd2, &dd3); 420 421 uint8x8_t d01, d23; 422 compute_basic_avg_4x4(dd0, dd1, dd2, dd3, d0, d1, d2, d3, 423 vreinterpretq_s16_u16(round_offset_vec), &d01, 424 &d23); 425 426 store_u8x4_strided_x2(dst8 + 0 * dst8_stride, dst8_stride, d01); 427 store_u8x4_strided_x2(dst8 + 2 * dst8_stride, dst8_stride, d23); 428 429 src += 4 * src_stride; 430 dst += 4 * dst_stride; 431 dst8 += 4 * dst8_stride; 432 height -= 4; 433 } while (height != 0); 434 } else { 435 do { 436 const uint8_t *s = src; 437 CONV_BUF_TYPE *d = dst; 438 uint8_t *d_u8 = dst8; 439 int width = w; 440 441 do { 442 uint8x8_t s0, s1, s2, s3; 443 load_u8_8x4(s, src_stride, &s0, &s1, &s2, &s3); 444 445 uint16x8_t d0 = vmlal_u8(round_offset_vec, s0, shift_by_bits); 446 uint16x8_t d1 = vmlal_u8(round_offset_vec, s1, shift_by_bits); 447 uint16x8_t d2 = vmlal_u8(round_offset_vec, s2, shift_by_bits); 448 uint16x8_t d3 = vmlal_u8(round_offset_vec, s3, shift_by_bits); 449 450 uint16x8_t dd0, dd1, dd2, dd3; 451 load_u16_8x4(d, dst_stride, &dd0, &dd1, &dd2, &dd3); 452 453 uint8x8_t d0_u8, d1_u8, d2_u8, d3_u8; 454 compute_basic_avg_8x4(dd0, dd1, dd2, dd3, d0, d1, d2, d3, 455 vreinterpretq_s16_u16(round_offset_vec), &d0_u8, 456 &d1_u8, &d2_u8, &d3_u8); 457 458 store_u8_8x4(d_u8, dst8_stride, d0_u8, d1_u8, d2_u8, d3_u8); 459 460 s += 8; 461 d += 8; 462 d_u8 += 8; 463 width -= 8; 464 } while (width != 0); 465 src += 4 * src_stride; 466 dst += 4 * dst_stride; 467 dst8 += 4 * dst8_stride; 468 height -= 4; 469 } while (height != 0); 470 } 471 } 472 473 static inline void dist_wtd_convolve_2d_copy_neon(const uint8_t *src, 474 int src_stride, int w, int h, 475 ConvolveParams *conv_params) { 476 assert(w % 4 == 0); 477 assert(h % 4 == 0); 478 479 const int bd = 8; 480 const int offset_bits = bd + 2 * FILTER_BITS - ROUND0_BITS; 481 const uint16_t round_offset = (1 << (offset_bits - COMPOUND_ROUND1_BITS)) + 482 (1 << (offset_bits - COMPOUND_ROUND1_BITS - 1)); 483 const uint16x8_t round_offset_vec = vdupq_n_u16(round_offset); 484 const uint8x8_t shift_by_bits = vdup_n_u8(1 << (FILTER_BITS - ROUND0_BITS)); 485 486 CONV_BUF_TYPE *dst = conv_params->dst; 487 const int dst_stride = conv_params->dst_stride; 488 int height = h; 489 490 if (w == 4) { 491 do { 492 uint8x8_t s0, s1, s2, s3; 493 load_u8_8x4(src, src_stride, &s0, &s1, &s2, &s3); 494 495 uint16x4_t d0 = 496 vget_low_u16(vmlal_u8(round_offset_vec, s0, shift_by_bits)); 497 uint16x4_t d1 = 498 vget_low_u16(vmlal_u8(round_offset_vec, s1, shift_by_bits)); 499 uint16x4_t d2 = 500 vget_low_u16(vmlal_u8(round_offset_vec, s2, shift_by_bits)); 501 uint16x4_t d3 = 502 vget_low_u16(vmlal_u8(round_offset_vec, s3, shift_by_bits)); 503 504 store_u16_4x4(dst, dst_stride, d0, d1, d2, d3); 505 506 src += 4 * src_stride; 507 dst += 4 * dst_stride; 508 height -= 4; 509 } while (height != 0); 510 } else { 511 do { 512 const uint8_t *s = src; 513 CONV_BUF_TYPE *d = dst; 514 int width = w; 515 516 do { 517 uint8x8_t s0, s1, s2, s3; 518 load_u8_8x4(s, src_stride, &s0, &s1, &s2, &s3); 519 520 uint16x8_t d0 = vmlal_u8(round_offset_vec, s0, shift_by_bits); 521 uint16x8_t d1 = vmlal_u8(round_offset_vec, s1, shift_by_bits); 522 uint16x8_t d2 = vmlal_u8(round_offset_vec, s2, shift_by_bits); 523 uint16x8_t d3 = vmlal_u8(round_offset_vec, s3, shift_by_bits); 524 525 store_u16_8x4(d, dst_stride, d0, d1, d2, d3); 526 527 s += 8; 528 d += 8; 529 width -= 8; 530 } while (width != 0); 531 src += 4 * src_stride; 532 dst += 4 * dst_stride; 533 height -= 4; 534 } while (height != 0); 535 } 536 } 537 538 void av1_dist_wtd_convolve_2d_copy_neon(const uint8_t *src, int src_stride, 539 uint8_t *dst8, int dst8_stride, int w, 540 int h, ConvolveParams *conv_params) { 541 if (conv_params->do_average) { 542 if (UNLIKELY(conv_params->use_dist_wtd_comp_avg)) { 543 dist_wtd_convolve_2d_copy_dist_wtd_avg_neon( 544 src, src_stride, dst8, dst8_stride, w, h, conv_params); 545 } else { 546 dist_wtd_convolve_2d_copy_avg_neon(src, src_stride, dst8, dst8_stride, w, 547 h, conv_params); 548 } 549 } else { 550 dist_wtd_convolve_2d_copy_neon(src, src_stride, w, h, conv_params); 551 } 552 } 553 554 static inline uint16x4_t convolve4_4_x(const int16x4_t s0, const int16x4_t s1, 555 const int16x4_t s2, const int16x4_t s3, 556 const int16x4_t x_filter, 557 const int16x4_t round_offset) { 558 int16x4_t sum = vmul_lane_s16(s0, x_filter, 0); 559 sum = vmla_lane_s16(sum, s1, x_filter, 1); 560 sum = vmla_lane_s16(sum, s2, x_filter, 2); 561 sum = vmla_lane_s16(sum, s3, x_filter, 3); 562 563 // We halved the convolution filter values so -1 from the right shift. 564 int16x4_t res = vrsra_n_s16(round_offset, sum, ROUND0_BITS - 1); 565 return vreinterpret_u16_s16(res); 566 } 567 568 static inline uint16x8_t convolve8_8_x(const int16x8_t s0, const int16x8_t s1, 569 const int16x8_t s2, const int16x8_t s3, 570 const int16x8_t s4, const int16x8_t s5, 571 const int16x8_t s6, const int16x8_t s7, 572 const int16x8_t x_filter, 573 const int16x8_t round_offset) { 574 const int16x4_t x_filter_0_3 = vget_low_s16(x_filter); 575 const int16x4_t x_filter_4_7 = vget_high_s16(x_filter); 576 577 int16x8_t sum = vmulq_lane_s16(s0, x_filter_0_3, 0); 578 sum = vmlaq_lane_s16(sum, s1, x_filter_0_3, 1); 579 sum = vmlaq_lane_s16(sum, s2, x_filter_0_3, 2); 580 sum = vmlaq_lane_s16(sum, s3, x_filter_0_3, 3); 581 sum = vmlaq_lane_s16(sum, s4, x_filter_4_7, 0); 582 sum = vmlaq_lane_s16(sum, s5, x_filter_4_7, 1); 583 sum = vmlaq_lane_s16(sum, s6, x_filter_4_7, 2); 584 sum = vmlaq_lane_s16(sum, s7, x_filter_4_7, 3); 585 586 // We halved the convolution filter values so -1 from the right shift. 587 int16x8_t res = vrsraq_n_s16(round_offset, sum, ROUND0_BITS - 1); 588 return vreinterpretq_u16_s16(res); 589 } 590 591 static inline void dist_wtd_convolve_x_dist_wtd_avg_neon( 592 const uint8_t *src, int src_stride, uint8_t *dst8, int dst8_stride, int w, 593 int h, const InterpFilterParams *filter_params_x, const int subpel_x_qn, 594 ConvolveParams *conv_params) { 595 assert(w % 4 == 0); 596 assert(h % 4 == 0); 597 598 const int bd = 8; 599 const int offset_bits = bd + 2 * FILTER_BITS - ROUND0_BITS; 600 const int16_t round_offset = (1 << (offset_bits - COMPOUND_ROUND1_BITS)) + 601 (1 << (offset_bits - COMPOUND_ROUND1_BITS - 1)); 602 const int16x8_t round_offset_vec = vdupq_n_s16(round_offset); 603 604 const uint16_t fwd_offset = conv_params->fwd_offset; 605 const uint16_t bck_offset = conv_params->bck_offset; 606 607 // Horizontal filter. 608 const int16_t *x_filter_ptr = av1_get_interp_filter_subpel_kernel( 609 filter_params_x, subpel_x_qn & SUBPEL_MASK); 610 611 const int horiz_offset = filter_params_x->taps / 2 - 1; 612 const uint8_t *src_ptr = src - horiz_offset; 613 CONV_BUF_TYPE *dst_ptr = conv_params->dst; 614 uint8_t *dst8_ptr = dst8; 615 int dst_stride = conv_params->dst_stride; 616 int height = h; 617 618 if (w == 4) { 619 // 4-tap filters are used for blocks having width <= 4. 620 // Filter values are even, so halve to reduce intermediate precision reqs. 621 const int16x4_t x_filter = vshr_n_s16(vld1_s16(x_filter_ptr + 2), 1); 622 623 src_ptr += 2; 624 625 do { 626 uint8x8_t t0 = vld1_u8(src_ptr); // a0 a1 a2 a3 a4 a5 a6 a7 627 int16x4_t s0 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0))); 628 int16x4_t s4 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t0))); 629 630 __builtin_prefetch(dst_ptr); 631 __builtin_prefetch(dst8_ptr); 632 633 int16x4_t s1 = vext_s16(s0, s4, 1); // a1 a2 a3 a4 634 int16x4_t s2 = vext_s16(s0, s4, 2); // a2 a3 a4 a5 635 int16x4_t s3 = vext_s16(s0, s4, 3); // a3 a4 a5 a6 636 637 uint16x4_t d0 = convolve4_4_x(s0, s1, s2, s3, x_filter, 638 vget_low_s16(round_offset_vec)); 639 640 uint16x4_t dd0 = vld1_u16(dst_ptr); 641 642 uint8x8_t d01; 643 compute_dist_wtd_avg_4x1(dd0, d0, fwd_offset, bck_offset, 644 vget_low_s16(round_offset_vec), &d01); 645 646 store_u8_4x1(dst8_ptr, d01); 647 648 src_ptr += src_stride; 649 dst_ptr += dst_stride; 650 dst8_ptr += dst8_stride; 651 } while (--height != 0); 652 } else { 653 // Filter values are even, so halve to reduce intermediate precision reqs. 654 const int16x8_t x_filter = vshrq_n_s16(vld1q_s16(x_filter_ptr), 1); 655 656 #if AOM_ARCH_AARCH64 657 while (height >= 8) { 658 const uint8_t *s = src_ptr; 659 CONV_BUF_TYPE *d = dst_ptr; 660 uint8_t *d_u8 = dst8_ptr; 661 int width = w; 662 663 uint8x8_t t0, t1, t2, t3, t4, t5, t6, t7; 664 load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); 665 transpose_elems_inplace_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); 666 667 int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t0)); 668 int16x8_t s1 = vreinterpretq_s16_u16(vmovl_u8(t1)); 669 int16x8_t s2 = vreinterpretq_s16_u16(vmovl_u8(t2)); 670 int16x8_t s3 = vreinterpretq_s16_u16(vmovl_u8(t3)); 671 int16x8_t s4 = vreinterpretq_s16_u16(vmovl_u8(t4)); 672 int16x8_t s5 = vreinterpretq_s16_u16(vmovl_u8(t5)); 673 int16x8_t s6 = vreinterpretq_s16_u16(vmovl_u8(t6)); 674 675 __builtin_prefetch(d + 0 * dst_stride); 676 __builtin_prefetch(d + 1 * dst_stride); 677 __builtin_prefetch(d + 2 * dst_stride); 678 __builtin_prefetch(d + 3 * dst_stride); 679 __builtin_prefetch(d + 4 * dst_stride); 680 __builtin_prefetch(d + 5 * dst_stride); 681 __builtin_prefetch(d + 6 * dst_stride); 682 __builtin_prefetch(d + 7 * dst_stride); 683 684 s += 7; 685 686 do { 687 load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); 688 transpose_elems_inplace_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); 689 690 int16x8_t s7 = vreinterpretq_s16_u16(vmovl_u8(t0)); 691 int16x8_t s8 = vreinterpretq_s16_u16(vmovl_u8(t1)); 692 int16x8_t s9 = vreinterpretq_s16_u16(vmovl_u8(t2)); 693 int16x8_t s10 = vreinterpretq_s16_u16(vmovl_u8(t3)); 694 int16x8_t s11 = vreinterpretq_s16_u16(vmovl_u8(t4)); 695 int16x8_t s12 = vreinterpretq_s16_u16(vmovl_u8(t5)); 696 int16x8_t s13 = vreinterpretq_s16_u16(vmovl_u8(t6)); 697 int16x8_t s14 = vreinterpretq_s16_u16(vmovl_u8(t7)); 698 699 uint16x8_t d0 = convolve8_8_x(s0, s1, s2, s3, s4, s5, s6, s7, x_filter, 700 round_offset_vec); 701 uint16x8_t d1 = convolve8_8_x(s1, s2, s3, s4, s5, s6, s7, s8, x_filter, 702 round_offset_vec); 703 uint16x8_t d2 = convolve8_8_x(s2, s3, s4, s5, s6, s7, s8, s9, x_filter, 704 round_offset_vec); 705 uint16x8_t d3 = convolve8_8_x(s3, s4, s5, s6, s7, s8, s9, s10, x_filter, 706 round_offset_vec); 707 uint16x8_t d4 = convolve8_8_x(s4, s5, s6, s7, s8, s9, s10, s11, 708 x_filter, round_offset_vec); 709 uint16x8_t d5 = convolve8_8_x(s5, s6, s7, s8, s9, s10, s11, s12, 710 x_filter, round_offset_vec); 711 uint16x8_t d6 = convolve8_8_x(s6, s7, s8, s9, s10, s11, s12, s13, 712 x_filter, round_offset_vec); 713 uint16x8_t d7 = convolve8_8_x(s7, s8, s9, s10, s11, s12, s13, s14, 714 x_filter, round_offset_vec); 715 716 transpose_elems_inplace_u16_8x8(&d0, &d1, &d2, &d3, &d4, &d5, &d6, &d7); 717 718 uint16x8_t dd0, dd1, dd2, dd3; 719 load_u16_8x4(d, dst_stride, &dd0, &dd1, &dd2, &dd3); 720 721 uint8x8_t d0_u8, d1_u8, d2_u8, d3_u8; 722 compute_dist_wtd_avg_8x4(dd0, dd1, dd2, dd3, d0, d1, d2, d3, fwd_offset, 723 bck_offset, round_offset_vec, &d0_u8, &d1_u8, 724 &d2_u8, &d3_u8); 725 726 store_u8_8x4(d_u8, dst8_stride, d0_u8, d1_u8, d2_u8, d3_u8); 727 728 uint16x8_t dd4, dd5, dd6, dd7; 729 load_u16_8x4(d + 4 * dst_stride, dst_stride, &dd4, &dd5, &dd6, &dd7); 730 731 uint8x8_t d4_u8, d5_u8, d6_u8, d7_u8; 732 compute_dist_wtd_avg_8x4(dd4, dd5, dd6, dd7, d4, d5, d6, d7, fwd_offset, 733 bck_offset, round_offset_vec, &d4_u8, &d5_u8, 734 &d6_u8, &d7_u8); 735 736 store_u8_8x4(d_u8 + 4 * dst8_stride, dst8_stride, d4_u8, d5_u8, d6_u8, 737 d7_u8); 738 739 s0 = s8; 740 s1 = s9; 741 s2 = s10; 742 s3 = s11; 743 s4 = s12; 744 s5 = s13; 745 s6 = s14; 746 s += 8; 747 d += 8; 748 d_u8 += 8; 749 width -= 8; 750 } while (width != 0); 751 src_ptr += 8 * src_stride; 752 dst_ptr += 8 * dst_stride; 753 dst8_ptr += 8 * dst8_stride; 754 height -= 8; 755 } 756 #endif // AOM_ARCH_AARCH64 757 758 while (height > 0) { 759 const uint8_t *s = src_ptr; 760 CONV_BUF_TYPE *d = dst_ptr; 761 uint8_t *d_u8 = dst8_ptr; 762 int width = w; 763 764 uint8x8_t t0 = vld1_u8(s); // a0 a1 a2 a3 a4 a5 a6 a7 765 int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t0)); 766 767 __builtin_prefetch(d); 768 769 s += 8; 770 771 do { 772 t0 = vld1_u8(s); // a8 a9 a10 a11 a12 a13 a14 a15 773 int16x8_t s8 = vreinterpretq_s16_u16(vmovl_u8(t0)); 774 775 int16x8_t s1 = vextq_s16(s0, s8, 1); // a1 a2 a3 a4 a5 a6 a7 a8 776 int16x8_t s2 = vextq_s16(s0, s8, 2); // a2 a3 a4 a5 a6 a7 a8 a9 777 int16x8_t s3 = vextq_s16(s0, s8, 3); // a3 a4 a5 a6 a7 a8 a9 a10 778 int16x8_t s4 = vextq_s16(s0, s8, 4); // a4 a5 a6 a7 a8 a9 a10 a11 779 int16x8_t s5 = vextq_s16(s0, s8, 5); // a5 a6 a7 a8 a9 a10 a11 a12 780 int16x8_t s6 = vextq_s16(s0, s8, 6); // a6 a7 a8 a9 a10 a11 a12 a13 781 int16x8_t s7 = vextq_s16(s0, s8, 7); // a7 a8 a9 a10 a11 a12 a13 a14 782 783 uint16x8_t d0 = convolve8_8_x(s0, s1, s2, s3, s4, s5, s6, s7, x_filter, 784 round_offset_vec); 785 786 uint16x8_t dd0 = vld1q_u16(d); 787 788 uint8x8_t d0_u8; 789 compute_dist_wtd_avg_8x1(dd0, d0, fwd_offset, bck_offset, 790 round_offset_vec, &d0_u8); 791 792 vst1_u8(d_u8, d0_u8); 793 794 s0 = s8; 795 s += 8; 796 d += 8; 797 d_u8 += 8; 798 width -= 8; 799 } while (width != 0); 800 src_ptr += src_stride; 801 dst_ptr += dst_stride; 802 dst8_ptr += dst8_stride; 803 height--; 804 } 805 } 806 } 807 808 static inline void dist_wtd_convolve_x_avg_neon( 809 const uint8_t *src, int src_stride, uint8_t *dst8, int dst8_stride, int w, 810 int h, const InterpFilterParams *filter_params_x, const int subpel_x_qn, 811 ConvolveParams *conv_params) { 812 assert(w % 4 == 0); 813 assert(h % 4 == 0); 814 815 const int bd = 8; 816 const int offset_bits = bd + 2 * FILTER_BITS - ROUND0_BITS; 817 const int16_t round_offset = (1 << (offset_bits - COMPOUND_ROUND1_BITS)) + 818 (1 << (offset_bits - COMPOUND_ROUND1_BITS - 1)); 819 const int16x8_t round_offset_vec = vdupq_n_s16(round_offset); 820 821 // Horizontal filter. 822 const int16_t *x_filter_ptr = av1_get_interp_filter_subpel_kernel( 823 filter_params_x, subpel_x_qn & SUBPEL_MASK); 824 825 const int horiz_offset = filter_params_x->taps / 2 - 1; 826 const uint8_t *src_ptr = src - horiz_offset; 827 CONV_BUF_TYPE *dst_ptr = conv_params->dst; 828 uint8_t *dst8_ptr = dst8; 829 int dst_stride = conv_params->dst_stride; 830 int height = h; 831 832 if (w == 4) { 833 // 4-tap filters are used for blocks having width <= 4. 834 // Filter values are even, so halve to reduce intermediate precision reqs. 835 const int16x4_t x_filter = vshr_n_s16(vld1_s16(x_filter_ptr + 2), 1); 836 837 src_ptr += 2; 838 839 do { 840 uint8x8_t t0 = vld1_u8(src_ptr); // a0 a1 a2 a3 a4 a5 a6 a7 841 int16x4_t s0 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0))); 842 int16x4_t s4 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t0))); 843 844 __builtin_prefetch(dst_ptr); 845 __builtin_prefetch(dst8_ptr); 846 847 int16x4_t s1 = vext_s16(s0, s4, 1); // a1 a2 a3 a4 848 int16x4_t s2 = vext_s16(s0, s4, 2); // a2 a3 a4 a5 849 int16x4_t s3 = vext_s16(s0, s4, 3); // a3 a4 a5 a6 850 851 uint16x4_t d0 = convolve4_4_x(s0, s1, s2, s3, x_filter, 852 vget_low_s16(round_offset_vec)); 853 854 uint16x4_t dd0 = vld1_u16(dst_ptr); 855 856 uint8x8_t d01; 857 compute_basic_avg_4x1(dd0, d0, vget_low_s16(round_offset_vec), &d01); 858 859 store_u8_4x1(dst8_ptr, d01); 860 861 src_ptr += src_stride; 862 dst_ptr += dst_stride; 863 dst8_ptr += dst8_stride; 864 } while (--height != 0); 865 } else { 866 // Filter values are even, so halve to reduce intermediate precision reqs. 867 const int16x8_t x_filter = vshrq_n_s16(vld1q_s16(x_filter_ptr), 1); 868 869 #if AOM_ARCH_AARCH64 870 while (height >= 8) { 871 const uint8_t *s = src_ptr; 872 CONV_BUF_TYPE *d = dst_ptr; 873 uint8_t *d_u8 = dst8_ptr; 874 int width = w; 875 876 uint8x8_t t0, t1, t2, t3, t4, t5, t6, t7; 877 load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); 878 transpose_elems_inplace_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); 879 880 int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t0)); 881 int16x8_t s1 = vreinterpretq_s16_u16(vmovl_u8(t1)); 882 int16x8_t s2 = vreinterpretq_s16_u16(vmovl_u8(t2)); 883 int16x8_t s3 = vreinterpretq_s16_u16(vmovl_u8(t3)); 884 int16x8_t s4 = vreinterpretq_s16_u16(vmovl_u8(t4)); 885 int16x8_t s5 = vreinterpretq_s16_u16(vmovl_u8(t5)); 886 int16x8_t s6 = vreinterpretq_s16_u16(vmovl_u8(t6)); 887 888 __builtin_prefetch(d + 0 * dst_stride); 889 __builtin_prefetch(d + 1 * dst_stride); 890 __builtin_prefetch(d + 2 * dst_stride); 891 __builtin_prefetch(d + 3 * dst_stride); 892 __builtin_prefetch(d + 4 * dst_stride); 893 __builtin_prefetch(d + 5 * dst_stride); 894 __builtin_prefetch(d + 6 * dst_stride); 895 __builtin_prefetch(d + 7 * dst_stride); 896 897 s += 7; 898 899 do { 900 load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); 901 transpose_elems_inplace_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); 902 903 int16x8_t s7 = vreinterpretq_s16_u16(vmovl_u8(t0)); 904 int16x8_t s8 = vreinterpretq_s16_u16(vmovl_u8(t1)); 905 int16x8_t s9 = vreinterpretq_s16_u16(vmovl_u8(t2)); 906 int16x8_t s10 = vreinterpretq_s16_u16(vmovl_u8(t3)); 907 int16x8_t s11 = vreinterpretq_s16_u16(vmovl_u8(t4)); 908 int16x8_t s12 = vreinterpretq_s16_u16(vmovl_u8(t5)); 909 int16x8_t s13 = vreinterpretq_s16_u16(vmovl_u8(t6)); 910 int16x8_t s14 = vreinterpretq_s16_u16(vmovl_u8(t7)); 911 912 uint16x8_t d0 = convolve8_8_x(s0, s1, s2, s3, s4, s5, s6, s7, x_filter, 913 round_offset_vec); 914 uint16x8_t d1 = convolve8_8_x(s1, s2, s3, s4, s5, s6, s7, s8, x_filter, 915 round_offset_vec); 916 uint16x8_t d2 = convolve8_8_x(s2, s3, s4, s5, s6, s7, s8, s9, x_filter, 917 round_offset_vec); 918 uint16x8_t d3 = convolve8_8_x(s3, s4, s5, s6, s7, s8, s9, s10, x_filter, 919 round_offset_vec); 920 uint16x8_t d4 = convolve8_8_x(s4, s5, s6, s7, s8, s9, s10, s11, 921 x_filter, round_offset_vec); 922 uint16x8_t d5 = convolve8_8_x(s5, s6, s7, s8, s9, s10, s11, s12, 923 x_filter, round_offset_vec); 924 uint16x8_t d6 = convolve8_8_x(s6, s7, s8, s9, s10, s11, s12, s13, 925 x_filter, round_offset_vec); 926 uint16x8_t d7 = convolve8_8_x(s7, s8, s9, s10, s11, s12, s13, s14, 927 x_filter, round_offset_vec); 928 929 transpose_elems_inplace_u16_8x8(&d0, &d1, &d2, &d3, &d4, &d5, &d6, &d7); 930 931 uint16x8_t dd0, dd1, dd2, dd3; 932 load_u16_8x4(d, dst_stride, &dd0, &dd1, &dd2, &dd3); 933 934 uint8x8_t d0_u8, d1_u8, d2_u8, d3_u8; 935 compute_basic_avg_8x4(dd0, dd1, dd2, dd3, d0, d1, d2, d3, 936 round_offset_vec, &d0_u8, &d1_u8, &d2_u8, &d3_u8); 937 938 store_u8_8x4(d_u8, dst8_stride, d0_u8, d1_u8, d2_u8, d3_u8); 939 940 uint16x8_t dd4, dd5, dd6, dd7; 941 load_u16_8x4(d + 4 * dst_stride, dst_stride, &dd4, &dd5, &dd6, &dd7); 942 943 uint8x8_t d4_u8, d5_u8, d6_u8, d7_u8; 944 compute_basic_avg_8x4(dd4, dd5, dd6, dd7, d4, d5, d6, d7, 945 round_offset_vec, &d4_u8, &d5_u8, &d6_u8, &d7_u8); 946 947 store_u8_8x4(d_u8 + 4 * dst8_stride, dst8_stride, d4_u8, d5_u8, d6_u8, 948 d7_u8); 949 950 s0 = s8; 951 s1 = s9; 952 s2 = s10; 953 s3 = s11; 954 s4 = s12; 955 s5 = s13; 956 s6 = s14; 957 s += 8; 958 d += 8; 959 d_u8 += 8; 960 width -= 8; 961 } while (width != 0); 962 src_ptr += 8 * src_stride; 963 dst_ptr += 8 * dst_stride; 964 dst8_ptr += 8 * dst8_stride; 965 height -= 8; 966 } 967 #endif // AOM_ARCH_AARCH64 968 969 while (height > 0) { 970 const uint8_t *s = src_ptr; 971 CONV_BUF_TYPE *d = dst_ptr; 972 uint8_t *d_u8 = dst8_ptr; 973 int width = w; 974 975 uint8x8_t t0 = vld1_u8(s); // a0 a1 a2 a3 a4 a5 a6 a7 976 int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t0)); 977 978 __builtin_prefetch(d); 979 980 s += 8; 981 982 do { 983 t0 = vld1_u8(s); // a8 a9 a10 a11 a12 a13 a14 a15 984 int16x8_t s8 = vreinterpretq_s16_u16(vmovl_u8(t0)); 985 986 int16x8_t s1 = vextq_s16(s0, s8, 1); // a1 a2 a3 a4 a5 a6 a7 a8 987 int16x8_t s2 = vextq_s16(s0, s8, 2); // a2 a3 a4 a5 a6 a7 a8 a9 988 int16x8_t s3 = vextq_s16(s0, s8, 3); // a3 a4 a5 a6 a7 a8 a9 a10 989 int16x8_t s4 = vextq_s16(s0, s8, 4); // a4 a5 a6 a7 a8 a9 a10 a11 990 int16x8_t s5 = vextq_s16(s0, s8, 5); // a5 a6 a7 a8 a9 a10 a11 a12 991 int16x8_t s6 = vextq_s16(s0, s8, 6); // a6 a7 a8 a9 a10 a11 a12 a13 992 int16x8_t s7 = vextq_s16(s0, s8, 7); // a7 a8 a9 a10 a11 a12 a13 a14 993 994 uint16x8_t d0 = convolve8_8_x(s0, s1, s2, s3, s4, s5, s6, s7, x_filter, 995 round_offset_vec); 996 997 uint16x8_t dd0 = vld1q_u16(d); 998 999 uint8x8_t d0_u8; 1000 compute_basic_avg_8x1(dd0, d0, round_offset_vec, &d0_u8); 1001 1002 vst1_u8(d_u8, d0_u8); 1003 1004 s0 = s8; 1005 s += 8; 1006 d += 8; 1007 d_u8 += 8; 1008 width -= 8; 1009 } while (width != 0); 1010 src_ptr += src_stride; 1011 dst_ptr += dst_stride; 1012 dst8_ptr += dst8_stride; 1013 height--; 1014 } 1015 } 1016 } 1017 1018 static inline void dist_wtd_convolve_x_neon( 1019 const uint8_t *src, int src_stride, int w, int h, 1020 const InterpFilterParams *filter_params_x, const int subpel_x_qn, 1021 ConvolveParams *conv_params) { 1022 assert(w % 4 == 0); 1023 assert(h % 4 == 0); 1024 1025 const int bd = 8; 1026 const int offset_bits = bd + 2 * FILTER_BITS - ROUND0_BITS; 1027 const int16_t round_offset = (1 << (offset_bits - COMPOUND_ROUND1_BITS)) + 1028 (1 << (offset_bits - COMPOUND_ROUND1_BITS - 1)); 1029 const int16x8_t round_offset_vec = vdupq_n_s16(round_offset); 1030 1031 // Horizontal filter. 1032 const int16_t *x_filter_ptr = av1_get_interp_filter_subpel_kernel( 1033 filter_params_x, subpel_x_qn & SUBPEL_MASK); 1034 1035 const int horiz_offset = filter_params_x->taps / 2 - 1; 1036 const uint8_t *src_ptr = src - horiz_offset; 1037 CONV_BUF_TYPE *dst_ptr = conv_params->dst; 1038 int dst_stride = conv_params->dst_stride; 1039 int height = h; 1040 1041 if (w == 4) { 1042 // 4-tap filters are used for blocks having width <= 4. 1043 // Filter values are even, so halve to reduce intermediate precision reqs. 1044 const int16x4_t x_filter = vshr_n_s16(vld1_s16(x_filter_ptr + 2), 1); 1045 1046 src_ptr += 2; 1047 1048 do { 1049 uint8x8_t t0 = vld1_u8(src_ptr); // a0 a1 a2 a3 a4 a5 a6 a7 1050 int16x4_t s0 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0))); 1051 int16x4_t s4 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t0))); 1052 1053 __builtin_prefetch(dst_ptr); 1054 1055 int16x4_t s1 = vext_s16(s0, s4, 1); // a1 a2 a3 a4 1056 int16x4_t s2 = vext_s16(s0, s4, 2); // a2 a3 a4 a5 1057 int16x4_t s3 = vext_s16(s0, s4, 3); // a3 a4 a5 a6 1058 1059 uint16x4_t d0 = convolve4_4_x(s0, s1, s2, s3, x_filter, 1060 vget_low_s16(round_offset_vec)); 1061 1062 vst1_u16(dst_ptr, d0); 1063 1064 src_ptr += src_stride; 1065 dst_ptr += dst_stride; 1066 } while (--height != 0); 1067 } else { 1068 // Filter values are even, so halve to reduce intermediate precision reqs. 1069 const int16x8_t x_filter = vshrq_n_s16(vld1q_s16(x_filter_ptr), 1); 1070 1071 #if AOM_ARCH_AARCH64 1072 while (height >= 8) { 1073 const uint8_t *s = src_ptr; 1074 CONV_BUF_TYPE *d = dst_ptr; 1075 int width = w; 1076 1077 uint8x8_t t0, t1, t2, t3, t4, t5, t6, t7; 1078 load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); 1079 transpose_elems_inplace_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); 1080 1081 int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t0)); 1082 int16x8_t s1 = vreinterpretq_s16_u16(vmovl_u8(t1)); 1083 int16x8_t s2 = vreinterpretq_s16_u16(vmovl_u8(t2)); 1084 int16x8_t s3 = vreinterpretq_s16_u16(vmovl_u8(t3)); 1085 int16x8_t s4 = vreinterpretq_s16_u16(vmovl_u8(t4)); 1086 int16x8_t s5 = vreinterpretq_s16_u16(vmovl_u8(t5)); 1087 int16x8_t s6 = vreinterpretq_s16_u16(vmovl_u8(t6)); 1088 1089 __builtin_prefetch(d + 0 * dst_stride); 1090 __builtin_prefetch(d + 1 * dst_stride); 1091 __builtin_prefetch(d + 2 * dst_stride); 1092 __builtin_prefetch(d + 3 * dst_stride); 1093 __builtin_prefetch(d + 4 * dst_stride); 1094 __builtin_prefetch(d + 5 * dst_stride); 1095 __builtin_prefetch(d + 6 * dst_stride); 1096 __builtin_prefetch(d + 7 * dst_stride); 1097 1098 s += 7; 1099 1100 do { 1101 load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); 1102 transpose_elems_inplace_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); 1103 1104 int16x8_t s7 = vreinterpretq_s16_u16(vmovl_u8(t0)); 1105 int16x8_t s8 = vreinterpretq_s16_u16(vmovl_u8(t1)); 1106 int16x8_t s9 = vreinterpretq_s16_u16(vmovl_u8(t2)); 1107 int16x8_t s10 = vreinterpretq_s16_u16(vmovl_u8(t3)); 1108 int16x8_t s11 = vreinterpretq_s16_u16(vmovl_u8(t4)); 1109 int16x8_t s12 = vreinterpretq_s16_u16(vmovl_u8(t5)); 1110 int16x8_t s13 = vreinterpretq_s16_u16(vmovl_u8(t6)); 1111 int16x8_t s14 = vreinterpretq_s16_u16(vmovl_u8(t7)); 1112 1113 uint16x8_t d0 = convolve8_8_x(s0, s1, s2, s3, s4, s5, s6, s7, x_filter, 1114 round_offset_vec); 1115 uint16x8_t d1 = convolve8_8_x(s1, s2, s3, s4, s5, s6, s7, s8, x_filter, 1116 round_offset_vec); 1117 uint16x8_t d2 = convolve8_8_x(s2, s3, s4, s5, s6, s7, s8, s9, x_filter, 1118 round_offset_vec); 1119 uint16x8_t d3 = convolve8_8_x(s3, s4, s5, s6, s7, s8, s9, s10, x_filter, 1120 round_offset_vec); 1121 uint16x8_t d4 = convolve8_8_x(s4, s5, s6, s7, s8, s9, s10, s11, 1122 x_filter, round_offset_vec); 1123 uint16x8_t d5 = convolve8_8_x(s5, s6, s7, s8, s9, s10, s11, s12, 1124 x_filter, round_offset_vec); 1125 uint16x8_t d6 = convolve8_8_x(s6, s7, s8, s9, s10, s11, s12, s13, 1126 x_filter, round_offset_vec); 1127 uint16x8_t d7 = convolve8_8_x(s7, s8, s9, s10, s11, s12, s13, s14, 1128 x_filter, round_offset_vec); 1129 1130 transpose_elems_inplace_u16_8x8(&d0, &d1, &d2, &d3, &d4, &d5, &d6, &d7); 1131 1132 store_u16_8x8(d, dst_stride, d0, d1, d2, d3, d4, d5, d6, d7); 1133 1134 s0 = s8; 1135 s1 = s9; 1136 s2 = s10; 1137 s3 = s11; 1138 s4 = s12; 1139 s5 = s13; 1140 s6 = s14; 1141 s += 8; 1142 d += 8; 1143 width -= 8; 1144 } while (width != 0); 1145 src_ptr += 8 * src_stride; 1146 dst_ptr += 8 * dst_stride; 1147 height -= 8; 1148 } 1149 #endif // AOM_ARCH_AARCH64 1150 1151 while (height > 0) { 1152 const uint8_t *s = src_ptr; 1153 CONV_BUF_TYPE *d = dst_ptr; 1154 int width = w; 1155 1156 uint8x8_t t0 = vld1_u8(s); // a0 a1 a2 a3 a4 a5 a6 a7 1157 int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t0)); 1158 1159 __builtin_prefetch(d); 1160 1161 s = src_ptr + 8; 1162 1163 do { 1164 t0 = vld1_u8(s); // a8 a9 a10 a11 a12 a13 a14 a15 1165 int16x8_t s8 = vreinterpretq_s16_u16(vmovl_u8(t0)); 1166 1167 int16x8_t s1 = vextq_s16(s0, s8, 1); // a1 a2 a3 a4 a5 a6 a7 a8 1168 int16x8_t s2 = vextq_s16(s0, s8, 2); // a2 a3 a4 a5 a6 a7 a8 a9 1169 int16x8_t s3 = vextq_s16(s0, s8, 3); // a3 a4 a5 a6 a7 a8 a9 a10 1170 int16x8_t s4 = vextq_s16(s0, s8, 4); // a4 a5 a6 a7 a8 a9 a10 a11 1171 int16x8_t s5 = vextq_s16(s0, s8, 5); // a5 a6 a7 a8 a9 a10 a11 a12 1172 int16x8_t s6 = vextq_s16(s0, s8, 6); // a6 a7 a8 a9 a10 a11 a12 a13 1173 int16x8_t s7 = vextq_s16(s0, s8, 7); // a7 a8 a9 a10 a11 a12 a13 a14 1174 1175 uint16x8_t d0 = convolve8_8_x(s0, s1, s2, s3, s4, s5, s6, s7, x_filter, 1176 round_offset_vec); 1177 1178 vst1q_u16(d, d0); 1179 1180 s0 = s8; 1181 s += 8; 1182 d += 8; 1183 width -= 8; 1184 } while (width != 0); 1185 src_ptr += src_stride; 1186 dst_ptr += dst_stride; 1187 height--; 1188 } 1189 } 1190 } 1191 1192 void av1_dist_wtd_convolve_x_neon(const uint8_t *src, int src_stride, 1193 uint8_t *dst8, int dst8_stride, int w, int h, 1194 const InterpFilterParams *filter_params_x, 1195 const int subpel_x_qn, 1196 ConvolveParams *conv_params) { 1197 if (conv_params->do_average) { 1198 if (UNLIKELY(conv_params->use_dist_wtd_comp_avg)) { 1199 dist_wtd_convolve_x_dist_wtd_avg_neon(src, src_stride, dst8, dst8_stride, 1200 w, h, filter_params_x, subpel_x_qn, 1201 conv_params); 1202 } else { 1203 dist_wtd_convolve_x_avg_neon(src, src_stride, dst8, dst8_stride, w, h, 1204 filter_params_x, subpel_x_qn, conv_params); 1205 } 1206 } else { 1207 dist_wtd_convolve_x_neon(src, src_stride, w, h, filter_params_x, 1208 subpel_x_qn, conv_params); 1209 } 1210 } 1211 1212 static inline uint16x4_t convolve6_4_y(const int16x4_t s0, const int16x4_t s1, 1213 const int16x4_t s2, const int16x4_t s3, 1214 const int16x4_t s4, const int16x4_t s5, 1215 const int16x8_t y_filter, 1216 const int16x4_t round_offset) { 1217 const int16x4_t y_filter_0_3 = vget_low_s16(y_filter); 1218 const int16x4_t y_filter_4_7 = vget_high_s16(y_filter); 1219 1220 // Filter values at indices 0 and 7 are 0. 1221 int16x4_t sum = vmul_lane_s16(s0, y_filter_0_3, 1); 1222 sum = vmla_lane_s16(sum, s1, y_filter_0_3, 2); 1223 sum = vmla_lane_s16(sum, s2, y_filter_0_3, 3); 1224 sum = vmla_lane_s16(sum, s3, y_filter_4_7, 0); 1225 sum = vmla_lane_s16(sum, s4, y_filter_4_7, 1); 1226 sum = vmla_lane_s16(sum, s5, y_filter_4_7, 2); 1227 1228 // We halved the convolution filter values so -1 from the right shift. 1229 int16x4_t res = vrsra_n_s16(round_offset, sum, ROUND0_BITS - 1); 1230 return vreinterpret_u16_s16(res); 1231 } 1232 1233 static inline uint16x8_t convolve6_8_y(const int16x8_t s0, const int16x8_t s1, 1234 const int16x8_t s2, const int16x8_t s3, 1235 const int16x8_t s4, const int16x8_t s5, 1236 const int16x8_t y_filter, 1237 const int16x8_t round_offset) { 1238 const int16x4_t y_filter_0_3 = vget_low_s16(y_filter); 1239 const int16x4_t y_filter_4_7 = vget_high_s16(y_filter); 1240 1241 // Filter values at indices 0 and 7 are 0. 1242 int16x8_t sum = vmulq_lane_s16(s0, y_filter_0_3, 1); 1243 sum = vmlaq_lane_s16(sum, s1, y_filter_0_3, 2); 1244 sum = vmlaq_lane_s16(sum, s2, y_filter_0_3, 3); 1245 sum = vmlaq_lane_s16(sum, s3, y_filter_4_7, 0); 1246 sum = vmlaq_lane_s16(sum, s4, y_filter_4_7, 1); 1247 sum = vmlaq_lane_s16(sum, s5, y_filter_4_7, 2); 1248 1249 // We halved the convolution filter values so -1 from the right shift. 1250 int16x8_t res = vrsraq_n_s16(round_offset, sum, ROUND0_BITS - 1); 1251 return vreinterpretq_u16_s16(res); 1252 } 1253 1254 static inline void dist_wtd_convolve_y_6tap_dist_wtd_avg_neon( 1255 const uint8_t *src_ptr, int src_stride, uint8_t *dst8_ptr, 1256 const int dst8_stride, int w, int h, const int16x8_t y_filter, 1257 ConvolveParams *conv_params) { 1258 const int bd = 8; 1259 const int offset_bits = bd + 2 * FILTER_BITS - ROUND0_BITS; 1260 const int16_t round_offset = (1 << (offset_bits - COMPOUND_ROUND1_BITS)) + 1261 (1 << (offset_bits - COMPOUND_ROUND1_BITS - 1)); 1262 const int16x8_t round_offset_vec = vdupq_n_s16(round_offset); 1263 1264 const uint16_t fwd_offset = conv_params->fwd_offset; 1265 const uint16_t bck_offset = conv_params->bck_offset; 1266 1267 CONV_BUF_TYPE *dst_ptr = conv_params->dst; 1268 const int dst_stride = conv_params->dst_stride; 1269 int width = w; 1270 1271 if (w == 4 || h == 4) { 1272 do { 1273 const uint8_t *s = src_ptr; 1274 CONV_BUF_TYPE *d = dst_ptr; 1275 uint8_t *d_u8 = dst8_ptr; 1276 int height = h; 1277 1278 uint8x8_t t0 = load_unaligned_u8_4x1(s + 0 * src_stride); 1279 uint8x8_t t1 = load_unaligned_u8_4x1(s + 1 * src_stride); 1280 uint8x8_t t2 = load_unaligned_u8_4x1(s + 2 * src_stride); 1281 uint8x8_t t3 = load_unaligned_u8_4x1(s + 3 * src_stride); 1282 uint8x8_t t4 = load_unaligned_u8_4x1(s + 4 * src_stride); 1283 1284 int16x4_t s0 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t0))); 1285 int16x4_t s1 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t1))); 1286 int16x4_t s2 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t2))); 1287 int16x4_t s3 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t3))); 1288 int16x4_t s4 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t4))); 1289 1290 s += 5 * src_stride; 1291 1292 do { 1293 #if AOM_ARCH_AARCH64 1294 t0 = load_unaligned_u8_4x1(s + 0 * src_stride); 1295 t1 = load_unaligned_u8_4x1(s + 1 * src_stride); 1296 t2 = load_unaligned_u8_4x1(s + 2 * src_stride); 1297 t3 = load_unaligned_u8_4x1(s + 3 * src_stride); 1298 1299 int16x4_t s5 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t0))); 1300 int16x4_t s6 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t1))); 1301 int16x4_t s7 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t2))); 1302 int16x4_t s8 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t3))); 1303 1304 uint16x4_t d0 = convolve6_4_y(s0, s1, s2, s3, s4, s5, y_filter, 1305 vget_low_s16(round_offset_vec)); 1306 uint16x4_t d1 = convolve6_4_y(s1, s2, s3, s4, s5, s6, y_filter, 1307 vget_low_s16(round_offset_vec)); 1308 uint16x4_t d2 = convolve6_4_y(s2, s3, s4, s5, s6, s7, y_filter, 1309 vget_low_s16(round_offset_vec)); 1310 uint16x4_t d3 = convolve6_4_y(s3, s4, s5, s6, s7, s8, y_filter, 1311 vget_low_s16(round_offset_vec)); 1312 1313 uint16x4_t dd0, dd1, dd2, dd3; 1314 load_u16_4x4(d, dst_stride, &dd0, &dd1, &dd2, &dd3); 1315 1316 uint8x8_t d01, d23; 1317 compute_dist_wtd_avg_4x4(dd0, dd1, dd2, dd3, d0, d1, d2, d3, fwd_offset, 1318 bck_offset, round_offset_vec, &d01, &d23); 1319 1320 store_u8x4_strided_x2(d_u8 + 0 * dst8_stride, dst8_stride, d01); 1321 store_u8x4_strided_x2(d_u8 + 2 * dst8_stride, dst8_stride, d23); 1322 1323 s0 = s4; 1324 s1 = s5; 1325 s2 = s6; 1326 s3 = s7; 1327 s4 = s8; 1328 s += 4 * src_stride; 1329 d += 4 * dst_stride; 1330 d_u8 += 4 * dst8_stride; 1331 height -= 4; 1332 #else // !AOM_ARCH_AARCH64 1333 t0 = load_unaligned_u8_4x1(s); 1334 int16x4_t s5 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t0))); 1335 1336 uint16x4_t d0 = convolve6_4_y(s0, s1, s2, s3, s4, s5, y_filter, 1337 vget_low_s16(round_offset_vec)); 1338 1339 uint16x4_t dd0 = vld1_u16(d); 1340 1341 uint8x8_t d01; 1342 compute_dist_wtd_avg_4x1(dd0, d0, fwd_offset, bck_offset, 1343 vget_low_s16(round_offset_vec), &d01); 1344 1345 store_u8_4x1(d_u8, d01); 1346 1347 s0 = s1; 1348 s1 = s2; 1349 s2 = s3; 1350 s3 = s4; 1351 s4 = s5; 1352 s += src_stride; 1353 d += dst_stride; 1354 d_u8 += dst8_stride; 1355 height--; 1356 #endif // AOM_ARCH_AARCH64 1357 } while (height != 0); 1358 src_ptr += 4; 1359 dst_ptr += 4; 1360 dst8_ptr += 4; 1361 width -= 4; 1362 } while (width != 0); 1363 } else { 1364 do { 1365 const uint8_t *s = src_ptr + (5 * src_stride); 1366 CONV_BUF_TYPE *d = dst_ptr; 1367 uint8_t *d_u8 = dst8_ptr; 1368 int height = h; 1369 1370 uint8x8_t t0, t1, t2, t3, t4; 1371 load_u8_8x5(src_ptr, src_stride, &t0, &t1, &t2, &t3, &t4); 1372 1373 int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t0)); 1374 int16x8_t s1 = vreinterpretq_s16_u16(vmovl_u8(t1)); 1375 int16x8_t s2 = vreinterpretq_s16_u16(vmovl_u8(t2)); 1376 int16x8_t s3 = vreinterpretq_s16_u16(vmovl_u8(t3)); 1377 int16x8_t s4 = vreinterpretq_s16_u16(vmovl_u8(t4)); 1378 1379 do { 1380 #if AOM_ARCH_AARCH64 1381 uint8x8_t t5, t6, t7; 1382 load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); 1383 1384 int16x8_t s5 = vreinterpretq_s16_u16(vmovl_u8(t0)); 1385 int16x8_t s6 = vreinterpretq_s16_u16(vmovl_u8(t1)); 1386 int16x8_t s7 = vreinterpretq_s16_u16(vmovl_u8(t2)); 1387 int16x8_t s8 = vreinterpretq_s16_u16(vmovl_u8(t3)); 1388 int16x8_t s9 = vreinterpretq_s16_u16(vmovl_u8(t4)); 1389 int16x8_t s10 = vreinterpretq_s16_u16(vmovl_u8(t5)); 1390 int16x8_t s11 = vreinterpretq_s16_u16(vmovl_u8(t6)); 1391 int16x8_t s12 = vreinterpretq_s16_u16(vmovl_u8(t7)); 1392 1393 uint16x8_t d0 = 1394 convolve6_8_y(s0, s1, s2, s3, s4, s5, y_filter, round_offset_vec); 1395 uint16x8_t d1 = 1396 convolve6_8_y(s1, s2, s3, s4, s5, s6, y_filter, round_offset_vec); 1397 uint16x8_t d2 = 1398 convolve6_8_y(s2, s3, s4, s5, s6, s7, y_filter, round_offset_vec); 1399 uint16x8_t d3 = 1400 convolve6_8_y(s3, s4, s5, s6, s7, s8, y_filter, round_offset_vec); 1401 uint16x8_t d4 = 1402 convolve6_8_y(s4, s5, s6, s7, s8, s9, y_filter, round_offset_vec); 1403 uint16x8_t d5 = 1404 convolve6_8_y(s5, s6, s7, s8, s9, s10, y_filter, round_offset_vec); 1405 uint16x8_t d6 = 1406 convolve6_8_y(s6, s7, s8, s9, s10, s11, y_filter, round_offset_vec); 1407 uint16x8_t d7 = convolve6_8_y(s7, s8, s9, s10, s11, s12, y_filter, 1408 round_offset_vec); 1409 1410 uint16x8_t dd0, dd1, dd2, dd3; 1411 load_u16_8x4(d, dst_stride, &dd0, &dd1, &dd2, &dd3); 1412 1413 uint8x8_t d0_u8, d1_u8, d2_u8, d3_u8; 1414 compute_dist_wtd_avg_8x4(dd0, dd1, dd2, dd3, d0, d1, d2, d3, fwd_offset, 1415 bck_offset, round_offset_vec, &d0_u8, &d1_u8, 1416 &d2_u8, &d3_u8); 1417 1418 store_u8_8x4(d_u8, dst8_stride, d0_u8, d1_u8, d2_u8, d3_u8); 1419 d_u8 += 4 * dst8_stride; 1420 1421 uint16x8_t dd4, dd5, dd6, dd7; 1422 load_u16_8x4(d + 4 * dst_stride, dst_stride, &dd4, &dd5, &dd6, &dd7); 1423 1424 uint8x8_t d4_u8, d5_u8, d6_u8, d7_u8; 1425 compute_dist_wtd_avg_8x4(dd4, dd5, dd6, dd7, d4, d5, d6, d7, fwd_offset, 1426 bck_offset, round_offset_vec, &d4_u8, &d5_u8, 1427 &d6_u8, &d7_u8); 1428 1429 store_u8_8x4(d_u8, dst8_stride, d4_u8, d5_u8, d6_u8, d7_u8); 1430 d_u8 += 4 * dst8_stride; 1431 1432 s0 = s8; 1433 s1 = s9; 1434 s2 = s10; 1435 s3 = s11; 1436 s4 = s12; 1437 s += 8 * src_stride; 1438 d += 8 * dst_stride; 1439 height -= 8; 1440 #else // !AOM_ARCH_AARCH64 1441 int16x8_t s5 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s))); 1442 1443 uint16x8_t d0 = 1444 convolve6_8_y(s0, s1, s2, s3, s4, s5, y_filter, round_offset_vec); 1445 1446 s0 = s1; 1447 s1 = s2; 1448 s2 = s3; 1449 s3 = s4; 1450 s4 = s5; 1451 1452 uint16x8_t dd0 = vld1q_u16(d); 1453 1454 uint8x8_t d0_u8; 1455 compute_dist_wtd_avg_8x1(dd0, d0, fwd_offset, bck_offset, 1456 round_offset_vec, &d0_u8); 1457 1458 vst1_u8(d_u8, d0_u8); 1459 d_u8 += dst8_stride; 1460 1461 s += src_stride; 1462 d += dst_stride; 1463 height--; 1464 #endif // AOM_ARCH_AARCH64 1465 } while (height != 0); 1466 src_ptr += 8; 1467 dst_ptr += 8; 1468 dst8_ptr += 8; 1469 width -= 8; 1470 } while (width != 0); 1471 } 1472 } 1473 1474 static inline void dist_wtd_convolve_y_6tap_avg_neon( 1475 const uint8_t *src_ptr, int src_stride, uint8_t *dst8_ptr, 1476 const int dst8_stride, int w, int h, const int16x8_t y_filter, 1477 ConvolveParams *conv_params) { 1478 const int bd = 8; 1479 const int offset_bits = bd + 2 * FILTER_BITS - ROUND0_BITS; 1480 const int16_t round_offset = (1 << (offset_bits - COMPOUND_ROUND1_BITS)) + 1481 (1 << (offset_bits - COMPOUND_ROUND1_BITS - 1)); 1482 const int16x8_t round_offset_vec = vdupq_n_s16(round_offset); 1483 1484 CONV_BUF_TYPE *dst_ptr = conv_params->dst; 1485 const int dst_stride = conv_params->dst_stride; 1486 int width = w; 1487 1488 if (w == 4 || h == 4) { 1489 do { 1490 const uint8_t *s = src_ptr; 1491 CONV_BUF_TYPE *d = dst_ptr; 1492 uint8_t *d_u8 = dst8_ptr; 1493 int height = h; 1494 1495 uint8x8_t t0 = load_unaligned_u8_4x1(s + 0 * src_stride); 1496 uint8x8_t t1 = load_unaligned_u8_4x1(s + 1 * src_stride); 1497 uint8x8_t t2 = load_unaligned_u8_4x1(s + 2 * src_stride); 1498 uint8x8_t t3 = load_unaligned_u8_4x1(s + 3 * src_stride); 1499 uint8x8_t t4 = load_unaligned_u8_4x1(s + 4 * src_stride); 1500 1501 int16x4_t s0 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t0))); 1502 int16x4_t s1 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t1))); 1503 int16x4_t s2 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t2))); 1504 int16x4_t s3 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t3))); 1505 int16x4_t s4 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t4))); 1506 1507 s += 5 * src_stride; 1508 1509 do { 1510 #if AOM_ARCH_AARCH64 1511 t0 = load_unaligned_u8_4x1(s + 0 * src_stride); 1512 t1 = load_unaligned_u8_4x1(s + 1 * src_stride); 1513 t2 = load_unaligned_u8_4x1(s + 2 * src_stride); 1514 t3 = load_unaligned_u8_4x1(s + 3 * src_stride); 1515 1516 int16x4_t s5 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t0))); 1517 int16x4_t s6 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t1))); 1518 int16x4_t s7 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t2))); 1519 int16x4_t s8 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t3))); 1520 1521 uint16x4_t d0 = convolve6_4_y(s0, s1, s2, s3, s4, s5, y_filter, 1522 vget_low_s16(round_offset_vec)); 1523 uint16x4_t d1 = convolve6_4_y(s1, s2, s3, s4, s5, s6, y_filter, 1524 vget_low_s16(round_offset_vec)); 1525 uint16x4_t d2 = convolve6_4_y(s2, s3, s4, s5, s6, s7, y_filter, 1526 vget_low_s16(round_offset_vec)); 1527 uint16x4_t d3 = convolve6_4_y(s3, s4, s5, s6, s7, s8, y_filter, 1528 vget_low_s16(round_offset_vec)); 1529 1530 uint16x4_t dd0, dd1, dd2, dd3; 1531 load_u16_4x4(d, dst_stride, &dd0, &dd1, &dd2, &dd3); 1532 1533 uint8x8_t d01, d23; 1534 compute_basic_avg_4x4(dd0, dd1, dd2, dd3, d0, d1, d2, d3, 1535 round_offset_vec, &d01, &d23); 1536 1537 store_u8x4_strided_x2(d_u8 + 0 * dst8_stride, dst8_stride, d01); 1538 store_u8x4_strided_x2(d_u8 + 2 * dst8_stride, dst8_stride, d23); 1539 1540 s0 = s4; 1541 s1 = s5; 1542 s2 = s6; 1543 s3 = s7; 1544 s4 = s8; 1545 s += 4 * src_stride; 1546 d += 4 * dst_stride; 1547 d_u8 += 4 * dst8_stride; 1548 height -= 4; 1549 #else // !AOM_ARCH_AARCH64 1550 t0 = load_unaligned_u8_4x1(s); 1551 int16x4_t s5 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t0))); 1552 1553 uint16x4_t d0 = convolve6_4_y(s0, s1, s2, s3, s4, s5, y_filter, 1554 vget_low_s16(round_offset_vec)); 1555 1556 uint16x4_t dd0 = vld1_u16(d); 1557 1558 uint8x8_t d01; 1559 compute_basic_avg_4x1(dd0, d0, vget_low_s16(round_offset_vec), &d01); 1560 1561 store_u8_4x1(d_u8, d01); 1562 1563 s0 = s1; 1564 s1 = s2; 1565 s2 = s3; 1566 s3 = s4; 1567 s4 = s5; 1568 s += src_stride; 1569 d += dst_stride; 1570 d_u8 += dst8_stride; 1571 height--; 1572 #endif // AOM_ARCH_AARCH64 1573 } while (height != 0); 1574 src_ptr += 4; 1575 dst_ptr += 4; 1576 dst8_ptr += 4; 1577 width -= 4; 1578 } while (width != 0); 1579 } else { 1580 do { 1581 const uint8_t *s = src_ptr + (5 * src_stride); 1582 CONV_BUF_TYPE *d = dst_ptr; 1583 uint8_t *d_u8 = dst8_ptr; 1584 int height = h; 1585 1586 uint8x8_t t0, t1, t2, t3, t4; 1587 load_u8_8x5(src_ptr, src_stride, &t0, &t1, &t2, &t3, &t4); 1588 1589 int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t0)); 1590 int16x8_t s1 = vreinterpretq_s16_u16(vmovl_u8(t1)); 1591 int16x8_t s2 = vreinterpretq_s16_u16(vmovl_u8(t2)); 1592 int16x8_t s3 = vreinterpretq_s16_u16(vmovl_u8(t3)); 1593 int16x8_t s4 = vreinterpretq_s16_u16(vmovl_u8(t4)); 1594 1595 do { 1596 #if AOM_ARCH_AARCH64 1597 uint8x8_t t5, t6, t7; 1598 load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); 1599 1600 int16x8_t s5 = vreinterpretq_s16_u16(vmovl_u8(t0)); 1601 int16x8_t s6 = vreinterpretq_s16_u16(vmovl_u8(t1)); 1602 int16x8_t s7 = vreinterpretq_s16_u16(vmovl_u8(t2)); 1603 int16x8_t s8 = vreinterpretq_s16_u16(vmovl_u8(t3)); 1604 int16x8_t s9 = vreinterpretq_s16_u16(vmovl_u8(t4)); 1605 int16x8_t s10 = vreinterpretq_s16_u16(vmovl_u8(t5)); 1606 int16x8_t s11 = vreinterpretq_s16_u16(vmovl_u8(t6)); 1607 int16x8_t s12 = vreinterpretq_s16_u16(vmovl_u8(t7)); 1608 1609 uint16x8_t d0 = 1610 convolve6_8_y(s0, s1, s2, s3, s4, s5, y_filter, round_offset_vec); 1611 uint16x8_t d1 = 1612 convolve6_8_y(s1, s2, s3, s4, s5, s6, y_filter, round_offset_vec); 1613 uint16x8_t d2 = 1614 convolve6_8_y(s2, s3, s4, s5, s6, s7, y_filter, round_offset_vec); 1615 uint16x8_t d3 = 1616 convolve6_8_y(s3, s4, s5, s6, s7, s8, y_filter, round_offset_vec); 1617 uint16x8_t d4 = 1618 convolve6_8_y(s4, s5, s6, s7, s8, s9, y_filter, round_offset_vec); 1619 uint16x8_t d5 = 1620 convolve6_8_y(s5, s6, s7, s8, s9, s10, y_filter, round_offset_vec); 1621 uint16x8_t d6 = 1622 convolve6_8_y(s6, s7, s8, s9, s10, s11, y_filter, round_offset_vec); 1623 uint16x8_t d7 = convolve6_8_y(s7, s8, s9, s10, s11, s12, y_filter, 1624 round_offset_vec); 1625 1626 uint16x8_t dd0, dd1, dd2, dd3; 1627 load_u16_8x4(d, dst_stride, &dd0, &dd1, &dd2, &dd3); 1628 1629 uint8x8_t d0_u8, d1_u8, d2_u8, d3_u8; 1630 compute_basic_avg_8x4(dd0, dd1, dd2, dd3, d0, d1, d2, d3, 1631 round_offset_vec, &d0_u8, &d1_u8, &d2_u8, &d3_u8); 1632 1633 store_u8_8x4(d_u8, dst8_stride, d0_u8, d1_u8, d2_u8, d3_u8); 1634 d_u8 += 4 * dst8_stride; 1635 1636 uint16x8_t dd4, dd5, dd6, dd7; 1637 load_u16_8x4(d + 4 * dst_stride, dst_stride, &dd4, &dd5, &dd6, &dd7); 1638 1639 uint8x8_t d4_u8, d5_u8, d6_u8, d7_u8; 1640 compute_basic_avg_8x4(dd4, dd5, dd6, dd7, d4, d5, d6, d7, 1641 round_offset_vec, &d4_u8, &d5_u8, &d6_u8, &d7_u8); 1642 1643 store_u8_8x4(d_u8, dst8_stride, d4_u8, d5_u8, d6_u8, d7_u8); 1644 d_u8 += 4 * dst8_stride; 1645 1646 s0 = s8; 1647 s1 = s9; 1648 s2 = s10; 1649 s3 = s11; 1650 s4 = s12; 1651 s += 8 * src_stride; 1652 d += 8 * dst_stride; 1653 height -= 8; 1654 #else // !AOM_ARCH_AARCH64 1655 int16x8_t s5 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s))); 1656 1657 uint16x8_t d0 = 1658 convolve6_8_y(s0, s1, s2, s3, s4, s5, y_filter, round_offset_vec); 1659 1660 s0 = s1; 1661 s1 = s2; 1662 s2 = s3; 1663 s3 = s4; 1664 s4 = s5; 1665 1666 uint16x8_t dd0 = vld1q_u16(d); 1667 1668 uint8x8_t d0_u8; 1669 compute_basic_avg_8x1(dd0, d0, round_offset_vec, &d0_u8); 1670 1671 vst1_u8(d_u8, d0_u8); 1672 d_u8 += dst8_stride; 1673 1674 s += src_stride; 1675 d += dst_stride; 1676 height--; 1677 #endif // AOM_ARCH_AARCH64 1678 } while (height != 0); 1679 src_ptr += 8; 1680 dst_ptr += 8; 1681 dst8_ptr += 8; 1682 width -= 8; 1683 } while (width != 0); 1684 } 1685 } 1686 1687 static inline void dist_wtd_convolve_y_6tap_neon(const uint8_t *src_ptr, 1688 int src_stride, int w, int h, 1689 const int16x8_t y_filter, 1690 ConvolveParams *conv_params) { 1691 const int bd = 8; 1692 const int offset_bits = bd + 2 * FILTER_BITS - ROUND0_BITS; 1693 const int16_t round_offset = (1 << (offset_bits - COMPOUND_ROUND1_BITS)) + 1694 (1 << (offset_bits - COMPOUND_ROUND1_BITS - 1)); 1695 const int16x8_t round_offset_vec = vdupq_n_s16(round_offset); 1696 1697 CONV_BUF_TYPE *dst_ptr = conv_params->dst; 1698 const int dst_stride = conv_params->dst_stride; 1699 int width = w; 1700 1701 if (w == 4 || h == 4) { 1702 do { 1703 const uint8_t *s = src_ptr; 1704 CONV_BUF_TYPE *d = dst_ptr; 1705 int height = h; 1706 1707 uint8x8_t t0 = load_unaligned_u8_4x1(s + 0 * src_stride); 1708 uint8x8_t t1 = load_unaligned_u8_4x1(s + 1 * src_stride); 1709 uint8x8_t t2 = load_unaligned_u8_4x1(s + 2 * src_stride); 1710 uint8x8_t t3 = load_unaligned_u8_4x1(s + 3 * src_stride); 1711 uint8x8_t t4 = load_unaligned_u8_4x1(s + 4 * src_stride); 1712 1713 int16x4_t s0 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t0))); 1714 int16x4_t s1 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t1))); 1715 int16x4_t s2 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t2))); 1716 int16x4_t s3 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t3))); 1717 int16x4_t s4 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t4))); 1718 1719 s += 5 * src_stride; 1720 1721 do { 1722 #if AOM_ARCH_AARCH64 1723 t0 = load_unaligned_u8_4x1(s + 0 * src_stride); 1724 t1 = load_unaligned_u8_4x1(s + 1 * src_stride); 1725 t2 = load_unaligned_u8_4x1(s + 2 * src_stride); 1726 t3 = load_unaligned_u8_4x1(s + 3 * src_stride); 1727 1728 int16x4_t s5 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t0))); 1729 int16x4_t s6 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t1))); 1730 int16x4_t s7 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t2))); 1731 int16x4_t s8 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t3))); 1732 1733 uint16x4_t d0 = convolve6_4_y(s0, s1, s2, s3, s4, s5, y_filter, 1734 vget_low_s16(round_offset_vec)); 1735 uint16x4_t d1 = convolve6_4_y(s1, s2, s3, s4, s5, s6, y_filter, 1736 vget_low_s16(round_offset_vec)); 1737 uint16x4_t d2 = convolve6_4_y(s2, s3, s4, s5, s6, s7, y_filter, 1738 vget_low_s16(round_offset_vec)); 1739 uint16x4_t d3 = convolve6_4_y(s3, s4, s5, s6, s7, s8, y_filter, 1740 vget_low_s16(round_offset_vec)); 1741 1742 store_u16_4x4(d, dst_stride, d0, d1, d2, d3); 1743 1744 s0 = s4; 1745 s1 = s5; 1746 s2 = s6; 1747 s3 = s7; 1748 s4 = s8; 1749 s += 4 * src_stride; 1750 d += 4 * dst_stride; 1751 height -= 4; 1752 #else // !AOM_ARCH_AARCH64 1753 t0 = load_unaligned_u8_4x1(s); 1754 int16x4_t s5 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t0))); 1755 1756 uint16x4_t d0 = convolve6_4_y(s0, s1, s2, s3, s4, s5, y_filter, 1757 vget_low_s16(round_offset_vec)); 1758 1759 vst1_u16(d, d0); 1760 1761 s0 = s1; 1762 s1 = s2; 1763 s2 = s3; 1764 s3 = s4; 1765 s4 = s5; 1766 s += src_stride; 1767 d += dst_stride; 1768 height--; 1769 #endif // AOM_ARCH_AARCH64 1770 } while (height != 0); 1771 src_ptr += 4; 1772 dst_ptr += 4; 1773 width -= 4; 1774 } while (width != 0); 1775 } else { 1776 do { 1777 const uint8_t *s = src_ptr + (5 * src_stride); 1778 CONV_BUF_TYPE *d = dst_ptr; 1779 int height = h; 1780 1781 uint8x8_t t0, t1, t2, t3, t4; 1782 load_u8_8x5(src_ptr, src_stride, &t0, &t1, &t2, &t3, &t4); 1783 1784 int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t0)); 1785 int16x8_t s1 = vreinterpretq_s16_u16(vmovl_u8(t1)); 1786 int16x8_t s2 = vreinterpretq_s16_u16(vmovl_u8(t2)); 1787 int16x8_t s3 = vreinterpretq_s16_u16(vmovl_u8(t3)); 1788 int16x8_t s4 = vreinterpretq_s16_u16(vmovl_u8(t4)); 1789 1790 do { 1791 #if AOM_ARCH_AARCH64 1792 uint8x8_t t5, t6, t7; 1793 load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); 1794 1795 int16x8_t s5 = vreinterpretq_s16_u16(vmovl_u8(t0)); 1796 int16x8_t s6 = vreinterpretq_s16_u16(vmovl_u8(t1)); 1797 int16x8_t s7 = vreinterpretq_s16_u16(vmovl_u8(t2)); 1798 int16x8_t s8 = vreinterpretq_s16_u16(vmovl_u8(t3)); 1799 int16x8_t s9 = vreinterpretq_s16_u16(vmovl_u8(t4)); 1800 int16x8_t s10 = vreinterpretq_s16_u16(vmovl_u8(t5)); 1801 int16x8_t s11 = vreinterpretq_s16_u16(vmovl_u8(t6)); 1802 int16x8_t s12 = vreinterpretq_s16_u16(vmovl_u8(t7)); 1803 1804 uint16x8_t d0 = 1805 convolve6_8_y(s0, s1, s2, s3, s4, s5, y_filter, round_offset_vec); 1806 uint16x8_t d1 = 1807 convolve6_8_y(s1, s2, s3, s4, s5, s6, y_filter, round_offset_vec); 1808 uint16x8_t d2 = 1809 convolve6_8_y(s2, s3, s4, s5, s6, s7, y_filter, round_offset_vec); 1810 uint16x8_t d3 = 1811 convolve6_8_y(s3, s4, s5, s6, s7, s8, y_filter, round_offset_vec); 1812 uint16x8_t d4 = 1813 convolve6_8_y(s4, s5, s6, s7, s8, s9, y_filter, round_offset_vec); 1814 uint16x8_t d5 = 1815 convolve6_8_y(s5, s6, s7, s8, s9, s10, y_filter, round_offset_vec); 1816 uint16x8_t d6 = 1817 convolve6_8_y(s6, s7, s8, s9, s10, s11, y_filter, round_offset_vec); 1818 uint16x8_t d7 = convolve6_8_y(s7, s8, s9, s10, s11, s12, y_filter, 1819 round_offset_vec); 1820 1821 store_u16_8x8(d, dst_stride, d0, d1, d2, d3, d4, d5, d6, d7); 1822 1823 s0 = s8; 1824 s1 = s9; 1825 s2 = s10; 1826 s3 = s11; 1827 s4 = s12; 1828 s += 8 * src_stride; 1829 d += 8 * dst_stride; 1830 height -= 8; 1831 #else // !AOM_ARCH_AARCH64 1832 int16x8_t s5 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s))); 1833 1834 uint16x8_t d0 = 1835 convolve6_8_y(s0, s1, s2, s3, s4, s5, y_filter, round_offset_vec); 1836 1837 s0 = s1; 1838 s1 = s2; 1839 s2 = s3; 1840 s3 = s4; 1841 s4 = s5; 1842 1843 vst1q_u16(d, d0); 1844 1845 s += src_stride; 1846 d += dst_stride; 1847 height--; 1848 #endif // AOM_ARCH_AARCH64 1849 } while (height != 0); 1850 src_ptr += 8; 1851 dst_ptr += 8; 1852 width -= 8; 1853 } while (width != 0); 1854 } 1855 } 1856 1857 static inline uint16x4_t convolve8_4_y(const int16x4_t s0, const int16x4_t s1, 1858 const int16x4_t s2, const int16x4_t s3, 1859 const int16x4_t s4, const int16x4_t s5, 1860 const int16x4_t s6, const int16x4_t s7, 1861 const int16x8_t y_filter, 1862 const int16x4_t round_offset) { 1863 const int16x4_t y_filter_0_3 = vget_low_s16(y_filter); 1864 const int16x4_t y_filter_4_7 = vget_high_s16(y_filter); 1865 1866 int16x4_t sum = vmul_lane_s16(s0, y_filter_0_3, 0); 1867 sum = vmla_lane_s16(sum, s1, y_filter_0_3, 1); 1868 sum = vmla_lane_s16(sum, s2, y_filter_0_3, 2); 1869 sum = vmla_lane_s16(sum, s3, y_filter_0_3, 3); 1870 sum = vmla_lane_s16(sum, s4, y_filter_4_7, 0); 1871 sum = vmla_lane_s16(sum, s5, y_filter_4_7, 1); 1872 sum = vmla_lane_s16(sum, s6, y_filter_4_7, 2); 1873 sum = vmla_lane_s16(sum, s7, y_filter_4_7, 3); 1874 1875 // We halved the convolution filter values so -1 from the right shift. 1876 int16x4_t res = vrsra_n_s16(round_offset, sum, ROUND0_BITS - 1); 1877 return vreinterpret_u16_s16(res); 1878 } 1879 1880 static inline uint16x8_t convolve8_8_y(const int16x8_t s0, const int16x8_t s1, 1881 const int16x8_t s2, const int16x8_t s3, 1882 const int16x8_t s4, const int16x8_t s5, 1883 const int16x8_t s6, const int16x8_t s7, 1884 const int16x8_t y_filter, 1885 const int16x8_t round_offset) { 1886 const int16x4_t y_filter_0_3 = vget_low_s16(y_filter); 1887 const int16x4_t y_filter_4_7 = vget_high_s16(y_filter); 1888 1889 int16x8_t sum = vmulq_lane_s16(s0, y_filter_0_3, 0); 1890 sum = vmlaq_lane_s16(sum, s1, y_filter_0_3, 1); 1891 sum = vmlaq_lane_s16(sum, s2, y_filter_0_3, 2); 1892 sum = vmlaq_lane_s16(sum, s3, y_filter_0_3, 3); 1893 sum = vmlaq_lane_s16(sum, s4, y_filter_4_7, 0); 1894 sum = vmlaq_lane_s16(sum, s5, y_filter_4_7, 1); 1895 sum = vmlaq_lane_s16(sum, s6, y_filter_4_7, 2); 1896 sum = vmlaq_lane_s16(sum, s7, y_filter_4_7, 3); 1897 1898 // We halved the convolution filter values so -1 from the right shift. 1899 int16x8_t res = vrsraq_n_s16(round_offset, sum, ROUND0_BITS - 1); 1900 return vreinterpretq_u16_s16(res); 1901 } 1902 1903 static inline void dist_wtd_convolve_y_8tap_dist_wtd_avg_neon( 1904 const uint8_t *src_ptr, int src_stride, uint8_t *dst8_ptr, 1905 const int dst8_stride, int w, int h, const int16x8_t y_filter, 1906 ConvolveParams *conv_params) { 1907 const int bd = 8; 1908 const int offset_bits = bd + 2 * FILTER_BITS - ROUND0_BITS; 1909 const int16_t round_offset = (1 << (offset_bits - COMPOUND_ROUND1_BITS)) + 1910 (1 << (offset_bits - COMPOUND_ROUND1_BITS - 1)); 1911 const int16x8_t round_offset_vec = vdupq_n_s16(round_offset); 1912 1913 const uint16_t fwd_offset = conv_params->fwd_offset; 1914 const uint16_t bck_offset = conv_params->bck_offset; 1915 1916 CONV_BUF_TYPE *dst_ptr = conv_params->dst; 1917 const int dst_stride = conv_params->dst_stride; 1918 int width = w; 1919 1920 if (w == 4 || h == 4) { 1921 do { 1922 const uint8_t *s = src_ptr; 1923 CONV_BUF_TYPE *d = dst_ptr; 1924 uint8_t *d_u8 = dst8_ptr; 1925 int height = h; 1926 1927 __builtin_prefetch(s + 0 * src_stride); 1928 __builtin_prefetch(s + 1 * src_stride); 1929 __builtin_prefetch(s + 2 * src_stride); 1930 __builtin_prefetch(s + 3 * src_stride); 1931 1932 uint8x8_t t0 = load_unaligned_u8_4x1(s + 0 * src_stride); 1933 uint8x8_t t1 = load_unaligned_u8_4x1(s + 1 * src_stride); 1934 uint8x8_t t2 = load_unaligned_u8_4x1(s + 2 * src_stride); 1935 uint8x8_t t3 = load_unaligned_u8_4x1(s + 3 * src_stride); 1936 uint8x8_t t4 = load_unaligned_u8_4x1(s + 4 * src_stride); 1937 uint8x8_t t5 = load_unaligned_u8_4x1(s + 5 * src_stride); 1938 uint8x8_t t6 = load_unaligned_u8_4x1(s + 6 * src_stride); 1939 1940 int16x4_t s0 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t0))); 1941 int16x4_t s1 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t1))); 1942 int16x4_t s2 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t2))); 1943 int16x4_t s3 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t3))); 1944 int16x4_t s4 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t4))); 1945 int16x4_t s5 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t5))); 1946 int16x4_t s6 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t6))); 1947 1948 __builtin_prefetch(d + 0 * dst_stride); 1949 __builtin_prefetch(d + 1 * dst_stride); 1950 __builtin_prefetch(d + 2 * dst_stride); 1951 __builtin_prefetch(d + 3 * dst_stride); 1952 1953 s += 7 * src_stride; 1954 1955 do { 1956 #if AOM_ARCH_AARCH64 1957 t0 = load_unaligned_u8_4x1(s + 0 * src_stride); 1958 t1 = load_unaligned_u8_4x1(s + 1 * src_stride); 1959 t2 = load_unaligned_u8_4x1(s + 2 * src_stride); 1960 t3 = load_unaligned_u8_4x1(s + 3 * src_stride); 1961 1962 int16x4_t s7 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t0))); 1963 int16x4_t s8 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t1))); 1964 int16x4_t s9 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t2))); 1965 int16x4_t s10 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t3))); 1966 1967 uint16x4_t d0 = convolve8_4_y(s0, s1, s2, s3, s4, s5, s6, s7, y_filter, 1968 vget_low_s16(round_offset_vec)); 1969 uint16x4_t d1 = convolve8_4_y(s1, s2, s3, s4, s5, s6, s7, s8, y_filter, 1970 vget_low_s16(round_offset_vec)); 1971 uint16x4_t d2 = convolve8_4_y(s2, s3, s4, s5, s6, s7, s8, s9, y_filter, 1972 vget_low_s16(round_offset_vec)); 1973 uint16x4_t d3 = convolve8_4_y(s3, s4, s5, s6, s7, s8, s9, s10, y_filter, 1974 vget_low_s16(round_offset_vec)); 1975 1976 __builtin_prefetch(d + 0 * dst_stride); 1977 __builtin_prefetch(d + 1 * dst_stride); 1978 __builtin_prefetch(d + 2 * dst_stride); 1979 __builtin_prefetch(d + 3 * dst_stride); 1980 1981 __builtin_prefetch(d_u8 + 0 * dst8_stride); 1982 __builtin_prefetch(d_u8 + 1 * dst8_stride); 1983 __builtin_prefetch(d_u8 + 2 * dst8_stride); 1984 __builtin_prefetch(d_u8 + 3 * dst8_stride); 1985 1986 uint16x4_t dd0, dd1, dd2, dd3; 1987 load_u16_4x4(d, dst_stride, &dd0, &dd1, &dd2, &dd3); 1988 1989 uint8x8_t d01, d23; 1990 compute_dist_wtd_avg_4x4(dd0, dd1, dd2, dd3, d0, d1, d2, d3, fwd_offset, 1991 bck_offset, round_offset_vec, &d01, &d23); 1992 1993 store_u8x4_strided_x2(d_u8 + 0 * dst8_stride, dst8_stride, d01); 1994 store_u8x4_strided_x2(d_u8 + 2 * dst8_stride, dst8_stride, d23); 1995 1996 s0 = s4; 1997 s1 = s5; 1998 s2 = s6; 1999 s3 = s7; 2000 s4 = s8; 2001 s5 = s9; 2002 s6 = s10; 2003 s += 4 * src_stride; 2004 d += 4 * dst_stride; 2005 d_u8 += 4 * dst8_stride; 2006 height -= 4; 2007 #else // !AOM_ARCH_AARCH64 2008 t0 = load_unaligned_u8_4x1(s); 2009 int16x4_t s7 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t0))); 2010 2011 uint16x4_t d0 = convolve8_4_y(s0, s1, s2, s3, s4, s5, s6, s7, y_filter, 2012 vget_low_s16(round_offset_vec)); 2013 2014 __builtin_prefetch(d); 2015 2016 uint16x4_t dd0 = vld1_u16(d); 2017 2018 uint8x8_t d01; 2019 compute_dist_wtd_avg_4x1(dd0, d0, fwd_offset, bck_offset, 2020 vget_low_s16(round_offset_vec), &d01); 2021 2022 store_u8_4x1(d_u8, d01); 2023 2024 s0 = s1; 2025 s1 = s2; 2026 s2 = s3; 2027 s3 = s4; 2028 s4 = s5; 2029 s5 = s6; 2030 s6 = s7; 2031 s += src_stride; 2032 d += dst_stride; 2033 d_u8 += dst8_stride; 2034 height--; 2035 #endif // AOM_ARCH_AARCH64 2036 } while (height != 0); 2037 src_ptr += 4; 2038 dst_ptr += 4; 2039 dst8_ptr += 4; 2040 width -= 4; 2041 } while (width != 0); 2042 } else { 2043 do { 2044 const uint8_t *s = src_ptr; 2045 CONV_BUF_TYPE *d = dst_ptr; 2046 uint8_t *d_u8 = dst8_ptr; 2047 int height = h; 2048 2049 __builtin_prefetch(s + 0 * src_stride); 2050 __builtin_prefetch(s + 1 * src_stride); 2051 __builtin_prefetch(s + 2 * src_stride); 2052 __builtin_prefetch(s + 3 * src_stride); 2053 __builtin_prefetch(s + 4 * src_stride); 2054 __builtin_prefetch(s + 5 * src_stride); 2055 __builtin_prefetch(s + 6 * src_stride); 2056 __builtin_prefetch(s + 7 * src_stride); 2057 2058 uint8x8_t t0, t1, t2, t3, t4, t5, t6; 2059 load_u8_8x7(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6); 2060 2061 int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t0)); 2062 int16x8_t s1 = vreinterpretq_s16_u16(vmovl_u8(t1)); 2063 int16x8_t s2 = vreinterpretq_s16_u16(vmovl_u8(t2)); 2064 int16x8_t s3 = vreinterpretq_s16_u16(vmovl_u8(t3)); 2065 int16x8_t s4 = vreinterpretq_s16_u16(vmovl_u8(t4)); 2066 int16x8_t s5 = vreinterpretq_s16_u16(vmovl_u8(t5)); 2067 int16x8_t s6 = vreinterpretq_s16_u16(vmovl_u8(t6)); 2068 2069 s += 7 * src_stride; 2070 2071 do { 2072 #if AOM_ARCH_AARCH64 2073 uint8x8_t t7; 2074 load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); 2075 2076 int16x8_t s7 = vreinterpretq_s16_u16(vmovl_u8(t0)); 2077 int16x8_t s8 = vreinterpretq_s16_u16(vmovl_u8(t1)); 2078 int16x8_t s9 = vreinterpretq_s16_u16(vmovl_u8(t2)); 2079 int16x8_t s10 = vreinterpretq_s16_u16(vmovl_u8(t3)); 2080 int16x8_t s11 = vreinterpretq_s16_u16(vmovl_u8(t4)); 2081 int16x8_t s12 = vreinterpretq_s16_u16(vmovl_u8(t5)); 2082 int16x8_t s13 = vreinterpretq_s16_u16(vmovl_u8(t6)); 2083 int16x8_t s14 = vreinterpretq_s16_u16(vmovl_u8(t7)); 2084 2085 __builtin_prefetch(dst_ptr + 0 * dst_stride); 2086 __builtin_prefetch(dst_ptr + 1 * dst_stride); 2087 __builtin_prefetch(dst_ptr + 2 * dst_stride); 2088 __builtin_prefetch(dst_ptr + 3 * dst_stride); 2089 2090 uint16x8_t d0 = convolve8_8_y(s0, s1, s2, s3, s4, s5, s6, s7, y_filter, 2091 round_offset_vec); 2092 uint16x8_t d1 = convolve8_8_y(s1, s2, s3, s4, s5, s6, s7, s8, y_filter, 2093 round_offset_vec); 2094 uint16x8_t d2 = convolve8_8_y(s2, s3, s4, s5, s6, s7, s8, s9, y_filter, 2095 round_offset_vec); 2096 uint16x8_t d3 = convolve8_8_y(s3, s4, s5, s6, s7, s8, s9, s10, y_filter, 2097 round_offset_vec); 2098 uint16x8_t d4 = convolve8_8_y(s4, s5, s6, s7, s8, s9, s10, s11, 2099 y_filter, round_offset_vec); 2100 uint16x8_t d5 = convolve8_8_y(s5, s6, s7, s8, s9, s10, s11, s12, 2101 y_filter, round_offset_vec); 2102 uint16x8_t d6 = convolve8_8_y(s6, s7, s8, s9, s10, s11, s12, s13, 2103 y_filter, round_offset_vec); 2104 uint16x8_t d7 = convolve8_8_y(s7, s8, s9, s10, s11, s12, s13, s14, 2105 y_filter, round_offset_vec); 2106 2107 __builtin_prefetch(d + 0 * dst8_stride); 2108 __builtin_prefetch(d + 1 * dst8_stride); 2109 __builtin_prefetch(d + 2 * dst8_stride); 2110 __builtin_prefetch(d + 3 * dst8_stride); 2111 2112 uint16x8_t dd0, dd1, dd2, dd3; 2113 load_u16_8x4(d, dst_stride, &dd0, &dd1, &dd2, &dd3); 2114 2115 uint8x8_t d0_u8, d1_u8, d2_u8, d3_u8; 2116 compute_dist_wtd_avg_8x4(dd0, dd1, dd2, dd3, d0, d1, d2, d3, fwd_offset, 2117 bck_offset, round_offset_vec, &d0_u8, &d1_u8, 2118 &d2_u8, &d3_u8); 2119 2120 store_u8_8x4(d_u8, dst8_stride, d0_u8, d1_u8, d2_u8, d3_u8); 2121 d_u8 += 4 * dst8_stride; 2122 2123 uint16x8_t dd4, dd5, dd6, dd7; 2124 load_u16_8x4(d + 4 * dst_stride, dst_stride, &dd4, &dd5, &dd6, &dd7); 2125 2126 uint8x8_t d4_u8, d5_u8, d6_u8, d7_u8; 2127 compute_dist_wtd_avg_8x4(dd4, dd5, dd6, dd7, d4, d5, d6, d7, fwd_offset, 2128 bck_offset, round_offset_vec, &d4_u8, &d5_u8, 2129 &d6_u8, &d7_u8); 2130 2131 store_u8_8x4(d_u8, dst8_stride, d4_u8, d5_u8, d6_u8, d7_u8); 2132 d_u8 += 4 * dst8_stride; 2133 2134 s0 = s8; 2135 s1 = s9; 2136 s2 = s10; 2137 s3 = s11; 2138 s4 = s12; 2139 s5 = s13; 2140 s6 = s14; 2141 s += 8 * src_stride; 2142 d += 8 * dst_stride; 2143 height -= 8; 2144 #else // !AOM_ARCH_AARCH64 2145 int16x8_t s7 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s))); 2146 2147 __builtin_prefetch(dst_ptr); 2148 2149 uint16x8_t d0 = convolve8_8_y(s0, s1, s2, s3, s4, s5, s6, s7, y_filter, 2150 round_offset_vec); 2151 2152 s0 = s1; 2153 s1 = s2; 2154 s2 = s3; 2155 s3 = s4; 2156 s4 = s5; 2157 s5 = s6; 2158 s6 = s7; 2159 2160 __builtin_prefetch(d); 2161 2162 uint16x8_t dd0 = vld1q_u16(d); 2163 2164 uint8x8_t d0_u8; 2165 compute_dist_wtd_avg_8x1(dd0, d0, fwd_offset, bck_offset, 2166 round_offset_vec, &d0_u8); 2167 2168 vst1_u8(d_u8, d0_u8); 2169 d_u8 += dst8_stride; 2170 2171 s += src_stride; 2172 d += dst_stride; 2173 height--; 2174 #endif // AOM_ARCH_AARCH64 2175 } while (height != 0); 2176 src_ptr += 8; 2177 dst_ptr += 8; 2178 dst8_ptr += 8; 2179 width -= 8; 2180 } while (width != 0); 2181 } 2182 } 2183 2184 static inline void dist_wtd_convolve_y_8tap_avg_neon( 2185 const uint8_t *src_ptr, int src_stride, uint8_t *dst8_ptr, 2186 const int dst8_stride, int w, int h, const int16x8_t y_filter, 2187 ConvolveParams *conv_params) { 2188 const int bd = 8; 2189 const int offset_bits = bd + 2 * FILTER_BITS - ROUND0_BITS; 2190 const int16_t round_offset = (1 << (offset_bits - COMPOUND_ROUND1_BITS)) + 2191 (1 << (offset_bits - COMPOUND_ROUND1_BITS - 1)); 2192 const int16x8_t round_offset_vec = vdupq_n_s16(round_offset); 2193 2194 CONV_BUF_TYPE *dst_ptr = conv_params->dst; 2195 const int dst_stride = conv_params->dst_stride; 2196 int width = w; 2197 2198 if (w == 4 || h == 4) { 2199 do { 2200 const uint8_t *s = src_ptr; 2201 CONV_BUF_TYPE *d = dst_ptr; 2202 uint8_t *d_u8 = dst8_ptr; 2203 int height = h; 2204 2205 __builtin_prefetch(s + 0 * src_stride); 2206 __builtin_prefetch(s + 1 * src_stride); 2207 __builtin_prefetch(s + 2 * src_stride); 2208 __builtin_prefetch(s + 3 * src_stride); 2209 2210 uint8x8_t t0 = load_unaligned_u8_4x1(s + 0 * src_stride); 2211 uint8x8_t t1 = load_unaligned_u8_4x1(s + 1 * src_stride); 2212 uint8x8_t t2 = load_unaligned_u8_4x1(s + 2 * src_stride); 2213 uint8x8_t t3 = load_unaligned_u8_4x1(s + 3 * src_stride); 2214 uint8x8_t t4 = load_unaligned_u8_4x1(s + 4 * src_stride); 2215 uint8x8_t t5 = load_unaligned_u8_4x1(s + 5 * src_stride); 2216 uint8x8_t t6 = load_unaligned_u8_4x1(s + 6 * src_stride); 2217 2218 int16x4_t s0 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t0))); 2219 int16x4_t s1 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t1))); 2220 int16x4_t s2 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t2))); 2221 int16x4_t s3 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t3))); 2222 int16x4_t s4 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t4))); 2223 int16x4_t s5 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t5))); 2224 int16x4_t s6 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t6))); 2225 2226 __builtin_prefetch(d + 0 * dst_stride); 2227 __builtin_prefetch(d + 1 * dst_stride); 2228 __builtin_prefetch(d + 2 * dst_stride); 2229 __builtin_prefetch(d + 3 * dst_stride); 2230 2231 s += 7 * src_stride; 2232 2233 do { 2234 #if AOM_ARCH_AARCH64 2235 t0 = load_unaligned_u8_4x1(s + 0 * src_stride); 2236 t1 = load_unaligned_u8_4x1(s + 1 * src_stride); 2237 t2 = load_unaligned_u8_4x1(s + 2 * src_stride); 2238 t3 = load_unaligned_u8_4x1(s + 3 * src_stride); 2239 2240 int16x4_t s7 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t0))); 2241 int16x4_t s8 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t1))); 2242 int16x4_t s9 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t2))); 2243 int16x4_t s10 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t3))); 2244 2245 uint16x4_t d0 = convolve8_4_y(s0, s1, s2, s3, s4, s5, s6, s7, y_filter, 2246 vget_low_s16(round_offset_vec)); 2247 uint16x4_t d1 = convolve8_4_y(s1, s2, s3, s4, s5, s6, s7, s8, y_filter, 2248 vget_low_s16(round_offset_vec)); 2249 uint16x4_t d2 = convolve8_4_y(s2, s3, s4, s5, s6, s7, s8, s9, y_filter, 2250 vget_low_s16(round_offset_vec)); 2251 uint16x4_t d3 = convolve8_4_y(s3, s4, s5, s6, s7, s8, s9, s10, y_filter, 2252 vget_low_s16(round_offset_vec)); 2253 2254 __builtin_prefetch(d + 0 * dst_stride); 2255 __builtin_prefetch(d + 1 * dst_stride); 2256 __builtin_prefetch(d + 2 * dst_stride); 2257 __builtin_prefetch(d + 3 * dst_stride); 2258 2259 __builtin_prefetch(d_u8 + 0 * dst8_stride); 2260 __builtin_prefetch(d_u8 + 1 * dst8_stride); 2261 __builtin_prefetch(d_u8 + 2 * dst8_stride); 2262 __builtin_prefetch(d_u8 + 3 * dst8_stride); 2263 2264 uint16x4_t dd0, dd1, dd2, dd3; 2265 load_u16_4x4(d, dst_stride, &dd0, &dd1, &dd2, &dd3); 2266 2267 uint8x8_t d01, d23; 2268 compute_basic_avg_4x4(dd0, dd1, dd2, dd3, d0, d1, d2, d3, 2269 round_offset_vec, &d01, &d23); 2270 2271 store_u8x4_strided_x2(d_u8 + 0 * dst8_stride, dst8_stride, d01); 2272 store_u8x4_strided_x2(d_u8 + 2 * dst8_stride, dst8_stride, d23); 2273 2274 s0 = s4; 2275 s1 = s5; 2276 s2 = s6; 2277 s3 = s7; 2278 s4 = s8; 2279 s5 = s9; 2280 s6 = s10; 2281 s += 4 * src_stride; 2282 d += 4 * dst_stride; 2283 d_u8 += 4 * dst8_stride; 2284 height -= 4; 2285 #else // !AOM_ARCH_AARCH64 2286 t0 = load_unaligned_u8_4x1(s); 2287 int16x4_t s7 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t0))); 2288 2289 uint16x4_t d0 = convolve8_4_y(s0, s1, s2, s3, s4, s5, s6, s7, y_filter, 2290 vget_low_s16(round_offset_vec)); 2291 2292 __builtin_prefetch(d); 2293 2294 uint16x4_t dd0 = vld1_u16(d); 2295 2296 uint8x8_t d01; 2297 compute_basic_avg_4x1(dd0, d0, vget_low_s16(round_offset_vec), &d01); 2298 2299 store_u8_4x1(d_u8, d01); 2300 2301 s0 = s1; 2302 s1 = s2; 2303 s2 = s3; 2304 s3 = s4; 2305 s4 = s5; 2306 s5 = s6; 2307 s6 = s7; 2308 s += src_stride; 2309 d += dst_stride; 2310 d_u8 += dst8_stride; 2311 height--; 2312 #endif // AOM_ARCH_AARCH64 2313 } while (height != 0); 2314 src_ptr += 4; 2315 dst_ptr += 4; 2316 dst8_ptr += 4; 2317 width -= 4; 2318 } while (width != 0); 2319 } else { 2320 do { 2321 const uint8_t *s = src_ptr; 2322 CONV_BUF_TYPE *d = dst_ptr; 2323 uint8_t *d_u8 = dst8_ptr; 2324 int height = h; 2325 2326 __builtin_prefetch(s + 0 * src_stride); 2327 __builtin_prefetch(s + 1 * src_stride); 2328 __builtin_prefetch(s + 2 * src_stride); 2329 __builtin_prefetch(s + 3 * src_stride); 2330 __builtin_prefetch(s + 4 * src_stride); 2331 __builtin_prefetch(s + 5 * src_stride); 2332 __builtin_prefetch(s + 6 * src_stride); 2333 __builtin_prefetch(s + 7 * src_stride); 2334 2335 uint8x8_t t0, t1, t2, t3, t4, t5, t6; 2336 load_u8_8x7(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6); 2337 2338 int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t0)); 2339 int16x8_t s1 = vreinterpretq_s16_u16(vmovl_u8(t1)); 2340 int16x8_t s2 = vreinterpretq_s16_u16(vmovl_u8(t2)); 2341 int16x8_t s3 = vreinterpretq_s16_u16(vmovl_u8(t3)); 2342 int16x8_t s4 = vreinterpretq_s16_u16(vmovl_u8(t4)); 2343 int16x8_t s5 = vreinterpretq_s16_u16(vmovl_u8(t5)); 2344 int16x8_t s6 = vreinterpretq_s16_u16(vmovl_u8(t6)); 2345 2346 s += 7 * src_stride; 2347 2348 do { 2349 #if AOM_ARCH_AARCH64 2350 uint8x8_t t7; 2351 load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); 2352 2353 int16x8_t s7 = vreinterpretq_s16_u16(vmovl_u8(t0)); 2354 int16x8_t s8 = vreinterpretq_s16_u16(vmovl_u8(t1)); 2355 int16x8_t s9 = vreinterpretq_s16_u16(vmovl_u8(t2)); 2356 int16x8_t s10 = vreinterpretq_s16_u16(vmovl_u8(t3)); 2357 int16x8_t s11 = vreinterpretq_s16_u16(vmovl_u8(t4)); 2358 int16x8_t s12 = vreinterpretq_s16_u16(vmovl_u8(t5)); 2359 int16x8_t s13 = vreinterpretq_s16_u16(vmovl_u8(t6)); 2360 int16x8_t s14 = vreinterpretq_s16_u16(vmovl_u8(t7)); 2361 2362 __builtin_prefetch(dst_ptr + 0 * dst_stride); 2363 __builtin_prefetch(dst_ptr + 1 * dst_stride); 2364 __builtin_prefetch(dst_ptr + 2 * dst_stride); 2365 __builtin_prefetch(dst_ptr + 3 * dst_stride); 2366 2367 uint16x8_t d0 = convolve8_8_y(s0, s1, s2, s3, s4, s5, s6, s7, y_filter, 2368 round_offset_vec); 2369 uint16x8_t d1 = convolve8_8_y(s1, s2, s3, s4, s5, s6, s7, s8, y_filter, 2370 round_offset_vec); 2371 uint16x8_t d2 = convolve8_8_y(s2, s3, s4, s5, s6, s7, s8, s9, y_filter, 2372 round_offset_vec); 2373 uint16x8_t d3 = convolve8_8_y(s3, s4, s5, s6, s7, s8, s9, s10, y_filter, 2374 round_offset_vec); 2375 uint16x8_t d4 = convolve8_8_y(s4, s5, s6, s7, s8, s9, s10, s11, 2376 y_filter, round_offset_vec); 2377 uint16x8_t d5 = convolve8_8_y(s5, s6, s7, s8, s9, s10, s11, s12, 2378 y_filter, round_offset_vec); 2379 uint16x8_t d6 = convolve8_8_y(s6, s7, s8, s9, s10, s11, s12, s13, 2380 y_filter, round_offset_vec); 2381 uint16x8_t d7 = convolve8_8_y(s7, s8, s9, s10, s11, s12, s13, s14, 2382 y_filter, round_offset_vec); 2383 2384 __builtin_prefetch(d + 0 * dst8_stride); 2385 __builtin_prefetch(d + 1 * dst8_stride); 2386 __builtin_prefetch(d + 2 * dst8_stride); 2387 __builtin_prefetch(d + 3 * dst8_stride); 2388 2389 uint16x8_t dd0, dd1, dd2, dd3; 2390 load_u16_8x4(d, dst_stride, &dd0, &dd1, &dd2, &dd3); 2391 2392 uint8x8_t d0_u8, d1_u8, d2_u8, d3_u8; 2393 compute_basic_avg_8x4(dd0, dd1, dd2, dd3, d0, d1, d2, d3, 2394 round_offset_vec, &d0_u8, &d1_u8, &d2_u8, &d3_u8); 2395 2396 store_u8_8x4(d_u8, dst8_stride, d0_u8, d1_u8, d2_u8, d3_u8); 2397 d_u8 += 4 * dst8_stride; 2398 2399 uint16x8_t dd4, dd5, dd6, dd7; 2400 load_u16_8x4(d + 4 * dst_stride, dst_stride, &dd4, &dd5, &dd6, &dd7); 2401 2402 uint8x8_t d4_u8, d5_u8, d6_u8, d7_u8; 2403 compute_basic_avg_8x4(dd4, dd5, dd6, dd7, d4, d5, d6, d7, 2404 round_offset_vec, &d4_u8, &d5_u8, &d6_u8, &d7_u8); 2405 2406 store_u8_8x4(d_u8, dst8_stride, d4_u8, d5_u8, d6_u8, d7_u8); 2407 d_u8 += 4 * dst8_stride; 2408 2409 s0 = s8; 2410 s1 = s9; 2411 s2 = s10; 2412 s3 = s11; 2413 s4 = s12; 2414 s5 = s13; 2415 s6 = s14; 2416 s += 8 * src_stride; 2417 d += 8 * dst_stride; 2418 height -= 8; 2419 #else // !AOM_ARCH_AARCH64 2420 int16x8_t s7 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s))); 2421 2422 __builtin_prefetch(dst_ptr); 2423 2424 uint16x8_t d0 = convolve8_8_y(s0, s1, s2, s3, s4, s5, s6, s7, y_filter, 2425 round_offset_vec); 2426 2427 s0 = s1; 2428 s1 = s2; 2429 s2 = s3; 2430 s3 = s4; 2431 s4 = s5; 2432 s5 = s6; 2433 s6 = s7; 2434 2435 __builtin_prefetch(d); 2436 2437 uint16x8_t dd0 = vld1q_u16(d); 2438 2439 uint8x8_t d0_u8; 2440 compute_basic_avg_8x1(dd0, d0, round_offset_vec, &d0_u8); 2441 2442 vst1_u8(d_u8, d0_u8); 2443 d_u8 += dst8_stride; 2444 2445 s += src_stride; 2446 d += dst_stride; 2447 height--; 2448 #endif // AOM_ARCH_AARCH64 2449 } while (height != 0); 2450 src_ptr += 8; 2451 dst_ptr += 8; 2452 dst8_ptr += 8; 2453 width -= 8; 2454 } while (width != 0); 2455 } 2456 } 2457 2458 static inline void dist_wtd_convolve_y_8tap_neon(const uint8_t *src_ptr, 2459 int src_stride, int w, int h, 2460 const int16x8_t y_filter, 2461 ConvolveParams *conv_params) { 2462 const int bd = 8; 2463 const int offset_bits = bd + 2 * FILTER_BITS - ROUND0_BITS; 2464 const int16_t round_offset = (1 << (offset_bits - COMPOUND_ROUND1_BITS)) + 2465 (1 << (offset_bits - COMPOUND_ROUND1_BITS - 1)); 2466 const int16x8_t round_offset_vec = vdupq_n_s16(round_offset); 2467 2468 CONV_BUF_TYPE *dst_ptr = conv_params->dst; 2469 const int dst_stride = conv_params->dst_stride; 2470 int width = w; 2471 2472 if (w == 4 || h == 4) { 2473 do { 2474 const uint8_t *s = src_ptr; 2475 CONV_BUF_TYPE *d = dst_ptr; 2476 int height = h; 2477 2478 __builtin_prefetch(s + 0 * src_stride); 2479 __builtin_prefetch(s + 1 * src_stride); 2480 __builtin_prefetch(s + 2 * src_stride); 2481 __builtin_prefetch(s + 3 * src_stride); 2482 2483 uint8x8_t t0 = load_unaligned_u8_4x1(s + 0 * src_stride); 2484 uint8x8_t t1 = load_unaligned_u8_4x1(s + 1 * src_stride); 2485 uint8x8_t t2 = load_unaligned_u8_4x1(s + 2 * src_stride); 2486 uint8x8_t t3 = load_unaligned_u8_4x1(s + 3 * src_stride); 2487 uint8x8_t t4 = load_unaligned_u8_4x1(s + 4 * src_stride); 2488 uint8x8_t t5 = load_unaligned_u8_4x1(s + 5 * src_stride); 2489 uint8x8_t t6 = load_unaligned_u8_4x1(s + 6 * src_stride); 2490 2491 int16x4_t s0 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t0))); 2492 int16x4_t s1 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t1))); 2493 int16x4_t s2 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t2))); 2494 int16x4_t s3 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t3))); 2495 int16x4_t s4 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t4))); 2496 int16x4_t s5 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t5))); 2497 int16x4_t s6 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t6))); 2498 2499 __builtin_prefetch(d + 0 * dst_stride); 2500 __builtin_prefetch(d + 1 * dst_stride); 2501 __builtin_prefetch(d + 2 * dst_stride); 2502 __builtin_prefetch(d + 3 * dst_stride); 2503 2504 s += 7 * src_stride; 2505 2506 do { 2507 #if AOM_ARCH_AARCH64 2508 t0 = load_unaligned_u8_4x1(s + 0 * src_stride); 2509 t1 = load_unaligned_u8_4x1(s + 1 * src_stride); 2510 t2 = load_unaligned_u8_4x1(s + 2 * src_stride); 2511 t3 = load_unaligned_u8_4x1(s + 3 * src_stride); 2512 2513 int16x4_t s7 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t0))); 2514 int16x4_t s8 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t1))); 2515 int16x4_t s9 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t2))); 2516 int16x4_t s10 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t3))); 2517 2518 uint16x4_t d0 = convolve8_4_y(s0, s1, s2, s3, s4, s5, s6, s7, y_filter, 2519 vget_low_s16(round_offset_vec)); 2520 uint16x4_t d1 = convolve8_4_y(s1, s2, s3, s4, s5, s6, s7, s8, y_filter, 2521 vget_low_s16(round_offset_vec)); 2522 uint16x4_t d2 = convolve8_4_y(s2, s3, s4, s5, s6, s7, s8, s9, y_filter, 2523 vget_low_s16(round_offset_vec)); 2524 uint16x4_t d3 = convolve8_4_y(s3, s4, s5, s6, s7, s8, s9, s10, y_filter, 2525 vget_low_s16(round_offset_vec)); 2526 2527 store_u16_4x4(d, dst_stride, d0, d1, d2, d3); 2528 2529 s0 = s4; 2530 s1 = s5; 2531 s2 = s6; 2532 s3 = s7; 2533 s4 = s8; 2534 s5 = s9; 2535 s6 = s10; 2536 s += 4 * src_stride; 2537 d += 4 * dst_stride; 2538 height -= 4; 2539 #else // !AOM_ARCH_AARCH64 2540 t0 = load_unaligned_u8_4x1(s); 2541 int16x4_t s7 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t0))); 2542 2543 uint16x4_t d0 = convolve8_4_y(s0, s1, s2, s3, s4, s5, s6, s7, y_filter, 2544 vget_low_s16(round_offset_vec)); 2545 2546 vst1_u16(d, d0); 2547 2548 s0 = s1; 2549 s1 = s2; 2550 s2 = s3; 2551 s3 = s4; 2552 s4 = s5; 2553 s5 = s6; 2554 s6 = s7; 2555 s += src_stride; 2556 d += dst_stride; 2557 height--; 2558 #endif // AOM_ARCH_AARCH64 2559 } while (height != 0); 2560 src_ptr += 4; 2561 dst_ptr += 4; 2562 width -= 4; 2563 } while (width != 0); 2564 } else { 2565 do { 2566 const uint8_t *s = src_ptr; 2567 CONV_BUF_TYPE *d = dst_ptr; 2568 int height = h; 2569 2570 __builtin_prefetch(s + 0 * src_stride); 2571 __builtin_prefetch(s + 1 * src_stride); 2572 __builtin_prefetch(s + 2 * src_stride); 2573 __builtin_prefetch(s + 3 * src_stride); 2574 __builtin_prefetch(s + 4 * src_stride); 2575 __builtin_prefetch(s + 5 * src_stride); 2576 __builtin_prefetch(s + 6 * src_stride); 2577 __builtin_prefetch(s + 7 * src_stride); 2578 2579 uint8x8_t t0, t1, t2, t3, t4, t5, t6; 2580 load_u8_8x7(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6); 2581 2582 int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t0)); 2583 int16x8_t s1 = vreinterpretq_s16_u16(vmovl_u8(t1)); 2584 int16x8_t s2 = vreinterpretq_s16_u16(vmovl_u8(t2)); 2585 int16x8_t s3 = vreinterpretq_s16_u16(vmovl_u8(t3)); 2586 int16x8_t s4 = vreinterpretq_s16_u16(vmovl_u8(t4)); 2587 int16x8_t s5 = vreinterpretq_s16_u16(vmovl_u8(t5)); 2588 int16x8_t s6 = vreinterpretq_s16_u16(vmovl_u8(t6)); 2589 2590 s += 7 * src_stride; 2591 2592 do { 2593 #if AOM_ARCH_AARCH64 2594 uint8x8_t t7; 2595 load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); 2596 2597 int16x8_t s7 = vreinterpretq_s16_u16(vmovl_u8(t0)); 2598 int16x8_t s8 = vreinterpretq_s16_u16(vmovl_u8(t1)); 2599 int16x8_t s9 = vreinterpretq_s16_u16(vmovl_u8(t2)); 2600 int16x8_t s10 = vreinterpretq_s16_u16(vmovl_u8(t3)); 2601 int16x8_t s11 = vreinterpretq_s16_u16(vmovl_u8(t4)); 2602 int16x8_t s12 = vreinterpretq_s16_u16(vmovl_u8(t5)); 2603 int16x8_t s13 = vreinterpretq_s16_u16(vmovl_u8(t6)); 2604 int16x8_t s14 = vreinterpretq_s16_u16(vmovl_u8(t7)); 2605 2606 __builtin_prefetch(dst_ptr + 0 * dst_stride); 2607 __builtin_prefetch(dst_ptr + 1 * dst_stride); 2608 __builtin_prefetch(dst_ptr + 2 * dst_stride); 2609 __builtin_prefetch(dst_ptr + 3 * dst_stride); 2610 2611 uint16x8_t d0 = convolve8_8_y(s0, s1, s2, s3, s4, s5, s6, s7, y_filter, 2612 round_offset_vec); 2613 uint16x8_t d1 = convolve8_8_y(s1, s2, s3, s4, s5, s6, s7, s8, y_filter, 2614 round_offset_vec); 2615 uint16x8_t d2 = convolve8_8_y(s2, s3, s4, s5, s6, s7, s8, s9, y_filter, 2616 round_offset_vec); 2617 uint16x8_t d3 = convolve8_8_y(s3, s4, s5, s6, s7, s8, s9, s10, y_filter, 2618 round_offset_vec); 2619 uint16x8_t d4 = convolve8_8_y(s4, s5, s6, s7, s8, s9, s10, s11, 2620 y_filter, round_offset_vec); 2621 uint16x8_t d5 = convolve8_8_y(s5, s6, s7, s8, s9, s10, s11, s12, 2622 y_filter, round_offset_vec); 2623 uint16x8_t d6 = convolve8_8_y(s6, s7, s8, s9, s10, s11, s12, s13, 2624 y_filter, round_offset_vec); 2625 uint16x8_t d7 = convolve8_8_y(s7, s8, s9, s10, s11, s12, s13, s14, 2626 y_filter, round_offset_vec); 2627 2628 store_u16_8x8(d, dst_stride, d0, d1, d2, d3, d4, d5, d6, d7); 2629 2630 s0 = s8; 2631 s1 = s9; 2632 s2 = s10; 2633 s3 = s11; 2634 s4 = s12; 2635 s5 = s13; 2636 s6 = s14; 2637 s += 8 * src_stride; 2638 d += 8 * dst_stride; 2639 height -= 8; 2640 #else // !AOM_ARCH_AARCH64 2641 int16x8_t s7 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s))); 2642 2643 __builtin_prefetch(dst_ptr); 2644 2645 uint16x8_t d0 = convolve8_8_y(s0, s1, s2, s3, s4, s5, s6, s7, y_filter, 2646 round_offset_vec); 2647 2648 s0 = s1; 2649 s1 = s2; 2650 s2 = s3; 2651 s3 = s4; 2652 s4 = s5; 2653 s5 = s6; 2654 s6 = s7; 2655 2656 vst1q_u16(d, d0); 2657 2658 s += src_stride; 2659 d += dst_stride; 2660 height--; 2661 #endif // AOM_ARCH_AARCH64 2662 } while (height != 0); 2663 src_ptr += 8; 2664 dst_ptr += 8; 2665 width -= 8; 2666 } while (width != 0); 2667 } 2668 } 2669 2670 void av1_dist_wtd_convolve_y_neon(const uint8_t *src, int src_stride, 2671 uint8_t *dst8, int dst8_stride, int w, int h, 2672 const InterpFilterParams *filter_params_y, 2673 const int subpel_y_qn, 2674 ConvolveParams *conv_params) { 2675 assert(w % 4 == 0); 2676 assert(h % 4 == 0); 2677 2678 // Vertical filter. 2679 const int16_t *y_filter_ptr = av1_get_interp_filter_subpel_kernel( 2680 filter_params_y, subpel_y_qn & SUBPEL_MASK); 2681 // Filter values are even, so downshift by 1 to reduce intermediate 2682 // precision requirements. 2683 const int16x8_t y_filter = vshrq_n_s16(vld1q_s16(y_filter_ptr), 1); 2684 2685 const int vert_offset = filter_params_y->taps / 2 - 1; 2686 const uint8_t *src_ptr = src - (vert_offset * src_stride); 2687 2688 if (get_filter_tap(filter_params_y, subpel_y_qn) <= 6) { 2689 if (conv_params->do_average) { 2690 if (UNLIKELY(conv_params->use_dist_wtd_comp_avg)) { 2691 dist_wtd_convolve_y_6tap_dist_wtd_avg_neon( 2692 src_ptr + src_stride, src_stride, dst8, dst8_stride, w, h, y_filter, 2693 conv_params); 2694 } else { 2695 dist_wtd_convolve_y_6tap_avg_neon(src_ptr + src_stride, src_stride, 2696 dst8, dst8_stride, w, h, y_filter, 2697 conv_params); 2698 } 2699 } else { 2700 dist_wtd_convolve_y_6tap_neon(src_ptr + src_stride, src_stride, w, h, 2701 y_filter, conv_params); 2702 } 2703 } else { 2704 if (conv_params->do_average) { 2705 if (UNLIKELY(conv_params->use_dist_wtd_comp_avg)) { 2706 dist_wtd_convolve_y_8tap_dist_wtd_avg_neon(src_ptr, src_stride, dst8, 2707 dst8_stride, w, h, y_filter, 2708 conv_params); 2709 } else { 2710 dist_wtd_convolve_y_8tap_avg_neon(src_ptr, src_stride, dst8, 2711 dst8_stride, w, h, y_filter, 2712 conv_params); 2713 } 2714 } else { 2715 dist_wtd_convolve_y_8tap_neon(src_ptr, src_stride, w, h, y_filter, 2716 conv_params); 2717 } 2718 } 2719 }