highbd_compound_convolve_sve2.c (60652B)
1 /* 2 * Copyright (c) 2024, Alliance for Open Media. All rights reserved. 3 * 4 * This source code is subject to the terms of the BSD 2 Clause License and 5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License 6 * was not distributed with this source code in the LICENSE file, you can 7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open 8 * Media Patent License 1.0 was not distributed with this source code in the 9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent. 10 */ 11 12 #include <assert.h> 13 #include <arm_neon.h> 14 15 #include "config/aom_config.h" 16 #include "config/av1_rtcd.h" 17 18 #include "aom_dsp/aom_dsp_common.h" 19 #include "aom_dsp/arm/aom_neon_sve_bridge.h" 20 #include "aom_dsp/arm/aom_neon_sve2_bridge.h" 21 #include "aom_dsp/arm/mem_neon.h" 22 #include "aom_ports/mem.h" 23 #include "av1/common/convolve.h" 24 #include "av1/common/filter.h" 25 #include "av1/common/filter.h" 26 #include "av1/common/arm/highbd_compound_convolve_neon.h" 27 #include "av1/common/arm/highbd_convolve_neon.h" 28 #include "av1/common/arm/highbd_convolve_sve2.h" 29 30 DECLARE_ALIGNED(16, static const uint16_t, kDotProdTbl[32]) = { 31 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6, 32 4, 5, 6, 7, 5, 6, 7, 0, 6, 7, 0, 1, 7, 0, 1, 2, 33 }; 34 35 static inline uint16x8_t highbd_12_convolve8_8_x(int16x8_t s0[8], 36 int16x8_t filter, 37 int64x2_t offset) { 38 int64x2_t sum[8]; 39 sum[0] = aom_sdotq_s16(offset, s0[0], filter); 40 sum[1] = aom_sdotq_s16(offset, s0[1], filter); 41 sum[2] = aom_sdotq_s16(offset, s0[2], filter); 42 sum[3] = aom_sdotq_s16(offset, s0[3], filter); 43 sum[4] = aom_sdotq_s16(offset, s0[4], filter); 44 sum[5] = aom_sdotq_s16(offset, s0[5], filter); 45 sum[6] = aom_sdotq_s16(offset, s0[6], filter); 46 sum[7] = aom_sdotq_s16(offset, s0[7], filter); 47 48 sum[0] = vpaddq_s64(sum[0], sum[1]); 49 sum[2] = vpaddq_s64(sum[2], sum[3]); 50 sum[4] = vpaddq_s64(sum[4], sum[5]); 51 sum[6] = vpaddq_s64(sum[6], sum[7]); 52 53 int32x4_t sum0123 = vcombine_s32(vmovn_s64(sum[0]), vmovn_s64(sum[2])); 54 int32x4_t sum4567 = vcombine_s32(vmovn_s64(sum[4]), vmovn_s64(sum[6])); 55 56 return vcombine_u16(vqrshrun_n_s32(sum0123, ROUND0_BITS + 2), 57 vqrshrun_n_s32(sum4567, ROUND0_BITS + 2)); 58 } 59 60 static inline void highbd_12_dist_wtd_convolve_x_8tap_sve2( 61 const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, 62 int width, int height, const int16_t *x_filter_ptr) { 63 const int64x1_t offset_vec = 64 vcreate_s64((1 << (12 + FILTER_BITS)) + (1 << (12 + FILTER_BITS - 1))); 65 const int64x2_t offset_lo = vcombine_s64(offset_vec, vdup_n_s64(0)); 66 67 const int16x8_t filter = vld1q_s16(x_filter_ptr); 68 69 do { 70 const int16_t *s = (const int16_t *)src; 71 uint16_t *d = dst; 72 int w = width; 73 74 do { 75 int16x8_t s0[8], s1[8], s2[8], s3[8]; 76 load_s16_8x8(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3], 77 &s0[4], &s0[5], &s0[6], &s0[7]); 78 load_s16_8x8(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3], 79 &s1[4], &s1[5], &s1[6], &s1[7]); 80 load_s16_8x8(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3], 81 &s2[4], &s2[5], &s2[6], &s2[7]); 82 load_s16_8x8(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3], 83 &s3[4], &s3[5], &s3[6], &s3[7]); 84 85 uint16x8_t d0 = highbd_12_convolve8_8_x(s0, filter, offset_lo); 86 uint16x8_t d1 = highbd_12_convolve8_8_x(s1, filter, offset_lo); 87 uint16x8_t d2 = highbd_12_convolve8_8_x(s2, filter, offset_lo); 88 uint16x8_t d3 = highbd_12_convolve8_8_x(s3, filter, offset_lo); 89 90 store_u16_8x4(d, dst_stride, d0, d1, d2, d3); 91 92 s += 8; 93 d += 8; 94 w -= 8; 95 } while (w != 0); 96 src += 4 * src_stride; 97 dst += 4 * dst_stride; 98 height -= 4; 99 } while (height != 0); 100 } 101 102 static inline uint16x8_t highbd_convolve8_8_x(int16x8_t s0[8], int16x8_t filter, 103 int64x2_t offset) { 104 int64x2_t sum[8]; 105 sum[0] = aom_sdotq_s16(offset, s0[0], filter); 106 sum[1] = aom_sdotq_s16(offset, s0[1], filter); 107 sum[2] = aom_sdotq_s16(offset, s0[2], filter); 108 sum[3] = aom_sdotq_s16(offset, s0[3], filter); 109 sum[4] = aom_sdotq_s16(offset, s0[4], filter); 110 sum[5] = aom_sdotq_s16(offset, s0[5], filter); 111 sum[6] = aom_sdotq_s16(offset, s0[6], filter); 112 sum[7] = aom_sdotq_s16(offset, s0[7], filter); 113 114 sum[0] = vpaddq_s64(sum[0], sum[1]); 115 sum[2] = vpaddq_s64(sum[2], sum[3]); 116 sum[4] = vpaddq_s64(sum[4], sum[5]); 117 sum[6] = vpaddq_s64(sum[6], sum[7]); 118 119 int32x4_t sum0123 = vcombine_s32(vmovn_s64(sum[0]), vmovn_s64(sum[2])); 120 int32x4_t sum4567 = vcombine_s32(vmovn_s64(sum[4]), vmovn_s64(sum[6])); 121 122 return vcombine_u16(vqrshrun_n_s32(sum0123, ROUND0_BITS), 123 vqrshrun_n_s32(sum4567, ROUND0_BITS)); 124 } 125 126 static inline void highbd_dist_wtd_convolve_x_8tap_sve2( 127 const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, 128 int width, int height, const int16_t *x_filter_ptr, const int bd) { 129 const int64x1_t offset_vec = 130 vcreate_s64((1 << (bd + FILTER_BITS)) + (1 << (bd + FILTER_BITS - 1))); 131 const int64x2_t offset_lo = vcombine_s64(offset_vec, vdup_n_s64(0)); 132 133 const int16x8_t filter = vld1q_s16(x_filter_ptr); 134 135 do { 136 const int16_t *s = (const int16_t *)src; 137 uint16_t *d = dst; 138 int w = width; 139 140 do { 141 int16x8_t s0[8], s1[8], s2[8], s3[8]; 142 load_s16_8x8(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3], 143 &s0[4], &s0[5], &s0[6], &s0[7]); 144 load_s16_8x8(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3], 145 &s1[4], &s1[5], &s1[6], &s1[7]); 146 load_s16_8x8(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3], 147 &s2[4], &s2[5], &s2[6], &s2[7]); 148 load_s16_8x8(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3], 149 &s3[4], &s3[5], &s3[6], &s3[7]); 150 151 uint16x8_t d0 = highbd_convolve8_8_x(s0, filter, offset_lo); 152 uint16x8_t d1 = highbd_convolve8_8_x(s1, filter, offset_lo); 153 uint16x8_t d2 = highbd_convolve8_8_x(s2, filter, offset_lo); 154 uint16x8_t d3 = highbd_convolve8_8_x(s3, filter, offset_lo); 155 156 store_u16_8x4(d, dst_stride, d0, d1, d2, d3); 157 158 s += 8; 159 d += 8; 160 w -= 8; 161 } while (w != 0); 162 src += 4 * src_stride; 163 dst += 4 * dst_stride; 164 height -= 4; 165 } while (height != 0); 166 } 167 168 // clang-format off 169 DECLARE_ALIGNED(16, static const uint16_t, kDeinterleaveTbl[8]) = { 170 0, 2, 4, 6, 1, 3, 5, 7, 171 }; 172 // clang-format on 173 174 static inline uint16x4_t highbd_12_convolve4_4_x(int16x8_t s0, int16x8_t filter, 175 int64x2_t offset, 176 uint16x8x2_t permute_tbl) { 177 int16x8_t permuted_samples0 = aom_tbl_s16(s0, permute_tbl.val[0]); 178 int16x8_t permuted_samples1 = aom_tbl_s16(s0, permute_tbl.val[1]); 179 180 int64x2_t sum01 = aom_svdot_lane_s16(offset, permuted_samples0, filter, 0); 181 int64x2_t sum23 = aom_svdot_lane_s16(offset, permuted_samples1, filter, 0); 182 183 int32x4_t sum0123 = vcombine_s32(vmovn_s64(sum01), vmovn_s64(sum23)); 184 185 return vqrshrun_n_s32(sum0123, ROUND0_BITS + 2); 186 } 187 188 static inline uint16x8_t highbd_12_convolve4_8_x(int16x8_t s0[4], 189 int16x8_t filter, 190 int64x2_t offset, 191 uint16x8_t tbl) { 192 int64x2_t sum04 = aom_svdot_lane_s16(offset, s0[0], filter, 0); 193 int64x2_t sum15 = aom_svdot_lane_s16(offset, s0[1], filter, 0); 194 int64x2_t sum26 = aom_svdot_lane_s16(offset, s0[2], filter, 0); 195 int64x2_t sum37 = aom_svdot_lane_s16(offset, s0[3], filter, 0); 196 197 int32x4_t sum0415 = vcombine_s32(vmovn_s64(sum04), vmovn_s64(sum15)); 198 int32x4_t sum2637 = vcombine_s32(vmovn_s64(sum26), vmovn_s64(sum37)); 199 200 uint16x8_t res = vcombine_u16(vqrshrun_n_s32(sum0415, ROUND0_BITS + 2), 201 vqrshrun_n_s32(sum2637, ROUND0_BITS + 2)); 202 return aom_tbl_u16(res, tbl); 203 } 204 205 static inline void highbd_12_dist_wtd_convolve_x_4tap_sve2( 206 const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, 207 int width, int height, const int16_t *x_filter_ptr) { 208 const int64x2_t offset = 209 vdupq_n_s64((1 << (12 + FILTER_BITS)) + (1 << (12 + FILTER_BITS - 1))); 210 211 const int16x4_t x_filter = vld1_s16(x_filter_ptr + 2); 212 const int16x8_t filter = vcombine_s16(x_filter, vdup_n_s16(0)); 213 214 if (width == 4) { 215 uint16x8x2_t permute_tbl = vld1q_u16_x2(kDotProdTbl); 216 217 const int16_t *s = (const int16_t *)(src); 218 219 do { 220 int16x8_t s0, s1, s2, s3; 221 load_s16_8x4(s, src_stride, &s0, &s1, &s2, &s3); 222 223 uint16x4_t d0 = highbd_12_convolve4_4_x(s0, filter, offset, permute_tbl); 224 uint16x4_t d1 = highbd_12_convolve4_4_x(s1, filter, offset, permute_tbl); 225 uint16x4_t d2 = highbd_12_convolve4_4_x(s2, filter, offset, permute_tbl); 226 uint16x4_t d3 = highbd_12_convolve4_4_x(s3, filter, offset, permute_tbl); 227 228 store_u16_4x4(dst, dst_stride, d0, d1, d2, d3); 229 230 s += 4 * src_stride; 231 dst += 4 * dst_stride; 232 height -= 4; 233 } while (height != 0); 234 } else { 235 uint16x8_t idx = vld1q_u16(kDeinterleaveTbl); 236 237 do { 238 const int16_t *s = (const int16_t *)(src); 239 uint16_t *d = dst; 240 int w = width; 241 242 do { 243 int16x8_t s0[4], s1[4], s2[4], s3[4]; 244 load_s16_8x4(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3]); 245 load_s16_8x4(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3]); 246 load_s16_8x4(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3]); 247 load_s16_8x4(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3]); 248 249 uint16x8_t d0 = highbd_12_convolve4_8_x(s0, filter, offset, idx); 250 uint16x8_t d1 = highbd_12_convolve4_8_x(s1, filter, offset, idx); 251 uint16x8_t d2 = highbd_12_convolve4_8_x(s2, filter, offset, idx); 252 uint16x8_t d3 = highbd_12_convolve4_8_x(s3, filter, offset, idx); 253 254 store_u16_8x4(d, dst_stride, d0, d1, d2, d3); 255 256 s += 8; 257 d += 8; 258 w -= 8; 259 } while (w != 0); 260 src += 4 * src_stride; 261 dst += 4 * dst_stride; 262 height -= 4; 263 } while (height != 0); 264 } 265 } 266 267 static inline uint16x4_t highbd_convolve4_4_x(int16x8_t s0, int16x8_t filter, 268 int64x2_t offset, 269 uint16x8x2_t permute_tbl) { 270 int16x8_t permuted_samples0 = aom_tbl_s16(s0, permute_tbl.val[0]); 271 int16x8_t permuted_samples1 = aom_tbl_s16(s0, permute_tbl.val[1]); 272 273 int64x2_t sum01 = aom_svdot_lane_s16(offset, permuted_samples0, filter, 0); 274 int64x2_t sum23 = aom_svdot_lane_s16(offset, permuted_samples1, filter, 0); 275 276 int32x4_t sum0123 = vcombine_s32(vmovn_s64(sum01), vmovn_s64(sum23)); 277 278 return vqrshrun_n_s32(sum0123, ROUND0_BITS); 279 } 280 281 static inline uint16x8_t highbd_convolve4_8_x(int16x8_t s0[4], int16x8_t filter, 282 int64x2_t offset, 283 uint16x8_t tbl) { 284 int64x2_t sum04 = aom_svdot_lane_s16(offset, s0[0], filter, 0); 285 int64x2_t sum15 = aom_svdot_lane_s16(offset, s0[1], filter, 0); 286 int64x2_t sum26 = aom_svdot_lane_s16(offset, s0[2], filter, 0); 287 int64x2_t sum37 = aom_svdot_lane_s16(offset, s0[3], filter, 0); 288 289 int32x4_t sum0415 = vcombine_s32(vmovn_s64(sum04), vmovn_s64(sum15)); 290 int32x4_t sum2637 = vcombine_s32(vmovn_s64(sum26), vmovn_s64(sum37)); 291 292 uint16x8_t res = vcombine_u16(vqrshrun_n_s32(sum0415, ROUND0_BITS), 293 vqrshrun_n_s32(sum2637, ROUND0_BITS)); 294 return aom_tbl_u16(res, tbl); 295 } 296 297 static inline void highbd_dist_wtd_convolve_x_4tap_sve2( 298 const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, 299 int width, int height, const int16_t *x_filter_ptr, const int bd) { 300 const int64x2_t offset = 301 vdupq_n_s64((1 << (bd + FILTER_BITS)) + (1 << (bd + FILTER_BITS - 1))); 302 303 const int16x4_t x_filter = vld1_s16(x_filter_ptr + 2); 304 const int16x8_t filter = vcombine_s16(x_filter, vdup_n_s16(0)); 305 306 if (width == 4) { 307 uint16x8x2_t permute_tbl = vld1q_u16_x2(kDotProdTbl); 308 309 const int16_t *s = (const int16_t *)(src); 310 311 do { 312 int16x8_t s0, s1, s2, s3; 313 load_s16_8x4(s, src_stride, &s0, &s1, &s2, &s3); 314 315 uint16x4_t d0 = highbd_convolve4_4_x(s0, filter, offset, permute_tbl); 316 uint16x4_t d1 = highbd_convolve4_4_x(s1, filter, offset, permute_tbl); 317 uint16x4_t d2 = highbd_convolve4_4_x(s2, filter, offset, permute_tbl); 318 uint16x4_t d3 = highbd_convolve4_4_x(s3, filter, offset, permute_tbl); 319 320 store_u16_4x4(dst, dst_stride, d0, d1, d2, d3); 321 322 s += 4 * src_stride; 323 dst += 4 * dst_stride; 324 height -= 4; 325 } while (height != 0); 326 } else { 327 uint16x8_t idx = vld1q_u16(kDeinterleaveTbl); 328 329 do { 330 const int16_t *s = (const int16_t *)(src); 331 uint16_t *d = dst; 332 int w = width; 333 334 do { 335 int16x8_t s0[4], s1[4], s2[4], s3[4]; 336 load_s16_8x4(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3]); 337 load_s16_8x4(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3]); 338 load_s16_8x4(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3]); 339 load_s16_8x4(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3]); 340 341 uint16x8_t d0 = highbd_convolve4_8_x(s0, filter, offset, idx); 342 uint16x8_t d1 = highbd_convolve4_8_x(s1, filter, offset, idx); 343 uint16x8_t d2 = highbd_convolve4_8_x(s2, filter, offset, idx); 344 uint16x8_t d3 = highbd_convolve4_8_x(s3, filter, offset, idx); 345 346 store_u16_8x4(d, dst_stride, d0, d1, d2, d3); 347 348 s += 8; 349 d += 8; 350 w -= 8; 351 } while (w != 0); 352 src += 4 * src_stride; 353 dst += 4 * dst_stride; 354 height -= 4; 355 } while (height != 0); 356 } 357 } 358 359 void av1_highbd_dist_wtd_convolve_x_sve2( 360 const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, 361 int h, const InterpFilterParams *filter_params_x, const int subpel_x_qn, 362 ConvolveParams *conv_params, int bd) { 363 DECLARE_ALIGNED(16, uint16_t, 364 im_block[(MAX_SB_SIZE + MAX_FILTER_TAP) * MAX_SB_SIZE]); 365 CONV_BUF_TYPE *dst16 = conv_params->dst; 366 const int x_filter_taps = get_filter_tap(filter_params_x, subpel_x_qn); 367 368 if (x_filter_taps == 6) { 369 av1_highbd_dist_wtd_convolve_x_neon(src, src_stride, dst, dst_stride, w, h, 370 filter_params_x, subpel_x_qn, 371 conv_params, bd); 372 return; 373 } 374 375 int dst16_stride = conv_params->dst_stride; 376 const int im_stride = MAX_SB_SIZE; 377 const int horiz_offset = filter_params_x->taps / 2 - 1; 378 assert(FILTER_BITS == COMPOUND_ROUND1_BITS); 379 380 const int16_t *x_filter_ptr = av1_get_interp_filter_subpel_kernel( 381 filter_params_x, subpel_x_qn & SUBPEL_MASK); 382 383 src -= horiz_offset; 384 385 if (bd == 12) { 386 if (conv_params->do_average) { 387 if (x_filter_taps <= 4) { 388 highbd_12_dist_wtd_convolve_x_4tap_sve2(src + 2, src_stride, im_block, 389 im_stride, w, h, x_filter_ptr); 390 } else { 391 highbd_12_dist_wtd_convolve_x_8tap_sve2(src, src_stride, im_block, 392 im_stride, w, h, x_filter_ptr); 393 } 394 395 if (conv_params->use_dist_wtd_comp_avg) { 396 highbd_12_dist_wtd_comp_avg_neon(im_block, im_stride, dst, dst_stride, 397 w, h, conv_params); 398 399 } else { 400 highbd_12_comp_avg_neon(im_block, im_stride, dst, dst_stride, w, h, 401 conv_params); 402 } 403 } else { 404 if (x_filter_taps <= 4) { 405 highbd_12_dist_wtd_convolve_x_4tap_sve2( 406 src + 2, src_stride, dst16, dst16_stride, w, h, x_filter_ptr); 407 } else { 408 highbd_12_dist_wtd_convolve_x_8tap_sve2( 409 src, src_stride, dst16, dst16_stride, w, h, x_filter_ptr); 410 } 411 } 412 } else { 413 if (conv_params->do_average) { 414 if (x_filter_taps <= 4) { 415 highbd_dist_wtd_convolve_x_4tap_sve2(src + 2, src_stride, im_block, 416 im_stride, w, h, x_filter_ptr, bd); 417 } else { 418 highbd_dist_wtd_convolve_x_8tap_sve2(src, src_stride, im_block, 419 im_stride, w, h, x_filter_ptr, bd); 420 } 421 422 if (conv_params->use_dist_wtd_comp_avg) { 423 highbd_dist_wtd_comp_avg_neon(im_block, im_stride, dst, dst_stride, w, 424 h, conv_params, bd); 425 } else { 426 highbd_comp_avg_neon(im_block, im_stride, dst, dst_stride, w, h, 427 conv_params, bd); 428 } 429 } else { 430 if (x_filter_taps <= 4) { 431 highbd_dist_wtd_convolve_x_4tap_sve2( 432 src + 2, src_stride, dst16, dst16_stride, w, h, x_filter_ptr, bd); 433 } else { 434 highbd_dist_wtd_convolve_x_8tap_sve2( 435 src, src_stride, dst16, dst16_stride, w, h, x_filter_ptr, bd); 436 } 437 } 438 } 439 } 440 441 static inline uint16x4_t highbd_12_convolve8_4_y(int16x8_t samples_lo[2], 442 int16x8_t samples_hi[2], 443 int16x8_t filter, 444 int64x2_t offset) { 445 int64x2_t sum01 = aom_svdot_lane_s16(offset, samples_lo[0], filter, 0); 446 sum01 = aom_svdot_lane_s16(sum01, samples_hi[0], filter, 1); 447 448 int64x2_t sum23 = aom_svdot_lane_s16(offset, samples_lo[1], filter, 0); 449 sum23 = aom_svdot_lane_s16(sum23, samples_hi[1], filter, 1); 450 451 int32x4_t sum0123 = vcombine_s32(vmovn_s64(sum01), vmovn_s64(sum23)); 452 453 return vqrshrun_n_s32(sum0123, ROUND0_BITS + 2); 454 } 455 456 static inline uint16x8_t highbd_12_convolve8_8_y(int16x8_t samples_lo[4], 457 int16x8_t samples_hi[4], 458 int16x8_t filter, 459 int64x2_t offset) { 460 int64x2_t sum01 = aom_svdot_lane_s16(offset, samples_lo[0], filter, 0); 461 sum01 = aom_svdot_lane_s16(sum01, samples_hi[0], filter, 1); 462 463 int64x2_t sum23 = aom_svdot_lane_s16(offset, samples_lo[1], filter, 0); 464 sum23 = aom_svdot_lane_s16(sum23, samples_hi[1], filter, 1); 465 466 int64x2_t sum45 = aom_svdot_lane_s16(offset, samples_lo[2], filter, 0); 467 sum45 = aom_svdot_lane_s16(sum45, samples_hi[2], filter, 1); 468 469 int64x2_t sum67 = aom_svdot_lane_s16(offset, samples_lo[3], filter, 0); 470 sum67 = aom_svdot_lane_s16(sum67, samples_hi[3], filter, 1); 471 472 int32x4_t sum0123 = vcombine_s32(vmovn_s64(sum01), vmovn_s64(sum23)); 473 int32x4_t sum4567 = vcombine_s32(vmovn_s64(sum45), vmovn_s64(sum67)); 474 475 return vcombine_u16(vqrshrun_n_s32(sum0123, ROUND0_BITS + 2), 476 vqrshrun_n_s32(sum4567, ROUND0_BITS + 2)); 477 } 478 479 static inline void highbd_12_dist_wtd_convolve_y_8tap_sve2( 480 const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, 481 int width, int height, const int16_t *y_filter_ptr) { 482 const int64x2_t offset = 483 vdupq_n_s64((1 << (12 + FILTER_BITS)) + (1 << (12 + FILTER_BITS - 1))); 484 const int16x8_t y_filter = vld1q_s16(y_filter_ptr); 485 486 uint16x8x3_t merge_block_tbl = vld1q_u16_x3(kDotProdMergeBlockTbl); 487 // Scale indices by size of the true vector length to avoid reading from an 488 // 'undefined' portion of a vector on a system with SVE vectors > 128-bit. 489 uint16x8_t correction0 = 490 vreinterpretq_u16_u64(vdupq_n_u64(svcnth() * 0x0001000000000000ULL)); 491 merge_block_tbl.val[0] = vaddq_u16(merge_block_tbl.val[0], correction0); 492 uint16x8_t correction1 = 493 vreinterpretq_u16_u64(vdupq_n_u64(svcnth() * 0x0001000100000000ULL)); 494 merge_block_tbl.val[1] = vaddq_u16(merge_block_tbl.val[1], correction1); 495 496 uint16x8_t correction2 = 497 vreinterpretq_u16_u64(vdupq_n_u64(svcnth() * 0x0001000100010000ULL)); 498 merge_block_tbl.val[2] = vaddq_u16(merge_block_tbl.val[2], correction2); 499 500 if (width == 4) { 501 int16_t *s = (int16_t *)src; 502 int16x4_t s0, s1, s2, s3, s4, s5, s6; 503 load_s16_4x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6); 504 s += 7 * src_stride; 505 506 // This operation combines a conventional transpose and the sample permute 507 // required before computing the dot product. 508 int16x8_t s0123[2], s1234[2], s2345[2], s3456[2]; 509 transpose_concat_elems_s16_4x4(s0, s1, s2, s3, s0123); 510 transpose_concat_elems_s16_4x4(s1, s2, s3, s4, s1234); 511 transpose_concat_elems_s16_4x4(s2, s3, s4, s5, s2345); 512 transpose_concat_elems_s16_4x4(s3, s4, s5, s6, s3456); 513 514 do { 515 int16x4_t s7, s8, s9, s10; 516 load_s16_4x4(s, src_stride, &s7, &s8, &s9, &s10); 517 518 int16x8_t s4567[2], s5678[2], s6789[2], s789A[2]; 519 // Transpose and shuffle the 4 lines that were loaded. 520 transpose_concat_elems_s16_4x4(s7, s8, s9, s10, s789A); 521 522 // Merge new data into block from previous iteration. 523 aom_tbl2x2_s16(s3456, s789A, merge_block_tbl.val[0], s4567); 524 aom_tbl2x2_s16(s3456, s789A, merge_block_tbl.val[1], s5678); 525 aom_tbl2x2_s16(s3456, s789A, merge_block_tbl.val[2], s6789); 526 527 uint16x4_t d0 = highbd_12_convolve8_4_y(s0123, s4567, y_filter, offset); 528 uint16x4_t d1 = highbd_12_convolve8_4_y(s1234, s5678, y_filter, offset); 529 uint16x4_t d2 = highbd_12_convolve8_4_y(s2345, s6789, y_filter, offset); 530 uint16x4_t d3 = highbd_12_convolve8_4_y(s3456, s789A, y_filter, offset); 531 532 store_u16_4x4(dst, dst_stride, d0, d1, d2, d3); 533 534 // Prepare block for next iteration - re-using as much as possible. 535 // Shuffle everything up four rows. 536 s0123[0] = s4567[0]; 537 s0123[1] = s4567[1]; 538 s1234[0] = s5678[0]; 539 s1234[1] = s5678[1]; 540 s2345[0] = s6789[0]; 541 s2345[1] = s6789[1]; 542 s3456[0] = s789A[0]; 543 s3456[1] = s789A[1]; 544 545 s += 4 * src_stride; 546 dst += 4 * dst_stride; 547 height -= 4; 548 } while (height != 0); 549 } else { 550 do { 551 int h = height; 552 int16_t *s = (int16_t *)src; 553 uint16_t *d = dst; 554 555 int16x8_t s0, s1, s2, s3, s4, s5, s6; 556 load_s16_8x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6); 557 s += 7 * src_stride; 558 559 // This operation combines a conventional transpose and the sample permute 560 // required before computing the dot product. 561 int16x8_t s0123[4], s1234[4], s2345[4], s3456[4]; 562 transpose_concat_elems_s16_8x4(s0, s1, s2, s3, s0123); 563 transpose_concat_elems_s16_8x4(s1, s2, s3, s4, s1234); 564 transpose_concat_elems_s16_8x4(s2, s3, s4, s5, s2345); 565 transpose_concat_elems_s16_8x4(s3, s4, s5, s6, s3456); 566 567 do { 568 int16x8_t s7, s8, s9, s10; 569 load_s16_8x4(s, src_stride, &s7, &s8, &s9, &s10); 570 int16x8_t s4567[4], s5678[4], s6789[4], s789A[4]; 571 572 // Transpose and shuffle the 4 lines that were loaded. 573 transpose_concat_elems_s16_8x4(s7, s8, s9, s10, s789A); 574 575 // Merge new data into block from previous iteration. 576 aom_tbl2x4_s16(s3456, s789A, merge_block_tbl.val[0], s4567); 577 aom_tbl2x4_s16(s3456, s789A, merge_block_tbl.val[1], s5678); 578 aom_tbl2x4_s16(s3456, s789A, merge_block_tbl.val[2], s6789); 579 580 uint16x8_t d0 = highbd_12_convolve8_8_y(s0123, s4567, y_filter, offset); 581 uint16x8_t d1 = highbd_12_convolve8_8_y(s1234, s5678, y_filter, offset); 582 uint16x8_t d2 = highbd_12_convolve8_8_y(s2345, s6789, y_filter, offset); 583 uint16x8_t d3 = highbd_12_convolve8_8_y(s3456, s789A, y_filter, offset); 584 585 store_u16_8x4(d, dst_stride, d0, d1, d2, d3); 586 587 // Prepare block for next iteration - re-using as much as possible. 588 // Shuffle everything up four rows. 589 s0123[0] = s4567[0]; 590 s0123[1] = s4567[1]; 591 s0123[2] = s4567[2]; 592 s0123[3] = s4567[3]; 593 s1234[0] = s5678[0]; 594 s1234[1] = s5678[1]; 595 s1234[2] = s5678[2]; 596 s1234[3] = s5678[3]; 597 s2345[0] = s6789[0]; 598 s2345[1] = s6789[1]; 599 s2345[2] = s6789[2]; 600 s2345[3] = s6789[3]; 601 s3456[0] = s789A[0]; 602 s3456[1] = s789A[1]; 603 s3456[2] = s789A[2]; 604 s3456[3] = s789A[3]; 605 606 s += 4 * src_stride; 607 d += 4 * dst_stride; 608 h -= 4; 609 } while (h != 0); 610 src += 8; 611 dst += 8; 612 width -= 8; 613 } while (width != 0); 614 } 615 } 616 617 static inline uint16x4_t highbd_convolve8_4_y(int16x8_t samples_lo[2], 618 int16x8_t samples_hi[2], 619 int16x8_t filter, 620 int64x2_t offset) { 621 int64x2_t sum01 = aom_svdot_lane_s16(offset, samples_lo[0], filter, 0); 622 sum01 = aom_svdot_lane_s16(sum01, samples_hi[0], filter, 1); 623 624 int64x2_t sum23 = aom_svdot_lane_s16(offset, samples_lo[1], filter, 0); 625 sum23 = aom_svdot_lane_s16(sum23, samples_hi[1], filter, 1); 626 627 int32x4_t sum0123 = vcombine_s32(vmovn_s64(sum01), vmovn_s64(sum23)); 628 629 return vqrshrun_n_s32(sum0123, ROUND0_BITS); 630 } 631 632 static inline uint16x8_t highbd_convolve8_8_y(int16x8_t samples_lo[4], 633 int16x8_t samples_hi[4], 634 int16x8_t filter, 635 int64x2_t offset) { 636 int64x2_t sum01 = aom_svdot_lane_s16(offset, samples_lo[0], filter, 0); 637 sum01 = aom_svdot_lane_s16(sum01, samples_hi[0], filter, 1); 638 639 int64x2_t sum23 = aom_svdot_lane_s16(offset, samples_lo[1], filter, 0); 640 sum23 = aom_svdot_lane_s16(sum23, samples_hi[1], filter, 1); 641 642 int64x2_t sum45 = aom_svdot_lane_s16(offset, samples_lo[2], filter, 0); 643 sum45 = aom_svdot_lane_s16(sum45, samples_hi[2], filter, 1); 644 645 int64x2_t sum67 = aom_svdot_lane_s16(offset, samples_lo[3], filter, 0); 646 sum67 = aom_svdot_lane_s16(sum67, samples_hi[3], filter, 1); 647 648 int32x4_t sum0123 = vcombine_s32(vmovn_s64(sum01), vmovn_s64(sum23)); 649 int32x4_t sum4567 = vcombine_s32(vmovn_s64(sum45), vmovn_s64(sum67)); 650 651 return vcombine_u16(vqrshrun_n_s32(sum0123, ROUND0_BITS), 652 vqrshrun_n_s32(sum4567, ROUND0_BITS)); 653 } 654 655 static inline void highbd_dist_wtd_convolve_y_8tap_sve2( 656 const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, 657 int width, int height, const int16_t *y_filter_ptr, const int bd) { 658 const int64x2_t offset = 659 vdupq_n_s64((1 << (bd + FILTER_BITS)) + (1 << (bd + FILTER_BITS - 1))); 660 const int16x8_t y_filter = vld1q_s16(y_filter_ptr); 661 662 uint16x8x3_t merge_block_tbl = vld1q_u16_x3(kDotProdMergeBlockTbl); 663 // Scale indices by size of the true vector length to avoid reading from an 664 // 'undefined' portion of a vector on a system with SVE vectors > 128-bit. 665 uint16x8_t correction0 = 666 vreinterpretq_u16_u64(vdupq_n_u64(svcnth() * 0x0001000000000000ULL)); 667 merge_block_tbl.val[0] = vaddq_u16(merge_block_tbl.val[0], correction0); 668 uint16x8_t correction1 = 669 vreinterpretq_u16_u64(vdupq_n_u64(svcnth() * 0x0001000100000000ULL)); 670 merge_block_tbl.val[1] = vaddq_u16(merge_block_tbl.val[1], correction1); 671 672 uint16x8_t correction2 = 673 vreinterpretq_u16_u64(vdupq_n_u64(svcnth() * 0x0001000100010000ULL)); 674 merge_block_tbl.val[2] = vaddq_u16(merge_block_tbl.val[2], correction2); 675 676 if (width == 4) { 677 int16_t *s = (int16_t *)src; 678 int16x4_t s0, s1, s2, s3, s4, s5, s6; 679 load_s16_4x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6); 680 s += 7 * src_stride; 681 682 // This operation combines a conventional transpose and the sample permute 683 // required before computing the dot product. 684 int16x8_t s0123[2], s1234[2], s2345[2], s3456[2]; 685 transpose_concat_elems_s16_4x4(s0, s1, s2, s3, s0123); 686 transpose_concat_elems_s16_4x4(s1, s2, s3, s4, s1234); 687 transpose_concat_elems_s16_4x4(s2, s3, s4, s5, s2345); 688 transpose_concat_elems_s16_4x4(s3, s4, s5, s6, s3456); 689 690 do { 691 int16x4_t s7, s8, s9, s10; 692 load_s16_4x4(s, src_stride, &s7, &s8, &s9, &s10); 693 694 int16x8_t s4567[2], s5678[2], s6789[2], s789A[2]; 695 // Transpose and shuffle the 4 lines that were loaded. 696 transpose_concat_elems_s16_4x4(s7, s8, s9, s10, s789A); 697 698 // Merge new data into block from previous iteration. 699 aom_tbl2x2_s16(s3456, s789A, merge_block_tbl.val[0], s4567); 700 aom_tbl2x2_s16(s3456, s789A, merge_block_tbl.val[1], s5678); 701 aom_tbl2x2_s16(s3456, s789A, merge_block_tbl.val[2], s6789); 702 703 uint16x4_t d0 = highbd_convolve8_4_y(s0123, s4567, y_filter, offset); 704 uint16x4_t d1 = highbd_convolve8_4_y(s1234, s5678, y_filter, offset); 705 uint16x4_t d2 = highbd_convolve8_4_y(s2345, s6789, y_filter, offset); 706 uint16x4_t d3 = highbd_convolve8_4_y(s3456, s789A, y_filter, offset); 707 708 store_u16_4x4(dst, dst_stride, d0, d1, d2, d3); 709 710 // Prepare block for next iteration - re-using as much as possible. 711 // Shuffle everything up four rows. 712 s0123[0] = s4567[0]; 713 s0123[1] = s4567[1]; 714 s1234[0] = s5678[0]; 715 s1234[1] = s5678[1]; 716 s2345[0] = s6789[0]; 717 s2345[1] = s6789[1]; 718 s3456[0] = s789A[0]; 719 s3456[1] = s789A[1]; 720 721 s += 4 * src_stride; 722 dst += 4 * dst_stride; 723 height -= 4; 724 } while (height != 0); 725 } else { 726 do { 727 int h = height; 728 int16_t *s = (int16_t *)src; 729 uint16_t *d = dst; 730 731 int16x8_t s0, s1, s2, s3, s4, s5, s6; 732 load_s16_8x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6); 733 s += 7 * src_stride; 734 735 // This operation combines a conventional transpose and the sample permute 736 // required before computing the dot product. 737 int16x8_t s0123[4], s1234[4], s2345[4], s3456[4]; 738 transpose_concat_elems_s16_8x4(s0, s1, s2, s3, s0123); 739 transpose_concat_elems_s16_8x4(s1, s2, s3, s4, s1234); 740 transpose_concat_elems_s16_8x4(s2, s3, s4, s5, s2345); 741 transpose_concat_elems_s16_8x4(s3, s4, s5, s6, s3456); 742 743 do { 744 int16x8_t s7, s8, s9, s10; 745 load_s16_8x4(s, src_stride, &s7, &s8, &s9, &s10); 746 int16x8_t s4567[4], s5678[4], s6789[4], s789A[4]; 747 748 // Transpose and shuffle the 4 lines that were loaded. 749 transpose_concat_elems_s16_8x4(s7, s8, s9, s10, s789A); 750 751 // Merge new data into block from previous iteration. 752 aom_tbl2x4_s16(s3456, s789A, merge_block_tbl.val[0], s4567); 753 aom_tbl2x4_s16(s3456, s789A, merge_block_tbl.val[1], s5678); 754 aom_tbl2x4_s16(s3456, s789A, merge_block_tbl.val[2], s6789); 755 756 uint16x8_t d0 = highbd_convolve8_8_y(s0123, s4567, y_filter, offset); 757 uint16x8_t d1 = highbd_convolve8_8_y(s1234, s5678, y_filter, offset); 758 uint16x8_t d2 = highbd_convolve8_8_y(s2345, s6789, y_filter, offset); 759 uint16x8_t d3 = highbd_convolve8_8_y(s3456, s789A, y_filter, offset); 760 761 store_u16_8x4(d, dst_stride, d0, d1, d2, d3); 762 763 // Prepare block for next iteration - re-using as much as possible. 764 // Shuffle everything up four rows. 765 s0123[0] = s4567[0]; 766 s0123[1] = s4567[1]; 767 s0123[2] = s4567[2]; 768 s0123[3] = s4567[3]; 769 s1234[0] = s5678[0]; 770 s1234[1] = s5678[1]; 771 s1234[2] = s5678[2]; 772 s1234[3] = s5678[3]; 773 s2345[0] = s6789[0]; 774 s2345[1] = s6789[1]; 775 s2345[2] = s6789[2]; 776 s2345[3] = s6789[3]; 777 s3456[0] = s789A[0]; 778 s3456[1] = s789A[1]; 779 s3456[2] = s789A[2]; 780 s3456[3] = s789A[3]; 781 782 s += 4 * src_stride; 783 d += 4 * dst_stride; 784 h -= 4; 785 } while (h != 0); 786 src += 8; 787 dst += 8; 788 width -= 8; 789 } while (width != 0); 790 } 791 } 792 793 void av1_highbd_dist_wtd_convolve_y_sve2( 794 const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, 795 int h, const InterpFilterParams *filter_params_y, const int subpel_y_qn, 796 ConvolveParams *conv_params, int bd) { 797 DECLARE_ALIGNED(16, uint16_t, 798 im_block[(MAX_SB_SIZE + MAX_FILTER_TAP) * MAX_SB_SIZE]); 799 CONV_BUF_TYPE *dst16 = conv_params->dst; 800 const int y_filter_taps = get_filter_tap(filter_params_y, subpel_y_qn); 801 802 if (y_filter_taps != 8) { 803 av1_highbd_dist_wtd_convolve_y_neon(src, src_stride, dst, dst_stride, w, h, 804 filter_params_y, subpel_y_qn, 805 conv_params, bd); 806 return; 807 } 808 809 int dst16_stride = conv_params->dst_stride; 810 const int im_stride = MAX_SB_SIZE; 811 const int vert_offset = filter_params_y->taps / 2 - 1; 812 assert(FILTER_BITS == COMPOUND_ROUND1_BITS); 813 814 const int16_t *y_filter_ptr = av1_get_interp_filter_subpel_kernel( 815 filter_params_y, subpel_y_qn & SUBPEL_MASK); 816 817 src -= vert_offset * src_stride; 818 819 if (bd == 12) { 820 if (conv_params->do_average) { 821 highbd_12_dist_wtd_convolve_y_8tap_sve2(src, src_stride, im_block, 822 im_stride, w, h, y_filter_ptr); 823 if (conv_params->use_dist_wtd_comp_avg) { 824 highbd_12_dist_wtd_comp_avg_neon(im_block, im_stride, dst, dst_stride, 825 w, h, conv_params); 826 } else { 827 highbd_12_comp_avg_neon(im_block, im_stride, dst, dst_stride, w, h, 828 conv_params); 829 } 830 } else { 831 highbd_12_dist_wtd_convolve_y_8tap_sve2(src, src_stride, dst16, 832 dst16_stride, w, h, y_filter_ptr); 833 } 834 } else { 835 if (conv_params->do_average) { 836 highbd_dist_wtd_convolve_y_8tap_sve2(src, src_stride, im_block, im_stride, 837 w, h, y_filter_ptr, bd); 838 if (conv_params->use_dist_wtd_comp_avg) { 839 highbd_dist_wtd_comp_avg_neon(im_block, im_stride, dst, dst_stride, w, 840 h, conv_params, bd); 841 } else { 842 highbd_comp_avg_neon(im_block, im_stride, dst, dst_stride, w, h, 843 conv_params, bd); 844 } 845 } else { 846 highbd_dist_wtd_convolve_y_8tap_sve2(src, src_stride, dst16, dst16_stride, 847 w, h, y_filter_ptr, bd); 848 } 849 } 850 } 851 852 static inline void highbd_12_dist_wtd_convolve_2d_horiz_8tap_sve2( 853 const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, 854 int width, int height, const int16_t *x_filter_ptr) { 855 const int64x2_t offset = vdupq_n_s64(1 << (12 + FILTER_BITS - 2)); 856 const int16x8_t filter = vld1q_s16(x_filter_ptr); 857 858 // We are only doing 8-tap and 4-tap vertical convolutions, therefore we know 859 // that im_h % 4 = 3, so we can do the loop across the whole block 4 rows at 860 // a time and then process the last 3 rows separately. 861 862 do { 863 const int16_t *s = (const int16_t *)src; 864 uint16_t *d = dst; 865 int w = width; 866 867 do { 868 int16x8_t s0[8], s1[8], s2[8], s3[8]; 869 load_s16_8x8(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3], 870 &s0[4], &s0[5], &s0[6], &s0[7]); 871 load_s16_8x8(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3], 872 &s1[4], &s1[5], &s1[6], &s1[7]); 873 load_s16_8x8(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3], 874 &s2[4], &s2[5], &s2[6], &s2[7]); 875 load_s16_8x8(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3], 876 &s3[4], &s3[5], &s3[6], &s3[7]); 877 878 uint16x8_t d0 = highbd_12_convolve8_8_x(s0, filter, offset); 879 uint16x8_t d1 = highbd_12_convolve8_8_x(s1, filter, offset); 880 uint16x8_t d2 = highbd_12_convolve8_8_x(s2, filter, offset); 881 uint16x8_t d3 = highbd_12_convolve8_8_x(s3, filter, offset); 882 883 store_u16_8x4(d, dst_stride, d0, d1, d2, d3); 884 885 s += 8; 886 d += 8; 887 w -= 8; 888 } while (w != 0); 889 src += 4 * src_stride; 890 dst += 4 * dst_stride; 891 height -= 4; 892 } while (height > 4); 893 894 // Process final 3 rows. 895 const int16_t *s = (const int16_t *)src; 896 do { 897 int16x8_t s0[8], s1[8], s2[8]; 898 load_s16_8x8(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3], &s0[4], 899 &s0[5], &s0[6], &s0[7]); 900 load_s16_8x8(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3], &s1[4], 901 &s1[5], &s1[6], &s1[7]); 902 load_s16_8x8(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3], &s2[4], 903 &s2[5], &s2[6], &s2[7]); 904 905 uint16x8_t d0 = highbd_12_convolve8_8_x(s0, filter, offset); 906 uint16x8_t d1 = highbd_12_convolve8_8_x(s1, filter, offset); 907 uint16x8_t d2 = highbd_12_convolve8_8_x(s2, filter, offset); 908 909 store_u16_8x3(dst, dst_stride, d0, d1, d2); 910 s += 8; 911 dst += 8; 912 width -= 8; 913 } while (width != 0); 914 } 915 916 static inline void highbd_dist_wtd_convolve_2d_horiz_8tap_sve2( 917 const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, 918 int width, int height, const int16_t *x_filter_ptr, const int bd) { 919 const int64x2_t offset = vdupq_n_s64(1 << (bd + FILTER_BITS - 2)); 920 const int16x8_t filter = vld1q_s16(x_filter_ptr); 921 922 // We are only doing 8-tap and 4-tap vertical convolutions, therefore we know 923 // that im_h % 4 = 3, so we can do the loop across the whole block 4 rows at 924 // a time and then process the last 3 rows separately. 925 926 do { 927 const int16_t *s = (const int16_t *)src; 928 uint16_t *d = dst; 929 int w = width; 930 931 do { 932 int16x8_t s0[8], s1[8], s2[8], s3[8]; 933 load_s16_8x8(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3], 934 &s0[4], &s0[5], &s0[6], &s0[7]); 935 load_s16_8x8(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3], 936 &s1[4], &s1[5], &s1[6], &s1[7]); 937 load_s16_8x8(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3], 938 &s2[4], &s2[5], &s2[6], &s2[7]); 939 load_s16_8x8(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3], 940 &s3[4], &s3[5], &s3[6], &s3[7]); 941 942 uint16x8_t d0 = highbd_convolve8_8_x(s0, filter, offset); 943 uint16x8_t d1 = highbd_convolve8_8_x(s1, filter, offset); 944 uint16x8_t d2 = highbd_convolve8_8_x(s2, filter, offset); 945 uint16x8_t d3 = highbd_convolve8_8_x(s3, filter, offset); 946 947 store_u16_8x4(d, dst_stride, d0, d1, d2, d3); 948 949 s += 8; 950 d += 8; 951 w -= 8; 952 } while (w != 0); 953 src += 4 * src_stride; 954 dst += 4 * dst_stride; 955 height -= 4; 956 } while (height > 4); 957 958 // Process final 3 rows. 959 const int16_t *s = (const int16_t *)src; 960 do { 961 int16x8_t s0[8], s1[8], s2[8]; 962 load_s16_8x8(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3], &s0[4], 963 &s0[5], &s0[6], &s0[7]); 964 load_s16_8x8(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3], &s1[4], 965 &s1[5], &s1[6], &s1[7]); 966 load_s16_8x8(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3], &s2[4], 967 &s2[5], &s2[6], &s2[7]); 968 969 uint16x8_t d0 = highbd_convolve8_8_x(s0, filter, offset); 970 uint16x8_t d1 = highbd_convolve8_8_x(s1, filter, offset); 971 uint16x8_t d2 = highbd_convolve8_8_x(s2, filter, offset); 972 973 store_u16_8x3(dst, dst_stride, d0, d1, d2); 974 s += 8; 975 dst += 8; 976 width -= 8; 977 } while (width != 0); 978 } 979 980 static inline void highbd_12_dist_wtd_convolve_2d_horiz_4tap_sve2( 981 const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, 982 int width, int height, const int16_t *x_filter_ptr) { 983 const int64x2_t offset = vdupq_n_s64(1 << (12 + FILTER_BITS - 1)); 984 const int16x4_t x_filter = vld1_s16(x_filter_ptr + 2); 985 const int16x8_t filter = vcombine_s16(x_filter, vdup_n_s16(0)); 986 987 // We are only doing 8-tap and 4-tap vertical convolutions, therefore we know 988 // that im_h % 4 = 3, so we can do the loop across the whole block 4 rows at 989 // a time and then process the last 3 rows separately. 990 991 if (width == 4) { 992 uint16x8x2_t permute_tbl = vld1q_u16_x2(kDotProdTbl); 993 994 const int16_t *s = (const int16_t *)(src); 995 996 do { 997 int16x8_t s0, s1, s2, s3; 998 load_s16_8x4(s, src_stride, &s0, &s1, &s2, &s3); 999 1000 uint16x4_t d0 = highbd_12_convolve4_4_x(s0, filter, offset, permute_tbl); 1001 uint16x4_t d1 = highbd_12_convolve4_4_x(s1, filter, offset, permute_tbl); 1002 uint16x4_t d2 = highbd_12_convolve4_4_x(s2, filter, offset, permute_tbl); 1003 uint16x4_t d3 = highbd_12_convolve4_4_x(s3, filter, offset, permute_tbl); 1004 1005 store_u16_4x4(dst, dst_stride, d0, d1, d2, d3); 1006 1007 s += 4 * src_stride; 1008 dst += 4 * dst_stride; 1009 height -= 4; 1010 } while (height > 4); 1011 1012 // Process final 3 rows. 1013 int16x8_t s0, s1, s2; 1014 load_s16_8x3(s, src_stride, &s0, &s1, &s2); 1015 1016 uint16x4_t d0 = highbd_12_convolve4_4_x(s0, filter, offset, permute_tbl); 1017 uint16x4_t d1 = highbd_12_convolve4_4_x(s1, filter, offset, permute_tbl); 1018 uint16x4_t d2 = highbd_12_convolve4_4_x(s2, filter, offset, permute_tbl); 1019 1020 store_u16_4x3(dst, dst_stride, d0, d1, d2); 1021 1022 } else { 1023 uint16x8_t idx = vld1q_u16(kDeinterleaveTbl); 1024 1025 do { 1026 const int16_t *s = (const int16_t *)(src); 1027 uint16_t *d = dst; 1028 int w = width; 1029 1030 do { 1031 int16x8_t s0[4], s1[4], s2[4], s3[4]; 1032 load_s16_8x4(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3]); 1033 load_s16_8x4(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3]); 1034 load_s16_8x4(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3]); 1035 load_s16_8x4(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3]); 1036 1037 uint16x8_t d0 = highbd_12_convolve4_8_x(s0, filter, offset, idx); 1038 uint16x8_t d1 = highbd_12_convolve4_8_x(s1, filter, offset, idx); 1039 uint16x8_t d2 = highbd_12_convolve4_8_x(s2, filter, offset, idx); 1040 uint16x8_t d3 = highbd_12_convolve4_8_x(s3, filter, offset, idx); 1041 1042 store_u16_8x4(d, dst_stride, d0, d1, d2, d3); 1043 1044 s += 8; 1045 d += 8; 1046 w -= 8; 1047 } while (w != 0); 1048 src += 4 * src_stride; 1049 dst += 4 * dst_stride; 1050 height -= 4; 1051 } while (height > 4); 1052 1053 // Process final 3 rows. 1054 const int16_t *s = (const int16_t *)(src); 1055 1056 do { 1057 int16x8_t s0[4], s1[4], s2[4]; 1058 load_s16_8x4(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3]); 1059 load_s16_8x4(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3]); 1060 load_s16_8x4(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3]); 1061 1062 uint16x8_t d0 = highbd_12_convolve4_8_x(s0, filter, offset, idx); 1063 uint16x8_t d1 = highbd_12_convolve4_8_x(s1, filter, offset, idx); 1064 uint16x8_t d2 = highbd_12_convolve4_8_x(s2, filter, offset, idx); 1065 1066 store_u16_8x3(dst, dst_stride, d0, d1, d2); 1067 1068 s += 8; 1069 dst += 8; 1070 width -= 8; 1071 } while (width != 0); 1072 } 1073 } 1074 1075 static inline void highbd_dist_wtd_convolve_2d_horiz_4tap_sve2( 1076 const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, 1077 int width, int height, const int16_t *x_filter_ptr, const int bd) { 1078 const int64x2_t offset = vdupq_n_s64(1 << (bd + FILTER_BITS - 1)); 1079 const int16x4_t x_filter = vld1_s16(x_filter_ptr + 2); 1080 const int16x8_t filter = vcombine_s16(x_filter, vdup_n_s16(0)); 1081 1082 // We are only doing 8-tap and 4-tap vertical convolutions, therefore we know 1083 // that im_h % 4 = 3, so we can do the loop across the whole block 4 rows at 1084 // a time and then process the last 3 rows separately. 1085 1086 if (width == 4) { 1087 uint16x8x2_t permute_tbl = vld1q_u16_x2(kDotProdTbl); 1088 1089 const int16_t *s = (const int16_t *)(src); 1090 1091 do { 1092 int16x8_t s0, s1, s2, s3; 1093 load_s16_8x4(s, src_stride, &s0, &s1, &s2, &s3); 1094 1095 uint16x4_t d0 = highbd_convolve4_4_x(s0, filter, offset, permute_tbl); 1096 uint16x4_t d1 = highbd_convolve4_4_x(s1, filter, offset, permute_tbl); 1097 uint16x4_t d2 = highbd_convolve4_4_x(s2, filter, offset, permute_tbl); 1098 uint16x4_t d3 = highbd_convolve4_4_x(s3, filter, offset, permute_tbl); 1099 1100 store_u16_4x4(dst, dst_stride, d0, d1, d2, d3); 1101 1102 s += 4 * src_stride; 1103 dst += 4 * dst_stride; 1104 height -= 4; 1105 } while (height > 4); 1106 1107 // Process final 3 rows. 1108 int16x8_t s0, s1, s2; 1109 load_s16_8x3(s, src_stride, &s0, &s1, &s2); 1110 1111 uint16x4_t d0 = highbd_convolve4_4_x(s0, filter, offset, permute_tbl); 1112 uint16x4_t d1 = highbd_convolve4_4_x(s1, filter, offset, permute_tbl); 1113 uint16x4_t d2 = highbd_convolve4_4_x(s2, filter, offset, permute_tbl); 1114 1115 store_u16_4x3(dst, dst_stride, d0, d1, d2); 1116 } else { 1117 uint16x8_t idx = vld1q_u16(kDeinterleaveTbl); 1118 1119 do { 1120 const int16_t *s = (const int16_t *)(src); 1121 uint16_t *d = dst; 1122 int w = width; 1123 1124 do { 1125 int16x8_t s0[4], s1[4], s2[4], s3[4]; 1126 load_s16_8x4(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3]); 1127 load_s16_8x4(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3]); 1128 load_s16_8x4(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3]); 1129 load_s16_8x4(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3]); 1130 1131 uint16x8_t d0 = highbd_convolve4_8_x(s0, filter, offset, idx); 1132 uint16x8_t d1 = highbd_convolve4_8_x(s1, filter, offset, idx); 1133 uint16x8_t d2 = highbd_convolve4_8_x(s2, filter, offset, idx); 1134 uint16x8_t d3 = highbd_convolve4_8_x(s3, filter, offset, idx); 1135 1136 store_u16_8x4(d, dst_stride, d0, d1, d2, d3); 1137 1138 s += 8; 1139 d += 8; 1140 w -= 8; 1141 } while (w != 0); 1142 src += 4 * src_stride; 1143 dst += 4 * dst_stride; 1144 height -= 4; 1145 } while (height > 4); 1146 1147 // Process final 3 rows. 1148 const int16_t *s = (const int16_t *)(src); 1149 1150 do { 1151 int16x8_t s0[4], s1[4], s2[4]; 1152 load_s16_8x4(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3]); 1153 load_s16_8x4(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3]); 1154 load_s16_8x4(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3]); 1155 1156 uint16x8_t d0 = highbd_convolve4_8_x(s0, filter, offset, idx); 1157 uint16x8_t d1 = highbd_convolve4_8_x(s1, filter, offset, idx); 1158 uint16x8_t d2 = highbd_convolve4_8_x(s2, filter, offset, idx); 1159 1160 store_u16_8x3(dst, dst_stride, d0, d1, d2); 1161 1162 s += 8; 1163 dst += 8; 1164 width -= 8; 1165 } while (width != 0); 1166 } 1167 } 1168 1169 static inline uint16x4_t highbd_convolve8_4_2d_v(int16x8_t samples_lo[2], 1170 int16x8_t samples_hi[2], 1171 int16x8_t filter, 1172 int64x2_t offset) { 1173 int64x2_t sum01 = aom_svdot_lane_s16(offset, samples_lo[0], filter, 0); 1174 sum01 = aom_svdot_lane_s16(sum01, samples_hi[0], filter, 1); 1175 1176 int64x2_t sum23 = aom_svdot_lane_s16(offset, samples_lo[1], filter, 0); 1177 sum23 = aom_svdot_lane_s16(sum23, samples_hi[1], filter, 1); 1178 1179 int32x4_t sum0123 = vcombine_s32(vmovn_s64(sum01), vmovn_s64(sum23)); 1180 1181 return vqrshrun_n_s32(sum0123, COMPOUND_ROUND1_BITS); 1182 } 1183 1184 static inline uint16x8_t highbd_convolve8_8_2d_v(int16x8_t samples_lo[4], 1185 int16x8_t samples_hi[4], 1186 int16x8_t filter, 1187 int64x2_t offset) { 1188 int64x2_t sum01 = aom_svdot_lane_s16(offset, samples_lo[0], filter, 0); 1189 sum01 = aom_svdot_lane_s16(sum01, samples_hi[0], filter, 1); 1190 1191 int64x2_t sum23 = aom_svdot_lane_s16(offset, samples_lo[1], filter, 0); 1192 sum23 = aom_svdot_lane_s16(sum23, samples_hi[1], filter, 1); 1193 1194 int64x2_t sum45 = aom_svdot_lane_s16(offset, samples_lo[2], filter, 0); 1195 sum45 = aom_svdot_lane_s16(sum45, samples_hi[2], filter, 1); 1196 1197 int64x2_t sum67 = aom_svdot_lane_s16(offset, samples_lo[3], filter, 0); 1198 sum67 = aom_svdot_lane_s16(sum67, samples_hi[3], filter, 1); 1199 1200 int32x4_t sum0123 = vcombine_s32(vmovn_s64(sum01), vmovn_s64(sum23)); 1201 int32x4_t sum4567 = vcombine_s32(vmovn_s64(sum45), vmovn_s64(sum67)); 1202 1203 return vcombine_u16(vqrshrun_n_s32(sum0123, COMPOUND_ROUND1_BITS), 1204 vqrshrun_n_s32(sum4567, COMPOUND_ROUND1_BITS)); 1205 } 1206 1207 static inline void highbd_dist_wtd_convolve_2d_vert_8tap_sve2( 1208 const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, 1209 int width, int height, const int16_t *y_filter_ptr, int offset) { 1210 const int16x8_t y_filter = vld1q_s16(y_filter_ptr); 1211 const int64x2_t offset_s64 = vdupq_n_s64(offset); 1212 1213 uint16x8x3_t merge_block_tbl = vld1q_u16_x3(kDotProdMergeBlockTbl); 1214 // Scale indices by size of the true vector length to avoid reading from an 1215 // 'undefined' portion of a vector on a system with SVE vectors > 128-bit. 1216 uint16x8_t correction0 = 1217 vreinterpretq_u16_u64(vdupq_n_u64(svcnth() * 0x0001000000000000ULL)); 1218 merge_block_tbl.val[0] = vaddq_u16(merge_block_tbl.val[0], correction0); 1219 1220 uint16x8_t correction1 = 1221 vreinterpretq_u16_u64(vdupq_n_u64(svcnth() * 0x0001000100000000ULL)); 1222 merge_block_tbl.val[1] = vaddq_u16(merge_block_tbl.val[1], correction1); 1223 1224 uint16x8_t correction2 = 1225 vreinterpretq_u16_u64(vdupq_n_u64(svcnth() * 0x0001000100010000ULL)); 1226 merge_block_tbl.val[2] = vaddq_u16(merge_block_tbl.val[2], correction2); 1227 1228 if (width == 4) { 1229 int16_t *s = (int16_t *)src; 1230 int16x4_t s0, s1, s2, s3, s4, s5, s6; 1231 load_s16_4x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6); 1232 s += 7 * src_stride; 1233 1234 // This operation combines a conventional transpose and the sample permute 1235 // required before computing the dot product. 1236 int16x8_t s0123[2], s1234[2], s2345[2], s3456[2]; 1237 transpose_concat_elems_s16_4x4(s0, s1, s2, s3, s0123); 1238 transpose_concat_elems_s16_4x4(s1, s2, s3, s4, s1234); 1239 transpose_concat_elems_s16_4x4(s2, s3, s4, s5, s2345); 1240 transpose_concat_elems_s16_4x4(s3, s4, s5, s6, s3456); 1241 1242 do { 1243 int16x4_t s7, s8, s9, s10; 1244 load_s16_4x4(s, src_stride, &s7, &s8, &s9, &s10); 1245 1246 int16x8_t s4567[2], s5678[2], s6789[2], s789A[2]; 1247 // Transpose and shuffle the 4 lines that were loaded. 1248 transpose_concat_elems_s16_4x4(s7, s8, s9, s10, s789A); 1249 1250 // Merge new data into block from previous iteration. 1251 aom_tbl2x2_s16(s3456, s789A, merge_block_tbl.val[0], s4567); 1252 aom_tbl2x2_s16(s3456, s789A, merge_block_tbl.val[1], s5678); 1253 aom_tbl2x2_s16(s3456, s789A, merge_block_tbl.val[2], s6789); 1254 1255 uint16x4_t d0 = 1256 highbd_convolve8_4_2d_v(s0123, s4567, y_filter, offset_s64); 1257 uint16x4_t d1 = 1258 highbd_convolve8_4_2d_v(s1234, s5678, y_filter, offset_s64); 1259 uint16x4_t d2 = 1260 highbd_convolve8_4_2d_v(s2345, s6789, y_filter, offset_s64); 1261 uint16x4_t d3 = 1262 highbd_convolve8_4_2d_v(s3456, s789A, y_filter, offset_s64); 1263 1264 store_u16_4x4(dst, dst_stride, d0, d1, d2, d3); 1265 1266 // Prepare block for next iteration - re-using as much as possible. 1267 // Shuffle everything up four rows. 1268 s0123[0] = s4567[0]; 1269 s0123[1] = s4567[1]; 1270 s1234[0] = s5678[0]; 1271 s1234[1] = s5678[1]; 1272 s2345[0] = s6789[0]; 1273 s2345[1] = s6789[1]; 1274 s3456[0] = s789A[0]; 1275 s3456[1] = s789A[1]; 1276 1277 s += 4 * src_stride; 1278 dst += 4 * dst_stride; 1279 height -= 4; 1280 } while (height != 0); 1281 } else { 1282 do { 1283 int h = height; 1284 int16_t *s = (int16_t *)src; 1285 uint16_t *d = dst; 1286 1287 int16x8_t s0, s1, s2, s3, s4, s5, s6; 1288 load_s16_8x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6); 1289 s += 7 * src_stride; 1290 1291 // This operation combines a conventional transpose and the sample permute 1292 // required before computing the dot product. 1293 int16x8_t s0123[4], s1234[4], s2345[4], s3456[4]; 1294 transpose_concat_elems_s16_8x4(s0, s1, s2, s3, s0123); 1295 transpose_concat_elems_s16_8x4(s1, s2, s3, s4, s1234); 1296 transpose_concat_elems_s16_8x4(s2, s3, s4, s5, s2345); 1297 transpose_concat_elems_s16_8x4(s3, s4, s5, s6, s3456); 1298 1299 do { 1300 int16x8_t s7, s8, s9, s10; 1301 load_s16_8x4(s, src_stride, &s7, &s8, &s9, &s10); 1302 int16x8_t s4567[4], s5678[4], s6789[4], s789A[4]; 1303 1304 // Transpose and shuffle the 4 lines that were loaded. 1305 transpose_concat_elems_s16_8x4(s7, s8, s9, s10, s789A); 1306 1307 // Merge new data into block from previous iteration. 1308 aom_tbl2x4_s16(s3456, s789A, merge_block_tbl.val[0], s4567); 1309 aom_tbl2x4_s16(s3456, s789A, merge_block_tbl.val[1], s5678); 1310 aom_tbl2x4_s16(s3456, s789A, merge_block_tbl.val[2], s6789); 1311 1312 uint16x8_t d0 = 1313 highbd_convolve8_8_2d_v(s0123, s4567, y_filter, offset_s64); 1314 uint16x8_t d1 = 1315 highbd_convolve8_8_2d_v(s1234, s5678, y_filter, offset_s64); 1316 uint16x8_t d2 = 1317 highbd_convolve8_8_2d_v(s2345, s6789, y_filter, offset_s64); 1318 uint16x8_t d3 = 1319 highbd_convolve8_8_2d_v(s3456, s789A, y_filter, offset_s64); 1320 1321 store_u16_8x4(d, dst_stride, d0, d1, d2, d3); 1322 1323 // Prepare block for next iteration - re-using as much as possible. 1324 // Shuffle everything up four rows. 1325 s0123[0] = s4567[0]; 1326 s0123[1] = s4567[1]; 1327 s0123[2] = s4567[2]; 1328 s0123[3] = s4567[3]; 1329 s1234[0] = s5678[0]; 1330 s1234[1] = s5678[1]; 1331 s1234[2] = s5678[2]; 1332 s1234[3] = s5678[3]; 1333 s2345[0] = s6789[0]; 1334 s2345[1] = s6789[1]; 1335 s2345[2] = s6789[2]; 1336 s2345[3] = s6789[3]; 1337 s3456[0] = s789A[0]; 1338 s3456[1] = s789A[1]; 1339 s3456[2] = s789A[2]; 1340 s3456[3] = s789A[3]; 1341 1342 s += 4 * src_stride; 1343 d += 4 * dst_stride; 1344 h -= 4; 1345 } while (h != 0); 1346 src += 8; 1347 dst += 8; 1348 width -= 8; 1349 } while (width != 0); 1350 } 1351 } 1352 1353 static inline uint16x4_t highbd_convolve4_4_2d_v( 1354 const int16x4_t s0, const int16x4_t s1, const int16x4_t s2, 1355 const int16x4_t s3, const int16x4_t filter, const int32x4_t offset) { 1356 int32x4_t sum = vmlal_lane_s16(offset, s0, filter, 0); 1357 sum = vmlal_lane_s16(sum, s1, filter, 1); 1358 sum = vmlal_lane_s16(sum, s2, filter, 2); 1359 sum = vmlal_lane_s16(sum, s3, filter, 3); 1360 1361 return vqrshrun_n_s32(sum, COMPOUND_ROUND1_BITS); 1362 } 1363 1364 static inline uint16x8_t highbd_convolve4_8_2d_v( 1365 const int16x8_t s0, const int16x8_t s1, const int16x8_t s2, 1366 const int16x8_t s3, const int16x4_t filter, const int32x4_t offset) { 1367 int32x4_t sum0 = vmlal_lane_s16(offset, vget_low_s16(s0), filter, 0); 1368 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s1), filter, 1); 1369 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s2), filter, 2); 1370 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s3), filter, 3); 1371 1372 int32x4_t sum1 = vmlal_lane_s16(offset, vget_high_s16(s0), filter, 0); 1373 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s1), filter, 1); 1374 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s2), filter, 2); 1375 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s3), filter, 3); 1376 1377 return vcombine_u16(vqrshrun_n_s32(sum0, COMPOUND_ROUND1_BITS), 1378 vqrshrun_n_s32(sum1, COMPOUND_ROUND1_BITS)); 1379 } 1380 1381 static inline void highbd_dist_wtd_convolve_2d_vert_4tap_neon( 1382 const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride, 1383 int w, int h, const int16_t *y_filter_ptr, const int offset) { 1384 const int16x4_t y_filter = vld1_s16(y_filter_ptr + 2); 1385 const int32x4_t offset_vec = vdupq_n_s32(offset); 1386 1387 if (w == 4) { 1388 const int16_t *s = (const int16_t *)src_ptr; 1389 uint16_t *d = dst_ptr; 1390 1391 int16x4_t s0, s1, s2; 1392 load_s16_4x3(s, src_stride, &s0, &s1, &s2); 1393 s += 3 * src_stride; 1394 1395 do { 1396 int16x4_t s3, s4, s5, s6; 1397 load_s16_4x4(s, src_stride, &s3, &s4, &s5, &s6); 1398 1399 uint16x4_t d0 = 1400 highbd_convolve4_4_2d_v(s0, s1, s2, s3, y_filter, offset_vec); 1401 uint16x4_t d1 = 1402 highbd_convolve4_4_2d_v(s1, s2, s3, s4, y_filter, offset_vec); 1403 uint16x4_t d2 = 1404 highbd_convolve4_4_2d_v(s2, s3, s4, s5, y_filter, offset_vec); 1405 uint16x4_t d3 = 1406 highbd_convolve4_4_2d_v(s3, s4, s5, s6, y_filter, offset_vec); 1407 1408 store_u16_4x4(d, dst_stride, d0, d1, d2, d3); 1409 1410 s0 = s4; 1411 s1 = s5; 1412 s2 = s6; 1413 1414 s += 4 * src_stride; 1415 d += 4 * dst_stride; 1416 h -= 4; 1417 } while (h != 0); 1418 } else { 1419 do { 1420 int height = h; 1421 const int16_t *s = (const int16_t *)src_ptr; 1422 uint16_t *d = dst_ptr; 1423 1424 int16x8_t s0, s1, s2; 1425 load_s16_8x3(s, src_stride, &s0, &s1, &s2); 1426 s += 3 * src_stride; 1427 1428 do { 1429 int16x8_t s3, s4, s5, s6; 1430 load_s16_8x4(s, src_stride, &s3, &s4, &s5, &s6); 1431 1432 uint16x8_t d0 = 1433 highbd_convolve4_8_2d_v(s0, s1, s2, s3, y_filter, offset_vec); 1434 uint16x8_t d1 = 1435 highbd_convolve4_8_2d_v(s1, s2, s3, s4, y_filter, offset_vec); 1436 uint16x8_t d2 = 1437 highbd_convolve4_8_2d_v(s2, s3, s4, s5, y_filter, offset_vec); 1438 uint16x8_t d3 = 1439 highbd_convolve4_8_2d_v(s3, s4, s5, s6, y_filter, offset_vec); 1440 1441 store_u16_8x4(d, dst_stride, d0, d1, d2, d3); 1442 1443 s0 = s4; 1444 s1 = s5; 1445 s2 = s6; 1446 1447 s += 4 * src_stride; 1448 d += 4 * dst_stride; 1449 height -= 4; 1450 } while (height != 0); 1451 src_ptr += 8; 1452 dst_ptr += 8; 1453 w -= 8; 1454 } while (w != 0); 1455 } 1456 } 1457 1458 void av1_highbd_dist_wtd_convolve_2d_sve2( 1459 const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, 1460 int h, const InterpFilterParams *filter_params_x, 1461 const InterpFilterParams *filter_params_y, const int subpel_x_qn, 1462 const int subpel_y_qn, ConvolveParams *conv_params, int bd) { 1463 DECLARE_ALIGNED(16, uint16_t, 1464 im_block[(MAX_SB_SIZE + MAX_FILTER_TAP) * MAX_SB_SIZE]); 1465 DECLARE_ALIGNED(16, uint16_t, 1466 im_block2[(MAX_SB_SIZE + MAX_FILTER_TAP) * MAX_SB_SIZE]); 1467 1468 CONV_BUF_TYPE *dst16 = conv_params->dst; 1469 int dst16_stride = conv_params->dst_stride; 1470 const int x_filter_taps = get_filter_tap(filter_params_x, subpel_x_qn); 1471 const int clamped_x_taps = x_filter_taps < 4 ? 4 : x_filter_taps; 1472 1473 const int y_filter_taps = get_filter_tap(filter_params_y, subpel_y_qn); 1474 const int clamped_y_taps = y_filter_taps < 4 ? 4 : y_filter_taps; 1475 1476 if (x_filter_taps == 6 || y_filter_taps == 6) { 1477 av1_highbd_dist_wtd_convolve_2d_neon( 1478 src, src_stride, dst, dst_stride, w, h, filter_params_x, 1479 filter_params_y, subpel_x_qn, subpel_y_qn, conv_params, bd); 1480 return; 1481 } 1482 1483 const int im_h = h + clamped_y_taps - 1; 1484 const int im_stride = MAX_SB_SIZE; 1485 const int vert_offset = clamped_y_taps / 2 - 1; 1486 const int horiz_offset = clamped_x_taps / 2 - 1; 1487 const int y_offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0; 1488 const int round_offset_conv_y = (1 << y_offset_bits); 1489 1490 const uint16_t *src_ptr = src - vert_offset * src_stride - horiz_offset; 1491 1492 const int16_t *x_filter_ptr = av1_get_interp_filter_subpel_kernel( 1493 filter_params_x, subpel_x_qn & SUBPEL_MASK); 1494 const int16_t *y_filter_ptr = av1_get_interp_filter_subpel_kernel( 1495 filter_params_y, subpel_y_qn & SUBPEL_MASK); 1496 1497 if (bd == 12) { 1498 if (x_filter_taps <= 4) { 1499 highbd_12_dist_wtd_convolve_2d_horiz_4tap_sve2( 1500 src_ptr, src_stride, im_block, im_stride, w, im_h, x_filter_ptr); 1501 } else { 1502 highbd_12_dist_wtd_convolve_2d_horiz_8tap_sve2( 1503 src_ptr, src_stride, im_block, im_stride, w, im_h, x_filter_ptr); 1504 } 1505 } else { 1506 if (x_filter_taps <= 4) { 1507 highbd_dist_wtd_convolve_2d_horiz_4tap_sve2( 1508 src_ptr, src_stride, im_block, im_stride, w, im_h, x_filter_ptr, bd); 1509 } else { 1510 highbd_dist_wtd_convolve_2d_horiz_8tap_sve2( 1511 src_ptr, src_stride, im_block, im_stride, w, im_h, x_filter_ptr, bd); 1512 } 1513 } 1514 1515 if (conv_params->do_average) { 1516 if (y_filter_taps <= 4) { 1517 highbd_dist_wtd_convolve_2d_vert_4tap_neon(im_block, im_stride, im_block2, 1518 im_stride, w, h, y_filter_ptr, 1519 round_offset_conv_y); 1520 } else { 1521 highbd_dist_wtd_convolve_2d_vert_8tap_sve2(im_block, im_stride, im_block2, 1522 im_stride, w, h, y_filter_ptr, 1523 round_offset_conv_y); 1524 } 1525 if (conv_params->use_dist_wtd_comp_avg) { 1526 if (bd == 12) { 1527 highbd_12_dist_wtd_comp_avg_neon(im_block2, im_stride, dst, dst_stride, 1528 w, h, conv_params); 1529 1530 } else { 1531 highbd_dist_wtd_comp_avg_neon(im_block2, im_stride, dst, dst_stride, w, 1532 h, conv_params, bd); 1533 } 1534 } else { 1535 if (bd == 12) { 1536 highbd_12_comp_avg_neon(im_block2, im_stride, dst, dst_stride, w, h, 1537 conv_params); 1538 1539 } else { 1540 highbd_comp_avg_neon(im_block2, im_stride, dst, dst_stride, w, h, 1541 conv_params, bd); 1542 } 1543 } 1544 } else { 1545 if (y_filter_taps <= 4) { 1546 highbd_dist_wtd_convolve_2d_vert_4tap_neon( 1547 im_block, im_stride, dst16, dst16_stride, w, h, y_filter_ptr, 1548 round_offset_conv_y); 1549 } else { 1550 highbd_dist_wtd_convolve_2d_vert_8tap_sve2( 1551 im_block, im_stride, dst16, dst16_stride, w, h, y_filter_ptr, 1552 round_offset_conv_y); 1553 } 1554 } 1555 }