highbd_convolve_sve2.c (68969B)
1 /* 2 * Copyright (c) 2024, Alliance for Open Media. All rights reserved. 3 * 4 * This source code is subject to the terms of the BSD 2 Clause License and 5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License 6 * was not distributed with this source code in the LICENSE file, you can 7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open 8 * Media Patent License 1.0 was not distributed with this source code in the 9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent. 10 */ 11 12 #include <assert.h> 13 #include <arm_neon.h> 14 15 #include "config/aom_config.h" 16 #include "config/av1_rtcd.h" 17 18 #include "aom_dsp/aom_dsp_common.h" 19 #include "aom_dsp/arm/aom_neon_sve_bridge.h" 20 #include "aom_dsp/arm/aom_neon_sve2_bridge.h" 21 #include "aom_dsp/arm/mem_neon.h" 22 #include "aom_dsp/arm/transpose_neon.h" 23 #include "aom_ports/mem.h" 24 #include "av1/common/convolve.h" 25 #include "av1/common/filter.h" 26 #include "av1/common/arm/highbd_convolve_sve2.h" 27 28 DECLARE_ALIGNED(16, static const uint16_t, kDotProdTbl[32]) = { 29 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6, 30 4, 5, 6, 7, 5, 6, 7, 0, 6, 7, 0, 1, 7, 0, 1, 2, 31 }; 32 33 static inline uint16x4_t convolve12_4_x( 34 int16x8_t s0, int16x8_t s1, int16x8_t filter_0_7, int16x8_t filter_4_11, 35 const int64x2_t offset, uint16x8x4_t permute_tbl, uint16x4_t max) { 36 int16x8_t permuted_samples[6]; 37 permuted_samples[0] = aom_tbl_s16(s0, permute_tbl.val[0]); 38 permuted_samples[1] = aom_tbl_s16(s0, permute_tbl.val[1]); 39 permuted_samples[2] = aom_tbl2_s16(s0, s1, permute_tbl.val[2]); 40 permuted_samples[3] = aom_tbl2_s16(s0, s1, permute_tbl.val[3]); 41 permuted_samples[4] = aom_tbl_s16(s1, permute_tbl.val[0]); 42 permuted_samples[5] = aom_tbl_s16(s1, permute_tbl.val[1]); 43 44 int64x2_t sum01 = 45 aom_svdot_lane_s16(offset, permuted_samples[0], filter_0_7, 0); 46 sum01 = aom_svdot_lane_s16(sum01, permuted_samples[2], filter_0_7, 1); 47 sum01 = aom_svdot_lane_s16(sum01, permuted_samples[4], filter_4_11, 1); 48 49 int64x2_t sum23 = 50 aom_svdot_lane_s16(offset, permuted_samples[1], filter_0_7, 0); 51 sum23 = aom_svdot_lane_s16(sum23, permuted_samples[3], filter_0_7, 1); 52 sum23 = aom_svdot_lane_s16(sum23, permuted_samples[5], filter_4_11, 1); 53 54 int32x4_t res0123 = vcombine_s32(vmovn_s64(sum01), vmovn_s64(sum23)); 55 uint16x4_t res = vqrshrun_n_s32(res0123, FILTER_BITS); 56 57 return vmin_u16(res, max); 58 } 59 60 static inline uint16x8_t convolve12_8_x(int16x8_t s0, int16x8_t s1, 61 int16x8_t s2, int16x8_t filter_0_7, 62 int16x8_t filter_4_11, int64x2_t offset, 63 uint16x8x4_t permute_tbl, 64 uint16x8_t max) { 65 int16x8_t permuted_samples[8]; 66 permuted_samples[0] = aom_tbl_s16(s0, permute_tbl.val[0]); 67 permuted_samples[1] = aom_tbl_s16(s0, permute_tbl.val[1]); 68 permuted_samples[2] = aom_tbl2_s16(s0, s1, permute_tbl.val[2]); 69 permuted_samples[3] = aom_tbl2_s16(s0, s1, permute_tbl.val[3]); 70 permuted_samples[4] = aom_tbl_s16(s1, permute_tbl.val[0]); 71 permuted_samples[5] = aom_tbl_s16(s1, permute_tbl.val[1]); 72 permuted_samples[6] = aom_tbl2_s16(s1, s2, permute_tbl.val[2]); 73 permuted_samples[7] = aom_tbl2_s16(s1, s2, permute_tbl.val[3]); 74 75 int64x2_t sum01 = 76 aom_svdot_lane_s16(offset, permuted_samples[0], filter_0_7, 0); 77 sum01 = aom_svdot_lane_s16(sum01, permuted_samples[2], filter_0_7, 1); 78 sum01 = aom_svdot_lane_s16(sum01, permuted_samples[4], filter_4_11, 1); 79 80 int64x2_t sum23 = 81 aom_svdot_lane_s16(offset, permuted_samples[1], filter_0_7, 0); 82 sum23 = aom_svdot_lane_s16(sum23, permuted_samples[3], filter_0_7, 1); 83 sum23 = aom_svdot_lane_s16(sum23, permuted_samples[5], filter_4_11, 1); 84 85 int64x2_t sum45 = 86 aom_svdot_lane_s16(offset, permuted_samples[2], filter_0_7, 0); 87 sum45 = aom_svdot_lane_s16(sum45, permuted_samples[4], filter_0_7, 1); 88 sum45 = aom_svdot_lane_s16(sum45, permuted_samples[6], filter_4_11, 1); 89 90 int64x2_t sum67 = 91 aom_svdot_lane_s16(offset, permuted_samples[3], filter_0_7, 0); 92 sum67 = aom_svdot_lane_s16(sum67, permuted_samples[5], filter_0_7, 1); 93 sum67 = aom_svdot_lane_s16(sum67, permuted_samples[7], filter_4_11, 1); 94 95 int32x4_t sum0123 = vcombine_s32(vmovn_s64(sum01), vmovn_s64(sum23)); 96 int32x4_t sum4567 = vcombine_s32(vmovn_s64(sum45), vmovn_s64(sum67)); 97 98 uint16x8_t res = vcombine_u16(vqrshrun_n_s32(sum0123, FILTER_BITS), 99 vqrshrun_n_s32(sum4567, FILTER_BITS)); 100 101 return vminq_u16(res, max); 102 } 103 104 static inline void highbd_convolve_x_sr_12tap_sve2( 105 const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, 106 int width, int height, const int16_t *y_filter_ptr, 107 ConvolveParams *conv_params, int bd) { 108 // This shim allows to do only one rounding shift instead of two. 109 const int64x2_t offset = vdupq_n_s64(1 << (conv_params->round_0 - 1)); 110 111 const int16x8_t y_filter_0_7 = vld1q_s16(y_filter_ptr); 112 const int16x8_t y_filter_4_11 = vld1q_s16(y_filter_ptr + 4); 113 114 uint16x8x4_t permute_tbl = vld1q_u16_x4(kDotProdTbl); 115 // Scale indices by size of the true vector length to avoid reading from an 116 // 'undefined' portion of a vector on a system with SVE vectors > 128-bit. 117 uint16x8_t correction0 = vreinterpretq_u16_u64(vcombine_u64( 118 vdup_n_u64(0), vdup_n_u64(svcnth() * 0x0001000000000000ULL))); 119 permute_tbl.val[2] = vaddq_u16(permute_tbl.val[2], correction0); 120 121 uint16x8_t correction1 = vreinterpretq_u16_u64( 122 vcombine_u64(vdup_n_u64(svcnth() * 0x0001000100000000ULL), 123 vdup_n_u64(svcnth() * 0x0001000100010000ULL))); 124 permute_tbl.val[3] = vaddq_u16(permute_tbl.val[3], correction1); 125 126 if (width == 4) { 127 const uint16x4_t max = vdup_n_u16((1 << bd) - 1); 128 const int16_t *s = (const int16_t *)src; 129 130 do { 131 int16x8_t s0, s1, s2, s3, s4, s5, s6, s7; 132 load_s16_8x4(s, src_stride, &s0, &s2, &s4, &s6); 133 load_s16_8x4(s + 8, src_stride, &s1, &s3, &s5, &s7); 134 135 uint16x4_t d0 = convolve12_4_x(s0, s1, y_filter_0_7, y_filter_4_11, 136 offset, permute_tbl, max); 137 uint16x4_t d1 = convolve12_4_x(s2, s3, y_filter_0_7, y_filter_4_11, 138 offset, permute_tbl, max); 139 uint16x4_t d2 = convolve12_4_x(s4, s5, y_filter_0_7, y_filter_4_11, 140 offset, permute_tbl, max); 141 uint16x4_t d3 = convolve12_4_x(s6, s7, y_filter_0_7, y_filter_4_11, 142 offset, permute_tbl, max); 143 144 store_u16_4x4(dst, dst_stride, d0, d1, d2, d3); 145 146 s += 4 * src_stride; 147 dst += 4 * dst_stride; 148 height -= 4; 149 } while (height != 0); 150 } else { 151 const uint16x8_t max = vdupq_n_u16((1 << bd) - 1); 152 153 do { 154 const int16_t *s = (const int16_t *)src; 155 uint16_t *d = dst; 156 int w = width; 157 158 do { 159 int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11; 160 load_s16_8x4(s, src_stride, &s0, &s3, &s6, &s9); 161 load_s16_8x4(s + 8, src_stride, &s1, &s4, &s7, &s10); 162 load_s16_8x4(s + 16, src_stride, &s2, &s5, &s8, &s11); 163 164 uint16x8_t d0 = convolve12_8_x(s0, s1, s2, y_filter_0_7, y_filter_4_11, 165 offset, permute_tbl, max); 166 uint16x8_t d1 = convolve12_8_x(s3, s4, s5, y_filter_0_7, y_filter_4_11, 167 offset, permute_tbl, max); 168 uint16x8_t d2 = convolve12_8_x(s6, s7, s8, y_filter_0_7, y_filter_4_11, 169 offset, permute_tbl, max); 170 uint16x8_t d3 = convolve12_8_x(s9, s10, s11, y_filter_0_7, 171 y_filter_4_11, offset, permute_tbl, max); 172 173 store_u16_8x4(d, dst_stride, d0, d1, d2, d3); 174 175 s += 8; 176 d += 8; 177 w -= 8; 178 } while (w != 0); 179 src += 4 * src_stride; 180 dst += 4 * dst_stride; 181 height -= 4; 182 } while (height != 0); 183 } 184 } 185 186 static inline uint16x8_t convolve8_8_x(int16x8_t s0[8], int16x8_t filter, 187 int64x2_t offset, uint16x8_t max) { 188 int64x2_t sum[8]; 189 sum[0] = aom_sdotq_s16(offset, s0[0], filter); 190 sum[1] = aom_sdotq_s16(offset, s0[1], filter); 191 sum[2] = aom_sdotq_s16(offset, s0[2], filter); 192 sum[3] = aom_sdotq_s16(offset, s0[3], filter); 193 sum[4] = aom_sdotq_s16(offset, s0[4], filter); 194 sum[5] = aom_sdotq_s16(offset, s0[5], filter); 195 sum[6] = aom_sdotq_s16(offset, s0[6], filter); 196 sum[7] = aom_sdotq_s16(offset, s0[7], filter); 197 198 sum[0] = vpaddq_s64(sum[0], sum[1]); 199 sum[2] = vpaddq_s64(sum[2], sum[3]); 200 sum[4] = vpaddq_s64(sum[4], sum[5]); 201 sum[6] = vpaddq_s64(sum[6], sum[7]); 202 203 int32x4_t sum0123 = vcombine_s32(vmovn_s64(sum[0]), vmovn_s64(sum[2])); 204 int32x4_t sum4567 = vcombine_s32(vmovn_s64(sum[4]), vmovn_s64(sum[6])); 205 206 uint16x8_t res = vcombine_u16(vqrshrun_n_s32(sum0123, FILTER_BITS), 207 vqrshrun_n_s32(sum4567, FILTER_BITS)); 208 209 return vminq_u16(res, max); 210 } 211 212 static inline void highbd_convolve_x_sr_8tap_sve2( 213 const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, 214 int width, int height, const int16_t *y_filter_ptr, 215 ConvolveParams *conv_params, int bd) { 216 const uint16x8_t max = vdupq_n_u16((1 << bd) - 1); 217 // This shim allows to do only one rounding shift instead of two. 218 const int64_t offset = 1 << (conv_params->round_0 - 1); 219 const int64x2_t offset_lo = vcombine_s64((int64x1_t)(offset), vdup_n_s64(0)); 220 221 const int16x8_t filter = vld1q_s16(y_filter_ptr); 222 223 do { 224 const int16_t *s = (const int16_t *)src; 225 uint16_t *d = dst; 226 int w = width; 227 228 do { 229 int16x8_t s0[8], s1[8], s2[8], s3[8]; 230 load_s16_8x8(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3], 231 &s0[4], &s0[5], &s0[6], &s0[7]); 232 load_s16_8x8(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3], 233 &s1[4], &s1[5], &s1[6], &s1[7]); 234 load_s16_8x8(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3], 235 &s2[4], &s2[5], &s2[6], &s2[7]); 236 load_s16_8x8(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3], 237 &s3[4], &s3[5], &s3[6], &s3[7]); 238 239 uint16x8_t d0 = convolve8_8_x(s0, filter, offset_lo, max); 240 uint16x8_t d1 = convolve8_8_x(s1, filter, offset_lo, max); 241 uint16x8_t d2 = convolve8_8_x(s2, filter, offset_lo, max); 242 uint16x8_t d3 = convolve8_8_x(s3, filter, offset_lo, max); 243 244 store_u16_8x4(d, dst_stride, d0, d1, d2, d3); 245 246 s += 8; 247 d += 8; 248 w -= 8; 249 } while (w != 0); 250 src += 4 * src_stride; 251 dst += 4 * dst_stride; 252 height -= 4; 253 } while (height != 0); 254 } 255 256 // clang-format off 257 DECLARE_ALIGNED(16, static const uint16_t, kDeinterleaveTbl[8]) = { 258 0, 2, 4, 6, 1, 3, 5, 7, 259 }; 260 // clang-format on 261 262 static inline uint16x4_t convolve4_4_x(int16x8_t s0, int16x8_t filter, 263 int64x2_t offset, 264 uint16x8x2_t permute_tbl, 265 uint16x4_t max) { 266 int16x8_t permuted_samples0 = aom_tbl_s16(s0, permute_tbl.val[0]); 267 int16x8_t permuted_samples1 = aom_tbl_s16(s0, permute_tbl.val[1]); 268 269 int64x2_t sum01 = aom_svdot_lane_s16(offset, permuted_samples0, filter, 0); 270 int64x2_t sum23 = aom_svdot_lane_s16(offset, permuted_samples1, filter, 0); 271 272 int32x4_t sum0123 = vcombine_s32(vmovn_s64(sum01), vmovn_s64(sum23)); 273 uint16x4_t res = vqrshrun_n_s32(sum0123, FILTER_BITS); 274 275 return vmin_u16(res, max); 276 } 277 278 static inline uint16x8_t convolve4_8_x(int16x8_t s0[4], int16x8_t filter, 279 int64x2_t offset, uint16x8_t tbl, 280 uint16x8_t max) { 281 int64x2_t sum04 = aom_svdot_lane_s16(offset, s0[0], filter, 0); 282 int64x2_t sum15 = aom_svdot_lane_s16(offset, s0[1], filter, 0); 283 int64x2_t sum26 = aom_svdot_lane_s16(offset, s0[2], filter, 0); 284 int64x2_t sum37 = aom_svdot_lane_s16(offset, s0[3], filter, 0); 285 286 int32x4_t sum0415 = vcombine_s32(vmovn_s64(sum04), vmovn_s64(sum15)); 287 int32x4_t sum2637 = vcombine_s32(vmovn_s64(sum26), vmovn_s64(sum37)); 288 289 uint16x8_t res = vcombine_u16(vqrshrun_n_s32(sum0415, FILTER_BITS), 290 vqrshrun_n_s32(sum2637, FILTER_BITS)); 291 res = aom_tbl_u16(res, tbl); 292 293 return vminq_u16(res, max); 294 } 295 296 static inline void highbd_convolve_x_sr_4tap_sve2( 297 const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, 298 int width, int height, const int16_t *x_filter_ptr, 299 ConvolveParams *conv_params, int bd) { 300 // This shim allows to do only one rounding shift instead of two. 301 const int64x2_t offset = vdupq_n_s64(1 << (conv_params->round_0 - 1)); 302 303 const int16x4_t x_filter = vld1_s16(x_filter_ptr + 2); 304 const int16x8_t filter = vcombine_s16(x_filter, vdup_n_s16(0)); 305 306 if (width == 4) { 307 const uint16x4_t max = vdup_n_u16((1 << bd) - 1); 308 uint16x8x2_t permute_tbl = vld1q_u16_x2(kDotProdTbl); 309 310 const int16_t *s = (const int16_t *)(src); 311 312 do { 313 int16x8_t s0, s1, s2, s3; 314 load_s16_8x4(s, src_stride, &s0, &s1, &s2, &s3); 315 316 uint16x4_t d0 = convolve4_4_x(s0, filter, offset, permute_tbl, max); 317 uint16x4_t d1 = convolve4_4_x(s1, filter, offset, permute_tbl, max); 318 uint16x4_t d2 = convolve4_4_x(s2, filter, offset, permute_tbl, max); 319 uint16x4_t d3 = convolve4_4_x(s3, filter, offset, permute_tbl, max); 320 321 store_u16_4x4(dst, dst_stride, d0, d1, d2, d3); 322 323 s += 4 * src_stride; 324 dst += 4 * dst_stride; 325 height -= 4; 326 } while (height != 0); 327 } else { 328 const uint16x8_t max = vdupq_n_u16((1 << bd) - 1); 329 uint16x8_t idx = vld1q_u16(kDeinterleaveTbl); 330 331 do { 332 const int16_t *s = (const int16_t *)(src); 333 uint16_t *d = dst; 334 int w = width; 335 336 do { 337 int16x8_t s0[4], s1[4], s2[4], s3[4]; 338 load_s16_8x4(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3]); 339 load_s16_8x4(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3]); 340 load_s16_8x4(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3]); 341 load_s16_8x4(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3]); 342 343 uint16x8_t d0 = convolve4_8_x(s0, filter, offset, idx, max); 344 uint16x8_t d1 = convolve4_8_x(s1, filter, offset, idx, max); 345 uint16x8_t d2 = convolve4_8_x(s2, filter, offset, idx, max); 346 uint16x8_t d3 = convolve4_8_x(s3, filter, offset, idx, max); 347 348 store_u16_8x4(d, dst_stride, d0, d1, d2, d3); 349 350 s += 8; 351 d += 8; 352 w -= 8; 353 } while (w != 0); 354 src += 4 * src_stride; 355 dst += 4 * dst_stride; 356 height -= 4; 357 } while (height != 0); 358 } 359 } 360 361 void av1_highbd_convolve_x_sr_sve2(const uint16_t *src, int src_stride, 362 uint16_t *dst, int dst_stride, int w, int h, 363 const InterpFilterParams *filter_params_x, 364 const int subpel_x_qn, 365 ConvolveParams *conv_params, int bd) { 366 if (w == 2 || h == 2) { 367 av1_highbd_convolve_x_sr_c(src, src_stride, dst, dst_stride, w, h, 368 filter_params_x, subpel_x_qn, conv_params, bd); 369 return; 370 } 371 372 const int x_filter_taps = get_filter_tap(filter_params_x, subpel_x_qn); 373 374 if (x_filter_taps == 6) { 375 av1_highbd_convolve_x_sr_neon(src, src_stride, dst, dst_stride, w, h, 376 filter_params_x, subpel_x_qn, conv_params, 377 bd); 378 return; 379 } 380 381 const int horiz_offset = filter_params_x->taps / 2 - 1; 382 const int16_t *x_filter_ptr = av1_get_interp_filter_subpel_kernel( 383 filter_params_x, subpel_x_qn & SUBPEL_MASK); 384 385 src -= horiz_offset; 386 387 if (x_filter_taps == 12) { 388 highbd_convolve_x_sr_12tap_sve2(src, src_stride, dst, dst_stride, w, h, 389 x_filter_ptr, conv_params, bd); 390 return; 391 } 392 393 if (x_filter_taps == 8) { 394 highbd_convolve_x_sr_8tap_sve2(src, src_stride, dst, dst_stride, w, h, 395 x_filter_ptr, conv_params, bd); 396 return; 397 } 398 399 highbd_convolve_x_sr_4tap_sve2(src + 2, src_stride, dst, dst_stride, w, h, 400 x_filter_ptr, conv_params, bd); 401 } 402 403 static inline uint16x4_t highbd_convolve12_4_y(int16x8_t s0[2], int16x8_t s1[2], 404 int16x8_t s2[2], 405 int16x8_t filter_0_7, 406 int16x8_t filter_4_11, 407 uint16x4_t max) { 408 int64x2_t sum[2]; 409 410 sum[0] = aom_svdot_lane_s16(vdupq_n_s64(0), s0[0], filter_0_7, 0); 411 sum[0] = aom_svdot_lane_s16(sum[0], s1[0], filter_0_7, 1); 412 sum[0] = aom_svdot_lane_s16(sum[0], s2[0], filter_4_11, 1); 413 414 sum[1] = aom_svdot_lane_s16(vdupq_n_s64(0), s0[1], filter_0_7, 0); 415 sum[1] = aom_svdot_lane_s16(sum[1], s1[1], filter_0_7, 1); 416 sum[1] = aom_svdot_lane_s16(sum[1], s2[1], filter_4_11, 1); 417 418 int32x4_t res_s32 = vcombine_s32(vmovn_s64(sum[0]), vmovn_s64(sum[1])); 419 420 uint16x4_t res = vqrshrun_n_s32(res_s32, FILTER_BITS); 421 422 return vmin_u16(res, max); 423 } 424 425 static inline void highbd_convolve_y_sr_12tap_sve2( 426 const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, 427 int width, int height, const int16_t *y_filter_ptr, int bd) { 428 const int16x8_t y_filter_0_7 = vld1q_s16(y_filter_ptr); 429 const int16x8_t y_filter_4_11 = vld1q_s16(y_filter_ptr + 4); 430 431 uint16x8x3_t merge_block_tbl = vld1q_u16_x3(kDotProdMergeBlockTbl); 432 // Scale indices by size of the true vector length to avoid reading from an 433 // 'undefined' portion of a vector on a system with SVE vectors > 128-bit. 434 uint16x8_t correction0 = 435 vreinterpretq_u16_u64(vdupq_n_u64(svcnth() * 0x0001000000000000ULL)); 436 merge_block_tbl.val[0] = vaddq_u16(merge_block_tbl.val[0], correction0); 437 438 uint16x8_t correction1 = 439 vreinterpretq_u16_u64(vdupq_n_u64(svcnth() * 0x0001000100000000ULL)); 440 merge_block_tbl.val[1] = vaddq_u16(merge_block_tbl.val[1], correction1); 441 442 uint16x8_t correction2 = 443 vreinterpretq_u16_u64(vdupq_n_u64(svcnth() * 0x0001000100010000ULL)); 444 merge_block_tbl.val[2] = vaddq_u16(merge_block_tbl.val[2], correction2); 445 446 const uint16x4_t max = vdup_n_u16((1 << bd) - 1); 447 448 do { 449 int16_t *s = (int16_t *)src; 450 uint16_t *d = dst; 451 int h = height; 452 453 int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA; 454 load_s16_4x11(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7, &s8, 455 &s9, &sA); 456 s += 11 * src_stride; 457 458 int16x8_t s0123[2], s1234[2], s2345[2], s3456[2], s4567[2], s5678[2], 459 s6789[2], s789A[2]; 460 transpose_concat_elems_s16_4x4(s0, s1, s2, s3, s0123); 461 transpose_concat_elems_s16_4x4(s1, s2, s3, s4, s1234); 462 transpose_concat_elems_s16_4x4(s2, s3, s4, s5, s2345); 463 transpose_concat_elems_s16_4x4(s3, s4, s5, s6, s3456); 464 transpose_concat_elems_s16_4x4(s4, s5, s6, s7, s4567); 465 transpose_concat_elems_s16_4x4(s5, s6, s7, s8, s5678); 466 transpose_concat_elems_s16_4x4(s6, s7, s8, s9, s6789); 467 transpose_concat_elems_s16_4x4(s7, s8, s9, sA, s789A); 468 469 do { 470 int16x4_t sB, sC, sD, sE; 471 load_s16_4x4(s, src_stride, &sB, &sC, &sD, &sE); 472 473 int16x8_t s89AB[2], s9ABC[2], sABCD[2], sBCDE[2]; 474 transpose_concat_elems_s16_4x4(sB, sC, sD, sE, sBCDE); 475 476 // Use the above transpose and reuse data from the previous loop to get 477 // the rest. 478 aom_tbl2x2_s16(s789A, sBCDE, merge_block_tbl.val[0], s89AB); 479 aom_tbl2x2_s16(s789A, sBCDE, merge_block_tbl.val[1], s9ABC); 480 aom_tbl2x2_s16(s789A, sBCDE, merge_block_tbl.val[2], sABCD); 481 482 uint16x4_t d0 = highbd_convolve12_4_y(s0123, s4567, s89AB, y_filter_0_7, 483 y_filter_4_11, max); 484 uint16x4_t d1 = highbd_convolve12_4_y(s1234, s5678, s9ABC, y_filter_0_7, 485 y_filter_4_11, max); 486 uint16x4_t d2 = highbd_convolve12_4_y(s2345, s6789, sABCD, y_filter_0_7, 487 y_filter_4_11, max); 488 uint16x4_t d3 = highbd_convolve12_4_y(s3456, s789A, sBCDE, y_filter_0_7, 489 y_filter_4_11, max); 490 491 store_u16_4x4(d, dst_stride, d0, d1, d2, d3); 492 493 // Prepare block for next iteration - re-using as much as possible. 494 // Shuffle everything up four rows. 495 s0123[0] = s4567[0]; 496 s0123[1] = s4567[1]; 497 s1234[0] = s5678[0]; 498 s1234[1] = s5678[1]; 499 s2345[0] = s6789[0]; 500 s2345[1] = s6789[1]; 501 s3456[0] = s789A[0]; 502 s3456[1] = s789A[1]; 503 s4567[0] = s89AB[0]; 504 s4567[1] = s89AB[1]; 505 s5678[0] = s9ABC[0]; 506 s5678[1] = s9ABC[1]; 507 s6789[0] = sABCD[0]; 508 s6789[1] = sABCD[1]; 509 s789A[0] = sBCDE[0]; 510 s789A[1] = sBCDE[1]; 511 512 s += 4 * src_stride; 513 d += 4 * dst_stride; 514 h -= 4; 515 } while (h != 0); 516 src += 4; 517 dst += 4; 518 width -= 4; 519 } while (width != 0); 520 } 521 522 static inline uint16x4_t highbd_convolve8_4_y(int16x8_t samples_lo[2], 523 int16x8_t samples_hi[2], 524 int16x8_t filter, 525 uint16x4_t max) { 526 int64x2_t sum01 = 527 aom_svdot_lane_s16(vdupq_n_s64(0), samples_lo[0], filter, 0); 528 sum01 = aom_svdot_lane_s16(sum01, samples_hi[0], filter, 1); 529 530 int64x2_t sum23 = 531 aom_svdot_lane_s16(vdupq_n_s64(0), samples_lo[1], filter, 0); 532 sum23 = aom_svdot_lane_s16(sum23, samples_hi[1], filter, 1); 533 534 int32x4_t sum0123 = vcombine_s32(vmovn_s64(sum01), vmovn_s64(sum23)); 535 uint16x4_t res = vqrshrun_n_s32(sum0123, FILTER_BITS); 536 return vmin_u16(res, max); 537 } 538 539 static inline uint16x8_t highbd_convolve8_8_y(int16x8_t samples_lo[4], 540 int16x8_t samples_hi[4], 541 int16x8_t filter, 542 uint16x8_t max) { 543 int64x2_t sum01 = 544 aom_svdot_lane_s16(vdupq_n_s64(0), samples_lo[0], filter, 0); 545 sum01 = aom_svdot_lane_s16(sum01, samples_hi[0], filter, 1); 546 547 int64x2_t sum23 = 548 aom_svdot_lane_s16(vdupq_n_s64(0), samples_lo[1], filter, 0); 549 sum23 = aom_svdot_lane_s16(sum23, samples_hi[1], filter, 1); 550 551 int64x2_t sum45 = 552 aom_svdot_lane_s16(vdupq_n_s64(0), samples_lo[2], filter, 0); 553 sum45 = aom_svdot_lane_s16(sum45, samples_hi[2], filter, 1); 554 555 int64x2_t sum67 = 556 aom_svdot_lane_s16(vdupq_n_s64(0), samples_lo[3], filter, 0); 557 sum67 = aom_svdot_lane_s16(sum67, samples_hi[3], filter, 1); 558 559 int32x4_t sum0123 = vcombine_s32(vmovn_s64(sum01), vmovn_s64(sum23)); 560 int32x4_t sum4567 = vcombine_s32(vmovn_s64(sum45), vmovn_s64(sum67)); 561 uint16x8_t res = vcombine_u16(vqrshrun_n_s32(sum0123, FILTER_BITS), 562 vqrshrun_n_s32(sum4567, FILTER_BITS)); 563 return vminq_u16(res, max); 564 } 565 566 static void highbd_convolve_y_sr_8tap_sve2(const uint16_t *src, 567 ptrdiff_t src_stride, uint16_t *dst, 568 ptrdiff_t dst_stride, int width, 569 int height, const int16_t *filter_y, 570 int bd) { 571 assert(width >= 4 && height >= 4); 572 573 const int16x8_t y_filter = vld1q_s16(filter_y); 574 575 uint16x8x3_t merge_block_tbl = vld1q_u16_x3(kDotProdMergeBlockTbl); 576 // Scale indices by size of the true vector length to avoid reading from an 577 // 'undefined' portion of a vector on a system with SVE vectors > 128-bit. 578 uint16x8_t correction0 = 579 vreinterpretq_u16_u64(vdupq_n_u64(svcnth() * 0x0001000000000000ULL)); 580 merge_block_tbl.val[0] = vaddq_u16(merge_block_tbl.val[0], correction0); 581 582 uint16x8_t correction1 = 583 vreinterpretq_u16_u64(vdupq_n_u64(svcnth() * 0x0001000100000000ULL)); 584 merge_block_tbl.val[1] = vaddq_u16(merge_block_tbl.val[1], correction1); 585 586 uint16x8_t correction2 = 587 vreinterpretq_u16_u64(vdupq_n_u64(svcnth() * 0x0001000100010000ULL)); 588 merge_block_tbl.val[2] = vaddq_u16(merge_block_tbl.val[2], correction2); 589 590 if (width == 4) { 591 const uint16x4_t max = vdup_n_u16((1 << bd) - 1); 592 int16_t *s = (int16_t *)src; 593 594 int16x4_t s0, s1, s2, s3, s4, s5, s6; 595 load_s16_4x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6); 596 s += 7 * src_stride; 597 598 // This operation combines a conventional transpose and the sample permute 599 // required before computing the dot product. 600 int16x8_t s0123[2], s1234[2], s2345[2], s3456[2]; 601 transpose_concat_elems_s16_4x4(s0, s1, s2, s3, s0123); 602 transpose_concat_elems_s16_4x4(s1, s2, s3, s4, s1234); 603 transpose_concat_elems_s16_4x4(s2, s3, s4, s5, s2345); 604 transpose_concat_elems_s16_4x4(s3, s4, s5, s6, s3456); 605 606 do { 607 int16x4_t s7, s8, s9, s10; 608 load_s16_4x4(s, src_stride, &s7, &s8, &s9, &s10); 609 610 int16x8_t s4567[2], s5678[2], s6789[2], s789A[2]; 611 // Transpose and shuffle the 4 lines that were loaded. 612 transpose_concat_elems_s16_4x4(s7, s8, s9, s10, s789A); 613 614 // Merge new data into block from previous iteration. 615 aom_tbl2x2_s16(s3456, s789A, merge_block_tbl.val[0], s4567); 616 aom_tbl2x2_s16(s3456, s789A, merge_block_tbl.val[1], s5678); 617 aom_tbl2x2_s16(s3456, s789A, merge_block_tbl.val[2], s6789); 618 619 uint16x4_t d0 = highbd_convolve8_4_y(s0123, s4567, y_filter, max); 620 uint16x4_t d1 = highbd_convolve8_4_y(s1234, s5678, y_filter, max); 621 uint16x4_t d2 = highbd_convolve8_4_y(s2345, s6789, y_filter, max); 622 uint16x4_t d3 = highbd_convolve8_4_y(s3456, s789A, y_filter, max); 623 624 store_u16_4x4(dst, dst_stride, d0, d1, d2, d3); 625 626 // Prepare block for next iteration - re-using as much as possible. 627 // Shuffle everything up four rows. 628 s0123[0] = s4567[0]; 629 s0123[1] = s4567[1]; 630 s1234[0] = s5678[0]; 631 s1234[1] = s5678[1]; 632 s2345[0] = s6789[0]; 633 s2345[1] = s6789[1]; 634 s3456[0] = s789A[0]; 635 s3456[1] = s789A[1]; 636 s += 4 * src_stride; 637 dst += 4 * dst_stride; 638 height -= 4; 639 } while (height != 0); 640 } else { 641 const uint16x8_t max = vdupq_n_u16((1 << bd) - 1); 642 643 do { 644 int h = height; 645 int16_t *s = (int16_t *)src; 646 uint16_t *d = dst; 647 648 int16x8_t s0, s1, s2, s3, s4, s5, s6; 649 load_s16_8x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6); 650 s += 7 * src_stride; 651 652 // This operation combines a conventional transpose and the sample permute 653 // required before computing the dot product. 654 int16x8_t s0123[4], s1234[4], s2345[4], s3456[4]; 655 transpose_concat_elems_s16_8x4(s0, s1, s2, s3, s0123); 656 transpose_concat_elems_s16_8x4(s1, s2, s3, s4, s1234); 657 transpose_concat_elems_s16_8x4(s2, s3, s4, s5, s2345); 658 transpose_concat_elems_s16_8x4(s3, s4, s5, s6, s3456); 659 660 do { 661 int16x8_t s7, s8, s9, s10; 662 load_s16_8x4(s, src_stride, &s7, &s8, &s9, &s10); 663 664 int16x8_t s4567[4], s5678[4], s6789[4], s789A[4]; 665 // Transpose and shuffle the 4 lines that were loaded. 666 transpose_concat_elems_s16_8x4(s7, s8, s9, s10, s789A); 667 668 // Merge new data into block from previous iteration. 669 aom_tbl2x4_s16(s3456, s789A, merge_block_tbl.val[0], s4567); 670 aom_tbl2x4_s16(s3456, s789A, merge_block_tbl.val[1], s5678); 671 aom_tbl2x4_s16(s3456, s789A, merge_block_tbl.val[2], s6789); 672 673 uint16x8_t d0 = highbd_convolve8_8_y(s0123, s4567, y_filter, max); 674 uint16x8_t d1 = highbd_convolve8_8_y(s1234, s5678, y_filter, max); 675 uint16x8_t d2 = highbd_convolve8_8_y(s2345, s6789, y_filter, max); 676 uint16x8_t d3 = highbd_convolve8_8_y(s3456, s789A, y_filter, max); 677 678 store_u16_8x4(d, dst_stride, d0, d1, d2, d3); 679 680 // Prepare block for next iteration - re-using as much as possible. 681 // Shuffle everything up four rows. 682 s0123[0] = s4567[0]; 683 s0123[1] = s4567[1]; 684 s0123[2] = s4567[2]; 685 s0123[3] = s4567[3]; 686 s1234[0] = s5678[0]; 687 s1234[1] = s5678[1]; 688 s1234[2] = s5678[2]; 689 s1234[3] = s5678[3]; 690 s2345[0] = s6789[0]; 691 s2345[1] = s6789[1]; 692 s2345[2] = s6789[2]; 693 s2345[3] = s6789[3]; 694 s3456[0] = s789A[0]; 695 s3456[1] = s789A[1]; 696 s3456[2] = s789A[2]; 697 s3456[3] = s789A[3]; 698 699 s += 4 * src_stride; 700 d += 4 * dst_stride; 701 h -= 4; 702 } while (h != 0); 703 src += 8; 704 dst += 8; 705 width -= 8; 706 } while (width != 0); 707 } 708 } 709 710 static inline uint16x4_t highbd_convolve4_4_y(int16x8_t samples[2], 711 int16x8_t filter, 712 uint16x4_t max) { 713 int64x2_t sum01 = aom_svdot_lane_s16(vdupq_n_s64(0), samples[0], filter, 0); 714 int64x2_t sum23 = aom_svdot_lane_s16(vdupq_n_s64(0), samples[1], filter, 0); 715 716 int32x4_t sum0123 = vcombine_s32(vmovn_s64(sum01), vmovn_s64(sum23)); 717 uint16x4_t res = vqrshrun_n_s32(sum0123, FILTER_BITS); 718 return vmin_u16(res, max); 719 } 720 721 static inline uint16x8_t highbd_convolve4_8_y(int16x8_t samples[4], 722 int16x8_t filter, 723 uint16x8_t max) { 724 int64x2_t sum01 = aom_svdot_lane_s16(vdupq_n_s64(0), samples[0], filter, 0); 725 int64x2_t sum23 = aom_svdot_lane_s16(vdupq_n_s64(0), samples[1], filter, 0); 726 int64x2_t sum45 = aom_svdot_lane_s16(vdupq_n_s64(0), samples[2], filter, 0); 727 int64x2_t sum67 = aom_svdot_lane_s16(vdupq_n_s64(0), samples[3], filter, 0); 728 729 int32x4_t sum0123 = vcombine_s32(vmovn_s64(sum01), vmovn_s64(sum23)); 730 int32x4_t sum4567 = vcombine_s32(vmovn_s64(sum45), vmovn_s64(sum67)); 731 uint16x8_t res = vcombine_u16(vqrshrun_n_s32(sum0123, FILTER_BITS), 732 vqrshrun_n_s32(sum4567, FILTER_BITS)); 733 return vminq_u16(res, max); 734 } 735 736 static void highbd_convolve_y_sr_4tap_sve2(const uint16_t *src, 737 ptrdiff_t src_stride, uint16_t *dst, 738 ptrdiff_t dst_stride, int width, 739 int height, const int16_t *filter_y, 740 int bd) { 741 assert(width >= 4 && height >= 4); 742 743 const int16x8_t y_filter = 744 vcombine_s16(vld1_s16(filter_y + 2), vdup_n_s16(0)); 745 746 if (width == 4) { 747 const uint16x4_t max = vdup_n_u16((1 << bd) - 1); 748 int16_t *s = (int16_t *)src; 749 750 int16x4_t s0, s1, s2; 751 load_s16_4x3(s, src_stride, &s0, &s1, &s2); 752 s += 3 * src_stride; 753 754 do { 755 int16x4_t s3, s4, s5, s6; 756 load_s16_4x4(s, src_stride, &s3, &s4, &s5, &s6); 757 758 // This operation combines a conventional transpose and the sample permute 759 // required before computing the dot product. 760 int16x8_t s0123[2], s1234[2], s2345[2], s3456[2]; 761 transpose_concat_elems_s16_4x4(s0, s1, s2, s3, s0123); 762 transpose_concat_elems_s16_4x4(s1, s2, s3, s4, s1234); 763 transpose_concat_elems_s16_4x4(s2, s3, s4, s5, s2345); 764 transpose_concat_elems_s16_4x4(s3, s4, s5, s6, s3456); 765 766 uint16x4_t d0 = highbd_convolve4_4_y(s0123, y_filter, max); 767 uint16x4_t d1 = highbd_convolve4_4_y(s1234, y_filter, max); 768 uint16x4_t d2 = highbd_convolve4_4_y(s2345, y_filter, max); 769 uint16x4_t d3 = highbd_convolve4_4_y(s3456, y_filter, max); 770 771 store_u16_4x4(dst, dst_stride, d0, d1, d2, d3); 772 773 // Shuffle everything up four rows. 774 s0 = s4; 775 s1 = s5; 776 s2 = s6; 777 778 s += 4 * src_stride; 779 dst += 4 * dst_stride; 780 height -= 4; 781 } while (height != 0); 782 } else { 783 const uint16x8_t max = vdupq_n_u16((1 << bd) - 1); 784 785 do { 786 int h = height; 787 int16_t *s = (int16_t *)src; 788 uint16_t *d = dst; 789 790 int16x8_t s0, s1, s2; 791 load_s16_8x3(s, src_stride, &s0, &s1, &s2); 792 s += 3 * src_stride; 793 794 do { 795 int16x8_t s3, s4, s5, s6; 796 load_s16_8x4(s, src_stride, &s3, &s4, &s5, &s6); 797 798 // This operation combines a conventional transpose and the sample 799 // permute required before computing the dot product. 800 int16x8_t s0123[4], s1234[4], s2345[4], s3456[4]; 801 transpose_concat_elems_s16_8x4(s0, s1, s2, s3, s0123); 802 transpose_concat_elems_s16_8x4(s1, s2, s3, s4, s1234); 803 transpose_concat_elems_s16_8x4(s2, s3, s4, s5, s2345); 804 transpose_concat_elems_s16_8x4(s3, s4, s5, s6, s3456); 805 806 uint16x8_t d0 = highbd_convolve4_8_y(s0123, y_filter, max); 807 uint16x8_t d1 = highbd_convolve4_8_y(s1234, y_filter, max); 808 uint16x8_t d2 = highbd_convolve4_8_y(s2345, y_filter, max); 809 uint16x8_t d3 = highbd_convolve4_8_y(s3456, y_filter, max); 810 811 store_u16_8x4(d, dst_stride, d0, d1, d2, d3); 812 813 // Shuffle everything up four rows. 814 s0 = s4; 815 s1 = s5; 816 s2 = s6; 817 818 s += 4 * src_stride; 819 d += 4 * dst_stride; 820 h -= 4; 821 } while (h != 0); 822 src += 8; 823 dst += 8; 824 width -= 8; 825 } while (width != 0); 826 } 827 } 828 829 void av1_highbd_convolve_y_sr_sve2(const uint16_t *src, int src_stride, 830 uint16_t *dst, int dst_stride, int w, int h, 831 const InterpFilterParams *filter_params_y, 832 const int subpel_y_qn, int bd) { 833 if (w == 2 || h == 2) { 834 av1_highbd_convolve_y_sr_c(src, src_stride, dst, dst_stride, w, h, 835 filter_params_y, subpel_y_qn, bd); 836 return; 837 } 838 const int y_filter_taps = get_filter_tap(filter_params_y, subpel_y_qn); 839 840 if (y_filter_taps == 6) { 841 av1_highbd_convolve_y_sr_neon(src, src_stride, dst, dst_stride, w, h, 842 filter_params_y, subpel_y_qn, bd); 843 return; 844 } 845 846 const int vert_offset = filter_params_y->taps / 2 - 1; 847 const int16_t *y_filter_ptr = av1_get_interp_filter_subpel_kernel( 848 filter_params_y, subpel_y_qn & SUBPEL_MASK); 849 850 src -= vert_offset * src_stride; 851 852 if (y_filter_taps > 8) { 853 highbd_convolve_y_sr_12tap_sve2(src, src_stride, dst, dst_stride, w, h, 854 y_filter_ptr, bd); 855 return; 856 } 857 858 if (y_filter_taps == 4) { 859 highbd_convolve_y_sr_4tap_sve2(src + 2 * src_stride, src_stride, dst, 860 dst_stride, w, h, y_filter_ptr, bd); 861 return; 862 } 863 864 highbd_convolve_y_sr_8tap_sve2(src, src_stride, dst, dst_stride, w, h, 865 y_filter_ptr, bd); 866 } 867 868 static inline uint16x4_t convolve12_4_2d_h( 869 int16x8_t s0, int16x8_t s1, int16x8_t filter_0_7, int16x8_t filter_4_11, 870 const int64x2_t offset, int32x4_t shift, uint16x8x4_t permute_tbl) { 871 int16x8_t permuted_samples[6]; 872 permuted_samples[0] = aom_tbl_s16(s0, permute_tbl.val[0]); 873 permuted_samples[1] = aom_tbl_s16(s0, permute_tbl.val[1]); 874 permuted_samples[2] = aom_tbl2_s16(s0, s1, permute_tbl.val[2]); 875 permuted_samples[3] = aom_tbl2_s16(s0, s1, permute_tbl.val[3]); 876 permuted_samples[4] = aom_tbl_s16(s1, permute_tbl.val[0]); 877 permuted_samples[5] = aom_tbl_s16(s1, permute_tbl.val[1]); 878 879 int64x2_t sum01 = 880 aom_svdot_lane_s16(offset, permuted_samples[0], filter_0_7, 0); 881 sum01 = aom_svdot_lane_s16(sum01, permuted_samples[2], filter_0_7, 1); 882 sum01 = aom_svdot_lane_s16(sum01, permuted_samples[4], filter_4_11, 1); 883 884 int64x2_t sum23 = 885 aom_svdot_lane_s16(offset, permuted_samples[1], filter_0_7, 0); 886 sum23 = aom_svdot_lane_s16(sum23, permuted_samples[3], filter_0_7, 1); 887 sum23 = aom_svdot_lane_s16(sum23, permuted_samples[5], filter_4_11, 1); 888 889 int32x4_t sum0123 = vcombine_s32(vmovn_s64(sum01), vmovn_s64(sum23)); 890 sum0123 = vqrshlq_s32(sum0123, shift); 891 return vqmovun_s32(sum0123); 892 } 893 894 static inline uint16x8_t convolve12_8_2d_h(int16x8_t s0, int16x8_t s1, 895 int16x8_t s2, int16x8_t filter_0_7, 896 int16x8_t filter_4_11, 897 int64x2_t offset, int32x4_t shift, 898 uint16x8x4_t permute_tbl) { 899 int16x8_t permuted_samples[8]; 900 permuted_samples[0] = aom_tbl_s16(s0, permute_tbl.val[0]); 901 permuted_samples[1] = aom_tbl_s16(s0, permute_tbl.val[1]); 902 permuted_samples[2] = aom_tbl2_s16(s0, s1, permute_tbl.val[2]); 903 permuted_samples[3] = aom_tbl2_s16(s0, s1, permute_tbl.val[3]); 904 permuted_samples[4] = aom_tbl_s16(s1, permute_tbl.val[0]); 905 permuted_samples[5] = aom_tbl_s16(s1, permute_tbl.val[1]); 906 permuted_samples[6] = aom_tbl2_s16(s1, s2, permute_tbl.val[2]); 907 permuted_samples[7] = aom_tbl2_s16(s1, s2, permute_tbl.val[3]); 908 909 int64x2_t sum01 = 910 aom_svdot_lane_s16(offset, permuted_samples[0], filter_0_7, 0); 911 sum01 = aom_svdot_lane_s16(sum01, permuted_samples[2], filter_0_7, 1); 912 sum01 = aom_svdot_lane_s16(sum01, permuted_samples[4], filter_4_11, 1); 913 914 int64x2_t sum23 = 915 aom_svdot_lane_s16(offset, permuted_samples[1], filter_0_7, 0); 916 sum23 = aom_svdot_lane_s16(sum23, permuted_samples[3], filter_0_7, 1); 917 sum23 = aom_svdot_lane_s16(sum23, permuted_samples[5], filter_4_11, 1); 918 919 int64x2_t sum45 = 920 aom_svdot_lane_s16(offset, permuted_samples[2], filter_0_7, 0); 921 sum45 = aom_svdot_lane_s16(sum45, permuted_samples[4], filter_0_7, 1); 922 sum45 = aom_svdot_lane_s16(sum45, permuted_samples[6], filter_4_11, 1); 923 924 int64x2_t sum67 = 925 aom_svdot_lane_s16(offset, permuted_samples[3], filter_0_7, 0); 926 sum67 = aom_svdot_lane_s16(sum67, permuted_samples[5], filter_0_7, 1); 927 sum67 = aom_svdot_lane_s16(sum67, permuted_samples[7], filter_4_11, 1); 928 929 int32x4_t sum0123 = vcombine_s32(vmovn_s64(sum01), vmovn_s64(sum23)); 930 int32x4_t sum4567 = vcombine_s32(vmovn_s64(sum45), vmovn_s64(sum67)); 931 932 sum0123 = vqrshlq_s32(sum0123, shift); 933 sum4567 = vqrshlq_s32(sum4567, shift); 934 935 return vcombine_u16(vqmovun_s32(sum0123), vqmovun_s32(sum4567)); 936 } 937 938 static inline void highbd_convolve_2d_sr_horiz_12tap_sve2( 939 const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, 940 int width, int height, const int16_t *y_filter_ptr, 941 ConvolveParams *conv_params, const int x_offset) { 942 const int64x2_t offset = vdupq_n_s64(x_offset); 943 const int32x4_t shift = vdupq_n_s32(-conv_params->round_0); 944 945 const int16x8_t y_filter_0_7 = vld1q_s16(y_filter_ptr); 946 const int16x8_t y_filter_4_11 = vld1q_s16(y_filter_ptr + 4); 947 948 uint16x8x4_t permute_tbl = vld1q_u16_x4(kDotProdTbl); 949 // Scale indices by size of the true vector length to avoid reading from an 950 // 'undefined' portion of a vector on a system with SVE vectors > 128-bit. 951 uint16x8_t correction0 = vreinterpretq_u16_u64(vcombine_u64( 952 vdup_n_u64(0), vdup_n_u64(svcnth() * 0x0001000000000000ULL))); 953 permute_tbl.val[2] = vaddq_u16(permute_tbl.val[2], correction0); 954 955 uint16x8_t correction1 = vreinterpretq_u16_u64( 956 vcombine_u64(vdup_n_u64(svcnth() * 0x0001000100000000ULL), 957 vdup_n_u64(svcnth() * 0x0001000100010000ULL))); 958 permute_tbl.val[3] = vaddq_u16(permute_tbl.val[3], correction1); 959 960 if (width == 4) { 961 const int16_t *s = (const int16_t *)src; 962 963 do { 964 int16x8_t s0, s1, s2, s3, s4, s5, s6, s7; 965 load_s16_8x4(s, src_stride, &s0, &s2, &s4, &s6); 966 load_s16_8x4(s + 8, src_stride, &s1, &s3, &s5, &s7); 967 968 uint16x4_t d0 = convolve12_4_2d_h(s0, s1, y_filter_0_7, y_filter_4_11, 969 offset, shift, permute_tbl); 970 uint16x4_t d1 = convolve12_4_2d_h(s2, s3, y_filter_0_7, y_filter_4_11, 971 offset, shift, permute_tbl); 972 uint16x4_t d2 = convolve12_4_2d_h(s4, s5, y_filter_0_7, y_filter_4_11, 973 offset, shift, permute_tbl); 974 uint16x4_t d3 = convolve12_4_2d_h(s6, s7, y_filter_0_7, y_filter_4_11, 975 offset, shift, permute_tbl); 976 977 store_u16_4x4(dst, dst_stride, d0, d1, d2, d3); 978 979 dst += 4 * dst_stride; 980 s += 4 * src_stride; 981 height -= 4; 982 } while (height > 0); 983 } else { 984 do { 985 const int16_t *s = (const int16_t *)src; 986 uint16_t *d = dst; 987 int w = width; 988 989 do { 990 int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11; 991 load_s16_8x4(s, src_stride, &s0, &s3, &s6, &s9); 992 load_s16_8x4(s + 8, src_stride, &s1, &s4, &s7, &s10); 993 load_s16_8x4(s + 16, src_stride, &s2, &s5, &s8, &s11); 994 995 uint16x8_t d0 = 996 convolve12_8_2d_h(s0, s1, s2, y_filter_0_7, y_filter_4_11, offset, 997 shift, permute_tbl); 998 uint16x8_t d1 = 999 convolve12_8_2d_h(s3, s4, s5, y_filter_0_7, y_filter_4_11, offset, 1000 shift, permute_tbl); 1001 uint16x8_t d2 = 1002 convolve12_8_2d_h(s6, s7, s8, y_filter_0_7, y_filter_4_11, offset, 1003 shift, permute_tbl); 1004 uint16x8_t d3 = 1005 convolve12_8_2d_h(s9, s10, s11, y_filter_0_7, y_filter_4_11, offset, 1006 shift, permute_tbl); 1007 1008 store_u16_8x4(d, dst_stride, d0, d1, d2, d3); 1009 1010 s += 8; 1011 d += 8; 1012 w -= 8; 1013 } while (w != 0); 1014 src += 4 * src_stride; 1015 dst += 4 * dst_stride; 1016 height -= 4; 1017 } while (height > 0); 1018 } 1019 } 1020 1021 static inline uint16x8_t convolve8_8_2d_h(int16x8_t s0[8], int16x8_t filter, 1022 int64x2_t offset, int32x4_t shift) { 1023 int64x2_t sum[8]; 1024 sum[0] = aom_sdotq_s16(offset, s0[0], filter); 1025 sum[1] = aom_sdotq_s16(offset, s0[1], filter); 1026 sum[2] = aom_sdotq_s16(offset, s0[2], filter); 1027 sum[3] = aom_sdotq_s16(offset, s0[3], filter); 1028 sum[4] = aom_sdotq_s16(offset, s0[4], filter); 1029 sum[5] = aom_sdotq_s16(offset, s0[5], filter); 1030 sum[6] = aom_sdotq_s16(offset, s0[6], filter); 1031 sum[7] = aom_sdotq_s16(offset, s0[7], filter); 1032 1033 sum[0] = vpaddq_s64(sum[0], sum[1]); 1034 sum[2] = vpaddq_s64(sum[2], sum[3]); 1035 sum[4] = vpaddq_s64(sum[4], sum[5]); 1036 sum[6] = vpaddq_s64(sum[6], sum[7]); 1037 1038 int32x4_t sum0123 = vcombine_s32(vmovn_s64(sum[0]), vmovn_s64(sum[2])); 1039 int32x4_t sum4567 = vcombine_s32(vmovn_s64(sum[4]), vmovn_s64(sum[6])); 1040 1041 sum0123 = vqrshlq_s32(sum0123, shift); 1042 sum4567 = vqrshlq_s32(sum4567, shift); 1043 1044 return vcombine_u16(vqmovun_s32(sum0123), vqmovun_s32(sum4567)); 1045 } 1046 1047 static inline void highbd_convolve_2d_sr_horiz_8tap_sve2( 1048 const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, 1049 int width, int height, const int16_t *y_filter_ptr, 1050 ConvolveParams *conv_params, const int x_offset) { 1051 const int64x2_t offset = vdupq_n_s64(x_offset); 1052 const int64x2_t offset_lo = vcombine_s64(vget_low_s64(offset), vdup_n_s64(0)); 1053 const int32x4_t shift = vdupq_n_s32(-conv_params->round_0); 1054 1055 const int16x8_t filter = vld1q_s16(y_filter_ptr); 1056 1057 do { 1058 const int16_t *s = (const int16_t *)src; 1059 uint16_t *d = dst; 1060 int w = width; 1061 1062 do { 1063 int16x8_t s0[8], s1[8], s2[8], s3[8]; 1064 load_s16_8x8(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3], 1065 &s0[4], &s0[5], &s0[6], &s0[7]); 1066 load_s16_8x8(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3], 1067 &s1[4], &s1[5], &s1[6], &s1[7]); 1068 load_s16_8x8(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3], 1069 &s2[4], &s2[5], &s2[6], &s2[7]); 1070 load_s16_8x8(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3], 1071 &s3[4], &s3[5], &s3[6], &s3[7]); 1072 1073 uint16x8_t d0 = convolve8_8_2d_h(s0, filter, offset_lo, shift); 1074 uint16x8_t d1 = convolve8_8_2d_h(s1, filter, offset_lo, shift); 1075 uint16x8_t d2 = convolve8_8_2d_h(s2, filter, offset_lo, shift); 1076 uint16x8_t d3 = convolve8_8_2d_h(s3, filter, offset_lo, shift); 1077 1078 store_u16_8x4(d, dst_stride, d0, d1, d2, d3); 1079 1080 s += 8; 1081 d += 8; 1082 w -= 8; 1083 } while (w != 0); 1084 src += 4 * src_stride; 1085 dst += 4 * dst_stride; 1086 height -= 4; 1087 } while (height > 0); 1088 } 1089 1090 static inline uint16x4_t convolve4_4_2d_h(int16x8_t s0, int16x8_t filter, 1091 int64x2_t offset, int32x4_t shift, 1092 uint16x8x2_t permute_tbl) { 1093 int16x8_t permuted_samples0 = aom_tbl_s16(s0, permute_tbl.val[0]); 1094 int16x8_t permuted_samples1 = aom_tbl_s16(s0, permute_tbl.val[1]); 1095 1096 int64x2_t sum01 = aom_svdot_lane_s16(offset, permuted_samples0, filter, 0); 1097 int64x2_t sum23 = aom_svdot_lane_s16(offset, permuted_samples1, filter, 0); 1098 1099 int32x4_t sum0123 = vcombine_s32(vmovn_s64(sum01), vmovn_s64(sum23)); 1100 sum0123 = vqrshlq_s32(sum0123, shift); 1101 return vqmovun_s32(sum0123); 1102 } 1103 1104 static inline uint16x8_t convolve4_8_2d_h(int16x8_t s0[8], int16x8_t filter, 1105 int64x2_t offset, int32x4_t shift, 1106 uint16x8_t tbl) { 1107 int64x2_t sum04 = aom_svdot_lane_s16(offset, s0[0], filter, 0); 1108 int64x2_t sum15 = aom_svdot_lane_s16(offset, s0[1], filter, 0); 1109 int64x2_t sum26 = aom_svdot_lane_s16(offset, s0[2], filter, 0); 1110 int64x2_t sum37 = aom_svdot_lane_s16(offset, s0[3], filter, 0); 1111 1112 int32x4_t sum0123 = vcombine_s32(vmovn_s64(sum04), vmovn_s64(sum15)); 1113 int32x4_t sum4567 = vcombine_s32(vmovn_s64(sum26), vmovn_s64(sum37)); 1114 1115 sum0123 = vqrshlq_s32(sum0123, shift); 1116 sum4567 = vqrshlq_s32(sum4567, shift); 1117 1118 uint16x8_t res = vcombine_u16(vqmovun_s32(sum0123), vqmovun_s32(sum4567)); 1119 return aom_tbl_u16(res, tbl); 1120 } 1121 1122 static inline void highbd_convolve_2d_sr_horiz_4tap_sve2( 1123 const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, 1124 int width, int height, const int16_t *x_filter_ptr, 1125 ConvolveParams *conv_params, const int x_offset) { 1126 const int64x2_t offset = vdupq_n_s64(x_offset); 1127 const int32x4_t shift = vdupq_n_s32(-conv_params->round_0); 1128 1129 const int16x4_t x_filter = vld1_s16(x_filter_ptr + 2); 1130 const int16x8_t filter = vcombine_s16(x_filter, vdup_n_s16(0)); 1131 1132 if (width == 4) { 1133 const int16_t *s = (const int16_t *)(src); 1134 1135 uint16x8x2_t permute_tbl = vld1q_u16_x2(kDotProdTbl); 1136 1137 do { 1138 int16x8_t s0, s1, s2, s3; 1139 load_s16_8x4(s, src_stride, &s0, &s1, &s2, &s3); 1140 1141 uint16x4_t d0 = convolve4_4_2d_h(s0, filter, offset, shift, permute_tbl); 1142 uint16x4_t d1 = convolve4_4_2d_h(s1, filter, offset, shift, permute_tbl); 1143 uint16x4_t d2 = convolve4_4_2d_h(s2, filter, offset, shift, permute_tbl); 1144 uint16x4_t d3 = convolve4_4_2d_h(s3, filter, offset, shift, permute_tbl); 1145 1146 store_u16_4x4(dst, dst_stride, d0, d1, d2, d3); 1147 1148 s += 4 * src_stride; 1149 dst += 4 * dst_stride; 1150 height -= 4; 1151 } while (height > 0); 1152 } else { 1153 uint16x8_t idx = vld1q_u16(kDeinterleaveTbl); 1154 1155 do { 1156 const int16_t *s = (const int16_t *)(src); 1157 uint16_t *d = dst; 1158 int w = width; 1159 1160 do { 1161 int16x8_t s0[8], s1[8], s2[8], s3[8]; 1162 load_s16_8x8(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3], 1163 &s0[4], &s0[5], &s0[6], &s0[7]); 1164 load_s16_8x8(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3], 1165 &s1[4], &s1[5], &s1[6], &s1[7]); 1166 load_s16_8x8(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3], 1167 &s2[4], &s2[5], &s2[6], &s2[7]); 1168 load_s16_8x8(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3], 1169 &s3[4], &s3[5], &s3[6], &s3[7]); 1170 1171 uint16x8_t d0 = convolve4_8_2d_h(s0, filter, offset, shift, idx); 1172 uint16x8_t d1 = convolve4_8_2d_h(s1, filter, offset, shift, idx); 1173 uint16x8_t d2 = convolve4_8_2d_h(s2, filter, offset, shift, idx); 1174 uint16x8_t d3 = convolve4_8_2d_h(s3, filter, offset, shift, idx); 1175 1176 store_u16_8x4(d, dst_stride, d0, d1, d2, d3); 1177 1178 s += 8; 1179 d += 8; 1180 w -= 8; 1181 } while (w != 0); 1182 src += 4 * src_stride; 1183 dst += 4 * dst_stride; 1184 height -= 4; 1185 } while (height > 0); 1186 } 1187 } 1188 1189 static inline uint16x4_t highbd_convolve12_4_2d_v( 1190 int16x8_t s0[2], int16x8_t s1[2], int16x8_t s2[2], int16x8_t filter_0_7, 1191 int16x8_t filter_4_11, int32x4_t shift, int64x2_t offset, uint16x4_t max) { 1192 int64x2_t sum01 = aom_svdot_lane_s16(offset, s0[0], filter_0_7, 0); 1193 sum01 = aom_svdot_lane_s16(sum01, s1[0], filter_0_7, 1); 1194 sum01 = aom_svdot_lane_s16(sum01, s2[0], filter_4_11, 1); 1195 1196 int64x2_t sum23 = aom_svdot_lane_s16(offset, s0[1], filter_0_7, 0); 1197 sum23 = aom_svdot_lane_s16(sum23, s1[1], filter_0_7, 1); 1198 sum23 = aom_svdot_lane_s16(sum23, s2[1], filter_4_11, 1); 1199 1200 int32x4_t sum0123 = vcombine_s32(vmovn_s64(sum01), vmovn_s64(sum23)); 1201 sum0123 = vshlq_s32(sum0123, shift); 1202 1203 uint16x4_t res = vqmovun_s32(sum0123); 1204 1205 return vmin_u16(res, max); 1206 } 1207 1208 static inline void highbd_convolve_2d_sr_vert_12tap_sve2( 1209 const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, 1210 int width, int height, const int16_t *y_filter_ptr, 1211 ConvolveParams *conv_params, int bd, const int y_offset) { 1212 const int64x2_t offset = vdupq_n_s64(y_offset); 1213 const int32x4_t shift = vdupq_n_s32(-conv_params->round_1); 1214 1215 const int16x8_t y_filter_0_7 = vld1q_s16(y_filter_ptr); 1216 const int16x8_t y_filter_4_11 = vld1q_s16(y_filter_ptr + 4); 1217 1218 uint16x8x3_t merge_block_tbl = vld1q_u16_x3(kDotProdMergeBlockTbl); 1219 // Scale indices by size of the true vector length to avoid reading from an 1220 // 'undefined' portion of a vector on a system with SVE vectors > 128-bit. 1221 uint16x8_t correction0 = 1222 vreinterpretq_u16_u64(vdupq_n_u64(svcnth() * 0x0001000000000000ULL)); 1223 merge_block_tbl.val[0] = vaddq_u16(merge_block_tbl.val[0], correction0); 1224 1225 uint16x8_t correction1 = 1226 vreinterpretq_u16_u64(vdupq_n_u64(svcnth() * 0x0001000100000000ULL)); 1227 merge_block_tbl.val[1] = vaddq_u16(merge_block_tbl.val[1], correction1); 1228 1229 uint16x8_t correction2 = 1230 vreinterpretq_u16_u64(vdupq_n_u64(svcnth() * 0x0001000100010000ULL)); 1231 merge_block_tbl.val[2] = vaddq_u16(merge_block_tbl.val[2], correction2); 1232 1233 const uint16x4_t max = vdup_n_u16((1 << bd) - 1); 1234 1235 do { 1236 int16_t *s = (int16_t *)src; 1237 uint16_t *d = (uint16_t *)dst; 1238 int h = height; 1239 1240 int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA; 1241 load_s16_4x11(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7, &s8, 1242 &s9, &sA); 1243 s += 11 * src_stride; 1244 1245 int16x8_t s0123[2], s1234[2], s2345[2], s3456[2], s4567[2], s5678[2], 1246 s6789[2], s789A[2]; 1247 // This operation combines a conventional transpose and the sample permute 1248 // required before computing the dot product. 1249 transpose_concat_elems_s16_4x4(s0, s1, s2, s3, s0123); 1250 transpose_concat_elems_s16_4x4(s1, s2, s3, s4, s1234); 1251 transpose_concat_elems_s16_4x4(s2, s3, s4, s5, s2345); 1252 transpose_concat_elems_s16_4x4(s3, s4, s5, s6, s3456); 1253 transpose_concat_elems_s16_4x4(s4, s5, s6, s7, s4567); 1254 transpose_concat_elems_s16_4x4(s5, s6, s7, s8, s5678); 1255 transpose_concat_elems_s16_4x4(s6, s7, s8, s9, s6789); 1256 transpose_concat_elems_s16_4x4(s7, s8, s9, sA, s789A); 1257 1258 do { 1259 int16x4_t sB, sC, sD, sE; 1260 load_s16_4x4(s, src_stride, &sB, &sC, &sD, &sE); 1261 1262 int16x8_t s89AB[2], s9ABC[2], sABCD[2], sBCDE[2]; 1263 transpose_concat_elems_s16_4x4(sB, sC, sD, sE, sBCDE); 1264 1265 // Use the above transpose and reuse data from the previous loop to get 1266 // the rest. 1267 aom_tbl2x2_s16(s789A, sBCDE, merge_block_tbl.val[0], s89AB); 1268 aom_tbl2x2_s16(s789A, sBCDE, merge_block_tbl.val[1], s9ABC); 1269 aom_tbl2x2_s16(s789A, sBCDE, merge_block_tbl.val[2], sABCD); 1270 1271 uint16x4_t d0 = highbd_convolve12_4_2d_v( 1272 s0123, s4567, s89AB, y_filter_0_7, y_filter_4_11, shift, offset, max); 1273 uint16x4_t d1 = highbd_convolve12_4_2d_v( 1274 s1234, s5678, s9ABC, y_filter_0_7, y_filter_4_11, shift, offset, max); 1275 uint16x4_t d2 = highbd_convolve12_4_2d_v( 1276 s2345, s6789, sABCD, y_filter_0_7, y_filter_4_11, shift, offset, max); 1277 uint16x4_t d3 = highbd_convolve12_4_2d_v( 1278 s3456, s789A, sBCDE, y_filter_0_7, y_filter_4_11, shift, offset, max); 1279 1280 store_u16_4x4(d, dst_stride, d0, d1, d2, d3); 1281 1282 // Prepare block for next iteration - re-using as much as possible. 1283 // Shuffle everything up four rows. 1284 s0123[0] = s4567[0]; 1285 s0123[1] = s4567[1]; 1286 s1234[0] = s5678[0]; 1287 s1234[1] = s5678[1]; 1288 s2345[0] = s6789[0]; 1289 s2345[1] = s6789[1]; 1290 s3456[0] = s789A[0]; 1291 s3456[1] = s789A[1]; 1292 s4567[0] = s89AB[0]; 1293 s4567[1] = s89AB[1]; 1294 s5678[0] = s9ABC[0]; 1295 s5678[1] = s9ABC[1]; 1296 s6789[0] = sABCD[0]; 1297 s6789[1] = sABCD[1]; 1298 s789A[0] = sBCDE[0]; 1299 s789A[1] = sBCDE[1]; 1300 1301 s += 4 * src_stride; 1302 d += 4 * dst_stride; 1303 h -= 4; 1304 } while (h != 0); 1305 src += 4; 1306 dst += 4; 1307 width -= 4; 1308 } while (width != 0); 1309 } 1310 1311 static inline uint16x4_t highbd_convolve8_4_2d_v( 1312 int16x8_t samples_lo[2], int16x8_t samples_hi[2], int16x8_t filter, 1313 int32x4_t shift, int64x2_t offset, uint16x4_t max) { 1314 int64x2_t sum01 = aom_svdot_lane_s16(offset, samples_lo[0], filter, 0); 1315 sum01 = aom_svdot_lane_s16(sum01, samples_hi[0], filter, 1); 1316 1317 int64x2_t sum23 = aom_svdot_lane_s16(offset, samples_lo[1], filter, 0); 1318 sum23 = aom_svdot_lane_s16(sum23, samples_hi[1], filter, 1); 1319 1320 int32x4_t sum0123 = vcombine_s32(vmovn_s64(sum01), vmovn_s64(sum23)); 1321 sum0123 = vshlq_s32(sum0123, shift); 1322 1323 uint16x4_t res = vqmovun_s32(sum0123); 1324 return vmin_u16(res, max); 1325 } 1326 1327 static inline uint16x8_t highbd_convolve8_8_2d_v( 1328 int16x8_t samples_lo[4], int16x8_t samples_hi[4], int16x8_t filter, 1329 int32x4_t shift, int64x2_t offset, uint16x8_t max) { 1330 int64x2_t sum01 = aom_svdot_lane_s16(offset, samples_lo[0], filter, 0); 1331 sum01 = aom_svdot_lane_s16(sum01, samples_hi[0], filter, 1); 1332 1333 int64x2_t sum23 = aom_svdot_lane_s16(offset, samples_lo[1], filter, 0); 1334 sum23 = aom_svdot_lane_s16(sum23, samples_hi[1], filter, 1); 1335 1336 int64x2_t sum45 = aom_svdot_lane_s16(offset, samples_lo[2], filter, 0); 1337 sum45 = aom_svdot_lane_s16(sum45, samples_hi[2], filter, 1); 1338 1339 int64x2_t sum67 = aom_svdot_lane_s16(offset, samples_lo[3], filter, 0); 1340 sum67 = aom_svdot_lane_s16(sum67, samples_hi[3], filter, 1); 1341 1342 int32x4_t sum0123 = vcombine_s32(vmovn_s64(sum01), vmovn_s64(sum23)); 1343 int32x4_t sum4567 = vcombine_s32(vmovn_s64(sum45), vmovn_s64(sum67)); 1344 1345 sum0123 = vshlq_s32(sum0123, shift); 1346 sum4567 = vshlq_s32(sum4567, shift); 1347 1348 uint16x8_t res = vcombine_u16(vqmovun_s32(sum0123), vqmovun_s32(sum4567)); 1349 return vminq_u16(res, max); 1350 } 1351 1352 static void highbd_convolve_2d_sr_vert_8tap_sve2( 1353 const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, 1354 ptrdiff_t dst_stride, int width, int height, const int16_t *filter_y, 1355 ConvolveParams *conv_params, int bd, const int y_offset) { 1356 assert(width >= 4 && height >= 4); 1357 const int64x2_t offset = vdupq_n_s64(y_offset); 1358 const int32x4_t shift = vdupq_n_s32(-conv_params->round_1); 1359 const int16x8_t y_filter = vld1q_s16(filter_y); 1360 1361 uint16x8x3_t merge_block_tbl = vld1q_u16_x3(kDotProdMergeBlockTbl); 1362 // Scale indices by size of the true vector length to avoid reading from an 1363 // 'undefined' portion of a vector on a system with SVE vectors > 128-bit. 1364 uint16x8_t correction0 = 1365 vreinterpretq_u16_u64(vdupq_n_u64(svcnth() * 0x0001000000000000ULL)); 1366 merge_block_tbl.val[0] = vaddq_u16(merge_block_tbl.val[0], correction0); 1367 1368 uint16x8_t correction1 = 1369 vreinterpretq_u16_u64(vdupq_n_u64(svcnth() * 0x0001000100000000ULL)); 1370 merge_block_tbl.val[1] = vaddq_u16(merge_block_tbl.val[1], correction1); 1371 1372 uint16x8_t correction2 = 1373 vreinterpretq_u16_u64(vdupq_n_u64(svcnth() * 0x0001000100010000ULL)); 1374 merge_block_tbl.val[2] = vaddq_u16(merge_block_tbl.val[2], correction2); 1375 1376 if (width == 4) { 1377 const uint16x4_t max = vdup_n_u16((1 << bd) - 1); 1378 int16_t *s = (int16_t *)src; 1379 1380 int16x4_t s0, s1, s2, s3, s4, s5, s6; 1381 load_s16_4x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6); 1382 s += 7 * src_stride; 1383 1384 // This operation combines a conventional transpose and the sample permute 1385 // required before computing the dot product. 1386 int16x8_t s0123[2], s1234[2], s2345[2], s3456[2]; 1387 transpose_concat_elems_s16_4x4(s0, s1, s2, s3, s0123); 1388 transpose_concat_elems_s16_4x4(s1, s2, s3, s4, s1234); 1389 transpose_concat_elems_s16_4x4(s2, s3, s4, s5, s2345); 1390 transpose_concat_elems_s16_4x4(s3, s4, s5, s6, s3456); 1391 1392 do { 1393 int16x4_t s7, s8, s9, s10; 1394 load_s16_4x4(s, src_stride, &s7, &s8, &s9, &s10); 1395 1396 int16x8_t s4567[2], s5678[2], s6789[2], s789A[2]; 1397 // Transpose and shuffle the 4 lines that were loaded. 1398 transpose_concat_elems_s16_4x4(s7, s8, s9, s10, s789A); 1399 1400 // Merge new data into block from previous iteration. 1401 aom_tbl2x2_s16(s3456, s789A, merge_block_tbl.val[0], s4567); 1402 aom_tbl2x2_s16(s3456, s789A, merge_block_tbl.val[1], s5678); 1403 aom_tbl2x2_s16(s3456, s789A, merge_block_tbl.val[2], s6789); 1404 1405 uint16x4_t d0 = 1406 highbd_convolve8_4_2d_v(s0123, s4567, y_filter, shift, offset, max); 1407 uint16x4_t d1 = 1408 highbd_convolve8_4_2d_v(s1234, s5678, y_filter, shift, offset, max); 1409 uint16x4_t d2 = 1410 highbd_convolve8_4_2d_v(s2345, s6789, y_filter, shift, offset, max); 1411 uint16x4_t d3 = 1412 highbd_convolve8_4_2d_v(s3456, s789A, y_filter, shift, offset, max); 1413 1414 store_u16_4x4(dst, dst_stride, d0, d1, d2, d3); 1415 1416 // Prepare block for next iteration - re-using as much as possible. 1417 // Shuffle everything up four rows. 1418 s0123[0] = s4567[0]; 1419 s0123[1] = s4567[1]; 1420 s1234[0] = s5678[0]; 1421 s1234[1] = s5678[1]; 1422 s2345[0] = s6789[0]; 1423 s2345[1] = s6789[1]; 1424 s3456[0] = s789A[0]; 1425 s3456[1] = s789A[1]; 1426 1427 s += 4 * src_stride; 1428 dst += 4 * dst_stride; 1429 height -= 4; 1430 } while (height != 0); 1431 } else { 1432 const uint16x8_t max = vdupq_n_u16((1 << bd) - 1); 1433 1434 do { 1435 int h = height; 1436 int16_t *s = (int16_t *)src; 1437 uint16_t *d = dst; 1438 1439 int16x8_t s0, s1, s2, s3, s4, s5, s6; 1440 load_s16_8x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6); 1441 s += 7 * src_stride; 1442 1443 // This operation combines a conventional transpose and the sample permute 1444 // required before computing the dot product. 1445 int16x8_t s0123[4], s1234[4], s2345[4], s3456[4]; 1446 transpose_concat_elems_s16_8x4(s0, s1, s2, s3, s0123); 1447 transpose_concat_elems_s16_8x4(s1, s2, s3, s4, s1234); 1448 transpose_concat_elems_s16_8x4(s2, s3, s4, s5, s2345); 1449 transpose_concat_elems_s16_8x4(s3, s4, s5, s6, s3456); 1450 1451 do { 1452 int16x8_t s7, s8, s9, s10; 1453 load_s16_8x4(s, src_stride, &s7, &s8, &s9, &s10); 1454 1455 int16x8_t s4567[4], s5678[4], s6789[4], s789A[4]; 1456 // Transpose and shuffle the 4 lines that were loaded. 1457 transpose_concat_elems_s16_8x4(s7, s8, s9, s10, s789A); 1458 1459 // Merge new data into block from previous iteration. 1460 aom_tbl2x4_s16(s3456, s789A, merge_block_tbl.val[0], s4567); 1461 aom_tbl2x4_s16(s3456, s789A, merge_block_tbl.val[1], s5678); 1462 aom_tbl2x4_s16(s3456, s789A, merge_block_tbl.val[2], s6789); 1463 1464 uint16x8_t d0 = 1465 highbd_convolve8_8_2d_v(s0123, s4567, y_filter, shift, offset, max); 1466 uint16x8_t d1 = 1467 highbd_convolve8_8_2d_v(s1234, s5678, y_filter, shift, offset, max); 1468 uint16x8_t d2 = 1469 highbd_convolve8_8_2d_v(s2345, s6789, y_filter, shift, offset, max); 1470 uint16x8_t d3 = 1471 highbd_convolve8_8_2d_v(s3456, s789A, y_filter, shift, offset, max); 1472 1473 store_u16_8x4(d, dst_stride, d0, d1, d2, d3); 1474 1475 // Prepare block for next iteration - re-using as much as possible. 1476 // Shuffle everything up four rows. 1477 s0123[0] = s4567[0]; 1478 s0123[1] = s4567[1]; 1479 s0123[2] = s4567[2]; 1480 s0123[3] = s4567[3]; 1481 s1234[0] = s5678[0]; 1482 s1234[1] = s5678[1]; 1483 s1234[2] = s5678[2]; 1484 s1234[3] = s5678[3]; 1485 s2345[0] = s6789[0]; 1486 s2345[1] = s6789[1]; 1487 s2345[2] = s6789[2]; 1488 s2345[3] = s6789[3]; 1489 s3456[0] = s789A[0]; 1490 s3456[1] = s789A[1]; 1491 s3456[2] = s789A[2]; 1492 s3456[3] = s789A[3]; 1493 1494 s += 4 * src_stride; 1495 d += 4 * dst_stride; 1496 h -= 4; 1497 } while (h != 0); 1498 src += 8; 1499 dst += 8; 1500 width -= 8; 1501 } while (width != 0); 1502 } 1503 } 1504 1505 static inline uint16x4_t highbd_convolve4_4_2d_v(int16x8_t samples[2], 1506 int16x8_t filter, 1507 int32x4_t shift, 1508 int64x2_t offset, 1509 uint16x4_t max) { 1510 int64x2_t sum01 = aom_svdot_lane_s16(offset, samples[0], filter, 0); 1511 int64x2_t sum23 = aom_svdot_lane_s16(offset, samples[1], filter, 0); 1512 1513 int32x4_t sum0123 = vcombine_s32(vmovn_s64(sum01), vmovn_s64(sum23)); 1514 sum0123 = vshlq_s32(sum0123, shift); 1515 1516 uint16x4_t res = vqmovun_s32(sum0123); 1517 return vmin_u16(res, max); 1518 } 1519 1520 static inline uint16x8_t highbd_convolve4_8_2d_v(int16x8_t samples[4], 1521 int16x8_t filter, 1522 int32x4_t shift, 1523 int64x2_t offset, 1524 uint16x8_t max) { 1525 int64x2_t sum01 = aom_svdot_lane_s16(offset, samples[0], filter, 0); 1526 int64x2_t sum23 = aom_svdot_lane_s16(offset, samples[1], filter, 0); 1527 int64x2_t sum45 = aom_svdot_lane_s16(offset, samples[2], filter, 0); 1528 int64x2_t sum67 = aom_svdot_lane_s16(offset, samples[3], filter, 0); 1529 1530 int32x4_t sum0123 = vcombine_s32(vmovn_s64(sum01), vmovn_s64(sum23)); 1531 int32x4_t sum4567 = vcombine_s32(vmovn_s64(sum45), vmovn_s64(sum67)); 1532 1533 sum0123 = vshlq_s32(sum0123, shift); 1534 sum4567 = vshlq_s32(sum4567, shift); 1535 1536 uint16x8_t res = vcombine_u16(vqmovun_s32(sum0123), vqmovun_s32(sum4567)); 1537 return vminq_u16(res, max); 1538 } 1539 1540 static void highbd_convolve_2d_sr_vert_4tap_sve2( 1541 const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, 1542 ptrdiff_t dst_stride, int width, int height, const int16_t *filter_y, 1543 ConvolveParams *conv_params, int bd, const int y_offset) { 1544 assert(width >= 4 && height >= 4); 1545 const int64x2_t offset = vdupq_n_s64(y_offset); 1546 const int32x4_t shift = vdupq_n_s32(-conv_params->round_1); 1547 1548 const int16x8_t y_filter = 1549 vcombine_s16(vld1_s16(filter_y + 2), vdup_n_s16(0)); 1550 1551 if (width == 4) { 1552 const uint16x4_t max = vdup_n_u16((1 << bd) - 1); 1553 int16_t *s = (int16_t *)(src); 1554 1555 int16x4_t s0, s1, s2; 1556 load_s16_4x3(s, src_stride, &s0, &s1, &s2); 1557 s += 3 * src_stride; 1558 1559 do { 1560 int16x4_t s3, s4, s5, s6; 1561 load_s16_4x4(s, src_stride, &s3, &s4, &s5, &s6); 1562 1563 // This operation combines a conventional transpose and the sample permute 1564 // required before computing the dot product. 1565 int16x8_t s0123[2], s1234[2], s2345[2], s3456[2]; 1566 transpose_concat_elems_s16_4x4(s0, s1, s2, s3, s0123); 1567 transpose_concat_elems_s16_4x4(s1, s2, s3, s4, s1234); 1568 transpose_concat_elems_s16_4x4(s2, s3, s4, s5, s2345); 1569 transpose_concat_elems_s16_4x4(s3, s4, s5, s6, s3456); 1570 1571 uint16x4_t d0 = 1572 highbd_convolve4_4_2d_v(s0123, y_filter, shift, offset, max); 1573 uint16x4_t d1 = 1574 highbd_convolve4_4_2d_v(s1234, y_filter, shift, offset, max); 1575 uint16x4_t d2 = 1576 highbd_convolve4_4_2d_v(s2345, y_filter, shift, offset, max); 1577 uint16x4_t d3 = 1578 highbd_convolve4_4_2d_v(s3456, y_filter, shift, offset, max); 1579 1580 store_u16_4x4(dst, dst_stride, d0, d1, d2, d3); 1581 1582 // Shuffle everything up four rows. 1583 s0 = s4; 1584 s1 = s5; 1585 s2 = s6; 1586 1587 s += 4 * src_stride; 1588 dst += 4 * dst_stride; 1589 height -= 4; 1590 } while (height != 0); 1591 } else { 1592 const uint16x8_t max = vdupq_n_u16((1 << bd) - 1); 1593 1594 do { 1595 int h = height; 1596 int16_t *s = (int16_t *)(src); 1597 uint16_t *d = dst; 1598 1599 int16x8_t s0, s1, s2; 1600 load_s16_8x3(s, src_stride, &s0, &s1, &s2); 1601 s += 3 * src_stride; 1602 1603 do { 1604 int16x8_t s3, s4, s5, s6; 1605 load_s16_8x4(s, src_stride, &s3, &s4, &s5, &s6); 1606 1607 // This operation combines a conventional transpose and the sample 1608 // permute required before computing the dot product. 1609 int16x8_t s0123[4], s1234[4], s2345[4], s3456[4]; 1610 transpose_concat_elems_s16_8x4(s0, s1, s2, s3, s0123); 1611 transpose_concat_elems_s16_8x4(s1, s2, s3, s4, s1234); 1612 transpose_concat_elems_s16_8x4(s2, s3, s4, s5, s2345); 1613 transpose_concat_elems_s16_8x4(s3, s4, s5, s6, s3456); 1614 1615 uint16x8_t d0 = 1616 highbd_convolve4_8_2d_v(s0123, y_filter, shift, offset, max); 1617 uint16x8_t d1 = 1618 highbd_convolve4_8_2d_v(s1234, y_filter, shift, offset, max); 1619 uint16x8_t d2 = 1620 highbd_convolve4_8_2d_v(s2345, y_filter, shift, offset, max); 1621 uint16x8_t d3 = 1622 highbd_convolve4_8_2d_v(s3456, y_filter, shift, offset, max); 1623 1624 store_u16_8x4(d, dst_stride, d0, d1, d2, d3); 1625 1626 // Shuffle everything up four rows. 1627 s0 = s4; 1628 s1 = s5; 1629 s2 = s6; 1630 1631 s += 4 * src_stride; 1632 d += 4 * dst_stride; 1633 h -= 4; 1634 } while (h != 0); 1635 src += 8; 1636 dst += 8; 1637 width -= 8; 1638 } while (width != 0); 1639 } 1640 } 1641 1642 void av1_highbd_convolve_2d_sr_sve2(const uint16_t *src, int src_stride, 1643 uint16_t *dst, int dst_stride, int w, int h, 1644 const InterpFilterParams *filter_params_x, 1645 const InterpFilterParams *filter_params_y, 1646 const int subpel_x_qn, 1647 const int subpel_y_qn, 1648 ConvolveParams *conv_params, int bd) { 1649 if (w == 2 || h == 2) { 1650 av1_highbd_convolve_2d_sr_c(src, src_stride, dst, dst_stride, w, h, 1651 filter_params_x, filter_params_y, subpel_x_qn, 1652 subpel_y_qn, conv_params, bd); 1653 return; 1654 } 1655 1656 DECLARE_ALIGNED(16, uint16_t, 1657 im_block[(MAX_SB_SIZE + MAX_FILTER_TAP) * MAX_SB_SIZE]); 1658 const int x_filter_taps = get_filter_tap(filter_params_x, subpel_x_qn); 1659 const int y_filter_taps = get_filter_tap(filter_params_y, subpel_y_qn); 1660 1661 if (x_filter_taps == 6 || y_filter_taps == 6) { 1662 av1_highbd_convolve_2d_sr_neon(src, src_stride, dst, dst_stride, w, h, 1663 filter_params_x, filter_params_y, 1664 subpel_x_qn, subpel_y_qn, conv_params, bd); 1665 return; 1666 } 1667 1668 const int clamped_x_taps = x_filter_taps < 4 ? 4 : x_filter_taps; 1669 const int clamped_y_taps = y_filter_taps < 4 ? 4 : y_filter_taps; 1670 1671 const int im_stride = MAX_SB_SIZE; 1672 const int vert_offset = clamped_y_taps / 2 - 1; 1673 const int horiz_offset = clamped_x_taps / 2 - 1; 1674 const int x_offset = (1 << (bd + FILTER_BITS - 1)); 1675 const int y_offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0; 1676 // The extra shim of (1 << (conv_params->round_1 - 1)) allows us to do a 1677 // simple shift left instead of a rounding saturating shift left. 1678 const int y_offset = 1679 (1 << (conv_params->round_1 - 1)) - (1 << (y_offset_bits - 1)); 1680 1681 const uint16_t *src_ptr = src - vert_offset * src_stride - horiz_offset; 1682 1683 const int16_t *x_filter_ptr = av1_get_interp_filter_subpel_kernel( 1684 filter_params_x, subpel_x_qn & SUBPEL_MASK); 1685 const int16_t *y_filter_ptr = av1_get_interp_filter_subpel_kernel( 1686 filter_params_y, subpel_y_qn & SUBPEL_MASK); 1687 const int im_h = h + clamped_y_taps - 1; 1688 1689 if (x_filter_taps > 8) { 1690 highbd_convolve_2d_sr_horiz_12tap_sve2(src_ptr, src_stride, im_block, 1691 im_stride, w, im_h, x_filter_ptr, 1692 conv_params, x_offset); 1693 1694 highbd_convolve_2d_sr_vert_12tap_sve2(im_block, im_stride, dst, dst_stride, 1695 w, h, y_filter_ptr, conv_params, bd, 1696 y_offset); 1697 return; 1698 } 1699 1700 if (x_filter_taps <= 4) { 1701 highbd_convolve_2d_sr_horiz_4tap_sve2(src_ptr, src_stride, im_block, 1702 im_stride, w, im_h, x_filter_ptr, 1703 conv_params, x_offset); 1704 } else { 1705 highbd_convolve_2d_sr_horiz_8tap_sve2(src_ptr, src_stride, im_block, 1706 im_stride, w, im_h, x_filter_ptr, 1707 conv_params, x_offset); 1708 } 1709 1710 if (y_filter_taps <= 4) { 1711 highbd_convolve_2d_sr_vert_4tap_sve2(im_block, im_stride, dst, dst_stride, 1712 w, h, y_filter_ptr, conv_params, bd, 1713 y_offset); 1714 } else { 1715 highbd_convolve_2d_sr_vert_8tap_sve2(im_block, im_stride, dst, dst_stride, 1716 w, h, y_filter_ptr, conv_params, bd, 1717 y_offset); 1718 } 1719 }