convolve_neon_i8mm.c (57512B)
1 /* 2 * Copyright (c) 2023, Alliance for Open Media. All rights reserved. 3 * 4 * This source code is subject to the terms of the BSD 2 Clause License and 5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License 6 * was not distributed with this source code in the LICENSE file, you can 7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open 8 * Media Patent License 1.0 was not distributed with this source code in the 9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent. 10 */ 11 12 #include <arm_neon.h> 13 14 #include "config/aom_config.h" 15 #include "config/av1_rtcd.h" 16 17 #include "aom_dsp/aom_dsp_common.h" 18 #include "aom_dsp/arm/mem_neon.h" 19 #include "aom_dsp/arm/transpose_neon.h" 20 #include "aom_ports/mem.h" 21 #include "av1/common/arm/convolve_neon.h" 22 #include "av1/common/arm/convolve_neon_i8mm.h" 23 #include "av1/common/convolve.h" 24 #include "av1/common/filter.h" 25 26 DECLARE_ALIGNED(16, static const uint8_t, kDotProdMergeBlockTbl[48]) = { 27 // Shift left and insert new last column in transposed 4x4 block. 28 1, 2, 3, 16, 5, 6, 7, 20, 9, 10, 11, 24, 13, 14, 15, 28, 29 // Shift left and insert two new columns in transposed 4x4 block. 30 2, 3, 16, 17, 6, 7, 20, 21, 10, 11, 24, 25, 14, 15, 28, 29, 31 // Shift left and insert three new columns in transposed 4x4 block. 32 3, 16, 17, 18, 7, 20, 21, 22, 11, 24, 25, 26, 15, 28, 29, 30 33 }; 34 35 static inline int16x4_t convolve12_4_x(uint8x16_t samples[2], 36 const int8x16_t filter[2], 37 const uint8x16_t permute_tbl, 38 const int32x4_t horiz_const) { 39 // Permute samples ready for matrix multiply. 40 // { 0, 1, 2, 3, 4, 5, 6, 7, 2, 3, 4, 5, 6, 7, 8, 9 } 41 // { 4, 5, 6, 7, 8, 9, 10, 11, 6, 7, 8, 9, 10, 11, 12, 13 } 42 uint8x16_t perm_samples[2] = { vqtbl1q_u8(samples[0], permute_tbl), 43 vqtbl1q_u8(samples[1], permute_tbl) }; 44 45 // These instructions multiply a 2x8 matrix (samples) by an 8x2 matrix 46 // (filter), destructively accumulating into the destination register. 47 int32x4_t sum = vusmmlaq_s32(horiz_const, perm_samples[0], filter[0]); 48 sum = vusmmlaq_s32(sum, perm_samples[1], filter[1]); 49 50 return vshrn_n_s32(sum, 1); 51 } 52 53 static inline uint8x8_t convolve12_8_x(uint8x16_t samples[2], 54 const int8x16_t filter[2], 55 const uint8x16x2_t permute_tbl, 56 const int32x4_t horiz_const) { 57 // Permute samples ready for matrix multiply. 58 // { 0, 1, 2, 3, 4, 5, 6, 7, 2, 3, 4, 5, 6, 7, 8, 9 } 59 // { 4, 5, 6, 7, 8, 9, 10, 11, 6, 7, 8, 9, 10, 11, 12, 13 } 60 // { 6, 7, 8, 9, 10, 11, 12, 13, 8, 9, 10, 11, 12, 13, 14, 15 } 61 // { 10, 11, 12, 13, 14, 15, 16, 17, 12, 13, 14, 15, 16, 17, 18, 19 } 62 uint8x16_t perm_samples[4] = { vqtbl1q_u8(samples[0], permute_tbl.val[0]), 63 vqtbl1q_u8(samples[0], permute_tbl.val[1]), 64 vqtbl1q_u8(samples[1], permute_tbl.val[0]), 65 vqtbl1q_u8(samples[1], permute_tbl.val[1]) }; 66 67 // These instructions multiply a 2x8 matrix (samples) by an 8x2 matrix 68 // (filter), destructively accumulating into the destination register. 69 int32x4_t sum0123 = vusmmlaq_s32(horiz_const, perm_samples[0], filter[0]); 70 int32x4_t sum4567 = vusmmlaq_s32(horiz_const, perm_samples[1], filter[0]); 71 sum0123 = vusmmlaq_s32(sum0123, perm_samples[2], filter[1]); 72 sum4567 = vusmmlaq_s32(sum4567, perm_samples[3], filter[1]); 73 74 // Narrow and re-pack. 75 int16x8_t sum_s16 = 76 vcombine_s16(vshrn_n_s32(sum0123, 1), vshrn_n_s32(sum4567, 1)); 77 return vqrshrun_n_s16(sum_s16, FILTER_BITS - 1); 78 } 79 80 static inline void convolve_x_sr_12tap_neon_i8mm(const uint8_t *src, 81 int src_stride, uint8_t *dst, 82 int dst_stride, int w, int h, 83 const int16_t *x_filter_ptr) { 84 // The no-op filter should never be used here. 85 assert(x_filter_ptr[5] != 128); 86 87 // Split 12-tap filter into two 6-tap filters, masking the top two elements. 88 // { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0 } 89 const int8x8_t mask = vcreate_s8(0x0000ffffffffffff); 90 const int8x8_t filter_0 = vand_s8(vmovn_s16(vld1q_s16(x_filter_ptr)), mask); 91 const int8x8_t filter_1 = 92 vext_s8(vmovn_s16(vld1q_s16(x_filter_ptr + 4)), vdup_n_s8(0), 2); 93 94 // Stagger each 6-tap filter to enable use of matrix multiply instructions. 95 // { f0, f1, f2, f3, f4, f5, 0, 0, 0, f0, f1, f2, f3, f4, f5, 0 } 96 const int8x16_t filter[2] = { 97 vcombine_s8(filter_0, vext_s8(filter_0, filter_0, 7)), 98 vcombine_s8(filter_1, vext_s8(filter_1, filter_1, 7)) 99 }; 100 101 // A shim of 1 << (ROUND0_BITS - 1) enables us to simplify computation in the 102 // convolution kernels: Adding this shim enables us to use a single rounding 103 // right shift by FILTER_BITS instead of two rounding right shifts: first by 104 // ROUND0_BITS, and then subsequently by FILTER_BITS - ROUND0_BITS. 105 const int32x4_t horiz_const = vdupq_n_s32(1 << (ROUND0_BITS - 1)); 106 107 if (w <= 4) { 108 const uint8x16_t permute_tbl = vld1q_u8(kMatMulPermuteTbl); 109 110 do { 111 uint8x16_t s0[2], s1[2], s2[2], s3[2]; 112 load_u8_16x4(src, src_stride, &s0[0], &s1[0], &s2[0], &s3[0]); 113 load_u8_16x4(src + 6, src_stride, &s0[1], &s1[1], &s2[1], &s3[1]); 114 115 int16x4_t d0 = convolve12_4_x(s0, filter, permute_tbl, horiz_const); 116 int16x4_t d1 = convolve12_4_x(s1, filter, permute_tbl, horiz_const); 117 int16x4_t d2 = convolve12_4_x(s2, filter, permute_tbl, horiz_const); 118 int16x4_t d3 = convolve12_4_x(s3, filter, permute_tbl, horiz_const); 119 120 uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS - 1); 121 uint8x8_t d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS - 1); 122 123 store_u8x4_strided_x2(dst + 0 * dst_stride, dst_stride, d01); 124 store_u8x4_strided_x2(dst + 2 * dst_stride, dst_stride, d23); 125 126 dst += 4 * dst_stride; 127 src += 4 * src_stride; 128 h -= 4; 129 } while (h != 0); 130 } else { 131 const uint8x16x2_t permute_tbl = vld1q_u8_x2(kMatMulPermuteTbl); 132 133 do { 134 const uint8_t *s = src; 135 uint8_t *d = dst; 136 int width = w; 137 138 do { 139 uint8x16_t s0[2], s1[2], s2[2], s3[2]; 140 load_u8_16x4(s, src_stride, &s0[0], &s1[0], &s2[0], &s3[0]); 141 load_u8_16x4(s + 6, src_stride, &s0[1], &s1[1], &s2[1], &s3[1]); 142 143 uint8x8_t d0 = convolve12_8_x(s0, filter, permute_tbl, horiz_const); 144 uint8x8_t d1 = convolve12_8_x(s1, filter, permute_tbl, horiz_const); 145 uint8x8_t d2 = convolve12_8_x(s2, filter, permute_tbl, horiz_const); 146 uint8x8_t d3 = convolve12_8_x(s3, filter, permute_tbl, horiz_const); 147 148 store_u8_8x4(d, dst_stride, d0, d1, d2, d3); 149 150 s += 8; 151 d += 8; 152 width -= 8; 153 } while (width != 0); 154 src += 4 * src_stride; 155 dst += 4 * dst_stride; 156 h -= 4; 157 } while (h != 0); 158 } 159 } 160 161 static inline uint8x8_t convolve8_8_x(uint8x16_t samples, const int8x8_t filter, 162 const uint8x16x3_t permute_tbl, 163 const int32x4_t horiz_const) { 164 // Permute samples ready for dot product. 165 // { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 } 166 // { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 } 167 // { 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 } 168 uint8x16_t perm_samples[3] = { vqtbl1q_u8(samples, permute_tbl.val[0]), 169 vqtbl1q_u8(samples, permute_tbl.val[1]), 170 vqtbl1q_u8(samples, permute_tbl.val[2]) }; 171 172 int32x4_t sum0123 = vusdotq_lane_s32(horiz_const, perm_samples[0], filter, 0); 173 sum0123 = vusdotq_lane_s32(sum0123, perm_samples[1], filter, 1); 174 175 int32x4_t sum4567 = vusdotq_lane_s32(horiz_const, perm_samples[1], filter, 0); 176 sum4567 = vusdotq_lane_s32(sum4567, perm_samples[2], filter, 1); 177 178 int16x8_t sum_s16 = vcombine_s16(vmovn_s32(sum0123), vmovn_s32(sum4567)); 179 // We halved the convolution filter values so - 1 from the right shift. 180 return vqrshrun_n_s16(sum_s16, FILTER_BITS - 1); 181 } 182 183 static inline void convolve_x_sr_8tap_neon_i8mm( 184 const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, 185 ptrdiff_t dst_stride, int width, int height, const int16_t *filter_x, 186 const int32x4_t horiz_const) { 187 // Filter values are even, so halve to reduce intermediate precision reqs. 188 const int8x8_t x_filter = vshrn_n_s16(vld1q_s16(filter_x), 1); 189 const uint8x16x3_t permute_tbl = vld1q_u8_x3(kDotProdPermuteTbl); 190 191 do { 192 const uint8_t *s = src; 193 uint8_t *d = dst; 194 int w = width; 195 196 do { 197 uint8x16_t s0, s1, s2, s3; 198 load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3); 199 200 uint8x8_t d0 = convolve8_8_x(s0, x_filter, permute_tbl, horiz_const); 201 uint8x8_t d1 = convolve8_8_x(s1, x_filter, permute_tbl, horiz_const); 202 uint8x8_t d2 = convolve8_8_x(s2, x_filter, permute_tbl, horiz_const); 203 uint8x8_t d3 = convolve8_8_x(s3, x_filter, permute_tbl, horiz_const); 204 205 store_u8_8x4(d, dst_stride, d0, d1, d2, d3); 206 207 s += 8; 208 d += 8; 209 w -= 8; 210 } while (w != 0); 211 src += 4 * src_stride; 212 dst += 4 * dst_stride; 213 height -= 4; 214 } while (height != 0); 215 } 216 217 static inline int16x4_t convolve6_4_x(uint8x16_t samples, 218 const int8x16_t filter, 219 const uint8x16_t permute_tbl, 220 const int32x4_t horiz_const) { 221 // Permute samples ready for matrix multiply. 222 // { 0, 1, 2, 3, 4, 5, 6, 7, 2, 3, 4, 5, 6, 7, 8, 9 } 223 uint8x16_t perm_samples = vqtbl1q_u8(samples, permute_tbl); 224 225 // These instructions multiply a 2x8 matrix (samples) by an 8x2 matrix 226 // (filter), destructively accumulating into the destination register. 227 int32x4_t sum = vusmmlaq_s32(horiz_const, perm_samples, filter); 228 229 // Further narrowing and packing is performed by the caller. 230 return vmovn_s32(sum); 231 } 232 233 static inline uint8x8_t convolve6_8_x(uint8x16_t samples, 234 const int8x16_t filter, 235 const uint8x16x2_t permute_tbl, 236 const int32x4_t horiz_const) { 237 // Permute samples ready for matrix multiply. 238 // { 0, 1, 2, 3, 4, 5, 6, 7, 2, 3, 4, 5, 6, 7, 8, 9 } 239 // { 4, 5, 6, 7, 8, 9, 10, 11, 6, 7, 8, 9, 10, 11, 12, 13 } 240 uint8x16_t perm_samples[2] = { vqtbl1q_u8(samples, permute_tbl.val[0]), 241 vqtbl1q_u8(samples, permute_tbl.val[1]) }; 242 243 // These instructions multiply a 2x8 matrix (samples) by an 8x2 matrix 244 // (filter), destructively accumulating into the destination register. 245 int32x4_t sum0123 = vusmmlaq_s32(horiz_const, perm_samples[0], filter); 246 int32x4_t sum4567 = vusmmlaq_s32(horiz_const, perm_samples[1], filter); 247 248 int16x8_t sum = vcombine_s16(vmovn_s32(sum0123), vmovn_s32(sum4567)); 249 // We halved the convolution filter values so - 1 from the right shift. 250 return vqrshrun_n_s16(sum, FILTER_BITS - 1); 251 } 252 253 static inline void convolve_x_sr_6tap_neon_i8mm( 254 const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, 255 ptrdiff_t dst_stride, int width, int height, const int16_t *filter_x, 256 const int32x4_t horiz_const) { 257 // Filter values are even, so halve to reduce intermediate precision reqs. 258 const int8x8_t x_filter_s8 = vshrn_n_s16(vld1q_s16(filter_x), 1); 259 // Stagger the filter for use with the matrix multiply instructions. 260 // { f0, f1, f2, f3, f4, f5, 0, 0, 0, f0, f1, f2, f3, f4, f5, 0 } 261 const int8x16_t x_filter = 262 vcombine_s8(vext_s8(x_filter_s8, x_filter_s8, 1), x_filter_s8); 263 264 if (width == 4) { 265 const uint8x16_t permute_tbl = vld1q_u8(kMatMulPermuteTbl); 266 do { 267 uint8x16_t s0, s1, s2, s3; 268 load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3); 269 270 int16x4_t t0 = convolve6_4_x(s0, x_filter, permute_tbl, horiz_const); 271 int16x4_t t1 = convolve6_4_x(s1, x_filter, permute_tbl, horiz_const); 272 int16x4_t t2 = convolve6_4_x(s2, x_filter, permute_tbl, horiz_const); 273 int16x4_t t3 = convolve6_4_x(s3, x_filter, permute_tbl, horiz_const); 274 // We halved the filter values so -1 from right shift. 275 uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(t0, t1), FILTER_BITS - 1); 276 uint8x8_t d23 = vqrshrun_n_s16(vcombine_s16(t2, t3), FILTER_BITS - 1); 277 278 store_u8x4_strided_x2(dst + 0 * dst_stride, dst_stride, d01); 279 store_u8x4_strided_x2(dst + 2 * dst_stride, dst_stride, d23); 280 281 src += 4 * src_stride; 282 dst += 4 * dst_stride; 283 height -= 4; 284 } while (height != 0); 285 } else { 286 const uint8x16x2_t permute_tbl = vld1q_u8_x2(kMatMulPermuteTbl); 287 do { 288 const uint8_t *s = src; 289 uint8_t *d = dst; 290 int w = width; 291 292 do { 293 uint8x16_t s0, s1, s2, s3; 294 load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3); 295 296 uint8x8_t d0 = convolve6_8_x(s0, x_filter, permute_tbl, horiz_const); 297 uint8x8_t d1 = convolve6_8_x(s1, x_filter, permute_tbl, horiz_const); 298 uint8x8_t d2 = convolve6_8_x(s2, x_filter, permute_tbl, horiz_const); 299 uint8x8_t d3 = convolve6_8_x(s3, x_filter, permute_tbl, horiz_const); 300 301 store_u8_8x4(d, dst_stride, d0, d1, d2, d3); 302 303 s += 8; 304 d += 8; 305 w -= 8; 306 } while (w != 0); 307 src += 4 * src_stride; 308 dst += 4 * dst_stride; 309 height -= 4; 310 } while (height != 0); 311 } 312 } 313 314 void av1_convolve_x_sr_neon_i8mm(const uint8_t *src, int src_stride, 315 uint8_t *dst, int dst_stride, int w, int h, 316 const InterpFilterParams *filter_params_x, 317 const int subpel_x_qn, 318 ConvolveParams *conv_params) { 319 if (w == 2 || h == 2) { 320 av1_convolve_x_sr_c(src, src_stride, dst, dst_stride, w, h, filter_params_x, 321 subpel_x_qn, conv_params); 322 return; 323 } 324 325 const uint8_t horiz_offset = filter_params_x->taps / 2 - 1; 326 src -= horiz_offset; 327 328 const int16_t *x_filter_ptr = av1_get_interp_filter_subpel_kernel( 329 filter_params_x, subpel_x_qn & SUBPEL_MASK); 330 331 int filter_taps = get_filter_tap(filter_params_x, subpel_x_qn & SUBPEL_MASK); 332 333 // A shim of 1 << (ROUND0_BITS - 1) enables us to simplify computation in the 334 // convolution kernels: Adding this shim enables us to use a single rounding 335 // right shift by FILTER_BITS instead of two rounding right shifts: first by 336 // ROUND0_BITS, and then subsequently by FILTER_BITS - ROUND0_BITS. 337 // Halve the total because we will halve the filter values. 338 const int32x4_t horiz_const = vdupq_n_s32((1 << ((ROUND0_BITS - 1)) / 2)); 339 340 if (filter_taps <= 6) { 341 convolve_x_sr_6tap_neon_i8mm(src + 1, src_stride, dst, dst_stride, w, h, 342 x_filter_ptr, horiz_const); 343 return; 344 } 345 346 if (filter_taps > 8) { 347 convolve_x_sr_12tap_neon_i8mm(src, src_stride, dst, dst_stride, w, h, 348 x_filter_ptr); 349 return; 350 } 351 352 convolve_x_sr_8tap_neon_i8mm(src, src_stride, dst, dst_stride, w, h, 353 x_filter_ptr, horiz_const); 354 } 355 356 static inline int16x4_t convolve12_4_y(const uint8x16_t s0, const uint8x16_t s1, 357 const uint8x16_t s2, 358 const int8x8_t filters_0_7, 359 const int8x8_t filters_4_11) { 360 int32x4_t sum = vusdotq_lane_s32(vdupq_n_s32(0), s0, filters_0_7, 0); 361 sum = vusdotq_lane_s32(sum, s1, filters_0_7, 1); 362 sum = vusdotq_lane_s32(sum, s2, filters_4_11, 1); 363 364 // Further narrowing and packing is performed by the caller. 365 return vshrn_n_s32(sum, 1); 366 } 367 368 static inline uint8x8_t convolve12_8_y( 369 const uint8x16_t s0_lo, const uint8x16_t s0_hi, const uint8x16_t s1_lo, 370 const uint8x16_t s1_hi, const uint8x16_t s2_lo, const uint8x16_t s2_hi, 371 const int8x8_t filters_0_7, const int8x8_t filters_4_11) { 372 int32x4_t sum0123 = vusdotq_lane_s32(vdupq_n_s32(0), s0_lo, filters_0_7, 0); 373 sum0123 = vusdotq_lane_s32(sum0123, s1_lo, filters_0_7, 1); 374 sum0123 = vusdotq_lane_s32(sum0123, s2_lo, filters_4_11, 1); 375 376 int32x4_t sum4567 = vusdotq_lane_s32(vdupq_n_s32(0), s0_hi, filters_0_7, 0); 377 sum4567 = vusdotq_lane_s32(sum4567, s1_hi, filters_0_7, 1); 378 sum4567 = vusdotq_lane_s32(sum4567, s2_hi, filters_4_11, 1); 379 380 // Narrow and re-pack. 381 int16x8_t sum = 382 vcombine_s16(vshrn_n_s32(sum0123, 1), vshrn_n_s32(sum4567, 1)); 383 return vqrshrun_n_s16(sum, FILTER_BITS - 1); 384 } 385 386 static inline void convolve_y_sr_12tap_neon_i8mm(const uint8_t *src_ptr, 387 int src_stride, 388 uint8_t *dst_ptr, 389 int dst_stride, int w, int h, 390 const int16_t *y_filter_ptr) { 391 // The no-op filter should never be used here. 392 assert(y_filter_ptr[5] != 128); 393 394 const int8x8_t filter_0_7 = vmovn_s16(vld1q_s16(y_filter_ptr)); 395 const int8x8_t filter_4_11 = vmovn_s16(vld1q_s16(y_filter_ptr + 4)); 396 397 const uint8x16x3_t merge_block_tbl = vld1q_u8_x3(kDotProdMergeBlockTbl); 398 399 if (w == 4) { 400 uint8x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA; 401 load_u8_8x11(src_ptr, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7, 402 &s8, &s9, &sA); 403 src_ptr += 11 * src_stride; 404 405 // This operation combines a conventional transpose and the sample permute 406 // (see horizontal case) required before computing the dot product. 407 uint8x16_t s0123, s1234, s2345, s3456, s4567, s5678, s6789, s789A; 408 transpose_concat_elems_u8_4x4(s0, s1, s2, s3, &s0123); 409 transpose_concat_elems_u8_4x4(s1, s2, s3, s4, &s1234); 410 transpose_concat_elems_u8_4x4(s2, s3, s4, s5, &s2345); 411 transpose_concat_elems_u8_4x4(s3, s4, s5, s6, &s3456); 412 transpose_concat_elems_u8_4x4(s4, s5, s6, s7, &s4567); 413 transpose_concat_elems_u8_4x4(s5, s6, s7, s8, &s5678); 414 transpose_concat_elems_u8_4x4(s6, s7, s8, s9, &s6789); 415 transpose_concat_elems_u8_4x4(s7, s8, s9, sA, &s789A); 416 417 do { 418 uint8x8_t sB, sC, sD, sE; 419 load_u8_8x4(src_ptr, src_stride, &sB, &sC, &sD, &sE); 420 421 uint8x16_t s89AB, s9ABC, sABCD, sBCDE; 422 transpose_concat_elems_u8_4x4(sB, sC, sD, sE, &sBCDE); 423 424 // Merge new data into block from previous iteration. 425 uint8x16x2_t samples_LUT = { { s789A, sBCDE } }; 426 s89AB = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[0]); 427 s9ABC = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[1]); 428 sABCD = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[2]); 429 430 int16x4_t d0 = 431 convolve12_4_y(s0123, s4567, s89AB, filter_0_7, filter_4_11); 432 int16x4_t d1 = 433 convolve12_4_y(s1234, s5678, s9ABC, filter_0_7, filter_4_11); 434 int16x4_t d2 = 435 convolve12_4_y(s2345, s6789, sABCD, filter_0_7, filter_4_11); 436 int16x4_t d3 = 437 convolve12_4_y(s3456, s789A, sBCDE, filter_0_7, filter_4_11); 438 uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS - 1); 439 uint8x8_t d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS - 1); 440 441 store_u8x4_strided_x2(dst_ptr + 0 * dst_stride, dst_stride, d01); 442 store_u8x4_strided_x2(dst_ptr + 2 * dst_stride, dst_stride, d23); 443 444 // Prepare block for next iteration - re-using as much as possible. 445 // Shuffle everything up four rows. 446 s0123 = s4567; 447 s1234 = s5678; 448 s2345 = s6789; 449 s3456 = s789A; 450 s4567 = s89AB; 451 s5678 = s9ABC; 452 s6789 = sABCD; 453 s789A = sBCDE; 454 455 src_ptr += 4 * src_stride; 456 dst_ptr += 4 * dst_stride; 457 h -= 4; 458 } while (h != 0); 459 } else { 460 do { 461 int height = h; 462 const uint8_t *s = src_ptr; 463 uint8_t *d = dst_ptr; 464 465 uint8x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA; 466 load_u8_8x11(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7, &s8, 467 &s9, &sA); 468 s += 11 * src_stride; 469 470 // This operation combines a conventional transpose and the sample 471 // permute (see horizontal case) required before computing the dot 472 // product. 473 uint8x16_t s0123_lo, s0123_hi, s1234_lo, s1234_hi, s2345_lo, s2345_hi, 474 s3456_lo, s3456_hi, s4567_lo, s4567_hi, s5678_lo, s5678_hi, s6789_lo, 475 s6789_hi, s789A_lo, s789A_hi; 476 transpose_concat_elems_u8_8x4(s0, s1, s2, s3, &s0123_lo, &s0123_hi); 477 transpose_concat_elems_u8_8x4(s1, s2, s3, s4, &s1234_lo, &s1234_hi); 478 transpose_concat_elems_u8_8x4(s2, s3, s4, s5, &s2345_lo, &s2345_hi); 479 transpose_concat_elems_u8_8x4(s3, s4, s5, s6, &s3456_lo, &s3456_hi); 480 transpose_concat_elems_u8_8x4(s4, s5, s6, s7, &s4567_lo, &s4567_hi); 481 transpose_concat_elems_u8_8x4(s5, s6, s7, s8, &s5678_lo, &s5678_hi); 482 transpose_concat_elems_u8_8x4(s6, s7, s8, s9, &s6789_lo, &s6789_hi); 483 transpose_concat_elems_u8_8x4(s7, s8, s9, sA, &s789A_lo, &s789A_hi); 484 485 do { 486 uint8x8_t sB, sC, sD, sE; 487 load_u8_8x4(s, src_stride, &sB, &sC, &sD, &sE); 488 489 uint8x16_t s89AB_lo, s89AB_hi, s9ABC_lo, s9ABC_hi, sABCD_lo, sABCD_hi, 490 sBCDE_lo, sBCDE_hi; 491 transpose_concat_elems_u8_8x4(sB, sC, sD, sE, &sBCDE_lo, &sBCDE_hi); 492 493 // Merge new data into block from previous iteration. 494 uint8x16x2_t samples_LUT_lo = { { s789A_lo, sBCDE_lo } }; 495 s89AB_lo = vqtbl2q_u8(samples_LUT_lo, merge_block_tbl.val[0]); 496 s9ABC_lo = vqtbl2q_u8(samples_LUT_lo, merge_block_tbl.val[1]); 497 sABCD_lo = vqtbl2q_u8(samples_LUT_lo, merge_block_tbl.val[2]); 498 499 uint8x16x2_t samples_LUT_hi = { { s789A_hi, sBCDE_hi } }; 500 s89AB_hi = vqtbl2q_u8(samples_LUT_hi, merge_block_tbl.val[0]); 501 s9ABC_hi = vqtbl2q_u8(samples_LUT_hi, merge_block_tbl.val[1]); 502 sABCD_hi = vqtbl2q_u8(samples_LUT_hi, merge_block_tbl.val[2]); 503 504 uint8x8_t d0 = 505 convolve12_8_y(s0123_lo, s0123_hi, s4567_lo, s4567_hi, s89AB_lo, 506 s89AB_hi, filter_0_7, filter_4_11); 507 uint8x8_t d1 = 508 convolve12_8_y(s1234_lo, s1234_hi, s5678_lo, s5678_hi, s9ABC_lo, 509 s9ABC_hi, filter_0_7, filter_4_11); 510 uint8x8_t d2 = 511 convolve12_8_y(s2345_lo, s2345_hi, s6789_lo, s6789_hi, sABCD_lo, 512 sABCD_hi, filter_0_7, filter_4_11); 513 uint8x8_t d3 = 514 convolve12_8_y(s3456_lo, s3456_hi, s789A_lo, s789A_hi, sBCDE_lo, 515 sBCDE_hi, filter_0_7, filter_4_11); 516 517 store_u8_8x4(d, dst_stride, d0, d1, d2, d3); 518 519 // Prepare block for next iteration - re-using as much as possible. 520 // Shuffle everything up four rows. 521 s0123_lo = s4567_lo; 522 s0123_hi = s4567_hi; 523 s1234_lo = s5678_lo; 524 s1234_hi = s5678_hi; 525 s2345_lo = s6789_lo; 526 s2345_hi = s6789_hi; 527 s3456_lo = s789A_lo; 528 s3456_hi = s789A_hi; 529 s4567_lo = s89AB_lo; 530 s4567_hi = s89AB_hi; 531 s5678_lo = s9ABC_lo; 532 s5678_hi = s9ABC_hi; 533 s6789_lo = sABCD_lo; 534 s6789_hi = sABCD_hi; 535 s789A_lo = sBCDE_lo; 536 s789A_hi = sBCDE_hi; 537 538 s += 4 * src_stride; 539 d += 4 * dst_stride; 540 height -= 4; 541 } while (height != 0); 542 src_ptr += 8; 543 dst_ptr += 8; 544 w -= 8; 545 } while (w != 0); 546 } 547 } 548 549 static inline int16x4_t convolve8_4_y(const uint8x16_t s0, const uint8x16_t s1, 550 const int8x8_t filters) { 551 int32x4_t sum = vusdotq_lane_s32(vdupq_n_s32(0), s0, filters, 0); 552 sum = vusdotq_lane_s32(sum, s1, filters, 1); 553 554 // Further narrowing and packing is performed by the caller. 555 return vmovn_s32(sum); 556 } 557 558 static inline uint8x8_t convolve8_8_y(const uint8x16_t s0_lo, 559 const uint8x16_t s0_hi, 560 const uint8x16_t s1_lo, 561 const uint8x16_t s1_hi, 562 const int8x8_t filters) { 563 int32x4_t sum0123 = vusdotq_lane_s32(vdupq_n_s32(0), s0_lo, filters, 0); 564 sum0123 = vusdotq_lane_s32(sum0123, s1_lo, filters, 1); 565 566 int32x4_t sum4567 = vusdotq_lane_s32(vdupq_n_s32(0), s0_hi, filters, 0); 567 sum4567 = vusdotq_lane_s32(sum4567, s1_hi, filters, 1); 568 569 // Narrow and re-pack. 570 int16x8_t sum = vcombine_s16(vmovn_s32(sum0123), vmovn_s32(sum4567)); 571 // We halved the filter values so -1 from right shift. 572 return vqrshrun_n_s16(sum, FILTER_BITS - 1); 573 } 574 575 static inline void convolve_y_sr_8tap_neon_i8mm(const uint8_t *src_ptr, 576 int src_stride, 577 uint8_t *dst_ptr, 578 int dst_stride, int w, int h, 579 const int16_t *y_filter_ptr) { 580 // Filter values are even, so halve to reduce intermediate precision reqs. 581 const int8x8_t filter = vshrn_n_s16(vld1q_s16(y_filter_ptr), 1); 582 583 const uint8x16x3_t merge_block_tbl = vld1q_u8_x3(kDotProdMergeBlockTbl); 584 585 if (w == 4) { 586 uint8x8_t s0, s1, s2, s3, s4, s5, s6; 587 load_u8_8x7(src_ptr, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6); 588 src_ptr += 7 * src_stride; 589 590 // This operation combines a conventional transpose and the sample permute 591 // (see horizontal case) required before computing the dot product. 592 uint8x16_t s0123, s1234, s2345, s3456; 593 transpose_concat_elems_u8_4x4(s0, s1, s2, s3, &s0123); 594 transpose_concat_elems_u8_4x4(s1, s2, s3, s4, &s1234); 595 transpose_concat_elems_u8_4x4(s2, s3, s4, s5, &s2345); 596 transpose_concat_elems_u8_4x4(s3, s4, s5, s6, &s3456); 597 598 do { 599 uint8x8_t s7, s8, s9, sA; 600 load_u8_8x4(src_ptr, src_stride, &s7, &s8, &s9, &sA); 601 602 uint8x16_t s4567, s5678, s6789, s789A; 603 transpose_concat_elems_u8_4x4(s7, s8, s9, sA, &s789A); 604 605 // Merge new data into block from previous iteration. 606 uint8x16x2_t samples_LUT = { { s3456, s789A } }; 607 s4567 = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[0]); 608 s5678 = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[1]); 609 s6789 = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[2]); 610 611 int16x4_t d0 = convolve8_4_y(s0123, s4567, filter); 612 int16x4_t d1 = convolve8_4_y(s1234, s5678, filter); 613 int16x4_t d2 = convolve8_4_y(s2345, s6789, filter); 614 int16x4_t d3 = convolve8_4_y(s3456, s789A, filter); 615 // We halved the filter values so -1 from right shift. 616 uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS - 1); 617 uint8x8_t d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS - 1); 618 619 store_u8x4_strided_x2(dst_ptr + 0 * dst_stride, dst_stride, d01); 620 store_u8x4_strided_x2(dst_ptr + 2 * dst_stride, dst_stride, d23); 621 622 // Prepare block for next iteration - re-using as much as possible. 623 // Shuffle everything up four rows. 624 s0123 = s4567; 625 s1234 = s5678; 626 s2345 = s6789; 627 s3456 = s789A; 628 629 src_ptr += 4 * src_stride; 630 dst_ptr += 4 * dst_stride; 631 h -= 4; 632 } while (h != 0); 633 } else { 634 do { 635 int height = h; 636 const uint8_t *s = src_ptr; 637 uint8_t *d = dst_ptr; 638 639 uint8x8_t s0, s1, s2, s3, s4, s5, s6; 640 load_u8_8x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6); 641 s += 7 * src_stride; 642 643 // This operation combines a conventional transpose and the sample 644 // permute (see horizontal case) required before computing the dot 645 // product. 646 uint8x16_t s0123_lo, s0123_hi, s1234_lo, s1234_hi, s2345_lo, s2345_hi, 647 s3456_lo, s3456_hi; 648 transpose_concat_elems_u8_8x4(s0, s1, s2, s3, &s0123_lo, &s0123_hi); 649 transpose_concat_elems_u8_8x4(s1, s2, s3, s4, &s1234_lo, &s1234_hi); 650 transpose_concat_elems_u8_8x4(s2, s3, s4, s5, &s2345_lo, &s2345_hi); 651 transpose_concat_elems_u8_8x4(s3, s4, s5, s6, &s3456_lo, &s3456_hi); 652 653 do { 654 uint8x8_t s7, s8, s9, sA; 655 load_u8_8x4(s, src_stride, &s7, &s8, &s9, &sA); 656 657 uint8x16_t s4567_lo, s4567_hi, s5678_lo, s5678_hi, s6789_lo, s6789_hi, 658 s789A_lo, s789A_hi; 659 transpose_concat_elems_u8_8x4(s7, s8, s9, sA, &s789A_lo, &s789A_hi); 660 661 // Merge new data into block from previous iteration. 662 uint8x16x2_t samples_LUT_lo = { { s3456_lo, s789A_lo } }; 663 s4567_lo = vqtbl2q_u8(samples_LUT_lo, merge_block_tbl.val[0]); 664 s5678_lo = vqtbl2q_u8(samples_LUT_lo, merge_block_tbl.val[1]); 665 s6789_lo = vqtbl2q_u8(samples_LUT_lo, merge_block_tbl.val[2]); 666 667 uint8x16x2_t samples_LUT_hi = { { s3456_hi, s789A_hi } }; 668 s4567_hi = vqtbl2q_u8(samples_LUT_hi, merge_block_tbl.val[0]); 669 s5678_hi = vqtbl2q_u8(samples_LUT_hi, merge_block_tbl.val[1]); 670 s6789_hi = vqtbl2q_u8(samples_LUT_hi, merge_block_tbl.val[2]); 671 672 uint8x8_t d0 = 673 convolve8_8_y(s0123_lo, s0123_hi, s4567_lo, s4567_hi, filter); 674 uint8x8_t d1 = 675 convolve8_8_y(s1234_lo, s1234_hi, s5678_lo, s5678_hi, filter); 676 uint8x8_t d2 = 677 convolve8_8_y(s2345_lo, s2345_hi, s6789_lo, s6789_hi, filter); 678 uint8x8_t d3 = 679 convolve8_8_y(s3456_lo, s3456_hi, s789A_lo, s789A_hi, filter); 680 681 store_u8_8x4(d, dst_stride, d0, d1, d2, d3); 682 683 // Prepare block for next iteration - re-using as much as possible. 684 // Shuffle everything up four rows. 685 s0123_lo = s4567_lo; 686 s0123_hi = s4567_hi; 687 s1234_lo = s5678_lo; 688 s1234_hi = s5678_hi; 689 s2345_lo = s6789_lo; 690 s2345_hi = s6789_hi; 691 s3456_lo = s789A_lo; 692 s3456_hi = s789A_hi; 693 694 s += 4 * src_stride; 695 d += 4 * dst_stride; 696 height -= 4; 697 } while (height != 0); 698 src_ptr += 8; 699 dst_ptr += 8; 700 w -= 8; 701 } while (w != 0); 702 } 703 } 704 705 static inline int16x4_t convolve4_4_y(const uint8x16_t s0, 706 const int8x8_t filters) { 707 int32x4_t sum = vusdotq_lane_s32(vdupq_n_s32(0), s0, filters, 0); 708 709 // Further narrowing and packing is performed by the caller. 710 return vmovn_s32(sum); 711 } 712 713 static inline uint8x8_t convolve4_8_y(const uint8x16_t s0, const uint8x16_t s1, 714 const int8x8_t filters) { 715 int32x4_t sum0123 = vusdotq_lane_s32(vdupq_n_s32(0), s0, filters, 0); 716 int32x4_t sum4567 = vusdotq_lane_s32(vdupq_n_s32(0), s1, filters, 0); 717 718 // Narrow and re-pack. 719 int16x8_t sum = vcombine_s16(vmovn_s32(sum0123), vmovn_s32(sum4567)); 720 // We halved the filter values so -1 from right shift. 721 return vqrshrun_n_s16(sum, FILTER_BITS - 1); 722 } 723 724 static inline void convolve_y_sr_4tap_neon_i8mm(const uint8_t *src_ptr, 725 int src_stride, 726 uint8_t *dst_ptr, 727 int dst_stride, int w, int h, 728 const int16_t *y_filter_ptr) { 729 // Filter values are even, so halve to reduce intermediate precision reqs. 730 const int16x8_t filter_s16 = 731 vcombine_s16(vld1_s16(y_filter_ptr + 2), vdup_n_s16(0)); 732 const int8x8_t filter = vshrn_n_s16(filter_s16, 1); 733 const uint8x16x3_t merge_block_tbl = vld1q_u8_x3(kDotProdMergeBlockTbl); 734 uint8x16x2_t samples_LUT; 735 736 if (w == 4) { 737 uint8x8_t s0, s1, s2, s3; 738 load_u8_8x4(src_ptr, src_stride, &s0, &s1, &s2, &s3); 739 src_ptr += 4 * src_stride; 740 741 // This operation combines a conventional transpose and the sample permute 742 // required before computing the dot product. 743 uint8x16_t s0123; 744 transpose_concat_elems_u8_4x4(s0, s1, s2, s3, &s0123); 745 746 do { 747 uint8x8_t s4, s5, s6, s7; 748 load_u8_8x4(src_ptr, src_stride, &s4, &s5, &s6, &s7); 749 750 uint8x16_t s4567; 751 transpose_concat_elems_u8_4x4(s4, s5, s6, s7, &s4567); 752 753 // Merge new data into block from previous iteration. 754 samples_LUT.val[0] = s0123; 755 samples_LUT.val[1] = s4567; 756 uint8x16_t s1234 = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[0]); 757 uint8x16_t s2345 = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[1]); 758 uint8x16_t s3456 = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[2]); 759 760 int16x4_t d0 = convolve4_4_y(s0123, filter); 761 int16x4_t d1 = convolve4_4_y(s1234, filter); 762 int16x4_t d2 = convolve4_4_y(s2345, filter); 763 int16x4_t d3 = convolve4_4_y(s3456, filter); 764 // We halved the filter values so -1 from right shift. 765 uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS - 1); 766 uint8x8_t d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS - 1); 767 768 store_u8x4_strided_x2(dst_ptr + 0 * dst_stride, dst_stride, d01); 769 store_u8x4_strided_x2(dst_ptr + 2 * dst_stride, dst_stride, d23); 770 771 // Prepare block for next iteration - re-using as much as possible. 772 // Shuffle everything up four rows. 773 s0123 = s4567; 774 775 src_ptr += 4 * src_stride; 776 dst_ptr += 4 * dst_stride; 777 h -= 4; 778 } while (h != 0); 779 } else { 780 do { 781 int height = h; 782 const uint8_t *s = src_ptr; 783 uint8_t *d = dst_ptr; 784 785 uint8x8_t s0, s1, s2, s3; 786 load_u8_8x4(s, src_stride, &s0, &s1, &s2, &s3); 787 s += 4 * src_stride; 788 789 // This operation combines a conventional transpose and the sample permute 790 // required before computing the dot product. 791 uint8x16_t s0123_lo, s0123_hi; 792 transpose_concat_elems_u8_8x4(s0, s1, s2, s3, &s0123_lo, &s0123_hi); 793 794 do { 795 uint8x8_t s4, s5, s6, s7; 796 load_u8_8x4(s, src_stride, &s4, &s5, &s6, &s7); 797 798 uint8x16_t s4567_lo, s4567_hi; 799 transpose_concat_elems_u8_8x4(s4, s5, s6, s7, &s4567_lo, &s4567_hi); 800 801 // Merge new data into block from previous iteration. 802 samples_LUT.val[0] = s0123_lo; 803 samples_LUT.val[1] = s4567_lo; 804 uint8x16_t s1234_lo = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[0]); 805 uint8x16_t s2345_lo = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[1]); 806 uint8x16_t s3456_lo = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[2]); 807 808 samples_LUT.val[0] = s0123_hi; 809 samples_LUT.val[1] = s4567_hi; 810 uint8x16_t s1234_hi = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[0]); 811 uint8x16_t s2345_hi = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[1]); 812 uint8x16_t s3456_hi = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[2]); 813 814 uint8x8_t d0 = convolve4_8_y(s0123_lo, s0123_hi, filter); 815 uint8x8_t d1 = convolve4_8_y(s1234_lo, s1234_hi, filter); 816 uint8x8_t d2 = convolve4_8_y(s2345_lo, s2345_hi, filter); 817 uint8x8_t d3 = convolve4_8_y(s3456_lo, s3456_hi, filter); 818 819 store_u8_8x4(d, dst_stride, d0, d1, d2, d3); 820 821 // Prepare block for next iteration - re-using as much as possible. 822 // Shuffle everything up four rows. 823 s0123_lo = s4567_lo; 824 s0123_hi = s4567_hi; 825 826 s += 4 * src_stride; 827 d += 4 * dst_stride; 828 height -= 4; 829 } while (height != 0); 830 src_ptr += 8; 831 dst_ptr += 8; 832 w -= 8; 833 } while (w != 0); 834 } 835 } 836 837 void av1_convolve_y_sr_neon_i8mm(const uint8_t *src, int src_stride, 838 uint8_t *dst, int dst_stride, int w, int h, 839 const InterpFilterParams *filter_params_y, 840 const int subpel_y_qn) { 841 if (w == 2 || h == 2) { 842 av1_convolve_y_sr_c(src, src_stride, dst, dst_stride, w, h, filter_params_y, 843 subpel_y_qn); 844 return; 845 } 846 847 const int y_filter_taps = get_filter_tap(filter_params_y, subpel_y_qn); 848 const int16_t *y_filter_ptr = av1_get_interp_filter_subpel_kernel( 849 filter_params_y, subpel_y_qn & SUBPEL_MASK); 850 851 if (y_filter_taps <= 4) { 852 convolve_y_sr_4tap_neon_i8mm(src - src_stride, src_stride, dst, dst_stride, 853 w, h, y_filter_ptr); 854 } else if (y_filter_taps == 12) { 855 convolve_y_sr_12tap_neon_i8mm(src - 5 * src_stride, src_stride, dst, 856 dst_stride, w, h, y_filter_ptr); 857 } else { 858 // 6-tap or 8-tap. 859 convolve_y_sr_8tap_neon_i8mm(src - 3 * src_stride, src_stride, dst, 860 dst_stride, w, h, y_filter_ptr); 861 } 862 } 863 864 static inline int16x8_t convolve8_8_2d_h(uint8x16_t samples, 865 const int8x8_t filters, 866 const uint8x16x3_t permute_tbl, 867 const int32x4_t horiz_const) { 868 // Permute samples ready for dot product. 869 // { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 } 870 // { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 } 871 // { 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 } 872 uint8x16_t perm_samples[3] = { vqtbl1q_u8(samples, permute_tbl.val[0]), 873 vqtbl1q_u8(samples, permute_tbl.val[1]), 874 vqtbl1q_u8(samples, permute_tbl.val[2]) }; 875 876 int32x4_t sum0123 = 877 vusdotq_lane_s32(horiz_const, perm_samples[0], filters, 0); 878 sum0123 = vusdotq_lane_s32(sum0123, perm_samples[1], filters, 1); 879 880 int32x4_t sum4567 = 881 vusdotq_lane_s32(horiz_const, perm_samples[1], filters, 0); 882 sum4567 = vusdotq_lane_s32(sum4567, perm_samples[2], filters, 1); 883 884 // Narrow and re-pack. 885 // We halved the convolution filter values so -1 from the right shift. 886 return vcombine_s16(vshrn_n_s32(sum0123, ROUND0_BITS - 1), 887 vshrn_n_s32(sum4567, ROUND0_BITS - 1)); 888 } 889 890 static inline void convolve_2d_sr_horiz_8tap_neon_i8mm( 891 const uint8_t *src, int src_stride, int16_t *im_block, int im_stride, int w, 892 int im_h, const int16_t *x_filter_ptr) { 893 // Filter values are even, so halve to reduce intermediate precision reqs. 894 const int8x8_t x_filter = vshrn_n_s16(vld1q_s16(x_filter_ptr), 1); 895 896 const int bd = 8; 897 // This shim of 1 << ((ROUND0_BITS - 1) - 1) enables us to use non-rounding 898 // shifts - which are generally faster than rounding shifts on modern CPUs. 899 // The outermost -1 is needed because we halved the filter values. 900 const int32x4_t horiz_const = vdupq_n_s32((1 << (bd + FILTER_BITS - 2)) + 901 (1 << ((ROUND0_BITS - 1) - 1))); 902 903 const uint8_t *src_ptr = src; 904 int16_t *dst_ptr = im_block; 905 int dst_stride = im_stride; 906 int height = im_h; 907 908 const uint8x16x3_t permute_tbl = vld1q_u8_x3(kDotProdPermuteTbl); 909 do { 910 const uint8_t *s = src_ptr; 911 int16_t *d = dst_ptr; 912 int width = w; 913 914 do { 915 uint8x16_t s0, s1, s2, s3; 916 load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3); 917 918 int16x8_t d0 = convolve8_8_2d_h(s0, x_filter, permute_tbl, horiz_const); 919 int16x8_t d1 = convolve8_8_2d_h(s1, x_filter, permute_tbl, horiz_const); 920 int16x8_t d2 = convolve8_8_2d_h(s2, x_filter, permute_tbl, horiz_const); 921 int16x8_t d3 = convolve8_8_2d_h(s3, x_filter, permute_tbl, horiz_const); 922 923 store_s16_8x4(d, dst_stride, d0, d1, d2, d3); 924 925 s += 8; 926 d += 8; 927 width -= 8; 928 } while (width != 0); 929 src_ptr += 4 * src_stride; 930 dst_ptr += 4 * dst_stride; 931 height -= 4; 932 } while (height > 4); 933 934 do { 935 const uint8_t *s = src_ptr; 936 int16_t *d = dst_ptr; 937 int width = w; 938 939 do { 940 uint8x16_t s0 = vld1q_u8(s); 941 int16x8_t d0 = convolve8_8_2d_h(s0, x_filter, permute_tbl, horiz_const); 942 vst1q_s16(d, d0); 943 944 s += 8; 945 d += 8; 946 width -= 8; 947 } while (width != 0); 948 src_ptr += src_stride; 949 dst_ptr += dst_stride; 950 } while (--height != 0); 951 } 952 953 static inline int16x4_t convolve4_4_2d_h(const uint8x16_t samples, 954 const int8x8_t filters, 955 const uint8x16_t permute_tbl, 956 const int32x4_t horiz_const) { 957 // Permute samples ready for dot product. 958 // { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 } 959 uint8x16_t perm_samples = vqtbl1q_u8(samples, permute_tbl); 960 961 int32x4_t sum = vusdotq_lane_s32(horiz_const, perm_samples, filters, 0); 962 963 // We halved the convolution filter values so -1 from the right shift. 964 return vshrn_n_s32(sum, ROUND0_BITS - 1); 965 } 966 967 static inline int16x8_t convolve4_8_2d_h(const uint8x16_t samples, 968 const int8x8_t filters, 969 const uint8x16x2_t permute_tbl, 970 const int32x4_t horiz_const) { 971 // Permute samples ready for dot product. 972 // { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 } 973 // { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 } 974 uint8x16_t perm_samples[2] = { vqtbl1q_u8(samples, permute_tbl.val[0]), 975 vqtbl1q_u8(samples, permute_tbl.val[1]) }; 976 977 int32x4_t sum0123 = 978 vusdotq_lane_s32(horiz_const, perm_samples[0], filters, 0); 979 int32x4_t sum4567 = 980 vusdotq_lane_s32(horiz_const, perm_samples[1], filters, 0); 981 982 // Narrow and re-pack. 983 // We halved the filter values so -1 from right shift. 984 return vcombine_s16(vshrn_n_s32(sum0123, ROUND0_BITS - 1), 985 vshrn_n_s32(sum4567, ROUND0_BITS - 1)); 986 } 987 988 static inline void convolve_2d_sr_horiz_4tap_neon_i8mm( 989 const uint8_t *src, int src_stride, int16_t *dst, int dst_stride, int width, 990 int height, const int16_t *filter_x) { 991 const int bd = 8; 992 const int16x4_t x_filter = vld1_s16(filter_x + 2); 993 // All 4-tap and bilinear filter values are even, so halve them to reduce 994 // intermediate precision requirements. 995 const int8x8_t filter = vshrn_n_s16(vcombine_s16(x_filter, vdup_n_s16(0)), 1); 996 997 // Adding a shim of 1 << (ROUND0_BITS - 1) enables us to use non-rounding 998 // shifts - which are generally faster than rounding shifts on modern CPUs. 999 // Halve the total because we halved the filter values. 1000 const int32x4_t horiz_const = vdupq_n_s32( 1001 (((1 << (bd + FILTER_BITS - 1)) + (1 << (ROUND0_BITS - 1))) / 2)); 1002 1003 if (width == 4) { 1004 const uint8x16_t perm_tbl = vld1q_u8(kDotProdPermuteTbl); 1005 do { 1006 uint8x16_t s0, s1, s2, s3; 1007 load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3); 1008 1009 int16x4_t d0 = convolve4_4_2d_h(s0, filter, perm_tbl, horiz_const); 1010 int16x4_t d1 = convolve4_4_2d_h(s1, filter, perm_tbl, horiz_const); 1011 int16x4_t d2 = convolve4_4_2d_h(s2, filter, perm_tbl, horiz_const); 1012 int16x4_t d3 = convolve4_4_2d_h(s3, filter, perm_tbl, horiz_const); 1013 1014 store_s16_4x4(dst, dst_stride, d0, d1, d2, d3); 1015 1016 src += 4 * src_stride; 1017 dst += 4 * dst_stride; 1018 height -= 4; 1019 } while (height > 4); 1020 1021 do { 1022 uint8x16_t s0 = vld1q_u8(src); 1023 int16x4_t d0 = convolve4_4_2d_h(s0, filter, perm_tbl, horiz_const); 1024 vst1_s16(dst, d0); 1025 1026 src += src_stride; 1027 dst += dst_stride; 1028 } while (--height != 0); 1029 } else { 1030 const uint8x16x2_t perm_tbl = vld1q_u8_x2(kDotProdPermuteTbl); 1031 do { 1032 int w = width; 1033 const uint8_t *s = src; 1034 int16_t *d = dst; 1035 1036 do { 1037 uint8x16_t s0, s1, s2, s3; 1038 load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3); 1039 1040 int16x8_t d0 = convolve4_8_2d_h(s0, filter, perm_tbl, horiz_const); 1041 int16x8_t d1 = convolve4_8_2d_h(s1, filter, perm_tbl, horiz_const); 1042 int16x8_t d2 = convolve4_8_2d_h(s2, filter, perm_tbl, horiz_const); 1043 int16x8_t d3 = convolve4_8_2d_h(s3, filter, perm_tbl, horiz_const); 1044 1045 store_s16_8x4(d, dst_stride, d0, d1, d2, d3); 1046 1047 s += 8; 1048 d += 8; 1049 w -= 8; 1050 } while (w != 0); 1051 src += 4 * src_stride; 1052 dst += 4 * dst_stride; 1053 height -= 4; 1054 } while (height > 4); 1055 1056 do { 1057 const uint8_t *s = src; 1058 int16_t *d = dst; 1059 int w = width; 1060 1061 do { 1062 uint8x16_t s0 = vld1q_u8(s); 1063 int16x8_t d0 = convolve4_8_2d_h(s0, filter, perm_tbl, horiz_const); 1064 vst1q_s16(d, d0); 1065 1066 s += 8; 1067 d += 8; 1068 w -= 8; 1069 } while (w != 0); 1070 src += src_stride; 1071 dst += dst_stride; 1072 } while (--height != 0); 1073 } 1074 } 1075 1076 static inline int16x4_t convolve6_4_2d_h(uint8x16_t samples, 1077 const int8x16_t filter, 1078 const uint8x16_t permute_tbl, 1079 const int32x4_t horiz_const) { 1080 // Permute samples ready for matrix multiply. 1081 // { 0, 1, 2, 3, 4, 5, 6, 7, 2, 3, 4, 5, 6, 7, 8, 9 } 1082 uint8x16_t perm_samples = vqtbl1q_u8(samples, permute_tbl); 1083 1084 // These instructions multiply a 2x8 matrix (samples) by an 8x2 matrix 1085 // (filter), destructively accumulating into the destination register. 1086 int32x4_t sum = vusmmlaq_s32(horiz_const, perm_samples, filter); 1087 1088 // We halved the convolution filter values so -1 from the right shift. 1089 return vshrn_n_s32(sum, ROUND0_BITS - 1); 1090 } 1091 1092 static inline int16x8_t convolve6_8_2d_h(uint8x16_t samples, 1093 const int8x16_t filter, 1094 const uint8x16x2_t permute_tbl, 1095 const int32x4_t horiz_const) { 1096 // Permute samples ready for matrix multiply. 1097 // { 0, 1, 2, 3, 4, 5, 6, 7, 2, 3, 4, 5, 6, 7, 8, 9 } 1098 // { 4, 5, 6, 7, 8, 9, 10, 11, 6, 7, 8, 9, 10, 11, 12, 13 } 1099 uint8x16_t perm_samples[2] = { vqtbl1q_u8(samples, permute_tbl.val[0]), 1100 vqtbl1q_u8(samples, permute_tbl.val[1]) }; 1101 1102 // These instructions multiply a 2x8 matrix (samples) by an 8x2 matrix 1103 // (filter), destructively accumulating into the destination register. 1104 int32x4_t sum0123 = vusmmlaq_s32(horiz_const, perm_samples[0], filter); 1105 int32x4_t sum4567 = vusmmlaq_s32(horiz_const, perm_samples[1], filter); 1106 1107 // Narrow and re-pack. 1108 // We halved the convolution filter values so -1 from the right shift. 1109 return vcombine_s16(vshrn_n_s32(sum0123, ROUND0_BITS - 1), 1110 vshrn_n_s32(sum4567, ROUND0_BITS - 1)); 1111 } 1112 1113 static inline void convolve_2d_sr_6tap_neon_i8mm(const uint8_t *src, 1114 int src_stride, uint8_t *dst, 1115 int dst_stride, int w, int h, 1116 const int16_t *x_filter_ptr, 1117 const int16_t *y_filter_ptr) { 1118 const int16x8_t y_filter = vld1q_s16(y_filter_ptr); 1119 // Filter values are even, so halve to reduce intermediate precision reqs. 1120 const int8x8_t x_filter_s8 = vshrn_n_s16(vld1q_s16(x_filter_ptr), 1); 1121 // Stagger the filter for use with the matrix multiply instructions. 1122 // { f0, f1, f2, f3, f4, f5, 0, 0, 0, f0, f1, f2, f3, f4, f5, 0 } 1123 const int8x16_t x_filter = 1124 vcombine_s8(vext_s8(x_filter_s8, x_filter_s8, 1), x_filter_s8); 1125 1126 const int bd = 8; 1127 // This shim of 1 << ((ROUND0_BITS - 1) - 1) enables us to use non-rounding 1128 // shifts in convolution kernels - which are generally faster than rounding 1129 // shifts on modern CPUs. The outermost -1 is needed because we halved the 1130 // filter values. 1131 const int32x4_t horiz_const = vdupq_n_s32((1 << (bd + FILTER_BITS - 2)) + 1132 (1 << ((ROUND0_BITS - 1) - 1))); 1133 const int16x8_t vert_const = vdupq_n_s16(1 << (bd - 1)); 1134 const uint8x16x2_t permute_tbl = vld1q_u8_x2(kMatMulPermuteTbl); 1135 1136 do { 1137 const uint8_t *s = src; 1138 uint8_t *d = dst; 1139 int height = h; 1140 1141 uint8x16_t h_s0, h_s1, h_s2, h_s3, h_s4; 1142 load_u8_16x5(s, src_stride, &h_s0, &h_s1, &h_s2, &h_s3, &h_s4); 1143 s += 5 * src_stride; 1144 1145 int16x8_t v_s0 = convolve6_8_2d_h(h_s0, x_filter, permute_tbl, horiz_const); 1146 int16x8_t v_s1 = convolve6_8_2d_h(h_s1, x_filter, permute_tbl, horiz_const); 1147 int16x8_t v_s2 = convolve6_8_2d_h(h_s2, x_filter, permute_tbl, horiz_const); 1148 int16x8_t v_s3 = convolve6_8_2d_h(h_s3, x_filter, permute_tbl, horiz_const); 1149 int16x8_t v_s4 = convolve6_8_2d_h(h_s4, x_filter, permute_tbl, horiz_const); 1150 1151 do { 1152 uint8x16_t h_s5, h_s6, h_s7, h_s8; 1153 load_u8_16x4(s, src_stride, &h_s5, &h_s6, &h_s7, &h_s8); 1154 1155 int16x8_t v_s5 = 1156 convolve6_8_2d_h(h_s5, x_filter, permute_tbl, horiz_const); 1157 int16x8_t v_s6 = 1158 convolve6_8_2d_h(h_s6, x_filter, permute_tbl, horiz_const); 1159 int16x8_t v_s7 = 1160 convolve6_8_2d_h(h_s7, x_filter, permute_tbl, horiz_const); 1161 int16x8_t v_s8 = 1162 convolve6_8_2d_h(h_s8, x_filter, permute_tbl, horiz_const); 1163 1164 uint8x8_t d0 = convolve6_8_2d_v(v_s0, v_s1, v_s2, v_s3, v_s4, v_s5, 1165 y_filter, vert_const); 1166 uint8x8_t d1 = convolve6_8_2d_v(v_s1, v_s2, v_s3, v_s4, v_s5, v_s6, 1167 y_filter, vert_const); 1168 uint8x8_t d2 = convolve6_8_2d_v(v_s2, v_s3, v_s4, v_s5, v_s6, v_s7, 1169 y_filter, vert_const); 1170 uint8x8_t d3 = convolve6_8_2d_v(v_s3, v_s4, v_s5, v_s6, v_s7, v_s8, 1171 y_filter, vert_const); 1172 1173 store_u8_8x4(d, dst_stride, d0, d1, d2, d3); 1174 1175 v_s0 = v_s4; 1176 v_s1 = v_s5; 1177 v_s2 = v_s6; 1178 v_s3 = v_s7; 1179 v_s4 = v_s8; 1180 1181 s += 4 * src_stride; 1182 d += 4 * dst_stride; 1183 height -= 4; 1184 } while (height != 0); 1185 src += 8; 1186 dst += 8; 1187 w -= 8; 1188 } while (w != 0); 1189 } 1190 1191 static inline void convolve_2d_sr_6tap_4tap_neon_i8mm( 1192 const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, 1193 int h, const int16_t *x_filter_ptr, const int16_t *y_filter_ptr) { 1194 const int16x4_t y_filter = vld1_s16(y_filter_ptr + 2); 1195 // Filter values are even, so halve to reduce intermediate precision reqs. 1196 const int8x8_t x_filter_s8 = vshrn_n_s16(vld1q_s16(x_filter_ptr), 1); 1197 // Stagger the filter for use with the matrix multiply instructions. 1198 // { f0, f1, f2, f3, f4, f5, 0, 0, 0, f0, f1, f2, f3, f4, f5, 0 } 1199 const int8x16_t x_filter = 1200 vcombine_s8(vext_s8(x_filter_s8, x_filter_s8, 1), x_filter_s8); 1201 1202 const int bd = 8; 1203 // Adding a shim of 1 << (ROUND0_BITS - 1) enables us to use non-rounding 1204 // shifts - which are generally faster than rounding shifts on modern CPUs. 1205 // Halve the total because we halved the filter values. 1206 const int32x4_t horiz_const = vdupq_n_s32( 1207 ((1 << (bd + FILTER_BITS - 1)) + (1 << (ROUND0_BITS - 1))) / 2); 1208 const int16x8_t vert_const = vdupq_n_s16(1 << (bd - 1)); 1209 1210 if (w == 4) { 1211 const uint8x16_t permute_tbl = vld1q_u8(kMatMulPermuteTbl); 1212 uint8x16_t h_s0, h_s1, h_s2; 1213 load_u8_16x3(src, src_stride, &h_s0, &h_s1, &h_s2); 1214 1215 int16x4_t v_s0 = convolve6_4_2d_h(h_s0, x_filter, permute_tbl, horiz_const); 1216 int16x4_t v_s1 = convolve6_4_2d_h(h_s1, x_filter, permute_tbl, horiz_const); 1217 int16x4_t v_s2 = convolve6_4_2d_h(h_s2, x_filter, permute_tbl, horiz_const); 1218 1219 src += 3 * src_stride; 1220 1221 do { 1222 uint8x16_t h_s3, h_s4, h_s5, h_s6; 1223 load_u8_16x4(src, src_stride, &h_s3, &h_s4, &h_s5, &h_s6); 1224 1225 int16x4_t v_s3 = 1226 convolve6_4_2d_h(h_s3, x_filter, permute_tbl, horiz_const); 1227 int16x4_t v_s4 = 1228 convolve6_4_2d_h(h_s4, x_filter, permute_tbl, horiz_const); 1229 int16x4_t v_s5 = 1230 convolve6_4_2d_h(h_s5, x_filter, permute_tbl, horiz_const); 1231 int16x4_t v_s6 = 1232 convolve6_4_2d_h(h_s6, x_filter, permute_tbl, horiz_const); 1233 1234 int16x4_t d0 = convolve4_4_2d_v(v_s0, v_s1, v_s2, v_s3, y_filter); 1235 int16x4_t d1 = convolve4_4_2d_v(v_s1, v_s2, v_s3, v_s4, y_filter); 1236 int16x4_t d2 = convolve4_4_2d_v(v_s2, v_s3, v_s4, v_s5, y_filter); 1237 int16x4_t d3 = convolve4_4_2d_v(v_s3, v_s4, v_s5, v_s6, y_filter); 1238 1239 uint8x8_t d01 = vqmovun_s16(vsubq_s16(vcombine_s16(d0, d1), vert_const)); 1240 uint8x8_t d23 = vqmovun_s16(vsubq_s16(vcombine_s16(d2, d3), vert_const)); 1241 1242 store_u8x4_strided_x2(dst + 0 * dst_stride, dst_stride, d01); 1243 store_u8x4_strided_x2(dst + 2 * dst_stride, dst_stride, d23); 1244 1245 v_s0 = v_s4; 1246 v_s1 = v_s5; 1247 v_s2 = v_s6; 1248 1249 src += 4 * src_stride; 1250 dst += 4 * dst_stride; 1251 h -= 4; 1252 } while (h != 0); 1253 } else { 1254 const uint8x16x2_t permute_tbl = vld1q_u8_x2(kMatMulPermuteTbl); 1255 1256 do { 1257 int height = h; 1258 const uint8_t *s = src; 1259 uint8_t *d = dst; 1260 1261 uint8x16_t h_s0, h_s1, h_s2; 1262 load_u8_16x3(src, src_stride, &h_s0, &h_s1, &h_s2); 1263 1264 int16x8_t v_s0 = 1265 convolve6_8_2d_h(h_s0, x_filter, permute_tbl, horiz_const); 1266 int16x8_t v_s1 = 1267 convolve6_8_2d_h(h_s1, x_filter, permute_tbl, horiz_const); 1268 int16x8_t v_s2 = 1269 convolve6_8_2d_h(h_s2, x_filter, permute_tbl, horiz_const); 1270 1271 s += 3 * src_stride; 1272 1273 do { 1274 uint8x16_t h_s3, h_s4, h_s5, h_s6; 1275 load_u8_16x4(s, src_stride, &h_s3, &h_s4, &h_s5, &h_s6); 1276 1277 int16x8_t v_s3 = 1278 convolve6_8_2d_h(h_s3, x_filter, permute_tbl, horiz_const); 1279 int16x8_t v_s4 = 1280 convolve6_8_2d_h(h_s4, x_filter, permute_tbl, horiz_const); 1281 int16x8_t v_s5 = 1282 convolve6_8_2d_h(h_s5, x_filter, permute_tbl, horiz_const); 1283 int16x8_t v_s6 = 1284 convolve6_8_2d_h(h_s6, x_filter, permute_tbl, horiz_const); 1285 1286 uint8x8_t d0 = 1287 convolve4_8_2d_v(v_s0, v_s1, v_s2, v_s3, y_filter, vert_const); 1288 uint8x8_t d1 = 1289 convolve4_8_2d_v(v_s1, v_s2, v_s3, v_s4, y_filter, vert_const); 1290 uint8x8_t d2 = 1291 convolve4_8_2d_v(v_s2, v_s3, v_s4, v_s5, y_filter, vert_const); 1292 uint8x8_t d3 = 1293 convolve4_8_2d_v(v_s3, v_s4, v_s5, v_s6, y_filter, vert_const); 1294 1295 store_u8_8x4(d, dst_stride, d0, d1, d2, d3); 1296 1297 v_s0 = v_s4; 1298 v_s1 = v_s5; 1299 v_s2 = v_s6; 1300 1301 s += 4 * src_stride; 1302 d += 4 * dst_stride; 1303 height -= 4; 1304 } while (height != 0); 1305 src += 8; 1306 dst += 8; 1307 w -= 8; 1308 } while (w != 0); 1309 } 1310 } 1311 1312 void av1_convolve_2d_sr_neon_i8mm(const uint8_t *src, int src_stride, 1313 uint8_t *dst, int dst_stride, int w, int h, 1314 const InterpFilterParams *filter_params_x, 1315 const InterpFilterParams *filter_params_y, 1316 const int subpel_x_qn, const int subpel_y_qn, 1317 ConvolveParams *conv_params) { 1318 if (w == 2 || h == 2) { 1319 av1_convolve_2d_sr_c(src, src_stride, dst, dst_stride, w, h, 1320 filter_params_x, filter_params_y, subpel_x_qn, 1321 subpel_y_qn, conv_params); 1322 return; 1323 } 1324 1325 const int y_filter_taps = get_filter_tap(filter_params_y, subpel_y_qn); 1326 const int x_filter_taps = get_filter_tap(filter_params_x, subpel_x_qn); 1327 const int clamped_y_taps = y_filter_taps < 4 ? 4 : y_filter_taps; 1328 const int im_h = h + clamped_y_taps - 1; 1329 const int im_stride = MAX_SB_SIZE; 1330 const int vert_offset = clamped_y_taps / 2 - 1; 1331 const int horiz_offset = filter_params_x->taps / 2 - 1; 1332 const uint8_t *src_ptr = src - vert_offset * src_stride - horiz_offset; 1333 1334 const int16_t *x_filter_ptr = av1_get_interp_filter_subpel_kernel( 1335 filter_params_x, subpel_x_qn & SUBPEL_MASK); 1336 const int16_t *y_filter_ptr = av1_get_interp_filter_subpel_kernel( 1337 filter_params_y, subpel_y_qn & SUBPEL_MASK); 1338 1339 if (filter_params_x->taps > 8) { 1340 DECLARE_ALIGNED(16, int16_t, 1341 im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE]); 1342 1343 const int16x8_t y_filter_0_7 = vld1q_s16(y_filter_ptr); 1344 const int16x4_t y_filter_8_11 = vld1_s16(y_filter_ptr + 8); 1345 1346 convolve_2d_sr_horiz_12tap_neon_i8mm(src_ptr, src_stride, im_block, 1347 im_stride, w, im_h, x_filter_ptr); 1348 1349 convolve_2d_sr_vert_12tap_neon(im_block, im_stride, dst, dst_stride, w, h, 1350 y_filter_0_7, y_filter_8_11); 1351 } else { 1352 DECLARE_ALIGNED(16, int16_t, 1353 im_block[(MAX_SB_SIZE + SUBPEL_TAPS - 1) * MAX_SB_SIZE]); 1354 1355 if (x_filter_taps == 6 && y_filter_taps == 6) { 1356 convolve_2d_sr_6tap_neon_i8mm(src_ptr + 1, src_stride, dst, dst_stride, w, 1357 h, x_filter_ptr, y_filter_ptr); 1358 return; 1359 } 1360 1361 // Used for both 6, 4 and 4, 4 horiz, vert filter tap combinations. 1362 if (x_filter_taps <= 6 && y_filter_taps <= 4) { 1363 convolve_2d_sr_6tap_4tap_neon_i8mm(src_ptr + 1, src_stride, dst, 1364 dst_stride, w, h, x_filter_ptr, 1365 y_filter_ptr); 1366 return; 1367 } 1368 1369 if (x_filter_taps <= 4) { 1370 convolve_2d_sr_horiz_4tap_neon_i8mm(src_ptr + 2, src_stride, im_block, 1371 im_stride, w, im_h, x_filter_ptr); 1372 } else { 1373 convolve_2d_sr_horiz_8tap_neon_i8mm(src_ptr, src_stride, im_block, 1374 im_stride, w, im_h, x_filter_ptr); 1375 } 1376 1377 const int16x8_t y_filter = vld1q_s16(y_filter_ptr); 1378 1379 if (clamped_y_taps <= 4) { 1380 convolve_2d_sr_vert_4tap_neon(im_block, im_stride, dst, dst_stride, w, h, 1381 y_filter_ptr); 1382 } else if (clamped_y_taps == 6) { 1383 convolve_2d_sr_vert_6tap_neon(im_block, im_stride, dst, dst_stride, w, h, 1384 y_filter); 1385 } else { 1386 convolve_2d_sr_vert_8tap_neon(im_block, im_stride, dst, dst_stride, w, h, 1387 y_filter); 1388 } 1389 } 1390 }