convolve_neon_dotprod.c (68967B)
1 /* 2 * Copyright (c) 2023, Alliance for Open Media. All rights reserved. 3 * 4 * This source code is subject to the terms of the BSD 2 Clause License and 5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License 6 * was not distributed with this source code in the LICENSE file, you can 7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open 8 * Media Patent License 1.0 was not distributed with this source code in the 9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent. 10 */ 11 12 #include <arm_neon.h> 13 14 #include "config/aom_config.h" 15 #include "config/av1_rtcd.h" 16 17 #include "aom_dsp/aom_dsp_common.h" 18 #include "aom_dsp/arm/mem_neon.h" 19 #include "aom_dsp/arm/transpose_neon.h" 20 #include "aom_ports/mem.h" 21 #include "av1/common/arm/convolve_neon.h" 22 #include "av1/common/convolve.h" 23 #include "av1/common/filter.h" 24 25 DECLARE_ALIGNED(16, static const uint8_t, kDotProdPermuteTbl[48]) = { 26 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6, 27 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10, 28 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 29 }; 30 31 DECLARE_ALIGNED(16, static const uint8_t, kDotProdMergeBlockTbl[48]) = { 32 // Shift left and insert new last column in transposed 4x4 block. 33 1, 2, 3, 16, 5, 6, 7, 20, 9, 10, 11, 24, 13, 14, 15, 28, 34 // Shift left and insert two new columns in transposed 4x4 block. 35 2, 3, 16, 17, 6, 7, 20, 21, 10, 11, 24, 25, 14, 15, 28, 29, 36 // Shift left and insert three new columns in transposed 4x4 block. 37 3, 16, 17, 18, 7, 20, 21, 22, 11, 24, 25, 26, 15, 28, 29, 30 38 }; 39 40 static inline int16x4_t convolve12_4_x(uint8x16_t samples, 41 const int8x16_t filter, 42 const uint8x16x3_t permute_tbl) { 43 // Transform sample range to [-128, 127] for 8-bit signed dot product. 44 int8x16_t samples_128 = 45 vreinterpretq_s8_u8(vsubq_u8(samples, vdupq_n_u8(128))); 46 47 // Permute samples ready for dot product. 48 // { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 } 49 // { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 } 50 // { 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 } 51 int8x16_t perm_samples[3] = { vqtbl1q_s8(samples_128, permute_tbl.val[0]), 52 vqtbl1q_s8(samples_128, permute_tbl.val[1]), 53 vqtbl1q_s8(samples_128, permute_tbl.val[2]) }; 54 55 // Dot product constants: 56 // Accumulate into 128 << FILTER_BITS to account for range transform. 57 // Adding a shim of 1 << (ROUND0_BITS - 1) enables us to use a single rounding 58 // right shift by FILTER_BITS - instead of a first rounding right shift by 59 // ROUND0_BITS, followed by second rounding right shift by FILTER_BITS - 60 // ROUND0_BITS. 61 int32x4_t acc = 62 vdupq_n_s32((128 << FILTER_BITS) + (1 << ((ROUND0_BITS - 1)))); 63 64 int32x4_t sum = vdotq_laneq_s32(acc, perm_samples[0], filter, 0); 65 sum = vdotq_laneq_s32(sum, perm_samples[1], filter, 1); 66 sum = vdotq_laneq_s32(sum, perm_samples[2], filter, 2); 67 68 return vshrn_n_s32(sum, 1); 69 } 70 71 static inline uint8x8_t convolve12_8_x(uint8x16_t samples[2], 72 const int8x16_t filter, 73 const uint8x16x3_t permute_tbl) { 74 // Transform sample range to [-128, 127] for 8-bit signed dot product. 75 int8x16_t samples_128[2] = { 76 vreinterpretq_s8_u8(vsubq_u8(samples[0], vdupq_n_u8(128))), 77 vreinterpretq_s8_u8(vsubq_u8(samples[1], vdupq_n_u8(128))) 78 }; 79 80 // Permute samples ready for dot product. 81 // { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 } 82 // { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 } 83 // { 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 } 84 // {12, 13, 14, 15, 13, 14, 15, 16, 14, 15, 16, 17, 15, 16, 17, 18 } 85 int8x16_t perm_samples[4] = { vqtbl1q_s8(samples_128[0], permute_tbl.val[0]), 86 vqtbl1q_s8(samples_128[0], permute_tbl.val[1]), 87 vqtbl1q_s8(samples_128[0], permute_tbl.val[2]), 88 vqtbl1q_s8(samples_128[1], 89 permute_tbl.val[2]) }; 90 91 // Dot product constants: 92 // Accumulate into 128 << FILTER_BITS to account for range transform. 93 // Adding a shim of 1 << (ROUND0_BITS - 1) enables us to use a single rounding 94 // right shift by FILTER_BITS - instead of a first rounding right shift by 95 // ROUND0_BITS, followed by second rounding right shift by FILTER_BITS - 96 // ROUND0_BITS. 97 int32x4_t acc = 98 vdupq_n_s32((128 << FILTER_BITS) + (1 << ((ROUND0_BITS - 1)))); 99 100 int32x4_t sum0123 = vdotq_laneq_s32(acc, perm_samples[0], filter, 0); 101 sum0123 = vdotq_laneq_s32(sum0123, perm_samples[1], filter, 1); 102 sum0123 = vdotq_laneq_s32(sum0123, perm_samples[2], filter, 2); 103 104 int32x4_t sum4567 = vdotq_laneq_s32(acc, perm_samples[1], filter, 0); 105 sum4567 = vdotq_laneq_s32(sum4567, perm_samples[2], filter, 1); 106 sum4567 = vdotq_laneq_s32(sum4567, perm_samples[3], filter, 2); 107 108 // Narrow and re-pack. 109 int16x8_t sum_s16 = 110 vcombine_s16(vshrn_n_s32(sum0123, 1), vshrn_n_s32(sum4567, 1)); 111 return vqrshrun_n_s16(sum_s16, FILTER_BITS - 1); 112 } 113 114 static inline void convolve_x_sr_12tap_neon_dotprod( 115 const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, 116 int h, const int16_t *x_filter_ptr) { 117 // The no-op filter should never be used here. 118 assert(x_filter_ptr[5] != 128); 119 120 const int16x8_t filter_0_7 = vld1q_s16(x_filter_ptr); 121 const int16x4_t filter_8_11 = vld1_s16(x_filter_ptr + 8); 122 const int16x8_t filter_8_15 = vcombine_s16(filter_8_11, vdup_n_s16(0)); 123 const int8x16_t filter = 124 vcombine_s8(vmovn_s16(filter_0_7), vmovn_s16(filter_8_15)); 125 126 const uint8x16x3_t permute_tbl = vld1q_u8_x3(kDotProdPermuteTbl); 127 128 if (w <= 4) { 129 do { 130 uint8x16_t s0, s1, s2, s3; 131 load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3); 132 133 int16x4_t d0 = convolve12_4_x(s0, filter, permute_tbl); 134 int16x4_t d1 = convolve12_4_x(s1, filter, permute_tbl); 135 int16x4_t d2 = convolve12_4_x(s2, filter, permute_tbl); 136 int16x4_t d3 = convolve12_4_x(s3, filter, permute_tbl); 137 138 uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS - 1); 139 uint8x8_t d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS - 1); 140 141 store_u8x4_strided_x2(dst + 0 * dst_stride, dst_stride, d01); 142 store_u8x4_strided_x2(dst + 2 * dst_stride, dst_stride, d23); 143 144 dst += 4 * dst_stride; 145 src += 4 * src_stride; 146 h -= 4; 147 } while (h != 0); 148 } else { 149 do { 150 const uint8_t *s = src; 151 uint8_t *d = dst; 152 int width = w; 153 154 do { 155 uint8x16_t s0[2], s1[2], s2[2], s3[2]; 156 load_u8_16x4(s, src_stride, &s0[0], &s1[0], &s2[0], &s3[0]); 157 load_u8_16x4(s + 4, src_stride, &s0[1], &s1[1], &s2[1], &s3[1]); 158 159 uint8x8_t d0 = convolve12_8_x(s0, filter, permute_tbl); 160 uint8x8_t d1 = convolve12_8_x(s1, filter, permute_tbl); 161 uint8x8_t d2 = convolve12_8_x(s2, filter, permute_tbl); 162 uint8x8_t d3 = convolve12_8_x(s3, filter, permute_tbl); 163 164 store_u8_8x4(d + 0 * dst_stride, dst_stride, d0, d1, d2, d3); 165 166 s += 8; 167 d += 8; 168 width -= 8; 169 } while (width != 0); 170 src += 4 * src_stride; 171 dst += 4 * dst_stride; 172 h -= 4; 173 } while (h != 0); 174 } 175 } 176 177 static inline int16x4_t convolve4_4_x(const uint8x16_t samples, 178 const int8x8_t filters, 179 const uint8x16_t permute_tbl) { 180 // Transform sample range to [-128, 127] for 8-bit signed dot product. 181 int8x16_t samples_128 = 182 vreinterpretq_s8_u8(vsubq_u8(samples, vdupq_n_u8(128))); 183 184 // Permute samples ready for dot product. 185 // { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 } 186 int8x16_t perm_samples = vqtbl1q_s8(samples_128, permute_tbl); 187 188 // Dot product constants: 189 // Accumulate into 128 << FILTER_BITS to account for range transform. 190 // Adding a shim of 1 << (ROUND0_BITS - 1) enables us to use a single rounding 191 // right shift by FILTER_BITS - instead of a first rounding right shift by 192 // ROUND0_BITS, followed by second rounding right shift by FILTER_BITS - 193 // ROUND0_BITS. Halve the total because we halved the filter values. 194 int32x4_t acc = 195 vdupq_n_s32(((128 << FILTER_BITS) + (1 << ((ROUND0_BITS - 1)))) / 2); 196 int32x4_t sum = vdotq_lane_s32(acc, perm_samples, filters, 0); 197 198 // Further narrowing and packing is performed by the caller. 199 return vmovn_s32(sum); 200 } 201 202 static inline uint8x8_t convolve4_8_x(const uint8x16_t samples, 203 const int8x8_t filters, 204 const uint8x16x2_t permute_tbl) { 205 // Transform sample range to [-128, 127] for 8-bit signed dot product. 206 int8x16_t samples_128 = 207 vreinterpretq_s8_u8(vsubq_u8(samples, vdupq_n_u8(128))); 208 209 // Permute samples ready for dot product. 210 // { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 } 211 // { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 } 212 int8x16_t perm_samples[2] = { vqtbl1q_s8(samples_128, permute_tbl.val[0]), 213 vqtbl1q_s8(samples_128, permute_tbl.val[1]) }; 214 215 // Dot product constants: 216 // Accumulate into 128 << FILTER_BITS to account for range transform. 217 // Adding a shim of 1 << (ROUND0_BITS - 1) enables us to use a single rounding 218 // right shift by FILTER_BITS - instead of a first rounding right shift by 219 // ROUND0_BITS, followed by second rounding right shift by FILTER_BITS - 220 // ROUND0_BITS. Halve the total because we halved the filter values. 221 int32x4_t acc = 222 vdupq_n_s32(((128 << FILTER_BITS) + (1 << ((ROUND0_BITS - 1)))) / 2); 223 224 int32x4_t sum0123 = vdotq_lane_s32(acc, perm_samples[0], filters, 0); 225 int32x4_t sum4567 = vdotq_lane_s32(acc, perm_samples[1], filters, 0); 226 227 // Narrow and re-pack. 228 int16x8_t sum = vcombine_s16(vmovn_s32(sum0123), vmovn_s32(sum4567)); 229 // We halved the filter values so -1 from right shift. 230 return vqrshrun_n_s16(sum, FILTER_BITS - 1); 231 } 232 233 static inline void convolve_x_sr_4tap_neon_dotprod( 234 const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, 235 ptrdiff_t dst_stride, int width, int height, const int16_t *filter_x) { 236 const int16x4_t x_filter = vld1_s16(filter_x + 2); 237 // All 4-tap and bilinear filter values are even, so halve them to reduce 238 // intermediate precision requirements. 239 const int8x8_t filter = vshrn_n_s16(vcombine_s16(x_filter, vdup_n_s16(0)), 1); 240 241 if (width == 4) { 242 const uint8x16_t permute_tbl = vld1q_u8(kDotProdPermuteTbl); 243 244 do { 245 uint8x16_t s0, s1, s2, s3; 246 load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3); 247 248 int16x4_t t0 = convolve4_4_x(s0, filter, permute_tbl); 249 int16x4_t t1 = convolve4_4_x(s1, filter, permute_tbl); 250 int16x4_t t2 = convolve4_4_x(s2, filter, permute_tbl); 251 int16x4_t t3 = convolve4_4_x(s3, filter, permute_tbl); 252 // We halved the filter values so -1 from right shift. 253 uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(t0, t1), FILTER_BITS - 1); 254 uint8x8_t d23 = vqrshrun_n_s16(vcombine_s16(t2, t3), FILTER_BITS - 1); 255 256 store_u8x4_strided_x2(dst + 0 * dst_stride, dst_stride, d01); 257 store_u8x4_strided_x2(dst + 2 * dst_stride, dst_stride, d23); 258 259 src += 4 * src_stride; 260 dst += 4 * dst_stride; 261 height -= 4; 262 } while (height != 0); 263 } else { 264 const uint8x16x2_t permute_tbl = vld1q_u8_x2(kDotProdPermuteTbl); 265 266 do { 267 const uint8_t *s = src; 268 uint8_t *d = dst; 269 int w = width; 270 271 do { 272 uint8x16_t s0, s1, s2, s3; 273 load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3); 274 275 uint8x8_t d0 = convolve4_8_x(s0, filter, permute_tbl); 276 uint8x8_t d1 = convolve4_8_x(s1, filter, permute_tbl); 277 uint8x8_t d2 = convolve4_8_x(s2, filter, permute_tbl); 278 uint8x8_t d3 = convolve4_8_x(s3, filter, permute_tbl); 279 280 store_u8_8x4(d, dst_stride, d0, d1, d2, d3); 281 282 s += 8; 283 d += 8; 284 w -= 8; 285 } while (w != 0); 286 src += 4 * src_stride; 287 dst += 4 * dst_stride; 288 height -= 4; 289 } while (height != 0); 290 } 291 } 292 293 static inline uint8x8_t convolve8_8_x(uint8x16_t samples, const int8x8_t filter, 294 const uint8x16x3_t permute_tbl) { 295 // Transform sample range to [-128, 127] for 8-bit signed dot product. 296 int8x16_t samples_128 = 297 vreinterpretq_s8_u8(vsubq_u8(samples, vdupq_n_u8(128))); 298 299 // Permute samples ready for dot product. */ 300 // { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 } 301 // { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 } 302 // { 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 } 303 int8x16_t perm_samples[3] = { vqtbl1q_s8(samples_128, permute_tbl.val[0]), 304 vqtbl1q_s8(samples_128, permute_tbl.val[1]), 305 vqtbl1q_s8(samples_128, permute_tbl.val[2]) }; 306 307 // Dot product constants: 308 // Accumulate into 128 << FILTER_BITS to account for range transform. 309 // Adding a shim of 1 << (ROUND0_BITS - 1) enables us to use a single rounding 310 // right shift by FILTER_BITS - instead of a first rounding right shift by 311 // ROUND0_BITS, followed by second rounding right shift by FILTER_BITS - 312 // ROUND0_BITS. Halve the total because we halved the filter values. 313 int32x4_t acc = 314 vdupq_n_s32(((128 << FILTER_BITS) + (1 << ((ROUND0_BITS - 1)))) / 2); 315 316 int32x4_t sum0123 = vdotq_lane_s32(acc, perm_samples[0], filter, 0); 317 sum0123 = vdotq_lane_s32(sum0123, perm_samples[1], filter, 1); 318 319 int32x4_t sum4567 = vdotq_lane_s32(acc, perm_samples[1], filter, 0); 320 sum4567 = vdotq_lane_s32(sum4567, perm_samples[2], filter, 1); 321 322 // Narrow and re-pack. 323 int16x8_t sum_s16 = vcombine_s16(vmovn_s32(sum0123), vmovn_s32(sum4567)); 324 // We halved the convolution filter values so - 1 from the right shift. 325 return vqrshrun_n_s16(sum_s16, FILTER_BITS - 1); 326 } 327 328 void av1_convolve_x_sr_neon_dotprod(const uint8_t *src, int src_stride, 329 uint8_t *dst, int dst_stride, int w, int h, 330 const InterpFilterParams *filter_params_x, 331 const int subpel_x_qn, 332 ConvolveParams *conv_params) { 333 if (w == 2 || h == 2) { 334 av1_convolve_x_sr_c(src, src_stride, dst, dst_stride, w, h, filter_params_x, 335 subpel_x_qn, conv_params); 336 return; 337 } 338 339 const uint8_t horiz_offset = filter_params_x->taps / 2 - 1; 340 src -= horiz_offset; 341 342 const int16_t *x_filter_ptr = av1_get_interp_filter_subpel_kernel( 343 filter_params_x, subpel_x_qn & SUBPEL_MASK); 344 345 int filter_taps = get_filter_tap(filter_params_x, subpel_x_qn & SUBPEL_MASK); 346 347 if (filter_taps > 8) { 348 convolve_x_sr_12tap_neon_dotprod(src, src_stride, dst, dst_stride, w, h, 349 x_filter_ptr); 350 return; 351 } 352 353 if (filter_taps <= 4) { 354 convolve_x_sr_4tap_neon_dotprod(src + 2, src_stride, dst, dst_stride, w, h, 355 x_filter_ptr); 356 return; 357 } 358 359 const int16x8_t x_filter_s16 = vld1q_s16(x_filter_ptr); 360 361 const uint8x16x3_t permute_tbl = vld1q_u8_x3(kDotProdPermuteTbl); 362 // Filter values are even, so halve to reduce intermediate precision reqs. 363 const int8x8_t x_filter = vshrn_n_s16(x_filter_s16, 1); 364 365 do { 366 int width = w; 367 const uint8_t *s = src; 368 uint8_t *d = dst; 369 370 do { 371 uint8x16_t s0, s1, s2, s3; 372 load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3); 373 374 uint8x8_t d0 = convolve8_8_x(s0, x_filter, permute_tbl); 375 uint8x8_t d1 = convolve8_8_x(s1, x_filter, permute_tbl); 376 uint8x8_t d2 = convolve8_8_x(s2, x_filter, permute_tbl); 377 uint8x8_t d3 = convolve8_8_x(s3, x_filter, permute_tbl); 378 379 store_u8_8x4(d, dst_stride, d0, d1, d2, d3); 380 381 s += 8; 382 d += 8; 383 width -= 8; 384 } while (width != 0); 385 src += 4 * src_stride; 386 dst += 4 * dst_stride; 387 h -= 4; 388 } while (h != 0); 389 } 390 391 static inline int16x4_t convolve12_4_y(const int8x16_t s0, const int8x16_t s1, 392 const int8x16_t s2, 393 const int8x8_t filters_0_7, 394 const int8x8_t filters_4_11) { 395 // The sample range transform and permutation are performed by the caller. 396 // Accumulate into 128 << FILTER_BITS to account for range transform. 397 const int32x4_t acc = vdupq_n_s32(128 << FILTER_BITS); 398 int32x4_t sum = vdotq_lane_s32(acc, s0, filters_0_7, 0); 399 sum = vdotq_lane_s32(sum, s1, filters_0_7, 1); 400 sum = vdotq_lane_s32(sum, s2, filters_4_11, 1); 401 402 // Further narrowing and packing is performed by the caller. 403 return vshrn_n_s32(sum, 1); 404 } 405 406 static inline uint8x8_t convolve12_8_y( 407 const int8x16_t s0_lo, const int8x16_t s0_hi, const int8x16_t s1_lo, 408 const int8x16_t s1_hi, const int8x16_t s2_lo, const int8x16_t s2_hi, 409 const int8x8_t filters_0_7, const int8x8_t filters_4_11) { 410 // The sample range transform and permutation are performed by the caller. 411 // Accumulate into 128 << FILTER_BITS to account for range transform. 412 const int32x4_t acc = vdupq_n_s32(128 << FILTER_BITS); 413 414 int32x4_t sum0123 = vdotq_lane_s32(acc, s0_lo, filters_0_7, 0); 415 sum0123 = vdotq_lane_s32(sum0123, s1_lo, filters_0_7, 1); 416 sum0123 = vdotq_lane_s32(sum0123, s2_lo, filters_4_11, 1); 417 418 int32x4_t sum4567 = vdotq_lane_s32(acc, s0_hi, filters_0_7, 0); 419 sum4567 = vdotq_lane_s32(sum4567, s1_hi, filters_0_7, 1); 420 sum4567 = vdotq_lane_s32(sum4567, s2_hi, filters_4_11, 1); 421 422 // Narrow and re-pack. 423 int16x8_t sum = 424 vcombine_s16(vshrn_n_s32(sum0123, 1), vshrn_n_s32(sum4567, 1)); 425 return vqrshrun_n_s16(sum, FILTER_BITS - 1); 426 } 427 428 static inline void convolve_y_sr_12tap_neon_dotprod( 429 const uint8_t *src_ptr, int src_stride, uint8_t *dst_ptr, int dst_stride, 430 int w, int h, const int16_t *y_filter_ptr) { 431 // The no-op filter should never be used here. 432 assert(y_filter_ptr[5] != 128); 433 434 const int8x8_t filter_0_7 = vmovn_s16(vld1q_s16(y_filter_ptr)); 435 const int8x8_t filter_4_11 = vmovn_s16(vld1q_s16(y_filter_ptr + 4)); 436 437 const uint8x16x3_t merge_block_tbl = vld1q_u8_x3(kDotProdMergeBlockTbl); 438 439 if (w == 4) { 440 uint8x8_t t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, tA; 441 load_u8_8x11(src_ptr, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7, 442 &t8, &t9, &tA); 443 src_ptr += 11 * src_stride; 444 445 // Transform sample range to [-128, 127] for 8-bit signed dot product. 446 int8x8_t s0 = vreinterpret_s8_u8(vsub_u8(t0, vdup_n_u8(128))); 447 int8x8_t s1 = vreinterpret_s8_u8(vsub_u8(t1, vdup_n_u8(128))); 448 int8x8_t s2 = vreinterpret_s8_u8(vsub_u8(t2, vdup_n_u8(128))); 449 int8x8_t s3 = vreinterpret_s8_u8(vsub_u8(t3, vdup_n_u8(128))); 450 int8x8_t s4 = vreinterpret_s8_u8(vsub_u8(t4, vdup_n_u8(128))); 451 int8x8_t s5 = vreinterpret_s8_u8(vsub_u8(t5, vdup_n_u8(128))); 452 int8x8_t s6 = vreinterpret_s8_u8(vsub_u8(t6, vdup_n_u8(128))); 453 int8x8_t s7 = vreinterpret_s8_u8(vsub_u8(t7, vdup_n_u8(128))); 454 int8x8_t s8 = vreinterpret_s8_u8(vsub_u8(t8, vdup_n_u8(128))); 455 int8x8_t s9 = vreinterpret_s8_u8(vsub_u8(t9, vdup_n_u8(128))); 456 int8x8_t sA = vreinterpret_s8_u8(vsub_u8(tA, vdup_n_u8(128))); 457 458 int8x16_t s0123, s1234, s2345, s3456, s4567, s5678, s6789, s789A; 459 transpose_concat_elems_s8_4x4(s0, s1, s2, s3, &s0123); 460 transpose_concat_elems_s8_4x4(s1, s2, s3, s4, &s1234); 461 transpose_concat_elems_s8_4x4(s2, s3, s4, s5, &s2345); 462 transpose_concat_elems_s8_4x4(s3, s4, s5, s6, &s3456); 463 transpose_concat_elems_s8_4x4(s4, s5, s6, s7, &s4567); 464 transpose_concat_elems_s8_4x4(s5, s6, s7, s8, &s5678); 465 transpose_concat_elems_s8_4x4(s6, s7, s8, s9, &s6789); 466 transpose_concat_elems_s8_4x4(s7, s8, s9, sA, &s789A); 467 468 do { 469 uint8x8_t tB, tC, tD, tE; 470 load_u8_8x4(src_ptr, src_stride, &tB, &tC, &tD, &tE); 471 472 int8x8_t sB = vreinterpret_s8_u8(vsub_u8(tB, vdup_n_u8(128))); 473 int8x8_t sC = vreinterpret_s8_u8(vsub_u8(tC, vdup_n_u8(128))); 474 int8x8_t sD = vreinterpret_s8_u8(vsub_u8(tD, vdup_n_u8(128))); 475 int8x8_t sE = vreinterpret_s8_u8(vsub_u8(tE, vdup_n_u8(128))); 476 477 int8x16_t s89AB, s9ABC, sABCD, sBCDE; 478 transpose_concat_elems_s8_4x4(sB, sC, sD, sE, &sBCDE); 479 480 // Merge new data into block from previous iteration. 481 int8x16x2_t samples_LUT = { { s789A, sBCDE } }; 482 s89AB = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[0]); 483 s9ABC = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[1]); 484 sABCD = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[2]); 485 486 int16x4_t d0 = 487 convolve12_4_y(s0123, s4567, s89AB, filter_0_7, filter_4_11); 488 int16x4_t d1 = 489 convolve12_4_y(s1234, s5678, s9ABC, filter_0_7, filter_4_11); 490 int16x4_t d2 = 491 convolve12_4_y(s2345, s6789, sABCD, filter_0_7, filter_4_11); 492 int16x4_t d3 = 493 convolve12_4_y(s3456, s789A, sBCDE, filter_0_7, filter_4_11); 494 uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS - 1); 495 uint8x8_t d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS - 1); 496 497 store_u8x4_strided_x2(dst_ptr + 0 * dst_stride, dst_stride, d01); 498 store_u8x4_strided_x2(dst_ptr + 2 * dst_stride, dst_stride, d23); 499 500 // Prepare block for next iteration - re-using as much as possible. 501 // Shuffle everything up four rows. 502 s0123 = s4567; 503 s1234 = s5678; 504 s2345 = s6789; 505 s3456 = s789A; 506 s4567 = s89AB; 507 s5678 = s9ABC; 508 s6789 = sABCD; 509 s789A = sBCDE; 510 511 src_ptr += 4 * src_stride; 512 dst_ptr += 4 * dst_stride; 513 h -= 4; 514 } while (h != 0); 515 } else { 516 do { 517 int height = h; 518 const uint8_t *s = src_ptr; 519 uint8_t *d = dst_ptr; 520 521 uint8x8_t t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, tA; 522 load_u8_8x11(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7, &t8, 523 &t9, &tA); 524 s += 11 * src_stride; 525 526 // Transform sample range to [-128, 127] for 8-bit signed dot product. 527 int8x8_t s0 = vreinterpret_s8_u8(vsub_u8(t0, vdup_n_u8(128))); 528 int8x8_t s1 = vreinterpret_s8_u8(vsub_u8(t1, vdup_n_u8(128))); 529 int8x8_t s2 = vreinterpret_s8_u8(vsub_u8(t2, vdup_n_u8(128))); 530 int8x8_t s3 = vreinterpret_s8_u8(vsub_u8(t3, vdup_n_u8(128))); 531 int8x8_t s4 = vreinterpret_s8_u8(vsub_u8(t4, vdup_n_u8(128))); 532 int8x8_t s5 = vreinterpret_s8_u8(vsub_u8(t5, vdup_n_u8(128))); 533 int8x8_t s6 = vreinterpret_s8_u8(vsub_u8(t6, vdup_n_u8(128))); 534 int8x8_t s7 = vreinterpret_s8_u8(vsub_u8(t7, vdup_n_u8(128))); 535 int8x8_t s8 = vreinterpret_s8_u8(vsub_u8(t8, vdup_n_u8(128))); 536 int8x8_t s9 = vreinterpret_s8_u8(vsub_u8(t9, vdup_n_u8(128))); 537 int8x8_t sA = vreinterpret_s8_u8(vsub_u8(tA, vdup_n_u8(128))); 538 539 // This operation combines a conventional transpose and the sample 540 // permute (see horizontal case) required before computing the dot 541 // product. 542 int8x16_t s0123_lo, s0123_hi, s1234_lo, s1234_hi, s2345_lo, s2345_hi, 543 s3456_lo, s3456_hi, s4567_lo, s4567_hi, s5678_lo, s5678_hi, s6789_lo, 544 s6789_hi, s789A_lo, s789A_hi; 545 transpose_concat_elems_s8_8x4(s0, s1, s2, s3, &s0123_lo, &s0123_hi); 546 transpose_concat_elems_s8_8x4(s1, s2, s3, s4, &s1234_lo, &s1234_hi); 547 transpose_concat_elems_s8_8x4(s2, s3, s4, s5, &s2345_lo, &s2345_hi); 548 transpose_concat_elems_s8_8x4(s3, s4, s5, s6, &s3456_lo, &s3456_hi); 549 transpose_concat_elems_s8_8x4(s4, s5, s6, s7, &s4567_lo, &s4567_hi); 550 transpose_concat_elems_s8_8x4(s5, s6, s7, s8, &s5678_lo, &s5678_hi); 551 transpose_concat_elems_s8_8x4(s6, s7, s8, s9, &s6789_lo, &s6789_hi); 552 transpose_concat_elems_s8_8x4(s7, s8, s9, sA, &s789A_lo, &s789A_hi); 553 554 do { 555 uint8x8_t tB, tC, tD, tE; 556 load_u8_8x4(s, src_stride, &tB, &tC, &tD, &tE); 557 558 int8x8_t sB = vreinterpret_s8_u8(vsub_u8(tB, vdup_n_u8(128))); 559 int8x8_t sC = vreinterpret_s8_u8(vsub_u8(tC, vdup_n_u8(128))); 560 int8x8_t sD = vreinterpret_s8_u8(vsub_u8(tD, vdup_n_u8(128))); 561 int8x8_t sE = vreinterpret_s8_u8(vsub_u8(tE, vdup_n_u8(128))); 562 563 int8x16_t s89AB_lo, s89AB_hi, s9ABC_lo, s9ABC_hi, sABCD_lo, sABCD_hi, 564 sBCDE_lo, sBCDE_hi; 565 transpose_concat_elems_s8_8x4(sB, sC, sD, sE, &sBCDE_lo, &sBCDE_hi); 566 567 // Merge new data into block from previous iteration. 568 int8x16x2_t samples_LUT_lo = { { s789A_lo, sBCDE_lo } }; 569 s89AB_lo = vqtbl2q_s8(samples_LUT_lo, merge_block_tbl.val[0]); 570 s9ABC_lo = vqtbl2q_s8(samples_LUT_lo, merge_block_tbl.val[1]); 571 sABCD_lo = vqtbl2q_s8(samples_LUT_lo, merge_block_tbl.val[2]); 572 573 int8x16x2_t samples_LUT_hi = { { s789A_hi, sBCDE_hi } }; 574 s89AB_hi = vqtbl2q_s8(samples_LUT_hi, merge_block_tbl.val[0]); 575 s9ABC_hi = vqtbl2q_s8(samples_LUT_hi, merge_block_tbl.val[1]); 576 sABCD_hi = vqtbl2q_s8(samples_LUT_hi, merge_block_tbl.val[2]); 577 578 uint8x8_t d0 = 579 convolve12_8_y(s0123_lo, s0123_hi, s4567_lo, s4567_hi, s89AB_lo, 580 s89AB_hi, filter_0_7, filter_4_11); 581 uint8x8_t d1 = 582 convolve12_8_y(s1234_lo, s1234_hi, s5678_lo, s5678_hi, s9ABC_lo, 583 s9ABC_hi, filter_0_7, filter_4_11); 584 uint8x8_t d2 = 585 convolve12_8_y(s2345_lo, s2345_hi, s6789_lo, s6789_hi, sABCD_lo, 586 sABCD_hi, filter_0_7, filter_4_11); 587 uint8x8_t d3 = 588 convolve12_8_y(s3456_lo, s3456_hi, s789A_lo, s789A_hi, sBCDE_lo, 589 sBCDE_hi, filter_0_7, filter_4_11); 590 591 store_u8_8x4(d, dst_stride, d0, d1, d2, d3); 592 593 // Prepare block for next iteration - re-using as much as possible. 594 // Shuffle everything up four rows. 595 s0123_lo = s4567_lo; 596 s0123_hi = s4567_hi; 597 s1234_lo = s5678_lo; 598 s1234_hi = s5678_hi; 599 s2345_lo = s6789_lo; 600 s2345_hi = s6789_hi; 601 s3456_lo = s789A_lo; 602 s3456_hi = s789A_hi; 603 s4567_lo = s89AB_lo; 604 s4567_hi = s89AB_hi; 605 s5678_lo = s9ABC_lo; 606 s5678_hi = s9ABC_hi; 607 s6789_lo = sABCD_lo; 608 s6789_hi = sABCD_hi; 609 s789A_lo = sBCDE_lo; 610 s789A_hi = sBCDE_hi; 611 612 s += 4 * src_stride; 613 d += 4 * dst_stride; 614 height -= 4; 615 } while (height != 0); 616 src_ptr += 8; 617 dst_ptr += 8; 618 w -= 8; 619 } while (w != 0); 620 } 621 } 622 623 static inline int16x4_t convolve8_4_y(const int8x16_t s0, const int8x16_t s1, 624 const int8x8_t filters) { 625 // The sample range transform and permutation are performed by the caller. 626 // Accumulate into 128 << FILTER_BITS to account for range transform. 627 // (- 1 since we halved the filters.) 628 const int32x4_t acc = vdupq_n_s32(128 << (FILTER_BITS - 1)); 629 int32x4_t sum = vdotq_lane_s32(acc, s0, filters, 0); 630 sum = vdotq_lane_s32(sum, s1, filters, 1); 631 632 // Further narrowing and packing is performed by the caller. 633 return vmovn_s32(sum); 634 } 635 636 static inline uint8x8_t convolve8_8_y(const int8x16_t s0_lo, 637 const int8x16_t s0_hi, 638 const int8x16_t s1_lo, 639 const int8x16_t s1_hi, 640 const int8x8_t filters) { 641 // The sample range transform and permutation are performed by the caller. 642 // Accumulate into 128 << FILTER_BITS to account for range transform. 643 // (- 1 since we halved the filters.) 644 const int32x4_t acc = vdupq_n_s32(128 << (FILTER_BITS - 1)); 645 646 int32x4_t sum0123 = vdotq_lane_s32(acc, s0_lo, filters, 0); 647 sum0123 = vdotq_lane_s32(sum0123, s1_lo, filters, 1); 648 649 int32x4_t sum4567 = vdotq_lane_s32(acc, s0_hi, filters, 0); 650 sum4567 = vdotq_lane_s32(sum4567, s1_hi, filters, 1); 651 652 // Narrow and re-pack. 653 int16x8_t sum = vcombine_s16(vmovn_s32(sum0123), vmovn_s32(sum4567)); 654 // We halved the filter values so -1 from right shift. 655 return vqrshrun_n_s16(sum, FILTER_BITS - 1); 656 } 657 658 static inline void convolve_y_sr_8tap_neon_dotprod( 659 const uint8_t *src_ptr, int src_stride, uint8_t *dst_ptr, int dst_stride, 660 int w, int h, const int16_t *y_filter_ptr) { 661 // Filter values are even, so halve to reduce intermediate precision reqs. 662 const int8x8_t filter = vshrn_n_s16(vld1q_s16(y_filter_ptr), 1); 663 664 const uint8x16x3_t merge_block_tbl = vld1q_u8_x3(kDotProdMergeBlockTbl); 665 666 if (w == 4) { 667 uint8x8_t t0, t1, t2, t3, t4, t5, t6; 668 load_u8_8x7(src_ptr, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6); 669 src_ptr += 7 * src_stride; 670 671 // Transform sample range to [-128, 127] for 8-bit signed dot product. 672 int8x8_t s0 = vreinterpret_s8_u8(vsub_u8(t0, vdup_n_u8(128))); 673 int8x8_t s1 = vreinterpret_s8_u8(vsub_u8(t1, vdup_n_u8(128))); 674 int8x8_t s2 = vreinterpret_s8_u8(vsub_u8(t2, vdup_n_u8(128))); 675 int8x8_t s3 = vreinterpret_s8_u8(vsub_u8(t3, vdup_n_u8(128))); 676 int8x8_t s4 = vreinterpret_s8_u8(vsub_u8(t4, vdup_n_u8(128))); 677 int8x8_t s5 = vreinterpret_s8_u8(vsub_u8(t5, vdup_n_u8(128))); 678 int8x8_t s6 = vreinterpret_s8_u8(vsub_u8(t6, vdup_n_u8(128))); 679 680 int8x16_t s0123, s1234, s2345, s3456; 681 transpose_concat_elems_s8_4x4(s0, s1, s2, s3, &s0123); 682 transpose_concat_elems_s8_4x4(s1, s2, s3, s4, &s1234); 683 transpose_concat_elems_s8_4x4(s2, s3, s4, s5, &s2345); 684 transpose_concat_elems_s8_4x4(s3, s4, s5, s6, &s3456); 685 686 do { 687 uint8x8_t t7, t8, t9, tA; 688 load_u8_8x4(src_ptr, src_stride, &t7, &t8, &t9, &tA); 689 690 int8x8_t s7 = vreinterpret_s8_u8(vsub_u8(t7, vdup_n_u8(128))); 691 int8x8_t s8 = vreinterpret_s8_u8(vsub_u8(t8, vdup_n_u8(128))); 692 int8x8_t s9 = vreinterpret_s8_u8(vsub_u8(t9, vdup_n_u8(128))); 693 int8x8_t sA = vreinterpret_s8_u8(vsub_u8(tA, vdup_n_u8(128))); 694 695 int8x16_t s4567, s5678, s6789, s789A; 696 transpose_concat_elems_s8_4x4(s7, s8, s9, sA, &s789A); 697 698 // Merge new data into block from previous iteration. 699 int8x16x2_t samples_LUT = { { s3456, s789A } }; 700 s4567 = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[0]); 701 s5678 = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[1]); 702 s6789 = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[2]); 703 704 int16x4_t d0 = convolve8_4_y(s0123, s4567, filter); 705 int16x4_t d1 = convolve8_4_y(s1234, s5678, filter); 706 int16x4_t d2 = convolve8_4_y(s2345, s6789, filter); 707 int16x4_t d3 = convolve8_4_y(s3456, s789A, filter); 708 // We halved the filter values so -1 from right shift. 709 uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS - 1); 710 uint8x8_t d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS - 1); 711 712 store_u8x4_strided_x2(dst_ptr + 0 * dst_stride, dst_stride, d01); 713 store_u8x4_strided_x2(dst_ptr + 2 * dst_stride, dst_stride, d23); 714 715 // Prepare block for next iteration - re-using as much as possible. 716 // Shuffle everything up four rows. 717 s0123 = s4567; 718 s1234 = s5678; 719 s2345 = s6789; 720 s3456 = s789A; 721 722 src_ptr += 4 * src_stride; 723 dst_ptr += 4 * dst_stride; 724 h -= 4; 725 } while (h != 0); 726 } else { 727 do { 728 int height = h; 729 const uint8_t *s = src_ptr; 730 uint8_t *d = dst_ptr; 731 732 uint8x8_t t0, t1, t2, t3, t4, t5, t6; 733 load_u8_8x7(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6); 734 s += 7 * src_stride; 735 736 // Transform sample range to [-128, 127] for 8-bit signed dot product. 737 int8x8_t s0 = vreinterpret_s8_u8(vsub_u8(t0, vdup_n_u8(128))); 738 int8x8_t s1 = vreinterpret_s8_u8(vsub_u8(t1, vdup_n_u8(128))); 739 int8x8_t s2 = vreinterpret_s8_u8(vsub_u8(t2, vdup_n_u8(128))); 740 int8x8_t s3 = vreinterpret_s8_u8(vsub_u8(t3, vdup_n_u8(128))); 741 int8x8_t s4 = vreinterpret_s8_u8(vsub_u8(t4, vdup_n_u8(128))); 742 int8x8_t s5 = vreinterpret_s8_u8(vsub_u8(t5, vdup_n_u8(128))); 743 int8x8_t s6 = vreinterpret_s8_u8(vsub_u8(t6, vdup_n_u8(128))); 744 745 // This operation combines a conventional transpose and the sample 746 // permute (see horizontal case) required before computing the dot 747 // product. 748 int8x16_t s0123_lo, s0123_hi, s1234_lo, s1234_hi, s2345_lo, s2345_hi, 749 s3456_lo, s3456_hi; 750 transpose_concat_elems_s8_8x4(s0, s1, s2, s3, &s0123_lo, &s0123_hi); 751 transpose_concat_elems_s8_8x4(s1, s2, s3, s4, &s1234_lo, &s1234_hi); 752 transpose_concat_elems_s8_8x4(s2, s3, s4, s5, &s2345_lo, &s2345_hi); 753 transpose_concat_elems_s8_8x4(s3, s4, s5, s6, &s3456_lo, &s3456_hi); 754 755 do { 756 uint8x8_t t7, t8, t9, tA; 757 load_u8_8x4(s, src_stride, &t7, &t8, &t9, &tA); 758 759 int8x8_t s7 = vreinterpret_s8_u8(vsub_u8(t7, vdup_n_u8(128))); 760 int8x8_t s8 = vreinterpret_s8_u8(vsub_u8(t8, vdup_n_u8(128))); 761 int8x8_t s9 = vreinterpret_s8_u8(vsub_u8(t9, vdup_n_u8(128))); 762 int8x8_t sA = vreinterpret_s8_u8(vsub_u8(tA, vdup_n_u8(128))); 763 764 int8x16_t s4567_lo, s4567_hi, s5678_lo, s5678_hi, s6789_lo, s6789_hi, 765 s789A_lo, s789A_hi; 766 transpose_concat_elems_s8_8x4(s7, s8, s9, sA, &s789A_lo, &s789A_hi); 767 768 // Merge new data into block from previous iteration. 769 int8x16x2_t samples_LUT_lo = { { s3456_lo, s789A_lo } }; 770 s4567_lo = vqtbl2q_s8(samples_LUT_lo, merge_block_tbl.val[0]); 771 s5678_lo = vqtbl2q_s8(samples_LUT_lo, merge_block_tbl.val[1]); 772 s6789_lo = vqtbl2q_s8(samples_LUT_lo, merge_block_tbl.val[2]); 773 774 int8x16x2_t samples_LUT_hi = { { s3456_hi, s789A_hi } }; 775 s4567_hi = vqtbl2q_s8(samples_LUT_hi, merge_block_tbl.val[0]); 776 s5678_hi = vqtbl2q_s8(samples_LUT_hi, merge_block_tbl.val[1]); 777 s6789_hi = vqtbl2q_s8(samples_LUT_hi, merge_block_tbl.val[2]); 778 779 uint8x8_t d0 = 780 convolve8_8_y(s0123_lo, s0123_hi, s4567_lo, s4567_hi, filter); 781 uint8x8_t d1 = 782 convolve8_8_y(s1234_lo, s1234_hi, s5678_lo, s5678_hi, filter); 783 uint8x8_t d2 = 784 convolve8_8_y(s2345_lo, s2345_hi, s6789_lo, s6789_hi, filter); 785 uint8x8_t d3 = 786 convolve8_8_y(s3456_lo, s3456_hi, s789A_lo, s789A_hi, filter); 787 788 store_u8_8x4(d, dst_stride, d0, d1, d2, d3); 789 790 // Prepare block for next iteration - re-using as much as possible. 791 // Shuffle everything up four rows. 792 s0123_lo = s4567_lo; 793 s0123_hi = s4567_hi; 794 s1234_lo = s5678_lo; 795 s1234_hi = s5678_hi; 796 s2345_lo = s6789_lo; 797 s2345_hi = s6789_hi; 798 s3456_lo = s789A_lo; 799 s3456_hi = s789A_hi; 800 801 s += 4 * src_stride; 802 d += 4 * dst_stride; 803 height -= 4; 804 } while (height != 0); 805 src_ptr += 8; 806 dst_ptr += 8; 807 w -= 8; 808 } while (w != 0); 809 } 810 } 811 812 static inline int16x4_t convolve4_4_y(const int8x16_t s0, 813 const int8x8_t filters) { 814 // The sample range transform and permutation are performed by the caller. 815 // Accumulate into 128 << FILTER_BITS to account for range transform. 816 // (- 1 since we halved the filters.) 817 const int32x4_t acc = vdupq_n_s32(128 << (FILTER_BITS - 1)); 818 819 int32x4_t sum = vdotq_lane_s32(acc, s0, filters, 0); 820 821 // Further narrowing and packing is performed by the caller. 822 return vmovn_s32(sum); 823 } 824 825 static inline uint8x8_t convolve4_8_y(const int8x16_t s0, const int8x16_t s1, 826 const int8x8_t filters) { 827 // The sample range transform and permutation are performed by the caller. 828 // Accumulate into 128 << FILTER_BITS to account for range transform. 829 // (- 1 since we halved the filters.) 830 const int32x4_t acc = vdupq_n_s32(128 << (FILTER_BITS - 1)); 831 832 int32x4_t sum0123 = vdotq_lane_s32(acc, s0, filters, 0); 833 int32x4_t sum4567 = vdotq_lane_s32(acc, s1, filters, 0); 834 835 // Narrow and re-pack. 836 int16x8_t sum = vcombine_s16(vmovn_s32(sum0123), vmovn_s32(sum4567)); 837 // We halved the filter values so -1 from right shift. 838 return vqrshrun_n_s16(sum, FILTER_BITS - 1); 839 } 840 841 static inline void convolve_y_sr_4tap_neon_dotprod( 842 const uint8_t *src_ptr, int src_stride, uint8_t *dst_ptr, int dst_stride, 843 int w, int h, const int16_t *y_filter_ptr) { 844 // Filter values are even, so halve to reduce intermediate precision reqs. 845 const int16x8_t filter_s16 = 846 vcombine_s16(vld1_s16(y_filter_ptr + 2), vdup_n_s16(0)); 847 const int8x8_t filter = vshrn_n_s16(filter_s16, 1); 848 const uint8x16x3_t merge_block_tbl = vld1q_u8_x3(kDotProdMergeBlockTbl); 849 int8x16x2_t samples_LUT; 850 851 if (w == 4) { 852 uint8x8_t t0, t1, t2, t3; 853 load_u8_8x4(src_ptr, src_stride, &t0, &t1, &t2, &t3); 854 src_ptr += 4 * src_stride; 855 856 // Transform sample range to [-128, 127] for 8-bit signed dot product. 857 int8x8_t s0 = vreinterpret_s8_u8(vsub_u8(t0, vdup_n_u8(128))); 858 int8x8_t s1 = vreinterpret_s8_u8(vsub_u8(t1, vdup_n_u8(128))); 859 int8x8_t s2 = vreinterpret_s8_u8(vsub_u8(t2, vdup_n_u8(128))); 860 int8x8_t s3 = vreinterpret_s8_u8(vsub_u8(t3, vdup_n_u8(128))); 861 862 // This operation combines a conventional transpose and the sample permute 863 // required before computing the dot product. 864 int8x16_t s0123; 865 transpose_concat_elems_s8_4x4(s0, s1, s2, s3, &s0123); 866 867 do { 868 uint8x8_t t4, t5, t6, t7; 869 load_u8_8x4(src_ptr, src_stride, &t4, &t5, &t6, &t7); 870 871 int8x8_t s4 = vreinterpret_s8_u8(vsub_u8(t4, vdup_n_u8(128))); 872 int8x8_t s5 = vreinterpret_s8_u8(vsub_u8(t5, vdup_n_u8(128))); 873 int8x8_t s6 = vreinterpret_s8_u8(vsub_u8(t6, vdup_n_u8(128))); 874 int8x8_t s7 = vreinterpret_s8_u8(vsub_u8(t7, vdup_n_u8(128))); 875 876 int8x16_t s4567; 877 transpose_concat_elems_s8_4x4(s4, s5, s6, s7, &s4567); 878 879 // Merge new data into block from previous iteration. 880 samples_LUT.val[0] = s0123; 881 samples_LUT.val[1] = s4567; 882 int8x16_t s1234 = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[0]); 883 int8x16_t s2345 = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[1]); 884 int8x16_t s3456 = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[2]); 885 886 int16x4_t d0 = convolve4_4_y(s0123, filter); 887 int16x4_t d1 = convolve4_4_y(s1234, filter); 888 int16x4_t d2 = convolve4_4_y(s2345, filter); 889 int16x4_t d3 = convolve4_4_y(s3456, filter); 890 // We halved the filter values so -1 from right shift. 891 uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS - 1); 892 uint8x8_t d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS - 1); 893 894 store_u8x4_strided_x2(dst_ptr + 0 * dst_stride, dst_stride, d01); 895 store_u8x4_strided_x2(dst_ptr + 2 * dst_stride, dst_stride, d23); 896 897 // Prepare block for next iteration - re-using as much as possible. 898 // Shuffle everything up four rows. 899 s0123 = s4567; 900 901 src_ptr += 4 * src_stride; 902 dst_ptr += 4 * dst_stride; 903 h -= 4; 904 } while (h != 0); 905 } else { 906 do { 907 int height = h; 908 const uint8_t *s = src_ptr; 909 uint8_t *d = dst_ptr; 910 911 uint8x8_t t0, t1, t2, t3; 912 load_u8_8x4(s, src_stride, &t0, &t1, &t2, &t3); 913 s += 4 * src_stride; 914 915 // Transform sample range to [-128, 127] for 8-bit signed dot product. 916 int8x8_t s0 = vreinterpret_s8_u8(vsub_u8(t0, vdup_n_u8(128))); 917 int8x8_t s1 = vreinterpret_s8_u8(vsub_u8(t1, vdup_n_u8(128))); 918 int8x8_t s2 = vreinterpret_s8_u8(vsub_u8(t2, vdup_n_u8(128))); 919 int8x8_t s3 = vreinterpret_s8_u8(vsub_u8(t3, vdup_n_u8(128))); 920 921 // This operation combines a conventional transpose and the sample permute 922 // required before computing the dot product. 923 int8x16_t s0123_lo, s0123_hi; 924 transpose_concat_elems_s8_8x4(s0, s1, s2, s3, &s0123_lo, &s0123_hi); 925 926 do { 927 uint8x8_t t4, t5, t6, t7; 928 load_u8_8x4(s, src_stride, &t4, &t5, &t6, &t7); 929 930 int8x8_t s4 = vreinterpret_s8_u8(vsub_u8(t4, vdup_n_u8(128))); 931 int8x8_t s5 = vreinterpret_s8_u8(vsub_u8(t5, vdup_n_u8(128))); 932 int8x8_t s6 = vreinterpret_s8_u8(vsub_u8(t6, vdup_n_u8(128))); 933 int8x8_t s7 = vreinterpret_s8_u8(vsub_u8(t7, vdup_n_u8(128))); 934 935 int8x16_t s4567_lo, s4567_hi; 936 transpose_concat_elems_s8_8x4(s4, s5, s6, s7, &s4567_lo, &s4567_hi); 937 938 // Merge new data into block from previous iteration. 939 samples_LUT.val[0] = s0123_lo; 940 samples_LUT.val[1] = s4567_lo; 941 int8x16_t s1234_lo = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[0]); 942 int8x16_t s2345_lo = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[1]); 943 int8x16_t s3456_lo = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[2]); 944 945 samples_LUT.val[0] = s0123_hi; 946 samples_LUT.val[1] = s4567_hi; 947 int8x16_t s1234_hi = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[0]); 948 int8x16_t s2345_hi = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[1]); 949 int8x16_t s3456_hi = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[2]); 950 951 uint8x8_t d0 = convolve4_8_y(s0123_lo, s0123_hi, filter); 952 uint8x8_t d1 = convolve4_8_y(s1234_lo, s1234_hi, filter); 953 uint8x8_t d2 = convolve4_8_y(s2345_lo, s2345_hi, filter); 954 uint8x8_t d3 = convolve4_8_y(s3456_lo, s3456_hi, filter); 955 956 store_u8_8x4(d, dst_stride, d0, d1, d2, d3); 957 958 // Prepare block for next iteration - re-using as much as possible. 959 // Shuffle everything up four rows. 960 s0123_lo = s4567_lo; 961 s0123_hi = s4567_hi; 962 963 s += 4 * src_stride; 964 d += 4 * dst_stride; 965 height -= 4; 966 } while (height != 0); 967 src_ptr += 8; 968 dst_ptr += 8; 969 w -= 8; 970 } while (w != 0); 971 } 972 } 973 974 void av1_convolve_y_sr_neon_dotprod(const uint8_t *src, int src_stride, 975 uint8_t *dst, int dst_stride, int w, int h, 976 const InterpFilterParams *filter_params_y, 977 const int subpel_y_qn) { 978 if (w == 2 || h == 2) { 979 av1_convolve_y_sr_c(src, src_stride, dst, dst_stride, w, h, filter_params_y, 980 subpel_y_qn); 981 return; 982 } 983 984 const int y_filter_taps = get_filter_tap(filter_params_y, subpel_y_qn); 985 const int16_t *y_filter_ptr = av1_get_interp_filter_subpel_kernel( 986 filter_params_y, subpel_y_qn & SUBPEL_MASK); 987 988 if (y_filter_taps <= 4) { 989 convolve_y_sr_4tap_neon_dotprod(src - src_stride, src_stride, dst, 990 dst_stride, w, h, y_filter_ptr); 991 } else if (y_filter_taps == 12) { 992 convolve_y_sr_12tap_neon_dotprod(src - 5 * src_stride, src_stride, dst, 993 dst_stride, w, h, y_filter_ptr); 994 } else { 995 // 6-tap or 8-tap. 996 convolve_y_sr_8tap_neon_dotprod(src - 3 * src_stride, src_stride, dst, 997 dst_stride, w, h, y_filter_ptr); 998 } 999 } 1000 1001 static inline int16x4_t convolve12_4_2d_h(uint8x16_t samples, 1002 const int8x16_t filters, 1003 const int32x4_t horiz_const, 1004 const uint8x16x3_t permute_tbl) { 1005 // Transform sample range to [-128, 127] for 8-bit signed dot product. 1006 int8x16_t samples_128 = 1007 vreinterpretq_s8_u8(vsubq_u8(samples, vdupq_n_u8(128))); 1008 1009 // Permute samples ready for dot product. 1010 // { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 } 1011 // { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 } 1012 // { 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 } 1013 int8x16_t perm_samples[3] = { vqtbl1q_s8(samples_128, permute_tbl.val[0]), 1014 vqtbl1q_s8(samples_128, permute_tbl.val[1]), 1015 vqtbl1q_s8(samples_128, permute_tbl.val[2]) }; 1016 1017 // Accumulate dot product into 'correction' to account for range transform. 1018 int32x4_t sum = vdotq_laneq_s32(horiz_const, perm_samples[0], filters, 0); 1019 sum = vdotq_laneq_s32(sum, perm_samples[1], filters, 1); 1020 sum = vdotq_laneq_s32(sum, perm_samples[2], filters, 2); 1021 1022 // Narrow and re-pack. 1023 return vshrn_n_s32(sum, ROUND0_BITS); 1024 } 1025 1026 static inline int16x8_t convolve12_8_2d_h(uint8x16_t samples[2], 1027 const int8x16_t filters, 1028 const int32x4_t correction, 1029 const uint8x16x3_t permute_tbl) { 1030 // Transform sample range to [-128, 127] for 8-bit signed dot product. 1031 int8x16_t samples_128[2] = { 1032 vreinterpretq_s8_u8(vsubq_u8(samples[0], vdupq_n_u8(128))), 1033 vreinterpretq_s8_u8(vsubq_u8(samples[1], vdupq_n_u8(128))) 1034 }; 1035 1036 // Permute samples ready for dot product. 1037 // { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 } 1038 // { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 } 1039 // { 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 } 1040 // {12, 13, 14, 15, 13, 14, 15, 16, 14, 15, 16, 17, 15, 16, 17, 18 } 1041 int8x16_t perm_samples[4] = { vqtbl1q_s8(samples_128[0], permute_tbl.val[0]), 1042 vqtbl1q_s8(samples_128[0], permute_tbl.val[1]), 1043 vqtbl1q_s8(samples_128[0], permute_tbl.val[2]), 1044 vqtbl1q_s8(samples_128[1], 1045 permute_tbl.val[2]) }; 1046 1047 // Accumulate dot product into 'correction' to account for range transform. 1048 int32x4_t sum0123 = vdotq_laneq_s32(correction, perm_samples[0], filters, 0); 1049 sum0123 = vdotq_laneq_s32(sum0123, perm_samples[1], filters, 1); 1050 sum0123 = vdotq_laneq_s32(sum0123, perm_samples[2], filters, 2); 1051 1052 int32x4_t sum4567 = vdotq_laneq_s32(correction, perm_samples[1], filters, 0); 1053 sum4567 = vdotq_laneq_s32(sum4567, perm_samples[2], filters, 1); 1054 sum4567 = vdotq_laneq_s32(sum4567, perm_samples[3], filters, 2); 1055 1056 // Narrow and re-pack. 1057 return vcombine_s16(vshrn_n_s32(sum0123, ROUND0_BITS), 1058 vshrn_n_s32(sum4567, ROUND0_BITS)); 1059 } 1060 1061 static inline void convolve_2d_sr_horiz_12tap_neon_dotprod( 1062 const uint8_t *src_ptr, int src_stride, int16_t *dst_ptr, 1063 const int dst_stride, int w, int h, const int16x8_t x_filter_0_7, 1064 const int16x4_t x_filter_8_11) { 1065 // The no-op filter should never be used here. 1066 assert(vgetq_lane_s16(x_filter_0_7, 5) != 128); 1067 1068 const int bd = 8; 1069 1070 // Narrow filter values to 8-bit. 1071 const int16x8x2_t x_filter_s16 = { 1072 { x_filter_0_7, vcombine_s16(x_filter_8_11, vdup_n_s16(0)) } 1073 }; 1074 const int8x16_t x_filter = vcombine_s8(vmovn_s16(x_filter_s16.val[0]), 1075 vmovn_s16(x_filter_s16.val[1])); 1076 1077 // Adding a shim of 1 << (ROUND0_BITS - 1) enables us to use non-rounding 1078 // shifts - which are generally faster than rounding shifts on modern CPUs. 1079 const int32_t horiz_const = 1080 ((1 << (bd + FILTER_BITS - 1)) + (1 << (ROUND0_BITS - 1))); 1081 // Dot product constants. 1082 const int32x4_t correction = vdupq_n_s32((128 << FILTER_BITS) + horiz_const); 1083 const uint8x16x3_t permute_tbl = vld1q_u8_x3(kDotProdPermuteTbl); 1084 1085 if (w <= 4) { 1086 do { 1087 uint8x16_t s0, s1, s2, s3; 1088 load_u8_16x4(src_ptr, src_stride, &s0, &s1, &s2, &s3); 1089 1090 int16x4_t d0 = convolve12_4_2d_h(s0, x_filter, correction, permute_tbl); 1091 int16x4_t d1 = convolve12_4_2d_h(s1, x_filter, correction, permute_tbl); 1092 int16x4_t d2 = convolve12_4_2d_h(s2, x_filter, correction, permute_tbl); 1093 int16x4_t d3 = convolve12_4_2d_h(s3, x_filter, correction, permute_tbl); 1094 1095 store_s16_4x4(dst_ptr, dst_stride, d0, d1, d2, d3); 1096 1097 src_ptr += 4 * src_stride; 1098 dst_ptr += 4 * dst_stride; 1099 h -= 4; 1100 } while (h > 4); 1101 1102 do { 1103 uint8x16_t s0 = vld1q_u8(src_ptr); 1104 int16x4_t d0 = convolve12_4_2d_h(s0, x_filter, correction, permute_tbl); 1105 vst1_s16(dst_ptr, d0); 1106 1107 src_ptr += src_stride; 1108 dst_ptr += dst_stride; 1109 } while (--h != 0); 1110 1111 } else { 1112 do { 1113 const uint8_t *s = src_ptr; 1114 int16_t *d = dst_ptr; 1115 int width = w; 1116 1117 do { 1118 uint8x16_t s0[2], s1[2], s2[2], s3[2]; 1119 load_u8_16x4(s, src_stride, &s0[0], &s1[0], &s2[0], &s3[0]); 1120 load_u8_16x4(s + 4, src_stride, &s0[1], &s1[1], &s2[1], &s3[1]); 1121 1122 int16x8_t d0 = convolve12_8_2d_h(s0, x_filter, correction, permute_tbl); 1123 int16x8_t d1 = convolve12_8_2d_h(s1, x_filter, correction, permute_tbl); 1124 int16x8_t d2 = convolve12_8_2d_h(s2, x_filter, correction, permute_tbl); 1125 int16x8_t d3 = convolve12_8_2d_h(s3, x_filter, correction, permute_tbl); 1126 1127 store_s16_8x4(d, dst_stride, d0, d1, d2, d3); 1128 1129 s += 8; 1130 d += 8; 1131 width -= 8; 1132 } while (width != 0); 1133 src_ptr += 4 * src_stride; 1134 dst_ptr += 4 * dst_stride; 1135 h -= 4; 1136 } while (h > 4); 1137 1138 do { 1139 const uint8_t *s = src_ptr; 1140 int16_t *d = dst_ptr; 1141 int width = w; 1142 1143 do { 1144 uint8x16_t s0[2]; 1145 s0[0] = vld1q_u8(s); 1146 s0[1] = vld1q_u8(s + 4); 1147 int16x8_t d0 = convolve12_8_2d_h(s0, x_filter, correction, permute_tbl); 1148 vst1q_s16(d, d0); 1149 1150 s += 8; 1151 d += 8; 1152 width -= 8; 1153 } while (width != 0); 1154 src_ptr += src_stride; 1155 dst_ptr += dst_stride; 1156 } while (--h != 0); 1157 } 1158 } 1159 1160 static inline int16x4_t convolve4_4_2d_h(const uint8x16_t samples, 1161 const int8x8_t filters, 1162 const uint8x16_t permute_tbl, 1163 const int32x4_t correction) { 1164 // Transform sample range to [-128, 127] for 8-bit signed dot product. 1165 int8x16_t samples_128 = 1166 vreinterpretq_s8_u8(vsubq_u8(samples, vdupq_n_u8(128))); 1167 1168 // Permute samples ready for dot product. 1169 // { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 } 1170 int8x16_t perm_samples = vqtbl1q_s8(samples_128, permute_tbl); 1171 1172 // Accumulate into 'correction' to account for range transform. 1173 int32x4_t sum = vdotq_lane_s32(correction, perm_samples, filters, 0); 1174 1175 // We halved the convolution filter values so -1 from the right shift. 1176 return vshrn_n_s32(sum, ROUND0_BITS - 1); 1177 } 1178 1179 static inline int16x8_t convolve4_8_2d_h(const uint8x16_t samples, 1180 const int8x8_t filters, 1181 const uint8x16x2_t permute_tbl, 1182 const int32x4_t correction) { 1183 // Transform sample range to [-128, 127] for 8-bit signed dot product. 1184 int8x16_t samples_128 = 1185 vreinterpretq_s8_u8(vsubq_u8(samples, vdupq_n_u8(128))); 1186 1187 // Permute samples ready for dot product. 1188 // { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 } 1189 // { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 } 1190 int8x16_t perm_samples[2] = { vqtbl1q_s8(samples_128, permute_tbl.val[0]), 1191 vqtbl1q_s8(samples_128, permute_tbl.val[1]) }; 1192 1193 // Accumulate into 'correction' to account for range transform. 1194 int32x4_t sum0123 = vdotq_lane_s32(correction, perm_samples[0], filters, 0); 1195 int32x4_t sum4567 = vdotq_lane_s32(correction, perm_samples[1], filters, 0); 1196 1197 // Narrow and re-pack. 1198 // We halved the filter values so -1 from right shift. 1199 return vcombine_s16(vshrn_n_s32(sum0123, ROUND0_BITS - 1), 1200 vshrn_n_s32(sum4567, ROUND0_BITS - 1)); 1201 } 1202 1203 static inline void convolve_2d_sr_horiz_4tap_neon_dotprod( 1204 const uint8_t *src, ptrdiff_t src_stride, int16_t *dst, 1205 ptrdiff_t dst_stride, int w, int h, const int16_t *filter_x) { 1206 const int bd = 8; 1207 const int16x4_t x_filter = vld1_s16(filter_x + 2); 1208 // All 4-tap and bilinear filter values are even, so halve them to reduce 1209 // intermediate precision requirements. 1210 const int8x8_t filter = vshrn_n_s16(vcombine_s16(x_filter, vdup_n_s16(0)), 1); 1211 1212 // Adding a shim of 1 << (ROUND0_BITS - 1) enables us to use non-rounding 1213 // shifts - which are generally faster than rounding shifts on modern CPUs. 1214 const int32_t horiz_const = 1215 ((1 << (bd + FILTER_BITS - 1)) + (1 << (ROUND0_BITS - 1))); 1216 // Accumulate into 128 << FILTER_BITS to account for range transform. 1217 // Halve the total because we halved the filter values. 1218 const int32x4_t correction = 1219 vdupq_n_s32(((128 << FILTER_BITS) + horiz_const) / 2); 1220 1221 if (w == 4) { 1222 const uint8x16_t permute_tbl = vld1q_u8(kDotProdPermuteTbl); 1223 1224 do { 1225 uint8x16_t s0, s1, s2, s3; 1226 load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3); 1227 1228 int16x4_t d0 = convolve4_4_2d_h(s0, filter, permute_tbl, correction); 1229 int16x4_t d1 = convolve4_4_2d_h(s1, filter, permute_tbl, correction); 1230 int16x4_t d2 = convolve4_4_2d_h(s2, filter, permute_tbl, correction); 1231 int16x4_t d3 = convolve4_4_2d_h(s3, filter, permute_tbl, correction); 1232 1233 store_s16_4x4(dst, dst_stride, d0, d1, d2, d3); 1234 1235 src += 4 * src_stride; 1236 dst += 4 * dst_stride; 1237 h -= 4; 1238 } while (h > 4); 1239 1240 do { 1241 uint8x16_t s0 = vld1q_u8(src); 1242 int16x4_t d0 = convolve4_4_2d_h(s0, filter, permute_tbl, correction); 1243 vst1_s16(dst, d0); 1244 1245 src += src_stride; 1246 dst += dst_stride; 1247 } while (--h != 0); 1248 } else { 1249 const uint8x16x2_t permute_tbl = vld1q_u8_x2(kDotProdPermuteTbl); 1250 do { 1251 const uint8_t *s = src; 1252 int16_t *d = dst; 1253 int width = w; 1254 1255 do { 1256 uint8x16_t s0, s1, s2, s3; 1257 load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3); 1258 1259 int16x8_t d0 = convolve4_8_2d_h(s0, filter, permute_tbl, correction); 1260 int16x8_t d1 = convolve4_8_2d_h(s1, filter, permute_tbl, correction); 1261 int16x8_t d2 = convolve4_8_2d_h(s2, filter, permute_tbl, correction); 1262 int16x8_t d3 = convolve4_8_2d_h(s3, filter, permute_tbl, correction); 1263 1264 store_s16_8x4(d, dst_stride, d0, d1, d2, d3); 1265 1266 s += 8; 1267 d += 8; 1268 width -= 8; 1269 } while (width != 0); 1270 src += 4 * src_stride; 1271 dst += 4 * dst_stride; 1272 h -= 4; 1273 } while (h > 4); 1274 1275 do { 1276 const uint8_t *s = src; 1277 int16_t *d = dst; 1278 int width = w; 1279 1280 do { 1281 uint8x16_t s0 = vld1q_u8(s); 1282 int16x8_t d0 = convolve4_8_2d_h(s0, filter, permute_tbl, correction); 1283 vst1q_s16(d, d0); 1284 1285 s += 8; 1286 d += 8; 1287 width -= 8; 1288 } while (width != 0); 1289 src += src_stride; 1290 dst += dst_stride; 1291 } while (--h != 0); 1292 } 1293 } 1294 1295 static inline int16x8_t convolve8_8_2d_h(uint8x16_t samples, 1296 const int8x8_t filters, 1297 const int32x4_t correction, 1298 const uint8x16x3_t permute_tbl) { 1299 // Transform sample range to [-128, 127] for 8-bit signed dot product. 1300 int8x16_t samples_128 = 1301 vreinterpretq_s8_u8(vsubq_u8(samples, vdupq_n_u8(128))); 1302 1303 // Permute samples ready for dot product. 1304 // { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 } 1305 // { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 } 1306 // { 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 } 1307 int8x16_t perm_samples[3] = { vqtbl1q_s8(samples_128, permute_tbl.val[0]), 1308 vqtbl1q_s8(samples_128, permute_tbl.val[1]), 1309 vqtbl1q_s8(samples_128, permute_tbl.val[2]) }; 1310 1311 // Accumulate dot product into 'correction' to account for range transform. 1312 int32x4_t sum0123 = vdotq_lane_s32(correction, perm_samples[0], filters, 0); 1313 sum0123 = vdotq_lane_s32(sum0123, perm_samples[1], filters, 1); 1314 1315 int32x4_t sum4567 = vdotq_lane_s32(correction, perm_samples[1], filters, 0); 1316 sum4567 = vdotq_lane_s32(sum4567, perm_samples[2], filters, 1); 1317 1318 // Narrow and re-pack. 1319 // We halved the convolution filter values so -1 from the right shift. 1320 return vcombine_s16(vshrn_n_s32(sum0123, ROUND0_BITS - 1), 1321 vshrn_n_s32(sum4567, ROUND0_BITS - 1)); 1322 } 1323 1324 static inline void convolve_2d_sr_horiz_8tap_neon_dotprod( 1325 const uint8_t *src, int src_stride, int16_t *im_block, int im_stride, int w, 1326 int im_h, const int16_t *x_filter_ptr) { 1327 const int16x8_t x_filter_s16 = vld1q_s16(x_filter_ptr); 1328 // Filter values are even, so halve to reduce intermediate precision reqs. 1329 const int8x8_t x_filter = vshrn_n_s16(x_filter_s16, 1); 1330 1331 const int bd = 8; 1332 // Adding a shim of 1 << (ROUND0_BITS - 1) enables us to use non-rounding 1333 // shifts - which are generally faster than rounding shifts on modern CPUs. 1334 const int32_t horiz_const = 1335 ((1 << (bd + FILTER_BITS - 1)) + (1 << (ROUND0_BITS - 1))); 1336 // Halve the total because we halved the filter values. 1337 const int32x4_t correction = 1338 vdupq_n_s32(((128 << FILTER_BITS) + horiz_const) / 2); 1339 1340 const uint8_t *src_ptr = src; 1341 int16_t *dst_ptr = im_block; 1342 int dst_stride = im_stride; 1343 int height = im_h; 1344 1345 const uint8x16x3_t permute_tbl = vld1q_u8_x3(kDotProdPermuteTbl); 1346 do { 1347 const uint8_t *s = src_ptr; 1348 int16_t *d = dst_ptr; 1349 int width = w; 1350 1351 do { 1352 uint8x16_t s0, s1, s2, s3; 1353 load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3); 1354 1355 int16x8_t d0 = convolve8_8_2d_h(s0, x_filter, correction, permute_tbl); 1356 int16x8_t d1 = convolve8_8_2d_h(s1, x_filter, correction, permute_tbl); 1357 int16x8_t d2 = convolve8_8_2d_h(s2, x_filter, correction, permute_tbl); 1358 int16x8_t d3 = convolve8_8_2d_h(s3, x_filter, correction, permute_tbl); 1359 1360 store_s16_8x4(d, dst_stride, d0, d1, d2, d3); 1361 1362 s += 8; 1363 d += 8; 1364 width -= 8; 1365 } while (width != 0); 1366 src_ptr += 4 * src_stride; 1367 dst_ptr += 4 * dst_stride; 1368 height -= 4; 1369 } while (height > 4); 1370 1371 do { 1372 const uint8_t *s = src_ptr; 1373 int16_t *d = dst_ptr; 1374 int width = w; 1375 1376 do { 1377 uint8x16_t s0 = vld1q_u8(s); 1378 int16x8_t d0 = convolve8_8_2d_h(s0, x_filter, correction, permute_tbl); 1379 vst1q_s16(d, d0); 1380 1381 s += 8; 1382 d += 8; 1383 width -= 8; 1384 } while (width != 0); 1385 src_ptr += src_stride; 1386 dst_ptr += dst_stride; 1387 } while (--height != 0); 1388 } 1389 1390 static inline void convolve_2d_sr_6tap_neon_dotprod( 1391 const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, 1392 int h, const int16_t *x_filter_ptr, const int16_t *y_filter_ptr) { 1393 const int16x8_t y_filter = vld1q_s16(y_filter_ptr); 1394 // Filter values are even, so halve to reduce intermediate precision reqs. 1395 const int8x8_t x_filter = vshrn_n_s16(vld1q_s16(x_filter_ptr), 1); 1396 1397 const int bd = 8; 1398 // Adding a shim of 1 << (ROUND0_BITS - 1) enables us to use non-rounding 1399 // shifts - which are generally faster than rounding shifts on modern CPUs. 1400 const int32_t horiz_const = 1401 ((1 << (bd + FILTER_BITS - 1)) + (1 << (ROUND0_BITS - 1))); 1402 // Accumulate into 128 << FILTER_BITS to account for range transform. 1403 // Halve the total because we halved the filter values. 1404 const int32x4_t correction = 1405 vdupq_n_s32(((128 << FILTER_BITS) + horiz_const) / 2); 1406 const int16x8_t vert_const = vdupq_n_s16(1 << (bd - 1)); 1407 const uint8x16x3_t permute_tbl = vld1q_u8_x3(kDotProdPermuteTbl); 1408 1409 do { 1410 const uint8_t *s = src; 1411 uint8_t *d = dst; 1412 int height = h; 1413 1414 uint8x16_t h_s0, h_s1, h_s2, h_s3, h_s4; 1415 load_u8_16x5(s, src_stride, &h_s0, &h_s1, &h_s2, &h_s3, &h_s4); 1416 s += 5 * src_stride; 1417 1418 int16x8_t v_s0 = convolve8_8_2d_h(h_s0, x_filter, correction, permute_tbl); 1419 int16x8_t v_s1 = convolve8_8_2d_h(h_s1, x_filter, correction, permute_tbl); 1420 int16x8_t v_s2 = convolve8_8_2d_h(h_s2, x_filter, correction, permute_tbl); 1421 int16x8_t v_s3 = convolve8_8_2d_h(h_s3, x_filter, correction, permute_tbl); 1422 int16x8_t v_s4 = convolve8_8_2d_h(h_s4, x_filter, correction, permute_tbl); 1423 1424 do { 1425 uint8x16_t h_s5, h_s6, h_s7, h_s8; 1426 load_u8_16x4(s, src_stride, &h_s5, &h_s6, &h_s7, &h_s8); 1427 1428 int16x8_t v_s5 = 1429 convolve8_8_2d_h(h_s5, x_filter, correction, permute_tbl); 1430 int16x8_t v_s6 = 1431 convolve8_8_2d_h(h_s6, x_filter, correction, permute_tbl); 1432 int16x8_t v_s7 = 1433 convolve8_8_2d_h(h_s7, x_filter, correction, permute_tbl); 1434 int16x8_t v_s8 = 1435 convolve8_8_2d_h(h_s8, x_filter, correction, permute_tbl); 1436 1437 uint8x8_t d0 = convolve6_8_2d_v(v_s0, v_s1, v_s2, v_s3, v_s4, v_s5, 1438 y_filter, vert_const); 1439 uint8x8_t d1 = convolve6_8_2d_v(v_s1, v_s2, v_s3, v_s4, v_s5, v_s6, 1440 y_filter, vert_const); 1441 uint8x8_t d2 = convolve6_8_2d_v(v_s2, v_s3, v_s4, v_s5, v_s6, v_s7, 1442 y_filter, vert_const); 1443 uint8x8_t d3 = convolve6_8_2d_v(v_s3, v_s4, v_s5, v_s6, v_s7, v_s8, 1444 y_filter, vert_const); 1445 1446 store_u8_8x4(d, dst_stride, d0, d1, d2, d3); 1447 1448 v_s0 = v_s4; 1449 v_s1 = v_s5; 1450 v_s2 = v_s6; 1451 v_s3 = v_s7; 1452 v_s4 = v_s8; 1453 1454 s += 4 * src_stride; 1455 d += 4 * dst_stride; 1456 height -= 4; 1457 } while (height != 0); 1458 src += 8; 1459 dst += 8; 1460 w -= 8; 1461 } while (w != 0); 1462 } 1463 1464 static inline void convolve_2d_sr_4tap_neon_dotprod( 1465 const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, 1466 int h, const int16_t *x_filter_ptr, const int16_t *y_filter_ptr) { 1467 const int bd = 8; 1468 const int16x8_t vert_const = vdupq_n_s16(1 << (bd - 1)); 1469 1470 const int16x4_t y_filter = vld1_s16(y_filter_ptr + 2); 1471 const int16x4_t x_filter_s16 = vld1_s16(x_filter_ptr + 2); 1472 // All 4-tap and bilinear filter values are even, so halve them to reduce 1473 // intermediate precision requirements. 1474 const int8x8_t x_filter = 1475 vshrn_n_s16(vcombine_s16(x_filter_s16, vdup_n_s16(0)), 1); 1476 1477 // Adding a shim of 1 << (ROUND0_BITS - 1) enables us to use non-rounding 1478 // shifts - which are generally faster than rounding shifts on modern CPUs. 1479 const int32_t horiz_const = 1480 ((1 << (bd + FILTER_BITS - 1)) + (1 << (ROUND0_BITS - 1))); 1481 // Accumulate into 128 << FILTER_BITS to account for range transform. 1482 // Halve the total because we halved the filter values. 1483 const int32x4_t correction = 1484 vdupq_n_s32(((128 << FILTER_BITS) + horiz_const) / 2); 1485 1486 if (w == 4) { 1487 const uint8x16_t permute_tbl = vld1q_u8(kDotProdPermuteTbl); 1488 1489 uint8x16_t h_s0, h_s1, h_s2; 1490 load_u8_16x3(src, src_stride, &h_s0, &h_s1, &h_s2); 1491 1492 int16x4_t v_s0 = convolve4_4_2d_h(h_s0, x_filter, permute_tbl, correction); 1493 int16x4_t v_s1 = convolve4_4_2d_h(h_s1, x_filter, permute_tbl, correction); 1494 int16x4_t v_s2 = convolve4_4_2d_h(h_s2, x_filter, permute_tbl, correction); 1495 1496 src += 3 * src_stride; 1497 1498 do { 1499 uint8x16_t h_s3, h_s4, h_s5, h_s6; 1500 load_u8_16x4(src, src_stride, &h_s3, &h_s4, &h_s5, &h_s6); 1501 1502 int16x4_t v_s3 = 1503 convolve4_4_2d_h(h_s3, x_filter, permute_tbl, correction); 1504 int16x4_t v_s4 = 1505 convolve4_4_2d_h(h_s4, x_filter, permute_tbl, correction); 1506 int16x4_t v_s5 = 1507 convolve4_4_2d_h(h_s5, x_filter, permute_tbl, correction); 1508 int16x4_t v_s6 = 1509 convolve4_4_2d_h(h_s6, x_filter, permute_tbl, correction); 1510 1511 int16x4_t d0 = convolve4_4_2d_v(v_s0, v_s1, v_s2, v_s3, y_filter); 1512 int16x4_t d1 = convolve4_4_2d_v(v_s1, v_s2, v_s3, v_s4, y_filter); 1513 int16x4_t d2 = convolve4_4_2d_v(v_s2, v_s3, v_s4, v_s5, y_filter); 1514 int16x4_t d3 = convolve4_4_2d_v(v_s3, v_s4, v_s5, v_s6, y_filter); 1515 1516 uint8x8_t d01 = vqmovun_s16(vsubq_s16(vcombine_s16(d0, d1), vert_const)); 1517 uint8x8_t d23 = vqmovun_s16(vsubq_s16(vcombine_s16(d2, d3), vert_const)); 1518 1519 store_u8x4_strided_x2(dst + 0 * dst_stride, dst_stride, d01); 1520 store_u8x4_strided_x2(dst + 2 * dst_stride, dst_stride, d23); 1521 1522 v_s0 = v_s4; 1523 v_s1 = v_s5; 1524 v_s2 = v_s6; 1525 1526 src += 4 * src_stride; 1527 dst += 4 * dst_stride; 1528 h -= 4; 1529 } while (h != 0); 1530 } else { 1531 const uint8x16x2_t permute_tbl = vld1q_u8_x2(kDotProdPermuteTbl); 1532 1533 do { 1534 int height = h; 1535 const uint8_t *s = src; 1536 uint8_t *d = dst; 1537 1538 uint8x16_t h_s0, h_s1, h_s2; 1539 load_u8_16x3(src, src_stride, &h_s0, &h_s1, &h_s2); 1540 1541 int16x8_t v_s0 = 1542 convolve4_8_2d_h(h_s0, x_filter, permute_tbl, correction); 1543 int16x8_t v_s1 = 1544 convolve4_8_2d_h(h_s1, x_filter, permute_tbl, correction); 1545 int16x8_t v_s2 = 1546 convolve4_8_2d_h(h_s2, x_filter, permute_tbl, correction); 1547 1548 s += 3 * src_stride; 1549 1550 do { 1551 uint8x16_t h_s3, h_s4, h_s5, h_s6; 1552 load_u8_16x4(s, src_stride, &h_s3, &h_s4, &h_s5, &h_s6); 1553 1554 int16x8_t v_s3 = 1555 convolve4_8_2d_h(h_s3, x_filter, permute_tbl, correction); 1556 int16x8_t v_s4 = 1557 convolve4_8_2d_h(h_s4, x_filter, permute_tbl, correction); 1558 int16x8_t v_s5 = 1559 convolve4_8_2d_h(h_s5, x_filter, permute_tbl, correction); 1560 int16x8_t v_s6 = 1561 convolve4_8_2d_h(h_s6, x_filter, permute_tbl, correction); 1562 1563 uint8x8_t d0 = 1564 convolve4_8_2d_v(v_s0, v_s1, v_s2, v_s3, y_filter, vert_const); 1565 uint8x8_t d1 = 1566 convolve4_8_2d_v(v_s1, v_s2, v_s3, v_s4, y_filter, vert_const); 1567 uint8x8_t d2 = 1568 convolve4_8_2d_v(v_s2, v_s3, v_s4, v_s5, y_filter, vert_const); 1569 uint8x8_t d3 = 1570 convolve4_8_2d_v(v_s3, v_s4, v_s5, v_s6, y_filter, vert_const); 1571 1572 store_u8_8x4(d, dst_stride, d0, d1, d2, d3); 1573 1574 v_s0 = v_s4; 1575 v_s1 = v_s5; 1576 v_s2 = v_s6; 1577 1578 s += 4 * src_stride; 1579 d += 4 * dst_stride; 1580 height -= 4; 1581 } while (height != 0); 1582 src += 8; 1583 dst += 8; 1584 w -= 8; 1585 } while (w != 0); 1586 } 1587 } 1588 1589 void av1_convolve_2d_sr_neon_dotprod(const uint8_t *src, int src_stride, 1590 uint8_t *dst, int dst_stride, int w, int h, 1591 const InterpFilterParams *filter_params_x, 1592 const InterpFilterParams *filter_params_y, 1593 const int subpel_x_qn, 1594 const int subpel_y_qn, 1595 ConvolveParams *conv_params) { 1596 if (w == 2 || h == 2) { 1597 av1_convolve_2d_sr_c(src, src_stride, dst, dst_stride, w, h, 1598 filter_params_x, filter_params_y, subpel_x_qn, 1599 subpel_y_qn, conv_params); 1600 return; 1601 } 1602 1603 const int y_filter_taps = get_filter_tap(filter_params_y, subpel_y_qn); 1604 const int x_filter_taps = get_filter_tap(filter_params_x, subpel_x_qn); 1605 const int clamped_y_taps = y_filter_taps < 4 ? 4 : y_filter_taps; 1606 const int im_h = h + clamped_y_taps - 1; 1607 const int im_stride = MAX_SB_SIZE; 1608 const int vert_offset = clamped_y_taps / 2 - 1; 1609 const int horiz_offset = filter_params_x->taps / 2 - 1; 1610 const uint8_t *src_ptr = src - vert_offset * src_stride - horiz_offset; 1611 1612 const int16_t *x_filter_ptr = av1_get_interp_filter_subpel_kernel( 1613 filter_params_x, subpel_x_qn & SUBPEL_MASK); 1614 const int16_t *y_filter_ptr = av1_get_interp_filter_subpel_kernel( 1615 filter_params_y, subpel_y_qn & SUBPEL_MASK); 1616 1617 if (filter_params_x->taps > 8) { 1618 DECLARE_ALIGNED(16, int16_t, 1619 im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE]); 1620 1621 const int16x8_t x_filter_0_7 = vld1q_s16(x_filter_ptr); 1622 const int16x4_t x_filter_8_11 = vld1_s16(x_filter_ptr + 8); 1623 const int16x8_t y_filter_0_7 = vld1q_s16(y_filter_ptr); 1624 const int16x4_t y_filter_8_11 = vld1_s16(y_filter_ptr + 8); 1625 1626 convolve_2d_sr_horiz_12tap_neon_dotprod(src_ptr, src_stride, im_block, 1627 im_stride, w, im_h, x_filter_0_7, 1628 x_filter_8_11); 1629 1630 convolve_2d_sr_vert_12tap_neon(im_block, im_stride, dst, dst_stride, w, h, 1631 y_filter_0_7, y_filter_8_11); 1632 } else { 1633 if (x_filter_taps >= 6 && y_filter_taps == 6) { 1634 convolve_2d_sr_6tap_neon_dotprod(src_ptr, src_stride, dst, dst_stride, w, 1635 h, x_filter_ptr, y_filter_ptr); 1636 return; 1637 } 1638 1639 if (x_filter_taps <= 4 && y_filter_taps <= 4) { 1640 convolve_2d_sr_4tap_neon_dotprod(src_ptr + 2, src_stride, dst, dst_stride, 1641 w, h, x_filter_ptr, y_filter_ptr); 1642 return; 1643 } 1644 1645 DECLARE_ALIGNED(16, int16_t, 1646 im_block[(MAX_SB_SIZE + SUBPEL_TAPS - 1) * MAX_SB_SIZE]); 1647 1648 if (x_filter_taps <= 4) { 1649 convolve_2d_sr_horiz_4tap_neon_dotprod(src_ptr + 2, src_stride, im_block, 1650 im_stride, w, im_h, x_filter_ptr); 1651 } else { 1652 convolve_2d_sr_horiz_8tap_neon_dotprod(src_ptr, src_stride, im_block, 1653 im_stride, w, im_h, x_filter_ptr); 1654 } 1655 1656 const int16x8_t y_filter = vld1q_s16(y_filter_ptr); 1657 1658 if (clamped_y_taps <= 4) { 1659 convolve_2d_sr_vert_4tap_neon(im_block, im_stride, dst, dst_stride, w, h, 1660 y_filter_ptr); 1661 } else if (clamped_y_taps == 6) { 1662 convolve_2d_sr_vert_6tap_neon(im_block, im_stride, dst, dst_stride, w, h, 1663 y_filter); 1664 } else { 1665 convolve_2d_sr_vert_8tap_neon(im_block, im_stride, dst, dst_stride, w, h, 1666 y_filter); 1667 } 1668 } 1669 }