aom_convolve8_neon_dotprod.c (21250B)
1 /* 2 * Copyright (c) 2014 The WebM project authors. All rights reserved. 3 * Copyright (c) 2023, Alliance for Open Media. All rights reserved. 4 * 5 * This source code is subject to the terms of the BSD 2 Clause License and 6 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License 7 * was not distributed with this source code in the LICENSE file, you can 8 * obtain it at www.aomedia.org/license/software. If the Alliance for Open 9 * Media Patent License 1.0 was not distributed with this source code in the 10 * PATENTS file, you can obtain it at www.aomedia.org/license/patent. 11 */ 12 13 #include <arm_neon.h> 14 #include <assert.h> 15 #include <string.h> 16 17 #include "config/aom_config.h" 18 #include "config/aom_dsp_rtcd.h" 19 20 #include "aom/aom_integer.h" 21 #include "aom_dsp/aom_dsp_common.h" 22 #include "aom_dsp/aom_filter.h" 23 #include "aom_dsp/arm/aom_convolve8_neon.h" 24 #include "aom_dsp/arm/aom_filter.h" 25 #include "aom_dsp/arm/mem_neon.h" 26 #include "aom_dsp/arm/transpose_neon.h" 27 #include "aom_ports/mem.h" 28 29 // Filter values always sum to 128. 30 #define FILTER_WEIGHT 128 31 32 DECLARE_ALIGNED(16, static const uint8_t, kDotProdPermuteTbl[48]) = { 33 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6, 34 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10, 35 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 36 }; 37 38 DECLARE_ALIGNED(16, static const uint8_t, kDotProdMergeBlockTbl[48]) = { 39 // Shift left and insert new last column in transposed 4x4 block. 40 1, 2, 3, 16, 5, 6, 7, 20, 9, 10, 11, 24, 13, 14, 15, 28, 41 // Shift left and insert two new columns in transposed 4x4 block. 42 2, 3, 16, 17, 6, 7, 20, 21, 10, 11, 24, 25, 14, 15, 28, 29, 43 // Shift left and insert three new columns in transposed 4x4 block. 44 3, 16, 17, 18, 7, 20, 21, 22, 11, 24, 25, 26, 15, 28, 29, 30 45 }; 46 47 static inline int16x4_t convolve8_4_h(const uint8x16_t samples, 48 const int8x8_t filters, 49 const uint8x16x2_t permute_tbl) { 50 // Transform sample range to [-128, 127] for 8-bit signed dot product. 51 int8x16_t samples_128 = 52 vreinterpretq_s8_u8(vsubq_u8(samples, vdupq_n_u8(128))); 53 54 // Permute samples ready for dot product. 55 // { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 } 56 // { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 } 57 int8x16_t perm_samples[2] = { vqtbl1q_s8(samples_128, permute_tbl.val[0]), 58 vqtbl1q_s8(samples_128, permute_tbl.val[1]) }; 59 60 // Accumulate into 128 * FILTER_WEIGHT to account for range transform. 61 // (Divide by 2 since we halved the filter values.) 62 int32x4_t acc = vdupq_n_s32(128 * FILTER_WEIGHT / 2); 63 int32x4_t sum = vdotq_lane_s32(acc, perm_samples[0], filters, 0); 64 sum = vdotq_lane_s32(sum, perm_samples[1], filters, 1); 65 66 // Further narrowing and packing is performed by the caller. 67 return vmovn_s32(sum); 68 } 69 70 static inline uint8x8_t convolve8_8_h(const uint8x16_t samples, 71 const int8x8_t filters, 72 const uint8x16x3_t permute_tbl) { 73 // Transform sample range to [-128, 127] for 8-bit signed dot product. 74 int8x16_t samples_128 = 75 vreinterpretq_s8_u8(vsubq_u8(samples, vdupq_n_u8(128))); 76 77 // Permute samples ready for dot product. 78 // { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 } 79 // { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 } 80 // { 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 } 81 int8x16_t perm_samples[3] = { vqtbl1q_s8(samples_128, permute_tbl.val[0]), 82 vqtbl1q_s8(samples_128, permute_tbl.val[1]), 83 vqtbl1q_s8(samples_128, permute_tbl.val[2]) }; 84 85 // Accumulate into 128 * FILTER_WEIGHT to account for range transform. 86 // (Divide by 2 since we halved the filter values.) 87 int32x4_t acc = vdupq_n_s32(128 * FILTER_WEIGHT / 2); 88 // First 4 output values. 89 int32x4_t sum0 = vdotq_lane_s32(acc, perm_samples[0], filters, 0); 90 sum0 = vdotq_lane_s32(sum0, perm_samples[1], filters, 1); 91 // Second 4 output values. 92 int32x4_t sum1 = vdotq_lane_s32(acc, perm_samples[1], filters, 0); 93 sum1 = vdotq_lane_s32(sum1, perm_samples[2], filters, 1); 94 95 // Narrow and re-pack. 96 int16x8_t sum = vcombine_s16(vmovn_s32(sum0), vmovn_s32(sum1)); 97 // We halved the filter values so -1 from right shift. 98 return vqrshrun_n_s16(sum, FILTER_BITS - 1); 99 } 100 101 static inline void convolve8_horiz_8tap_neon_dotprod( 102 const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, 103 ptrdiff_t dst_stride, const int16_t *filter_x, int w, int h) { 104 // Filter values are even, so halve to reduce intermediate precision reqs. 105 const int8x8_t filter = vshrn_n_s16(vld1q_s16(filter_x), 1); 106 107 if (w == 4) { 108 const uint8x16x2_t perm_tbl = vld1q_u8_x2(kDotProdPermuteTbl); 109 do { 110 uint8x16_t s0, s1, s2, s3; 111 load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3); 112 113 int16x4_t d0 = convolve8_4_h(s0, filter, perm_tbl); 114 int16x4_t d1 = convolve8_4_h(s1, filter, perm_tbl); 115 int16x4_t d2 = convolve8_4_h(s2, filter, perm_tbl); 116 int16x4_t d3 = convolve8_4_h(s3, filter, perm_tbl); 117 // We halved the filter values so -1 from right shift. 118 uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS - 1); 119 uint8x8_t d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS - 1); 120 121 store_u8x4_strided_x2(dst + 0 * dst_stride, dst_stride, d01); 122 store_u8x4_strided_x2(dst + 2 * dst_stride, dst_stride, d23); 123 124 src += 4 * src_stride; 125 dst += 4 * dst_stride; 126 h -= 4; 127 } while (h > 0); 128 } else { 129 const uint8x16x3_t perm_tbl = vld1q_u8_x3(kDotProdPermuteTbl); 130 131 do { 132 int width = w; 133 const uint8_t *s = src; 134 uint8_t *d = dst; 135 do { 136 uint8x16_t s0, s1, s2, s3; 137 load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3); 138 139 uint8x8_t d0 = convolve8_8_h(s0, filter, perm_tbl); 140 uint8x8_t d1 = convolve8_8_h(s1, filter, perm_tbl); 141 uint8x8_t d2 = convolve8_8_h(s2, filter, perm_tbl); 142 uint8x8_t d3 = convolve8_8_h(s3, filter, perm_tbl); 143 144 store_u8_8x4(d, dst_stride, d0, d1, d2, d3); 145 146 s += 8; 147 d += 8; 148 width -= 8; 149 } while (width != 0); 150 src += 4 * src_stride; 151 dst += 4 * dst_stride; 152 h -= 4; 153 } while (h > 0); 154 } 155 } 156 157 static inline int16x4_t convolve4_4_h(const uint8x16_t samples, 158 const int8x8_t filters, 159 const uint8x16_t permute_tbl) { 160 // Transform sample range to [-128, 127] for 8-bit signed dot product. 161 int8x16_t samples_128 = 162 vreinterpretq_s8_u8(vsubq_u8(samples, vdupq_n_u8(128))); 163 164 // Permute samples ready for dot product. 165 // { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 } 166 int8x16_t perm_samples = vqtbl1q_s8(samples_128, permute_tbl); 167 168 // Accumulate into 128 * FILTER_WEIGHT to account for range transform. 169 // (Divide by 2 since we halved the filter values.) 170 int32x4_t acc = vdupq_n_s32(128 * FILTER_WEIGHT / 2); 171 int32x4_t sum = vdotq_lane_s32(acc, perm_samples, filters, 0); 172 173 // Further narrowing and packing is performed by the caller. 174 return vmovn_s32(sum); 175 } 176 177 static inline uint8x8_t convolve4_8_h(const uint8x16_t samples, 178 const int8x8_t filters, 179 const uint8x16x2_t permute_tbl) { 180 // Transform sample range to [-128, 127] for 8-bit signed dot product. 181 int8x16_t samples_128 = 182 vreinterpretq_s8_u8(vsubq_u8(samples, vdupq_n_u8(128))); 183 184 // Permute samples ready for dot product. 185 // { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 } 186 // { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 } 187 int8x16_t perm_samples[2] = { vqtbl1q_s8(samples_128, permute_tbl.val[0]), 188 vqtbl1q_s8(samples_128, permute_tbl.val[1]) }; 189 190 // Accumulate into 128 * FILTER_WEIGHT to account for range transform. 191 // (Divide by 2 since we halved the filter values.) 192 int32x4_t acc = vdupq_n_s32(128 * FILTER_WEIGHT / 2); 193 // First 4 output values. 194 int32x4_t sum0 = vdotq_lane_s32(acc, perm_samples[0], filters, 0); 195 // Second 4 output values. 196 int32x4_t sum1 = vdotq_lane_s32(acc, perm_samples[1], filters, 0); 197 198 // Narrow and re-pack. 199 int16x8_t sum = vcombine_s16(vmovn_s32(sum0), vmovn_s32(sum1)); 200 // We halved the filter values so -1 from right shift. 201 return vqrshrun_n_s16(sum, FILTER_BITS - 1); 202 } 203 204 static inline void convolve8_horiz_4tap_neon_dotprod( 205 const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, 206 ptrdiff_t dst_stride, const int16_t *filter_x, int width, int height) { 207 const int16x4_t x_filter = vld1_s16(filter_x + 2); 208 // All 4-tap and bilinear filter values are even, so halve them to reduce 209 // intermediate precision requirements. 210 const int8x8_t filter = vshrn_n_s16(vcombine_s16(x_filter, vdup_n_s16(0)), 1); 211 212 if (width == 4) { 213 const uint8x16_t permute_tbl = vld1q_u8(kDotProdPermuteTbl); 214 215 do { 216 uint8x16_t s0, s1, s2, s3; 217 load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3); 218 219 int16x4_t t0 = convolve4_4_h(s0, filter, permute_tbl); 220 int16x4_t t1 = convolve4_4_h(s1, filter, permute_tbl); 221 int16x4_t t2 = convolve4_4_h(s2, filter, permute_tbl); 222 int16x4_t t3 = convolve4_4_h(s3, filter, permute_tbl); 223 // We halved the filter values so -1 from right shift. 224 uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(t0, t1), FILTER_BITS - 1); 225 uint8x8_t d23 = vqrshrun_n_s16(vcombine_s16(t2, t3), FILTER_BITS - 1); 226 227 store_u8x4_strided_x2(dst + 0 * dst_stride, dst_stride, d01); 228 store_u8x4_strided_x2(dst + 2 * dst_stride, dst_stride, d23); 229 230 src += 4 * src_stride; 231 dst += 4 * dst_stride; 232 height -= 4; 233 } while (height > 0); 234 } else { 235 const uint8x16x2_t permute_tbl = vld1q_u8_x2(kDotProdPermuteTbl); 236 237 do { 238 const uint8_t *s = src; 239 uint8_t *d = dst; 240 int w = width; 241 242 do { 243 uint8x16_t s0, s1, s2, s3; 244 load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3); 245 246 uint8x8_t d0 = convolve4_8_h(s0, filter, permute_tbl); 247 uint8x8_t d1 = convolve4_8_h(s1, filter, permute_tbl); 248 uint8x8_t d2 = convolve4_8_h(s2, filter, permute_tbl); 249 uint8x8_t d3 = convolve4_8_h(s3, filter, permute_tbl); 250 251 store_u8_8x4(d, dst_stride, d0, d1, d2, d3); 252 253 s += 8; 254 d += 8; 255 w -= 8; 256 } while (w != 0); 257 src += 4 * src_stride; 258 dst += 4 * dst_stride; 259 height -= 4; 260 } while (height > 0); 261 } 262 } 263 264 void aom_convolve8_horiz_neon_dotprod(const uint8_t *src, ptrdiff_t src_stride, 265 uint8_t *dst, ptrdiff_t dst_stride, 266 const int16_t *filter_x, int x_step_q4, 267 const int16_t *filter_y, int y_step_q4, 268 int w, int h) { 269 assert((intptr_t)dst % 4 == 0); 270 assert(dst_stride % 4 == 0); 271 272 (void)x_step_q4; 273 (void)filter_y; 274 (void)y_step_q4; 275 276 src -= ((SUBPEL_TAPS / 2) - 1); 277 278 int filter_taps = get_filter_taps_convolve8(filter_x); 279 280 if (filter_taps == 2) { 281 convolve8_horiz_2tap_neon(src + 3, src_stride, dst, dst_stride, filter_x, w, 282 h); 283 } else if (filter_taps == 4) { 284 convolve8_horiz_4tap_neon_dotprod(src + 2, src_stride, dst, dst_stride, 285 filter_x, w, h); 286 } else { 287 convolve8_horiz_8tap_neon_dotprod(src, src_stride, dst, dst_stride, 288 filter_x, w, h); 289 } 290 } 291 292 static inline int16x4_t convolve8_4_v(const int8x16_t samples_lo, 293 const int8x16_t samples_hi, 294 const int8x8_t filters) { 295 // The sample range transform and permutation are performed by the caller. 296 297 // Accumulate into 128 * FILTER_WEIGHT to account for range transform. 298 // (Divide by 2 since we halved the filter values.) 299 int32x4_t acc = vdupq_n_s32(128 * FILTER_WEIGHT / 2); 300 int32x4_t sum = vdotq_lane_s32(acc, samples_lo, filters, 0); 301 sum = vdotq_lane_s32(sum, samples_hi, filters, 1); 302 303 // Further narrowing and packing is performed by the caller. 304 return vmovn_s32(sum); 305 } 306 307 static inline uint8x8_t convolve8_8_v(const int8x16_t samples0_lo, 308 const int8x16_t samples0_hi, 309 const int8x16_t samples1_lo, 310 const int8x16_t samples1_hi, 311 const int8x8_t filters) { 312 // The sample range transform and permutation are performed by the caller. 313 314 // Accumulate into 128 * FILTER_WEIGHT to account for range transform. 315 // (Divide by 2 since we halved the filter values.) 316 int32x4_t acc = vdupq_n_s32(128 * FILTER_WEIGHT / 2); 317 // First 4 output values. 318 int32x4_t sum0 = vdotq_lane_s32(acc, samples0_lo, filters, 0); 319 sum0 = vdotq_lane_s32(sum0, samples0_hi, filters, 1); 320 // Second 4 output values. 321 int32x4_t sum1 = vdotq_lane_s32(acc, samples1_lo, filters, 0); 322 sum1 = vdotq_lane_s32(sum1, samples1_hi, filters, 1); 323 324 // Narrow and re-pack. 325 int16x8_t sum = vcombine_s16(vmovn_s32(sum0), vmovn_s32(sum1)); 326 // We halved the filter values so -1 from right shift. 327 return vqrshrun_n_s16(sum, FILTER_BITS - 1); 328 } 329 330 static inline void convolve8_vert_8tap_neon_dotprod( 331 const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, 332 ptrdiff_t dst_stride, const int16_t *filter_y, int w, int h) { 333 // Filter values are even, so halve to reduce intermediate precision reqs. 334 const int8x8_t filter = vshrn_n_s16(vld1q_s16(filter_y), 1); 335 const uint8x16x3_t merge_block_tbl = vld1q_u8_x3(kDotProdMergeBlockTbl); 336 int8x16x2_t samples_LUT; 337 338 if (w == 4) { 339 uint8x8_t t0, t1, t2, t3, t4, t5, t6; 340 load_u8_8x7(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6); 341 src += 7 * src_stride; 342 343 // Clamp sample range to [-128, 127] for 8-bit signed dot product. 344 int8x8_t s0 = vreinterpret_s8_u8(vsub_u8(t0, vdup_n_u8(128))); 345 int8x8_t s1 = vreinterpret_s8_u8(vsub_u8(t1, vdup_n_u8(128))); 346 int8x8_t s2 = vreinterpret_s8_u8(vsub_u8(t2, vdup_n_u8(128))); 347 int8x8_t s3 = vreinterpret_s8_u8(vsub_u8(t3, vdup_n_u8(128))); 348 int8x8_t s4 = vreinterpret_s8_u8(vsub_u8(t4, vdup_n_u8(128))); 349 int8x8_t s5 = vreinterpret_s8_u8(vsub_u8(t5, vdup_n_u8(128))); 350 int8x8_t s6 = vreinterpret_s8_u8(vsub_u8(t6, vdup_n_u8(128))); 351 352 // This operation combines a conventional transpose and the sample permute 353 // (see horizontal case) required before computing the dot product. 354 int8x16_t s0123, s1234, s2345, s3456; 355 transpose_concat_elems_s8_4x4(s0, s1, s2, s3, &s0123); 356 transpose_concat_elems_s8_4x4(s1, s2, s3, s4, &s1234); 357 transpose_concat_elems_s8_4x4(s2, s3, s4, s5, &s2345); 358 transpose_concat_elems_s8_4x4(s3, s4, s5, s6, &s3456); 359 360 do { 361 uint8x8_t t7, t8, t9, t10; 362 load_u8_8x4(src, src_stride, &t7, &t8, &t9, &t10); 363 364 int8x8_t s7 = vreinterpret_s8_u8(vsub_u8(t7, vdup_n_u8(128))); 365 int8x8_t s8 = vreinterpret_s8_u8(vsub_u8(t8, vdup_n_u8(128))); 366 int8x8_t s9 = vreinterpret_s8_u8(vsub_u8(t9, vdup_n_u8(128))); 367 int8x8_t s10 = vreinterpret_s8_u8(vsub_u8(t10, vdup_n_u8(128))); 368 369 int8x16_t s4567, s5678, s6789, s78910; 370 transpose_concat_elems_s8_4x4(s7, s8, s9, s10, &s78910); 371 372 // Merge new data into block from previous iteration. 373 samples_LUT.val[0] = s3456; 374 samples_LUT.val[1] = s78910; 375 s4567 = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[0]); 376 s5678 = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[1]); 377 s6789 = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[2]); 378 379 int16x4_t d0 = convolve8_4_v(s0123, s4567, filter); 380 int16x4_t d1 = convolve8_4_v(s1234, s5678, filter); 381 int16x4_t d2 = convolve8_4_v(s2345, s6789, filter); 382 int16x4_t d3 = convolve8_4_v(s3456, s78910, filter); 383 // We halved the filter values so -1 from right shift. 384 uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS - 1); 385 uint8x8_t d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS - 1); 386 387 store_u8x4_strided_x2(dst + 0 * dst_stride, dst_stride, d01); 388 store_u8x4_strided_x2(dst + 2 * dst_stride, dst_stride, d23); 389 390 // Prepare block for next iteration - re-using as much as possible. 391 // Shuffle everything up four rows. 392 s0123 = s4567; 393 s1234 = s5678; 394 s2345 = s6789; 395 s3456 = s78910; 396 397 src += 4 * src_stride; 398 dst += 4 * dst_stride; 399 h -= 4; 400 } while (h != 0); 401 } else { 402 do { 403 int height = h; 404 const uint8_t *s = src; 405 uint8_t *d = dst; 406 407 uint8x8_t t0, t1, t2, t3, t4, t5, t6; 408 load_u8_8x7(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6); 409 s += 7 * src_stride; 410 411 // Clamp sample range to [-128, 127] for 8-bit signed dot product. 412 int8x8_t s0 = vreinterpret_s8_u8(vsub_u8(t0, vdup_n_u8(128))); 413 int8x8_t s1 = vreinterpret_s8_u8(vsub_u8(t1, vdup_n_u8(128))); 414 int8x8_t s2 = vreinterpret_s8_u8(vsub_u8(t2, vdup_n_u8(128))); 415 int8x8_t s3 = vreinterpret_s8_u8(vsub_u8(t3, vdup_n_u8(128))); 416 int8x8_t s4 = vreinterpret_s8_u8(vsub_u8(t4, vdup_n_u8(128))); 417 int8x8_t s5 = vreinterpret_s8_u8(vsub_u8(t5, vdup_n_u8(128))); 418 int8x8_t s6 = vreinterpret_s8_u8(vsub_u8(t6, vdup_n_u8(128))); 419 420 // This operation combines a conventional transpose and the sample permute 421 // (see horizontal case) required before computing the dot product. 422 int8x16_t s0123_lo, s0123_hi, s1234_lo, s1234_hi, s2345_lo, s2345_hi, 423 s3456_lo, s3456_hi; 424 transpose_concat_elems_s8_8x4(s0, s1, s2, s3, &s0123_lo, &s0123_hi); 425 transpose_concat_elems_s8_8x4(s1, s2, s3, s4, &s1234_lo, &s1234_hi); 426 transpose_concat_elems_s8_8x4(s2, s3, s4, s5, &s2345_lo, &s2345_hi); 427 transpose_concat_elems_s8_8x4(s3, s4, s5, s6, &s3456_lo, &s3456_hi); 428 429 do { 430 uint8x8_t t7, t8, t9, t10; 431 load_u8_8x4(s, src_stride, &t7, &t8, &t9, &t10); 432 433 int8x8_t s7 = vreinterpret_s8_u8(vsub_u8(t7, vdup_n_u8(128))); 434 int8x8_t s8 = vreinterpret_s8_u8(vsub_u8(t8, vdup_n_u8(128))); 435 int8x8_t s9 = vreinterpret_s8_u8(vsub_u8(t9, vdup_n_u8(128))); 436 int8x8_t s10 = vreinterpret_s8_u8(vsub_u8(t10, vdup_n_u8(128))); 437 438 int8x16_t s4567_lo, s4567_hi, s5678_lo, s5678_hi, s6789_lo, s6789_hi, 439 s78910_lo, s78910_hi; 440 transpose_concat_elems_s8_8x4(s7, s8, s9, s10, &s78910_lo, &s78910_hi); 441 442 // Merge new data into block from previous iteration. 443 samples_LUT.val[0] = s3456_lo; 444 samples_LUT.val[1] = s78910_lo; 445 s4567_lo = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[0]); 446 s5678_lo = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[1]); 447 s6789_lo = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[2]); 448 449 samples_LUT.val[0] = s3456_hi; 450 samples_LUT.val[1] = s78910_hi; 451 s4567_hi = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[0]); 452 s5678_hi = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[1]); 453 s6789_hi = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[2]); 454 455 uint8x8_t d0 = 456 convolve8_8_v(s0123_lo, s4567_lo, s0123_hi, s4567_hi, filter); 457 uint8x8_t d1 = 458 convolve8_8_v(s1234_lo, s5678_lo, s1234_hi, s5678_hi, filter); 459 uint8x8_t d2 = 460 convolve8_8_v(s2345_lo, s6789_lo, s2345_hi, s6789_hi, filter); 461 uint8x8_t d3 = 462 convolve8_8_v(s3456_lo, s78910_lo, s3456_hi, s78910_hi, filter); 463 464 store_u8_8x4(d, dst_stride, d0, d1, d2, d3); 465 466 // Prepare block for next iteration - re-using as much as possible. 467 // Shuffle everything up four rows. 468 s0123_lo = s4567_lo; 469 s0123_hi = s4567_hi; 470 s1234_lo = s5678_lo; 471 s1234_hi = s5678_hi; 472 s2345_lo = s6789_lo; 473 s2345_hi = s6789_hi; 474 s3456_lo = s78910_lo; 475 s3456_hi = s78910_hi; 476 477 s += 4 * src_stride; 478 d += 4 * dst_stride; 479 height -= 4; 480 } while (height != 0); 481 src += 8; 482 dst += 8; 483 w -= 8; 484 } while (w != 0); 485 } 486 } 487 488 void aom_convolve8_vert_neon_dotprod(const uint8_t *src, ptrdiff_t src_stride, 489 uint8_t *dst, ptrdiff_t dst_stride, 490 const int16_t *filter_x, int x_step_q4, 491 const int16_t *filter_y, int y_step_q4, 492 int w, int h) { 493 assert((intptr_t)dst % 4 == 0); 494 assert(dst_stride % 4 == 0); 495 496 (void)filter_x; 497 (void)x_step_q4; 498 (void)y_step_q4; 499 500 src -= ((SUBPEL_TAPS / 2) - 1) * src_stride; 501 502 int filter_taps = get_filter_taps_convolve8(filter_y); 503 504 if (filter_taps == 2) { 505 convolve8_vert_2tap_neon(src + 3 * src_stride, src_stride, dst, dst_stride, 506 filter_y, w, h); 507 } else if (filter_taps == 4) { 508 convolve8_vert_4tap_neon(src + 2 * src_stride, src_stride, dst, dst_stride, 509 filter_y, w, h); 510 } else { 511 convolve8_vert_8tap_neon_dotprod(src, src_stride, dst, dst_stride, filter_y, 512 w, h); 513 } 514 }