aom_convolve8_neon_i8mm.c (24525B)
1 /* 2 * Copyright (c) 2014 The WebM project authors. All rights reserved. 3 * Copyright (c) 2023, Alliance for Open Media. All rights reserved. 4 * 5 * This source code is subject to the terms of the BSD 2 Clause License and 6 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License 7 * was not distributed with this source code in the LICENSE file, you can 8 * obtain it at www.aomedia.org/license/software. If the Alliance for Open 9 * Media Patent License 1.0 was not distributed with this source code in the 10 * PATENTS file, you can obtain it at www.aomedia.org/license/patent. 11 */ 12 13 #include <arm_neon.h> 14 #include <assert.h> 15 #include <string.h> 16 17 #include "config/aom_config.h" 18 19 #include "aom/aom_integer.h" 20 #include "aom_dsp/aom_dsp_common.h" 21 #include "aom_dsp/aom_filter.h" 22 #include "aom_dsp/arm/aom_convolve8_neon.h" 23 #include "aom_dsp/arm/aom_filter.h" 24 #include "aom_dsp/arm/mem_neon.h" 25 #include "aom_dsp/arm/transpose_neon.h" 26 #include "aom_ports/mem.h" 27 28 DECLARE_ALIGNED(16, static const uint8_t, kMatMul6PermuteTbl[32]) = { 29 // clang-format off 30 0, 1, 2, 3, 4, 5, 6, 7, 2, 3, 4, 5, 6, 7, 8, 9, 31 4, 5, 6, 7, 8, 9, 10, 11, 6, 7, 8, 9, 10, 11, 12, 13 32 // clang-format on 33 }; 34 35 DECLARE_ALIGNED(16, static const uint8_t, kMatMul8PermuteTbl[32]) = { 36 // clang-format off 37 1, 2, 3, 4, 5, 6, 7, 8, 3, 4, 5, 6, 7, 8, 9, 10, 38 5, 6, 7, 8, 9, 10, 11, 12, 7, 8, 9, 10, 11, 12, 13, 14 39 // clang-format on 40 }; 41 42 DECLARE_ALIGNED(16, static const uint8_t, kMatMul8FilterPermuteTbl[16]) = { 43 // clang-format off 44 1, 2, 3, 4, 5, 6, 7, 16, 16, 1, 2, 3, 4, 5, 6, 7 45 // clang-format on 46 }; 47 48 DECLARE_ALIGNED(16, static const uint8_t, kDotProdMergeBlockTbl[48]) = { 49 // Shift left and insert new last column in transposed 4x4 block. 50 1, 2, 3, 16, 5, 6, 7, 20, 9, 10, 11, 24, 13, 14, 15, 28, 51 // Shift left and insert two new columns in transposed 4x4 block. 52 2, 3, 16, 17, 6, 7, 20, 21, 10, 11, 24, 25, 14, 15, 28, 29, 53 // Shift left and insert three new columns in transposed 4x4 block. 54 3, 16, 17, 18, 7, 20, 21, 22, 11, 24, 25, 26, 15, 28, 29, 30 55 }; 56 57 static inline int16x4_t convolve8_4_h(const uint8x16_t samples, 58 const int8x16_t filters, 59 const uint8x16_t permute_tbl) { 60 // Permute samples ready for matrix multiply. 61 // { 1, 2, 3, 4, 5, 6, 7, 8, 3, 4, 5, 6, 7, 8, 9, 10 } 62 uint8x16_t perm_samples = vqtbl1q_u8(samples, permute_tbl); 63 64 // These instructions multiply a 2x8 matrix (samples) by an 8x2 matrix 65 // (filter), destructively accumulating into the destination register. 66 int32x4_t sum = vusmmlaq_s32(vdupq_n_s32(0), perm_samples, filters); 67 68 // Tap 0, as well as further narrowing and packing, is applied by the caller. 69 return vmovn_s32(sum); 70 } 71 72 static inline uint8x8_t convolve8_8_h(const uint8x16_t samples, 73 const int8x16_t filters, 74 const uint8x8_t f0, 75 const uint8x16x2_t permute_tbl) { 76 // Permute samples ready for matrix multiply. 77 // { 1, 2, 3, 4, 5, 6, 7, 8, 3, 4, 5, 6, 7, 8, 9, 10 } 78 // { 5, 6, 7, 8, 9, 10, 11, 12, 7, 8, 9, 10, 11, 12, 13, 14 } 79 uint8x16_t perm_samples[2] = { vqtbl1q_u8(samples, permute_tbl.val[0]), 80 vqtbl1q_u8(samples, permute_tbl.val[1]) }; 81 82 // These instructions multiply a 2x8 matrix (samples) by an 8x2 matrix 83 // (filter), destructively accumulating into the destination register. 84 int32x4_t sum0123 = vusmmlaq_s32(vdupq_n_s32(0), perm_samples[0], filters); 85 int32x4_t sum4567 = vusmmlaq_s32(vdupq_n_s32(0), perm_samples[1], filters); 86 87 // Narrow and re-pack. 88 int16x8_t sum = vcombine_s16(vmovn_s32(sum0123), vmovn_s32(sum4567)); 89 // Apply tap 0 and accumulate. 90 sum = vreinterpretq_s16_u16( 91 vmlsl_u8(vreinterpretq_u16_s16(sum), vget_low_u8(samples), f0)); 92 93 // We halved the filter values so -1 from right shift. 94 return vqrshrun_n_s16(sum, FILTER_BITS - 1); 95 } 96 97 static inline void convolve8_horiz_8tap_neon_i8mm( 98 const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, 99 ptrdiff_t dst_stride, const int16_t *filter_x, int w, int h) { 100 // Filter values are even, so halve to reduce intermediate precision reqs. 101 const int8x8_t filter_s8 = vshrn_n_s16(vld1q_s16(filter_x), 1); 102 // Stagger the filter for use with the matrix multiply instructions. 103 // { f1, f2, f3, f4, f5, f6, f7, 0, 0, f1, f2, f3, f4, f5, f6, f7 } 104 const uint8x16_t filter_idx = vld1q_u8(kMatMul8FilterPermuteTbl); 105 const int8x16_t filter = 106 vqtbl1q_s8(vcombine_s8(filter_s8, vdup_n_s8(0)), filter_idx); 107 108 // Since f0 is always negative and samples are unsigned, subtract (unsigned) 109 // s0 * -f0 to avoid signed overflow. 110 const uint8x8_t f0 = vdup_n_u8(-filter_x[0] >> 1); 111 112 if (w == 4) { 113 const uint8x16_t perm_tbl = vld1q_u8(kMatMul8PermuteTbl); 114 115 do { 116 uint8x16_t s0, s1, s2, s3; 117 load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3); 118 uint8x8_t s01 = load_u8_4x2(src + 0 * src_stride, src_stride); 119 uint8x8_t s23 = load_u8_4x2(src + 2 * src_stride, src_stride); 120 121 int16x4_t t0 = convolve8_4_h(s0, filter, perm_tbl); 122 int16x4_t t1 = convolve8_4_h(s1, filter, perm_tbl); 123 int16x4_t t2 = convolve8_4_h(s2, filter, perm_tbl); 124 int16x4_t t3 = convolve8_4_h(s3, filter, perm_tbl); 125 // Apply tap 0 and accumulate. 126 int16x8_t t01 = vcombine_s16(t0, t1); 127 int16x8_t t23 = vcombine_s16(t2, t3); 128 t01 = 129 vreinterpretq_s16_u16(vmlsl_u8(vreinterpretq_u16_s16(t01), s01, f0)); 130 t23 = 131 vreinterpretq_s16_u16(vmlsl_u8(vreinterpretq_u16_s16(t23), s23, f0)); 132 // We halved the filter values to -1 from right shift. 133 uint8x8_t d01 = vqrshrun_n_s16(t01, FILTER_BITS - 1); 134 uint8x8_t d23 = vqrshrun_n_s16(t23, FILTER_BITS - 1); 135 136 store_u8x4_strided_x2(dst + 0 * dst_stride, dst_stride, d01); 137 store_u8x4_strided_x2(dst + 2 * dst_stride, dst_stride, d23); 138 139 src += 4 * src_stride; 140 dst += 4 * dst_stride; 141 h -= 4; 142 } while (h > 0); 143 } else { 144 const uint8x16x2_t perm_tbl = vld1q_u8_x2(kMatMul8PermuteTbl); 145 146 do { 147 int width = w; 148 const uint8_t *s = src; 149 uint8_t *d = dst; 150 do { 151 uint8x16_t s0, s1, s2, s3; 152 load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3); 153 154 uint8x8_t d0 = convolve8_8_h(s0, filter, f0, perm_tbl); 155 uint8x8_t d1 = convolve8_8_h(s1, filter, f0, perm_tbl); 156 uint8x8_t d2 = convolve8_8_h(s2, filter, f0, perm_tbl); 157 uint8x8_t d3 = convolve8_8_h(s3, filter, f0, perm_tbl); 158 159 store_u8_8x4(d, dst_stride, d0, d1, d2, d3); 160 161 s += 8; 162 d += 8; 163 width -= 8; 164 } while (width != 0); 165 src += 4 * src_stride; 166 dst += 4 * dst_stride; 167 h -= 4; 168 } while (h > 0); 169 } 170 } 171 172 static inline int16x4_t convolve6_4_h(const uint8x16_t samples, 173 const int8x16_t filter, 174 const uint8x16_t permute_tbl) { 175 // Permute samples ready for matrix multiply. 176 // { 0, 1, 2, 3, 4, 5, 6, 7, 2, 3, 4, 5, 6, 7, 8, 9 } 177 uint8x16_t perm_samples = vqtbl1q_u8(samples, permute_tbl); 178 179 // These instructions multiply a 2x8 matrix (samples) by an 8x2 matrix 180 // (filter), destructively accumulating into the destination register. 181 int32x4_t sum = vusmmlaq_s32(vdupq_n_s32(0), perm_samples, filter); 182 183 // Further narrowing and packing is performed by the caller. 184 return vmovn_s32(sum); 185 } 186 187 static inline uint8x8_t convolve6_8_h(const uint8x16_t samples, 188 const int8x16_t filter, 189 const uint8x16x2_t permute_tbl) { 190 // Permute samples ready for matrix multiply. 191 // { 0, 1, 2, 3, 4, 5, 6, 7, 2, 3, 4, 5, 6, 7, 8, 9 } 192 // { 4, 5, 6, 7, 8, 9, 10, 11, 6, 7, 8, 9, 10, 11, 12, 13 } 193 uint8x16_t perm_samples[2] = { vqtbl1q_u8(samples, permute_tbl.val[0]), 194 vqtbl1q_u8(samples, permute_tbl.val[1]) }; 195 196 // These instructions multiply a 2x8 matrix (samples) by an 8x2 matrix 197 // (filter), destructively accumulating into the destination register. 198 int32x4_t sum0123 = vusmmlaq_s32(vdupq_n_s32(0), perm_samples[0], filter); 199 int32x4_t sum4567 = vusmmlaq_s32(vdupq_n_s32(0), perm_samples[1], filter); 200 201 // Narrow and re-pack. 202 int16x8_t sum = vcombine_s16(vmovn_s32(sum0123), vmovn_s32(sum4567)); 203 // We halved the filter values so -1 from right shift. 204 return vqrshrun_n_s16(sum, FILTER_BITS - 1); 205 } 206 207 static inline void convolve8_horiz_6tap_neon_i8mm( 208 const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, 209 ptrdiff_t dst_stride, const int16_t *filter_x, int width, int height) { 210 // Filter values are even, so halve to reduce intermediate precision reqs. 211 const int8x8_t x_filter = vshrn_n_s16(vld1q_s16(filter_x), 1); 212 // Stagger the filter for use with the matrix multiply instructions. 213 // { f0, f1, f2, f3, f4, f5, 0, 0, 0, f0, f1, f2, f3, f4, f5, 0 } 214 const int8x16_t filter = 215 vcombine_s8(vext_s8(x_filter, x_filter, 1), x_filter); 216 217 if (width == 4) { 218 const uint8x16_t perm_tbl = vld1q_u8(kMatMul6PermuteTbl); 219 do { 220 uint8x16_t s0, s1, s2, s3; 221 load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3); 222 223 int16x4_t t0 = convolve6_4_h(s0, filter, perm_tbl); 224 int16x4_t t1 = convolve6_4_h(s1, filter, perm_tbl); 225 int16x4_t t2 = convolve6_4_h(s2, filter, perm_tbl); 226 int16x4_t t3 = convolve6_4_h(s3, filter, perm_tbl); 227 // We halved the filter values so -1 from right shift. 228 uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(t0, t1), FILTER_BITS - 1); 229 uint8x8_t d23 = vqrshrun_n_s16(vcombine_s16(t2, t3), FILTER_BITS - 1); 230 231 store_u8x4_strided_x2(dst + 0 * dst_stride, dst_stride, d01); 232 store_u8x4_strided_x2(dst + 2 * dst_stride, dst_stride, d23); 233 234 src += 4 * src_stride; 235 dst += 4 * dst_stride; 236 height -= 4; 237 } while (height > 0); 238 } else { 239 const uint8x16x2_t perm_tbl = vld1q_u8_x2(kMatMul6PermuteTbl); 240 241 do { 242 int w = width; 243 const uint8_t *s = src; 244 uint8_t *d = dst; 245 do { 246 uint8x16_t s0, s1, s2, s3; 247 load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3); 248 249 uint8x8_t d0 = convolve6_8_h(s0, filter, perm_tbl); 250 uint8x8_t d1 = convolve6_8_h(s1, filter, perm_tbl); 251 uint8x8_t d2 = convolve6_8_h(s2, filter, perm_tbl); 252 uint8x8_t d3 = convolve6_8_h(s3, filter, perm_tbl); 253 254 store_u8_8x4(d, dst_stride, d0, d1, d2, d3); 255 256 s += 8; 257 d += 8; 258 w -= 8; 259 } while (w != 0); 260 src += 4 * src_stride; 261 dst += 4 * dst_stride; 262 height -= 4; 263 } while (height > 0); 264 } 265 } 266 267 void aom_convolve8_horiz_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride, 268 uint8_t *dst, ptrdiff_t dst_stride, 269 const int16_t *filter_x, int x_step_q4, 270 const int16_t *filter_y, int y_step_q4, 271 int w, int h) { 272 assert((intptr_t)dst % 4 == 0); 273 assert(dst_stride % 4 == 0); 274 275 (void)x_step_q4; 276 (void)filter_y; 277 (void)y_step_q4; 278 279 src -= ((SUBPEL_TAPS / 2) - 1); 280 281 int filter_taps = get_filter_taps_convolve8(filter_x); 282 283 if (filter_taps == 2) { 284 convolve8_horiz_2tap_neon(src + 3, src_stride, dst, dst_stride, filter_x, w, 285 h); 286 } else if (filter_taps <= 6) { 287 convolve8_horiz_6tap_neon_i8mm(src + 1, src_stride, dst, dst_stride, 288 filter_x, w, h); 289 } else { 290 convolve8_horiz_8tap_neon_i8mm(src, src_stride, dst, dst_stride, filter_x, 291 w, h); 292 } 293 } 294 295 static inline int16x4_t convolve8_4_v(const uint8x16_t samples_lo, 296 const uint8x16_t samples_hi, 297 const int8x8_t filters) { 298 // Sample permutation is performed by the caller. 299 int32x4_t sum = vusdotq_lane_s32(vdupq_n_s32(0), samples_lo, filters, 0); 300 sum = vusdotq_lane_s32(sum, samples_hi, filters, 1); 301 302 // Further narrowing and packing is performed by the caller. 303 return vmovn_s32(sum); 304 } 305 306 static inline uint8x8_t convolve8_8_v(const uint8x16_t samples0_lo, 307 const uint8x16_t samples0_hi, 308 const uint8x16_t samples1_lo, 309 const uint8x16_t samples1_hi, 310 const int8x8_t filters) { 311 // Sample permutation is performed by the caller. 312 313 // First 4 output values. 314 int32x4_t sum0 = vusdotq_lane_s32(vdupq_n_s32(0), samples0_lo, filters, 0); 315 sum0 = vusdotq_lane_s32(sum0, samples0_hi, filters, 1); 316 // Second 4 output values. 317 int32x4_t sum1 = vusdotq_lane_s32(vdupq_n_s32(0), samples1_lo, filters, 0); 318 sum1 = vusdotq_lane_s32(sum1, samples1_hi, filters, 1); 319 320 // Narrow and re-pack. 321 int16x8_t sum = vcombine_s16(vmovn_s32(sum0), vmovn_s32(sum1)); 322 // We halved the filter values so -1 from right shift. 323 return vqrshrun_n_s16(sum, FILTER_BITS - 1); 324 } 325 326 static inline void convolve8_vert_8tap_neon_i8mm( 327 const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, 328 ptrdiff_t dst_stride, const int16_t *filter_y, int w, int h) { 329 // Filter values are even, so halve to reduce intermediate precision reqs. 330 const int8x8_t filter = vshrn_n_s16(vld1q_s16(filter_y), 1); 331 const uint8x16x3_t merge_block_tbl = vld1q_u8_x3(kDotProdMergeBlockTbl); 332 uint8x16x2_t samples_LUT; 333 334 if (w == 4) { 335 uint8x8_t s0, s1, s2, s3, s4, s5, s6; 336 load_u8_8x7(src, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6); 337 src += 7 * src_stride; 338 339 // This operation combines a conventional transpose and the sample permute 340 // required before computing the dot product. 341 uint8x16_t s0123, s1234, s2345, s3456; 342 transpose_concat_elems_u8_4x4(s0, s1, s2, s3, &s0123); 343 transpose_concat_elems_u8_4x4(s1, s2, s3, s4, &s1234); 344 transpose_concat_elems_u8_4x4(s2, s3, s4, s5, &s2345); 345 transpose_concat_elems_u8_4x4(s3, s4, s5, s6, &s3456); 346 347 do { 348 uint8x8_t s7, s8, s9, s10; 349 load_u8_8x4(src, src_stride, &s7, &s8, &s9, &s10); 350 351 uint8x16_t s4567, s5678, s6789, s78910; 352 transpose_concat_elems_u8_4x4(s7, s8, s9, s10, &s78910); 353 354 // Merge new data into block from previous iteration. 355 samples_LUT.val[0] = s3456; 356 samples_LUT.val[1] = s78910; 357 s4567 = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[0]); 358 s5678 = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[1]); 359 s6789 = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[2]); 360 361 int16x4_t d0 = convolve8_4_v(s0123, s4567, filter); 362 int16x4_t d1 = convolve8_4_v(s1234, s5678, filter); 363 int16x4_t d2 = convolve8_4_v(s2345, s6789, filter); 364 int16x4_t d3 = convolve8_4_v(s3456, s78910, filter); 365 // We halved the filter values so -1 from right shift. 366 uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS - 1); 367 uint8x8_t d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS - 1); 368 369 store_u8x4_strided_x2(dst + 0 * dst_stride, dst_stride, d01); 370 store_u8x4_strided_x2(dst + 2 * dst_stride, dst_stride, d23); 371 372 // Prepare block for next iteration - re-using as much as possible. 373 // Shuffle everything up four rows. 374 s0123 = s4567; 375 s1234 = s5678; 376 s2345 = s6789; 377 s3456 = s78910; 378 379 src += 4 * src_stride; 380 dst += 4 * dst_stride; 381 h -= 4; 382 } while (h != 0); 383 } else { 384 do { 385 int height = h; 386 const uint8_t *s = src; 387 uint8_t *d = dst; 388 389 uint8x8_t s0, s1, s2, s3, s4, s5, s6; 390 load_u8_8x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6); 391 s += 7 * src_stride; 392 393 // This operation combines a conventional transpose and the sample permute 394 // required before computing the dot product. 395 uint8x16_t s0123_lo, s0123_hi, s1234_lo, s1234_hi, s2345_lo, s2345_hi, 396 s3456_lo, s3456_hi; 397 transpose_concat_elems_u8_8x4(s0, s1, s2, s3, &s0123_lo, &s0123_hi); 398 transpose_concat_elems_u8_8x4(s1, s2, s3, s4, &s1234_lo, &s1234_hi); 399 transpose_concat_elems_u8_8x4(s2, s3, s4, s5, &s2345_lo, &s2345_hi); 400 transpose_concat_elems_u8_8x4(s3, s4, s5, s6, &s3456_lo, &s3456_hi); 401 402 do { 403 uint8x8_t s7, s8, s9, s10; 404 load_u8_8x4(s, src_stride, &s7, &s8, &s9, &s10); 405 406 uint8x16_t s4567_lo, s4567_hi, s5678_lo, s5678_hi, s6789_lo, s6789_hi, 407 s78910_lo, s78910_hi; 408 transpose_concat_elems_u8_8x4(s7, s8, s9, s10, &s78910_lo, &s78910_hi); 409 410 // Merge new data into block from previous iteration. 411 samples_LUT.val[0] = s3456_lo; 412 samples_LUT.val[1] = s78910_lo; 413 s4567_lo = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[0]); 414 s5678_lo = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[1]); 415 s6789_lo = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[2]); 416 417 samples_LUT.val[0] = s3456_hi; 418 samples_LUT.val[1] = s78910_hi; 419 s4567_hi = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[0]); 420 s5678_hi = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[1]); 421 s6789_hi = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[2]); 422 423 uint8x8_t d0 = 424 convolve8_8_v(s0123_lo, s4567_lo, s0123_hi, s4567_hi, filter); 425 uint8x8_t d1 = 426 convolve8_8_v(s1234_lo, s5678_lo, s1234_hi, s5678_hi, filter); 427 uint8x8_t d2 = 428 convolve8_8_v(s2345_lo, s6789_lo, s2345_hi, s6789_hi, filter); 429 uint8x8_t d3 = 430 convolve8_8_v(s3456_lo, s78910_lo, s3456_hi, s78910_hi, filter); 431 432 store_u8_8x4(d, dst_stride, d0, d1, d2, d3); 433 434 // Prepare block for next iteration - re-using as much as possible. 435 // Shuffle everything up four rows. 436 s0123_lo = s4567_lo; 437 s0123_hi = s4567_hi; 438 s1234_lo = s5678_lo; 439 s1234_hi = s5678_hi; 440 s2345_lo = s6789_lo; 441 s2345_hi = s6789_hi; 442 s3456_lo = s78910_lo; 443 s3456_hi = s78910_hi; 444 445 s += 4 * src_stride; 446 d += 4 * dst_stride; 447 height -= 4; 448 } while (height != 0); 449 src += 8; 450 dst += 8; 451 w -= 8; 452 } while (w != 0); 453 } 454 } 455 456 static inline int16x4_t convolve4_4_v(const uint8x16_t samples, 457 const int8x8_t filters) { 458 // Sample permutation is performed by the caller. 459 int32x4_t sum = vusdotq_lane_s32(vdupq_n_s32(0), samples, filters, 0); 460 461 // Further narrowing and packing is performed by the caller. 462 return vmovn_s32(sum); 463 } 464 465 static inline uint8x8_t convolve4_8_v(const uint8x16_t samples0, 466 const uint8x16_t samples1, 467 const int8x8_t filters) { 468 // Sample permutation is performed by the caller. 469 470 // First 4 output values. 471 int32x4_t sum0 = vusdotq_lane_s32(vdupq_n_s32(0), samples0, filters, 0); 472 // Second 4 output values. 473 int32x4_t sum1 = vusdotq_lane_s32(vdupq_n_s32(0), samples1, filters, 0); 474 475 // Narrow and re-pack. 476 int16x8_t sum = vcombine_s16(vmovn_s32(sum0), vmovn_s32(sum1)); 477 // We halved the filter values so -1 from right shift. 478 return vqrshrun_n_s16(sum, FILTER_BITS - 1); 479 } 480 481 static inline void convolve8_vert_4tap_neon_i8mm( 482 const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, 483 ptrdiff_t dst_stride, const int16_t *filter_y, int w, int h) { 484 // Filter values are even, so halve to reduce intermediate precision reqs. 485 const int16x8_t filter_s16 = 486 vcombine_s16(vld1_s16(filter_y + 2), vdup_n_s16(0)); 487 const int8x8_t filter = vshrn_n_s16(filter_s16, 1); 488 const uint8x16x3_t merge_block_tbl = vld1q_u8_x3(kDotProdMergeBlockTbl); 489 uint8x16x2_t samples_LUT; 490 491 if (w == 4) { 492 uint8x8_t s0, s1, s2, s3; 493 load_u8_8x4(src, src_stride, &s0, &s1, &s2, &s3); 494 src += 4 * src_stride; 495 496 // This operation combines a conventional transpose and the sample permute 497 // required before computing the dot product. 498 uint8x16_t s0123; 499 transpose_concat_elems_u8_4x4(s0, s1, s2, s3, &s0123); 500 501 do { 502 uint8x8_t s4, s5, s6, s7; 503 load_u8_8x4(src, src_stride, &s4, &s5, &s6, &s7); 504 505 uint8x16_t s4567; 506 transpose_concat_elems_u8_4x4(s4, s5, s6, s7, &s4567); 507 508 // Merge new data into block from previous iteration. 509 samples_LUT.val[0] = s0123; 510 samples_LUT.val[1] = s4567; 511 uint8x16_t s1234 = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[0]); 512 uint8x16_t s2345 = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[1]); 513 uint8x16_t s3456 = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[2]); 514 515 int16x4_t d0 = convolve4_4_v(s0123, filter); 516 int16x4_t d1 = convolve4_4_v(s1234, filter); 517 int16x4_t d2 = convolve4_4_v(s2345, filter); 518 int16x4_t d3 = convolve4_4_v(s3456, filter); 519 // We halved the filter values so -1 from right shift. 520 uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS - 1); 521 uint8x8_t d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS - 1); 522 523 store_u8x4_strided_x2(dst + 0 * dst_stride, dst_stride, d01); 524 store_u8x4_strided_x2(dst + 2 * dst_stride, dst_stride, d23); 525 526 // Prepare block for next iteration - re-using as much as possible. 527 // Shuffle everything up four rows. 528 s0123 = s4567; 529 530 src += 4 * src_stride; 531 dst += 4 * dst_stride; 532 h -= 4; 533 } while (h != 0); 534 } else { 535 do { 536 int height = h; 537 const uint8_t *s = src; 538 uint8_t *d = dst; 539 540 uint8x8_t s0, s1, s2, s3; 541 load_u8_8x4(s, src_stride, &s0, &s1, &s2, &s3); 542 s += 4 * src_stride; 543 544 // This operation combines a conventional transpose and the sample permute 545 // required before computing the dot product. 546 uint8x16_t s0123_lo, s0123_hi; 547 transpose_concat_elems_u8_8x4(s0, s1, s2, s3, &s0123_lo, &s0123_hi); 548 549 do { 550 uint8x8_t s4, s5, s6, s7; 551 load_u8_8x4(s, src_stride, &s4, &s5, &s6, &s7); 552 553 uint8x16_t s4567_lo, s4567_hi; 554 transpose_concat_elems_u8_8x4(s4, s5, s6, s7, &s4567_lo, &s4567_hi); 555 556 // Merge new data into block from previous iteration. 557 samples_LUT.val[0] = s0123_lo; 558 samples_LUT.val[1] = s4567_lo; 559 uint8x16_t s1234_lo = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[0]); 560 uint8x16_t s2345_lo = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[1]); 561 uint8x16_t s3456_lo = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[2]); 562 563 samples_LUT.val[0] = s0123_hi; 564 samples_LUT.val[1] = s4567_hi; 565 uint8x16_t s1234_hi = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[0]); 566 uint8x16_t s2345_hi = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[1]); 567 uint8x16_t s3456_hi = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[2]); 568 569 uint8x8_t d0 = convolve4_8_v(s0123_lo, s0123_hi, filter); 570 uint8x8_t d1 = convolve4_8_v(s1234_lo, s1234_hi, filter); 571 uint8x8_t d2 = convolve4_8_v(s2345_lo, s2345_hi, filter); 572 uint8x8_t d3 = convolve4_8_v(s3456_lo, s3456_hi, filter); 573 574 store_u8_8x4(d, dst_stride, d0, d1, d2, d3); 575 576 // Prepare block for next iteration - re-using as much as possible. 577 // Shuffle everything up four rows. 578 s0123_lo = s4567_lo; 579 s0123_hi = s4567_hi; 580 581 s += 4 * src_stride; 582 d += 4 * dst_stride; 583 height -= 4; 584 } while (height != 0); 585 src += 8; 586 dst += 8; 587 w -= 8; 588 } while (w != 0); 589 } 590 } 591 592 void aom_convolve8_vert_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride, 593 uint8_t *dst, ptrdiff_t dst_stride, 594 const int16_t *filter_x, int x_step_q4, 595 const int16_t *filter_y, int y_step_q4, int w, 596 int h) { 597 assert((intptr_t)dst % 4 == 0); 598 assert(dst_stride % 4 == 0); 599 600 (void)filter_x; 601 (void)x_step_q4; 602 (void)y_step_q4; 603 604 src -= ((SUBPEL_TAPS / 2) - 1) * src_stride; 605 606 int filter_taps = get_filter_taps_convolve8(filter_y); 607 608 if (filter_taps == 2) { 609 convolve8_vert_2tap_neon(src + 3 * src_stride, src_stride, dst, dst_stride, 610 filter_y, w, h); 611 } else if (filter_taps == 4) { 612 convolve8_vert_4tap_neon_i8mm(src + 2 * src_stride, src_stride, dst, 613 dst_stride, filter_y, w, h); 614 } else { 615 convolve8_vert_8tap_neon_i8mm(src, src_stride, dst, dst_stride, filter_y, w, 616 h); 617 } 618 }