av1_convolve_scale_neon.c (30506B)
1 /* 2 * Copyright (c) 2024, Alliance for Open Media. All rights reserved. 3 * 4 * This source code is subject to the terms of the BSD 2 Clause License and 5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License 6 * was not distributed with this source code in the LICENSE file, you can 7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open 8 * Media Patent License 1.0 was not distributed with this source code in the 9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent. 10 */ 11 12 #include <arm_neon.h> 13 #include <assert.h> 14 #include <stdint.h> 15 16 #include "config/aom_config.h" 17 #include "config/av1_rtcd.h" 18 19 #include "aom_dsp/aom_dsp_common.h" 20 #include "aom_dsp/aom_filter.h" 21 #include "aom_dsp/arm/mem_neon.h" 22 #include "aom_dsp/arm/transpose_neon.h" 23 #include "av1/common/arm/convolve_scale_neon.h" 24 #include "av1/common/convolve.h" 25 #include "av1/common/filter.h" 26 27 static inline int16x4_t convolve8_4_h(const int16x4_t s0, const int16x4_t s1, 28 const int16x4_t s2, const int16x4_t s3, 29 const int16x4_t s4, const int16x4_t s5, 30 const int16x4_t s6, const int16x4_t s7, 31 const int16x8_t filter, 32 const int32x4_t horiz_const) { 33 int16x4_t filter_lo = vget_low_s16(filter); 34 int16x4_t filter_hi = vget_high_s16(filter); 35 36 int32x4_t sum = horiz_const; 37 sum = vmlal_lane_s16(sum, s0, filter_lo, 0); 38 sum = vmlal_lane_s16(sum, s1, filter_lo, 1); 39 sum = vmlal_lane_s16(sum, s2, filter_lo, 2); 40 sum = vmlal_lane_s16(sum, s3, filter_lo, 3); 41 sum = vmlal_lane_s16(sum, s4, filter_hi, 0); 42 sum = vmlal_lane_s16(sum, s5, filter_hi, 1); 43 sum = vmlal_lane_s16(sum, s6, filter_hi, 2); 44 sum = vmlal_lane_s16(sum, s7, filter_hi, 3); 45 46 return vshrn_n_s32(sum, ROUND0_BITS); 47 } 48 49 static inline int16x8_t convolve8_8_h(const int16x8_t s0, const int16x8_t s1, 50 const int16x8_t s2, const int16x8_t s3, 51 const int16x8_t s4, const int16x8_t s5, 52 const int16x8_t s6, const int16x8_t s7, 53 const int16x8_t filter, 54 const int16x8_t horiz_const) { 55 int16x4_t filter_lo = vget_low_s16(filter); 56 int16x4_t filter_hi = vget_high_s16(filter); 57 58 int16x8_t sum = horiz_const; 59 sum = vmlaq_lane_s16(sum, s0, filter_lo, 0); 60 sum = vmlaq_lane_s16(sum, s1, filter_lo, 1); 61 sum = vmlaq_lane_s16(sum, s2, filter_lo, 2); 62 sum = vmlaq_lane_s16(sum, s3, filter_lo, 3); 63 sum = vmlaq_lane_s16(sum, s4, filter_hi, 0); 64 sum = vmlaq_lane_s16(sum, s5, filter_hi, 1); 65 sum = vmlaq_lane_s16(sum, s6, filter_hi, 2); 66 sum = vmlaq_lane_s16(sum, s7, filter_hi, 3); 67 68 return vshrq_n_s16(sum, ROUND0_BITS - 1); 69 } 70 71 static inline void convolve_horiz_scale_8tap_neon(const uint8_t *src, 72 int src_stride, int16_t *dst, 73 int dst_stride, int w, int h, 74 const int16_t *x_filter, 75 const int subpel_x_qn, 76 const int x_step_qn) { 77 DECLARE_ALIGNED(16, int16_t, temp[8 * 8]); 78 const int bd = 8; 79 80 if (w == 4) { 81 // The shim of 1 << (ROUND0_BITS - 1) enables us to use non-rounding shifts. 82 const int32x4_t horiz_offset = 83 vdupq_n_s32((1 << (bd + FILTER_BITS - 1)) + (1 << (ROUND0_BITS - 1))); 84 85 do { 86 int x_qn = subpel_x_qn; 87 88 // Process a 4x4 tile. 89 for (int r = 0; r < 4; ++r) { 90 const uint8_t *const s = &src[x_qn >> SCALE_SUBPEL_BITS]; 91 92 const ptrdiff_t filter_offset = 93 SUBPEL_TAPS * ((x_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS); 94 const int16x8_t filter = vld1q_s16(x_filter + filter_offset); 95 96 uint8x8_t t0, t1, t2, t3; 97 load_u8_8x4(s, src_stride, &t0, &t1, &t2, &t3); 98 99 transpose_elems_inplace_u8_8x4(&t0, &t1, &t2, &t3); 100 101 int16x4_t s0 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0))); 102 int16x4_t s1 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1))); 103 int16x4_t s2 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2))); 104 int16x4_t s3 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3))); 105 int16x4_t s4 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t0))); 106 int16x4_t s5 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t1))); 107 int16x4_t s6 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t2))); 108 int16x4_t s7 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t3))); 109 110 int16x4_t d0 = 111 convolve8_4_h(s0, s1, s2, s3, s4, s5, s6, s7, filter, horiz_offset); 112 113 vst1_s16(&temp[r * 4], d0); 114 x_qn += x_step_qn; 115 } 116 117 // Transpose the 4x4 result tile and store. 118 int16x4_t d0, d1, d2, d3; 119 load_s16_4x4(temp, 4, &d0, &d1, &d2, &d3); 120 121 transpose_elems_inplace_s16_4x4(&d0, &d1, &d2, &d3); 122 123 store_s16_4x4(dst, dst_stride, d0, d1, d2, d3); 124 125 dst += 4 * dst_stride; 126 src += 4 * src_stride; 127 h -= 4; 128 } while (h > 0); 129 } else { 130 // The shim of 1 << (ROUND0_BITS - 1) enables us to use non-rounding shifts. 131 // The additional -1 is needed because we are halving the filter values. 132 const int16x8_t horiz_offset = 133 vdupq_n_s16((1 << (bd + FILTER_BITS - 2)) + (1 << (ROUND0_BITS - 2))); 134 135 do { 136 int x_qn = subpel_x_qn; 137 int16_t *d = dst; 138 int width = w; 139 140 do { 141 // Process an 8x8 tile. 142 for (int r = 0; r < 8; ++r) { 143 const uint8_t *const s = &src[(x_qn >> SCALE_SUBPEL_BITS)]; 144 145 const ptrdiff_t filter_offset = 146 SUBPEL_TAPS * ((x_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS); 147 int16x8_t filter = vld1q_s16(x_filter + filter_offset); 148 // Filter values are all even so halve them to allow convolution 149 // kernel computations to stay in 16-bit element types. 150 filter = vshrq_n_s16(filter, 1); 151 152 uint8x8_t t0, t1, t2, t3, t4, t5, t6, t7; 153 load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); 154 155 transpose_elems_u8_8x8(t0, t1, t2, t3, t4, t5, t6, t7, &t0, &t1, &t2, 156 &t3, &t4, &t5, &t6, &t7); 157 158 int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t0)); 159 int16x8_t s1 = vreinterpretq_s16_u16(vmovl_u8(t1)); 160 int16x8_t s2 = vreinterpretq_s16_u16(vmovl_u8(t2)); 161 int16x8_t s3 = vreinterpretq_s16_u16(vmovl_u8(t3)); 162 int16x8_t s4 = vreinterpretq_s16_u16(vmovl_u8(t4)); 163 int16x8_t s5 = vreinterpretq_s16_u16(vmovl_u8(t5)); 164 int16x8_t s6 = vreinterpretq_s16_u16(vmovl_u8(t6)); 165 int16x8_t s7 = vreinterpretq_s16_u16(vmovl_u8(t7)); 166 167 int16x8_t d0 = convolve8_8_h(s0, s1, s2, s3, s4, s5, s6, s7, filter, 168 horiz_offset); 169 170 vst1q_s16(&temp[r * 8], d0); 171 172 x_qn += x_step_qn; 173 } 174 175 // Transpose the 8x8 result tile and store. 176 int16x8_t d0, d1, d2, d3, d4, d5, d6, d7; 177 load_s16_8x8(temp, 8, &d0, &d1, &d2, &d3, &d4, &d5, &d6, &d7); 178 179 transpose_elems_inplace_s16_8x8(&d0, &d1, &d2, &d3, &d4, &d5, &d6, &d7); 180 181 store_s16_8x8(d, dst_stride, d0, d1, d2, d3, d4, d5, d6, d7); 182 183 d += 8; 184 width -= 8; 185 } while (width != 0); 186 187 dst += 8 * dst_stride; 188 src += 8 * src_stride; 189 h -= 8; 190 } while (h > 0); 191 } 192 } 193 194 static inline int16x4_t convolve6_4_h(const int16x4_t s0, const int16x4_t s1, 195 const int16x4_t s2, const int16x4_t s3, 196 const int16x4_t s4, const int16x4_t s5, 197 const int16x8_t filter, 198 const int32x4_t horiz_const) { 199 int16x4_t filter_lo = vget_low_s16(filter); 200 int16x4_t filter_hi = vget_high_s16(filter); 201 202 int32x4_t sum = horiz_const; 203 // Filter values at indices 0 and 7 are 0. 204 sum = vmlal_lane_s16(sum, s0, filter_lo, 1); 205 sum = vmlal_lane_s16(sum, s1, filter_lo, 2); 206 sum = vmlal_lane_s16(sum, s2, filter_lo, 3); 207 sum = vmlal_lane_s16(sum, s3, filter_hi, 0); 208 sum = vmlal_lane_s16(sum, s4, filter_hi, 1); 209 sum = vmlal_lane_s16(sum, s5, filter_hi, 2); 210 211 return vshrn_n_s32(sum, ROUND0_BITS); 212 } 213 214 static inline int16x8_t convolve6_8_h(const int16x8_t s0, const int16x8_t s1, 215 const int16x8_t s2, const int16x8_t s3, 216 const int16x8_t s4, const int16x8_t s5, 217 const int16x8_t filter, 218 const int16x8_t horiz_const) { 219 int16x4_t filter_lo = vget_low_s16(filter); 220 int16x4_t filter_hi = vget_high_s16(filter); 221 222 int16x8_t sum = horiz_const; 223 // Filter values at indices 0 and 7 are 0. 224 sum = vmlaq_lane_s16(sum, s0, filter_lo, 1); 225 sum = vmlaq_lane_s16(sum, s1, filter_lo, 2); 226 sum = vmlaq_lane_s16(sum, s2, filter_lo, 3); 227 sum = vmlaq_lane_s16(sum, s3, filter_hi, 0); 228 sum = vmlaq_lane_s16(sum, s4, filter_hi, 1); 229 sum = vmlaq_lane_s16(sum, s5, filter_hi, 2); 230 231 // We halved the filter values so -1 from right shift. 232 return vshrq_n_s16(sum, ROUND0_BITS - 1); 233 } 234 235 static inline void convolve_horiz_scale_6tap_neon(const uint8_t *src, 236 int src_stride, int16_t *dst, 237 int dst_stride, int w, int h, 238 const int16_t *x_filter, 239 const int subpel_x_qn, 240 const int x_step_qn) { 241 DECLARE_ALIGNED(16, int16_t, temp[8 * 8]); 242 const int bd = 8; 243 244 if (w == 4) { 245 // The shim of 1 << (ROUND0_BITS - 1) enables us to use non-rounding shifts. 246 const int32x4_t horiz_offset = 247 vdupq_n_s32((1 << (bd + FILTER_BITS - 1)) + (1 << (ROUND0_BITS - 1))); 248 249 do { 250 int x_qn = subpel_x_qn; 251 252 // Process a 4x4 tile. 253 for (int r = 0; r < 4; ++r) { 254 const uint8_t *const s = &src[x_qn >> SCALE_SUBPEL_BITS]; 255 256 const ptrdiff_t filter_offset = 257 SUBPEL_TAPS * ((x_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS); 258 const int16x8_t filter = vld1q_s16(x_filter + filter_offset); 259 260 uint8x8_t t0, t1, t2, t3; 261 load_u8_8x4(s, src_stride, &t0, &t1, &t2, &t3); 262 263 transpose_elems_inplace_u8_8x4(&t0, &t1, &t2, &t3); 264 265 int16x4_t s0 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1))); 266 int16x4_t s1 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2))); 267 int16x4_t s2 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3))); 268 int16x4_t s3 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t0))); 269 int16x4_t s4 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t1))); 270 int16x4_t s5 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t2))); 271 272 int16x4_t d0 = 273 convolve6_4_h(s0, s1, s2, s3, s4, s5, filter, horiz_offset); 274 275 vst1_s16(&temp[r * 4], d0); 276 x_qn += x_step_qn; 277 } 278 279 // Transpose the 4x4 result tile and store. 280 int16x4_t d0, d1, d2, d3; 281 load_s16_4x4(temp, 4, &d0, &d1, &d2, &d3); 282 283 transpose_elems_inplace_s16_4x4(&d0, &d1, &d2, &d3); 284 285 store_s16_4x4(dst, dst_stride, d0, d1, d2, d3); 286 287 dst += 4 * dst_stride; 288 src += 4 * src_stride; 289 h -= 4; 290 } while (h > 0); 291 } else { 292 // The shim of 1 << (ROUND0_BITS - 1) enables us to use non-rounding shifts. 293 // The additional -1 is needed because we are halving the filter values. 294 const int16x8_t horiz_offset = 295 vdupq_n_s16((1 << (bd + FILTER_BITS - 2)) + (1 << (ROUND0_BITS - 2))); 296 297 do { 298 int x_qn = subpel_x_qn; 299 int16_t *d = dst; 300 int width = w; 301 302 do { 303 // Process an 8x8 tile. 304 for (int r = 0; r < 8; ++r) { 305 const uint8_t *const s = &src[(x_qn >> SCALE_SUBPEL_BITS)]; 306 307 const ptrdiff_t filter_offset = 308 SUBPEL_TAPS * ((x_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS); 309 int16x8_t filter = vld1q_s16(x_filter + filter_offset); 310 // Filter values are all even so halve them to allow convolution 311 // kernel computations to stay in 16-bit element types. 312 filter = vshrq_n_s16(filter, 1); 313 314 uint8x8_t t0, t1, t2, t3, t4, t5, t6, t7; 315 load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); 316 317 transpose_elems_u8_8x8(t0, t1, t2, t3, t4, t5, t6, t7, &t0, &t1, &t2, 318 &t3, &t4, &t5, &t6, &t7); 319 320 int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t1)); 321 int16x8_t s1 = vreinterpretq_s16_u16(vmovl_u8(t2)); 322 int16x8_t s2 = vreinterpretq_s16_u16(vmovl_u8(t3)); 323 int16x8_t s3 = vreinterpretq_s16_u16(vmovl_u8(t4)); 324 int16x8_t s4 = vreinterpretq_s16_u16(vmovl_u8(t5)); 325 int16x8_t s5 = vreinterpretq_s16_u16(vmovl_u8(t6)); 326 327 int16x8_t d0 = 328 convolve6_8_h(s0, s1, s2, s3, s4, s5, filter, horiz_offset); 329 330 vst1q_s16(&temp[r * 8], d0); 331 332 x_qn += x_step_qn; 333 } 334 335 // Transpose the 8x8 result tile and store. 336 int16x8_t d0, d1, d2, d3, d4, d5, d6, d7; 337 load_s16_8x8(temp, 8, &d0, &d1, &d2, &d3, &d4, &d5, &d6, &d7); 338 339 transpose_elems_inplace_s16_8x8(&d0, &d1, &d2, &d3, &d4, &d5, &d6, &d7); 340 341 store_s16_8x8(d, dst_stride, d0, d1, d2, d3, d4, d5, d6, d7); 342 343 d += 8; 344 width -= 8; 345 } while (width != 0); 346 347 dst += 8 * dst_stride; 348 src += 8 * src_stride; 349 h -= 8; 350 } while (h > 0); 351 } 352 } 353 354 static inline void convolve_horiz_scale_2_8tap_neon( 355 const uint8_t *src, int src_stride, int16_t *dst, int dst_stride, int w, 356 int h, const int16_t *x_filter) { 357 const int bd = 8; 358 359 if (w == 4) { 360 // A shim of 1 << (ROUND0_BITS - 1) enables us to use non-rounding 361 // shifts - which are generally faster than rounding shifts on modern CPUs. 362 const int32x4_t horiz_offset = 363 vdupq_n_s32((1 << (bd + FILTER_BITS - 1)) + (1 << (ROUND0_BITS - 1))); 364 const int16x8_t filter = vld1q_s16(x_filter); 365 366 do { 367 uint8x16_t t0, t1, t2, t3; 368 load_u8_16x4(src, src_stride, &t0, &t1, &t2, &t3); 369 transpose_elems_inplace_u8_16x4(&t0, &t1, &t2, &t3); 370 371 int16x8_t tt0 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(t0))); 372 int16x8_t tt1 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(t1))); 373 int16x8_t tt2 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(t2))); 374 int16x8_t tt3 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(t3))); 375 int16x8_t tt4 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(t0))); 376 int16x8_t tt5 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(t1))); 377 int16x8_t tt6 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(t2))); 378 int16x8_t tt7 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(t3))); 379 380 int16x4_t s0 = vget_low_s16(tt0); 381 int16x4_t s1 = vget_low_s16(tt1); 382 int16x4_t s2 = vget_low_s16(tt2); 383 int16x4_t s3 = vget_low_s16(tt3); 384 int16x4_t s4 = vget_high_s16(tt0); 385 int16x4_t s5 = vget_high_s16(tt1); 386 int16x4_t s6 = vget_high_s16(tt2); 387 int16x4_t s7 = vget_high_s16(tt3); 388 int16x4_t s8 = vget_low_s16(tt4); 389 int16x4_t s9 = vget_low_s16(tt5); 390 int16x4_t s10 = vget_low_s16(tt6); 391 int16x4_t s11 = vget_low_s16(tt7); 392 int16x4_t s12 = vget_high_s16(tt4); 393 int16x4_t s13 = vget_high_s16(tt5); 394 395 int16x4_t d0 = 396 convolve8_4_h(s0, s1, s2, s3, s4, s5, s6, s7, filter, horiz_offset); 397 int16x4_t d1 = 398 convolve8_4_h(s2, s3, s4, s5, s6, s7, s8, s9, filter, horiz_offset); 399 int16x4_t d2 = 400 convolve8_4_h(s4, s5, s6, s7, s8, s9, s10, s11, filter, horiz_offset); 401 int16x4_t d3 = convolve8_4_h(s6, s7, s8, s9, s10, s11, s12, s13, filter, 402 horiz_offset); 403 404 transpose_elems_inplace_s16_4x4(&d0, &d1, &d2, &d3); 405 406 store_s16_4x4(dst, dst_stride, d0, d1, d2, d3); 407 408 dst += 4 * dst_stride; 409 src += 4 * src_stride; 410 h -= 4; 411 } while (h > 0); 412 } else { 413 // A shim of 1 << (ROUND0_BITS - 1) enables us to use non-rounding 414 // shifts - which are generally faster than rounding shifts on modern CPUs. 415 // The additional -1 is needed because we are halving the filter values. 416 const int16x8_t horiz_offset = 417 vdupq_n_s16((1 << (bd + FILTER_BITS - 2)) + (1 << (ROUND0_BITS - 2))); 418 // Filter values are all even so halve them to allow convolution 419 // kernel computations to stay in 16-bit element types. 420 const int16x8_t filter = vshrq_n_s16(vld1q_s16(x_filter), 1); 421 422 do { 423 const uint8_t *s = src; 424 int16_t *d = dst; 425 int width = w; 426 427 uint8x8_t t0, t1, t2, t3, t4, t5, t6, t7; 428 load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); 429 transpose_elems_u8_8x8(t0, t1, t2, t3, t4, t5, t6, t7, &t0, &t1, &t2, &t3, 430 &t4, &t5, &t6, &t7); 431 432 s += 8; 433 434 int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t0)); 435 int16x8_t s1 = vreinterpretq_s16_u16(vmovl_u8(t1)); 436 int16x8_t s2 = vreinterpretq_s16_u16(vmovl_u8(t2)); 437 int16x8_t s3 = vreinterpretq_s16_u16(vmovl_u8(t3)); 438 int16x8_t s4 = vreinterpretq_s16_u16(vmovl_u8(t4)); 439 int16x8_t s5 = vreinterpretq_s16_u16(vmovl_u8(t5)); 440 int16x8_t s6 = vreinterpretq_s16_u16(vmovl_u8(t6)); 441 int16x8_t s7 = vreinterpretq_s16_u16(vmovl_u8(t7)); 442 443 do { 444 uint8x8_t t8, t9, t10, t11, t12, t13, t14, t15; 445 load_u8_8x8(s, src_stride, &t8, &t9, &t10, &t11, &t12, &t13, &t14, 446 &t15); 447 transpose_elems_u8_8x8(t8, t9, t10, t11, t12, t13, t14, t15, &t8, &t9, 448 &t10, &t11, &t12, &t13, &t14, &t15); 449 450 int16x8_t s8 = vreinterpretq_s16_u16(vmovl_u8(t8)); 451 int16x8_t s9 = vreinterpretq_s16_u16(vmovl_u8(t9)); 452 int16x8_t s10 = vreinterpretq_s16_u16(vmovl_u8(t10)); 453 int16x8_t s11 = vreinterpretq_s16_u16(vmovl_u8(t11)); 454 int16x8_t s12 = vreinterpretq_s16_u16(vmovl_u8(t12)); 455 int16x8_t s13 = vreinterpretq_s16_u16(vmovl_u8(t13)); 456 int16x8_t s14 = vreinterpretq_s16_u16(vmovl_u8(t14)); 457 int16x8_t s15 = vreinterpretq_s16_u16(vmovl_u8(t15)); 458 459 int16x8_t d0 = 460 convolve8_8_h(s0, s1, s2, s3, s4, s5, s6, s7, filter, horiz_offset); 461 int16x8_t d1 = 462 convolve8_8_h(s2, s3, s4, s5, s6, s7, s8, s9, filter, horiz_offset); 463 int16x8_t d2 = convolve8_8_h(s4, s5, s6, s7, s8, s9, s10, s11, filter, 464 horiz_offset); 465 int16x8_t d3 = convolve8_8_h(s6, s7, s8, s9, s10, s11, s12, s13, filter, 466 horiz_offset); 467 468 transpose_elems_inplace_s16_8x4(&d0, &d1, &d2, &d3); 469 470 store_s16_4x8(d, dst_stride, vget_low_s16(d0), vget_low_s16(d1), 471 vget_low_s16(d2), vget_low_s16(d3), vget_high_s16(d0), 472 vget_high_s16(d1), vget_high_s16(d2), vget_high_s16(d3)); 473 474 s0 = s8; 475 s1 = s9; 476 s2 = s10; 477 s3 = s11; 478 s4 = s12; 479 s5 = s13; 480 s6 = s14; 481 s7 = s15; 482 483 s += 8; 484 d += 4; 485 width -= 4; 486 } while (width != 0); 487 488 dst += 8 * dst_stride; 489 src += 8 * src_stride; 490 h -= 8; 491 } while (h > 0); 492 } 493 } 494 495 static inline void convolve_horiz_scale_2_6tap_neon( 496 const uint8_t *src, int src_stride, int16_t *dst, int dst_stride, int w, 497 int h, const int16_t *x_filter) { 498 const int bd = 8; 499 500 if (w == 4) { 501 // A shim of 1 << (ROUND0_BITS - 1) enables us to use non-rounding 502 // shifts - which are generally faster than rounding shifts on modern CPUs. 503 const int32x4_t horiz_offset = 504 vdupq_n_s32((1 << (bd + FILTER_BITS - 1)) + (1 << (ROUND0_BITS - 1))); 505 const int16x8_t filter = vld1q_s16(x_filter); 506 507 do { 508 uint8x16_t t0, t1, t2, t3; 509 load_u8_16x4(src, src_stride, &t0, &t1, &t2, &t3); 510 transpose_elems_inplace_u8_16x4(&t0, &t1, &t2, &t3); 511 512 int16x8_t tt0 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(t1))); 513 int16x8_t tt1 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(t2))); 514 int16x8_t tt2 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(t3))); 515 int16x8_t tt3 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(t0))); 516 int16x8_t tt4 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(t0))); 517 int16x8_t tt5 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(t1))); 518 int16x8_t tt6 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(t2))); 519 int16x8_t tt7 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(t3))); 520 521 int16x4_t s0 = vget_low_s16(tt0); 522 int16x4_t s1 = vget_low_s16(tt1); 523 int16x4_t s2 = vget_low_s16(tt2); 524 int16x4_t s3 = vget_high_s16(tt3); 525 int16x4_t s4 = vget_high_s16(tt0); 526 int16x4_t s5 = vget_high_s16(tt1); 527 int16x4_t s6 = vget_high_s16(tt2); 528 int16x4_t s7 = vget_low_s16(tt4); 529 int16x4_t s8 = vget_low_s16(tt5); 530 int16x4_t s9 = vget_low_s16(tt6); 531 int16x4_t s10 = vget_low_s16(tt7); 532 int16x4_t s11 = vget_high_s16(tt4); 533 534 int16x4_t d0 = 535 convolve6_4_h(s0, s1, s2, s3, s4, s5, filter, horiz_offset); 536 int16x4_t d1 = 537 convolve6_4_h(s2, s3, s4, s5, s6, s7, filter, horiz_offset); 538 int16x4_t d2 = 539 convolve6_4_h(s4, s5, s6, s7, s8, s9, filter, horiz_offset); 540 int16x4_t d3 = 541 convolve6_4_h(s6, s7, s8, s9, s10, s11, filter, horiz_offset); 542 543 transpose_elems_inplace_s16_4x4(&d0, &d1, &d2, &d3); 544 545 store_s16_4x4(dst, dst_stride, d0, d1, d2, d3); 546 547 dst += 4 * dst_stride; 548 src += 4 * src_stride; 549 h -= 4; 550 } while (h > 0); 551 } else { 552 // A shim of 1 << (ROUND0_BITS - 1) enables us to use non-rounding 553 // shifts - which are generally faster than rounding shifts on modern CPUs. 554 // The additional -1 is needed because we are halving the filter values. 555 const int16x8_t horiz_offset = 556 vdupq_n_s16((1 << (bd + FILTER_BITS - 2)) + (1 << (ROUND0_BITS - 2))); 557 // Filter values are all even so halve them to allow convolution 558 // kernel computations to stay in 16-bit element types. 559 const int16x8_t filter = vshrq_n_s16(vld1q_s16(x_filter), 1); 560 561 do { 562 const uint8_t *s = src; 563 int16_t *d = dst; 564 int width = w; 565 566 uint8x8_t t0, t1, t2, t3, t4, t5, t6, t7; 567 load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); 568 transpose_elems_u8_8x8(t0, t1, t2, t3, t4, t5, t6, t7, &t0, &t1, &t2, &t3, 569 &t4, &t5, &t6, &t7); 570 571 s += 8; 572 573 int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t1)); 574 int16x8_t s1 = vreinterpretq_s16_u16(vmovl_u8(t2)); 575 int16x8_t s2 = vreinterpretq_s16_u16(vmovl_u8(t3)); 576 int16x8_t s3 = vreinterpretq_s16_u16(vmovl_u8(t4)); 577 int16x8_t s4 = vreinterpretq_s16_u16(vmovl_u8(t5)); 578 int16x8_t s5 = vreinterpretq_s16_u16(vmovl_u8(t6)); 579 int16x8_t s6 = vreinterpretq_s16_u16(vmovl_u8(t7)); 580 581 do { 582 uint8x8_t t8, t9, t10, t11, t12, t13, t14, t15; 583 load_u8_8x8(s, src_stride, &t8, &t9, &t10, &t11, &t12, &t13, &t14, 584 &t15); 585 transpose_elems_u8_8x8(t8, t9, t10, t11, t12, t13, t14, t15, &t8, &t9, 586 &t10, &t11, &t12, &t13, &t14, &t15); 587 588 int16x8_t s7 = vreinterpretq_s16_u16(vmovl_u8(t8)); 589 int16x8_t s8 = vreinterpretq_s16_u16(vmovl_u8(t9)); 590 int16x8_t s9 = vreinterpretq_s16_u16(vmovl_u8(t10)); 591 int16x8_t s10 = vreinterpretq_s16_u16(vmovl_u8(t11)); 592 int16x8_t s11 = vreinterpretq_s16_u16(vmovl_u8(t12)); 593 int16x8_t s12 = vreinterpretq_s16_u16(vmovl_u8(t13)); 594 int16x8_t s13 = vreinterpretq_s16_u16(vmovl_u8(t14)); 595 int16x8_t s14 = vreinterpretq_s16_u16(vmovl_u8(t15)); 596 597 int16x8_t d0 = 598 convolve6_8_h(s0, s1, s2, s3, s4, s5, filter, horiz_offset); 599 int16x8_t d1 = 600 convolve6_8_h(s2, s3, s4, s5, s6, s7, filter, horiz_offset); 601 int16x8_t d2 = 602 convolve6_8_h(s4, s5, s6, s7, s8, s9, filter, horiz_offset); 603 int16x8_t d3 = 604 convolve6_8_h(s6, s7, s8, s9, s10, s11, filter, horiz_offset); 605 606 transpose_elems_inplace_s16_8x4(&d0, &d1, &d2, &d3); 607 608 store_s16_4x8(d, dst_stride, vget_low_s16(d0), vget_low_s16(d1), 609 vget_low_s16(d2), vget_low_s16(d3), vget_high_s16(d0), 610 vget_high_s16(d1), vget_high_s16(d2), vget_high_s16(d3)); 611 612 s0 = s8; 613 s1 = s9; 614 s2 = s10; 615 s3 = s11; 616 s4 = s12; 617 s5 = s13; 618 s6 = s14; 619 620 s += 8; 621 d += 4; 622 width -= 4; 623 } while (width != 0); 624 625 dst += 8 * dst_stride; 626 src += 8 * src_stride; 627 h -= 8; 628 } while (h > 0); 629 } 630 } 631 632 void av1_convolve_2d_scale_neon(const uint8_t *src, int src_stride, 633 uint8_t *dst, int dst_stride, int w, int h, 634 const InterpFilterParams *filter_params_x, 635 const InterpFilterParams *filter_params_y, 636 const int subpel_x_qn, const int x_step_qn, 637 const int subpel_y_qn, const int y_step_qn, 638 ConvolveParams *conv_params) { 639 if (w < 4 || h < 4) { 640 av1_convolve_2d_scale_c(src, src_stride, dst, dst_stride, w, h, 641 filter_params_x, filter_params_y, subpel_x_qn, 642 x_step_qn, subpel_y_qn, y_step_qn, conv_params); 643 return; 644 } 645 646 // For the interpolation 8-tap filters are used. 647 assert(filter_params_y->taps <= 8 && filter_params_x->taps <= 8); 648 649 DECLARE_ALIGNED(32, int16_t, 650 im_block[(2 * MAX_SB_SIZE + MAX_FILTER_TAP) * MAX_SB_SIZE]); 651 int im_h = (((h - 1) * y_step_qn + subpel_y_qn) >> SCALE_SUBPEL_BITS) + 652 filter_params_y->taps; 653 int im_stride = MAX_SB_SIZE; 654 CONV_BUF_TYPE *dst16 = conv_params->dst; 655 const int dst16_stride = conv_params->dst_stride; 656 657 // Account for needing filter_taps / 2 - 1 lines prior and filter_taps / 2 658 // lines post both horizontally and vertically. 659 const ptrdiff_t horiz_offset = filter_params_x->taps / 2 - 1; 660 const ptrdiff_t vert_offset = (filter_params_y->taps / 2 - 1) * src_stride; 661 662 // Horizontal filter 663 664 if (x_step_qn != 2 * (1 << SCALE_SUBPEL_BITS)) { 665 if (filter_params_x->interp_filter == MULTITAP_SHARP) { 666 convolve_horiz_scale_8tap_neon( 667 src - horiz_offset - vert_offset, src_stride, im_block, im_stride, w, 668 im_h, filter_params_x->filter_ptr, subpel_x_qn, x_step_qn); 669 } else { 670 convolve_horiz_scale_6tap_neon( 671 src - horiz_offset - vert_offset, src_stride, im_block, im_stride, w, 672 im_h, filter_params_x->filter_ptr, subpel_x_qn, x_step_qn); 673 } 674 } else { 675 assert(subpel_x_qn < (1 << SCALE_SUBPEL_BITS)); 676 // The filter index is calculated using the 677 // ((subpel_x_qn + x * x_step_qn) & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS 678 // equation, where the values of x are from 0 to w. If x_step_qn is a 679 // multiple of SCALE_SUBPEL_MASK we can leave it out of the equation. 680 const ptrdiff_t filter_offset = 681 SUBPEL_TAPS * ((subpel_x_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS); 682 const int16_t *x_filter = filter_params_x->filter_ptr + filter_offset; 683 684 // The source index is calculated using the (subpel_x_qn + x * x_step_qn) 685 // >> SCALE_SUBPEL_BITS, where the values of x are from 0 to w. If 686 // subpel_x_qn < (1 << SCALE_SUBPEL_BITS) and x_step_qn % (1 << 687 // SCALE_SUBPEL_BITS) == 0, the source index can be determined using the 688 // value x * (x_step_qn / (1 << SCALE_SUBPEL_BITS)). 689 if (filter_params_x->interp_filter == MULTITAP_SHARP) { 690 convolve_horiz_scale_2_8tap_neon(src - horiz_offset - vert_offset, 691 src_stride, im_block, im_stride, w, im_h, 692 x_filter); 693 } else { 694 convolve_horiz_scale_2_6tap_neon(src - horiz_offset - vert_offset, 695 src_stride, im_block, im_stride, w, im_h, 696 x_filter); 697 } 698 } 699 700 // Vertical filter 701 if (filter_params_y->interp_filter == MULTITAP_SHARP) { 702 if (UNLIKELY(conv_params->is_compound)) { 703 if (conv_params->do_average) { 704 if (conv_params->use_dist_wtd_comp_avg) { 705 compound_dist_wtd_convolve_vert_scale_8tap_neon( 706 im_block, im_stride, dst, dst_stride, dst16, dst16_stride, w, h, 707 filter_params_y->filter_ptr, conv_params, subpel_y_qn, y_step_qn); 708 } else { 709 compound_avg_convolve_vert_scale_8tap_neon( 710 im_block, im_stride, dst, dst_stride, dst16, dst16_stride, w, h, 711 filter_params_y->filter_ptr, subpel_y_qn, y_step_qn); 712 } 713 } else { 714 compound_convolve_vert_scale_8tap_neon( 715 im_block, im_stride, dst16, dst16_stride, w, h, 716 filter_params_y->filter_ptr, subpel_y_qn, y_step_qn); 717 } 718 } else { 719 convolve_vert_scale_8tap_neon(im_block, im_stride, dst, dst_stride, w, h, 720 filter_params_y->filter_ptr, subpel_y_qn, 721 y_step_qn); 722 } 723 } else { 724 if (UNLIKELY(conv_params->is_compound)) { 725 if (conv_params->do_average) { 726 if (conv_params->use_dist_wtd_comp_avg) { 727 compound_dist_wtd_convolve_vert_scale_6tap_neon( 728 im_block + im_stride, im_stride, dst, dst_stride, dst16, 729 dst16_stride, w, h, filter_params_y->filter_ptr, conv_params, 730 subpel_y_qn, y_step_qn); 731 } else { 732 compound_avg_convolve_vert_scale_6tap_neon( 733 im_block + im_stride, im_stride, dst, dst_stride, dst16, 734 dst16_stride, w, h, filter_params_y->filter_ptr, subpel_y_qn, 735 y_step_qn); 736 } 737 } else { 738 compound_convolve_vert_scale_6tap_neon( 739 im_block + im_stride, im_stride, dst16, dst16_stride, w, h, 740 filter_params_y->filter_ptr, subpel_y_qn, y_step_qn); 741 } 742 } else { 743 convolve_vert_scale_6tap_neon( 744 im_block + im_stride, im_stride, dst, dst_stride, w, h, 745 filter_params_y->filter_ptr, subpel_y_qn, y_step_qn); 746 } 747 } 748 }