convolve_neon.c (71254B)
1 /* 2 * 3 * Copyright (c) 2018, Alliance for Open Media. All rights reserved. 4 * 5 * This source code is subject to the terms of the BSD 2 Clause License and 6 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License 7 * was not distributed with this source code in the LICENSE file, you can 8 * obtain it at www.aomedia.org/license/software. If the Alliance for Open 9 * Media Patent License 1.0 was not distributed with this source code in the 10 * PATENTS file, you can obtain it at www.aomedia.org/license/patent. 11 */ 12 13 #include <assert.h> 14 #include <arm_neon.h> 15 16 #include "config/aom_config.h" 17 #include "config/av1_rtcd.h" 18 19 #include "aom_dsp/aom_dsp_common.h" 20 #include "aom_dsp/arm/mem_neon.h" 21 #include "aom_dsp/arm/transpose_neon.h" 22 #include "aom_ports/mem.h" 23 #include "av1/common/convolve.h" 24 #include "av1/common/filter.h" 25 #include "av1/common/arm/convolve_neon.h" 26 27 static inline int16x4_t convolve12_4_x(const int16x4_t s0, const int16x4_t s1, 28 const int16x4_t s2, const int16x4_t s3, 29 const int16x4_t s4, const int16x4_t s5, 30 const int16x4_t s6, const int16x4_t s7, 31 const int16x4_t s8, const int16x4_t s9, 32 const int16x4_t s10, const int16x4_t s11, 33 const int16x8_t x_filter_0_7, 34 const int16x4_t x_filter_8_11, 35 const int32x4_t horiz_const) { 36 const int16x4_t x_filter_0_3 = vget_low_s16(x_filter_0_7); 37 const int16x4_t x_filter_4_7 = vget_high_s16(x_filter_0_7); 38 39 int32x4_t sum = horiz_const; 40 sum = vmlal_lane_s16(sum, s0, x_filter_0_3, 0); 41 sum = vmlal_lane_s16(sum, s1, x_filter_0_3, 1); 42 sum = vmlal_lane_s16(sum, s2, x_filter_0_3, 2); 43 sum = vmlal_lane_s16(sum, s3, x_filter_0_3, 3); 44 sum = vmlal_lane_s16(sum, s4, x_filter_4_7, 0); 45 sum = vmlal_lane_s16(sum, s5, x_filter_4_7, 1); 46 sum = vmlal_lane_s16(sum, s6, x_filter_4_7, 2); 47 sum = vmlal_lane_s16(sum, s7, x_filter_4_7, 3); 48 sum = vmlal_lane_s16(sum, s8, x_filter_8_11, 0); 49 sum = vmlal_lane_s16(sum, s9, x_filter_8_11, 1); 50 sum = vmlal_lane_s16(sum, s10, x_filter_8_11, 2); 51 sum = vmlal_lane_s16(sum, s11, x_filter_8_11, 3); 52 53 return vqrshrn_n_s32(sum, FILTER_BITS); 54 } 55 56 static inline void convolve_x_sr_12tap_neon(const uint8_t *src_ptr, 57 int src_stride, uint8_t *dst_ptr, 58 const int dst_stride, int w, int h, 59 const int16_t *x_filter_ptr) { 60 const int16x8_t x_filter_0_7 = vld1q_s16(x_filter_ptr); 61 const int16x4_t x_filter_8_11 = vld1_s16(x_filter_ptr + 8); 62 63 // A shim of 1 << (ROUND0_BITS - 1) enables us to use a single rounding right 64 // shift by FILTER_BITS - instead of a first rounding right shift by 65 // ROUND0_BITS, followed by second rounding right shift by FILTER_BITS - 66 // ROUND0_BITS. 67 const int32x4_t horiz_const = vdupq_n_s32(1 << (ROUND0_BITS - 1)); 68 69 #if AOM_ARCH_AARCH64 70 do { 71 const uint8_t *s = src_ptr; 72 uint8_t *d = dst_ptr; 73 int width = w; 74 75 uint8x8_t t0, t1, t2, t3; 76 load_u8_8x4(s, src_stride, &t0, &t1, &t2, &t3); 77 transpose_elems_inplace_u8_8x4(&t0, &t1, &t2, &t3); 78 79 int16x4_t s0 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0))); 80 int16x4_t s1 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1))); 81 int16x4_t s2 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2))); 82 int16x4_t s3 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3))); 83 int16x4_t s4 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t0))); 84 int16x4_t s5 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t1))); 85 int16x4_t s6 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t2))); 86 int16x4_t s7 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t3))); 87 88 load_u8_8x4(s + 8, src_stride, &t0, &t1, &t2, &t3); 89 transpose_elems_inplace_u8_8x4(&t0, &t1, &t2, &t3); 90 91 int16x4_t s8 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0))); 92 int16x4_t s9 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1))); 93 int16x4_t s10 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2))); 94 95 s += 11; 96 97 do { 98 load_u8_8x4(s, src_stride, &t0, &t1, &t2, &t3); 99 transpose_elems_inplace_u8_8x4(&t0, &t1, &t2, &t3); 100 101 int16x4_t s11 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0))); 102 int16x4_t s12 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1))); 103 int16x4_t s13 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2))); 104 int16x4_t s14 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3))); 105 106 int16x4_t d0 = 107 convolve12_4_x(s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, 108 x_filter_0_7, x_filter_8_11, horiz_const); 109 int16x4_t d1 = 110 convolve12_4_x(s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, 111 x_filter_0_7, x_filter_8_11, horiz_const); 112 int16x4_t d2 = 113 convolve12_4_x(s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, 114 x_filter_0_7, x_filter_8_11, horiz_const); 115 int16x4_t d3 = 116 convolve12_4_x(s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14, 117 x_filter_0_7, x_filter_8_11, horiz_const); 118 119 transpose_elems_inplace_s16_4x4(&d0, &d1, &d2, &d3); 120 121 uint8x8_t d01 = vqmovun_s16(vcombine_s16(d0, d1)); 122 uint8x8_t d23 = vqmovun_s16(vcombine_s16(d2, d3)); 123 124 store_u8x4_strided_x2(d, dst_stride, d01); 125 store_u8x4_strided_x2(d + 2 * dst_stride, dst_stride, d23); 126 127 s0 = s4; 128 s1 = s5; 129 s2 = s6; 130 s3 = s7; 131 s4 = s8; 132 s5 = s9; 133 s6 = s10; 134 s7 = s11; 135 s8 = s12; 136 s9 = s13; 137 s10 = s14; 138 s += 4; 139 d += 4; 140 width -= 4; 141 } while (width != 0); 142 src_ptr += 4 * src_stride; 143 dst_ptr += 4 * dst_stride; 144 h -= 4; 145 } while (h != 0); 146 147 #else // !AOM_ARCH_AARCH64 148 do { 149 const uint8_t *s = src_ptr; 150 uint8_t *d = dst_ptr; 151 int width = w; 152 153 do { 154 uint8x16_t t0 = vld1q_u8(s); 155 int16x8_t tt0 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(t0))); 156 int16x8_t tt8 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(t0))); 157 158 int16x4_t s0 = vget_low_s16(tt0); 159 int16x4_t s4 = vget_high_s16(tt0); 160 int16x4_t s8 = vget_low_s16(tt8); 161 int16x4_t s12 = vget_high_s16(tt8); 162 163 int16x4_t s1 = vext_s16(s0, s4, 1); // a1 a2 a3 a4 164 int16x4_t s2 = vext_s16(s0, s4, 2); // a2 a3 a4 a5 165 int16x4_t s3 = vext_s16(s0, s4, 3); // a3 a4 a5 a6 166 int16x4_t s5 = vext_s16(s4, s8, 1); // a5 a6 a7 a8 167 int16x4_t s6 = vext_s16(s4, s8, 2); // a6 a7 a8 a9 168 int16x4_t s7 = vext_s16(s4, s8, 3); // a7 a8 a9 a10 169 int16x4_t s9 = vext_s16(s8, s12, 1); // a9 a10 a11 a12 170 int16x4_t s10 = vext_s16(s8, s12, 2); // a10 a11 a12 a13 171 int16x4_t s11 = vext_s16(s8, s12, 3); // a11 a12 a13 a14 172 173 int16x4_t d0 = 174 convolve12_4_x(s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, 175 x_filter_0_7, x_filter_8_11, horiz_const); 176 177 uint8x8_t dd0 = vqmovun_s16(vcombine_s16(d0, vdup_n_s16(0))); 178 179 store_u8_4x1(d, dd0); 180 181 s += 4; 182 d += 4; 183 width -= 4; 184 } while (width != 0); 185 src_ptr += src_stride; 186 dst_ptr += dst_stride; 187 } while (--h != 0); 188 #endif // AOM_ARCH_AARCH64 189 } 190 191 static inline uint8x8_t convolve4_8_x(const int16x8_t s0, const int16x8_t s1, 192 const int16x8_t s2, const int16x8_t s3, 193 const int16x4_t filter, 194 int16x8_t horiz_const) { 195 int16x8_t sum = horiz_const; 196 sum = vmlaq_lane_s16(sum, s0, filter, 0); 197 sum = vmlaq_lane_s16(sum, s1, filter, 1); 198 sum = vmlaq_lane_s16(sum, s2, filter, 2); 199 sum = vmlaq_lane_s16(sum, s3, filter, 3); 200 // We halved the filter values so -1 from right shift. 201 return vqrshrun_n_s16(sum, FILTER_BITS - 1); 202 } 203 204 static inline void convolve_x_sr_4tap_neon(const uint8_t *src_ptr, 205 int src_stride, uint8_t *dst_ptr, 206 const int dst_stride, int w, int h, 207 const int16_t *x_filter_ptr) { 208 // All filter values are even, halve to reduce intermediate precision 209 // requirements. 210 const int16x4_t filter = vshr_n_s16(vld1_s16(x_filter_ptr + 2), 1); 211 212 // This shim of 1 << ((ROUND0_BITS - 1) - 1) enables us to use a single 213 // rounding right shift by FILTER_BITS - instead of a first rounding right 214 // shift by ROUND0_BITS, followed by second rounding right shift by 215 // FILTER_BITS - ROUND0_BITS. 216 // The outermost -1 is needed because we will halve the filter values. 217 const int16x8_t horiz_const = vdupq_n_s16(1 << ((ROUND0_BITS - 1) - 1)); 218 219 if (w == 4) { 220 do { 221 uint8x8_t t01[4]; 222 t01[0] = load_unaligned_u8(src_ptr + 0, src_stride); 223 t01[1] = load_unaligned_u8(src_ptr + 1, src_stride); 224 t01[2] = load_unaligned_u8(src_ptr + 2, src_stride); 225 t01[3] = load_unaligned_u8(src_ptr + 3, src_stride); 226 227 int16x8_t s01[4]; 228 s01[0] = vreinterpretq_s16_u16(vmovl_u8(t01[0])); 229 s01[1] = vreinterpretq_s16_u16(vmovl_u8(t01[1])); 230 s01[2] = vreinterpretq_s16_u16(vmovl_u8(t01[2])); 231 s01[3] = vreinterpretq_s16_u16(vmovl_u8(t01[3])); 232 233 uint8x8_t d01 = 234 convolve4_8_x(s01[0], s01[1], s01[2], s01[3], filter, horiz_const); 235 236 store_u8x4_strided_x2(dst_ptr + 0 * dst_stride, dst_stride, d01); 237 238 src_ptr += 2 * src_stride; 239 dst_ptr += 2 * dst_stride; 240 h -= 2; 241 } while (h != 0); 242 } else { 243 do { 244 int width = w; 245 const uint8_t *s = src_ptr; 246 uint8_t *d = dst_ptr; 247 248 do { 249 uint8x8_t t0[4], t1[4]; 250 load_u8_8x4(s + 0 * src_stride, 1, &t0[0], &t0[1], &t0[2], &t0[3]); 251 load_u8_8x4(s + 1 * src_stride, 1, &t1[0], &t1[1], &t1[2], &t1[3]); 252 253 int16x8_t s0[4], s1[4]; 254 s0[0] = vreinterpretq_s16_u16(vmovl_u8(t0[0])); 255 s0[1] = vreinterpretq_s16_u16(vmovl_u8(t0[1])); 256 s0[2] = vreinterpretq_s16_u16(vmovl_u8(t0[2])); 257 s0[3] = vreinterpretq_s16_u16(vmovl_u8(t0[3])); 258 259 s1[0] = vreinterpretq_s16_u16(vmovl_u8(t1[0])); 260 s1[1] = vreinterpretq_s16_u16(vmovl_u8(t1[1])); 261 s1[2] = vreinterpretq_s16_u16(vmovl_u8(t1[2])); 262 s1[3] = vreinterpretq_s16_u16(vmovl_u8(t1[3])); 263 264 uint8x8_t d0 = 265 convolve4_8_x(s0[0], s0[1], s0[2], s0[3], filter, horiz_const); 266 uint8x8_t d1 = 267 convolve4_8_x(s1[0], s1[1], s1[2], s1[3], filter, horiz_const); 268 269 store_u8_8x2(d, dst_stride, d0, d1); 270 271 s += 8; 272 d += 8; 273 width -= 8; 274 } while (width != 0); 275 src_ptr += 2 * src_stride; 276 dst_ptr += 2 * dst_stride; 277 h -= 2; 278 } while (h != 0); 279 } 280 } 281 282 static inline uint8x8_t convolve8_8_x(const int16x8_t s0, const int16x8_t s1, 283 const int16x8_t s2, const int16x8_t s3, 284 const int16x8_t s4, const int16x8_t s5, 285 const int16x8_t s6, const int16x8_t s7, 286 const int16x8_t filter, 287 const int16x8_t horiz_const) { 288 const int16x4_t filter_lo = vget_low_s16(filter); 289 const int16x4_t filter_hi = vget_high_s16(filter); 290 291 int16x8_t sum = horiz_const; 292 sum = vmlaq_lane_s16(sum, s0, filter_lo, 0); 293 sum = vmlaq_lane_s16(sum, s1, filter_lo, 1); 294 sum = vmlaq_lane_s16(sum, s2, filter_lo, 2); 295 sum = vmlaq_lane_s16(sum, s3, filter_lo, 3); 296 sum = vmlaq_lane_s16(sum, s4, filter_hi, 0); 297 sum = vmlaq_lane_s16(sum, s5, filter_hi, 1); 298 sum = vmlaq_lane_s16(sum, s6, filter_hi, 2); 299 sum = vmlaq_lane_s16(sum, s7, filter_hi, 3); 300 301 // We halved the convolution filter values so - 1 from the right shift. 302 return vqrshrun_n_s16(sum, FILTER_BITS - 1); 303 } 304 305 void av1_convolve_x_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst, 306 int dst_stride, int w, int h, 307 const InterpFilterParams *filter_params_x, 308 const int subpel_x_qn, 309 ConvolveParams *conv_params) { 310 if (w == 2 || h == 2) { 311 av1_convolve_x_sr_c(src, src_stride, dst, dst_stride, w, h, filter_params_x, 312 subpel_x_qn, conv_params); 313 return; 314 } 315 316 const uint8_t horiz_offset = filter_params_x->taps / 2 - 1; 317 src -= horiz_offset; 318 319 const int16_t *x_filter_ptr = av1_get_interp_filter_subpel_kernel( 320 filter_params_x, subpel_x_qn & SUBPEL_MASK); 321 322 int filter_taps = get_filter_tap(filter_params_x, subpel_x_qn & SUBPEL_MASK); 323 324 if (filter_taps > 8) { 325 convolve_x_sr_12tap_neon(src, src_stride, dst, dst_stride, w, h, 326 x_filter_ptr); 327 return; 328 } 329 330 if (filter_taps <= 4) { 331 convolve_x_sr_4tap_neon(src + 2, src_stride, dst, dst_stride, w, h, 332 x_filter_ptr); 333 return; 334 } 335 336 // This shim of 1 << ((ROUND0_BITS - 1) - 1) enables us to use a single 337 // rounding right shift by FILTER_BITS - instead of a first rounding right 338 // shift by ROUND0_BITS, followed by second rounding right shift by 339 // FILTER_BITS - ROUND0_BITS. 340 // The outermost -1 is needed because we will halve the filter values. 341 const int16x8_t horiz_const = vdupq_n_s16(1 << ((ROUND0_BITS - 1) - 1)); 342 343 // Filter values are even so halve to reduce precision requirements. 344 const int16x8_t x_filter = vshrq_n_s16(vld1q_s16(x_filter_ptr), 1); 345 346 #if AOM_ARCH_AARCH64 347 while (h >= 8) { 348 uint8x8_t t0, t1, t2, t3, t4, t5, t6, t7; 349 load_u8_8x8(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); 350 351 transpose_elems_inplace_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); 352 int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t0)); 353 int16x8_t s1 = vreinterpretq_s16_u16(vmovl_u8(t1)); 354 int16x8_t s2 = vreinterpretq_s16_u16(vmovl_u8(t2)); 355 int16x8_t s3 = vreinterpretq_s16_u16(vmovl_u8(t3)); 356 int16x8_t s4 = vreinterpretq_s16_u16(vmovl_u8(t4)); 357 int16x8_t s5 = vreinterpretq_s16_u16(vmovl_u8(t5)); 358 int16x8_t s6 = vreinterpretq_s16_u16(vmovl_u8(t6)); 359 360 int width = w; 361 const uint8_t *s = src + 7; 362 uint8_t *d = dst; 363 364 __builtin_prefetch(d + 0 * dst_stride); 365 __builtin_prefetch(d + 1 * dst_stride); 366 __builtin_prefetch(d + 2 * dst_stride); 367 __builtin_prefetch(d + 3 * dst_stride); 368 __builtin_prefetch(d + 4 * dst_stride); 369 __builtin_prefetch(d + 5 * dst_stride); 370 __builtin_prefetch(d + 6 * dst_stride); 371 __builtin_prefetch(d + 7 * dst_stride); 372 373 do { 374 uint8x8_t t8, t9, t10, t11, t12, t13, t14; 375 load_u8_8x8(s, src_stride, &t7, &t8, &t9, &t10, &t11, &t12, &t13, &t14); 376 377 transpose_elems_inplace_u8_8x8(&t7, &t8, &t9, &t10, &t11, &t12, &t13, 378 &t14); 379 int16x8_t s7 = vreinterpretq_s16_u16(vmovl_u8(t7)); 380 int16x8_t s8 = vreinterpretq_s16_u16(vmovl_u8(t8)); 381 int16x8_t s9 = vreinterpretq_s16_u16(vmovl_u8(t9)); 382 int16x8_t s10 = vreinterpretq_s16_u16(vmovl_u8(t10)); 383 int16x8_t s11 = vreinterpretq_s16_u16(vmovl_u8(t11)); 384 int16x8_t s12 = vreinterpretq_s16_u16(vmovl_u8(t12)); 385 int16x8_t s13 = vreinterpretq_s16_u16(vmovl_u8(t13)); 386 int16x8_t s14 = vreinterpretq_s16_u16(vmovl_u8(t14)); 387 388 uint8x8_t d0 = 389 convolve8_8_x(s0, s1, s2, s3, s4, s5, s6, s7, x_filter, horiz_const); 390 uint8x8_t d1 = 391 convolve8_8_x(s1, s2, s3, s4, s5, s6, s7, s8, x_filter, horiz_const); 392 uint8x8_t d2 = 393 convolve8_8_x(s2, s3, s4, s5, s6, s7, s8, s9, x_filter, horiz_const); 394 uint8x8_t d3 = 395 convolve8_8_x(s3, s4, s5, s6, s7, s8, s9, s10, x_filter, horiz_const); 396 uint8x8_t d4 = convolve8_8_x(s4, s5, s6, s7, s8, s9, s10, s11, x_filter, 397 horiz_const); 398 uint8x8_t d5 = convolve8_8_x(s5, s6, s7, s8, s9, s10, s11, s12, x_filter, 399 horiz_const); 400 uint8x8_t d6 = convolve8_8_x(s6, s7, s8, s9, s10, s11, s12, s13, x_filter, 401 horiz_const); 402 uint8x8_t d7 = convolve8_8_x(s7, s8, s9, s10, s11, s12, s13, s14, 403 x_filter, horiz_const); 404 405 transpose_elems_inplace_u8_8x8(&d0, &d1, &d2, &d3, &d4, &d5, &d6, &d7); 406 407 store_u8_8x8(d, dst_stride, d0, d1, d2, d3, d4, d5, d6, d7); 408 409 s0 = s8; 410 s1 = s9; 411 s2 = s10; 412 s3 = s11; 413 s4 = s12; 414 s5 = s13; 415 s6 = s14; 416 s += 8; 417 d += 8; 418 width -= 8; 419 } while (width != 0); 420 src += 8 * src_stride; 421 dst += 8 * dst_stride; 422 h -= 8; 423 } 424 #endif // AOM_ARCH_AARCH64 425 426 while (h-- != 0) { 427 uint8x8_t t0 = vld1_u8(src); // a0 a1 a2 a3 a4 a5 a6 a7 428 int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t0)); 429 430 int width = w; 431 const uint8_t *s = src + 8; 432 uint8_t *d = dst; 433 434 __builtin_prefetch(d); 435 436 do { 437 uint8x8_t t8 = vld1_u8(s); // a8 a9 a10 a11 a12 a13 a14 a15 438 int16x8_t s8 = vreinterpretq_s16_u16(vmovl_u8(t8)); 439 440 int16x8_t s1 = vextq_s16(s0, s8, 1); // a1 a2 a3 a4 a5 a6 a7 a8 441 int16x8_t s2 = vextq_s16(s0, s8, 2); // a2 a3 a4 a5 a6 a7 a8 a9 442 int16x8_t s3 = vextq_s16(s0, s8, 3); // a3 a4 a5 a6 a7 a8 a9 a10 443 int16x8_t s4 = vextq_s16(s0, s8, 4); // a4 a5 a6 a7 a8 a9 a10 a11 444 int16x8_t s5 = vextq_s16(s0, s8, 5); // a5 a6 a7 a8 a9 a10 a11 a12 445 int16x8_t s6 = vextq_s16(s0, s8, 6); // a6 a7 a8 a9 a10 a11 a12 a13 446 int16x8_t s7 = vextq_s16(s0, s8, 7); // a7 a8 a9 a10 a11 a12 a13 a14 447 448 uint8x8_t d0 = 449 convolve8_8_x(s0, s1, s2, s3, s4, s5, s6, s7, x_filter, horiz_const); 450 451 vst1_u8(d, d0); 452 453 s0 = s8; 454 s += 8; 455 d += 8; 456 width -= 8; 457 } while (width != 0); 458 src += src_stride; 459 dst += dst_stride; 460 } 461 } 462 463 static inline uint8x8_t convolve4_8_y(const int16x8_t s0, const int16x8_t s1, 464 const int16x8_t s2, const int16x8_t s3, 465 const int16x4_t filter) { 466 int16x8_t sum = vmulq_lane_s16(s0, filter, 0); 467 sum = vmlaq_lane_s16(sum, s1, filter, 1); 468 sum = vmlaq_lane_s16(sum, s2, filter, 2); 469 sum = vmlaq_lane_s16(sum, s3, filter, 3); 470 471 // We halved the filter values so -1 from right shift. 472 return vqrshrun_n_s16(sum, FILTER_BITS - 1); 473 } 474 475 static inline void convolve_y_sr_4tap_neon(const uint8_t *src, 476 const int src_stride, uint8_t *dst, 477 const int dst_stride, int w, int h, 478 const int16_t *filter_y) { 479 // All filter values are even, halve to reduce intermediate precision 480 // requirements. 481 const int16x4_t filter = vshr_n_s16(vld1_s16(filter_y + 2), 1); 482 483 if (w == 4) { 484 uint8x8_t t01 = load_unaligned_u8(src + 0 * src_stride, src_stride); 485 uint8x8_t t12 = load_unaligned_u8(src + 1 * src_stride, src_stride); 486 487 int16x8_t s01 = vreinterpretq_s16_u16(vmovl_u8(t01)); 488 int16x8_t s12 = vreinterpretq_s16_u16(vmovl_u8(t12)); 489 490 src += 2 * src_stride; 491 492 do { 493 uint8x8_t t23 = load_unaligned_u8(src + 0 * src_stride, src_stride); 494 uint8x8_t t34 = load_unaligned_u8(src + 1 * src_stride, src_stride); 495 uint8x8_t t45 = load_unaligned_u8(src + 2 * src_stride, src_stride); 496 uint8x8_t t56 = load_unaligned_u8(src + 3 * src_stride, src_stride); 497 498 int16x8_t s23 = vreinterpretq_s16_u16(vmovl_u8(t23)); 499 int16x8_t s34 = vreinterpretq_s16_u16(vmovl_u8(t34)); 500 int16x8_t s45 = vreinterpretq_s16_u16(vmovl_u8(t45)); 501 int16x8_t s56 = vreinterpretq_s16_u16(vmovl_u8(t56)); 502 503 uint8x8_t d01 = convolve4_8_y(s01, s12, s23, s34, filter); 504 uint8x8_t d23 = convolve4_8_y(s23, s34, s45, s56, filter); 505 506 store_u8x4_strided_x2(dst + 0 * dst_stride, dst_stride, d01); 507 store_u8x4_strided_x2(dst + 2 * dst_stride, dst_stride, d23); 508 509 s01 = s45; 510 s12 = s56; 511 512 src += 4 * src_stride; 513 dst += 4 * dst_stride; 514 h -= 4; 515 } while (h != 0); 516 } else { 517 do { 518 uint8x8_t t0, t1, t2; 519 load_u8_8x3(src, src_stride, &t0, &t1, &t2); 520 521 int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t0)); 522 int16x8_t s1 = vreinterpretq_s16_u16(vmovl_u8(t1)); 523 int16x8_t s2 = vreinterpretq_s16_u16(vmovl_u8(t2)); 524 525 int height = h; 526 const uint8_t *s = src + 3 * src_stride; 527 uint8_t *d = dst; 528 529 do { 530 uint8x8_t t3; 531 load_u8_8x4(s, src_stride, &t0, &t1, &t2, &t3); 532 533 int16x8_t s3 = vreinterpretq_s16_u16(vmovl_u8(t0)); 534 int16x8_t s4 = vreinterpretq_s16_u16(vmovl_u8(t1)); 535 int16x8_t s5 = vreinterpretq_s16_u16(vmovl_u8(t2)); 536 int16x8_t s6 = vreinterpretq_s16_u16(vmovl_u8(t3)); 537 538 uint8x8_t d0 = convolve4_8_y(s0, s1, s2, s3, filter); 539 uint8x8_t d1 = convolve4_8_y(s1, s2, s3, s4, filter); 540 uint8x8_t d2 = convolve4_8_y(s2, s3, s4, s5, filter); 541 uint8x8_t d3 = convolve4_8_y(s3, s4, s5, s6, filter); 542 543 store_u8_8x4(d, dst_stride, d0, d1, d2, d3); 544 545 s0 = s4; 546 s1 = s5; 547 s2 = s6; 548 549 s += 4 * src_stride; 550 d += 4 * dst_stride; 551 height -= 4; 552 } while (height != 0); 553 src += 8; 554 dst += 8; 555 w -= 8; 556 } while (w != 0); 557 } 558 } 559 560 static inline int16x4_t convolve6_4_y(const int16x4_t s0, const int16x4_t s1, 561 const int16x4_t s2, const int16x4_t s3, 562 const int16x4_t s4, const int16x4_t s5, 563 const int16x8_t y_filter_0_7) { 564 const int16x4_t y_filter_0_3 = vget_low_s16(y_filter_0_7); 565 const int16x4_t y_filter_4_7 = vget_high_s16(y_filter_0_7); 566 567 // Filter values at indices 0 and 7 are 0. 568 int16x4_t sum = vmul_lane_s16(s0, y_filter_0_3, 1); 569 sum = vmla_lane_s16(sum, s1, y_filter_0_3, 2); 570 sum = vmla_lane_s16(sum, s2, y_filter_0_3, 3); 571 sum = vmla_lane_s16(sum, s3, y_filter_4_7, 0); 572 sum = vmla_lane_s16(sum, s4, y_filter_4_7, 1); 573 sum = vmla_lane_s16(sum, s5, y_filter_4_7, 2); 574 575 return sum; 576 } 577 578 static inline uint8x8_t convolve6_8_y(const int16x8_t s0, const int16x8_t s1, 579 const int16x8_t s2, const int16x8_t s3, 580 const int16x8_t s4, const int16x8_t s5, 581 const int16x8_t y_filters) { 582 const int16x4_t y_filter_lo = vget_low_s16(y_filters); 583 const int16x4_t y_filter_hi = vget_high_s16(y_filters); 584 585 // Filter values at indices 0 and 7 are 0. 586 int16x8_t sum = vmulq_lane_s16(s0, y_filter_lo, 1); 587 sum = vmlaq_lane_s16(sum, s1, y_filter_lo, 2); 588 sum = vmlaq_lane_s16(sum, s2, y_filter_lo, 3); 589 sum = vmlaq_lane_s16(sum, s3, y_filter_hi, 0); 590 sum = vmlaq_lane_s16(sum, s4, y_filter_hi, 1); 591 sum = vmlaq_lane_s16(sum, s5, y_filter_hi, 2); 592 // We halved the convolution filter values so -1 from the right shift. 593 return vqrshrun_n_s16(sum, FILTER_BITS - 1); 594 } 595 596 static inline void convolve_y_sr_6tap_neon(const uint8_t *src_ptr, 597 int src_stride, uint8_t *dst_ptr, 598 const int dst_stride, int w, int h, 599 const int16x8_t y_filter) { 600 if (w <= 4) { 601 uint8x8_t t0 = load_unaligned_u8_4x1(src_ptr + 0 * src_stride); 602 uint8x8_t t1 = load_unaligned_u8_4x1(src_ptr + 1 * src_stride); 603 uint8x8_t t2 = load_unaligned_u8_4x1(src_ptr + 2 * src_stride); 604 uint8x8_t t3 = load_unaligned_u8_4x1(src_ptr + 3 * src_stride); 605 uint8x8_t t4 = load_unaligned_u8_4x1(src_ptr + 4 * src_stride); 606 607 int16x4_t s0 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0))); 608 int16x4_t s1 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1))); 609 int16x4_t s2 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2))); 610 int16x4_t s3 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3))); 611 int16x4_t s4 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t4))); 612 613 src_ptr += 5 * src_stride; 614 615 do { 616 #if AOM_ARCH_AARCH64 617 uint8x8_t t5 = load_unaligned_u8_4x1(src_ptr + 0 * src_stride); 618 uint8x8_t t6 = load_unaligned_u8_4x1(src_ptr + 1 * src_stride); 619 uint8x8_t t7 = load_unaligned_u8_4x1(src_ptr + 2 * src_stride); 620 uint8x8_t t8 = load_unaligned_u8_4x1(src_ptr + 3 * src_stride); 621 622 int16x4_t s5 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t5))); 623 int16x4_t s6 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t6))); 624 int16x4_t s7 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t7))); 625 int16x4_t s8 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t8))); 626 627 int16x4_t d0 = convolve6_4_y(s0, s1, s2, s3, s4, s5, y_filter); 628 int16x4_t d1 = convolve6_4_y(s1, s2, s3, s4, s5, s6, y_filter); 629 int16x4_t d2 = convolve6_4_y(s2, s3, s4, s5, s6, s7, y_filter); 630 int16x4_t d3 = convolve6_4_y(s3, s4, s5, s6, s7, s8, y_filter); 631 632 // We halved the convolution filter values so -1 from the right shift. 633 uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS - 1); 634 uint8x8_t d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS - 1); 635 636 store_u8x4_strided_x2(dst_ptr, dst_stride, d01); 637 store_u8x4_strided_x2(dst_ptr + 2 * dst_stride, dst_stride, d23); 638 639 s0 = s4; 640 s1 = s5; 641 s2 = s6; 642 s3 = s7; 643 s4 = s8; 644 src_ptr += 4 * src_stride; 645 dst_ptr += 4 * dst_stride; 646 h -= 4; 647 #else // !AOM_ARCH_AARCH64 648 uint8x8_t t5 = load_unaligned_u8_4x1(src_ptr); 649 int16x4_t s5 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t5))); 650 651 int16x4_t d0 = convolve6_4_y(s0, s1, s2, s3, s4, s5, y_filter); 652 // We halved the convolution filter values so -1 from the right shift. 653 uint8x8_t d01 = 654 vqrshrun_n_s16(vcombine_s16(d0, vdup_n_s16(0)), FILTER_BITS - 1); 655 656 store_u8_4x1(dst_ptr, d01); 657 658 s0 = s1; 659 s1 = s2; 660 s2 = s3; 661 s3 = s4; 662 s4 = s5; 663 src_ptr += src_stride; 664 dst_ptr += dst_stride; 665 h--; 666 #endif // AOM_ARCH_AARCH64 667 } while (h != 0); 668 669 } else { 670 do { 671 const uint8_t *s = src_ptr; 672 uint8_t *d = dst_ptr; 673 int height = h; 674 675 uint8x8_t t0, t1, t2, t3, t4; 676 load_u8_8x5(s, src_stride, &t0, &t1, &t2, &t3, &t4); 677 678 int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t0)); 679 int16x8_t s1 = vreinterpretq_s16_u16(vmovl_u8(t1)); 680 int16x8_t s2 = vreinterpretq_s16_u16(vmovl_u8(t2)); 681 int16x8_t s3 = vreinterpretq_s16_u16(vmovl_u8(t3)); 682 int16x8_t s4 = vreinterpretq_s16_u16(vmovl_u8(t4)); 683 684 s += 5 * src_stride; 685 686 do { 687 #if AOM_ARCH_AARCH64 688 uint8x8_t t5, t6, t7, t8; 689 load_u8_8x4(s, src_stride, &t5, &t6, &t7, &t8); 690 691 int16x8_t s5 = vreinterpretq_s16_u16(vmovl_u8(t5)); 692 int16x8_t s6 = vreinterpretq_s16_u16(vmovl_u8(t6)); 693 int16x8_t s7 = vreinterpretq_s16_u16(vmovl_u8(t7)); 694 int16x8_t s8 = vreinterpretq_s16_u16(vmovl_u8(t8)); 695 696 uint8x8_t d0 = convolve6_8_y(s0, s1, s2, s3, s4, s5, y_filter); 697 uint8x8_t d1 = convolve6_8_y(s1, s2, s3, s4, s5, s6, y_filter); 698 uint8x8_t d2 = convolve6_8_y(s2, s3, s4, s5, s6, s7, y_filter); 699 uint8x8_t d3 = convolve6_8_y(s3, s4, s5, s6, s7, s8, y_filter); 700 701 store_u8_8x4(d, dst_stride, d0, d1, d2, d3); 702 703 s0 = s4; 704 s1 = s5; 705 s2 = s6; 706 s3 = s7; 707 s4 = s8; 708 s += 4 * src_stride; 709 d += 4 * dst_stride; 710 height -= 4; 711 #else // !AOM_ARCH_AARCH64 712 int16x8_t s5 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s))); 713 714 uint8x8_t d0 = convolve6_8_y(s0, s1, s2, s3, s4, s5, y_filter); 715 716 vst1_u8(d, d0); 717 718 s0 = s1; 719 s1 = s2; 720 s2 = s3; 721 s3 = s4; 722 s4 = s5; 723 s += src_stride; 724 d += dst_stride; 725 height--; 726 #endif // AOM_ARCH_AARCH64 727 } while (height != 0); 728 src_ptr += 8; 729 dst_ptr += 8; 730 w -= 8; 731 } while (w != 0); 732 } 733 } 734 735 static inline int16x4_t convolve8_4_y(const int16x4_t s0, const int16x4_t s1, 736 const int16x4_t s2, const int16x4_t s3, 737 const int16x4_t s4, const int16x4_t s5, 738 const int16x4_t s6, const int16x4_t s7, 739 const int16x8_t filter) { 740 const int16x4_t filter_lo = vget_low_s16(filter); 741 const int16x4_t filter_hi = vget_high_s16(filter); 742 743 int16x4_t sum = vmul_lane_s16(s0, filter_lo, 0); 744 sum = vmla_lane_s16(sum, s1, filter_lo, 1); 745 sum = vmla_lane_s16(sum, s2, filter_lo, 2); 746 sum = vmla_lane_s16(sum, s3, filter_lo, 3); 747 sum = vmla_lane_s16(sum, s4, filter_hi, 0); 748 sum = vmla_lane_s16(sum, s5, filter_hi, 1); 749 sum = vmla_lane_s16(sum, s6, filter_hi, 2); 750 sum = vmla_lane_s16(sum, s7, filter_hi, 3); 751 752 return sum; 753 } 754 755 static inline uint8x8_t convolve8_8_y(const int16x8_t s0, const int16x8_t s1, 756 const int16x8_t s2, const int16x8_t s3, 757 const int16x8_t s4, const int16x8_t s5, 758 const int16x8_t s6, const int16x8_t s7, 759 const int16x8_t filter) { 760 const int16x4_t filter_lo = vget_low_s16(filter); 761 const int16x4_t filter_hi = vget_high_s16(filter); 762 763 int16x8_t sum = vmulq_lane_s16(s0, filter_lo, 0); 764 sum = vmlaq_lane_s16(sum, s1, filter_lo, 1); 765 sum = vmlaq_lane_s16(sum, s2, filter_lo, 2); 766 sum = vmlaq_lane_s16(sum, s3, filter_lo, 3); 767 sum = vmlaq_lane_s16(sum, s4, filter_hi, 0); 768 sum = vmlaq_lane_s16(sum, s5, filter_hi, 1); 769 sum = vmlaq_lane_s16(sum, s6, filter_hi, 2); 770 sum = vmlaq_lane_s16(sum, s7, filter_hi, 3); 771 772 // We halved the convolution filter values so -1 from the right shift. 773 return vqrshrun_n_s16(sum, FILTER_BITS - 1); 774 } 775 776 static inline void convolve_y_sr_8tap_neon(const uint8_t *src_ptr, 777 int src_stride, uint8_t *dst_ptr, 778 const int dst_stride, int w, int h, 779 const int16x8_t y_filter) { 780 if (w <= 4) { 781 uint8x8_t t0 = load_unaligned_u8_4x1(src_ptr + 0 * src_stride); 782 uint8x8_t t1 = load_unaligned_u8_4x1(src_ptr + 1 * src_stride); 783 uint8x8_t t2 = load_unaligned_u8_4x1(src_ptr + 2 * src_stride); 784 uint8x8_t t3 = load_unaligned_u8_4x1(src_ptr + 3 * src_stride); 785 uint8x8_t t4 = load_unaligned_u8_4x1(src_ptr + 4 * src_stride); 786 uint8x8_t t5 = load_unaligned_u8_4x1(src_ptr + 5 * src_stride); 787 uint8x8_t t6 = load_unaligned_u8_4x1(src_ptr + 6 * src_stride); 788 789 int16x4_t s0 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t0))); 790 int16x4_t s1 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t1))); 791 int16x4_t s2 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t2))); 792 int16x4_t s3 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t3))); 793 int16x4_t s4 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t4))); 794 int16x4_t s5 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t5))); 795 int16x4_t s6 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t6))); 796 797 src_ptr += 7 * src_stride; 798 799 do { 800 #if AOM_ARCH_AARCH64 801 uint8x8_t t7 = load_unaligned_u8_4x1(src_ptr + 0 * src_stride); 802 uint8x8_t t8 = load_unaligned_u8_4x1(src_ptr + 1 * src_stride); 803 uint8x8_t t9 = load_unaligned_u8_4x1(src_ptr + 2 * src_stride); 804 uint8x8_t t10 = load_unaligned_u8_4x1(src_ptr + 3 * src_stride); 805 806 int16x4_t s7 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t7))); 807 int16x4_t s8 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t8))); 808 int16x4_t s9 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t9))); 809 int16x4_t s10 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t10))); 810 811 int16x4_t d0 = convolve8_4_y(s0, s1, s2, s3, s4, s5, s6, s7, y_filter); 812 int16x4_t d1 = convolve8_4_y(s1, s2, s3, s4, s5, s6, s7, s8, y_filter); 813 int16x4_t d2 = convolve8_4_y(s2, s3, s4, s5, s6, s7, s8, s9, y_filter); 814 int16x4_t d3 = convolve8_4_y(s3, s4, s5, s6, s7, s8, s9, s10, y_filter); 815 816 // We halved the convolution filter values so -1 from the right shift. 817 uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS - 1); 818 uint8x8_t d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS - 1); 819 820 store_u8x4_strided_x2(dst_ptr, dst_stride, d01); 821 store_u8x4_strided_x2(dst_ptr + 2 * dst_stride, dst_stride, d23); 822 823 s0 = s4; 824 s1 = s5; 825 s2 = s6; 826 s3 = s7; 827 s4 = s8; 828 s5 = s9; 829 s6 = s10; 830 src_ptr += 4 * src_stride; 831 dst_ptr += 4 * dst_stride; 832 h -= 4; 833 #else // !AOM_ARCH_AARCH64 834 uint8x8_t t7 = load_unaligned_u8_4x1(src_ptr); 835 int16x4_t s7 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t7))); 836 837 int16x4_t d0 = convolve8_4_y(s0, s1, s2, s3, s4, s5, s6, s7, y_filter); 838 // We halved the convolution filter values so -1 from the right shift. 839 uint8x8_t d01 = 840 vqrshrun_n_s16(vcombine_s16(d0, vdup_n_s16(0)), FILTER_BITS - 1); 841 842 store_u8_4x1(dst_ptr, d01); 843 844 s0 = s1; 845 s1 = s2; 846 s2 = s3; 847 s3 = s4; 848 s4 = s5; 849 s5 = s6; 850 s6 = s7; 851 src_ptr += src_stride; 852 dst_ptr += dst_stride; 853 h--; 854 #endif // AOM_ARCH_AARCH64 855 } while (h != 0); 856 } else { 857 do { 858 const uint8_t *s = src_ptr; 859 uint8_t *d = dst_ptr; 860 int height = h; 861 862 uint8x8_t t0, t1, t2, t3, t4, t5, t6; 863 load_u8_8x7(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6); 864 865 int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t0)); 866 int16x8_t s1 = vreinterpretq_s16_u16(vmovl_u8(t1)); 867 int16x8_t s2 = vreinterpretq_s16_u16(vmovl_u8(t2)); 868 int16x8_t s3 = vreinterpretq_s16_u16(vmovl_u8(t3)); 869 int16x8_t s4 = vreinterpretq_s16_u16(vmovl_u8(t4)); 870 int16x8_t s5 = vreinterpretq_s16_u16(vmovl_u8(t5)); 871 int16x8_t s6 = vreinterpretq_s16_u16(vmovl_u8(t6)); 872 873 s += 7 * src_stride; 874 875 do { 876 #if AOM_ARCH_AARCH64 877 uint8x8_t t7, t8, t9, t10; 878 load_u8_8x4(s, src_stride, &t7, &t8, &t9, &t10); 879 880 int16x8_t s7 = vreinterpretq_s16_u16(vmovl_u8(t7)); 881 int16x8_t s8 = vreinterpretq_s16_u16(vmovl_u8(t8)); 882 int16x8_t s9 = vreinterpretq_s16_u16(vmovl_u8(t9)); 883 int16x8_t s10 = vreinterpretq_s16_u16(vmovl_u8(t10)); 884 885 uint8x8_t d0 = convolve8_8_y(s0, s1, s2, s3, s4, s5, s6, s7, y_filter); 886 uint8x8_t d1 = convolve8_8_y(s1, s2, s3, s4, s5, s6, s7, s8, y_filter); 887 uint8x8_t d2 = convolve8_8_y(s2, s3, s4, s5, s6, s7, s8, s9, y_filter); 888 uint8x8_t d3 = convolve8_8_y(s3, s4, s5, s6, s7, s8, s9, s10, y_filter); 889 890 store_u8_8x4(d, dst_stride, d0, d1, d2, d3); 891 892 s0 = s4; 893 s1 = s5; 894 s2 = s6; 895 s3 = s7; 896 s4 = s8; 897 s5 = s9; 898 s6 = s10; 899 s += 4 * src_stride; 900 d += 4 * dst_stride; 901 height -= 4; 902 #else // !AOM_ARCH_AARCH64 903 int16x8_t s7 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s))); 904 905 uint8x8_t d0 = convolve8_8_y(s0, s1, s2, s3, s4, s5, s6, s7, y_filter); 906 907 vst1_u8(d, d0); 908 909 s0 = s1; 910 s1 = s2; 911 s2 = s3; 912 s3 = s4; 913 s4 = s5; 914 s5 = s6; 915 s6 = s7; 916 s += src_stride; 917 d += dst_stride; 918 height--; 919 #endif // AOM_ARCH_AARCH64 920 } while (height != 0); 921 src_ptr += 8; 922 dst_ptr += 8; 923 w -= 8; 924 } while (w != 0); 925 } 926 } 927 928 static inline int16x4_t convolve12_4_y(const int16x4_t s0, const int16x4_t s1, 929 const int16x4_t s2, const int16x4_t s3, 930 const int16x4_t s4, const int16x4_t s5, 931 const int16x4_t s6, const int16x4_t s7, 932 const int16x4_t s8, const int16x4_t s9, 933 const int16x4_t s10, const int16x4_t s11, 934 const int16x8_t y_filter_0_7, 935 const int16x4_t y_filter_8_11) { 936 const int16x4_t y_filter_0_3 = vget_low_s16(y_filter_0_7); 937 const int16x4_t y_filter_4_7 = vget_high_s16(y_filter_0_7); 938 int16x4_t sum; 939 940 sum = vmul_lane_s16(s0, y_filter_0_3, 0); 941 sum = vmla_lane_s16(sum, s1, y_filter_0_3, 1); 942 sum = vmla_lane_s16(sum, s2, y_filter_0_3, 2); 943 sum = vmla_lane_s16(sum, s3, y_filter_0_3, 3); 944 sum = vmla_lane_s16(sum, s4, y_filter_4_7, 0); 945 946 sum = vmla_lane_s16(sum, s7, y_filter_4_7, 3); 947 sum = vmla_lane_s16(sum, s8, y_filter_8_11, 0); 948 sum = vmla_lane_s16(sum, s9, y_filter_8_11, 1); 949 sum = vmla_lane_s16(sum, s10, y_filter_8_11, 2); 950 sum = vmla_lane_s16(sum, s11, y_filter_8_11, 3); 951 952 // Saturating addition is required for the largest filter taps to avoid 953 // overflow (while staying in 16-bit elements.) 954 sum = vqadd_s16(sum, vmul_lane_s16(s5, y_filter_4_7, 1)); 955 sum = vqadd_s16(sum, vmul_lane_s16(s6, y_filter_4_7, 2)); 956 957 return sum; 958 } 959 960 static inline uint8x8_t convolve12_8_y(const int16x8_t s0, const int16x8_t s1, 961 const int16x8_t s2, const int16x8_t s3, 962 const int16x8_t s4, const int16x8_t s5, 963 const int16x8_t s6, const int16x8_t s7, 964 const int16x8_t s8, const int16x8_t s9, 965 const int16x8_t s10, const int16x8_t s11, 966 const int16x8_t y_filter_0_7, 967 const int16x4_t y_filter_8_11) { 968 const int16x4_t y_filter_0_3 = vget_low_s16(y_filter_0_7); 969 const int16x4_t y_filter_4_7 = vget_high_s16(y_filter_0_7); 970 int16x8_t sum; 971 972 sum = vmulq_lane_s16(s0, y_filter_0_3, 0); 973 sum = vmlaq_lane_s16(sum, s1, y_filter_0_3, 1); 974 sum = vmlaq_lane_s16(sum, s2, y_filter_0_3, 2); 975 sum = vmlaq_lane_s16(sum, s3, y_filter_0_3, 3); 976 sum = vmlaq_lane_s16(sum, s4, y_filter_4_7, 0); 977 978 sum = vmlaq_lane_s16(sum, s7, y_filter_4_7, 3); 979 sum = vmlaq_lane_s16(sum, s8, y_filter_8_11, 0); 980 sum = vmlaq_lane_s16(sum, s9, y_filter_8_11, 1); 981 sum = vmlaq_lane_s16(sum, s10, y_filter_8_11, 2); 982 sum = vmlaq_lane_s16(sum, s11, y_filter_8_11, 3); 983 984 // Saturating addition is required for the largest filter taps to avoid 985 // overflow (while staying in 16-bit elements.) 986 sum = vqaddq_s16(sum, vmulq_lane_s16(s5, y_filter_4_7, 1)); 987 sum = vqaddq_s16(sum, vmulq_lane_s16(s6, y_filter_4_7, 2)); 988 989 return vqrshrun_n_s16(sum, FILTER_BITS); 990 } 991 992 static inline void convolve_y_sr_12tap_neon(const uint8_t *src_ptr, 993 int src_stride, uint8_t *dst_ptr, 994 int dst_stride, int w, int h, 995 const int16_t *y_filter_ptr) { 996 const int16x8_t y_filter_0_7 = vld1q_s16(y_filter_ptr); 997 const int16x4_t y_filter_8_11 = vld1_s16(y_filter_ptr + 8); 998 999 if (w <= 4) { 1000 uint8x8_t t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10; 1001 load_u8_8x11(src_ptr, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7, 1002 &t8, &t9, &t10); 1003 int16x4_t s0 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0))); 1004 int16x4_t s1 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1))); 1005 int16x4_t s2 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2))); 1006 int16x4_t s3 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3))); 1007 int16x4_t s4 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t4))); 1008 int16x4_t s5 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t5))); 1009 int16x4_t s6 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t6))); 1010 int16x4_t s7 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t7))); 1011 int16x4_t s8 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t8))); 1012 int16x4_t s9 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t9))); 1013 int16x4_t s10 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t10))); 1014 1015 src_ptr += 11 * src_stride; 1016 1017 do { 1018 uint8x8_t t11, t12, t13, t14; 1019 load_u8_8x4(src_ptr, src_stride, &t11, &t12, &t13, &t14); 1020 1021 int16x4_t s11 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t11))); 1022 int16x4_t s12 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t12))); 1023 int16x4_t s13 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t13))); 1024 int16x4_t s14 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t14))); 1025 1026 int16x4_t d0 = convolve12_4_y(s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, 1027 s11, y_filter_0_7, y_filter_8_11); 1028 int16x4_t d1 = convolve12_4_y(s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, 1029 s11, s12, y_filter_0_7, y_filter_8_11); 1030 int16x4_t d2 = convolve12_4_y(s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, 1031 s12, s13, y_filter_0_7, y_filter_8_11); 1032 int16x4_t d3 = convolve12_4_y(s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, 1033 s13, s14, y_filter_0_7, y_filter_8_11); 1034 1035 uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS); 1036 uint8x8_t d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS); 1037 1038 store_u8x4_strided_x2(dst_ptr, dst_stride, d01); 1039 store_u8x4_strided_x2(dst_ptr + 2 * dst_stride, dst_stride, d23); 1040 1041 s0 = s4; 1042 s1 = s5; 1043 s2 = s6; 1044 s3 = s7; 1045 s4 = s8; 1046 s5 = s9; 1047 s6 = s10; 1048 s7 = s11; 1049 s8 = s12; 1050 s9 = s13; 1051 s10 = s14; 1052 src_ptr += 4 * src_stride; 1053 dst_ptr += 4 * dst_stride; 1054 h -= 4; 1055 } while (h != 0); 1056 1057 } else { 1058 do { 1059 const uint8_t *s = src_ptr; 1060 uint8_t *d = dst_ptr; 1061 int height = h; 1062 1063 uint8x8_t t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10; 1064 load_u8_8x11(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7, &t8, 1065 &t9, &t10); 1066 int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t0)); 1067 int16x8_t s1 = vreinterpretq_s16_u16(vmovl_u8(t1)); 1068 int16x8_t s2 = vreinterpretq_s16_u16(vmovl_u8(t2)); 1069 int16x8_t s3 = vreinterpretq_s16_u16(vmovl_u8(t3)); 1070 int16x8_t s4 = vreinterpretq_s16_u16(vmovl_u8(t4)); 1071 int16x8_t s5 = vreinterpretq_s16_u16(vmovl_u8(t5)); 1072 int16x8_t s6 = vreinterpretq_s16_u16(vmovl_u8(t6)); 1073 int16x8_t s7 = vreinterpretq_s16_u16(vmovl_u8(t7)); 1074 int16x8_t s8 = vreinterpretq_s16_u16(vmovl_u8(t8)); 1075 int16x8_t s9 = vreinterpretq_s16_u16(vmovl_u8(t9)); 1076 int16x8_t s10 = vreinterpretq_s16_u16(vmovl_u8(t10)); 1077 1078 s += 11 * src_stride; 1079 1080 do { 1081 uint8x8_t t11, t12, t13, t14; 1082 load_u8_8x4(s, src_stride, &t11, &t12, &t13, &t14); 1083 1084 int16x8_t s11 = vreinterpretq_s16_u16(vmovl_u8(t11)); 1085 int16x8_t s12 = vreinterpretq_s16_u16(vmovl_u8(t12)); 1086 int16x8_t s13 = vreinterpretq_s16_u16(vmovl_u8(t13)); 1087 int16x8_t s14 = vreinterpretq_s16_u16(vmovl_u8(t14)); 1088 1089 uint8x8_t d0 = convolve12_8_y(s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, 1090 s10, s11, y_filter_0_7, y_filter_8_11); 1091 uint8x8_t d1 = convolve12_8_y(s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, 1092 s11, s12, y_filter_0_7, y_filter_8_11); 1093 uint8x8_t d2 = convolve12_8_y(s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, 1094 s12, s13, y_filter_0_7, y_filter_8_11); 1095 uint8x8_t d3 = convolve12_8_y(s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, 1096 s13, s14, y_filter_0_7, y_filter_8_11); 1097 1098 store_u8_8x4(d, dst_stride, d0, d1, d2, d3); 1099 1100 s0 = s4; 1101 s1 = s5; 1102 s2 = s6; 1103 s3 = s7; 1104 s4 = s8; 1105 s5 = s9; 1106 s6 = s10; 1107 s7 = s11; 1108 s8 = s12; 1109 s9 = s13; 1110 s10 = s14; 1111 s += 4 * src_stride; 1112 d += 4 * dst_stride; 1113 height -= 4; 1114 } while (height != 0); 1115 src_ptr += 8; 1116 dst_ptr += 8; 1117 w -= 8; 1118 } while (w != 0); 1119 } 1120 } 1121 1122 void av1_convolve_y_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst, 1123 int dst_stride, int w, int h, 1124 const InterpFilterParams *filter_params_y, 1125 const int subpel_y_qn) { 1126 if (w == 2 || h == 2) { 1127 av1_convolve_y_sr_c(src, src_stride, dst, dst_stride, w, h, filter_params_y, 1128 subpel_y_qn); 1129 return; 1130 } 1131 1132 const int y_filter_taps = get_filter_tap(filter_params_y, subpel_y_qn); 1133 const int clamped_y_taps = y_filter_taps < 4 ? 4 : y_filter_taps; 1134 const int vert_offset = clamped_y_taps / 2 - 1; 1135 1136 src -= vert_offset * src_stride; 1137 1138 const int16_t *y_filter_ptr = av1_get_interp_filter_subpel_kernel( 1139 filter_params_y, subpel_y_qn & SUBPEL_MASK); 1140 1141 if (y_filter_taps > 8) { 1142 convolve_y_sr_12tap_neon(src, src_stride, dst, dst_stride, w, h, 1143 y_filter_ptr); 1144 return; 1145 } 1146 1147 // Filter values are even so halve to reduce precision requirements. 1148 const int16x8_t y_filter = vshrq_n_s16(vld1q_s16(y_filter_ptr), 1); 1149 1150 if (y_filter_taps <= 4) { 1151 convolve_y_sr_4tap_neon(src, src_stride, dst, dst_stride, w, h, 1152 y_filter_ptr); 1153 } else if (y_filter_taps == 6) { 1154 convolve_y_sr_6tap_neon(src, src_stride, dst, dst_stride, w, h, y_filter); 1155 } else { 1156 convolve_y_sr_8tap_neon(src, src_stride, dst, dst_stride, w, h, y_filter); 1157 } 1158 } 1159 1160 static inline int16x4_t convolve12_4_2d_h( 1161 const int16x4_t s0, const int16x4_t s1, const int16x4_t s2, 1162 const int16x4_t s3, const int16x4_t s4, const int16x4_t s5, 1163 const int16x4_t s6, const int16x4_t s7, const int16x4_t s8, 1164 const int16x4_t s9, const int16x4_t s10, const int16x4_t s11, 1165 const int16x8_t x_filter_0_7, const int16x4_t x_filter_8_11, 1166 const int32x4_t horiz_const) { 1167 const int16x4_t x_filter_0_3 = vget_low_s16(x_filter_0_7); 1168 const int16x4_t x_filter_4_7 = vget_high_s16(x_filter_0_7); 1169 1170 int32x4_t sum = horiz_const; 1171 sum = vmlal_lane_s16(sum, s0, x_filter_0_3, 0); 1172 sum = vmlal_lane_s16(sum, s1, x_filter_0_3, 1); 1173 sum = vmlal_lane_s16(sum, s2, x_filter_0_3, 2); 1174 sum = vmlal_lane_s16(sum, s3, x_filter_0_3, 3); 1175 sum = vmlal_lane_s16(sum, s4, x_filter_4_7, 0); 1176 sum = vmlal_lane_s16(sum, s5, x_filter_4_7, 1); 1177 sum = vmlal_lane_s16(sum, s6, x_filter_4_7, 2); 1178 sum = vmlal_lane_s16(sum, s7, x_filter_4_7, 3); 1179 sum = vmlal_lane_s16(sum, s8, x_filter_8_11, 0); 1180 sum = vmlal_lane_s16(sum, s9, x_filter_8_11, 1); 1181 sum = vmlal_lane_s16(sum, s10, x_filter_8_11, 2); 1182 sum = vmlal_lane_s16(sum, s11, x_filter_8_11, 3); 1183 1184 return vshrn_n_s32(sum, ROUND0_BITS); 1185 } 1186 1187 static inline void convolve_2d_sr_horiz_12tap_neon( 1188 const uint8_t *src_ptr, int src_stride, int16_t *dst_ptr, 1189 const int dst_stride, int w, int h, const int16x8_t x_filter_0_7, 1190 const int16x4_t x_filter_8_11) { 1191 const int bd = 8; 1192 // A shim of 1 << (ROUND0_BITS - 1) enables us to use non-rounding shifts - 1193 // which are generally faster than rounding shifts on modern CPUs. 1194 const int32x4_t horiz_const = 1195 vdupq_n_s32((1 << (bd + FILTER_BITS - 1)) + (1 << (ROUND0_BITS - 1))); 1196 1197 #if AOM_ARCH_AARCH64 1198 do { 1199 const uint8_t *s = src_ptr; 1200 int16_t *d = dst_ptr; 1201 int width = w; 1202 1203 uint8x8_t t0, t1, t2, t3; 1204 load_u8_8x4(s, src_stride, &t0, &t1, &t2, &t3); 1205 transpose_elems_inplace_u8_8x4(&t0, &t1, &t2, &t3); 1206 1207 int16x4_t s0 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0))); 1208 int16x4_t s1 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1))); 1209 int16x4_t s2 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2))); 1210 int16x4_t s3 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3))); 1211 int16x4_t s4 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t0))); 1212 int16x4_t s5 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t1))); 1213 int16x4_t s6 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t2))); 1214 int16x4_t s7 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t3))); 1215 1216 load_u8_8x4(s + 8, src_stride, &t0, &t1, &t2, &t3); 1217 transpose_elems_inplace_u8_8x4(&t0, &t1, &t2, &t3); 1218 1219 int16x4_t s8 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0))); 1220 int16x4_t s9 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1))); 1221 int16x4_t s10 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2))); 1222 1223 s += 11; 1224 1225 do { 1226 load_u8_8x4(s, src_stride, &t0, &t1, &t2, &t3); 1227 transpose_elems_inplace_u8_8x4(&t0, &t1, &t2, &t3); 1228 1229 int16x4_t s11 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0))); 1230 int16x4_t s12 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1))); 1231 int16x4_t s13 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2))); 1232 int16x4_t s14 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3))); 1233 1234 int16x4_t d0 = 1235 convolve12_4_2d_h(s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, 1236 x_filter_0_7, x_filter_8_11, horiz_const); 1237 int16x4_t d1 = 1238 convolve12_4_2d_h(s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, 1239 x_filter_0_7, x_filter_8_11, horiz_const); 1240 int16x4_t d2 = 1241 convolve12_4_2d_h(s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, 1242 x_filter_0_7, x_filter_8_11, horiz_const); 1243 int16x4_t d3 = 1244 convolve12_4_2d_h(s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14, 1245 x_filter_0_7, x_filter_8_11, horiz_const); 1246 1247 transpose_elems_inplace_s16_4x4(&d0, &d1, &d2, &d3); 1248 store_s16_4x4(d, dst_stride, d0, d1, d2, d3); 1249 1250 s0 = s4; 1251 s1 = s5; 1252 s2 = s6; 1253 s3 = s7; 1254 s4 = s8; 1255 s5 = s9; 1256 s6 = s10; 1257 s7 = s11; 1258 s8 = s12; 1259 s9 = s13; 1260 s10 = s14; 1261 s += 4; 1262 d += 4; 1263 width -= 4; 1264 } while (width != 0); 1265 src_ptr += 4 * src_stride; 1266 dst_ptr += 4 * dst_stride; 1267 h -= 4; 1268 } while (h > 4); 1269 #endif // AOM_ARCH_AARCH64 1270 1271 do { 1272 const uint8_t *s = src_ptr; 1273 int16_t *d = dst_ptr; 1274 int width = w; 1275 1276 do { 1277 uint8x16_t t0 = vld1q_u8(s); 1278 int16x8_t tt0 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(t0))); 1279 int16x8_t tt1 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(t0))); 1280 1281 int16x4_t s0 = vget_low_s16(tt0); 1282 int16x4_t s4 = vget_high_s16(tt0); 1283 int16x4_t s8 = vget_low_s16(tt1); 1284 int16x4_t s12 = vget_high_s16(tt1); 1285 1286 int16x4_t s1 = vext_s16(s0, s4, 1); // a1 a2 a3 a4 1287 int16x4_t s2 = vext_s16(s0, s4, 2); // a2 a3 a4 a5 1288 int16x4_t s3 = vext_s16(s0, s4, 3); // a3 a4 a5 a6 1289 int16x4_t s5 = vext_s16(s4, s8, 1); // a5 a6 a7 a8 1290 int16x4_t s6 = vext_s16(s4, s8, 2); // a6 a7 a8 a9 1291 int16x4_t s7 = vext_s16(s4, s8, 3); // a7 a8 a9 a10 1292 int16x4_t s9 = vext_s16(s8, s12, 1); // a9 a10 a11 a12 1293 int16x4_t s10 = vext_s16(s8, s12, 2); // a10 a11 a12 a13 1294 int16x4_t s11 = vext_s16(s8, s12, 3); // a11 a12 a13 a14 1295 1296 int16x4_t d0 = 1297 convolve12_4_2d_h(s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, 1298 x_filter_0_7, x_filter_8_11, horiz_const); 1299 vst1_s16(d, d0); 1300 1301 s += 4; 1302 d += 4; 1303 width -= 4; 1304 } while (width != 0); 1305 src_ptr += src_stride; 1306 dst_ptr += dst_stride; 1307 } while (--h != 0); 1308 } 1309 1310 static inline int16x8_t convolve4_8_2d_h(const int16x8_t s0, const int16x8_t s1, 1311 const int16x8_t s2, const int16x8_t s3, 1312 const int16x4_t filter, 1313 const int16x8_t horiz_const) { 1314 int16x8_t sum = vmlaq_lane_s16(horiz_const, s0, filter, 0); 1315 sum = vmlaq_lane_s16(sum, s1, filter, 1); 1316 sum = vmlaq_lane_s16(sum, s2, filter, 2); 1317 sum = vmlaq_lane_s16(sum, s3, filter, 3); 1318 // We halved the filter values so -1 from right shift. 1319 return vshrq_n_s16(sum, ROUND0_BITS - 1); 1320 } 1321 1322 static inline void convolve_2d_sr_horiz_4tap_neon( 1323 const uint8_t *src, ptrdiff_t src_stride, int16_t *dst, 1324 ptrdiff_t dst_stride, int w, int h, const int16_t *filter_x) { 1325 const int bd = 8; 1326 // All filter values are even, halve to reduce intermediate precision 1327 // requirements. 1328 const int16x4_t filter = vshr_n_s16(vld1_s16(filter_x + 2), 1); 1329 1330 // A shim of 1 << ((ROUND0_BITS - 1) - 1) enables us to use non-rounding 1331 // shifts - which are generally faster than rounding shifts on modern CPUs. 1332 // (The extra -1 is needed because we halved the filter values.) 1333 const int16x8_t horiz_const = vdupq_n_s16((1 << (bd + FILTER_BITS - 2)) + 1334 (1 << ((ROUND0_BITS - 1) - 1))); 1335 1336 if (w == 4) { 1337 do { 1338 uint8x8_t t01[4]; 1339 t01[0] = load_unaligned_u8(src + 0, (int)src_stride); 1340 t01[1] = load_unaligned_u8(src + 1, (int)src_stride); 1341 t01[2] = load_unaligned_u8(src + 2, (int)src_stride); 1342 t01[3] = load_unaligned_u8(src + 3, (int)src_stride); 1343 1344 int16x8_t s01[4]; 1345 s01[0] = vreinterpretq_s16_u16(vmovl_u8(t01[0])); 1346 s01[1] = vreinterpretq_s16_u16(vmovl_u8(t01[1])); 1347 s01[2] = vreinterpretq_s16_u16(vmovl_u8(t01[2])); 1348 s01[3] = vreinterpretq_s16_u16(vmovl_u8(t01[3])); 1349 1350 int16x8_t d01 = 1351 convolve4_8_2d_h(s01[0], s01[1], s01[2], s01[3], filter, horiz_const); 1352 1353 store_s16x4_strided_x2(dst, (int)dst_stride, d01); 1354 1355 src += 2 * src_stride; 1356 dst += 2 * dst_stride; 1357 h -= 2; 1358 } while (h > 0); 1359 } else { 1360 do { 1361 int width = w; 1362 const uint8_t *s = src; 1363 int16_t *d = dst; 1364 1365 do { 1366 uint8x8_t t0[4], t1[4]; 1367 load_u8_8x4(s + 0 * src_stride, 1, &t0[0], &t0[1], &t0[2], &t0[3]); 1368 load_u8_8x4(s + 1 * src_stride, 1, &t1[0], &t1[1], &t1[2], &t1[3]); 1369 1370 int16x8_t s0[4]; 1371 s0[0] = vreinterpretq_s16_u16(vmovl_u8(t0[0])); 1372 s0[1] = vreinterpretq_s16_u16(vmovl_u8(t0[1])); 1373 s0[2] = vreinterpretq_s16_u16(vmovl_u8(t0[2])); 1374 s0[3] = vreinterpretq_s16_u16(vmovl_u8(t0[3])); 1375 1376 int16x8_t s1[4]; 1377 s1[0] = vreinterpretq_s16_u16(vmovl_u8(t1[0])); 1378 s1[1] = vreinterpretq_s16_u16(vmovl_u8(t1[1])); 1379 s1[2] = vreinterpretq_s16_u16(vmovl_u8(t1[2])); 1380 s1[3] = vreinterpretq_s16_u16(vmovl_u8(t1[3])); 1381 1382 int16x8_t d0 = 1383 convolve4_8_2d_h(s0[0], s0[1], s0[2], s0[3], filter, horiz_const); 1384 int16x8_t d1 = 1385 convolve4_8_2d_h(s1[0], s1[1], s1[2], s1[3], filter, horiz_const); 1386 1387 store_s16_8x2(d, dst_stride, d0, d1); 1388 1389 s += 8; 1390 d += 8; 1391 width -= 8; 1392 } while (width != 0); 1393 src += 2 * src_stride; 1394 dst += 2 * dst_stride; 1395 h -= 2; 1396 } while (h > 2); 1397 1398 do { 1399 const uint8_t *s = src; 1400 int16_t *d = dst; 1401 int width = w; 1402 1403 do { 1404 uint8x8_t t0[4]; 1405 load_u8_8x4(s, 1, &t0[0], &t0[1], &t0[2], &t0[3]); 1406 1407 int16x8_t s0[4]; 1408 s0[0] = vreinterpretq_s16_u16(vmovl_u8(t0[0])); 1409 s0[1] = vreinterpretq_s16_u16(vmovl_u8(t0[1])); 1410 s0[2] = vreinterpretq_s16_u16(vmovl_u8(t0[2])); 1411 s0[3] = vreinterpretq_s16_u16(vmovl_u8(t0[3])); 1412 1413 int16x8_t d0 = 1414 convolve4_8_2d_h(s0[0], s0[1], s0[2], s0[3], filter, horiz_const); 1415 1416 vst1q_s16(d, d0); 1417 1418 s += 8; 1419 d += 8; 1420 width -= 8; 1421 } while (width != 0); 1422 src += src_stride; 1423 dst += dst_stride; 1424 } while (--h != 0); 1425 } 1426 } 1427 1428 static inline int16x8_t convolve8_8_2d_h(const int16x8_t s0, const int16x8_t s1, 1429 const int16x8_t s2, const int16x8_t s3, 1430 const int16x8_t s4, const int16x8_t s5, 1431 const int16x8_t s6, const int16x8_t s7, 1432 const int16x8_t filter, 1433 const int16x8_t horiz_const) { 1434 const int16x4_t filter_lo = vget_low_s16(filter); 1435 const int16x4_t filter_hi = vget_high_s16(filter); 1436 1437 int16x8_t sum = horiz_const; 1438 sum = vmlaq_lane_s16(sum, s0, filter_lo, 0); 1439 sum = vmlaq_lane_s16(sum, s1, filter_lo, 1); 1440 sum = vmlaq_lane_s16(sum, s2, filter_lo, 2); 1441 sum = vmlaq_lane_s16(sum, s3, filter_lo, 3); 1442 sum = vmlaq_lane_s16(sum, s4, filter_hi, 0); 1443 sum = vmlaq_lane_s16(sum, s5, filter_hi, 1); 1444 sum = vmlaq_lane_s16(sum, s6, filter_hi, 2); 1445 sum = vmlaq_lane_s16(sum, s7, filter_hi, 3); 1446 1447 // We halved the convolution filter values so -1 from the right shift. 1448 return vshrq_n_s16(sum, ROUND0_BITS - 1); 1449 } 1450 1451 static inline void convolve_2d_sr_horiz_8tap_neon( 1452 const uint8_t *src, int src_stride, int16_t *im_block, int im_stride, int w, 1453 int im_h, const int16_t *x_filter_ptr) { 1454 const int bd = 8; 1455 1456 const uint8_t *src_ptr = src; 1457 int16_t *dst_ptr = im_block; 1458 int dst_stride = im_stride; 1459 int height = im_h; 1460 1461 // A shim of 1 << ((ROUND0_BITS - 1) - 1) enables us to use non-rounding 1462 // shifts - which are generally faster than rounding shifts on modern CPUs. 1463 // (The extra -1 is needed because we halved the filter values.) 1464 const int16x8_t horiz_const = vdupq_n_s16((1 << (bd + FILTER_BITS - 2)) + 1465 (1 << ((ROUND0_BITS - 1) - 1))); 1466 // Filter values are even, so halve to reduce intermediate precision reqs. 1467 const int16x8_t x_filter = vshrq_n_s16(vld1q_s16(x_filter_ptr), 1); 1468 1469 #if AOM_ARCH_AARCH64 1470 while (height > 8) { 1471 const uint8_t *s = src_ptr; 1472 int16_t *d = dst_ptr; 1473 int width = w; 1474 1475 uint8x8_t t0, t1, t2, t3, t4, t5, t6, t7; 1476 load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); 1477 transpose_elems_inplace_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); 1478 1479 int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t0)); 1480 int16x8_t s1 = vreinterpretq_s16_u16(vmovl_u8(t1)); 1481 int16x8_t s2 = vreinterpretq_s16_u16(vmovl_u8(t2)); 1482 int16x8_t s3 = vreinterpretq_s16_u16(vmovl_u8(t3)); 1483 int16x8_t s4 = vreinterpretq_s16_u16(vmovl_u8(t4)); 1484 int16x8_t s5 = vreinterpretq_s16_u16(vmovl_u8(t5)); 1485 int16x8_t s6 = vreinterpretq_s16_u16(vmovl_u8(t6)); 1486 1487 s += 7; 1488 1489 do { 1490 load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); 1491 1492 transpose_elems_inplace_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); 1493 1494 int16x8_t s7 = vreinterpretq_s16_u16(vmovl_u8(t0)); 1495 int16x8_t s8 = vreinterpretq_s16_u16(vmovl_u8(t1)); 1496 int16x8_t s9 = vreinterpretq_s16_u16(vmovl_u8(t2)); 1497 int16x8_t s10 = vreinterpretq_s16_u16(vmovl_u8(t3)); 1498 int16x8_t s11 = vreinterpretq_s16_u16(vmovl_u8(t4)); 1499 int16x8_t s12 = vreinterpretq_s16_u16(vmovl_u8(t5)); 1500 int16x8_t s13 = vreinterpretq_s16_u16(vmovl_u8(t6)); 1501 int16x8_t s14 = vreinterpretq_s16_u16(vmovl_u8(t7)); 1502 1503 int16x8_t d0 = convolve8_8_2d_h(s0, s1, s2, s3, s4, s5, s6, s7, x_filter, 1504 horiz_const); 1505 int16x8_t d1 = convolve8_8_2d_h(s1, s2, s3, s4, s5, s6, s7, s8, x_filter, 1506 horiz_const); 1507 int16x8_t d2 = convolve8_8_2d_h(s2, s3, s4, s5, s6, s7, s8, s9, x_filter, 1508 horiz_const); 1509 int16x8_t d3 = convolve8_8_2d_h(s3, s4, s5, s6, s7, s8, s9, s10, x_filter, 1510 horiz_const); 1511 int16x8_t d4 = convolve8_8_2d_h(s4, s5, s6, s7, s8, s9, s10, s11, 1512 x_filter, horiz_const); 1513 int16x8_t d5 = convolve8_8_2d_h(s5, s6, s7, s8, s9, s10, s11, s12, 1514 x_filter, horiz_const); 1515 int16x8_t d6 = convolve8_8_2d_h(s6, s7, s8, s9, s10, s11, s12, s13, 1516 x_filter, horiz_const); 1517 int16x8_t d7 = convolve8_8_2d_h(s7, s8, s9, s10, s11, s12, s13, s14, 1518 x_filter, horiz_const); 1519 1520 transpose_elems_inplace_s16_8x8(&d0, &d1, &d2, &d3, &d4, &d5, &d6, &d7); 1521 1522 store_s16_8x8(d, dst_stride, d0, d1, d2, d3, d4, d5, d6, d7); 1523 1524 s0 = s8; 1525 s1 = s9; 1526 s2 = s10; 1527 s3 = s11; 1528 s4 = s12; 1529 s5 = s13; 1530 s6 = s14; 1531 s += 8; 1532 d += 8; 1533 width -= 8; 1534 } while (width != 0); 1535 src_ptr += 8 * src_stride; 1536 dst_ptr += 8 * dst_stride; 1537 height -= 8; 1538 } 1539 #endif // AOM_ARCH_AARCH64 1540 1541 do { 1542 const uint8_t *s = src_ptr; 1543 int16_t *d = dst_ptr; 1544 int width = w; 1545 1546 uint8x8_t t0 = vld1_u8(s); // a0 a1 a2 a3 a4 a5 a6 a7 1547 int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t0)); 1548 1549 do { 1550 uint8x8_t t1 = vld1_u8(s + 8); // a8 a9 a10 a11 a12 a13 a14 a15 1551 int16x8_t s8 = vreinterpretq_s16_u16(vmovl_u8(t1)); 1552 1553 int16x8_t s1 = vextq_s16(s0, s8, 1); // a1 a2 a3 a4 a5 a6 a7 a8 1554 int16x8_t s2 = vextq_s16(s0, s8, 2); // a2 a3 a4 a5 a6 a7 a8 a9 1555 int16x8_t s3 = vextq_s16(s0, s8, 3); // a3 a4 a5 a6 a7 a8 a9 a10 1556 int16x8_t s4 = vextq_s16(s0, s8, 4); // a4 a5 a6 a7 a8 a9 a10 a11 1557 int16x8_t s5 = vextq_s16(s0, s8, 5); // a5 a6 a7 a8 a9 a10 a11 a12 1558 int16x8_t s6 = vextq_s16(s0, s8, 6); // a6 a7 a8 a9 a10 a11 a12 a13 1559 int16x8_t s7 = vextq_s16(s0, s8, 7); // a7 a8 a9 a10 a11 a12 a13 a14 1560 1561 int16x8_t d0 = convolve8_8_2d_h(s0, s1, s2, s3, s4, s5, s6, s7, x_filter, 1562 horiz_const); 1563 1564 vst1q_s16(d, d0); 1565 1566 s0 = s8; 1567 s += 8; 1568 d += 8; 1569 width -= 8; 1570 } while (width != 0); 1571 src_ptr += src_stride; 1572 dst_ptr += dst_stride; 1573 } while (--height != 0); 1574 } 1575 1576 void av1_convolve_2d_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst, 1577 int dst_stride, int w, int h, 1578 const InterpFilterParams *filter_params_x, 1579 const InterpFilterParams *filter_params_y, 1580 const int subpel_x_qn, const int subpel_y_qn, 1581 ConvolveParams *conv_params) { 1582 if (w == 2 || h == 2) { 1583 av1_convolve_2d_sr_c(src, src_stride, dst, dst_stride, w, h, 1584 filter_params_x, filter_params_y, subpel_x_qn, 1585 subpel_y_qn, conv_params); 1586 return; 1587 } 1588 1589 const int y_filter_taps = get_filter_tap(filter_params_y, subpel_y_qn); 1590 const int x_filter_taps = get_filter_tap(filter_params_x, subpel_x_qn); 1591 const int clamped_y_taps = y_filter_taps < 4 ? 4 : y_filter_taps; 1592 const int im_h = h + clamped_y_taps - 1; 1593 const int im_stride = MAX_SB_SIZE; 1594 const int vert_offset = clamped_y_taps / 2 - 1; 1595 const int horiz_offset = filter_params_x->taps / 2 - 1; 1596 const uint8_t *src_ptr = src - vert_offset * src_stride - horiz_offset; 1597 1598 const int16_t *x_filter_ptr = av1_get_interp_filter_subpel_kernel( 1599 filter_params_x, subpel_x_qn & SUBPEL_MASK); 1600 const int16_t *y_filter_ptr = av1_get_interp_filter_subpel_kernel( 1601 filter_params_y, subpel_y_qn & SUBPEL_MASK); 1602 1603 if (filter_params_x->taps > 8) { 1604 DECLARE_ALIGNED(16, int16_t, 1605 im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE]); 1606 1607 const int16x8_t x_filter_0_7 = vld1q_s16(x_filter_ptr); 1608 const int16x4_t x_filter_8_11 = vld1_s16(x_filter_ptr + 8); 1609 const int16x8_t y_filter_0_7 = vld1q_s16(y_filter_ptr); 1610 const int16x4_t y_filter_8_11 = vld1_s16(y_filter_ptr + 8); 1611 1612 convolve_2d_sr_horiz_12tap_neon(src_ptr, src_stride, im_block, im_stride, w, 1613 im_h, x_filter_0_7, x_filter_8_11); 1614 1615 convolve_2d_sr_vert_12tap_neon(im_block, im_stride, dst, dst_stride, w, h, 1616 y_filter_0_7, y_filter_8_11); 1617 } else { 1618 DECLARE_ALIGNED(16, int16_t, 1619 im_block[(MAX_SB_SIZE + SUBPEL_TAPS - 1) * MAX_SB_SIZE]); 1620 1621 if (x_filter_taps <= 4) { 1622 convolve_2d_sr_horiz_4tap_neon(src_ptr + 2, src_stride, im_block, 1623 im_stride, w, im_h, x_filter_ptr); 1624 } else { 1625 convolve_2d_sr_horiz_8tap_neon(src_ptr, src_stride, im_block, im_stride, 1626 w, im_h, x_filter_ptr); 1627 } 1628 1629 const int16x8_t y_filter = vld1q_s16(y_filter_ptr); 1630 1631 if (clamped_y_taps <= 4) { 1632 convolve_2d_sr_vert_4tap_neon(im_block, im_stride, dst, dst_stride, w, h, 1633 y_filter_ptr); 1634 } else if (clamped_y_taps == 6) { 1635 convolve_2d_sr_vert_6tap_neon(im_block, im_stride, dst, dst_stride, w, h, 1636 y_filter); 1637 } else { 1638 convolve_2d_sr_vert_8tap_neon(im_block, im_stride, dst, dst_stride, w, h, 1639 y_filter); 1640 } 1641 } 1642 } 1643 1644 void av1_convolve_x_sr_intrabc_neon(const uint8_t *src, int src_stride, 1645 uint8_t *dst, int dst_stride, int w, int h, 1646 const InterpFilterParams *filter_params_x, 1647 const int subpel_x_qn, 1648 ConvolveParams *conv_params) { 1649 assert(subpel_x_qn == 8); 1650 assert(filter_params_x->taps == 2); 1651 assert((conv_params->round_0 + conv_params->round_1) == 2 * FILTER_BITS); 1652 (void)filter_params_x; 1653 (void)subpel_x_qn; 1654 (void)conv_params; 1655 1656 if (w <= 4) { 1657 do { 1658 uint8x8_t s0_0 = vld1_u8(src); 1659 uint8x8_t s0_1 = vld1_u8(src + 1); 1660 uint8x8_t s1_0 = vld1_u8(src + src_stride); 1661 uint8x8_t s1_1 = vld1_u8(src + src_stride + 1); 1662 1663 uint8x8_t d0 = vrhadd_u8(s0_0, s0_1); 1664 uint8x8_t d1 = vrhadd_u8(s1_0, s1_1); 1665 1666 if (w == 2) { 1667 store_u8_2x1(dst + 0 * dst_stride, d0); 1668 store_u8_2x1(dst + 1 * dst_stride, d1); 1669 } else { 1670 store_u8_4x1(dst + 0 * dst_stride, d0); 1671 store_u8_4x1(dst + 1 * dst_stride, d1); 1672 } 1673 1674 src += 2 * src_stride; 1675 dst += 2 * dst_stride; 1676 h -= 2; 1677 } while (h != 0); 1678 } else if (w == 8) { 1679 do { 1680 uint8x8_t s0_0 = vld1_u8(src); 1681 uint8x8_t s0_1 = vld1_u8(src + 1); 1682 uint8x8_t s1_0 = vld1_u8(src + src_stride); 1683 uint8x8_t s1_1 = vld1_u8(src + src_stride + 1); 1684 1685 uint8x8_t d0 = vrhadd_u8(s0_0, s0_1); 1686 uint8x8_t d1 = vrhadd_u8(s1_0, s1_1); 1687 1688 vst1_u8(dst, d0); 1689 vst1_u8(dst + dst_stride, d1); 1690 1691 src += 2 * src_stride; 1692 dst += 2 * dst_stride; 1693 h -= 2; 1694 } while (h != 0); 1695 } else { 1696 do { 1697 const uint8_t *src_ptr = src; 1698 uint8_t *dst_ptr = dst; 1699 int width = w; 1700 1701 do { 1702 uint8x16_t s0 = vld1q_u8(src_ptr); 1703 uint8x16_t s1 = vld1q_u8(src_ptr + 1); 1704 1705 uint8x16_t d0 = vrhaddq_u8(s0, s1); 1706 1707 vst1q_u8(dst_ptr, d0); 1708 1709 src_ptr += 16; 1710 dst_ptr += 16; 1711 width -= 16; 1712 } while (width != 0); 1713 src += src_stride; 1714 dst += dst_stride; 1715 } while (--h != 0); 1716 } 1717 } 1718 1719 void av1_convolve_y_sr_intrabc_neon(const uint8_t *src, int src_stride, 1720 uint8_t *dst, int dst_stride, int w, int h, 1721 const InterpFilterParams *filter_params_y, 1722 const int subpel_y_qn) { 1723 assert(subpel_y_qn == 8); 1724 assert(filter_params_y->taps == 2); 1725 (void)filter_params_y; 1726 (void)subpel_y_qn; 1727 1728 if (w <= 4) { 1729 do { 1730 uint8x8_t s0 = load_unaligned_u8_4x1(src); 1731 uint8x8_t s1 = load_unaligned_u8_4x1(src + src_stride); 1732 uint8x8_t s2 = load_unaligned_u8_4x1(src + 2 * src_stride); 1733 1734 uint8x8_t d0 = vrhadd_u8(s0, s1); 1735 uint8x8_t d1 = vrhadd_u8(s1, s2); 1736 1737 if (w == 2) { 1738 store_u8_2x1(dst + 0 * dst_stride, d0); 1739 store_u8_2x1(dst + 1 * dst_stride, d1); 1740 } else { 1741 store_u8_4x1(dst + 0 * dst_stride, d0); 1742 store_u8_4x1(dst + 1 * dst_stride, d1); 1743 } 1744 1745 src += 2 * src_stride; 1746 dst += 2 * dst_stride; 1747 h -= 2; 1748 } while (h != 0); 1749 } else if (w == 8) { 1750 do { 1751 uint8x8_t s0 = vld1_u8(src); 1752 uint8x8_t s1 = vld1_u8(src + src_stride); 1753 uint8x8_t s2 = vld1_u8(src + 2 * src_stride); 1754 1755 uint8x8_t d0 = vrhadd_u8(s0, s1); 1756 uint8x8_t d1 = vrhadd_u8(s1, s2); 1757 1758 vst1_u8(dst, d0); 1759 vst1_u8(dst + dst_stride, d1); 1760 1761 src += 2 * src_stride; 1762 dst += 2 * dst_stride; 1763 h -= 2; 1764 } while (h != 0); 1765 } else { 1766 do { 1767 const uint8_t *src_ptr = src; 1768 uint8_t *dst_ptr = dst; 1769 int height = h; 1770 1771 do { 1772 uint8x16_t s0 = vld1q_u8(src_ptr); 1773 uint8x16_t s1 = vld1q_u8(src_ptr + src_stride); 1774 1775 uint8x16_t d0 = vrhaddq_u8(s0, s1); 1776 1777 vst1q_u8(dst_ptr, d0); 1778 1779 src_ptr += src_stride; 1780 dst_ptr += dst_stride; 1781 } while (--height != 0); 1782 src += 16; 1783 dst += 16; 1784 w -= 16; 1785 } while (w != 0); 1786 } 1787 } 1788 1789 void av1_convolve_2d_sr_intrabc_neon(const uint8_t *src, int src_stride, 1790 uint8_t *dst, int dst_stride, int w, int h, 1791 const InterpFilterParams *filter_params_x, 1792 const InterpFilterParams *filter_params_y, 1793 const int subpel_x_qn, 1794 const int subpel_y_qn, 1795 ConvolveParams *conv_params) { 1796 assert(subpel_x_qn == 8); 1797 assert(subpel_y_qn == 8); 1798 assert(filter_params_x->taps == 2 && filter_params_y->taps == 2); 1799 assert((conv_params->round_0 + conv_params->round_1) == 2 * FILTER_BITS); 1800 (void)filter_params_x; 1801 (void)subpel_x_qn; 1802 (void)filter_params_y; 1803 (void)subpel_y_qn; 1804 (void)conv_params; 1805 1806 uint16_t im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE]; 1807 int im_h = h + 1; 1808 int im_stride = w; 1809 assert(w <= MAX_SB_SIZE && h <= MAX_SB_SIZE); 1810 1811 uint16_t *im = im_block; 1812 1813 // Horizontal filter. 1814 if (w <= 4) { 1815 do { 1816 uint8x8_t s0 = vld1_u8(src); 1817 uint8x8_t s1 = vld1_u8(src + 1); 1818 1819 uint16x4_t sum = vget_low_u16(vaddl_u8(s0, s1)); 1820 1821 // Safe to store the whole vector, the im buffer is big enough. 1822 vst1_u16(im, sum); 1823 1824 src += src_stride; 1825 im += im_stride; 1826 } while (--im_h != 0); 1827 } else { 1828 do { 1829 const uint8_t *src_ptr = src; 1830 uint16_t *im_ptr = im; 1831 int width = w; 1832 1833 do { 1834 uint8x8_t s0 = vld1_u8(src_ptr); 1835 uint8x8_t s1 = vld1_u8(src_ptr + 1); 1836 1837 uint16x8_t sum = vaddl_u8(s0, s1); 1838 1839 vst1q_u16(im_ptr, sum); 1840 1841 src_ptr += 8; 1842 im_ptr += 8; 1843 width -= 8; 1844 } while (width != 0); 1845 src += src_stride; 1846 im += im_stride; 1847 } while (--im_h != 0); 1848 } 1849 1850 im = im_block; 1851 1852 // Vertical filter. 1853 if (w <= 4) { 1854 do { 1855 uint16x4_t s0 = vld1_u16(im); 1856 uint16x4_t s1 = vld1_u16(im + im_stride); 1857 uint16x4_t s2 = vld1_u16(im + 2 * im_stride); 1858 1859 uint16x4_t sum0 = vadd_u16(s0, s1); 1860 uint16x4_t sum1 = vadd_u16(s1, s2); 1861 1862 uint8x8_t d0 = vqrshrn_n_u16(vcombine_u16(sum0, vdup_n_u16(0)), 2); 1863 uint8x8_t d1 = vqrshrn_n_u16(vcombine_u16(sum1, vdup_n_u16(0)), 2); 1864 1865 if (w == 2) { 1866 store_u8_2x1(dst + 0 * dst_stride, d0); 1867 store_u8_2x1(dst + 1 * dst_stride, d1); 1868 } else { 1869 store_u8_4x1(dst + 0 * dst_stride, d0); 1870 store_u8_4x1(dst + 1 * dst_stride, d1); 1871 } 1872 1873 im += 2 * im_stride; 1874 dst += 2 * dst_stride; 1875 h -= 2; 1876 } while (h != 0); 1877 } else { 1878 do { 1879 uint16_t *im_ptr = im; 1880 uint8_t *dst_ptr = dst; 1881 int height = h; 1882 1883 do { 1884 uint16x8_t s0 = vld1q_u16(im_ptr); 1885 uint16x8_t s1 = vld1q_u16(im_ptr + im_stride); 1886 1887 uint16x8_t sum = vaddq_u16(s0, s1); 1888 uint8x8_t d0 = vqrshrn_n_u16(sum, 2); 1889 1890 vst1_u8(dst_ptr, d0); 1891 1892 im_ptr += im_stride; 1893 dst_ptr += dst_stride; 1894 } while (--height != 0); 1895 im += 8; 1896 dst += 8; 1897 w -= 8; 1898 } while (w != 0); 1899 } 1900 }