highbd_compound_convolve_neon.c (75719B)
1 /* 2 * Copyright (c) 2023, Alliance for Open Media. All rights reserved. 3 * 4 * This source code is subject to the terms of the BSD 2 Clause License and 5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License 6 * was not distributed with this source code in the LICENSE file, you can 7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open 8 * Media Patent License 1.0 was not distributed with this source code in the 9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent. 10 */ 11 12 #include <assert.h> 13 #include <arm_neon.h> 14 15 #include "config/aom_config.h" 16 #include "config/av1_rtcd.h" 17 18 #include "aom_dsp/aom_dsp_common.h" 19 #include "aom_dsp/arm/mem_neon.h" 20 #include "aom_ports/mem.h" 21 #include "av1/common/convolve.h" 22 #include "av1/common/filter.h" 23 #include "av1/common/arm/highbd_compound_convolve_neon.h" 24 #include "av1/common/arm/highbd_convolve_neon.h" 25 26 static inline uint16x4_t highbd_12_convolve6_4( 27 const int16x4_t s0, const int16x4_t s1, const int16x4_t s2, 28 const int16x4_t s3, const int16x4_t s4, const int16x4_t s5, 29 const int16x8_t filter, const int32x4_t offset) { 30 // Values at indices 0 and 7 of y_filter are zero. 31 const int16x4_t filter_0_3 = vget_low_s16(filter); 32 const int16x4_t filter_4_7 = vget_high_s16(filter); 33 34 int32x4_t sum = vmlal_lane_s16(offset, s0, filter_0_3, 1); 35 sum = vmlal_lane_s16(sum, s1, filter_0_3, 2); 36 sum = vmlal_lane_s16(sum, s2, filter_0_3, 3); 37 sum = vmlal_lane_s16(sum, s3, filter_4_7, 0); 38 sum = vmlal_lane_s16(sum, s4, filter_4_7, 1); 39 sum = vmlal_lane_s16(sum, s5, filter_4_7, 2); 40 41 return vqshrun_n_s32(sum, ROUND0_BITS + 2); 42 } 43 44 static inline uint16x4_t highbd_convolve6_4( 45 const int16x4_t s0, const int16x4_t s1, const int16x4_t s2, 46 const int16x4_t s3, const int16x4_t s4, const int16x4_t s5, 47 const int16x8_t filter, const int32x4_t offset) { 48 // Values at indices 0 and 7 of y_filter are zero. 49 const int16x4_t filter_0_3 = vget_low_s16(filter); 50 const int16x4_t filter_4_7 = vget_high_s16(filter); 51 52 int32x4_t sum = vmlal_lane_s16(offset, s0, filter_0_3, 1); 53 sum = vmlal_lane_s16(sum, s1, filter_0_3, 2); 54 sum = vmlal_lane_s16(sum, s2, filter_0_3, 3); 55 sum = vmlal_lane_s16(sum, s3, filter_4_7, 0); 56 sum = vmlal_lane_s16(sum, s4, filter_4_7, 1); 57 sum = vmlal_lane_s16(sum, s5, filter_4_7, 2); 58 59 return vqshrun_n_s32(sum, ROUND0_BITS); 60 } 61 62 static inline uint16x8_t highbd_12_convolve6_8( 63 const int16x8_t s0, const int16x8_t s1, const int16x8_t s2, 64 const int16x8_t s3, const int16x8_t s4, const int16x8_t s5, 65 const int16x8_t filter, const int32x4_t offset) { 66 // Values at indices 0 and 7 of y_filter are zero. 67 const int16x4_t filter_0_3 = vget_low_s16(filter); 68 const int16x4_t filter_4_7 = vget_high_s16(filter); 69 70 int32x4_t sum0 = vmlal_lane_s16(offset, vget_low_s16(s0), filter_0_3, 1); 71 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s1), filter_0_3, 2); 72 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s2), filter_0_3, 3); 73 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s3), filter_4_7, 0); 74 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s4), filter_4_7, 1); 75 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s5), filter_4_7, 2); 76 77 int32x4_t sum1 = vmlal_lane_s16(offset, vget_high_s16(s0), filter_0_3, 1); 78 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s1), filter_0_3, 2); 79 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s2), filter_0_3, 3); 80 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s3), filter_4_7, 0); 81 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s4), filter_4_7, 1); 82 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s5), filter_4_7, 2); 83 84 return vcombine_u16(vqshrun_n_s32(sum0, ROUND0_BITS + 2), 85 vqshrun_n_s32(sum1, ROUND0_BITS + 2)); 86 } 87 88 static inline uint16x8_t highbd_convolve6_8( 89 const int16x8_t s0, const int16x8_t s1, const int16x8_t s2, 90 const int16x8_t s3, const int16x8_t s4, const int16x8_t s5, 91 const int16x8_t filter, const int32x4_t offset) { 92 // Values at indices 0 and 7 of y_filter are zero. 93 const int16x4_t filter_0_3 = vget_low_s16(filter); 94 const int16x4_t filter_4_7 = vget_high_s16(filter); 95 96 int32x4_t sum0 = vmlal_lane_s16(offset, vget_low_s16(s0), filter_0_3, 1); 97 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s1), filter_0_3, 2); 98 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s2), filter_0_3, 3); 99 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s3), filter_4_7, 0); 100 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s4), filter_4_7, 1); 101 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s5), filter_4_7, 2); 102 103 int32x4_t sum1 = vmlal_lane_s16(offset, vget_high_s16(s0), filter_0_3, 1); 104 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s1), filter_0_3, 2); 105 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s2), filter_0_3, 3); 106 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s3), filter_4_7, 0); 107 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s4), filter_4_7, 1); 108 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s5), filter_4_7, 2); 109 110 return vcombine_u16(vqshrun_n_s32(sum0, 3), vqshrun_n_s32(sum1, ROUND0_BITS)); 111 } 112 113 static inline void highbd_12_dist_wtd_convolve_x_6tap_neon( 114 const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride, 115 int w, int h, const int16_t *x_filter_ptr, const int offset) { 116 const int32x4_t offset_vec = vdupq_n_s32(offset); 117 118 const int16x8_t x_filter = vld1q_s16(x_filter_ptr); 119 120 int height = h; 121 122 do { 123 int width = w; 124 const int16_t *s = (const int16_t *)src_ptr; 125 uint16_t *d = dst_ptr; 126 127 do { 128 int16x8_t s0[6], s1[6], s2[6], s3[6]; 129 load_s16_8x6(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3], 130 &s0[4], &s0[5]); 131 load_s16_8x6(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3], 132 &s1[4], &s1[5]); 133 load_s16_8x6(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3], 134 &s2[4], &s2[5]); 135 load_s16_8x6(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3], 136 &s3[4], &s3[5]); 137 138 uint16x8_t d0 = highbd_12_convolve6_8(s0[0], s0[1], s0[2], s0[3], s0[4], 139 s0[5], x_filter, offset_vec); 140 uint16x8_t d1 = highbd_12_convolve6_8(s1[0], s1[1], s1[2], s1[3], s1[4], 141 s1[5], x_filter, offset_vec); 142 uint16x8_t d2 = highbd_12_convolve6_8(s2[0], s2[1], s2[2], s2[3], s2[4], 143 s2[5], x_filter, offset_vec); 144 uint16x8_t d3 = highbd_12_convolve6_8(s3[0], s3[1], s3[2], s3[3], s3[4], 145 s3[5], x_filter, offset_vec); 146 147 store_u16_8x4(d, dst_stride, d0, d1, d2, d3); 148 149 s += 8; 150 d += 8; 151 width -= 8; 152 } while (width != 0); 153 src_ptr += 4 * src_stride; 154 dst_ptr += 4 * dst_stride; 155 height -= 4; 156 } while (height != 0); 157 } 158 159 static inline void highbd_dist_wtd_convolve_x_6tap_neon( 160 const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride, 161 int w, int h, const int16_t *x_filter_ptr, const int offset) { 162 const int32x4_t offset_vec = vdupq_n_s32(offset); 163 164 const int16x8_t x_filter = vld1q_s16(x_filter_ptr); 165 166 int height = h; 167 168 do { 169 int width = w; 170 const int16_t *s = (const int16_t *)src_ptr; 171 uint16_t *d = dst_ptr; 172 173 do { 174 int16x8_t s0[6], s1[6], s2[6], s3[6]; 175 load_s16_8x6(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3], 176 &s0[4], &s0[5]); 177 load_s16_8x6(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3], 178 &s1[4], &s1[5]); 179 load_s16_8x6(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3], 180 &s2[4], &s2[5]); 181 load_s16_8x6(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3], 182 &s3[4], &s3[5]); 183 184 uint16x8_t d0 = highbd_convolve6_8(s0[0], s0[1], s0[2], s0[3], s0[4], 185 s0[5], x_filter, offset_vec); 186 uint16x8_t d1 = highbd_convolve6_8(s1[0], s1[1], s1[2], s1[3], s1[4], 187 s1[5], x_filter, offset_vec); 188 uint16x8_t d2 = highbd_convolve6_8(s2[0], s2[1], s2[2], s2[3], s2[4], 189 s2[5], x_filter, offset_vec); 190 uint16x8_t d3 = highbd_convolve6_8(s3[0], s3[1], s3[2], s3[3], s3[4], 191 s3[5], x_filter, offset_vec); 192 193 store_u16_8x4(d, dst_stride, d0, d1, d2, d3); 194 195 s += 8; 196 d += 8; 197 width -= 8; 198 } while (width != 0); 199 src_ptr += 4 * src_stride; 200 dst_ptr += 4 * dst_stride; 201 height -= 4; 202 } while (height != 0); 203 } 204 205 static inline uint16x4_t highbd_12_convolve8_4( 206 const int16x4_t s0, const int16x4_t s1, const int16x4_t s2, 207 const int16x4_t s3, const int16x4_t s4, const int16x4_t s5, 208 const int16x4_t s6, const int16x4_t s7, const int16x8_t filter, 209 const int32x4_t offset) { 210 const int16x4_t filter_0_3 = vget_low_s16(filter); 211 const int16x4_t filter_4_7 = vget_high_s16(filter); 212 213 int32x4_t sum = vmlal_lane_s16(offset, s0, filter_0_3, 0); 214 sum = vmlal_lane_s16(sum, s1, filter_0_3, 1); 215 sum = vmlal_lane_s16(sum, s2, filter_0_3, 2); 216 sum = vmlal_lane_s16(sum, s3, filter_0_3, 3); 217 sum = vmlal_lane_s16(sum, s4, filter_4_7, 0); 218 sum = vmlal_lane_s16(sum, s5, filter_4_7, 1); 219 sum = vmlal_lane_s16(sum, s6, filter_4_7, 2); 220 sum = vmlal_lane_s16(sum, s7, filter_4_7, 3); 221 222 return vqshrun_n_s32(sum, ROUND0_BITS + 2); 223 } 224 225 static inline uint16x4_t highbd_convolve8_4( 226 const int16x4_t s0, const int16x4_t s1, const int16x4_t s2, 227 const int16x4_t s3, const int16x4_t s4, const int16x4_t s5, 228 const int16x4_t s6, const int16x4_t s7, const int16x8_t filter, 229 const int32x4_t offset) { 230 const int16x4_t filter_0_3 = vget_low_s16(filter); 231 const int16x4_t filter_4_7 = vget_high_s16(filter); 232 233 int32x4_t sum = vmlal_lane_s16(offset, s0, filter_0_3, 0); 234 sum = vmlal_lane_s16(sum, s1, filter_0_3, 1); 235 sum = vmlal_lane_s16(sum, s2, filter_0_3, 2); 236 sum = vmlal_lane_s16(sum, s3, filter_0_3, 3); 237 sum = vmlal_lane_s16(sum, s4, filter_4_7, 0); 238 sum = vmlal_lane_s16(sum, s5, filter_4_7, 1); 239 sum = vmlal_lane_s16(sum, s6, filter_4_7, 2); 240 sum = vmlal_lane_s16(sum, s7, filter_4_7, 3); 241 242 return vqshrun_n_s32(sum, ROUND0_BITS); 243 } 244 245 static inline uint16x8_t highbd_12_convolve8_8( 246 const int16x8_t s0, const int16x8_t s1, const int16x8_t s2, 247 const int16x8_t s3, const int16x8_t s4, const int16x8_t s5, 248 const int16x8_t s6, const int16x8_t s7, const int16x8_t filter, 249 const int32x4_t offset) { 250 const int16x4_t filter_0_3 = vget_low_s16(filter); 251 const int16x4_t filter_4_7 = vget_high_s16(filter); 252 253 int32x4_t sum0 = vmlal_lane_s16(offset, vget_low_s16(s0), filter_0_3, 0); 254 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s1), filter_0_3, 1); 255 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s2), filter_0_3, 2); 256 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s3), filter_0_3, 3); 257 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s4), filter_4_7, 0); 258 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s5), filter_4_7, 1); 259 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s6), filter_4_7, 2); 260 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s7), filter_4_7, 3); 261 262 int32x4_t sum1 = vmlal_lane_s16(offset, vget_high_s16(s0), filter_0_3, 0); 263 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s1), filter_0_3, 1); 264 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s2), filter_0_3, 2); 265 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s3), filter_0_3, 3); 266 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s4), filter_4_7, 0); 267 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s5), filter_4_7, 1); 268 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s6), filter_4_7, 2); 269 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s7), filter_4_7, 3); 270 271 return vcombine_u16(vqshrun_n_s32(sum0, ROUND0_BITS + 2), 272 vqshrun_n_s32(sum1, ROUND0_BITS + 2)); 273 } 274 275 static inline uint16x8_t highbd_convolve8_8( 276 const int16x8_t s0, const int16x8_t s1, const int16x8_t s2, 277 const int16x8_t s3, const int16x8_t s4, const int16x8_t s5, 278 const int16x8_t s6, const int16x8_t s7, const int16x8_t filter, 279 const int32x4_t offset) { 280 const int16x4_t filter_0_3 = vget_low_s16(filter); 281 const int16x4_t filter_4_7 = vget_high_s16(filter); 282 283 int32x4_t sum0 = vmlal_lane_s16(offset, vget_low_s16(s0), filter_0_3, 0); 284 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s1), filter_0_3, 1); 285 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s2), filter_0_3, 2); 286 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s3), filter_0_3, 3); 287 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s4), filter_4_7, 0); 288 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s5), filter_4_7, 1); 289 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s6), filter_4_7, 2); 290 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s7), filter_4_7, 3); 291 292 int32x4_t sum1 = vmlal_lane_s16(offset, vget_high_s16(s0), filter_0_3, 0); 293 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s1), filter_0_3, 1); 294 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s2), filter_0_3, 2); 295 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s3), filter_0_3, 3); 296 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s4), filter_4_7, 0); 297 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s5), filter_4_7, 1); 298 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s6), filter_4_7, 2); 299 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s7), filter_4_7, 3); 300 301 return vcombine_u16(vqshrun_n_s32(sum0, ROUND0_BITS), 302 vqshrun_n_s32(sum1, ROUND0_BITS)); 303 } 304 305 static inline uint16x4_t highbd_12_convolve4_4_x(const int16x4_t s[4], 306 const int16x4_t x_filter, 307 const int32x4_t offset) { 308 int32x4_t sum = vmlal_lane_s16(offset, s[0], x_filter, 0); 309 sum = vmlal_lane_s16(sum, s[1], x_filter, 1); 310 sum = vmlal_lane_s16(sum, s[2], x_filter, 2); 311 sum = vmlal_lane_s16(sum, s[3], x_filter, 3); 312 313 return vqshrun_n_s32(sum, 5); 314 } 315 316 static inline uint16x4_t highbd_convolve4_4_x(const int16x4_t s[4], 317 const int16x4_t x_filter, 318 const int32x4_t offset) { 319 int32x4_t sum = vmlal_lane_s16(offset, s[0], x_filter, 0); 320 sum = vmlal_lane_s16(sum, s[1], x_filter, 1); 321 sum = vmlal_lane_s16(sum, s[2], x_filter, 2); 322 sum = vmlal_lane_s16(sum, s[3], x_filter, 3); 323 324 return vqshrun_n_s32(sum, ROUND0_BITS); 325 } 326 327 static inline void highbd_12_dist_wtd_convolve_x_neon( 328 const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride, 329 int w, int h, const int16_t *x_filter_ptr, const int offset) { 330 const int32x4_t offset_vec = vdupq_n_s32(offset); 331 332 if (w == 4) { 333 // 4-tap filters are used for blocks having width == 4. 334 const int16x4_t x_filter = vld1_s16(x_filter_ptr + 2); 335 const int16_t *s = (const int16_t *)(src_ptr + 2); 336 uint16_t *d = dst_ptr; 337 338 do { 339 int16x4_t s0[4], s1[4], s2[4], s3[4]; 340 load_s16_4x4(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3]); 341 load_s16_4x4(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3]); 342 load_s16_4x4(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3]); 343 load_s16_4x4(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3]); 344 345 uint16x4_t d0 = highbd_12_convolve4_4_x(s0, x_filter, offset_vec); 346 uint16x4_t d1 = highbd_12_convolve4_4_x(s1, x_filter, offset_vec); 347 uint16x4_t d2 = highbd_12_convolve4_4_x(s2, x_filter, offset_vec); 348 uint16x4_t d3 = highbd_12_convolve4_4_x(s3, x_filter, offset_vec); 349 350 store_u16_4x4(d, dst_stride, d0, d1, d2, d3); 351 352 s += 4 * src_stride; 353 d += 4 * dst_stride; 354 h -= 4; 355 } while (h != 0); 356 } else { 357 const int16x8_t x_filter = vld1q_s16(x_filter_ptr); 358 int height = h; 359 360 do { 361 int width = w; 362 const int16_t *s = (const int16_t *)src_ptr; 363 uint16_t *d = dst_ptr; 364 365 do { 366 int16x8_t s0[8], s1[8], s2[8], s3[8]; 367 load_s16_8x8(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3], 368 &s0[4], &s0[5], &s0[6], &s0[7]); 369 load_s16_8x8(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3], 370 &s1[4], &s1[5], &s1[6], &s1[7]); 371 load_s16_8x8(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3], 372 &s2[4], &s2[5], &s2[6], &s2[7]); 373 load_s16_8x8(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3], 374 &s3[4], &s3[5], &s3[6], &s3[7]); 375 376 uint16x8_t d0 = 377 highbd_12_convolve8_8(s0[0], s0[1], s0[2], s0[3], s0[4], s0[5], 378 s0[6], s0[7], x_filter, offset_vec); 379 uint16x8_t d1 = 380 highbd_12_convolve8_8(s1[0], s1[1], s1[2], s1[3], s1[4], s1[5], 381 s1[6], s1[7], x_filter, offset_vec); 382 uint16x8_t d2 = 383 highbd_12_convolve8_8(s2[0], s2[1], s2[2], s2[3], s2[4], s2[5], 384 s2[6], s2[7], x_filter, offset_vec); 385 uint16x8_t d3 = 386 highbd_12_convolve8_8(s3[0], s3[1], s3[2], s3[3], s3[4], s3[5], 387 s3[6], s3[7], x_filter, offset_vec); 388 389 store_u16_8x4(d, dst_stride, d0, d1, d2, d3); 390 391 s += 8; 392 d += 8; 393 width -= 8; 394 } while (width != 0); 395 src_ptr += 4 * src_stride; 396 dst_ptr += 4 * dst_stride; 397 height -= 4; 398 } while (height != 0); 399 } 400 } 401 402 static inline void highbd_dist_wtd_convolve_x_neon( 403 const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride, 404 int w, int h, const int16_t *x_filter_ptr, const int offset) { 405 const int32x4_t offset_vec = vdupq_n_s32(offset); 406 407 if (w == 4) { 408 // 4-tap filters are used for blocks having width == 4. 409 const int16x4_t x_filter = vld1_s16(x_filter_ptr + 2); 410 const int16_t *s = (const int16_t *)(src_ptr + 2); 411 uint16_t *d = dst_ptr; 412 413 do { 414 int16x4_t s0[4], s1[4], s2[4], s3[4]; 415 load_s16_4x4(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3]); 416 load_s16_4x4(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3]); 417 load_s16_4x4(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3]); 418 load_s16_4x4(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3]); 419 420 uint16x4_t d0 = highbd_convolve4_4_x(s0, x_filter, offset_vec); 421 uint16x4_t d1 = highbd_convolve4_4_x(s1, x_filter, offset_vec); 422 uint16x4_t d2 = highbd_convolve4_4_x(s2, x_filter, offset_vec); 423 uint16x4_t d3 = highbd_convolve4_4_x(s3, x_filter, offset_vec); 424 425 store_u16_4x4(d, dst_stride, d0, d1, d2, d3); 426 427 s += 4 * src_stride; 428 d += 4 * dst_stride; 429 h -= 4; 430 } while (h != 0); 431 } else { 432 const int16x8_t x_filter = vld1q_s16(x_filter_ptr); 433 int height = h; 434 435 do { 436 int width = w; 437 const int16_t *s = (const int16_t *)src_ptr; 438 uint16_t *d = dst_ptr; 439 440 do { 441 int16x8_t s0[8], s1[8], s2[8], s3[8]; 442 load_s16_8x8(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3], 443 &s0[4], &s0[5], &s0[6], &s0[7]); 444 load_s16_8x8(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3], 445 &s1[4], &s1[5], &s1[6], &s1[7]); 446 load_s16_8x8(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3], 447 &s2[4], &s2[5], &s2[6], &s2[7]); 448 load_s16_8x8(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3], 449 &s3[4], &s3[5], &s3[6], &s3[7]); 450 451 uint16x8_t d0 = 452 highbd_convolve8_8(s0[0], s0[1], s0[2], s0[3], s0[4], s0[5], s0[6], 453 s0[7], x_filter, offset_vec); 454 uint16x8_t d1 = 455 highbd_convolve8_8(s1[0], s1[1], s1[2], s1[3], s1[4], s1[5], s1[6], 456 s1[7], x_filter, offset_vec); 457 uint16x8_t d2 = 458 highbd_convolve8_8(s2[0], s2[1], s2[2], s2[3], s2[4], s2[5], s2[6], 459 s2[7], x_filter, offset_vec); 460 uint16x8_t d3 = 461 highbd_convolve8_8(s3[0], s3[1], s3[2], s3[3], s3[4], s3[5], s3[6], 462 s3[7], x_filter, offset_vec); 463 464 store_u16_8x4(d, dst_stride, d0, d1, d2, d3); 465 466 s += 8; 467 d += 8; 468 width -= 8; 469 } while (width != 0); 470 src_ptr += 4 * src_stride; 471 dst_ptr += 4 * dst_stride; 472 height -= 4; 473 } while (height != 0); 474 } 475 } 476 477 void av1_highbd_dist_wtd_convolve_x_neon( 478 const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, 479 int h, const InterpFilterParams *filter_params_x, const int subpel_x_qn, 480 ConvolveParams *conv_params, int bd) { 481 DECLARE_ALIGNED(16, uint16_t, 482 im_block[(MAX_SB_SIZE + MAX_FILTER_TAP) * MAX_SB_SIZE]); 483 CONV_BUF_TYPE *dst16 = conv_params->dst; 484 const int x_filter_taps = get_filter_tap(filter_params_x, subpel_x_qn); 485 int dst16_stride = conv_params->dst_stride; 486 const int im_stride = MAX_SB_SIZE; 487 const int horiz_offset = filter_params_x->taps / 2 - 1; 488 assert(FILTER_BITS == COMPOUND_ROUND1_BITS); 489 const int offset_convolve = (1 << (conv_params->round_0 - 1)) + 490 (1 << (bd + FILTER_BITS)) + 491 (1 << (bd + FILTER_BITS - 1)); 492 493 const int16_t *x_filter_ptr = av1_get_interp_filter_subpel_kernel( 494 filter_params_x, subpel_x_qn & SUBPEL_MASK); 495 496 src -= horiz_offset; 497 498 // horizontal filter 499 if (bd == 12) { 500 if (conv_params->do_average) { 501 if (x_filter_taps <= 6 && w != 4) { 502 highbd_12_dist_wtd_convolve_x_6tap_neon(src + 1, src_stride, im_block, 503 im_stride, w, h, x_filter_ptr, 504 offset_convolve); 505 } else { 506 highbd_12_dist_wtd_convolve_x_neon(src, src_stride, im_block, im_stride, 507 w, h, x_filter_ptr, offset_convolve); 508 } 509 if (conv_params->use_dist_wtd_comp_avg) { 510 highbd_12_dist_wtd_comp_avg_neon(im_block, im_stride, dst, dst_stride, 511 w, h, conv_params); 512 } else { 513 highbd_12_comp_avg_neon(im_block, im_stride, dst, dst_stride, w, h, 514 conv_params); 515 } 516 } else { 517 if (x_filter_taps <= 6 && w != 4) { 518 highbd_12_dist_wtd_convolve_x_6tap_neon(src + 1, src_stride, dst16, 519 dst16_stride, w, h, 520 x_filter_ptr, offset_convolve); 521 } else { 522 highbd_12_dist_wtd_convolve_x_neon(src, src_stride, dst16, dst16_stride, 523 w, h, x_filter_ptr, offset_convolve); 524 } 525 } 526 } else { 527 if (conv_params->do_average) { 528 if (x_filter_taps <= 6 && w != 4) { 529 highbd_dist_wtd_convolve_x_6tap_neon(src + 1, src_stride, im_block, 530 im_stride, w, h, x_filter_ptr, 531 offset_convolve); 532 } else { 533 highbd_dist_wtd_convolve_x_neon(src, src_stride, im_block, im_stride, w, 534 h, x_filter_ptr, offset_convolve); 535 } 536 if (conv_params->use_dist_wtd_comp_avg) { 537 highbd_dist_wtd_comp_avg_neon(im_block, im_stride, dst, dst_stride, w, 538 h, conv_params, bd); 539 } else { 540 highbd_comp_avg_neon(im_block, im_stride, dst, dst_stride, w, h, 541 conv_params, bd); 542 } 543 } else { 544 if (x_filter_taps <= 6 && w != 4) { 545 highbd_dist_wtd_convolve_x_6tap_neon(src + 1, src_stride, dst16, 546 dst16_stride, w, h, x_filter_ptr, 547 offset_convolve); 548 } else { 549 highbd_dist_wtd_convolve_x_neon(src, src_stride, dst16, dst16_stride, w, 550 h, x_filter_ptr, offset_convolve); 551 } 552 } 553 } 554 } 555 556 static inline void highbd_12_dist_wtd_convolve_y_6tap_neon( 557 const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride, 558 int w, int h, const int16_t *y_filter_ptr, const int offset) { 559 const int16x8_t y_filter = vld1q_s16(y_filter_ptr); 560 const int32x4_t offset_vec = vdupq_n_s32(offset); 561 562 if (w == 4) { 563 const int16_t *s = (const int16_t *)src_ptr; 564 uint16_t *d = dst_ptr; 565 566 int16x4_t s0, s1, s2, s3, s4; 567 load_s16_4x5(s, src_stride, &s0, &s1, &s2, &s3, &s4); 568 s += 5 * src_stride; 569 570 do { 571 int16x4_t s5, s6, s7, s8; 572 load_s16_4x4(s, src_stride, &s5, &s6, &s7, &s8); 573 574 uint16x4_t d0 = 575 highbd_12_convolve6_4(s0, s1, s2, s3, s4, s5, y_filter, offset_vec); 576 uint16x4_t d1 = 577 highbd_12_convolve6_4(s1, s2, s3, s4, s5, s6, y_filter, offset_vec); 578 uint16x4_t d2 = 579 highbd_12_convolve6_4(s2, s3, s4, s5, s6, s7, y_filter, offset_vec); 580 uint16x4_t d3 = 581 highbd_12_convolve6_4(s3, s4, s5, s6, s7, s8, y_filter, offset_vec); 582 583 store_u16_4x4(d, dst_stride, d0, d1, d2, d3); 584 585 s0 = s4; 586 s1 = s5; 587 s2 = s6; 588 s3 = s7; 589 s4 = s8; 590 s += 4 * src_stride; 591 d += 4 * dst_stride; 592 h -= 4; 593 } while (h != 0); 594 } else { 595 do { 596 int height = h; 597 const int16_t *s = (const int16_t *)src_ptr; 598 uint16_t *d = dst_ptr; 599 600 int16x8_t s0, s1, s2, s3, s4; 601 load_s16_8x5(s, src_stride, &s0, &s1, &s2, &s3, &s4); 602 s += 5 * src_stride; 603 604 do { 605 int16x8_t s5, s6, s7, s8; 606 load_s16_8x4(s, src_stride, &s5, &s6, &s7, &s8); 607 608 uint16x8_t d0 = 609 highbd_12_convolve6_8(s0, s1, s2, s3, s4, s5, y_filter, offset_vec); 610 uint16x8_t d1 = 611 highbd_12_convolve6_8(s1, s2, s3, s4, s5, s6, y_filter, offset_vec); 612 uint16x8_t d2 = 613 highbd_12_convolve6_8(s2, s3, s4, s5, s6, s7, y_filter, offset_vec); 614 uint16x8_t d3 = 615 highbd_12_convolve6_8(s3, s4, s5, s6, s7, s8, y_filter, offset_vec); 616 617 store_u16_8x4(d, dst_stride, d0, d1, d2, d3); 618 619 s0 = s4; 620 s1 = s5; 621 s2 = s6; 622 s3 = s7; 623 s4 = s8; 624 s += 4 * src_stride; 625 d += 4 * dst_stride; 626 height -= 4; 627 } while (height != 0); 628 src_ptr += 8; 629 dst_ptr += 8; 630 w -= 8; 631 } while (w != 0); 632 } 633 } 634 635 static inline void highbd_dist_wtd_convolve_y_6tap_neon( 636 const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride, 637 int w, int h, const int16_t *y_filter_ptr, const int offset) { 638 const int16x8_t y_filter = vld1q_s16(y_filter_ptr); 639 const int32x4_t offset_vec = vdupq_n_s32(offset); 640 641 if (w == 4) { 642 const int16_t *s = (const int16_t *)src_ptr; 643 uint16_t *d = dst_ptr; 644 645 int16x4_t s0, s1, s2, s3, s4; 646 load_s16_4x5(s, src_stride, &s0, &s1, &s2, &s3, &s4); 647 s += 5 * src_stride; 648 649 do { 650 int16x4_t s5, s6, s7, s8; 651 load_s16_4x4(s, src_stride, &s5, &s6, &s7, &s8); 652 653 uint16x4_t d0 = 654 highbd_convolve6_4(s0, s1, s2, s3, s4, s5, y_filter, offset_vec); 655 uint16x4_t d1 = 656 highbd_convolve6_4(s1, s2, s3, s4, s5, s6, y_filter, offset_vec); 657 uint16x4_t d2 = 658 highbd_convolve6_4(s2, s3, s4, s5, s6, s7, y_filter, offset_vec); 659 uint16x4_t d3 = 660 highbd_convolve6_4(s3, s4, s5, s6, s7, s8, y_filter, offset_vec); 661 662 store_u16_4x4(d, dst_stride, d0, d1, d2, d3); 663 664 s0 = s4; 665 s1 = s5; 666 s2 = s6; 667 s3 = s7; 668 s4 = s8; 669 s += 4 * src_stride; 670 d += 4 * dst_stride; 671 h -= 4; 672 } while (h != 0); 673 } else { 674 do { 675 int height = h; 676 const int16_t *s = (const int16_t *)src_ptr; 677 uint16_t *d = dst_ptr; 678 679 int16x8_t s0, s1, s2, s3, s4; 680 load_s16_8x5(s, src_stride, &s0, &s1, &s2, &s3, &s4); 681 s += 5 * src_stride; 682 683 do { 684 int16x8_t s5, s6, s7, s8; 685 load_s16_8x4(s, src_stride, &s5, &s6, &s7, &s8); 686 687 uint16x8_t d0 = 688 highbd_convolve6_8(s0, s1, s2, s3, s4, s5, y_filter, offset_vec); 689 uint16x8_t d1 = 690 highbd_convolve6_8(s1, s2, s3, s4, s5, s6, y_filter, offset_vec); 691 uint16x8_t d2 = 692 highbd_convolve6_8(s2, s3, s4, s5, s6, s7, y_filter, offset_vec); 693 uint16x8_t d3 = 694 highbd_convolve6_8(s3, s4, s5, s6, s7, s8, y_filter, offset_vec); 695 696 store_u16_8x4(d, dst_stride, d0, d1, d2, d3); 697 698 s0 = s4; 699 s1 = s5; 700 s2 = s6; 701 s3 = s7; 702 s4 = s8; 703 s += 4 * src_stride; 704 d += 4 * dst_stride; 705 height -= 4; 706 } while (height != 0); 707 src_ptr += 8; 708 dst_ptr += 8; 709 w -= 8; 710 } while (w != 0); 711 } 712 } 713 714 static inline uint16x4_t highbd_12_convolve4_4( 715 const int16x4_t s0, const int16x4_t s1, const int16x4_t s2, 716 const int16x4_t s3, const int16x4_t filter, const int32x4_t offset) { 717 int32x4_t sum = vmlal_lane_s16(offset, s0, filter, 0); 718 sum = vmlal_lane_s16(sum, s1, filter, 1); 719 sum = vmlal_lane_s16(sum, s2, filter, 2); 720 sum = vmlal_lane_s16(sum, s3, filter, 3); 721 722 return vqshrun_n_s32(sum, ROUND0_BITS + 2); 723 } 724 725 static inline uint16x8_t highbd_12_convolve4_8( 726 const int16x8_t s0, const int16x8_t s1, const int16x8_t s2, 727 const int16x8_t s3, const int16x4_t filter, const int32x4_t offset) { 728 int32x4_t sum0 = vmlal_lane_s16(offset, vget_low_s16(s0), filter, 0); 729 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s1), filter, 1); 730 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s2), filter, 2); 731 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s3), filter, 3); 732 733 int32x4_t sum1 = vmlal_lane_s16(offset, vget_high_s16(s0), filter, 0); 734 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s1), filter, 1); 735 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s2), filter, 2); 736 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s3), filter, 3); 737 738 return vcombine_u16(vqshrun_n_s32(sum0, ROUND0_BITS + 2), 739 vqshrun_n_s32(sum1, ROUND0_BITS + 2)); 740 } 741 742 static inline void highbd_12_dist_wtd_convolve_y_4tap_neon( 743 const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride, 744 int w, int h, const int16_t *y_filter_ptr, const int offset) { 745 const int16x4_t y_filter = vld1_s16(y_filter_ptr + 2); 746 const int32x4_t offset_vec = vdupq_n_s32(offset); 747 748 if (w == 4) { 749 const int16_t *s = (const int16_t *)src_ptr; 750 uint16_t *d = dst_ptr; 751 752 int16x4_t s0, s1, s2; 753 load_s16_4x3(s, src_stride, &s0, &s1, &s2); 754 s += 3 * src_stride; 755 756 do { 757 int16x4_t s3, s4, s5, s6; 758 load_s16_4x4(s, src_stride, &s3, &s4, &s5, &s6); 759 760 uint16x4_t d0 = 761 highbd_12_convolve4_4(s0, s1, s2, s3, y_filter, offset_vec); 762 uint16x4_t d1 = 763 highbd_12_convolve4_4(s1, s2, s3, s4, y_filter, offset_vec); 764 uint16x4_t d2 = 765 highbd_12_convolve4_4(s2, s3, s4, s5, y_filter, offset_vec); 766 uint16x4_t d3 = 767 highbd_12_convolve4_4(s3, s4, s5, s6, y_filter, offset_vec); 768 769 store_u16_4x4(d, dst_stride, d0, d1, d2, d3); 770 771 s0 = s4; 772 s1 = s5; 773 s2 = s6; 774 775 s += 4 * src_stride; 776 d += 4 * dst_stride; 777 h -= 4; 778 } while (h != 0); 779 } else { 780 do { 781 int height = h; 782 const int16_t *s = (const int16_t *)src_ptr; 783 uint16_t *d = dst_ptr; 784 785 int16x8_t s0, s1, s2; 786 load_s16_8x3(s, src_stride, &s0, &s1, &s2); 787 s += 3 * src_stride; 788 789 do { 790 int16x8_t s3, s4, s5, s6; 791 load_s16_8x4(s, src_stride, &s3, &s4, &s5, &s6); 792 793 uint16x8_t d0 = 794 highbd_12_convolve4_8(s0, s1, s2, s3, y_filter, offset_vec); 795 uint16x8_t d1 = 796 highbd_12_convolve4_8(s1, s2, s3, s4, y_filter, offset_vec); 797 uint16x8_t d2 = 798 highbd_12_convolve4_8(s2, s3, s4, s5, y_filter, offset_vec); 799 uint16x8_t d3 = 800 highbd_12_convolve4_8(s3, s4, s5, s6, y_filter, offset_vec); 801 802 store_u16_8x4(d, dst_stride, d0, d1, d2, d3); 803 804 s0 = s4; 805 s1 = s5; 806 s2 = s6; 807 808 s += 4 * src_stride; 809 d += 4 * dst_stride; 810 height -= 4; 811 } while (height != 0); 812 src_ptr += 8; 813 dst_ptr += 8; 814 w -= 8; 815 } while (w != 0); 816 } 817 } 818 819 static inline uint16x4_t highbd_convolve4_4( 820 const int16x4_t s0, const int16x4_t s1, const int16x4_t s2, 821 const int16x4_t s3, const int16x4_t filter, const int32x4_t offset) { 822 int32x4_t sum = vmlal_lane_s16(offset, s0, filter, 0); 823 sum = vmlal_lane_s16(sum, s1, filter, 1); 824 sum = vmlal_lane_s16(sum, s2, filter, 2); 825 sum = vmlal_lane_s16(sum, s3, filter, 3); 826 827 return vqshrun_n_s32(sum, ROUND0_BITS); 828 } 829 830 static inline uint16x8_t highbd_convolve4_8( 831 const int16x8_t s0, const int16x8_t s1, const int16x8_t s2, 832 const int16x8_t s3, const int16x4_t filter, const int32x4_t offset) { 833 int32x4_t sum0 = vmlal_lane_s16(offset, vget_low_s16(s0), filter, 0); 834 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s1), filter, 1); 835 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s2), filter, 2); 836 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s3), filter, 3); 837 838 int32x4_t sum1 = vmlal_lane_s16(offset, vget_high_s16(s0), filter, 0); 839 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s1), filter, 1); 840 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s2), filter, 2); 841 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s3), filter, 3); 842 843 return vcombine_u16(vqshrun_n_s32(sum0, ROUND0_BITS), 844 vqshrun_n_s32(sum1, ROUND0_BITS)); 845 } 846 847 static inline void highbd_dist_wtd_convolve_y_4tap_neon( 848 const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride, 849 int w, int h, const int16_t *y_filter_ptr, const int offset) { 850 const int16x4_t y_filter = vld1_s16(y_filter_ptr + 2); 851 const int32x4_t offset_vec = vdupq_n_s32(offset); 852 853 if (w == 4) { 854 const int16_t *s = (const int16_t *)src_ptr; 855 uint16_t *d = dst_ptr; 856 857 int16x4_t s0, s1, s2; 858 load_s16_4x3(s, src_stride, &s0, &s1, &s2); 859 s += 3 * src_stride; 860 861 do { 862 int16x4_t s3, s4, s5, s6; 863 load_s16_4x4(s, src_stride, &s3, &s4, &s5, &s6); 864 865 uint16x4_t d0 = highbd_convolve4_4(s0, s1, s2, s3, y_filter, offset_vec); 866 uint16x4_t d1 = highbd_convolve4_4(s1, s2, s3, s4, y_filter, offset_vec); 867 uint16x4_t d2 = highbd_convolve4_4(s2, s3, s4, s5, y_filter, offset_vec); 868 uint16x4_t d3 = highbd_convolve4_4(s3, s4, s5, s6, y_filter, offset_vec); 869 870 store_u16_4x4(d, dst_stride, d0, d1, d2, d3); 871 872 s0 = s4; 873 s1 = s5; 874 s2 = s6; 875 876 s += 4 * src_stride; 877 d += 4 * dst_stride; 878 h -= 4; 879 } while (h != 0); 880 } else { 881 do { 882 int height = h; 883 const int16_t *s = (const int16_t *)src_ptr; 884 uint16_t *d = dst_ptr; 885 886 int16x8_t s0, s1, s2; 887 load_s16_8x3(s, src_stride, &s0, &s1, &s2); 888 s += 3 * src_stride; 889 890 do { 891 int16x8_t s3, s4, s5, s6; 892 load_s16_8x4(s, src_stride, &s3, &s4, &s5, &s6); 893 894 uint16x8_t d0 = 895 highbd_convolve4_8(s0, s1, s2, s3, y_filter, offset_vec); 896 uint16x8_t d1 = 897 highbd_convolve4_8(s1, s2, s3, s4, y_filter, offset_vec); 898 uint16x8_t d2 = 899 highbd_convolve4_8(s2, s3, s4, s5, y_filter, offset_vec); 900 uint16x8_t d3 = 901 highbd_convolve4_8(s3, s4, s5, s6, y_filter, offset_vec); 902 903 store_u16_8x4(d, dst_stride, d0, d1, d2, d3); 904 905 s0 = s4; 906 s1 = s5; 907 s2 = s6; 908 909 s += 4 * src_stride; 910 d += 4 * dst_stride; 911 height -= 4; 912 } while (height != 0); 913 src_ptr += 8; 914 dst_ptr += 8; 915 w -= 8; 916 } while (w != 0); 917 } 918 } 919 920 static inline void highbd_12_dist_wtd_convolve_y_8tap_neon( 921 const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride, 922 int w, int h, const int16_t *y_filter_ptr, const int offset) { 923 const int16x8_t y_filter = vld1q_s16(y_filter_ptr); 924 const int32x4_t offset_vec = vdupq_n_s32(offset); 925 926 if (w == 4) { 927 const int16_t *s = (const int16_t *)src_ptr; 928 uint16_t *d = dst_ptr; 929 930 int16x4_t s0, s1, s2, s3, s4, s5, s6; 931 load_s16_4x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6); 932 s += 7 * src_stride; 933 934 do { 935 int16x4_t s7, s8, s9, s10; 936 load_s16_4x4(s, src_stride, &s7, &s8, &s9, &s10); 937 938 uint16x4_t d0 = highbd_12_convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, 939 y_filter, offset_vec); 940 uint16x4_t d1 = highbd_12_convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, 941 y_filter, offset_vec); 942 uint16x4_t d2 = highbd_12_convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, 943 y_filter, offset_vec); 944 uint16x4_t d3 = highbd_12_convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, 945 y_filter, offset_vec); 946 947 store_u16_4x4(d, dst_stride, d0, d1, d2, d3); 948 949 s0 = s4; 950 s1 = s5; 951 s2 = s6; 952 s3 = s7; 953 s4 = s8; 954 s5 = s9; 955 s6 = s10; 956 s += 4 * src_stride; 957 d += 4 * dst_stride; 958 h -= 4; 959 } while (h != 0); 960 } else { 961 do { 962 int height = h; 963 const int16_t *s = (const int16_t *)src_ptr; 964 uint16_t *d = dst_ptr; 965 966 int16x8_t s0, s1, s2, s3, s4, s5, s6; 967 load_s16_8x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6); 968 s += 7 * src_stride; 969 970 do { 971 int16x8_t s7, s8, s9, s10; 972 load_s16_8x4(s, src_stride, &s7, &s8, &s9, &s10); 973 974 uint16x8_t d0 = highbd_12_convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, 975 y_filter, offset_vec); 976 uint16x8_t d1 = highbd_12_convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, 977 y_filter, offset_vec); 978 uint16x8_t d2 = highbd_12_convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, 979 y_filter, offset_vec); 980 uint16x8_t d3 = highbd_12_convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, 981 y_filter, offset_vec); 982 983 store_u16_8x4(d, dst_stride, d0, d1, d2, d3); 984 985 s0 = s4; 986 s1 = s5; 987 s2 = s6; 988 s3 = s7; 989 s4 = s8; 990 s5 = s9; 991 s6 = s10; 992 s += 4 * src_stride; 993 d += 4 * dst_stride; 994 height -= 4; 995 } while (height != 0); 996 src_ptr += 8; 997 dst_ptr += 8; 998 w -= 8; 999 } while (w != 0); 1000 } 1001 } 1002 static inline void highbd_dist_wtd_convolve_y_8tap_neon( 1003 const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride, 1004 int w, int h, const int16_t *y_filter_ptr, const int offset) { 1005 const int16x8_t y_filter = vld1q_s16(y_filter_ptr); 1006 const int32x4_t offset_vec = vdupq_n_s32(offset); 1007 1008 if (w == 4) { 1009 const int16_t *s = (const int16_t *)src_ptr; 1010 uint16_t *d = dst_ptr; 1011 1012 int16x4_t s0, s1, s2, s3, s4, s5, s6; 1013 load_s16_4x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6); 1014 s += 7 * src_stride; 1015 1016 do { 1017 int16x4_t s7, s8, s9, s10; 1018 load_s16_4x4(s, src_stride, &s7, &s8, &s9, &s10); 1019 1020 uint16x4_t d0 = highbd_convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, 1021 y_filter, offset_vec); 1022 uint16x4_t d1 = highbd_convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, 1023 y_filter, offset_vec); 1024 uint16x4_t d2 = highbd_convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, 1025 y_filter, offset_vec); 1026 uint16x4_t d3 = highbd_convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, 1027 y_filter, offset_vec); 1028 1029 store_u16_4x4(d, dst_stride, d0, d1, d2, d3); 1030 1031 s0 = s4; 1032 s1 = s5; 1033 s2 = s6; 1034 s3 = s7; 1035 s4 = s8; 1036 s5 = s9; 1037 s6 = s10; 1038 s += 4 * src_stride; 1039 d += 4 * dst_stride; 1040 h -= 4; 1041 } while (h != 0); 1042 } else { 1043 do { 1044 int height = h; 1045 const int16_t *s = (const int16_t *)src_ptr; 1046 uint16_t *d = dst_ptr; 1047 1048 int16x8_t s0, s1, s2, s3, s4, s5, s6; 1049 load_s16_8x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6); 1050 s += 7 * src_stride; 1051 1052 do { 1053 int16x8_t s7, s8, s9, s10; 1054 load_s16_8x4(s, src_stride, &s7, &s8, &s9, &s10); 1055 1056 uint16x8_t d0 = highbd_convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, 1057 y_filter, offset_vec); 1058 uint16x8_t d1 = highbd_convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, 1059 y_filter, offset_vec); 1060 uint16x8_t d2 = highbd_convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, 1061 y_filter, offset_vec); 1062 uint16x8_t d3 = highbd_convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, 1063 y_filter, offset_vec); 1064 1065 store_u16_8x4(d, dst_stride, d0, d1, d2, d3); 1066 1067 s0 = s4; 1068 s1 = s5; 1069 s2 = s6; 1070 s3 = s7; 1071 s4 = s8; 1072 s5 = s9; 1073 s6 = s10; 1074 s += 4 * src_stride; 1075 d += 4 * dst_stride; 1076 height -= 4; 1077 } while (height != 0); 1078 src_ptr += 8; 1079 dst_ptr += 8; 1080 w -= 8; 1081 } while (w != 0); 1082 } 1083 } 1084 1085 void av1_highbd_dist_wtd_convolve_y_neon( 1086 const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, 1087 int h, const InterpFilterParams *filter_params_y, const int subpel_y_qn, 1088 ConvolveParams *conv_params, int bd) { 1089 DECLARE_ALIGNED(16, uint16_t, 1090 im_block[(MAX_SB_SIZE + MAX_FILTER_TAP) * MAX_SB_SIZE]); 1091 CONV_BUF_TYPE *dst16 = conv_params->dst; 1092 const int y_filter_taps = get_filter_tap(filter_params_y, subpel_y_qn); 1093 int dst16_stride = conv_params->dst_stride; 1094 const int im_stride = MAX_SB_SIZE; 1095 const int vert_offset = filter_params_y->taps / 2 - 1; 1096 assert(FILTER_BITS == COMPOUND_ROUND1_BITS); 1097 const int round_offset_conv = (1 << (conv_params->round_0 - 1)) + 1098 (1 << (bd + FILTER_BITS)) + 1099 (1 << (bd + FILTER_BITS - 1)); 1100 1101 const int16_t *y_filter_ptr = av1_get_interp_filter_subpel_kernel( 1102 filter_params_y, subpel_y_qn & SUBPEL_MASK); 1103 1104 src -= vert_offset * src_stride; 1105 1106 if (bd == 12) { 1107 if (conv_params->do_average) { 1108 if (y_filter_taps <= 4) { 1109 highbd_12_dist_wtd_convolve_y_4tap_neon( 1110 src + 2 * src_stride, src_stride, im_block, im_stride, w, h, 1111 y_filter_ptr, round_offset_conv); 1112 } else if (y_filter_taps == 6) { 1113 highbd_12_dist_wtd_convolve_y_6tap_neon( 1114 src + src_stride, src_stride, im_block, im_stride, w, h, 1115 y_filter_ptr, round_offset_conv); 1116 } else { 1117 highbd_12_dist_wtd_convolve_y_8tap_neon(src, src_stride, im_block, 1118 im_stride, w, h, y_filter_ptr, 1119 round_offset_conv); 1120 } 1121 if (conv_params->use_dist_wtd_comp_avg) { 1122 highbd_12_dist_wtd_comp_avg_neon(im_block, im_stride, dst, dst_stride, 1123 w, h, conv_params); 1124 } else { 1125 highbd_12_comp_avg_neon(im_block, im_stride, dst, dst_stride, w, h, 1126 conv_params); 1127 } 1128 } else { 1129 if (y_filter_taps <= 4) { 1130 highbd_12_dist_wtd_convolve_y_4tap_neon( 1131 src + 2 * src_stride, src_stride, dst16, dst16_stride, w, h, 1132 y_filter_ptr, round_offset_conv); 1133 } else if (y_filter_taps == 6) { 1134 highbd_12_dist_wtd_convolve_y_6tap_neon( 1135 src + src_stride, src_stride, dst16, dst16_stride, w, h, 1136 y_filter_ptr, round_offset_conv); 1137 } else { 1138 highbd_12_dist_wtd_convolve_y_8tap_neon( 1139 src, src_stride, dst16, dst16_stride, w, h, y_filter_ptr, 1140 round_offset_conv); 1141 } 1142 } 1143 } else { 1144 if (conv_params->do_average) { 1145 if (y_filter_taps <= 4) { 1146 highbd_dist_wtd_convolve_y_4tap_neon(src + 2 * src_stride, src_stride, 1147 im_block, im_stride, w, h, 1148 y_filter_ptr, round_offset_conv); 1149 } else if (y_filter_taps == 6) { 1150 highbd_dist_wtd_convolve_y_6tap_neon(src + src_stride, src_stride, 1151 im_block, im_stride, w, h, 1152 y_filter_ptr, round_offset_conv); 1153 } else { 1154 highbd_dist_wtd_convolve_y_8tap_neon(src, src_stride, im_block, 1155 im_stride, w, h, y_filter_ptr, 1156 round_offset_conv); 1157 } 1158 if (conv_params->use_dist_wtd_comp_avg) { 1159 highbd_dist_wtd_comp_avg_neon(im_block, im_stride, dst, dst_stride, w, 1160 h, conv_params, bd); 1161 } else { 1162 highbd_comp_avg_neon(im_block, im_stride, dst, dst_stride, w, h, 1163 conv_params, bd); 1164 } 1165 } else { 1166 if (y_filter_taps <= 4) { 1167 highbd_dist_wtd_convolve_y_4tap_neon(src + 2 * src_stride, src_stride, 1168 dst16, dst16_stride, w, h, 1169 y_filter_ptr, round_offset_conv); 1170 } else if (y_filter_taps == 6) { 1171 highbd_dist_wtd_convolve_y_6tap_neon(src + src_stride, src_stride, 1172 dst16, dst16_stride, w, h, 1173 y_filter_ptr, round_offset_conv); 1174 } else { 1175 highbd_dist_wtd_convolve_y_8tap_neon(src, src_stride, dst16, 1176 dst16_stride, w, h, y_filter_ptr, 1177 round_offset_conv); 1178 } 1179 } 1180 } 1181 } 1182 1183 static inline void highbd_2d_copy_neon(const uint16_t *src_ptr, int src_stride, 1184 uint16_t *dst_ptr, int dst_stride, int w, 1185 int h, const int round_bits, 1186 const int offset) { 1187 if (w <= 4) { 1188 const int16x4_t round_shift_s16 = vdup_n_s16(round_bits); 1189 const uint16x4_t offset_u16 = vdup_n_u16(offset); 1190 1191 for (int y = 0; y < h; ++y) { 1192 const uint16x4_t s = vld1_u16(src_ptr + y * src_stride); 1193 uint16x4_t d = vshl_u16(s, round_shift_s16); 1194 d = vadd_u16(d, offset_u16); 1195 if (w == 2) { 1196 store_u16_2x1(dst_ptr + y * dst_stride, d); 1197 } else { 1198 vst1_u16(dst_ptr + y * dst_stride, d); 1199 } 1200 } 1201 } else { 1202 const int16x8_t round_shift_s16 = vdupq_n_s16(round_bits); 1203 const uint16x8_t offset_u16 = vdupq_n_u16(offset); 1204 1205 for (int y = 0; y < h; ++y) { 1206 for (int x = 0; x < w; x += 8) { 1207 const uint16x8_t s = vld1q_u16(src_ptr + y * src_stride + x); 1208 uint16x8_t d = vshlq_u16(s, round_shift_s16); 1209 d = vaddq_u16(d, offset_u16); 1210 vst1q_u16(dst_ptr + y * dst_stride + x, d); 1211 } 1212 } 1213 } 1214 } 1215 1216 void av1_highbd_dist_wtd_convolve_2d_copy_neon(const uint16_t *src, 1217 int src_stride, uint16_t *dst, 1218 int dst_stride, int w, int h, 1219 ConvolveParams *conv_params, 1220 int bd) { 1221 DECLARE_ALIGNED(16, uint16_t, 1222 im_block[(MAX_SB_SIZE + MAX_FILTER_TAP) * MAX_SB_SIZE]); 1223 1224 const int im_stride = MAX_SB_SIZE; 1225 CONV_BUF_TYPE *dst16 = conv_params->dst; 1226 int dst16_stride = conv_params->dst_stride; 1227 const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0; 1228 const int round_offset = (1 << (offset_bits - conv_params->round_1)) + 1229 (1 << (offset_bits - conv_params->round_1 - 1)); 1230 const int round_bits = 1231 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1; 1232 assert(round_bits >= 0); 1233 1234 if (conv_params->do_average) { 1235 highbd_2d_copy_neon(src, src_stride, im_block, im_stride, w, h, round_bits, 1236 round_offset); 1237 } else { 1238 highbd_2d_copy_neon(src, src_stride, dst16, dst16_stride, w, h, round_bits, 1239 round_offset); 1240 } 1241 1242 if (conv_params->do_average) { 1243 if (conv_params->use_dist_wtd_comp_avg) { 1244 if (bd == 12) { 1245 highbd_12_dist_wtd_comp_avg_neon(im_block, im_stride, dst, dst_stride, 1246 w, h, conv_params); 1247 } else { 1248 highbd_dist_wtd_comp_avg_neon(im_block, im_stride, dst, dst_stride, w, 1249 h, conv_params, bd); 1250 } 1251 } else { 1252 if (bd == 12) { 1253 highbd_12_comp_avg_neon(im_block, im_stride, dst, dst_stride, w, h, 1254 conv_params); 1255 } else { 1256 highbd_comp_avg_neon(im_block, im_stride, dst, dst_stride, w, h, 1257 conv_params, bd); 1258 } 1259 } 1260 } 1261 } 1262 1263 static inline uint16x4_t highbd_convolve6_4_2d_v( 1264 const int16x4_t s0, const int16x4_t s1, const int16x4_t s2, 1265 const int16x4_t s3, const int16x4_t s4, const int16x4_t s5, 1266 const int16x8_t y_filter, const int32x4_t offset) { 1267 // Values at indices 0 and 7 of y_filter are zero. 1268 const int16x4_t y_filter_0_3 = vget_low_s16(y_filter); 1269 const int16x4_t y_filter_4_7 = vget_high_s16(y_filter); 1270 1271 int32x4_t sum = vmlal_lane_s16(offset, s0, y_filter_0_3, 1); 1272 sum = vmlal_lane_s16(sum, s1, y_filter_0_3, 2); 1273 sum = vmlal_lane_s16(sum, s2, y_filter_0_3, 3); 1274 sum = vmlal_lane_s16(sum, s3, y_filter_4_7, 0); 1275 sum = vmlal_lane_s16(sum, s4, y_filter_4_7, 1); 1276 sum = vmlal_lane_s16(sum, s5, y_filter_4_7, 2); 1277 1278 return vqrshrun_n_s32(sum, COMPOUND_ROUND1_BITS); 1279 } 1280 1281 static inline uint16x8_t highbd_convolve6_8_2d_v( 1282 const int16x8_t s0, const int16x8_t s1, const int16x8_t s2, 1283 const int16x8_t s3, const int16x8_t s4, const int16x8_t s5, 1284 const int16x8_t y_filter, const int32x4_t offset) { 1285 // Values at indices 0 and 7 of y_filter are zero. 1286 const int16x4_t y_filter_0_3 = vget_low_s16(y_filter); 1287 const int16x4_t y_filter_4_7 = vget_high_s16(y_filter); 1288 1289 int32x4_t sum0 = vmlal_lane_s16(offset, vget_low_s16(s0), y_filter_0_3, 1); 1290 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s1), y_filter_0_3, 2); 1291 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s2), y_filter_0_3, 3); 1292 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s3), y_filter_4_7, 0); 1293 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s4), y_filter_4_7, 1); 1294 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s5), y_filter_4_7, 2); 1295 1296 int32x4_t sum1 = vmlal_lane_s16(offset, vget_high_s16(s0), y_filter_0_3, 1); 1297 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s1), y_filter_0_3, 2); 1298 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s2), y_filter_0_3, 3); 1299 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s3), y_filter_4_7, 0); 1300 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s4), y_filter_4_7, 1); 1301 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s5), y_filter_4_7, 2); 1302 1303 return vcombine_u16(vqrshrun_n_s32(sum0, COMPOUND_ROUND1_BITS), 1304 vqrshrun_n_s32(sum1, COMPOUND_ROUND1_BITS)); 1305 } 1306 1307 static inline void highbd_dist_wtd_convolve_2d_vert_6tap_neon( 1308 const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride, 1309 int w, int h, const int16_t *y_filter_ptr, int offset) { 1310 const int16x8_t y_filter = vld1q_s16(y_filter_ptr); 1311 const int32x4_t offset_vec = vdupq_n_s32(offset); 1312 1313 if (w == 4) { 1314 const int16_t *s = (const int16_t *)src_ptr; 1315 uint16_t *d = dst_ptr; 1316 1317 int16x4_t s0, s1, s2, s3, s4; 1318 load_s16_4x5(s, src_stride, &s0, &s1, &s2, &s3, &s4); 1319 s += 5 * src_stride; 1320 1321 do { 1322 int16x4_t s5, s6, s7, s8; 1323 load_s16_4x4(s, src_stride, &s5, &s6, &s7, &s8); 1324 1325 uint16x4_t d0 = 1326 highbd_convolve6_4_2d_v(s0, s1, s2, s3, s4, s5, y_filter, offset_vec); 1327 uint16x4_t d1 = 1328 highbd_convolve6_4_2d_v(s1, s2, s3, s4, s5, s6, y_filter, offset_vec); 1329 uint16x4_t d2 = 1330 highbd_convolve6_4_2d_v(s2, s3, s4, s5, s6, s7, y_filter, offset_vec); 1331 uint16x4_t d3 = 1332 highbd_convolve6_4_2d_v(s3, s4, s5, s6, s7, s8, y_filter, offset_vec); 1333 1334 store_u16_4x4(d, dst_stride, d0, d1, d2, d3); 1335 1336 s0 = s4; 1337 s1 = s5; 1338 s2 = s6; 1339 s3 = s7; 1340 s4 = s8; 1341 s += 4 * src_stride; 1342 d += 4 * dst_stride; 1343 h -= 4; 1344 } while (h != 0); 1345 } else { 1346 do { 1347 int height = h; 1348 const int16_t *s = (const int16_t *)src_ptr; 1349 uint16_t *d = dst_ptr; 1350 1351 int16x8_t s0, s1, s2, s3, s4; 1352 load_s16_8x5(s, src_stride, &s0, &s1, &s2, &s3, &s4); 1353 s += 5 * src_stride; 1354 1355 do { 1356 int16x8_t s5, s6, s7, s8; 1357 load_s16_8x4(s, src_stride, &s5, &s6, &s7, &s8); 1358 1359 uint16x8_t d0 = highbd_convolve6_8_2d_v(s0, s1, s2, s3, s4, s5, 1360 y_filter, offset_vec); 1361 uint16x8_t d1 = highbd_convolve6_8_2d_v(s1, s2, s3, s4, s5, s6, 1362 y_filter, offset_vec); 1363 uint16x8_t d2 = highbd_convolve6_8_2d_v(s2, s3, s4, s5, s6, s7, 1364 y_filter, offset_vec); 1365 uint16x8_t d3 = highbd_convolve6_8_2d_v(s3, s4, s5, s6, s7, s8, 1366 y_filter, offset_vec); 1367 1368 store_u16_8x4(d, dst_stride, d0, d1, d2, d3); 1369 1370 s0 = s4; 1371 s1 = s5; 1372 s2 = s6; 1373 s3 = s7; 1374 s4 = s8; 1375 s += 4 * src_stride; 1376 d += 4 * dst_stride; 1377 height -= 4; 1378 } while (height != 0); 1379 src_ptr += 8; 1380 dst_ptr += 8; 1381 w -= 8; 1382 } while (w != 0); 1383 } 1384 } 1385 1386 static inline uint16x4_t highbd_convolve8_4_2d_v( 1387 const int16x4_t s0, const int16x4_t s1, const int16x4_t s2, 1388 const int16x4_t s3, const int16x4_t s4, const int16x4_t s5, 1389 const int16x4_t s6, const int16x4_t s7, const int16x8_t y_filter, 1390 const int32x4_t offset) { 1391 const int16x4_t y_filter_0_3 = vget_low_s16(y_filter); 1392 const int16x4_t y_filter_4_7 = vget_high_s16(y_filter); 1393 1394 int32x4_t sum = vmlal_lane_s16(offset, s0, y_filter_0_3, 0); 1395 sum = vmlal_lane_s16(sum, s1, y_filter_0_3, 1); 1396 sum = vmlal_lane_s16(sum, s2, y_filter_0_3, 2); 1397 sum = vmlal_lane_s16(sum, s3, y_filter_0_3, 3); 1398 sum = vmlal_lane_s16(sum, s4, y_filter_4_7, 0); 1399 sum = vmlal_lane_s16(sum, s5, y_filter_4_7, 1); 1400 sum = vmlal_lane_s16(sum, s6, y_filter_4_7, 2); 1401 sum = vmlal_lane_s16(sum, s7, y_filter_4_7, 3); 1402 1403 return vqrshrun_n_s32(sum, COMPOUND_ROUND1_BITS); 1404 } 1405 1406 static inline uint16x8_t highbd_convolve8_8_2d_v( 1407 const int16x8_t s0, const int16x8_t s1, const int16x8_t s2, 1408 const int16x8_t s3, const int16x8_t s4, const int16x8_t s5, 1409 const int16x8_t s6, const int16x8_t s7, const int16x8_t y_filter, 1410 const int32x4_t offset) { 1411 const int16x4_t y_filter_0_3 = vget_low_s16(y_filter); 1412 const int16x4_t y_filter_4_7 = vget_high_s16(y_filter); 1413 1414 int32x4_t sum0 = vmlal_lane_s16(offset, vget_low_s16(s0), y_filter_0_3, 0); 1415 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s1), y_filter_0_3, 1); 1416 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s2), y_filter_0_3, 2); 1417 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s3), y_filter_0_3, 3); 1418 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s4), y_filter_4_7, 0); 1419 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s5), y_filter_4_7, 1); 1420 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s6), y_filter_4_7, 2); 1421 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s7), y_filter_4_7, 3); 1422 1423 int32x4_t sum1 = vmlal_lane_s16(offset, vget_high_s16(s0), y_filter_0_3, 0); 1424 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s1), y_filter_0_3, 1); 1425 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s2), y_filter_0_3, 2); 1426 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s3), y_filter_0_3, 3); 1427 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s4), y_filter_4_7, 0); 1428 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s5), y_filter_4_7, 1); 1429 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s6), y_filter_4_7, 2); 1430 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s7), y_filter_4_7, 3); 1431 1432 return vcombine_u16(vqrshrun_n_s32(sum0, COMPOUND_ROUND1_BITS), 1433 vqrshrun_n_s32(sum1, COMPOUND_ROUND1_BITS)); 1434 } 1435 1436 static inline void highbd_dist_wtd_convolve_2d_vert_8tap_neon( 1437 const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride, 1438 int w, int h, const int16_t *y_filter_ptr, int offset) { 1439 const int16x8_t y_filter = vld1q_s16(y_filter_ptr); 1440 const int32x4_t offset_vec = vdupq_n_s32(offset); 1441 1442 if (w <= 4) { 1443 const int16_t *s = (const int16_t *)src_ptr; 1444 uint16_t *d = dst_ptr; 1445 1446 int16x4_t s0, s1, s2, s3, s4, s5, s6; 1447 load_s16_4x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6); 1448 s += 7 * src_stride; 1449 1450 do { 1451 int16x4_t s7, s8, s9, s10; 1452 load_s16_4x4(s, src_stride, &s7, &s8, &s9, &s10); 1453 1454 uint16x4_t d0 = highbd_convolve8_4_2d_v(s0, s1, s2, s3, s4, s5, s6, s7, 1455 y_filter, offset_vec); 1456 uint16x4_t d1 = highbd_convolve8_4_2d_v(s1, s2, s3, s4, s5, s6, s7, s8, 1457 y_filter, offset_vec); 1458 uint16x4_t d2 = highbd_convolve8_4_2d_v(s2, s3, s4, s5, s6, s7, s8, s9, 1459 y_filter, offset_vec); 1460 uint16x4_t d3 = highbd_convolve8_4_2d_v(s3, s4, s5, s6, s7, s8, s9, s10, 1461 y_filter, offset_vec); 1462 1463 store_u16_4x4(d, dst_stride, d0, d1, d2, d3); 1464 1465 s0 = s4; 1466 s1 = s5; 1467 s2 = s6; 1468 s3 = s7; 1469 s4 = s8; 1470 s5 = s9; 1471 s6 = s10; 1472 s += 4 * src_stride; 1473 d += 4 * dst_stride; 1474 h -= 4; 1475 } while (h != 0); 1476 } else { 1477 do { 1478 int height = h; 1479 const int16_t *s = (const int16_t *)src_ptr; 1480 uint16_t *d = dst_ptr; 1481 1482 int16x8_t s0, s1, s2, s3, s4, s5, s6; 1483 load_s16_8x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6); 1484 s += 7 * src_stride; 1485 1486 do { 1487 int16x8_t s7, s8, s9, s10; 1488 load_s16_8x4(s, src_stride, &s7, &s8, &s9, &s10); 1489 1490 uint16x8_t d0 = highbd_convolve8_8_2d_v(s0, s1, s2, s3, s4, s5, s6, s7, 1491 y_filter, offset_vec); 1492 uint16x8_t d1 = highbd_convolve8_8_2d_v(s1, s2, s3, s4, s5, s6, s7, s8, 1493 y_filter, offset_vec); 1494 uint16x8_t d2 = highbd_convolve8_8_2d_v(s2, s3, s4, s5, s6, s7, s8, s9, 1495 y_filter, offset_vec); 1496 uint16x8_t d3 = highbd_convolve8_8_2d_v(s3, s4, s5, s6, s7, s8, s9, s10, 1497 y_filter, offset_vec); 1498 1499 store_u16_8x4(d, dst_stride, d0, d1, d2, d3); 1500 1501 s0 = s4; 1502 s1 = s5; 1503 s2 = s6; 1504 s3 = s7; 1505 s4 = s8; 1506 s5 = s9; 1507 s6 = s10; 1508 s += 4 * src_stride; 1509 d += 4 * dst_stride; 1510 height -= 4; 1511 } while (height != 0); 1512 src_ptr += 8; 1513 dst_ptr += 8; 1514 w -= 8; 1515 } while (w != 0); 1516 } 1517 } 1518 1519 static inline void highbd_12_dist_wtd_convolve_2d_horiz_6tap_neon( 1520 const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride, 1521 int w, int h, const int16_t *x_filter_ptr, const int offset) { 1522 // The smallest block height is 4, and the horizontal convolution needs to 1523 // process an extra (filter_taps/2 - 1) lines for the vertical convolution. 1524 assert(h >= 5); 1525 const int32x4_t offset_vec = vdupq_n_s32(offset); 1526 1527 const int16x8_t x_filter = vld1q_s16(x_filter_ptr); 1528 1529 int height = h; 1530 1531 do { 1532 int width = w; 1533 const int16_t *s = (const int16_t *)src_ptr; 1534 uint16_t *d = dst_ptr; 1535 1536 do { 1537 int16x8_t s0[6], s1[6], s2[6], s3[6]; 1538 load_s16_8x6(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3], 1539 &s0[4], &s0[5]); 1540 load_s16_8x6(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3], 1541 &s1[4], &s1[5]); 1542 load_s16_8x6(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3], 1543 &s2[4], &s2[5]); 1544 load_s16_8x6(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3], 1545 &s3[4], &s3[5]); 1546 1547 uint16x8_t d0 = highbd_12_convolve6_8(s0[0], s0[1], s0[2], s0[3], s0[4], 1548 s0[5], x_filter, offset_vec); 1549 uint16x8_t d1 = highbd_12_convolve6_8(s1[0], s1[1], s1[2], s1[3], s1[4], 1550 s1[5], x_filter, offset_vec); 1551 uint16x8_t d2 = highbd_12_convolve6_8(s2[0], s2[1], s2[2], s2[3], s2[4], 1552 s2[5], x_filter, offset_vec); 1553 uint16x8_t d3 = highbd_12_convolve6_8(s3[0], s3[1], s3[2], s3[3], s3[4], 1554 s3[5], x_filter, offset_vec); 1555 1556 store_u16_8x4(d, dst_stride, d0, d1, d2, d3); 1557 1558 s += 8; 1559 d += 8; 1560 width -= 8; 1561 } while (width != 0); 1562 src_ptr += 4 * src_stride; 1563 dst_ptr += 4 * dst_stride; 1564 height -= 4; 1565 } while (height > 4); 1566 1567 do { 1568 int width = w; 1569 const int16_t *s = (const int16_t *)src_ptr; 1570 uint16_t *d = dst_ptr; 1571 1572 do { 1573 int16x8_t s0[6]; 1574 load_s16_8x6(s, 1, &s0[0], &s0[1], &s0[2], &s0[3], &s0[4], &s0[5]); 1575 1576 uint16x8_t d0 = highbd_12_convolve6_8(s0[0], s0[1], s0[2], s0[3], s0[4], 1577 s0[5], x_filter, offset_vec); 1578 vst1q_u16(d, d0); 1579 1580 s += 8; 1581 d += 8; 1582 width -= 8; 1583 } while (width != 0); 1584 src_ptr += src_stride; 1585 dst_ptr += dst_stride; 1586 } while (--height != 0); 1587 } 1588 1589 static inline void highbd_dist_wtd_convolve_2d_horiz_6tap_neon( 1590 const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride, 1591 int w, int h, const int16_t *x_filter_ptr, const int offset) { 1592 // The smallest block height is 4, and the horizontal convolution needs to 1593 // process an extra (filter_taps/2 - 1) lines for the vertical convolution. 1594 assert(h >= 5); 1595 const int32x4_t offset_vec = vdupq_n_s32(offset); 1596 1597 const int16x8_t x_filter = vld1q_s16(x_filter_ptr); 1598 1599 int height = h; 1600 1601 do { 1602 int width = w; 1603 const int16_t *s = (const int16_t *)src_ptr; 1604 uint16_t *d = dst_ptr; 1605 1606 do { 1607 int16x8_t s0[6], s1[6], s2[6], s3[6]; 1608 load_s16_8x6(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3], 1609 &s0[4], &s0[5]); 1610 load_s16_8x6(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3], 1611 &s1[4], &s1[5]); 1612 load_s16_8x6(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3], 1613 &s2[4], &s2[5]); 1614 load_s16_8x6(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3], 1615 &s3[4], &s3[5]); 1616 1617 uint16x8_t d0 = highbd_convolve6_8(s0[0], s0[1], s0[2], s0[3], s0[4], 1618 s0[5], x_filter, offset_vec); 1619 uint16x8_t d1 = highbd_convolve6_8(s1[0], s1[1], s1[2], s1[3], s1[4], 1620 s1[5], x_filter, offset_vec); 1621 uint16x8_t d2 = highbd_convolve6_8(s2[0], s2[1], s2[2], s2[3], s2[4], 1622 s2[5], x_filter, offset_vec); 1623 uint16x8_t d3 = highbd_convolve6_8(s3[0], s3[1], s3[2], s3[3], s3[4], 1624 s3[5], x_filter, offset_vec); 1625 1626 store_u16_8x4(d, dst_stride, d0, d1, d2, d3); 1627 1628 s += 8; 1629 d += 8; 1630 width -= 8; 1631 } while (width != 0); 1632 src_ptr += 4 * src_stride; 1633 dst_ptr += 4 * dst_stride; 1634 height -= 4; 1635 } while (height > 4); 1636 1637 do { 1638 int width = w; 1639 const int16_t *s = (const int16_t *)src_ptr; 1640 uint16_t *d = dst_ptr; 1641 1642 do { 1643 int16x8_t s0[6]; 1644 load_s16_8x6(s, 1, &s0[0], &s0[1], &s0[2], &s0[3], &s0[4], &s0[5]); 1645 1646 uint16x8_t d0 = highbd_convolve6_8(s0[0], s0[1], s0[2], s0[3], s0[4], 1647 s0[5], x_filter, offset_vec); 1648 vst1q_u16(d, d0); 1649 1650 s += 8; 1651 d += 8; 1652 width -= 8; 1653 } while (width != 0); 1654 src_ptr += src_stride; 1655 dst_ptr += dst_stride; 1656 } while (--height != 0); 1657 } 1658 1659 static inline void highbd_12_dist_wtd_convolve_2d_horiz_neon( 1660 const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride, 1661 int w, int h, const int16_t *x_filter_ptr, const int offset) { 1662 // The smallest block height is 4, and the horizontal convolution needs to 1663 // process an extra (filter_taps/2 - 1) lines for the vertical convolution. 1664 assert(h >= 5); 1665 const int32x4_t offset_vec = vdupq_n_s32(offset); 1666 1667 if (w == 4) { 1668 // 4-tap filters are used for blocks having width == 4. 1669 const int16x4_t x_filter = vld1_s16(x_filter_ptr + 2); 1670 const int16_t *s = (const int16_t *)(src_ptr + 1); 1671 uint16_t *d = dst_ptr; 1672 1673 do { 1674 int16x4_t s0[4], s1[4], s2[4], s3[4]; 1675 load_s16_4x4(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3]); 1676 load_s16_4x4(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3]); 1677 load_s16_4x4(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3]); 1678 load_s16_4x4(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3]); 1679 1680 uint16x4_t d0 = highbd_12_convolve4_4_x(s0, x_filter, offset_vec); 1681 uint16x4_t d1 = highbd_12_convolve4_4_x(s1, x_filter, offset_vec); 1682 uint16x4_t d2 = highbd_12_convolve4_4_x(s2, x_filter, offset_vec); 1683 uint16x4_t d3 = highbd_12_convolve4_4_x(s3, x_filter, offset_vec); 1684 1685 store_u16_4x4(d, dst_stride, d0, d1, d2, d3); 1686 1687 s += 4 * src_stride; 1688 d += 4 * dst_stride; 1689 h -= 4; 1690 } while (h > 4); 1691 1692 do { 1693 int16x4_t s0[4]; 1694 load_s16_4x4(s, 1, &s0[0], &s0[1], &s0[2], &s0[3]); 1695 1696 uint16x4_t d0 = highbd_12_convolve4_4_x(s0, x_filter, offset_vec); 1697 vst1_u16(d, d0); 1698 1699 s += src_stride; 1700 d += dst_stride; 1701 } while (--h != 0); 1702 } else { 1703 const int16x8_t x_filter = vld1q_s16(x_filter_ptr); 1704 int height = h; 1705 1706 do { 1707 int width = w; 1708 const int16_t *s = (const int16_t *)src_ptr; 1709 uint16_t *d = dst_ptr; 1710 1711 do { 1712 int16x8_t s0[8], s1[8], s2[8], s3[8]; 1713 load_s16_8x8(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3], 1714 &s0[4], &s0[5], &s0[6], &s0[7]); 1715 load_s16_8x8(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3], 1716 &s1[4], &s1[5], &s1[6], &s1[7]); 1717 load_s16_8x8(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3], 1718 &s2[4], &s2[5], &s2[6], &s2[7]); 1719 load_s16_8x8(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3], 1720 &s3[4], &s3[5], &s3[6], &s3[7]); 1721 1722 uint16x8_t d0 = 1723 highbd_12_convolve8_8(s0[0], s0[1], s0[2], s0[3], s0[4], s0[5], 1724 s0[6], s0[7], x_filter, offset_vec); 1725 uint16x8_t d1 = 1726 highbd_12_convolve8_8(s1[0], s1[1], s1[2], s1[3], s1[4], s1[5], 1727 s1[6], s1[7], x_filter, offset_vec); 1728 uint16x8_t d2 = 1729 highbd_12_convolve8_8(s2[0], s2[1], s2[2], s2[3], s2[4], s2[5], 1730 s2[6], s2[7], x_filter, offset_vec); 1731 uint16x8_t d3 = 1732 highbd_12_convolve8_8(s3[0], s3[1], s3[2], s3[3], s3[4], s3[5], 1733 s3[6], s3[7], x_filter, offset_vec); 1734 1735 store_u16_8x4(d, dst_stride, d0, d1, d2, d3); 1736 1737 s += 8; 1738 d += 8; 1739 width -= 8; 1740 } while (width != 0); 1741 src_ptr += 4 * src_stride; 1742 dst_ptr += 4 * dst_stride; 1743 height -= 4; 1744 } while (height > 4); 1745 1746 do { 1747 int width = w; 1748 const int16_t *s = (const int16_t *)src_ptr; 1749 uint16_t *d = dst_ptr; 1750 1751 do { 1752 int16x8_t s0[8]; 1753 load_s16_8x8(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3], 1754 &s0[4], &s0[5], &s0[6], &s0[7]); 1755 1756 uint16x8_t d0 = 1757 highbd_12_convolve8_8(s0[0], s0[1], s0[2], s0[3], s0[4], s0[5], 1758 s0[6], s0[7], x_filter, offset_vec); 1759 vst1q_u16(d, d0); 1760 1761 s += 8; 1762 d += 8; 1763 width -= 8; 1764 } while (width != 0); 1765 src_ptr += src_stride; 1766 dst_ptr += dst_stride; 1767 } while (--height != 0); 1768 } 1769 } 1770 1771 static inline void highbd_dist_wtd_convolve_2d_horiz_neon( 1772 const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride, 1773 int w, int h, const int16_t *x_filter_ptr, const int offset) { 1774 // The smallest block height is 4, and the horizontal convolution needs to 1775 // process an extra (filter_taps/2 - 1) lines for the vertical convolution. 1776 assert(h >= 5); 1777 const int32x4_t offset_vec = vdupq_n_s32(offset); 1778 1779 if (w == 4) { 1780 // 4-tap filters are used for blocks having width == 4. 1781 const int16x4_t x_filter = vld1_s16(x_filter_ptr + 2); 1782 const int16_t *s = (const int16_t *)(src_ptr + 1); 1783 uint16_t *d = dst_ptr; 1784 1785 do { 1786 int16x4_t s0[4], s1[4], s2[4], s3[4]; 1787 load_s16_4x4(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3]); 1788 load_s16_4x4(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3]); 1789 load_s16_4x4(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3]); 1790 load_s16_4x4(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3]); 1791 1792 uint16x4_t d0 = highbd_convolve4_4_x(s0, x_filter, offset_vec); 1793 uint16x4_t d1 = highbd_convolve4_4_x(s1, x_filter, offset_vec); 1794 uint16x4_t d2 = highbd_convolve4_4_x(s2, x_filter, offset_vec); 1795 uint16x4_t d3 = highbd_convolve4_4_x(s3, x_filter, offset_vec); 1796 1797 store_u16_4x4(d, dst_stride, d0, d1, d2, d3); 1798 1799 s += 4 * src_stride; 1800 d += 4 * dst_stride; 1801 h -= 4; 1802 } while (h > 4); 1803 1804 do { 1805 int16x4_t s0[4]; 1806 load_s16_4x4(s, 1, &s0[0], &s0[1], &s0[2], &s0[3]); 1807 1808 uint16x4_t d0 = highbd_convolve4_4_x(s0, x_filter, offset_vec); 1809 vst1_u16(d, d0); 1810 1811 s += src_stride; 1812 d += dst_stride; 1813 } while (--h != 0); 1814 } else { 1815 const int16x8_t x_filter = vld1q_s16(x_filter_ptr); 1816 int height = h; 1817 1818 do { 1819 int width = w; 1820 const int16_t *s = (const int16_t *)src_ptr; 1821 uint16_t *d = dst_ptr; 1822 1823 do { 1824 int16x8_t s0[8], s1[8], s2[8], s3[8]; 1825 load_s16_8x8(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3], 1826 &s0[4], &s0[5], &s0[6], &s0[7]); 1827 load_s16_8x8(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3], 1828 &s1[4], &s1[5], &s1[6], &s1[7]); 1829 load_s16_8x8(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3], 1830 &s2[4], &s2[5], &s2[6], &s2[7]); 1831 load_s16_8x8(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3], 1832 &s3[4], &s3[5], &s3[6], &s3[7]); 1833 1834 uint16x8_t d0 = 1835 highbd_convolve8_8(s0[0], s0[1], s0[2], s0[3], s0[4], s0[5], s0[6], 1836 s0[7], x_filter, offset_vec); 1837 uint16x8_t d1 = 1838 highbd_convolve8_8(s1[0], s1[1], s1[2], s1[3], s1[4], s1[5], s1[6], 1839 s1[7], x_filter, offset_vec); 1840 uint16x8_t d2 = 1841 highbd_convolve8_8(s2[0], s2[1], s2[2], s2[3], s2[4], s2[5], s2[6], 1842 s2[7], x_filter, offset_vec); 1843 uint16x8_t d3 = 1844 highbd_convolve8_8(s3[0], s3[1], s3[2], s3[3], s3[4], s3[5], s3[6], 1845 s3[7], x_filter, offset_vec); 1846 1847 store_u16_8x4(d, dst_stride, d0, d1, d2, d3); 1848 1849 s += 8; 1850 d += 8; 1851 width -= 8; 1852 } while (width != 0); 1853 src_ptr += 4 * src_stride; 1854 dst_ptr += 4 * dst_stride; 1855 height -= 4; 1856 } while (height > 4); 1857 1858 do { 1859 int width = w; 1860 const int16_t *s = (const int16_t *)src_ptr; 1861 uint16_t *d = dst_ptr; 1862 1863 do { 1864 int16x8_t s0[8]; 1865 load_s16_8x8(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3], 1866 &s0[4], &s0[5], &s0[6], &s0[7]); 1867 1868 uint16x8_t d0 = 1869 highbd_convolve8_8(s0[0], s0[1], s0[2], s0[3], s0[4], s0[5], s0[6], 1870 s0[7], x_filter, offset_vec); 1871 vst1q_u16(d, d0); 1872 1873 s += 8; 1874 d += 8; 1875 width -= 8; 1876 } while (width != 0); 1877 src_ptr += src_stride; 1878 dst_ptr += dst_stride; 1879 } while (--height != 0); 1880 } 1881 } 1882 1883 void av1_highbd_dist_wtd_convolve_2d_neon( 1884 const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, 1885 int h, const InterpFilterParams *filter_params_x, 1886 const InterpFilterParams *filter_params_y, const int subpel_x_qn, 1887 const int subpel_y_qn, ConvolveParams *conv_params, int bd) { 1888 DECLARE_ALIGNED(16, uint16_t, 1889 im_block[(MAX_SB_SIZE + MAX_FILTER_TAP) * MAX_SB_SIZE]); 1890 DECLARE_ALIGNED(16, uint16_t, 1891 im_block2[(MAX_SB_SIZE + MAX_FILTER_TAP) * MAX_SB_SIZE]); 1892 1893 CONV_BUF_TYPE *dst16 = conv_params->dst; 1894 int dst16_stride = conv_params->dst_stride; 1895 const int x_filter_taps = get_filter_tap(filter_params_x, subpel_x_qn); 1896 const int clamped_x_taps = x_filter_taps < 6 ? 6 : x_filter_taps; 1897 const int y_filter_taps = get_filter_tap(filter_params_y, subpel_y_qn); 1898 const int clamped_y_taps = y_filter_taps < 6 ? 6 : y_filter_taps; 1899 1900 const int im_h = h + clamped_y_taps - 1; 1901 const int im_stride = MAX_SB_SIZE; 1902 const int vert_offset = clamped_y_taps / 2 - 1; 1903 const int horiz_offset = clamped_x_taps / 2 - 1; 1904 // The extra shim of (1 << (conv_params->round_0 - 1)) allows us to use a 1905 // faster non-rounding non-saturating left shift. 1906 const int round_offset_conv_x = 1907 (1 << (bd + FILTER_BITS - 1)) + (1 << (conv_params->round_0 - 1)); 1908 const int y_offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0; 1909 const int round_offset_conv_y = (1 << y_offset_bits); 1910 1911 const uint16_t *src_ptr = src - vert_offset * src_stride - horiz_offset; 1912 1913 const int16_t *x_filter_ptr = av1_get_interp_filter_subpel_kernel( 1914 filter_params_x, subpel_x_qn & SUBPEL_MASK); 1915 const int16_t *y_filter_ptr = av1_get_interp_filter_subpel_kernel( 1916 filter_params_y, subpel_y_qn & SUBPEL_MASK); 1917 1918 // horizontal filter 1919 if (bd == 12) { 1920 if (x_filter_taps <= 6 && w != 4) { 1921 highbd_12_dist_wtd_convolve_2d_horiz_6tap_neon( 1922 src_ptr, src_stride, im_block, im_stride, w, im_h, x_filter_ptr, 1923 round_offset_conv_x); 1924 } else { 1925 highbd_12_dist_wtd_convolve_2d_horiz_neon( 1926 src_ptr, src_stride, im_block, im_stride, w, im_h, x_filter_ptr, 1927 round_offset_conv_x); 1928 } 1929 } else { 1930 if (x_filter_taps <= 6 && w != 4) { 1931 highbd_dist_wtd_convolve_2d_horiz_6tap_neon( 1932 src_ptr, src_stride, im_block, im_stride, w, im_h, x_filter_ptr, 1933 round_offset_conv_x); 1934 } else { 1935 highbd_dist_wtd_convolve_2d_horiz_neon(src_ptr, src_stride, im_block, 1936 im_stride, w, im_h, x_filter_ptr, 1937 round_offset_conv_x); 1938 } 1939 } 1940 1941 // vertical filter 1942 if (y_filter_taps <= 6) { 1943 if (conv_params->do_average) { 1944 highbd_dist_wtd_convolve_2d_vert_6tap_neon(im_block, im_stride, im_block2, 1945 im_stride, w, h, y_filter_ptr, 1946 round_offset_conv_y); 1947 } else { 1948 highbd_dist_wtd_convolve_2d_vert_6tap_neon( 1949 im_block, im_stride, dst16, dst16_stride, w, h, y_filter_ptr, 1950 round_offset_conv_y); 1951 } 1952 } else { 1953 if (conv_params->do_average) { 1954 highbd_dist_wtd_convolve_2d_vert_8tap_neon(im_block, im_stride, im_block2, 1955 im_stride, w, h, y_filter_ptr, 1956 round_offset_conv_y); 1957 } else { 1958 highbd_dist_wtd_convolve_2d_vert_8tap_neon( 1959 im_block, im_stride, dst16, dst16_stride, w, h, y_filter_ptr, 1960 round_offset_conv_y); 1961 } 1962 } 1963 1964 // Do the compound averaging outside the loop, avoids branching within the 1965 // main loop 1966 if (conv_params->do_average) { 1967 if (conv_params->use_dist_wtd_comp_avg) { 1968 if (bd == 12) { 1969 highbd_12_dist_wtd_comp_avg_neon(im_block2, im_stride, dst, dst_stride, 1970 w, h, conv_params); 1971 } else { 1972 highbd_dist_wtd_comp_avg_neon(im_block2, im_stride, dst, dst_stride, w, 1973 h, conv_params, bd); 1974 } 1975 } else { 1976 if (bd == 12) { 1977 highbd_12_comp_avg_neon(im_block2, im_stride, dst, dst_stride, w, h, 1978 conv_params); 1979 } else { 1980 highbd_comp_avg_neon(im_block2, im_stride, dst, dst_stride, w, h, 1981 conv_params, bd); 1982 } 1983 } 1984 } 1985 }