highbd_convolve_neon.c (81586B)
1 /* 2 * Copyright (c) 2023, Alliance for Open Media. All rights reserved. 3 * 4 * This source code is subject to the terms of the BSD 2 Clause License and 5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License 6 * was not distributed with this source code in the LICENSE file, you can 7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open 8 * Media Patent License 1.0 was not distributed with this source code in the 9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent. 10 */ 11 12 #include <assert.h> 13 #include <arm_neon.h> 14 15 #include "config/aom_config.h" 16 #include "config/av1_rtcd.h" 17 18 #include "aom_dsp/aom_dsp_common.h" 19 #include "aom_dsp/arm/mem_neon.h" 20 #include "aom_ports/mem.h" 21 #include "av1/common/convolve.h" 22 #include "av1/common/filter.h" 23 24 static inline uint16x4_t highbd_convolve6_4_y( 25 const int16x4_t s0, const int16x4_t s1, const int16x4_t s2, 26 const int16x4_t s3, const int16x4_t s4, const int16x4_t s5, 27 const int16x8_t y_filter, const uint16x4_t max) { 28 // Values at indices 0 and 7 of y_filter are zero. 29 const int16x4_t y_filter_0_3 = vget_low_s16(y_filter); 30 const int16x4_t y_filter_4_7 = vget_high_s16(y_filter); 31 32 int32x4_t sum = vmull_lane_s16(s0, y_filter_0_3, 1); 33 sum = vmlal_lane_s16(sum, s1, y_filter_0_3, 2); 34 sum = vmlal_lane_s16(sum, s2, y_filter_0_3, 3); 35 sum = vmlal_lane_s16(sum, s3, y_filter_4_7, 0); 36 sum = vmlal_lane_s16(sum, s4, y_filter_4_7, 1); 37 sum = vmlal_lane_s16(sum, s5, y_filter_4_7, 2); 38 39 uint16x4_t res = vqrshrun_n_s32(sum, COMPOUND_ROUND1_BITS); 40 return vmin_u16(res, max); 41 } 42 43 static inline uint16x8_t highbd_convolve6_8_y( 44 const int16x8_t s0, const int16x8_t s1, const int16x8_t s2, 45 const int16x8_t s3, const int16x8_t s4, const int16x8_t s5, 46 const int16x8_t y_filter, const uint16x8_t max) { 47 // Values at indices 0 and 7 of y_filter are zero. 48 const int16x4_t y_filter_0_3 = vget_low_s16(y_filter); 49 const int16x4_t y_filter_4_7 = vget_high_s16(y_filter); 50 51 int32x4_t sum0 = vmull_lane_s16(vget_low_s16(s0), y_filter_0_3, 1); 52 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s1), y_filter_0_3, 2); 53 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s2), y_filter_0_3, 3); 54 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s3), y_filter_4_7, 0); 55 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s4), y_filter_4_7, 1); 56 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s5), y_filter_4_7, 2); 57 58 int32x4_t sum1 = vmull_lane_s16(vget_high_s16(s0), y_filter_0_3, 1); 59 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s1), y_filter_0_3, 2); 60 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s2), y_filter_0_3, 3); 61 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s3), y_filter_4_7, 0); 62 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s4), y_filter_4_7, 1); 63 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s5), y_filter_4_7, 2); 64 65 uint16x8_t res = vcombine_u16(vqrshrun_n_s32(sum0, COMPOUND_ROUND1_BITS), 66 vqrshrun_n_s32(sum1, COMPOUND_ROUND1_BITS)); 67 return vminq_u16(res, max); 68 } 69 70 static inline void highbd_convolve_y_sr_6tap_neon( 71 const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride, 72 int w, int h, const int16_t *y_filter_ptr, const int bd) { 73 const int16x8_t y_filter_0_7 = vld1q_s16(y_filter_ptr); 74 75 if (w == 4) { 76 const uint16x4_t max = vdup_n_u16((1 << bd) - 1); 77 const int16_t *s = (const int16_t *)(src_ptr + src_stride); 78 uint16_t *d = dst_ptr; 79 80 int16x4_t s0, s1, s2, s3, s4; 81 load_s16_4x5(s, src_stride, &s0, &s1, &s2, &s3, &s4); 82 s += 5 * src_stride; 83 84 do { 85 int16x4_t s5, s6, s7, s8; 86 load_s16_4x4(s, src_stride, &s5, &s6, &s7, &s8); 87 88 uint16x4_t d0 = 89 highbd_convolve6_4_y(s0, s1, s2, s3, s4, s5, y_filter_0_7, max); 90 uint16x4_t d1 = 91 highbd_convolve6_4_y(s1, s2, s3, s4, s5, s6, y_filter_0_7, max); 92 uint16x4_t d2 = 93 highbd_convolve6_4_y(s2, s3, s4, s5, s6, s7, y_filter_0_7, max); 94 uint16x4_t d3 = 95 highbd_convolve6_4_y(s3, s4, s5, s6, s7, s8, y_filter_0_7, max); 96 97 store_u16_4x4(d, dst_stride, d0, d1, d2, d3); 98 99 s0 = s4; 100 s1 = s5; 101 s2 = s6; 102 s3 = s7; 103 s4 = s8; 104 s += 4 * src_stride; 105 d += 4 * dst_stride; 106 h -= 4; 107 } while (h != 0); 108 } else { 109 const uint16x8_t max = vdupq_n_u16((1 << bd) - 1); 110 // Width is a multiple of 8 and height is a multiple of 4. 111 do { 112 int height = h; 113 const int16_t *s = (const int16_t *)(src_ptr + src_stride); 114 uint16_t *d = dst_ptr; 115 116 int16x8_t s0, s1, s2, s3, s4; 117 load_s16_8x5(s, src_stride, &s0, &s1, &s2, &s3, &s4); 118 s += 5 * src_stride; 119 120 do { 121 int16x8_t s5, s6, s7, s8; 122 load_s16_8x4(s, src_stride, &s5, &s6, &s7, &s8); 123 124 uint16x8_t d0 = 125 highbd_convolve6_8_y(s0, s1, s2, s3, s4, s5, y_filter_0_7, max); 126 uint16x8_t d1 = 127 highbd_convolve6_8_y(s1, s2, s3, s4, s5, s6, y_filter_0_7, max); 128 uint16x8_t d2 = 129 highbd_convolve6_8_y(s2, s3, s4, s5, s6, s7, y_filter_0_7, max); 130 uint16x8_t d3 = 131 highbd_convolve6_8_y(s3, s4, s5, s6, s7, s8, y_filter_0_7, max); 132 133 store_u16_8x4(d, dst_stride, d0, d1, d2, d3); 134 135 s0 = s4; 136 s1 = s5; 137 s2 = s6; 138 s3 = s7; 139 s4 = s8; 140 s += 4 * src_stride; 141 d += 4 * dst_stride; 142 height -= 4; 143 } while (height != 0); 144 145 src_ptr += 8; 146 dst_ptr += 8; 147 w -= 8; 148 } while (w != 0); 149 } 150 } 151 152 static inline uint16x4_t highbd_convolve8_4_y( 153 const int16x4_t s0, const int16x4_t s1, const int16x4_t s2, 154 const int16x4_t s3, const int16x4_t s4, const int16x4_t s5, 155 const int16x4_t s6, const int16x4_t s7, const int16x8_t y_filter, 156 const uint16x4_t max) { 157 const int16x4_t y_filter_0_3 = vget_low_s16(y_filter); 158 const int16x4_t y_filter_4_7 = vget_high_s16(y_filter); 159 160 int32x4_t sum = vmull_lane_s16(s0, y_filter_0_3, 0); 161 sum = vmlal_lane_s16(sum, s1, y_filter_0_3, 1); 162 sum = vmlal_lane_s16(sum, s2, y_filter_0_3, 2); 163 sum = vmlal_lane_s16(sum, s3, y_filter_0_3, 3); 164 sum = vmlal_lane_s16(sum, s4, y_filter_4_7, 0); 165 sum = vmlal_lane_s16(sum, s5, y_filter_4_7, 1); 166 sum = vmlal_lane_s16(sum, s6, y_filter_4_7, 2); 167 sum = vmlal_lane_s16(sum, s7, y_filter_4_7, 3); 168 169 uint16x4_t res = vqrshrun_n_s32(sum, COMPOUND_ROUND1_BITS); 170 return vmin_u16(res, max); 171 } 172 173 static inline uint16x8_t highbd_convolve8_8_y( 174 const int16x8_t s0, const int16x8_t s1, const int16x8_t s2, 175 const int16x8_t s3, const int16x8_t s4, const int16x8_t s5, 176 const int16x8_t s6, const int16x8_t s7, const int16x8_t y_filter, 177 const uint16x8_t max) { 178 const int16x4_t y_filter_0_3 = vget_low_s16(y_filter); 179 const int16x4_t y_filter_4_7 = vget_high_s16(y_filter); 180 181 int32x4_t sum0 = vmull_lane_s16(vget_low_s16(s0), y_filter_0_3, 0); 182 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s1), y_filter_0_3, 1); 183 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s2), y_filter_0_3, 2); 184 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s3), y_filter_0_3, 3); 185 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s4), y_filter_4_7, 0); 186 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s5), y_filter_4_7, 1); 187 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s6), y_filter_4_7, 2); 188 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s7), y_filter_4_7, 3); 189 190 int32x4_t sum1 = vmull_lane_s16(vget_high_s16(s0), y_filter_0_3, 0); 191 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s1), y_filter_0_3, 1); 192 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s2), y_filter_0_3, 2); 193 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s3), y_filter_0_3, 3); 194 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s4), y_filter_4_7, 0); 195 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s5), y_filter_4_7, 1); 196 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s6), y_filter_4_7, 2); 197 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s7), y_filter_4_7, 3); 198 199 uint16x8_t res = vcombine_u16(vqrshrun_n_s32(sum0, COMPOUND_ROUND1_BITS), 200 vqrshrun_n_s32(sum1, COMPOUND_ROUND1_BITS)); 201 return vminq_u16(res, max); 202 } 203 204 static inline void highbd_convolve_y_sr_8tap_neon( 205 const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride, 206 int w, int h, const int16_t *y_filter_ptr, int bd) { 207 const int16x8_t y_filter = vld1q_s16(y_filter_ptr); 208 209 if (w == 4) { 210 const uint16x4_t max = vdup_n_u16((1 << bd) - 1); 211 const int16_t *s = (const int16_t *)src_ptr; 212 uint16_t *d = dst_ptr; 213 214 int16x4_t s0, s1, s2, s3, s4, s5, s6; 215 load_s16_4x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6); 216 s += 7 * src_stride; 217 218 do { 219 int16x4_t s7, s8, s9, s10; 220 load_s16_4x4(s, src_stride, &s7, &s8, &s9, &s10); 221 222 uint16x4_t d0 = 223 highbd_convolve8_4_y(s0, s1, s2, s3, s4, s5, s6, s7, y_filter, max); 224 uint16x4_t d1 = 225 highbd_convolve8_4_y(s1, s2, s3, s4, s5, s6, s7, s8, y_filter, max); 226 uint16x4_t d2 = 227 highbd_convolve8_4_y(s2, s3, s4, s5, s6, s7, s8, s9, y_filter, max); 228 uint16x4_t d3 = 229 highbd_convolve8_4_y(s3, s4, s5, s6, s7, s8, s9, s10, y_filter, max); 230 231 store_u16_4x4(d, dst_stride, d0, d1, d2, d3); 232 233 s0 = s4; 234 s1 = s5; 235 s2 = s6; 236 s3 = s7; 237 s4 = s8; 238 s5 = s9; 239 s6 = s10; 240 s += 4 * src_stride; 241 d += 4 * dst_stride; 242 h -= 4; 243 } while (h != 0); 244 } else { 245 const uint16x8_t max = vdupq_n_u16((1 << bd) - 1); 246 247 do { 248 int height = h; 249 const int16_t *s = (const int16_t *)src_ptr; 250 uint16_t *d = dst_ptr; 251 252 int16x8_t s0, s1, s2, s3, s4, s5, s6; 253 load_s16_8x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6); 254 s += 7 * src_stride; 255 256 do { 257 int16x8_t s7, s8, s9, s10; 258 load_s16_8x4(s, src_stride, &s7, &s8, &s9, &s10); 259 260 uint16x8_t d0 = 261 highbd_convolve8_8_y(s0, s1, s2, s3, s4, s5, s6, s7, y_filter, max); 262 uint16x8_t d1 = 263 highbd_convolve8_8_y(s1, s2, s3, s4, s5, s6, s7, s8, y_filter, max); 264 uint16x8_t d2 = 265 highbd_convolve8_8_y(s2, s3, s4, s5, s6, s7, s8, s9, y_filter, max); 266 uint16x8_t d3 = highbd_convolve8_8_y(s3, s4, s5, s6, s7, s8, s9, s10, 267 y_filter, max); 268 269 store_u16_8x4(d, dst_stride, d0, d1, d2, d3); 270 271 s0 = s4; 272 s1 = s5; 273 s2 = s6; 274 s3 = s7; 275 s4 = s8; 276 s5 = s9; 277 s6 = s10; 278 s += 4 * src_stride; 279 d += 4 * dst_stride; 280 height -= 4; 281 } while (height != 0); 282 src_ptr += 8; 283 dst_ptr += 8; 284 w -= 8; 285 } while (w != 0); 286 } 287 } 288 289 static inline uint16x4_t highbd_convolve12_4_y( 290 const int16x4_t s0, const int16x4_t s1, const int16x4_t s2, 291 const int16x4_t s3, const int16x4_t s4, const int16x4_t s5, 292 const int16x4_t s6, const int16x4_t s7, const int16x4_t s8, 293 const int16x4_t s9, const int16x4_t s10, const int16x4_t s11, 294 const int16x8_t y_filter_0_7, const int16x4_t y_filter_8_11, 295 const uint16x4_t max) { 296 const int16x4_t y_filter_0_3 = vget_low_s16(y_filter_0_7); 297 const int16x4_t y_filter_4_7 = vget_high_s16(y_filter_0_7); 298 299 int32x4_t sum = vmull_lane_s16(s0, y_filter_0_3, 0); 300 sum = vmlal_lane_s16(sum, s1, y_filter_0_3, 1); 301 sum = vmlal_lane_s16(sum, s2, y_filter_0_3, 2); 302 sum = vmlal_lane_s16(sum, s3, y_filter_0_3, 3); 303 sum = vmlal_lane_s16(sum, s4, y_filter_4_7, 0); 304 sum = vmlal_lane_s16(sum, s5, y_filter_4_7, 1); 305 sum = vmlal_lane_s16(sum, s6, y_filter_4_7, 2); 306 sum = vmlal_lane_s16(sum, s7, y_filter_4_7, 3); 307 sum = vmlal_lane_s16(sum, s8, y_filter_8_11, 0); 308 sum = vmlal_lane_s16(sum, s9, y_filter_8_11, 1); 309 sum = vmlal_lane_s16(sum, s10, y_filter_8_11, 2); 310 sum = vmlal_lane_s16(sum, s11, y_filter_8_11, 3); 311 312 uint16x4_t res = vqrshrun_n_s32(sum, COMPOUND_ROUND1_BITS); 313 return vmin_u16(res, max); 314 } 315 316 static inline uint16x8_t highbd_convolve12_8_y( 317 const int16x8_t s0, const int16x8_t s1, const int16x8_t s2, 318 const int16x8_t s3, const int16x8_t s4, const int16x8_t s5, 319 const int16x8_t s6, const int16x8_t s7, const int16x8_t s8, 320 const int16x8_t s9, const int16x8_t s10, const int16x8_t s11, 321 const int16x8_t y_filter_0_7, const int16x4_t y_filter_8_11, 322 const uint16x8_t max) { 323 const int16x4_t y_filter_0_3 = vget_low_s16(y_filter_0_7); 324 const int16x4_t y_filter_4_7 = vget_high_s16(y_filter_0_7); 325 326 int32x4_t sum0 = vmull_lane_s16(vget_low_s16(s0), y_filter_0_3, 0); 327 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s1), y_filter_0_3, 1); 328 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s2), y_filter_0_3, 2); 329 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s3), y_filter_0_3, 3); 330 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s4), y_filter_4_7, 0); 331 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s5), y_filter_4_7, 1); 332 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s6), y_filter_4_7, 2); 333 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s7), y_filter_4_7, 3); 334 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s8), y_filter_8_11, 0); 335 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s9), y_filter_8_11, 1); 336 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s10), y_filter_8_11, 2); 337 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s11), y_filter_8_11, 3); 338 339 int32x4_t sum1 = vmull_lane_s16(vget_high_s16(s0), y_filter_0_3, 0); 340 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s1), y_filter_0_3, 1); 341 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s2), y_filter_0_3, 2); 342 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s3), y_filter_0_3, 3); 343 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s4), y_filter_4_7, 0); 344 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s5), y_filter_4_7, 1); 345 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s6), y_filter_4_7, 2); 346 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s7), y_filter_4_7, 3); 347 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s8), y_filter_8_11, 0); 348 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s9), y_filter_8_11, 1); 349 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s10), y_filter_8_11, 2); 350 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s11), y_filter_8_11, 3); 351 352 uint16x8_t res = vcombine_u16(vqrshrun_n_s32(sum0, COMPOUND_ROUND1_BITS), 353 vqrshrun_n_s32(sum1, COMPOUND_ROUND1_BITS)); 354 return vminq_u16(res, max); 355 } 356 357 static inline void highbd_convolve_y_sr_12tap_neon( 358 const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride, 359 int w, int h, const int16_t *y_filter_ptr, int bd) { 360 const int16x8_t y_filter_0_7 = vld1q_s16(y_filter_ptr); 361 const int16x4_t y_filter_8_11 = vld1_s16(y_filter_ptr + 8); 362 363 if (w == 4) { 364 const uint16x4_t max = vdup_n_u16((1 << bd) - 1); 365 const int16_t *s = (const int16_t *)src_ptr; 366 uint16_t *d = dst_ptr; 367 368 int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10; 369 load_s16_4x11(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7, &s8, 370 &s9, &s10); 371 s += 11 * src_stride; 372 373 do { 374 int16x4_t s11, s12, s13, s14; 375 load_s16_4x4(s, src_stride, &s11, &s12, &s13, &s14); 376 377 uint16x4_t d0 = 378 highbd_convolve12_4_y(s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, 379 s11, y_filter_0_7, y_filter_8_11, max); 380 uint16x4_t d1 = 381 highbd_convolve12_4_y(s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, 382 s12, y_filter_0_7, y_filter_8_11, max); 383 uint16x4_t d2 = 384 highbd_convolve12_4_y(s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, 385 s13, y_filter_0_7, y_filter_8_11, max); 386 uint16x4_t d3 = 387 highbd_convolve12_4_y(s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, 388 s14, y_filter_0_7, y_filter_8_11, max); 389 390 store_u16_4x4(d, dst_stride, d0, d1, d2, d3); 391 392 s0 = s4; 393 s1 = s5; 394 s2 = s6; 395 s3 = s7; 396 s4 = s8; 397 s5 = s9; 398 s6 = s10; 399 s7 = s11; 400 s8 = s12; 401 s9 = s13; 402 s10 = s14; 403 s += 4 * src_stride; 404 d += 4 * dst_stride; 405 h -= 4; 406 } while (h != 0); 407 } else { 408 const uint16x8_t max = vdupq_n_u16((1 << bd) - 1); 409 410 do { 411 int height = h; 412 const int16_t *s = (const int16_t *)src_ptr; 413 uint16_t *d = dst_ptr; 414 415 int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10; 416 load_s16_8x11(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7, &s8, 417 &s9, &s10); 418 s += 11 * src_stride; 419 420 do { 421 int16x8_t s11, s12, s13, s14; 422 load_s16_8x4(s, src_stride, &s11, &s12, &s13, &s14); 423 424 uint16x8_t d0 = 425 highbd_convolve12_8_y(s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, 426 s11, y_filter_0_7, y_filter_8_11, max); 427 uint16x8_t d1 = 428 highbd_convolve12_8_y(s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, 429 s12, y_filter_0_7, y_filter_8_11, max); 430 uint16x8_t d2 = 431 highbd_convolve12_8_y(s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, 432 s13, y_filter_0_7, y_filter_8_11, max); 433 uint16x8_t d3 = 434 highbd_convolve12_8_y(s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, 435 s13, s14, y_filter_0_7, y_filter_8_11, max); 436 437 store_u16_8x4(d, dst_stride, d0, d1, d2, d3); 438 439 s0 = s4; 440 s1 = s5; 441 s2 = s6; 442 s3 = s7; 443 s4 = s8; 444 s5 = s9; 445 s6 = s10; 446 s7 = s11; 447 s8 = s12; 448 s9 = s13; 449 s10 = s14; 450 s += 4 * src_stride; 451 d += 4 * dst_stride; 452 height -= 4; 453 } while (height != 0); 454 455 src_ptr += 8; 456 dst_ptr += 8; 457 w -= 8; 458 } while (w != 0); 459 } 460 } 461 462 void av1_highbd_convolve_y_sr_neon(const uint16_t *src, int src_stride, 463 uint16_t *dst, int dst_stride, int w, int h, 464 const InterpFilterParams *filter_params_y, 465 const int subpel_y_qn, int bd) { 466 if (w == 2 || h == 2) { 467 av1_highbd_convolve_y_sr_c(src, src_stride, dst, dst_stride, w, h, 468 filter_params_y, subpel_y_qn, bd); 469 return; 470 } 471 const int y_filter_taps = get_filter_tap(filter_params_y, subpel_y_qn); 472 const int vert_offset = filter_params_y->taps / 2 - 1; 473 const int16_t *y_filter_ptr = av1_get_interp_filter_subpel_kernel( 474 filter_params_y, subpel_y_qn & SUBPEL_MASK); 475 476 src -= vert_offset * src_stride; 477 478 if (y_filter_taps > 8) { 479 highbd_convolve_y_sr_12tap_neon(src, src_stride, dst, dst_stride, w, h, 480 y_filter_ptr, bd); 481 return; 482 } 483 if (y_filter_taps < 8) { 484 highbd_convolve_y_sr_6tap_neon(src, src_stride, dst, dst_stride, w, h, 485 y_filter_ptr, bd); 486 return; 487 } 488 489 highbd_convolve_y_sr_8tap_neon(src, src_stride, dst, dst_stride, w, h, 490 y_filter_ptr, bd); 491 } 492 493 static inline uint16x8_t highbd_convolve6_8_x(const int16x8_t s[6], 494 const int16x8_t x_filter, 495 const int32x4_t offset, 496 const uint16x8_t max) { 497 // Values at indices 0 and 7 of y_filter are zero. 498 const int16x4_t x_filter_0_3 = vget_low_s16(x_filter); 499 const int16x4_t x_filter_4_7 = vget_high_s16(x_filter); 500 501 int32x4_t sum0 = offset; 502 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[0]), x_filter_0_3, 1); 503 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[1]), x_filter_0_3, 2); 504 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[2]), x_filter_0_3, 3); 505 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[3]), x_filter_4_7, 0); 506 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[4]), x_filter_4_7, 1); 507 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[5]), x_filter_4_7, 2); 508 509 int32x4_t sum1 = offset; 510 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[0]), x_filter_0_3, 1); 511 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[1]), x_filter_0_3, 2); 512 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[2]), x_filter_0_3, 3); 513 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[3]), x_filter_4_7, 0); 514 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[4]), x_filter_4_7, 1); 515 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[5]), x_filter_4_7, 2); 516 517 uint16x8_t res = vcombine_u16(vqrshrun_n_s32(sum0, FILTER_BITS), 518 vqrshrun_n_s32(sum1, FILTER_BITS)); 519 return vminq_u16(res, max); 520 } 521 522 static inline void highbd_convolve_x_sr_6tap_neon( 523 const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride, 524 int w, int h, const int16_t *x_filter_ptr, ConvolveParams *conv_params, 525 int bd) { 526 const int16x8_t x_filter = vld1q_s16(x_filter_ptr); 527 const uint16x8_t max = vdupq_n_u16((1 << bd) - 1); 528 // This shim allows to do only one rounding shift instead of two. 529 const int32x4_t offset = vdupq_n_s32(1 << (conv_params->round_0 - 1)); 530 531 int height = h; 532 533 do { 534 int width = w; 535 const int16_t *s = (const int16_t *)src_ptr; 536 uint16_t *d = dst_ptr; 537 538 do { 539 int16x8_t s0[6], s1[6], s2[6], s3[6]; 540 load_s16_8x6(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3], 541 &s0[4], &s0[5]); 542 load_s16_8x6(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3], 543 &s1[4], &s1[5]); 544 load_s16_8x6(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3], 545 &s2[4], &s2[5]); 546 load_s16_8x6(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3], 547 &s3[4], &s3[5]); 548 549 uint16x8_t d0 = highbd_convolve6_8_x(s0, x_filter, offset, max); 550 uint16x8_t d1 = highbd_convolve6_8_x(s1, x_filter, offset, max); 551 uint16x8_t d2 = highbd_convolve6_8_x(s2, x_filter, offset, max); 552 uint16x8_t d3 = highbd_convolve6_8_x(s3, x_filter, offset, max); 553 554 store_u16_8x4(d, dst_stride, d0, d1, d2, d3); 555 556 s += 8; 557 d += 8; 558 width -= 8; 559 } while (width != 0); 560 561 src_ptr += 4 * src_stride; 562 dst_ptr += 4 * dst_stride; 563 height -= 4; 564 } while (height != 0); 565 } 566 567 static inline uint16x4_t highbd_convolve4_4_x(const int16x4_t s[4], 568 const int16x4_t x_filter, 569 const int32x4_t offset, 570 const uint16x4_t max) { 571 int32x4_t sum = offset; 572 sum = vmlal_lane_s16(sum, s[0], x_filter, 0); 573 sum = vmlal_lane_s16(sum, s[1], x_filter, 1); 574 sum = vmlal_lane_s16(sum, s[2], x_filter, 2); 575 sum = vmlal_lane_s16(sum, s[3], x_filter, 3); 576 577 uint16x4_t res = vqrshrun_n_s32(sum, FILTER_BITS); 578 return vmin_u16(res, max); 579 } 580 581 static inline uint16x8_t highbd_convolve8_8_x(const int16x8_t s[8], 582 const int16x8_t x_filter, 583 const int32x4_t offset, 584 const uint16x8_t max) { 585 const int16x4_t x_filter_0_3 = vget_low_s16(x_filter); 586 const int16x4_t x_filter_4_7 = vget_high_s16(x_filter); 587 588 int32x4_t sum0 = offset; 589 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[0]), x_filter_0_3, 0); 590 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[1]), x_filter_0_3, 1); 591 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[2]), x_filter_0_3, 2); 592 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[3]), x_filter_0_3, 3); 593 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[4]), x_filter_4_7, 0); 594 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[5]), x_filter_4_7, 1); 595 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[6]), x_filter_4_7, 2); 596 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[7]), x_filter_4_7, 3); 597 598 int32x4_t sum1 = offset; 599 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[0]), x_filter_0_3, 0); 600 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[1]), x_filter_0_3, 1); 601 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[2]), x_filter_0_3, 2); 602 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[3]), x_filter_0_3, 3); 603 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[4]), x_filter_4_7, 0); 604 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[5]), x_filter_4_7, 1); 605 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[6]), x_filter_4_7, 2); 606 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[7]), x_filter_4_7, 3); 607 608 uint16x8_t res = vcombine_u16(vqrshrun_n_s32(sum0, FILTER_BITS), 609 vqrshrun_n_s32(sum1, FILTER_BITS)); 610 return vminq_u16(res, max); 611 } 612 613 static inline void highbd_convolve_x_sr_neon(const uint16_t *src_ptr, 614 int src_stride, uint16_t *dst_ptr, 615 int dst_stride, int w, int h, 616 const int16_t *x_filter_ptr, 617 ConvolveParams *conv_params, 618 int bd) { 619 // This shim allows to do only one rounding shift instead of two. 620 const int32x4_t offset = vdupq_n_s32(1 << (conv_params->round_0 - 1)); 621 622 if (w == 4) { 623 const uint16x4_t max = vdup_n_u16((1 << bd) - 1); 624 // 4-tap filters are used for blocks having width == 4. 625 const int16x4_t x_filter = vld1_s16(x_filter_ptr + 2); 626 const int16_t *s = (const int16_t *)(src_ptr + 2); 627 uint16_t *d = dst_ptr; 628 629 do { 630 int16x4_t s0[4], s1[4], s2[4], s3[4]; 631 load_s16_4x4(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3]); 632 load_s16_4x4(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3]); 633 load_s16_4x4(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3]); 634 load_s16_4x4(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3]); 635 636 uint16x4_t d0 = highbd_convolve4_4_x(s0, x_filter, offset, max); 637 uint16x4_t d1 = highbd_convolve4_4_x(s1, x_filter, offset, max); 638 uint16x4_t d2 = highbd_convolve4_4_x(s2, x_filter, offset, max); 639 uint16x4_t d3 = highbd_convolve4_4_x(s3, x_filter, offset, max); 640 641 store_u16_4x4(d, dst_stride, d0, d1, d2, d3); 642 643 s += 4 * src_stride; 644 d += 4 * dst_stride; 645 h -= 4; 646 } while (h != 0); 647 } else { 648 const uint16x8_t max = vdupq_n_u16((1 << bd) - 1); 649 const int16x8_t x_filter = vld1q_s16(x_filter_ptr); 650 int height = h; 651 652 do { 653 int width = w; 654 const int16_t *s = (const int16_t *)src_ptr; 655 uint16_t *d = dst_ptr; 656 657 do { 658 int16x8_t s0[8], s1[8], s2[8], s3[8]; 659 load_s16_8x8(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3], 660 &s0[4], &s0[5], &s0[6], &s0[7]); 661 load_s16_8x8(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3], 662 &s1[4], &s1[5], &s1[6], &s1[7]); 663 load_s16_8x8(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3], 664 &s2[4], &s2[5], &s2[6], &s2[7]); 665 load_s16_8x8(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3], 666 &s3[4], &s3[5], &s3[6], &s3[7]); 667 668 uint16x8_t d0 = highbd_convolve8_8_x(s0, x_filter, offset, max); 669 uint16x8_t d1 = highbd_convolve8_8_x(s1, x_filter, offset, max); 670 uint16x8_t d2 = highbd_convolve8_8_x(s2, x_filter, offset, max); 671 uint16x8_t d3 = highbd_convolve8_8_x(s3, x_filter, offset, max); 672 673 store_u16_8x4(d, dst_stride, d0, d1, d2, d3); 674 675 s += 8; 676 d += 8; 677 width -= 8; 678 } while (width != 0); 679 src_ptr += 4 * src_stride; 680 dst_ptr += 4 * dst_stride; 681 height -= 4; 682 } while (height != 0); 683 } 684 } 685 686 static inline uint16x4_t highbd_convolve12_4_x(const int16x4_t s[12], 687 const int16x8_t x_filter_0_7, 688 const int16x4_t x_filter_8_11, 689 const int32x4_t offset, 690 const uint16x4_t max) { 691 const int16x4_t x_filter_0_3 = vget_low_s16(x_filter_0_7); 692 const int16x4_t x_filter_4_7 = vget_high_s16(x_filter_0_7); 693 694 int32x4_t sum = offset; 695 sum = vmlal_lane_s16(sum, s[0], x_filter_0_3, 0); 696 sum = vmlal_lane_s16(sum, s[1], x_filter_0_3, 1); 697 sum = vmlal_lane_s16(sum, s[2], x_filter_0_3, 2); 698 sum = vmlal_lane_s16(sum, s[3], x_filter_0_3, 3); 699 sum = vmlal_lane_s16(sum, s[4], x_filter_4_7, 0); 700 sum = vmlal_lane_s16(sum, s[5], x_filter_4_7, 1); 701 sum = vmlal_lane_s16(sum, s[6], x_filter_4_7, 2); 702 sum = vmlal_lane_s16(sum, s[7], x_filter_4_7, 3); 703 sum = vmlal_lane_s16(sum, s[8], x_filter_8_11, 0); 704 sum = vmlal_lane_s16(sum, s[9], x_filter_8_11, 1); 705 sum = vmlal_lane_s16(sum, s[10], x_filter_8_11, 2); 706 sum = vmlal_lane_s16(sum, s[11], x_filter_8_11, 3); 707 708 uint16x4_t res = vqrshrun_n_s32(sum, FILTER_BITS); 709 return vmin_u16(res, max); 710 } 711 712 static inline uint16x8_t highbd_convolve12_8_x(const int16x8_t s[12], 713 const int16x8_t x_filter_0_7, 714 const int16x4_t x_filter_8_11, 715 const int32x4_t offset, 716 const uint16x8_t max) { 717 const int16x4_t x_filter_0_3 = vget_low_s16(x_filter_0_7); 718 const int16x4_t x_filter_4_7 = vget_high_s16(x_filter_0_7); 719 720 int32x4_t sum0 = offset; 721 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[0]), x_filter_0_3, 0); 722 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[1]), x_filter_0_3, 1); 723 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[2]), x_filter_0_3, 2); 724 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[3]), x_filter_0_3, 3); 725 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[4]), x_filter_4_7, 0); 726 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[5]), x_filter_4_7, 1); 727 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[6]), x_filter_4_7, 2); 728 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[7]), x_filter_4_7, 3); 729 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[8]), x_filter_8_11, 0); 730 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[9]), x_filter_8_11, 1); 731 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[10]), x_filter_8_11, 2); 732 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[11]), x_filter_8_11, 3); 733 734 int32x4_t sum1 = offset; 735 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[0]), x_filter_0_3, 0); 736 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[1]), x_filter_0_3, 1); 737 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[2]), x_filter_0_3, 2); 738 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[3]), x_filter_0_3, 3); 739 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[4]), x_filter_4_7, 0); 740 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[5]), x_filter_4_7, 1); 741 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[6]), x_filter_4_7, 2); 742 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[7]), x_filter_4_7, 3); 743 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[8]), x_filter_8_11, 0); 744 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[9]), x_filter_8_11, 1); 745 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[10]), x_filter_8_11, 2); 746 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[11]), x_filter_8_11, 3); 747 748 uint16x8_t res = vcombine_u16(vqrshrun_n_s32(sum0, FILTER_BITS), 749 vqrshrun_n_s32(sum1, FILTER_BITS)); 750 return vminq_u16(res, max); 751 } 752 753 static inline void highbd_convolve_x_sr_12tap_neon( 754 const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride, 755 int w, int h, const int16_t *x_filter_ptr, ConvolveParams *conv_params, 756 int bd) { 757 // This shim allows to do only one rounding shift instead of two. 758 const int32x4_t offset = vdupq_n_s32(1 << (conv_params->round_0 - 1)); 759 const int16x8_t x_filter_0_7 = vld1q_s16(x_filter_ptr); 760 const int16x4_t x_filter_8_11 = vld1_s16(x_filter_ptr + 8); 761 762 if (w == 4) { 763 const uint16x4_t max = vdup_n_u16((1 << bd) - 1); 764 const int16_t *s = (const int16_t *)src_ptr; 765 uint16_t *d = dst_ptr; 766 767 do { 768 int16x4_t s0[12], s1[12], s2[12], s3[12]; 769 load_s16_4x12(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3], 770 &s0[4], &s0[5], &s0[6], &s0[7], &s0[8], &s0[9], &s0[10], 771 &s0[11]); 772 load_s16_4x12(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3], 773 &s1[4], &s1[5], &s1[6], &s1[7], &s1[8], &s1[9], &s1[10], 774 &s1[11]); 775 load_s16_4x12(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3], 776 &s2[4], &s2[5], &s2[6], &s2[7], &s2[8], &s2[9], &s2[10], 777 &s2[11]); 778 load_s16_4x12(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3], 779 &s3[4], &s3[5], &s3[6], &s3[7], &s3[8], &s3[9], &s3[10], 780 &s3[11]); 781 782 uint16x4_t d0 = 783 highbd_convolve12_4_x(s0, x_filter_0_7, x_filter_8_11, offset, max); 784 uint16x4_t d1 = 785 highbd_convolve12_4_x(s1, x_filter_0_7, x_filter_8_11, offset, max); 786 uint16x4_t d2 = 787 highbd_convolve12_4_x(s2, x_filter_0_7, x_filter_8_11, offset, max); 788 uint16x4_t d3 = 789 highbd_convolve12_4_x(s3, x_filter_0_7, x_filter_8_11, offset, max); 790 791 store_u16_4x4(d, dst_stride, d0, d1, d2, d3); 792 793 s += 4 * src_stride; 794 d += 4 * dst_stride; 795 h -= 4; 796 } while (h != 0); 797 } else { 798 const uint16x8_t max = vdupq_n_u16((1 << bd) - 1); 799 int height = h; 800 801 do { 802 int width = w; 803 const int16_t *s = (const int16_t *)src_ptr; 804 uint16_t *d = dst_ptr; 805 806 do { 807 int16x8_t s0[12], s1[12], s2[12], s3[12]; 808 load_s16_8x12(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3], 809 &s0[4], &s0[5], &s0[6], &s0[7], &s0[8], &s0[9], &s0[10], 810 &s0[11]); 811 load_s16_8x12(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3], 812 &s1[4], &s1[5], &s1[6], &s1[7], &s1[8], &s1[9], &s1[10], 813 &s1[11]); 814 load_s16_8x12(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3], 815 &s2[4], &s2[5], &s2[6], &s2[7], &s2[8], &s2[9], &s2[10], 816 &s2[11]); 817 load_s16_8x12(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3], 818 &s3[4], &s3[5], &s3[6], &s3[7], &s3[8], &s3[9], &s3[10], 819 &s3[11]); 820 821 uint16x8_t d0 = 822 highbd_convolve12_8_x(s0, x_filter_0_7, x_filter_8_11, offset, max); 823 uint16x8_t d1 = 824 highbd_convolve12_8_x(s1, x_filter_0_7, x_filter_8_11, offset, max); 825 uint16x8_t d2 = 826 highbd_convolve12_8_x(s2, x_filter_0_7, x_filter_8_11, offset, max); 827 uint16x8_t d3 = 828 highbd_convolve12_8_x(s3, x_filter_0_7, x_filter_8_11, offset, max); 829 830 store_u16_8x4(d, dst_stride, d0, d1, d2, d3); 831 832 s += 8; 833 d += 8; 834 width -= 8; 835 } while (width != 0); 836 src_ptr += 4 * src_stride; 837 dst_ptr += 4 * dst_stride; 838 height -= 4; 839 } while (height != 0); 840 } 841 } 842 843 void av1_highbd_convolve_x_sr_neon(const uint16_t *src, int src_stride, 844 uint16_t *dst, int dst_stride, int w, int h, 845 const InterpFilterParams *filter_params_x, 846 const int subpel_x_qn, 847 ConvolveParams *conv_params, int bd) { 848 if (w == 2 || h == 2) { 849 av1_highbd_convolve_x_sr_c(src, src_stride, dst, dst_stride, w, h, 850 filter_params_x, subpel_x_qn, conv_params, bd); 851 return; 852 } 853 const int x_filter_taps = get_filter_tap(filter_params_x, subpel_x_qn); 854 const int horiz_offset = filter_params_x->taps / 2 - 1; 855 const int16_t *x_filter_ptr = av1_get_interp_filter_subpel_kernel( 856 filter_params_x, subpel_x_qn & SUBPEL_MASK); 857 858 src -= horiz_offset; 859 860 if (x_filter_taps > 8) { 861 highbd_convolve_x_sr_12tap_neon(src, src_stride, dst, dst_stride, w, h, 862 x_filter_ptr, conv_params, bd); 863 return; 864 } 865 if (x_filter_taps <= 6 && w != 4) { 866 highbd_convolve_x_sr_6tap_neon(src + 1, src_stride, dst, dst_stride, w, h, 867 x_filter_ptr, conv_params, bd); 868 return; 869 } 870 871 highbd_convolve_x_sr_neon(src, src_stride, dst, dst_stride, w, h, 872 x_filter_ptr, conv_params, bd); 873 } 874 875 static inline uint16x4_t highbd_convolve6_4_2d_v( 876 const int16x4_t s0, const int16x4_t s1, const int16x4_t s2, 877 const int16x4_t s3, const int16x4_t s4, const int16x4_t s5, 878 const int16x8_t y_filter, const int32x4_t round_shift, 879 const int32x4_t offset, const uint16x4_t max) { 880 // Values at indices 0 and 7 of y_filter are zero. 881 const int16x4_t y_filter_0_3 = vget_low_s16(y_filter); 882 const int16x4_t y_filter_4_7 = vget_high_s16(y_filter); 883 884 int32x4_t sum = vmlal_lane_s16(offset, s0, y_filter_0_3, 1); 885 sum = vmlal_lane_s16(sum, s1, y_filter_0_3, 2); 886 sum = vmlal_lane_s16(sum, s2, y_filter_0_3, 3); 887 sum = vmlal_lane_s16(sum, s3, y_filter_4_7, 0); 888 sum = vmlal_lane_s16(sum, s4, y_filter_4_7, 1); 889 sum = vmlal_lane_s16(sum, s5, y_filter_4_7, 2); 890 891 sum = vshlq_s32(sum, round_shift); 892 uint16x4_t res = vqmovun_s32(sum); 893 return vmin_u16(res, max); 894 } 895 896 static inline uint16x8_t highbd_convolve6_8_2d_v( 897 const int16x8_t s0, const int16x8_t s1, const int16x8_t s2, 898 const int16x8_t s3, const int16x8_t s4, const int16x8_t s5, 899 const int16x8_t y_filter, const int32x4_t round_shift, 900 const int32x4_t offset, const uint16x8_t max) { 901 // Values at indices 0 and 7 of y_filter are zero. 902 const int16x4_t y_filter_0_3 = vget_low_s16(y_filter); 903 const int16x4_t y_filter_4_7 = vget_high_s16(y_filter); 904 905 int32x4_t sum0 = vmlal_lane_s16(offset, vget_low_s16(s0), y_filter_0_3, 1); 906 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s1), y_filter_0_3, 2); 907 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s2), y_filter_0_3, 3); 908 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s3), y_filter_4_7, 0); 909 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s4), y_filter_4_7, 1); 910 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s5), y_filter_4_7, 2); 911 912 int32x4_t sum1 = vmlal_lane_s16(offset, vget_high_s16(s0), y_filter_0_3, 1); 913 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s1), y_filter_0_3, 2); 914 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s2), y_filter_0_3, 3); 915 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s3), y_filter_4_7, 0); 916 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s4), y_filter_4_7, 1); 917 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s5), y_filter_4_7, 2); 918 919 sum0 = vshlq_s32(sum0, round_shift); 920 sum1 = vshlq_s32(sum1, round_shift); 921 922 uint16x8_t res = vcombine_u16(vqmovun_s32(sum0), vqmovun_s32(sum1)); 923 return vminq_u16(res, max); 924 } 925 926 static inline void highbd_convolve_2d_sr_vert_6tap_neon( 927 const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride, 928 int w, int h, const int16_t *y_filter_ptr, ConvolveParams *conv_params, 929 int bd, const int offset) { 930 const int16x8_t y_filter = vld1q_s16(y_filter_ptr); 931 const int32x4_t offset_s32 = vdupq_n_s32(offset); 932 const int round1_shift = conv_params->round_1; 933 const int32x4_t round1_shift_s32 = vdupq_n_s32(-round1_shift); 934 935 if (w == 4) { 936 const uint16x4_t max = vdup_n_u16((1 << bd) - 1); 937 const int16_t *s = (const int16_t *)src_ptr; 938 uint16_t *d = dst_ptr; 939 int16x4_t s0, s1, s2, s3, s4; 940 load_s16_4x5(s, src_stride, &s0, &s1, &s2, &s3, &s4); 941 s += 5 * src_stride; 942 943 do { 944 int16x4_t s5, s6, s7, s8; 945 load_s16_4x4(s, src_stride, &s5, &s6, &s7, &s8); 946 947 uint16x4_t d0 = highbd_convolve6_4_2d_v( 948 s0, s1, s2, s3, s4, s5, y_filter, round1_shift_s32, offset_s32, max); 949 uint16x4_t d1 = highbd_convolve6_4_2d_v( 950 s1, s2, s3, s4, s5, s6, y_filter, round1_shift_s32, offset_s32, max); 951 uint16x4_t d2 = highbd_convolve6_4_2d_v( 952 s2, s3, s4, s5, s6, s7, y_filter, round1_shift_s32, offset_s32, max); 953 uint16x4_t d3 = highbd_convolve6_4_2d_v( 954 s3, s4, s5, s6, s7, s8, y_filter, round1_shift_s32, offset_s32, max); 955 956 store_u16_4x4(d, dst_stride, d0, d1, d2, d3); 957 958 s0 = s4; 959 s1 = s5; 960 s2 = s6; 961 s3 = s7; 962 s4 = s8; 963 s += 4 * src_stride; 964 d += 4 * dst_stride; 965 h -= 4; 966 } while (h != 0); 967 } else { 968 const uint16x8_t max = vdupq_n_u16((1 << bd) - 1); 969 970 do { 971 int height = h; 972 const int16_t *s = (const int16_t *)src_ptr; 973 uint16_t *d = dst_ptr; 974 int16x8_t s0, s1, s2, s3, s4; 975 load_s16_8x5(s, src_stride, &s0, &s1, &s2, &s3, &s4); 976 s += 5 * src_stride; 977 978 do { 979 int16x8_t s5, s6, s7, s8; 980 load_s16_8x4(s, src_stride, &s5, &s6, &s7, &s8); 981 982 uint16x8_t d0 = 983 highbd_convolve6_8_2d_v(s0, s1, s2, s3, s4, s5, y_filter, 984 round1_shift_s32, offset_s32, max); 985 uint16x8_t d1 = 986 highbd_convolve6_8_2d_v(s1, s2, s3, s4, s5, s6, y_filter, 987 round1_shift_s32, offset_s32, max); 988 uint16x8_t d2 = 989 highbd_convolve6_8_2d_v(s2, s3, s4, s5, s6, s7, y_filter, 990 round1_shift_s32, offset_s32, max); 991 uint16x8_t d3 = 992 highbd_convolve6_8_2d_v(s3, s4, s5, s6, s7, s8, y_filter, 993 round1_shift_s32, offset_s32, max); 994 995 store_u16_8x4(d, dst_stride, d0, d1, d2, d3); 996 997 s0 = s4; 998 s1 = s5; 999 s2 = s6; 1000 s3 = s7; 1001 s4 = s8; 1002 s += 4 * src_stride; 1003 d += 4 * dst_stride; 1004 height -= 4; 1005 } while (height != 0); 1006 src_ptr += 8; 1007 dst_ptr += 8; 1008 w -= 8; 1009 } while (w != 0); 1010 } 1011 } 1012 1013 static inline uint16x4_t highbd_convolve8_4_2d_v( 1014 const int16x4_t s0, const int16x4_t s1, const int16x4_t s2, 1015 const int16x4_t s3, const int16x4_t s4, const int16x4_t s5, 1016 const int16x4_t s6, const int16x4_t s7, const int16x8_t y_filter, 1017 const int32x4_t round_shift, const int32x4_t offset, const uint16x4_t max) { 1018 const int16x4_t y_filter_lo = vget_low_s16(y_filter); 1019 const int16x4_t y_filter_hi = vget_high_s16(y_filter); 1020 1021 int32x4_t sum = vmlal_lane_s16(offset, s0, y_filter_lo, 0); 1022 sum = vmlal_lane_s16(sum, s1, y_filter_lo, 1); 1023 sum = vmlal_lane_s16(sum, s2, y_filter_lo, 2); 1024 sum = vmlal_lane_s16(sum, s3, y_filter_lo, 3); 1025 sum = vmlal_lane_s16(sum, s4, y_filter_hi, 0); 1026 sum = vmlal_lane_s16(sum, s5, y_filter_hi, 1); 1027 sum = vmlal_lane_s16(sum, s6, y_filter_hi, 2); 1028 sum = vmlal_lane_s16(sum, s7, y_filter_hi, 3); 1029 1030 sum = vshlq_s32(sum, round_shift); 1031 uint16x4_t res = vqmovun_s32(sum); 1032 return vmin_u16(res, max); 1033 } 1034 1035 static inline uint16x8_t highbd_convolve8_8_2d_v( 1036 const int16x8_t s0, const int16x8_t s1, const int16x8_t s2, 1037 const int16x8_t s3, const int16x8_t s4, const int16x8_t s5, 1038 const int16x8_t s6, const int16x8_t s7, const int16x8_t y_filter, 1039 const int32x4_t round_shift, const int32x4_t offset, const uint16x8_t max) { 1040 const int16x4_t y_filter_lo = vget_low_s16(y_filter); 1041 const int16x4_t y_filter_hi = vget_high_s16(y_filter); 1042 1043 int32x4_t sum0 = vmlal_lane_s16(offset, vget_low_s16(s0), y_filter_lo, 0); 1044 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s1), y_filter_lo, 1); 1045 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s2), y_filter_lo, 2); 1046 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s3), y_filter_lo, 3); 1047 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s4), y_filter_hi, 0); 1048 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s5), y_filter_hi, 1); 1049 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s6), y_filter_hi, 2); 1050 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s7), y_filter_hi, 3); 1051 1052 int32x4_t sum1 = vmlal_lane_s16(offset, vget_high_s16(s0), y_filter_lo, 0); 1053 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s1), y_filter_lo, 1); 1054 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s2), y_filter_lo, 2); 1055 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s3), y_filter_lo, 3); 1056 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s4), y_filter_hi, 0); 1057 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s5), y_filter_hi, 1); 1058 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s6), y_filter_hi, 2); 1059 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s7), y_filter_hi, 3); 1060 1061 sum0 = vshlq_s32(sum0, round_shift); 1062 sum1 = vshlq_s32(sum1, round_shift); 1063 1064 uint16x8_t res = vcombine_u16(vqmovun_s32(sum0), vqmovun_s32(sum1)); 1065 return vminq_u16(res, max); 1066 } 1067 1068 static inline void highbd_convolve_2d_sr_vert_8tap_neon( 1069 const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride, 1070 int w, int h, const int16_t *y_filter_ptr, ConvolveParams *conv_params, 1071 int bd, const int offset) { 1072 const int16x8_t y_filter = vld1q_s16(y_filter_ptr); 1073 const int32x4_t offset_s32 = vdupq_n_s32(offset); 1074 const int round1_shift = conv_params->round_1; 1075 const int32x4_t round1_shift_s32 = vdupq_n_s32(-round1_shift); 1076 1077 if (w == 4) { 1078 const uint16x4_t max = vdup_n_u16((1 << bd) - 1); 1079 const int16_t *s = (const int16_t *)src_ptr; 1080 uint16_t *d = dst_ptr; 1081 1082 int16x4_t s0, s1, s2, s3, s4, s5, s6; 1083 load_s16_4x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6); 1084 s += 7 * src_stride; 1085 1086 do { 1087 int16x4_t s7, s8, s9, s10; 1088 load_s16_4x4(s, src_stride, &s7, &s8, &s9, &s10); 1089 1090 uint16x4_t d0 = 1091 highbd_convolve8_4_2d_v(s0, s1, s2, s3, s4, s5, s6, s7, y_filter, 1092 round1_shift_s32, offset_s32, max); 1093 uint16x4_t d1 = 1094 highbd_convolve8_4_2d_v(s1, s2, s3, s4, s5, s6, s7, s8, y_filter, 1095 round1_shift_s32, offset_s32, max); 1096 uint16x4_t d2 = 1097 highbd_convolve8_4_2d_v(s2, s3, s4, s5, s6, s7, s8, s9, y_filter, 1098 round1_shift_s32, offset_s32, max); 1099 uint16x4_t d3 = 1100 highbd_convolve8_4_2d_v(s3, s4, s5, s6, s7, s8, s9, s10, y_filter, 1101 round1_shift_s32, offset_s32, max); 1102 1103 store_u16_4x4(d, dst_stride, d0, d1, d2, d3); 1104 1105 s0 = s4; 1106 s1 = s5; 1107 s2 = s6; 1108 s3 = s7; 1109 s4 = s8; 1110 s5 = s9; 1111 s6 = s10; 1112 s += 4 * src_stride; 1113 d += 4 * dst_stride; 1114 h -= 4; 1115 } while (h != 0); 1116 } else { 1117 const uint16x8_t max = vdupq_n_u16((1 << bd) - 1); 1118 1119 do { 1120 int height = h; 1121 const int16_t *s = (const int16_t *)src_ptr; 1122 uint16_t *d = dst_ptr; 1123 1124 int16x8_t s0, s1, s2, s3, s4, s5, s6; 1125 load_s16_8x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6); 1126 s += 7 * src_stride; 1127 1128 do { 1129 int16x8_t s7, s8, s9, s10; 1130 load_s16_8x4(s, src_stride, &s7, &s8, &s9, &s10); 1131 1132 uint16x8_t d0 = 1133 highbd_convolve8_8_2d_v(s0, s1, s2, s3, s4, s5, s6, s7, y_filter, 1134 round1_shift_s32, offset_s32, max); 1135 uint16x8_t d1 = 1136 highbd_convolve8_8_2d_v(s1, s2, s3, s4, s5, s6, s7, s8, y_filter, 1137 round1_shift_s32, offset_s32, max); 1138 uint16x8_t d2 = 1139 highbd_convolve8_8_2d_v(s2, s3, s4, s5, s6, s7, s8, s9, y_filter, 1140 round1_shift_s32, offset_s32, max); 1141 uint16x8_t d3 = 1142 highbd_convolve8_8_2d_v(s3, s4, s5, s6, s7, s8, s9, s10, y_filter, 1143 round1_shift_s32, offset_s32, max); 1144 1145 store_u16_8x4(d, dst_stride, d0, d1, d2, d3); 1146 1147 s0 = s4; 1148 s1 = s5; 1149 s2 = s6; 1150 s3 = s7; 1151 s4 = s8; 1152 s5 = s9; 1153 s6 = s10; 1154 s += 4 * src_stride; 1155 d += 4 * dst_stride; 1156 height -= 4; 1157 } while (height != 0); 1158 src_ptr += 8; 1159 dst_ptr += 8; 1160 w -= 8; 1161 } while (w != 0); 1162 } 1163 } 1164 1165 static inline uint16x4_t highbd_convolve12_4_2d_v( 1166 const int16x4_t s0, const int16x4_t s1, const int16x4_t s2, 1167 const int16x4_t s3, const int16x4_t s4, const int16x4_t s5, 1168 const int16x4_t s6, const int16x4_t s7, const int16x4_t s8, 1169 const int16x4_t s9, const int16x4_t s10, const int16x4_t s11, 1170 const int16x8_t y_filter_0_7, const int16x4_t y_filter_8_11, 1171 const int32x4_t round_shift, const int32x4_t offset, const uint16x4_t max) { 1172 const int16x4_t y_filter_0_3 = vget_low_s16(y_filter_0_7); 1173 const int16x4_t y_filter_4_7 = vget_high_s16(y_filter_0_7); 1174 1175 int32x4_t sum = vmlal_lane_s16(offset, s0, y_filter_0_3, 0); 1176 sum = vmlal_lane_s16(sum, s1, y_filter_0_3, 1); 1177 sum = vmlal_lane_s16(sum, s2, y_filter_0_3, 2); 1178 sum = vmlal_lane_s16(sum, s3, y_filter_0_3, 3); 1179 sum = vmlal_lane_s16(sum, s4, y_filter_4_7, 0); 1180 sum = vmlal_lane_s16(sum, s5, y_filter_4_7, 1); 1181 sum = vmlal_lane_s16(sum, s6, y_filter_4_7, 2); 1182 sum = vmlal_lane_s16(sum, s7, y_filter_4_7, 3); 1183 sum = vmlal_lane_s16(sum, s8, y_filter_8_11, 0); 1184 sum = vmlal_lane_s16(sum, s9, y_filter_8_11, 1); 1185 sum = vmlal_lane_s16(sum, s10, y_filter_8_11, 2); 1186 sum = vmlal_lane_s16(sum, s11, y_filter_8_11, 3); 1187 1188 sum = vshlq_s32(sum, round_shift); 1189 uint16x4_t res = vqmovun_s32(sum); 1190 return vmin_u16(res, max); 1191 } 1192 1193 static inline uint16x8_t highbd_convolve12_8_2d_v( 1194 const int16x8_t s0, const int16x8_t s1, const int16x8_t s2, 1195 const int16x8_t s3, const int16x8_t s4, const int16x8_t s5, 1196 const int16x8_t s6, const int16x8_t s7, const int16x8_t s8, 1197 const int16x8_t s9, const int16x8_t s10, const int16x8_t s11, 1198 const int16x8_t y_filter_0_7, const int16x4_t y_filter_8_11, 1199 const int32x4_t round_shift, const int32x4_t offset, const uint16x8_t max) { 1200 const int16x4_t y_filter_0_3 = vget_low_s16(y_filter_0_7); 1201 const int16x4_t y_filter_4_7 = vget_high_s16(y_filter_0_7); 1202 1203 int32x4_t sum0 = vmlal_lane_s16(offset, vget_low_s16(s0), y_filter_0_3, 0); 1204 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s1), y_filter_0_3, 1); 1205 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s2), y_filter_0_3, 2); 1206 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s3), y_filter_0_3, 3); 1207 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s4), y_filter_4_7, 0); 1208 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s5), y_filter_4_7, 1); 1209 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s6), y_filter_4_7, 2); 1210 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s7), y_filter_4_7, 3); 1211 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s8), y_filter_8_11, 0); 1212 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s9), y_filter_8_11, 1); 1213 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s10), y_filter_8_11, 2); 1214 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s11), y_filter_8_11, 3); 1215 1216 int32x4_t sum1 = vmlal_lane_s16(offset, vget_high_s16(s0), y_filter_0_3, 0); 1217 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s1), y_filter_0_3, 1); 1218 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s2), y_filter_0_3, 2); 1219 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s3), y_filter_0_3, 3); 1220 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s4), y_filter_4_7, 0); 1221 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s5), y_filter_4_7, 1); 1222 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s6), y_filter_4_7, 2); 1223 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s7), y_filter_4_7, 3); 1224 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s8), y_filter_8_11, 0); 1225 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s9), y_filter_8_11, 1); 1226 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s10), y_filter_8_11, 2); 1227 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s11), y_filter_8_11, 3); 1228 1229 sum0 = vshlq_s32(sum0, round_shift); 1230 sum1 = vshlq_s32(sum1, round_shift); 1231 1232 uint16x8_t res = vcombine_u16(vqmovun_s32(sum0), vqmovun_s32(sum1)); 1233 return vminq_u16(res, max); 1234 } 1235 1236 static inline void highbd_convolve_2d_sr_vert_12tap_neon( 1237 const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride, 1238 int w, int h, const int16_t *y_filter_ptr, ConvolveParams *conv_params, 1239 const int bd, const int offset) { 1240 const int16x8_t y_filter_0_7 = vld1q_s16(y_filter_ptr); 1241 const int16x4_t y_filter_8_11 = vld1_s16(y_filter_ptr + 8); 1242 const int32x4_t offset_s32 = vdupq_n_s32(offset); 1243 const int round1_shift = conv_params->round_1; 1244 const int32x4_t round1_shift_s32 = vdupq_n_s32(-round1_shift); 1245 1246 if (w == 4) { 1247 const uint16x4_t max = vdup_n_u16((1 << bd) - 1); 1248 const int16_t *s = (const int16_t *)src_ptr; 1249 uint16_t *d = dst_ptr; 1250 1251 int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10; 1252 load_s16_4x11(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7, &s8, 1253 &s9, &s10); 1254 s += 11 * src_stride; 1255 1256 do { 1257 int16x4_t s11, s12, s13, s14; 1258 load_s16_4x4(s, src_stride, &s11, &s12, &s13, &s14); 1259 1260 uint16x4_t d0 = highbd_convolve12_4_2d_v( 1261 s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, y_filter_0_7, 1262 y_filter_8_11, round1_shift_s32, offset_s32, max); 1263 uint16x4_t d1 = highbd_convolve12_4_2d_v( 1264 s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, y_filter_0_7, 1265 y_filter_8_11, round1_shift_s32, offset_s32, max); 1266 uint16x4_t d2 = highbd_convolve12_4_2d_v( 1267 s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, y_filter_0_7, 1268 y_filter_8_11, round1_shift_s32, offset_s32, max); 1269 uint16x4_t d3 = highbd_convolve12_4_2d_v( 1270 s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14, y_filter_0_7, 1271 y_filter_8_11, round1_shift_s32, offset_s32, max); 1272 1273 store_u16_4x4(d, dst_stride, d0, d1, d2, d3); 1274 1275 s0 = s4; 1276 s1 = s5; 1277 s2 = s6; 1278 s3 = s7; 1279 s4 = s8; 1280 s5 = s9; 1281 s6 = s10; 1282 s7 = s11; 1283 s8 = s12; 1284 s9 = s13; 1285 s10 = s14; 1286 s += 4 * src_stride; 1287 d += 4 * dst_stride; 1288 h -= 4; 1289 } while (h != 0); 1290 } else { 1291 const uint16x8_t max = vdupq_n_u16((1 << bd) - 1); 1292 1293 do { 1294 int height = h; 1295 const int16_t *s = (const int16_t *)src_ptr; 1296 uint16_t *d = dst_ptr; 1297 1298 int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10; 1299 load_s16_8x11(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7, &s8, 1300 &s9, &s10); 1301 s += 11 * src_stride; 1302 1303 do { 1304 int16x8_t s11, s12, s13, s14; 1305 load_s16_8x4(s, src_stride, &s11, &s12, &s13, &s14); 1306 1307 uint16x8_t d0 = highbd_convolve12_8_2d_v( 1308 s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, y_filter_0_7, 1309 y_filter_8_11, round1_shift_s32, offset_s32, max); 1310 uint16x8_t d1 = highbd_convolve12_8_2d_v( 1311 s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, y_filter_0_7, 1312 y_filter_8_11, round1_shift_s32, offset_s32, max); 1313 uint16x8_t d2 = highbd_convolve12_8_2d_v( 1314 s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, y_filter_0_7, 1315 y_filter_8_11, round1_shift_s32, offset_s32, max); 1316 uint16x8_t d3 = highbd_convolve12_8_2d_v( 1317 s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14, y_filter_0_7, 1318 y_filter_8_11, round1_shift_s32, offset_s32, max); 1319 1320 store_u16_8x4(d, dst_stride, d0, d1, d2, d3); 1321 1322 s0 = s4; 1323 s1 = s5; 1324 s2 = s6; 1325 s3 = s7; 1326 s4 = s8; 1327 s5 = s9; 1328 s6 = s10; 1329 s7 = s11; 1330 s8 = s12; 1331 s9 = s13; 1332 s10 = s14; 1333 s += 4 * src_stride; 1334 d += 4 * dst_stride; 1335 height -= 4; 1336 } while (height != 0); 1337 1338 src_ptr += 8; 1339 dst_ptr += 8; 1340 w -= 8; 1341 } while (w != 0); 1342 } 1343 } 1344 1345 static inline uint16x8_t highbd_convolve6_8_2d_h(const int16x8_t s[6], 1346 const int16x8_t x_filter, 1347 const int32x4_t shift_s32, 1348 const int32x4_t offset) { 1349 // Values at indices 0 and 7 of y_filter are zero. 1350 const int16x4_t x_filter_0_3 = vget_low_s16(x_filter); 1351 const int16x4_t x_filter_4_7 = vget_high_s16(x_filter); 1352 1353 int32x4_t sum0 = vmlal_lane_s16(offset, vget_low_s16(s[0]), x_filter_0_3, 1); 1354 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[1]), x_filter_0_3, 2); 1355 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[2]), x_filter_0_3, 3); 1356 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[3]), x_filter_4_7, 0); 1357 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[4]), x_filter_4_7, 1); 1358 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[5]), x_filter_4_7, 2); 1359 1360 int32x4_t sum1 = vmlal_lane_s16(offset, vget_high_s16(s[0]), x_filter_0_3, 1); 1361 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[1]), x_filter_0_3, 2); 1362 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[2]), x_filter_0_3, 3); 1363 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[3]), x_filter_4_7, 0); 1364 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[4]), x_filter_4_7, 1); 1365 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[5]), x_filter_4_7, 2); 1366 1367 sum0 = vqrshlq_s32(sum0, shift_s32); 1368 sum1 = vqrshlq_s32(sum1, shift_s32); 1369 1370 return vcombine_u16(vqmovun_s32(sum0), vqmovun_s32(sum1)); 1371 } 1372 1373 static inline void highbd_convolve_2d_sr_horiz_6tap_neon( 1374 const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride, 1375 int w, int h, const int16_t *x_filter_ptr, ConvolveParams *conv_params, 1376 const int offset) { 1377 // The smallest block height processed by the SIMD functions is 4, and the 1378 // horizontal convolution needs to process an extra (filter_taps/2 - 1) lines 1379 // for the vertical convolution. 1380 assert(h >= 5); 1381 const int32x4_t shift_s32 = vdupq_n_s32(-conv_params->round_0); 1382 const int32x4_t offset_s32 = vdupq_n_s32(offset); 1383 1384 const int16x8_t x_filter = vld1q_s16(x_filter_ptr); 1385 int height = h; 1386 1387 do { 1388 int width = w; 1389 const int16_t *s = (const int16_t *)src_ptr; 1390 uint16_t *d = dst_ptr; 1391 1392 do { 1393 int16x8_t s0[6], s1[6], s2[6], s3[6]; 1394 load_s16_8x6(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3], 1395 &s0[4], &s0[5]); 1396 load_s16_8x6(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3], 1397 &s1[4], &s1[5]); 1398 load_s16_8x6(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3], 1399 &s2[4], &s2[5]); 1400 load_s16_8x6(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3], 1401 &s3[4], &s3[5]); 1402 1403 uint16x8_t d0 = 1404 highbd_convolve6_8_2d_h(s0, x_filter, shift_s32, offset_s32); 1405 uint16x8_t d1 = 1406 highbd_convolve6_8_2d_h(s1, x_filter, shift_s32, offset_s32); 1407 uint16x8_t d2 = 1408 highbd_convolve6_8_2d_h(s2, x_filter, shift_s32, offset_s32); 1409 uint16x8_t d3 = 1410 highbd_convolve6_8_2d_h(s3, x_filter, shift_s32, offset_s32); 1411 1412 store_u16_8x4(d, dst_stride, d0, d1, d2, d3); 1413 1414 s += 8; 1415 d += 8; 1416 width -= 8; 1417 } while (width != 0); 1418 src_ptr += 4 * src_stride; 1419 dst_ptr += 4 * dst_stride; 1420 height -= 4; 1421 } while (height > 4); 1422 do { 1423 int width = w; 1424 const int16_t *s = (const int16_t *)src_ptr; 1425 uint16_t *d = dst_ptr; 1426 1427 do { 1428 int16x8_t s0[6]; 1429 load_s16_8x6(s, 1, &s0[0], &s0[1], &s0[2], &s0[3], &s0[4], &s0[5]); 1430 1431 uint16x8_t d0 = 1432 highbd_convolve6_8_2d_h(s0, x_filter, shift_s32, offset_s32); 1433 vst1q_u16(d, d0); 1434 1435 s += 8; 1436 d += 8; 1437 width -= 8; 1438 } while (width != 0); 1439 src_ptr += src_stride; 1440 dst_ptr += dst_stride; 1441 } while (--height != 0); 1442 } 1443 1444 static inline uint16x4_t highbd_convolve4_4_2d_h(const int16x4_t s[4], 1445 const int16x4_t x_filter, 1446 const int32x4_t shift_s32, 1447 const int32x4_t offset) { 1448 int32x4_t sum = vmlal_lane_s16(offset, s[0], x_filter, 0); 1449 sum = vmlal_lane_s16(sum, s[1], x_filter, 1); 1450 sum = vmlal_lane_s16(sum, s[2], x_filter, 2); 1451 sum = vmlal_lane_s16(sum, s[3], x_filter, 3); 1452 1453 sum = vqrshlq_s32(sum, shift_s32); 1454 return vqmovun_s32(sum); 1455 } 1456 1457 static inline uint16x8_t highbd_convolve8_8_2d_h(const int16x8_t s[8], 1458 const int16x8_t x_filter, 1459 const int32x4_t shift_s32, 1460 const int32x4_t offset) { 1461 const int16x4_t x_filter_0_3 = vget_low_s16(x_filter); 1462 const int16x4_t x_filter_4_7 = vget_high_s16(x_filter); 1463 1464 int32x4_t sum0 = vmlal_lane_s16(offset, vget_low_s16(s[0]), x_filter_0_3, 0); 1465 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[1]), x_filter_0_3, 1); 1466 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[2]), x_filter_0_3, 2); 1467 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[3]), x_filter_0_3, 3); 1468 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[4]), x_filter_4_7, 0); 1469 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[5]), x_filter_4_7, 1); 1470 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[6]), x_filter_4_7, 2); 1471 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[7]), x_filter_4_7, 3); 1472 1473 int32x4_t sum1 = vmlal_lane_s16(offset, vget_high_s16(s[0]), x_filter_0_3, 0); 1474 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[1]), x_filter_0_3, 1); 1475 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[2]), x_filter_0_3, 2); 1476 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[3]), x_filter_0_3, 3); 1477 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[4]), x_filter_4_7, 0); 1478 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[5]), x_filter_4_7, 1); 1479 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[6]), x_filter_4_7, 2); 1480 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[7]), x_filter_4_7, 3); 1481 1482 sum0 = vqrshlq_s32(sum0, shift_s32); 1483 sum1 = vqrshlq_s32(sum1, shift_s32); 1484 1485 return vcombine_u16(vqmovun_s32(sum0), vqmovun_s32(sum1)); 1486 } 1487 1488 static inline void highbd_convolve_2d_sr_horiz_neon( 1489 const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride, 1490 int w, int h, const int16_t *x_filter_ptr, ConvolveParams *conv_params, 1491 const int offset) { 1492 // The smallest block height processed by the SIMD functions is 4, and the 1493 // horizontal convolution needs to process an extra (filter_taps/2 - 1) lines 1494 // for the vertical convolution. 1495 assert(h >= 5); 1496 const int32x4_t shift_s32 = vdupq_n_s32(-conv_params->round_0); 1497 const int32x4_t offset_s32 = vdupq_n_s32(offset); 1498 1499 if (w == 4) { 1500 // 4-tap filters are used for blocks having width <= 4. 1501 const int16x4_t x_filter = vld1_s16(x_filter_ptr + 2); 1502 const int16_t *s = (const int16_t *)(src_ptr + 1); 1503 uint16_t *d = dst_ptr; 1504 1505 do { 1506 int16x4_t s0[4], s1[4], s2[4], s3[4]; 1507 load_s16_4x4(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3]); 1508 load_s16_4x4(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3]); 1509 load_s16_4x4(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3]); 1510 load_s16_4x4(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3]); 1511 1512 uint16x4_t d0 = 1513 highbd_convolve4_4_2d_h(s0, x_filter, shift_s32, offset_s32); 1514 uint16x4_t d1 = 1515 highbd_convolve4_4_2d_h(s1, x_filter, shift_s32, offset_s32); 1516 uint16x4_t d2 = 1517 highbd_convolve4_4_2d_h(s2, x_filter, shift_s32, offset_s32); 1518 uint16x4_t d3 = 1519 highbd_convolve4_4_2d_h(s3, x_filter, shift_s32, offset_s32); 1520 1521 store_u16_4x4(d, dst_stride, d0, d1, d2, d3); 1522 1523 s += 4 * src_stride; 1524 d += 4 * dst_stride; 1525 h -= 4; 1526 } while (h > 4); 1527 1528 do { 1529 int16x4_t s0[4]; 1530 load_s16_4x4(s, 1, &s0[0], &s0[1], &s0[2], &s0[3]); 1531 1532 uint16x4_t d0 = 1533 highbd_convolve4_4_2d_h(s0, x_filter, shift_s32, offset_s32); 1534 1535 vst1_u16(d, d0); 1536 1537 s += src_stride; 1538 d += dst_stride; 1539 } while (--h != 0); 1540 } else { 1541 const int16x8_t x_filter = vld1q_s16(x_filter_ptr); 1542 int height = h; 1543 1544 do { 1545 int width = w; 1546 const int16_t *s = (const int16_t *)src_ptr; 1547 uint16_t *d = dst_ptr; 1548 1549 do { 1550 int16x8_t s0[8], s1[8], s2[8], s3[8]; 1551 load_s16_8x8(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3], 1552 &s0[4], &s0[5], &s0[6], &s0[7]); 1553 load_s16_8x8(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3], 1554 &s1[4], &s1[5], &s1[6], &s1[7]); 1555 load_s16_8x8(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3], 1556 &s2[4], &s2[5], &s2[6], &s2[7]); 1557 load_s16_8x8(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3], 1558 &s3[4], &s3[5], &s3[6], &s3[7]); 1559 1560 uint16x8_t d0 = 1561 highbd_convolve8_8_2d_h(s0, x_filter, shift_s32, offset_s32); 1562 uint16x8_t d1 = 1563 highbd_convolve8_8_2d_h(s1, x_filter, shift_s32, offset_s32); 1564 uint16x8_t d2 = 1565 highbd_convolve8_8_2d_h(s2, x_filter, shift_s32, offset_s32); 1566 uint16x8_t d3 = 1567 highbd_convolve8_8_2d_h(s3, x_filter, shift_s32, offset_s32); 1568 1569 store_u16_8x4(d, dst_stride, d0, d1, d2, d3); 1570 1571 s += 8; 1572 d += 8; 1573 width -= 8; 1574 } while (width != 0); 1575 src_ptr += 4 * src_stride; 1576 dst_ptr += 4 * dst_stride; 1577 height -= 4; 1578 } while (height > 4); 1579 1580 do { 1581 int width = w; 1582 const int16_t *s = (const int16_t *)src_ptr; 1583 uint16_t *d = dst_ptr; 1584 1585 do { 1586 int16x8_t s0[8]; 1587 load_s16_8x8(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3], 1588 &s0[4], &s0[5], &s0[6], &s0[7]); 1589 1590 uint16x8_t d0 = 1591 highbd_convolve8_8_2d_h(s0, x_filter, shift_s32, offset_s32); 1592 vst1q_u16(d, d0); 1593 1594 s += 8; 1595 d += 8; 1596 width -= 8; 1597 } while (width != 0); 1598 src_ptr += src_stride; 1599 dst_ptr += dst_stride; 1600 } while (--height != 0); 1601 } 1602 } 1603 1604 static inline uint16x4_t highbd_convolve12_4_2d_h(const int16x4_t s[12], 1605 const int16x8_t x_filter_0_7, 1606 const int16x4_t x_filter_8_11, 1607 const int32x4_t shift_s32, 1608 const int32x4_t offset) { 1609 const int16x4_t x_filter_0_3 = vget_low_s16(x_filter_0_7); 1610 const int16x4_t x_filter_4_7 = vget_high_s16(x_filter_0_7); 1611 1612 int32x4_t sum = vmlal_lane_s16(offset, s[0], x_filter_0_3, 0); 1613 sum = vmlal_lane_s16(sum, s[1], x_filter_0_3, 1); 1614 sum = vmlal_lane_s16(sum, s[2], x_filter_0_3, 2); 1615 sum = vmlal_lane_s16(sum, s[3], x_filter_0_3, 3); 1616 sum = vmlal_lane_s16(sum, s[4], x_filter_4_7, 0); 1617 sum = vmlal_lane_s16(sum, s[5], x_filter_4_7, 1); 1618 sum = vmlal_lane_s16(sum, s[6], x_filter_4_7, 2); 1619 sum = vmlal_lane_s16(sum, s[7], x_filter_4_7, 3); 1620 sum = vmlal_lane_s16(sum, s[8], x_filter_8_11, 0); 1621 sum = vmlal_lane_s16(sum, s[9], x_filter_8_11, 1); 1622 sum = vmlal_lane_s16(sum, s[10], x_filter_8_11, 2); 1623 sum = vmlal_lane_s16(sum, s[11], x_filter_8_11, 3); 1624 1625 sum = vqrshlq_s32(sum, shift_s32); 1626 return vqmovun_s32(sum); 1627 } 1628 1629 static inline uint16x8_t highbd_convolve12_8_2d_h(const int16x8_t s[12], 1630 const int16x8_t x_filter_0_7, 1631 const int16x4_t x_filter_8_11, 1632 const int32x4_t shift_s32, 1633 const int32x4_t offset) { 1634 const int16x4_t x_filter_0_3 = vget_low_s16(x_filter_0_7); 1635 const int16x4_t x_filter_4_7 = vget_high_s16(x_filter_0_7); 1636 1637 int32x4_t sum0 = vmlal_lane_s16(offset, vget_low_s16(s[0]), x_filter_0_3, 0); 1638 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[1]), x_filter_0_3, 1); 1639 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[2]), x_filter_0_3, 2); 1640 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[3]), x_filter_0_3, 3); 1641 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[4]), x_filter_4_7, 0); 1642 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[5]), x_filter_4_7, 1); 1643 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[6]), x_filter_4_7, 2); 1644 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[7]), x_filter_4_7, 3); 1645 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[8]), x_filter_8_11, 0); 1646 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[9]), x_filter_8_11, 1); 1647 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[10]), x_filter_8_11, 2); 1648 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[11]), x_filter_8_11, 3); 1649 1650 int32x4_t sum1 = vmlal_lane_s16(offset, vget_high_s16(s[0]), x_filter_0_3, 0); 1651 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[1]), x_filter_0_3, 1); 1652 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[2]), x_filter_0_3, 2); 1653 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[3]), x_filter_0_3, 3); 1654 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[4]), x_filter_4_7, 0); 1655 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[5]), x_filter_4_7, 1); 1656 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[6]), x_filter_4_7, 2); 1657 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[7]), x_filter_4_7, 3); 1658 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[8]), x_filter_8_11, 0); 1659 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[9]), x_filter_8_11, 1); 1660 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[10]), x_filter_8_11, 2); 1661 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[11]), x_filter_8_11, 3); 1662 1663 sum0 = vqrshlq_s32(sum0, shift_s32); 1664 sum1 = vqrshlq_s32(sum1, shift_s32); 1665 1666 return vcombine_u16(vqmovun_s32(sum0), vqmovun_s32(sum1)); 1667 } 1668 1669 static inline void highbd_convolve_2d_sr_horiz_12tap_neon( 1670 const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride, 1671 int w, int h, const int16_t *x_filter_ptr, ConvolveParams *conv_params, 1672 const int offset) { 1673 // The smallest block height processed by the SIMD functions is 4, and the 1674 // horizontal convolution needs to process an extra (filter_taps/2 - 1) lines 1675 // for the vertical convolution. 1676 assert(h >= 5); 1677 const int32x4_t shift_s32 = vdupq_n_s32(-conv_params->round_0); 1678 const int16x8_t x_filter_0_7 = vld1q_s16(x_filter_ptr); 1679 const int16x4_t x_filter_8_11 = vld1_s16(x_filter_ptr + 8); 1680 const int32x4_t offset_s32 = vdupq_n_s32(offset); 1681 1682 if (w == 4) { 1683 const int16_t *s = (const int16_t *)src_ptr; 1684 uint16_t *d = dst_ptr; 1685 1686 do { 1687 int16x4_t s0[12], s1[12], s2[12], s3[12]; 1688 load_s16_4x12(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3], 1689 &s0[4], &s0[5], &s0[6], &s0[7], &s0[8], &s0[9], &s0[10], 1690 &s0[11]); 1691 load_s16_4x12(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3], 1692 &s1[4], &s1[5], &s1[6], &s1[7], &s1[8], &s1[9], &s1[10], 1693 &s1[11]); 1694 load_s16_4x12(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3], 1695 &s2[4], &s2[5], &s2[6], &s2[7], &s2[8], &s2[9], &s2[10], 1696 &s2[11]); 1697 load_s16_4x12(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3], 1698 &s3[4], &s3[5], &s3[6], &s3[7], &s3[8], &s3[9], &s3[10], 1699 &s3[11]); 1700 1701 uint16x4_t d0 = highbd_convolve12_4_2d_h(s0, x_filter_0_7, x_filter_8_11, 1702 shift_s32, offset_s32); 1703 uint16x4_t d1 = highbd_convolve12_4_2d_h(s1, x_filter_0_7, x_filter_8_11, 1704 shift_s32, offset_s32); 1705 uint16x4_t d2 = highbd_convolve12_4_2d_h(s2, x_filter_0_7, x_filter_8_11, 1706 shift_s32, offset_s32); 1707 uint16x4_t d3 = highbd_convolve12_4_2d_h(s3, x_filter_0_7, x_filter_8_11, 1708 shift_s32, offset_s32); 1709 1710 store_u16_4x4(d, dst_stride, d0, d1, d2, d3); 1711 1712 s += 4 * src_stride; 1713 d += 4 * dst_stride; 1714 h -= 4; 1715 } while (h > 4); 1716 1717 do { 1718 int16x4_t s0[12]; 1719 load_s16_4x12(s, 1, &s0[0], &s0[1], &s0[2], &s0[3], &s0[4], &s0[5], 1720 &s0[6], &s0[7], &s0[8], &s0[9], &s0[10], &s0[11]); 1721 1722 uint16x4_t d0 = highbd_convolve12_4_2d_h(s0, x_filter_0_7, x_filter_8_11, 1723 shift_s32, offset_s32); 1724 1725 vst1_u16(d, d0); 1726 1727 s += src_stride; 1728 d += dst_stride; 1729 } while (--h != 0); 1730 } else { 1731 int height = h; 1732 1733 do { 1734 int width = w; 1735 const int16_t *s = (const int16_t *)src_ptr; 1736 uint16_t *d = dst_ptr; 1737 1738 do { 1739 int16x8_t s0[12], s1[12], s2[12], s3[12]; 1740 load_s16_8x12(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3], 1741 &s0[4], &s0[5], &s0[6], &s0[7], &s0[8], &s0[9], &s0[10], 1742 &s0[11]); 1743 load_s16_8x12(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3], 1744 &s1[4], &s1[5], &s1[6], &s1[7], &s1[8], &s1[9], &s1[10], 1745 &s1[11]); 1746 load_s16_8x12(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3], 1747 &s2[4], &s2[5], &s2[6], &s2[7], &s2[8], &s2[9], &s2[10], 1748 &s2[11]); 1749 load_s16_8x12(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3], 1750 &s3[4], &s3[5], &s3[6], &s3[7], &s3[8], &s3[9], &s3[10], 1751 &s3[11]); 1752 1753 uint16x8_t d0 = highbd_convolve12_8_2d_h( 1754 s0, x_filter_0_7, x_filter_8_11, shift_s32, offset_s32); 1755 uint16x8_t d1 = highbd_convolve12_8_2d_h( 1756 s1, x_filter_0_7, x_filter_8_11, shift_s32, offset_s32); 1757 uint16x8_t d2 = highbd_convolve12_8_2d_h( 1758 s2, x_filter_0_7, x_filter_8_11, shift_s32, offset_s32); 1759 uint16x8_t d3 = highbd_convolve12_8_2d_h( 1760 s3, x_filter_0_7, x_filter_8_11, shift_s32, offset_s32); 1761 1762 store_u16_8x4(d, dst_stride, d0, d1, d2, d3); 1763 1764 s += 8; 1765 d += 8; 1766 width -= 8; 1767 } while (width != 0); 1768 src_ptr += 4 * src_stride; 1769 dst_ptr += 4 * dst_stride; 1770 height -= 4; 1771 } while (height > 4); 1772 1773 do { 1774 int width = w; 1775 const int16_t *s = (const int16_t *)src_ptr; 1776 uint16_t *d = dst_ptr; 1777 1778 do { 1779 int16x8_t s0[12]; 1780 load_s16_8x12(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3], 1781 &s0[4], &s0[5], &s0[6], &s0[7], &s0[8], &s0[9], &s0[10], 1782 &s0[11]); 1783 1784 uint16x8_t d0 = highbd_convolve12_8_2d_h( 1785 s0, x_filter_0_7, x_filter_8_11, shift_s32, offset_s32); 1786 vst1q_u16(d, d0); 1787 1788 s += 8; 1789 d += 8; 1790 width -= 8; 1791 } while (width > 0); 1792 src_ptr += src_stride; 1793 dst_ptr += dst_stride; 1794 } while (--height != 0); 1795 } 1796 } 1797 1798 void av1_highbd_convolve_2d_sr_neon(const uint16_t *src, int src_stride, 1799 uint16_t *dst, int dst_stride, int w, int h, 1800 const InterpFilterParams *filter_params_x, 1801 const InterpFilterParams *filter_params_y, 1802 const int subpel_x_qn, 1803 const int subpel_y_qn, 1804 ConvolveParams *conv_params, int bd) { 1805 if (w == 2 || h == 2) { 1806 av1_highbd_convolve_2d_sr_c(src, src_stride, dst, dst_stride, w, h, 1807 filter_params_x, filter_params_y, subpel_x_qn, 1808 subpel_y_qn, conv_params, bd); 1809 return; 1810 } 1811 DECLARE_ALIGNED(16, uint16_t, 1812 im_block[(MAX_SB_SIZE + MAX_FILTER_TAP) * MAX_SB_SIZE]); 1813 const int x_filter_taps = get_filter_tap(filter_params_x, subpel_x_qn); 1814 const int clamped_x_taps = x_filter_taps < 6 ? 6 : x_filter_taps; 1815 1816 const int y_filter_taps = get_filter_tap(filter_params_y, subpel_y_qn); 1817 const int clamped_y_taps = y_filter_taps < 6 ? 6 : y_filter_taps; 1818 const int im_h = h + clamped_y_taps - 1; 1819 const int im_stride = MAX_SB_SIZE; 1820 const int vert_offset = clamped_y_taps / 2 - 1; 1821 const int horiz_offset = clamped_x_taps / 2 - 1; 1822 const int x_offset_initial = (1 << (bd + FILTER_BITS - 1)); 1823 const int y_offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0; 1824 // The extra shim of (1 << (conv_params->round_1 - 1)) allows us to do a 1825 // simple shift left instead of a rounding saturating shift left. 1826 const int y_offset = 1827 (1 << (conv_params->round_1 - 1)) - (1 << (y_offset_bits - 1)); 1828 1829 const uint16_t *src_ptr = src - vert_offset * src_stride - horiz_offset; 1830 1831 const int16_t *x_filter_ptr = av1_get_interp_filter_subpel_kernel( 1832 filter_params_x, subpel_x_qn & SUBPEL_MASK); 1833 const int16_t *y_filter_ptr = av1_get_interp_filter_subpel_kernel( 1834 filter_params_y, subpel_y_qn & SUBPEL_MASK); 1835 1836 if (x_filter_taps > 8) { 1837 highbd_convolve_2d_sr_horiz_12tap_neon(src_ptr, src_stride, im_block, 1838 im_stride, w, im_h, x_filter_ptr, 1839 conv_params, x_offset_initial); 1840 1841 highbd_convolve_2d_sr_vert_12tap_neon(im_block, im_stride, dst, dst_stride, 1842 w, h, y_filter_ptr, conv_params, bd, 1843 y_offset); 1844 return; 1845 } 1846 if (x_filter_taps <= 6 && w != 4) { 1847 highbd_convolve_2d_sr_horiz_6tap_neon(src_ptr, src_stride, im_block, 1848 im_stride, w, im_h, x_filter_ptr, 1849 conv_params, x_offset_initial); 1850 } else { 1851 highbd_convolve_2d_sr_horiz_neon(src_ptr, src_stride, im_block, im_stride, 1852 w, im_h, x_filter_ptr, conv_params, 1853 x_offset_initial); 1854 } 1855 1856 if (y_filter_taps <= 6) { 1857 highbd_convolve_2d_sr_vert_6tap_neon(im_block, im_stride, dst, dst_stride, 1858 w, h, y_filter_ptr, conv_params, bd, 1859 y_offset); 1860 } else { 1861 highbd_convolve_2d_sr_vert_8tap_neon(im_block, im_stride, dst, dst_stride, 1862 w, h, y_filter_ptr, conv_params, bd, 1863 y_offset); 1864 } 1865 } 1866 1867 // Filter used is [64, 64]. 1868 void av1_highbd_convolve_x_sr_intrabc_neon( 1869 const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, 1870 int h, const InterpFilterParams *filter_params_x, const int subpel_x_qn, 1871 ConvolveParams *conv_params, int bd) { 1872 assert(subpel_x_qn == 8); 1873 assert(filter_params_x->taps == 2); 1874 assert((conv_params->round_0 + conv_params->round_1) == 2 * FILTER_BITS); 1875 (void)filter_params_x; 1876 (void)subpel_x_qn; 1877 (void)conv_params; 1878 (void)bd; 1879 1880 if (w <= 4) { 1881 do { 1882 uint16x4_t s0 = vld1_u16(src); 1883 uint16x4_t s1 = vld1_u16(src + 1); 1884 1885 uint16x4_t d0 = vrhadd_u16(s0, s1); 1886 1887 if (w == 2) { 1888 store_u16_2x1(dst, d0); 1889 } else { 1890 vst1_u16(dst, d0); 1891 } 1892 1893 src += src_stride; 1894 dst += dst_stride; 1895 } while (--h != 0); 1896 } else { 1897 do { 1898 const uint16_t *src_ptr = src; 1899 uint16_t *dst_ptr = dst; 1900 int width = w; 1901 1902 do { 1903 uint16x8_t s0 = vld1q_u16(src_ptr); 1904 uint16x8_t s1 = vld1q_u16(src_ptr + 1); 1905 1906 uint16x8_t d0 = vrhaddq_u16(s0, s1); 1907 1908 vst1q_u16(dst_ptr, d0); 1909 1910 src_ptr += 8; 1911 dst_ptr += 8; 1912 width -= 8; 1913 } while (width != 0); 1914 src += src_stride; 1915 dst += dst_stride; 1916 } while (--h != 0); 1917 } 1918 } 1919 1920 // Filter used is [64, 64]. 1921 void av1_highbd_convolve_y_sr_intrabc_neon( 1922 const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, 1923 int h, const InterpFilterParams *filter_params_y, const int subpel_y_qn, 1924 int bd) { 1925 assert(subpel_y_qn == 8); 1926 assert(filter_params_y->taps == 2); 1927 (void)filter_params_y; 1928 (void)subpel_y_qn; 1929 (void)bd; 1930 1931 if (w <= 4) { 1932 do { 1933 uint16x4_t s0 = vld1_u16(src); 1934 uint16x4_t s1 = vld1_u16(src + src_stride); 1935 1936 uint16x4_t d0 = vrhadd_u16(s0, s1); 1937 1938 if (w == 2) { 1939 store_u16_2x1(dst, d0); 1940 } else { 1941 vst1_u16(dst, d0); 1942 } 1943 1944 src += src_stride; 1945 dst += dst_stride; 1946 } while (--h != 0); 1947 } else { 1948 do { 1949 const uint16_t *src_ptr = src; 1950 uint16_t *dst_ptr = dst; 1951 int height = h; 1952 1953 do { 1954 uint16x8_t s0 = vld1q_u16(src_ptr); 1955 uint16x8_t s1 = vld1q_u16(src_ptr + src_stride); 1956 1957 uint16x8_t d0 = vrhaddq_u16(s0, s1); 1958 1959 vst1q_u16(dst_ptr, d0); 1960 1961 src_ptr += src_stride; 1962 dst_ptr += dst_stride; 1963 } while (--height != 0); 1964 src += 8; 1965 dst += 8; 1966 w -= 8; 1967 } while (w != 0); 1968 } 1969 } 1970 1971 // Both horizontal and vertical passes use the same 2-tap filter: [64, 64]. 1972 void av1_highbd_convolve_2d_sr_intrabc_neon( 1973 const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, 1974 int h, const InterpFilterParams *filter_params_x, 1975 const InterpFilterParams *filter_params_y, const int subpel_x_qn, 1976 const int subpel_y_qn, ConvolveParams *conv_params, int bd) { 1977 assert(subpel_x_qn == 8); 1978 assert(subpel_y_qn == 8); 1979 assert(filter_params_x->taps == 2 && filter_params_y->taps == 2); 1980 assert((conv_params->round_0 + conv_params->round_1) == 2 * FILTER_BITS); 1981 assert(w <= MAX_SB_SIZE && h <= MAX_SB_SIZE); 1982 (void)filter_params_x; 1983 (void)subpel_x_qn; 1984 (void)filter_params_y; 1985 (void)subpel_y_qn; 1986 (void)conv_params; 1987 (void)bd; 1988 1989 DECLARE_ALIGNED(16, uint16_t, 1990 im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE]); 1991 int im_h = h + 1; 1992 int im_stride = MAX_SB_SIZE; 1993 1994 uint16x8_t vert_offset = vdupq_n_u16(1); 1995 1996 uint16_t *im = im_block; 1997 1998 // Horizontal filter. 1999 if (w <= 4) { 2000 do { 2001 uint16x4_t s0 = vld1_u16(src); 2002 uint16x4_t s1 = vld1_u16(src + 1); 2003 2004 uint16x4_t d0 = vadd_u16(s0, s1); 2005 2006 // Safe to store the whole vector, the im buffer is big enough. 2007 vst1_u16(im, d0); 2008 2009 src += src_stride; 2010 im += im_stride; 2011 } while (--im_h != 0); 2012 } else { 2013 do { 2014 const uint16_t *src_ptr = src; 2015 uint16_t *im_ptr = im; 2016 int width = w; 2017 2018 do { 2019 uint16x8_t s0 = vld1q_u16(src_ptr); 2020 uint16x8_t s1 = vld1q_u16(src_ptr + 1); 2021 2022 uint16x8_t d0 = vaddq_u16(s0, s1); 2023 2024 vst1q_u16(im_ptr, d0); 2025 2026 src_ptr += 8; 2027 im_ptr += 8; 2028 width -= 8; 2029 } while (width != 0); 2030 src += src_stride; 2031 im += im_stride; 2032 } while (--im_h != 0); 2033 } 2034 2035 im = im_block; 2036 2037 // Vertical filter. 2038 if (w <= 4) { 2039 do { 2040 uint16x4_t s0 = vld1_u16(im); 2041 uint16x4_t s1 = vld1_u16(im + im_stride); 2042 2043 uint16x4_t d0 = vhadd_u16(s0, s1); 2044 d0 = vhadd_u16(d0, vget_low_u16(vert_offset)); 2045 2046 if (w == 2) { 2047 store_u16_2x1(dst, d0); 2048 } else { 2049 vst1_u16(dst, d0); 2050 } 2051 2052 im += im_stride; 2053 dst += dst_stride; 2054 } while (--h != 0); 2055 } else { 2056 do { 2057 uint16_t *im_ptr = im; 2058 uint16_t *dst_ptr = dst; 2059 int height = h; 2060 2061 do { 2062 uint16x8_t s0 = vld1q_u16(im_ptr); 2063 uint16x8_t s1 = vld1q_u16(im_ptr + im_stride); 2064 2065 uint16x8_t d0 = vhaddq_u16(s0, s1); 2066 d0 = vhaddq_u16(d0, vert_offset); 2067 2068 vst1q_u16(dst_ptr, d0); 2069 2070 im_ptr += im_stride; 2071 dst_ptr += dst_stride; 2072 } while (--height != 0); 2073 im += 8; 2074 dst += 8; 2075 w -= 8; 2076 } while (w != 0); 2077 } 2078 }