convolve_neon.h (24355B)
1 /* 2 * Copyright (c) 2018, Alliance for Open Media. All rights reserved. 3 * 4 * This source code is subject to the terms of the BSD 2 Clause License and 5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License 6 * was not distributed with this source code in the LICENSE file, you can 7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open 8 * Media Patent License 1.0 was not distributed with this source code in the 9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent. 10 */ 11 12 #ifndef AOM_AV1_COMMON_ARM_CONVOLVE_NEON_H_ 13 #define AOM_AV1_COMMON_ARM_CONVOLVE_NEON_H_ 14 15 #include <arm_neon.h> 16 17 #include "config/aom_config.h" 18 19 #include "aom_dsp/arm/mem_neon.h" 20 #include "av1/common/convolve.h" 21 #include "av1/common/filter.h" 22 23 static inline int32x4_t convolve12_4_2d_v( 24 const int16x4_t s0, const int16x4_t s1, const int16x4_t s2, 25 const int16x4_t s3, const int16x4_t s4, const int16x4_t s5, 26 const int16x4_t s6, const int16x4_t s7, const int16x4_t s8, 27 const int16x4_t s9, const int16x4_t s10, const int16x4_t s11, 28 const int16x8_t y_filter_0_7, const int16x4_t y_filter_8_11) { 29 const int16x4_t y_filter_0_3 = vget_low_s16(y_filter_0_7); 30 const int16x4_t y_filter_4_7 = vget_high_s16(y_filter_0_7); 31 32 int32x4_t sum = vmull_lane_s16(s0, y_filter_0_3, 0); 33 sum = vmlal_lane_s16(sum, s1, y_filter_0_3, 1); 34 sum = vmlal_lane_s16(sum, s2, y_filter_0_3, 2); 35 sum = vmlal_lane_s16(sum, s3, y_filter_0_3, 3); 36 sum = vmlal_lane_s16(sum, s4, y_filter_4_7, 0); 37 sum = vmlal_lane_s16(sum, s5, y_filter_4_7, 1); 38 sum = vmlal_lane_s16(sum, s6, y_filter_4_7, 2); 39 sum = vmlal_lane_s16(sum, s7, y_filter_4_7, 3); 40 sum = vmlal_lane_s16(sum, s8, y_filter_8_11, 0); 41 sum = vmlal_lane_s16(sum, s9, y_filter_8_11, 1); 42 sum = vmlal_lane_s16(sum, s10, y_filter_8_11, 2); 43 sum = vmlal_lane_s16(sum, s11, y_filter_8_11, 3); 44 45 return sum; 46 } 47 48 static inline uint8x8_t convolve12_8_2d_v( 49 const int16x8_t s0, const int16x8_t s1, const int16x8_t s2, 50 const int16x8_t s3, const int16x8_t s4, const int16x8_t s5, 51 const int16x8_t s6, const int16x8_t s7, const int16x8_t s8, 52 const int16x8_t s9, const int16x8_t s10, const int16x8_t s11, 53 const int16x8_t y_filter_0_7, const int16x4_t y_filter_8_11, 54 const int16x8_t sub_const) { 55 const int16x4_t y_filter_0_3 = vget_low_s16(y_filter_0_7); 56 const int16x4_t y_filter_4_7 = vget_high_s16(y_filter_0_7); 57 58 int32x4_t sum0 = vmull_lane_s16(vget_low_s16(s0), y_filter_0_3, 0); 59 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s1), y_filter_0_3, 1); 60 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s2), y_filter_0_3, 2); 61 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s3), y_filter_0_3, 3); 62 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s4), y_filter_4_7, 0); 63 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s5), y_filter_4_7, 1); 64 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s6), y_filter_4_7, 2); 65 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s7), y_filter_4_7, 3); 66 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s8), y_filter_8_11, 0); 67 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s9), y_filter_8_11, 1); 68 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s10), y_filter_8_11, 2); 69 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s11), y_filter_8_11, 3); 70 71 int32x4_t sum1 = vmull_lane_s16(vget_high_s16(s0), y_filter_0_3, 0); 72 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s1), y_filter_0_3, 1); 73 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s2), y_filter_0_3, 2); 74 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s3), y_filter_0_3, 3); 75 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s4), y_filter_4_7, 0); 76 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s5), y_filter_4_7, 1); 77 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s6), y_filter_4_7, 2); 78 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s7), y_filter_4_7, 3); 79 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s8), y_filter_8_11, 0); 80 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s9), y_filter_8_11, 1); 81 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s10), y_filter_8_11, 2); 82 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s11), y_filter_8_11, 3); 83 84 int16x8_t res = 85 vcombine_s16(vqrshrn_n_s32(sum0, 2 * FILTER_BITS - ROUND0_BITS), 86 vqrshrn_n_s32(sum1, 2 * FILTER_BITS - ROUND0_BITS)); 87 res = vsubq_s16(res, sub_const); 88 89 return vqmovun_s16(res); 90 } 91 92 static inline void convolve_2d_sr_vert_12tap_neon( 93 int16_t *src_ptr, int src_stride, uint8_t *dst_ptr, int dst_stride, int w, 94 int h, const int16x8_t y_filter_0_7, const int16x4_t y_filter_8_11) { 95 const int bd = 8; 96 const int16x8_t sub_const = vdupq_n_s16(1 << (bd - 1)); 97 98 if (w <= 4) { 99 int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10; 100 load_s16_4x11(src_ptr, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7, 101 &s8, &s9, &s10); 102 src_ptr += 11 * src_stride; 103 104 do { 105 int16x4_t s11, s12, s13, s14; 106 load_s16_4x4(src_ptr, src_stride, &s11, &s12, &s13, &s14); 107 108 int32x4_t d0 = convolve12_4_2d_v(s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, 109 s10, s11, y_filter_0_7, y_filter_8_11); 110 int32x4_t d1 = convolve12_4_2d_v(s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, 111 s11, s12, y_filter_0_7, y_filter_8_11); 112 int32x4_t d2 = convolve12_4_2d_v(s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, 113 s12, s13, y_filter_0_7, y_filter_8_11); 114 int32x4_t d3 = 115 convolve12_4_2d_v(s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14, 116 y_filter_0_7, y_filter_8_11); 117 118 int16x8_t dd01 = 119 vcombine_s16(vqrshrn_n_s32(d0, 2 * FILTER_BITS - ROUND0_BITS), 120 vqrshrn_n_s32(d1, 2 * FILTER_BITS - ROUND0_BITS)); 121 int16x8_t dd23 = 122 vcombine_s16(vqrshrn_n_s32(d2, 2 * FILTER_BITS - ROUND0_BITS), 123 vqrshrn_n_s32(d3, 2 * FILTER_BITS - ROUND0_BITS)); 124 125 dd01 = vsubq_s16(dd01, sub_const); 126 dd23 = vsubq_s16(dd23, sub_const); 127 128 uint8x8_t d01 = vqmovun_s16(dd01); 129 uint8x8_t d23 = vqmovun_s16(dd23); 130 131 store_u8x4_strided_x2(dst_ptr + 0 * dst_stride, dst_stride, d01); 132 store_u8x4_strided_x2(dst_ptr + 2 * dst_stride, dst_stride, d23); 133 134 s0 = s4; 135 s1 = s5; 136 s2 = s6; 137 s3 = s7; 138 s4 = s8; 139 s5 = s9; 140 s6 = s10; 141 s7 = s11; 142 s8 = s12; 143 s9 = s13; 144 s10 = s14; 145 src_ptr += 4 * src_stride; 146 dst_ptr += 4 * dst_stride; 147 h -= 4; 148 } while (h != 0); 149 150 } else { 151 do { 152 int height = h; 153 int16_t *s = src_ptr; 154 uint8_t *d = dst_ptr; 155 156 int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10; 157 load_s16_8x11(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7, &s8, 158 &s9, &s10); 159 s += 11 * src_stride; 160 161 do { 162 int16x8_t s11, s12, s13, s14; 163 load_s16_8x4(s, src_stride, &s11, &s12, &s13, &s14); 164 165 uint8x8_t d0 = 166 convolve12_8_2d_v(s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, 167 y_filter_0_7, y_filter_8_11, sub_const); 168 uint8x8_t d1 = 169 convolve12_8_2d_v(s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, 170 y_filter_0_7, y_filter_8_11, sub_const); 171 uint8x8_t d2 = 172 convolve12_8_2d_v(s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, 173 s13, y_filter_0_7, y_filter_8_11, sub_const); 174 uint8x8_t d3 = 175 convolve12_8_2d_v(s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, 176 s14, y_filter_0_7, y_filter_8_11, sub_const); 177 178 store_u8_8x4(d, dst_stride, d0, d1, d2, d3); 179 180 s0 = s4; 181 s1 = s5; 182 s2 = s6; 183 s3 = s7; 184 s4 = s8; 185 s5 = s9; 186 s6 = s10; 187 s7 = s11; 188 s8 = s12; 189 s9 = s13; 190 s10 = s14; 191 s += 4 * src_stride; 192 d += 4 * dst_stride; 193 height -= 4; 194 } while (height != 0); 195 src_ptr += 8; 196 dst_ptr += 8; 197 w -= 8; 198 } while (w != 0); 199 } 200 } 201 202 static inline int16x4_t convolve8_4_2d_v(const int16x4_t s0, const int16x4_t s1, 203 const int16x4_t s2, const int16x4_t s3, 204 const int16x4_t s4, const int16x4_t s5, 205 const int16x4_t s6, const int16x4_t s7, 206 const int16x8_t y_filter) { 207 const int16x4_t y_filter_lo = vget_low_s16(y_filter); 208 const int16x4_t y_filter_hi = vget_high_s16(y_filter); 209 210 int32x4_t sum = vmull_lane_s16(s0, y_filter_lo, 0); 211 sum = vmlal_lane_s16(sum, s1, y_filter_lo, 1); 212 sum = vmlal_lane_s16(sum, s2, y_filter_lo, 2); 213 sum = vmlal_lane_s16(sum, s3, y_filter_lo, 3); 214 sum = vmlal_lane_s16(sum, s4, y_filter_hi, 0); 215 sum = vmlal_lane_s16(sum, s5, y_filter_hi, 1); 216 sum = vmlal_lane_s16(sum, s6, y_filter_hi, 2); 217 sum = vmlal_lane_s16(sum, s7, y_filter_hi, 3); 218 219 return vqrshrn_n_s32(sum, 2 * FILTER_BITS - ROUND0_BITS); 220 } 221 222 static inline uint8x8_t convolve8_8_2d_v(const int16x8_t s0, const int16x8_t s1, 223 const int16x8_t s2, const int16x8_t s3, 224 const int16x8_t s4, const int16x8_t s5, 225 const int16x8_t s6, const int16x8_t s7, 226 const int16x8_t y_filter, 227 const int16x8_t sub_const) { 228 const int16x4_t y_filter_lo = vget_low_s16(y_filter); 229 const int16x4_t y_filter_hi = vget_high_s16(y_filter); 230 231 int32x4_t sum0 = vmull_lane_s16(vget_low_s16(s0), y_filter_lo, 0); 232 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s1), y_filter_lo, 1); 233 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s2), y_filter_lo, 2); 234 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s3), y_filter_lo, 3); 235 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s4), y_filter_hi, 0); 236 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s5), y_filter_hi, 1); 237 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s6), y_filter_hi, 2); 238 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s7), y_filter_hi, 3); 239 240 int32x4_t sum1 = vmull_lane_s16(vget_high_s16(s0), y_filter_lo, 0); 241 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s1), y_filter_lo, 1); 242 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s2), y_filter_lo, 2); 243 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s3), y_filter_lo, 3); 244 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s4), y_filter_hi, 0); 245 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s5), y_filter_hi, 1); 246 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s6), y_filter_hi, 2); 247 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s7), y_filter_hi, 3); 248 249 int16x8_t res = 250 vcombine_s16(vqrshrn_n_s32(sum0, 2 * FILTER_BITS - ROUND0_BITS), 251 vqrshrn_n_s32(sum1, 2 * FILTER_BITS - ROUND0_BITS)); 252 res = vsubq_s16(res, sub_const); 253 254 return vqmovun_s16(res); 255 } 256 257 static inline void convolve_2d_sr_vert_8tap_neon(int16_t *src_ptr, 258 int src_stride, 259 uint8_t *dst_ptr, 260 int dst_stride, int w, int h, 261 const int16x8_t y_filter) { 262 const int bd = 8; 263 const int16x8_t sub_const = vdupq_n_s16(1 << (bd - 1)); 264 265 if (w <= 4) { 266 int16x4_t s0, s1, s2, s3, s4, s5, s6; 267 load_s16_4x7(src_ptr, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6); 268 src_ptr += 7 * src_stride; 269 270 do { 271 #if AOM_ARCH_AARCH64 272 int16x4_t s7, s8, s9, s10; 273 load_s16_4x4(src_ptr, src_stride, &s7, &s8, &s9, &s10); 274 275 int16x4_t d0 = convolve8_4_2d_v(s0, s1, s2, s3, s4, s5, s6, s7, y_filter); 276 int16x4_t d1 = convolve8_4_2d_v(s1, s2, s3, s4, s5, s6, s7, s8, y_filter); 277 int16x4_t d2 = convolve8_4_2d_v(s2, s3, s4, s5, s6, s7, s8, s9, y_filter); 278 int16x4_t d3 = 279 convolve8_4_2d_v(s3, s4, s5, s6, s7, s8, s9, s10, y_filter); 280 281 uint8x8_t d01 = vqmovun_s16(vsubq_s16(vcombine_s16(d0, d1), sub_const)); 282 uint8x8_t d23 = vqmovun_s16(vsubq_s16(vcombine_s16(d2, d3), sub_const)); 283 284 store_u8x4_strided_x2(dst_ptr + 0 * dst_stride, dst_stride, d01); 285 store_u8x4_strided_x2(dst_ptr + 2 * dst_stride, dst_stride, d23); 286 287 s0 = s4; 288 s1 = s5; 289 s2 = s6; 290 s3 = s7; 291 s4 = s8; 292 s5 = s9; 293 s6 = s10; 294 src_ptr += 4 * src_stride; 295 dst_ptr += 4 * dst_stride; 296 h -= 4; 297 #else // !AOM_ARCH_AARCH64 298 int16x4_t s7 = vld1_s16(src_ptr); 299 int16x4_t d0 = convolve8_4_2d_v(s0, s1, s2, s3, s4, s5, s6, s7, y_filter); 300 uint8x8_t d01 = 301 vqmovun_s16(vsubq_s16(vcombine_s16(d0, vdup_n_s16(0)), sub_const)); 302 303 store_u8_4x1(dst_ptr, d01); 304 305 s0 = s1; 306 s1 = s2; 307 s2 = s3; 308 s3 = s4; 309 s4 = s5; 310 s5 = s6; 311 s6 = s7; 312 src_ptr += src_stride; 313 dst_ptr += dst_stride; 314 h--; 315 #endif // AOM_ARCH_AARCH64 316 } while (h != 0); 317 } else { 318 // Width is a multiple of 8 and height is a multiple of 4. 319 do { 320 int height = h; 321 int16_t *s = src_ptr; 322 uint8_t *d = dst_ptr; 323 324 int16x8_t s0, s1, s2, s3, s4, s5, s6; 325 load_s16_8x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6); 326 s += 7 * src_stride; 327 328 do { 329 #if AOM_ARCH_AARCH64 330 int16x8_t s7, s8, s9, s10; 331 load_s16_8x4(s, src_stride, &s7, &s8, &s9, &s10); 332 333 uint8x8_t d0 = convolve8_8_2d_v(s0, s1, s2, s3, s4, s5, s6, s7, 334 y_filter, sub_const); 335 uint8x8_t d1 = convolve8_8_2d_v(s1, s2, s3, s4, s5, s6, s7, s8, 336 y_filter, sub_const); 337 uint8x8_t d2 = convolve8_8_2d_v(s2, s3, s4, s5, s6, s7, s8, s9, 338 y_filter, sub_const); 339 uint8x8_t d3 = convolve8_8_2d_v(s3, s4, s5, s6, s7, s8, s9, s10, 340 y_filter, sub_const); 341 342 store_u8_8x4(d, dst_stride, d0, d1, d2, d3); 343 344 s0 = s4; 345 s1 = s5; 346 s2 = s6; 347 s3 = s7; 348 s4 = s8; 349 s5 = s9; 350 s6 = s10; 351 s += 4 * src_stride; 352 d += 4 * dst_stride; 353 height -= 4; 354 #else // !AOM_ARCH_AARCH64 355 int16x8_t s7 = vld1q_s16(s); 356 uint8x8_t d0 = convolve8_8_2d_v(s0, s1, s2, s3, s4, s5, s6, s7, 357 y_filter, sub_const); 358 vst1_u8(d, d0); 359 360 s0 = s1; 361 s1 = s2; 362 s2 = s3; 363 s3 = s4; 364 s4 = s5; 365 s5 = s6; 366 s6 = s7; 367 s += src_stride; 368 d += dst_stride; 369 height--; 370 #endif // AOM_ARCH_AARCH64 371 } while (height != 0); 372 src_ptr += 8; 373 dst_ptr += 8; 374 w -= 8; 375 } while (w != 0); 376 } 377 } 378 379 static inline int16x4_t convolve6_4_2d_v(const int16x4_t s0, const int16x4_t s1, 380 const int16x4_t s2, const int16x4_t s3, 381 const int16x4_t s4, const int16x4_t s5, 382 const int16x8_t y_filter) { 383 const int16x4_t y_filter_lo = vget_low_s16(y_filter); 384 const int16x4_t y_filter_hi = vget_high_s16(y_filter); 385 386 int32x4_t sum = vmull_lane_s16(s0, y_filter_lo, 1); 387 sum = vmlal_lane_s16(sum, s1, y_filter_lo, 2); 388 sum = vmlal_lane_s16(sum, s2, y_filter_lo, 3); 389 sum = vmlal_lane_s16(sum, s3, y_filter_hi, 0); 390 sum = vmlal_lane_s16(sum, s4, y_filter_hi, 1); 391 sum = vmlal_lane_s16(sum, s5, y_filter_hi, 2); 392 393 return vqrshrn_n_s32(sum, 2 * FILTER_BITS - ROUND0_BITS); 394 } 395 396 static inline uint8x8_t convolve6_8_2d_v(const int16x8_t s0, const int16x8_t s1, 397 const int16x8_t s2, const int16x8_t s3, 398 const int16x8_t s4, const int16x8_t s5, 399 const int16x8_t y_filter, 400 const int16x8_t sub_const) { 401 const int16x4_t y_filter_lo = vget_low_s16(y_filter); 402 const int16x4_t y_filter_hi = vget_high_s16(y_filter); 403 404 int32x4_t sum0 = vmull_lane_s16(vget_low_s16(s0), y_filter_lo, 1); 405 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s1), y_filter_lo, 2); 406 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s2), y_filter_lo, 3); 407 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s3), y_filter_hi, 0); 408 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s4), y_filter_hi, 1); 409 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s5), y_filter_hi, 2); 410 411 int32x4_t sum1 = vmull_lane_s16(vget_high_s16(s0), y_filter_lo, 1); 412 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s1), y_filter_lo, 2); 413 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s2), y_filter_lo, 3); 414 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s3), y_filter_hi, 0); 415 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s4), y_filter_hi, 1); 416 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s5), y_filter_hi, 2); 417 418 int16x8_t res = 419 vcombine_s16(vqrshrn_n_s32(sum0, 2 * FILTER_BITS - ROUND0_BITS), 420 vqrshrn_n_s32(sum1, 2 * FILTER_BITS - ROUND0_BITS)); 421 res = vsubq_s16(res, sub_const); 422 423 return vqmovun_s16(res); 424 } 425 426 static inline void convolve_2d_sr_vert_6tap_neon(int16_t *src_ptr, 427 int src_stride, 428 uint8_t *dst_ptr, 429 int dst_stride, int w, int h, 430 const int16x8_t y_filter) { 431 const int bd = 8; 432 const int16x8_t sub_const = vdupq_n_s16(1 << (bd - 1)); 433 434 if (w <= 4) { 435 int16x4_t s0, s1, s2, s3, s4; 436 load_s16_4x5(src_ptr, src_stride, &s0, &s1, &s2, &s3, &s4); 437 src_ptr += 5 * src_stride; 438 439 do { 440 #if AOM_ARCH_AARCH64 441 int16x4_t s5, s6, s7, s8; 442 load_s16_4x4(src_ptr, src_stride, &s5, &s6, &s7, &s8); 443 444 int16x4_t d0 = convolve6_4_2d_v(s0, s1, s2, s3, s4, s5, y_filter); 445 int16x4_t d1 = convolve6_4_2d_v(s1, s2, s3, s4, s5, s6, y_filter); 446 int16x4_t d2 = convolve6_4_2d_v(s2, s3, s4, s5, s6, s7, y_filter); 447 int16x4_t d3 = convolve6_4_2d_v(s3, s4, s5, s6, s7, s8, y_filter); 448 449 uint8x8_t d01 = vqmovun_s16(vsubq_s16(vcombine_s16(d0, d1), sub_const)); 450 uint8x8_t d23 = vqmovun_s16(vsubq_s16(vcombine_s16(d2, d3), sub_const)); 451 452 store_u8x4_strided_x2(dst_ptr + 0 * dst_stride, dst_stride, d01); 453 store_u8x4_strided_x2(dst_ptr + 2 * dst_stride, dst_stride, d23); 454 455 s0 = s4; 456 s1 = s5; 457 s2 = s6; 458 s3 = s7; 459 s4 = s8; 460 src_ptr += 4 * src_stride; 461 dst_ptr += 4 * dst_stride; 462 h -= 4; 463 #else // !AOM_ARCH_AARCH64 464 int16x4_t s5 = vld1_s16(src_ptr); 465 int16x4_t d0 = convolve6_4_2d_v(s0, s1, s2, s3, s4, s5, y_filter); 466 uint8x8_t d01 = 467 vqmovun_s16(vsubq_s16(vcombine_s16(d0, vdup_n_s16(0)), sub_const)); 468 469 store_u8_4x1(dst_ptr, d01); 470 471 s0 = s1; 472 s1 = s2; 473 s2 = s3; 474 s3 = s4; 475 s4 = s5; 476 src_ptr += src_stride; 477 dst_ptr += dst_stride; 478 h--; 479 #endif // AOM_ARCH_AARCH64 480 } while (h != 0); 481 } else { 482 // Width is a multiple of 8 and height is a multiple of 4. 483 do { 484 int height = h; 485 int16_t *s = src_ptr; 486 uint8_t *d = dst_ptr; 487 488 int16x8_t s0, s1, s2, s3, s4; 489 load_s16_8x5(s, src_stride, &s0, &s1, &s2, &s3, &s4); 490 s += 5 * src_stride; 491 492 do { 493 #if AOM_ARCH_AARCH64 494 int16x8_t s5, s6, s7, s8; 495 load_s16_8x4(s, src_stride, &s5, &s6, &s7, &s8); 496 497 uint8x8_t d0 = 498 convolve6_8_2d_v(s0, s1, s2, s3, s4, s5, y_filter, sub_const); 499 uint8x8_t d1 = 500 convolve6_8_2d_v(s1, s2, s3, s4, s5, s6, y_filter, sub_const); 501 uint8x8_t d2 = 502 convolve6_8_2d_v(s2, s3, s4, s5, s6, s7, y_filter, sub_const); 503 uint8x8_t d3 = 504 convolve6_8_2d_v(s3, s4, s5, s6, s7, s8, y_filter, sub_const); 505 506 store_u8_8x4(d, dst_stride, d0, d1, d2, d3); 507 508 s0 = s4; 509 s1 = s5; 510 s2 = s6; 511 s3 = s7; 512 s4 = s8; 513 s += 4 * src_stride; 514 d += 4 * dst_stride; 515 height -= 4; 516 #else // !AOM_ARCH_AARCH64 517 int16x8_t s5 = vld1q_s16(s); 518 uint8x8_t d0 = 519 convolve6_8_2d_v(s0, s1, s2, s3, s4, s5, y_filter, sub_const); 520 vst1_u8(d, d0); 521 522 s0 = s1; 523 s1 = s2; 524 s2 = s3; 525 s3 = s4; 526 s4 = s5; 527 s += src_stride; 528 d += dst_stride; 529 height--; 530 #endif // AOM_ARCH_AARCH64 531 } while (height != 0); 532 src_ptr += 8; 533 dst_ptr += 8; 534 w -= 8; 535 } while (w != 0); 536 } 537 } 538 539 static inline int16x4_t convolve4_4_2d_v(const int16x4_t s0, const int16x4_t s1, 540 const int16x4_t s2, const int16x4_t s3, 541 const int16x4_t y_filter) { 542 int32x4_t sum = vmull_lane_s16(s0, y_filter, 0); 543 sum = vmlal_lane_s16(sum, s1, y_filter, 1); 544 sum = vmlal_lane_s16(sum, s2, y_filter, 2); 545 sum = vmlal_lane_s16(sum, s3, y_filter, 3); 546 547 return vqrshrn_n_s32(sum, 2 * FILTER_BITS - ROUND0_BITS); 548 } 549 550 static inline uint8x8_t convolve4_8_2d_v(const int16x8_t s0, const int16x8_t s1, 551 const int16x8_t s2, const int16x8_t s3, 552 const int16x4_t y_filter, 553 const int16x8_t sub_const) { 554 int32x4_t sum0 = vmull_lane_s16(vget_low_s16(s0), y_filter, 0); 555 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s1), y_filter, 1); 556 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s2), y_filter, 2); 557 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s3), y_filter, 3); 558 559 int32x4_t sum1 = vmull_lane_s16(vget_high_s16(s0), y_filter, 0); 560 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s1), y_filter, 1); 561 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s2), y_filter, 2); 562 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s3), y_filter, 3); 563 564 int16x8_t res = 565 vcombine_s16(vqrshrn_n_s32(sum0, 2 * FILTER_BITS - ROUND0_BITS), 566 vqrshrn_n_s32(sum1, 2 * FILTER_BITS - ROUND0_BITS)); 567 res = vsubq_s16(res, sub_const); 568 569 return vqmovun_s16(res); 570 } 571 572 static inline void convolve_2d_sr_vert_4tap_neon(int16_t *src_ptr, 573 int src_stride, 574 uint8_t *dst_ptr, 575 int dst_stride, int w, int h, 576 const int16_t *y_filter) { 577 const int bd = 8; 578 const int16x8_t sub_const = vdupq_n_s16(1 << (bd - 1)); 579 580 const int16x4_t filter = vld1_s16(y_filter + 2); 581 582 if (w == 4) { 583 int16x4_t s0, s1, s2; 584 load_s16_4x3(src_ptr, src_stride, &s0, &s1, &s2); 585 src_ptr += 3 * src_stride; 586 587 do { 588 int16x4_t s3, s4, s5, s6; 589 load_s16_4x4(src_ptr, src_stride, &s3, &s4, &s5, &s6); 590 591 int16x4_t d0 = convolve4_4_2d_v(s0, s1, s2, s3, filter); 592 int16x4_t d1 = convolve4_4_2d_v(s1, s2, s3, s4, filter); 593 int16x4_t d2 = convolve4_4_2d_v(s2, s3, s4, s5, filter); 594 int16x4_t d3 = convolve4_4_2d_v(s3, s4, s5, s6, filter); 595 596 uint8x8_t d01 = vqmovun_s16(vsubq_s16(vcombine_s16(d0, d1), sub_const)); 597 uint8x8_t d23 = vqmovun_s16(vsubq_s16(vcombine_s16(d2, d3), sub_const)); 598 599 store_u8x4_strided_x2(dst_ptr + 0 * dst_stride, dst_stride, d01); 600 store_u8x4_strided_x2(dst_ptr + 2 * dst_stride, dst_stride, d23); 601 602 s0 = s4; 603 s1 = s5; 604 s2 = s6; 605 606 src_ptr += 4 * src_stride; 607 dst_ptr += 4 * dst_stride; 608 h -= 4; 609 } while (h != 0); 610 } else { 611 // Width is a multiple of 8 and height is a multiple of 4. 612 do { 613 int height = h; 614 int16_t *s = src_ptr; 615 uint8_t *d = dst_ptr; 616 617 int16x8_t s0, s1, s2; 618 load_s16_8x3(s, src_stride, &s0, &s1, &s2); 619 s += 3 * src_stride; 620 621 do { 622 int16x8_t s3, s4, s5, s6; 623 load_s16_8x4(s, src_stride, &s3, &s4, &s5, &s6); 624 625 uint8x8_t d0 = convolve4_8_2d_v(s0, s1, s2, s3, filter, sub_const); 626 uint8x8_t d1 = convolve4_8_2d_v(s1, s2, s3, s4, filter, sub_const); 627 uint8x8_t d2 = convolve4_8_2d_v(s2, s3, s4, s5, filter, sub_const); 628 uint8x8_t d3 = convolve4_8_2d_v(s3, s4, s5, s6, filter, sub_const); 629 630 store_u8_8x4(d, dst_stride, d0, d1, d2, d3); 631 632 s0 = s4; 633 s1 = s5; 634 s2 = s6; 635 636 s += 4 * src_stride; 637 d += 4 * dst_stride; 638 height -= 4; 639 } while (height != 0); 640 src_ptr += 8; 641 dst_ptr += 8; 642 w -= 8; 643 } while (w != 0); 644 } 645 } 646 647 #endif // AOM_AV1_COMMON_ARM_CONVOLVE_NEON_H_