aom_convolve8_neon.c (16318B)
1 /* 2 * Copyright (c) 2014 The WebM project authors. All rights reserved. 3 * Copyright (c) 2023, Alliance for Open Media. All rights reserved. 4 * 5 * This source code is subject to the terms of the BSD 2 Clause License and 6 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License 7 * was not distributed with this source code in the LICENSE file, you can 8 * obtain it at www.aomedia.org/license/software. If the Alliance for Open 9 * Media Patent License 1.0 was not distributed with this source code in the 10 * PATENTS file, you can obtain it at www.aomedia.org/license/patent. 11 */ 12 13 #include <arm_neon.h> 14 #include <assert.h> 15 #include <string.h> 16 17 #include "config/aom_config.h" 18 #include "config/aom_dsp_rtcd.h" 19 20 #include "aom/aom_integer.h" 21 #include "aom_dsp/aom_dsp_common.h" 22 #include "aom_dsp/aom_filter.h" 23 #include "aom_dsp/arm/aom_convolve8_neon.h" 24 #include "aom_dsp/arm/aom_filter.h" 25 #include "aom_dsp/arm/mem_neon.h" 26 #include "aom_dsp/arm/transpose_neon.h" 27 #include "aom_ports/mem.h" 28 29 static inline void convolve8_horiz_8tap_neon(const uint8_t *src, 30 ptrdiff_t src_stride, uint8_t *dst, 31 ptrdiff_t dst_stride, 32 const int16_t *filter_x, int w, 33 int h) { 34 // All filter values are even so halve them to reduce intermediate precision 35 // requirements. 36 const int16x8_t filter = vshrq_n_s16(vld1q_s16(filter_x), 1); 37 38 if (h == 4) { 39 uint8x8_t t0, t1, t2, t3; 40 load_u8_8x4(src, src_stride, &t0, &t1, &t2, &t3); 41 transpose_elems_inplace_u8_8x4(&t0, &t1, &t2, &t3); 42 43 int16x4_t s0 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0))); 44 int16x4_t s1 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1))); 45 int16x4_t s2 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2))); 46 int16x4_t s3 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3))); 47 int16x4_t s4 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t0))); 48 int16x4_t s5 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t1))); 49 int16x4_t s6 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t2))); 50 51 src += 7; 52 53 do { 54 load_u8_8x4(src, src_stride, &t0, &t1, &t2, &t3); 55 transpose_elems_inplace_u8_8x4(&t0, &t1, &t2, &t3); 56 57 int16x4_t s7 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0))); 58 int16x4_t s8 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1))); 59 int16x4_t s9 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2))); 60 int16x4_t s10 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3))); 61 62 int16x4_t d0 = convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filter); 63 int16x4_t d1 = convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, filter); 64 int16x4_t d2 = convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, filter); 65 int16x4_t d3 = convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, filter); 66 // We halved the filter values so -1 from right shift. 67 uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS - 1); 68 uint8x8_t d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS - 1); 69 70 transpose_elems_inplace_u8_4x4(&d01, &d23); 71 72 store_u8x4_strided_x2(dst + 0 * dst_stride, 2 * dst_stride, d01); 73 store_u8x4_strided_x2(dst + 1 * dst_stride, 2 * dst_stride, d23); 74 75 s0 = s4; 76 s1 = s5; 77 s2 = s6; 78 s3 = s7; 79 s4 = s8; 80 s5 = s9; 81 s6 = s10; 82 83 src += 4; 84 dst += 4; 85 w -= 4; 86 } while (w != 0); 87 } else { 88 if (w == 4) { 89 do { 90 uint8x8_t t0, t1, t2, t3, t4, t5, t6, t7; 91 load_u8_8x8(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); 92 transpose_elems_inplace_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); 93 94 int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t0)); 95 int16x8_t s1 = vreinterpretq_s16_u16(vmovl_u8(t1)); 96 int16x8_t s2 = vreinterpretq_s16_u16(vmovl_u8(t2)); 97 int16x8_t s3 = vreinterpretq_s16_u16(vmovl_u8(t3)); 98 int16x8_t s4 = vreinterpretq_s16_u16(vmovl_u8(t4)); 99 int16x8_t s5 = vreinterpretq_s16_u16(vmovl_u8(t5)); 100 int16x8_t s6 = vreinterpretq_s16_u16(vmovl_u8(t6)); 101 102 load_u8_8x8(src + 7, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, 103 &t7); 104 transpose_elems_u8_4x8(t0, t1, t2, t3, t4, t5, t6, t7, &t0, &t1, &t2, 105 &t3); 106 107 int16x8_t s7 = vreinterpretq_s16_u16(vmovl_u8(t0)); 108 int16x8_t s8 = vreinterpretq_s16_u16(vmovl_u8(t1)); 109 int16x8_t s9 = vreinterpretq_s16_u16(vmovl_u8(t2)); 110 int16x8_t s10 = vreinterpretq_s16_u16(vmovl_u8(t3)); 111 112 uint8x8_t d0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filter); 113 uint8x8_t d1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filter); 114 uint8x8_t d2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filter); 115 uint8x8_t d3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filter); 116 117 transpose_elems_inplace_u8_8x4(&d0, &d1, &d2, &d3); 118 119 store_u8x4_strided_x2(dst + 0 * dst_stride, 4 * dst_stride, d0); 120 store_u8x4_strided_x2(dst + 1 * dst_stride, 4 * dst_stride, d1); 121 store_u8x4_strided_x2(dst + 2 * dst_stride, 4 * dst_stride, d2); 122 store_u8x4_strided_x2(dst + 3 * dst_stride, 4 * dst_stride, d3); 123 124 src += 8 * src_stride; 125 dst += 8 * dst_stride; 126 h -= 8; 127 } while (h > 0); 128 } else { 129 do { 130 int width = w; 131 const uint8_t *s = src; 132 uint8_t *d = dst; 133 134 uint8x8_t t0, t1, t2, t3, t4, t5, t6, t7; 135 load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); 136 transpose_elems_inplace_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); 137 138 int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t0)); 139 int16x8_t s1 = vreinterpretq_s16_u16(vmovl_u8(t1)); 140 int16x8_t s2 = vreinterpretq_s16_u16(vmovl_u8(t2)); 141 int16x8_t s3 = vreinterpretq_s16_u16(vmovl_u8(t3)); 142 int16x8_t s4 = vreinterpretq_s16_u16(vmovl_u8(t4)); 143 int16x8_t s5 = vreinterpretq_s16_u16(vmovl_u8(t5)); 144 int16x8_t s6 = vreinterpretq_s16_u16(vmovl_u8(t6)); 145 146 s += 7; 147 148 do { 149 load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); 150 transpose_elems_inplace_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, 151 &t7); 152 153 int16x8_t s7 = vreinterpretq_s16_u16(vmovl_u8(t0)); 154 int16x8_t s8 = vreinterpretq_s16_u16(vmovl_u8(t1)); 155 int16x8_t s9 = vreinterpretq_s16_u16(vmovl_u8(t2)); 156 int16x8_t s10 = vreinterpretq_s16_u16(vmovl_u8(t3)); 157 int16x8_t s11 = vreinterpretq_s16_u16(vmovl_u8(t4)); 158 int16x8_t s12 = vreinterpretq_s16_u16(vmovl_u8(t5)); 159 int16x8_t s13 = vreinterpretq_s16_u16(vmovl_u8(t6)); 160 int16x8_t s14 = vreinterpretq_s16_u16(vmovl_u8(t7)); 161 162 uint8x8_t d0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filter); 163 uint8x8_t d1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filter); 164 uint8x8_t d2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filter); 165 uint8x8_t d3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filter); 166 uint8x8_t d4 = convolve8_8(s4, s5, s6, s7, s8, s9, s10, s11, filter); 167 uint8x8_t d5 = convolve8_8(s5, s6, s7, s8, s9, s10, s11, s12, filter); 168 uint8x8_t d6 = 169 convolve8_8(s6, s7, s8, s9, s10, s11, s12, s13, filter); 170 uint8x8_t d7 = 171 convolve8_8(s7, s8, s9, s10, s11, s12, s13, s14, filter); 172 173 transpose_elems_inplace_u8_8x8(&d0, &d1, &d2, &d3, &d4, &d5, &d6, 174 &d7); 175 176 store_u8_8x8(d, dst_stride, d0, d1, d2, d3, d4, d5, d6, d7); 177 178 s0 = s8; 179 s1 = s9; 180 s2 = s10; 181 s3 = s11; 182 s4 = s12; 183 s5 = s13; 184 s6 = s14; 185 186 s += 8; 187 d += 8; 188 width -= 8; 189 } while (width != 0); 190 src += 8 * src_stride; 191 dst += 8 * dst_stride; 192 h -= 8; 193 } while (h > 0); 194 } 195 } 196 } 197 198 static inline void convolve8_horiz_4tap_neon(const uint8_t *src, 199 ptrdiff_t src_stride, uint8_t *dst, 200 ptrdiff_t dst_stride, 201 const int16_t *filter_x, int w, 202 int h) { 203 // All filter values are even, halve to reduce intermediate precision 204 // requirements. 205 const int16x4_t filter = vshr_n_s16(vld1_s16(filter_x + 2), 1); 206 207 if (w == 4) { 208 do { 209 uint8x8_t t01[4]; 210 211 t01[0] = load_unaligned_u8(src + 0, (int)src_stride); 212 t01[1] = load_unaligned_u8(src + 1, (int)src_stride); 213 t01[2] = load_unaligned_u8(src + 2, (int)src_stride); 214 t01[3] = load_unaligned_u8(src + 3, (int)src_stride); 215 216 int16x8_t s01[4]; 217 s01[0] = vreinterpretq_s16_u16(vmovl_u8(t01[0])); 218 s01[1] = vreinterpretq_s16_u16(vmovl_u8(t01[1])); 219 s01[2] = vreinterpretq_s16_u16(vmovl_u8(t01[2])); 220 s01[3] = vreinterpretq_s16_u16(vmovl_u8(t01[3])); 221 222 uint8x8_t d01 = convolve4_8(s01[0], s01[1], s01[2], s01[3], filter); 223 224 store_u8x4_strided_x2(dst + 0 * dst_stride, dst_stride, d01); 225 226 src += 2 * src_stride; 227 dst += 2 * dst_stride; 228 h -= 2; 229 } while (h > 0); 230 } else { 231 do { 232 int width = w; 233 const uint8_t *s = src; 234 uint8_t *d = dst; 235 236 do { 237 uint8x8_t t0[4], t1[4]; 238 load_u8_8x4(s + 0 * src_stride, 1, &t0[0], &t0[1], &t0[2], &t0[3]); 239 load_u8_8x4(s + 1 * src_stride, 1, &t1[0], &t1[1], &t1[2], &t1[3]); 240 241 int16x8_t s0[4], s1[4]; 242 s0[0] = vreinterpretq_s16_u16(vmovl_u8(t0[0])); 243 s0[1] = vreinterpretq_s16_u16(vmovl_u8(t0[1])); 244 s0[2] = vreinterpretq_s16_u16(vmovl_u8(t0[2])); 245 s0[3] = vreinterpretq_s16_u16(vmovl_u8(t0[3])); 246 247 s1[0] = vreinterpretq_s16_u16(vmovl_u8(t1[0])); 248 s1[1] = vreinterpretq_s16_u16(vmovl_u8(t1[1])); 249 s1[2] = vreinterpretq_s16_u16(vmovl_u8(t1[2])); 250 s1[3] = vreinterpretq_s16_u16(vmovl_u8(t1[3])); 251 252 uint8x8_t d0 = convolve4_8(s0[0], s0[1], s0[2], s0[3], filter); 253 uint8x8_t d1 = convolve4_8(s1[0], s1[1], s1[2], s1[3], filter); 254 255 store_u8_8x2(d, dst_stride, d0, d1); 256 257 s += 8; 258 d += 8; 259 width -= 8; 260 } while (width != 0); 261 src += 2 * src_stride; 262 dst += 2 * dst_stride; 263 h -= 2; 264 } while (h > 0); 265 } 266 } 267 268 void aom_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride, 269 uint8_t *dst, ptrdiff_t dst_stride, 270 const int16_t *filter_x, int x_step_q4, 271 const int16_t *filter_y, int y_step_q4, int w, 272 int h) { 273 assert((intptr_t)dst % 4 == 0); 274 assert(dst_stride % 4 == 0); 275 276 (void)x_step_q4; 277 (void)filter_y; 278 (void)y_step_q4; 279 280 src -= ((SUBPEL_TAPS / 2) - 1); 281 282 int filter_taps = get_filter_taps_convolve8(filter_x); 283 284 if (filter_taps == 2) { 285 convolve8_horiz_2tap_neon(src + 3, src_stride, dst, dst_stride, filter_x, w, 286 h); 287 } else if (filter_taps == 4) { 288 convolve8_horiz_4tap_neon(src + 2, src_stride, dst, dst_stride, filter_x, w, 289 h); 290 } else { 291 convolve8_horiz_8tap_neon(src, src_stride, dst, dst_stride, filter_x, w, h); 292 } 293 } 294 295 static inline void convolve8_vert_8tap_neon(const uint8_t *src, 296 ptrdiff_t src_stride, uint8_t *dst, 297 ptrdiff_t dst_stride, 298 const int16_t *filter_y, int w, 299 int h) { 300 // All filter values are even so halve them to reduce intermediate precision 301 // requirements. 302 const int16x8_t filter = vshrq_n_s16(vld1q_s16(filter_y), 1); 303 304 if (w == 4) { 305 uint8x8_t t0, t1, t2, t3, t4, t5, t6; 306 load_u8_8x7(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6); 307 308 int16x4_t s0 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0))); 309 int16x4_t s1 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1))); 310 int16x4_t s2 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2))); 311 int16x4_t s3 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3))); 312 int16x4_t s4 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t4))); 313 int16x4_t s5 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t5))); 314 int16x4_t s6 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t6))); 315 316 src += 7 * src_stride; 317 318 do { 319 load_u8_8x4(src, src_stride, &t0, &t1, &t2, &t3); 320 321 int16x4_t s7 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0))); 322 int16x4_t s8 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1))); 323 int16x4_t s9 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2))); 324 int16x4_t s10 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3))); 325 326 int16x4_t d0 = convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filter); 327 int16x4_t d1 = convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, filter); 328 int16x4_t d2 = convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, filter); 329 int16x4_t d3 = convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, filter); 330 // We halved the filter values so -1 from right shift. 331 uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS - 1); 332 uint8x8_t d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS - 1); 333 334 store_u8x4_strided_x2(dst + 0 * dst_stride, dst_stride, d01); 335 store_u8x4_strided_x2(dst + 2 * dst_stride, dst_stride, d23); 336 337 s0 = s4; 338 s1 = s5; 339 s2 = s6; 340 s3 = s7; 341 s4 = s8; 342 s5 = s9; 343 s6 = s10; 344 345 src += 4 * src_stride; 346 dst += 4 * dst_stride; 347 h -= 4; 348 } while (h != 0); 349 } else { 350 do { 351 uint8x8_t t0, t1, t2, t3, t4, t5, t6; 352 load_u8_8x7(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6); 353 354 int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t0)); 355 int16x8_t s1 = vreinterpretq_s16_u16(vmovl_u8(t1)); 356 int16x8_t s2 = vreinterpretq_s16_u16(vmovl_u8(t2)); 357 int16x8_t s3 = vreinterpretq_s16_u16(vmovl_u8(t3)); 358 int16x8_t s4 = vreinterpretq_s16_u16(vmovl_u8(t4)); 359 int16x8_t s5 = vreinterpretq_s16_u16(vmovl_u8(t5)); 360 int16x8_t s6 = vreinterpretq_s16_u16(vmovl_u8(t6)); 361 362 int height = h; 363 const uint8_t *s = src + 7 * src_stride; 364 uint8_t *d = dst; 365 366 do { 367 load_u8_8x4(s, src_stride, &t0, &t1, &t2, &t3); 368 369 int16x8_t s7 = vreinterpretq_s16_u16(vmovl_u8(t0)); 370 int16x8_t s8 = vreinterpretq_s16_u16(vmovl_u8(t1)); 371 int16x8_t s9 = vreinterpretq_s16_u16(vmovl_u8(t2)); 372 int16x8_t s10 = vreinterpretq_s16_u16(vmovl_u8(t3)); 373 374 uint8x8_t d0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filter); 375 uint8x8_t d1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filter); 376 uint8x8_t d2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filter); 377 uint8x8_t d3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filter); 378 379 store_u8_8x4(d, dst_stride, d0, d1, d2, d3); 380 381 s0 = s4; 382 s1 = s5; 383 s2 = s6; 384 s3 = s7; 385 s4 = s8; 386 s5 = s9; 387 s6 = s10; 388 389 s += 4 * src_stride; 390 d += 4 * dst_stride; 391 height -= 4; 392 } while (height != 0); 393 src += 8; 394 dst += 8; 395 w -= 8; 396 } while (w != 0); 397 } 398 } 399 400 void aom_convolve8_vert_neon(const uint8_t *src, ptrdiff_t src_stride, 401 uint8_t *dst, ptrdiff_t dst_stride, 402 const int16_t *filter_x, int x_step_q4, 403 const int16_t *filter_y, int y_step_q4, int w, 404 int h) { 405 assert((intptr_t)dst % 4 == 0); 406 assert(dst_stride % 4 == 0); 407 408 (void)filter_x; 409 (void)x_step_q4; 410 (void)y_step_q4; 411 412 src -= ((SUBPEL_TAPS / 2) - 1) * src_stride; 413 414 int filter_taps = get_filter_taps_convolve8(filter_y); 415 416 if (filter_taps == 2) { 417 convolve8_vert_2tap_neon(src + 3 * src_stride, src_stride, dst, dst_stride, 418 filter_y, w, h); 419 } else if (filter_taps == 4) { 420 convolve8_vert_4tap_neon(src + 2 * src_stride, src_stride, dst, dst_stride, 421 filter_y, w, h); 422 } else { 423 convolve8_vert_8tap_neon(src, src_stride, dst, dst_stride, filter_y, w, h); 424 } 425 }