highbd_sadxd_neon.c (25781B)
1 /* 2 * Copyright (c) 2023 The WebM project authors. All rights reserved. 3 * Copyright (c) 2023, Alliance for Open Media. All rights reserved. 4 * 5 * This source code is subject to the terms of the BSD 2 Clause License and 6 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License 7 * was not distributed with this source code in the LICENSE file, you can 8 * obtain it at www.aomedia.org/license/software. If the Alliance for Open 9 * Media Patent License 1.0 was not distributed with this source code in the 10 * PATENTS file, you can obtain it at www.aomedia.org/license/patent. 11 */ 12 13 #include <arm_neon.h> 14 15 #include "config/aom_config.h" 16 #include "config/aom_dsp_rtcd.h" 17 18 #include "aom/aom_integer.h" 19 #include "aom_dsp/arm/mem_neon.h" 20 #include "aom_dsp/arm/sum_neon.h" 21 22 static inline void highbd_sad4xhx4d_small_neon(const uint8_t *src_ptr, 23 int src_stride, 24 const uint8_t *const ref_ptr[4], 25 int ref_stride, uint32_t res[4], 26 int h) { 27 const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr); 28 const uint16_t *ref16_ptr0 = CONVERT_TO_SHORTPTR(ref_ptr[0]); 29 const uint16_t *ref16_ptr1 = CONVERT_TO_SHORTPTR(ref_ptr[1]); 30 const uint16_t *ref16_ptr2 = CONVERT_TO_SHORTPTR(ref_ptr[2]); 31 const uint16_t *ref16_ptr3 = CONVERT_TO_SHORTPTR(ref_ptr[3]); 32 33 uint32x4_t sum[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0), 34 vdupq_n_u32(0) }; 35 36 int i = 0; 37 do { 38 uint16x4_t s = vld1_u16(src16_ptr + i * src_stride); 39 uint16x4_t r0 = vld1_u16(ref16_ptr0 + i * ref_stride); 40 uint16x4_t r1 = vld1_u16(ref16_ptr1 + i * ref_stride); 41 uint16x4_t r2 = vld1_u16(ref16_ptr2 + i * ref_stride); 42 uint16x4_t r3 = vld1_u16(ref16_ptr3 + i * ref_stride); 43 44 sum[0] = vabal_u16(sum[0], s, r0); 45 sum[1] = vabal_u16(sum[1], s, r1); 46 sum[2] = vabal_u16(sum[2], s, r2); 47 sum[3] = vabal_u16(sum[3], s, r3); 48 49 } while (++i < h); 50 51 vst1q_u32(res, horizontal_add_4d_u32x4(sum)); 52 } 53 54 static inline void highbd_sad8xhx4d_small_neon(const uint8_t *src_ptr, 55 int src_stride, 56 const uint8_t *const ref_ptr[4], 57 int ref_stride, uint32_t res[4], 58 int h) { 59 const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr); 60 const uint16_t *ref16_ptr0 = CONVERT_TO_SHORTPTR(ref_ptr[0]); 61 const uint16_t *ref16_ptr1 = CONVERT_TO_SHORTPTR(ref_ptr[1]); 62 const uint16_t *ref16_ptr2 = CONVERT_TO_SHORTPTR(ref_ptr[2]); 63 const uint16_t *ref16_ptr3 = CONVERT_TO_SHORTPTR(ref_ptr[3]); 64 65 uint16x8_t sum[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0), 66 vdupq_n_u16(0) }; 67 uint32x4_t sum_u32[4]; 68 69 int i = 0; 70 do { 71 uint16x8_t s = vld1q_u16(src16_ptr + i * src_stride); 72 73 sum[0] = vabaq_u16(sum[0], s, vld1q_u16(ref16_ptr0 + i * ref_stride)); 74 sum[1] = vabaq_u16(sum[1], s, vld1q_u16(ref16_ptr1 + i * ref_stride)); 75 sum[2] = vabaq_u16(sum[2], s, vld1q_u16(ref16_ptr2 + i * ref_stride)); 76 sum[3] = vabaq_u16(sum[3], s, vld1q_u16(ref16_ptr3 + i * ref_stride)); 77 78 } while (++i < h); 79 80 sum_u32[0] = vpaddlq_u16(sum[0]); 81 sum_u32[1] = vpaddlq_u16(sum[1]); 82 sum_u32[2] = vpaddlq_u16(sum[2]); 83 sum_u32[3] = vpaddlq_u16(sum[3]); 84 vst1q_u32(res, horizontal_add_4d_u32x4(sum_u32)); 85 } 86 87 static inline void sad8_neon(uint16x8_t src, uint16x8_t ref, 88 uint32x4_t *const sad_sum) { 89 uint16x8_t abs_diff = vabdq_u16(src, ref); 90 *sad_sum = vpadalq_u16(*sad_sum, abs_diff); 91 } 92 93 #if !CONFIG_REALTIME_ONLY 94 static inline void highbd_sad8xhx4d_large_neon(const uint8_t *src_ptr, 95 int src_stride, 96 const uint8_t *const ref_ptr[4], 97 int ref_stride, uint32_t res[4], 98 int h) { 99 const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr); 100 const uint16_t *ref16_ptr0 = CONVERT_TO_SHORTPTR(ref_ptr[0]); 101 const uint16_t *ref16_ptr1 = CONVERT_TO_SHORTPTR(ref_ptr[1]); 102 const uint16_t *ref16_ptr2 = CONVERT_TO_SHORTPTR(ref_ptr[2]); 103 const uint16_t *ref16_ptr3 = CONVERT_TO_SHORTPTR(ref_ptr[3]); 104 105 uint32x4_t sum[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0), 106 vdupq_n_u32(0) }; 107 108 int i = 0; 109 do { 110 uint16x8_t s = vld1q_u16(src16_ptr + i * src_stride); 111 sad8_neon(s, vld1q_u16(ref16_ptr0 + i * ref_stride), &sum[0]); 112 sad8_neon(s, vld1q_u16(ref16_ptr1 + i * ref_stride), &sum[1]); 113 sad8_neon(s, vld1q_u16(ref16_ptr2 + i * ref_stride), &sum[2]); 114 sad8_neon(s, vld1q_u16(ref16_ptr3 + i * ref_stride), &sum[3]); 115 116 } while (++i < h); 117 118 vst1q_u32(res, horizontal_add_4d_u32x4(sum)); 119 } 120 #endif // !CONFIG_REALTIME_ONLY 121 122 static inline void highbd_sad16xhx4d_large_neon(const uint8_t *src_ptr, 123 int src_stride, 124 const uint8_t *const ref_ptr[4], 125 int ref_stride, uint32_t res[4], 126 int h) { 127 const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr); 128 const uint16_t *ref16_ptr0 = CONVERT_TO_SHORTPTR(ref_ptr[0]); 129 const uint16_t *ref16_ptr1 = CONVERT_TO_SHORTPTR(ref_ptr[1]); 130 const uint16_t *ref16_ptr2 = CONVERT_TO_SHORTPTR(ref_ptr[2]); 131 const uint16_t *ref16_ptr3 = CONVERT_TO_SHORTPTR(ref_ptr[3]); 132 133 uint32x4_t sum_lo[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0), 134 vdupq_n_u32(0) }; 135 uint32x4_t sum_hi[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0), 136 vdupq_n_u32(0) }; 137 uint32x4_t sum[4]; 138 139 int i = 0; 140 do { 141 uint16x8_t s0 = vld1q_u16(src16_ptr + i * src_stride); 142 sad8_neon(s0, vld1q_u16(ref16_ptr0 + i * ref_stride), &sum_lo[0]); 143 sad8_neon(s0, vld1q_u16(ref16_ptr1 + i * ref_stride), &sum_lo[1]); 144 sad8_neon(s0, vld1q_u16(ref16_ptr2 + i * ref_stride), &sum_lo[2]); 145 sad8_neon(s0, vld1q_u16(ref16_ptr3 + i * ref_stride), &sum_lo[3]); 146 147 uint16x8_t s1 = vld1q_u16(src16_ptr + i * src_stride + 8); 148 sad8_neon(s1, vld1q_u16(ref16_ptr0 + i * ref_stride + 8), &sum_hi[0]); 149 sad8_neon(s1, vld1q_u16(ref16_ptr1 + i * ref_stride + 8), &sum_hi[1]); 150 sad8_neon(s1, vld1q_u16(ref16_ptr2 + i * ref_stride + 8), &sum_hi[2]); 151 sad8_neon(s1, vld1q_u16(ref16_ptr3 + i * ref_stride + 8), &sum_hi[3]); 152 153 } while (++i < h); 154 155 sum[0] = vaddq_u32(sum_lo[0], sum_hi[0]); 156 sum[1] = vaddq_u32(sum_lo[1], sum_hi[1]); 157 sum[2] = vaddq_u32(sum_lo[2], sum_hi[2]); 158 sum[3] = vaddq_u32(sum_lo[3], sum_hi[3]); 159 160 vst1q_u32(res, horizontal_add_4d_u32x4(sum)); 161 } 162 163 static inline void highbd_sadwxhx4d_large_neon(const uint8_t *src_ptr, 164 int src_stride, 165 const uint8_t *const ref_ptr[4], 166 int ref_stride, uint32_t res[4], 167 int w, int h) { 168 const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr); 169 const uint16_t *ref16_ptr0 = CONVERT_TO_SHORTPTR(ref_ptr[0]); 170 const uint16_t *ref16_ptr1 = CONVERT_TO_SHORTPTR(ref_ptr[1]); 171 const uint16_t *ref16_ptr2 = CONVERT_TO_SHORTPTR(ref_ptr[2]); 172 const uint16_t *ref16_ptr3 = CONVERT_TO_SHORTPTR(ref_ptr[3]); 173 174 uint32x4_t sum_lo[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0), 175 vdupq_n_u32(0) }; 176 uint32x4_t sum_hi[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0), 177 vdupq_n_u32(0) }; 178 uint32x4_t sum[4]; 179 180 int i = 0; 181 do { 182 int j = 0; 183 do { 184 uint16x8_t s0 = vld1q_u16(src16_ptr + i * src_stride + j); 185 sad8_neon(s0, vld1q_u16(ref16_ptr0 + i * ref_stride + j), &sum_lo[0]); 186 sad8_neon(s0, vld1q_u16(ref16_ptr1 + i * ref_stride + j), &sum_lo[1]); 187 sad8_neon(s0, vld1q_u16(ref16_ptr2 + i * ref_stride + j), &sum_lo[2]); 188 sad8_neon(s0, vld1q_u16(ref16_ptr3 + i * ref_stride + j), &sum_lo[3]); 189 190 uint16x8_t s1 = vld1q_u16(src16_ptr + i * src_stride + j + 8); 191 sad8_neon(s1, vld1q_u16(ref16_ptr0 + i * ref_stride + j + 8), &sum_hi[0]); 192 sad8_neon(s1, vld1q_u16(ref16_ptr1 + i * ref_stride + j + 8), &sum_hi[1]); 193 sad8_neon(s1, vld1q_u16(ref16_ptr2 + i * ref_stride + j + 8), &sum_hi[2]); 194 sad8_neon(s1, vld1q_u16(ref16_ptr3 + i * ref_stride + j + 8), &sum_hi[3]); 195 196 uint16x8_t s2 = vld1q_u16(src16_ptr + i * src_stride + j + 16); 197 sad8_neon(s2, vld1q_u16(ref16_ptr0 + i * ref_stride + j + 16), 198 &sum_lo[0]); 199 sad8_neon(s2, vld1q_u16(ref16_ptr1 + i * ref_stride + j + 16), 200 &sum_lo[1]); 201 sad8_neon(s2, vld1q_u16(ref16_ptr2 + i * ref_stride + j + 16), 202 &sum_lo[2]); 203 sad8_neon(s2, vld1q_u16(ref16_ptr3 + i * ref_stride + j + 16), 204 &sum_lo[3]); 205 206 uint16x8_t s3 = vld1q_u16(src16_ptr + i * src_stride + j + 24); 207 sad8_neon(s3, vld1q_u16(ref16_ptr0 + i * ref_stride + j + 24), 208 &sum_hi[0]); 209 sad8_neon(s3, vld1q_u16(ref16_ptr1 + i * ref_stride + j + 24), 210 &sum_hi[1]); 211 sad8_neon(s3, vld1q_u16(ref16_ptr2 + i * ref_stride + j + 24), 212 &sum_hi[2]); 213 sad8_neon(s3, vld1q_u16(ref16_ptr3 + i * ref_stride + j + 24), 214 &sum_hi[3]); 215 216 j += 32; 217 } while (j < w); 218 219 } while (++i < h); 220 221 sum[0] = vaddq_u32(sum_lo[0], sum_hi[0]); 222 sum[1] = vaddq_u32(sum_lo[1], sum_hi[1]); 223 sum[2] = vaddq_u32(sum_lo[2], sum_hi[2]); 224 sum[3] = vaddq_u32(sum_lo[3], sum_hi[3]); 225 226 vst1q_u32(res, horizontal_add_4d_u32x4(sum)); 227 } 228 229 static inline void highbd_sad128xhx4d_large_neon( 230 const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_ptr[4], 231 int ref_stride, uint32_t res[4], int h) { 232 highbd_sadwxhx4d_large_neon(src_ptr, src_stride, ref_ptr, ref_stride, res, 233 128, h); 234 } 235 236 static inline void highbd_sad64xhx4d_large_neon(const uint8_t *src_ptr, 237 int src_stride, 238 const uint8_t *const ref_ptr[4], 239 int ref_stride, uint32_t res[4], 240 int h) { 241 highbd_sadwxhx4d_large_neon(src_ptr, src_stride, ref_ptr, ref_stride, res, 64, 242 h); 243 } 244 245 static inline void highbd_sad32xhx4d_large_neon(const uint8_t *src_ptr, 246 int src_stride, 247 const uint8_t *const ref_ptr[4], 248 int ref_stride, uint32_t res[4], 249 int h) { 250 highbd_sadwxhx4d_large_neon(src_ptr, src_stride, ref_ptr, ref_stride, res, 32, 251 h); 252 } 253 254 #define HBD_SAD_WXH_4D_SMALL_NEON(w, h) \ 255 void aom_highbd_sad##w##x##h##x4d_neon( \ 256 const uint8_t *src, int src_stride, const uint8_t *const ref_array[4], \ 257 int ref_stride, uint32_t sad_array[4]) { \ 258 highbd_sad##w##xhx4d_small_neon(src, src_stride, ref_array, ref_stride, \ 259 sad_array, (h)); \ 260 } 261 262 #define HBD_SAD_WXH_4D_LARGE_NEON(w, h) \ 263 void aom_highbd_sad##w##x##h##x4d_neon( \ 264 const uint8_t *src, int src_stride, const uint8_t *const ref_array[4], \ 265 int ref_stride, uint32_t sad_array[4]) { \ 266 highbd_sad##w##xhx4d_large_neon(src, src_stride, ref_array, ref_stride, \ 267 sad_array, (h)); \ 268 } 269 270 HBD_SAD_WXH_4D_SMALL_NEON(4, 4) 271 HBD_SAD_WXH_4D_SMALL_NEON(4, 8) 272 273 HBD_SAD_WXH_4D_SMALL_NEON(8, 4) 274 HBD_SAD_WXH_4D_SMALL_NEON(8, 8) 275 HBD_SAD_WXH_4D_SMALL_NEON(8, 16) 276 277 HBD_SAD_WXH_4D_LARGE_NEON(16, 8) 278 HBD_SAD_WXH_4D_LARGE_NEON(16, 16) 279 HBD_SAD_WXH_4D_LARGE_NEON(16, 32) 280 281 HBD_SAD_WXH_4D_LARGE_NEON(32, 16) 282 HBD_SAD_WXH_4D_LARGE_NEON(32, 32) 283 HBD_SAD_WXH_4D_LARGE_NEON(32, 64) 284 285 HBD_SAD_WXH_4D_LARGE_NEON(64, 32) 286 HBD_SAD_WXH_4D_LARGE_NEON(64, 64) 287 HBD_SAD_WXH_4D_LARGE_NEON(64, 128) 288 289 HBD_SAD_WXH_4D_LARGE_NEON(128, 64) 290 HBD_SAD_WXH_4D_LARGE_NEON(128, 128) 291 292 #if !CONFIG_REALTIME_ONLY 293 HBD_SAD_WXH_4D_SMALL_NEON(4, 16) 294 295 HBD_SAD_WXH_4D_LARGE_NEON(8, 32) 296 297 HBD_SAD_WXH_4D_LARGE_NEON(16, 4) 298 HBD_SAD_WXH_4D_LARGE_NEON(16, 64) 299 300 HBD_SAD_WXH_4D_LARGE_NEON(32, 8) 301 302 HBD_SAD_WXH_4D_LARGE_NEON(64, 16) 303 #endif // !CONFIG_REALTIME_ONLY 304 305 #define HBD_SAD_SKIP_WXH_4D_SMALL_NEON(w, h) \ 306 void aom_highbd_sad_skip_##w##x##h##x4d_neon( \ 307 const uint8_t *src, int src_stride, const uint8_t *const ref_array[4], \ 308 int ref_stride, uint32_t sad_array[4]) { \ 309 highbd_sad##w##xhx4d_small_neon(src, 2 * src_stride, ref_array, \ 310 2 * ref_stride, sad_array, ((h) >> 1)); \ 311 sad_array[0] <<= 1; \ 312 sad_array[1] <<= 1; \ 313 sad_array[2] <<= 1; \ 314 sad_array[3] <<= 1; \ 315 } 316 317 #define HBD_SAD_SKIP_WXH_4D_LARGE_NEON(w, h) \ 318 void aom_highbd_sad_skip_##w##x##h##x4d_neon( \ 319 const uint8_t *src, int src_stride, const uint8_t *const ref_array[4], \ 320 int ref_stride, uint32_t sad_array[4]) { \ 321 highbd_sad##w##xhx4d_large_neon(src, 2 * src_stride, ref_array, \ 322 2 * ref_stride, sad_array, ((h) >> 1)); \ 323 sad_array[0] <<= 1; \ 324 sad_array[1] <<= 1; \ 325 sad_array[2] <<= 1; \ 326 sad_array[3] <<= 1; \ 327 } 328 329 HBD_SAD_SKIP_WXH_4D_SMALL_NEON(8, 16) 330 331 HBD_SAD_SKIP_WXH_4D_LARGE_NEON(16, 16) 332 HBD_SAD_SKIP_WXH_4D_LARGE_NEON(16, 32) 333 334 HBD_SAD_SKIP_WXH_4D_LARGE_NEON(32, 16) 335 HBD_SAD_SKIP_WXH_4D_LARGE_NEON(32, 32) 336 HBD_SAD_SKIP_WXH_4D_LARGE_NEON(32, 64) 337 338 HBD_SAD_SKIP_WXH_4D_LARGE_NEON(64, 32) 339 HBD_SAD_SKIP_WXH_4D_LARGE_NEON(64, 64) 340 HBD_SAD_SKIP_WXH_4D_LARGE_NEON(64, 128) 341 342 HBD_SAD_SKIP_WXH_4D_LARGE_NEON(128, 64) 343 HBD_SAD_SKIP_WXH_4D_LARGE_NEON(128, 128) 344 345 #if !CONFIG_REALTIME_ONLY 346 HBD_SAD_SKIP_WXH_4D_SMALL_NEON(4, 16) 347 348 HBD_SAD_SKIP_WXH_4D_SMALL_NEON(8, 32) 349 350 HBD_SAD_SKIP_WXH_4D_LARGE_NEON(16, 64) 351 352 HBD_SAD_SKIP_WXH_4D_LARGE_NEON(64, 16) 353 #endif // !CONFIG_REALTIME_ONLY 354 355 static inline void highbd_sad4xhx3d_small_neon(const uint8_t *src_ptr, 356 int src_stride, 357 const uint8_t *const ref_ptr[4], 358 int ref_stride, uint32_t res[4], 359 int h) { 360 const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr); 361 const uint16_t *ref16_ptr0 = CONVERT_TO_SHORTPTR(ref_ptr[0]); 362 const uint16_t *ref16_ptr1 = CONVERT_TO_SHORTPTR(ref_ptr[1]); 363 const uint16_t *ref16_ptr2 = CONVERT_TO_SHORTPTR(ref_ptr[2]); 364 365 uint32x4_t sum[3] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0) }; 366 367 int i = 0; 368 do { 369 uint16x4_t s = vld1_u16(src16_ptr + i * src_stride); 370 uint16x4_t r0 = vld1_u16(ref16_ptr0 + i * ref_stride); 371 uint16x4_t r1 = vld1_u16(ref16_ptr1 + i * ref_stride); 372 uint16x4_t r2 = vld1_u16(ref16_ptr2 + i * ref_stride); 373 374 sum[0] = vabal_u16(sum[0], s, r0); 375 sum[1] = vabal_u16(sum[1], s, r1); 376 sum[2] = vabal_u16(sum[2], s, r2); 377 378 } while (++i < h); 379 380 res[0] = horizontal_add_u32x4(sum[0]); 381 res[1] = horizontal_add_u32x4(sum[1]); 382 res[2] = horizontal_add_u32x4(sum[2]); 383 } 384 385 static inline void highbd_sad8xhx3d_small_neon(const uint8_t *src_ptr, 386 int src_stride, 387 const uint8_t *const ref_ptr[4], 388 int ref_stride, uint32_t res[4], 389 int h) { 390 const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr); 391 const uint16_t *ref16_ptr0 = CONVERT_TO_SHORTPTR(ref_ptr[0]); 392 const uint16_t *ref16_ptr1 = CONVERT_TO_SHORTPTR(ref_ptr[1]); 393 const uint16_t *ref16_ptr2 = CONVERT_TO_SHORTPTR(ref_ptr[2]); 394 395 uint16x8_t sum[3] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0) }; 396 397 int i = 0; 398 do { 399 uint16x8_t s = vld1q_u16(src16_ptr + i * src_stride); 400 401 sum[0] = vabaq_u16(sum[0], s, vld1q_u16(ref16_ptr0 + i * ref_stride)); 402 sum[1] = vabaq_u16(sum[1], s, vld1q_u16(ref16_ptr1 + i * ref_stride)); 403 sum[2] = vabaq_u16(sum[2], s, vld1q_u16(ref16_ptr2 + i * ref_stride)); 404 405 } while (++i < h); 406 407 res[0] = horizontal_add_u32x4(vpaddlq_u16(sum[0])); 408 res[1] = horizontal_add_u32x4(vpaddlq_u16(sum[1])); 409 res[2] = horizontal_add_u32x4(vpaddlq_u16(sum[2])); 410 } 411 412 #if !CONFIG_REALTIME_ONLY 413 static inline void highbd_sad8xhx3d_large_neon(const uint8_t *src_ptr, 414 int src_stride, 415 const uint8_t *const ref_ptr[4], 416 int ref_stride, uint32_t res[4], 417 int h) { 418 const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr); 419 const uint16_t *ref16_ptr0 = CONVERT_TO_SHORTPTR(ref_ptr[0]); 420 const uint16_t *ref16_ptr1 = CONVERT_TO_SHORTPTR(ref_ptr[1]); 421 const uint16_t *ref16_ptr2 = CONVERT_TO_SHORTPTR(ref_ptr[2]); 422 423 uint32x4_t sum[3] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0) }; 424 425 int i = 0; 426 do { 427 uint16x8_t s = vld1q_u16(src16_ptr + i * src_stride); 428 uint16x8_t r0 = vld1q_u16(ref16_ptr0 + i * ref_stride); 429 uint16x8_t r1 = vld1q_u16(ref16_ptr1 + i * ref_stride); 430 uint16x8_t r2 = vld1q_u16(ref16_ptr2 + i * ref_stride); 431 432 sad8_neon(s, r0, &sum[0]); 433 sad8_neon(s, r1, &sum[1]); 434 sad8_neon(s, r2, &sum[2]); 435 436 } while (++i < h); 437 438 res[0] = horizontal_add_u32x4(sum[0]); 439 res[1] = horizontal_add_u32x4(sum[1]); 440 res[2] = horizontal_add_u32x4(sum[2]); 441 } 442 #endif // !CONFIG_REALTIME_ONLY 443 444 static inline void highbd_sad16xhx3d_large_neon(const uint8_t *src_ptr, 445 int src_stride, 446 const uint8_t *const ref_ptr[4], 447 int ref_stride, uint32_t res[4], 448 int h) { 449 const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr); 450 const uint16_t *ref16_ptr0 = CONVERT_TO_SHORTPTR(ref_ptr[0]); 451 const uint16_t *ref16_ptr1 = CONVERT_TO_SHORTPTR(ref_ptr[1]); 452 const uint16_t *ref16_ptr2 = CONVERT_TO_SHORTPTR(ref_ptr[2]); 453 454 uint32x4_t sum_lo[3] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0) }; 455 uint32x4_t sum_hi[3] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0) }; 456 457 int i = 0; 458 do { 459 uint16x8_t s0 = vld1q_u16(src16_ptr + i * src_stride); 460 sad8_neon(s0, vld1q_u16(ref16_ptr0 + i * ref_stride), &sum_lo[0]); 461 sad8_neon(s0, vld1q_u16(ref16_ptr1 + i * ref_stride), &sum_lo[1]); 462 sad8_neon(s0, vld1q_u16(ref16_ptr2 + i * ref_stride), &sum_lo[2]); 463 464 uint16x8_t s1 = vld1q_u16(src16_ptr + i * src_stride + 8); 465 sad8_neon(s1, vld1q_u16(ref16_ptr0 + i * ref_stride + 8), &sum_hi[0]); 466 sad8_neon(s1, vld1q_u16(ref16_ptr1 + i * ref_stride + 8), &sum_hi[1]); 467 sad8_neon(s1, vld1q_u16(ref16_ptr2 + i * ref_stride + 8), &sum_hi[2]); 468 469 } while (++i < h); 470 471 res[0] = horizontal_add_u32x4(vaddq_u32(sum_lo[0], sum_hi[0])); 472 res[1] = horizontal_add_u32x4(vaddq_u32(sum_lo[1], sum_hi[1])); 473 res[2] = horizontal_add_u32x4(vaddq_u32(sum_lo[2], sum_hi[2])); 474 } 475 476 static inline void highbd_sadwxhx3d_large_neon(const uint8_t *src_ptr, 477 int src_stride, 478 const uint8_t *const ref_ptr[4], 479 int ref_stride, uint32_t res[4], 480 int w, int h) { 481 const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr); 482 const uint16_t *ref16_ptr0 = CONVERT_TO_SHORTPTR(ref_ptr[0]); 483 const uint16_t *ref16_ptr1 = CONVERT_TO_SHORTPTR(ref_ptr[1]); 484 const uint16_t *ref16_ptr2 = CONVERT_TO_SHORTPTR(ref_ptr[2]); 485 486 uint32x4_t sum_lo[3] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0) }; 487 uint32x4_t sum_hi[3] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0) }; 488 uint32x4_t sum[3]; 489 490 int i = 0; 491 do { 492 int j = 0; 493 do { 494 uint16x8_t s0 = vld1q_u16(src16_ptr + i * src_stride + j); 495 sad8_neon(s0, vld1q_u16(ref16_ptr0 + i * ref_stride + j), &sum_lo[0]); 496 sad8_neon(s0, vld1q_u16(ref16_ptr1 + i * ref_stride + j), &sum_lo[1]); 497 sad8_neon(s0, vld1q_u16(ref16_ptr2 + i * ref_stride + j), &sum_lo[2]); 498 499 uint16x8_t s1 = vld1q_u16(src16_ptr + i * src_stride + j + 8); 500 sad8_neon(s1, vld1q_u16(ref16_ptr0 + i * ref_stride + j + 8), &sum_hi[0]); 501 sad8_neon(s1, vld1q_u16(ref16_ptr1 + i * ref_stride + j + 8), &sum_hi[1]); 502 sad8_neon(s1, vld1q_u16(ref16_ptr2 + i * ref_stride + j + 8), &sum_hi[2]); 503 504 uint16x8_t s2 = vld1q_u16(src16_ptr + i * src_stride + j + 16); 505 sad8_neon(s2, vld1q_u16(ref16_ptr0 + i * ref_stride + j + 16), 506 &sum_lo[0]); 507 sad8_neon(s2, vld1q_u16(ref16_ptr1 + i * ref_stride + j + 16), 508 &sum_lo[1]); 509 sad8_neon(s2, vld1q_u16(ref16_ptr2 + i * ref_stride + j + 16), 510 &sum_lo[2]); 511 512 uint16x8_t s3 = vld1q_u16(src16_ptr + i * src_stride + j + 24); 513 sad8_neon(s3, vld1q_u16(ref16_ptr0 + i * ref_stride + j + 24), 514 &sum_hi[0]); 515 sad8_neon(s3, vld1q_u16(ref16_ptr1 + i * ref_stride + j + 24), 516 &sum_hi[1]); 517 sad8_neon(s3, vld1q_u16(ref16_ptr2 + i * ref_stride + j + 24), 518 &sum_hi[2]); 519 520 j += 32; 521 } while (j < w); 522 523 } while (++i < h); 524 525 sum[0] = vaddq_u32(sum_lo[0], sum_hi[0]); 526 sum[1] = vaddq_u32(sum_lo[1], sum_hi[1]); 527 sum[2] = vaddq_u32(sum_lo[2], sum_hi[2]); 528 529 res[0] = horizontal_add_u32x4(sum[0]); 530 res[1] = horizontal_add_u32x4(sum[1]); 531 res[2] = horizontal_add_u32x4(sum[2]); 532 } 533 534 static inline void highbd_sad128xhx3d_large_neon( 535 const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_ptr[4], 536 int ref_stride, uint32_t res[4], int h) { 537 highbd_sadwxhx3d_large_neon(src_ptr, src_stride, ref_ptr, ref_stride, res, 538 128, h); 539 } 540 541 static inline void highbd_sad64xhx3d_large_neon(const uint8_t *src_ptr, 542 int src_stride, 543 const uint8_t *const ref_ptr[4], 544 int ref_stride, uint32_t res[4], 545 int h) { 546 highbd_sadwxhx3d_large_neon(src_ptr, src_stride, ref_ptr, ref_stride, res, 64, 547 h); 548 } 549 550 static inline void highbd_sad32xhx3d_large_neon(const uint8_t *src_ptr, 551 int src_stride, 552 const uint8_t *const ref_ptr[4], 553 int ref_stride, uint32_t res[4], 554 int h) { 555 highbd_sadwxhx3d_large_neon(src_ptr, src_stride, ref_ptr, ref_stride, res, 32, 556 h); 557 } 558 559 #define HBD_SAD_WXH_3D_SMALL_NEON(w, h) \ 560 void aom_highbd_sad##w##x##h##x3d_neon( \ 561 const uint8_t *src, int src_stride, const uint8_t *const ref_array[4], \ 562 int ref_stride, uint32_t sad_array[4]) { \ 563 highbd_sad##w##xhx3d_small_neon(src, src_stride, ref_array, ref_stride, \ 564 sad_array, (h)); \ 565 } 566 567 #define HBD_SAD_WXH_3D_LARGE_NEON(w, h) \ 568 void aom_highbd_sad##w##x##h##x3d_neon( \ 569 const uint8_t *src, int src_stride, const uint8_t *const ref_array[4], \ 570 int ref_stride, uint32_t sad_array[4]) { \ 571 highbd_sad##w##xhx3d_large_neon(src, src_stride, ref_array, ref_stride, \ 572 sad_array, (h)); \ 573 } 574 575 HBD_SAD_WXH_3D_SMALL_NEON(4, 4) 576 HBD_SAD_WXH_3D_SMALL_NEON(4, 8) 577 578 HBD_SAD_WXH_3D_SMALL_NEON(8, 4) 579 HBD_SAD_WXH_3D_SMALL_NEON(8, 8) 580 HBD_SAD_WXH_3D_SMALL_NEON(8, 16) 581 582 HBD_SAD_WXH_3D_LARGE_NEON(16, 8) 583 HBD_SAD_WXH_3D_LARGE_NEON(16, 16) 584 HBD_SAD_WXH_3D_LARGE_NEON(16, 32) 585 586 HBD_SAD_WXH_3D_LARGE_NEON(32, 16) 587 HBD_SAD_WXH_3D_LARGE_NEON(32, 32) 588 HBD_SAD_WXH_3D_LARGE_NEON(32, 64) 589 590 HBD_SAD_WXH_3D_LARGE_NEON(64, 32) 591 HBD_SAD_WXH_3D_LARGE_NEON(64, 64) 592 HBD_SAD_WXH_3D_LARGE_NEON(64, 128) 593 594 HBD_SAD_WXH_3D_LARGE_NEON(128, 64) 595 HBD_SAD_WXH_3D_LARGE_NEON(128, 128) 596 597 #if !CONFIG_REALTIME_ONLY 598 HBD_SAD_WXH_3D_SMALL_NEON(4, 16) 599 600 HBD_SAD_WXH_3D_LARGE_NEON(8, 32) 601 602 HBD_SAD_WXH_3D_LARGE_NEON(16, 4) 603 HBD_SAD_WXH_3D_LARGE_NEON(16, 64) 604 605 HBD_SAD_WXH_3D_LARGE_NEON(32, 8) 606 607 HBD_SAD_WXH_3D_LARGE_NEON(64, 16) 608 #endif // !CONFIG_REALTIME_ONLY