highbd_sad_neon.c (16686B)
1 /* 2 * Copyright (c) 2023 The WebM project authors. All rights reserved. 3 * Copyright (c) 2023, Alliance for Open Media. All rights reserved. 4 * 5 * This source code is subject to the terms of the BSD 2 Clause License and 6 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License 7 * was not distributed with this source code in the LICENSE file, you can 8 * obtain it at www.aomedia.org/license/software. If the Alliance for Open 9 * Media Patent License 1.0 was not distributed with this source code in the 10 * PATENTS file, you can obtain it at www.aomedia.org/license/patent. 11 */ 12 13 #include <arm_neon.h> 14 15 #include "config/aom_config.h" 16 #include "config/aom_dsp_rtcd.h" 17 18 #include "aom/aom_integer.h" 19 #include "aom_dsp/arm/mem_neon.h" 20 #include "aom_dsp/arm/sum_neon.h" 21 22 static inline uint32_t highbd_sad4xh_small_neon(const uint8_t *src_ptr, 23 int src_stride, 24 const uint8_t *ref_ptr, 25 int ref_stride, int h) { 26 const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr); 27 const uint16_t *ref16_ptr = CONVERT_TO_SHORTPTR(ref_ptr); 28 uint32x4_t sum = vdupq_n_u32(0); 29 30 int i = h; 31 do { 32 uint16x4_t s = vld1_u16(src16_ptr); 33 uint16x4_t r = vld1_u16(ref16_ptr); 34 sum = vabal_u16(sum, s, r); 35 36 src16_ptr += src_stride; 37 ref16_ptr += ref_stride; 38 } while (--i != 0); 39 40 return horizontal_add_u32x4(sum); 41 } 42 43 static inline uint32_t highbd_sad8xh_small_neon(const uint8_t *src_ptr, 44 int src_stride, 45 const uint8_t *ref_ptr, 46 int ref_stride, int h) { 47 const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr); 48 const uint16_t *ref16_ptr = CONVERT_TO_SHORTPTR(ref_ptr); 49 uint16x8_t sum = vdupq_n_u16(0); 50 51 int i = h; 52 do { 53 uint16x8_t s = vld1q_u16(src16_ptr); 54 uint16x8_t r = vld1q_u16(ref16_ptr); 55 sum = vabaq_u16(sum, s, r); 56 57 src16_ptr += src_stride; 58 ref16_ptr += ref_stride; 59 } while (--i != 0); 60 61 return horizontal_add_u16x8(sum); 62 } 63 64 #if !CONFIG_REALTIME_ONLY 65 static inline uint32_t highbd_sad8xh_large_neon(const uint8_t *src_ptr, 66 int src_stride, 67 const uint8_t *ref_ptr, 68 int ref_stride, int h) { 69 const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr); 70 const uint16_t *ref16_ptr = CONVERT_TO_SHORTPTR(ref_ptr); 71 uint32x4_t sum_u32 = vdupq_n_u32(0); 72 73 int i = h; 74 do { 75 uint16x8_t s = vld1q_u16(src16_ptr); 76 uint16x8_t r = vld1q_u16(ref16_ptr); 77 uint16x8_t sum_u16 = vabdq_u16(s, r); 78 sum_u32 = vpadalq_u16(sum_u32, sum_u16); 79 80 src16_ptr += src_stride; 81 ref16_ptr += ref_stride; 82 } while (--i != 0); 83 84 return horizontal_add_u32x4(sum_u32); 85 } 86 #endif // !CONFIG_REALTIME_ONLY 87 88 static inline uint32_t highbd_sad16xh_large_neon(const uint8_t *src_ptr, 89 int src_stride, 90 const uint8_t *ref_ptr, 91 int ref_stride, int h) { 92 const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr); 93 const uint16_t *ref16_ptr = CONVERT_TO_SHORTPTR(ref_ptr); 94 uint32x4_t sum[2] = { vdupq_n_u32(0), vdupq_n_u32(0) }; 95 96 int i = h; 97 do { 98 uint16x8_t s0 = vld1q_u16(src16_ptr); 99 uint16x8_t r0 = vld1q_u16(ref16_ptr); 100 uint16x8_t diff0 = vabdq_u16(s0, r0); 101 sum[0] = vpadalq_u16(sum[0], diff0); 102 103 uint16x8_t s1 = vld1q_u16(src16_ptr + 8); 104 uint16x8_t r1 = vld1q_u16(ref16_ptr + 8); 105 uint16x8_t diff1 = vabdq_u16(s1, r1); 106 sum[1] = vpadalq_u16(sum[1], diff1); 107 108 src16_ptr += src_stride; 109 ref16_ptr += ref_stride; 110 } while (--i != 0); 111 112 sum[0] = vaddq_u32(sum[0], sum[1]); 113 return horizontal_add_u32x4(sum[0]); 114 } 115 116 static inline uint32_t highbd_sadwxh_large_neon(const uint8_t *src_ptr, 117 int src_stride, 118 const uint8_t *ref_ptr, 119 int ref_stride, int w, int h) { 120 const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr); 121 const uint16_t *ref16_ptr = CONVERT_TO_SHORTPTR(ref_ptr); 122 uint32x4_t sum[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0), 123 vdupq_n_u32(0) }; 124 125 int i = h; 126 do { 127 int j = 0; 128 do { 129 uint16x8_t s0 = vld1q_u16(src16_ptr + j); 130 uint16x8_t r0 = vld1q_u16(ref16_ptr + j); 131 uint16x8_t diff0 = vabdq_u16(s0, r0); 132 sum[0] = vpadalq_u16(sum[0], diff0); 133 134 uint16x8_t s1 = vld1q_u16(src16_ptr + j + 8); 135 uint16x8_t r1 = vld1q_u16(ref16_ptr + j + 8); 136 uint16x8_t diff1 = vabdq_u16(s1, r1); 137 sum[1] = vpadalq_u16(sum[1], diff1); 138 139 uint16x8_t s2 = vld1q_u16(src16_ptr + j + 16); 140 uint16x8_t r2 = vld1q_u16(ref16_ptr + j + 16); 141 uint16x8_t diff2 = vabdq_u16(s2, r2); 142 sum[2] = vpadalq_u16(sum[2], diff2); 143 144 uint16x8_t s3 = vld1q_u16(src16_ptr + j + 24); 145 uint16x8_t r3 = vld1q_u16(ref16_ptr + j + 24); 146 uint16x8_t diff3 = vabdq_u16(s3, r3); 147 sum[3] = vpadalq_u16(sum[3], diff3); 148 149 j += 32; 150 } while (j < w); 151 152 src16_ptr += src_stride; 153 ref16_ptr += ref_stride; 154 } while (--i != 0); 155 156 sum[0] = vaddq_u32(sum[0], sum[1]); 157 sum[2] = vaddq_u32(sum[2], sum[3]); 158 sum[0] = vaddq_u32(sum[0], sum[2]); 159 160 return horizontal_add_u32x4(sum[0]); 161 } 162 163 static inline unsigned int highbd_sad128xh_large_neon(const uint8_t *src_ptr, 164 int src_stride, 165 const uint8_t *ref_ptr, 166 int ref_stride, int h) { 167 return highbd_sadwxh_large_neon(src_ptr, src_stride, ref_ptr, ref_stride, 128, 168 h); 169 } 170 171 static inline unsigned int highbd_sad64xh_large_neon(const uint8_t *src_ptr, 172 int src_stride, 173 const uint8_t *ref_ptr, 174 int ref_stride, int h) { 175 return highbd_sadwxh_large_neon(src_ptr, src_stride, ref_ptr, ref_stride, 64, 176 h); 177 } 178 179 static inline unsigned int highbd_sad32xh_large_neon(const uint8_t *src_ptr, 180 int src_stride, 181 const uint8_t *ref_ptr, 182 int ref_stride, int h) { 183 return highbd_sadwxh_large_neon(src_ptr, src_stride, ref_ptr, ref_stride, 32, 184 h); 185 } 186 187 #define HBD_SAD_WXH_SMALL_NEON(w, h) \ 188 unsigned int aom_highbd_sad##w##x##h##_neon( \ 189 const uint8_t *src, int src_stride, const uint8_t *ref, \ 190 int ref_stride) { \ 191 return highbd_sad##w##xh_small_neon(src, src_stride, ref, ref_stride, \ 192 (h)); \ 193 } 194 195 #define HBD_SAD_WXH_LARGE_NEON(w, h) \ 196 unsigned int aom_highbd_sad##w##x##h##_neon( \ 197 const uint8_t *src, int src_stride, const uint8_t *ref, \ 198 int ref_stride) { \ 199 return highbd_sad##w##xh_large_neon(src, src_stride, ref, ref_stride, \ 200 (h)); \ 201 } 202 203 HBD_SAD_WXH_SMALL_NEON(4, 4) 204 HBD_SAD_WXH_SMALL_NEON(4, 8) 205 206 HBD_SAD_WXH_SMALL_NEON(8, 4) 207 HBD_SAD_WXH_SMALL_NEON(8, 8) 208 HBD_SAD_WXH_SMALL_NEON(8, 16) 209 210 HBD_SAD_WXH_LARGE_NEON(16, 8) 211 HBD_SAD_WXH_LARGE_NEON(16, 16) 212 HBD_SAD_WXH_LARGE_NEON(16, 32) 213 214 HBD_SAD_WXH_LARGE_NEON(32, 16) 215 HBD_SAD_WXH_LARGE_NEON(32, 32) 216 HBD_SAD_WXH_LARGE_NEON(32, 64) 217 218 HBD_SAD_WXH_LARGE_NEON(64, 32) 219 HBD_SAD_WXH_LARGE_NEON(64, 64) 220 HBD_SAD_WXH_LARGE_NEON(64, 128) 221 222 HBD_SAD_WXH_LARGE_NEON(128, 64) 223 HBD_SAD_WXH_LARGE_NEON(128, 128) 224 225 #if !CONFIG_REALTIME_ONLY 226 HBD_SAD_WXH_SMALL_NEON(4, 16) 227 228 HBD_SAD_WXH_LARGE_NEON(8, 32) 229 230 HBD_SAD_WXH_LARGE_NEON(16, 4) 231 HBD_SAD_WXH_LARGE_NEON(16, 64) 232 233 HBD_SAD_WXH_LARGE_NEON(32, 8) 234 235 HBD_SAD_WXH_LARGE_NEON(64, 16) 236 #endif // !CONFIG_REALTIME_ONLY 237 238 #define HBD_SAD_SKIP_WXH_SMALL_NEON(w, h) \ 239 unsigned int aom_highbd_sad_skip_##w##x##h##_neon( \ 240 const uint8_t *src, int src_stride, const uint8_t *ref, \ 241 int ref_stride) { \ 242 return 2 * highbd_sad##w##xh_small_neon(src, 2 * src_stride, ref, \ 243 2 * ref_stride, (h) / 2); \ 244 } 245 246 #define HBD_SAD_SKIP_WXH_LARGE_NEON(w, h) \ 247 unsigned int aom_highbd_sad_skip_##w##x##h##_neon( \ 248 const uint8_t *src, int src_stride, const uint8_t *ref, \ 249 int ref_stride) { \ 250 return 2 * highbd_sad##w##xh_large_neon(src, 2 * src_stride, ref, \ 251 2 * ref_stride, (h) / 2); \ 252 } 253 254 HBD_SAD_SKIP_WXH_SMALL_NEON(8, 16) 255 256 HBD_SAD_SKIP_WXH_LARGE_NEON(16, 16) 257 HBD_SAD_SKIP_WXH_LARGE_NEON(16, 32) 258 259 HBD_SAD_SKIP_WXH_LARGE_NEON(32, 16) 260 HBD_SAD_SKIP_WXH_LARGE_NEON(32, 32) 261 HBD_SAD_SKIP_WXH_LARGE_NEON(32, 64) 262 263 HBD_SAD_SKIP_WXH_LARGE_NEON(64, 32) 264 HBD_SAD_SKIP_WXH_LARGE_NEON(64, 64) 265 HBD_SAD_SKIP_WXH_LARGE_NEON(64, 128) 266 267 HBD_SAD_SKIP_WXH_LARGE_NEON(128, 64) 268 HBD_SAD_SKIP_WXH_LARGE_NEON(128, 128) 269 270 #if !CONFIG_REALTIME_ONLY 271 HBD_SAD_SKIP_WXH_SMALL_NEON(4, 16) 272 273 HBD_SAD_SKIP_WXH_SMALL_NEON(8, 32) 274 275 HBD_SAD_SKIP_WXH_LARGE_NEON(16, 64) 276 277 HBD_SAD_SKIP_WXH_LARGE_NEON(64, 16) 278 #endif // !CONFIG_REALTIME_ONLY 279 280 static inline uint32_t highbd_sad8xh_avg_neon(const uint8_t *src_ptr, 281 int src_stride, 282 const uint8_t *ref_ptr, 283 int ref_stride, int h, 284 const uint8_t *second_pred) { 285 const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr); 286 const uint16_t *ref16_ptr = CONVERT_TO_SHORTPTR(ref_ptr); 287 const uint16_t *pred16_ptr = CONVERT_TO_SHORTPTR(second_pred); 288 uint32x4_t sum = vdupq_n_u32(0); 289 290 int i = h; 291 do { 292 uint16x8_t s = vld1q_u16(src16_ptr); 293 uint16x8_t r = vld1q_u16(ref16_ptr); 294 uint16x8_t p = vld1q_u16(pred16_ptr); 295 296 uint16x8_t avg = vrhaddq_u16(r, p); 297 uint16x8_t diff = vabdq_u16(s, avg); 298 sum = vpadalq_u16(sum, diff); 299 300 src16_ptr += src_stride; 301 ref16_ptr += ref_stride; 302 pred16_ptr += 8; 303 } while (--i != 0); 304 305 return horizontal_add_u32x4(sum); 306 } 307 308 static inline uint32_t highbd_sad16xh_avg_neon(const uint8_t *src_ptr, 309 int src_stride, 310 const uint8_t *ref_ptr, 311 int ref_stride, int h, 312 const uint8_t *second_pred) { 313 const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr); 314 const uint16_t *ref16_ptr = CONVERT_TO_SHORTPTR(ref_ptr); 315 const uint16_t *pred16_ptr = CONVERT_TO_SHORTPTR(second_pred); 316 uint32x4_t sum[2] = { vdupq_n_u32(0), vdupq_n_u32(0) }; 317 318 int i = h; 319 do { 320 uint16x8_t s0, s1, r0, r1, p0, p1; 321 uint16x8_t avg0, avg1, diff0, diff1; 322 323 s0 = vld1q_u16(src16_ptr); 324 r0 = vld1q_u16(ref16_ptr); 325 p0 = vld1q_u16(pred16_ptr); 326 avg0 = vrhaddq_u16(r0, p0); 327 diff0 = vabdq_u16(s0, avg0); 328 sum[0] = vpadalq_u16(sum[0], diff0); 329 330 s1 = vld1q_u16(src16_ptr + 8); 331 r1 = vld1q_u16(ref16_ptr + 8); 332 p1 = vld1q_u16(pred16_ptr + 8); 333 avg1 = vrhaddq_u16(r1, p1); 334 diff1 = vabdq_u16(s1, avg1); 335 sum[1] = vpadalq_u16(sum[1], diff1); 336 337 src16_ptr += src_stride; 338 ref16_ptr += ref_stride; 339 pred16_ptr += 16; 340 } while (--i != 0); 341 342 sum[0] = vaddq_u32(sum[0], sum[1]); 343 return horizontal_add_u32x4(sum[0]); 344 } 345 346 static inline uint32_t highbd_sadwxh_avg_neon(const uint8_t *src_ptr, 347 int src_stride, 348 const uint8_t *ref_ptr, 349 int ref_stride, int w, int h, 350 const uint8_t *second_pred) { 351 const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr); 352 const uint16_t *ref16_ptr = CONVERT_TO_SHORTPTR(ref_ptr); 353 const uint16_t *pred16_ptr = CONVERT_TO_SHORTPTR(second_pred); 354 uint32x4_t sum[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0), 355 vdupq_n_u32(0) }; 356 357 int i = h; 358 do { 359 int j = 0; 360 do { 361 uint16x8_t s0, s1, s2, s3, r0, r1, r2, r3, p0, p1, p2, p3; 362 uint16x8_t avg0, avg1, avg2, avg3, diff0, diff1, diff2, diff3; 363 364 s0 = vld1q_u16(src16_ptr + j); 365 r0 = vld1q_u16(ref16_ptr + j); 366 p0 = vld1q_u16(pred16_ptr + j); 367 avg0 = vrhaddq_u16(r0, p0); 368 diff0 = vabdq_u16(s0, avg0); 369 sum[0] = vpadalq_u16(sum[0], diff0); 370 371 s1 = vld1q_u16(src16_ptr + j + 8); 372 r1 = vld1q_u16(ref16_ptr + j + 8); 373 p1 = vld1q_u16(pred16_ptr + j + 8); 374 avg1 = vrhaddq_u16(r1, p1); 375 diff1 = vabdq_u16(s1, avg1); 376 sum[1] = vpadalq_u16(sum[1], diff1); 377 378 s2 = vld1q_u16(src16_ptr + j + 16); 379 r2 = vld1q_u16(ref16_ptr + j + 16); 380 p2 = vld1q_u16(pred16_ptr + j + 16); 381 avg2 = vrhaddq_u16(r2, p2); 382 diff2 = vabdq_u16(s2, avg2); 383 sum[2] = vpadalq_u16(sum[2], diff2); 384 385 s3 = vld1q_u16(src16_ptr + j + 24); 386 r3 = vld1q_u16(ref16_ptr + j + 24); 387 p3 = vld1q_u16(pred16_ptr + j + 24); 388 avg3 = vrhaddq_u16(r3, p3); 389 diff3 = vabdq_u16(s3, avg3); 390 sum[3] = vpadalq_u16(sum[3], diff3); 391 392 j += 32; 393 } while (j < w); 394 395 src16_ptr += src_stride; 396 ref16_ptr += ref_stride; 397 pred16_ptr += w; 398 } while (--i != 0); 399 400 sum[0] = vaddq_u32(sum[0], sum[1]); 401 sum[2] = vaddq_u32(sum[2], sum[3]); 402 sum[0] = vaddq_u32(sum[0], sum[2]); 403 404 return horizontal_add_u32x4(sum[0]); 405 } 406 407 static inline unsigned int highbd_sad128xh_avg_neon( 408 const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, 409 int ref_stride, int h, const uint8_t *second_pred) { 410 return highbd_sadwxh_avg_neon(src_ptr, src_stride, ref_ptr, ref_stride, 128, 411 h, second_pred); 412 } 413 414 static inline unsigned int highbd_sad64xh_avg_neon(const uint8_t *src_ptr, 415 int src_stride, 416 const uint8_t *ref_ptr, 417 int ref_stride, int h, 418 const uint8_t *second_pred) { 419 return highbd_sadwxh_avg_neon(src_ptr, src_stride, ref_ptr, ref_stride, 64, h, 420 second_pred); 421 } 422 423 static inline unsigned int highbd_sad32xh_avg_neon(const uint8_t *src_ptr, 424 int src_stride, 425 const uint8_t *ref_ptr, 426 int ref_stride, int h, 427 const uint8_t *second_pred) { 428 return highbd_sadwxh_avg_neon(src_ptr, src_stride, ref_ptr, ref_stride, 32, h, 429 second_pred); 430 } 431 432 #define HBD_SAD_WXH_AVG_NEON(w, h) \ 433 uint32_t aom_highbd_sad##w##x##h##_avg_neon( \ 434 const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \ 435 const uint8_t *second_pred) { \ 436 return highbd_sad##w##xh_avg_neon(src, src_stride, ref, ref_stride, (h), \ 437 second_pred); \ 438 } 439 440 HBD_SAD_WXH_AVG_NEON(8, 8) 441 HBD_SAD_WXH_AVG_NEON(8, 16) 442 443 HBD_SAD_WXH_AVG_NEON(16, 8) 444 HBD_SAD_WXH_AVG_NEON(16, 16) 445 HBD_SAD_WXH_AVG_NEON(16, 32) 446 447 HBD_SAD_WXH_AVG_NEON(32, 16) 448 HBD_SAD_WXH_AVG_NEON(32, 32) 449 HBD_SAD_WXH_AVG_NEON(32, 64) 450 451 HBD_SAD_WXH_AVG_NEON(64, 32) 452 HBD_SAD_WXH_AVG_NEON(64, 64) 453 HBD_SAD_WXH_AVG_NEON(64, 128) 454 455 HBD_SAD_WXH_AVG_NEON(128, 64) 456 HBD_SAD_WXH_AVG_NEON(128, 128) 457 458 #if !CONFIG_REALTIME_ONLY 459 HBD_SAD_WXH_AVG_NEON(8, 32) 460 461 HBD_SAD_WXH_AVG_NEON(16, 64) 462 463 HBD_SAD_WXH_AVG_NEON(32, 8) 464 465 HBD_SAD_WXH_AVG_NEON(64, 16) 466 #endif // !CONFIG_REALTIME_ONLY