sad_neon.c (16899B)
1 /* 2 * Copyright (c) 2016, Alliance for Open Media. All rights reserved. 3 * 4 * This source code is subject to the terms of the BSD 2 Clause License and 5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License 6 * was not distributed with this source code in the LICENSE file, you can 7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open 8 * Media Patent License 1.0 was not distributed with this source code in the 9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent. 10 */ 11 12 #include <arm_neon.h> 13 14 #include "config/aom_config.h" 15 #include "config/aom_dsp_rtcd.h" 16 17 #include "aom/aom_integer.h" 18 #include "aom_dsp/arm/mem_neon.h" 19 #include "aom_dsp/arm/sum_neon.h" 20 21 static inline unsigned int sad128xh_neon(const uint8_t *src_ptr, int src_stride, 22 const uint8_t *ref_ptr, int ref_stride, 23 int h) { 24 // We use 8 accumulators to prevent overflow for large values of 'h', as well 25 // as enabling optimal UADALP instruction throughput on CPUs that have either 26 // 2 or 4 Neon pipes. 27 uint16x8_t sum[8] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0), 28 vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0), 29 vdupq_n_u16(0), vdupq_n_u16(0) }; 30 31 int i = h; 32 do { 33 uint8x16_t s0, s1, s2, s3, s4, s5, s6, s7; 34 uint8x16_t r0, r1, r2, r3, r4, r5, r6, r7; 35 uint8x16_t diff0, diff1, diff2, diff3, diff4, diff5, diff6, diff7; 36 37 s0 = vld1q_u8(src_ptr); 38 r0 = vld1q_u8(ref_ptr); 39 diff0 = vabdq_u8(s0, r0); 40 sum[0] = vpadalq_u8(sum[0], diff0); 41 42 s1 = vld1q_u8(src_ptr + 16); 43 r1 = vld1q_u8(ref_ptr + 16); 44 diff1 = vabdq_u8(s1, r1); 45 sum[1] = vpadalq_u8(sum[1], diff1); 46 47 s2 = vld1q_u8(src_ptr + 32); 48 r2 = vld1q_u8(ref_ptr + 32); 49 diff2 = vabdq_u8(s2, r2); 50 sum[2] = vpadalq_u8(sum[2], diff2); 51 52 s3 = vld1q_u8(src_ptr + 48); 53 r3 = vld1q_u8(ref_ptr + 48); 54 diff3 = vabdq_u8(s3, r3); 55 sum[3] = vpadalq_u8(sum[3], diff3); 56 57 s4 = vld1q_u8(src_ptr + 64); 58 r4 = vld1q_u8(ref_ptr + 64); 59 diff4 = vabdq_u8(s4, r4); 60 sum[4] = vpadalq_u8(sum[4], diff4); 61 62 s5 = vld1q_u8(src_ptr + 80); 63 r5 = vld1q_u8(ref_ptr + 80); 64 diff5 = vabdq_u8(s5, r5); 65 sum[5] = vpadalq_u8(sum[5], diff5); 66 67 s6 = vld1q_u8(src_ptr + 96); 68 r6 = vld1q_u8(ref_ptr + 96); 69 diff6 = vabdq_u8(s6, r6); 70 sum[6] = vpadalq_u8(sum[6], diff6); 71 72 s7 = vld1q_u8(src_ptr + 112); 73 r7 = vld1q_u8(ref_ptr + 112); 74 diff7 = vabdq_u8(s7, r7); 75 sum[7] = vpadalq_u8(sum[7], diff7); 76 77 src_ptr += src_stride; 78 ref_ptr += ref_stride; 79 } while (--i != 0); 80 81 uint32x4_t sum_u32 = vpaddlq_u16(sum[0]); 82 sum_u32 = vpadalq_u16(sum_u32, sum[1]); 83 sum_u32 = vpadalq_u16(sum_u32, sum[2]); 84 sum_u32 = vpadalq_u16(sum_u32, sum[3]); 85 sum_u32 = vpadalq_u16(sum_u32, sum[4]); 86 sum_u32 = vpadalq_u16(sum_u32, sum[5]); 87 sum_u32 = vpadalq_u16(sum_u32, sum[6]); 88 sum_u32 = vpadalq_u16(sum_u32, sum[7]); 89 90 return horizontal_add_u32x4(sum_u32); 91 } 92 93 static inline unsigned int sad64xh_neon(const uint8_t *src_ptr, int src_stride, 94 const uint8_t *ref_ptr, int ref_stride, 95 int h) { 96 uint16x8_t sum[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0), 97 vdupq_n_u16(0) }; 98 99 int i = h; 100 do { 101 uint8x16_t s0, s1, s2, s3, r0, r1, r2, r3; 102 uint8x16_t diff0, diff1, diff2, diff3; 103 104 s0 = vld1q_u8(src_ptr); 105 r0 = vld1q_u8(ref_ptr); 106 diff0 = vabdq_u8(s0, r0); 107 sum[0] = vpadalq_u8(sum[0], diff0); 108 109 s1 = vld1q_u8(src_ptr + 16); 110 r1 = vld1q_u8(ref_ptr + 16); 111 diff1 = vabdq_u8(s1, r1); 112 sum[1] = vpadalq_u8(sum[1], diff1); 113 114 s2 = vld1q_u8(src_ptr + 32); 115 r2 = vld1q_u8(ref_ptr + 32); 116 diff2 = vabdq_u8(s2, r2); 117 sum[2] = vpadalq_u8(sum[2], diff2); 118 119 s3 = vld1q_u8(src_ptr + 48); 120 r3 = vld1q_u8(ref_ptr + 48); 121 diff3 = vabdq_u8(s3, r3); 122 sum[3] = vpadalq_u8(sum[3], diff3); 123 124 src_ptr += src_stride; 125 ref_ptr += ref_stride; 126 } while (--i != 0); 127 128 uint32x4_t sum_u32 = vpaddlq_u16(sum[0]); 129 sum_u32 = vpadalq_u16(sum_u32, sum[1]); 130 sum_u32 = vpadalq_u16(sum_u32, sum[2]); 131 sum_u32 = vpadalq_u16(sum_u32, sum[3]); 132 133 return horizontal_add_u32x4(sum_u32); 134 } 135 136 static inline unsigned int sad32xh_neon(const uint8_t *src_ptr, int src_stride, 137 const uint8_t *ref_ptr, int ref_stride, 138 int h) { 139 uint16x8_t sum[2] = { vdupq_n_u16(0), vdupq_n_u16(0) }; 140 141 int i = h; 142 do { 143 uint8x16_t s0 = vld1q_u8(src_ptr); 144 uint8x16_t r0 = vld1q_u8(ref_ptr); 145 uint8x16_t diff0 = vabdq_u8(s0, r0); 146 sum[0] = vpadalq_u8(sum[0], diff0); 147 148 uint8x16_t s1 = vld1q_u8(src_ptr + 16); 149 uint8x16_t r1 = vld1q_u8(ref_ptr + 16); 150 uint8x16_t diff1 = vabdq_u8(s1, r1); 151 sum[1] = vpadalq_u8(sum[1], diff1); 152 153 src_ptr += src_stride; 154 ref_ptr += ref_stride; 155 } while (--i != 0); 156 157 return horizontal_add_u16x8(vaddq_u16(sum[0], sum[1])); 158 } 159 160 static inline unsigned int sad16xh_neon(const uint8_t *src_ptr, int src_stride, 161 const uint8_t *ref_ptr, int ref_stride, 162 int h) { 163 uint16x8_t sum = vdupq_n_u16(0); 164 165 int i = h; 166 do { 167 uint8x16_t s = vld1q_u8(src_ptr); 168 uint8x16_t r = vld1q_u8(ref_ptr); 169 170 uint8x16_t diff = vabdq_u8(s, r); 171 sum = vpadalq_u8(sum, diff); 172 173 src_ptr += src_stride; 174 ref_ptr += ref_stride; 175 } while (--i != 0); 176 177 return horizontal_add_u16x8(sum); 178 } 179 180 static inline unsigned int sad8xh_neon(const uint8_t *src_ptr, int src_stride, 181 const uint8_t *ref_ptr, int ref_stride, 182 int h) { 183 uint16x8_t sum = vdupq_n_u16(0); 184 185 int i = h; 186 do { 187 uint8x8_t s = vld1_u8(src_ptr); 188 uint8x8_t r = vld1_u8(ref_ptr); 189 190 sum = vabal_u8(sum, s, r); 191 192 src_ptr += src_stride; 193 ref_ptr += ref_stride; 194 } while (--i != 0); 195 196 return horizontal_add_u16x8(sum); 197 } 198 199 static inline unsigned int sad4xh_neon(const uint8_t *src_ptr, int src_stride, 200 const uint8_t *ref_ptr, int ref_stride, 201 int h) { 202 uint16x8_t sum = vdupq_n_u16(0); 203 204 int i = h / 2; 205 do { 206 uint8x8_t s = load_unaligned_u8(src_ptr, src_stride); 207 uint8x8_t r = load_unaligned_u8(ref_ptr, ref_stride); 208 209 sum = vabal_u8(sum, s, r); 210 211 src_ptr += 2 * src_stride; 212 ref_ptr += 2 * ref_stride; 213 } while (--i != 0); 214 215 return horizontal_add_u16x8(sum); 216 } 217 218 #define SAD_WXH_NEON(w, h) \ 219 unsigned int aom_sad##w##x##h##_neon(const uint8_t *src, int src_stride, \ 220 const uint8_t *ref, int ref_stride) { \ 221 return sad##w##xh_neon(src, src_stride, ref, ref_stride, (h)); \ 222 } 223 224 SAD_WXH_NEON(4, 4) 225 SAD_WXH_NEON(4, 8) 226 227 SAD_WXH_NEON(8, 4) 228 SAD_WXH_NEON(8, 8) 229 SAD_WXH_NEON(8, 16) 230 231 SAD_WXH_NEON(16, 8) 232 SAD_WXH_NEON(16, 16) 233 SAD_WXH_NEON(16, 32) 234 235 SAD_WXH_NEON(32, 16) 236 SAD_WXH_NEON(32, 32) 237 SAD_WXH_NEON(32, 64) 238 239 SAD_WXH_NEON(64, 32) 240 SAD_WXH_NEON(64, 64) 241 SAD_WXH_NEON(64, 128) 242 243 SAD_WXH_NEON(128, 64) 244 SAD_WXH_NEON(128, 128) 245 246 #if !CONFIG_REALTIME_ONLY 247 SAD_WXH_NEON(4, 16) 248 SAD_WXH_NEON(8, 32) 249 SAD_WXH_NEON(16, 4) 250 SAD_WXH_NEON(16, 64) 251 SAD_WXH_NEON(32, 8) 252 SAD_WXH_NEON(64, 16) 253 #endif // !CONFIG_REALTIME_ONLY 254 255 #undef SAD_WXH_NEON 256 257 #define SAD_SKIP_WXH_NEON(w, h) \ 258 unsigned int aom_sad_skip_##w##x##h##_neon( \ 259 const uint8_t *src, int src_stride, const uint8_t *ref, \ 260 int ref_stride) { \ 261 return 2 * \ 262 sad##w##xh_neon(src, 2 * src_stride, ref, 2 * ref_stride, (h) / 2); \ 263 } 264 265 SAD_SKIP_WXH_NEON(8, 16) 266 267 SAD_SKIP_WXH_NEON(16, 16) 268 SAD_SKIP_WXH_NEON(16, 32) 269 270 SAD_SKIP_WXH_NEON(32, 16) 271 SAD_SKIP_WXH_NEON(32, 32) 272 SAD_SKIP_WXH_NEON(32, 64) 273 274 SAD_SKIP_WXH_NEON(64, 32) 275 SAD_SKIP_WXH_NEON(64, 64) 276 SAD_SKIP_WXH_NEON(64, 128) 277 278 SAD_SKIP_WXH_NEON(128, 64) 279 SAD_SKIP_WXH_NEON(128, 128) 280 281 #if !CONFIG_REALTIME_ONLY 282 SAD_SKIP_WXH_NEON(4, 16) 283 SAD_SKIP_WXH_NEON(8, 32) 284 SAD_SKIP_WXH_NEON(16, 64) 285 SAD_SKIP_WXH_NEON(64, 16) 286 #endif // !CONFIG_REALTIME_ONLY 287 288 #undef SAD_SKIP_WXH_NEON 289 290 static inline unsigned int sad128xh_avg_neon(const uint8_t *src_ptr, 291 int src_stride, 292 const uint8_t *ref_ptr, 293 int ref_stride, int h, 294 const uint8_t *second_pred) { 295 // We use 8 accumulators to prevent overflow for large values of 'h', as well 296 // as enabling optimal UADALP instruction throughput on CPUs that have either 297 // 2 or 4 Neon pipes. 298 uint16x8_t sum[8] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0), 299 vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0), 300 vdupq_n_u16(0), vdupq_n_u16(0) }; 301 302 int i = h; 303 do { 304 uint8x16_t s0, s1, s2, s3, s4, s5, s6, s7; 305 uint8x16_t r0, r1, r2, r3, r4, r5, r6, r7; 306 uint8x16_t p0, p1, p2, p3, p4, p5, p6, p7; 307 uint8x16_t avg0, avg1, avg2, avg3, avg4, avg5, avg6, avg7; 308 uint8x16_t diff0, diff1, diff2, diff3, diff4, diff5, diff6, diff7; 309 310 s0 = vld1q_u8(src_ptr); 311 r0 = vld1q_u8(ref_ptr); 312 p0 = vld1q_u8(second_pred); 313 avg0 = vrhaddq_u8(r0, p0); 314 diff0 = vabdq_u8(s0, avg0); 315 sum[0] = vpadalq_u8(sum[0], diff0); 316 317 s1 = vld1q_u8(src_ptr + 16); 318 r1 = vld1q_u8(ref_ptr + 16); 319 p1 = vld1q_u8(second_pred + 16); 320 avg1 = vrhaddq_u8(r1, p1); 321 diff1 = vabdq_u8(s1, avg1); 322 sum[1] = vpadalq_u8(sum[1], diff1); 323 324 s2 = vld1q_u8(src_ptr + 32); 325 r2 = vld1q_u8(ref_ptr + 32); 326 p2 = vld1q_u8(second_pred + 32); 327 avg2 = vrhaddq_u8(r2, p2); 328 diff2 = vabdq_u8(s2, avg2); 329 sum[2] = vpadalq_u8(sum[2], diff2); 330 331 s3 = vld1q_u8(src_ptr + 48); 332 r3 = vld1q_u8(ref_ptr + 48); 333 p3 = vld1q_u8(second_pred + 48); 334 avg3 = vrhaddq_u8(r3, p3); 335 diff3 = vabdq_u8(s3, avg3); 336 sum[3] = vpadalq_u8(sum[3], diff3); 337 338 s4 = vld1q_u8(src_ptr + 64); 339 r4 = vld1q_u8(ref_ptr + 64); 340 p4 = vld1q_u8(second_pred + 64); 341 avg4 = vrhaddq_u8(r4, p4); 342 diff4 = vabdq_u8(s4, avg4); 343 sum[4] = vpadalq_u8(sum[4], diff4); 344 345 s5 = vld1q_u8(src_ptr + 80); 346 r5 = vld1q_u8(ref_ptr + 80); 347 p5 = vld1q_u8(second_pred + 80); 348 avg5 = vrhaddq_u8(r5, p5); 349 diff5 = vabdq_u8(s5, avg5); 350 sum[5] = vpadalq_u8(sum[5], diff5); 351 352 s6 = vld1q_u8(src_ptr + 96); 353 r6 = vld1q_u8(ref_ptr + 96); 354 p6 = vld1q_u8(second_pred + 96); 355 avg6 = vrhaddq_u8(r6, p6); 356 diff6 = vabdq_u8(s6, avg6); 357 sum[6] = vpadalq_u8(sum[6], diff6); 358 359 s7 = vld1q_u8(src_ptr + 112); 360 r7 = vld1q_u8(ref_ptr + 112); 361 p7 = vld1q_u8(second_pred + 112); 362 avg7 = vrhaddq_u8(r7, p7); 363 diff7 = vabdq_u8(s7, avg7); 364 sum[7] = vpadalq_u8(sum[7], diff7); 365 366 src_ptr += src_stride; 367 ref_ptr += ref_stride; 368 second_pred += 128; 369 } while (--i != 0); 370 371 uint32x4_t sum_u32 = vpaddlq_u16(sum[0]); 372 sum_u32 = vpadalq_u16(sum_u32, sum[1]); 373 sum_u32 = vpadalq_u16(sum_u32, sum[2]); 374 sum_u32 = vpadalq_u16(sum_u32, sum[3]); 375 sum_u32 = vpadalq_u16(sum_u32, sum[4]); 376 sum_u32 = vpadalq_u16(sum_u32, sum[5]); 377 sum_u32 = vpadalq_u16(sum_u32, sum[6]); 378 sum_u32 = vpadalq_u16(sum_u32, sum[7]); 379 380 return horizontal_add_u32x4(sum_u32); 381 } 382 383 static inline unsigned int sad64xh_avg_neon(const uint8_t *src_ptr, 384 int src_stride, 385 const uint8_t *ref_ptr, 386 int ref_stride, int h, 387 const uint8_t *second_pred) { 388 uint16x8_t sum[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0), 389 vdupq_n_u16(0) }; 390 391 int i = h; 392 do { 393 uint8x16_t s0, s1, s2, s3, r0, r1, r2, r3, p0, p1, p2, p3; 394 uint8x16_t avg0, avg1, avg2, avg3, diff0, diff1, diff2, diff3; 395 396 s0 = vld1q_u8(src_ptr); 397 r0 = vld1q_u8(ref_ptr); 398 p0 = vld1q_u8(second_pred); 399 avg0 = vrhaddq_u8(r0, p0); 400 diff0 = vabdq_u8(s0, avg0); 401 sum[0] = vpadalq_u8(sum[0], diff0); 402 403 s1 = vld1q_u8(src_ptr + 16); 404 r1 = vld1q_u8(ref_ptr + 16); 405 p1 = vld1q_u8(second_pred + 16); 406 avg1 = vrhaddq_u8(r1, p1); 407 diff1 = vabdq_u8(s1, avg1); 408 sum[1] = vpadalq_u8(sum[1], diff1); 409 410 s2 = vld1q_u8(src_ptr + 32); 411 r2 = vld1q_u8(ref_ptr + 32); 412 p2 = vld1q_u8(second_pred + 32); 413 avg2 = vrhaddq_u8(r2, p2); 414 diff2 = vabdq_u8(s2, avg2); 415 sum[2] = vpadalq_u8(sum[2], diff2); 416 417 s3 = vld1q_u8(src_ptr + 48); 418 r3 = vld1q_u8(ref_ptr + 48); 419 p3 = vld1q_u8(second_pred + 48); 420 avg3 = vrhaddq_u8(r3, p3); 421 diff3 = vabdq_u8(s3, avg3); 422 sum[3] = vpadalq_u8(sum[3], diff3); 423 424 src_ptr += src_stride; 425 ref_ptr += ref_stride; 426 second_pred += 64; 427 } while (--i != 0); 428 429 uint32x4_t sum_u32 = vpaddlq_u16(sum[0]); 430 sum_u32 = vpadalq_u16(sum_u32, sum[1]); 431 sum_u32 = vpadalq_u16(sum_u32, sum[2]); 432 sum_u32 = vpadalq_u16(sum_u32, sum[3]); 433 434 return horizontal_add_u32x4(sum_u32); 435 } 436 437 static inline unsigned int sad32xh_avg_neon(const uint8_t *src_ptr, 438 int src_stride, 439 const uint8_t *ref_ptr, 440 int ref_stride, int h, 441 const uint8_t *second_pred) { 442 uint16x8_t sum[2] = { vdupq_n_u16(0), vdupq_n_u16(0) }; 443 444 int i = h; 445 do { 446 uint8x16_t s0 = vld1q_u8(src_ptr); 447 uint8x16_t r0 = vld1q_u8(ref_ptr); 448 uint8x16_t p0 = vld1q_u8(second_pred); 449 uint8x16_t avg0 = vrhaddq_u8(r0, p0); 450 uint8x16_t diff0 = vabdq_u8(s0, avg0); 451 sum[0] = vpadalq_u8(sum[0], diff0); 452 453 uint8x16_t s1 = vld1q_u8(src_ptr + 16); 454 uint8x16_t r1 = vld1q_u8(ref_ptr + 16); 455 uint8x16_t p1 = vld1q_u8(second_pred + 16); 456 uint8x16_t avg1 = vrhaddq_u8(r1, p1); 457 uint8x16_t diff1 = vabdq_u8(s1, avg1); 458 sum[1] = vpadalq_u8(sum[1], diff1); 459 460 src_ptr += src_stride; 461 ref_ptr += ref_stride; 462 second_pred += 32; 463 } while (--i != 0); 464 465 return horizontal_add_u16x8(vaddq_u16(sum[0], sum[1])); 466 } 467 468 static inline unsigned int sad16xh_avg_neon(const uint8_t *src_ptr, 469 int src_stride, 470 const uint8_t *ref_ptr, 471 int ref_stride, int h, 472 const uint8_t *second_pred) { 473 uint16x8_t sum = vdupq_n_u16(0); 474 475 int i = h; 476 do { 477 uint8x16_t s = vld1q_u8(src_ptr); 478 uint8x16_t r = vld1q_u8(ref_ptr); 479 uint8x16_t p = vld1q_u8(second_pred); 480 481 uint8x16_t avg = vrhaddq_u8(r, p); 482 uint8x16_t diff = vabdq_u8(s, avg); 483 sum = vpadalq_u8(sum, diff); 484 485 src_ptr += src_stride; 486 ref_ptr += ref_stride; 487 second_pred += 16; 488 } while (--i != 0); 489 490 return horizontal_add_u16x8(sum); 491 } 492 493 static inline unsigned int sad8xh_avg_neon(const uint8_t *src_ptr, 494 int src_stride, 495 const uint8_t *ref_ptr, 496 int ref_stride, int h, 497 const uint8_t *second_pred) { 498 uint16x8_t sum = vdupq_n_u16(0); 499 500 int i = h; 501 do { 502 uint8x8_t s = vld1_u8(src_ptr); 503 uint8x8_t r = vld1_u8(ref_ptr); 504 uint8x8_t p = vld1_u8(second_pred); 505 506 uint8x8_t avg = vrhadd_u8(r, p); 507 sum = vabal_u8(sum, s, avg); 508 509 src_ptr += src_stride; 510 ref_ptr += ref_stride; 511 second_pred += 8; 512 } while (--i != 0); 513 514 return horizontal_add_u16x8(sum); 515 } 516 517 #define SAD_WXH_AVG_NEON(w, h) \ 518 unsigned int aom_sad##w##x##h##_avg_neon(const uint8_t *src, int src_stride, \ 519 const uint8_t *ref, int ref_stride, \ 520 const uint8_t *second_pred) { \ 521 return sad##w##xh_avg_neon(src, src_stride, ref, ref_stride, (h), \ 522 second_pred); \ 523 } 524 525 SAD_WXH_AVG_NEON(8, 8) 526 SAD_WXH_AVG_NEON(8, 16) 527 528 SAD_WXH_AVG_NEON(16, 8) 529 SAD_WXH_AVG_NEON(16, 16) 530 SAD_WXH_AVG_NEON(16, 32) 531 532 SAD_WXH_AVG_NEON(32, 16) 533 SAD_WXH_AVG_NEON(32, 32) 534 SAD_WXH_AVG_NEON(32, 64) 535 536 SAD_WXH_AVG_NEON(64, 32) 537 SAD_WXH_AVG_NEON(64, 64) 538 SAD_WXH_AVG_NEON(64, 128) 539 540 SAD_WXH_AVG_NEON(128, 64) 541 SAD_WXH_AVG_NEON(128, 128) 542 543 #if !CONFIG_REALTIME_ONLY 544 SAD_WXH_AVG_NEON(8, 32) 545 SAD_WXH_AVG_NEON(16, 64) 546 SAD_WXH_AVG_NEON(32, 8) 547 SAD_WXH_AVG_NEON(64, 16) 548 #endif // !CONFIG_REALTIME_ONLY 549 550 #undef SAD_WXH_AVG_NEON