sadxd_neon.c (17679B)
1 /* 2 * Copyright (c) 2016, Alliance for Open Media. All rights reserved. 3 * 4 * This source code is subject to the terms of the BSD 2 Clause License and 5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License 6 * was not distributed with this source code in the LICENSE file, you can 7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open 8 * Media Patent License 1.0 was not distributed with this source code in the 9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent. 10 */ 11 12 #include <arm_neon.h> 13 14 #include "config/aom_config.h" 15 #include "config/aom_dsp_rtcd.h" 16 17 #include "aom/aom_integer.h" 18 #include "aom_dsp/arm/mem_neon.h" 19 #include "aom_dsp/arm/sum_neon.h" 20 21 static inline void sad16_neon(uint8x16_t src, uint8x16_t ref, 22 uint16x8_t *const sad_sum) { 23 uint8x16_t abs_diff = vabdq_u8(src, ref); 24 *sad_sum = vpadalq_u8(*sad_sum, abs_diff); 25 } 26 27 static inline void sadwxhx3d_large_neon(const uint8_t *src, int src_stride, 28 const uint8_t *const ref[3], 29 int ref_stride, uint32_t res[3], int w, 30 int h, int h_overflow) { 31 uint32x4_t sum[3] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0) }; 32 int h_limit = h > h_overflow ? h_overflow : h; 33 34 int ref_offset = 0; 35 int i = 0; 36 do { 37 uint16x8_t sum_lo[3] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0) }; 38 uint16x8_t sum_hi[3] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0) }; 39 40 do { 41 int j = 0; 42 do { 43 const uint8x16_t s0 = vld1q_u8(src + j); 44 sad16_neon(s0, vld1q_u8(ref[0] + ref_offset + j), &sum_lo[0]); 45 sad16_neon(s0, vld1q_u8(ref[1] + ref_offset + j), &sum_lo[1]); 46 sad16_neon(s0, vld1q_u8(ref[2] + ref_offset + j), &sum_lo[2]); 47 48 const uint8x16_t s1 = vld1q_u8(src + j + 16); 49 sad16_neon(s1, vld1q_u8(ref[0] + ref_offset + j + 16), &sum_hi[0]); 50 sad16_neon(s1, vld1q_u8(ref[1] + ref_offset + j + 16), &sum_hi[1]); 51 sad16_neon(s1, vld1q_u8(ref[2] + ref_offset + j + 16), &sum_hi[2]); 52 53 j += 32; 54 } while (j < w); 55 56 src += src_stride; 57 ref_offset += ref_stride; 58 } while (++i < h_limit); 59 60 sum[0] = vpadalq_u16(sum[0], sum_lo[0]); 61 sum[0] = vpadalq_u16(sum[0], sum_hi[0]); 62 sum[1] = vpadalq_u16(sum[1], sum_lo[1]); 63 sum[1] = vpadalq_u16(sum[1], sum_hi[1]); 64 sum[2] = vpadalq_u16(sum[2], sum_lo[2]); 65 sum[2] = vpadalq_u16(sum[2], sum_hi[2]); 66 67 h_limit += h_overflow; 68 } while (i < h); 69 70 res[0] = horizontal_add_u32x4(sum[0]); 71 res[1] = horizontal_add_u32x4(sum[1]); 72 res[2] = horizontal_add_u32x4(sum[2]); 73 } 74 75 static inline void sad128xhx3d_neon(const uint8_t *src, int src_stride, 76 const uint8_t *const ref[3], int ref_stride, 77 uint32_t res[3], int h) { 78 sadwxhx3d_large_neon(src, src_stride, ref, ref_stride, res, 128, h, 32); 79 } 80 81 static inline void sad64xhx3d_neon(const uint8_t *src, int src_stride, 82 const uint8_t *const ref[3], int ref_stride, 83 uint32_t res[3], int h) { 84 sadwxhx3d_large_neon(src, src_stride, ref, ref_stride, res, 64, h, 64); 85 } 86 87 static inline void sad32xhx3d_neon(const uint8_t *src, int src_stride, 88 const uint8_t *const ref[3], int ref_stride, 89 uint32_t res[3], int h) { 90 uint16x8_t sum_lo[3] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0) }; 91 uint16x8_t sum_hi[3] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0) }; 92 93 int ref_offset = 0; 94 int i = h; 95 do { 96 const uint8x16_t s0 = vld1q_u8(src); 97 sad16_neon(s0, vld1q_u8(ref[0] + ref_offset), &sum_lo[0]); 98 sad16_neon(s0, vld1q_u8(ref[1] + ref_offset), &sum_lo[1]); 99 sad16_neon(s0, vld1q_u8(ref[2] + ref_offset), &sum_lo[2]); 100 101 const uint8x16_t s1 = vld1q_u8(src + 16); 102 sad16_neon(s1, vld1q_u8(ref[0] + ref_offset + 16), &sum_hi[0]); 103 sad16_neon(s1, vld1q_u8(ref[1] + ref_offset + 16), &sum_hi[1]); 104 sad16_neon(s1, vld1q_u8(ref[2] + ref_offset + 16), &sum_hi[2]); 105 106 src += src_stride; 107 ref_offset += ref_stride; 108 } while (--i != 0); 109 110 res[0] = horizontal_long_add_u16x8(sum_lo[0], sum_hi[0]); 111 res[1] = horizontal_long_add_u16x8(sum_lo[1], sum_hi[1]); 112 res[2] = horizontal_long_add_u16x8(sum_lo[2], sum_hi[2]); 113 } 114 115 static inline void sad16xhx3d_neon(const uint8_t *src, int src_stride, 116 const uint8_t *const ref[3], int ref_stride, 117 uint32_t res[3], int h) { 118 uint16x8_t sum[3] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0) }; 119 120 int ref_offset = 0; 121 int i = h; 122 do { 123 const uint8x16_t s = vld1q_u8(src); 124 sad16_neon(s, vld1q_u8(ref[0] + ref_offset), &sum[0]); 125 sad16_neon(s, vld1q_u8(ref[1] + ref_offset), &sum[1]); 126 sad16_neon(s, vld1q_u8(ref[2] + ref_offset), &sum[2]); 127 128 src += src_stride; 129 ref_offset += ref_stride; 130 } while (--i != 0); 131 132 res[0] = horizontal_add_u16x8(sum[0]); 133 res[1] = horizontal_add_u16x8(sum[1]); 134 res[2] = horizontal_add_u16x8(sum[2]); 135 } 136 137 static inline void sad8xhx3d_neon(const uint8_t *src, int src_stride, 138 const uint8_t *const ref[3], int ref_stride, 139 uint32_t res[3], int h) { 140 uint16x8_t sum[3]; 141 142 uint8x8_t s = vld1_u8(src); 143 sum[0] = vabdl_u8(s, vld1_u8(ref[0])); 144 sum[1] = vabdl_u8(s, vld1_u8(ref[1])); 145 sum[2] = vabdl_u8(s, vld1_u8(ref[2])); 146 147 src += src_stride; 148 int ref_offset = ref_stride; 149 int i = h - 1; 150 do { 151 s = vld1_u8(src); 152 sum[0] = vabal_u8(sum[0], s, vld1_u8(ref[0] + ref_offset)); 153 sum[1] = vabal_u8(sum[1], s, vld1_u8(ref[1] + ref_offset)); 154 sum[2] = vabal_u8(sum[2], s, vld1_u8(ref[2] + ref_offset)); 155 156 src += src_stride; 157 ref_offset += ref_stride; 158 } while (--i != 0); 159 160 res[0] = horizontal_add_u16x8(sum[0]); 161 res[1] = horizontal_add_u16x8(sum[1]); 162 res[2] = horizontal_add_u16x8(sum[2]); 163 } 164 165 static inline void sad4xhx3d_neon(const uint8_t *src, int src_stride, 166 const uint8_t *const ref[3], int ref_stride, 167 uint32_t res[3], int h) { 168 assert(h % 2 == 0); 169 uint16x8_t sum[3]; 170 171 uint8x8_t s = load_unaligned_u8(src, src_stride); 172 uint8x8_t r0 = load_unaligned_u8(ref[0], ref_stride); 173 uint8x8_t r1 = load_unaligned_u8(ref[1], ref_stride); 174 uint8x8_t r2 = load_unaligned_u8(ref[2], ref_stride); 175 176 sum[0] = vabdl_u8(s, r0); 177 sum[1] = vabdl_u8(s, r1); 178 sum[2] = vabdl_u8(s, r2); 179 180 src += 2 * src_stride; 181 int ref_offset = 2 * ref_stride; 182 int i = (h / 2) - 1; 183 do { 184 s = load_unaligned_u8(src, src_stride); 185 r0 = load_unaligned_u8(ref[0] + ref_offset, ref_stride); 186 r1 = load_unaligned_u8(ref[1] + ref_offset, ref_stride); 187 r2 = load_unaligned_u8(ref[2] + ref_offset, ref_stride); 188 189 sum[0] = vabal_u8(sum[0], s, r0); 190 sum[1] = vabal_u8(sum[1], s, r1); 191 sum[2] = vabal_u8(sum[2], s, r2); 192 193 src += 2 * src_stride; 194 ref_offset += 2 * ref_stride; 195 } while (--i != 0); 196 197 res[0] = horizontal_add_u16x8(sum[0]); 198 res[1] = horizontal_add_u16x8(sum[1]); 199 res[2] = horizontal_add_u16x8(sum[2]); 200 } 201 202 #define SAD_WXH_3D_NEON(w, h) \ 203 void aom_sad##w##x##h##x3d_neon(const uint8_t *src, int src_stride, \ 204 const uint8_t *const ref[4], int ref_stride, \ 205 uint32_t res[4]) { \ 206 sad##w##xhx3d_neon(src, src_stride, ref, ref_stride, res, (h)); \ 207 } 208 209 SAD_WXH_3D_NEON(4, 4) 210 SAD_WXH_3D_NEON(4, 8) 211 212 SAD_WXH_3D_NEON(8, 4) 213 SAD_WXH_3D_NEON(8, 8) 214 SAD_WXH_3D_NEON(8, 16) 215 216 SAD_WXH_3D_NEON(16, 8) 217 SAD_WXH_3D_NEON(16, 16) 218 SAD_WXH_3D_NEON(16, 32) 219 220 SAD_WXH_3D_NEON(32, 16) 221 SAD_WXH_3D_NEON(32, 32) 222 SAD_WXH_3D_NEON(32, 64) 223 224 SAD_WXH_3D_NEON(64, 32) 225 SAD_WXH_3D_NEON(64, 64) 226 SAD_WXH_3D_NEON(64, 128) 227 228 SAD_WXH_3D_NEON(128, 64) 229 SAD_WXH_3D_NEON(128, 128) 230 231 #if !CONFIG_REALTIME_ONLY 232 SAD_WXH_3D_NEON(4, 16) 233 SAD_WXH_3D_NEON(8, 32) 234 SAD_WXH_3D_NEON(16, 4) 235 SAD_WXH_3D_NEON(16, 64) 236 SAD_WXH_3D_NEON(32, 8) 237 SAD_WXH_3D_NEON(64, 16) 238 #endif // !CONFIG_REALTIME_ONLY 239 240 #undef SAD_WXH_3D_NEON 241 242 static inline void sadwxhx4d_large_neon(const uint8_t *src, int src_stride, 243 const uint8_t *const ref[4], 244 int ref_stride, uint32_t res[4], int w, 245 int h, int h_overflow) { 246 uint32x4_t sum[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0), 247 vdupq_n_u32(0) }; 248 int h_limit = h > h_overflow ? h_overflow : h; 249 250 int ref_offset = 0; 251 int i = 0; 252 do { 253 uint16x8_t sum_lo[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0), 254 vdupq_n_u16(0) }; 255 uint16x8_t sum_hi[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0), 256 vdupq_n_u16(0) }; 257 258 do { 259 int j = 0; 260 do { 261 const uint8x16_t s0 = vld1q_u8(src + j); 262 sad16_neon(s0, vld1q_u8(ref[0] + ref_offset + j), &sum_lo[0]); 263 sad16_neon(s0, vld1q_u8(ref[1] + ref_offset + j), &sum_lo[1]); 264 sad16_neon(s0, vld1q_u8(ref[2] + ref_offset + j), &sum_lo[2]); 265 sad16_neon(s0, vld1q_u8(ref[3] + ref_offset + j), &sum_lo[3]); 266 267 const uint8x16_t s1 = vld1q_u8(src + j + 16); 268 sad16_neon(s1, vld1q_u8(ref[0] + ref_offset + j + 16), &sum_hi[0]); 269 sad16_neon(s1, vld1q_u8(ref[1] + ref_offset + j + 16), &sum_hi[1]); 270 sad16_neon(s1, vld1q_u8(ref[2] + ref_offset + j + 16), &sum_hi[2]); 271 sad16_neon(s1, vld1q_u8(ref[3] + ref_offset + j + 16), &sum_hi[3]); 272 273 j += 32; 274 } while (j < w); 275 276 src += src_stride; 277 ref_offset += ref_stride; 278 } while (++i < h_limit); 279 280 sum[0] = vpadalq_u16(sum[0], sum_lo[0]); 281 sum[0] = vpadalq_u16(sum[0], sum_hi[0]); 282 sum[1] = vpadalq_u16(sum[1], sum_lo[1]); 283 sum[1] = vpadalq_u16(sum[1], sum_hi[1]); 284 sum[2] = vpadalq_u16(sum[2], sum_lo[2]); 285 sum[2] = vpadalq_u16(sum[2], sum_hi[2]); 286 sum[3] = vpadalq_u16(sum[3], sum_lo[3]); 287 sum[3] = vpadalq_u16(sum[3], sum_hi[3]); 288 289 h_limit += h_overflow; 290 } while (i < h); 291 292 vst1q_u32(res, horizontal_add_4d_u32x4(sum)); 293 } 294 295 static inline void sad128xhx4d_neon(const uint8_t *src, int src_stride, 296 const uint8_t *const ref[4], int ref_stride, 297 uint32_t res[4], int h) { 298 sadwxhx4d_large_neon(src, src_stride, ref, ref_stride, res, 128, h, 32); 299 } 300 301 static inline void sad64xhx4d_neon(const uint8_t *src, int src_stride, 302 const uint8_t *const ref[4], int ref_stride, 303 uint32_t res[4], int h) { 304 sadwxhx4d_large_neon(src, src_stride, ref, ref_stride, res, 64, h, 64); 305 } 306 307 static inline void sad32xhx4d_neon(const uint8_t *src, int src_stride, 308 const uint8_t *const ref[4], int ref_stride, 309 uint32_t res[4], int h) { 310 uint16x8_t sum_lo[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0), 311 vdupq_n_u16(0) }; 312 uint16x8_t sum_hi[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0), 313 vdupq_n_u16(0) }; 314 315 int ref_offset = 0; 316 int i = h; 317 do { 318 const uint8x16_t s0 = vld1q_u8(src); 319 sad16_neon(s0, vld1q_u8(ref[0] + ref_offset), &sum_lo[0]); 320 sad16_neon(s0, vld1q_u8(ref[1] + ref_offset), &sum_lo[1]); 321 sad16_neon(s0, vld1q_u8(ref[2] + ref_offset), &sum_lo[2]); 322 sad16_neon(s0, vld1q_u8(ref[3] + ref_offset), &sum_lo[3]); 323 324 const uint8x16_t s1 = vld1q_u8(src + 16); 325 sad16_neon(s1, vld1q_u8(ref[0] + ref_offset + 16), &sum_hi[0]); 326 sad16_neon(s1, vld1q_u8(ref[1] + ref_offset + 16), &sum_hi[1]); 327 sad16_neon(s1, vld1q_u8(ref[2] + ref_offset + 16), &sum_hi[2]); 328 sad16_neon(s1, vld1q_u8(ref[3] + ref_offset + 16), &sum_hi[3]); 329 330 src += src_stride; 331 ref_offset += ref_stride; 332 } while (--i != 0); 333 334 vst1q_u32(res, horizontal_long_add_4d_u16x8(sum_lo, sum_hi)); 335 } 336 337 static inline void sad16xhx4d_neon(const uint8_t *src, int src_stride, 338 const uint8_t *const ref[4], int ref_stride, 339 uint32_t res[4], int h) { 340 uint16x8_t sum_u16[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0), 341 vdupq_n_u16(0) }; 342 uint32x4_t sum_u32[4]; 343 344 int ref_offset = 0; 345 int i = h; 346 do { 347 const uint8x16_t s = vld1q_u8(src); 348 sad16_neon(s, vld1q_u8(ref[0] + ref_offset), &sum_u16[0]); 349 sad16_neon(s, vld1q_u8(ref[1] + ref_offset), &sum_u16[1]); 350 sad16_neon(s, vld1q_u8(ref[2] + ref_offset), &sum_u16[2]); 351 sad16_neon(s, vld1q_u8(ref[3] + ref_offset), &sum_u16[3]); 352 353 src += src_stride; 354 ref_offset += ref_stride; 355 } while (--i != 0); 356 357 sum_u32[0] = vpaddlq_u16(sum_u16[0]); 358 sum_u32[1] = vpaddlq_u16(sum_u16[1]); 359 sum_u32[2] = vpaddlq_u16(sum_u16[2]); 360 sum_u32[3] = vpaddlq_u16(sum_u16[3]); 361 362 vst1q_u32(res, horizontal_add_4d_u32x4(sum_u32)); 363 } 364 365 static inline void sad8xhx4d_neon(const uint8_t *src, int src_stride, 366 const uint8_t *const ref[4], int ref_stride, 367 uint32_t res[4], int h) { 368 uint16x8_t sum[4]; 369 370 uint8x8_t s = vld1_u8(src); 371 sum[0] = vabdl_u8(s, vld1_u8(ref[0])); 372 sum[1] = vabdl_u8(s, vld1_u8(ref[1])); 373 sum[2] = vabdl_u8(s, vld1_u8(ref[2])); 374 sum[3] = vabdl_u8(s, vld1_u8(ref[3])); 375 376 src += src_stride; 377 int ref_offset = ref_stride; 378 int i = h - 1; 379 do { 380 s = vld1_u8(src); 381 sum[0] = vabal_u8(sum[0], s, vld1_u8(ref[0] + ref_offset)); 382 sum[1] = vabal_u8(sum[1], s, vld1_u8(ref[1] + ref_offset)); 383 sum[2] = vabal_u8(sum[2], s, vld1_u8(ref[2] + ref_offset)); 384 sum[3] = vabal_u8(sum[3], s, vld1_u8(ref[3] + ref_offset)); 385 386 src += src_stride; 387 ref_offset += ref_stride; 388 } while (--i != 0); 389 390 vst1q_u32(res, horizontal_add_4d_u16x8(sum)); 391 } 392 393 static inline void sad4xhx4d_neon(const uint8_t *src, int src_stride, 394 const uint8_t *const ref[4], int ref_stride, 395 uint32_t res[4], int h) { 396 uint16x8_t sum[4]; 397 398 uint8x8_t s = load_unaligned_u8(src, src_stride); 399 uint8x8_t r0 = load_unaligned_u8(ref[0], ref_stride); 400 uint8x8_t r1 = load_unaligned_u8(ref[1], ref_stride); 401 uint8x8_t r2 = load_unaligned_u8(ref[2], ref_stride); 402 uint8x8_t r3 = load_unaligned_u8(ref[3], ref_stride); 403 404 sum[0] = vabdl_u8(s, r0); 405 sum[1] = vabdl_u8(s, r1); 406 sum[2] = vabdl_u8(s, r2); 407 sum[3] = vabdl_u8(s, r3); 408 409 src += 2 * src_stride; 410 int ref_offset = 2 * ref_stride; 411 int i = h / 2; 412 while (--i != 0) { 413 s = load_unaligned_u8(src, src_stride); 414 r0 = load_unaligned_u8(ref[0] + ref_offset, ref_stride); 415 r1 = load_unaligned_u8(ref[1] + ref_offset, ref_stride); 416 r2 = load_unaligned_u8(ref[2] + ref_offset, ref_stride); 417 r3 = load_unaligned_u8(ref[3] + ref_offset, ref_stride); 418 419 sum[0] = vabal_u8(sum[0], s, r0); 420 sum[1] = vabal_u8(sum[1], s, r1); 421 sum[2] = vabal_u8(sum[2], s, r2); 422 sum[3] = vabal_u8(sum[3], s, r3); 423 424 src += 2 * src_stride; 425 ref_offset += 2 * ref_stride; 426 } 427 428 vst1q_u32(res, horizontal_add_4d_u16x8(sum)); 429 } 430 431 #define SAD_WXH_4D_NEON(w, h) \ 432 void aom_sad##w##x##h##x4d_neon(const uint8_t *src, int src_stride, \ 433 const uint8_t *const ref[4], int ref_stride, \ 434 uint32_t res[4]) { \ 435 sad##w##xhx4d_neon(src, src_stride, ref, ref_stride, res, (h)); \ 436 } 437 438 SAD_WXH_4D_NEON(4, 4) 439 SAD_WXH_4D_NEON(4, 8) 440 441 SAD_WXH_4D_NEON(8, 4) 442 SAD_WXH_4D_NEON(8, 8) 443 SAD_WXH_4D_NEON(8, 16) 444 445 SAD_WXH_4D_NEON(16, 8) 446 SAD_WXH_4D_NEON(16, 16) 447 SAD_WXH_4D_NEON(16, 32) 448 449 SAD_WXH_4D_NEON(32, 16) 450 SAD_WXH_4D_NEON(32, 32) 451 SAD_WXH_4D_NEON(32, 64) 452 453 SAD_WXH_4D_NEON(64, 32) 454 SAD_WXH_4D_NEON(64, 64) 455 SAD_WXH_4D_NEON(64, 128) 456 457 SAD_WXH_4D_NEON(128, 64) 458 SAD_WXH_4D_NEON(128, 128) 459 460 #if !CONFIG_REALTIME_ONLY 461 SAD_WXH_4D_NEON(4, 16) 462 SAD_WXH_4D_NEON(8, 32) 463 SAD_WXH_4D_NEON(16, 4) 464 SAD_WXH_4D_NEON(16, 64) 465 SAD_WXH_4D_NEON(32, 8) 466 SAD_WXH_4D_NEON(64, 16) 467 #endif // !CONFIG_REALTIME_ONLY 468 469 #undef SAD_WXH_4D_NEON 470 471 #define SAD_SKIP_WXH_4D_NEON(w, h) \ 472 void aom_sad_skip_##w##x##h##x4d_neon(const uint8_t *src, int src_stride, \ 473 const uint8_t *const ref[4], \ 474 int ref_stride, uint32_t res[4]) { \ 475 sad##w##xhx4d_neon(src, 2 * src_stride, ref, 2 * ref_stride, res, \ 476 ((h) >> 1)); \ 477 res[0] <<= 1; \ 478 res[1] <<= 1; \ 479 res[2] <<= 1; \ 480 res[3] <<= 1; \ 481 } 482 483 SAD_SKIP_WXH_4D_NEON(8, 16) 484 485 SAD_SKIP_WXH_4D_NEON(16, 16) 486 SAD_SKIP_WXH_4D_NEON(16, 32) 487 488 SAD_SKIP_WXH_4D_NEON(32, 16) 489 SAD_SKIP_WXH_4D_NEON(32, 32) 490 SAD_SKIP_WXH_4D_NEON(32, 64) 491 492 SAD_SKIP_WXH_4D_NEON(64, 32) 493 SAD_SKIP_WXH_4D_NEON(64, 64) 494 SAD_SKIP_WXH_4D_NEON(64, 128) 495 496 SAD_SKIP_WXH_4D_NEON(128, 64) 497 SAD_SKIP_WXH_4D_NEON(128, 128) 498 499 #if !CONFIG_REALTIME_ONLY 500 SAD_SKIP_WXH_4D_NEON(4, 16) 501 SAD_SKIP_WXH_4D_NEON(8, 32) 502 SAD_SKIP_WXH_4D_NEON(16, 64) 503 SAD_SKIP_WXH_4D_NEON(64, 16) 504 #endif // !CONFIG_REALTIME_ONLY 505 506 #undef SAD_SKIP_WXH_4D_NEON