tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

sad_neon.c (16899B)


      1 /*
      2 * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
      3 *
      4 * This source code is subject to the terms of the BSD 2 Clause License and
      5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
      6 * was not distributed with this source code in the LICENSE file, you can
      7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
      8 * Media Patent License 1.0 was not distributed with this source code in the
      9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
     10 */
     11 
     12 #include <arm_neon.h>
     13 
     14 #include "config/aom_config.h"
     15 #include "config/aom_dsp_rtcd.h"
     16 
     17 #include "aom/aom_integer.h"
     18 #include "aom_dsp/arm/mem_neon.h"
     19 #include "aom_dsp/arm/sum_neon.h"
     20 
     21 static inline unsigned int sad128xh_neon(const uint8_t *src_ptr, int src_stride,
     22                                         const uint8_t *ref_ptr, int ref_stride,
     23                                         int h) {
     24  // We use 8 accumulators to prevent overflow for large values of 'h', as well
     25  // as enabling optimal UADALP instruction throughput on CPUs that have either
     26  // 2 or 4 Neon pipes.
     27  uint16x8_t sum[8] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),
     28                        vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),
     29                        vdupq_n_u16(0), vdupq_n_u16(0) };
     30 
     31  int i = h;
     32  do {
     33    uint8x16_t s0, s1, s2, s3, s4, s5, s6, s7;
     34    uint8x16_t r0, r1, r2, r3, r4, r5, r6, r7;
     35    uint8x16_t diff0, diff1, diff2, diff3, diff4, diff5, diff6, diff7;
     36 
     37    s0 = vld1q_u8(src_ptr);
     38    r0 = vld1q_u8(ref_ptr);
     39    diff0 = vabdq_u8(s0, r0);
     40    sum[0] = vpadalq_u8(sum[0], diff0);
     41 
     42    s1 = vld1q_u8(src_ptr + 16);
     43    r1 = vld1q_u8(ref_ptr + 16);
     44    diff1 = vabdq_u8(s1, r1);
     45    sum[1] = vpadalq_u8(sum[1], diff1);
     46 
     47    s2 = vld1q_u8(src_ptr + 32);
     48    r2 = vld1q_u8(ref_ptr + 32);
     49    diff2 = vabdq_u8(s2, r2);
     50    sum[2] = vpadalq_u8(sum[2], diff2);
     51 
     52    s3 = vld1q_u8(src_ptr + 48);
     53    r3 = vld1q_u8(ref_ptr + 48);
     54    diff3 = vabdq_u8(s3, r3);
     55    sum[3] = vpadalq_u8(sum[3], diff3);
     56 
     57    s4 = vld1q_u8(src_ptr + 64);
     58    r4 = vld1q_u8(ref_ptr + 64);
     59    diff4 = vabdq_u8(s4, r4);
     60    sum[4] = vpadalq_u8(sum[4], diff4);
     61 
     62    s5 = vld1q_u8(src_ptr + 80);
     63    r5 = vld1q_u8(ref_ptr + 80);
     64    diff5 = vabdq_u8(s5, r5);
     65    sum[5] = vpadalq_u8(sum[5], diff5);
     66 
     67    s6 = vld1q_u8(src_ptr + 96);
     68    r6 = vld1q_u8(ref_ptr + 96);
     69    diff6 = vabdq_u8(s6, r6);
     70    sum[6] = vpadalq_u8(sum[6], diff6);
     71 
     72    s7 = vld1q_u8(src_ptr + 112);
     73    r7 = vld1q_u8(ref_ptr + 112);
     74    diff7 = vabdq_u8(s7, r7);
     75    sum[7] = vpadalq_u8(sum[7], diff7);
     76 
     77    src_ptr += src_stride;
     78    ref_ptr += ref_stride;
     79  } while (--i != 0);
     80 
     81  uint32x4_t sum_u32 = vpaddlq_u16(sum[0]);
     82  sum_u32 = vpadalq_u16(sum_u32, sum[1]);
     83  sum_u32 = vpadalq_u16(sum_u32, sum[2]);
     84  sum_u32 = vpadalq_u16(sum_u32, sum[3]);
     85  sum_u32 = vpadalq_u16(sum_u32, sum[4]);
     86  sum_u32 = vpadalq_u16(sum_u32, sum[5]);
     87  sum_u32 = vpadalq_u16(sum_u32, sum[6]);
     88  sum_u32 = vpadalq_u16(sum_u32, sum[7]);
     89 
     90  return horizontal_add_u32x4(sum_u32);
     91 }
     92 
     93 static inline unsigned int sad64xh_neon(const uint8_t *src_ptr, int src_stride,
     94                                        const uint8_t *ref_ptr, int ref_stride,
     95                                        int h) {
     96  uint16x8_t sum[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),
     97                        vdupq_n_u16(0) };
     98 
     99  int i = h;
    100  do {
    101    uint8x16_t s0, s1, s2, s3, r0, r1, r2, r3;
    102    uint8x16_t diff0, diff1, diff2, diff3;
    103 
    104    s0 = vld1q_u8(src_ptr);
    105    r0 = vld1q_u8(ref_ptr);
    106    diff0 = vabdq_u8(s0, r0);
    107    sum[0] = vpadalq_u8(sum[0], diff0);
    108 
    109    s1 = vld1q_u8(src_ptr + 16);
    110    r1 = vld1q_u8(ref_ptr + 16);
    111    diff1 = vabdq_u8(s1, r1);
    112    sum[1] = vpadalq_u8(sum[1], diff1);
    113 
    114    s2 = vld1q_u8(src_ptr + 32);
    115    r2 = vld1q_u8(ref_ptr + 32);
    116    diff2 = vabdq_u8(s2, r2);
    117    sum[2] = vpadalq_u8(sum[2], diff2);
    118 
    119    s3 = vld1q_u8(src_ptr + 48);
    120    r3 = vld1q_u8(ref_ptr + 48);
    121    diff3 = vabdq_u8(s3, r3);
    122    sum[3] = vpadalq_u8(sum[3], diff3);
    123 
    124    src_ptr += src_stride;
    125    ref_ptr += ref_stride;
    126  } while (--i != 0);
    127 
    128  uint32x4_t sum_u32 = vpaddlq_u16(sum[0]);
    129  sum_u32 = vpadalq_u16(sum_u32, sum[1]);
    130  sum_u32 = vpadalq_u16(sum_u32, sum[2]);
    131  sum_u32 = vpadalq_u16(sum_u32, sum[3]);
    132 
    133  return horizontal_add_u32x4(sum_u32);
    134 }
    135 
    136 static inline unsigned int sad32xh_neon(const uint8_t *src_ptr, int src_stride,
    137                                        const uint8_t *ref_ptr, int ref_stride,
    138                                        int h) {
    139  uint16x8_t sum[2] = { vdupq_n_u16(0), vdupq_n_u16(0) };
    140 
    141  int i = h;
    142  do {
    143    uint8x16_t s0 = vld1q_u8(src_ptr);
    144    uint8x16_t r0 = vld1q_u8(ref_ptr);
    145    uint8x16_t diff0 = vabdq_u8(s0, r0);
    146    sum[0] = vpadalq_u8(sum[0], diff0);
    147 
    148    uint8x16_t s1 = vld1q_u8(src_ptr + 16);
    149    uint8x16_t r1 = vld1q_u8(ref_ptr + 16);
    150    uint8x16_t diff1 = vabdq_u8(s1, r1);
    151    sum[1] = vpadalq_u8(sum[1], diff1);
    152 
    153    src_ptr += src_stride;
    154    ref_ptr += ref_stride;
    155  } while (--i != 0);
    156 
    157  return horizontal_add_u16x8(vaddq_u16(sum[0], sum[1]));
    158 }
    159 
    160 static inline unsigned int sad16xh_neon(const uint8_t *src_ptr, int src_stride,
    161                                        const uint8_t *ref_ptr, int ref_stride,
    162                                        int h) {
    163  uint16x8_t sum = vdupq_n_u16(0);
    164 
    165  int i = h;
    166  do {
    167    uint8x16_t s = vld1q_u8(src_ptr);
    168    uint8x16_t r = vld1q_u8(ref_ptr);
    169 
    170    uint8x16_t diff = vabdq_u8(s, r);
    171    sum = vpadalq_u8(sum, diff);
    172 
    173    src_ptr += src_stride;
    174    ref_ptr += ref_stride;
    175  } while (--i != 0);
    176 
    177  return horizontal_add_u16x8(sum);
    178 }
    179 
    180 static inline unsigned int sad8xh_neon(const uint8_t *src_ptr, int src_stride,
    181                                       const uint8_t *ref_ptr, int ref_stride,
    182                                       int h) {
    183  uint16x8_t sum = vdupq_n_u16(0);
    184 
    185  int i = h;
    186  do {
    187    uint8x8_t s = vld1_u8(src_ptr);
    188    uint8x8_t r = vld1_u8(ref_ptr);
    189 
    190    sum = vabal_u8(sum, s, r);
    191 
    192    src_ptr += src_stride;
    193    ref_ptr += ref_stride;
    194  } while (--i != 0);
    195 
    196  return horizontal_add_u16x8(sum);
    197 }
    198 
    199 static inline unsigned int sad4xh_neon(const uint8_t *src_ptr, int src_stride,
    200                                       const uint8_t *ref_ptr, int ref_stride,
    201                                       int h) {
    202  uint16x8_t sum = vdupq_n_u16(0);
    203 
    204  int i = h / 2;
    205  do {
    206    uint8x8_t s = load_unaligned_u8(src_ptr, src_stride);
    207    uint8x8_t r = load_unaligned_u8(ref_ptr, ref_stride);
    208 
    209    sum = vabal_u8(sum, s, r);
    210 
    211    src_ptr += 2 * src_stride;
    212    ref_ptr += 2 * ref_stride;
    213  } while (--i != 0);
    214 
    215  return horizontal_add_u16x8(sum);
    216 }
    217 
    218 #define SAD_WXH_NEON(w, h)                                                   \
    219  unsigned int aom_sad##w##x##h##_neon(const uint8_t *src, int src_stride,   \
    220                                       const uint8_t *ref, int ref_stride) { \
    221    return sad##w##xh_neon(src, src_stride, ref, ref_stride, (h));           \
    222  }
    223 
    224 SAD_WXH_NEON(4, 4)
    225 SAD_WXH_NEON(4, 8)
    226 
    227 SAD_WXH_NEON(8, 4)
    228 SAD_WXH_NEON(8, 8)
    229 SAD_WXH_NEON(8, 16)
    230 
    231 SAD_WXH_NEON(16, 8)
    232 SAD_WXH_NEON(16, 16)
    233 SAD_WXH_NEON(16, 32)
    234 
    235 SAD_WXH_NEON(32, 16)
    236 SAD_WXH_NEON(32, 32)
    237 SAD_WXH_NEON(32, 64)
    238 
    239 SAD_WXH_NEON(64, 32)
    240 SAD_WXH_NEON(64, 64)
    241 SAD_WXH_NEON(64, 128)
    242 
    243 SAD_WXH_NEON(128, 64)
    244 SAD_WXH_NEON(128, 128)
    245 
    246 #if !CONFIG_REALTIME_ONLY
    247 SAD_WXH_NEON(4, 16)
    248 SAD_WXH_NEON(8, 32)
    249 SAD_WXH_NEON(16, 4)
    250 SAD_WXH_NEON(16, 64)
    251 SAD_WXH_NEON(32, 8)
    252 SAD_WXH_NEON(64, 16)
    253 #endif  // !CONFIG_REALTIME_ONLY
    254 
    255 #undef SAD_WXH_NEON
    256 
    257 #define SAD_SKIP_WXH_NEON(w, h)                                                \
    258  unsigned int aom_sad_skip_##w##x##h##_neon(                                  \
    259      const uint8_t *src, int src_stride, const uint8_t *ref,                  \
    260      int ref_stride) {                                                        \
    261    return 2 *                                                                 \
    262           sad##w##xh_neon(src, 2 * src_stride, ref, 2 * ref_stride, (h) / 2); \
    263  }
    264 
    265 SAD_SKIP_WXH_NEON(8, 16)
    266 
    267 SAD_SKIP_WXH_NEON(16, 16)
    268 SAD_SKIP_WXH_NEON(16, 32)
    269 
    270 SAD_SKIP_WXH_NEON(32, 16)
    271 SAD_SKIP_WXH_NEON(32, 32)
    272 SAD_SKIP_WXH_NEON(32, 64)
    273 
    274 SAD_SKIP_WXH_NEON(64, 32)
    275 SAD_SKIP_WXH_NEON(64, 64)
    276 SAD_SKIP_WXH_NEON(64, 128)
    277 
    278 SAD_SKIP_WXH_NEON(128, 64)
    279 SAD_SKIP_WXH_NEON(128, 128)
    280 
    281 #if !CONFIG_REALTIME_ONLY
    282 SAD_SKIP_WXH_NEON(4, 16)
    283 SAD_SKIP_WXH_NEON(8, 32)
    284 SAD_SKIP_WXH_NEON(16, 64)
    285 SAD_SKIP_WXH_NEON(64, 16)
    286 #endif  // !CONFIG_REALTIME_ONLY
    287 
    288 #undef SAD_SKIP_WXH_NEON
    289 
    290 static inline unsigned int sad128xh_avg_neon(const uint8_t *src_ptr,
    291                                             int src_stride,
    292                                             const uint8_t *ref_ptr,
    293                                             int ref_stride, int h,
    294                                             const uint8_t *second_pred) {
    295  // We use 8 accumulators to prevent overflow for large values of 'h', as well
    296  // as enabling optimal UADALP instruction throughput on CPUs that have either
    297  // 2 or 4 Neon pipes.
    298  uint16x8_t sum[8] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),
    299                        vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),
    300                        vdupq_n_u16(0), vdupq_n_u16(0) };
    301 
    302  int i = h;
    303  do {
    304    uint8x16_t s0, s1, s2, s3, s4, s5, s6, s7;
    305    uint8x16_t r0, r1, r2, r3, r4, r5, r6, r7;
    306    uint8x16_t p0, p1, p2, p3, p4, p5, p6, p7;
    307    uint8x16_t avg0, avg1, avg2, avg3, avg4, avg5, avg6, avg7;
    308    uint8x16_t diff0, diff1, diff2, diff3, diff4, diff5, diff6, diff7;
    309 
    310    s0 = vld1q_u8(src_ptr);
    311    r0 = vld1q_u8(ref_ptr);
    312    p0 = vld1q_u8(second_pred);
    313    avg0 = vrhaddq_u8(r0, p0);
    314    diff0 = vabdq_u8(s0, avg0);
    315    sum[0] = vpadalq_u8(sum[0], diff0);
    316 
    317    s1 = vld1q_u8(src_ptr + 16);
    318    r1 = vld1q_u8(ref_ptr + 16);
    319    p1 = vld1q_u8(second_pred + 16);
    320    avg1 = vrhaddq_u8(r1, p1);
    321    diff1 = vabdq_u8(s1, avg1);
    322    sum[1] = vpadalq_u8(sum[1], diff1);
    323 
    324    s2 = vld1q_u8(src_ptr + 32);
    325    r2 = vld1q_u8(ref_ptr + 32);
    326    p2 = vld1q_u8(second_pred + 32);
    327    avg2 = vrhaddq_u8(r2, p2);
    328    diff2 = vabdq_u8(s2, avg2);
    329    sum[2] = vpadalq_u8(sum[2], diff2);
    330 
    331    s3 = vld1q_u8(src_ptr + 48);
    332    r3 = vld1q_u8(ref_ptr + 48);
    333    p3 = vld1q_u8(second_pred + 48);
    334    avg3 = vrhaddq_u8(r3, p3);
    335    diff3 = vabdq_u8(s3, avg3);
    336    sum[3] = vpadalq_u8(sum[3], diff3);
    337 
    338    s4 = vld1q_u8(src_ptr + 64);
    339    r4 = vld1q_u8(ref_ptr + 64);
    340    p4 = vld1q_u8(second_pred + 64);
    341    avg4 = vrhaddq_u8(r4, p4);
    342    diff4 = vabdq_u8(s4, avg4);
    343    sum[4] = vpadalq_u8(sum[4], diff4);
    344 
    345    s5 = vld1q_u8(src_ptr + 80);
    346    r5 = vld1q_u8(ref_ptr + 80);
    347    p5 = vld1q_u8(second_pred + 80);
    348    avg5 = vrhaddq_u8(r5, p5);
    349    diff5 = vabdq_u8(s5, avg5);
    350    sum[5] = vpadalq_u8(sum[5], diff5);
    351 
    352    s6 = vld1q_u8(src_ptr + 96);
    353    r6 = vld1q_u8(ref_ptr + 96);
    354    p6 = vld1q_u8(second_pred + 96);
    355    avg6 = vrhaddq_u8(r6, p6);
    356    diff6 = vabdq_u8(s6, avg6);
    357    sum[6] = vpadalq_u8(sum[6], diff6);
    358 
    359    s7 = vld1q_u8(src_ptr + 112);
    360    r7 = vld1q_u8(ref_ptr + 112);
    361    p7 = vld1q_u8(second_pred + 112);
    362    avg7 = vrhaddq_u8(r7, p7);
    363    diff7 = vabdq_u8(s7, avg7);
    364    sum[7] = vpadalq_u8(sum[7], diff7);
    365 
    366    src_ptr += src_stride;
    367    ref_ptr += ref_stride;
    368    second_pred += 128;
    369  } while (--i != 0);
    370 
    371  uint32x4_t sum_u32 = vpaddlq_u16(sum[0]);
    372  sum_u32 = vpadalq_u16(sum_u32, sum[1]);
    373  sum_u32 = vpadalq_u16(sum_u32, sum[2]);
    374  sum_u32 = vpadalq_u16(sum_u32, sum[3]);
    375  sum_u32 = vpadalq_u16(sum_u32, sum[4]);
    376  sum_u32 = vpadalq_u16(sum_u32, sum[5]);
    377  sum_u32 = vpadalq_u16(sum_u32, sum[6]);
    378  sum_u32 = vpadalq_u16(sum_u32, sum[7]);
    379 
    380  return horizontal_add_u32x4(sum_u32);
    381 }
    382 
    383 static inline unsigned int sad64xh_avg_neon(const uint8_t *src_ptr,
    384                                            int src_stride,
    385                                            const uint8_t *ref_ptr,
    386                                            int ref_stride, int h,
    387                                            const uint8_t *second_pred) {
    388  uint16x8_t sum[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),
    389                        vdupq_n_u16(0) };
    390 
    391  int i = h;
    392  do {
    393    uint8x16_t s0, s1, s2, s3, r0, r1, r2, r3, p0, p1, p2, p3;
    394    uint8x16_t avg0, avg1, avg2, avg3, diff0, diff1, diff2, diff3;
    395 
    396    s0 = vld1q_u8(src_ptr);
    397    r0 = vld1q_u8(ref_ptr);
    398    p0 = vld1q_u8(second_pred);
    399    avg0 = vrhaddq_u8(r0, p0);
    400    diff0 = vabdq_u8(s0, avg0);
    401    sum[0] = vpadalq_u8(sum[0], diff0);
    402 
    403    s1 = vld1q_u8(src_ptr + 16);
    404    r1 = vld1q_u8(ref_ptr + 16);
    405    p1 = vld1q_u8(second_pred + 16);
    406    avg1 = vrhaddq_u8(r1, p1);
    407    diff1 = vabdq_u8(s1, avg1);
    408    sum[1] = vpadalq_u8(sum[1], diff1);
    409 
    410    s2 = vld1q_u8(src_ptr + 32);
    411    r2 = vld1q_u8(ref_ptr + 32);
    412    p2 = vld1q_u8(second_pred + 32);
    413    avg2 = vrhaddq_u8(r2, p2);
    414    diff2 = vabdq_u8(s2, avg2);
    415    sum[2] = vpadalq_u8(sum[2], diff2);
    416 
    417    s3 = vld1q_u8(src_ptr + 48);
    418    r3 = vld1q_u8(ref_ptr + 48);
    419    p3 = vld1q_u8(second_pred + 48);
    420    avg3 = vrhaddq_u8(r3, p3);
    421    diff3 = vabdq_u8(s3, avg3);
    422    sum[3] = vpadalq_u8(sum[3], diff3);
    423 
    424    src_ptr += src_stride;
    425    ref_ptr += ref_stride;
    426    second_pred += 64;
    427  } while (--i != 0);
    428 
    429  uint32x4_t sum_u32 = vpaddlq_u16(sum[0]);
    430  sum_u32 = vpadalq_u16(sum_u32, sum[1]);
    431  sum_u32 = vpadalq_u16(sum_u32, sum[2]);
    432  sum_u32 = vpadalq_u16(sum_u32, sum[3]);
    433 
    434  return horizontal_add_u32x4(sum_u32);
    435 }
    436 
    437 static inline unsigned int sad32xh_avg_neon(const uint8_t *src_ptr,
    438                                            int src_stride,
    439                                            const uint8_t *ref_ptr,
    440                                            int ref_stride, int h,
    441                                            const uint8_t *second_pred) {
    442  uint16x8_t sum[2] = { vdupq_n_u16(0), vdupq_n_u16(0) };
    443 
    444  int i = h;
    445  do {
    446    uint8x16_t s0 = vld1q_u8(src_ptr);
    447    uint8x16_t r0 = vld1q_u8(ref_ptr);
    448    uint8x16_t p0 = vld1q_u8(second_pred);
    449    uint8x16_t avg0 = vrhaddq_u8(r0, p0);
    450    uint8x16_t diff0 = vabdq_u8(s0, avg0);
    451    sum[0] = vpadalq_u8(sum[0], diff0);
    452 
    453    uint8x16_t s1 = vld1q_u8(src_ptr + 16);
    454    uint8x16_t r1 = vld1q_u8(ref_ptr + 16);
    455    uint8x16_t p1 = vld1q_u8(second_pred + 16);
    456    uint8x16_t avg1 = vrhaddq_u8(r1, p1);
    457    uint8x16_t diff1 = vabdq_u8(s1, avg1);
    458    sum[1] = vpadalq_u8(sum[1], diff1);
    459 
    460    src_ptr += src_stride;
    461    ref_ptr += ref_stride;
    462    second_pred += 32;
    463  } while (--i != 0);
    464 
    465  return horizontal_add_u16x8(vaddq_u16(sum[0], sum[1]));
    466 }
    467 
    468 static inline unsigned int sad16xh_avg_neon(const uint8_t *src_ptr,
    469                                            int src_stride,
    470                                            const uint8_t *ref_ptr,
    471                                            int ref_stride, int h,
    472                                            const uint8_t *second_pred) {
    473  uint16x8_t sum = vdupq_n_u16(0);
    474 
    475  int i = h;
    476  do {
    477    uint8x16_t s = vld1q_u8(src_ptr);
    478    uint8x16_t r = vld1q_u8(ref_ptr);
    479    uint8x16_t p = vld1q_u8(second_pred);
    480 
    481    uint8x16_t avg = vrhaddq_u8(r, p);
    482    uint8x16_t diff = vabdq_u8(s, avg);
    483    sum = vpadalq_u8(sum, diff);
    484 
    485    src_ptr += src_stride;
    486    ref_ptr += ref_stride;
    487    second_pred += 16;
    488  } while (--i != 0);
    489 
    490  return horizontal_add_u16x8(sum);
    491 }
    492 
    493 static inline unsigned int sad8xh_avg_neon(const uint8_t *src_ptr,
    494                                           int src_stride,
    495                                           const uint8_t *ref_ptr,
    496                                           int ref_stride, int h,
    497                                           const uint8_t *second_pred) {
    498  uint16x8_t sum = vdupq_n_u16(0);
    499 
    500  int i = h;
    501  do {
    502    uint8x8_t s = vld1_u8(src_ptr);
    503    uint8x8_t r = vld1_u8(ref_ptr);
    504    uint8x8_t p = vld1_u8(second_pred);
    505 
    506    uint8x8_t avg = vrhadd_u8(r, p);
    507    sum = vabal_u8(sum, s, avg);
    508 
    509    src_ptr += src_stride;
    510    ref_ptr += ref_stride;
    511    second_pred += 8;
    512  } while (--i != 0);
    513 
    514  return horizontal_add_u16x8(sum);
    515 }
    516 
    517 #define SAD_WXH_AVG_NEON(w, h)                                                 \
    518  unsigned int aom_sad##w##x##h##_avg_neon(const uint8_t *src, int src_stride, \
    519                                           const uint8_t *ref, int ref_stride, \
    520                                           const uint8_t *second_pred) {       \
    521    return sad##w##xh_avg_neon(src, src_stride, ref, ref_stride, (h),          \
    522                               second_pred);                                   \
    523  }
    524 
    525 SAD_WXH_AVG_NEON(8, 8)
    526 SAD_WXH_AVG_NEON(8, 16)
    527 
    528 SAD_WXH_AVG_NEON(16, 8)
    529 SAD_WXH_AVG_NEON(16, 16)
    530 SAD_WXH_AVG_NEON(16, 32)
    531 
    532 SAD_WXH_AVG_NEON(32, 16)
    533 SAD_WXH_AVG_NEON(32, 32)
    534 SAD_WXH_AVG_NEON(32, 64)
    535 
    536 SAD_WXH_AVG_NEON(64, 32)
    537 SAD_WXH_AVG_NEON(64, 64)
    538 SAD_WXH_AVG_NEON(64, 128)
    539 
    540 SAD_WXH_AVG_NEON(128, 64)
    541 SAD_WXH_AVG_NEON(128, 128)
    542 
    543 #if !CONFIG_REALTIME_ONLY
    544 SAD_WXH_AVG_NEON(8, 32)
    545 SAD_WXH_AVG_NEON(16, 64)
    546 SAD_WXH_AVG_NEON(32, 8)
    547 SAD_WXH_AVG_NEON(64, 16)
    548 #endif  // !CONFIG_REALTIME_ONLY
    549 
    550 #undef SAD_WXH_AVG_NEON