tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

sadxd_neon.c (17679B)


      1 /*
      2 * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
      3 *
      4 * This source code is subject to the terms of the BSD 2 Clause License and
      5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
      6 * was not distributed with this source code in the LICENSE file, you can
      7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
      8 * Media Patent License 1.0 was not distributed with this source code in the
      9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
     10 */
     11 
     12 #include <arm_neon.h>
     13 
     14 #include "config/aom_config.h"
     15 #include "config/aom_dsp_rtcd.h"
     16 
     17 #include "aom/aom_integer.h"
     18 #include "aom_dsp/arm/mem_neon.h"
     19 #include "aom_dsp/arm/sum_neon.h"
     20 
     21 static inline void sad16_neon(uint8x16_t src, uint8x16_t ref,
     22                              uint16x8_t *const sad_sum) {
     23  uint8x16_t abs_diff = vabdq_u8(src, ref);
     24  *sad_sum = vpadalq_u8(*sad_sum, abs_diff);
     25 }
     26 
     27 static inline void sadwxhx3d_large_neon(const uint8_t *src, int src_stride,
     28                                        const uint8_t *const ref[3],
     29                                        int ref_stride, uint32_t res[3], int w,
     30                                        int h, int h_overflow) {
     31  uint32x4_t sum[3] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0) };
     32  int h_limit = h > h_overflow ? h_overflow : h;
     33 
     34  int ref_offset = 0;
     35  int i = 0;
     36  do {
     37    uint16x8_t sum_lo[3] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0) };
     38    uint16x8_t sum_hi[3] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0) };
     39 
     40    do {
     41      int j = 0;
     42      do {
     43        const uint8x16_t s0 = vld1q_u8(src + j);
     44        sad16_neon(s0, vld1q_u8(ref[0] + ref_offset + j), &sum_lo[0]);
     45        sad16_neon(s0, vld1q_u8(ref[1] + ref_offset + j), &sum_lo[1]);
     46        sad16_neon(s0, vld1q_u8(ref[2] + ref_offset + j), &sum_lo[2]);
     47 
     48        const uint8x16_t s1 = vld1q_u8(src + j + 16);
     49        sad16_neon(s1, vld1q_u8(ref[0] + ref_offset + j + 16), &sum_hi[0]);
     50        sad16_neon(s1, vld1q_u8(ref[1] + ref_offset + j + 16), &sum_hi[1]);
     51        sad16_neon(s1, vld1q_u8(ref[2] + ref_offset + j + 16), &sum_hi[2]);
     52 
     53        j += 32;
     54      } while (j < w);
     55 
     56      src += src_stride;
     57      ref_offset += ref_stride;
     58    } while (++i < h_limit);
     59 
     60    sum[0] = vpadalq_u16(sum[0], sum_lo[0]);
     61    sum[0] = vpadalq_u16(sum[0], sum_hi[0]);
     62    sum[1] = vpadalq_u16(sum[1], sum_lo[1]);
     63    sum[1] = vpadalq_u16(sum[1], sum_hi[1]);
     64    sum[2] = vpadalq_u16(sum[2], sum_lo[2]);
     65    sum[2] = vpadalq_u16(sum[2], sum_hi[2]);
     66 
     67    h_limit += h_overflow;
     68  } while (i < h);
     69 
     70  res[0] = horizontal_add_u32x4(sum[0]);
     71  res[1] = horizontal_add_u32x4(sum[1]);
     72  res[2] = horizontal_add_u32x4(sum[2]);
     73 }
     74 
     75 static inline void sad128xhx3d_neon(const uint8_t *src, int src_stride,
     76                                    const uint8_t *const ref[3], int ref_stride,
     77                                    uint32_t res[3], int h) {
     78  sadwxhx3d_large_neon(src, src_stride, ref, ref_stride, res, 128, h, 32);
     79 }
     80 
     81 static inline void sad64xhx3d_neon(const uint8_t *src, int src_stride,
     82                                   const uint8_t *const ref[3], int ref_stride,
     83                                   uint32_t res[3], int h) {
     84  sadwxhx3d_large_neon(src, src_stride, ref, ref_stride, res, 64, h, 64);
     85 }
     86 
     87 static inline void sad32xhx3d_neon(const uint8_t *src, int src_stride,
     88                                   const uint8_t *const ref[3], int ref_stride,
     89                                   uint32_t res[3], int h) {
     90  uint16x8_t sum_lo[3] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0) };
     91  uint16x8_t sum_hi[3] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0) };
     92 
     93  int ref_offset = 0;
     94  int i = h;
     95  do {
     96    const uint8x16_t s0 = vld1q_u8(src);
     97    sad16_neon(s0, vld1q_u8(ref[0] + ref_offset), &sum_lo[0]);
     98    sad16_neon(s0, vld1q_u8(ref[1] + ref_offset), &sum_lo[1]);
     99    sad16_neon(s0, vld1q_u8(ref[2] + ref_offset), &sum_lo[2]);
    100 
    101    const uint8x16_t s1 = vld1q_u8(src + 16);
    102    sad16_neon(s1, vld1q_u8(ref[0] + ref_offset + 16), &sum_hi[0]);
    103    sad16_neon(s1, vld1q_u8(ref[1] + ref_offset + 16), &sum_hi[1]);
    104    sad16_neon(s1, vld1q_u8(ref[2] + ref_offset + 16), &sum_hi[2]);
    105 
    106    src += src_stride;
    107    ref_offset += ref_stride;
    108  } while (--i != 0);
    109 
    110  res[0] = horizontal_long_add_u16x8(sum_lo[0], sum_hi[0]);
    111  res[1] = horizontal_long_add_u16x8(sum_lo[1], sum_hi[1]);
    112  res[2] = horizontal_long_add_u16x8(sum_lo[2], sum_hi[2]);
    113 }
    114 
    115 static inline void sad16xhx3d_neon(const uint8_t *src, int src_stride,
    116                                   const uint8_t *const ref[3], int ref_stride,
    117                                   uint32_t res[3], int h) {
    118  uint16x8_t sum[3] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0) };
    119 
    120  int ref_offset = 0;
    121  int i = h;
    122  do {
    123    const uint8x16_t s = vld1q_u8(src);
    124    sad16_neon(s, vld1q_u8(ref[0] + ref_offset), &sum[0]);
    125    sad16_neon(s, vld1q_u8(ref[1] + ref_offset), &sum[1]);
    126    sad16_neon(s, vld1q_u8(ref[2] + ref_offset), &sum[2]);
    127 
    128    src += src_stride;
    129    ref_offset += ref_stride;
    130  } while (--i != 0);
    131 
    132  res[0] = horizontal_add_u16x8(sum[0]);
    133  res[1] = horizontal_add_u16x8(sum[1]);
    134  res[2] = horizontal_add_u16x8(sum[2]);
    135 }
    136 
    137 static inline void sad8xhx3d_neon(const uint8_t *src, int src_stride,
    138                                  const uint8_t *const ref[3], int ref_stride,
    139                                  uint32_t res[3], int h) {
    140  uint16x8_t sum[3];
    141 
    142  uint8x8_t s = vld1_u8(src);
    143  sum[0] = vabdl_u8(s, vld1_u8(ref[0]));
    144  sum[1] = vabdl_u8(s, vld1_u8(ref[1]));
    145  sum[2] = vabdl_u8(s, vld1_u8(ref[2]));
    146 
    147  src += src_stride;
    148  int ref_offset = ref_stride;
    149  int i = h - 1;
    150  do {
    151    s = vld1_u8(src);
    152    sum[0] = vabal_u8(sum[0], s, vld1_u8(ref[0] + ref_offset));
    153    sum[1] = vabal_u8(sum[1], s, vld1_u8(ref[1] + ref_offset));
    154    sum[2] = vabal_u8(sum[2], s, vld1_u8(ref[2] + ref_offset));
    155 
    156    src += src_stride;
    157    ref_offset += ref_stride;
    158  } while (--i != 0);
    159 
    160  res[0] = horizontal_add_u16x8(sum[0]);
    161  res[1] = horizontal_add_u16x8(sum[1]);
    162  res[2] = horizontal_add_u16x8(sum[2]);
    163 }
    164 
    165 static inline void sad4xhx3d_neon(const uint8_t *src, int src_stride,
    166                                  const uint8_t *const ref[3], int ref_stride,
    167                                  uint32_t res[3], int h) {
    168  assert(h % 2 == 0);
    169  uint16x8_t sum[3];
    170 
    171  uint8x8_t s = load_unaligned_u8(src, src_stride);
    172  uint8x8_t r0 = load_unaligned_u8(ref[0], ref_stride);
    173  uint8x8_t r1 = load_unaligned_u8(ref[1], ref_stride);
    174  uint8x8_t r2 = load_unaligned_u8(ref[2], ref_stride);
    175 
    176  sum[0] = vabdl_u8(s, r0);
    177  sum[1] = vabdl_u8(s, r1);
    178  sum[2] = vabdl_u8(s, r2);
    179 
    180  src += 2 * src_stride;
    181  int ref_offset = 2 * ref_stride;
    182  int i = (h / 2) - 1;
    183  do {
    184    s = load_unaligned_u8(src, src_stride);
    185    r0 = load_unaligned_u8(ref[0] + ref_offset, ref_stride);
    186    r1 = load_unaligned_u8(ref[1] + ref_offset, ref_stride);
    187    r2 = load_unaligned_u8(ref[2] + ref_offset, ref_stride);
    188 
    189    sum[0] = vabal_u8(sum[0], s, r0);
    190    sum[1] = vabal_u8(sum[1], s, r1);
    191    sum[2] = vabal_u8(sum[2], s, r2);
    192 
    193    src += 2 * src_stride;
    194    ref_offset += 2 * ref_stride;
    195  } while (--i != 0);
    196 
    197  res[0] = horizontal_add_u16x8(sum[0]);
    198  res[1] = horizontal_add_u16x8(sum[1]);
    199  res[2] = horizontal_add_u16x8(sum[2]);
    200 }
    201 
    202 #define SAD_WXH_3D_NEON(w, h)                                                  \
    203  void aom_sad##w##x##h##x3d_neon(const uint8_t *src, int src_stride,          \
    204                                  const uint8_t *const ref[4], int ref_stride, \
    205                                  uint32_t res[4]) {                           \
    206    sad##w##xhx3d_neon(src, src_stride, ref, ref_stride, res, (h));            \
    207  }
    208 
    209 SAD_WXH_3D_NEON(4, 4)
    210 SAD_WXH_3D_NEON(4, 8)
    211 
    212 SAD_WXH_3D_NEON(8, 4)
    213 SAD_WXH_3D_NEON(8, 8)
    214 SAD_WXH_3D_NEON(8, 16)
    215 
    216 SAD_WXH_3D_NEON(16, 8)
    217 SAD_WXH_3D_NEON(16, 16)
    218 SAD_WXH_3D_NEON(16, 32)
    219 
    220 SAD_WXH_3D_NEON(32, 16)
    221 SAD_WXH_3D_NEON(32, 32)
    222 SAD_WXH_3D_NEON(32, 64)
    223 
    224 SAD_WXH_3D_NEON(64, 32)
    225 SAD_WXH_3D_NEON(64, 64)
    226 SAD_WXH_3D_NEON(64, 128)
    227 
    228 SAD_WXH_3D_NEON(128, 64)
    229 SAD_WXH_3D_NEON(128, 128)
    230 
    231 #if !CONFIG_REALTIME_ONLY
    232 SAD_WXH_3D_NEON(4, 16)
    233 SAD_WXH_3D_NEON(8, 32)
    234 SAD_WXH_3D_NEON(16, 4)
    235 SAD_WXH_3D_NEON(16, 64)
    236 SAD_WXH_3D_NEON(32, 8)
    237 SAD_WXH_3D_NEON(64, 16)
    238 #endif  // !CONFIG_REALTIME_ONLY
    239 
    240 #undef SAD_WXH_3D_NEON
    241 
    242 static inline void sadwxhx4d_large_neon(const uint8_t *src, int src_stride,
    243                                        const uint8_t *const ref[4],
    244                                        int ref_stride, uint32_t res[4], int w,
    245                                        int h, int h_overflow) {
    246  uint32x4_t sum[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0),
    247                        vdupq_n_u32(0) };
    248  int h_limit = h > h_overflow ? h_overflow : h;
    249 
    250  int ref_offset = 0;
    251  int i = 0;
    252  do {
    253    uint16x8_t sum_lo[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),
    254                             vdupq_n_u16(0) };
    255    uint16x8_t sum_hi[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),
    256                             vdupq_n_u16(0) };
    257 
    258    do {
    259      int j = 0;
    260      do {
    261        const uint8x16_t s0 = vld1q_u8(src + j);
    262        sad16_neon(s0, vld1q_u8(ref[0] + ref_offset + j), &sum_lo[0]);
    263        sad16_neon(s0, vld1q_u8(ref[1] + ref_offset + j), &sum_lo[1]);
    264        sad16_neon(s0, vld1q_u8(ref[2] + ref_offset + j), &sum_lo[2]);
    265        sad16_neon(s0, vld1q_u8(ref[3] + ref_offset + j), &sum_lo[3]);
    266 
    267        const uint8x16_t s1 = vld1q_u8(src + j + 16);
    268        sad16_neon(s1, vld1q_u8(ref[0] + ref_offset + j + 16), &sum_hi[0]);
    269        sad16_neon(s1, vld1q_u8(ref[1] + ref_offset + j + 16), &sum_hi[1]);
    270        sad16_neon(s1, vld1q_u8(ref[2] + ref_offset + j + 16), &sum_hi[2]);
    271        sad16_neon(s1, vld1q_u8(ref[3] + ref_offset + j + 16), &sum_hi[3]);
    272 
    273        j += 32;
    274      } while (j < w);
    275 
    276      src += src_stride;
    277      ref_offset += ref_stride;
    278    } while (++i < h_limit);
    279 
    280    sum[0] = vpadalq_u16(sum[0], sum_lo[0]);
    281    sum[0] = vpadalq_u16(sum[0], sum_hi[0]);
    282    sum[1] = vpadalq_u16(sum[1], sum_lo[1]);
    283    sum[1] = vpadalq_u16(sum[1], sum_hi[1]);
    284    sum[2] = vpadalq_u16(sum[2], sum_lo[2]);
    285    sum[2] = vpadalq_u16(sum[2], sum_hi[2]);
    286    sum[3] = vpadalq_u16(sum[3], sum_lo[3]);
    287    sum[3] = vpadalq_u16(sum[3], sum_hi[3]);
    288 
    289    h_limit += h_overflow;
    290  } while (i < h);
    291 
    292  vst1q_u32(res, horizontal_add_4d_u32x4(sum));
    293 }
    294 
    295 static inline void sad128xhx4d_neon(const uint8_t *src, int src_stride,
    296                                    const uint8_t *const ref[4], int ref_stride,
    297                                    uint32_t res[4], int h) {
    298  sadwxhx4d_large_neon(src, src_stride, ref, ref_stride, res, 128, h, 32);
    299 }
    300 
    301 static inline void sad64xhx4d_neon(const uint8_t *src, int src_stride,
    302                                   const uint8_t *const ref[4], int ref_stride,
    303                                   uint32_t res[4], int h) {
    304  sadwxhx4d_large_neon(src, src_stride, ref, ref_stride, res, 64, h, 64);
    305 }
    306 
    307 static inline void sad32xhx4d_neon(const uint8_t *src, int src_stride,
    308                                   const uint8_t *const ref[4], int ref_stride,
    309                                   uint32_t res[4], int h) {
    310  uint16x8_t sum_lo[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),
    311                           vdupq_n_u16(0) };
    312  uint16x8_t sum_hi[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),
    313                           vdupq_n_u16(0) };
    314 
    315  int ref_offset = 0;
    316  int i = h;
    317  do {
    318    const uint8x16_t s0 = vld1q_u8(src);
    319    sad16_neon(s0, vld1q_u8(ref[0] + ref_offset), &sum_lo[0]);
    320    sad16_neon(s0, vld1q_u8(ref[1] + ref_offset), &sum_lo[1]);
    321    sad16_neon(s0, vld1q_u8(ref[2] + ref_offset), &sum_lo[2]);
    322    sad16_neon(s0, vld1q_u8(ref[3] + ref_offset), &sum_lo[3]);
    323 
    324    const uint8x16_t s1 = vld1q_u8(src + 16);
    325    sad16_neon(s1, vld1q_u8(ref[0] + ref_offset + 16), &sum_hi[0]);
    326    sad16_neon(s1, vld1q_u8(ref[1] + ref_offset + 16), &sum_hi[1]);
    327    sad16_neon(s1, vld1q_u8(ref[2] + ref_offset + 16), &sum_hi[2]);
    328    sad16_neon(s1, vld1q_u8(ref[3] + ref_offset + 16), &sum_hi[3]);
    329 
    330    src += src_stride;
    331    ref_offset += ref_stride;
    332  } while (--i != 0);
    333 
    334  vst1q_u32(res, horizontal_long_add_4d_u16x8(sum_lo, sum_hi));
    335 }
    336 
    337 static inline void sad16xhx4d_neon(const uint8_t *src, int src_stride,
    338                                   const uint8_t *const ref[4], int ref_stride,
    339                                   uint32_t res[4], int h) {
    340  uint16x8_t sum_u16[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),
    341                            vdupq_n_u16(0) };
    342  uint32x4_t sum_u32[4];
    343 
    344  int ref_offset = 0;
    345  int i = h;
    346  do {
    347    const uint8x16_t s = vld1q_u8(src);
    348    sad16_neon(s, vld1q_u8(ref[0] + ref_offset), &sum_u16[0]);
    349    sad16_neon(s, vld1q_u8(ref[1] + ref_offset), &sum_u16[1]);
    350    sad16_neon(s, vld1q_u8(ref[2] + ref_offset), &sum_u16[2]);
    351    sad16_neon(s, vld1q_u8(ref[3] + ref_offset), &sum_u16[3]);
    352 
    353    src += src_stride;
    354    ref_offset += ref_stride;
    355  } while (--i != 0);
    356 
    357  sum_u32[0] = vpaddlq_u16(sum_u16[0]);
    358  sum_u32[1] = vpaddlq_u16(sum_u16[1]);
    359  sum_u32[2] = vpaddlq_u16(sum_u16[2]);
    360  sum_u32[3] = vpaddlq_u16(sum_u16[3]);
    361 
    362  vst1q_u32(res, horizontal_add_4d_u32x4(sum_u32));
    363 }
    364 
    365 static inline void sad8xhx4d_neon(const uint8_t *src, int src_stride,
    366                                  const uint8_t *const ref[4], int ref_stride,
    367                                  uint32_t res[4], int h) {
    368  uint16x8_t sum[4];
    369 
    370  uint8x8_t s = vld1_u8(src);
    371  sum[0] = vabdl_u8(s, vld1_u8(ref[0]));
    372  sum[1] = vabdl_u8(s, vld1_u8(ref[1]));
    373  sum[2] = vabdl_u8(s, vld1_u8(ref[2]));
    374  sum[3] = vabdl_u8(s, vld1_u8(ref[3]));
    375 
    376  src += src_stride;
    377  int ref_offset = ref_stride;
    378  int i = h - 1;
    379  do {
    380    s = vld1_u8(src);
    381    sum[0] = vabal_u8(sum[0], s, vld1_u8(ref[0] + ref_offset));
    382    sum[1] = vabal_u8(sum[1], s, vld1_u8(ref[1] + ref_offset));
    383    sum[2] = vabal_u8(sum[2], s, vld1_u8(ref[2] + ref_offset));
    384    sum[3] = vabal_u8(sum[3], s, vld1_u8(ref[3] + ref_offset));
    385 
    386    src += src_stride;
    387    ref_offset += ref_stride;
    388  } while (--i != 0);
    389 
    390  vst1q_u32(res, horizontal_add_4d_u16x8(sum));
    391 }
    392 
    393 static inline void sad4xhx4d_neon(const uint8_t *src, int src_stride,
    394                                  const uint8_t *const ref[4], int ref_stride,
    395                                  uint32_t res[4], int h) {
    396  uint16x8_t sum[4];
    397 
    398  uint8x8_t s = load_unaligned_u8(src, src_stride);
    399  uint8x8_t r0 = load_unaligned_u8(ref[0], ref_stride);
    400  uint8x8_t r1 = load_unaligned_u8(ref[1], ref_stride);
    401  uint8x8_t r2 = load_unaligned_u8(ref[2], ref_stride);
    402  uint8x8_t r3 = load_unaligned_u8(ref[3], ref_stride);
    403 
    404  sum[0] = vabdl_u8(s, r0);
    405  sum[1] = vabdl_u8(s, r1);
    406  sum[2] = vabdl_u8(s, r2);
    407  sum[3] = vabdl_u8(s, r3);
    408 
    409  src += 2 * src_stride;
    410  int ref_offset = 2 * ref_stride;
    411  int i = h / 2;
    412  while (--i != 0) {
    413    s = load_unaligned_u8(src, src_stride);
    414    r0 = load_unaligned_u8(ref[0] + ref_offset, ref_stride);
    415    r1 = load_unaligned_u8(ref[1] + ref_offset, ref_stride);
    416    r2 = load_unaligned_u8(ref[2] + ref_offset, ref_stride);
    417    r3 = load_unaligned_u8(ref[3] + ref_offset, ref_stride);
    418 
    419    sum[0] = vabal_u8(sum[0], s, r0);
    420    sum[1] = vabal_u8(sum[1], s, r1);
    421    sum[2] = vabal_u8(sum[2], s, r2);
    422    sum[3] = vabal_u8(sum[3], s, r3);
    423 
    424    src += 2 * src_stride;
    425    ref_offset += 2 * ref_stride;
    426  }
    427 
    428  vst1q_u32(res, horizontal_add_4d_u16x8(sum));
    429 }
    430 
    431 #define SAD_WXH_4D_NEON(w, h)                                                  \
    432  void aom_sad##w##x##h##x4d_neon(const uint8_t *src, int src_stride,          \
    433                                  const uint8_t *const ref[4], int ref_stride, \
    434                                  uint32_t res[4]) {                           \
    435    sad##w##xhx4d_neon(src, src_stride, ref, ref_stride, res, (h));            \
    436  }
    437 
    438 SAD_WXH_4D_NEON(4, 4)
    439 SAD_WXH_4D_NEON(4, 8)
    440 
    441 SAD_WXH_4D_NEON(8, 4)
    442 SAD_WXH_4D_NEON(8, 8)
    443 SAD_WXH_4D_NEON(8, 16)
    444 
    445 SAD_WXH_4D_NEON(16, 8)
    446 SAD_WXH_4D_NEON(16, 16)
    447 SAD_WXH_4D_NEON(16, 32)
    448 
    449 SAD_WXH_4D_NEON(32, 16)
    450 SAD_WXH_4D_NEON(32, 32)
    451 SAD_WXH_4D_NEON(32, 64)
    452 
    453 SAD_WXH_4D_NEON(64, 32)
    454 SAD_WXH_4D_NEON(64, 64)
    455 SAD_WXH_4D_NEON(64, 128)
    456 
    457 SAD_WXH_4D_NEON(128, 64)
    458 SAD_WXH_4D_NEON(128, 128)
    459 
    460 #if !CONFIG_REALTIME_ONLY
    461 SAD_WXH_4D_NEON(4, 16)
    462 SAD_WXH_4D_NEON(8, 32)
    463 SAD_WXH_4D_NEON(16, 4)
    464 SAD_WXH_4D_NEON(16, 64)
    465 SAD_WXH_4D_NEON(32, 8)
    466 SAD_WXH_4D_NEON(64, 16)
    467 #endif  // !CONFIG_REALTIME_ONLY
    468 
    469 #undef SAD_WXH_4D_NEON
    470 
    471 #define SAD_SKIP_WXH_4D_NEON(w, h)                                          \
    472  void aom_sad_skip_##w##x##h##x4d_neon(const uint8_t *src, int src_stride, \
    473                                        const uint8_t *const ref[4],        \
    474                                        int ref_stride, uint32_t res[4]) {  \
    475    sad##w##xhx4d_neon(src, 2 * src_stride, ref, 2 * ref_stride, res,       \
    476                       ((h) >> 1));                                         \
    477    res[0] <<= 1;                                                           \
    478    res[1] <<= 1;                                                           \
    479    res[2] <<= 1;                                                           \
    480    res[3] <<= 1;                                                           \
    481  }
    482 
    483 SAD_SKIP_WXH_4D_NEON(8, 16)
    484 
    485 SAD_SKIP_WXH_4D_NEON(16, 16)
    486 SAD_SKIP_WXH_4D_NEON(16, 32)
    487 
    488 SAD_SKIP_WXH_4D_NEON(32, 16)
    489 SAD_SKIP_WXH_4D_NEON(32, 32)
    490 SAD_SKIP_WXH_4D_NEON(32, 64)
    491 
    492 SAD_SKIP_WXH_4D_NEON(64, 32)
    493 SAD_SKIP_WXH_4D_NEON(64, 64)
    494 SAD_SKIP_WXH_4D_NEON(64, 128)
    495 
    496 SAD_SKIP_WXH_4D_NEON(128, 64)
    497 SAD_SKIP_WXH_4D_NEON(128, 128)
    498 
    499 #if !CONFIG_REALTIME_ONLY
    500 SAD_SKIP_WXH_4D_NEON(4, 16)
    501 SAD_SKIP_WXH_4D_NEON(8, 32)
    502 SAD_SKIP_WXH_4D_NEON(16, 64)
    503 SAD_SKIP_WXH_4D_NEON(64, 16)
    504 #endif  // !CONFIG_REALTIME_ONLY
    505 
    506 #undef SAD_SKIP_WXH_4D_NEON