tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

intrapred_neon.c (134752B)


      1 /*
      2 * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
      3 *
      4 * This source code is subject to the terms of the BSD 2 Clause License and
      5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
      6 * was not distributed with this source code in the LICENSE file, you can
      7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
      8 * Media Patent License 1.0 was not distributed with this source code in the
      9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
     10 */
     11 
     12 #include <arm_neon.h>
     13 #include <assert.h>
     14 #include <stdint.h>
     15 
     16 #include "config/aom_config.h"
     17 #include "config/aom_dsp_rtcd.h"
     18 #include "config/av1_rtcd.h"
     19 
     20 #include "aom/aom_integer.h"
     21 #include "aom_dsp/arm/mem_neon.h"
     22 #include "aom_dsp/arm/reinterpret_neon.h"
     23 #include "aom_dsp/arm/sum_neon.h"
     24 #include "aom_dsp/arm/transpose_neon.h"
     25 #include "aom_dsp/intrapred_common.h"
     26 
     27 //------------------------------------------------------------------------------
     28 // DC 4x4
     29 
     30 static inline uint16x8_t dc_load_sum_4(const uint8_t *in) {
     31  const uint8x8_t a = load_u8_4x1(in);
     32  const uint16x4_t p0 = vpaddl_u8(a);
     33  const uint16x4_t p1 = vpadd_u16(p0, p0);
     34  return vcombine_u16(p1, vdup_n_u16(0));
     35 }
     36 
     37 static inline void dc_store_4xh(uint8_t *dst, ptrdiff_t stride, int h,
     38                                uint8x8_t dc) {
     39  for (int i = 0; i < h; ++i) {
     40    store_u8_4x1(dst + i * stride, dc);
     41  }
     42 }
     43 
     44 void aom_dc_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
     45                               const uint8_t *above, const uint8_t *left) {
     46  const uint16x8_t sum_top = dc_load_sum_4(above);
     47  const uint16x8_t sum_left = dc_load_sum_4(left);
     48  const uint16x8_t sum = vaddq_u16(sum_left, sum_top);
     49  const uint8x8_t dc0 = vrshrn_n_u16(sum, 3);
     50  dc_store_4xh(dst, stride, 4, vdup_lane_u8(dc0, 0));
     51 }
     52 
     53 void aom_dc_left_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
     54                                    const uint8_t *above, const uint8_t *left) {
     55  const uint16x8_t sum_left = dc_load_sum_4(left);
     56  const uint8x8_t dc0 = vrshrn_n_u16(sum_left, 2);
     57  (void)above;
     58  dc_store_4xh(dst, stride, 4, vdup_lane_u8(dc0, 0));
     59 }
     60 
     61 void aom_dc_top_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
     62                                   const uint8_t *above, const uint8_t *left) {
     63  const uint16x8_t sum_top = dc_load_sum_4(above);
     64  const uint8x8_t dc0 = vrshrn_n_u16(sum_top, 2);
     65  (void)left;
     66  dc_store_4xh(dst, stride, 4, vdup_lane_u8(dc0, 0));
     67 }
     68 
     69 void aom_dc_128_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
     70                                   const uint8_t *above, const uint8_t *left) {
     71  const uint8x8_t dc0 = vdup_n_u8(0x80);
     72  (void)above;
     73  (void)left;
     74  dc_store_4xh(dst, stride, 4, dc0);
     75 }
     76 
     77 //------------------------------------------------------------------------------
     78 // DC 8x8
     79 
     80 static inline uint16x8_t dc_load_sum_8(const uint8_t *in) {
     81  // This isn't used in the case where we want to load both above and left
     82  // vectors, since we want to avoid performing the reduction twice.
     83  const uint8x8_t a = vld1_u8(in);
     84  const uint16x4_t p0 = vpaddl_u8(a);
     85  const uint16x4_t p1 = vpadd_u16(p0, p0);
     86  const uint16x4_t p2 = vpadd_u16(p1, p1);
     87  return vcombine_u16(p2, vdup_n_u16(0));
     88 }
     89 
     90 static inline uint16x8_t horizontal_add_and_broadcast_u16x8(uint16x8_t a) {
     91 #if AOM_ARCH_AARCH64
     92  // On AArch64 we could also use vdupq_n_u16(vaddvq_u16(a)) here to save an
     93  // instruction, however the addv instruction is usually slightly more
     94  // expensive than a pairwise addition, so the need for immediately
     95  // broadcasting the result again seems to negate any benefit.
     96  const uint16x8_t b = vpaddq_u16(a, a);
     97  const uint16x8_t c = vpaddq_u16(b, b);
     98  return vpaddq_u16(c, c);
     99 #else
    100  const uint16x4_t b = vadd_u16(vget_low_u16(a), vget_high_u16(a));
    101  const uint16x4_t c = vpadd_u16(b, b);
    102  const uint16x4_t d = vpadd_u16(c, c);
    103  return vcombine_u16(d, d);
    104 #endif
    105 }
    106 
    107 static inline void dc_store_8xh(uint8_t *dst, ptrdiff_t stride, int h,
    108                                uint8x8_t dc) {
    109  for (int i = 0; i < h; ++i) {
    110    vst1_u8(dst + i * stride, dc);
    111  }
    112 }
    113 
    114 void aom_dc_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
    115                               const uint8_t *above, const uint8_t *left) {
    116  const uint8x8_t sum_top = vld1_u8(above);
    117  const uint8x8_t sum_left = vld1_u8(left);
    118  uint16x8_t sum = vaddl_u8(sum_left, sum_top);
    119  sum = horizontal_add_and_broadcast_u16x8(sum);
    120  const uint8x8_t dc0 = vrshrn_n_u16(sum, 4);
    121  dc_store_8xh(dst, stride, 8, vdup_lane_u8(dc0, 0));
    122 }
    123 
    124 void aom_dc_left_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
    125                                    const uint8_t *above, const uint8_t *left) {
    126  const uint16x8_t sum_left = dc_load_sum_8(left);
    127  const uint8x8_t dc0 = vrshrn_n_u16(sum_left, 3);
    128  (void)above;
    129  dc_store_8xh(dst, stride, 8, vdup_lane_u8(dc0, 0));
    130 }
    131 
    132 void aom_dc_top_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
    133                                   const uint8_t *above, const uint8_t *left) {
    134  const uint16x8_t sum_top = dc_load_sum_8(above);
    135  const uint8x8_t dc0 = vrshrn_n_u16(sum_top, 3);
    136  (void)left;
    137  dc_store_8xh(dst, stride, 8, vdup_lane_u8(dc0, 0));
    138 }
    139 
    140 void aom_dc_128_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
    141                                   const uint8_t *above, const uint8_t *left) {
    142  const uint8x8_t dc0 = vdup_n_u8(0x80);
    143  (void)above;
    144  (void)left;
    145  dc_store_8xh(dst, stride, 8, dc0);
    146 }
    147 
    148 //------------------------------------------------------------------------------
    149 // DC 16x16
    150 
    151 static inline uint16x8_t dc_load_partial_sum_16(const uint8_t *in) {
    152  const uint8x16_t a = vld1q_u8(in);
    153  // delay the remainder of the reduction until
    154  // horizontal_add_and_broadcast_u16x8, since we want to do it once rather
    155  // than twice in the case we are loading both above and left.
    156  return vpaddlq_u8(a);
    157 }
    158 
    159 static inline uint16x8_t dc_load_sum_16(const uint8_t *in) {
    160  return horizontal_add_and_broadcast_u16x8(dc_load_partial_sum_16(in));
    161 }
    162 
    163 static inline void dc_store_16xh(uint8_t *dst, ptrdiff_t stride, int h,
    164                                 uint8x16_t dc) {
    165  for (int i = 0; i < h; ++i) {
    166    vst1q_u8(dst + i * stride, dc);
    167  }
    168 }
    169 
    170 void aom_dc_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
    171                                 const uint8_t *above, const uint8_t *left) {
    172  const uint16x8_t sum_top = dc_load_partial_sum_16(above);
    173  const uint16x8_t sum_left = dc_load_partial_sum_16(left);
    174  uint16x8_t sum = vaddq_u16(sum_left, sum_top);
    175  sum = horizontal_add_and_broadcast_u16x8(sum);
    176  const uint8x8_t dc0 = vrshrn_n_u16(sum, 5);
    177  dc_store_16xh(dst, stride, 16, vdupq_lane_u8(dc0, 0));
    178 }
    179 
    180 void aom_dc_left_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
    181                                      const uint8_t *above,
    182                                      const uint8_t *left) {
    183  const uint16x8_t sum_left = dc_load_sum_16(left);
    184  const uint8x8_t dc0 = vrshrn_n_u16(sum_left, 4);
    185  (void)above;
    186  dc_store_16xh(dst, stride, 16, vdupq_lane_u8(dc0, 0));
    187 }
    188 
    189 void aom_dc_top_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
    190                                     const uint8_t *above,
    191                                     const uint8_t *left) {
    192  const uint16x8_t sum_top = dc_load_sum_16(above);
    193  const uint8x8_t dc0 = vrshrn_n_u16(sum_top, 4);
    194  (void)left;
    195  dc_store_16xh(dst, stride, 16, vdupq_lane_u8(dc0, 0));
    196 }
    197 
    198 void aom_dc_128_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
    199                                     const uint8_t *above,
    200                                     const uint8_t *left) {
    201  const uint8x16_t dc0 = vdupq_n_u8(0x80);
    202  (void)above;
    203  (void)left;
    204  dc_store_16xh(dst, stride, 16, dc0);
    205 }
    206 
    207 //------------------------------------------------------------------------------
    208 // DC 32x32
    209 
    210 static inline uint16x8_t dc_load_partial_sum_32(const uint8_t *in) {
    211  const uint8x16_t a0 = vld1q_u8(in);
    212  const uint8x16_t a1 = vld1q_u8(in + 16);
    213  // delay the remainder of the reduction until
    214  // horizontal_add_and_broadcast_u16x8, since we want to do it once rather
    215  // than twice in the case we are loading both above and left.
    216  return vpadalq_u8(vpaddlq_u8(a0), a1);
    217 }
    218 
    219 static inline uint16x8_t dc_load_sum_32(const uint8_t *in) {
    220  return horizontal_add_and_broadcast_u16x8(dc_load_partial_sum_32(in));
    221 }
    222 
    223 static inline void dc_store_32xh(uint8_t *dst, ptrdiff_t stride, int h,
    224                                 uint8x16_t dc) {
    225  for (int i = 0; i < h; ++i) {
    226    vst1q_u8(dst + i * stride, dc);
    227    vst1q_u8(dst + i * stride + 16, dc);
    228  }
    229 }
    230 
    231 void aom_dc_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
    232                                 const uint8_t *above, const uint8_t *left) {
    233  const uint16x8_t sum_top = dc_load_partial_sum_32(above);
    234  const uint16x8_t sum_left = dc_load_partial_sum_32(left);
    235  uint16x8_t sum = vaddq_u16(sum_left, sum_top);
    236  sum = horizontal_add_and_broadcast_u16x8(sum);
    237  const uint8x8_t dc0 = vrshrn_n_u16(sum, 6);
    238  dc_store_32xh(dst, stride, 32, vdupq_lane_u8(dc0, 0));
    239 }
    240 
    241 void aom_dc_left_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
    242                                      const uint8_t *above,
    243                                      const uint8_t *left) {
    244  const uint16x8_t sum_left = dc_load_sum_32(left);
    245  const uint8x8_t dc0 = vrshrn_n_u16(sum_left, 5);
    246  (void)above;
    247  dc_store_32xh(dst, stride, 32, vdupq_lane_u8(dc0, 0));
    248 }
    249 
    250 void aom_dc_top_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
    251                                     const uint8_t *above,
    252                                     const uint8_t *left) {
    253  const uint16x8_t sum_top = dc_load_sum_32(above);
    254  const uint8x8_t dc0 = vrshrn_n_u16(sum_top, 5);
    255  (void)left;
    256  dc_store_32xh(dst, stride, 32, vdupq_lane_u8(dc0, 0));
    257 }
    258 
    259 void aom_dc_128_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
    260                                     const uint8_t *above,
    261                                     const uint8_t *left) {
    262  const uint8x16_t dc0 = vdupq_n_u8(0x80);
    263  (void)above;
    264  (void)left;
    265  dc_store_32xh(dst, stride, 32, dc0);
    266 }
    267 
    268 //------------------------------------------------------------------------------
    269 // DC 64x64
    270 
    271 static inline uint16x8_t dc_load_partial_sum_64(const uint8_t *in) {
    272  const uint8x16_t a0 = vld1q_u8(in);
    273  const uint8x16_t a1 = vld1q_u8(in + 16);
    274  const uint8x16_t a2 = vld1q_u8(in + 32);
    275  const uint8x16_t a3 = vld1q_u8(in + 48);
    276  const uint16x8_t p01 = vpadalq_u8(vpaddlq_u8(a0), a1);
    277  const uint16x8_t p23 = vpadalq_u8(vpaddlq_u8(a2), a3);
    278  // delay the remainder of the reduction until
    279  // horizontal_add_and_broadcast_u16x8, since we want to do it once rather
    280  // than twice in the case we are loading both above and left.
    281  return vaddq_u16(p01, p23);
    282 }
    283 
    284 static inline uint16x8_t dc_load_sum_64(const uint8_t *in) {
    285  return horizontal_add_and_broadcast_u16x8(dc_load_partial_sum_64(in));
    286 }
    287 
    288 static inline void dc_store_64xh(uint8_t *dst, ptrdiff_t stride, int h,
    289                                 uint8x16_t dc) {
    290  for (int i = 0; i < h; ++i) {
    291    vst1q_u8(dst + i * stride, dc);
    292    vst1q_u8(dst + i * stride + 16, dc);
    293    vst1q_u8(dst + i * stride + 32, dc);
    294    vst1q_u8(dst + i * stride + 48, dc);
    295  }
    296 }
    297 
    298 void aom_dc_predictor_64x64_neon(uint8_t *dst, ptrdiff_t stride,
    299                                 const uint8_t *above, const uint8_t *left) {
    300  const uint16x8_t sum_top = dc_load_partial_sum_64(above);
    301  const uint16x8_t sum_left = dc_load_partial_sum_64(left);
    302  uint16x8_t sum = vaddq_u16(sum_left, sum_top);
    303  sum = horizontal_add_and_broadcast_u16x8(sum);
    304  const uint8x8_t dc0 = vrshrn_n_u16(sum, 7);
    305  dc_store_64xh(dst, stride, 64, vdupq_lane_u8(dc0, 0));
    306 }
    307 
    308 void aom_dc_left_predictor_64x64_neon(uint8_t *dst, ptrdiff_t stride,
    309                                      const uint8_t *above,
    310                                      const uint8_t *left) {
    311  const uint16x8_t sum_left = dc_load_sum_64(left);
    312  const uint8x8_t dc0 = vrshrn_n_u16(sum_left, 6);
    313  (void)above;
    314  dc_store_64xh(dst, stride, 64, vdupq_lane_u8(dc0, 0));
    315 }
    316 
    317 void aom_dc_top_predictor_64x64_neon(uint8_t *dst, ptrdiff_t stride,
    318                                     const uint8_t *above,
    319                                     const uint8_t *left) {
    320  const uint16x8_t sum_top = dc_load_sum_64(above);
    321  const uint8x8_t dc0 = vrshrn_n_u16(sum_top, 6);
    322  (void)left;
    323  dc_store_64xh(dst, stride, 64, vdupq_lane_u8(dc0, 0));
    324 }
    325 
    326 void aom_dc_128_predictor_64x64_neon(uint8_t *dst, ptrdiff_t stride,
    327                                     const uint8_t *above,
    328                                     const uint8_t *left) {
    329  const uint8x16_t dc0 = vdupq_n_u8(0x80);
    330  (void)above;
    331  (void)left;
    332  dc_store_64xh(dst, stride, 64, dc0);
    333 }
    334 
    335 //------------------------------------------------------------------------------
    336 // DC rectangular cases
    337 
    338 #define DC_MULTIPLIER_1X2 0x5556
    339 #define DC_MULTIPLIER_1X4 0x3334
    340 
    341 #define DC_SHIFT2 16
    342 
    343 static inline int divide_using_multiply_shift(int num, int shift1,
    344                                              int multiplier, int shift2) {
    345  const int interm = num >> shift1;
    346  return interm * multiplier >> shift2;
    347 }
    348 
    349 static inline int calculate_dc_from_sum(int bw, int bh, uint32_t sum,
    350                                        int shift1, int multiplier) {
    351  const int expected_dc = divide_using_multiply_shift(
    352      sum + ((bw + bh) >> 1), shift1, multiplier, DC_SHIFT2);
    353  assert(expected_dc < (1 << 8));
    354  return expected_dc;
    355 }
    356 
    357 #undef DC_SHIFT2
    358 
    359 void aom_dc_predictor_4x8_neon(uint8_t *dst, ptrdiff_t stride,
    360                               const uint8_t *above, const uint8_t *left) {
    361  uint8x8_t a = load_u8_4x1(above);
    362  uint8x8_t l = vld1_u8(left);
    363  uint32_t sum = horizontal_add_u16x8(vaddl_u8(a, l));
    364  uint32_t dc = calculate_dc_from_sum(4, 8, sum, 2, DC_MULTIPLIER_1X2);
    365  dc_store_4xh(dst, stride, 8, vdup_n_u8(dc));
    366 }
    367 
    368 void aom_dc_predictor_8x4_neon(uint8_t *dst, ptrdiff_t stride,
    369                               const uint8_t *above, const uint8_t *left) {
    370  uint8x8_t a = vld1_u8(above);
    371  uint8x8_t l = load_u8_4x1(left);
    372  uint32_t sum = horizontal_add_u16x8(vaddl_u8(a, l));
    373  uint32_t dc = calculate_dc_from_sum(8, 4, sum, 2, DC_MULTIPLIER_1X2);
    374  dc_store_8xh(dst, stride, 4, vdup_n_u8(dc));
    375 }
    376 
    377 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
    378 void aom_dc_predictor_4x16_neon(uint8_t *dst, ptrdiff_t stride,
    379                                const uint8_t *above, const uint8_t *left) {
    380  uint8x8_t a = load_u8_4x1(above);
    381  uint8x16_t l = vld1q_u8(left);
    382  uint16x8_t sum_al = vaddw_u8(vpaddlq_u8(l), a);
    383  uint32_t sum = horizontal_add_u16x8(sum_al);
    384  uint32_t dc = calculate_dc_from_sum(4, 16, sum, 2, DC_MULTIPLIER_1X4);
    385  dc_store_4xh(dst, stride, 16, vdup_n_u8(dc));
    386 }
    387 
    388 void aom_dc_predictor_16x4_neon(uint8_t *dst, ptrdiff_t stride,
    389                                const uint8_t *above, const uint8_t *left) {
    390  uint8x16_t a = vld1q_u8(above);
    391  uint8x8_t l = load_u8_4x1(left);
    392  uint16x8_t sum_al = vaddw_u8(vpaddlq_u8(a), l);
    393  uint32_t sum = horizontal_add_u16x8(sum_al);
    394  uint32_t dc = calculate_dc_from_sum(16, 4, sum, 2, DC_MULTIPLIER_1X4);
    395  dc_store_16xh(dst, stride, 4, vdupq_n_u8(dc));
    396 }
    397 #endif  // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
    398 
    399 void aom_dc_predictor_8x16_neon(uint8_t *dst, ptrdiff_t stride,
    400                                const uint8_t *above, const uint8_t *left) {
    401  uint8x8_t a = vld1_u8(above);
    402  uint8x16_t l = vld1q_u8(left);
    403  uint16x8_t sum_al = vaddw_u8(vpaddlq_u8(l), a);
    404  uint32_t sum = horizontal_add_u16x8(sum_al);
    405  uint32_t dc = calculate_dc_from_sum(8, 16, sum, 3, DC_MULTIPLIER_1X2);
    406  dc_store_8xh(dst, stride, 16, vdup_n_u8(dc));
    407 }
    408 
    409 void aom_dc_predictor_16x8_neon(uint8_t *dst, ptrdiff_t stride,
    410                                const uint8_t *above, const uint8_t *left) {
    411  uint8x16_t a = vld1q_u8(above);
    412  uint8x8_t l = vld1_u8(left);
    413  uint16x8_t sum_al = vaddw_u8(vpaddlq_u8(a), l);
    414  uint32_t sum = horizontal_add_u16x8(sum_al);
    415  uint32_t dc = calculate_dc_from_sum(16, 8, sum, 3, DC_MULTIPLIER_1X2);
    416  dc_store_16xh(dst, stride, 8, vdupq_n_u8(dc));
    417 }
    418 
    419 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
    420 void aom_dc_predictor_8x32_neon(uint8_t *dst, ptrdiff_t stride,
    421                                const uint8_t *above, const uint8_t *left) {
    422  uint8x8_t a = vld1_u8(above);
    423  uint16x8_t sum_left = dc_load_partial_sum_32(left);
    424  uint16x8_t sum_al = vaddw_u8(sum_left, a);
    425  uint32_t sum = horizontal_add_u16x8(sum_al);
    426  uint32_t dc = calculate_dc_from_sum(8, 32, sum, 3, DC_MULTIPLIER_1X4);
    427  dc_store_8xh(dst, stride, 32, vdup_n_u8(dc));
    428 }
    429 
    430 void aom_dc_predictor_32x8_neon(uint8_t *dst, ptrdiff_t stride,
    431                                const uint8_t *above, const uint8_t *left) {
    432  uint16x8_t sum_top = dc_load_partial_sum_32(above);
    433  uint8x8_t l = vld1_u8(left);
    434  uint16x8_t sum_al = vaddw_u8(sum_top, l);
    435  uint32_t sum = horizontal_add_u16x8(sum_al);
    436  uint32_t dc = calculate_dc_from_sum(32, 8, sum, 3, DC_MULTIPLIER_1X4);
    437  dc_store_32xh(dst, stride, 8, vdupq_n_u8(dc));
    438 }
    439 #endif  // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
    440 
    441 void aom_dc_predictor_16x32_neon(uint8_t *dst, ptrdiff_t stride,
    442                                 const uint8_t *above, const uint8_t *left) {
    443  uint16x8_t sum_above = dc_load_partial_sum_16(above);
    444  uint16x8_t sum_left = dc_load_partial_sum_32(left);
    445  uint16x8_t sum_al = vaddq_u16(sum_left, sum_above);
    446  uint32_t sum = horizontal_add_u16x8(sum_al);
    447  uint32_t dc = calculate_dc_from_sum(16, 32, sum, 4, DC_MULTIPLIER_1X2);
    448  dc_store_16xh(dst, stride, 32, vdupq_n_u8(dc));
    449 }
    450 
    451 void aom_dc_predictor_32x16_neon(uint8_t *dst, ptrdiff_t stride,
    452                                 const uint8_t *above, const uint8_t *left) {
    453  uint16x8_t sum_above = dc_load_partial_sum_32(above);
    454  uint16x8_t sum_left = dc_load_partial_sum_16(left);
    455  uint16x8_t sum_al = vaddq_u16(sum_left, sum_above);
    456  uint32_t sum = horizontal_add_u16x8(sum_al);
    457  uint32_t dc = calculate_dc_from_sum(32, 16, sum, 4, DC_MULTIPLIER_1X2);
    458  dc_store_32xh(dst, stride, 16, vdupq_n_u8(dc));
    459 }
    460 
    461 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
    462 void aom_dc_predictor_16x64_neon(uint8_t *dst, ptrdiff_t stride,
    463                                 const uint8_t *above, const uint8_t *left) {
    464  uint16x8_t sum_above = dc_load_partial_sum_16(above);
    465  uint16x8_t sum_left = dc_load_partial_sum_64(left);
    466  uint16x8_t sum_al = vaddq_u16(sum_left, sum_above);
    467  uint32_t sum = horizontal_add_u16x8(sum_al);
    468  uint32_t dc = calculate_dc_from_sum(16, 64, sum, 4, DC_MULTIPLIER_1X4);
    469  dc_store_16xh(dst, stride, 64, vdupq_n_u8(dc));
    470 }
    471 
    472 void aom_dc_predictor_64x16_neon(uint8_t *dst, ptrdiff_t stride,
    473                                 const uint8_t *above, const uint8_t *left) {
    474  uint16x8_t sum_above = dc_load_partial_sum_64(above);
    475  uint16x8_t sum_left = dc_load_partial_sum_16(left);
    476  uint16x8_t sum_al = vaddq_u16(sum_above, sum_left);
    477  uint32_t sum = horizontal_add_u16x8(sum_al);
    478  uint32_t dc = calculate_dc_from_sum(64, 16, sum, 4, DC_MULTIPLIER_1X4);
    479  dc_store_64xh(dst, stride, 16, vdupq_n_u8(dc));
    480 }
    481 #endif  // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
    482 
    483 void aom_dc_predictor_32x64_neon(uint8_t *dst, ptrdiff_t stride,
    484                                 const uint8_t *above, const uint8_t *left) {
    485  uint16x8_t sum_above = dc_load_partial_sum_32(above);
    486  uint16x8_t sum_left = dc_load_partial_sum_64(left);
    487  uint16x8_t sum_al = vaddq_u16(sum_above, sum_left);
    488  uint32_t sum = horizontal_add_u16x8(sum_al);
    489  uint32_t dc = calculate_dc_from_sum(32, 64, sum, 5, DC_MULTIPLIER_1X2);
    490  dc_store_32xh(dst, stride, 64, vdupq_n_u8(dc));
    491 }
    492 
    493 void aom_dc_predictor_64x32_neon(uint8_t *dst, ptrdiff_t stride,
    494                                 const uint8_t *above, const uint8_t *left) {
    495  uint16x8_t sum_above = dc_load_partial_sum_64(above);
    496  uint16x8_t sum_left = dc_load_partial_sum_32(left);
    497  uint16x8_t sum_al = vaddq_u16(sum_above, sum_left);
    498  uint32_t sum = horizontal_add_u16x8(sum_al);
    499  uint32_t dc = calculate_dc_from_sum(64, 32, sum, 5, DC_MULTIPLIER_1X2);
    500  dc_store_64xh(dst, stride, 32, vdupq_n_u8(dc));
    501 }
    502 
    503 #undef DC_MULTIPLIER_1X2
    504 #undef DC_MULTIPLIER_1X4
    505 
    506 #define DC_PREDICTOR_128(w, h, q)                                            \
    507  void aom_dc_128_predictor_##w##x##h##_neon(uint8_t *dst, ptrdiff_t stride, \
    508                                             const uint8_t *above,           \
    509                                             const uint8_t *left) {          \
    510    (void)above;                                                             \
    511    (void)left;                                                              \
    512    dc_store_##w##xh(dst, stride, (h), vdup##q##_n_u8(0x80));                \
    513  }
    514 
    515 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
    516 DC_PREDICTOR_128(4, 16, )
    517 DC_PREDICTOR_128(8, 32, )
    518 DC_PREDICTOR_128(16, 4, q)
    519 DC_PREDICTOR_128(16, 64, q)
    520 DC_PREDICTOR_128(32, 8, q)
    521 DC_PREDICTOR_128(64, 16, q)
    522 #endif  // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
    523 DC_PREDICTOR_128(4, 8, )
    524 DC_PREDICTOR_128(8, 4, )
    525 DC_PREDICTOR_128(8, 16, )
    526 DC_PREDICTOR_128(16, 8, q)
    527 DC_PREDICTOR_128(16, 32, q)
    528 DC_PREDICTOR_128(32, 16, q)
    529 DC_PREDICTOR_128(32, 64, q)
    530 DC_PREDICTOR_128(64, 32, q)
    531 
    532 #undef DC_PREDICTOR_128
    533 
    534 #define DC_PREDICTOR_LEFT(w, h, shift, q)                                     \
    535  void aom_dc_left_predictor_##w##x##h##_neon(uint8_t *dst, ptrdiff_t stride, \
    536                                              const uint8_t *above,           \
    537                                              const uint8_t *left) {          \
    538    (void)above;                                                              \
    539    const uint16x8_t sum = dc_load_sum_##h(left);                             \
    540    const uint8x8_t dc0 = vrshrn_n_u16(sum, (shift));                         \
    541    dc_store_##w##xh(dst, stride, (h), vdup##q##_lane_u8(dc0, 0));            \
    542  }
    543 
    544 DC_PREDICTOR_LEFT(4, 8, 3, )
    545 DC_PREDICTOR_LEFT(8, 4, 2, )
    546 DC_PREDICTOR_LEFT(8, 16, 4, )
    547 DC_PREDICTOR_LEFT(16, 8, 3, q)
    548 DC_PREDICTOR_LEFT(16, 32, 5, q)
    549 DC_PREDICTOR_LEFT(32, 16, 4, q)
    550 DC_PREDICTOR_LEFT(32, 64, 6, q)
    551 DC_PREDICTOR_LEFT(64, 32, 5, q)
    552 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
    553 DC_PREDICTOR_LEFT(4, 16, 4, )
    554 DC_PREDICTOR_LEFT(16, 4, 2, q)
    555 DC_PREDICTOR_LEFT(8, 32, 5, )
    556 DC_PREDICTOR_LEFT(32, 8, 3, q)
    557 DC_PREDICTOR_LEFT(16, 64, 6, q)
    558 DC_PREDICTOR_LEFT(64, 16, 4, q)
    559 #endif  // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
    560 
    561 #undef DC_PREDICTOR_LEFT
    562 
    563 #define DC_PREDICTOR_TOP(w, h, shift, q)                                     \
    564  void aom_dc_top_predictor_##w##x##h##_neon(uint8_t *dst, ptrdiff_t stride, \
    565                                             const uint8_t *above,           \
    566                                             const uint8_t *left) {          \
    567    (void)left;                                                              \
    568    const uint16x8_t sum = dc_load_sum_##w(above);                           \
    569    const uint8x8_t dc0 = vrshrn_n_u16(sum, (shift));                        \
    570    dc_store_##w##xh(dst, stride, (h), vdup##q##_lane_u8(dc0, 0));           \
    571  }
    572 
    573 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
    574 DC_PREDICTOR_TOP(8, 32, 3, )
    575 DC_PREDICTOR_TOP(4, 16, 2, )
    576 DC_PREDICTOR_TOP(16, 4, 4, q)
    577 DC_PREDICTOR_TOP(16, 64, 4, q)
    578 DC_PREDICTOR_TOP(32, 8, 5, q)
    579 DC_PREDICTOR_TOP(64, 16, 6, q)
    580 #endif  // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
    581 DC_PREDICTOR_TOP(4, 8, 2, )
    582 DC_PREDICTOR_TOP(8, 4, 3, )
    583 DC_PREDICTOR_TOP(8, 16, 3, )
    584 DC_PREDICTOR_TOP(16, 8, 4, q)
    585 DC_PREDICTOR_TOP(16, 32, 4, q)
    586 DC_PREDICTOR_TOP(32, 16, 5, q)
    587 DC_PREDICTOR_TOP(32, 64, 5, q)
    588 DC_PREDICTOR_TOP(64, 32, 6, q)
    589 
    590 #undef DC_PREDICTOR_TOP
    591 
    592 // -----------------------------------------------------------------------------
    593 
    594 static inline void v_store_4xh(uint8_t *dst, ptrdiff_t stride, int h,
    595                               uint8x8_t d0) {
    596  for (int i = 0; i < h; ++i) {
    597    store_u8_4x1(dst + i * stride, d0);
    598  }
    599 }
    600 
    601 static inline void v_store_8xh(uint8_t *dst, ptrdiff_t stride, int h,
    602                               uint8x8_t d0) {
    603  for (int i = 0; i < h; ++i) {
    604    vst1_u8(dst + i * stride, d0);
    605  }
    606 }
    607 
    608 static inline void v_store_16xh(uint8_t *dst, ptrdiff_t stride, int h,
    609                                uint8x16_t d0) {
    610  for (int i = 0; i < h; ++i) {
    611    vst1q_u8(dst + i * stride, d0);
    612  }
    613 }
    614 
    615 static inline void v_store_32xh(uint8_t *dst, ptrdiff_t stride, int h,
    616                                uint8x16_t d0, uint8x16_t d1) {
    617  for (int i = 0; i < h; ++i) {
    618    vst1q_u8(dst + 0, d0);
    619    vst1q_u8(dst + 16, d1);
    620    dst += stride;
    621  }
    622 }
    623 
    624 static inline void v_store_64xh(uint8_t *dst, ptrdiff_t stride, int h,
    625                                uint8x16_t d0, uint8x16_t d1, uint8x16_t d2,
    626                                uint8x16_t d3) {
    627  for (int i = 0; i < h; ++i) {
    628    vst1q_u8(dst + 0, d0);
    629    vst1q_u8(dst + 16, d1);
    630    vst1q_u8(dst + 32, d2);
    631    vst1q_u8(dst + 48, d3);
    632    dst += stride;
    633  }
    634 }
    635 
    636 void aom_v_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
    637                              const uint8_t *above, const uint8_t *left) {
    638  (void)left;
    639  v_store_4xh(dst, stride, 4, load_u8_4x1(above));
    640 }
    641 
    642 void aom_v_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
    643                              const uint8_t *above, const uint8_t *left) {
    644  (void)left;
    645  v_store_8xh(dst, stride, 8, vld1_u8(above));
    646 }
    647 
    648 void aom_v_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
    649                                const uint8_t *above, const uint8_t *left) {
    650  (void)left;
    651  v_store_16xh(dst, stride, 16, vld1q_u8(above));
    652 }
    653 
    654 void aom_v_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
    655                                const uint8_t *above, const uint8_t *left) {
    656  const uint8x16_t d0 = vld1q_u8(above);
    657  const uint8x16_t d1 = vld1q_u8(above + 16);
    658  (void)left;
    659  v_store_32xh(dst, stride, 32, d0, d1);
    660 }
    661 
    662 void aom_v_predictor_4x8_neon(uint8_t *dst, ptrdiff_t stride,
    663                              const uint8_t *above, const uint8_t *left) {
    664  (void)left;
    665  v_store_4xh(dst, stride, 8, load_u8_4x1(above));
    666 }
    667 
    668 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
    669 void aom_v_predictor_4x16_neon(uint8_t *dst, ptrdiff_t stride,
    670                               const uint8_t *above, const uint8_t *left) {
    671  (void)left;
    672  v_store_4xh(dst, stride, 16, load_u8_4x1(above));
    673 }
    674 #endif  // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
    675 
    676 void aom_v_predictor_8x4_neon(uint8_t *dst, ptrdiff_t stride,
    677                              const uint8_t *above, const uint8_t *left) {
    678  (void)left;
    679  v_store_8xh(dst, stride, 4, vld1_u8(above));
    680 }
    681 
    682 void aom_v_predictor_8x16_neon(uint8_t *dst, ptrdiff_t stride,
    683                               const uint8_t *above, const uint8_t *left) {
    684  (void)left;
    685  v_store_8xh(dst, stride, 16, vld1_u8(above));
    686 }
    687 
    688 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
    689 void aom_v_predictor_8x32_neon(uint8_t *dst, ptrdiff_t stride,
    690                               const uint8_t *above, const uint8_t *left) {
    691  (void)left;
    692  v_store_8xh(dst, stride, 32, vld1_u8(above));
    693 }
    694 
    695 void aom_v_predictor_16x4_neon(uint8_t *dst, ptrdiff_t stride,
    696                               const uint8_t *above, const uint8_t *left) {
    697  (void)left;
    698  v_store_16xh(dst, stride, 4, vld1q_u8(above));
    699 }
    700 #endif  // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
    701 
    702 void aom_v_predictor_16x8_neon(uint8_t *dst, ptrdiff_t stride,
    703                               const uint8_t *above, const uint8_t *left) {
    704  (void)left;
    705  v_store_16xh(dst, stride, 8, vld1q_u8(above));
    706 }
    707 
    708 void aom_v_predictor_16x32_neon(uint8_t *dst, ptrdiff_t stride,
    709                                const uint8_t *above, const uint8_t *left) {
    710  (void)left;
    711  v_store_16xh(dst, stride, 32, vld1q_u8(above));
    712 }
    713 
    714 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
    715 void aom_v_predictor_16x64_neon(uint8_t *dst, ptrdiff_t stride,
    716                                const uint8_t *above, const uint8_t *left) {
    717  (void)left;
    718  v_store_16xh(dst, stride, 64, vld1q_u8(above));
    719 }
    720 
    721 void aom_v_predictor_32x8_neon(uint8_t *dst, ptrdiff_t stride,
    722                               const uint8_t *above, const uint8_t *left) {
    723  const uint8x16_t d0 = vld1q_u8(above);
    724  const uint8x16_t d1 = vld1q_u8(above + 16);
    725  (void)left;
    726  v_store_32xh(dst, stride, 8, d0, d1);
    727 }
    728 #endif  // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
    729 
    730 void aom_v_predictor_32x16_neon(uint8_t *dst, ptrdiff_t stride,
    731                                const uint8_t *above, const uint8_t *left) {
    732  const uint8x16_t d0 = vld1q_u8(above);
    733  const uint8x16_t d1 = vld1q_u8(above + 16);
    734  (void)left;
    735  v_store_32xh(dst, stride, 16, d0, d1);
    736 }
    737 
    738 void aom_v_predictor_32x64_neon(uint8_t *dst, ptrdiff_t stride,
    739                                const uint8_t *above, const uint8_t *left) {
    740  const uint8x16_t d0 = vld1q_u8(above);
    741  const uint8x16_t d1 = vld1q_u8(above + 16);
    742  (void)left;
    743  v_store_32xh(dst, stride, 64, d0, d1);
    744 }
    745 
    746 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
    747 void aom_v_predictor_64x16_neon(uint8_t *dst, ptrdiff_t stride,
    748                                const uint8_t *above, const uint8_t *left) {
    749  const uint8x16_t d0 = vld1q_u8(above);
    750  const uint8x16_t d1 = vld1q_u8(above + 16);
    751  const uint8x16_t d2 = vld1q_u8(above + 32);
    752  const uint8x16_t d3 = vld1q_u8(above + 48);
    753  (void)left;
    754  v_store_64xh(dst, stride, 16, d0, d1, d2, d3);
    755 }
    756 #endif  // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
    757 
    758 void aom_v_predictor_64x32_neon(uint8_t *dst, ptrdiff_t stride,
    759                                const uint8_t *above, const uint8_t *left) {
    760  const uint8x16_t d0 = vld1q_u8(above);
    761  const uint8x16_t d1 = vld1q_u8(above + 16);
    762  const uint8x16_t d2 = vld1q_u8(above + 32);
    763  const uint8x16_t d3 = vld1q_u8(above + 48);
    764  (void)left;
    765  v_store_64xh(dst, stride, 32, d0, d1, d2, d3);
    766 }
    767 
    768 void aom_v_predictor_64x64_neon(uint8_t *dst, ptrdiff_t stride,
    769                                const uint8_t *above, const uint8_t *left) {
    770  const uint8x16_t d0 = vld1q_u8(above);
    771  const uint8x16_t d1 = vld1q_u8(above + 16);
    772  const uint8x16_t d2 = vld1q_u8(above + 32);
    773  const uint8x16_t d3 = vld1q_u8(above + 48);
    774  (void)left;
    775  v_store_64xh(dst, stride, 64, d0, d1, d2, d3);
    776 }
    777 
    778 // -----------------------------------------------------------------------------
    779 
    780 static inline void h_store_4x8(uint8_t *dst, ptrdiff_t stride, uint8x8_t d0) {
    781  store_u8_4x1(dst + 0 * stride, vdup_lane_u8(d0, 0));
    782  store_u8_4x1(dst + 1 * stride, vdup_lane_u8(d0, 1));
    783  store_u8_4x1(dst + 2 * stride, vdup_lane_u8(d0, 2));
    784  store_u8_4x1(dst + 3 * stride, vdup_lane_u8(d0, 3));
    785  store_u8_4x1(dst + 4 * stride, vdup_lane_u8(d0, 4));
    786  store_u8_4x1(dst + 5 * stride, vdup_lane_u8(d0, 5));
    787  store_u8_4x1(dst + 6 * stride, vdup_lane_u8(d0, 6));
    788  store_u8_4x1(dst + 7 * stride, vdup_lane_u8(d0, 7));
    789 }
    790 
    791 static inline void h_store_8x8(uint8_t *dst, ptrdiff_t stride, uint8x8_t d0) {
    792  vst1_u8(dst + 0 * stride, vdup_lane_u8(d0, 0));
    793  vst1_u8(dst + 1 * stride, vdup_lane_u8(d0, 1));
    794  vst1_u8(dst + 2 * stride, vdup_lane_u8(d0, 2));
    795  vst1_u8(dst + 3 * stride, vdup_lane_u8(d0, 3));
    796  vst1_u8(dst + 4 * stride, vdup_lane_u8(d0, 4));
    797  vst1_u8(dst + 5 * stride, vdup_lane_u8(d0, 5));
    798  vst1_u8(dst + 6 * stride, vdup_lane_u8(d0, 6));
    799  vst1_u8(dst + 7 * stride, vdup_lane_u8(d0, 7));
    800 }
    801 
    802 static inline void h_store_16x8(uint8_t *dst, ptrdiff_t stride, uint8x8_t d0) {
    803  vst1q_u8(dst + 0 * stride, vdupq_lane_u8(d0, 0));
    804  vst1q_u8(dst + 1 * stride, vdupq_lane_u8(d0, 1));
    805  vst1q_u8(dst + 2 * stride, vdupq_lane_u8(d0, 2));
    806  vst1q_u8(dst + 3 * stride, vdupq_lane_u8(d0, 3));
    807  vst1q_u8(dst + 4 * stride, vdupq_lane_u8(d0, 4));
    808  vst1q_u8(dst + 5 * stride, vdupq_lane_u8(d0, 5));
    809  vst1q_u8(dst + 6 * stride, vdupq_lane_u8(d0, 6));
    810  vst1q_u8(dst + 7 * stride, vdupq_lane_u8(d0, 7));
    811 }
    812 
    813 static inline void h_store_32x8(uint8_t *dst, ptrdiff_t stride, uint8x8_t d0) {
    814  vst1q_u8(dst + 0, vdupq_lane_u8(d0, 0));
    815  vst1q_u8(dst + 16, vdupq_lane_u8(d0, 0));
    816  dst += stride;
    817  vst1q_u8(dst + 0, vdupq_lane_u8(d0, 1));
    818  vst1q_u8(dst + 16, vdupq_lane_u8(d0, 1));
    819  dst += stride;
    820  vst1q_u8(dst + 0, vdupq_lane_u8(d0, 2));
    821  vst1q_u8(dst + 16, vdupq_lane_u8(d0, 2));
    822  dst += stride;
    823  vst1q_u8(dst + 0, vdupq_lane_u8(d0, 3));
    824  vst1q_u8(dst + 16, vdupq_lane_u8(d0, 3));
    825  dst += stride;
    826  vst1q_u8(dst + 0, vdupq_lane_u8(d0, 4));
    827  vst1q_u8(dst + 16, vdupq_lane_u8(d0, 4));
    828  dst += stride;
    829  vst1q_u8(dst + 0, vdupq_lane_u8(d0, 5));
    830  vst1q_u8(dst + 16, vdupq_lane_u8(d0, 5));
    831  dst += stride;
    832  vst1q_u8(dst + 0, vdupq_lane_u8(d0, 6));
    833  vst1q_u8(dst + 16, vdupq_lane_u8(d0, 6));
    834  dst += stride;
    835  vst1q_u8(dst + 0, vdupq_lane_u8(d0, 7));
    836  vst1q_u8(dst + 16, vdupq_lane_u8(d0, 7));
    837 }
    838 
    839 static inline void h_store_64x8(uint8_t *dst, ptrdiff_t stride, uint8x8_t d0) {
    840  vst1q_u8(dst + 0, vdupq_lane_u8(d0, 0));
    841  vst1q_u8(dst + 16, vdupq_lane_u8(d0, 0));
    842  vst1q_u8(dst + 32, vdupq_lane_u8(d0, 0));
    843  vst1q_u8(dst + 48, vdupq_lane_u8(d0, 0));
    844  dst += stride;
    845  vst1q_u8(dst + 0, vdupq_lane_u8(d0, 1));
    846  vst1q_u8(dst + 16, vdupq_lane_u8(d0, 1));
    847  vst1q_u8(dst + 32, vdupq_lane_u8(d0, 1));
    848  vst1q_u8(dst + 48, vdupq_lane_u8(d0, 1));
    849  dst += stride;
    850  vst1q_u8(dst + 0, vdupq_lane_u8(d0, 2));
    851  vst1q_u8(dst + 16, vdupq_lane_u8(d0, 2));
    852  vst1q_u8(dst + 32, vdupq_lane_u8(d0, 2));
    853  vst1q_u8(dst + 48, vdupq_lane_u8(d0, 2));
    854  dst += stride;
    855  vst1q_u8(dst + 0, vdupq_lane_u8(d0, 3));
    856  vst1q_u8(dst + 16, vdupq_lane_u8(d0, 3));
    857  vst1q_u8(dst + 32, vdupq_lane_u8(d0, 3));
    858  vst1q_u8(dst + 48, vdupq_lane_u8(d0, 3));
    859  dst += stride;
    860  vst1q_u8(dst + 0, vdupq_lane_u8(d0, 4));
    861  vst1q_u8(dst + 16, vdupq_lane_u8(d0, 4));
    862  vst1q_u8(dst + 32, vdupq_lane_u8(d0, 4));
    863  vst1q_u8(dst + 48, vdupq_lane_u8(d0, 4));
    864  dst += stride;
    865  vst1q_u8(dst + 0, vdupq_lane_u8(d0, 5));
    866  vst1q_u8(dst + 16, vdupq_lane_u8(d0, 5));
    867  vst1q_u8(dst + 32, vdupq_lane_u8(d0, 5));
    868  vst1q_u8(dst + 48, vdupq_lane_u8(d0, 5));
    869  dst += stride;
    870  vst1q_u8(dst + 0, vdupq_lane_u8(d0, 6));
    871  vst1q_u8(dst + 16, vdupq_lane_u8(d0, 6));
    872  vst1q_u8(dst + 32, vdupq_lane_u8(d0, 6));
    873  vst1q_u8(dst + 48, vdupq_lane_u8(d0, 6));
    874  dst += stride;
    875  vst1q_u8(dst + 0, vdupq_lane_u8(d0, 7));
    876  vst1q_u8(dst + 16, vdupq_lane_u8(d0, 7));
    877  vst1q_u8(dst + 32, vdupq_lane_u8(d0, 7));
    878  vst1q_u8(dst + 48, vdupq_lane_u8(d0, 7));
    879 }
    880 
    881 void aom_h_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
    882                              const uint8_t *above, const uint8_t *left) {
    883  const uint8x8_t d0 = load_u8_4x1(left);
    884  (void)above;
    885  store_u8_4x1(dst + 0 * stride, vdup_lane_u8(d0, 0));
    886  store_u8_4x1(dst + 1 * stride, vdup_lane_u8(d0, 1));
    887  store_u8_4x1(dst + 2 * stride, vdup_lane_u8(d0, 2));
    888  store_u8_4x1(dst + 3 * stride, vdup_lane_u8(d0, 3));
    889 }
    890 
    891 void aom_h_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
    892                              const uint8_t *above, const uint8_t *left) {
    893  const uint8x8_t d0 = vld1_u8(left);
    894  (void)above;
    895  h_store_8x8(dst, stride, d0);
    896 }
    897 
    898 void aom_h_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
    899                                const uint8_t *above, const uint8_t *left) {
    900  const uint8x16_t d0 = vld1q_u8(left);
    901  (void)above;
    902  h_store_16x8(dst, stride, vget_low_u8(d0));
    903  h_store_16x8(dst + 8 * stride, stride, vget_high_u8(d0));
    904 }
    905 
    906 void aom_h_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
    907                                const uint8_t *above, const uint8_t *left) {
    908  const uint8x16_t d0 = vld1q_u8(left);
    909  const uint8x16_t d1 = vld1q_u8(left + 16);
    910  (void)above;
    911  h_store_32x8(dst + 0 * stride, stride, vget_low_u8(d0));
    912  h_store_32x8(dst + 8 * stride, stride, vget_high_u8(d0));
    913  h_store_32x8(dst + 16 * stride, stride, vget_low_u8(d1));
    914  h_store_32x8(dst + 24 * stride, stride, vget_high_u8(d1));
    915 }
    916 
    917 void aom_h_predictor_4x8_neon(uint8_t *dst, ptrdiff_t stride,
    918                              const uint8_t *above, const uint8_t *left) {
    919  const uint8x8_t d0 = vld1_u8(left);
    920  (void)above;
    921  h_store_4x8(dst, stride, d0);
    922 }
    923 
    924 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
    925 void aom_h_predictor_4x16_neon(uint8_t *dst, ptrdiff_t stride,
    926                               const uint8_t *above, const uint8_t *left) {
    927  const uint8x16_t d0 = vld1q_u8(left);
    928  (void)above;
    929  h_store_4x8(dst + 0 * stride, stride, vget_low_u8(d0));
    930  h_store_4x8(dst + 8 * stride, stride, vget_high_u8(d0));
    931 }
    932 #endif  // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
    933 
    934 void aom_h_predictor_8x4_neon(uint8_t *dst, ptrdiff_t stride,
    935                              const uint8_t *above, const uint8_t *left) {
    936  const uint8x8_t d0 = load_u8_4x1(left);
    937  (void)above;
    938  vst1_u8(dst + 0 * stride, vdup_lane_u8(d0, 0));
    939  vst1_u8(dst + 1 * stride, vdup_lane_u8(d0, 1));
    940  vst1_u8(dst + 2 * stride, vdup_lane_u8(d0, 2));
    941  vst1_u8(dst + 3 * stride, vdup_lane_u8(d0, 3));
    942 }
    943 
    944 void aom_h_predictor_8x16_neon(uint8_t *dst, ptrdiff_t stride,
    945                               const uint8_t *above, const uint8_t *left) {
    946  const uint8x16_t d0 = vld1q_u8(left);
    947  (void)above;
    948  h_store_8x8(dst + 0 * stride, stride, vget_low_u8(d0));
    949  h_store_8x8(dst + 8 * stride, stride, vget_high_u8(d0));
    950 }
    951 
    952 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
    953 void aom_h_predictor_8x32_neon(uint8_t *dst, ptrdiff_t stride,
    954                               const uint8_t *above, const uint8_t *left) {
    955  const uint8x16_t d0 = vld1q_u8(left);
    956  const uint8x16_t d1 = vld1q_u8(left + 16);
    957  (void)above;
    958  h_store_8x8(dst + 0 * stride, stride, vget_low_u8(d0));
    959  h_store_8x8(dst + 8 * stride, stride, vget_high_u8(d0));
    960  h_store_8x8(dst + 16 * stride, stride, vget_low_u8(d1));
    961  h_store_8x8(dst + 24 * stride, stride, vget_high_u8(d1));
    962 }
    963 
    964 void aom_h_predictor_16x4_neon(uint8_t *dst, ptrdiff_t stride,
    965                               const uint8_t *above, const uint8_t *left) {
    966  const uint8x8_t d0 = load_u8_4x1(left);
    967  (void)above;
    968  vst1q_u8(dst + 0 * stride, vdupq_lane_u8(d0, 0));
    969  vst1q_u8(dst + 1 * stride, vdupq_lane_u8(d0, 1));
    970  vst1q_u8(dst + 2 * stride, vdupq_lane_u8(d0, 2));
    971  vst1q_u8(dst + 3 * stride, vdupq_lane_u8(d0, 3));
    972 }
    973 #endif  // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
    974 
    975 void aom_h_predictor_16x8_neon(uint8_t *dst, ptrdiff_t stride,
    976                               const uint8_t *above, const uint8_t *left) {
    977  const uint8x8_t d0 = vld1_u8(left);
    978  (void)above;
    979  h_store_16x8(dst, stride, d0);
    980 }
    981 
    982 void aom_h_predictor_16x32_neon(uint8_t *dst, ptrdiff_t stride,
    983                                const uint8_t *above, const uint8_t *left) {
    984  const uint8x16_t d0 = vld1q_u8(left);
    985  const uint8x16_t d1 = vld1q_u8(left + 16);
    986  (void)above;
    987  h_store_16x8(dst + 0 * stride, stride, vget_low_u8(d0));
    988  h_store_16x8(dst + 8 * stride, stride, vget_high_u8(d0));
    989  h_store_16x8(dst + 16 * stride, stride, vget_low_u8(d1));
    990  h_store_16x8(dst + 24 * stride, stride, vget_high_u8(d1));
    991 }
    992 
    993 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
    994 void aom_h_predictor_16x64_neon(uint8_t *dst, ptrdiff_t stride,
    995                                const uint8_t *above, const uint8_t *left) {
    996  const uint8x16_t d0 = vld1q_u8(left);
    997  const uint8x16_t d1 = vld1q_u8(left + 16);
    998  const uint8x16_t d2 = vld1q_u8(left + 32);
    999  const uint8x16_t d3 = vld1q_u8(left + 48);
   1000  (void)above;
   1001  h_store_16x8(dst + 0 * stride, stride, vget_low_u8(d0));
   1002  h_store_16x8(dst + 8 * stride, stride, vget_high_u8(d0));
   1003  h_store_16x8(dst + 16 * stride, stride, vget_low_u8(d1));
   1004  h_store_16x8(dst + 24 * stride, stride, vget_high_u8(d1));
   1005  h_store_16x8(dst + 32 * stride, stride, vget_low_u8(d2));
   1006  h_store_16x8(dst + 40 * stride, stride, vget_high_u8(d2));
   1007  h_store_16x8(dst + 48 * stride, stride, vget_low_u8(d3));
   1008  h_store_16x8(dst + 56 * stride, stride, vget_high_u8(d3));
   1009 }
   1010 
   1011 void aom_h_predictor_32x8_neon(uint8_t *dst, ptrdiff_t stride,
   1012                               const uint8_t *above, const uint8_t *left) {
   1013  const uint8x8_t d0 = vld1_u8(left);
   1014  (void)above;
   1015  h_store_32x8(dst, stride, d0);
   1016 }
   1017 #endif  // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
   1018 
   1019 void aom_h_predictor_32x16_neon(uint8_t *dst, ptrdiff_t stride,
   1020                                const uint8_t *above, const uint8_t *left) {
   1021  const uint8x16_t d0 = vld1q_u8(left);
   1022  (void)above;
   1023  h_store_32x8(dst + 0 * stride, stride, vget_low_u8(d0));
   1024  h_store_32x8(dst + 8 * stride, stride, vget_high_u8(d0));
   1025 }
   1026 
   1027 void aom_h_predictor_32x64_neon(uint8_t *dst, ptrdiff_t stride,
   1028                                const uint8_t *above, const uint8_t *left) {
   1029  const uint8x16_t d0 = vld1q_u8(left + 0);
   1030  const uint8x16_t d1 = vld1q_u8(left + 16);
   1031  const uint8x16_t d2 = vld1q_u8(left + 32);
   1032  const uint8x16_t d3 = vld1q_u8(left + 48);
   1033  (void)above;
   1034  h_store_32x8(dst + 0 * stride, stride, vget_low_u8(d0));
   1035  h_store_32x8(dst + 8 * stride, stride, vget_high_u8(d0));
   1036  h_store_32x8(dst + 16 * stride, stride, vget_low_u8(d1));
   1037  h_store_32x8(dst + 24 * stride, stride, vget_high_u8(d1));
   1038  h_store_32x8(dst + 32 * stride, stride, vget_low_u8(d2));
   1039  h_store_32x8(dst + 40 * stride, stride, vget_high_u8(d2));
   1040  h_store_32x8(dst + 48 * stride, stride, vget_low_u8(d3));
   1041  h_store_32x8(dst + 56 * stride, stride, vget_high_u8(d3));
   1042 }
   1043 
   1044 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
   1045 void aom_h_predictor_64x16_neon(uint8_t *dst, ptrdiff_t stride,
   1046                                const uint8_t *above, const uint8_t *left) {
   1047  const uint8x16_t d0 = vld1q_u8(left);
   1048  (void)above;
   1049  h_store_64x8(dst + 0 * stride, stride, vget_low_u8(d0));
   1050  h_store_64x8(dst + 8 * stride, stride, vget_high_u8(d0));
   1051 }
   1052 #endif  // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
   1053 
   1054 void aom_h_predictor_64x32_neon(uint8_t *dst, ptrdiff_t stride,
   1055                                const uint8_t *above, const uint8_t *left) {
   1056  (void)above;
   1057  for (int i = 0; i < 2; ++i) {
   1058    const uint8x16_t d0 = vld1q_u8(left);
   1059    h_store_64x8(dst + 0 * stride, stride, vget_low_u8(d0));
   1060    h_store_64x8(dst + 8 * stride, stride, vget_high_u8(d0));
   1061    left += 16;
   1062    dst += 16 * stride;
   1063  }
   1064 }
   1065 
   1066 void aom_h_predictor_64x64_neon(uint8_t *dst, ptrdiff_t stride,
   1067                                const uint8_t *above, const uint8_t *left) {
   1068  (void)above;
   1069  for (int i = 0; i < 4; ++i) {
   1070    const uint8x16_t d0 = vld1q_u8(left);
   1071    h_store_64x8(dst + 0 * stride, stride, vget_low_u8(d0));
   1072    h_store_64x8(dst + 8 * stride, stride, vget_high_u8(d0));
   1073    left += 16;
   1074    dst += 16 * stride;
   1075  }
   1076 }
   1077 
   1078 /* ---------------------P R E D I C T I O N   Z 1--------------------------- */
   1079 
   1080 // Low bit depth functions
   1081 static DECLARE_ALIGNED(32, const uint8_t, BaseMask[33][32]) = {
   1082  { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
   1083    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
   1084  { 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
   1085    0,    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
   1086  { 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
   1087    0,    0,    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
   1088  { 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
   1089    0,    0,    0,    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
   1090  { 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
   1091    0,    0,    0,    0,    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
   1092  { 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
   1093    0,    0,    0,    0,    0,    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
   1094  { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
   1095    0,    0,    0,    0,    0,    0,    0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
   1096  { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0,
   1097    0,    0,    0,    0,    0,    0,    0,    0, 0, 0, 0, 0, 0, 0, 0, 0 },
   1098  { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0,
   1099    0,    0,    0,    0,    0,    0,    0,    0,    0, 0, 0, 0, 0, 0, 0, 0 },
   1100  { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0,
   1101    0,    0,    0,    0,    0,    0,    0,    0,    0,    0, 0, 0, 0, 0, 0, 0 },
   1102  { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0,
   1103    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
   1104    0,    0,    0,    0,    0,    0,    0,    0,    0,    0 },
   1105  { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
   1106    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
   1107    0,    0,    0,    0,    0,    0,    0,    0,    0,    0 },
   1108  { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
   1109    0xff, 0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
   1110    0,    0,    0,    0,    0,    0,    0,    0,    0,    0 },
   1111  { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
   1112    0xff, 0xff, 0,    0,    0,    0,    0,    0,    0,    0,    0,
   1113    0,    0,    0,    0,    0,    0,    0,    0,    0,    0 },
   1114  { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
   1115    0xff, 0xff, 0xff, 0,    0,    0,    0,    0,    0,    0,    0,
   1116    0,    0,    0,    0,    0,    0,    0,    0,    0,    0 },
   1117  { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
   1118    0xff, 0xff, 0xff, 0xff, 0,    0,    0,    0,    0,    0,    0,
   1119    0,    0,    0,    0,    0,    0,    0,    0,    0,    0 },
   1120  { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
   1121    0xff, 0xff, 0xff, 0xff, 0xff, 0,    0,    0,    0,    0,    0,
   1122    0,    0,    0,    0,    0,    0,    0,    0,    0,    0 },
   1123  { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
   1124    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0,    0,    0,    0,    0,
   1125    0,    0,    0,    0,    0,    0,    0,    0,    0,    0 },
   1126  { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
   1127    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0,    0,    0,    0,
   1128    0,    0,    0,    0,    0,    0,    0,    0,    0,    0 },
   1129  { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
   1130    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0,    0,    0,
   1131    0,    0,    0,    0,    0,    0,    0,    0,    0,    0 },
   1132  { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
   1133    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0,    0,
   1134    0,    0,    0,    0,    0,    0,    0,    0,    0,    0 },
   1135  { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
   1136    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0,
   1137    0,    0,    0,    0,    0,    0,    0,    0,    0,    0 },
   1138  { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
   1139    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
   1140    0,    0,    0,    0,    0,    0,    0,    0,    0,    0 },
   1141  { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
   1142    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
   1143    0xff, 0,    0,    0,    0,    0,    0,    0,    0,    0 },
   1144  { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
   1145    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
   1146    0xff, 0xff, 0,    0,    0,    0,    0,    0,    0,    0 },
   1147  { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
   1148    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
   1149    0xff, 0xff, 0xff, 0,    0,    0,    0,    0,    0,    0 },
   1150  { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
   1151    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
   1152    0xff, 0xff, 0xff, 0xff, 0,    0,    0,    0,    0,    0 },
   1153  { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
   1154    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
   1155    0xff, 0xff, 0xff, 0xff, 0xff, 0,    0,    0,    0,    0 },
   1156  { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
   1157    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
   1158    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0,    0,    0,    0 },
   1159  { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
   1160    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
   1161    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0,    0,    0 },
   1162  { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
   1163    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
   1164    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0,    0 },
   1165  { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
   1166    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
   1167    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0 },
   1168  { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
   1169    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
   1170    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff },
   1171 };
   1172 
   1173 static AOM_FORCE_INLINE void dr_prediction_z1_HxW_internal_neon_64(
   1174    int H, int W, uint8x8_t *dst, const uint8_t *above, int upsample_above,
   1175    int dx) {
   1176  const int frac_bits = 6 - upsample_above;
   1177  const int max_base_x = ((W + H) - 1) << upsample_above;
   1178 
   1179  assert(dx > 0);
   1180  // pre-filter above pixels
   1181  // store in temp buffers:
   1182  //   above[x] * 32 + 16
   1183  //   above[x+1] - above[x]
   1184  // final pixels will be calculated as:
   1185  //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
   1186 
   1187  const uint8x8_t a_mbase_x = vdup_n_u8(above[max_base_x]);
   1188 
   1189  int x = dx;
   1190  for (int r = 0; r < W; r++) {
   1191    int base = x >> frac_bits;
   1192    int base_max_diff = (max_base_x - base) >> upsample_above;
   1193    if (base_max_diff <= 0) {
   1194      for (int i = r; i < W; ++i) {
   1195        dst[i] = a_mbase_x;  // save 4 values
   1196      }
   1197      return;
   1198    }
   1199 
   1200    if (base_max_diff > H) base_max_diff = H;
   1201 
   1202    uint8x8x2_t a01_128;
   1203    uint16x8_t shift;
   1204    if (upsample_above) {
   1205      a01_128 = vld2_u8(above + base);
   1206      shift = vdupq_n_u16(((x << upsample_above) & 0x3f) >> 1);
   1207    } else {
   1208      a01_128.val[0] = vld1_u8(above + base);
   1209      a01_128.val[1] = vld1_u8(above + base + 1);
   1210      shift = vdupq_n_u16((x & 0x3f) >> 1);
   1211    }
   1212    uint16x8_t diff = vsubl_u8(a01_128.val[1], a01_128.val[0]);
   1213    uint16x8_t a32 = vmlal_u8(vdupq_n_u16(16), a01_128.val[0], vdup_n_u8(32));
   1214    uint16x8_t res = vmlaq_u16(a32, diff, shift);
   1215 
   1216    uint8x8_t mask = vld1_u8(BaseMask[base_max_diff]);
   1217    dst[r] = vbsl_u8(mask, vshrn_n_u16(res, 5), a_mbase_x);
   1218 
   1219    x += dx;
   1220  }
   1221 }
   1222 
   1223 static void dr_prediction_z1_4xN_neon(int N, uint8_t *dst, ptrdiff_t stride,
   1224                                      const uint8_t *above, int upsample_above,
   1225                                      int dx) {
   1226  uint8x8_t dstvec[16];
   1227 
   1228  dr_prediction_z1_HxW_internal_neon_64(4, N, dstvec, above, upsample_above,
   1229                                        dx);
   1230  for (int i = 0; i < N; i++) {
   1231    vst1_lane_u32((uint32_t *)(dst + stride * i),
   1232                  vreinterpret_u32_u8(dstvec[i]), 0);
   1233  }
   1234 }
   1235 
   1236 static void dr_prediction_z1_8xN_neon(int N, uint8_t *dst, ptrdiff_t stride,
   1237                                      const uint8_t *above, int upsample_above,
   1238                                      int dx) {
   1239  uint8x8_t dstvec[32];
   1240 
   1241  dr_prediction_z1_HxW_internal_neon_64(8, N, dstvec, above, upsample_above,
   1242                                        dx);
   1243  for (int i = 0; i < N; i++) {
   1244    vst1_u8(dst + stride * i, dstvec[i]);
   1245  }
   1246 }
   1247 
   1248 static AOM_FORCE_INLINE void dr_prediction_z1_HxW_internal_neon(
   1249    int H, int W, uint8x16_t *dst, const uint8_t *above, int upsample_above,
   1250    int dx) {
   1251  const int frac_bits = 6 - upsample_above;
   1252  const int max_base_x = ((W + H) - 1) << upsample_above;
   1253 
   1254  assert(dx > 0);
   1255  // pre-filter above pixels
   1256  // store in temp buffers:
   1257  //   above[x] * 32 + 16
   1258  //   above[x+1] - above[x]
   1259  // final pixels will be calculated as:
   1260  //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
   1261 
   1262  const uint8x16_t a_mbase_x = vdupq_n_u8(above[max_base_x]);
   1263 
   1264  int x = dx;
   1265  for (int r = 0; r < W; r++) {
   1266    int base = x >> frac_bits;
   1267    int base_max_diff = (max_base_x - base) >> upsample_above;
   1268    if (base_max_diff <= 0) {
   1269      for (int i = r; i < W; ++i) {
   1270        dst[i] = a_mbase_x;  // save 4 values
   1271      }
   1272      return;
   1273    }
   1274 
   1275    if (base_max_diff > H) base_max_diff = H;
   1276 
   1277    uint16x8_t shift;
   1278    uint8x16_t a0_128, a1_128;
   1279    if (upsample_above) {
   1280      uint8x8x2_t v_tmp_a0_128 = vld2_u8(above + base);
   1281      a0_128 = vcombine_u8(v_tmp_a0_128.val[0], v_tmp_a0_128.val[1]);
   1282      a1_128 = vextq_u8(a0_128, vdupq_n_u8(0), 8);
   1283      shift = vdupq_n_u16(x & 0x1f);
   1284    } else {
   1285      a0_128 = vld1q_u8(above + base);
   1286      a1_128 = vld1q_u8(above + base + 1);
   1287      shift = vdupq_n_u16((x & 0x3f) >> 1);
   1288    }
   1289    uint16x8_t diff_lo = vsubl_u8(vget_low_u8(a1_128), vget_low_u8(a0_128));
   1290    uint16x8_t diff_hi = vsubl_u8(vget_high_u8(a1_128), vget_high_u8(a0_128));
   1291    uint16x8_t a32_lo =
   1292        vmlal_u8(vdupq_n_u16(16), vget_low_u8(a0_128), vdup_n_u8(32));
   1293    uint16x8_t a32_hi =
   1294        vmlal_u8(vdupq_n_u16(16), vget_high_u8(a0_128), vdup_n_u8(32));
   1295    uint16x8_t res_lo = vmlaq_u16(a32_lo, diff_lo, shift);
   1296    uint16x8_t res_hi = vmlaq_u16(a32_hi, diff_hi, shift);
   1297    uint8x16_t v_temp =
   1298        vcombine_u8(vshrn_n_u16(res_lo, 5), vshrn_n_u16(res_hi, 5));
   1299 
   1300    uint8x16_t mask = vld1q_u8(BaseMask[base_max_diff]);
   1301    dst[r] = vbslq_u8(mask, v_temp, a_mbase_x);
   1302 
   1303    x += dx;
   1304  }
   1305 }
   1306 
   1307 static void dr_prediction_z1_16xN_neon(int N, uint8_t *dst, ptrdiff_t stride,
   1308                                       const uint8_t *above, int upsample_above,
   1309                                       int dx) {
   1310  uint8x16_t dstvec[64];
   1311 
   1312  dr_prediction_z1_HxW_internal_neon(16, N, dstvec, above, upsample_above, dx);
   1313  for (int i = 0; i < N; i++) {
   1314    vst1q_u8(dst + stride * i, dstvec[i]);
   1315  }
   1316 }
   1317 
   1318 static AOM_FORCE_INLINE void dr_prediction_z1_32xN_internal_neon(
   1319    int N, uint8x16x2_t *dstvec, const uint8_t *above, int dx) {
   1320  const int frac_bits = 6;
   1321  const int max_base_x = ((32 + N) - 1);
   1322 
   1323  // pre-filter above pixels
   1324  // store in temp buffers:
   1325  //   above[x] * 32 + 16
   1326  //   above[x+1] - above[x]
   1327  // final pixels will be calculated as:
   1328  //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
   1329 
   1330  const uint8x16_t a_mbase_x = vdupq_n_u8(above[max_base_x]);
   1331 
   1332  int x = dx;
   1333  for (int r = 0; r < N; r++) {
   1334    int base = x >> frac_bits;
   1335    int base_max_diff = (max_base_x - base);
   1336    if (base_max_diff <= 0) {
   1337      for (int i = r; i < N; ++i) {
   1338        dstvec[i].val[0] = a_mbase_x;  // save 32 values
   1339        dstvec[i].val[1] = a_mbase_x;
   1340      }
   1341      return;
   1342    }
   1343    if (base_max_diff > 32) base_max_diff = 32;
   1344 
   1345    uint16x8_t shift = vdupq_n_u16((x & 0x3f) >> 1);
   1346 
   1347    uint8x16_t res16[2];
   1348    for (int j = 0, jj = 0; j < 32; j += 16, jj++) {
   1349      int mdiff = base_max_diff - j;
   1350      if (mdiff <= 0) {
   1351        res16[jj] = a_mbase_x;
   1352      } else {
   1353        uint8x16_t a0_128 = vld1q_u8(above + base + j);
   1354        uint8x16_t a1_128 = vld1q_u8(above + base + j + 1);
   1355        uint16x8_t diff_lo = vsubl_u8(vget_low_u8(a1_128), vget_low_u8(a0_128));
   1356        uint16x8_t diff_hi =
   1357            vsubl_u8(vget_high_u8(a1_128), vget_high_u8(a0_128));
   1358        uint16x8_t a32_lo =
   1359            vmlal_u8(vdupq_n_u16(16), vget_low_u8(a0_128), vdup_n_u8(32));
   1360        uint16x8_t a32_hi =
   1361            vmlal_u8(vdupq_n_u16(16), vget_high_u8(a0_128), vdup_n_u8(32));
   1362        uint16x8_t res_lo = vmlaq_u16(a32_lo, diff_lo, shift);
   1363        uint16x8_t res_hi = vmlaq_u16(a32_hi, diff_hi, shift);
   1364 
   1365        res16[jj] = vcombine_u8(vshrn_n_u16(res_lo, 5), vshrn_n_u16(res_hi, 5));
   1366      }
   1367    }
   1368 
   1369    uint8x16_t mask_lo = vld1q_u8(BaseMask[base_max_diff]);
   1370    uint8x16_t mask_hi = vld1q_u8(BaseMask[base_max_diff] + 16);
   1371    dstvec[r].val[0] = vbslq_u8(mask_lo, res16[0], a_mbase_x);
   1372    dstvec[r].val[1] = vbslq_u8(mask_hi, res16[1], a_mbase_x);
   1373    x += dx;
   1374  }
   1375 }
   1376 
   1377 static void dr_prediction_z1_32xN_neon(int N, uint8_t *dst, ptrdiff_t stride,
   1378                                       const uint8_t *above, int dx) {
   1379  uint8x16x2_t dstvec[64];
   1380 
   1381  dr_prediction_z1_32xN_internal_neon(N, dstvec, above, dx);
   1382  for (int i = 0; i < N; i++) {
   1383    vst1q_u8(dst + stride * i, dstvec[i].val[0]);
   1384    vst1q_u8(dst + stride * i + 16, dstvec[i].val[1]);
   1385  }
   1386 }
   1387 
   1388 // clang-format off
   1389 static const uint8_t kLoadMaxShuffles[] = {
   1390  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
   1391  14, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
   1392  13, 14, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
   1393  12, 13, 14, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
   1394  11, 12, 13, 14, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
   1395  10, 11, 12, 13, 14, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
   1396   9, 10, 11, 12, 13, 14, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
   1397   8,  9, 10, 11, 12, 13, 14, 15, 15, 15, 15, 15, 15, 15, 15, 15,
   1398   7,  8,  9, 10, 11, 12, 13, 14, 15, 15, 15, 15, 15, 15, 15, 15,
   1399   6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 15, 15, 15, 15, 15, 15,
   1400   5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 15, 15, 15, 15, 15,
   1401   4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 15, 15, 15, 15,
   1402   3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 15, 15, 15,
   1403   2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 15, 15,
   1404   1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 15,
   1405   0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15,
   1406 };
   1407 // clang-format on
   1408 
   1409 static inline uint8x16_t z1_load_masked_neon(const uint8_t *ptr,
   1410                                             int shuffle_idx) {
   1411  uint8x16_t shuffle = vld1q_u8(&kLoadMaxShuffles[16 * shuffle_idx]);
   1412  uint8x16_t src = vld1q_u8(ptr);
   1413 #if AOM_ARCH_AARCH64
   1414  return vqtbl1q_u8(src, shuffle);
   1415 #else
   1416  uint8x8x2_t src2 = { { vget_low_u8(src), vget_high_u8(src) } };
   1417  uint8x8_t lo = vtbl2_u8(src2, vget_low_u8(shuffle));
   1418  uint8x8_t hi = vtbl2_u8(src2, vget_high_u8(shuffle));
   1419  return vcombine_u8(lo, hi);
   1420 #endif
   1421 }
   1422 
   1423 static void dr_prediction_z1_64xN_neon(int N, uint8_t *dst, ptrdiff_t stride,
   1424                                       const uint8_t *above, int dx) {
   1425  const int frac_bits = 6;
   1426  const int max_base_x = ((64 + N) - 1);
   1427 
   1428  // pre-filter above pixels
   1429  // store in temp buffers:
   1430  //   above[x] * 32 + 16
   1431  //   above[x+1] - above[x]
   1432  // final pixels will be calculated as:
   1433  //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
   1434 
   1435  const uint8x16_t a_mbase_x = vdupq_n_u8(above[max_base_x]);
   1436 
   1437  int x = dx;
   1438  for (int r = 0; r < N; r++, dst += stride) {
   1439    int base = x >> frac_bits;
   1440    if (base >= max_base_x) {
   1441      for (int i = r; i < N; ++i) {
   1442        vst1q_u8(dst, a_mbase_x);
   1443        vst1q_u8(dst + 16, a_mbase_x);
   1444        vst1q_u8(dst + 32, a_mbase_x);
   1445        vst1q_u8(dst + 48, a_mbase_x);
   1446        dst += stride;
   1447      }
   1448      return;
   1449    }
   1450 
   1451    uint16x8_t shift = vdupq_n_u16((x & 0x3f) >> 1);
   1452    uint8x16_t base_inc128 =
   1453        vaddq_u8(vdupq_n_u8(base), vcombine_u8(vcreate_u8(0x0706050403020100),
   1454                                               vcreate_u8(0x0F0E0D0C0B0A0908)));
   1455 
   1456    for (int j = 0; j < 64; j += 16) {
   1457      if (base + j >= max_base_x) {
   1458        vst1q_u8(dst + j, a_mbase_x);
   1459      } else {
   1460        uint8x16_t a0_128;
   1461        uint8x16_t a1_128;
   1462        if (base + j + 15 >= max_base_x) {
   1463          int shuffle_idx = max_base_x - base - j;
   1464          a0_128 = z1_load_masked_neon(above + (max_base_x - 15), shuffle_idx);
   1465        } else {
   1466          a0_128 = vld1q_u8(above + base + j);
   1467        }
   1468        if (base + j + 16 >= max_base_x) {
   1469          int shuffle_idx = max_base_x - base - j - 1;
   1470          a1_128 = z1_load_masked_neon(above + (max_base_x - 15), shuffle_idx);
   1471        } else {
   1472          a1_128 = vld1q_u8(above + base + j + 1);
   1473        }
   1474 
   1475        uint16x8_t diff_lo = vsubl_u8(vget_low_u8(a1_128), vget_low_u8(a0_128));
   1476        uint16x8_t diff_hi =
   1477            vsubl_u8(vget_high_u8(a1_128), vget_high_u8(a0_128));
   1478        uint16x8_t a32_lo =
   1479            vmlal_u8(vdupq_n_u16(16), vget_low_u8(a0_128), vdup_n_u8(32));
   1480        uint16x8_t a32_hi =
   1481            vmlal_u8(vdupq_n_u16(16), vget_high_u8(a0_128), vdup_n_u8(32));
   1482        uint16x8_t res_lo = vmlaq_u16(a32_lo, diff_lo, shift);
   1483        uint16x8_t res_hi = vmlaq_u16(a32_hi, diff_hi, shift);
   1484        vst1q_u8(dst + j,
   1485                 vcombine_u8(vshrn_n_u16(res_lo, 5), vshrn_n_u16(res_hi, 5)));
   1486 
   1487        base_inc128 = vaddq_u8(base_inc128, vdupq_n_u8(16));
   1488      }
   1489    }
   1490    x += dx;
   1491  }
   1492 }
   1493 
   1494 // Directional prediction, zone 1: 0 < angle < 90
   1495 void av1_dr_prediction_z1_neon(uint8_t *dst, ptrdiff_t stride, int bw, int bh,
   1496                               const uint8_t *above, const uint8_t *left,
   1497                               int upsample_above, int dx, int dy) {
   1498  (void)left;
   1499  (void)dy;
   1500 
   1501  switch (bw) {
   1502    case 4:
   1503      dr_prediction_z1_4xN_neon(bh, dst, stride, above, upsample_above, dx);
   1504      break;
   1505    case 8:
   1506      dr_prediction_z1_8xN_neon(bh, dst, stride, above, upsample_above, dx);
   1507      break;
   1508    case 16:
   1509      dr_prediction_z1_16xN_neon(bh, dst, stride, above, upsample_above, dx);
   1510      break;
   1511    case 32: dr_prediction_z1_32xN_neon(bh, dst, stride, above, dx); break;
   1512    case 64: dr_prediction_z1_64xN_neon(bh, dst, stride, above, dx); break;
   1513    default: break;
   1514  }
   1515 }
   1516 
   1517 /* ---------------------P R E D I C T I O N   Z 2--------------------------- */
   1518 
   1519 #if !AOM_ARCH_AARCH64
   1520 static DECLARE_ALIGNED(16, const uint8_t, LoadMaskz2[4][16]) = {
   1521  { 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
   1522  { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0 },
   1523  { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0,
   1524    0, 0, 0 },
   1525  { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
   1526    0xff, 0xff, 0xff, 0xff }
   1527 };
   1528 #endif  // !AOM_ARCH_AARCH64
   1529 
   1530 static AOM_FORCE_INLINE void dr_prediction_z2_Nx4_above_neon(
   1531    const uint8_t *above, int upsample_above, int dx, int base_x, int y,
   1532    uint8x8_t *a0_x, uint8x8_t *a1_x, uint16x4_t *shift0) {
   1533  uint16x4_t r6 = vcreate_u16(0x00C0008000400000);
   1534  uint16x4_t ydx = vdup_n_u16(y * dx);
   1535  if (upsample_above) {
   1536    // Cannot use LD2 here since we only want to load eight bytes, but LD2 can
   1537    // only load either 16 or 32.
   1538    uint8x8_t v_tmp = vld1_u8(above + base_x);
   1539    *a0_x = vuzp_u8(v_tmp, vdup_n_u8(0)).val[0];
   1540    *a1_x = vuzp_u8(v_tmp, vdup_n_u8(0)).val[1];
   1541    *shift0 = vand_u16(vsub_u16(r6, ydx), vdup_n_u16(0x1f));
   1542  } else {
   1543    *a0_x = load_unaligned_u8_4x1(above + base_x);
   1544    *a1_x = load_unaligned_u8_4x1(above + base_x + 1);
   1545    *shift0 = vand_u16(vhsub_u16(r6, ydx), vdup_n_u16(0x1f));
   1546  }
   1547 }
   1548 
   1549 static AOM_FORCE_INLINE void dr_prediction_z2_Nx4_left_neon(
   1550 #if AOM_ARCH_AARCH64
   1551    uint8x16x2_t left_vals,
   1552 #else
   1553    const uint8_t *left,
   1554 #endif
   1555    int upsample_left, int dy, int r, int min_base_y, int frac_bits_y,
   1556    uint16x4_t *a0_y, uint16x4_t *a1_y, uint16x4_t *shift1) {
   1557  int16x4_t dy64 = vdup_n_s16(dy);
   1558  int16x4_t v_1234 = vcreate_s16(0x0004000300020001);
   1559  int16x4_t v_frac_bits_y = vdup_n_s16(-frac_bits_y);
   1560  int16x4_t min_base_y64 = vdup_n_s16(min_base_y);
   1561  int16x4_t v_r6 = vdup_n_s16(r << 6);
   1562  int16x4_t y_c64 = vmls_s16(v_r6, v_1234, dy64);
   1563  int16x4_t base_y_c64 = vshl_s16(y_c64, v_frac_bits_y);
   1564 
   1565  // Values in base_y_c64 range from -2 through 14 inclusive.
   1566  base_y_c64 = vmax_s16(base_y_c64, min_base_y64);
   1567 
   1568 #if AOM_ARCH_AARCH64
   1569  uint8x8_t left_idx0 =
   1570      vreinterpret_u8_s16(vadd_s16(base_y_c64, vdup_n_s16(2)));  // [0, 16]
   1571  uint8x8_t left_idx1 =
   1572      vreinterpret_u8_s16(vadd_s16(base_y_c64, vdup_n_s16(3)));  // [1, 17]
   1573 
   1574  *a0_y = vreinterpret_u16_u8(vqtbl2_u8(left_vals, left_idx0));
   1575  *a1_y = vreinterpret_u16_u8(vqtbl2_u8(left_vals, left_idx1));
   1576 #else   // !AOM_ARCH_AARCH64
   1577  DECLARE_ALIGNED(32, int16_t, base_y_c[4]);
   1578 
   1579  vst1_s16(base_y_c, base_y_c64);
   1580  uint8x8_t a0_y_u8 = vdup_n_u8(0);
   1581  a0_y_u8 = vld1_lane_u8(left + base_y_c[0], a0_y_u8, 0);
   1582  a0_y_u8 = vld1_lane_u8(left + base_y_c[1], a0_y_u8, 2);
   1583  a0_y_u8 = vld1_lane_u8(left + base_y_c[2], a0_y_u8, 4);
   1584  a0_y_u8 = vld1_lane_u8(left + base_y_c[3], a0_y_u8, 6);
   1585 
   1586  base_y_c64 = vadd_s16(base_y_c64, vdup_n_s16(1));
   1587  vst1_s16(base_y_c, base_y_c64);
   1588  uint8x8_t a1_y_u8 = vdup_n_u8(0);
   1589  a1_y_u8 = vld1_lane_u8(left + base_y_c[0], a1_y_u8, 0);
   1590  a1_y_u8 = vld1_lane_u8(left + base_y_c[1], a1_y_u8, 2);
   1591  a1_y_u8 = vld1_lane_u8(left + base_y_c[2], a1_y_u8, 4);
   1592  a1_y_u8 = vld1_lane_u8(left + base_y_c[3], a1_y_u8, 6);
   1593 
   1594  *a0_y = vreinterpret_u16_u8(a0_y_u8);
   1595  *a1_y = vreinterpret_u16_u8(a1_y_u8);
   1596 #endif  // AOM_ARCH_AARCH64
   1597 
   1598  if (upsample_left) {
   1599    *shift1 = vand_u16(vreinterpret_u16_s16(y_c64), vdup_n_u16(0x1f));
   1600  } else {
   1601    *shift1 =
   1602        vand_u16(vshr_n_u16(vreinterpret_u16_s16(y_c64), 1), vdup_n_u16(0x1f));
   1603  }
   1604 }
   1605 
   1606 static AOM_FORCE_INLINE uint8x8_t dr_prediction_z2_Nx8_above_neon(
   1607    const uint8_t *above, int upsample_above, int dx, int base_x, int y) {
   1608  uint16x8_t c1234 = vcombine_u16(vcreate_u16(0x0004000300020001),
   1609                                  vcreate_u16(0x0008000700060005));
   1610  uint16x8_t ydx = vdupq_n_u16(y * dx);
   1611  uint16x8_t r6 = vshlq_n_u16(vextq_u16(c1234, vdupq_n_u16(0), 2), 6);
   1612 
   1613  uint16x8_t shift0;
   1614  uint8x8_t a0_x0;
   1615  uint8x8_t a1_x0;
   1616  if (upsample_above) {
   1617    uint8x8x2_t v_tmp = vld2_u8(above + base_x);
   1618    a0_x0 = v_tmp.val[0];
   1619    a1_x0 = v_tmp.val[1];
   1620    shift0 = vandq_u16(vsubq_u16(r6, ydx), vdupq_n_u16(0x1f));
   1621  } else {
   1622    a0_x0 = vld1_u8(above + base_x);
   1623    a1_x0 = vld1_u8(above + base_x + 1);
   1624    shift0 = vandq_u16(vhsubq_u16(r6, ydx), vdupq_n_u16(0x1f));
   1625  }
   1626 
   1627  uint16x8_t diff0 = vsubl_u8(a1_x0, a0_x0);  // a[x+1] - a[x]
   1628  uint16x8_t a32 =
   1629      vmlal_u8(vdupq_n_u16(16), a0_x0, vdup_n_u8(32));  // a[x] * 32 + 16
   1630  uint16x8_t res = vmlaq_u16(a32, diff0, shift0);
   1631  return vshrn_n_u16(res, 5);
   1632 }
   1633 
   1634 static AOM_FORCE_INLINE uint8x8_t dr_prediction_z2_Nx8_left_neon(
   1635 #if AOM_ARCH_AARCH64
   1636    uint8x16x3_t left_vals,
   1637 #else
   1638    const uint8_t *left,
   1639 #endif
   1640    int upsample_left, int dy, int r, int min_base_y, int frac_bits_y) {
   1641  int16x8_t v_r6 = vdupq_n_s16(r << 6);
   1642  int16x8_t dy128 = vdupq_n_s16(dy);
   1643  int16x8_t v_frac_bits_y = vdupq_n_s16(-frac_bits_y);
   1644  int16x8_t min_base_y128 = vdupq_n_s16(min_base_y);
   1645 
   1646  uint16x8_t c1234 = vcombine_u16(vcreate_u16(0x0004000300020001),
   1647                                  vcreate_u16(0x0008000700060005));
   1648  int16x8_t y_c128 = vmlsq_s16(v_r6, vreinterpretq_s16_u16(c1234), dy128);
   1649  int16x8_t base_y_c128 = vshlq_s16(y_c128, v_frac_bits_y);
   1650 
   1651  // Values in base_y_c128 range from -2 through 31 inclusive.
   1652  base_y_c128 = vmaxq_s16(base_y_c128, min_base_y128);
   1653 
   1654 #if AOM_ARCH_AARCH64
   1655  uint8x16_t left_idx0 =
   1656      vreinterpretq_u8_s16(vaddq_s16(base_y_c128, vdupq_n_s16(2)));  // [0, 33]
   1657  uint8x16_t left_idx1 =
   1658      vreinterpretq_u8_s16(vaddq_s16(base_y_c128, vdupq_n_s16(3)));  // [1, 34]
   1659  uint8x16_t left_idx01 = vuzp1q_u8(left_idx0, left_idx1);
   1660 
   1661  uint8x16_t a01_x = vqtbl3q_u8(left_vals, left_idx01);
   1662  uint8x8_t a0_x1 = vget_low_u8(a01_x);
   1663  uint8x8_t a1_x1 = vget_high_u8(a01_x);
   1664 #else   // !AOM_ARCH_AARCH64
   1665  uint8x8_t a0_x1 = load_u8_gather_s16_x8(left, base_y_c128);
   1666  uint8x8_t a1_x1 = load_u8_gather_s16_x8(left + 1, base_y_c128);
   1667 #endif  // AOM_ARCH_AARCH64
   1668 
   1669  uint16x8_t shift1;
   1670  if (upsample_left) {
   1671    shift1 = vandq_u16(vreinterpretq_u16_s16(y_c128), vdupq_n_u16(0x1f));
   1672  } else {
   1673    shift1 = vshrq_n_u16(
   1674        vandq_u16(vreinterpretq_u16_s16(y_c128), vdupq_n_u16(0x3f)), 1);
   1675  }
   1676 
   1677  uint16x8_t diff1 = vsubl_u8(a1_x1, a0_x1);
   1678  uint16x8_t a32 = vmlal_u8(vdupq_n_u16(16), a0_x1, vdup_n_u8(32));
   1679  uint16x8_t res = vmlaq_u16(a32, diff1, shift1);
   1680  return vshrn_n_u16(res, 5);
   1681 }
   1682 
   1683 static AOM_FORCE_INLINE uint8x16_t dr_prediction_z2_NxW_above_neon(
   1684    const uint8_t *above, int dx, int base_x, int y, int j) {
   1685  uint16x8x2_t c0123 = { { vcombine_u16(vcreate_u16(0x0003000200010000),
   1686                                        vcreate_u16(0x0007000600050004)),
   1687                           vcombine_u16(vcreate_u16(0x000B000A00090008),
   1688                                        vcreate_u16(0x000F000E000D000C)) } };
   1689  uint16x8_t j256 = vdupq_n_u16(j);
   1690  uint16x8_t ydx = vdupq_n_u16((uint16_t)(y * dx));
   1691 
   1692  const uint8x16_t a0_x128 = vld1q_u8(above + base_x + j);
   1693  const uint8x16_t a1_x128 = vld1q_u8(above + base_x + j + 1);
   1694  uint16x8_t res6_0 = vshlq_n_u16(vaddq_u16(c0123.val[0], j256), 6);
   1695  uint16x8_t res6_1 = vshlq_n_u16(vaddq_u16(c0123.val[1], j256), 6);
   1696  uint16x8_t shift0 =
   1697      vshrq_n_u16(vandq_u16(vsubq_u16(res6_0, ydx), vdupq_n_u16(0x3f)), 1);
   1698  uint16x8_t shift1 =
   1699      vshrq_n_u16(vandq_u16(vsubq_u16(res6_1, ydx), vdupq_n_u16(0x3f)), 1);
   1700  // a[x+1] - a[x]
   1701  uint16x8_t diff0 = vsubl_u8(vget_low_u8(a1_x128), vget_low_u8(a0_x128));
   1702  uint16x8_t diff1 = vsubl_u8(vget_high_u8(a1_x128), vget_high_u8(a0_x128));
   1703  // a[x] * 32 + 16
   1704  uint16x8_t a32_0 =
   1705      vmlal_u8(vdupq_n_u16(16), vget_low_u8(a0_x128), vdup_n_u8(32));
   1706  uint16x8_t a32_1 =
   1707      vmlal_u8(vdupq_n_u16(16), vget_high_u8(a0_x128), vdup_n_u8(32));
   1708  uint16x8_t res0 = vmlaq_u16(a32_0, diff0, shift0);
   1709  uint16x8_t res1 = vmlaq_u16(a32_1, diff1, shift1);
   1710  return vcombine_u8(vshrn_n_u16(res0, 5), vshrn_n_u16(res1, 5));
   1711 }
   1712 
   1713 static AOM_FORCE_INLINE uint8x16_t dr_prediction_z2_NxW_left_neon(
   1714 #if AOM_ARCH_AARCH64
   1715    uint8x16x4_t left_vals0, uint8x16x4_t left_vals1,
   1716 #else
   1717    const uint8_t *left,
   1718 #endif
   1719    int dy, int r, int j) {
   1720  // here upsample_above and upsample_left are 0 by design of
   1721  // av1_use_intra_edge_upsample
   1722  const int min_base_y = -1;
   1723 
   1724  int16x8_t min_base_y256 = vdupq_n_s16(min_base_y);
   1725  int16x8_t half_min_base_y256 = vdupq_n_s16(min_base_y >> 1);
   1726  int16x8_t dy256 = vdupq_n_s16(dy);
   1727  uint16x8_t j256 = vdupq_n_u16(j);
   1728 
   1729  uint16x8x2_t c0123 = { { vcombine_u16(vcreate_u16(0x0003000200010000),
   1730                                        vcreate_u16(0x0007000600050004)),
   1731                           vcombine_u16(vcreate_u16(0x000B000A00090008),
   1732                                        vcreate_u16(0x000F000E000D000C)) } };
   1733  uint16x8x2_t c1234 = { { vaddq_u16(c0123.val[0], vdupq_n_u16(1)),
   1734                           vaddq_u16(c0123.val[1], vdupq_n_u16(1)) } };
   1735 
   1736  int16x8_t v_r6 = vdupq_n_s16(r << 6);
   1737 
   1738  int16x8_t c256_0 = vreinterpretq_s16_u16(vaddq_u16(j256, c1234.val[0]));
   1739  int16x8_t c256_1 = vreinterpretq_s16_u16(vaddq_u16(j256, c1234.val[1]));
   1740  int16x8_t mul16_lo = vreinterpretq_s16_u16(
   1741      vminq_u16(vreinterpretq_u16_s16(vmulq_s16(c256_0, dy256)),
   1742                vreinterpretq_u16_s16(half_min_base_y256)));
   1743  int16x8_t mul16_hi = vreinterpretq_s16_u16(
   1744      vminq_u16(vreinterpretq_u16_s16(vmulq_s16(c256_1, dy256)),
   1745                vreinterpretq_u16_s16(half_min_base_y256)));
   1746  int16x8_t y_c256_lo = vsubq_s16(v_r6, mul16_lo);
   1747  int16x8_t y_c256_hi = vsubq_s16(v_r6, mul16_hi);
   1748 
   1749  int16x8_t base_y_c256_lo = vshrq_n_s16(y_c256_lo, 6);
   1750  int16x8_t base_y_c256_hi = vshrq_n_s16(y_c256_hi, 6);
   1751 
   1752  base_y_c256_lo = vmaxq_s16(min_base_y256, base_y_c256_lo);
   1753  base_y_c256_hi = vmaxq_s16(min_base_y256, base_y_c256_hi);
   1754 
   1755 #if !AOM_ARCH_AARCH64
   1756  int16_t min_y = vgetq_lane_s16(base_y_c256_hi, 7);
   1757  int16_t max_y = vgetq_lane_s16(base_y_c256_lo, 0);
   1758  int16_t offset_diff = max_y - min_y;
   1759 
   1760  uint8x8_t a0_y0;
   1761  uint8x8_t a0_y1;
   1762  uint8x8_t a1_y0;
   1763  uint8x8_t a1_y1;
   1764  if (offset_diff < 16) {
   1765    // Avoid gathers where the data we want is close together in memory.
   1766    // We don't need this for AArch64 since we can already use TBL to cover the
   1767    // full range of possible values.
   1768    assert(offset_diff >= 0);
   1769    int16x8_t min_y256 = vdupq_lane_s16(vget_high_s16(base_y_c256_hi), 3);
   1770 
   1771    int16x8x2_t base_y_offset;
   1772    base_y_offset.val[0] = vsubq_s16(base_y_c256_lo, min_y256);
   1773    base_y_offset.val[1] = vsubq_s16(base_y_c256_hi, min_y256);
   1774 
   1775    int8x16_t base_y_offset128 = vcombine_s8(vqmovn_s16(base_y_offset.val[0]),
   1776                                             vqmovn_s16(base_y_offset.val[1]));
   1777 
   1778    uint8x16_t v_loadmaskz2 = vld1q_u8(LoadMaskz2[offset_diff / 4]);
   1779    uint8x16_t a0_y128 = vld1q_u8(left + min_y);
   1780    uint8x16_t a1_y128 = vld1q_u8(left + min_y + 1);
   1781    a0_y128 = vandq_u8(a0_y128, v_loadmaskz2);
   1782    a1_y128 = vandq_u8(a1_y128, v_loadmaskz2);
   1783 
   1784    uint8x8_t v_index_low = vget_low_u8(vreinterpretq_u8_s8(base_y_offset128));
   1785    uint8x8_t v_index_high =
   1786        vget_high_u8(vreinterpretq_u8_s8(base_y_offset128));
   1787    uint8x8x2_t v_tmp, v_res;
   1788    v_tmp.val[0] = vget_low_u8(a0_y128);
   1789    v_tmp.val[1] = vget_high_u8(a0_y128);
   1790    v_res.val[0] = vtbl2_u8(v_tmp, v_index_low);
   1791    v_res.val[1] = vtbl2_u8(v_tmp, v_index_high);
   1792    a0_y128 = vcombine_u8(v_res.val[0], v_res.val[1]);
   1793    v_tmp.val[0] = vget_low_u8(a1_y128);
   1794    v_tmp.val[1] = vget_high_u8(a1_y128);
   1795    v_res.val[0] = vtbl2_u8(v_tmp, v_index_low);
   1796    v_res.val[1] = vtbl2_u8(v_tmp, v_index_high);
   1797    a1_y128 = vcombine_u8(v_res.val[0], v_res.val[1]);
   1798 
   1799    a0_y0 = vget_low_u8(a0_y128);
   1800    a0_y1 = vget_high_u8(a0_y128);
   1801    a1_y0 = vget_low_u8(a1_y128);
   1802    a1_y1 = vget_high_u8(a1_y128);
   1803  } else {
   1804    a0_y0 = load_u8_gather_s16_x8(left, base_y_c256_lo);
   1805    a0_y1 = load_u8_gather_s16_x8(left, base_y_c256_hi);
   1806    a1_y0 = load_u8_gather_s16_x8(left + 1, base_y_c256_lo);
   1807    a1_y1 = load_u8_gather_s16_x8(left + 1, base_y_c256_hi);
   1808  }
   1809 #else
   1810  // Values in left_idx{0,1} range from 0 through 63 inclusive.
   1811  uint8x16_t left_idx0 =
   1812      vreinterpretq_u8_s16(vaddq_s16(base_y_c256_lo, vdupq_n_s16(1)));
   1813  uint8x16_t left_idx1 =
   1814      vreinterpretq_u8_s16(vaddq_s16(base_y_c256_hi, vdupq_n_s16(1)));
   1815  uint8x16_t left_idx01 = vuzp1q_u8(left_idx0, left_idx1);
   1816 
   1817  uint8x16_t a0_y01 = vqtbl4q_u8(left_vals0, left_idx01);
   1818  uint8x16_t a1_y01 = vqtbl4q_u8(left_vals1, left_idx01);
   1819 
   1820  uint8x8_t a0_y0 = vget_low_u8(a0_y01);
   1821  uint8x8_t a0_y1 = vget_high_u8(a0_y01);
   1822  uint8x8_t a1_y0 = vget_low_u8(a1_y01);
   1823  uint8x8_t a1_y1 = vget_high_u8(a1_y01);
   1824 #endif  // !AOM_ARCH_AARCH64
   1825 
   1826  uint16x8_t shifty_lo = vshrq_n_u16(
   1827      vandq_u16(vreinterpretq_u16_s16(y_c256_lo), vdupq_n_u16(0x3f)), 1);
   1828  uint16x8_t shifty_hi = vshrq_n_u16(
   1829      vandq_u16(vreinterpretq_u16_s16(y_c256_hi), vdupq_n_u16(0x3f)), 1);
   1830 
   1831  // a[x+1] - a[x]
   1832  uint16x8_t diff_lo = vsubl_u8(a1_y0, a0_y0);
   1833  uint16x8_t diff_hi = vsubl_u8(a1_y1, a0_y1);
   1834  // a[x] * 32 + 16
   1835  uint16x8_t a32_lo = vmlal_u8(vdupq_n_u16(16), a0_y0, vdup_n_u8(32));
   1836  uint16x8_t a32_hi = vmlal_u8(vdupq_n_u16(16), a0_y1, vdup_n_u8(32));
   1837 
   1838  uint16x8_t res0 = vmlaq_u16(a32_lo, diff_lo, shifty_lo);
   1839  uint16x8_t res1 = vmlaq_u16(a32_hi, diff_hi, shifty_hi);
   1840 
   1841  return vcombine_u8(vshrn_n_u16(res0, 5), vshrn_n_u16(res1, 5));
   1842 }
   1843 
   1844 static void dr_prediction_z2_Nx4_neon(int N, uint8_t *dst, ptrdiff_t stride,
   1845                                      const uint8_t *above, const uint8_t *left,
   1846                                      int upsample_above, int upsample_left,
   1847                                      int dx, int dy) {
   1848  const int min_base_x = -(1 << upsample_above);
   1849  const int min_base_y = -(1 << upsample_left);
   1850  const int frac_bits_x = 6 - upsample_above;
   1851  const int frac_bits_y = 6 - upsample_left;
   1852 
   1853  assert(dx > 0);
   1854  // pre-filter above pixels
   1855  // store in temp buffers:
   1856  //   above[x] * 32 + 16
   1857  //   above[x+1] - above[x]
   1858  // final pixels will be calculated as:
   1859  //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
   1860 
   1861 #if AOM_ARCH_AARCH64
   1862  // Use ext rather than loading left + 14 directly to avoid over-read.
   1863  const uint8x16_t left_m2 = vld1q_u8(left - 2);
   1864  const uint8x16_t left_0 = vld1q_u8(left);
   1865  const uint8x16_t left_14 = vextq_u8(left_0, left_0, 14);
   1866  const uint8x16x2_t left_vals = { { left_m2, left_14 } };
   1867 #define LEFT left_vals
   1868 #else  // !AOM_ARCH_AARCH64
   1869 #define LEFT left
   1870 #endif  // AOM_ARCH_AARCH64
   1871 
   1872  for (int r = 0; r < N; r++) {
   1873    int y = r + 1;
   1874    int base_x = (-y * dx) >> frac_bits_x;
   1875    const int base_min_diff =
   1876        (min_base_x - ((-y * dx) >> frac_bits_x) + upsample_above) >>
   1877        upsample_above;
   1878 
   1879    if (base_min_diff <= 0) {
   1880      uint8x8_t a0_x_u8, a1_x_u8;
   1881      uint16x4_t shift0;
   1882      dr_prediction_z2_Nx4_above_neon(above, upsample_above, dx, base_x, y,
   1883                                      &a0_x_u8, &a1_x_u8, &shift0);
   1884      uint8x8_t a0_x = a0_x_u8;
   1885      uint8x8_t a1_x = a1_x_u8;
   1886 
   1887      uint16x8_t diff = vsubl_u8(a1_x, a0_x);  // a[x+1] - a[x]
   1888      uint16x8_t a32 =
   1889          vmlal_u8(vdupq_n_u16(16), a0_x, vdup_n_u8(32));  // a[x] * 32 + 16
   1890      uint16x8_t res =
   1891          vmlaq_u16(a32, diff, vcombine_u16(shift0, vdup_n_u16(0)));
   1892      uint8x8_t resx = vshrn_n_u16(res, 5);
   1893      vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(resx), 0);
   1894    } else if (base_min_diff < 4) {
   1895      uint8x8_t a0_x_u8, a1_x_u8;
   1896      uint16x4_t shift0;
   1897      dr_prediction_z2_Nx4_above_neon(above, upsample_above, dx, base_x, y,
   1898                                      &a0_x_u8, &a1_x_u8, &shift0);
   1899      uint16x8_t a0_x = vmovl_u8(a0_x_u8);
   1900      uint16x8_t a1_x = vmovl_u8(a1_x_u8);
   1901 
   1902      uint16x4_t a0_y;
   1903      uint16x4_t a1_y;
   1904      uint16x4_t shift1;
   1905      dr_prediction_z2_Nx4_left_neon(LEFT, upsample_left, dy, r, min_base_y,
   1906                                     frac_bits_y, &a0_y, &a1_y, &shift1);
   1907      a0_x = vcombine_u16(vget_low_u16(a0_x), a0_y);
   1908      a1_x = vcombine_u16(vget_low_u16(a1_x), a1_y);
   1909 
   1910      uint16x8_t shift = vcombine_u16(shift0, shift1);
   1911      uint16x8_t diff = vsubq_u16(a1_x, a0_x);  // a[x+1] - a[x]
   1912      uint16x8_t a32 =
   1913          vmlaq_n_u16(vdupq_n_u16(16), a0_x, 32);  // a[x] * 32 + 16
   1914      uint16x8_t res = vmlaq_u16(a32, diff, shift);
   1915      uint8x8_t resx = vshrn_n_u16(res, 5);
   1916      uint8x8_t resy = vext_u8(resx, vdup_n_u8(0), 4);
   1917 
   1918      uint8x8_t mask = vld1_u8(BaseMask[base_min_diff]);
   1919      uint8x8_t v_resxy = vbsl_u8(mask, resy, resx);
   1920      vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(v_resxy), 0);
   1921    } else {
   1922      uint16x4_t a0_y, a1_y;
   1923      uint16x4_t shift1;
   1924      dr_prediction_z2_Nx4_left_neon(LEFT, upsample_left, dy, r, min_base_y,
   1925                                     frac_bits_y, &a0_y, &a1_y, &shift1);
   1926      uint16x4_t diff = vsub_u16(a1_y, a0_y);                 // a[x+1] - a[x]
   1927      uint16x4_t a32 = vmla_n_u16(vdup_n_u16(16), a0_y, 32);  // a[x] * 32 + 16
   1928      uint16x4_t res = vmla_u16(a32, diff, shift1);
   1929      uint8x8_t resy = vshrn_n_u16(vcombine_u16(res, vdup_n_u16(0)), 5);
   1930 
   1931      vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(resy), 0);
   1932    }
   1933 
   1934    dst += stride;
   1935  }
   1936 #undef LEFT
   1937 }
   1938 
   1939 static void dr_prediction_z2_Nx8_neon(int N, uint8_t *dst, ptrdiff_t stride,
   1940                                      const uint8_t *above, const uint8_t *left,
   1941                                      int upsample_above, int upsample_left,
   1942                                      int dx, int dy) {
   1943  const int min_base_x = -(1 << upsample_above);
   1944  const int min_base_y = -(1 << upsample_left);
   1945  const int frac_bits_x = 6 - upsample_above;
   1946  const int frac_bits_y = 6 - upsample_left;
   1947 
   1948  // pre-filter above pixels
   1949  // store in temp buffers:
   1950  //   above[x] * 32 + 16
   1951  //   above[x+1] - above[x]
   1952  // final pixels will be calculated as:
   1953  //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
   1954 
   1955 #if AOM_ARCH_AARCH64
   1956  // Use ext rather than loading left + 30 directly to avoid over-read.
   1957  const uint8x16_t left_m2 = vld1q_u8(left - 2);
   1958  const uint8x16_t left_0 = vld1q_u8(left + 0);
   1959  const uint8x16_t left_16 = vld1q_u8(left + 16);
   1960  const uint8x16_t left_14 = vextq_u8(left_0, left_16, 14);
   1961  const uint8x16_t left_30 = vextq_u8(left_16, left_16, 14);
   1962  const uint8x16x3_t left_vals = { { left_m2, left_14, left_30 } };
   1963 #define LEFT left_vals
   1964 #else  // !AOM_ARCH_AARCH64
   1965 #define LEFT left
   1966 #endif  // AOM_ARCH_AARCH64
   1967 
   1968  for (int r = 0; r < N; r++) {
   1969    int y = r + 1;
   1970    int base_x = (-y * dx) >> frac_bits_x;
   1971    int base_min_diff =
   1972        (min_base_x - base_x + upsample_above) >> upsample_above;
   1973 
   1974    if (base_min_diff <= 0) {
   1975      uint8x8_t resx =
   1976          dr_prediction_z2_Nx8_above_neon(above, upsample_above, dx, base_x, y);
   1977      vst1_u8(dst, resx);
   1978    } else if (base_min_diff < 8) {
   1979      uint8x8_t resx =
   1980          dr_prediction_z2_Nx8_above_neon(above, upsample_above, dx, base_x, y);
   1981      uint8x8_t resy = dr_prediction_z2_Nx8_left_neon(
   1982          LEFT, upsample_left, dy, r, min_base_y, frac_bits_y);
   1983      uint8x8_t mask = vld1_u8(BaseMask[base_min_diff]);
   1984      uint8x8_t resxy = vbsl_u8(mask, resy, resx);
   1985      vst1_u8(dst, resxy);
   1986    } else {
   1987      uint8x8_t resy = dr_prediction_z2_Nx8_left_neon(
   1988          LEFT, upsample_left, dy, r, min_base_y, frac_bits_y);
   1989      vst1_u8(dst, resy);
   1990    }
   1991 
   1992    dst += stride;
   1993  }
   1994 #undef LEFT
   1995 }
   1996 
   1997 static void dr_prediction_z2_HxW_neon(int H, int W, uint8_t *dst,
   1998                                      ptrdiff_t stride, const uint8_t *above,
   1999                                      const uint8_t *left, int dx, int dy) {
   2000  // here upsample_above and upsample_left are 0 by design of
   2001  // av1_use_intra_edge_upsample
   2002  const int min_base_x = -1;
   2003 
   2004 #if AOM_ARCH_AARCH64
   2005  const uint8x16_t left_m1 = vld1q_u8(left - 1);
   2006  const uint8x16_t left_0 = vld1q_u8(left + 0);
   2007  const uint8x16_t left_16 = vld1q_u8(left + 16);
   2008  const uint8x16_t left_32 = vld1q_u8(left + 32);
   2009  const uint8x16_t left_48 = vld1q_u8(left + 48);
   2010  const uint8x16_t left_15 = vextq_u8(left_0, left_16, 15);
   2011  const uint8x16_t left_31 = vextq_u8(left_16, left_32, 15);
   2012  const uint8x16_t left_47 = vextq_u8(left_32, left_48, 15);
   2013  const uint8x16x4_t left_vals0 = { { left_m1, left_15, left_31, left_47 } };
   2014  const uint8x16x4_t left_vals1 = { { left_0, left_16, left_32, left_48 } };
   2015 #define LEFT left_vals0, left_vals1
   2016 #else  // !AOM_ARCH_AARCH64
   2017 #define LEFT left
   2018 #endif  // AOM_ARCH_AARCH64
   2019 
   2020  for (int r = 0; r < H; r++) {
   2021    int y = r + 1;
   2022    int base_x = (-y * dx) >> 6;
   2023    for (int j = 0; j < W; j += 16) {
   2024      const int base_min_diff = min_base_x - base_x - j;
   2025 
   2026      if (base_min_diff <= 0) {
   2027        uint8x16_t resx =
   2028            dr_prediction_z2_NxW_above_neon(above, dx, base_x, y, j);
   2029        vst1q_u8(dst + j, resx);
   2030      } else if (base_min_diff < 16) {
   2031        uint8x16_t resx =
   2032            dr_prediction_z2_NxW_above_neon(above, dx, base_x, y, j);
   2033        uint8x16_t resy = dr_prediction_z2_NxW_left_neon(LEFT, dy, r, j);
   2034        uint8x16_t mask = vld1q_u8(BaseMask[base_min_diff]);
   2035        uint8x16_t resxy = vbslq_u8(mask, resy, resx);
   2036        vst1q_u8(dst + j, resxy);
   2037      } else {
   2038        uint8x16_t resy = dr_prediction_z2_NxW_left_neon(LEFT, dy, r, j);
   2039        vst1q_u8(dst + j, resy);
   2040      }
   2041    }  // for j
   2042    dst += stride;
   2043  }
   2044 #undef LEFT
   2045 }
   2046 
   2047 // Directional prediction, zone 2: 90 < angle < 180
   2048 void av1_dr_prediction_z2_neon(uint8_t *dst, ptrdiff_t stride, int bw, int bh,
   2049                               const uint8_t *above, const uint8_t *left,
   2050                               int upsample_above, int upsample_left, int dx,
   2051                               int dy) {
   2052  assert(dx > 0);
   2053  assert(dy > 0);
   2054 
   2055  switch (bw) {
   2056    case 4:
   2057      dr_prediction_z2_Nx4_neon(bh, dst, stride, above, left, upsample_above,
   2058                                upsample_left, dx, dy);
   2059      break;
   2060    case 8:
   2061      dr_prediction_z2_Nx8_neon(bh, dst, stride, above, left, upsample_above,
   2062                                upsample_left, dx, dy);
   2063      break;
   2064    default:
   2065      dr_prediction_z2_HxW_neon(bh, bw, dst, stride, above, left, dx, dy);
   2066      break;
   2067  }
   2068 }
   2069 
   2070 /* ---------------------P R E D I C T I O N   Z 3--------------------------- */
   2071 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
   2072 static AOM_FORCE_INLINE void z3_transpose_arrays_u8_16x4(const uint8x16_t *x,
   2073                                                         uint8x16x2_t *d) {
   2074  uint8x16x2_t w0 = vzipq_u8(x[0], x[1]);
   2075  uint8x16x2_t w1 = vzipq_u8(x[2], x[3]);
   2076 
   2077  d[0] = aom_reinterpretq_u8_u16_x2(vzipq_u16(vreinterpretq_u16_u8(w0.val[0]),
   2078                                              vreinterpretq_u16_u8(w1.val[0])));
   2079  d[1] = aom_reinterpretq_u8_u16_x2(vzipq_u16(vreinterpretq_u16_u8(w0.val[1]),
   2080                                              vreinterpretq_u16_u8(w1.val[1])));
   2081 }
   2082 #endif  // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
   2083 
   2084 static AOM_FORCE_INLINE void z3_transpose_arrays_u8_4x4(const uint8x8_t *x,
   2085                                                        uint8x8x2_t *d) {
   2086  uint8x8x2_t w0 = vzip_u8(x[0], x[1]);
   2087  uint8x8x2_t w1 = vzip_u8(x[2], x[3]);
   2088 
   2089  *d = aom_reinterpret_u8_u16_x2(
   2090      vzip_u16(vreinterpret_u16_u8(w0.val[0]), vreinterpret_u16_u8(w1.val[0])));
   2091 }
   2092 
   2093 static AOM_FORCE_INLINE void z3_transpose_arrays_u8_8x4(const uint8x8_t *x,
   2094                                                        uint8x8x2_t *d) {
   2095  uint8x8x2_t w0 = vzip_u8(x[0], x[1]);
   2096  uint8x8x2_t w1 = vzip_u8(x[2], x[3]);
   2097 
   2098  d[0] = aom_reinterpret_u8_u16_x2(
   2099      vzip_u16(vreinterpret_u16_u8(w0.val[0]), vreinterpret_u16_u8(w1.val[0])));
   2100  d[1] = aom_reinterpret_u8_u16_x2(
   2101      vzip_u16(vreinterpret_u16_u8(w0.val[1]), vreinterpret_u16_u8(w1.val[1])));
   2102 }
   2103 
   2104 static void z3_transpose_arrays_u8_16x16(const uint8_t *src, ptrdiff_t pitchSrc,
   2105                                         uint8_t *dst, ptrdiff_t pitchDst) {
   2106  // The same as the normal transposes in transpose_neon.h, but with a stride
   2107  // between consecutive vectors of elements.
   2108  uint8x16_t r[16];
   2109  uint8x16_t d[16];
   2110  for (int i = 0; i < 16; i++) {
   2111    r[i] = vld1q_u8(src + i * pitchSrc);
   2112  }
   2113  transpose_arrays_u8_16x16(r, d);
   2114  for (int i = 0; i < 16; i++) {
   2115    vst1q_u8(dst + i * pitchDst, d[i]);
   2116  }
   2117 }
   2118 
   2119 static void z3_transpose_arrays_u8_16nx16n(const uint8_t *src,
   2120                                           ptrdiff_t pitchSrc, uint8_t *dst,
   2121                                           ptrdiff_t pitchDst, int width,
   2122                                           int height) {
   2123  for (int j = 0; j < height; j += 16) {
   2124    for (int i = 0; i < width; i += 16) {
   2125      z3_transpose_arrays_u8_16x16(src + i * pitchSrc + j, pitchSrc,
   2126                                   dst + j * pitchDst + i, pitchDst);
   2127    }
   2128  }
   2129 }
   2130 
   2131 static void dr_prediction_z3_4x4_neon(uint8_t *dst, ptrdiff_t stride,
   2132                                      const uint8_t *left, int upsample_left,
   2133                                      int dy) {
   2134  uint8x8_t dstvec[4];
   2135  uint8x8x2_t dest;
   2136 
   2137  dr_prediction_z1_HxW_internal_neon_64(4, 4, dstvec, left, upsample_left, dy);
   2138  z3_transpose_arrays_u8_4x4(dstvec, &dest);
   2139  store_u8x4_strided_x2(dst + stride * 0, stride, dest.val[0]);
   2140  store_u8x4_strided_x2(dst + stride * 2, stride, dest.val[1]);
   2141 }
   2142 
   2143 static void dr_prediction_z3_8x8_neon(uint8_t *dst, ptrdiff_t stride,
   2144                                      const uint8_t *left, int upsample_left,
   2145                                      int dy) {
   2146  uint8x8_t dstvec[8];
   2147  uint8x8_t d[8];
   2148 
   2149  dr_prediction_z1_HxW_internal_neon_64(8, 8, dstvec, left, upsample_left, dy);
   2150  transpose_arrays_u8_8x8(dstvec, d);
   2151  store_u8_8x8(dst, stride, d[0], d[1], d[2], d[3], d[4], d[5], d[6], d[7]);
   2152 }
   2153 
   2154 static void dr_prediction_z3_4x8_neon(uint8_t *dst, ptrdiff_t stride,
   2155                                      const uint8_t *left, int upsample_left,
   2156                                      int dy) {
   2157  uint8x8_t dstvec[4];
   2158  uint8x8x2_t d[2];
   2159 
   2160  dr_prediction_z1_HxW_internal_neon_64(8, 4, dstvec, left, upsample_left, dy);
   2161  z3_transpose_arrays_u8_8x4(dstvec, d);
   2162  store_u8x4_strided_x2(dst + stride * 0, stride, d[0].val[0]);
   2163  store_u8x4_strided_x2(dst + stride * 2, stride, d[0].val[1]);
   2164  store_u8x4_strided_x2(dst + stride * 4, stride, d[1].val[0]);
   2165  store_u8x4_strided_x2(dst + stride * 6, stride, d[1].val[1]);
   2166 }
   2167 
   2168 static void dr_prediction_z3_8x4_neon(uint8_t *dst, ptrdiff_t stride,
   2169                                      const uint8_t *left, int upsample_left,
   2170                                      int dy) {
   2171  uint8x8_t dstvec[8];
   2172  uint8x8_t d[8];
   2173 
   2174  dr_prediction_z1_HxW_internal_neon_64(4, 8, dstvec, left, upsample_left, dy);
   2175  transpose_arrays_u8_8x8(dstvec, d);
   2176  store_u8_8x4(dst, stride, d[0], d[1], d[2], d[3]);
   2177 }
   2178 
   2179 static void dr_prediction_z3_8x16_neon(uint8_t *dst, ptrdiff_t stride,
   2180                                       const uint8_t *left, int upsample_left,
   2181                                       int dy) {
   2182  uint8x16_t dstvec[8];
   2183  uint8x8_t d[16];
   2184 
   2185  dr_prediction_z1_HxW_internal_neon(16, 8, dstvec, left, upsample_left, dy);
   2186  transpose_arrays_u8_16x8(dstvec, d);
   2187  for (int i = 0; i < 16; i++) {
   2188    vst1_u8(dst + i * stride, d[i]);
   2189  }
   2190 }
   2191 
   2192 static void dr_prediction_z3_16x8_neon(uint8_t *dst, ptrdiff_t stride,
   2193                                       const uint8_t *left, int upsample_left,
   2194                                       int dy) {
   2195  uint8x8_t dstvec[16];
   2196  uint8x16_t d[8];
   2197 
   2198  dr_prediction_z1_HxW_internal_neon_64(8, 16, dstvec, left, upsample_left, dy);
   2199  transpose_arrays_u8_8x16(dstvec, d);
   2200  for (int i = 0; i < 8; i++) {
   2201    vst1q_u8(dst + i * stride, d[i]);
   2202  }
   2203 }
   2204 
   2205 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
   2206 static void dr_prediction_z3_4x16_neon(uint8_t *dst, ptrdiff_t stride,
   2207                                       const uint8_t *left, int upsample_left,
   2208                                       int dy) {
   2209  uint8x16_t dstvec[4];
   2210  uint8x16x2_t d[2];
   2211 
   2212  dr_prediction_z1_HxW_internal_neon(16, 4, dstvec, left, upsample_left, dy);
   2213  z3_transpose_arrays_u8_16x4(dstvec, d);
   2214  store_u8x4_strided_x4(dst + stride * 0, stride, d[0].val[0]);
   2215  store_u8x4_strided_x4(dst + stride * 4, stride, d[0].val[1]);
   2216  store_u8x4_strided_x4(dst + stride * 8, stride, d[1].val[0]);
   2217  store_u8x4_strided_x4(dst + stride * 12, stride, d[1].val[1]);
   2218 }
   2219 
   2220 static void dr_prediction_z3_16x4_neon(uint8_t *dst, ptrdiff_t stride,
   2221                                       const uint8_t *left, int upsample_left,
   2222                                       int dy) {
   2223  uint8x8_t dstvec[16];
   2224  uint8x16_t d[8];
   2225 
   2226  dr_prediction_z1_HxW_internal_neon_64(4, 16, dstvec, left, upsample_left, dy);
   2227  transpose_arrays_u8_8x16(dstvec, d);
   2228  for (int i = 0; i < 4; i++) {
   2229    vst1q_u8(dst + i * stride, d[i]);
   2230  }
   2231 }
   2232 
   2233 static void dr_prediction_z3_8x32_neon(uint8_t *dst, ptrdiff_t stride,
   2234                                       const uint8_t *left, int upsample_left,
   2235                                       int dy) {
   2236  (void)upsample_left;
   2237  uint8x16x2_t dstvec[16];
   2238  uint8x16_t d[32];
   2239  uint8x16_t v_zero = vdupq_n_u8(0);
   2240 
   2241  dr_prediction_z1_32xN_internal_neon(8, dstvec, left, dy);
   2242  for (int i = 8; i < 16; i++) {
   2243    dstvec[i].val[0] = v_zero;
   2244    dstvec[i].val[1] = v_zero;
   2245  }
   2246  transpose_arrays_u8_32x16(dstvec, d);
   2247  for (int i = 0; i < 32; i++) {
   2248    vst1_u8(dst + i * stride, vget_low_u8(d[i]));
   2249  }
   2250 }
   2251 
   2252 static void dr_prediction_z3_32x8_neon(uint8_t *dst, ptrdiff_t stride,
   2253                                       const uint8_t *left, int upsample_left,
   2254                                       int dy) {
   2255  uint8x8_t dstvec[32];
   2256  uint8x16_t d[16];
   2257 
   2258  dr_prediction_z1_HxW_internal_neon_64(8, 32, dstvec, left, upsample_left, dy);
   2259  transpose_arrays_u8_8x16(dstvec, d);
   2260  transpose_arrays_u8_8x16(dstvec + 16, d + 8);
   2261  for (int i = 0; i < 8; i++) {
   2262    vst1q_u8(dst + i * stride, d[i]);
   2263    vst1q_u8(dst + i * stride + 16, d[i + 8]);
   2264  }
   2265 }
   2266 #endif  // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
   2267 
   2268 static void dr_prediction_z3_16x16_neon(uint8_t *dst, ptrdiff_t stride,
   2269                                        const uint8_t *left, int upsample_left,
   2270                                        int dy) {
   2271  uint8x16_t dstvec[16];
   2272  uint8x16_t d[16];
   2273 
   2274  dr_prediction_z1_HxW_internal_neon(16, 16, dstvec, left, upsample_left, dy);
   2275  transpose_arrays_u8_16x16(dstvec, d);
   2276  for (int i = 0; i < 16; i++) {
   2277    vst1q_u8(dst + i * stride, d[i]);
   2278  }
   2279 }
   2280 
   2281 static void dr_prediction_z3_32x32_neon(uint8_t *dst, ptrdiff_t stride,
   2282                                        const uint8_t *left, int upsample_left,
   2283                                        int dy) {
   2284  (void)upsample_left;
   2285  uint8x16x2_t dstvec[32];
   2286  uint8x16_t d[64];
   2287 
   2288  dr_prediction_z1_32xN_internal_neon(32, dstvec, left, dy);
   2289  transpose_arrays_u8_32x16(dstvec, d);
   2290  transpose_arrays_u8_32x16(dstvec + 16, d + 32);
   2291  for (int i = 0; i < 32; i++) {
   2292    vst1q_u8(dst + i * stride, d[i]);
   2293    vst1q_u8(dst + i * stride + 16, d[i + 32]);
   2294  }
   2295 }
   2296 
   2297 static void dr_prediction_z3_64x64_neon(uint8_t *dst, ptrdiff_t stride,
   2298                                        const uint8_t *left, int upsample_left,
   2299                                        int dy) {
   2300  (void)upsample_left;
   2301  DECLARE_ALIGNED(16, uint8_t, dstT[64 * 64]);
   2302 
   2303  dr_prediction_z1_64xN_neon(64, dstT, 64, left, dy);
   2304  z3_transpose_arrays_u8_16nx16n(dstT, 64, dst, stride, 64, 64);
   2305 }
   2306 
   2307 static void dr_prediction_z3_16x32_neon(uint8_t *dst, ptrdiff_t stride,
   2308                                        const uint8_t *left, int upsample_left,
   2309                                        int dy) {
   2310  (void)upsample_left;
   2311  uint8x16x2_t dstvec[16];
   2312  uint8x16_t d[32];
   2313 
   2314  dr_prediction_z1_32xN_internal_neon(16, dstvec, left, dy);
   2315  transpose_arrays_u8_32x16(dstvec, d);
   2316  for (int i = 0; i < 16; i++) {
   2317    vst1q_u8(dst + 2 * i * stride, d[2 * i + 0]);
   2318    vst1q_u8(dst + (2 * i + 1) * stride, d[2 * i + 1]);
   2319  }
   2320 }
   2321 
   2322 static void dr_prediction_z3_32x16_neon(uint8_t *dst, ptrdiff_t stride,
   2323                                        const uint8_t *left, int upsample_left,
   2324                                        int dy) {
   2325  uint8x16_t dstvec[32];
   2326 
   2327  dr_prediction_z1_HxW_internal_neon(16, 32, dstvec, left, upsample_left, dy);
   2328  for (int i = 0; i < 32; i += 16) {
   2329    uint8x16_t d[16];
   2330    transpose_arrays_u8_16x16(dstvec + i, d);
   2331    for (int j = 0; j < 16; j++) {
   2332      vst1q_u8(dst + j * stride + i, d[j]);
   2333    }
   2334  }
   2335 }
   2336 
   2337 static void dr_prediction_z3_32x64_neon(uint8_t *dst, ptrdiff_t stride,
   2338                                        const uint8_t *left, int upsample_left,
   2339                                        int dy) {
   2340  (void)upsample_left;
   2341  uint8_t dstT[64 * 32];
   2342 
   2343  dr_prediction_z1_64xN_neon(32, dstT, 64, left, dy);
   2344  z3_transpose_arrays_u8_16nx16n(dstT, 64, dst, stride, 32, 64);
   2345 }
   2346 
   2347 static void dr_prediction_z3_64x32_neon(uint8_t *dst, ptrdiff_t stride,
   2348                                        const uint8_t *left, int upsample_left,
   2349                                        int dy) {
   2350  (void)upsample_left;
   2351  uint8_t dstT[32 * 64];
   2352 
   2353  dr_prediction_z1_32xN_neon(64, dstT, 32, left, dy);
   2354  z3_transpose_arrays_u8_16nx16n(dstT, 32, dst, stride, 64, 32);
   2355 }
   2356 
   2357 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
   2358 static void dr_prediction_z3_16x64_neon(uint8_t *dst, ptrdiff_t stride,
   2359                                        const uint8_t *left, int upsample_left,
   2360                                        int dy) {
   2361  (void)upsample_left;
   2362  uint8_t dstT[64 * 16];
   2363 
   2364  dr_prediction_z1_64xN_neon(16, dstT, 64, left, dy);
   2365  z3_transpose_arrays_u8_16nx16n(dstT, 64, dst, stride, 16, 64);
   2366 }
   2367 
   2368 static void dr_prediction_z3_64x16_neon(uint8_t *dst, ptrdiff_t stride,
   2369                                        const uint8_t *left, int upsample_left,
   2370                                        int dy) {
   2371  uint8x16_t dstvec[64];
   2372 
   2373  dr_prediction_z1_HxW_internal_neon(16, 64, dstvec, left, upsample_left, dy);
   2374  for (int i = 0; i < 64; i += 16) {
   2375    uint8x16_t d[16];
   2376    transpose_arrays_u8_16x16(dstvec + i, d);
   2377    for (int j = 0; j < 16; ++j) {
   2378      vst1q_u8(dst + j * stride + i, d[j]);
   2379    }
   2380  }
   2381 }
   2382 #endif  // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
   2383 
   2384 typedef void (*dr_prediction_z3_fn)(uint8_t *dst, ptrdiff_t stride,
   2385                                    const uint8_t *left, int upsample_left,
   2386                                    int dy);
   2387 
   2388 #if CONFIG_REALTIME_ONLY && !CONFIG_AV1_DECODER
   2389 static const dr_prediction_z3_fn dr_prediction_z3_arr[7][7] = {
   2390  { NULL, NULL, NULL, NULL, NULL, NULL, NULL },
   2391  { NULL, NULL, NULL, NULL, NULL, NULL, NULL },
   2392  { NULL, NULL, dr_prediction_z3_4x4_neon, dr_prediction_z3_4x8_neon, NULL,
   2393    NULL, NULL },
   2394  { NULL, NULL, dr_prediction_z3_8x4_neon, dr_prediction_z3_8x8_neon,
   2395    dr_prediction_z3_8x16_neon, NULL, NULL },
   2396  { NULL, NULL, NULL, dr_prediction_z3_16x8_neon, dr_prediction_z3_16x16_neon,
   2397    dr_prediction_z3_16x32_neon, NULL },
   2398  { NULL, NULL, NULL, NULL, dr_prediction_z3_32x16_neon,
   2399    dr_prediction_z3_32x32_neon, dr_prediction_z3_32x64_neon },
   2400  { NULL, NULL, NULL, NULL, NULL, dr_prediction_z3_64x32_neon,
   2401    dr_prediction_z3_64x64_neon },
   2402 };
   2403 #else
   2404 static const dr_prediction_z3_fn dr_prediction_z3_arr[7][7] = {
   2405  { NULL, NULL, NULL, NULL, NULL, NULL, NULL },
   2406  { NULL, NULL, NULL, NULL, NULL, NULL, NULL },
   2407  { NULL, NULL, dr_prediction_z3_4x4_neon, dr_prediction_z3_4x8_neon,
   2408    dr_prediction_z3_4x16_neon, NULL, NULL },
   2409  { NULL, NULL, dr_prediction_z3_8x4_neon, dr_prediction_z3_8x8_neon,
   2410    dr_prediction_z3_8x16_neon, dr_prediction_z3_8x32_neon, NULL },
   2411  { NULL, NULL, dr_prediction_z3_16x4_neon, dr_prediction_z3_16x8_neon,
   2412    dr_prediction_z3_16x16_neon, dr_prediction_z3_16x32_neon,
   2413    dr_prediction_z3_16x64_neon },
   2414  { NULL, NULL, NULL, dr_prediction_z3_32x8_neon, dr_prediction_z3_32x16_neon,
   2415    dr_prediction_z3_32x32_neon, dr_prediction_z3_32x64_neon },
   2416  { NULL, NULL, NULL, NULL, dr_prediction_z3_64x16_neon,
   2417    dr_prediction_z3_64x32_neon, dr_prediction_z3_64x64_neon },
   2418 };
   2419 #endif  // CONFIG_REALTIME_ONLY && !CONFIG_AV1_DECODER
   2420 
   2421 void av1_dr_prediction_z3_neon(uint8_t *dst, ptrdiff_t stride, int bw, int bh,
   2422                               const uint8_t *above, const uint8_t *left,
   2423                               int upsample_left, int dx, int dy) {
   2424  (void)above;
   2425  (void)dx;
   2426  assert(dx == 1);
   2427  assert(dy > 0);
   2428 
   2429  dr_prediction_z3_fn f = dr_prediction_z3_arr[get_msb(bw)][get_msb(bh)];
   2430  assert(f != NULL);
   2431  f(dst, stride, left, upsample_left, dy);
   2432 }
   2433 
   2434 // -----------------------------------------------------------------------------
   2435 // SMOOTH_PRED
   2436 
   2437 // 256 - v = vneg_s8(v)
   2438 static inline uint8x8_t negate_s8(const uint8x8_t v) {
   2439  return vreinterpret_u8_s8(vneg_s8(vreinterpret_s8_u8(v)));
   2440 }
   2441 
   2442 static void smooth_4xh_neon(uint8_t *dst, ptrdiff_t stride,
   2443                            const uint8_t *const top_row,
   2444                            const uint8_t *const left_column,
   2445                            const int height) {
   2446  const uint8_t top_right = top_row[3];
   2447  const uint8_t bottom_left = left_column[height - 1];
   2448  const uint8_t *const weights_y = smooth_weights + height - 4;
   2449 
   2450  uint8x8_t top_v = load_u8_4x1(top_row);
   2451  const uint8x8_t top_right_v = vdup_n_u8(top_right);
   2452  const uint8x8_t bottom_left_v = vdup_n_u8(bottom_left);
   2453  uint8x8_t weights_x_v = load_u8_4x1(smooth_weights);
   2454  const uint8x8_t scaled_weights_x = negate_s8(weights_x_v);
   2455  const uint16x8_t weighted_tr = vmull_u8(scaled_weights_x, top_right_v);
   2456 
   2457  assert(height > 0);
   2458  int y = 0;
   2459  do {
   2460    const uint8x8_t left_v = vdup_n_u8(left_column[y]);
   2461    const uint8x8_t weights_y_v = vdup_n_u8(weights_y[y]);
   2462    const uint8x8_t scaled_weights_y = negate_s8(weights_y_v);
   2463    const uint16x8_t weighted_bl = vmull_u8(scaled_weights_y, bottom_left_v);
   2464    const uint16x8_t weighted_top_bl =
   2465        vmlal_u8(weighted_bl, weights_y_v, top_v);
   2466    const uint16x8_t weighted_left_tr =
   2467        vmlal_u8(weighted_tr, weights_x_v, left_v);
   2468    // Maximum value of each parameter: 0xFF00
   2469    const uint16x8_t avg = vhaddq_u16(weighted_top_bl, weighted_left_tr);
   2470    const uint8x8_t result = vrshrn_n_u16(avg, SMOOTH_WEIGHT_LOG2_SCALE);
   2471 
   2472    vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(result), 0);
   2473    dst += stride;
   2474  } while (++y != height);
   2475 }
   2476 
   2477 static inline uint8x8_t calculate_pred(const uint16x8_t weighted_top_bl,
   2478                                       const uint16x8_t weighted_left_tr) {
   2479  // Maximum value of each parameter: 0xFF00
   2480  const uint16x8_t avg = vhaddq_u16(weighted_top_bl, weighted_left_tr);
   2481  return vrshrn_n_u16(avg, SMOOTH_WEIGHT_LOG2_SCALE);
   2482 }
   2483 
   2484 static inline uint8x8_t calculate_weights_and_pred(
   2485    const uint8x8_t top, const uint8x8_t left, const uint16x8_t weighted_tr,
   2486    const uint8x8_t bottom_left, const uint8x8_t weights_x,
   2487    const uint8x8_t scaled_weights_y, const uint8x8_t weights_y) {
   2488  const uint16x8_t weighted_top = vmull_u8(weights_y, top);
   2489  const uint16x8_t weighted_top_bl =
   2490      vmlal_u8(weighted_top, scaled_weights_y, bottom_left);
   2491  const uint16x8_t weighted_left_tr = vmlal_u8(weighted_tr, weights_x, left);
   2492  return calculate_pred(weighted_top_bl, weighted_left_tr);
   2493 }
   2494 
   2495 static void smooth_8xh_neon(uint8_t *dst, ptrdiff_t stride,
   2496                            const uint8_t *const top_row,
   2497                            const uint8_t *const left_column,
   2498                            const int height) {
   2499  const uint8_t top_right = top_row[7];
   2500  const uint8_t bottom_left = left_column[height - 1];
   2501  const uint8_t *const weights_y = smooth_weights + height - 4;
   2502 
   2503  const uint8x8_t top_v = vld1_u8(top_row);
   2504  const uint8x8_t top_right_v = vdup_n_u8(top_right);
   2505  const uint8x8_t bottom_left_v = vdup_n_u8(bottom_left);
   2506  const uint8x8_t weights_x_v = vld1_u8(smooth_weights + 4);
   2507  const uint8x8_t scaled_weights_x = negate_s8(weights_x_v);
   2508  const uint16x8_t weighted_tr = vmull_u8(scaled_weights_x, top_right_v);
   2509 
   2510  assert(height > 0);
   2511  int y = 0;
   2512  do {
   2513    const uint8x8_t left_v = vdup_n_u8(left_column[y]);
   2514    const uint8x8_t weights_y_v = vdup_n_u8(weights_y[y]);
   2515    const uint8x8_t scaled_weights_y = negate_s8(weights_y_v);
   2516    const uint8x8_t result =
   2517        calculate_weights_and_pred(top_v, left_v, weighted_tr, bottom_left_v,
   2518                                   weights_x_v, scaled_weights_y, weights_y_v);
   2519 
   2520    vst1_u8(dst, result);
   2521    dst += stride;
   2522  } while (++y != height);
   2523 }
   2524 
   2525 #define SMOOTH_NXM(W, H)                                                       \
   2526  void aom_smooth_predictor_##W##x##H##_neon(uint8_t *dst, ptrdiff_t y_stride, \
   2527                                             const uint8_t *above,             \
   2528                                             const uint8_t *left) {            \
   2529    smooth_##W##xh_neon(dst, y_stride, above, left, H);                        \
   2530  }
   2531 
   2532 SMOOTH_NXM(4, 4)
   2533 SMOOTH_NXM(4, 8)
   2534 SMOOTH_NXM(8, 4)
   2535 SMOOTH_NXM(8, 8)
   2536 SMOOTH_NXM(8, 16)
   2537 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
   2538 SMOOTH_NXM(4, 16)
   2539 SMOOTH_NXM(8, 32)
   2540 #endif  // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
   2541 
   2542 #undef SMOOTH_NXM
   2543 
   2544 static inline uint8x16_t calculate_weights_and_predq(
   2545    const uint8x16_t top, const uint8x8_t left, const uint8x8_t top_right,
   2546    const uint8x8_t weights_y, const uint8x16_t weights_x,
   2547    const uint8x16_t scaled_weights_x, const uint16x8_t weighted_bl) {
   2548  const uint16x8_t weighted_top_bl_low =
   2549      vmlal_u8(weighted_bl, weights_y, vget_low_u8(top));
   2550  const uint16x8_t weighted_left_low = vmull_u8(vget_low_u8(weights_x), left);
   2551  const uint16x8_t weighted_left_tr_low =
   2552      vmlal_u8(weighted_left_low, vget_low_u8(scaled_weights_x), top_right);
   2553  const uint8x8_t result_low =
   2554      calculate_pred(weighted_top_bl_low, weighted_left_tr_low);
   2555 
   2556  const uint16x8_t weighted_top_bl_high =
   2557      vmlal_u8(weighted_bl, weights_y, vget_high_u8(top));
   2558  const uint16x8_t weighted_left_high = vmull_u8(vget_high_u8(weights_x), left);
   2559  const uint16x8_t weighted_left_tr_high =
   2560      vmlal_u8(weighted_left_high, vget_high_u8(scaled_weights_x), top_right);
   2561  const uint8x8_t result_high =
   2562      calculate_pred(weighted_top_bl_high, weighted_left_tr_high);
   2563 
   2564  return vcombine_u8(result_low, result_high);
   2565 }
   2566 
   2567 // 256 - v = vneg_s8(v)
   2568 static inline uint8x16_t negate_s8q(const uint8x16_t v) {
   2569  return vreinterpretq_u8_s8(vnegq_s8(vreinterpretq_s8_u8(v)));
   2570 }
   2571 
   2572 // For width 16 and above.
   2573 #define SMOOTH_PREDICTOR(W)                                                 \
   2574  static void smooth_##W##xh_neon(                                          \
   2575      uint8_t *dst, ptrdiff_t stride, const uint8_t *const top_row,         \
   2576      const uint8_t *const left_column, const int height) {                 \
   2577    const uint8_t top_right = top_row[(W) - 1];                             \
   2578    const uint8_t bottom_left = left_column[height - 1];                    \
   2579    const uint8_t *const weights_y = smooth_weights + height - 4;           \
   2580                                                                            \
   2581    uint8x16_t top_v[4];                                                    \
   2582    top_v[0] = vld1q_u8(top_row);                                           \
   2583    if ((W) > 16) {                                                         \
   2584      top_v[1] = vld1q_u8(top_row + 16);                                    \
   2585      if ((W) == 64) {                                                      \
   2586        top_v[2] = vld1q_u8(top_row + 32);                                  \
   2587        top_v[3] = vld1q_u8(top_row + 48);                                  \
   2588      }                                                                     \
   2589    }                                                                       \
   2590                                                                            \
   2591    const uint8x8_t top_right_v = vdup_n_u8(top_right);                     \
   2592    const uint8x8_t bottom_left_v = vdup_n_u8(bottom_left);                 \
   2593                                                                            \
   2594    uint8x16_t weights_x_v[4];                                              \
   2595    weights_x_v[0] = vld1q_u8(smooth_weights + (W) - 4);                    \
   2596    if ((W) > 16) {                                                         \
   2597      weights_x_v[1] = vld1q_u8(smooth_weights + (W) + 16 - 4);             \
   2598      if ((W) == 64) {                                                      \
   2599        weights_x_v[2] = vld1q_u8(smooth_weights + (W) + 32 - 4);           \
   2600        weights_x_v[3] = vld1q_u8(smooth_weights + (W) + 48 - 4);           \
   2601      }                                                                     \
   2602    }                                                                       \
   2603                                                                            \
   2604    uint8x16_t scaled_weights_x[4];                                         \
   2605    scaled_weights_x[0] = negate_s8q(weights_x_v[0]);                       \
   2606    if ((W) > 16) {                                                         \
   2607      scaled_weights_x[1] = negate_s8q(weights_x_v[1]);                     \
   2608      if ((W) == 64) {                                                      \
   2609        scaled_weights_x[2] = negate_s8q(weights_x_v[2]);                   \
   2610        scaled_weights_x[3] = negate_s8q(weights_x_v[3]);                   \
   2611      }                                                                     \
   2612    }                                                                       \
   2613                                                                            \
   2614    for (int y = 0; y < height; ++y) {                                      \
   2615      const uint8x8_t left_v = vdup_n_u8(left_column[y]);                   \
   2616      const uint8x8_t weights_y_v = vdup_n_u8(weights_y[y]);                \
   2617      const uint8x8_t scaled_weights_y = negate_s8(weights_y_v);            \
   2618      const uint16x8_t weighted_bl =                                        \
   2619          vmull_u8(scaled_weights_y, bottom_left_v);                        \
   2620                                                                            \
   2621      vst1q_u8(dst, calculate_weights_and_predq(                            \
   2622                        top_v[0], left_v, top_right_v, weights_y_v,         \
   2623                        weights_x_v[0], scaled_weights_x[0], weighted_bl)); \
   2624                                                                            \
   2625      if ((W) > 16) {                                                       \
   2626        vst1q_u8(dst + 16,                                                  \
   2627                 calculate_weights_and_predq(                               \
   2628                     top_v[1], left_v, top_right_v, weights_y_v,            \
   2629                     weights_x_v[1], scaled_weights_x[1], weighted_bl));    \
   2630        if ((W) == 64) {                                                    \
   2631          vst1q_u8(dst + 32,                                                \
   2632                   calculate_weights_and_predq(                             \
   2633                       top_v[2], left_v, top_right_v, weights_y_v,          \
   2634                       weights_x_v[2], scaled_weights_x[2], weighted_bl));  \
   2635          vst1q_u8(dst + 48,                                                \
   2636                   calculate_weights_and_predq(                             \
   2637                       top_v[3], left_v, top_right_v, weights_y_v,          \
   2638                       weights_x_v[3], scaled_weights_x[3], weighted_bl));  \
   2639        }                                                                   \
   2640      }                                                                     \
   2641                                                                            \
   2642      dst += stride;                                                        \
   2643    }                                                                       \
   2644  }
   2645 
   2646 SMOOTH_PREDICTOR(16)
   2647 SMOOTH_PREDICTOR(32)
   2648 SMOOTH_PREDICTOR(64)
   2649 
   2650 #undef SMOOTH_PREDICTOR
   2651 
   2652 #define SMOOTH_NXM_WIDE(W, H)                                                  \
   2653  void aom_smooth_predictor_##W##x##H##_neon(uint8_t *dst, ptrdiff_t y_stride, \
   2654                                             const uint8_t *above,             \
   2655                                             const uint8_t *left) {            \
   2656    smooth_##W##xh_neon(dst, y_stride, above, left, H);                        \
   2657  }
   2658 
   2659 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
   2660 SMOOTH_NXM_WIDE(16, 4)
   2661 SMOOTH_NXM_WIDE(16, 64)
   2662 SMOOTH_NXM_WIDE(32, 8)
   2663 SMOOTH_NXM_WIDE(64, 16)
   2664 #endif  // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
   2665 SMOOTH_NXM_WIDE(16, 8)
   2666 SMOOTH_NXM_WIDE(16, 16)
   2667 SMOOTH_NXM_WIDE(16, 32)
   2668 SMOOTH_NXM_WIDE(32, 16)
   2669 SMOOTH_NXM_WIDE(32, 32)
   2670 SMOOTH_NXM_WIDE(32, 64)
   2671 SMOOTH_NXM_WIDE(64, 32)
   2672 SMOOTH_NXM_WIDE(64, 64)
   2673 
   2674 #undef SMOOTH_NXM_WIDE
   2675 
   2676 // -----------------------------------------------------------------------------
   2677 // SMOOTH_V_PRED
   2678 
   2679 // For widths 4 and 8.
   2680 #define SMOOTH_V_PREDICTOR(W)                                         \
   2681  static void smooth_v_##W##xh_neon(                                  \
   2682      uint8_t *dst, ptrdiff_t stride, const uint8_t *const top_row,   \
   2683      const uint8_t *const left_column, const int height) {           \
   2684    const uint8_t bottom_left = left_column[height - 1];              \
   2685    const uint8_t *const weights_y = smooth_weights + height - 4;     \
   2686                                                                      \
   2687    uint8x8_t top_v;                                                  \
   2688    if ((W) == 4) {                                                   \
   2689      top_v = load_u8_4x1(top_row);                                   \
   2690    } else { /* width == 8 */                                         \
   2691      top_v = vld1_u8(top_row);                                       \
   2692    }                                                                 \
   2693                                                                      \
   2694    const uint8x8_t bottom_left_v = vdup_n_u8(bottom_left);           \
   2695                                                                      \
   2696    assert(height > 0);                                               \
   2697    int y = 0;                                                        \
   2698    do {                                                              \
   2699      const uint8x8_t weights_y_v = vdup_n_u8(weights_y[y]);          \
   2700      const uint8x8_t scaled_weights_y = negate_s8(weights_y_v);      \
   2701                                                                      \
   2702      const uint16x8_t weighted_top = vmull_u8(weights_y_v, top_v);   \
   2703      const uint16x8_t weighted_top_bl =                              \
   2704          vmlal_u8(weighted_top, scaled_weights_y, bottom_left_v);    \
   2705      const uint8x8_t pred =                                          \
   2706          vrshrn_n_u16(weighted_top_bl, SMOOTH_WEIGHT_LOG2_SCALE);    \
   2707                                                                      \
   2708      if ((W) == 4) {                                                 \
   2709        vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(pred), 0); \
   2710      } else { /* width == 8 */                                       \
   2711        vst1_u8(dst, pred);                                           \
   2712      }                                                               \
   2713      dst += stride;                                                  \
   2714    } while (++y != height);                                          \
   2715  }
   2716 
   2717 SMOOTH_V_PREDICTOR(4)
   2718 SMOOTH_V_PREDICTOR(8)
   2719 
   2720 #undef SMOOTH_V_PREDICTOR
   2721 
   2722 #define SMOOTH_V_NXM(W, H)                                    \
   2723  void aom_smooth_v_predictor_##W##x##H##_neon(               \
   2724      uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, \
   2725      const uint8_t *left) {                                  \
   2726    smooth_v_##W##xh_neon(dst, y_stride, above, left, H);     \
   2727  }
   2728 
   2729 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
   2730 SMOOTH_V_NXM(4, 16)
   2731 SMOOTH_V_NXM(8, 32)
   2732 #endif  // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
   2733 SMOOTH_V_NXM(4, 4)
   2734 SMOOTH_V_NXM(4, 8)
   2735 SMOOTH_V_NXM(8, 4)
   2736 SMOOTH_V_NXM(8, 8)
   2737 SMOOTH_V_NXM(8, 16)
   2738 
   2739 #undef SMOOTH_V_NXM
   2740 
   2741 static inline uint8x16_t calculate_vertical_weights_and_pred(
   2742    const uint8x16_t top, const uint8x8_t weights_y,
   2743    const uint16x8_t weighted_bl) {
   2744  const uint16x8_t pred_low =
   2745      vmlal_u8(weighted_bl, weights_y, vget_low_u8(top));
   2746  const uint16x8_t pred_high =
   2747      vmlal_u8(weighted_bl, weights_y, vget_high_u8(top));
   2748  const uint8x8_t pred_scaled_low =
   2749      vrshrn_n_u16(pred_low, SMOOTH_WEIGHT_LOG2_SCALE);
   2750  const uint8x8_t pred_scaled_high =
   2751      vrshrn_n_u16(pred_high, SMOOTH_WEIGHT_LOG2_SCALE);
   2752  return vcombine_u8(pred_scaled_low, pred_scaled_high);
   2753 }
   2754 
   2755 // For width 16 and above.
   2756 #define SMOOTH_V_PREDICTOR(W)                                            \
   2757  static void smooth_v_##W##xh_neon(                                     \
   2758      uint8_t *dst, ptrdiff_t stride, const uint8_t *const top_row,      \
   2759      const uint8_t *const left_column, const int height) {              \
   2760    const uint8_t bottom_left = left_column[height - 1];                 \
   2761    const uint8_t *const weights_y = smooth_weights + height - 4;        \
   2762                                                                         \
   2763    uint8x16_t top_v[4];                                                 \
   2764    top_v[0] = vld1q_u8(top_row);                                        \
   2765    if ((W) > 16) {                                                      \
   2766      top_v[1] = vld1q_u8(top_row + 16);                                 \
   2767      if ((W) == 64) {                                                   \
   2768        top_v[2] = vld1q_u8(top_row + 32);                               \
   2769        top_v[3] = vld1q_u8(top_row + 48);                               \
   2770      }                                                                  \
   2771    }                                                                    \
   2772                                                                         \
   2773    const uint8x8_t bottom_left_v = vdup_n_u8(bottom_left);              \
   2774                                                                         \
   2775    assert(height > 0);                                                  \
   2776    int y = 0;                                                           \
   2777    do {                                                                 \
   2778      const uint8x8_t weights_y_v = vdup_n_u8(weights_y[y]);             \
   2779      const uint8x8_t scaled_weights_y = negate_s8(weights_y_v);         \
   2780      const uint16x8_t weighted_bl =                                     \
   2781          vmull_u8(scaled_weights_y, bottom_left_v);                     \
   2782                                                                         \
   2783      const uint8x16_t pred_0 = calculate_vertical_weights_and_pred(     \
   2784          top_v[0], weights_y_v, weighted_bl);                           \
   2785      vst1q_u8(dst, pred_0);                                             \
   2786                                                                         \
   2787      if ((W) > 16) {                                                    \
   2788        const uint8x16_t pred_1 = calculate_vertical_weights_and_pred(   \
   2789            top_v[1], weights_y_v, weighted_bl);                         \
   2790        vst1q_u8(dst + 16, pred_1);                                      \
   2791                                                                         \
   2792        if ((W) == 64) {                                                 \
   2793          const uint8x16_t pred_2 = calculate_vertical_weights_and_pred( \
   2794              top_v[2], weights_y_v, weighted_bl);                       \
   2795          vst1q_u8(dst + 32, pred_2);                                    \
   2796                                                                         \
   2797          const uint8x16_t pred_3 = calculate_vertical_weights_and_pred( \
   2798              top_v[3], weights_y_v, weighted_bl);                       \
   2799          vst1q_u8(dst + 48, pred_3);                                    \
   2800        }                                                                \
   2801      }                                                                  \
   2802                                                                         \
   2803      dst += stride;                                                     \
   2804    } while (++y != height);                                             \
   2805  }
   2806 
   2807 SMOOTH_V_PREDICTOR(16)
   2808 SMOOTH_V_PREDICTOR(32)
   2809 SMOOTH_V_PREDICTOR(64)
   2810 
   2811 #undef SMOOTH_V_PREDICTOR
   2812 
   2813 #define SMOOTH_V_NXM_WIDE(W, H)                               \
   2814  void aom_smooth_v_predictor_##W##x##H##_neon(               \
   2815      uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, \
   2816      const uint8_t *left) {                                  \
   2817    smooth_v_##W##xh_neon(dst, y_stride, above, left, H);     \
   2818  }
   2819 
   2820 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
   2821 SMOOTH_V_NXM_WIDE(16, 4)
   2822 SMOOTH_V_NXM_WIDE(32, 8)
   2823 SMOOTH_V_NXM_WIDE(64, 16)
   2824 SMOOTH_V_NXM_WIDE(16, 64)
   2825 #endif  // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
   2826 SMOOTH_V_NXM_WIDE(16, 8)
   2827 SMOOTH_V_NXM_WIDE(16, 16)
   2828 SMOOTH_V_NXM_WIDE(16, 32)
   2829 SMOOTH_V_NXM_WIDE(32, 16)
   2830 SMOOTH_V_NXM_WIDE(32, 32)
   2831 SMOOTH_V_NXM_WIDE(32, 64)
   2832 SMOOTH_V_NXM_WIDE(64, 32)
   2833 SMOOTH_V_NXM_WIDE(64, 64)
   2834 
   2835 #undef SMOOTH_V_NXM_WIDE
   2836 
   2837 // -----------------------------------------------------------------------------
   2838 // SMOOTH_H_PRED
   2839 
   2840 // For widths 4 and 8.
   2841 #define SMOOTH_H_PREDICTOR(W)                                               \
   2842  static void smooth_h_##W##xh_neon(                                        \
   2843      uint8_t *dst, ptrdiff_t stride, const uint8_t *const top_row,         \
   2844      const uint8_t *const left_column, const int height) {                 \
   2845    const uint8_t top_right = top_row[(W) - 1];                             \
   2846                                                                            \
   2847    const uint8x8_t top_right_v = vdup_n_u8(top_right);                     \
   2848    /* Over-reads for 4xN but still within the array. */                    \
   2849    const uint8x8_t weights_x = vld1_u8(smooth_weights + (W) - 4);          \
   2850    const uint8x8_t scaled_weights_x = negate_s8(weights_x);                \
   2851    const uint16x8_t weighted_tr = vmull_u8(scaled_weights_x, top_right_v); \
   2852                                                                            \
   2853    assert(height > 0);                                                     \
   2854    int y = 0;                                                              \
   2855    do {                                                                    \
   2856      const uint8x8_t left_v = vdup_n_u8(left_column[y]);                   \
   2857      const uint16x8_t weighted_left_tr =                                   \
   2858          vmlal_u8(weighted_tr, weights_x, left_v);                         \
   2859      const uint8x8_t pred =                                                \
   2860          vrshrn_n_u16(weighted_left_tr, SMOOTH_WEIGHT_LOG2_SCALE);         \
   2861                                                                            \
   2862      if ((W) == 4) {                                                       \
   2863        vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(pred), 0);       \
   2864      } else { /* width == 8 */                                             \
   2865        vst1_u8(dst, pred);                                                 \
   2866      }                                                                     \
   2867      dst += stride;                                                        \
   2868    } while (++y != height);                                                \
   2869  }
   2870 
   2871 SMOOTH_H_PREDICTOR(4)
   2872 SMOOTH_H_PREDICTOR(8)
   2873 
   2874 #undef SMOOTH_H_PREDICTOR
   2875 
   2876 #define SMOOTH_H_NXM(W, H)                                    \
   2877  void aom_smooth_h_predictor_##W##x##H##_neon(               \
   2878      uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, \
   2879      const uint8_t *left) {                                  \
   2880    smooth_h_##W##xh_neon(dst, y_stride, above, left, H);     \
   2881  }
   2882 
   2883 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
   2884 SMOOTH_H_NXM(4, 16)
   2885 SMOOTH_H_NXM(8, 32)
   2886 #endif  // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
   2887 SMOOTH_H_NXM(4, 4)
   2888 SMOOTH_H_NXM(4, 8)
   2889 SMOOTH_H_NXM(8, 4)
   2890 SMOOTH_H_NXM(8, 8)
   2891 SMOOTH_H_NXM(8, 16)
   2892 
   2893 #undef SMOOTH_H_NXM
   2894 
   2895 static inline uint8x16_t calculate_horizontal_weights_and_pred(
   2896    const uint8x8_t left, const uint8x8_t top_right, const uint8x16_t weights_x,
   2897    const uint8x16_t scaled_weights_x) {
   2898  const uint16x8_t weighted_left_low = vmull_u8(vget_low_u8(weights_x), left);
   2899  const uint16x8_t weighted_left_tr_low =
   2900      vmlal_u8(weighted_left_low, vget_low_u8(scaled_weights_x), top_right);
   2901  const uint8x8_t pred_scaled_low =
   2902      vrshrn_n_u16(weighted_left_tr_low, SMOOTH_WEIGHT_LOG2_SCALE);
   2903 
   2904  const uint16x8_t weighted_left_high = vmull_u8(vget_high_u8(weights_x), left);
   2905  const uint16x8_t weighted_left_tr_high =
   2906      vmlal_u8(weighted_left_high, vget_high_u8(scaled_weights_x), top_right);
   2907  const uint8x8_t pred_scaled_high =
   2908      vrshrn_n_u16(weighted_left_tr_high, SMOOTH_WEIGHT_LOG2_SCALE);
   2909 
   2910  return vcombine_u8(pred_scaled_low, pred_scaled_high);
   2911 }
   2912 
   2913 // For width 16 and above.
   2914 #define SMOOTH_H_PREDICTOR(W)                                              \
   2915  static void smooth_h_##W##xh_neon(                                       \
   2916      uint8_t *dst, ptrdiff_t stride, const uint8_t *const top_row,        \
   2917      const uint8_t *const left_column, const int height) {                \
   2918    const uint8_t top_right = top_row[(W) - 1];                            \
   2919                                                                           \
   2920    const uint8x8_t top_right_v = vdup_n_u8(top_right);                    \
   2921                                                                           \
   2922    uint8x16_t weights_x[4];                                               \
   2923    weights_x[0] = vld1q_u8(smooth_weights + (W) - 4);                     \
   2924    if ((W) > 16) {                                                        \
   2925      weights_x[1] = vld1q_u8(smooth_weights + (W) + 16 - 4);              \
   2926      if ((W) == 64) {                                                     \
   2927        weights_x[2] = vld1q_u8(smooth_weights + (W) + 32 - 4);            \
   2928        weights_x[3] = vld1q_u8(smooth_weights + (W) + 48 - 4);            \
   2929      }                                                                    \
   2930    }                                                                      \
   2931                                                                           \
   2932    uint8x16_t scaled_weights_x[4];                                        \
   2933    scaled_weights_x[0] = negate_s8q(weights_x[0]);                        \
   2934    if ((W) > 16) {                                                        \
   2935      scaled_weights_x[1] = negate_s8q(weights_x[1]);                      \
   2936      if ((W) == 64) {                                                     \
   2937        scaled_weights_x[2] = negate_s8q(weights_x[2]);                    \
   2938        scaled_weights_x[3] = negate_s8q(weights_x[3]);                    \
   2939      }                                                                    \
   2940    }                                                                      \
   2941                                                                           \
   2942    assert(height > 0);                                                    \
   2943    int y = 0;                                                             \
   2944    do {                                                                   \
   2945      const uint8x8_t left_v = vdup_n_u8(left_column[y]);                  \
   2946                                                                           \
   2947      const uint8x16_t pred_0 = calculate_horizontal_weights_and_pred(     \
   2948          left_v, top_right_v, weights_x[0], scaled_weights_x[0]);         \
   2949      vst1q_u8(dst, pred_0);                                               \
   2950                                                                           \
   2951      if ((W) > 16) {                                                      \
   2952        const uint8x16_t pred_1 = calculate_horizontal_weights_and_pred(   \
   2953            left_v, top_right_v, weights_x[1], scaled_weights_x[1]);       \
   2954        vst1q_u8(dst + 16, pred_1);                                        \
   2955                                                                           \
   2956        if ((W) == 64) {                                                   \
   2957          const uint8x16_t pred_2 = calculate_horizontal_weights_and_pred( \
   2958              left_v, top_right_v, weights_x[2], scaled_weights_x[2]);     \
   2959          vst1q_u8(dst + 32, pred_2);                                      \
   2960                                                                           \
   2961          const uint8x16_t pred_3 = calculate_horizontal_weights_and_pred( \
   2962              left_v, top_right_v, weights_x[3], scaled_weights_x[3]);     \
   2963          vst1q_u8(dst + 48, pred_3);                                      \
   2964        }                                                                  \
   2965      }                                                                    \
   2966      dst += stride;                                                       \
   2967    } while (++y != height);                                               \
   2968  }
   2969 
   2970 SMOOTH_H_PREDICTOR(16)
   2971 SMOOTH_H_PREDICTOR(32)
   2972 SMOOTH_H_PREDICTOR(64)
   2973 
   2974 #undef SMOOTH_H_PREDICTOR
   2975 
   2976 #define SMOOTH_H_NXM_WIDE(W, H)                               \
   2977  void aom_smooth_h_predictor_##W##x##H##_neon(               \
   2978      uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, \
   2979      const uint8_t *left) {                                  \
   2980    smooth_h_##W##xh_neon(dst, y_stride, above, left, H);     \
   2981  }
   2982 
   2983 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
   2984 SMOOTH_H_NXM_WIDE(16, 4)
   2985 SMOOTH_H_NXM_WIDE(16, 64)
   2986 SMOOTH_H_NXM_WIDE(32, 8)
   2987 SMOOTH_H_NXM_WIDE(64, 16)
   2988 #endif  // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
   2989 SMOOTH_H_NXM_WIDE(16, 8)
   2990 SMOOTH_H_NXM_WIDE(16, 16)
   2991 SMOOTH_H_NXM_WIDE(16, 32)
   2992 SMOOTH_H_NXM_WIDE(32, 16)
   2993 SMOOTH_H_NXM_WIDE(32, 32)
   2994 SMOOTH_H_NXM_WIDE(32, 64)
   2995 SMOOTH_H_NXM_WIDE(64, 32)
   2996 SMOOTH_H_NXM_WIDE(64, 64)
   2997 
   2998 #undef SMOOTH_H_NXM_WIDE
   2999 
   3000 // -----------------------------------------------------------------------------
   3001 // PAETH
   3002 
   3003 static inline void paeth_4or8_x_h_neon(uint8_t *dest, ptrdiff_t stride,
   3004                                       const uint8_t *const top_row,
   3005                                       const uint8_t *const left_column,
   3006                                       int width, int height) {
   3007  const uint8x8_t top_left = vdup_n_u8(top_row[-1]);
   3008  const uint16x8_t top_left_x2 = vdupq_n_u16(top_row[-1] + top_row[-1]);
   3009  uint8x8_t top;
   3010  if (width == 4) {
   3011    top = load_u8_4x1(top_row);
   3012  } else {  // width == 8
   3013    top = vld1_u8(top_row);
   3014  }
   3015 
   3016  assert(height > 0);
   3017  int y = 0;
   3018  do {
   3019    const uint8x8_t left = vdup_n_u8(left_column[y]);
   3020 
   3021    const uint8x8_t left_dist = vabd_u8(top, top_left);
   3022    const uint8x8_t top_dist = vabd_u8(left, top_left);
   3023    const uint16x8_t top_left_dist =
   3024        vabdq_u16(vaddl_u8(top, left), top_left_x2);
   3025 
   3026    const uint8x8_t left_le_top = vcle_u8(left_dist, top_dist);
   3027    const uint8x8_t left_le_top_left =
   3028        vmovn_u16(vcleq_u16(vmovl_u8(left_dist), top_left_dist));
   3029    const uint8x8_t top_le_top_left =
   3030        vmovn_u16(vcleq_u16(vmovl_u8(top_dist), top_left_dist));
   3031 
   3032    // if (left_dist <= top_dist && left_dist <= top_left_dist)
   3033    const uint8x8_t left_mask = vand_u8(left_le_top, left_le_top_left);
   3034    //   dest[x] = left_column[y];
   3035    // Fill all the unused spaces with 'top'. They will be overwritten when
   3036    // the positions for top_left are known.
   3037    uint8x8_t result = vbsl_u8(left_mask, left, top);
   3038    // else if (top_dist <= top_left_dist)
   3039    //   dest[x] = top_row[x];
   3040    // Add these values to the mask. They were already set.
   3041    const uint8x8_t left_or_top_mask = vorr_u8(left_mask, top_le_top_left);
   3042    // else
   3043    //   dest[x] = top_left;
   3044    result = vbsl_u8(left_or_top_mask, result, top_left);
   3045 
   3046    if (width == 4) {
   3047      store_u8_4x1(dest, result);
   3048    } else {  // width == 8
   3049      vst1_u8(dest, result);
   3050    }
   3051    dest += stride;
   3052  } while (++y != height);
   3053 }
   3054 
   3055 #define PAETH_NXM(W, H)                                                     \
   3056  void aom_paeth_predictor_##W##x##H##_neon(uint8_t *dst, ptrdiff_t stride, \
   3057                                            const uint8_t *above,           \
   3058                                            const uint8_t *left) {          \
   3059    paeth_4or8_x_h_neon(dst, stride, above, left, W, H);                    \
   3060  }
   3061 
   3062 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
   3063 PAETH_NXM(4, 16)
   3064 PAETH_NXM(8, 32)
   3065 #endif  // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
   3066 PAETH_NXM(4, 4)
   3067 PAETH_NXM(4, 8)
   3068 PAETH_NXM(8, 4)
   3069 PAETH_NXM(8, 8)
   3070 PAETH_NXM(8, 16)
   3071 
   3072 // Calculate X distance <= TopLeft distance and pack the resulting mask into
   3073 // uint8x8_t.
   3074 static inline uint8x16_t x_le_top_left(const uint8x16_t x_dist,
   3075                                       const uint16x8_t top_left_dist_low,
   3076                                       const uint16x8_t top_left_dist_high) {
   3077  const uint8x16_t top_left_dist = vcombine_u8(vqmovn_u16(top_left_dist_low),
   3078                                               vqmovn_u16(top_left_dist_high));
   3079  return vcleq_u8(x_dist, top_left_dist);
   3080 }
   3081 
   3082 // Select the closest values and collect them.
   3083 static inline uint8x16_t select_paeth(const uint8x16_t top,
   3084                                      const uint8x16_t left,
   3085                                      const uint8x16_t top_left,
   3086                                      const uint8x16_t left_le_top,
   3087                                      const uint8x16_t left_le_top_left,
   3088                                      const uint8x16_t top_le_top_left) {
   3089  // if (left_dist <= top_dist && left_dist <= top_left_dist)
   3090  const uint8x16_t left_mask = vandq_u8(left_le_top, left_le_top_left);
   3091  //   dest[x] = left_column[y];
   3092  // Fill all the unused spaces with 'top'. They will be overwritten when
   3093  // the positions for top_left are known.
   3094  uint8x16_t result = vbslq_u8(left_mask, left, top);
   3095  // else if (top_dist <= top_left_dist)
   3096  //   dest[x] = top_row[x];
   3097  // Add these values to the mask. They were already set.
   3098  const uint8x16_t left_or_top_mask = vorrq_u8(left_mask, top_le_top_left);
   3099  // else
   3100  //   dest[x] = top_left;
   3101  return vbslq_u8(left_or_top_mask, result, top_left);
   3102 }
   3103 
   3104 // Generate numbered and high/low versions of top_left_dist.
   3105 #define TOP_LEFT_DIST(num)                                              \
   3106  const uint16x8_t top_left_##num##_dist_low = vabdq_u16(               \
   3107      vaddl_u8(vget_low_u8(top[num]), vget_low_u8(left)), top_left_x2); \
   3108  const uint16x8_t top_left_##num##_dist_high = vabdq_u16(              \
   3109      vaddl_u8(vget_high_u8(top[num]), vget_low_u8(left)), top_left_x2)
   3110 
   3111 // Generate numbered versions of XLeTopLeft with x = left.
   3112 #define LEFT_LE_TOP_LEFT(num)                                     \
   3113  const uint8x16_t left_le_top_left_##num =                       \
   3114      x_le_top_left(left_##num##_dist, top_left_##num##_dist_low, \
   3115                    top_left_##num##_dist_high)
   3116 
   3117 // Generate numbered versions of XLeTopLeft with x = top.
   3118 #define TOP_LE_TOP_LEFT(num)                              \
   3119  const uint8x16_t top_le_top_left_##num = x_le_top_left( \
   3120      top_dist, top_left_##num##_dist_low, top_left_##num##_dist_high)
   3121 
   3122 static inline void paeth16_plus_x_h_neon(uint8_t *dest, ptrdiff_t stride,
   3123                                         const uint8_t *const top_row,
   3124                                         const uint8_t *const left_column,
   3125                                         int width, int height) {
   3126  const uint8x16_t top_left = vdupq_n_u8(top_row[-1]);
   3127  const uint16x8_t top_left_x2 = vdupq_n_u16(top_row[-1] + top_row[-1]);
   3128  uint8x16_t top[4];
   3129  top[0] = vld1q_u8(top_row);
   3130  if (width > 16) {
   3131    top[1] = vld1q_u8(top_row + 16);
   3132    if (width == 64) {
   3133      top[2] = vld1q_u8(top_row + 32);
   3134      top[3] = vld1q_u8(top_row + 48);
   3135    }
   3136  }
   3137 
   3138  assert(height > 0);
   3139  int y = 0;
   3140  do {
   3141    const uint8x16_t left = vdupq_n_u8(left_column[y]);
   3142 
   3143    const uint8x16_t top_dist = vabdq_u8(left, top_left);
   3144 
   3145    const uint8x16_t left_0_dist = vabdq_u8(top[0], top_left);
   3146    TOP_LEFT_DIST(0);
   3147    const uint8x16_t left_0_le_top = vcleq_u8(left_0_dist, top_dist);
   3148    LEFT_LE_TOP_LEFT(0);
   3149    TOP_LE_TOP_LEFT(0);
   3150 
   3151    const uint8x16_t result_0 =
   3152        select_paeth(top[0], left, top_left, left_0_le_top, left_le_top_left_0,
   3153                     top_le_top_left_0);
   3154    vst1q_u8(dest, result_0);
   3155 
   3156    if (width > 16) {
   3157      const uint8x16_t left_1_dist = vabdq_u8(top[1], top_left);
   3158      TOP_LEFT_DIST(1);
   3159      const uint8x16_t left_1_le_top = vcleq_u8(left_1_dist, top_dist);
   3160      LEFT_LE_TOP_LEFT(1);
   3161      TOP_LE_TOP_LEFT(1);
   3162 
   3163      const uint8x16_t result_1 =
   3164          select_paeth(top[1], left, top_left, left_1_le_top,
   3165                       left_le_top_left_1, top_le_top_left_1);
   3166      vst1q_u8(dest + 16, result_1);
   3167 
   3168      if (width == 64) {
   3169        const uint8x16_t left_2_dist = vabdq_u8(top[2], top_left);
   3170        TOP_LEFT_DIST(2);
   3171        const uint8x16_t left_2_le_top = vcleq_u8(left_2_dist, top_dist);
   3172        LEFT_LE_TOP_LEFT(2);
   3173        TOP_LE_TOP_LEFT(2);
   3174 
   3175        const uint8x16_t result_2 =
   3176            select_paeth(top[2], left, top_left, left_2_le_top,
   3177                         left_le_top_left_2, top_le_top_left_2);
   3178        vst1q_u8(dest + 32, result_2);
   3179 
   3180        const uint8x16_t left_3_dist = vabdq_u8(top[3], top_left);
   3181        TOP_LEFT_DIST(3);
   3182        const uint8x16_t left_3_le_top = vcleq_u8(left_3_dist, top_dist);
   3183        LEFT_LE_TOP_LEFT(3);
   3184        TOP_LE_TOP_LEFT(3);
   3185 
   3186        const uint8x16_t result_3 =
   3187            select_paeth(top[3], left, top_left, left_3_le_top,
   3188                         left_le_top_left_3, top_le_top_left_3);
   3189        vst1q_u8(dest + 48, result_3);
   3190      }
   3191    }
   3192 
   3193    dest += stride;
   3194  } while (++y != height);
   3195 }
   3196 
   3197 #define PAETH_NXM_WIDE(W, H)                                                \
   3198  void aom_paeth_predictor_##W##x##H##_neon(uint8_t *dst, ptrdiff_t stride, \
   3199                                            const uint8_t *above,           \
   3200                                            const uint8_t *left) {          \
   3201    paeth16_plus_x_h_neon(dst, stride, above, left, W, H);                  \
   3202  }
   3203 
   3204 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
   3205 PAETH_NXM_WIDE(16, 4)
   3206 PAETH_NXM_WIDE(16, 64)
   3207 PAETH_NXM_WIDE(32, 8)
   3208 PAETH_NXM_WIDE(64, 16)
   3209 #endif  // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
   3210 PAETH_NXM_WIDE(16, 8)
   3211 PAETH_NXM_WIDE(16, 16)
   3212 PAETH_NXM_WIDE(16, 32)
   3213 PAETH_NXM_WIDE(32, 16)
   3214 PAETH_NXM_WIDE(32, 32)
   3215 PAETH_NXM_WIDE(32, 64)
   3216 PAETH_NXM_WIDE(64, 32)
   3217 PAETH_NXM_WIDE(64, 64)