tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

highbd_intrapred_neon.c (123001B)


      1 /*
      2 * Copyright (c) 2022, Alliance for Open Media. All rights reserved.
      3 *
      4 * This source code is subject to the terms of the BSD 2 Clause License and
      5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
      6 * was not distributed with this source code in the LICENSE file, you can
      7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
      8 * Media Patent License 1.0 was not distributed with this source code in the
      9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
     10 */
     11 
     12 #include <arm_neon.h>
     13 
     14 #include "config/aom_config.h"
     15 #include "config/aom_dsp_rtcd.h"
     16 #include "config/av1_rtcd.h"
     17 
     18 #include "aom/aom_integer.h"
     19 #include "aom_dsp/arm/mem_neon.h"
     20 #include "aom_dsp/arm/sum_neon.h"
     21 #include "aom_dsp/arm/transpose_neon.h"
     22 #include "aom_dsp/intrapred_common.h"
     23 
     24 // -----------------------------------------------------------------------------
     25 // DC
     26 
     27 static inline void highbd_dc_store_4xh(uint16_t *dst, ptrdiff_t stride, int h,
     28                                       uint16x4_t dc) {
     29  for (int i = 0; i < h; ++i) {
     30    vst1_u16(dst + i * stride, dc);
     31  }
     32 }
     33 
     34 static inline void highbd_dc_store_8xh(uint16_t *dst, ptrdiff_t stride, int h,
     35                                       uint16x8_t dc) {
     36  for (int i = 0; i < h; ++i) {
     37    vst1q_u16(dst + i * stride, dc);
     38  }
     39 }
     40 
     41 static inline void highbd_dc_store_16xh(uint16_t *dst, ptrdiff_t stride, int h,
     42                                        uint16x8_t dc) {
     43  for (int i = 0; i < h; ++i) {
     44    vst1q_u16(dst + i * stride, dc);
     45    vst1q_u16(dst + i * stride + 8, dc);
     46  }
     47 }
     48 
     49 static inline void highbd_dc_store_32xh(uint16_t *dst, ptrdiff_t stride, int h,
     50                                        uint16x8_t dc) {
     51  for (int i = 0; i < h; ++i) {
     52    vst1q_u16(dst + i * stride, dc);
     53    vst1q_u16(dst + i * stride + 8, dc);
     54    vst1q_u16(dst + i * stride + 16, dc);
     55    vst1q_u16(dst + i * stride + 24, dc);
     56  }
     57 }
     58 
     59 static inline void highbd_dc_store_64xh(uint16_t *dst, ptrdiff_t stride, int h,
     60                                        uint16x8_t dc) {
     61  for (int i = 0; i < h; ++i) {
     62    vst1q_u16(dst + i * stride, dc);
     63    vst1q_u16(dst + i * stride + 8, dc);
     64    vst1q_u16(dst + i * stride + 16, dc);
     65    vst1q_u16(dst + i * stride + 24, dc);
     66    vst1q_u16(dst + i * stride + 32, dc);
     67    vst1q_u16(dst + i * stride + 40, dc);
     68    vst1q_u16(dst + i * stride + 48, dc);
     69    vst1q_u16(dst + i * stride + 56, dc);
     70  }
     71 }
     72 
     73 static inline uint32x4_t horizontal_add_and_broadcast_long_u16x8(uint16x8_t a) {
     74  // Need to assume input is up to 16 bits wide from dc 64x64 partial sum, so
     75  // promote first.
     76  const uint32x4_t b = vpaddlq_u16(a);
     77 #if AOM_ARCH_AARCH64
     78  const uint32x4_t c = vpaddq_u32(b, b);
     79  return vpaddq_u32(c, c);
     80 #else
     81  const uint32x2_t c = vadd_u32(vget_low_u32(b), vget_high_u32(b));
     82  const uint32x2_t d = vpadd_u32(c, c);
     83  return vcombine_u32(d, d);
     84 #endif
     85 }
     86 
     87 static inline uint16x8_t highbd_dc_load_partial_sum_4(const uint16_t *left) {
     88  // Nothing to do since sum is already one vector, but saves needing to
     89  // special case w=4 or h=4 cases. The combine will be zero cost for a sane
     90  // compiler since vld1 already sets the top half of a vector to zero as part
     91  // of the operation.
     92  return vcombine_u16(vld1_u16(left), vdup_n_u16(0));
     93 }
     94 
     95 static inline uint16x8_t highbd_dc_load_partial_sum_8(const uint16_t *left) {
     96  // Nothing to do since sum is already one vector, but saves needing to
     97  // special case w=8 or h=8 cases.
     98  return vld1q_u16(left);
     99 }
    100 
    101 static inline uint16x8_t highbd_dc_load_partial_sum_16(const uint16_t *left) {
    102  const uint16x8_t a0 = vld1q_u16(left + 0);  // up to 12 bits
    103  const uint16x8_t a1 = vld1q_u16(left + 8);
    104  return vaddq_u16(a0, a1);  // up to 13 bits
    105 }
    106 
    107 static inline uint16x8_t highbd_dc_load_partial_sum_32(const uint16_t *left) {
    108  const uint16x8_t a0 = vld1q_u16(left + 0);  // up to 12 bits
    109  const uint16x8_t a1 = vld1q_u16(left + 8);
    110  const uint16x8_t a2 = vld1q_u16(left + 16);
    111  const uint16x8_t a3 = vld1q_u16(left + 24);
    112  const uint16x8_t b0 = vaddq_u16(a0, a1);  // up to 13 bits
    113  const uint16x8_t b1 = vaddq_u16(a2, a3);
    114  return vaddq_u16(b0, b1);  // up to 14 bits
    115 }
    116 
    117 static inline uint16x8_t highbd_dc_load_partial_sum_64(const uint16_t *left) {
    118  const uint16x8_t a0 = vld1q_u16(left + 0);  // up to 12 bits
    119  const uint16x8_t a1 = vld1q_u16(left + 8);
    120  const uint16x8_t a2 = vld1q_u16(left + 16);
    121  const uint16x8_t a3 = vld1q_u16(left + 24);
    122  const uint16x8_t a4 = vld1q_u16(left + 32);
    123  const uint16x8_t a5 = vld1q_u16(left + 40);
    124  const uint16x8_t a6 = vld1q_u16(left + 48);
    125  const uint16x8_t a7 = vld1q_u16(left + 56);
    126  const uint16x8_t b0 = vaddq_u16(a0, a1);  // up to 13 bits
    127  const uint16x8_t b1 = vaddq_u16(a2, a3);
    128  const uint16x8_t b2 = vaddq_u16(a4, a5);
    129  const uint16x8_t b3 = vaddq_u16(a6, a7);
    130  const uint16x8_t c0 = vaddq_u16(b0, b1);  // up to 14 bits
    131  const uint16x8_t c1 = vaddq_u16(b2, b3);
    132  return vaddq_u16(c0, c1);  // up to 15 bits
    133 }
    134 
    135 #define HIGHBD_DC_PREDICTOR(w, h, shift)                               \
    136  void aom_highbd_dc_predictor_##w##x##h##_neon(                       \
    137      uint16_t *dst, ptrdiff_t stride, const uint16_t *above,          \
    138      const uint16_t *left, int bd) {                                  \
    139    (void)bd;                                                          \
    140    const uint16x8_t a = highbd_dc_load_partial_sum_##w(above);        \
    141    const uint16x8_t l = highbd_dc_load_partial_sum_##h(left);         \
    142    const uint32x4_t sum =                                             \
    143        horizontal_add_and_broadcast_long_u16x8(vaddq_u16(a, l));      \
    144    const uint16x4_t dc0 = vrshrn_n_u32(sum, shift);                   \
    145    highbd_dc_store_##w##xh(dst, stride, (h), vdupq_lane_u16(dc0, 0)); \
    146  }
    147 
    148 void aom_highbd_dc_predictor_4x4_neon(uint16_t *dst, ptrdiff_t stride,
    149                                      const uint16_t *above,
    150                                      const uint16_t *left, int bd) {
    151  // In the rectangular cases we simply extend the shorter vector to uint16x8
    152  // in order to accumulate, however in the 4x4 case there is no shorter vector
    153  // to extend so it is beneficial to do the whole calculation in uint16x4
    154  // instead.
    155  (void)bd;
    156  const uint16x4_t a = vld1_u16(above);  // up to 12 bits
    157  const uint16x4_t l = vld1_u16(left);
    158  uint16x4_t sum = vpadd_u16(a, l);  // up to 13 bits
    159  sum = vpadd_u16(sum, sum);         // up to 14 bits
    160  sum = vpadd_u16(sum, sum);
    161  const uint16x4_t dc = vrshr_n_u16(sum, 3);
    162  highbd_dc_store_4xh(dst, stride, 4, dc);
    163 }
    164 
    165 HIGHBD_DC_PREDICTOR(8, 8, 4)
    166 HIGHBD_DC_PREDICTOR(16, 16, 5)
    167 HIGHBD_DC_PREDICTOR(32, 32, 6)
    168 HIGHBD_DC_PREDICTOR(64, 64, 7)
    169 
    170 #undef HIGHBD_DC_PREDICTOR
    171 
    172 static inline int divide_using_multiply_shift(int num, int shift1,
    173                                              int multiplier, int shift2) {
    174  const int interm = num >> shift1;
    175  return interm * multiplier >> shift2;
    176 }
    177 
    178 #define HIGHBD_DC_MULTIPLIER_1X2 0xAAAB
    179 #define HIGHBD_DC_MULTIPLIER_1X4 0x6667
    180 #define HIGHBD_DC_SHIFT2 17
    181 
    182 static inline int highbd_dc_predictor_rect(int bw, int bh, int sum, int shift1,
    183                                           uint32_t multiplier) {
    184  return divide_using_multiply_shift(sum + ((bw + bh) >> 1), shift1, multiplier,
    185                                     HIGHBD_DC_SHIFT2);
    186 }
    187 
    188 #undef HIGHBD_DC_SHIFT2
    189 
    190 #define HIGHBD_DC_PREDICTOR_RECT(w, h, q, shift, mult)                  \
    191  void aom_highbd_dc_predictor_##w##x##h##_neon(                        \
    192      uint16_t *dst, ptrdiff_t stride, const uint16_t *above,           \
    193      const uint16_t *left, int bd) {                                   \
    194    (void)bd;                                                           \
    195    uint16x8_t sum_above = highbd_dc_load_partial_sum_##w(above);       \
    196    uint16x8_t sum_left = highbd_dc_load_partial_sum_##h(left);         \
    197    uint16x8_t sum_vec = vaddq_u16(sum_left, sum_above);                \
    198    int sum = horizontal_add_u16x8(sum_vec);                            \
    199    int dc0 = highbd_dc_predictor_rect((w), (h), sum, (shift), (mult)); \
    200    highbd_dc_store_##w##xh(dst, stride, (h), vdup##q##_n_u16(dc0));    \
    201  }
    202 
    203 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
    204 HIGHBD_DC_PREDICTOR_RECT(4, 8, , 2, HIGHBD_DC_MULTIPLIER_1X2)
    205 HIGHBD_DC_PREDICTOR_RECT(4, 16, , 2, HIGHBD_DC_MULTIPLIER_1X4)
    206 HIGHBD_DC_PREDICTOR_RECT(8, 4, q, 2, HIGHBD_DC_MULTIPLIER_1X2)
    207 HIGHBD_DC_PREDICTOR_RECT(8, 16, q, 3, HIGHBD_DC_MULTIPLIER_1X2)
    208 HIGHBD_DC_PREDICTOR_RECT(8, 32, q, 3, HIGHBD_DC_MULTIPLIER_1X4)
    209 HIGHBD_DC_PREDICTOR_RECT(16, 4, q, 2, HIGHBD_DC_MULTIPLIER_1X4)
    210 HIGHBD_DC_PREDICTOR_RECT(16, 8, q, 3, HIGHBD_DC_MULTIPLIER_1X2)
    211 HIGHBD_DC_PREDICTOR_RECT(16, 32, q, 4, HIGHBD_DC_MULTIPLIER_1X2)
    212 HIGHBD_DC_PREDICTOR_RECT(16, 64, q, 4, HIGHBD_DC_MULTIPLIER_1X4)
    213 HIGHBD_DC_PREDICTOR_RECT(32, 8, q, 3, HIGHBD_DC_MULTIPLIER_1X4)
    214 HIGHBD_DC_PREDICTOR_RECT(32, 16, q, 4, HIGHBD_DC_MULTIPLIER_1X2)
    215 HIGHBD_DC_PREDICTOR_RECT(32, 64, q, 5, HIGHBD_DC_MULTIPLIER_1X2)
    216 HIGHBD_DC_PREDICTOR_RECT(64, 16, q, 4, HIGHBD_DC_MULTIPLIER_1X4)
    217 HIGHBD_DC_PREDICTOR_RECT(64, 32, q, 5, HIGHBD_DC_MULTIPLIER_1X2)
    218 #else
    219 HIGHBD_DC_PREDICTOR_RECT(4, 8, , 2, HIGHBD_DC_MULTIPLIER_1X2)
    220 HIGHBD_DC_PREDICTOR_RECT(8, 4, q, 2, HIGHBD_DC_MULTIPLIER_1X2)
    221 HIGHBD_DC_PREDICTOR_RECT(8, 16, q, 3, HIGHBD_DC_MULTIPLIER_1X2)
    222 HIGHBD_DC_PREDICTOR_RECT(16, 8, q, 3, HIGHBD_DC_MULTIPLIER_1X2)
    223 HIGHBD_DC_PREDICTOR_RECT(16, 32, q, 4, HIGHBD_DC_MULTIPLIER_1X2)
    224 HIGHBD_DC_PREDICTOR_RECT(32, 16, q, 4, HIGHBD_DC_MULTIPLIER_1X2)
    225 HIGHBD_DC_PREDICTOR_RECT(32, 64, q, 5, HIGHBD_DC_MULTIPLIER_1X2)
    226 HIGHBD_DC_PREDICTOR_RECT(64, 32, q, 5, HIGHBD_DC_MULTIPLIER_1X2)
    227 #endif  // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
    228 
    229 #undef HIGHBD_DC_PREDICTOR_RECT
    230 #undef HIGHBD_DC_MULTIPLIER_1X2
    231 #undef HIGHBD_DC_MULTIPLIER_1X4
    232 
    233 // -----------------------------------------------------------------------------
    234 // DC_128
    235 
    236 #define HIGHBD_DC_PREDICTOR_128(w, h, q)                        \
    237  void aom_highbd_dc_128_predictor_##w##x##h##_neon(            \
    238      uint16_t *dst, ptrdiff_t stride, const uint16_t *above,   \
    239      const uint16_t *left, int bd) {                           \
    240    (void)above;                                                \
    241    (void)bd;                                                   \
    242    (void)left;                                                 \
    243    highbd_dc_store_##w##xh(dst, stride, (h),                   \
    244                            vdup##q##_n_u16(0x80 << (bd - 8))); \
    245  }
    246 
    247 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
    248 HIGHBD_DC_PREDICTOR_128(4, 4, )
    249 HIGHBD_DC_PREDICTOR_128(4, 8, )
    250 HIGHBD_DC_PREDICTOR_128(4, 16, )
    251 HIGHBD_DC_PREDICTOR_128(8, 4, q)
    252 HIGHBD_DC_PREDICTOR_128(8, 8, q)
    253 HIGHBD_DC_PREDICTOR_128(8, 16, q)
    254 HIGHBD_DC_PREDICTOR_128(8, 32, q)
    255 HIGHBD_DC_PREDICTOR_128(16, 4, q)
    256 HIGHBD_DC_PREDICTOR_128(16, 8, q)
    257 HIGHBD_DC_PREDICTOR_128(16, 16, q)
    258 HIGHBD_DC_PREDICTOR_128(16, 32, q)
    259 HIGHBD_DC_PREDICTOR_128(16, 64, q)
    260 HIGHBD_DC_PREDICTOR_128(32, 8, q)
    261 HIGHBD_DC_PREDICTOR_128(32, 16, q)
    262 HIGHBD_DC_PREDICTOR_128(32, 32, q)
    263 HIGHBD_DC_PREDICTOR_128(32, 64, q)
    264 HIGHBD_DC_PREDICTOR_128(64, 16, q)
    265 HIGHBD_DC_PREDICTOR_128(64, 32, q)
    266 HIGHBD_DC_PREDICTOR_128(64, 64, q)
    267 #else
    268 HIGHBD_DC_PREDICTOR_128(4, 4, )
    269 HIGHBD_DC_PREDICTOR_128(4, 8, )
    270 HIGHBD_DC_PREDICTOR_128(8, 4, q)
    271 HIGHBD_DC_PREDICTOR_128(8, 8, q)
    272 HIGHBD_DC_PREDICTOR_128(8, 16, q)
    273 HIGHBD_DC_PREDICTOR_128(16, 8, q)
    274 HIGHBD_DC_PREDICTOR_128(16, 16, q)
    275 HIGHBD_DC_PREDICTOR_128(16, 32, q)
    276 HIGHBD_DC_PREDICTOR_128(32, 16, q)
    277 HIGHBD_DC_PREDICTOR_128(32, 32, q)
    278 HIGHBD_DC_PREDICTOR_128(32, 64, q)
    279 HIGHBD_DC_PREDICTOR_128(64, 32, q)
    280 HIGHBD_DC_PREDICTOR_128(64, 64, q)
    281 #endif  // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
    282 
    283 #undef HIGHBD_DC_PREDICTOR_128
    284 
    285 // -----------------------------------------------------------------------------
    286 // DC_LEFT
    287 
    288 static inline uint32x4_t highbd_dc_load_sum_4(const uint16_t *left) {
    289  const uint16x4_t a = vld1_u16(left);   // up to 12 bits
    290  const uint16x4_t b = vpadd_u16(a, a);  // up to 13 bits
    291  return vcombine_u32(vpaddl_u16(b), vdup_n_u32(0));
    292 }
    293 
    294 static inline uint32x4_t highbd_dc_load_sum_8(const uint16_t *left) {
    295  return horizontal_add_and_broadcast_long_u16x8(vld1q_u16(left));
    296 }
    297 
    298 static inline uint32x4_t highbd_dc_load_sum_16(const uint16_t *left) {
    299  return horizontal_add_and_broadcast_long_u16x8(
    300      highbd_dc_load_partial_sum_16(left));
    301 }
    302 
    303 static inline uint32x4_t highbd_dc_load_sum_32(const uint16_t *left) {
    304  return horizontal_add_and_broadcast_long_u16x8(
    305      highbd_dc_load_partial_sum_32(left));
    306 }
    307 
    308 static inline uint32x4_t highbd_dc_load_sum_64(const uint16_t *left) {
    309  return horizontal_add_and_broadcast_long_u16x8(
    310      highbd_dc_load_partial_sum_64(left));
    311 }
    312 
    313 #define DC_PREDICTOR_LEFT(w, h, shift, q)                                  \
    314  void aom_highbd_dc_left_predictor_##w##x##h##_neon(                      \
    315      uint16_t *dst, ptrdiff_t stride, const uint16_t *above,              \
    316      const uint16_t *left, int bd) {                                      \
    317    (void)above;                                                           \
    318    (void)bd;                                                              \
    319    const uint32x4_t sum = highbd_dc_load_sum_##h(left);                   \
    320    const uint16x4_t dc0 = vrshrn_n_u32(sum, (shift));                     \
    321    highbd_dc_store_##w##xh(dst, stride, (h), vdup##q##_lane_u16(dc0, 0)); \
    322  }
    323 
    324 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
    325 DC_PREDICTOR_LEFT(4, 4, 2, )
    326 DC_PREDICTOR_LEFT(4, 8, 3, )
    327 DC_PREDICTOR_LEFT(4, 16, 4, )
    328 DC_PREDICTOR_LEFT(8, 4, 2, q)
    329 DC_PREDICTOR_LEFT(8, 8, 3, q)
    330 DC_PREDICTOR_LEFT(8, 16, 4, q)
    331 DC_PREDICTOR_LEFT(8, 32, 5, q)
    332 DC_PREDICTOR_LEFT(16, 4, 2, q)
    333 DC_PREDICTOR_LEFT(16, 8, 3, q)
    334 DC_PREDICTOR_LEFT(16, 16, 4, q)
    335 DC_PREDICTOR_LEFT(16, 32, 5, q)
    336 DC_PREDICTOR_LEFT(16, 64, 6, q)
    337 DC_PREDICTOR_LEFT(32, 8, 3, q)
    338 DC_PREDICTOR_LEFT(32, 16, 4, q)
    339 DC_PREDICTOR_LEFT(32, 32, 5, q)
    340 DC_PREDICTOR_LEFT(32, 64, 6, q)
    341 DC_PREDICTOR_LEFT(64, 16, 4, q)
    342 DC_PREDICTOR_LEFT(64, 32, 5, q)
    343 DC_PREDICTOR_LEFT(64, 64, 6, q)
    344 #else
    345 DC_PREDICTOR_LEFT(4, 4, 2, )
    346 DC_PREDICTOR_LEFT(4, 8, 3, )
    347 DC_PREDICTOR_LEFT(8, 4, 2, q)
    348 DC_PREDICTOR_LEFT(8, 8, 3, q)
    349 DC_PREDICTOR_LEFT(8, 16, 4, q)
    350 DC_PREDICTOR_LEFT(16, 8, 3, q)
    351 DC_PREDICTOR_LEFT(16, 16, 4, q)
    352 DC_PREDICTOR_LEFT(16, 32, 5, q)
    353 DC_PREDICTOR_LEFT(32, 16, 4, q)
    354 DC_PREDICTOR_LEFT(32, 32, 5, q)
    355 DC_PREDICTOR_LEFT(32, 64, 6, q)
    356 DC_PREDICTOR_LEFT(64, 32, 5, q)
    357 DC_PREDICTOR_LEFT(64, 64, 6, q)
    358 #endif  // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
    359 
    360 #undef DC_PREDICTOR_LEFT
    361 
    362 // -----------------------------------------------------------------------------
    363 // DC_TOP
    364 
    365 #define DC_PREDICTOR_TOP(w, h, shift, q)                                   \
    366  void aom_highbd_dc_top_predictor_##w##x##h##_neon(                       \
    367      uint16_t *dst, ptrdiff_t stride, const uint16_t *above,              \
    368      const uint16_t *left, int bd) {                                      \
    369    (void)bd;                                                              \
    370    (void)left;                                                            \
    371    const uint32x4_t sum = highbd_dc_load_sum_##w(above);                  \
    372    const uint16x4_t dc0 = vrshrn_n_u32(sum, (shift));                     \
    373    highbd_dc_store_##w##xh(dst, stride, (h), vdup##q##_lane_u16(dc0, 0)); \
    374  }
    375 
    376 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
    377 DC_PREDICTOR_TOP(4, 4, 2, )
    378 DC_PREDICTOR_TOP(4, 8, 2, )
    379 DC_PREDICTOR_TOP(4, 16, 2, )
    380 DC_PREDICTOR_TOP(8, 4, 3, q)
    381 DC_PREDICTOR_TOP(8, 8, 3, q)
    382 DC_PREDICTOR_TOP(8, 16, 3, q)
    383 DC_PREDICTOR_TOP(8, 32, 3, q)
    384 DC_PREDICTOR_TOP(16, 4, 4, q)
    385 DC_PREDICTOR_TOP(16, 8, 4, q)
    386 DC_PREDICTOR_TOP(16, 16, 4, q)
    387 DC_PREDICTOR_TOP(16, 32, 4, q)
    388 DC_PREDICTOR_TOP(16, 64, 4, q)
    389 DC_PREDICTOR_TOP(32, 8, 5, q)
    390 DC_PREDICTOR_TOP(32, 16, 5, q)
    391 DC_PREDICTOR_TOP(32, 32, 5, q)
    392 DC_PREDICTOR_TOP(32, 64, 5, q)
    393 DC_PREDICTOR_TOP(64, 16, 6, q)
    394 DC_PREDICTOR_TOP(64, 32, 6, q)
    395 DC_PREDICTOR_TOP(64, 64, 6, q)
    396 #else
    397 DC_PREDICTOR_TOP(4, 4, 2, )
    398 DC_PREDICTOR_TOP(4, 8, 2, )
    399 DC_PREDICTOR_TOP(8, 4, 3, q)
    400 DC_PREDICTOR_TOP(8, 8, 3, q)
    401 DC_PREDICTOR_TOP(8, 16, 3, q)
    402 DC_PREDICTOR_TOP(16, 8, 4, q)
    403 DC_PREDICTOR_TOP(16, 16, 4, q)
    404 DC_PREDICTOR_TOP(16, 32, 4, q)
    405 DC_PREDICTOR_TOP(32, 16, 5, q)
    406 DC_PREDICTOR_TOP(32, 32, 5, q)
    407 DC_PREDICTOR_TOP(32, 64, 5, q)
    408 DC_PREDICTOR_TOP(64, 32, 6, q)
    409 DC_PREDICTOR_TOP(64, 64, 6, q)
    410 #endif  // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
    411 
    412 #undef DC_PREDICTOR_TOP
    413 
    414 // -----------------------------------------------------------------------------
    415 // V_PRED
    416 
    417 #define HIGHBD_V_NXM(W, H)                                    \
    418  void aom_highbd_v_predictor_##W##x##H##_neon(               \
    419      uint16_t *dst, ptrdiff_t stride, const uint16_t *above, \
    420      const uint16_t *left, int bd) {                         \
    421    (void)left;                                               \
    422    (void)bd;                                                 \
    423    vertical##W##xh_neon(dst, stride, above, H);              \
    424  }
    425 
    426 static inline uint16x8x2_t load_uint16x8x2(uint16_t const *ptr) {
    427  uint16x8x2_t x;
    428  // Clang/gcc uses ldp here.
    429  x.val[0] = vld1q_u16(ptr);
    430  x.val[1] = vld1q_u16(ptr + 8);
    431  return x;
    432 }
    433 
    434 static inline void store_uint16x8x2(uint16_t *ptr, uint16x8x2_t x) {
    435  vst1q_u16(ptr, x.val[0]);
    436  vst1q_u16(ptr + 8, x.val[1]);
    437 }
    438 
    439 static inline void vertical4xh_neon(uint16_t *dst, ptrdiff_t stride,
    440                                    const uint16_t *const above, int height) {
    441  const uint16x4_t row = vld1_u16(above);
    442  int y = height;
    443  do {
    444    vst1_u16(dst, row);
    445    vst1_u16(dst + stride, row);
    446    dst += stride << 1;
    447    y -= 2;
    448  } while (y != 0);
    449 }
    450 
    451 static inline void vertical8xh_neon(uint16_t *dst, ptrdiff_t stride,
    452                                    const uint16_t *const above, int height) {
    453  const uint16x8_t row = vld1q_u16(above);
    454  int y = height;
    455  do {
    456    vst1q_u16(dst, row);
    457    vst1q_u16(dst + stride, row);
    458    dst += stride << 1;
    459    y -= 2;
    460  } while (y != 0);
    461 }
    462 
    463 static inline void vertical16xh_neon(uint16_t *dst, ptrdiff_t stride,
    464                                     const uint16_t *const above, int height) {
    465  const uint16x8x2_t row = load_uint16x8x2(above);
    466  int y = height;
    467  do {
    468    store_uint16x8x2(dst, row);
    469    store_uint16x8x2(dst + stride, row);
    470    dst += stride << 1;
    471    y -= 2;
    472  } while (y != 0);
    473 }
    474 
    475 static inline uint16x8x4_t load_uint16x8x4(uint16_t const *ptr) {
    476  uint16x8x4_t x;
    477  // Clang/gcc uses ldp here.
    478  x.val[0] = vld1q_u16(ptr);
    479  x.val[1] = vld1q_u16(ptr + 8);
    480  x.val[2] = vld1q_u16(ptr + 16);
    481  x.val[3] = vld1q_u16(ptr + 24);
    482  return x;
    483 }
    484 
    485 static inline void store_uint16x8x4(uint16_t *ptr, uint16x8x4_t x) {
    486  vst1q_u16(ptr, x.val[0]);
    487  vst1q_u16(ptr + 8, x.val[1]);
    488  vst1q_u16(ptr + 16, x.val[2]);
    489  vst1q_u16(ptr + 24, x.val[3]);
    490 }
    491 
    492 static inline void vertical32xh_neon(uint16_t *dst, ptrdiff_t stride,
    493                                     const uint16_t *const above, int height) {
    494  const uint16x8x4_t row = load_uint16x8x4(above);
    495  int y = height;
    496  do {
    497    store_uint16x8x4(dst, row);
    498    store_uint16x8x4(dst + stride, row);
    499    dst += stride << 1;
    500    y -= 2;
    501  } while (y != 0);
    502 }
    503 
    504 static inline void vertical64xh_neon(uint16_t *dst, ptrdiff_t stride,
    505                                     const uint16_t *const above, int height) {
    506  uint16_t *dst32 = dst + 32;
    507  const uint16x8x4_t row = load_uint16x8x4(above);
    508  const uint16x8x4_t row32 = load_uint16x8x4(above + 32);
    509  int y = height;
    510  do {
    511    store_uint16x8x4(dst, row);
    512    store_uint16x8x4(dst32, row32);
    513    store_uint16x8x4(dst + stride, row);
    514    store_uint16x8x4(dst32 + stride, row32);
    515    dst += stride << 1;
    516    dst32 += stride << 1;
    517    y -= 2;
    518  } while (y != 0);
    519 }
    520 
    521 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
    522 HIGHBD_V_NXM(4, 4)
    523 HIGHBD_V_NXM(4, 8)
    524 HIGHBD_V_NXM(4, 16)
    525 
    526 HIGHBD_V_NXM(8, 4)
    527 HIGHBD_V_NXM(8, 8)
    528 HIGHBD_V_NXM(8, 16)
    529 HIGHBD_V_NXM(8, 32)
    530 
    531 HIGHBD_V_NXM(16, 4)
    532 HIGHBD_V_NXM(16, 8)
    533 HIGHBD_V_NXM(16, 16)
    534 HIGHBD_V_NXM(16, 32)
    535 HIGHBD_V_NXM(16, 64)
    536 
    537 HIGHBD_V_NXM(32, 8)
    538 HIGHBD_V_NXM(32, 16)
    539 HIGHBD_V_NXM(32, 32)
    540 HIGHBD_V_NXM(32, 64)
    541 
    542 HIGHBD_V_NXM(64, 16)
    543 HIGHBD_V_NXM(64, 32)
    544 HIGHBD_V_NXM(64, 64)
    545 #else
    546 HIGHBD_V_NXM(4, 4)
    547 HIGHBD_V_NXM(4, 8)
    548 
    549 HIGHBD_V_NXM(8, 4)
    550 HIGHBD_V_NXM(8, 8)
    551 HIGHBD_V_NXM(8, 16)
    552 
    553 HIGHBD_V_NXM(16, 8)
    554 HIGHBD_V_NXM(16, 16)
    555 HIGHBD_V_NXM(16, 32)
    556 
    557 HIGHBD_V_NXM(32, 16)
    558 HIGHBD_V_NXM(32, 32)
    559 HIGHBD_V_NXM(32, 64)
    560 
    561 HIGHBD_V_NXM(64, 32)
    562 HIGHBD_V_NXM(64, 64)
    563 #endif  // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
    564 
    565 // -----------------------------------------------------------------------------
    566 // H_PRED
    567 
    568 static inline void highbd_h_store_4x4(uint16_t *dst, ptrdiff_t stride,
    569                                      uint16x4_t left) {
    570  vst1_u16(dst + 0 * stride, vdup_lane_u16(left, 0));
    571  vst1_u16(dst + 1 * stride, vdup_lane_u16(left, 1));
    572  vst1_u16(dst + 2 * stride, vdup_lane_u16(left, 2));
    573  vst1_u16(dst + 3 * stride, vdup_lane_u16(left, 3));
    574 }
    575 
    576 static inline void highbd_h_store_8x4(uint16_t *dst, ptrdiff_t stride,
    577                                      uint16x4_t left) {
    578  vst1q_u16(dst + 0 * stride, vdupq_lane_u16(left, 0));
    579  vst1q_u16(dst + 1 * stride, vdupq_lane_u16(left, 1));
    580  vst1q_u16(dst + 2 * stride, vdupq_lane_u16(left, 2));
    581  vst1q_u16(dst + 3 * stride, vdupq_lane_u16(left, 3));
    582 }
    583 
    584 static inline void highbd_h_store_16x1(uint16_t *dst, uint16x8_t left) {
    585  vst1q_u16(dst + 0, left);
    586  vst1q_u16(dst + 8, left);
    587 }
    588 
    589 static inline void highbd_h_store_16x4(uint16_t *dst, ptrdiff_t stride,
    590                                       uint16x4_t left) {
    591  highbd_h_store_16x1(dst + 0 * stride, vdupq_lane_u16(left, 0));
    592  highbd_h_store_16x1(dst + 1 * stride, vdupq_lane_u16(left, 1));
    593  highbd_h_store_16x1(dst + 2 * stride, vdupq_lane_u16(left, 2));
    594  highbd_h_store_16x1(dst + 3 * stride, vdupq_lane_u16(left, 3));
    595 }
    596 
    597 static inline void highbd_h_store_32x1(uint16_t *dst, uint16x8_t left) {
    598  vst1q_u16(dst + 0, left);
    599  vst1q_u16(dst + 8, left);
    600  vst1q_u16(dst + 16, left);
    601  vst1q_u16(dst + 24, left);
    602 }
    603 
    604 static inline void highbd_h_store_32x4(uint16_t *dst, ptrdiff_t stride,
    605                                       uint16x4_t left) {
    606  highbd_h_store_32x1(dst + 0 * stride, vdupq_lane_u16(left, 0));
    607  highbd_h_store_32x1(dst + 1 * stride, vdupq_lane_u16(left, 1));
    608  highbd_h_store_32x1(dst + 2 * stride, vdupq_lane_u16(left, 2));
    609  highbd_h_store_32x1(dst + 3 * stride, vdupq_lane_u16(left, 3));
    610 }
    611 
    612 static inline void highbd_h_store_64x1(uint16_t *dst, uint16x8_t left) {
    613  vst1q_u16(dst + 0, left);
    614  vst1q_u16(dst + 8, left);
    615  vst1q_u16(dst + 16, left);
    616  vst1q_u16(dst + 24, left);
    617  vst1q_u16(dst + 32, left);
    618  vst1q_u16(dst + 40, left);
    619  vst1q_u16(dst + 48, left);
    620  vst1q_u16(dst + 56, left);
    621 }
    622 
    623 static inline void highbd_h_store_64x4(uint16_t *dst, ptrdiff_t stride,
    624                                       uint16x4_t left) {
    625  highbd_h_store_64x1(dst + 0 * stride, vdupq_lane_u16(left, 0));
    626  highbd_h_store_64x1(dst + 1 * stride, vdupq_lane_u16(left, 1));
    627  highbd_h_store_64x1(dst + 2 * stride, vdupq_lane_u16(left, 2));
    628  highbd_h_store_64x1(dst + 3 * stride, vdupq_lane_u16(left, 3));
    629 }
    630 
    631 void aom_highbd_h_predictor_4x4_neon(uint16_t *dst, ptrdiff_t stride,
    632                                     const uint16_t *above,
    633                                     const uint16_t *left, int bd) {
    634  (void)above;
    635  (void)bd;
    636  highbd_h_store_4x4(dst, stride, vld1_u16(left));
    637 }
    638 
    639 void aom_highbd_h_predictor_4x8_neon(uint16_t *dst, ptrdiff_t stride,
    640                                     const uint16_t *above,
    641                                     const uint16_t *left, int bd) {
    642  (void)above;
    643  (void)bd;
    644  uint16x8_t l = vld1q_u16(left);
    645  highbd_h_store_4x4(dst + 0 * stride, stride, vget_low_u16(l));
    646  highbd_h_store_4x4(dst + 4 * stride, stride, vget_high_u16(l));
    647 }
    648 
    649 void aom_highbd_h_predictor_8x4_neon(uint16_t *dst, ptrdiff_t stride,
    650                                     const uint16_t *above,
    651                                     const uint16_t *left, int bd) {
    652  (void)above;
    653  (void)bd;
    654  highbd_h_store_8x4(dst, stride, vld1_u16(left));
    655 }
    656 
    657 void aom_highbd_h_predictor_8x8_neon(uint16_t *dst, ptrdiff_t stride,
    658                                     const uint16_t *above,
    659                                     const uint16_t *left, int bd) {
    660  (void)above;
    661  (void)bd;
    662  uint16x8_t l = vld1q_u16(left);
    663  highbd_h_store_8x4(dst + 0 * stride, stride, vget_low_u16(l));
    664  highbd_h_store_8x4(dst + 4 * stride, stride, vget_high_u16(l));
    665 }
    666 
    667 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
    668 void aom_highbd_h_predictor_16x4_neon(uint16_t *dst, ptrdiff_t stride,
    669                                      const uint16_t *above,
    670                                      const uint16_t *left, int bd) {
    671  (void)above;
    672  (void)bd;
    673  highbd_h_store_16x4(dst, stride, vld1_u16(left));
    674 }
    675 #endif  // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
    676 
    677 void aom_highbd_h_predictor_16x8_neon(uint16_t *dst, ptrdiff_t stride,
    678                                      const uint16_t *above,
    679                                      const uint16_t *left, int bd) {
    680  (void)above;
    681  (void)bd;
    682  uint16x8_t l = vld1q_u16(left);
    683  highbd_h_store_16x4(dst + 0 * stride, stride, vget_low_u16(l));
    684  highbd_h_store_16x4(dst + 4 * stride, stride, vget_high_u16(l));
    685 }
    686 
    687 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
    688 void aom_highbd_h_predictor_32x8_neon(uint16_t *dst, ptrdiff_t stride,
    689                                      const uint16_t *above,
    690                                      const uint16_t *left, int bd) {
    691  (void)above;
    692  (void)bd;
    693  uint16x8_t l = vld1q_u16(left);
    694  highbd_h_store_32x4(dst + 0 * stride, stride, vget_low_u16(l));
    695  highbd_h_store_32x4(dst + 4 * stride, stride, vget_high_u16(l));
    696 }
    697 #endif  // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
    698 
    699 // For cases where height >= 16 we use pairs of loads to get LDP instructions.
    700 #define HIGHBD_H_WXH_LARGE(w, h)                                            \
    701  void aom_highbd_h_predictor_##w##x##h##_neon(                             \
    702      uint16_t *dst, ptrdiff_t stride, const uint16_t *above,               \
    703      const uint16_t *left, int bd) {                                       \
    704    (void)above;                                                            \
    705    (void)bd;                                                               \
    706    for (int i = 0; i < (h) / 16; ++i) {                                    \
    707      uint16x8_t l0 = vld1q_u16(left + 0);                                  \
    708      uint16x8_t l1 = vld1q_u16(left + 8);                                  \
    709      highbd_h_store_##w##x4(dst + 0 * stride, stride, vget_low_u16(l0));   \
    710      highbd_h_store_##w##x4(dst + 4 * stride, stride, vget_high_u16(l0));  \
    711      highbd_h_store_##w##x4(dst + 8 * stride, stride, vget_low_u16(l1));   \
    712      highbd_h_store_##w##x4(dst + 12 * stride, stride, vget_high_u16(l1)); \
    713      left += 16;                                                           \
    714      dst += 16 * stride;                                                   \
    715    }                                                                       \
    716  }
    717 
    718 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
    719 HIGHBD_H_WXH_LARGE(4, 16)
    720 HIGHBD_H_WXH_LARGE(8, 16)
    721 HIGHBD_H_WXH_LARGE(8, 32)
    722 HIGHBD_H_WXH_LARGE(16, 16)
    723 HIGHBD_H_WXH_LARGE(16, 32)
    724 HIGHBD_H_WXH_LARGE(16, 64)
    725 HIGHBD_H_WXH_LARGE(32, 16)
    726 HIGHBD_H_WXH_LARGE(32, 32)
    727 HIGHBD_H_WXH_LARGE(32, 64)
    728 HIGHBD_H_WXH_LARGE(64, 16)
    729 HIGHBD_H_WXH_LARGE(64, 32)
    730 HIGHBD_H_WXH_LARGE(64, 64)
    731 #else
    732 HIGHBD_H_WXH_LARGE(8, 16)
    733 HIGHBD_H_WXH_LARGE(16, 16)
    734 HIGHBD_H_WXH_LARGE(16, 32)
    735 HIGHBD_H_WXH_LARGE(32, 16)
    736 HIGHBD_H_WXH_LARGE(32, 32)
    737 HIGHBD_H_WXH_LARGE(32, 64)
    738 HIGHBD_H_WXH_LARGE(64, 32)
    739 HIGHBD_H_WXH_LARGE(64, 64)
    740 #endif  // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
    741 
    742 #undef HIGHBD_H_WXH_LARGE
    743 
    744 // -----------------------------------------------------------------------------
    745 // PAETH
    746 
    747 static inline void highbd_paeth_4or8_x_h_neon(uint16_t *dest, ptrdiff_t stride,
    748                                              const uint16_t *const top_row,
    749                                              const uint16_t *const left_column,
    750                                              int width, int height) {
    751  const uint16x8_t top_left = vdupq_n_u16(top_row[-1]);
    752  const uint16x8_t top_left_x2 = vdupq_n_u16(top_row[-1] + top_row[-1]);
    753  uint16x8_t top;
    754  if (width == 4) {
    755    top = vcombine_u16(vld1_u16(top_row), vdup_n_u16(0));
    756  } else {  // width == 8
    757    top = vld1q_u16(top_row);
    758  }
    759 
    760  for (int y = 0; y < height; ++y) {
    761    const uint16x8_t left = vdupq_n_u16(left_column[y]);
    762 
    763    const uint16x8_t left_dist = vabdq_u16(top, top_left);
    764    const uint16x8_t top_dist = vabdq_u16(left, top_left);
    765    const uint16x8_t top_left_dist =
    766        vabdq_u16(vaddq_u16(top, left), top_left_x2);
    767 
    768    const uint16x8_t left_le_top = vcleq_u16(left_dist, top_dist);
    769    const uint16x8_t left_le_top_left = vcleq_u16(left_dist, top_left_dist);
    770    const uint16x8_t top_le_top_left = vcleq_u16(top_dist, top_left_dist);
    771 
    772    // if (left_dist <= top_dist && left_dist <= top_left_dist)
    773    const uint16x8_t left_mask = vandq_u16(left_le_top, left_le_top_left);
    774    //   dest[x] = left_column[y];
    775    // Fill all the unused spaces with 'top'. They will be overwritten when
    776    // the positions for top_left are known.
    777    uint16x8_t result = vbslq_u16(left_mask, left, top);
    778    // else if (top_dist <= top_left_dist)
    779    //   dest[x] = top_row[x];
    780    // Add these values to the mask. They were already set.
    781    const uint16x8_t left_or_top_mask = vorrq_u16(left_mask, top_le_top_left);
    782    // else
    783    //   dest[x] = top_left;
    784    result = vbslq_u16(left_or_top_mask, result, top_left);
    785 
    786    if (width == 4) {
    787      vst1_u16(dest, vget_low_u16(result));
    788    } else {  // width == 8
    789      vst1q_u16(dest, result);
    790    }
    791    dest += stride;
    792  }
    793 }
    794 
    795 #define HIGHBD_PAETH_NXM(W, H)                                  \
    796  void aom_highbd_paeth_predictor_##W##x##H##_neon(             \
    797      uint16_t *dst, ptrdiff_t stride, const uint16_t *above,   \
    798      const uint16_t *left, int bd) {                           \
    799    (void)bd;                                                   \
    800    highbd_paeth_4or8_x_h_neon(dst, stride, above, left, W, H); \
    801  }
    802 
    803 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
    804 HIGHBD_PAETH_NXM(4, 4)
    805 HIGHBD_PAETH_NXM(4, 8)
    806 HIGHBD_PAETH_NXM(4, 16)
    807 HIGHBD_PAETH_NXM(8, 4)
    808 HIGHBD_PAETH_NXM(8, 8)
    809 HIGHBD_PAETH_NXM(8, 16)
    810 HIGHBD_PAETH_NXM(8, 32)
    811 #else
    812 HIGHBD_PAETH_NXM(4, 4)
    813 HIGHBD_PAETH_NXM(4, 8)
    814 HIGHBD_PAETH_NXM(8, 4)
    815 HIGHBD_PAETH_NXM(8, 8)
    816 HIGHBD_PAETH_NXM(8, 16)
    817 #endif  // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
    818 
    819 // Select the closest values and collect them.
    820 static inline uint16x8_t select_paeth(const uint16x8_t top,
    821                                      const uint16x8_t left,
    822                                      const uint16x8_t top_left,
    823                                      const uint16x8_t left_le_top,
    824                                      const uint16x8_t left_le_top_left,
    825                                      const uint16x8_t top_le_top_left) {
    826  // if (left_dist <= top_dist && left_dist <= top_left_dist)
    827  const uint16x8_t left_mask = vandq_u16(left_le_top, left_le_top_left);
    828  //   dest[x] = left_column[y];
    829  // Fill all the unused spaces with 'top'. They will be overwritten when
    830  // the positions for top_left are known.
    831  const uint16x8_t result = vbslq_u16(left_mask, left, top);
    832  // else if (top_dist <= top_left_dist)
    833  //   dest[x] = top_row[x];
    834  // Add these values to the mask. They were already set.
    835  const uint16x8_t left_or_top_mask = vorrq_u16(left_mask, top_le_top_left);
    836  // else
    837  //   dest[x] = top_left;
    838  return vbslq_u16(left_or_top_mask, result, top_left);
    839 }
    840 
    841 #define PAETH_PREDICTOR(num)                                                  \
    842  do {                                                                        \
    843    const uint16x8_t left_dist = vabdq_u16(top[num], top_left);               \
    844    const uint16x8_t top_left_dist =                                          \
    845        vabdq_u16(vaddq_u16(top[num], left), top_left_x2);                    \
    846    const uint16x8_t left_le_top = vcleq_u16(left_dist, top_dist);            \
    847    const uint16x8_t left_le_top_left = vcleq_u16(left_dist, top_left_dist);  \
    848    const uint16x8_t top_le_top_left = vcleq_u16(top_dist, top_left_dist);    \
    849    const uint16x8_t result =                                                 \
    850        select_paeth(top[num], left, top_left, left_le_top, left_le_top_left, \
    851                     top_le_top_left);                                        \
    852    vst1q_u16(dest + (num * 8), result);                                      \
    853  } while (0)
    854 
    855 #define LOAD_TOP_ROW(num) vld1q_u16(top_row + (num * 8))
    856 
    857 static inline void highbd_paeth16_plus_x_h_neon(
    858    uint16_t *dest, ptrdiff_t stride, const uint16_t *const top_row,
    859    const uint16_t *const left_column, int width, int height) {
    860  const uint16x8_t top_left = vdupq_n_u16(top_row[-1]);
    861  const uint16x8_t top_left_x2 = vdupq_n_u16(top_row[-1] + top_row[-1]);
    862  uint16x8_t top[8];
    863  top[0] = LOAD_TOP_ROW(0);
    864  top[1] = LOAD_TOP_ROW(1);
    865  if (width > 16) {
    866    top[2] = LOAD_TOP_ROW(2);
    867    top[3] = LOAD_TOP_ROW(3);
    868    if (width == 64) {
    869      top[4] = LOAD_TOP_ROW(4);
    870      top[5] = LOAD_TOP_ROW(5);
    871      top[6] = LOAD_TOP_ROW(6);
    872      top[7] = LOAD_TOP_ROW(7);
    873    }
    874  }
    875 
    876  for (int y = 0; y < height; ++y) {
    877    const uint16x8_t left = vdupq_n_u16(left_column[y]);
    878    const uint16x8_t top_dist = vabdq_u16(left, top_left);
    879    PAETH_PREDICTOR(0);
    880    PAETH_PREDICTOR(1);
    881    if (width > 16) {
    882      PAETH_PREDICTOR(2);
    883      PAETH_PREDICTOR(3);
    884      if (width == 64) {
    885        PAETH_PREDICTOR(4);
    886        PAETH_PREDICTOR(5);
    887        PAETH_PREDICTOR(6);
    888        PAETH_PREDICTOR(7);
    889      }
    890    }
    891    dest += stride;
    892  }
    893 }
    894 
    895 #define HIGHBD_PAETH_NXM_WIDE(W, H)                               \
    896  void aom_highbd_paeth_predictor_##W##x##H##_neon(               \
    897      uint16_t *dst, ptrdiff_t stride, const uint16_t *above,     \
    898      const uint16_t *left, int bd) {                             \
    899    (void)bd;                                                     \
    900    highbd_paeth16_plus_x_h_neon(dst, stride, above, left, W, H); \
    901  }
    902 
    903 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
    904 HIGHBD_PAETH_NXM_WIDE(16, 4)
    905 HIGHBD_PAETH_NXM_WIDE(16, 8)
    906 HIGHBD_PAETH_NXM_WIDE(16, 16)
    907 HIGHBD_PAETH_NXM_WIDE(16, 32)
    908 HIGHBD_PAETH_NXM_WIDE(16, 64)
    909 HIGHBD_PAETH_NXM_WIDE(32, 8)
    910 HIGHBD_PAETH_NXM_WIDE(32, 16)
    911 HIGHBD_PAETH_NXM_WIDE(32, 32)
    912 HIGHBD_PAETH_NXM_WIDE(32, 64)
    913 HIGHBD_PAETH_NXM_WIDE(64, 16)
    914 HIGHBD_PAETH_NXM_WIDE(64, 32)
    915 HIGHBD_PAETH_NXM_WIDE(64, 64)
    916 #else
    917 HIGHBD_PAETH_NXM_WIDE(16, 8)
    918 HIGHBD_PAETH_NXM_WIDE(16, 16)
    919 HIGHBD_PAETH_NXM_WIDE(16, 32)
    920 HIGHBD_PAETH_NXM_WIDE(32, 16)
    921 HIGHBD_PAETH_NXM_WIDE(32, 32)
    922 HIGHBD_PAETH_NXM_WIDE(32, 64)
    923 HIGHBD_PAETH_NXM_WIDE(64, 32)
    924 HIGHBD_PAETH_NXM_WIDE(64, 64)
    925 #endif  // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
    926 
    927 // -----------------------------------------------------------------------------
    928 // SMOOTH
    929 
    930 // 256 - v = vneg_s8(v)
    931 static inline uint16x4_t negate_s8(const uint16x4_t v) {
    932  return vreinterpret_u16_s8(vneg_s8(vreinterpret_s8_u16(v)));
    933 }
    934 
    935 static inline void highbd_smooth_4xh_neon(uint16_t *dst, ptrdiff_t stride,
    936                                          const uint16_t *const top_row,
    937                                          const uint16_t *const left_column,
    938                                          const int height) {
    939  const uint16_t top_right = top_row[3];
    940  const uint16_t bottom_left = left_column[height - 1];
    941  const uint16_t *const weights_y = smooth_weights_u16 + height - 4;
    942 
    943  const uint16x4_t top_v = vld1_u16(top_row);
    944  const uint16x4_t bottom_left_v = vdup_n_u16(bottom_left);
    945  const uint16x4_t weights_x_v = vld1_u16(smooth_weights_u16);
    946  const uint16x4_t scaled_weights_x = negate_s8(weights_x_v);
    947  const uint32x4_t weighted_tr = vmull_n_u16(scaled_weights_x, top_right);
    948 
    949  for (int y = 0; y < height; ++y) {
    950    // Each variable in the running summation is named for the last item to be
    951    // accumulated.
    952    const uint32x4_t weighted_top =
    953        vmlal_n_u16(weighted_tr, top_v, weights_y[y]);
    954    const uint32x4_t weighted_left =
    955        vmlal_n_u16(weighted_top, weights_x_v, left_column[y]);
    956    const uint32x4_t weighted_bl =
    957        vmlal_n_u16(weighted_left, bottom_left_v, 256 - weights_y[y]);
    958 
    959    const uint16x4_t pred =
    960        vrshrn_n_u32(weighted_bl, SMOOTH_WEIGHT_LOG2_SCALE + 1);
    961    vst1_u16(dst, pred);
    962    dst += stride;
    963  }
    964 }
    965 
    966 // Common code between 8xH and [16|32|64]xH.
    967 static inline void highbd_calculate_pred8(
    968    uint16_t *dst, const uint32x4_t weighted_corners_low,
    969    const uint32x4_t weighted_corners_high, const uint16x4x2_t top_vals,
    970    const uint16x4x2_t weights_x, const uint16_t left_y,
    971    const uint16_t weight_y) {
    972  // Each variable in the running summation is named for the last item to be
    973  // accumulated.
    974  const uint32x4_t weighted_top_low =
    975      vmlal_n_u16(weighted_corners_low, top_vals.val[0], weight_y);
    976  const uint32x4_t weighted_edges_low =
    977      vmlal_n_u16(weighted_top_low, weights_x.val[0], left_y);
    978 
    979  const uint16x4_t pred_low =
    980      vrshrn_n_u32(weighted_edges_low, SMOOTH_WEIGHT_LOG2_SCALE + 1);
    981  vst1_u16(dst, pred_low);
    982 
    983  const uint32x4_t weighted_top_high =
    984      vmlal_n_u16(weighted_corners_high, top_vals.val[1], weight_y);
    985  const uint32x4_t weighted_edges_high =
    986      vmlal_n_u16(weighted_top_high, weights_x.val[1], left_y);
    987 
    988  const uint16x4_t pred_high =
    989      vrshrn_n_u32(weighted_edges_high, SMOOTH_WEIGHT_LOG2_SCALE + 1);
    990  vst1_u16(dst + 4, pred_high);
    991 }
    992 
    993 static void highbd_smooth_8xh_neon(uint16_t *dst, ptrdiff_t stride,
    994                                   const uint16_t *const top_row,
    995                                   const uint16_t *const left_column,
    996                                   const int height) {
    997  const uint16_t top_right = top_row[7];
    998  const uint16_t bottom_left = left_column[height - 1];
    999  const uint16_t *const weights_y = smooth_weights_u16 + height - 4;
   1000 
   1001  const uint16x4x2_t top_vals = { { vld1_u16(top_row),
   1002                                    vld1_u16(top_row + 4) } };
   1003  const uint16x4_t bottom_left_v = vdup_n_u16(bottom_left);
   1004  const uint16x4x2_t weights_x = { { vld1_u16(smooth_weights_u16 + 4),
   1005                                     vld1_u16(smooth_weights_u16 + 8) } };
   1006  const uint32x4_t weighted_tr_low =
   1007      vmull_n_u16(negate_s8(weights_x.val[0]), top_right);
   1008  const uint32x4_t weighted_tr_high =
   1009      vmull_n_u16(negate_s8(weights_x.val[1]), top_right);
   1010 
   1011  for (int y = 0; y < height; ++y) {
   1012    const uint32x4_t weighted_bl =
   1013        vmull_n_u16(bottom_left_v, 256 - weights_y[y]);
   1014    const uint32x4_t weighted_corners_low =
   1015        vaddq_u32(weighted_bl, weighted_tr_low);
   1016    const uint32x4_t weighted_corners_high =
   1017        vaddq_u32(weighted_bl, weighted_tr_high);
   1018    highbd_calculate_pred8(dst, weighted_corners_low, weighted_corners_high,
   1019                           top_vals, weights_x, left_column[y], weights_y[y]);
   1020    dst += stride;
   1021  }
   1022 }
   1023 
   1024 #define HIGHBD_SMOOTH_NXM(W, H)                                 \
   1025  void aom_highbd_smooth_predictor_##W##x##H##_neon(            \
   1026      uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, \
   1027      const uint16_t *left, int bd) {                           \
   1028    (void)bd;                                                   \
   1029    highbd_smooth_##W##xh_neon(dst, y_stride, above, left, H);  \
   1030  }
   1031 
   1032 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
   1033 HIGHBD_SMOOTH_NXM(4, 4)
   1034 HIGHBD_SMOOTH_NXM(4, 8)
   1035 HIGHBD_SMOOTH_NXM(8, 4)
   1036 HIGHBD_SMOOTH_NXM(8, 8)
   1037 HIGHBD_SMOOTH_NXM(4, 16)
   1038 HIGHBD_SMOOTH_NXM(8, 16)
   1039 HIGHBD_SMOOTH_NXM(8, 32)
   1040 #else
   1041 HIGHBD_SMOOTH_NXM(4, 4)
   1042 HIGHBD_SMOOTH_NXM(4, 8)
   1043 HIGHBD_SMOOTH_NXM(8, 4)
   1044 HIGHBD_SMOOTH_NXM(8, 8)
   1045 HIGHBD_SMOOTH_NXM(8, 16)
   1046 #endif  // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
   1047 
   1048 #undef HIGHBD_SMOOTH_NXM
   1049 
   1050 // For width 16 and above.
   1051 #define HIGHBD_SMOOTH_PREDICTOR(W)                                             \
   1052  static void highbd_smooth_##W##xh_neon(                                      \
   1053      uint16_t *dst, ptrdiff_t stride, const uint16_t *const top_row,          \
   1054      const uint16_t *const left_column, const int height) {                   \
   1055    const uint16_t top_right = top_row[(W) - 1];                               \
   1056    const uint16_t bottom_left = left_column[height - 1];                      \
   1057    const uint16_t *const weights_y = smooth_weights_u16 + height - 4;         \
   1058                                                                               \
   1059    /* Precompute weighted values that don't vary with |y|. */                 \
   1060    uint32x4_t weighted_tr_low[(W) >> 3];                                      \
   1061    uint32x4_t weighted_tr_high[(W) >> 3];                                     \
   1062    for (int i = 0; i < (W) >> 3; ++i) {                                       \
   1063      const int x = i << 3;                                                    \
   1064      const uint16x4_t weights_x_low =                                         \
   1065          vld1_u16(smooth_weights_u16 + (W) - 4 + x);                          \
   1066      weighted_tr_low[i] = vmull_n_u16(negate_s8(weights_x_low), top_right);   \
   1067      const uint16x4_t weights_x_high =                                        \
   1068          vld1_u16(smooth_weights_u16 + (W) + x);                              \
   1069      weighted_tr_high[i] = vmull_n_u16(negate_s8(weights_x_high), top_right); \
   1070    }                                                                          \
   1071                                                                               \
   1072    const uint16x4_t bottom_left_v = vdup_n_u16(bottom_left);                  \
   1073    for (int y = 0; y < height; ++y) {                                         \
   1074      const uint32x4_t weighted_bl =                                           \
   1075          vmull_n_u16(bottom_left_v, 256 - weights_y[y]);                      \
   1076      uint16_t *dst_x = dst;                                                   \
   1077      for (int i = 0; i < (W) >> 3; ++i) {                                     \
   1078        const int x = i << 3;                                                  \
   1079        const uint16x4x2_t top_vals = { { vld1_u16(top_row + x),               \
   1080                                          vld1_u16(top_row + x + 4) } };       \
   1081        const uint32x4_t weighted_corners_low =                                \
   1082            vaddq_u32(weighted_bl, weighted_tr_low[i]);                        \
   1083        const uint32x4_t weighted_corners_high =                               \
   1084            vaddq_u32(weighted_bl, weighted_tr_high[i]);                       \
   1085        /* Accumulate weighted edge values and store. */                       \
   1086        const uint16x4x2_t weights_x = {                                       \
   1087          { vld1_u16(smooth_weights_u16 + (W) - 4 + x),                        \
   1088            vld1_u16(smooth_weights_u16 + (W) + x) }                           \
   1089        };                                                                     \
   1090        highbd_calculate_pred8(dst_x, weighted_corners_low,                    \
   1091                               weighted_corners_high, top_vals, weights_x,     \
   1092                               left_column[y], weights_y[y]);                  \
   1093        dst_x += 8;                                                            \
   1094      }                                                                        \
   1095      dst += stride;                                                           \
   1096    }                                                                          \
   1097  }
   1098 
   1099 HIGHBD_SMOOTH_PREDICTOR(16)
   1100 HIGHBD_SMOOTH_PREDICTOR(32)
   1101 HIGHBD_SMOOTH_PREDICTOR(64)
   1102 
   1103 #undef HIGHBD_SMOOTH_PREDICTOR
   1104 
   1105 #define HIGHBD_SMOOTH_NXM_WIDE(W, H)                            \
   1106  void aom_highbd_smooth_predictor_##W##x##H##_neon(            \
   1107      uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, \
   1108      const uint16_t *left, int bd) {                           \
   1109    (void)bd;                                                   \
   1110    highbd_smooth_##W##xh_neon(dst, y_stride, above, left, H);  \
   1111  }
   1112 
   1113 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
   1114 HIGHBD_SMOOTH_NXM_WIDE(16, 4)
   1115 HIGHBD_SMOOTH_NXM_WIDE(16, 8)
   1116 HIGHBD_SMOOTH_NXM_WIDE(16, 16)
   1117 HIGHBD_SMOOTH_NXM_WIDE(16, 32)
   1118 HIGHBD_SMOOTH_NXM_WIDE(16, 64)
   1119 HIGHBD_SMOOTH_NXM_WIDE(32, 8)
   1120 HIGHBD_SMOOTH_NXM_WIDE(32, 16)
   1121 HIGHBD_SMOOTH_NXM_WIDE(32, 32)
   1122 HIGHBD_SMOOTH_NXM_WIDE(32, 64)
   1123 HIGHBD_SMOOTH_NXM_WIDE(64, 16)
   1124 HIGHBD_SMOOTH_NXM_WIDE(64, 32)
   1125 HIGHBD_SMOOTH_NXM_WIDE(64, 64)
   1126 #else
   1127 HIGHBD_SMOOTH_NXM_WIDE(16, 8)
   1128 HIGHBD_SMOOTH_NXM_WIDE(16, 16)
   1129 HIGHBD_SMOOTH_NXM_WIDE(16, 32)
   1130 HIGHBD_SMOOTH_NXM_WIDE(32, 16)
   1131 HIGHBD_SMOOTH_NXM_WIDE(32, 32)
   1132 HIGHBD_SMOOTH_NXM_WIDE(32, 64)
   1133 HIGHBD_SMOOTH_NXM_WIDE(64, 32)
   1134 HIGHBD_SMOOTH_NXM_WIDE(64, 64)
   1135 #endif  // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
   1136 
   1137 #undef HIGHBD_SMOOTH_NXM_WIDE
   1138 
   1139 static void highbd_smooth_v_4xh_neon(uint16_t *dst, ptrdiff_t stride,
   1140                                     const uint16_t *const top_row,
   1141                                     const uint16_t *const left_column,
   1142                                     const int height) {
   1143  const uint16_t bottom_left = left_column[height - 1];
   1144  const uint16_t *const weights_y = smooth_weights_u16 + height - 4;
   1145 
   1146  const uint16x4_t top_v = vld1_u16(top_row);
   1147  const uint16x4_t bottom_left_v = vdup_n_u16(bottom_left);
   1148 
   1149  for (int y = 0; y < height; ++y) {
   1150    const uint32x4_t weighted_bl =
   1151        vmull_n_u16(bottom_left_v, 256 - weights_y[y]);
   1152    const uint32x4_t weighted_top =
   1153        vmlal_n_u16(weighted_bl, top_v, weights_y[y]);
   1154    vst1_u16(dst, vrshrn_n_u32(weighted_top, SMOOTH_WEIGHT_LOG2_SCALE));
   1155 
   1156    dst += stride;
   1157  }
   1158 }
   1159 
   1160 static void highbd_smooth_v_8xh_neon(uint16_t *dst, const ptrdiff_t stride,
   1161                                     const uint16_t *const top_row,
   1162                                     const uint16_t *const left_column,
   1163                                     const int height) {
   1164  const uint16_t bottom_left = left_column[height - 1];
   1165  const uint16_t *const weights_y = smooth_weights_u16 + height - 4;
   1166 
   1167  const uint16x4_t top_low = vld1_u16(top_row);
   1168  const uint16x4_t top_high = vld1_u16(top_row + 4);
   1169  const uint16x4_t bottom_left_v = vdup_n_u16(bottom_left);
   1170 
   1171  for (int y = 0; y < height; ++y) {
   1172    const uint32x4_t weighted_bl =
   1173        vmull_n_u16(bottom_left_v, 256 - weights_y[y]);
   1174 
   1175    const uint32x4_t weighted_top_low =
   1176        vmlal_n_u16(weighted_bl, top_low, weights_y[y]);
   1177    vst1_u16(dst, vrshrn_n_u32(weighted_top_low, SMOOTH_WEIGHT_LOG2_SCALE));
   1178 
   1179    const uint32x4_t weighted_top_high =
   1180        vmlal_n_u16(weighted_bl, top_high, weights_y[y]);
   1181    vst1_u16(dst + 4,
   1182             vrshrn_n_u32(weighted_top_high, SMOOTH_WEIGHT_LOG2_SCALE));
   1183    dst += stride;
   1184  }
   1185 }
   1186 
   1187 #define HIGHBD_SMOOTH_V_NXM(W, H)                                \
   1188  void aom_highbd_smooth_v_predictor_##W##x##H##_neon(           \
   1189      uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above,  \
   1190      const uint16_t *left, int bd) {                            \
   1191    (void)bd;                                                    \
   1192    highbd_smooth_v_##W##xh_neon(dst, y_stride, above, left, H); \
   1193  }
   1194 
   1195 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
   1196 HIGHBD_SMOOTH_V_NXM(4, 4)
   1197 HIGHBD_SMOOTH_V_NXM(4, 8)
   1198 HIGHBD_SMOOTH_V_NXM(4, 16)
   1199 HIGHBD_SMOOTH_V_NXM(8, 4)
   1200 HIGHBD_SMOOTH_V_NXM(8, 8)
   1201 HIGHBD_SMOOTH_V_NXM(8, 16)
   1202 HIGHBD_SMOOTH_V_NXM(8, 32)
   1203 #else
   1204 HIGHBD_SMOOTH_V_NXM(4, 4)
   1205 HIGHBD_SMOOTH_V_NXM(4, 8)
   1206 HIGHBD_SMOOTH_V_NXM(8, 4)
   1207 HIGHBD_SMOOTH_V_NXM(8, 8)
   1208 HIGHBD_SMOOTH_V_NXM(8, 16)
   1209 #endif  // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
   1210 
   1211 #undef HIGHBD_SMOOTH_V_NXM
   1212 
   1213 // For width 16 and above.
   1214 #define HIGHBD_SMOOTH_V_PREDICTOR(W)                                         \
   1215  static void highbd_smooth_v_##W##xh_neon(                                  \
   1216      uint16_t *dst, const ptrdiff_t stride, const uint16_t *const top_row,  \
   1217      const uint16_t *const left_column, const int height) {                 \
   1218    const uint16_t bottom_left = left_column[height - 1];                    \
   1219    const uint16_t *const weights_y = smooth_weights_u16 + height - 4;       \
   1220                                                                             \
   1221    uint16x4x2_t top_vals[(W) >> 3];                                         \
   1222    for (int i = 0; i < (W) >> 3; ++i) {                                     \
   1223      const int x = i << 3;                                                  \
   1224      top_vals[i].val[0] = vld1_u16(top_row + x);                            \
   1225      top_vals[i].val[1] = vld1_u16(top_row + x + 4);                        \
   1226    }                                                                        \
   1227                                                                             \
   1228    const uint16x4_t bottom_left_v = vdup_n_u16(bottom_left);                \
   1229    for (int y = 0; y < height; ++y) {                                       \
   1230      const uint32x4_t weighted_bl =                                         \
   1231          vmull_n_u16(bottom_left_v, 256 - weights_y[y]);                    \
   1232                                                                             \
   1233      uint16_t *dst_x = dst;                                                 \
   1234      for (int i = 0; i < (W) >> 3; ++i) {                                   \
   1235        const uint32x4_t weighted_top_low =                                  \
   1236            vmlal_n_u16(weighted_bl, top_vals[i].val[0], weights_y[y]);      \
   1237        vst1_u16(dst_x,                                                      \
   1238                 vrshrn_n_u32(weighted_top_low, SMOOTH_WEIGHT_LOG2_SCALE));  \
   1239                                                                             \
   1240        const uint32x4_t weighted_top_high =                                 \
   1241            vmlal_n_u16(weighted_bl, top_vals[i].val[1], weights_y[y]);      \
   1242        vst1_u16(dst_x + 4,                                                  \
   1243                 vrshrn_n_u32(weighted_top_high, SMOOTH_WEIGHT_LOG2_SCALE)); \
   1244        dst_x += 8;                                                          \
   1245      }                                                                      \
   1246      dst += stride;                                                         \
   1247    }                                                                        \
   1248  }
   1249 
   1250 HIGHBD_SMOOTH_V_PREDICTOR(16)
   1251 HIGHBD_SMOOTH_V_PREDICTOR(32)
   1252 HIGHBD_SMOOTH_V_PREDICTOR(64)
   1253 
   1254 #undef HIGHBD_SMOOTH_V_PREDICTOR
   1255 
   1256 #define HIGHBD_SMOOTH_V_NXM_WIDE(W, H)                           \
   1257  void aom_highbd_smooth_v_predictor_##W##x##H##_neon(           \
   1258      uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above,  \
   1259      const uint16_t *left, int bd) {                            \
   1260    (void)bd;                                                    \
   1261    highbd_smooth_v_##W##xh_neon(dst, y_stride, above, left, H); \
   1262  }
   1263 
   1264 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
   1265 HIGHBD_SMOOTH_V_NXM_WIDE(16, 4)
   1266 HIGHBD_SMOOTH_V_NXM_WIDE(16, 8)
   1267 HIGHBD_SMOOTH_V_NXM_WIDE(16, 16)
   1268 HIGHBD_SMOOTH_V_NXM_WIDE(16, 32)
   1269 HIGHBD_SMOOTH_V_NXM_WIDE(16, 64)
   1270 HIGHBD_SMOOTH_V_NXM_WIDE(32, 8)
   1271 HIGHBD_SMOOTH_V_NXM_WIDE(32, 16)
   1272 HIGHBD_SMOOTH_V_NXM_WIDE(32, 32)
   1273 HIGHBD_SMOOTH_V_NXM_WIDE(32, 64)
   1274 HIGHBD_SMOOTH_V_NXM_WIDE(64, 16)
   1275 HIGHBD_SMOOTH_V_NXM_WIDE(64, 32)
   1276 HIGHBD_SMOOTH_V_NXM_WIDE(64, 64)
   1277 #else
   1278 HIGHBD_SMOOTH_V_NXM_WIDE(16, 8)
   1279 HIGHBD_SMOOTH_V_NXM_WIDE(16, 16)
   1280 HIGHBD_SMOOTH_V_NXM_WIDE(16, 32)
   1281 HIGHBD_SMOOTH_V_NXM_WIDE(32, 16)
   1282 HIGHBD_SMOOTH_V_NXM_WIDE(32, 32)
   1283 HIGHBD_SMOOTH_V_NXM_WIDE(32, 64)
   1284 HIGHBD_SMOOTH_V_NXM_WIDE(64, 32)
   1285 HIGHBD_SMOOTH_V_NXM_WIDE(64, 64)
   1286 #endif  // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
   1287 
   1288 #undef HIGHBD_SMOOTH_V_NXM_WIDE
   1289 
   1290 static inline void highbd_smooth_h_4xh_neon(uint16_t *dst, ptrdiff_t stride,
   1291                                            const uint16_t *const top_row,
   1292                                            const uint16_t *const left_column,
   1293                                            const int height) {
   1294  const uint16_t top_right = top_row[3];
   1295 
   1296  const uint16x4_t weights_x = vld1_u16(smooth_weights_u16);
   1297  const uint16x4_t scaled_weights_x = negate_s8(weights_x);
   1298 
   1299  const uint32x4_t weighted_tr = vmull_n_u16(scaled_weights_x, top_right);
   1300  for (int y = 0; y < height; ++y) {
   1301    const uint32x4_t weighted_left =
   1302        vmlal_n_u16(weighted_tr, weights_x, left_column[y]);
   1303    vst1_u16(dst, vrshrn_n_u32(weighted_left, SMOOTH_WEIGHT_LOG2_SCALE));
   1304    dst += stride;
   1305  }
   1306 }
   1307 
   1308 static inline void highbd_smooth_h_8xh_neon(uint16_t *dst, ptrdiff_t stride,
   1309                                            const uint16_t *const top_row,
   1310                                            const uint16_t *const left_column,
   1311                                            const int height) {
   1312  const uint16_t top_right = top_row[7];
   1313 
   1314  const uint16x4x2_t weights_x = { { vld1_u16(smooth_weights_u16 + 4),
   1315                                     vld1_u16(smooth_weights_u16 + 8) } };
   1316 
   1317  const uint32x4_t weighted_tr_low =
   1318      vmull_n_u16(negate_s8(weights_x.val[0]), top_right);
   1319  const uint32x4_t weighted_tr_high =
   1320      vmull_n_u16(negate_s8(weights_x.val[1]), top_right);
   1321 
   1322  for (int y = 0; y < height; ++y) {
   1323    const uint16_t left_y = left_column[y];
   1324    const uint32x4_t weighted_left_low =
   1325        vmlal_n_u16(weighted_tr_low, weights_x.val[0], left_y);
   1326    vst1_u16(dst, vrshrn_n_u32(weighted_left_low, SMOOTH_WEIGHT_LOG2_SCALE));
   1327 
   1328    const uint32x4_t weighted_left_high =
   1329        vmlal_n_u16(weighted_tr_high, weights_x.val[1], left_y);
   1330    vst1_u16(dst + 4,
   1331             vrshrn_n_u32(weighted_left_high, SMOOTH_WEIGHT_LOG2_SCALE));
   1332    dst += stride;
   1333  }
   1334 }
   1335 
   1336 #define HIGHBD_SMOOTH_H_NXM(W, H)                                \
   1337  void aom_highbd_smooth_h_predictor_##W##x##H##_neon(           \
   1338      uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above,  \
   1339      const uint16_t *left, int bd) {                            \
   1340    (void)bd;                                                    \
   1341    highbd_smooth_h_##W##xh_neon(dst, y_stride, above, left, H); \
   1342  }
   1343 
   1344 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
   1345 HIGHBD_SMOOTH_H_NXM(4, 4)
   1346 HIGHBD_SMOOTH_H_NXM(4, 8)
   1347 HIGHBD_SMOOTH_H_NXM(4, 16)
   1348 HIGHBD_SMOOTH_H_NXM(8, 4)
   1349 HIGHBD_SMOOTH_H_NXM(8, 8)
   1350 HIGHBD_SMOOTH_H_NXM(8, 16)
   1351 HIGHBD_SMOOTH_H_NXM(8, 32)
   1352 #else
   1353 HIGHBD_SMOOTH_H_NXM(4, 4)
   1354 HIGHBD_SMOOTH_H_NXM(4, 8)
   1355 HIGHBD_SMOOTH_H_NXM(8, 4)
   1356 HIGHBD_SMOOTH_H_NXM(8, 8)
   1357 HIGHBD_SMOOTH_H_NXM(8, 16)
   1358 #endif  // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
   1359 
   1360 #undef HIGHBD_SMOOTH_H_NXM
   1361 
   1362 // For width 16 and above.
   1363 #define HIGHBD_SMOOTH_H_PREDICTOR(W)                                          \
   1364  static void highbd_smooth_h_##W##xh_neon(                                   \
   1365      uint16_t *dst, ptrdiff_t stride, const uint16_t *const top_row,         \
   1366      const uint16_t *const left_column, const int height) {                  \
   1367    const uint16_t top_right = top_row[(W) - 1];                              \
   1368                                                                              \
   1369    uint16x4_t weights_x_low[(W) >> 3];                                       \
   1370    uint16x4_t weights_x_high[(W) >> 3];                                      \
   1371    uint32x4_t weighted_tr_low[(W) >> 3];                                     \
   1372    uint32x4_t weighted_tr_high[(W) >> 3];                                    \
   1373    for (int i = 0; i < (W) >> 3; ++i) {                                      \
   1374      const int x = i << 3;                                                   \
   1375      weights_x_low[i] = vld1_u16(smooth_weights_u16 + (W) - 4 + x);          \
   1376      weighted_tr_low[i] =                                                    \
   1377          vmull_n_u16(negate_s8(weights_x_low[i]), top_right);                \
   1378      weights_x_high[i] = vld1_u16(smooth_weights_u16 + (W) + x);             \
   1379      weighted_tr_high[i] =                                                   \
   1380          vmull_n_u16(negate_s8(weights_x_high[i]), top_right);               \
   1381    }                                                                         \
   1382                                                                              \
   1383    for (int y = 0; y < height; ++y) {                                        \
   1384      uint16_t *dst_x = dst;                                                  \
   1385      const uint16_t left_y = left_column[y];                                 \
   1386      for (int i = 0; i < (W) >> 3; ++i) {                                    \
   1387        const uint32x4_t weighted_left_low =                                  \
   1388            vmlal_n_u16(weighted_tr_low[i], weights_x_low[i], left_y);        \
   1389        vst1_u16(dst_x,                                                       \
   1390                 vrshrn_n_u32(weighted_left_low, SMOOTH_WEIGHT_LOG2_SCALE));  \
   1391                                                                              \
   1392        const uint32x4_t weighted_left_high =                                 \
   1393            vmlal_n_u16(weighted_tr_high[i], weights_x_high[i], left_y);      \
   1394        vst1_u16(dst_x + 4,                                                   \
   1395                 vrshrn_n_u32(weighted_left_high, SMOOTH_WEIGHT_LOG2_SCALE)); \
   1396        dst_x += 8;                                                           \
   1397      }                                                                       \
   1398      dst += stride;                                                          \
   1399    }                                                                         \
   1400  }
   1401 
   1402 HIGHBD_SMOOTH_H_PREDICTOR(16)
   1403 HIGHBD_SMOOTH_H_PREDICTOR(32)
   1404 HIGHBD_SMOOTH_H_PREDICTOR(64)
   1405 
   1406 #undef HIGHBD_SMOOTH_H_PREDICTOR
   1407 
   1408 #define HIGHBD_SMOOTH_H_NXM_WIDE(W, H)                           \
   1409  void aom_highbd_smooth_h_predictor_##W##x##H##_neon(           \
   1410      uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above,  \
   1411      const uint16_t *left, int bd) {                            \
   1412    (void)bd;                                                    \
   1413    highbd_smooth_h_##W##xh_neon(dst, y_stride, above, left, H); \
   1414  }
   1415 
   1416 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
   1417 HIGHBD_SMOOTH_H_NXM_WIDE(16, 4)
   1418 HIGHBD_SMOOTH_H_NXM_WIDE(16, 8)
   1419 HIGHBD_SMOOTH_H_NXM_WIDE(16, 16)
   1420 HIGHBD_SMOOTH_H_NXM_WIDE(16, 32)
   1421 HIGHBD_SMOOTH_H_NXM_WIDE(16, 64)
   1422 HIGHBD_SMOOTH_H_NXM_WIDE(32, 8)
   1423 HIGHBD_SMOOTH_H_NXM_WIDE(32, 16)
   1424 HIGHBD_SMOOTH_H_NXM_WIDE(32, 32)
   1425 HIGHBD_SMOOTH_H_NXM_WIDE(32, 64)
   1426 HIGHBD_SMOOTH_H_NXM_WIDE(64, 16)
   1427 HIGHBD_SMOOTH_H_NXM_WIDE(64, 32)
   1428 HIGHBD_SMOOTH_H_NXM_WIDE(64, 64)
   1429 #else
   1430 HIGHBD_SMOOTH_H_NXM_WIDE(16, 8)
   1431 HIGHBD_SMOOTH_H_NXM_WIDE(16, 16)
   1432 HIGHBD_SMOOTH_H_NXM_WIDE(16, 32)
   1433 HIGHBD_SMOOTH_H_NXM_WIDE(32, 16)
   1434 HIGHBD_SMOOTH_H_NXM_WIDE(32, 32)
   1435 HIGHBD_SMOOTH_H_NXM_WIDE(32, 64)
   1436 HIGHBD_SMOOTH_H_NXM_WIDE(64, 32)
   1437 HIGHBD_SMOOTH_H_NXM_WIDE(64, 64)
   1438 #endif  // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
   1439 
   1440 #undef HIGHBD_SMOOTH_H_NXM_WIDE
   1441 
   1442 // -----------------------------------------------------------------------------
   1443 // Z1
   1444 
   1445 static const int16_t iota1_s16[] = { 0, 1, 2, 3, 4, 5, 6, 7, 8 };
   1446 static const int16_t iota2_s16[] = { 0, 2, 4, 6, 8, 10, 12, 14 };
   1447 
   1448 static AOM_FORCE_INLINE uint16x4_t highbd_dr_z1_apply_shift_x4(uint16x4_t a0,
   1449                                                               uint16x4_t a1,
   1450                                                               int shift) {
   1451  // The C implementation of the z1 predictor uses (32 - shift) and a right
   1452  // shift by 5, however we instead double shift to avoid an unnecessary right
   1453  // shift by 1.
   1454  uint32x4_t res = vmull_n_u16(a1, shift);
   1455  res = vmlal_n_u16(res, a0, 64 - shift);
   1456  return vrshrn_n_u32(res, 6);
   1457 }
   1458 
   1459 static AOM_FORCE_INLINE uint16x8_t highbd_dr_z1_apply_shift_x8(uint16x8_t a0,
   1460                                                               uint16x8_t a1,
   1461                                                               int shift) {
   1462  return vcombine_u16(
   1463      highbd_dr_z1_apply_shift_x4(vget_low_u16(a0), vget_low_u16(a1), shift),
   1464      highbd_dr_z1_apply_shift_x4(vget_high_u16(a0), vget_high_u16(a1), shift));
   1465 }
   1466 
   1467 // clang-format off
   1468 static const uint8_t kLoadMaxShuffles[] = {
   1469  14, 15, 14, 15, 14, 15, 14, 15, 14, 15, 14, 15, 14, 15, 14, 15,
   1470  12, 13, 14, 15, 14, 15, 14, 15, 14, 15, 14, 15, 14, 15, 14, 15,
   1471  10, 11, 12, 13, 14, 15, 14, 15, 14, 15, 14, 15, 14, 15, 14, 15,
   1472   8,  9, 10, 11, 12, 13, 14, 15, 14, 15, 14, 15, 14, 15, 14, 15,
   1473   6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 14, 15, 14, 15, 14, 15,
   1474   4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 14, 15, 14, 15,
   1475   2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 14, 15,
   1476   0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15,
   1477 };
   1478 // clang-format on
   1479 
   1480 static inline uint16x8_t zn_load_masked_neon(const uint16_t *ptr,
   1481                                             int shuffle_idx) {
   1482  uint8x16_t shuffle = vld1q_u8(&kLoadMaxShuffles[16 * shuffle_idx]);
   1483  uint8x16_t src = vreinterpretq_u8_u16(vld1q_u16(ptr));
   1484 #if AOM_ARCH_AARCH64
   1485  return vreinterpretq_u16_u8(vqtbl1q_u8(src, shuffle));
   1486 #else
   1487  uint8x8x2_t src2 = { { vget_low_u8(src), vget_high_u8(src) } };
   1488  uint8x8_t lo = vtbl2_u8(src2, vget_low_u8(shuffle));
   1489  uint8x8_t hi = vtbl2_u8(src2, vget_high_u8(shuffle));
   1490  return vreinterpretq_u16_u8(vcombine_u8(lo, hi));
   1491 #endif
   1492 }
   1493 
   1494 static void highbd_dr_prediction_z1_upsample0_neon(uint16_t *dst,
   1495                                                   ptrdiff_t stride, int bw,
   1496                                                   int bh,
   1497                                                   const uint16_t *above,
   1498                                                   int dx) {
   1499  assert(bw % 4 == 0);
   1500  assert(bh % 4 == 0);
   1501  assert(dx > 0);
   1502 
   1503  const int max_base_x = (bw + bh) - 1;
   1504  const int above_max = above[max_base_x];
   1505 
   1506  const int16x8_t iota1x8 = vld1q_s16(iota1_s16);
   1507  const int16x4_t iota1x4 = vget_low_s16(iota1x8);
   1508 
   1509  int x = dx;
   1510  int r = 0;
   1511  do {
   1512    const int base = x >> 6;
   1513    if (base >= max_base_x) {
   1514      for (int i = r; i < bh; ++i) {
   1515        aom_memset16(dst, above_max, bw);
   1516        dst += stride;
   1517      }
   1518      return;
   1519    }
   1520 
   1521    // The C implementation of the z1 predictor when not upsampling uses:
   1522    // ((x & 0x3f) >> 1)
   1523    // The right shift is unnecessary here since we instead shift by +1 later,
   1524    // so adjust the mask to 0x3e to ensure we don't consider the extra bit.
   1525    const int shift = x & 0x3e;
   1526 
   1527    if (bw == 4) {
   1528      const uint16x4_t a0 = vld1_u16(&above[base]);
   1529      const uint16x4_t a1 = vld1_u16(&above[base + 1]);
   1530      const uint16x4_t val = highbd_dr_z1_apply_shift_x4(a0, a1, shift);
   1531      const uint16x4_t cmp = vcgt_s16(vdup_n_s16(max_base_x - base), iota1x4);
   1532      const uint16x4_t res = vbsl_u16(cmp, val, vdup_n_u16(above_max));
   1533      vst1_u16(dst, res);
   1534    } else {
   1535      int c = 0;
   1536      do {
   1537        uint16x8_t a0;
   1538        uint16x8_t a1;
   1539        if (base + c >= max_base_x) {
   1540          a0 = a1 = vdupq_n_u16(above_max);
   1541        } else {
   1542          if (base + c + 7 >= max_base_x) {
   1543            int shuffle_idx = max_base_x - base - c;
   1544            a0 = zn_load_masked_neon(above + (max_base_x - 7), shuffle_idx);
   1545          } else {
   1546            a0 = vld1q_u16(above + base + c);
   1547          }
   1548          if (base + c + 8 >= max_base_x) {
   1549            int shuffle_idx = max_base_x - base - c - 1;
   1550            a1 = zn_load_masked_neon(above + (max_base_x - 7), shuffle_idx);
   1551          } else {
   1552            a1 = vld1q_u16(above + base + c + 1);
   1553          }
   1554        }
   1555 
   1556        vst1q_u16(dst + c, highbd_dr_z1_apply_shift_x8(a0, a1, shift));
   1557        c += 8;
   1558      } while (c < bw);
   1559    }
   1560 
   1561    dst += stride;
   1562    x += dx;
   1563  } while (++r < bh);
   1564 }
   1565 
   1566 static void highbd_dr_prediction_z1_upsample1_neon(uint16_t *dst,
   1567                                                   ptrdiff_t stride, int bw,
   1568                                                   int bh,
   1569                                                   const uint16_t *above,
   1570                                                   int dx) {
   1571  assert(bw % 4 == 0);
   1572  assert(bh % 4 == 0);
   1573  assert(dx > 0);
   1574 
   1575  const int max_base_x = ((bw + bh) - 1) << 1;
   1576  const int above_max = above[max_base_x];
   1577 
   1578  const int16x8_t iota2x8 = vld1q_s16(iota2_s16);
   1579  const int16x4_t iota2x4 = vget_low_s16(iota2x8);
   1580 
   1581  int x = dx;
   1582  int r = 0;
   1583  do {
   1584    const int base = x >> 5;
   1585    if (base >= max_base_x) {
   1586      for (int i = r; i < bh; ++i) {
   1587        aom_memset16(dst, above_max, bw);
   1588        dst += stride;
   1589      }
   1590      return;
   1591    }
   1592 
   1593    // The C implementation of the z1 predictor when upsampling uses:
   1594    // (((x << 1) & 0x3f) >> 1)
   1595    // The right shift is unnecessary here since we instead shift by +1 later,
   1596    // so adjust the mask to 0x3e to ensure we don't consider the extra bit.
   1597    const int shift = (x << 1) & 0x3e;
   1598 
   1599    if (bw == 4) {
   1600      const uint16x4x2_t a01 = vld2_u16(&above[base]);
   1601      const uint16x4_t val =
   1602          highbd_dr_z1_apply_shift_x4(a01.val[0], a01.val[1], shift);
   1603      const uint16x4_t cmp = vcgt_s16(vdup_n_s16(max_base_x - base), iota2x4);
   1604      const uint16x4_t res = vbsl_u16(cmp, val, vdup_n_u16(above_max));
   1605      vst1_u16(dst, res);
   1606    } else {
   1607      int c = 0;
   1608      do {
   1609        const uint16x8x2_t a01 = vld2q_u16(&above[base + 2 * c]);
   1610        const uint16x8_t val =
   1611            highbd_dr_z1_apply_shift_x8(a01.val[0], a01.val[1], shift);
   1612        const uint16x8_t cmp =
   1613            vcgtq_s16(vdupq_n_s16(max_base_x - base - 2 * c), iota2x8);
   1614        const uint16x8_t res = vbslq_u16(cmp, val, vdupq_n_u16(above_max));
   1615        vst1q_u16(dst + c, res);
   1616        c += 8;
   1617      } while (c < bw);
   1618    }
   1619 
   1620    dst += stride;
   1621    x += dx;
   1622  } while (++r < bh);
   1623 }
   1624 
   1625 // Directional prediction, zone 1: 0 < angle < 90
   1626 void av1_highbd_dr_prediction_z1_neon(uint16_t *dst, ptrdiff_t stride, int bw,
   1627                                      int bh, const uint16_t *above,
   1628                                      const uint16_t *left, int upsample_above,
   1629                                      int dx, int dy, int bd) {
   1630  (void)left;
   1631  (void)dy;
   1632  (void)bd;
   1633  assert(dy == 1);
   1634 
   1635  if (upsample_above) {
   1636    highbd_dr_prediction_z1_upsample1_neon(dst, stride, bw, bh, above, dx);
   1637  } else {
   1638    highbd_dr_prediction_z1_upsample0_neon(dst, stride, bw, bh, above, dx);
   1639  }
   1640 }
   1641 
   1642 // -----------------------------------------------------------------------------
   1643 // Z2
   1644 
   1645 #if AOM_ARCH_AARCH64
   1646 // Incrementally shift more elements from `above` into the result, merging with
   1647 // existing `left` elements.
   1648 // X0, X1, X2, X3
   1649 // Y0, X0, X1, X2
   1650 // Y0, Y1, X0, X1
   1651 // Y0, Y1, Y2, X0
   1652 // Y0, Y1, Y2, Y3
   1653 // clang-format off
   1654 static const uint8_t z2_merge_shuffles_u16x4[5][8] = {
   1655  {  8,  9, 10, 11, 12, 13, 14, 15 },
   1656  {  0,  1,  8,  9, 10, 11, 12, 13 },
   1657  {  0,  1,  2,  3,  8,  9, 10, 11 },
   1658  {  0,  1,  2,  3,  4,  5,  8,  9 },
   1659  {  0,  1,  2,  3,  4,  5,  6,  7 },
   1660 };
   1661 // clang-format on
   1662 
   1663 // Incrementally shift more elements from `above` into the result, merging with
   1664 // existing `left` elements.
   1665 // X0, X1, X2, X3, X4, X5, X6, X7
   1666 // Y0, X0, X1, X2, X3, X4, X5, X6
   1667 // Y0, Y1, X0, X1, X2, X3, X4, X5
   1668 // Y0, Y1, Y2, X0, X1, X2, X3, X4
   1669 // Y0, Y1, Y2, Y3, X0, X1, X2, X3
   1670 // Y0, Y1, Y2, Y3, Y4, X0, X1, X2
   1671 // Y0, Y1, Y2, Y3, Y4, Y5, X0, X1
   1672 // Y0, Y1, Y2, Y3, Y4, Y5, Y6, X0
   1673 // Y0, Y1, Y2, Y3, Y4, Y5, Y6, Y7
   1674 // clang-format off
   1675 static const uint8_t z2_merge_shuffles_u16x8[9][16] = {
   1676  { 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 },
   1677  {  0,  1, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29 },
   1678  {  0,  1,  2,  3, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27 },
   1679  {  0,  1,  2,  3,  4,  5, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25 },
   1680  {  0,  1,  2,  3,  4,  5,  6,  7, 16, 17, 18, 19, 20, 21, 22, 23 },
   1681  {  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 16, 17, 18, 19, 20, 21 },
   1682  {  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 16, 17, 18, 19 },
   1683  {  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 16, 17 },
   1684  {  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15 },
   1685 };
   1686 // clang-format on
   1687 
   1688 // clang-format off
   1689 static const uint16_t z2_y_iter_masks_u16x4[5][4] = {
   1690  {      0U,      0U,      0U,      0U },
   1691  { 0xffffU,      0U,      0U,      0U },
   1692  { 0xffffU, 0xffffU,      0U,      0U },
   1693  { 0xffffU, 0xffffU, 0xffffU,      0U },
   1694  { 0xffffU, 0xffffU, 0xffffU, 0xffffU },
   1695 };
   1696 // clang-format on
   1697 
   1698 // clang-format off
   1699 static const uint16_t z2_y_iter_masks_u16x8[9][8] = {
   1700  {      0U,      0U,      0U,      0U,      0U,      0U,      0U,      0U },
   1701  { 0xffffU,      0U,      0U,      0U,      0U,      0U,      0U,      0U },
   1702  { 0xffffU, 0xffffU,      0U,      0U,      0U,      0U,      0U,      0U },
   1703  { 0xffffU, 0xffffU, 0xffffU,      0U,      0U,      0U,      0U,      0U },
   1704  { 0xffffU, 0xffffU, 0xffffU, 0xffffU,      0U,      0U,      0U,      0U },
   1705  { 0xffffU, 0xffffU, 0xffffU, 0xffffU, 0xffffU,      0U,      0U,      0U },
   1706  { 0xffffU, 0xffffU, 0xffffU, 0xffffU, 0xffffU, 0xffffU,      0U,      0U },
   1707  { 0xffffU, 0xffffU, 0xffffU, 0xffffU, 0xffffU, 0xffffU, 0xffffU,      0U },
   1708  { 0xffffU, 0xffffU, 0xffffU, 0xffffU, 0xffffU, 0xffffU, 0xffffU, 0xffffU },
   1709 };
   1710 // clang-format on
   1711 
   1712 static AOM_FORCE_INLINE uint16x4_t highbd_dr_prediction_z2_tbl_left_x4_from_x8(
   1713    const uint16x8_t left_data, const int16x4_t indices, int base, int n) {
   1714  // Need to adjust indices to operate on 0-based indices rather than
   1715  // `base`-based indices and then adjust from uint16x4 indices to uint8x8
   1716  // indices so we can use a tbl instruction (which only operates on bytes).
   1717  uint8x8_t left_indices =
   1718      vreinterpret_u8_s16(vsub_s16(indices, vdup_n_s16(base)));
   1719  left_indices = vtrn1_u8(left_indices, left_indices);
   1720  left_indices = vadd_u8(left_indices, left_indices);
   1721  left_indices = vadd_u8(left_indices, vreinterpret_u8_u16(vdup_n_u16(0x0100)));
   1722  const uint16x4_t ret = vreinterpret_u16_u8(
   1723      vqtbl1_u8(vreinterpretq_u8_u16(left_data), left_indices));
   1724  return vand_u16(ret, vld1_u16(z2_y_iter_masks_u16x4[n]));
   1725 }
   1726 
   1727 static AOM_FORCE_INLINE uint16x4_t highbd_dr_prediction_z2_tbl_left_x4_from_x16(
   1728    const uint16x8x2_t left_data, const int16x4_t indices, int base, int n) {
   1729  // Need to adjust indices to operate on 0-based indices rather than
   1730  // `base`-based indices and then adjust from uint16x4 indices to uint8x8
   1731  // indices so we can use a tbl instruction (which only operates on bytes).
   1732  uint8x8_t left_indices =
   1733      vreinterpret_u8_s16(vsub_s16(indices, vdup_n_s16(base)));
   1734  left_indices = vtrn1_u8(left_indices, left_indices);
   1735  left_indices = vadd_u8(left_indices, left_indices);
   1736  left_indices = vadd_u8(left_indices, vreinterpret_u8_u16(vdup_n_u16(0x0100)));
   1737  uint8x16x2_t data_u8 = { { vreinterpretq_u8_u16(left_data.val[0]),
   1738                             vreinterpretq_u8_u16(left_data.val[1]) } };
   1739  const uint16x4_t ret = vreinterpret_u16_u8(vqtbl2_u8(data_u8, left_indices));
   1740  return vand_u16(ret, vld1_u16(z2_y_iter_masks_u16x4[n]));
   1741 }
   1742 
   1743 static AOM_FORCE_INLINE uint16x8_t highbd_dr_prediction_z2_tbl_left_x8_from_x8(
   1744    const uint16x8_t left_data, const int16x8_t indices, int base, int n) {
   1745  // Need to adjust indices to operate on 0-based indices rather than
   1746  // `base`-based indices and then adjust from uint16x4 indices to uint8x8
   1747  // indices so we can use a tbl instruction (which only operates on bytes).
   1748  uint8x16_t left_indices =
   1749      vreinterpretq_u8_s16(vsubq_s16(indices, vdupq_n_s16(base)));
   1750  left_indices = vtrn1q_u8(left_indices, left_indices);
   1751  left_indices = vaddq_u8(left_indices, left_indices);
   1752  left_indices =
   1753      vaddq_u8(left_indices, vreinterpretq_u8_u16(vdupq_n_u16(0x0100)));
   1754  const uint16x8_t ret = vreinterpretq_u16_u8(
   1755      vqtbl1q_u8(vreinterpretq_u8_u16(left_data), left_indices));
   1756  return vandq_u16(ret, vld1q_u16(z2_y_iter_masks_u16x8[n]));
   1757 }
   1758 
   1759 static AOM_FORCE_INLINE uint16x8_t highbd_dr_prediction_z2_tbl_left_x8_from_x16(
   1760    const uint16x8x2_t left_data, const int16x8_t indices, int base, int n) {
   1761  // Need to adjust indices to operate on 0-based indices rather than
   1762  // `base`-based indices and then adjust from uint16x4 indices to uint8x8
   1763  // indices so we can use a tbl instruction (which only operates on bytes).
   1764  uint8x16_t left_indices =
   1765      vreinterpretq_u8_s16(vsubq_s16(indices, vdupq_n_s16(base)));
   1766  left_indices = vtrn1q_u8(left_indices, left_indices);
   1767  left_indices = vaddq_u8(left_indices, left_indices);
   1768  left_indices =
   1769      vaddq_u8(left_indices, vreinterpretq_u8_u16(vdupq_n_u16(0x0100)));
   1770  uint8x16x2_t data_u8 = { { vreinterpretq_u8_u16(left_data.val[0]),
   1771                             vreinterpretq_u8_u16(left_data.val[1]) } };
   1772  const uint16x8_t ret =
   1773      vreinterpretq_u16_u8(vqtbl2q_u8(data_u8, left_indices));
   1774  return vandq_u16(ret, vld1q_u16(z2_y_iter_masks_u16x8[n]));
   1775 }
   1776 #endif  // AOM_ARCH_AARCH64
   1777 
   1778 static AOM_FORCE_INLINE uint16x4x2_t highbd_dr_prediction_z2_gather_left_x4(
   1779    const uint16_t *left, const int16x4_t indices, int n) {
   1780  assert(n > 0);
   1781  assert(n <= 4);
   1782  // Load two elements at a time and then uzp them into separate vectors, to
   1783  // reduce the number of memory accesses.
   1784  uint32x2_t ret0_u32 = vdup_n_u32(0);
   1785  uint32x2_t ret1_u32 = vdup_n_u32(0);
   1786 
   1787  // Use a single vget_lane_u64 to minimize vector to general purpose register
   1788  // transfers and then mask off the bits we actually want.
   1789  const uint64_t indices0123 = vget_lane_u64(vreinterpret_u64_s16(indices), 0);
   1790  const int idx0 = (int16_t)((indices0123 >> 0) & 0xffffU);
   1791  const int idx1 = (int16_t)((indices0123 >> 16) & 0xffffU);
   1792  const int idx2 = (int16_t)((indices0123 >> 32) & 0xffffU);
   1793  const int idx3 = (int16_t)((indices0123 >> 48) & 0xffffU);
   1794 
   1795  // At time of writing both Clang and GCC produced better code with these
   1796  // nested if-statements compared to a switch statement with fallthrough.
   1797  load_unaligned_u32_2x1_lane(ret0_u32, left + idx0, 0);
   1798  if (n > 1) {
   1799    load_unaligned_u32_2x1_lane(ret0_u32, left + idx1, 1);
   1800    if (n > 2) {
   1801      load_unaligned_u32_2x1_lane(ret1_u32, left + idx2, 0);
   1802      if (n > 3) {
   1803        load_unaligned_u32_2x1_lane(ret1_u32, left + idx3, 1);
   1804      }
   1805    }
   1806  }
   1807  return vuzp_u16(vreinterpret_u16_u32(ret0_u32),
   1808                  vreinterpret_u16_u32(ret1_u32));
   1809 }
   1810 
   1811 static AOM_FORCE_INLINE uint16x8x2_t highbd_dr_prediction_z2_gather_left_x8(
   1812    const uint16_t *left, const int16x8_t indices, int n) {
   1813  assert(n > 0);
   1814  assert(n <= 8);
   1815  // Load two elements at a time and then uzp them into separate vectors, to
   1816  // reduce the number of memory accesses.
   1817  uint32x4_t ret0_u32 = vdupq_n_u32(0);
   1818  uint32x4_t ret1_u32 = vdupq_n_u32(0);
   1819 
   1820  // Use a pair of vget_lane_u64 to minimize vector to general purpose register
   1821  // transfers and then mask off the bits we actually want.
   1822  const uint64_t indices0123 =
   1823      vgetq_lane_u64(vreinterpretq_u64_s16(indices), 0);
   1824  const uint64_t indices4567 =
   1825      vgetq_lane_u64(vreinterpretq_u64_s16(indices), 1);
   1826  const int idx0 = (int16_t)((indices0123 >> 0) & 0xffffU);
   1827  const int idx1 = (int16_t)((indices0123 >> 16) & 0xffffU);
   1828  const int idx2 = (int16_t)((indices0123 >> 32) & 0xffffU);
   1829  const int idx3 = (int16_t)((indices0123 >> 48) & 0xffffU);
   1830  const int idx4 = (int16_t)((indices4567 >> 0) & 0xffffU);
   1831  const int idx5 = (int16_t)((indices4567 >> 16) & 0xffffU);
   1832  const int idx6 = (int16_t)((indices4567 >> 32) & 0xffffU);
   1833  const int idx7 = (int16_t)((indices4567 >> 48) & 0xffffU);
   1834 
   1835  // At time of writing both Clang and GCC produced better code with these
   1836  // nested if-statements compared to a switch statement with fallthrough.
   1837  load_unaligned_u32_4x1_lane(ret0_u32, left + idx0, 0);
   1838  if (n > 1) {
   1839    load_unaligned_u32_4x1_lane(ret0_u32, left + idx1, 1);
   1840    if (n > 2) {
   1841      load_unaligned_u32_4x1_lane(ret0_u32, left + idx2, 2);
   1842      if (n > 3) {
   1843        load_unaligned_u32_4x1_lane(ret0_u32, left + idx3, 3);
   1844        if (n > 4) {
   1845          load_unaligned_u32_4x1_lane(ret1_u32, left + idx4, 0);
   1846          if (n > 5) {
   1847            load_unaligned_u32_4x1_lane(ret1_u32, left + idx5, 1);
   1848            if (n > 6) {
   1849              load_unaligned_u32_4x1_lane(ret1_u32, left + idx6, 2);
   1850              if (n > 7) {
   1851                load_unaligned_u32_4x1_lane(ret1_u32, left + idx7, 3);
   1852              }
   1853            }
   1854          }
   1855        }
   1856      }
   1857    }
   1858  }
   1859  return vuzpq_u16(vreinterpretq_u16_u32(ret0_u32),
   1860                   vreinterpretq_u16_u32(ret1_u32));
   1861 }
   1862 
   1863 static AOM_FORCE_INLINE uint16x4_t highbd_dr_prediction_z2_merge_x4(
   1864    uint16x4_t out_x, uint16x4_t out_y, int base_shift) {
   1865  assert(base_shift >= 0);
   1866  assert(base_shift <= 4);
   1867  // On AArch64 we can permute the data from the `above` and `left` vectors
   1868  // into a single vector in a single load (of the permute vector) + tbl.
   1869 #if AOM_ARCH_AARCH64
   1870  const uint8x8x2_t out_yx = { { vreinterpret_u8_u16(out_y),
   1871                                 vreinterpret_u8_u16(out_x) } };
   1872  return vreinterpret_u16_u8(
   1873      vtbl2_u8(out_yx, vld1_u8(z2_merge_shuffles_u16x4[base_shift])));
   1874 #else
   1875  uint16x4_t out = out_y;
   1876  for (int c2 = base_shift, x_idx = 0; c2 < 4; ++c2, ++x_idx) {
   1877    out[c2] = out_x[x_idx];
   1878  }
   1879  return out;
   1880 #endif
   1881 }
   1882 
   1883 static AOM_FORCE_INLINE uint16x8_t highbd_dr_prediction_z2_merge_x8(
   1884    uint16x8_t out_x, uint16x8_t out_y, int base_shift) {
   1885  assert(base_shift >= 0);
   1886  assert(base_shift <= 8);
   1887  // On AArch64 we can permute the data from the `above` and `left` vectors
   1888  // into a single vector in a single load (of the permute vector) + tbl.
   1889 #if AOM_ARCH_AARCH64
   1890  const uint8x16x2_t out_yx = { { vreinterpretq_u8_u16(out_y),
   1891                                  vreinterpretq_u8_u16(out_x) } };
   1892  return vreinterpretq_u16_u8(
   1893      vqtbl2q_u8(out_yx, vld1q_u8(z2_merge_shuffles_u16x8[base_shift])));
   1894 #else
   1895  uint16x8_t out = out_y;
   1896  for (int c2 = base_shift, x_idx = 0; c2 < 8; ++c2, ++x_idx) {
   1897    out[c2] = out_x[x_idx];
   1898  }
   1899  return out;
   1900 #endif
   1901 }
   1902 
   1903 static AOM_FORCE_INLINE uint16x4_t highbd_dr_prediction_z2_apply_shift_x4(
   1904    uint16x4_t a0, uint16x4_t a1, int16x4_t shift) {
   1905  uint32x4_t res = vmull_u16(a1, vreinterpret_u16_s16(shift));
   1906  res =
   1907      vmlal_u16(res, a0, vsub_u16(vdup_n_u16(32), vreinterpret_u16_s16(shift)));
   1908  return vrshrn_n_u32(res, 5);
   1909 }
   1910 
   1911 static AOM_FORCE_INLINE uint16x8_t highbd_dr_prediction_z2_apply_shift_x8(
   1912    uint16x8_t a0, uint16x8_t a1, int16x8_t shift) {
   1913  return vcombine_u16(
   1914      highbd_dr_prediction_z2_apply_shift_x4(vget_low_u16(a0), vget_low_u16(a1),
   1915                                             vget_low_s16(shift)),
   1916      highbd_dr_prediction_z2_apply_shift_x4(
   1917          vget_high_u16(a0), vget_high_u16(a1), vget_high_s16(shift)));
   1918 }
   1919 
   1920 static AOM_FORCE_INLINE uint16x4_t highbd_dr_prediction_z2_step_x4(
   1921    const uint16_t *above, const uint16x4_t above0, const uint16x4_t above1,
   1922    const uint16_t *left, int dx, int dy, int r, int c) {
   1923  const int16x4_t iota = vld1_s16(iota1_s16);
   1924 
   1925  const int x0 = (c << 6) - (r + 1) * dx;
   1926  const int y0 = (r << 6) - (c + 1) * dy;
   1927 
   1928  const int16x4_t x0123 = vadd_s16(vdup_n_s16(x0), vshl_n_s16(iota, 6));
   1929  const int16x4_t y0123 = vsub_s16(vdup_n_s16(y0), vmul_n_s16(iota, dy));
   1930  const int16x4_t shift_x0123 =
   1931      vshr_n_s16(vand_s16(x0123, vdup_n_s16(0x3F)), 1);
   1932  const int16x4_t shift_y0123 =
   1933      vshr_n_s16(vand_s16(y0123, vdup_n_s16(0x3F)), 1);
   1934  const int16x4_t base_y0123 = vshr_n_s16(y0123, 6);
   1935 
   1936  const int base_shift = ((((r + 1) * dx) - 1) >> 6) - c;
   1937 
   1938  // Based on the value of `base_shift` there are three possible cases to
   1939  // compute the result:
   1940  // 1) base_shift <= 0: We can load and operate entirely on data from the
   1941  //                     `above` input vector.
   1942  // 2) base_shift < vl: We can load from `above[-1]` and shift
   1943  //                     `vl - base_shift` elements across to the end of the
   1944  //                     vector, then compute the remainder from `left`.
   1945  // 3) base_shift >= vl: We can load and operate entirely on data from the
   1946  //                      `left` input vector.
   1947 
   1948  if (base_shift <= 0) {
   1949    const int base_x = x0 >> 6;
   1950    const uint16x4_t a0 = vld1_u16(above + base_x);
   1951    const uint16x4_t a1 = vld1_u16(above + base_x + 1);
   1952    return highbd_dr_prediction_z2_apply_shift_x4(a0, a1, shift_x0123);
   1953  } else if (base_shift < 4) {
   1954    const uint16x4x2_t l01 = highbd_dr_prediction_z2_gather_left_x4(
   1955        left + 1, base_y0123, base_shift);
   1956    const uint16x4_t out16_y = highbd_dr_prediction_z2_apply_shift_x4(
   1957        l01.val[0], l01.val[1], shift_y0123);
   1958 
   1959    // No need to reload from above in the loop, just use pre-loaded constants.
   1960    const uint16x4_t out16_x =
   1961        highbd_dr_prediction_z2_apply_shift_x4(above0, above1, shift_x0123);
   1962 
   1963    return highbd_dr_prediction_z2_merge_x4(out16_x, out16_y, base_shift);
   1964  } else {
   1965    const uint16x4x2_t l01 =
   1966        highbd_dr_prediction_z2_gather_left_x4(left + 1, base_y0123, 4);
   1967    return highbd_dr_prediction_z2_apply_shift_x4(l01.val[0], l01.val[1],
   1968                                                  shift_y0123);
   1969  }
   1970 }
   1971 
   1972 static AOM_FORCE_INLINE uint16x8_t highbd_dr_prediction_z2_step_x8(
   1973    const uint16_t *above, const uint16x8_t above0, const uint16x8_t above1,
   1974    const uint16_t *left, int dx, int dy, int r, int c) {
   1975  const int16x8_t iota = vld1q_s16(iota1_s16);
   1976 
   1977  const int x0 = (c << 6) - (r + 1) * dx;
   1978  const int y0 = (r << 6) - (c + 1) * dy;
   1979 
   1980  const int16x8_t x01234567 = vaddq_s16(vdupq_n_s16(x0), vshlq_n_s16(iota, 6));
   1981  const int16x8_t y01234567 = vsubq_s16(vdupq_n_s16(y0), vmulq_n_s16(iota, dy));
   1982  const int16x8_t shift_x01234567 =
   1983      vshrq_n_s16(vandq_s16(x01234567, vdupq_n_s16(0x3F)), 1);
   1984  const int16x8_t shift_y01234567 =
   1985      vshrq_n_s16(vandq_s16(y01234567, vdupq_n_s16(0x3F)), 1);
   1986  const int16x8_t base_y01234567 = vshrq_n_s16(y01234567, 6);
   1987 
   1988  const int base_shift = ((((r + 1) * dx) - 1) >> 6) - c;
   1989 
   1990  // Based on the value of `base_shift` there are three possible cases to
   1991  // compute the result:
   1992  // 1) base_shift <= 0: We can load and operate entirely on data from the
   1993  //                     `above` input vector.
   1994  // 2) base_shift < vl: We can load from `above[-1]` and shift
   1995  //                     `vl - base_shift` elements across to the end of the
   1996  //                     vector, then compute the remainder from `left`.
   1997  // 3) base_shift >= vl: We can load and operate entirely on data from the
   1998  //                      `left` input vector.
   1999 
   2000  if (base_shift <= 0) {
   2001    const int base_x = x0 >> 6;
   2002    const uint16x8_t a0 = vld1q_u16(above + base_x);
   2003    const uint16x8_t a1 = vld1q_u16(above + base_x + 1);
   2004    return highbd_dr_prediction_z2_apply_shift_x8(a0, a1, shift_x01234567);
   2005  } else if (base_shift < 8) {
   2006    const uint16x8x2_t l01 = highbd_dr_prediction_z2_gather_left_x8(
   2007        left + 1, base_y01234567, base_shift);
   2008    const uint16x8_t out16_y = highbd_dr_prediction_z2_apply_shift_x8(
   2009        l01.val[0], l01.val[1], shift_y01234567);
   2010 
   2011    // No need to reload from above in the loop, just use pre-loaded constants.
   2012    const uint16x8_t out16_x =
   2013        highbd_dr_prediction_z2_apply_shift_x8(above0, above1, shift_x01234567);
   2014 
   2015    return highbd_dr_prediction_z2_merge_x8(out16_x, out16_y, base_shift);
   2016  } else {
   2017    const uint16x8x2_t l01 =
   2018        highbd_dr_prediction_z2_gather_left_x8(left + 1, base_y01234567, 8);
   2019    return highbd_dr_prediction_z2_apply_shift_x8(l01.val[0], l01.val[1],
   2020                                                  shift_y01234567);
   2021  }
   2022 }
   2023 
   2024 // Left array is accessed from -1 through `bh - 1` inclusive.
   2025 // Above array is accessed from -1 through `bw - 1` inclusive.
   2026 #define HIGHBD_DR_PREDICTOR_Z2_WXH(bw, bh)                                 \
   2027  static void highbd_dr_prediction_z2_##bw##x##bh##_neon(                  \
   2028      uint16_t *dst, ptrdiff_t stride, const uint16_t *above,              \
   2029      const uint16_t *left, int upsample_above, int upsample_left, int dx, \
   2030      int dy, int bd) {                                                    \
   2031    (void)bd;                                                              \
   2032    (void)upsample_above;                                                  \
   2033    (void)upsample_left;                                                   \
   2034    assert(!upsample_above);                                               \
   2035    assert(!upsample_left);                                                \
   2036    assert(bw % 4 == 0);                                                   \
   2037    assert(bh % 4 == 0);                                                   \
   2038    assert(dx > 0);                                                        \
   2039    assert(dy > 0);                                                        \
   2040                                                                           \
   2041    uint16_t left_data[bh + 1];                                            \
   2042    memcpy(left_data, left - 1, (bh + 1) * sizeof(uint16_t));              \
   2043                                                                           \
   2044    uint16x8_t a0, a1;                                                     \
   2045    if (bw == 4) {                                                         \
   2046      a0 = vcombine_u16(vld1_u16(above - 1), vdup_n_u16(0));               \
   2047      a1 = vcombine_u16(vld1_u16(above + 0), vdup_n_u16(0));               \
   2048    } else {                                                               \
   2049      a0 = vld1q_u16(above - 1);                                           \
   2050      a1 = vld1q_u16(above + 0);                                           \
   2051    }                                                                      \
   2052                                                                           \
   2053    int r = 0;                                                             \
   2054    do {                                                                   \
   2055      if (bw == 4) {                                                       \
   2056        vst1_u16(dst, highbd_dr_prediction_z2_step_x4(                     \
   2057                          above, vget_low_u16(a0), vget_low_u16(a1),       \
   2058                          left_data, dx, dy, r, 0));                       \
   2059      } else {                                                             \
   2060        int c = 0;                                                         \
   2061        do {                                                               \
   2062          vst1q_u16(dst + c, highbd_dr_prediction_z2_step_x8(              \
   2063                                 above, a0, a1, left_data, dx, dy, r, c)); \
   2064          c += 8;                                                          \
   2065        } while (c < bw);                                                  \
   2066      }                                                                    \
   2067      dst += stride;                                                       \
   2068    } while (++r < bh);                                                    \
   2069  }
   2070 
   2071 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
   2072 HIGHBD_DR_PREDICTOR_Z2_WXH(4, 16)
   2073 HIGHBD_DR_PREDICTOR_Z2_WXH(8, 16)
   2074 HIGHBD_DR_PREDICTOR_Z2_WXH(8, 32)
   2075 HIGHBD_DR_PREDICTOR_Z2_WXH(16, 4)
   2076 HIGHBD_DR_PREDICTOR_Z2_WXH(16, 8)
   2077 HIGHBD_DR_PREDICTOR_Z2_WXH(16, 16)
   2078 HIGHBD_DR_PREDICTOR_Z2_WXH(16, 32)
   2079 HIGHBD_DR_PREDICTOR_Z2_WXH(16, 64)
   2080 HIGHBD_DR_PREDICTOR_Z2_WXH(32, 8)
   2081 HIGHBD_DR_PREDICTOR_Z2_WXH(32, 16)
   2082 HIGHBD_DR_PREDICTOR_Z2_WXH(32, 32)
   2083 HIGHBD_DR_PREDICTOR_Z2_WXH(32, 64)
   2084 HIGHBD_DR_PREDICTOR_Z2_WXH(64, 16)
   2085 HIGHBD_DR_PREDICTOR_Z2_WXH(64, 32)
   2086 HIGHBD_DR_PREDICTOR_Z2_WXH(64, 64)
   2087 #else
   2088 HIGHBD_DR_PREDICTOR_Z2_WXH(8, 16)
   2089 HIGHBD_DR_PREDICTOR_Z2_WXH(16, 8)
   2090 HIGHBD_DR_PREDICTOR_Z2_WXH(16, 16)
   2091 HIGHBD_DR_PREDICTOR_Z2_WXH(16, 32)
   2092 HIGHBD_DR_PREDICTOR_Z2_WXH(32, 32)
   2093 HIGHBD_DR_PREDICTOR_Z2_WXH(32, 64)
   2094 HIGHBD_DR_PREDICTOR_Z2_WXH(64, 32)
   2095 HIGHBD_DR_PREDICTOR_Z2_WXH(64, 64)
   2096 #endif  // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
   2097 
   2098 #undef HIGHBD_DR_PREDICTOR_Z2_WXH
   2099 
   2100 typedef void (*highbd_dr_prediction_z2_ptr)(uint16_t *dst, ptrdiff_t stride,
   2101                                            const uint16_t *above,
   2102                                            const uint16_t *left,
   2103                                            int upsample_above,
   2104                                            int upsample_left, int dx, int dy,
   2105                                            int bd);
   2106 
   2107 static void highbd_dr_prediction_z2_4x4_neon(uint16_t *dst, ptrdiff_t stride,
   2108                                             const uint16_t *above,
   2109                                             const uint16_t *left,
   2110                                             int upsample_above,
   2111                                             int upsample_left, int dx, int dy,
   2112                                             int bd) {
   2113  (void)bd;
   2114  assert(dx > 0);
   2115  assert(dy > 0);
   2116 
   2117  const int frac_bits_x = 6 - upsample_above;
   2118  const int frac_bits_y = 6 - upsample_left;
   2119  const int min_base_x = -(1 << (upsample_above + frac_bits_x));
   2120 
   2121  // if `upsample_left` then we need -2 through 6 inclusive from `left`.
   2122  // else we only need -1 through 3 inclusive.
   2123 
   2124 #if AOM_ARCH_AARCH64
   2125  uint16x8_t left_data0, left_data1;
   2126  if (upsample_left) {
   2127    left_data0 = vld1q_u16(left - 2);
   2128    left_data1 = vld1q_u16(left - 1);
   2129  } else {
   2130    left_data0 = vcombine_u16(vld1_u16(left - 1), vdup_n_u16(0));
   2131    left_data1 = vcombine_u16(vld1_u16(left + 0), vdup_n_u16(0));
   2132  }
   2133 #endif
   2134 
   2135  const int16x4_t iota0123 = vld1_s16(iota1_s16);
   2136  const int16x4_t iota1234 = vld1_s16(iota1_s16 + 1);
   2137 
   2138  for (int r = 0; r < 4; ++r) {
   2139    const int base_shift = (min_base_x + (r + 1) * dx + 63) >> 6;
   2140    const int x0 = (r + 1) * dx;
   2141    const int16x4_t x0123 = vsub_s16(vshl_n_s16(iota0123, 6), vdup_n_s16(x0));
   2142    const int base_x0 = (-x0) >> frac_bits_x;
   2143    if (base_shift <= 0) {
   2144      uint16x4_t a0, a1;
   2145      int16x4_t shift_x0123;
   2146      if (upsample_above) {
   2147        const uint16x4x2_t a01 = vld2_u16(above + base_x0);
   2148        a0 = a01.val[0];
   2149        a1 = a01.val[1];
   2150        shift_x0123 = vand_s16(x0123, vdup_n_s16(0x1F));
   2151      } else {
   2152        a0 = vld1_u16(above + base_x0);
   2153        a1 = vld1_u16(above + base_x0 + 1);
   2154        shift_x0123 = vshr_n_s16(vand_s16(x0123, vdup_n_s16(0x3F)), 1);
   2155      }
   2156      vst1_u16(dst,
   2157               highbd_dr_prediction_z2_apply_shift_x4(a0, a1, shift_x0123));
   2158    } else if (base_shift < 4) {
   2159      // Calculate Y component from `left`.
   2160      const int y_iters = base_shift;
   2161      const int16x4_t y0123 =
   2162          vsub_s16(vdup_n_s16(r << 6), vmul_n_s16(iota1234, dy));
   2163      const int16x4_t base_y0123 = vshl_s16(y0123, vdup_n_s16(-frac_bits_y));
   2164      const int16x4_t shift_y0123 = vshr_n_s16(
   2165          vand_s16(vmul_n_s16(y0123, 1 << upsample_left), vdup_n_s16(0x3F)), 1);
   2166      uint16x4_t l0, l1;
   2167 #if AOM_ARCH_AARCH64
   2168      const int left_data_base = upsample_left ? -2 : -1;
   2169      l0 = highbd_dr_prediction_z2_tbl_left_x4_from_x8(left_data0, base_y0123,
   2170                                                       left_data_base, y_iters);
   2171      l1 = highbd_dr_prediction_z2_tbl_left_x4_from_x8(left_data1, base_y0123,
   2172                                                       left_data_base, y_iters);
   2173 #else
   2174      const uint16x4x2_t l01 =
   2175          highbd_dr_prediction_z2_gather_left_x4(left, base_y0123, y_iters);
   2176      l0 = l01.val[0];
   2177      l1 = l01.val[1];
   2178 #endif
   2179 
   2180      const uint16x4_t out_y =
   2181          highbd_dr_prediction_z2_apply_shift_x4(l0, l1, shift_y0123);
   2182 
   2183      // Calculate X component from `above`.
   2184      const int16x4_t shift_x0123 = vshr_n_s16(
   2185          vand_s16(vmul_n_s16(x0123, 1 << upsample_above), vdup_n_s16(0x3F)),
   2186          1);
   2187      uint16x4_t a0, a1;
   2188      if (upsample_above) {
   2189        const uint16x4x2_t a01 = vld2_u16(above + (base_x0 % 2 == 0 ? -2 : -1));
   2190        a0 = a01.val[0];
   2191        a1 = a01.val[1];
   2192      } else {
   2193        a0 = vld1_u16(above - 1);
   2194        a1 = vld1_u16(above + 0);
   2195      }
   2196      const uint16x4_t out_x =
   2197          highbd_dr_prediction_z2_apply_shift_x4(a0, a1, shift_x0123);
   2198 
   2199      // Combine X and Y vectors.
   2200      const uint16x4_t out =
   2201          highbd_dr_prediction_z2_merge_x4(out_x, out_y, base_shift);
   2202      vst1_u16(dst, out);
   2203    } else {
   2204      const int16x4_t y0123 =
   2205          vsub_s16(vdup_n_s16(r << 6), vmul_n_s16(iota1234, dy));
   2206      const int16x4_t base_y0123 = vshl_s16(y0123, vdup_n_s16(-frac_bits_y));
   2207      const int16x4_t shift_y0123 = vshr_n_s16(
   2208          vand_s16(vmul_n_s16(y0123, 1 << upsample_left), vdup_n_s16(0x3F)), 1);
   2209      uint16x4_t l0, l1;
   2210 #if AOM_ARCH_AARCH64
   2211      const int left_data_base = upsample_left ? -2 : -1;
   2212      l0 = highbd_dr_prediction_z2_tbl_left_x4_from_x8(left_data0, base_y0123,
   2213                                                       left_data_base, 4);
   2214      l1 = highbd_dr_prediction_z2_tbl_left_x4_from_x8(left_data1, base_y0123,
   2215                                                       left_data_base, 4);
   2216 #else
   2217      const uint16x4x2_t l01 =
   2218          highbd_dr_prediction_z2_gather_left_x4(left, base_y0123, 4);
   2219      l0 = l01.val[0];
   2220      l1 = l01.val[1];
   2221 #endif
   2222      vst1_u16(dst,
   2223               highbd_dr_prediction_z2_apply_shift_x4(l0, l1, shift_y0123));
   2224    }
   2225    dst += stride;
   2226  }
   2227 }
   2228 
   2229 static void highbd_dr_prediction_z2_4x8_neon(uint16_t *dst, ptrdiff_t stride,
   2230                                             const uint16_t *above,
   2231                                             const uint16_t *left,
   2232                                             int upsample_above,
   2233                                             int upsample_left, int dx, int dy,
   2234                                             int bd) {
   2235  (void)bd;
   2236  assert(dx > 0);
   2237  assert(dy > 0);
   2238 
   2239  const int frac_bits_x = 6 - upsample_above;
   2240  const int frac_bits_y = 6 - upsample_left;
   2241  const int min_base_x = -(1 << (upsample_above + frac_bits_x));
   2242 
   2243  // if `upsample_left` then we need -2 through 14 inclusive from `left`.
   2244  // else we only need -1 through 6 inclusive.
   2245 
   2246 #if AOM_ARCH_AARCH64
   2247  uint16x8x2_t left_data0, left_data1;
   2248  if (upsample_left) {
   2249    left_data0 = vld1q_u16_x2(left - 2);
   2250    left_data1 = vld1q_u16_x2(left - 1);
   2251  } else {
   2252    left_data0 = (uint16x8x2_t){ { vld1q_u16(left - 1), vdupq_n_u16(0) } };
   2253    left_data1 = (uint16x8x2_t){ { vld1q_u16(left + 0), vdupq_n_u16(0) } };
   2254  }
   2255 #endif
   2256 
   2257  const int16x4_t iota0123 = vld1_s16(iota1_s16);
   2258  const int16x4_t iota1234 = vld1_s16(iota1_s16 + 1);
   2259 
   2260  for (int r = 0; r < 8; ++r) {
   2261    const int base_shift = (min_base_x + (r + 1) * dx + 63) >> 6;
   2262    const int x0 = (r + 1) * dx;
   2263    const int16x4_t x0123 = vsub_s16(vshl_n_s16(iota0123, 6), vdup_n_s16(x0));
   2264    const int base_x0 = (-x0) >> frac_bits_x;
   2265    if (base_shift <= 0) {
   2266      uint16x4_t a0, a1;
   2267      int16x4_t shift_x0123;
   2268      if (upsample_above) {
   2269        const uint16x4x2_t a01 = vld2_u16(above + base_x0);
   2270        a0 = a01.val[0];
   2271        a1 = a01.val[1];
   2272        shift_x0123 = vand_s16(x0123, vdup_n_s16(0x1F));
   2273      } else {
   2274        a0 = vld1_u16(above + base_x0);
   2275        a1 = vld1_u16(above + base_x0 + 1);
   2276        shift_x0123 = vand_s16(vshr_n_s16(x0123, 1), vdup_n_s16(0x1F));
   2277      }
   2278      vst1_u16(dst,
   2279               highbd_dr_prediction_z2_apply_shift_x4(a0, a1, shift_x0123));
   2280    } else if (base_shift < 4) {
   2281      // Calculate Y component from `left`.
   2282      const int y_iters = base_shift;
   2283      const int16x4_t y0123 =
   2284          vsub_s16(vdup_n_s16(r << 6), vmul_n_s16(iota1234, dy));
   2285      const int16x4_t base_y0123 = vshl_s16(y0123, vdup_n_s16(-frac_bits_y));
   2286      const int16x4_t shift_y0123 = vshr_n_s16(
   2287          vand_s16(vmul_n_s16(y0123, 1 << upsample_left), vdup_n_s16(0x3F)), 1);
   2288 
   2289      uint16x4_t l0, l1;
   2290 #if AOM_ARCH_AARCH64
   2291      const int left_data_base = upsample_left ? -2 : -1;
   2292      l0 = highbd_dr_prediction_z2_tbl_left_x4_from_x16(
   2293          left_data0, base_y0123, left_data_base, y_iters);
   2294      l1 = highbd_dr_prediction_z2_tbl_left_x4_from_x16(
   2295          left_data1, base_y0123, left_data_base, y_iters);
   2296 #else
   2297      const uint16x4x2_t l01 =
   2298          highbd_dr_prediction_z2_gather_left_x4(left, base_y0123, y_iters);
   2299      l0 = l01.val[0];
   2300      l1 = l01.val[1];
   2301 #endif
   2302 
   2303      const uint16x4_t out_y =
   2304          highbd_dr_prediction_z2_apply_shift_x4(l0, l1, shift_y0123);
   2305 
   2306      // Calculate X component from `above`.
   2307      uint16x4_t a0, a1;
   2308      int16x4_t shift_x0123;
   2309      if (upsample_above) {
   2310        const uint16x4x2_t a01 = vld2_u16(above + (base_x0 % 2 == 0 ? -2 : -1));
   2311        a0 = a01.val[0];
   2312        a1 = a01.val[1];
   2313        shift_x0123 = vand_s16(x0123, vdup_n_s16(0x1F));
   2314      } else {
   2315        a0 = vld1_u16(above - 1);
   2316        a1 = vld1_u16(above + 0);
   2317        shift_x0123 = vand_s16(vshr_n_s16(x0123, 1), vdup_n_s16(0x1F));
   2318      }
   2319      const uint16x4_t out_x =
   2320          highbd_dr_prediction_z2_apply_shift_x4(a0, a1, shift_x0123);
   2321 
   2322      // Combine X and Y vectors.
   2323      const uint16x4_t out =
   2324          highbd_dr_prediction_z2_merge_x4(out_x, out_y, base_shift);
   2325      vst1_u16(dst, out);
   2326    } else {
   2327      const int16x4_t y0123 =
   2328          vsub_s16(vdup_n_s16(r << 6), vmul_n_s16(iota1234, dy));
   2329      const int16x4_t base_y0123 = vshl_s16(y0123, vdup_n_s16(-frac_bits_y));
   2330      const int16x4_t shift_y0123 = vshr_n_s16(
   2331          vand_s16(vmul_n_s16(y0123, 1 << upsample_left), vdup_n_s16(0x3F)), 1);
   2332 
   2333      uint16x4_t l0, l1;
   2334 #if AOM_ARCH_AARCH64
   2335      const int left_data_base = upsample_left ? -2 : -1;
   2336      l0 = highbd_dr_prediction_z2_tbl_left_x4_from_x16(left_data0, base_y0123,
   2337                                                        left_data_base, 4);
   2338      l1 = highbd_dr_prediction_z2_tbl_left_x4_from_x16(left_data1, base_y0123,
   2339                                                        left_data_base, 4);
   2340 #else
   2341      const uint16x4x2_t l01 =
   2342          highbd_dr_prediction_z2_gather_left_x4(left, base_y0123, 4);
   2343      l0 = l01.val[0];
   2344      l1 = l01.val[1];
   2345 #endif
   2346 
   2347      vst1_u16(dst,
   2348               highbd_dr_prediction_z2_apply_shift_x4(l0, l1, shift_y0123));
   2349    }
   2350    dst += stride;
   2351  }
   2352 }
   2353 
   2354 static void highbd_dr_prediction_z2_8x4_neon(uint16_t *dst, ptrdiff_t stride,
   2355                                             const uint16_t *above,
   2356                                             const uint16_t *left,
   2357                                             int upsample_above,
   2358                                             int upsample_left, int dx, int dy,
   2359                                             int bd) {
   2360  (void)bd;
   2361  assert(dx > 0);
   2362  assert(dy > 0);
   2363 
   2364  const int frac_bits_x = 6 - upsample_above;
   2365  const int frac_bits_y = 6 - upsample_left;
   2366  const int min_base_x = -(1 << (upsample_above + frac_bits_x));
   2367 
   2368  // if `upsample_left` then we need -2 through 6 inclusive from `left`.
   2369  // else we only need -1 through 3 inclusive.
   2370 
   2371 #if AOM_ARCH_AARCH64
   2372  uint16x8_t left_data0, left_data1;
   2373  if (upsample_left) {
   2374    left_data0 = vld1q_u16(left - 2);
   2375    left_data1 = vld1q_u16(left - 1);
   2376  } else {
   2377    left_data0 = vcombine_u16(vld1_u16(left - 1), vdup_n_u16(0));
   2378    left_data1 = vcombine_u16(vld1_u16(left + 0), vdup_n_u16(0));
   2379  }
   2380 #endif
   2381 
   2382  const int16x8_t iota01234567 = vld1q_s16(iota1_s16);
   2383  const int16x8_t iota12345678 = vld1q_s16(iota1_s16 + 1);
   2384 
   2385  for (int r = 0; r < 4; ++r) {
   2386    const int base_shift = (min_base_x + (r + 1) * dx + 63) >> 6;
   2387    const int x0 = (r + 1) * dx;
   2388    const int16x8_t x01234567 =
   2389        vsubq_s16(vshlq_n_s16(iota01234567, 6), vdupq_n_s16(x0));
   2390    const int base_x0 = (-x0) >> frac_bits_x;
   2391    if (base_shift <= 0) {
   2392      uint16x8_t a0, a1;
   2393      int16x8_t shift_x01234567;
   2394      if (upsample_above) {
   2395        const uint16x8x2_t a01 = vld2q_u16(above + base_x0);
   2396        a0 = a01.val[0];
   2397        a1 = a01.val[1];
   2398        shift_x01234567 = vandq_s16(x01234567, vdupq_n_s16(0x1F));
   2399      } else {
   2400        a0 = vld1q_u16(above + base_x0);
   2401        a1 = vld1q_u16(above + base_x0 + 1);
   2402        shift_x01234567 =
   2403            vandq_s16(vshrq_n_s16(x01234567, 1), vdupq_n_s16(0x1F));
   2404      }
   2405      vst1q_u16(
   2406          dst, highbd_dr_prediction_z2_apply_shift_x8(a0, a1, shift_x01234567));
   2407    } else if (base_shift < 8) {
   2408      // Calculate Y component from `left`.
   2409      const int y_iters = base_shift;
   2410      const int16x8_t y01234567 =
   2411          vsubq_s16(vdupq_n_s16(r << 6), vmulq_n_s16(iota12345678, dy));
   2412      const int16x8_t base_y01234567 =
   2413          vshlq_s16(y01234567, vdupq_n_s16(-frac_bits_y));
   2414      const int16x8_t shift_y01234567 =
   2415          vshrq_n_s16(vandq_s16(vmulq_n_s16(y01234567, 1 << upsample_left),
   2416                                vdupq_n_s16(0x3F)),
   2417                      1);
   2418 
   2419      uint16x8_t l0, l1;
   2420 #if AOM_ARCH_AARCH64
   2421      const int left_data_base = upsample_left ? -2 : -1;
   2422      l0 = highbd_dr_prediction_z2_tbl_left_x8_from_x8(
   2423          left_data0, base_y01234567, left_data_base, y_iters);
   2424      l1 = highbd_dr_prediction_z2_tbl_left_x8_from_x8(
   2425          left_data1, base_y01234567, left_data_base, y_iters);
   2426 #else
   2427      const uint16x8x2_t l01 =
   2428          highbd_dr_prediction_z2_gather_left_x8(left, base_y01234567, y_iters);
   2429      l0 = l01.val[0];
   2430      l1 = l01.val[1];
   2431 #endif
   2432 
   2433      const uint16x8_t out_y =
   2434          highbd_dr_prediction_z2_apply_shift_x8(l0, l1, shift_y01234567);
   2435 
   2436      // Calculate X component from `above`.
   2437      uint16x8_t a0, a1;
   2438      int16x8_t shift_x01234567;
   2439      if (upsample_above) {
   2440        const uint16x8x2_t a01 =
   2441            vld2q_u16(above + (base_x0 % 2 == 0 ? -2 : -1));
   2442        a0 = a01.val[0];
   2443        a1 = a01.val[1];
   2444        shift_x01234567 = vandq_s16(x01234567, vdupq_n_s16(0x1F));
   2445      } else {
   2446        a0 = vld1q_u16(above - 1);
   2447        a1 = vld1q_u16(above + 0);
   2448        shift_x01234567 =
   2449            vandq_s16(vshrq_n_s16(x01234567, 1), vdupq_n_s16(0x1F));
   2450      }
   2451      const uint16x8_t out_x =
   2452          highbd_dr_prediction_z2_apply_shift_x8(a0, a1, shift_x01234567);
   2453 
   2454      // Combine X and Y vectors.
   2455      const uint16x8_t out =
   2456          highbd_dr_prediction_z2_merge_x8(out_x, out_y, base_shift);
   2457      vst1q_u16(dst, out);
   2458    } else {
   2459      const int16x8_t y01234567 =
   2460          vsubq_s16(vdupq_n_s16(r << 6), vmulq_n_s16(iota12345678, dy));
   2461      const int16x8_t base_y01234567 =
   2462          vshlq_s16(y01234567, vdupq_n_s16(-frac_bits_y));
   2463      const int16x8_t shift_y01234567 =
   2464          vshrq_n_s16(vandq_s16(vmulq_n_s16(y01234567, 1 << upsample_left),
   2465                                vdupq_n_s16(0x3F)),
   2466                      1);
   2467 
   2468      uint16x8_t l0, l1;
   2469 #if AOM_ARCH_AARCH64
   2470      const int left_data_base = upsample_left ? -2 : -1;
   2471      l0 = highbd_dr_prediction_z2_tbl_left_x8_from_x8(
   2472          left_data0, base_y01234567, left_data_base, 8);
   2473      l1 = highbd_dr_prediction_z2_tbl_left_x8_from_x8(
   2474          left_data1, base_y01234567, left_data_base, 8);
   2475 #else
   2476      const uint16x8x2_t l01 =
   2477          highbd_dr_prediction_z2_gather_left_x8(left, base_y01234567, 8);
   2478      l0 = l01.val[0];
   2479      l1 = l01.val[1];
   2480 #endif
   2481 
   2482      vst1q_u16(
   2483          dst, highbd_dr_prediction_z2_apply_shift_x8(l0, l1, shift_y01234567));
   2484    }
   2485    dst += stride;
   2486  }
   2487 }
   2488 
   2489 static void highbd_dr_prediction_z2_8x8_neon(uint16_t *dst, ptrdiff_t stride,
   2490                                             const uint16_t *above,
   2491                                             const uint16_t *left,
   2492                                             int upsample_above,
   2493                                             int upsample_left, int dx, int dy,
   2494                                             int bd) {
   2495  (void)bd;
   2496  assert(dx > 0);
   2497  assert(dy > 0);
   2498 
   2499  const int frac_bits_x = 6 - upsample_above;
   2500  const int frac_bits_y = 6 - upsample_left;
   2501  const int min_base_x = -(1 << (upsample_above + frac_bits_x));
   2502 
   2503  // if `upsample_left` then we need -2 through 14 inclusive from `left`.
   2504  // else we only need -1 through 6 inclusive.
   2505 
   2506 #if AOM_ARCH_AARCH64
   2507  uint16x8x2_t left_data0, left_data1;
   2508  if (upsample_left) {
   2509    left_data0 = vld1q_u16_x2(left - 2);
   2510    left_data1 = vld1q_u16_x2(left - 1);
   2511  } else {
   2512    left_data0 = (uint16x8x2_t){ { vld1q_u16(left - 1), vdupq_n_u16(0) } };
   2513    left_data1 = (uint16x8x2_t){ { vld1q_u16(left + 0), vdupq_n_u16(0) } };
   2514  }
   2515 #endif
   2516 
   2517  const int16x8_t iota01234567 = vld1q_s16(iota1_s16);
   2518  const int16x8_t iota12345678 = vld1q_s16(iota1_s16 + 1);
   2519 
   2520  for (int r = 0; r < 8; ++r) {
   2521    const int base_shift = (min_base_x + (r + 1) * dx + 63) >> 6;
   2522    const int x0 = (r + 1) * dx;
   2523    const int16x8_t x01234567 =
   2524        vsubq_s16(vshlq_n_s16(iota01234567, 6), vdupq_n_s16(x0));
   2525    const int base_x0 = (-x0) >> frac_bits_x;
   2526    if (base_shift <= 0) {
   2527      uint16x8_t a0, a1;
   2528      int16x8_t shift_x01234567;
   2529      if (upsample_above) {
   2530        const uint16x8x2_t a01 = vld2q_u16(above + base_x0);
   2531        a0 = a01.val[0];
   2532        a1 = a01.val[1];
   2533        shift_x01234567 = vandq_s16(x01234567, vdupq_n_s16(0x1F));
   2534      } else {
   2535        a0 = vld1q_u16(above + base_x0);
   2536        a1 = vld1q_u16(above + base_x0 + 1);
   2537        shift_x01234567 =
   2538            vandq_s16(vshrq_n_s16(x01234567, 1), vdupq_n_s16(0x1F));
   2539      }
   2540      vst1q_u16(
   2541          dst, highbd_dr_prediction_z2_apply_shift_x8(a0, a1, shift_x01234567));
   2542    } else if (base_shift < 8) {
   2543      // Calculate Y component from `left`.
   2544      const int y_iters = base_shift;
   2545      const int16x8_t y01234567 =
   2546          vsubq_s16(vdupq_n_s16(r << 6), vmulq_n_s16(iota12345678, dy));
   2547      const int16x8_t base_y01234567 =
   2548          vshlq_s16(y01234567, vdupq_n_s16(-frac_bits_y));
   2549      const int16x8_t shift_y01234567 =
   2550          vshrq_n_s16(vandq_s16(vmulq_n_s16(y01234567, 1 << upsample_left),
   2551                                vdupq_n_s16(0x3F)),
   2552                      1);
   2553 
   2554      uint16x8_t l0, l1;
   2555 #if AOM_ARCH_AARCH64
   2556      const int left_data_base = upsample_left ? -2 : -1;
   2557      l0 = highbd_dr_prediction_z2_tbl_left_x8_from_x16(
   2558          left_data0, base_y01234567, left_data_base, y_iters);
   2559      l1 = highbd_dr_prediction_z2_tbl_left_x8_from_x16(
   2560          left_data1, base_y01234567, left_data_base, y_iters);
   2561 #else
   2562      const uint16x8x2_t l01 =
   2563          highbd_dr_prediction_z2_gather_left_x8(left, base_y01234567, y_iters);
   2564      l0 = l01.val[0];
   2565      l1 = l01.val[1];
   2566 #endif
   2567 
   2568      const uint16x8_t out_y =
   2569          highbd_dr_prediction_z2_apply_shift_x8(l0, l1, shift_y01234567);
   2570 
   2571      // Calculate X component from `above`.
   2572      uint16x8_t a0, a1;
   2573      int16x8_t shift_x01234567;
   2574      if (upsample_above) {
   2575        const uint16x8x2_t a01 =
   2576            vld2q_u16(above + (base_x0 % 2 == 0 ? -2 : -1));
   2577        a0 = a01.val[0];
   2578        a1 = a01.val[1];
   2579        shift_x01234567 = vandq_s16(x01234567, vdupq_n_s16(0x1F));
   2580      } else {
   2581        a0 = vld1q_u16(above - 1);
   2582        a1 = vld1q_u16(above + 0);
   2583        shift_x01234567 =
   2584            vandq_s16(vshrq_n_s16(x01234567, 1), vdupq_n_s16(0x1F));
   2585      }
   2586      const uint16x8_t out_x =
   2587          highbd_dr_prediction_z2_apply_shift_x8(a0, a1, shift_x01234567);
   2588 
   2589      // Combine X and Y vectors.
   2590      const uint16x8_t out =
   2591          highbd_dr_prediction_z2_merge_x8(out_x, out_y, base_shift);
   2592      vst1q_u16(dst, out);
   2593    } else {
   2594      const int16x8_t y01234567 =
   2595          vsubq_s16(vdupq_n_s16(r << 6), vmulq_n_s16(iota12345678, dy));
   2596      const int16x8_t base_y01234567 =
   2597          vshlq_s16(y01234567, vdupq_n_s16(-frac_bits_y));
   2598      const int16x8_t shift_y01234567 =
   2599          vshrq_n_s16(vandq_s16(vmulq_n_s16(y01234567, 1 << upsample_left),
   2600                                vdupq_n_s16(0x3F)),
   2601                      1);
   2602 
   2603      uint16x8_t l0, l1;
   2604 #if AOM_ARCH_AARCH64
   2605      const int left_data_base = upsample_left ? -2 : -1;
   2606      l0 = highbd_dr_prediction_z2_tbl_left_x8_from_x16(
   2607          left_data0, base_y01234567, left_data_base, 8);
   2608      l1 = highbd_dr_prediction_z2_tbl_left_x8_from_x16(
   2609          left_data1, base_y01234567, left_data_base, 8);
   2610 #else
   2611      const uint16x8x2_t l01 =
   2612          highbd_dr_prediction_z2_gather_left_x8(left, base_y01234567, 8);
   2613      l0 = l01.val[0];
   2614      l1 = l01.val[1];
   2615 #endif
   2616 
   2617      vst1q_u16(
   2618          dst, highbd_dr_prediction_z2_apply_shift_x8(l0, l1, shift_y01234567));
   2619    }
   2620    dst += stride;
   2621  }
   2622 }
   2623 
   2624 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
   2625 static highbd_dr_prediction_z2_ptr dr_predictor_z2_arr_neon[7][7] = {
   2626  { NULL, NULL, NULL, NULL, NULL, NULL, NULL },
   2627  { NULL, NULL, NULL, NULL, NULL, NULL, NULL },
   2628  { NULL, NULL, &highbd_dr_prediction_z2_4x4_neon,
   2629    &highbd_dr_prediction_z2_4x8_neon, &highbd_dr_prediction_z2_4x16_neon, NULL,
   2630    NULL },
   2631  { NULL, NULL, &highbd_dr_prediction_z2_8x4_neon,
   2632    &highbd_dr_prediction_z2_8x8_neon, &highbd_dr_prediction_z2_8x16_neon,
   2633    &highbd_dr_prediction_z2_8x32_neon, NULL },
   2634  { NULL, NULL, &highbd_dr_prediction_z2_16x4_neon,
   2635    &highbd_dr_prediction_z2_16x8_neon, &highbd_dr_prediction_z2_16x16_neon,
   2636    &highbd_dr_prediction_z2_16x32_neon, &highbd_dr_prediction_z2_16x64_neon },
   2637  { NULL, NULL, NULL, &highbd_dr_prediction_z2_32x8_neon,
   2638    &highbd_dr_prediction_z2_32x16_neon, &highbd_dr_prediction_z2_32x32_neon,
   2639    &highbd_dr_prediction_z2_32x64_neon },
   2640  { NULL, NULL, NULL, NULL, &highbd_dr_prediction_z2_64x16_neon,
   2641    &highbd_dr_prediction_z2_64x32_neon, &highbd_dr_prediction_z2_64x64_neon },
   2642 };
   2643 #else
   2644 static highbd_dr_prediction_z2_ptr dr_predictor_z2_arr_neon[7][7] = {
   2645  { NULL, NULL, NULL, NULL, NULL, NULL, NULL },
   2646  { NULL, NULL, NULL, NULL, NULL, NULL, NULL },
   2647  { NULL, NULL, &highbd_dr_prediction_z2_4x4_neon,
   2648    &highbd_dr_prediction_z2_4x8_neon, NULL, NULL, NULL },
   2649  { NULL, NULL, &highbd_dr_prediction_z2_8x4_neon,
   2650    &highbd_dr_prediction_z2_8x8_neon, &highbd_dr_prediction_z2_8x16_neon, NULL,
   2651    NULL },
   2652  { NULL, NULL, NULL, &highbd_dr_prediction_z2_16x8_neon,
   2653    &highbd_dr_prediction_z2_16x16_neon, &highbd_dr_prediction_z2_16x32_neon,
   2654    NULL },
   2655  { NULL, NULL, NULL, NULL, NULL, &highbd_dr_prediction_z2_32x32_neon,
   2656    &highbd_dr_prediction_z2_32x64_neon },
   2657  { NULL, NULL, NULL, NULL, NULL, &highbd_dr_prediction_z2_64x32_neon,
   2658    &highbd_dr_prediction_z2_64x64_neon },
   2659 };
   2660 #endif  // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
   2661 
   2662 // Directional prediction, zone 2: 90 < angle < 180
   2663 void av1_highbd_dr_prediction_z2_neon(uint16_t *dst, ptrdiff_t stride, int bw,
   2664                                      int bh, const uint16_t *above,
   2665                                      const uint16_t *left, int upsample_above,
   2666                                      int upsample_left, int dx, int dy,
   2667                                      int bd) {
   2668  highbd_dr_prediction_z2_ptr f =
   2669      dr_predictor_z2_arr_neon[get_msb(bw)][get_msb(bh)];
   2670  assert(f != NULL);
   2671  f(dst, stride, above, left, upsample_above, upsample_left, dx, dy, bd);
   2672 }
   2673 
   2674 // -----------------------------------------------------------------------------
   2675 // Z3
   2676 
   2677 // Both the lane to the use and the shift amount must be immediates.
   2678 #define HIGHBD_DR_PREDICTOR_Z3_STEP_X4(out, iota, base, in0, in1, s0, s1, \
   2679                                       lane, shift)                       \
   2680  do {                                                                    \
   2681    uint32x4_t val = vmull_lane_u16((in0), (s0), (lane));                 \
   2682    val = vmlal_lane_u16(val, (in1), (s1), (lane));                       \
   2683    const uint16x4_t cmp = vadd_u16((iota), vdup_n_u16(base));            \
   2684    const uint16x4_t res = vrshrn_n_u32(val, (shift));                    \
   2685    *(out) = vbsl_u16(vclt_u16(cmp, vdup_n_u16(max_base_y)), res,         \
   2686                      vdup_n_u16(left_max));                              \
   2687  } while (0)
   2688 
   2689 #define HIGHBD_DR_PREDICTOR_Z3_STEP_X8(out, iota, base, in0, in1, s0, s1, \
   2690                                       lane, shift)                       \
   2691  do {                                                                    \
   2692    uint32x4_t val_lo = vmull_lane_u16(vget_low_u16(in0), (s0), (lane));  \
   2693    val_lo = vmlal_lane_u16(val_lo, vget_low_u16(in1), (s1), (lane));     \
   2694    uint32x4_t val_hi = vmull_lane_u16(vget_high_u16(in0), (s0), (lane)); \
   2695    val_hi = vmlal_lane_u16(val_hi, vget_high_u16(in1), (s1), (lane));    \
   2696    *(out) = vcombine_u16(vrshrn_n_u32(val_lo, (shift)),                  \
   2697                          vrshrn_n_u32(val_hi, (shift)));                 \
   2698  } while (0)
   2699 
   2700 static inline uint16x8x2_t z3_load_left_neon(const uint16_t *left0, int ofs,
   2701                                             int max_ofs) {
   2702  uint16x8_t r0;
   2703  uint16x8_t r1;
   2704  if (ofs + 7 >= max_ofs) {
   2705    int shuffle_idx = max_ofs - ofs;
   2706    r0 = zn_load_masked_neon(left0 + (max_ofs - 7), shuffle_idx);
   2707  } else {
   2708    r0 = vld1q_u16(left0 + ofs);
   2709  }
   2710  if (ofs + 8 >= max_ofs) {
   2711    int shuffle_idx = max_ofs - ofs - 1;
   2712    r1 = zn_load_masked_neon(left0 + (max_ofs - 7), shuffle_idx);
   2713  } else {
   2714    r1 = vld1q_u16(left0 + ofs + 1);
   2715  }
   2716  return (uint16x8x2_t){ { r0, r1 } };
   2717 }
   2718 
   2719 static void highbd_dr_prediction_z3_upsample0_neon(uint16_t *dst,
   2720                                                   ptrdiff_t stride, int bw,
   2721                                                   int bh, const uint16_t *left,
   2722                                                   int dy) {
   2723  assert(bw % 4 == 0);
   2724  assert(bh % 4 == 0);
   2725  assert(dy > 0);
   2726 
   2727  // Factor out left + 1 to give the compiler a better chance of recognising
   2728  // that the offsets used for the loads from left and left + 1 are otherwise
   2729  // identical.
   2730  const uint16_t *left1 = left + 1;
   2731 
   2732  const int max_base_y = (bw + bh - 1);
   2733  const int left_max = left[max_base_y];
   2734  const int frac_bits = 6;
   2735 
   2736  const uint16x8_t iota1x8 = vreinterpretq_u16_s16(vld1q_s16(iota1_s16));
   2737  const uint16x4_t iota1x4 = vget_low_u16(iota1x8);
   2738 
   2739  // The C implementation of the z3 predictor when not upsampling uses:
   2740  // ((y & 0x3f) >> 1)
   2741  // The right shift is unnecessary here since we instead shift by +1 later,
   2742  // so adjust the mask to 0x3e to ensure we don't consider the extra bit.
   2743  const uint16x4_t shift_mask = vdup_n_u16(0x3e);
   2744 
   2745  if (bh == 4) {
   2746    int y = dy;
   2747    int c = 0;
   2748    do {
   2749      // Fully unroll the 4x4 block to allow us to use immediate lane-indexed
   2750      // multiply instructions.
   2751      const uint16x4_t shifts1 =
   2752          vand_u16(vmla_n_u16(vdup_n_u16(y), iota1x4, dy), shift_mask);
   2753      const uint16x4_t shifts0 = vsub_u16(vdup_n_u16(64), shifts1);
   2754      const int base0 = (y + 0 * dy) >> frac_bits;
   2755      const int base1 = (y + 1 * dy) >> frac_bits;
   2756      const int base2 = (y + 2 * dy) >> frac_bits;
   2757      const int base3 = (y + 3 * dy) >> frac_bits;
   2758      uint16x4_t out[4];
   2759      if (base0 >= max_base_y) {
   2760        out[0] = vdup_n_u16(left_max);
   2761      } else {
   2762        const uint16x4_t l00 = vld1_u16(left + base0);
   2763        const uint16x4_t l01 = vld1_u16(left1 + base0);
   2764        HIGHBD_DR_PREDICTOR_Z3_STEP_X4(&out[0], iota1x4, base0, l00, l01,
   2765                                       shifts0, shifts1, 0, 6);
   2766      }
   2767      if (base1 >= max_base_y) {
   2768        out[1] = vdup_n_u16(left_max);
   2769      } else {
   2770        const uint16x4_t l10 = vld1_u16(left + base1);
   2771        const uint16x4_t l11 = vld1_u16(left1 + base1);
   2772        HIGHBD_DR_PREDICTOR_Z3_STEP_X4(&out[1], iota1x4, base1, l10, l11,
   2773                                       shifts0, shifts1, 1, 6);
   2774      }
   2775      if (base2 >= max_base_y) {
   2776        out[2] = vdup_n_u16(left_max);
   2777      } else {
   2778        const uint16x4_t l20 = vld1_u16(left + base2);
   2779        const uint16x4_t l21 = vld1_u16(left1 + base2);
   2780        HIGHBD_DR_PREDICTOR_Z3_STEP_X4(&out[2], iota1x4, base2, l20, l21,
   2781                                       shifts0, shifts1, 2, 6);
   2782      }
   2783      if (base3 >= max_base_y) {
   2784        out[3] = vdup_n_u16(left_max);
   2785      } else {
   2786        const uint16x4_t l30 = vld1_u16(left + base3);
   2787        const uint16x4_t l31 = vld1_u16(left1 + base3);
   2788        HIGHBD_DR_PREDICTOR_Z3_STEP_X4(&out[3], iota1x4, base3, l30, l31,
   2789                                       shifts0, shifts1, 3, 6);
   2790      }
   2791      transpose_array_inplace_u16_4x4(out);
   2792      for (int r2 = 0; r2 < 4; ++r2) {
   2793        vst1_u16(dst + r2 * stride + c, out[r2]);
   2794      }
   2795      y += 4 * dy;
   2796      c += 4;
   2797    } while (c < bw);
   2798  } else {
   2799    int y = dy;
   2800    int c = 0;
   2801    do {
   2802      int r = 0;
   2803      do {
   2804        // Fully unroll the 4x4 block to allow us to use immediate lane-indexed
   2805        // multiply instructions.
   2806        const uint16x4_t shifts1 =
   2807            vand_u16(vmla_n_u16(vdup_n_u16(y), iota1x4, dy), shift_mask);
   2808        const uint16x4_t shifts0 = vsub_u16(vdup_n_u16(64), shifts1);
   2809        const int base0 = ((y + 0 * dy) >> frac_bits) + r;
   2810        const int base1 = ((y + 1 * dy) >> frac_bits) + r;
   2811        const int base2 = ((y + 2 * dy) >> frac_bits) + r;
   2812        const int base3 = ((y + 3 * dy) >> frac_bits) + r;
   2813        uint16x8_t out[4];
   2814        if (base0 >= max_base_y) {
   2815          out[0] = vdupq_n_u16(left_max);
   2816        } else {
   2817          const uint16x8x2_t l0 = z3_load_left_neon(left, base0, max_base_y);
   2818          HIGHBD_DR_PREDICTOR_Z3_STEP_X8(&out[0], iota1x8, base0, l0.val[0],
   2819                                         l0.val[1], shifts0, shifts1, 0, 6);
   2820        }
   2821        if (base1 >= max_base_y) {
   2822          out[1] = vdupq_n_u16(left_max);
   2823        } else {
   2824          const uint16x8x2_t l1 = z3_load_left_neon(left, base1, max_base_y);
   2825          HIGHBD_DR_PREDICTOR_Z3_STEP_X8(&out[1], iota1x8, base1, l1.val[0],
   2826                                         l1.val[1], shifts0, shifts1, 1, 6);
   2827        }
   2828        if (base2 >= max_base_y) {
   2829          out[2] = vdupq_n_u16(left_max);
   2830        } else {
   2831          const uint16x8x2_t l2 = z3_load_left_neon(left, base2, max_base_y);
   2832          HIGHBD_DR_PREDICTOR_Z3_STEP_X8(&out[2], iota1x8, base2, l2.val[0],
   2833                                         l2.val[1], shifts0, shifts1, 2, 6);
   2834        }
   2835        if (base3 >= max_base_y) {
   2836          out[3] = vdupq_n_u16(left_max);
   2837        } else {
   2838          const uint16x8x2_t l3 = z3_load_left_neon(left, base3, max_base_y);
   2839          HIGHBD_DR_PREDICTOR_Z3_STEP_X8(&out[3], iota1x8, base3, l3.val[0],
   2840                                         l3.val[1], shifts0, shifts1, 3, 6);
   2841        }
   2842        transpose_array_inplace_u16_4x8(out);
   2843        for (int r2 = 0; r2 < 4; ++r2) {
   2844          vst1_u16(dst + (r + r2) * stride + c, vget_low_u16(out[r2]));
   2845        }
   2846        for (int r2 = 0; r2 < 4; ++r2) {
   2847          vst1_u16(dst + (r + r2 + 4) * stride + c, vget_high_u16(out[r2]));
   2848        }
   2849        r += 8;
   2850      } while (r < bh);
   2851      y += 4 * dy;
   2852      c += 4;
   2853    } while (c < bw);
   2854  }
   2855 }
   2856 
   2857 static void highbd_dr_prediction_z3_upsample1_neon(uint16_t *dst,
   2858                                                   ptrdiff_t stride, int bw,
   2859                                                   int bh, const uint16_t *left,
   2860                                                   int dy) {
   2861  assert(bw % 4 == 0);
   2862  assert(bh % 4 == 0);
   2863  assert(dy > 0);
   2864 
   2865  const int max_base_y = (bw + bh - 1) << 1;
   2866  const int left_max = left[max_base_y];
   2867  const int frac_bits = 5;
   2868 
   2869  const uint16x4_t iota1x4 = vreinterpret_u16_s16(vld1_s16(iota1_s16));
   2870  const uint16x8_t iota2x8 = vreinterpretq_u16_s16(vld1q_s16(iota2_s16));
   2871  const uint16x4_t iota2x4 = vget_low_u16(iota2x8);
   2872 
   2873  // The C implementation of the z3 predictor when upsampling uses:
   2874  // (((x << 1) & 0x3f) >> 1)
   2875  // The two shifts are unnecessary here since the lowest bit is guaranteed to
   2876  // be zero when the mask is applied, so adjust the mask to 0x1f to avoid
   2877  // needing the shifts at all.
   2878  const uint16x4_t shift_mask = vdup_n_u16(0x1F);
   2879 
   2880  if (bh == 4) {
   2881    int y = dy;
   2882    int c = 0;
   2883    do {
   2884      // Fully unroll the 4x4 block to allow us to use immediate lane-indexed
   2885      // multiply instructions.
   2886      const uint16x4_t shifts1 =
   2887          vand_u16(vmla_n_u16(vdup_n_u16(y), iota1x4, dy), shift_mask);
   2888      const uint16x4_t shifts0 = vsub_u16(vdup_n_u16(32), shifts1);
   2889      const int base0 = (y + 0 * dy) >> frac_bits;
   2890      const int base1 = (y + 1 * dy) >> frac_bits;
   2891      const int base2 = (y + 2 * dy) >> frac_bits;
   2892      const int base3 = (y + 3 * dy) >> frac_bits;
   2893      const uint16x4x2_t l0 = vld2_u16(left + base0);
   2894      const uint16x4x2_t l1 = vld2_u16(left + base1);
   2895      const uint16x4x2_t l2 = vld2_u16(left + base2);
   2896      const uint16x4x2_t l3 = vld2_u16(left + base3);
   2897      uint16x4_t out[4];
   2898      HIGHBD_DR_PREDICTOR_Z3_STEP_X4(&out[0], iota2x4, base0, l0.val[0],
   2899                                     l0.val[1], shifts0, shifts1, 0, 5);
   2900      HIGHBD_DR_PREDICTOR_Z3_STEP_X4(&out[1], iota2x4, base1, l1.val[0],
   2901                                     l1.val[1], shifts0, shifts1, 1, 5);
   2902      HIGHBD_DR_PREDICTOR_Z3_STEP_X4(&out[2], iota2x4, base2, l2.val[0],
   2903                                     l2.val[1], shifts0, shifts1, 2, 5);
   2904      HIGHBD_DR_PREDICTOR_Z3_STEP_X4(&out[3], iota2x4, base3, l3.val[0],
   2905                                     l3.val[1], shifts0, shifts1, 3, 5);
   2906      transpose_array_inplace_u16_4x4(out);
   2907      for (int r2 = 0; r2 < 4; ++r2) {
   2908        vst1_u16(dst + r2 * stride + c, out[r2]);
   2909      }
   2910      y += 4 * dy;
   2911      c += 4;
   2912    } while (c < bw);
   2913  } else {
   2914    assert(bh % 8 == 0);
   2915 
   2916    int y = dy;
   2917    int c = 0;
   2918    do {
   2919      int r = 0;
   2920      do {
   2921        // Fully unroll the 4x8 block to allow us to use immediate lane-indexed
   2922        // multiply instructions.
   2923        const uint16x4_t shifts1 =
   2924            vand_u16(vmla_n_u16(vdup_n_u16(y), iota1x4, dy), shift_mask);
   2925        const uint16x4_t shifts0 = vsub_u16(vdup_n_u16(32), shifts1);
   2926        const int base0 = ((y + 0 * dy) >> frac_bits) + (r * 2);
   2927        const int base1 = ((y + 1 * dy) >> frac_bits) + (r * 2);
   2928        const int base2 = ((y + 2 * dy) >> frac_bits) + (r * 2);
   2929        const int base3 = ((y + 3 * dy) >> frac_bits) + (r * 2);
   2930        const uint16x8x2_t l0 = vld2q_u16(left + base0);
   2931        const uint16x8x2_t l1 = vld2q_u16(left + base1);
   2932        const uint16x8x2_t l2 = vld2q_u16(left + base2);
   2933        const uint16x8x2_t l3 = vld2q_u16(left + base3);
   2934        uint16x8_t out[4];
   2935        HIGHBD_DR_PREDICTOR_Z3_STEP_X8(&out[0], iota2x8, base0, l0.val[0],
   2936                                       l0.val[1], shifts0, shifts1, 0, 5);
   2937        HIGHBD_DR_PREDICTOR_Z3_STEP_X8(&out[1], iota2x8, base1, l1.val[0],
   2938                                       l1.val[1], shifts0, shifts1, 1, 5);
   2939        HIGHBD_DR_PREDICTOR_Z3_STEP_X8(&out[2], iota2x8, base2, l2.val[0],
   2940                                       l2.val[1], shifts0, shifts1, 2, 5);
   2941        HIGHBD_DR_PREDICTOR_Z3_STEP_X8(&out[3], iota2x8, base3, l3.val[0],
   2942                                       l3.val[1], shifts0, shifts1, 3, 5);
   2943        transpose_array_inplace_u16_4x8(out);
   2944        for (int r2 = 0; r2 < 4; ++r2) {
   2945          vst1_u16(dst + (r + r2) * stride + c, vget_low_u16(out[r2]));
   2946        }
   2947        for (int r2 = 0; r2 < 4; ++r2) {
   2948          vst1_u16(dst + (r + r2 + 4) * stride + c, vget_high_u16(out[r2]));
   2949        }
   2950        r += 8;
   2951      } while (r < bh);
   2952      y += 4 * dy;
   2953      c += 4;
   2954    } while (c < bw);
   2955  }
   2956 }
   2957 
   2958 // Directional prediction, zone 3: 180 < angle < 270
   2959 void av1_highbd_dr_prediction_z3_neon(uint16_t *dst, ptrdiff_t stride, int bw,
   2960                                      int bh, const uint16_t *above,
   2961                                      const uint16_t *left, int upsample_left,
   2962                                      int dx, int dy, int bd) {
   2963  (void)above;
   2964  (void)dx;
   2965  (void)bd;
   2966  assert(bw % 4 == 0);
   2967  assert(bh % 4 == 0);
   2968  assert(dx == 1);
   2969  assert(dy > 0);
   2970 
   2971  if (upsample_left) {
   2972    highbd_dr_prediction_z3_upsample1_neon(dst, stride, bw, bh, left, dy);
   2973  } else {
   2974    highbd_dr_prediction_z3_upsample0_neon(dst, stride, bw, bh, left, dy);
   2975  }
   2976 }
   2977 
   2978 #undef HIGHBD_DR_PREDICTOR_Z3_STEP_X4
   2979 #undef HIGHBD_DR_PREDICTOR_Z3_STEP_X8