tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

highbd_subpel_variance_neon.c (57374B)


      1 /*
      2 * Copyright (c) 2023 The WebM project authors. All rights reserved.
      3 * Copyright (c) 2023, Alliance for Open Media. All rights reserved.
      4 *
      5 * This source code is subject to the terms of the BSD 2 Clause License and
      6 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
      7 * was not distributed with this source code in the LICENSE file, you can
      8 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
      9 * Media Patent License 1.0 was not distributed with this source code in the
     10 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
     11 */
     12 
     13 #include <arm_neon.h>
     14 
     15 #include "config/aom_config.h"
     16 #include "config/aom_dsp_rtcd.h"
     17 
     18 #include "aom_dsp/aom_filter.h"
     19 #include "aom_dsp/arm/mem_neon.h"
     20 #include "aom_dsp/arm/sum_neon.h"
     21 #include "aom_dsp/variance.h"
     22 
     23 // The bilinear filters look like this:
     24 //
     25 // {{ 128,  0 }, { 112, 16 }, { 96, 32 }, { 80,  48 },
     26 //  {  64, 64 }, {  48, 80 }, { 32, 96 }, { 16, 112 }}
     27 //
     28 // We can factor out the highest common multiple, such that the sum of both
     29 // weights will be 8 instead of 128. The benefits of this are two-fold:
     30 //
     31 // 1) We can infer the filter values from the filter_offset parameter in the
     32 // bilinear filter functions below - we don't have to actually load the values
     33 // from memory:
     34 // f0 = 8 - filter_offset
     35 // f1 = filter_offset
     36 //
     37 // 2) Scaling the pixel values by 8, instead of 128 enables us to operate on
     38 // 16-bit data types at all times, rather than widening out to 32-bit and
     39 // requiring double the number of data processing instructions. (12-bit * 8 =
     40 // 15-bit.)
     41 
     42 // Process a block exactly 4 wide and any height.
     43 static void highbd_var_filter_block2d_bil_w4(const uint16_t *src_ptr,
     44                                             uint16_t *dst_ptr, int src_stride,
     45                                             int pixel_step, int dst_height,
     46                                             int filter_offset) {
     47  const uint16x4_t f0 = vdup_n_u16(8 - filter_offset);
     48  const uint16x4_t f1 = vdup_n_u16(filter_offset);
     49 
     50  int i = dst_height;
     51  do {
     52    uint16x4_t s0 = load_unaligned_u16_4x1(src_ptr);
     53    uint16x4_t s1 = load_unaligned_u16_4x1(src_ptr + pixel_step);
     54 
     55    uint16x4_t blend = vmul_u16(s0, f0);
     56    blend = vmla_u16(blend, s1, f1);
     57    blend = vrshr_n_u16(blend, 3);
     58 
     59    vst1_u16(dst_ptr, blend);
     60 
     61    src_ptr += src_stride;
     62    dst_ptr += 4;
     63  } while (--i != 0);
     64 }
     65 
     66 // Process a block which is a multiple of 8 and any height.
     67 static void highbd_var_filter_block2d_bil_large(const uint16_t *src_ptr,
     68                                                uint16_t *dst_ptr,
     69                                                int src_stride, int pixel_step,
     70                                                int dst_width, int dst_height,
     71                                                int filter_offset) {
     72  const uint16x8_t f0 = vdupq_n_u16(8 - filter_offset);
     73  const uint16x8_t f1 = vdupq_n_u16(filter_offset);
     74 
     75  int i = dst_height;
     76  do {
     77    int j = 0;
     78    do {
     79      uint16x8_t s0 = vld1q_u16(src_ptr + j);
     80      uint16x8_t s1 = vld1q_u16(src_ptr + j + pixel_step);
     81 
     82      uint16x8_t blend = vmulq_u16(s0, f0);
     83      blend = vmlaq_u16(blend, s1, f1);
     84      blend = vrshrq_n_u16(blend, 3);
     85 
     86      vst1q_u16(dst_ptr + j, blend);
     87 
     88      j += 8;
     89    } while (j < dst_width);
     90 
     91    src_ptr += src_stride;
     92    dst_ptr += dst_width;
     93  } while (--i != 0);
     94 }
     95 
     96 static void highbd_var_filter_block2d_bil_w8(const uint16_t *src_ptr,
     97                                             uint16_t *dst_ptr, int src_stride,
     98                                             int pixel_step, int dst_height,
     99                                             int filter_offset) {
    100  highbd_var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride, pixel_step,
    101                                      8, dst_height, filter_offset);
    102 }
    103 
    104 static void highbd_var_filter_block2d_bil_w16(const uint16_t *src_ptr,
    105                                              uint16_t *dst_ptr, int src_stride,
    106                                              int pixel_step, int dst_height,
    107                                              int filter_offset) {
    108  highbd_var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride, pixel_step,
    109                                      16, dst_height, filter_offset);
    110 }
    111 
    112 static void highbd_var_filter_block2d_bil_w32(const uint16_t *src_ptr,
    113                                              uint16_t *dst_ptr, int src_stride,
    114                                              int pixel_step, int dst_height,
    115                                              int filter_offset) {
    116  highbd_var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride, pixel_step,
    117                                      32, dst_height, filter_offset);
    118 }
    119 
    120 static void highbd_var_filter_block2d_bil_w64(const uint16_t *src_ptr,
    121                                              uint16_t *dst_ptr, int src_stride,
    122                                              int pixel_step, int dst_height,
    123                                              int filter_offset) {
    124  highbd_var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride, pixel_step,
    125                                      64, dst_height, filter_offset);
    126 }
    127 
    128 static void highbd_var_filter_block2d_bil_w128(const uint16_t *src_ptr,
    129                                               uint16_t *dst_ptr,
    130                                               int src_stride, int pixel_step,
    131                                               int dst_height,
    132                                               int filter_offset) {
    133  highbd_var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride, pixel_step,
    134                                      128, dst_height, filter_offset);
    135 }
    136 
    137 static void highbd_var_filter_block2d_avg(const uint16_t *src_ptr,
    138                                          uint16_t *dst_ptr, int src_stride,
    139                                          int pixel_step, int dst_width,
    140                                          int dst_height) {
    141  int i = dst_height;
    142 
    143  // We only specialize on the filter values for large block sizes (>= 16x16.)
    144  assert(dst_width >= 16 && dst_width % 16 == 0);
    145 
    146  do {
    147    int j = 0;
    148    do {
    149      uint16x8_t s0 = vld1q_u16(src_ptr + j);
    150      uint16x8_t s1 = vld1q_u16(src_ptr + j + pixel_step);
    151      uint16x8_t avg = vrhaddq_u16(s0, s1);
    152      vst1q_u16(dst_ptr + j, avg);
    153 
    154      j += 8;
    155    } while (j < dst_width);
    156 
    157    src_ptr += src_stride;
    158    dst_ptr += dst_width;
    159  } while (--i != 0);
    160 }
    161 
    162 #define HBD_SUBPEL_VARIANCE_WXH_NEON(bitdepth, w, h)                           \
    163  unsigned int aom_highbd_##bitdepth##_sub_pixel_variance##w##x##h##_neon(     \
    164      const uint8_t *src, int src_stride, int xoffset, int yoffset,            \
    165      const uint8_t *ref, int ref_stride, uint32_t *sse) {                     \
    166    uint16_t tmp0[w * (h + 1)];                                                \
    167    uint16_t tmp1[w * h];                                                      \
    168    uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src);                              \
    169                                                                               \
    170    highbd_var_filter_block2d_bil_w##w(src_ptr, tmp0, src_stride, 1, (h + 1),  \
    171                                       xoffset);                               \
    172    highbd_var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset);          \
    173                                                                               \
    174    return aom_highbd_##bitdepth##_variance##w##x##h(CONVERT_TO_BYTEPTR(tmp1), \
    175                                                     w, ref, ref_stride, sse); \
    176  }
    177 
    178 #define HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(bitdepth, w, h)               \
    179  unsigned int aom_highbd_##bitdepth##_sub_pixel_variance##w##x##h##_neon(     \
    180      const uint8_t *src, int src_stride, int xoffset, int yoffset,            \
    181      const uint8_t *ref, int ref_stride, unsigned int *sse) {                 \
    182    uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src);                              \
    183                                                                               \
    184    if (xoffset == 0) {                                                        \
    185      if (yoffset == 0) {                                                      \
    186        return aom_highbd_##bitdepth##_variance##w##x##h(                      \
    187            CONVERT_TO_BYTEPTR(src_ptr), src_stride, ref, ref_stride, sse);    \
    188      } else if (yoffset == 4) {                                               \
    189        uint16_t tmp[w * h];                                                   \
    190        highbd_var_filter_block2d_avg(src_ptr, tmp, src_stride, src_stride, w, \
    191                                      h);                                      \
    192        return aom_highbd_##bitdepth##_variance##w##x##h(                      \
    193            CONVERT_TO_BYTEPTR(tmp), w, ref, ref_stride, sse);                 \
    194      } else {                                                                 \
    195        uint16_t tmp[w * h];                                                   \
    196        highbd_var_filter_block2d_bil_w##w(src_ptr, tmp, src_stride,           \
    197                                           src_stride, h, yoffset);            \
    198        return aom_highbd_##bitdepth##_variance##w##x##h(                      \
    199            CONVERT_TO_BYTEPTR(tmp), w, ref, ref_stride, sse);                 \
    200      }                                                                        \
    201    } else if (xoffset == 4) {                                                 \
    202      uint16_t tmp0[w * (h + 1)];                                              \
    203      if (yoffset == 0) {                                                      \
    204        highbd_var_filter_block2d_avg(src_ptr, tmp0, src_stride, 1, w, h);     \
    205        return aom_highbd_##bitdepth##_variance##w##x##h(                      \
    206            CONVERT_TO_BYTEPTR(tmp0), w, ref, ref_stride, sse);                \
    207      } else if (yoffset == 4) {                                               \
    208        uint16_t tmp1[w * (h + 1)];                                            \
    209        highbd_var_filter_block2d_avg(src_ptr, tmp0, src_stride, 1, w,         \
    210                                      (h + 1));                                \
    211        highbd_var_filter_block2d_avg(tmp0, tmp1, w, w, w, h);                 \
    212        return aom_highbd_##bitdepth##_variance##w##x##h(                      \
    213            CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse);                \
    214      } else {                                                                 \
    215        uint16_t tmp1[w * (h + 1)];                                            \
    216        highbd_var_filter_block2d_avg(src_ptr, tmp0, src_stride, 1, w,         \
    217                                      (h + 1));                                \
    218        highbd_var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset);      \
    219        return aom_highbd_##bitdepth##_variance##w##x##h(                      \
    220            CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse);                \
    221      }                                                                        \
    222    } else {                                                                   \
    223      uint16_t tmp0[w * (h + 1)];                                              \
    224      if (yoffset == 0) {                                                      \
    225        highbd_var_filter_block2d_bil_w##w(src_ptr, tmp0, src_stride, 1, h,    \
    226                                           xoffset);                           \
    227        return aom_highbd_##bitdepth##_variance##w##x##h(                      \
    228            CONVERT_TO_BYTEPTR(tmp0), w, ref, ref_stride, sse);                \
    229      } else if (yoffset == 4) {                                               \
    230        uint16_t tmp1[w * h];                                                  \
    231        highbd_var_filter_block2d_bil_w##w(src_ptr, tmp0, src_stride, 1,       \
    232                                           (h + 1), xoffset);                  \
    233        highbd_var_filter_block2d_avg(tmp0, tmp1, w, w, w, h);                 \
    234        return aom_highbd_##bitdepth##_variance##w##x##h(                      \
    235            CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse);                \
    236      } else {                                                                 \
    237        uint16_t tmp1[w * h];                                                  \
    238        highbd_var_filter_block2d_bil_w##w(src_ptr, tmp0, src_stride, 1,       \
    239                                           (h + 1), xoffset);                  \
    240        highbd_var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset);      \
    241        return aom_highbd_##bitdepth##_variance##w##x##h(                      \
    242            CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse);                \
    243      }                                                                        \
    244    }                                                                          \
    245  }
    246 
    247 // 8-bit
    248 HBD_SUBPEL_VARIANCE_WXH_NEON(8, 4, 4)
    249 HBD_SUBPEL_VARIANCE_WXH_NEON(8, 4, 8)
    250 
    251 HBD_SUBPEL_VARIANCE_WXH_NEON(8, 8, 4)
    252 HBD_SUBPEL_VARIANCE_WXH_NEON(8, 8, 8)
    253 HBD_SUBPEL_VARIANCE_WXH_NEON(8, 8, 16)
    254 
    255 HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(8, 16, 8)
    256 HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(8, 16, 16)
    257 HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(8, 16, 32)
    258 
    259 HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(8, 32, 16)
    260 HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(8, 32, 32)
    261 HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(8, 32, 64)
    262 
    263 HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(8, 64, 32)
    264 HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(8, 64, 64)
    265 HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(8, 64, 128)
    266 
    267 HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(8, 128, 64)
    268 HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(8, 128, 128)
    269 
    270 #if !CONFIG_REALTIME_ONLY
    271 HBD_SUBPEL_VARIANCE_WXH_NEON(8, 4, 16)
    272 
    273 HBD_SUBPEL_VARIANCE_WXH_NEON(8, 8, 32)
    274 
    275 HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(8, 16, 4)
    276 HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(8, 16, 64)
    277 
    278 HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(8, 32, 8)
    279 
    280 HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(8, 64, 16)
    281 #endif  // !CONFIG_REALTIME_ONLY
    282 
    283 // 10-bit
    284 HBD_SUBPEL_VARIANCE_WXH_NEON(10, 4, 4)
    285 HBD_SUBPEL_VARIANCE_WXH_NEON(10, 4, 8)
    286 
    287 HBD_SUBPEL_VARIANCE_WXH_NEON(10, 8, 4)
    288 HBD_SUBPEL_VARIANCE_WXH_NEON(10, 8, 8)
    289 HBD_SUBPEL_VARIANCE_WXH_NEON(10, 8, 16)
    290 
    291 HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(10, 16, 8)
    292 HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(10, 16, 16)
    293 HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(10, 16, 32)
    294 
    295 HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(10, 32, 16)
    296 HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(10, 32, 32)
    297 HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(10, 32, 64)
    298 
    299 HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(10, 64, 32)
    300 HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(10, 64, 64)
    301 HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(10, 64, 128)
    302 
    303 HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(10, 128, 64)
    304 HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(10, 128, 128)
    305 
    306 #if !CONFIG_REALTIME_ONLY
    307 HBD_SUBPEL_VARIANCE_WXH_NEON(10, 4, 16)
    308 
    309 HBD_SUBPEL_VARIANCE_WXH_NEON(10, 8, 32)
    310 
    311 HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(10, 16, 4)
    312 HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(10, 16, 64)
    313 
    314 HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(10, 32, 8)
    315 
    316 HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(10, 64, 16)
    317 #endif  // !CONFIG_REALTIME_ONLY
    318 
    319 // 12-bit
    320 HBD_SUBPEL_VARIANCE_WXH_NEON(12, 4, 4)
    321 HBD_SUBPEL_VARIANCE_WXH_NEON(12, 4, 8)
    322 
    323 HBD_SUBPEL_VARIANCE_WXH_NEON(12, 8, 4)
    324 HBD_SUBPEL_VARIANCE_WXH_NEON(12, 8, 8)
    325 HBD_SUBPEL_VARIANCE_WXH_NEON(12, 8, 16)
    326 
    327 HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(12, 16, 8)
    328 HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(12, 16, 16)
    329 HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(12, 16, 32)
    330 
    331 HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(12, 32, 16)
    332 HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(12, 32, 32)
    333 HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(12, 32, 64)
    334 
    335 HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(12, 64, 32)
    336 HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(12, 64, 64)
    337 HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(12, 64, 128)
    338 
    339 HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(12, 128, 64)
    340 HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(12, 128, 128)
    341 
    342 #if !CONFIG_REALTIME_ONLY
    343 HBD_SUBPEL_VARIANCE_WXH_NEON(12, 4, 16)
    344 
    345 HBD_SUBPEL_VARIANCE_WXH_NEON(12, 8, 32)
    346 
    347 HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(12, 16, 4)
    348 HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(12, 16, 64)
    349 
    350 HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(12, 32, 8)
    351 
    352 HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(12, 64, 16)
    353 #endif  // !CONFIG_REALTIME_ONLY
    354 
    355 // Combine bilinear filter with aom_highbd_comp_avg_pred for blocks having
    356 // width 4.
    357 static void highbd_avg_pred_var_filter_block2d_bil_w4(
    358    const uint16_t *src_ptr, uint16_t *dst_ptr, int src_stride, int pixel_step,
    359    int dst_height, int filter_offset, const uint16_t *second_pred) {
    360  const uint16x4_t f0 = vdup_n_u16(8 - filter_offset);
    361  const uint16x4_t f1 = vdup_n_u16(filter_offset);
    362 
    363  int i = dst_height;
    364  do {
    365    uint16x4_t s0 = load_unaligned_u16_4x1(src_ptr);
    366    uint16x4_t s1 = load_unaligned_u16_4x1(src_ptr + pixel_step);
    367    uint16x4_t p = vld1_u16(second_pred);
    368 
    369    uint16x4_t blend = vmul_u16(s0, f0);
    370    blend = vmla_u16(blend, s1, f1);
    371    blend = vrshr_n_u16(blend, 3);
    372 
    373    vst1_u16(dst_ptr, vrhadd_u16(blend, p));
    374 
    375    src_ptr += src_stride;
    376    dst_ptr += 4;
    377    second_pred += 4;
    378  } while (--i != 0);
    379 }
    380 
    381 // Combine bilinear filter with aom_highbd_comp_avg_pred for large blocks.
    382 static void highbd_avg_pred_var_filter_block2d_bil_large(
    383    const uint16_t *src_ptr, uint16_t *dst_ptr, int src_stride, int pixel_step,
    384    int dst_width, int dst_height, int filter_offset,
    385    const uint16_t *second_pred) {
    386  const uint16x8_t f0 = vdupq_n_u16(8 - filter_offset);
    387  const uint16x8_t f1 = vdupq_n_u16(filter_offset);
    388 
    389  int i = dst_height;
    390  do {
    391    int j = 0;
    392    do {
    393      uint16x8_t s0 = vld1q_u16(src_ptr + j);
    394      uint16x8_t s1 = vld1q_u16(src_ptr + j + pixel_step);
    395      uint16x8_t p = vld1q_u16(second_pred);
    396 
    397      uint16x8_t blend = vmulq_u16(s0, f0);
    398      blend = vmlaq_u16(blend, s1, f1);
    399      blend = vrshrq_n_u16(blend, 3);
    400 
    401      vst1q_u16(dst_ptr + j, vrhaddq_u16(blend, p));
    402 
    403      j += 8;
    404      second_pred += 8;
    405    } while (j < dst_width);
    406 
    407    src_ptr += src_stride;
    408    dst_ptr += dst_width;
    409  } while (--i != 0);
    410 }
    411 
    412 static void highbd_avg_pred_var_filter_block2d_bil_w8(
    413    const uint16_t *src_ptr, uint16_t *dst_ptr, int src_stride, int pixel_step,
    414    int dst_height, int filter_offset, const uint16_t *second_pred) {
    415  highbd_avg_pred_var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride,
    416                                               pixel_step, 8, dst_height,
    417                                               filter_offset, second_pred);
    418 }
    419 
    420 static void highbd_avg_pred_var_filter_block2d_bil_w16(
    421    const uint16_t *src_ptr, uint16_t *dst_ptr, int src_stride, int pixel_step,
    422    int dst_height, int filter_offset, const uint16_t *second_pred) {
    423  highbd_avg_pred_var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride,
    424                                               pixel_step, 16, dst_height,
    425                                               filter_offset, second_pred);
    426 }
    427 
    428 static void highbd_avg_pred_var_filter_block2d_bil_w32(
    429    const uint16_t *src_ptr, uint16_t *dst_ptr, int src_stride, int pixel_step,
    430    int dst_height, int filter_offset, const uint16_t *second_pred) {
    431  highbd_avg_pred_var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride,
    432                                               pixel_step, 32, dst_height,
    433                                               filter_offset, second_pred);
    434 }
    435 
    436 static void highbd_avg_pred_var_filter_block2d_bil_w64(
    437    const uint16_t *src_ptr, uint16_t *dst_ptr, int src_stride, int pixel_step,
    438    int dst_height, int filter_offset, const uint16_t *second_pred) {
    439  highbd_avg_pred_var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride,
    440                                               pixel_step, 64, dst_height,
    441                                               filter_offset, second_pred);
    442 }
    443 
    444 static void highbd_avg_pred_var_filter_block2d_bil_w128(
    445    const uint16_t *src_ptr, uint16_t *dst_ptr, int src_stride, int pixel_step,
    446    int dst_height, int filter_offset, const uint16_t *second_pred) {
    447  highbd_avg_pred_var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride,
    448                                               pixel_step, 128, dst_height,
    449                                               filter_offset, second_pred);
    450 }
    451 
    452 // Combine averaging subpel filter with aom_highbd_comp_avg_pred.
    453 static void highbd_avg_pred_var_filter_block2d_avg(
    454    const uint16_t *src_ptr, uint16_t *dst_ptr, int src_stride, int pixel_step,
    455    int dst_width, int dst_height, const uint16_t *second_pred) {
    456  int i = dst_height;
    457 
    458  // We only specialize on the filter values for large block sizes (>= 16x16.)
    459  assert(dst_width >= 16 && dst_width % 16 == 0);
    460 
    461  do {
    462    int j = 0;
    463    do {
    464      uint16x8_t s0 = vld1q_u16(src_ptr + j);
    465      uint16x8_t s1 = vld1q_u16(src_ptr + j + pixel_step);
    466      uint16x8_t avg = vrhaddq_u16(s0, s1);
    467 
    468      uint16x8_t p = vld1q_u16(second_pred);
    469      avg = vrhaddq_u16(avg, p);
    470 
    471      vst1q_u16(dst_ptr + j, avg);
    472 
    473      j += 8;
    474      second_pred += 8;
    475    } while (j < dst_width);
    476 
    477    src_ptr += src_stride;
    478    dst_ptr += dst_width;
    479  } while (--i != 0);
    480 }
    481 
    482 // Implementation of aom_highbd_comp_avg_pred for blocks having width >= 16.
    483 static void highbd_avg_pred(const uint16_t *src_ptr, uint16_t *dst_ptr,
    484                            int src_stride, int dst_width, int dst_height,
    485                            const uint16_t *second_pred) {
    486  int i = dst_height;
    487 
    488  // We only specialize on the filter values for large block sizes (>= 16x16.)
    489  assert(dst_width >= 16 && dst_width % 16 == 0);
    490 
    491  do {
    492    int j = 0;
    493    do {
    494      uint16x8_t s = vld1q_u16(src_ptr + j);
    495      uint16x8_t p = vld1q_u16(second_pred);
    496 
    497      uint16x8_t avg = vrhaddq_u16(s, p);
    498 
    499      vst1q_u16(dst_ptr + j, avg);
    500 
    501      j += 8;
    502      second_pred += 8;
    503    } while (j < dst_width);
    504 
    505    src_ptr += src_stride;
    506    dst_ptr += dst_width;
    507  } while (--i != 0);
    508 }
    509 
    510 #define HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(bitdepth, w, h)                       \
    511  uint32_t aom_highbd_##bitdepth##_sub_pixel_avg_variance##w##x##h##_neon(     \
    512      const uint8_t *src, int src_stride, int xoffset, int yoffset,            \
    513      const uint8_t *ref, int ref_stride, uint32_t *sse,                       \
    514      const uint8_t *second_pred) {                                            \
    515    uint16_t tmp0[w * (h + 1)];                                                \
    516    uint16_t tmp1[w * h];                                                      \
    517    uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src);                              \
    518                                                                               \
    519    highbd_var_filter_block2d_bil_w##w(src_ptr, tmp0, src_stride, 1, (h + 1),  \
    520                                       xoffset);                               \
    521    highbd_avg_pred_var_filter_block2d_bil_w##w(                               \
    522        tmp0, tmp1, w, w, h, yoffset, CONVERT_TO_SHORTPTR(second_pred));       \
    523                                                                               \
    524    return aom_highbd_##bitdepth##_variance##w##x##h(CONVERT_TO_BYTEPTR(tmp1), \
    525                                                     w, ref, ref_stride, sse); \
    526  }
    527 
    528 #define HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(bitdepth, w, h)           \
    529  unsigned int aom_highbd_##bitdepth##_sub_pixel_avg_variance##w##x##h##_neon( \
    530      const uint8_t *src, int source_stride, int xoffset, int yoffset,         \
    531      const uint8_t *ref, int ref_stride, uint32_t *sse,                       \
    532      const uint8_t *second_pred) {                                            \
    533    uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src);                              \
    534                                                                               \
    535    if (xoffset == 0) {                                                        \
    536      uint16_t tmp[w * h];                                                     \
    537      if (yoffset == 0) {                                                      \
    538        highbd_avg_pred(src_ptr, tmp, source_stride, w, h,                     \
    539                        CONVERT_TO_SHORTPTR(second_pred));                     \
    540        return aom_highbd_##bitdepth##_variance##w##x##h(                      \
    541            CONVERT_TO_BYTEPTR(tmp), w, ref, ref_stride, sse);                 \
    542      } else if (yoffset == 4) {                                               \
    543        highbd_avg_pred_var_filter_block2d_avg(                                \
    544            src_ptr, tmp, source_stride, source_stride, w, h,                  \
    545            CONVERT_TO_SHORTPTR(second_pred));                                 \
    546        return aom_highbd_##bitdepth##_variance##w##x##h(                      \
    547            CONVERT_TO_BYTEPTR(tmp), w, ref, ref_stride, sse);                 \
    548      } else {                                                                 \
    549        highbd_avg_pred_var_filter_block2d_bil_w##w(                           \
    550            src_ptr, tmp, source_stride, source_stride, h, yoffset,            \
    551            CONVERT_TO_SHORTPTR(second_pred));                                 \
    552        return aom_highbd_##bitdepth##_variance##w##x##h(                      \
    553            CONVERT_TO_BYTEPTR(tmp), w, ref, ref_stride, sse);                 \
    554      }                                                                        \
    555    } else if (xoffset == 4) {                                                 \
    556      uint16_t tmp0[w * (h + 1)];                                              \
    557      if (yoffset == 0) {                                                      \
    558        highbd_avg_pred_var_filter_block2d_avg(                                \
    559            src_ptr, tmp0, source_stride, 1, w, h,                             \
    560            CONVERT_TO_SHORTPTR(second_pred));                                 \
    561        return aom_highbd_##bitdepth##_variance##w##x##h(                      \
    562            CONVERT_TO_BYTEPTR(tmp0), w, ref, ref_stride, sse);                \
    563      } else if (yoffset == 4) {                                               \
    564        uint16_t tmp1[w * (h + 1)];                                            \
    565        highbd_var_filter_block2d_avg(src_ptr, tmp0, source_stride, 1, w,      \
    566                                      (h + 1));                                \
    567        highbd_avg_pred_var_filter_block2d_avg(                                \
    568            tmp0, tmp1, w, w, w, h, CONVERT_TO_SHORTPTR(second_pred));         \
    569        return aom_highbd_##bitdepth##_variance##w##x##h(                      \
    570            CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse);                \
    571      } else {                                                                 \
    572        uint16_t tmp1[w * (h + 1)];                                            \
    573        highbd_var_filter_block2d_avg(src_ptr, tmp0, source_stride, 1, w,      \
    574                                      (h + 1));                                \
    575        highbd_avg_pred_var_filter_block2d_bil_w##w(                           \
    576            tmp0, tmp1, w, w, h, yoffset, CONVERT_TO_SHORTPTR(second_pred));   \
    577        return aom_highbd_##bitdepth##_variance##w##x##h(                      \
    578            CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse);                \
    579      }                                                                        \
    580    } else {                                                                   \
    581      uint16_t tmp0[w * (h + 1)];                                              \
    582      if (yoffset == 0) {                                                      \
    583        highbd_avg_pred_var_filter_block2d_bil_w##w(                           \
    584            src_ptr, tmp0, source_stride, 1, h, xoffset,                       \
    585            CONVERT_TO_SHORTPTR(second_pred));                                 \
    586        return aom_highbd_##bitdepth##_variance##w##x##h(                      \
    587            CONVERT_TO_BYTEPTR(tmp0), w, ref, ref_stride, sse);                \
    588      } else if (yoffset == 4) {                                               \
    589        uint16_t tmp1[w * h];                                                  \
    590        highbd_var_filter_block2d_bil_w##w(src_ptr, tmp0, source_stride, 1,    \
    591                                           (h + 1), xoffset);                  \
    592        highbd_avg_pred_var_filter_block2d_avg(                                \
    593            tmp0, tmp1, w, w, w, h, CONVERT_TO_SHORTPTR(second_pred));         \
    594        return aom_highbd_##bitdepth##_variance##w##x##h(                      \
    595            CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse);                \
    596      } else {                                                                 \
    597        uint16_t tmp1[w * h];                                                  \
    598        highbd_var_filter_block2d_bil_w##w(src_ptr, tmp0, source_stride, 1,    \
    599                                           (h + 1), xoffset);                  \
    600        highbd_avg_pred_var_filter_block2d_bil_w##w(                           \
    601            tmp0, tmp1, w, w, h, yoffset, CONVERT_TO_SHORTPTR(second_pred));   \
    602        return aom_highbd_##bitdepth##_variance##w##x##h(                      \
    603            CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse);                \
    604      }                                                                        \
    605    }                                                                          \
    606  }
    607 
    608 // 8-bit
    609 HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 4, 4)
    610 HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 4, 8)
    611 
    612 HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 8, 4)
    613 HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 8, 8)
    614 HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 8, 16)
    615 
    616 HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 16, 8)
    617 HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 16, 16)
    618 HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 16, 32)
    619 
    620 HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 32, 16)
    621 HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 32, 32)
    622 HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 32, 64)
    623 
    624 HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 64, 32)
    625 HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 64, 64)
    626 HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 64, 128)
    627 
    628 HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 128, 64)
    629 HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 128, 128)
    630 
    631 #if !CONFIG_REALTIME_ONLY
    632 HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 4, 16)
    633 
    634 HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 8, 32)
    635 
    636 HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 16, 4)
    637 HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 16, 64)
    638 
    639 HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 32, 8)
    640 
    641 HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 64, 16)
    642 #endif  // !CONFIG_REALTIME_ONLY
    643 
    644 // 10-bit
    645 HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 4, 4)
    646 HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 4, 8)
    647 
    648 HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 8, 4)
    649 HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 8, 8)
    650 HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 8, 16)
    651 
    652 HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 16, 8)
    653 HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 16, 16)
    654 HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 16, 32)
    655 
    656 HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 32, 16)
    657 HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 32, 32)
    658 HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 32, 64)
    659 
    660 HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 64, 32)
    661 HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 64, 64)
    662 HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 64, 128)
    663 
    664 HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 128, 64)
    665 HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 128, 128)
    666 
    667 #if !CONFIG_REALTIME_ONLY
    668 HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 4, 16)
    669 
    670 HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 8, 32)
    671 
    672 HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 16, 4)
    673 HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 16, 64)
    674 
    675 HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 32, 8)
    676 
    677 HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 64, 16)
    678 #endif  // !CONFIG_REALTIME_ONLY
    679 
    680 // 12-bit
    681 HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 4, 4)
    682 HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 4, 8)
    683 
    684 HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 8, 4)
    685 HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 8, 8)
    686 HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 8, 16)
    687 
    688 HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 16, 8)
    689 HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 16, 16)
    690 HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 16, 32)
    691 
    692 HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 32, 16)
    693 HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 32, 32)
    694 HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 32, 64)
    695 
    696 HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 64, 32)
    697 HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 64, 64)
    698 HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 64, 128)
    699 
    700 HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 128, 64)
    701 HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 128, 128)
    702 
    703 #if !CONFIG_REALTIME_ONLY
    704 HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 4, 16)
    705 
    706 HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 8, 32)
    707 
    708 HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 16, 4)
    709 HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 16, 64)
    710 
    711 HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 32, 8)
    712 
    713 HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 64, 16)
    714 #endif  // !CONFIG_REALTIME_ONLY
    715 
    716 #define HBD_MASKED_SUBPEL_VARIANCE_WXH_NEON(bitdepth, w, h)                    \
    717  unsigned int                                                                 \
    718      aom_highbd_##bitdepth##_masked_sub_pixel_variance##w##x##h##_neon(       \
    719          const uint8_t *src, int src_stride, int xoffset, int yoffset,        \
    720          const uint8_t *ref, int ref_stride, const uint8_t *second_pred,      \
    721          const uint8_t *msk, int msk_stride, int invert_mask,                 \
    722          unsigned int *sse) {                                                 \
    723    uint16_t tmp0[w * (h + 1)];                                                \
    724    uint16_t tmp1[w * (h + 1)];                                                \
    725    uint16_t tmp2[w * h];                                                      \
    726    uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src);                              \
    727    highbd_var_filter_block2d_bil_w##w(src_ptr, tmp0, src_stride, 1, (h + 1),  \
    728                                       xoffset);                               \
    729    highbd_var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset);          \
    730    aom_highbd_comp_mask_pred_neon(CONVERT_TO_BYTEPTR(tmp2), second_pred, w,   \
    731                                   h, CONVERT_TO_BYTEPTR(tmp1), w, msk,        \
    732                                   msk_stride, invert_mask);                   \
    733    return aom_highbd_##bitdepth##_variance##w##x##h(CONVERT_TO_BYTEPTR(tmp2), \
    734                                                     w, ref, ref_stride, sse); \
    735  }
    736 
    737 #define HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(bitdepth, w, h)        \
    738  unsigned int                                                                 \
    739      aom_highbd_##bitdepth##_masked_sub_pixel_variance##w##x##h##_neon(       \
    740          const uint8_t *src, int src_stride, int xoffset, int yoffset,        \
    741          const uint8_t *ref, int ref_stride, const uint8_t *second_pred,      \
    742          const uint8_t *msk, int msk_stride, int invert_mask,                 \
    743          unsigned int *sse) {                                                 \
    744    uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src);                              \
    745    if (xoffset == 0) {                                                        \
    746      uint16_t tmp0[w * h];                                                    \
    747      if (yoffset == 0) {                                                      \
    748        aom_highbd_comp_mask_pred_neon(CONVERT_TO_BYTEPTR(tmp0), second_pred,  \
    749                                       w, h, src, src_stride, msk, msk_stride, \
    750                                       invert_mask);                           \
    751        return aom_highbd_##bitdepth##_variance##w##x##h(                      \
    752            CONVERT_TO_BYTEPTR(tmp0), w, ref, ref_stride, sse);                \
    753      } else if (yoffset == 4) {                                               \
    754        uint16_t tmp1[w * h];                                                  \
    755        highbd_var_filter_block2d_avg(src_ptr, tmp0, src_stride, src_stride,   \
    756                                      w, h);                                   \
    757        aom_highbd_comp_mask_pred_neon(CONVERT_TO_BYTEPTR(tmp1), second_pred,  \
    758                                       w, h, CONVERT_TO_BYTEPTR(tmp0), w, msk, \
    759                                       msk_stride, invert_mask);               \
    760        return aom_highbd_##bitdepth##_variance##w##x##h(                      \
    761            CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse);                \
    762      } else {                                                                 \
    763        uint16_t tmp1[w * h];                                                  \
    764        highbd_var_filter_block2d_bil_w##w(src_ptr, tmp0, src_stride,          \
    765                                           src_stride, h, yoffset);            \
    766        aom_highbd_comp_mask_pred_neon(CONVERT_TO_BYTEPTR(tmp1), second_pred,  \
    767                                       w, h, CONVERT_TO_BYTEPTR(tmp0), w, msk, \
    768                                       msk_stride, invert_mask);               \
    769        return aom_highbd_##bitdepth##_variance##w##x##h(                      \
    770            CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse);                \
    771      }                                                                        \
    772    } else if (xoffset == 4) {                                                 \
    773      uint16_t tmp0[w * (h + 1)];                                              \
    774      if (yoffset == 0) {                                                      \
    775        uint16_t tmp1[w * h];                                                  \
    776        highbd_var_filter_block2d_avg(src_ptr, tmp0, src_stride, 1, w, h);     \
    777        aom_highbd_comp_mask_pred_neon(CONVERT_TO_BYTEPTR(tmp1), second_pred,  \
    778                                       w, h, CONVERT_TO_BYTEPTR(tmp0), w, msk, \
    779                                       msk_stride, invert_mask);               \
    780        return aom_highbd_##bitdepth##_variance##w##x##h(                      \
    781            CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse);                \
    782      } else if (yoffset == 4) {                                               \
    783        uint16_t tmp1[w * h];                                                  \
    784        uint16_t tmp2[w * h];                                                  \
    785        highbd_var_filter_block2d_avg(src_ptr, tmp0, src_stride, 1, w,         \
    786                                      (h + 1));                                \
    787        highbd_var_filter_block2d_avg(tmp0, tmp1, w, w, w, h);                 \
    788        aom_highbd_comp_mask_pred_neon(CONVERT_TO_BYTEPTR(tmp2), second_pred,  \
    789                                       w, h, CONVERT_TO_BYTEPTR(tmp1), w, msk, \
    790                                       msk_stride, invert_mask);               \
    791        return aom_highbd_##bitdepth##_variance##w##x##h(                      \
    792            CONVERT_TO_BYTEPTR(tmp2), w, ref, ref_stride, sse);                \
    793      } else {                                                                 \
    794        uint16_t tmp1[w * h];                                                  \
    795        uint16_t tmp2[w * h];                                                  \
    796        highbd_var_filter_block2d_avg(src_ptr, tmp0, src_stride, 1, w,         \
    797                                      (h + 1));                                \
    798        highbd_var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset);      \
    799        aom_highbd_comp_mask_pred_neon(CONVERT_TO_BYTEPTR(tmp2), second_pred,  \
    800                                       w, h, CONVERT_TO_BYTEPTR(tmp1), w, msk, \
    801                                       msk_stride, invert_mask);               \
    802        return aom_highbd_##bitdepth##_variance##w##x##h(                      \
    803            CONVERT_TO_BYTEPTR(tmp2), w, ref, ref_stride, sse);                \
    804      }                                                                        \
    805    } else {                                                                   \
    806      if (yoffset == 0) {                                                      \
    807        uint16_t tmp0[w * h];                                                  \
    808        uint16_t tmp1[w * h];                                                  \
    809        highbd_var_filter_block2d_bil_w##w(src_ptr, tmp0, src_stride, 1, h,    \
    810                                           xoffset);                           \
    811        aom_highbd_comp_mask_pred_neon(CONVERT_TO_BYTEPTR(tmp1), second_pred,  \
    812                                       w, h, CONVERT_TO_BYTEPTR(tmp0), w, msk, \
    813                                       msk_stride, invert_mask);               \
    814        return aom_highbd_##bitdepth##_variance##w##x##h(                      \
    815            CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse);                \
    816      } else if (yoffset == 4) {                                               \
    817        uint16_t tmp0[w * (h + 1)];                                            \
    818        uint16_t tmp1[w * h];                                                  \
    819        uint16_t tmp2[w * h];                                                  \
    820        highbd_var_filter_block2d_bil_w##w(src_ptr, tmp0, src_stride, 1,       \
    821                                           (h + 1), xoffset);                  \
    822        highbd_var_filter_block2d_avg(tmp0, tmp1, w, w, w, h);                 \
    823        aom_highbd_comp_mask_pred_neon(CONVERT_TO_BYTEPTR(tmp2), second_pred,  \
    824                                       w, h, CONVERT_TO_BYTEPTR(tmp1), w, msk, \
    825                                       msk_stride, invert_mask);               \
    826        return aom_highbd_##bitdepth##_variance##w##x##h(                      \
    827            CONVERT_TO_BYTEPTR(tmp2), w, ref, ref_stride, sse);                \
    828      } else {                                                                 \
    829        uint16_t tmp0[w * (h + 1)];                                            \
    830        uint16_t tmp1[w * (h + 1)];                                            \
    831        uint16_t tmp2[w * h];                                                  \
    832        highbd_var_filter_block2d_bil_w##w(src_ptr, tmp0, src_stride, 1,       \
    833                                           (h + 1), xoffset);                  \
    834        highbd_var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset);      \
    835        aom_highbd_comp_mask_pred_neon(CONVERT_TO_BYTEPTR(tmp2), second_pred,  \
    836                                       w, h, CONVERT_TO_BYTEPTR(tmp1), w, msk, \
    837                                       msk_stride, invert_mask);               \
    838        return aom_highbd_##bitdepth##_variance##w##x##h(                      \
    839            CONVERT_TO_BYTEPTR(tmp2), w, ref, ref_stride, sse);                \
    840      }                                                                        \
    841    }                                                                          \
    842  }
    843 
    844 // 8-bit
    845 HBD_MASKED_SUBPEL_VARIANCE_WXH_NEON(8, 4, 4)
    846 HBD_MASKED_SUBPEL_VARIANCE_WXH_NEON(8, 4, 8)
    847 
    848 HBD_MASKED_SUBPEL_VARIANCE_WXH_NEON(8, 8, 4)
    849 HBD_MASKED_SUBPEL_VARIANCE_WXH_NEON(8, 8, 8)
    850 HBD_MASKED_SUBPEL_VARIANCE_WXH_NEON(8, 8, 16)
    851 
    852 HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(8, 16, 8)
    853 HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(8, 16, 16)
    854 HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(8, 16, 32)
    855 
    856 HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(8, 32, 16)
    857 HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(8, 32, 32)
    858 HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(8, 32, 64)
    859 
    860 HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(8, 64, 32)
    861 HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(8, 64, 64)
    862 HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(8, 64, 128)
    863 
    864 HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(8, 128, 64)
    865 HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(8, 128, 128)
    866 
    867 #if !CONFIG_REALTIME_ONLY
    868 HBD_MASKED_SUBPEL_VARIANCE_WXH_NEON(8, 4, 16)
    869 
    870 HBD_MASKED_SUBPEL_VARIANCE_WXH_NEON(8, 8, 32)
    871 
    872 HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(8, 16, 4)
    873 HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(8, 16, 64)
    874 
    875 HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(8, 32, 8)
    876 
    877 HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(8, 64, 16)
    878 #endif  // !CONFIG_REALTIME_ONLY
    879 
    880 // 10-bit
    881 HBD_MASKED_SUBPEL_VARIANCE_WXH_NEON(10, 4, 4)
    882 HBD_MASKED_SUBPEL_VARIANCE_WXH_NEON(10, 4, 8)
    883 
    884 HBD_MASKED_SUBPEL_VARIANCE_WXH_NEON(10, 8, 4)
    885 HBD_MASKED_SUBPEL_VARIANCE_WXH_NEON(10, 8, 8)
    886 HBD_MASKED_SUBPEL_VARIANCE_WXH_NEON(10, 8, 16)
    887 
    888 HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(10, 16, 8)
    889 HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(10, 16, 16)
    890 HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(10, 16, 32)
    891 
    892 HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(10, 32, 16)
    893 HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(10, 32, 32)
    894 HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(10, 32, 64)
    895 
    896 HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(10, 64, 32)
    897 HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(10, 64, 64)
    898 HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(10, 64, 128)
    899 
    900 HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(10, 128, 64)
    901 HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(10, 128, 128)
    902 
    903 #if !CONFIG_REALTIME_ONLY
    904 HBD_MASKED_SUBPEL_VARIANCE_WXH_NEON(10, 4, 16)
    905 
    906 HBD_MASKED_SUBPEL_VARIANCE_WXH_NEON(10, 8, 32)
    907 
    908 HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(10, 16, 4)
    909 HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(10, 16, 64)
    910 
    911 HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(10, 32, 8)
    912 
    913 HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(10, 64, 16)
    914 #endif  // !CONFIG_REALTIME_ONLY
    915 
    916 // 12-bit
    917 HBD_MASKED_SUBPEL_VARIANCE_WXH_NEON(12, 4, 4)
    918 HBD_MASKED_SUBPEL_VARIANCE_WXH_NEON(12, 4, 8)
    919 
    920 HBD_MASKED_SUBPEL_VARIANCE_WXH_NEON(12, 8, 4)
    921 HBD_MASKED_SUBPEL_VARIANCE_WXH_NEON(12, 8, 8)
    922 HBD_MASKED_SUBPEL_VARIANCE_WXH_NEON(12, 8, 16)
    923 
    924 HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(12, 16, 8)
    925 HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(12, 16, 16)
    926 HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(12, 16, 32)
    927 
    928 HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(12, 32, 16)
    929 HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(12, 32, 32)
    930 HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(12, 32, 64)
    931 
    932 HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(12, 64, 32)
    933 HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(12, 64, 64)
    934 HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(12, 64, 128)
    935 
    936 HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(12, 128, 64)
    937 HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(12, 128, 128)
    938 
    939 #if !CONFIG_REALTIME_ONLY
    940 HBD_MASKED_SUBPEL_VARIANCE_WXH_NEON(12, 4, 16)
    941 
    942 HBD_MASKED_SUBPEL_VARIANCE_WXH_NEON(12, 8, 32)
    943 
    944 HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(12, 16, 4)
    945 HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(12, 16, 64)
    946 
    947 HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(12, 32, 8)
    948 
    949 HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(12, 64, 16)
    950 #endif  // !CONFIG_REALTIME_ONLY
    951 
    952 #if !CONFIG_REALTIME_ONLY
    953 #define HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(bitdepth, w, h)                \
    954  unsigned int                                                              \
    955      aom_highbd_##bitdepth##_obmc_sub_pixel_variance##w##x##h##_neon(      \
    956          const uint8_t *pre, int pre_stride, int xoffset, int yoffset,     \
    957          const int32_t *wsrc, const int32_t *mask, unsigned int *sse) {    \
    958    uint16_t *pre_ptr = CONVERT_TO_SHORTPTR(pre);                           \
    959    uint16_t tmp0[w * (h + 1)];                                             \
    960    uint16_t tmp1[w * h];                                                   \
    961    highbd_var_filter_block2d_bil_w##w(pre_ptr, tmp0, pre_stride, 1, h + 1, \
    962                                       xoffset);                            \
    963    highbd_var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset);       \
    964    return aom_highbd_##bitdepth##_obmc_variance##w##x##h##_neon(           \
    965        CONVERT_TO_BYTEPTR(tmp1), w, wsrc, mask, sse);                      \
    966  }
    967 
    968 #define SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(bitdepth, w, h)       \
    969  unsigned int                                                                 \
    970      aom_highbd_##bitdepth##_obmc_sub_pixel_variance##w##x##h##_neon(         \
    971          const uint8_t *pre, int pre_stride, int xoffset, int yoffset,        \
    972          const int32_t *wsrc, const int32_t *mask, unsigned int *sse) {       \
    973    uint16_t *pre_ptr = CONVERT_TO_SHORTPTR(pre);                              \
    974    if (xoffset == 0) {                                                        \
    975      if (yoffset == 0) {                                                      \
    976        return aom_highbd_##bitdepth##_obmc_variance##w##x##h##_neon(          \
    977            pre, pre_stride, wsrc, mask, sse);                                 \
    978      } else if (yoffset == 4) {                                               \
    979        uint16_t tmp[w * h];                                                   \
    980        highbd_var_filter_block2d_avg(pre_ptr, tmp, pre_stride, pre_stride, w, \
    981                                      h);                                      \
    982        return aom_highbd_##bitdepth##_obmc_variance##w##x##h##_neon(          \
    983            CONVERT_TO_BYTEPTR(tmp), w, wsrc, mask, sse);                      \
    984      } else {                                                                 \
    985        uint16_t tmp[w * h];                                                   \
    986        highbd_var_filter_block2d_bil_w##w(pre_ptr, tmp, pre_stride,           \
    987                                           pre_stride, h, yoffset);            \
    988        return aom_highbd_##bitdepth##_obmc_variance##w##x##h##_neon(          \
    989            CONVERT_TO_BYTEPTR(tmp), w, wsrc, mask, sse);                      \
    990      }                                                                        \
    991    } else if (xoffset == 4) {                                                 \
    992      uint16_t tmp0[w * (h + 1)];                                              \
    993      if (yoffset == 0) {                                                      \
    994        highbd_var_filter_block2d_avg(pre_ptr, tmp0, pre_stride, 1, w, h);     \
    995        return aom_highbd_##bitdepth##_obmc_variance##w##x##h##_neon(          \
    996            CONVERT_TO_BYTEPTR(tmp0), w, wsrc, mask, sse);                     \
    997      } else if (yoffset == 4) {                                               \
    998        uint16_t tmp1[w * (h + 1)];                                            \
    999        highbd_var_filter_block2d_avg(pre_ptr, tmp0, pre_stride, 1, w, h + 1); \
   1000        highbd_var_filter_block2d_avg(tmp0, tmp1, w, w, w, h);                 \
   1001        return aom_highbd_##bitdepth##_obmc_variance##w##x##h##_neon(          \
   1002            CONVERT_TO_BYTEPTR(tmp1), w, wsrc, mask, sse);                     \
   1003      } else {                                                                 \
   1004        uint16_t tmp1[w * (h + 1)];                                            \
   1005        highbd_var_filter_block2d_avg(pre_ptr, tmp0, pre_stride, 1, w, h + 1); \
   1006        highbd_var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset);      \
   1007        return aom_highbd_##bitdepth##_obmc_variance##w##x##h##_neon(          \
   1008            CONVERT_TO_BYTEPTR(tmp1), w, wsrc, mask, sse);                     \
   1009      }                                                                        \
   1010    } else {                                                                   \
   1011      uint16_t tmp0[w * (h + 1)];                                              \
   1012      if (yoffset == 0) {                                                      \
   1013        highbd_var_filter_block2d_bil_w##w(pre_ptr, tmp0, pre_stride, 1, h,    \
   1014                                           xoffset);                           \
   1015        return aom_highbd_##bitdepth##_obmc_variance##w##x##h##_neon(          \
   1016            CONVERT_TO_BYTEPTR(tmp0), w, wsrc, mask, sse);                     \
   1017      } else if (yoffset == 4) {                                               \
   1018        uint16_t tmp1[w * h];                                                  \
   1019        highbd_var_filter_block2d_bil_w##w(pre_ptr, tmp0, pre_stride, 1,       \
   1020                                           h + 1, xoffset);                    \
   1021        highbd_var_filter_block2d_avg(tmp0, tmp1, w, w, w, h);                 \
   1022        return aom_highbd_##bitdepth##_obmc_variance##w##x##h##_neon(          \
   1023            CONVERT_TO_BYTEPTR(tmp1), w, wsrc, mask, sse);                     \
   1024      } else {                                                                 \
   1025        uint16_t tmp1[w * h];                                                  \
   1026        highbd_var_filter_block2d_bil_w##w(pre_ptr, tmp0, pre_stride, 1,       \
   1027                                           h + 1, xoffset);                    \
   1028        highbd_var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset);      \
   1029        return aom_highbd_##bitdepth##_obmc_variance##w##x##h##_neon(          \
   1030            CONVERT_TO_BYTEPTR(tmp1), w, wsrc, mask, sse);                     \
   1031      }                                                                        \
   1032    }                                                                          \
   1033  }
   1034 
   1035 // 8-bit
   1036 HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(8, 4, 4)
   1037 HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(8, 4, 8)
   1038 HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(8, 4, 16)
   1039 
   1040 HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(8, 8, 4)
   1041 HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(8, 8, 8)
   1042 HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(8, 8, 16)
   1043 HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(8, 8, 32)
   1044 
   1045 HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(8, 16, 4)
   1046 HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(8, 16, 8)
   1047 SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(8, 16, 16)
   1048 SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(8, 16, 32)
   1049 SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(8, 16, 64)
   1050 
   1051 SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(8, 32, 8)
   1052 SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(8, 32, 16)
   1053 SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(8, 32, 32)
   1054 SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(8, 32, 64)
   1055 
   1056 SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(8, 64, 16)
   1057 SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(8, 64, 32)
   1058 SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(8, 64, 64)
   1059 SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(8, 64, 128)
   1060 
   1061 SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(8, 128, 64)
   1062 SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(8, 128, 128)
   1063 
   1064 // 10-bit
   1065 HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(10, 4, 4)
   1066 HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(10, 4, 8)
   1067 HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(10, 4, 16)
   1068 
   1069 HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(10, 8, 4)
   1070 HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(10, 8, 8)
   1071 HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(10, 8, 16)
   1072 HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(10, 8, 32)
   1073 
   1074 HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(10, 16, 4)
   1075 HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(10, 16, 8)
   1076 SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(10, 16, 16)
   1077 SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(10, 16, 32)
   1078 SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(10, 16, 64)
   1079 
   1080 SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(10, 32, 8)
   1081 SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(10, 32, 16)
   1082 SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(10, 32, 32)
   1083 SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(10, 32, 64)
   1084 
   1085 SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(10, 64, 16)
   1086 SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(10, 64, 32)
   1087 SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(10, 64, 64)
   1088 SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(10, 64, 128)
   1089 
   1090 SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(10, 128, 64)
   1091 SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(10, 128, 128)
   1092 
   1093 // 12-bit
   1094 HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(12, 4, 4)
   1095 HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(12, 4, 8)
   1096 HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(12, 4, 16)
   1097 
   1098 HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(12, 8, 4)
   1099 HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(12, 8, 8)
   1100 HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(12, 8, 16)
   1101 HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(12, 8, 32)
   1102 
   1103 HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(12, 16, 4)
   1104 HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(12, 16, 8)
   1105 SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(12, 16, 16)
   1106 SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(12, 16, 32)
   1107 SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(12, 16, 64)
   1108 
   1109 SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(12, 32, 8)
   1110 SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(12, 32, 16)
   1111 SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(12, 32, 32)
   1112 SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(12, 32, 64)
   1113 
   1114 SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(12, 64, 16)
   1115 SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(12, 64, 32)
   1116 SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(12, 64, 64)
   1117 SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(12, 64, 128)
   1118 
   1119 SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(12, 128, 64)
   1120 SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(12, 128, 128)
   1121 #endif  // !CONFIG_REALTIME_ONLY