tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

subpel_variance_neon.c (40425B)


      1 /*
      2 * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
      3 *
      4 * This source code is subject to the terms of the BSD 2 Clause License and
      5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
      6 * was not distributed with this source code in the LICENSE file, you can
      7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
      8 * Media Patent License 1.0 was not distributed with this source code in the
      9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
     10 */
     11 
     12 #include <arm_neon.h>
     13 
     14 #include "config/aom_dsp_rtcd.h"
     15 #include "config/aom_config.h"
     16 
     17 #include "aom_ports/mem.h"
     18 #include "aom/aom_integer.h"
     19 
     20 #include "aom_dsp/variance.h"
     21 #include "aom_dsp/arm/mem_neon.h"
     22 
     23 static void var_filter_block2d_bil_w4(const uint8_t *src_ptr, uint8_t *dst_ptr,
     24                                      int src_stride, int pixel_step,
     25                                      int dst_height, int filter_offset) {
     26  const uint8x8_t f0 = vdup_n_u8(8 - filter_offset);
     27  const uint8x8_t f1 = vdup_n_u8(filter_offset);
     28 
     29  int i = dst_height;
     30  do {
     31    uint8x8_t s0 = load_unaligned_u8(src_ptr, src_stride);
     32    uint8x8_t s1 = load_unaligned_u8(src_ptr + pixel_step, src_stride);
     33    uint16x8_t blend = vmull_u8(s0, f0);
     34    blend = vmlal_u8(blend, s1, f1);
     35    uint8x8_t blend_u8 = vrshrn_n_u16(blend, 3);
     36    vst1_u8(dst_ptr, blend_u8);
     37 
     38    src_ptr += 2 * src_stride;
     39    dst_ptr += 2 * 4;
     40    i -= 2;
     41  } while (i != 0);
     42 }
     43 
     44 static void var_filter_block2d_bil_w8(const uint8_t *src_ptr, uint8_t *dst_ptr,
     45                                      int src_stride, int pixel_step,
     46                                      int dst_height, int filter_offset) {
     47  const uint8x8_t f0 = vdup_n_u8(8 - filter_offset);
     48  const uint8x8_t f1 = vdup_n_u8(filter_offset);
     49 
     50  int i = dst_height;
     51  do {
     52    uint8x8_t s0 = vld1_u8(src_ptr);
     53    uint8x8_t s1 = vld1_u8(src_ptr + pixel_step);
     54    uint16x8_t blend = vmull_u8(s0, f0);
     55    blend = vmlal_u8(blend, s1, f1);
     56    uint8x8_t blend_u8 = vrshrn_n_u16(blend, 3);
     57    vst1_u8(dst_ptr, blend_u8);
     58 
     59    src_ptr += src_stride;
     60    dst_ptr += 8;
     61  } while (--i != 0);
     62 }
     63 
     64 static void var_filter_block2d_bil_large(const uint8_t *src_ptr,
     65                                         uint8_t *dst_ptr, int src_stride,
     66                                         int pixel_step, int dst_width,
     67                                         int dst_height, int filter_offset) {
     68  const uint8x8_t f0 = vdup_n_u8(8 - filter_offset);
     69  const uint8x8_t f1 = vdup_n_u8(filter_offset);
     70 
     71  int i = dst_height;
     72  do {
     73    int j = 0;
     74    do {
     75      uint8x16_t s0 = vld1q_u8(src_ptr + j);
     76      uint8x16_t s1 = vld1q_u8(src_ptr + j + pixel_step);
     77      uint16x8_t blend_l = vmull_u8(vget_low_u8(s0), f0);
     78      blend_l = vmlal_u8(blend_l, vget_low_u8(s1), f1);
     79      uint16x8_t blend_h = vmull_u8(vget_high_u8(s0), f0);
     80      blend_h = vmlal_u8(blend_h, vget_high_u8(s1), f1);
     81      uint8x16_t blend_u8 =
     82          vcombine_u8(vrshrn_n_u16(blend_l, 3), vrshrn_n_u16(blend_h, 3));
     83      vst1q_u8(dst_ptr + j, blend_u8);
     84 
     85      j += 16;
     86    } while (j < dst_width);
     87 
     88    src_ptr += src_stride;
     89    dst_ptr += dst_width;
     90  } while (--i != 0);
     91 }
     92 
     93 static void var_filter_block2d_bil_w16(const uint8_t *src_ptr, uint8_t *dst_ptr,
     94                                       int src_stride, int pixel_step,
     95                                       int dst_height, int filter_offset) {
     96  var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride, pixel_step, 16,
     97                               dst_height, filter_offset);
     98 }
     99 
    100 static void var_filter_block2d_bil_w32(const uint8_t *src_ptr, uint8_t *dst_ptr,
    101                                       int src_stride, int pixel_step,
    102                                       int dst_height, int filter_offset) {
    103  var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride, pixel_step, 32,
    104                               dst_height, filter_offset);
    105 }
    106 
    107 static void var_filter_block2d_bil_w64(const uint8_t *src_ptr, uint8_t *dst_ptr,
    108                                       int src_stride, int pixel_step,
    109                                       int dst_height, int filter_offset) {
    110  var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride, pixel_step, 64,
    111                               dst_height, filter_offset);
    112 }
    113 
    114 static void var_filter_block2d_bil_w128(const uint8_t *src_ptr,
    115                                        uint8_t *dst_ptr, int src_stride,
    116                                        int pixel_step, int dst_height,
    117                                        int filter_offset) {
    118  var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride, pixel_step, 128,
    119                               dst_height, filter_offset);
    120 }
    121 
    122 static void var_filter_block2d_avg(const uint8_t *src_ptr, uint8_t *dst_ptr,
    123                                   int src_stride, int pixel_step,
    124                                   int dst_width, int dst_height) {
    125  // We only specialise on the filter values for large block sizes (>= 16x16.)
    126  assert(dst_width >= 16 && dst_width % 16 == 0);
    127 
    128  int i = dst_height;
    129  do {
    130    int j = 0;
    131    do {
    132      uint8x16_t s0 = vld1q_u8(src_ptr + j);
    133      uint8x16_t s1 = vld1q_u8(src_ptr + j + pixel_step);
    134      uint8x16_t avg = vrhaddq_u8(s0, s1);
    135      vst1q_u8(dst_ptr + j, avg);
    136 
    137      j += 16;
    138    } while (j < dst_width);
    139 
    140    src_ptr += src_stride;
    141    dst_ptr += dst_width;
    142  } while (--i != 0);
    143 }
    144 
    145 #define SUBPEL_VARIANCE_WXH_NEON(w, h, padding)                          \
    146  unsigned int aom_sub_pixel_variance##w##x##h##_neon(                   \
    147      const uint8_t *src, int src_stride, int xoffset, int yoffset,      \
    148      const uint8_t *ref, int ref_stride, uint32_t *sse) {               \
    149    uint8_t tmp0[w * (h + padding)];                                     \
    150    uint8_t tmp1[w * h];                                                 \
    151    var_filter_block2d_bil_w##w(src, tmp0, src_stride, 1, (h + padding), \
    152                                xoffset);                                \
    153    var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset);           \
    154    return aom_variance##w##x##h(tmp1, w, ref, ref_stride, sse);         \
    155  }
    156 
    157 #define SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(w, h, padding)                  \
    158  unsigned int aom_sub_pixel_variance##w##x##h##_neon(                       \
    159      const uint8_t *src, int src_stride, int xoffset, int yoffset,          \
    160      const uint8_t *ref, int ref_stride, unsigned int *sse) {               \
    161    if (xoffset == 0) {                                                      \
    162      if (yoffset == 0) {                                                    \
    163        return aom_variance##w##x##h(src, src_stride, ref, ref_stride, sse); \
    164      } else if (yoffset == 4) {                                             \
    165        uint8_t tmp[w * h];                                                  \
    166        var_filter_block2d_avg(src, tmp, src_stride, src_stride, w, h);      \
    167        return aom_variance##w##x##h(tmp, w, ref, ref_stride, sse);          \
    168      } else {                                                               \
    169        uint8_t tmp[w * h];                                                  \
    170        var_filter_block2d_bil_w##w(src, tmp, src_stride, src_stride, h,     \
    171                                    yoffset);                                \
    172        return aom_variance##w##x##h(tmp, w, ref, ref_stride, sse);          \
    173      }                                                                      \
    174    } else if (xoffset == 4) {                                               \
    175      uint8_t tmp0[w * (h + padding)];                                       \
    176      if (yoffset == 0) {                                                    \
    177        var_filter_block2d_avg(src, tmp0, src_stride, 1, w, h);              \
    178        return aom_variance##w##x##h(tmp0, w, ref, ref_stride, sse);         \
    179      } else if (yoffset == 4) {                                             \
    180        uint8_t tmp1[w * (h + padding)];                                     \
    181        var_filter_block2d_avg(src, tmp0, src_stride, 1, w, (h + padding));  \
    182        var_filter_block2d_avg(tmp0, tmp1, w, w, w, h);                      \
    183        return aom_variance##w##x##h(tmp1, w, ref, ref_stride, sse);         \
    184      } else {                                                               \
    185        uint8_t tmp1[w * (h + padding)];                                     \
    186        var_filter_block2d_avg(src, tmp0, src_stride, 1, w, (h + padding));  \
    187        var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset);           \
    188        return aom_variance##w##x##h(tmp1, w, ref, ref_stride, sse);         \
    189      }                                                                      \
    190    } else {                                                                 \
    191      uint8_t tmp0[w * (h + padding)];                                       \
    192      if (yoffset == 0) {                                                    \
    193        var_filter_block2d_bil_w##w(src, tmp0, src_stride, 1, h, xoffset);   \
    194        return aom_variance##w##x##h(tmp0, w, ref, ref_stride, sse);         \
    195      } else if (yoffset == 4) {                                             \
    196        uint8_t tmp1[w * h];                                                 \
    197        var_filter_block2d_bil_w##w(src, tmp0, src_stride, 1, (h + padding), \
    198                                    xoffset);                                \
    199        var_filter_block2d_avg(tmp0, tmp1, w, w, w, h);                      \
    200        return aom_variance##w##x##h(tmp1, w, ref, ref_stride, sse);         \
    201      } else {                                                               \
    202        uint8_t tmp1[w * h];                                                 \
    203        var_filter_block2d_bil_w##w(src, tmp0, src_stride, 1, (h + padding), \
    204                                    xoffset);                                \
    205        var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset);           \
    206        return aom_variance##w##x##h(tmp1, w, ref, ref_stride, sse);         \
    207      }                                                                      \
    208    }                                                                        \
    209  }
    210 
    211 SUBPEL_VARIANCE_WXH_NEON(4, 4, 2)
    212 SUBPEL_VARIANCE_WXH_NEON(4, 8, 2)
    213 
    214 SUBPEL_VARIANCE_WXH_NEON(8, 4, 1)
    215 SUBPEL_VARIANCE_WXH_NEON(8, 8, 1)
    216 SUBPEL_VARIANCE_WXH_NEON(8, 16, 1)
    217 
    218 SUBPEL_VARIANCE_WXH_NEON(16, 8, 1)
    219 SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(16, 16, 1)
    220 SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(16, 32, 1)
    221 
    222 SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(32, 16, 1)
    223 SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(32, 32, 1)
    224 SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(32, 64, 1)
    225 
    226 SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(64, 32, 1)
    227 SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(64, 64, 1)
    228 SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(64, 128, 1)
    229 
    230 SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(128, 64, 1)
    231 SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(128, 128, 1)
    232 
    233 // Realtime mode doesn't use 4x rectangular blocks.
    234 #if !CONFIG_REALTIME_ONLY
    235 
    236 SUBPEL_VARIANCE_WXH_NEON(4, 16, 2)
    237 
    238 SUBPEL_VARIANCE_WXH_NEON(8, 32, 1)
    239 
    240 SUBPEL_VARIANCE_WXH_NEON(16, 4, 1)
    241 SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(16, 64, 1)
    242 
    243 SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(32, 8, 1)
    244 
    245 SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(64, 16, 1)
    246 
    247 #endif  // !CONFIG_REALTIME_ONLY
    248 
    249 #undef SUBPEL_VARIANCE_WXH_NEON
    250 #undef SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON
    251 
    252 // Combine bilinear filter with aom_comp_avg_pred for blocks having width 4.
    253 static void avg_pred_var_filter_block2d_bil_w4(const uint8_t *src_ptr,
    254                                               uint8_t *dst_ptr, int src_stride,
    255                                               int pixel_step, int dst_height,
    256                                               int filter_offset,
    257                                               const uint8_t *second_pred) {
    258  const uint8x8_t f0 = vdup_n_u8(8 - filter_offset);
    259  const uint8x8_t f1 = vdup_n_u8(filter_offset);
    260 
    261  int i = dst_height;
    262  do {
    263    uint8x8_t s0 = load_unaligned_u8(src_ptr, src_stride);
    264    uint8x8_t s1 = load_unaligned_u8(src_ptr + pixel_step, src_stride);
    265    uint16x8_t blend = vmull_u8(s0, f0);
    266    blend = vmlal_u8(blend, s1, f1);
    267    uint8x8_t blend_u8 = vrshrn_n_u16(blend, 3);
    268 
    269    uint8x8_t p = vld1_u8(second_pred);
    270    uint8x8_t avg = vrhadd_u8(blend_u8, p);
    271 
    272    vst1_u8(dst_ptr, avg);
    273 
    274    src_ptr += 2 * src_stride;
    275    dst_ptr += 2 * 4;
    276    second_pred += 2 * 4;
    277    i -= 2;
    278  } while (i != 0);
    279 }
    280 
    281 // Combine bilinear filter with aom_comp_avg_pred for blocks having width 8.
    282 static void avg_pred_var_filter_block2d_bil_w8(const uint8_t *src_ptr,
    283                                               uint8_t *dst_ptr, int src_stride,
    284                                               int pixel_step, int dst_height,
    285                                               int filter_offset,
    286                                               const uint8_t *second_pred) {
    287  const uint8x8_t f0 = vdup_n_u8(8 - filter_offset);
    288  const uint8x8_t f1 = vdup_n_u8(filter_offset);
    289 
    290  int i = dst_height;
    291  do {
    292    uint8x8_t s0 = vld1_u8(src_ptr);
    293    uint8x8_t s1 = vld1_u8(src_ptr + pixel_step);
    294    uint16x8_t blend = vmull_u8(s0, f0);
    295    blend = vmlal_u8(blend, s1, f1);
    296    uint8x8_t blend_u8 = vrshrn_n_u16(blend, 3);
    297 
    298    uint8x8_t p = vld1_u8(second_pred);
    299    uint8x8_t avg = vrhadd_u8(blend_u8, p);
    300 
    301    vst1_u8(dst_ptr, avg);
    302 
    303    src_ptr += src_stride;
    304    dst_ptr += 8;
    305    second_pred += 8;
    306  } while (--i > 0);
    307 }
    308 
    309 // Combine bilinear filter with aom_comp_avg_pred for large blocks.
    310 static void avg_pred_var_filter_block2d_bil_large(
    311    const uint8_t *src_ptr, uint8_t *dst_ptr, int src_stride, int pixel_step,
    312    int dst_width, int dst_height, int filter_offset,
    313    const uint8_t *second_pred) {
    314  const uint8x8_t f0 = vdup_n_u8(8 - filter_offset);
    315  const uint8x8_t f1 = vdup_n_u8(filter_offset);
    316 
    317  int i = dst_height;
    318  do {
    319    int j = 0;
    320    do {
    321      uint8x16_t s0 = vld1q_u8(src_ptr + j);
    322      uint8x16_t s1 = vld1q_u8(src_ptr + j + pixel_step);
    323      uint16x8_t blend_l = vmull_u8(vget_low_u8(s0), f0);
    324      blend_l = vmlal_u8(blend_l, vget_low_u8(s1), f1);
    325      uint16x8_t blend_h = vmull_u8(vget_high_u8(s0), f0);
    326      blend_h = vmlal_u8(blend_h, vget_high_u8(s1), f1);
    327      uint8x16_t blend_u8 =
    328          vcombine_u8(vrshrn_n_u16(blend_l, 3), vrshrn_n_u16(blend_h, 3));
    329 
    330      uint8x16_t p = vld1q_u8(second_pred);
    331      uint8x16_t avg = vrhaddq_u8(blend_u8, p);
    332 
    333      vst1q_u8(dst_ptr + j, avg);
    334 
    335      j += 16;
    336      second_pred += 16;
    337    } while (j < dst_width);
    338 
    339    src_ptr += src_stride;
    340    dst_ptr += dst_width;
    341  } while (--i != 0);
    342 }
    343 
    344 // Combine bilinear filter with aom_comp_avg_pred for blocks having width 16.
    345 static void avg_pred_var_filter_block2d_bil_w16(
    346    const uint8_t *src_ptr, uint8_t *dst_ptr, int src_stride, int pixel_step,
    347    int dst_height, int filter_offset, const uint8_t *second_pred) {
    348  avg_pred_var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride,
    349                                        pixel_step, 16, dst_height,
    350                                        filter_offset, second_pred);
    351 }
    352 
    353 // Combine bilinear filter with aom_comp_avg_pred for blocks having width 32.
    354 static void avg_pred_var_filter_block2d_bil_w32(
    355    const uint8_t *src_ptr, uint8_t *dst_ptr, int src_stride, int pixel_step,
    356    int dst_height, int filter_offset, const uint8_t *second_pred) {
    357  avg_pred_var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride,
    358                                        pixel_step, 32, dst_height,
    359                                        filter_offset, second_pred);
    360 }
    361 
    362 // Combine bilinear filter with aom_comp_avg_pred for blocks having width 64.
    363 static void avg_pred_var_filter_block2d_bil_w64(
    364    const uint8_t *src_ptr, uint8_t *dst_ptr, int src_stride, int pixel_step,
    365    int dst_height, int filter_offset, const uint8_t *second_pred) {
    366  avg_pred_var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride,
    367                                        pixel_step, 64, dst_height,
    368                                        filter_offset, second_pred);
    369 }
    370 
    371 // Combine bilinear filter with aom_comp_avg_pred for blocks having width 128.
    372 static void avg_pred_var_filter_block2d_bil_w128(
    373    const uint8_t *src_ptr, uint8_t *dst_ptr, int src_stride, int pixel_step,
    374    int dst_height, int filter_offset, const uint8_t *second_pred) {
    375  avg_pred_var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride,
    376                                        pixel_step, 128, dst_height,
    377                                        filter_offset, second_pred);
    378 }
    379 
    380 // Combine averaging subpel filter with aom_comp_avg_pred.
    381 static void avg_pred_var_filter_block2d_avg(const uint8_t *src_ptr,
    382                                            uint8_t *dst_ptr, int src_stride,
    383                                            int pixel_step, int dst_width,
    384                                            int dst_height,
    385                                            const uint8_t *second_pred) {
    386  // We only specialise on the filter values for large block sizes (>= 16x16.)
    387  assert(dst_width >= 16 && dst_width % 16 == 0);
    388 
    389  int i = dst_height;
    390  do {
    391    int j = 0;
    392    do {
    393      uint8x16_t s0 = vld1q_u8(src_ptr + j);
    394      uint8x16_t s1 = vld1q_u8(src_ptr + j + pixel_step);
    395      uint8x16_t avg = vrhaddq_u8(s0, s1);
    396 
    397      uint8x16_t p = vld1q_u8(second_pred);
    398      avg = vrhaddq_u8(avg, p);
    399 
    400      vst1q_u8(dst_ptr + j, avg);
    401 
    402      j += 16;
    403      second_pred += 16;
    404    } while (j < dst_width);
    405 
    406    src_ptr += src_stride;
    407    dst_ptr += dst_width;
    408  } while (--i != 0);
    409 }
    410 
    411 // Implementation of aom_comp_avg_pred for blocks having width >= 16.
    412 static void avg_pred(const uint8_t *src_ptr, uint8_t *dst_ptr, int src_stride,
    413                     int dst_width, int dst_height,
    414                     const uint8_t *second_pred) {
    415  // We only specialise on the filter values for large block sizes (>= 16x16.)
    416  assert(dst_width >= 16 && dst_width % 16 == 0);
    417 
    418  int i = dst_height;
    419  do {
    420    int j = 0;
    421    do {
    422      uint8x16_t s = vld1q_u8(src_ptr + j);
    423      uint8x16_t p = vld1q_u8(second_pred);
    424 
    425      uint8x16_t avg = vrhaddq_u8(s, p);
    426 
    427      vst1q_u8(dst_ptr + j, avg);
    428 
    429      j += 16;
    430      second_pred += 16;
    431    } while (j < dst_width);
    432 
    433    src_ptr += src_stride;
    434    dst_ptr += dst_width;
    435  } while (--i != 0);
    436 }
    437 
    438 #define SUBPEL_AVG_VARIANCE_WXH_NEON(w, h, padding)                         \
    439  unsigned int aom_sub_pixel_avg_variance##w##x##h##_neon(                  \
    440      const uint8_t *src, int source_stride, int xoffset, int yoffset,      \
    441      const uint8_t *ref, int ref_stride, uint32_t *sse,                    \
    442      const uint8_t *second_pred) {                                         \
    443    uint8_t tmp0[w * (h + padding)];                                        \
    444    uint8_t tmp1[w * h];                                                    \
    445    var_filter_block2d_bil_w##w(src, tmp0, source_stride, 1, (h + padding), \
    446                                xoffset);                                   \
    447    avg_pred_var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset,      \
    448                                         second_pred);                      \
    449    return aom_variance##w##x##h(tmp1, w, ref, ref_stride, sse);            \
    450  }
    451 
    452 #define SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(w, h, padding)                \
    453  unsigned int aom_sub_pixel_avg_variance##w##x##h##_neon(                     \
    454      const uint8_t *src, int source_stride, int xoffset, int yoffset,         \
    455      const uint8_t *ref, int ref_stride, unsigned int *sse,                   \
    456      const uint8_t *second_pred) {                                            \
    457    if (xoffset == 0) {                                                        \
    458      uint8_t tmp[w * h];                                                      \
    459      if (yoffset == 0) {                                                      \
    460        avg_pred(src, tmp, source_stride, w, h, second_pred);                  \
    461        return aom_variance##w##x##h(tmp, w, ref, ref_stride, sse);            \
    462      } else if (yoffset == 4) {                                               \
    463        avg_pred_var_filter_block2d_avg(src, tmp, source_stride,               \
    464                                        source_stride, w, h, second_pred);     \
    465        return aom_variance##w##x##h(tmp, w, ref, ref_stride, sse);            \
    466      } else {                                                                 \
    467        avg_pred_var_filter_block2d_bil_w##w(                                  \
    468            src, tmp, source_stride, source_stride, h, yoffset, second_pred);  \
    469        return aom_variance##w##x##h(tmp, w, ref, ref_stride, sse);            \
    470      }                                                                        \
    471    } else if (xoffset == 4) {                                                 \
    472      uint8_t tmp0[w * (h + padding)];                                         \
    473      if (yoffset == 0) {                                                      \
    474        avg_pred_var_filter_block2d_avg(src, tmp0, source_stride, 1, w, h,     \
    475                                        second_pred);                          \
    476        return aom_variance##w##x##h(tmp0, w, ref, ref_stride, sse);           \
    477      } else if (yoffset == 4) {                                               \
    478        uint8_t tmp1[w * (h + padding)];                                       \
    479        var_filter_block2d_avg(src, tmp0, source_stride, 1, w, (h + padding)); \
    480        avg_pred_var_filter_block2d_avg(tmp0, tmp1, w, w, w, h, second_pred);  \
    481        return aom_variance##w##x##h(tmp1, w, ref, ref_stride, sse);           \
    482      } else {                                                                 \
    483        uint8_t tmp1[w * (h + padding)];                                       \
    484        var_filter_block2d_avg(src, tmp0, source_stride, 1, w, (h + padding)); \
    485        avg_pred_var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset,     \
    486                                             second_pred);                     \
    487        return aom_variance##w##x##h(tmp1, w, ref, ref_stride, sse);           \
    488      }                                                                        \
    489    } else {                                                                   \
    490      uint8_t tmp0[w * (h + padding)];                                         \
    491      if (yoffset == 0) {                                                      \
    492        avg_pred_var_filter_block2d_bil_w##w(src, tmp0, source_stride, 1, h,   \
    493                                             xoffset, second_pred);            \
    494        return aom_variance##w##x##h(tmp0, w, ref, ref_stride, sse);           \
    495      } else if (yoffset == 4) {                                               \
    496        uint8_t tmp1[w * h];                                                   \
    497        var_filter_block2d_bil_w##w(src, tmp0, source_stride, 1,               \
    498                                    (h + padding), xoffset);                   \
    499        avg_pred_var_filter_block2d_avg(tmp0, tmp1, w, w, w, h, second_pred);  \
    500        return aom_variance##w##x##h(tmp1, w, ref, ref_stride, sse);           \
    501      } else {                                                                 \
    502        uint8_t tmp1[w * h];                                                   \
    503        var_filter_block2d_bil_w##w(src, tmp0, source_stride, 1,               \
    504                                    (h + padding), xoffset);                   \
    505        avg_pred_var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset,     \
    506                                             second_pred);                     \
    507        return aom_variance##w##x##h(tmp1, w, ref, ref_stride, sse);           \
    508      }                                                                        \
    509    }                                                                          \
    510  }
    511 
    512 SUBPEL_AVG_VARIANCE_WXH_NEON(4, 4, 2)
    513 SUBPEL_AVG_VARIANCE_WXH_NEON(4, 8, 2)
    514 
    515 SUBPEL_AVG_VARIANCE_WXH_NEON(8, 4, 1)
    516 SUBPEL_AVG_VARIANCE_WXH_NEON(8, 8, 1)
    517 SUBPEL_AVG_VARIANCE_WXH_NEON(8, 16, 1)
    518 
    519 SUBPEL_AVG_VARIANCE_WXH_NEON(16, 8, 1)
    520 SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(16, 16, 1)
    521 SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(16, 32, 1)
    522 
    523 SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(32, 16, 1)
    524 SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(32, 32, 1)
    525 SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(32, 64, 1)
    526 
    527 SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(64, 32, 1)
    528 SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(64, 64, 1)
    529 SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(64, 128, 1)
    530 
    531 SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(128, 64, 1)
    532 SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(128, 128, 1)
    533 
    534 #if !CONFIG_REALTIME_ONLY
    535 
    536 SUBPEL_AVG_VARIANCE_WXH_NEON(4, 16, 2)
    537 
    538 SUBPEL_AVG_VARIANCE_WXH_NEON(8, 32, 1)
    539 
    540 SUBPEL_AVG_VARIANCE_WXH_NEON(16, 4, 1)
    541 SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(16, 64, 1)
    542 
    543 SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(32, 8, 1)
    544 
    545 SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(64, 16, 1)
    546 
    547 #endif  // !CONFIG_REALTIME_ONLY
    548 
    549 #undef SUBPEL_AVG_VARIANCE_WXH_NEON
    550 #undef SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON
    551 
    552 #if !CONFIG_REALTIME_ONLY
    553 
    554 #define OBMC_SUBPEL_VARIANCE_WXH_NEON(w, h, padding)                   \
    555  unsigned int aom_obmc_sub_pixel_variance##w##x##h##_neon(            \
    556      const uint8_t *pre, int pre_stride, int xoffset, int yoffset,    \
    557      const int32_t *wsrc, const int32_t *mask, unsigned int *sse) {   \
    558    uint8_t tmp0[w * (h + padding)];                                   \
    559    uint8_t tmp1[w * h];                                               \
    560    var_filter_block2d_bil_w##w(pre, tmp0, pre_stride, 1, h + padding, \
    561                                xoffset);                              \
    562    var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset);         \
    563    return aom_obmc_variance##w##x##h(tmp1, w, wsrc, mask, sse);       \
    564  }
    565 
    566 #define SPECIALIZED_OBMC_SUBPEL_VARIANCE_WXH_NEON(w, h, padding)              \
    567  unsigned int aom_obmc_sub_pixel_variance##w##x##h##_neon(                   \
    568      const uint8_t *pre, int pre_stride, int xoffset, int yoffset,           \
    569      const int32_t *wsrc, const int32_t *mask, unsigned int *sse) {          \
    570    if (xoffset == 0) {                                                       \
    571      if (yoffset == 0) {                                                     \
    572        return aom_obmc_variance##w##x##h##_neon(pre, pre_stride, wsrc, mask, \
    573                                                 sse);                        \
    574      } else if (yoffset == 4) {                                              \
    575        uint8_t tmp[w * h];                                                   \
    576        var_filter_block2d_avg(pre, tmp, pre_stride, pre_stride, w, h);       \
    577        return aom_obmc_variance##w##x##h##_neon(tmp, w, wsrc, mask, sse);    \
    578      } else {                                                                \
    579        uint8_t tmp[w * h];                                                   \
    580        var_filter_block2d_bil_w##w(pre, tmp, pre_stride, pre_stride, h,      \
    581                                    yoffset);                                 \
    582        return aom_obmc_variance##w##x##h##_neon(tmp, w, wsrc, mask, sse);    \
    583      }                                                                       \
    584    } else if (xoffset == 4) {                                                \
    585      uint8_t tmp0[w * (h + padding)];                                        \
    586      if (yoffset == 0) {                                                     \
    587        var_filter_block2d_avg(pre, tmp0, pre_stride, 1, w, h);               \
    588        return aom_obmc_variance##w##x##h##_neon(tmp0, w, wsrc, mask, sse);   \
    589      } else if (yoffset == 4) {                                              \
    590        uint8_t tmp1[w * (h + padding)];                                      \
    591        var_filter_block2d_avg(pre, tmp0, pre_stride, 1, w, h + padding);     \
    592        var_filter_block2d_avg(tmp0, tmp1, w, w, w, h);                       \
    593        return aom_obmc_variance##w##x##h##_neon(tmp1, w, wsrc, mask, sse);   \
    594      } else {                                                                \
    595        uint8_t tmp1[w * (h + padding)];                                      \
    596        var_filter_block2d_avg(pre, tmp0, pre_stride, 1, w, h + padding);     \
    597        var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset);            \
    598        return aom_obmc_variance##w##x##h##_neon(tmp1, w, wsrc, mask, sse);   \
    599      }                                                                       \
    600    } else {                                                                  \
    601      uint8_t tmp0[w * (h + padding)];                                        \
    602      if (yoffset == 0) {                                                     \
    603        var_filter_block2d_bil_w##w(pre, tmp0, pre_stride, 1, h, xoffset);    \
    604        return aom_obmc_variance##w##x##h##_neon(tmp0, w, wsrc, mask, sse);   \
    605      } else if (yoffset == 4) {                                              \
    606        uint8_t tmp1[w * h];                                                  \
    607        var_filter_block2d_bil_w##w(pre, tmp0, pre_stride, 1, h + padding,    \
    608                                    xoffset);                                 \
    609        var_filter_block2d_avg(tmp0, tmp1, w, w, w, h);                       \
    610        return aom_obmc_variance##w##x##h##_neon(tmp1, w, wsrc, mask, sse);   \
    611      } else {                                                                \
    612        uint8_t tmp1[w * h];                                                  \
    613        var_filter_block2d_bil_w##w(pre, tmp0, pre_stride, 1, h + padding,    \
    614                                    xoffset);                                 \
    615        var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset);            \
    616        return aom_obmc_variance##w##x##h##_neon(tmp1, w, wsrc, mask, sse);   \
    617      }                                                                       \
    618    }                                                                         \
    619  }
    620 
    621 OBMC_SUBPEL_VARIANCE_WXH_NEON(4, 4, 2)
    622 OBMC_SUBPEL_VARIANCE_WXH_NEON(4, 8, 2)
    623 OBMC_SUBPEL_VARIANCE_WXH_NEON(4, 16, 2)
    624 
    625 OBMC_SUBPEL_VARIANCE_WXH_NEON(8, 4, 1)
    626 OBMC_SUBPEL_VARIANCE_WXH_NEON(8, 8, 1)
    627 OBMC_SUBPEL_VARIANCE_WXH_NEON(8, 16, 1)
    628 OBMC_SUBPEL_VARIANCE_WXH_NEON(8, 32, 1)
    629 
    630 OBMC_SUBPEL_VARIANCE_WXH_NEON(16, 4, 1)
    631 OBMC_SUBPEL_VARIANCE_WXH_NEON(16, 8, 1)
    632 SPECIALIZED_OBMC_SUBPEL_VARIANCE_WXH_NEON(16, 16, 1)
    633 SPECIALIZED_OBMC_SUBPEL_VARIANCE_WXH_NEON(16, 32, 1)
    634 SPECIALIZED_OBMC_SUBPEL_VARIANCE_WXH_NEON(16, 64, 1)
    635 
    636 SPECIALIZED_OBMC_SUBPEL_VARIANCE_WXH_NEON(32, 8, 1)
    637 SPECIALIZED_OBMC_SUBPEL_VARIANCE_WXH_NEON(32, 16, 1)
    638 SPECIALIZED_OBMC_SUBPEL_VARIANCE_WXH_NEON(32, 32, 1)
    639 SPECIALIZED_OBMC_SUBPEL_VARIANCE_WXH_NEON(32, 64, 1)
    640 
    641 SPECIALIZED_OBMC_SUBPEL_VARIANCE_WXH_NEON(64, 16, 1)
    642 SPECIALIZED_OBMC_SUBPEL_VARIANCE_WXH_NEON(64, 32, 1)
    643 SPECIALIZED_OBMC_SUBPEL_VARIANCE_WXH_NEON(64, 64, 1)
    644 SPECIALIZED_OBMC_SUBPEL_VARIANCE_WXH_NEON(64, 128, 1)
    645 
    646 SPECIALIZED_OBMC_SUBPEL_VARIANCE_WXH_NEON(128, 64, 1)
    647 SPECIALIZED_OBMC_SUBPEL_VARIANCE_WXH_NEON(128, 128, 1)
    648 
    649 #undef OBMC_SUBPEL_VARIANCE_WXH_NEON
    650 #undef SPECIALIZED_OBMC_SUBPEL_VARIANCE_WXH_NEON
    651 #endif  // !CONFIG_REALTIME_ONLY
    652 
    653 #define MASKED_SUBPEL_VARIANCE_WXH_NEON(w, h, padding)                         \
    654  unsigned int aom_masked_sub_pixel_variance##w##x##h##_neon(                  \
    655      const uint8_t *src, int src_stride, int xoffset, int yoffset,            \
    656      const uint8_t *ref, int ref_stride, const uint8_t *second_pred,          \
    657      const uint8_t *msk, int msk_stride, int invert_mask,                     \
    658      unsigned int *sse) {                                                     \
    659    uint8_t tmp0[w * (h + padding)];                                           \
    660    uint8_t tmp1[w * h];                                                       \
    661    uint8_t tmp2[w * h];                                                       \
    662    var_filter_block2d_bil_w##w(src, tmp0, src_stride, 1, (h + padding),       \
    663                                xoffset);                                      \
    664    var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset);                 \
    665    aom_comp_mask_pred_neon(tmp2, second_pred, w, h, tmp1, w, msk, msk_stride, \
    666                            invert_mask);                                      \
    667    return aom_variance##w##x##h(tmp2, w, ref, ref_stride, sse);               \
    668  }
    669 
    670 #define SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(w, h, padding)             \
    671  unsigned int aom_masked_sub_pixel_variance##w##x##h##_neon(                  \
    672      const uint8_t *src, int src_stride, int xoffset, int yoffset,            \
    673      const uint8_t *ref, int ref_stride, const uint8_t *second_pred,          \
    674      const uint8_t *msk, int msk_stride, int invert_mask,                     \
    675      unsigned int *sse) {                                                     \
    676    if (xoffset == 0) {                                                        \
    677      uint8_t tmp0[w * h];                                                     \
    678      if (yoffset == 0) {                                                      \
    679        aom_comp_mask_pred_neon(tmp0, second_pred, w, h, src, src_stride, msk, \
    680                                msk_stride, invert_mask);                      \
    681        return aom_variance##w##x##h(tmp0, w, ref, ref_stride, sse);           \
    682      } else if (yoffset == 4) {                                               \
    683        uint8_t tmp1[w * h];                                                   \
    684        var_filter_block2d_avg(src, tmp0, src_stride, src_stride, w, h);       \
    685        aom_comp_mask_pred_neon(tmp1, second_pred, w, h, tmp0, w, msk,         \
    686                                msk_stride, invert_mask);                      \
    687        return aom_variance##w##x##h(tmp1, w, ref, ref_stride, sse);           \
    688      } else {                                                                 \
    689        uint8_t tmp1[w * h];                                                   \
    690        var_filter_block2d_bil_w##w(src, tmp0, src_stride, src_stride, h,      \
    691                                    yoffset);                                  \
    692        aom_comp_mask_pred_neon(tmp1, second_pred, w, h, tmp0, w, msk,         \
    693                                msk_stride, invert_mask);                      \
    694        return aom_variance##w##x##h(tmp1, w, ref, ref_stride, sse);           \
    695      }                                                                        \
    696    } else if (xoffset == 4) {                                                 \
    697      uint8_t tmp0[w * (h + padding)];                                         \
    698      if (yoffset == 0) {                                                      \
    699        uint8_t tmp1[w * h];                                                   \
    700        var_filter_block2d_avg(src, tmp0, src_stride, 1, w, h);                \
    701        aom_comp_mask_pred_neon(tmp1, second_pred, w, h, tmp0, w, msk,         \
    702                                msk_stride, invert_mask);                      \
    703        return aom_variance##w##x##h(tmp1, w, ref, ref_stride, sse);           \
    704      } else if (yoffset == 4) {                                               \
    705        uint8_t tmp1[w * h];                                                   \
    706        uint8_t tmp2[w * h];                                                   \
    707        var_filter_block2d_avg(src, tmp0, src_stride, 1, w, (h + padding));    \
    708        var_filter_block2d_avg(tmp0, tmp1, w, w, w, h);                        \
    709        aom_comp_mask_pred_neon(tmp2, second_pred, w, h, tmp1, w, msk,         \
    710                                msk_stride, invert_mask);                      \
    711        return aom_variance##w##x##h(tmp2, w, ref, ref_stride, sse);           \
    712      } else {                                                                 \
    713        uint8_t tmp1[w * h];                                                   \
    714        uint8_t tmp2[w * h];                                                   \
    715        var_filter_block2d_avg(src, tmp0, src_stride, 1, w, (h + padding));    \
    716        var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset);             \
    717        aom_comp_mask_pred_neon(tmp2, second_pred, w, h, tmp1, w, msk,         \
    718                                msk_stride, invert_mask);                      \
    719        return aom_variance##w##x##h(tmp2, w, ref, ref_stride, sse);           \
    720      }                                                                        \
    721    } else {                                                                   \
    722      if (yoffset == 0) {                                                      \
    723        uint8_t tmp0[w * h];                                                   \
    724        uint8_t tmp1[w * h];                                                   \
    725        var_filter_block2d_bil_w##w(src, tmp0, src_stride, 1, h, xoffset);     \
    726        aom_comp_mask_pred_neon(tmp1, second_pred, w, h, tmp0, w, msk,         \
    727                                msk_stride, invert_mask);                      \
    728        return aom_variance##w##x##h(tmp1, w, ref, ref_stride, sse);           \
    729      } else if (yoffset == 4) {                                               \
    730        uint8_t tmp0[w * (h + padding)];                                       \
    731        uint8_t tmp1[w * h];                                                   \
    732        uint8_t tmp2[w * h];                                                   \
    733        var_filter_block2d_bil_w##w(src, tmp0, src_stride, 1, (h + padding),   \
    734                                    xoffset);                                  \
    735        var_filter_block2d_avg(tmp0, tmp1, w, w, w, h);                        \
    736        aom_comp_mask_pred_neon(tmp2, second_pred, w, h, tmp1, w, msk,         \
    737                                msk_stride, invert_mask);                      \
    738        return aom_variance##w##x##h(tmp2, w, ref, ref_stride, sse);           \
    739      } else {                                                                 \
    740        uint8_t tmp0[w * (h + padding)];                                       \
    741        uint8_t tmp1[w * (h + padding)];                                       \
    742        uint8_t tmp2[w * h];                                                   \
    743        var_filter_block2d_bil_w##w(src, tmp0, src_stride, 1, (h + padding),   \
    744                                    xoffset);                                  \
    745        var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset);             \
    746        aom_comp_mask_pred_neon(tmp2, second_pred, w, h, tmp1, w, msk,         \
    747                                msk_stride, invert_mask);                      \
    748        return aom_variance##w##x##h(tmp2, w, ref, ref_stride, sse);           \
    749      }                                                                        \
    750    }                                                                          \
    751  }
    752 
    753 MASKED_SUBPEL_VARIANCE_WXH_NEON(4, 4, 2)
    754 MASKED_SUBPEL_VARIANCE_WXH_NEON(4, 8, 2)
    755 
    756 MASKED_SUBPEL_VARIANCE_WXH_NEON(8, 4, 1)
    757 MASKED_SUBPEL_VARIANCE_WXH_NEON(8, 8, 1)
    758 MASKED_SUBPEL_VARIANCE_WXH_NEON(8, 16, 1)
    759 
    760 MASKED_SUBPEL_VARIANCE_WXH_NEON(16, 8, 1)
    761 SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(16, 16, 1)
    762 SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(16, 32, 1)
    763 
    764 SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(32, 16, 1)
    765 SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(32, 32, 1)
    766 SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(32, 64, 1)
    767 
    768 SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(64, 32, 1)
    769 SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(64, 64, 1)
    770 SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(64, 128, 1)
    771 
    772 SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(128, 64, 1)
    773 SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(128, 128, 1)
    774 
    775 // Realtime mode doesn't use 4x rectangular blocks.
    776 #if !CONFIG_REALTIME_ONLY
    777 MASKED_SUBPEL_VARIANCE_WXH_NEON(4, 16, 2)
    778 MASKED_SUBPEL_VARIANCE_WXH_NEON(8, 32, 1)
    779 MASKED_SUBPEL_VARIANCE_WXH_NEON(16, 4, 1)
    780 SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(16, 64, 1)
    781 SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(32, 8, 1)
    782 SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(64, 16, 1)
    783 #endif  // !CONFIG_REALTIME_ONLY
    784 
    785 #undef MASKED_SUBPEL_VARIANCE_WXH_NEON
    786 #undef SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON