tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

variance.c (50164B)


      1 /*
      2 * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
      3 *
      4 * This source code is subject to the terms of the BSD 2 Clause License and
      5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
      6 * was not distributed with this source code in the LICENSE file, you can
      7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
      8 * Media Patent License 1.0 was not distributed with this source code in the
      9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
     10 */
     11 #include <assert.h>
     12 #include <stdlib.h>
     13 
     14 #include "config/aom_config.h"
     15 #include "config/aom_dsp_rtcd.h"
     16 
     17 #include "aom/aom_integer.h"
     18 #include "aom_ports/mem.h"
     19 
     20 #include "aom_dsp/aom_filter.h"
     21 #include "aom_dsp/blend.h"
     22 #include "aom_dsp/variance.h"
     23 
     24 #include "av1/common/filter.h"
     25 #include "av1/common/reconinter.h"
     26 
     27 #if !CONFIG_REALTIME_ONLY
     28 uint32_t aom_get_mb_ss_c(const int16_t *a) {
     29  unsigned int i, sum = 0;
     30 
     31  for (i = 0; i < 256; ++i) {
     32    sum += a[i] * a[i];
     33  }
     34 
     35  return sum;
     36 }
     37 #endif  // !CONFIG_REALTIME_ONLY
     38 
     39 static void variance(const uint8_t *a, int a_stride, const uint8_t *b,
     40                     int b_stride, int w, int h, uint32_t *sse, int *sum) {
     41  int i, j;
     42  int tsum = 0;
     43  uint32_t tsse = 0;
     44 
     45  for (i = 0; i < h; ++i) {
     46    for (j = 0; j < w; ++j) {
     47      const int diff = a[j] - b[j];
     48      tsum += diff;
     49      tsse += diff * diff;
     50    }
     51 
     52    a += a_stride;
     53    b += b_stride;
     54  }
     55  *sum = tsum;
     56  *sse = tsse;
     57 }
     58 
     59 uint32_t aom_sse_odd_size(const uint8_t *a, int a_stride, const uint8_t *b,
     60                          int b_stride, int w, int h) {
     61  uint32_t sse;
     62  int sum;
     63  variance(a, a_stride, b, b_stride, w, h, &sse, &sum);
     64  return sse;
     65 }
     66 
     67 // Applies a 1-D 2-tap bilinear filter to the source block in either horizontal
     68 // or vertical direction to produce the filtered output block. Used to implement
     69 // the first-pass of 2-D separable filter.
     70 //
     71 // Produces int16_t output to retain precision for the next pass. Two filter
     72 // taps should sum to FILTER_WEIGHT. pixel_step defines whether the filter is
     73 // applied horizontally (pixel_step = 1) or vertically (pixel_step = stride).
     74 // It defines the offset required to move from one input to the next.
     75 static void var_filter_block2d_bil_first_pass_c(
     76    const uint8_t *a, uint16_t *b, unsigned int src_pixels_per_line,
     77    unsigned int pixel_step, unsigned int output_height,
     78    unsigned int output_width, const uint8_t *filter) {
     79  unsigned int i, j;
     80 
     81  for (i = 0; i < output_height; ++i) {
     82    for (j = 0; j < output_width; ++j) {
     83      b[j] = ROUND_POWER_OF_TWO(
     84          (int)a[0] * filter[0] + (int)a[pixel_step] * filter[1], FILTER_BITS);
     85 
     86      ++a;
     87    }
     88 
     89    a += src_pixels_per_line - output_width;
     90    b += output_width;
     91  }
     92 }
     93 
     94 // Applies a 1-D 2-tap bilinear filter to the source block in either horizontal
     95 // or vertical direction to produce the filtered output block. Used to implement
     96 // the second-pass of 2-D separable filter.
     97 //
     98 // Requires 16-bit input as produced by filter_block2d_bil_first_pass. Two
     99 // filter taps should sum to FILTER_WEIGHT. pixel_step defines whether the
    100 // filter is applied horizontally (pixel_step = 1) or vertically
    101 // (pixel_step = stride). It defines the offset required to move from one input
    102 // to the next. Output is 8-bit.
    103 static void var_filter_block2d_bil_second_pass_c(
    104    const uint16_t *a, uint8_t *b, unsigned int src_pixels_per_line,
    105    unsigned int pixel_step, unsigned int output_height,
    106    unsigned int output_width, const uint8_t *filter) {
    107  unsigned int i, j;
    108 
    109  for (i = 0; i < output_height; ++i) {
    110    for (j = 0; j < output_width; ++j) {
    111      b[j] = ROUND_POWER_OF_TWO(
    112          (int)a[0] * filter[0] + (int)a[pixel_step] * filter[1], FILTER_BITS);
    113      ++a;
    114    }
    115 
    116    a += src_pixels_per_line - output_width;
    117    b += output_width;
    118  }
    119 }
    120 
    121 #define VAR(W, H)                                                    \
    122  uint32_t aom_variance##W##x##H##_c(const uint8_t *a, int a_stride, \
    123                                     const uint8_t *b, int b_stride, \
    124                                     uint32_t *sse) {                \
    125    int sum;                                                         \
    126    variance(a, a_stride, b, b_stride, W, H, sse, &sum);             \
    127    return *sse - (uint32_t)(((int64_t)sum * sum) / (W * H));        \
    128  }
    129 
    130 #define SUBPIX_VAR(W, H)                                                  \
    131  uint32_t aom_sub_pixel_variance##W##x##H##_c(                           \
    132      const uint8_t *a, int a_stride, int xoffset, int yoffset,           \
    133      const uint8_t *b, int b_stride, uint32_t *sse) {                    \
    134    uint16_t fdata3[(H + 1) * W];                                         \
    135    uint8_t temp2[H * W];                                                 \
    136                                                                          \
    137    var_filter_block2d_bil_first_pass_c(a, fdata3, a_stride, 1, H + 1, W, \
    138                                        bilinear_filters_2t[xoffset]);    \
    139    var_filter_block2d_bil_second_pass_c(fdata3, temp2, W, W, H, W,       \
    140                                         bilinear_filters_2t[yoffset]);   \
    141                                                                          \
    142    return aom_variance##W##x##H##_c(temp2, W, b, b_stride, sse);         \
    143  }
    144 
    145 #define SUBPIX_AVG_VAR(W, H)                                              \
    146  uint32_t aom_sub_pixel_avg_variance##W##x##H##_c(                       \
    147      const uint8_t *a, int a_stride, int xoffset, int yoffset,           \
    148      const uint8_t *b, int b_stride, uint32_t *sse,                      \
    149      const uint8_t *second_pred) {                                       \
    150    uint16_t fdata3[(H + 1) * W];                                         \
    151    uint8_t temp2[H * W];                                                 \
    152    DECLARE_ALIGNED(16, uint8_t, temp3[H * W]);                           \
    153                                                                          \
    154    var_filter_block2d_bil_first_pass_c(a, fdata3, a_stride, 1, H + 1, W, \
    155                                        bilinear_filters_2t[xoffset]);    \
    156    var_filter_block2d_bil_second_pass_c(fdata3, temp2, W, W, H, W,       \
    157                                         bilinear_filters_2t[yoffset]);   \
    158                                                                          \
    159    aom_comp_avg_pred(temp3, second_pred, W, H, temp2, W);                \
    160                                                                          \
    161    return aom_variance##W##x##H##_c(temp3, W, b, b_stride, sse);         \
    162  }
    163 
    164 void aom_get_var_sse_sum_8x8_quad_c(const uint8_t *a, int a_stride,
    165                                    const uint8_t *b, int b_stride,
    166                                    uint32_t *sse8x8, int *sum8x8,
    167                                    unsigned int *tot_sse, int *tot_sum,
    168                                    uint32_t *var8x8) {
    169  // Loop over 4 8x8 blocks. Process one 8x32 block.
    170  for (int k = 0; k < 4; k++) {
    171    variance(a + (k * 8), a_stride, b + (k * 8), b_stride, 8, 8, &sse8x8[k],
    172             &sum8x8[k]);
    173  }
    174 
    175  // Calculate variance at 8x8 level and total sse, sum of 8x32 block.
    176  *tot_sse += sse8x8[0] + sse8x8[1] + sse8x8[2] + sse8x8[3];
    177  *tot_sum += sum8x8[0] + sum8x8[1] + sum8x8[2] + sum8x8[3];
    178  for (int i = 0; i < 4; i++)
    179    var8x8[i] = sse8x8[i] - (uint32_t)(((int64_t)sum8x8[i] * sum8x8[i]) >> 6);
    180 }
    181 
    182 void aom_get_var_sse_sum_16x16_dual_c(const uint8_t *src_ptr, int source_stride,
    183                                      const uint8_t *ref_ptr, int ref_stride,
    184                                      uint32_t *sse16x16, unsigned int *tot_sse,
    185                                      int *tot_sum, uint32_t *var16x16) {
    186  int sum16x16[2] = { 0 };
    187  // Loop over two consecutive 16x16 blocks and process as one 16x32 block.
    188  for (int k = 0; k < 2; k++) {
    189    variance(src_ptr + (k * 16), source_stride, ref_ptr + (k * 16), ref_stride,
    190             16, 16, &sse16x16[k], &sum16x16[k]);
    191  }
    192 
    193  // Calculate variance at 16x16 level and total sse, sum of 16x32 block.
    194  *tot_sse += sse16x16[0] + sse16x16[1];
    195  *tot_sum += sum16x16[0] + sum16x16[1];
    196  for (int i = 0; i < 2; i++)
    197    var16x16[i] =
    198        sse16x16[i] - (uint32_t)(((int64_t)sum16x16[i] * sum16x16[i]) >> 8);
    199 }
    200 
    201 /* Identical to the variance call except it does not calculate the
    202 * sse - sum^2 / w*h and returns sse in addtion to modifying the passed in
    203 * variable.
    204 */
    205 #define MSE(W, H)                                               \
    206  uint32_t aom_mse##W##x##H##_c(const uint8_t *a, int a_stride, \
    207                                const uint8_t *b, int b_stride, \
    208                                uint32_t *sse) {                \
    209    int sum;                                                    \
    210    variance(a, a_stride, b, b_stride, W, H, sse, &sum);        \
    211    return *sse;                                                \
    212  }
    213 
    214 /* All three forms of the variance are available in the same sizes. */
    215 #define VARIANCES(W, H) \
    216  VAR(W, H)             \
    217  SUBPIX_VAR(W, H)      \
    218  SUBPIX_AVG_VAR(W, H)
    219 
    220 VARIANCES(128, 128)
    221 VARIANCES(128, 64)
    222 VARIANCES(64, 128)
    223 VARIANCES(64, 64)
    224 VARIANCES(64, 32)
    225 VARIANCES(32, 64)
    226 VARIANCES(32, 32)
    227 VARIANCES(32, 16)
    228 VARIANCES(16, 32)
    229 VARIANCES(16, 16)
    230 VARIANCES(16, 8)
    231 VARIANCES(8, 16)
    232 VARIANCES(8, 8)
    233 VARIANCES(8, 4)
    234 VARIANCES(4, 8)
    235 VARIANCES(4, 4)
    236 
    237 // Realtime mode doesn't use rectangular blocks.
    238 #if !CONFIG_REALTIME_ONLY
    239 VARIANCES(4, 16)
    240 VARIANCES(16, 4)
    241 VARIANCES(8, 32)
    242 VARIANCES(32, 8)
    243 VARIANCES(16, 64)
    244 VARIANCES(64, 16)
    245 #endif
    246 
    247 MSE(16, 16)
    248 MSE(16, 8)
    249 MSE(8, 16)
    250 MSE(8, 8)
    251 
    252 void aom_comp_avg_pred_c(uint8_t *comp_pred, const uint8_t *pred, int width,
    253                         int height, const uint8_t *ref, int ref_stride) {
    254  int i, j;
    255 
    256  for (i = 0; i < height; ++i) {
    257    for (j = 0; j < width; ++j) {
    258      const int tmp = pred[j] + ref[j];
    259      comp_pred[j] = ROUND_POWER_OF_TWO(tmp, 1);
    260    }
    261    comp_pred += width;
    262    pred += width;
    263    ref += ref_stride;
    264  }
    265 }
    266 
    267 #if CONFIG_AV1_HIGHBITDEPTH
    268 static void highbd_variance64(const uint8_t *a8, int a_stride,
    269                              const uint8_t *b8, int b_stride, int w, int h,
    270                              uint64_t *sse, int64_t *sum) {
    271  const uint16_t *a = CONVERT_TO_SHORTPTR(a8);
    272  const uint16_t *b = CONVERT_TO_SHORTPTR(b8);
    273  int64_t tsum = 0;
    274  uint64_t tsse = 0;
    275  for (int i = 0; i < h; ++i) {
    276    int32_t lsum = 0;
    277    for (int j = 0; j < w; ++j) {
    278      const int diff = a[j] - b[j];
    279      lsum += diff;
    280      tsse += (uint32_t)(diff * diff);
    281    }
    282    tsum += lsum;
    283    a += a_stride;
    284    b += b_stride;
    285  }
    286  *sum = tsum;
    287  *sse = tsse;
    288 }
    289 
    290 uint64_t aom_highbd_sse_odd_size(const uint8_t *a, int a_stride,
    291                                 const uint8_t *b, int b_stride, int w, int h) {
    292  uint64_t sse;
    293  int64_t sum;
    294  highbd_variance64(a, a_stride, b, b_stride, w, h, &sse, &sum);
    295  return sse;
    296 }
    297 
    298 static void highbd_8_variance(const uint8_t *a8, int a_stride,
    299                              const uint8_t *b8, int b_stride, int w, int h,
    300                              uint32_t *sse, int *sum) {
    301  uint64_t sse_long = 0;
    302  int64_t sum_long = 0;
    303  highbd_variance64(a8, a_stride, b8, b_stride, w, h, &sse_long, &sum_long);
    304  *sse = (uint32_t)sse_long;
    305  *sum = (int)sum_long;
    306 }
    307 
    308 static void highbd_10_variance(const uint8_t *a8, int a_stride,
    309                               const uint8_t *b8, int b_stride, int w, int h,
    310                               uint32_t *sse, int *sum) {
    311  uint64_t sse_long = 0;
    312  int64_t sum_long = 0;
    313  highbd_variance64(a8, a_stride, b8, b_stride, w, h, &sse_long, &sum_long);
    314  *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_long, 4);
    315  *sum = (int)ROUND_POWER_OF_TWO(sum_long, 2);
    316 }
    317 
    318 static void highbd_12_variance(const uint8_t *a8, int a_stride,
    319                               const uint8_t *b8, int b_stride, int w, int h,
    320                               uint32_t *sse, int *sum) {
    321  uint64_t sse_long = 0;
    322  int64_t sum_long = 0;
    323  highbd_variance64(a8, a_stride, b8, b_stride, w, h, &sse_long, &sum_long);
    324  *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_long, 8);
    325  *sum = (int)ROUND_POWER_OF_TWO(sum_long, 4);
    326 }
    327 
    328 #define HIGHBD_VAR(W, H)                                                       \
    329  uint32_t aom_highbd_8_variance##W##x##H##_c(const uint8_t *a, int a_stride,  \
    330                                              const uint8_t *b, int b_stride,  \
    331                                              uint32_t *sse) {                 \
    332    int sum;                                                                   \
    333    highbd_8_variance(a, a_stride, b, b_stride, W, H, sse, &sum);              \
    334    return *sse - (uint32_t)(((int64_t)sum * sum) / (W * H));                  \
    335  }                                                                            \
    336                                                                               \
    337  uint32_t aom_highbd_10_variance##W##x##H##_c(const uint8_t *a, int a_stride, \
    338                                               const uint8_t *b, int b_stride, \
    339                                               uint32_t *sse) {                \
    340    int sum;                                                                   \
    341    int64_t var;                                                               \
    342    highbd_10_variance(a, a_stride, b, b_stride, W, H, sse, &sum);             \
    343    var = (int64_t)(*sse) - (((int64_t)sum * sum) / (W * H));                  \
    344    return (var >= 0) ? (uint32_t)var : 0;                                     \
    345  }                                                                            \
    346                                                                               \
    347  uint32_t aom_highbd_12_variance##W##x##H##_c(const uint8_t *a, int a_stride, \
    348                                               const uint8_t *b, int b_stride, \
    349                                               uint32_t *sse) {                \
    350    int sum;                                                                   \
    351    int64_t var;                                                               \
    352    highbd_12_variance(a, a_stride, b, b_stride, W, H, sse, &sum);             \
    353    var = (int64_t)(*sse) - (((int64_t)sum * sum) / (W * H));                  \
    354    return (var >= 0) ? (uint32_t)var : 0;                                     \
    355  }
    356 
    357 #define HIGHBD_MSE(W, H)                                                      \
    358  uint32_t aom_highbd_8_mse##W##x##H##_c(const uint8_t *src, int src_stride,  \
    359                                         const uint8_t *ref, int ref_stride,  \
    360                                         uint32_t *sse) {                     \
    361    int sum;                                                                  \
    362    highbd_8_variance(src, src_stride, ref, ref_stride, W, H, sse, &sum);     \
    363    return *sse;                                                              \
    364  }                                                                           \
    365                                                                              \
    366  uint32_t aom_highbd_10_mse##W##x##H##_c(const uint8_t *src, int src_stride, \
    367                                          const uint8_t *ref, int ref_stride, \
    368                                          uint32_t *sse) {                    \
    369    int sum;                                                                  \
    370    highbd_10_variance(src, src_stride, ref, ref_stride, W, H, sse, &sum);    \
    371    return *sse;                                                              \
    372  }                                                                           \
    373                                                                              \
    374  uint32_t aom_highbd_12_mse##W##x##H##_c(const uint8_t *src, int src_stride, \
    375                                          const uint8_t *ref, int ref_stride, \
    376                                          uint32_t *sse) {                    \
    377    int sum;                                                                  \
    378    highbd_12_variance(src, src_stride, ref, ref_stride, W, H, sse, &sum);    \
    379    return *sse;                                                              \
    380  }
    381 
    382 void aom_highbd_var_filter_block2d_bil_first_pass(
    383    const uint8_t *src_ptr8, uint16_t *output_ptr,
    384    unsigned int src_pixels_per_line, int pixel_step,
    385    unsigned int output_height, unsigned int output_width,
    386    const uint8_t *filter) {
    387  unsigned int i, j;
    388  uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src_ptr8);
    389  for (i = 0; i < output_height; ++i) {
    390    for (j = 0; j < output_width; ++j) {
    391      output_ptr[j] = ROUND_POWER_OF_TWO(
    392          (int)src_ptr[0] * filter[0] + (int)src_ptr[pixel_step] * filter[1],
    393          FILTER_BITS);
    394 
    395      ++src_ptr;
    396    }
    397 
    398    // Next row...
    399    src_ptr += src_pixels_per_line - output_width;
    400    output_ptr += output_width;
    401  }
    402 }
    403 
    404 void aom_highbd_var_filter_block2d_bil_second_pass(
    405    const uint16_t *src_ptr, uint16_t *output_ptr,
    406    unsigned int src_pixels_per_line, unsigned int pixel_step,
    407    unsigned int output_height, unsigned int output_width,
    408    const uint8_t *filter) {
    409  unsigned int i, j;
    410 
    411  for (i = 0; i < output_height; ++i) {
    412    for (j = 0; j < output_width; ++j) {
    413      output_ptr[j] = ROUND_POWER_OF_TWO(
    414          (int)src_ptr[0] * filter[0] + (int)src_ptr[pixel_step] * filter[1],
    415          FILTER_BITS);
    416      ++src_ptr;
    417    }
    418 
    419    src_ptr += src_pixels_per_line - output_width;
    420    output_ptr += output_width;
    421  }
    422 }
    423 
    424 #define HIGHBD_SUBPIX_VAR(W, H)                                              \
    425  uint32_t aom_highbd_8_sub_pixel_variance##W##x##H##_c(                     \
    426      const uint8_t *src, int src_stride, int xoffset, int yoffset,          \
    427      const uint8_t *dst, int dst_stride, uint32_t *sse) {                   \
    428    uint16_t fdata3[(H + 1) * W];                                            \
    429    uint16_t temp2[H * W];                                                   \
    430                                                                             \
    431    aom_highbd_var_filter_block2d_bil_first_pass(                            \
    432        src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \
    433    aom_highbd_var_filter_block2d_bil_second_pass(                           \
    434        fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]);            \
    435                                                                             \
    436    return aom_highbd_8_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), W,  \
    437                                              dst, dst_stride, sse);         \
    438  }                                                                          \
    439                                                                             \
    440  uint32_t aom_highbd_10_sub_pixel_variance##W##x##H##_c(                    \
    441      const uint8_t *src, int src_stride, int xoffset, int yoffset,          \
    442      const uint8_t *dst, int dst_stride, uint32_t *sse) {                   \
    443    uint16_t fdata3[(H + 1) * W];                                            \
    444    uint16_t temp2[H * W];                                                   \
    445                                                                             \
    446    aom_highbd_var_filter_block2d_bil_first_pass(                            \
    447        src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \
    448    aom_highbd_var_filter_block2d_bil_second_pass(                           \
    449        fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]);            \
    450                                                                             \
    451    return aom_highbd_10_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), W, \
    452                                               dst, dst_stride, sse);        \
    453  }                                                                          \
    454                                                                             \
    455  uint32_t aom_highbd_12_sub_pixel_variance##W##x##H##_c(                    \
    456      const uint8_t *src, int src_stride, int xoffset, int yoffset,          \
    457      const uint8_t *dst, int dst_stride, uint32_t *sse) {                   \
    458    uint16_t fdata3[(H + 1) * W];                                            \
    459    uint16_t temp2[H * W];                                                   \
    460                                                                             \
    461    aom_highbd_var_filter_block2d_bil_first_pass(                            \
    462        src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \
    463    aom_highbd_var_filter_block2d_bil_second_pass(                           \
    464        fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]);            \
    465                                                                             \
    466    return aom_highbd_12_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), W, \
    467                                               dst, dst_stride, sse);        \
    468  }
    469 
    470 #define HIGHBD_SUBPIX_AVG_VAR(W, H)                                          \
    471  uint32_t aom_highbd_8_sub_pixel_avg_variance##W##x##H##_c(                 \
    472      const uint8_t *src, int src_stride, int xoffset, int yoffset,          \
    473      const uint8_t *dst, int dst_stride, uint32_t *sse,                     \
    474      const uint8_t *second_pred) {                                          \
    475    uint16_t fdata3[(H + 1) * W];                                            \
    476    uint16_t temp2[H * W];                                                   \
    477    DECLARE_ALIGNED(16, uint16_t, temp3[H * W]);                             \
    478                                                                             \
    479    aom_highbd_var_filter_block2d_bil_first_pass(                            \
    480        src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \
    481    aom_highbd_var_filter_block2d_bil_second_pass(                           \
    482        fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]);            \
    483                                                                             \
    484    aom_highbd_comp_avg_pred_c(CONVERT_TO_BYTEPTR(temp3), second_pred, W, H, \
    485                               CONVERT_TO_BYTEPTR(temp2), W);                \
    486                                                                             \
    487    return aom_highbd_8_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), W,  \
    488                                              dst, dst_stride, sse);         \
    489  }                                                                          \
    490                                                                             \
    491  uint32_t aom_highbd_10_sub_pixel_avg_variance##W##x##H##_c(                \
    492      const uint8_t *src, int src_stride, int xoffset, int yoffset,          \
    493      const uint8_t *dst, int dst_stride, uint32_t *sse,                     \
    494      const uint8_t *second_pred) {                                          \
    495    uint16_t fdata3[(H + 1) * W];                                            \
    496    uint16_t temp2[H * W];                                                   \
    497    DECLARE_ALIGNED(16, uint16_t, temp3[H * W]);                             \
    498                                                                             \
    499    aom_highbd_var_filter_block2d_bil_first_pass(                            \
    500        src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \
    501    aom_highbd_var_filter_block2d_bil_second_pass(                           \
    502        fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]);            \
    503                                                                             \
    504    aom_highbd_comp_avg_pred_c(CONVERT_TO_BYTEPTR(temp3), second_pred, W, H, \
    505                               CONVERT_TO_BYTEPTR(temp2), W);                \
    506                                                                             \
    507    return aom_highbd_10_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), W, \
    508                                               dst, dst_stride, sse);        \
    509  }                                                                          \
    510                                                                             \
    511  uint32_t aom_highbd_12_sub_pixel_avg_variance##W##x##H##_c(                \
    512      const uint8_t *src, int src_stride, int xoffset, int yoffset,          \
    513      const uint8_t *dst, int dst_stride, uint32_t *sse,                     \
    514      const uint8_t *second_pred) {                                          \
    515    uint16_t fdata3[(H + 1) * W];                                            \
    516    uint16_t temp2[H * W];                                                   \
    517    DECLARE_ALIGNED(16, uint16_t, temp3[H * W]);                             \
    518                                                                             \
    519    aom_highbd_var_filter_block2d_bil_first_pass(                            \
    520        src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \
    521    aom_highbd_var_filter_block2d_bil_second_pass(                           \
    522        fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]);            \
    523                                                                             \
    524    aom_highbd_comp_avg_pred_c(CONVERT_TO_BYTEPTR(temp3), second_pred, W, H, \
    525                               CONVERT_TO_BYTEPTR(temp2), W);                \
    526                                                                             \
    527    return aom_highbd_12_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), W, \
    528                                               dst, dst_stride, sse);        \
    529  }                                                                          \
    530                                                                             \
    531 /* All three forms of the variance are available in the same sizes. */
    532 #define HIGHBD_VARIANCES(W, H) \
    533  HIGHBD_VAR(W, H)             \
    534  HIGHBD_SUBPIX_VAR(W, H)      \
    535  HIGHBD_SUBPIX_AVG_VAR(W, H)
    536 
    537 HIGHBD_VARIANCES(128, 128)
    538 HIGHBD_VARIANCES(128, 64)
    539 HIGHBD_VARIANCES(64, 128)
    540 HIGHBD_VARIANCES(64, 64)
    541 HIGHBD_VARIANCES(64, 32)
    542 HIGHBD_VARIANCES(32, 64)
    543 HIGHBD_VARIANCES(32, 32)
    544 HIGHBD_VARIANCES(32, 16)
    545 HIGHBD_VARIANCES(16, 32)
    546 HIGHBD_VARIANCES(16, 16)
    547 HIGHBD_VARIANCES(16, 8)
    548 HIGHBD_VARIANCES(8, 16)
    549 HIGHBD_VARIANCES(8, 8)
    550 HIGHBD_VARIANCES(8, 4)
    551 HIGHBD_VARIANCES(4, 8)
    552 HIGHBD_VARIANCES(4, 4)
    553 
    554 // Realtime mode doesn't use 4x rectangular blocks.
    555 #if !CONFIG_REALTIME_ONLY
    556 HIGHBD_VARIANCES(4, 16)
    557 HIGHBD_VARIANCES(16, 4)
    558 HIGHBD_VARIANCES(8, 32)
    559 HIGHBD_VARIANCES(32, 8)
    560 HIGHBD_VARIANCES(16, 64)
    561 HIGHBD_VARIANCES(64, 16)
    562 #endif
    563 
    564 HIGHBD_MSE(16, 16)
    565 HIGHBD_MSE(16, 8)
    566 HIGHBD_MSE(8, 16)
    567 HIGHBD_MSE(8, 8)
    568 
    569 void aom_highbd_comp_avg_pred_c(uint8_t *comp_pred8, const uint8_t *pred8,
    570                                int width, int height, const uint8_t *ref8,
    571                                int ref_stride) {
    572  int i, j;
    573  uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
    574  uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
    575  uint16_t *comp_pred = CONVERT_TO_SHORTPTR(comp_pred8);
    576  for (i = 0; i < height; ++i) {
    577    for (j = 0; j < width; ++j) {
    578      const int tmp = pred[j] + ref[j];
    579      comp_pred[j] = ROUND_POWER_OF_TWO(tmp, 1);
    580    }
    581    comp_pred += width;
    582    pred += width;
    583    ref += ref_stride;
    584  }
    585 }
    586 #endif  // CONFIG_AV1_HIGHBITDEPTH
    587 
    588 void aom_comp_mask_pred_c(uint8_t *comp_pred, const uint8_t *pred, int width,
    589                          int height, const uint8_t *ref, int ref_stride,
    590                          const uint8_t *mask, int mask_stride,
    591                          int invert_mask) {
    592  int i, j;
    593  const uint8_t *src0 = invert_mask ? pred : ref;
    594  const uint8_t *src1 = invert_mask ? ref : pred;
    595  const int stride0 = invert_mask ? width : ref_stride;
    596  const int stride1 = invert_mask ? ref_stride : width;
    597  for (i = 0; i < height; ++i) {
    598    for (j = 0; j < width; ++j) {
    599      comp_pred[j] = AOM_BLEND_A64(mask[j], src0[j], src1[j]);
    600    }
    601    comp_pred += width;
    602    src0 += stride0;
    603    src1 += stride1;
    604    mask += mask_stride;
    605  }
    606 }
    607 
    608 #define MASK_SUBPIX_VAR(W, H)                                                 \
    609  unsigned int aom_masked_sub_pixel_variance##W##x##H##_c(                    \
    610      const uint8_t *src, int src_stride, int xoffset, int yoffset,           \
    611      const uint8_t *ref, int ref_stride, const uint8_t *second_pred,         \
    612      const uint8_t *msk, int msk_stride, int invert_mask,                    \
    613      unsigned int *sse) {                                                    \
    614    uint16_t fdata3[(H + 1) * W];                                             \
    615    uint8_t temp2[H * W];                                                     \
    616    DECLARE_ALIGNED(16, uint8_t, temp3[H * W]);                               \
    617                                                                              \
    618    var_filter_block2d_bil_first_pass_c(src, fdata3, src_stride, 1, H + 1, W, \
    619                                        bilinear_filters_2t[xoffset]);        \
    620    var_filter_block2d_bil_second_pass_c(fdata3, temp2, W, W, H, W,           \
    621                                         bilinear_filters_2t[yoffset]);       \
    622                                                                              \
    623    aom_comp_mask_pred_c(temp3, second_pred, W, H, temp2, W, msk, msk_stride, \
    624                         invert_mask);                                        \
    625    return aom_variance##W##x##H##_c(temp3, W, ref, ref_stride, sse);         \
    626  }
    627 
    628 MASK_SUBPIX_VAR(4, 4)
    629 MASK_SUBPIX_VAR(4, 8)
    630 MASK_SUBPIX_VAR(8, 4)
    631 MASK_SUBPIX_VAR(8, 8)
    632 MASK_SUBPIX_VAR(8, 16)
    633 MASK_SUBPIX_VAR(16, 8)
    634 MASK_SUBPIX_VAR(16, 16)
    635 MASK_SUBPIX_VAR(16, 32)
    636 MASK_SUBPIX_VAR(32, 16)
    637 MASK_SUBPIX_VAR(32, 32)
    638 MASK_SUBPIX_VAR(32, 64)
    639 MASK_SUBPIX_VAR(64, 32)
    640 MASK_SUBPIX_VAR(64, 64)
    641 MASK_SUBPIX_VAR(64, 128)
    642 MASK_SUBPIX_VAR(128, 64)
    643 MASK_SUBPIX_VAR(128, 128)
    644 
    645 // Realtime mode doesn't use 4x rectangular blocks.
    646 #if !CONFIG_REALTIME_ONLY
    647 MASK_SUBPIX_VAR(4, 16)
    648 MASK_SUBPIX_VAR(16, 4)
    649 MASK_SUBPIX_VAR(8, 32)
    650 MASK_SUBPIX_VAR(32, 8)
    651 MASK_SUBPIX_VAR(16, 64)
    652 MASK_SUBPIX_VAR(64, 16)
    653 #endif
    654 
    655 #if CONFIG_AV1_HIGHBITDEPTH
    656 void aom_highbd_comp_mask_pred_c(uint8_t *comp_pred8, const uint8_t *pred8,
    657                                 int width, int height, const uint8_t *ref8,
    658                                 int ref_stride, const uint8_t *mask,
    659                                 int mask_stride, int invert_mask) {
    660  int i, j;
    661  uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
    662  uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
    663  uint16_t *comp_pred = CONVERT_TO_SHORTPTR(comp_pred8);
    664  for (i = 0; i < height; ++i) {
    665    for (j = 0; j < width; ++j) {
    666      if (!invert_mask)
    667        comp_pred[j] = AOM_BLEND_A64(mask[j], ref[j], pred[j]);
    668      else
    669        comp_pred[j] = AOM_BLEND_A64(mask[j], pred[j], ref[j]);
    670    }
    671    comp_pred += width;
    672    pred += width;
    673    ref += ref_stride;
    674    mask += mask_stride;
    675  }
    676 }
    677 
    678 #define HIGHBD_MASK_SUBPIX_VAR(W, H)                                           \
    679  unsigned int aom_highbd_8_masked_sub_pixel_variance##W##x##H##_c(            \
    680      const uint8_t *src, int src_stride, int xoffset, int yoffset,            \
    681      const uint8_t *ref, int ref_stride, const uint8_t *second_pred,          \
    682      const uint8_t *msk, int msk_stride, int invert_mask,                     \
    683      unsigned int *sse) {                                                     \
    684    uint16_t fdata3[(H + 1) * W];                                              \
    685    uint16_t temp2[H * W];                                                     \
    686    DECLARE_ALIGNED(16, uint16_t, temp3[H * W]);                               \
    687                                                                               \
    688    aom_highbd_var_filter_block2d_bil_first_pass(                              \
    689        src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]);   \
    690    aom_highbd_var_filter_block2d_bil_second_pass(                             \
    691        fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]);              \
    692                                                                               \
    693    aom_highbd_comp_mask_pred_c(CONVERT_TO_BYTEPTR(temp3), second_pred, W, H,  \
    694                                CONVERT_TO_BYTEPTR(temp2), W, msk, msk_stride, \
    695                                invert_mask);                                  \
    696                                                                               \
    697    return aom_highbd_8_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), W,    \
    698                                              ref, ref_stride, sse);           \
    699  }                                                                            \
    700                                                                               \
    701  unsigned int aom_highbd_10_masked_sub_pixel_variance##W##x##H##_c(           \
    702      const uint8_t *src, int src_stride, int xoffset, int yoffset,            \
    703      const uint8_t *ref, int ref_stride, const uint8_t *second_pred,          \
    704      const uint8_t *msk, int msk_stride, int invert_mask,                     \
    705      unsigned int *sse) {                                                     \
    706    uint16_t fdata3[(H + 1) * W];                                              \
    707    uint16_t temp2[H * W];                                                     \
    708    DECLARE_ALIGNED(16, uint16_t, temp3[H * W]);                               \
    709                                                                               \
    710    aom_highbd_var_filter_block2d_bil_first_pass(                              \
    711        src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]);   \
    712    aom_highbd_var_filter_block2d_bil_second_pass(                             \
    713        fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]);              \
    714                                                                               \
    715    aom_highbd_comp_mask_pred_c(CONVERT_TO_BYTEPTR(temp3), second_pred, W, H,  \
    716                                CONVERT_TO_BYTEPTR(temp2), W, msk, msk_stride, \
    717                                invert_mask);                                  \
    718                                                                               \
    719    return aom_highbd_10_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), W,   \
    720                                               ref, ref_stride, sse);          \
    721  }                                                                            \
    722                                                                               \
    723  unsigned int aom_highbd_12_masked_sub_pixel_variance##W##x##H##_c(           \
    724      const uint8_t *src, int src_stride, int xoffset, int yoffset,            \
    725      const uint8_t *ref, int ref_stride, const uint8_t *second_pred,          \
    726      const uint8_t *msk, int msk_stride, int invert_mask,                     \
    727      unsigned int *sse) {                                                     \
    728    uint16_t fdata3[(H + 1) * W];                                              \
    729    uint16_t temp2[H * W];                                                     \
    730    DECLARE_ALIGNED(16, uint16_t, temp3[H * W]);                               \
    731                                                                               \
    732    aom_highbd_var_filter_block2d_bil_first_pass(                              \
    733        src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]);   \
    734    aom_highbd_var_filter_block2d_bil_second_pass(                             \
    735        fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]);              \
    736                                                                               \
    737    aom_highbd_comp_mask_pred_c(CONVERT_TO_BYTEPTR(temp3), second_pred, W, H,  \
    738                                CONVERT_TO_BYTEPTR(temp2), W, msk, msk_stride, \
    739                                invert_mask);                                  \
    740                                                                               \
    741    return aom_highbd_12_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), W,   \
    742                                               ref, ref_stride, sse);          \
    743  }
    744 
    745 HIGHBD_MASK_SUBPIX_VAR(4, 4)
    746 HIGHBD_MASK_SUBPIX_VAR(4, 8)
    747 HIGHBD_MASK_SUBPIX_VAR(8, 4)
    748 HIGHBD_MASK_SUBPIX_VAR(8, 8)
    749 HIGHBD_MASK_SUBPIX_VAR(8, 16)
    750 HIGHBD_MASK_SUBPIX_VAR(16, 8)
    751 HIGHBD_MASK_SUBPIX_VAR(16, 16)
    752 HIGHBD_MASK_SUBPIX_VAR(16, 32)
    753 HIGHBD_MASK_SUBPIX_VAR(32, 16)
    754 HIGHBD_MASK_SUBPIX_VAR(32, 32)
    755 HIGHBD_MASK_SUBPIX_VAR(32, 64)
    756 HIGHBD_MASK_SUBPIX_VAR(64, 32)
    757 HIGHBD_MASK_SUBPIX_VAR(64, 64)
    758 HIGHBD_MASK_SUBPIX_VAR(64, 128)
    759 HIGHBD_MASK_SUBPIX_VAR(128, 64)
    760 HIGHBD_MASK_SUBPIX_VAR(128, 128)
    761 #if !CONFIG_REALTIME_ONLY
    762 HIGHBD_MASK_SUBPIX_VAR(4, 16)
    763 HIGHBD_MASK_SUBPIX_VAR(16, 4)
    764 HIGHBD_MASK_SUBPIX_VAR(8, 32)
    765 HIGHBD_MASK_SUBPIX_VAR(32, 8)
    766 HIGHBD_MASK_SUBPIX_VAR(16, 64)
    767 HIGHBD_MASK_SUBPIX_VAR(64, 16)
    768 #endif
    769 #endif  // CONFIG_AV1_HIGHBITDEPTH
    770 
    771 #if !CONFIG_REALTIME_ONLY
    772 static inline void obmc_variance(const uint8_t *pre, int pre_stride,
    773                                 const int32_t *wsrc, const int32_t *mask,
    774                                 int w, int h, unsigned int *sse, int *sum) {
    775  int i, j;
    776  unsigned int tsse = 0;
    777  int tsum = 0;
    778 
    779  for (i = 0; i < h; i++) {
    780    for (j = 0; j < w; j++) {
    781      int diff = ROUND_POWER_OF_TWO_SIGNED(wsrc[j] - pre[j] * mask[j], 12);
    782      tsum += diff;
    783      tsse += diff * diff;
    784    }
    785 
    786    pre += pre_stride;
    787    wsrc += w;
    788    mask += w;
    789  }
    790  *sse = tsse;
    791  *sum = tsum;
    792 }
    793 
    794 #define OBMC_VAR(W, H)                                            \
    795  unsigned int aom_obmc_variance##W##x##H##_c(                    \
    796      const uint8_t *pre, int pre_stride, const int32_t *wsrc,    \
    797      const int32_t *mask, unsigned int *sse) {                   \
    798    int sum;                                                      \
    799    obmc_variance(pre, pre_stride, wsrc, mask, W, H, sse, &sum);  \
    800    return *sse - (unsigned int)(((int64_t)sum * sum) / (W * H)); \
    801  }
    802 
    803 #define OBMC_SUBPIX_VAR(W, H)                                                 \
    804  unsigned int aom_obmc_sub_pixel_variance##W##x##H##_c(                      \
    805      const uint8_t *pre, int pre_stride, int xoffset, int yoffset,           \
    806      const int32_t *wsrc, const int32_t *mask, unsigned int *sse) {          \
    807    uint16_t fdata3[(H + 1) * W];                                             \
    808    uint8_t temp2[H * W];                                                     \
    809                                                                              \
    810    var_filter_block2d_bil_first_pass_c(pre, fdata3, pre_stride, 1, H + 1, W, \
    811                                        bilinear_filters_2t[xoffset]);        \
    812    var_filter_block2d_bil_second_pass_c(fdata3, temp2, W, W, H, W,           \
    813                                         bilinear_filters_2t[yoffset]);       \
    814                                                                              \
    815    return aom_obmc_variance##W##x##H##_c(temp2, W, wsrc, mask, sse);         \
    816  }
    817 
    818 OBMC_VAR(4, 4)
    819 OBMC_SUBPIX_VAR(4, 4)
    820 
    821 OBMC_VAR(4, 8)
    822 OBMC_SUBPIX_VAR(4, 8)
    823 
    824 OBMC_VAR(8, 4)
    825 OBMC_SUBPIX_VAR(8, 4)
    826 
    827 OBMC_VAR(8, 8)
    828 OBMC_SUBPIX_VAR(8, 8)
    829 
    830 OBMC_VAR(8, 16)
    831 OBMC_SUBPIX_VAR(8, 16)
    832 
    833 OBMC_VAR(16, 8)
    834 OBMC_SUBPIX_VAR(16, 8)
    835 
    836 OBMC_VAR(16, 16)
    837 OBMC_SUBPIX_VAR(16, 16)
    838 
    839 OBMC_VAR(16, 32)
    840 OBMC_SUBPIX_VAR(16, 32)
    841 
    842 OBMC_VAR(32, 16)
    843 OBMC_SUBPIX_VAR(32, 16)
    844 
    845 OBMC_VAR(32, 32)
    846 OBMC_SUBPIX_VAR(32, 32)
    847 
    848 OBMC_VAR(32, 64)
    849 OBMC_SUBPIX_VAR(32, 64)
    850 
    851 OBMC_VAR(64, 32)
    852 OBMC_SUBPIX_VAR(64, 32)
    853 
    854 OBMC_VAR(64, 64)
    855 OBMC_SUBPIX_VAR(64, 64)
    856 
    857 OBMC_VAR(64, 128)
    858 OBMC_SUBPIX_VAR(64, 128)
    859 
    860 OBMC_VAR(128, 64)
    861 OBMC_SUBPIX_VAR(128, 64)
    862 
    863 OBMC_VAR(128, 128)
    864 OBMC_SUBPIX_VAR(128, 128)
    865 
    866 OBMC_VAR(4, 16)
    867 OBMC_SUBPIX_VAR(4, 16)
    868 OBMC_VAR(16, 4)
    869 OBMC_SUBPIX_VAR(16, 4)
    870 OBMC_VAR(8, 32)
    871 OBMC_SUBPIX_VAR(8, 32)
    872 OBMC_VAR(32, 8)
    873 OBMC_SUBPIX_VAR(32, 8)
    874 OBMC_VAR(16, 64)
    875 OBMC_SUBPIX_VAR(16, 64)
    876 OBMC_VAR(64, 16)
    877 OBMC_SUBPIX_VAR(64, 16)
    878 
    879 #if CONFIG_AV1_HIGHBITDEPTH
    880 static inline void highbd_obmc_variance64(const uint8_t *pre8, int pre_stride,
    881                                          const int32_t *wsrc,
    882                                          const int32_t *mask, int w, int h,
    883                                          uint64_t *sse, int64_t *sum) {
    884  int i, j;
    885  uint16_t *pre = CONVERT_TO_SHORTPTR(pre8);
    886  uint64_t tsse = 0;
    887  int64_t tsum = 0;
    888 
    889  for (i = 0; i < h; i++) {
    890    for (j = 0; j < w; j++) {
    891      int diff = ROUND_POWER_OF_TWO_SIGNED(wsrc[j] - pre[j] * mask[j], 12);
    892      tsum += diff;
    893      tsse += diff * diff;
    894    }
    895 
    896    pre += pre_stride;
    897    wsrc += w;
    898    mask += w;
    899  }
    900  *sse = tsse;
    901  *sum = tsum;
    902 }
    903 
    904 static inline void highbd_obmc_variance(const uint8_t *pre8, int pre_stride,
    905                                        const int32_t *wsrc,
    906                                        const int32_t *mask, int w, int h,
    907                                        unsigned int *sse, int *sum) {
    908  int64_t sum64;
    909  uint64_t sse64;
    910  highbd_obmc_variance64(pre8, pre_stride, wsrc, mask, w, h, &sse64, &sum64);
    911  *sum = (int)sum64;
    912  *sse = (unsigned int)sse64;
    913 }
    914 
    915 static inline void highbd_10_obmc_variance(const uint8_t *pre8, int pre_stride,
    916                                           const int32_t *wsrc,
    917                                           const int32_t *mask, int w, int h,
    918                                           unsigned int *sse, int *sum) {
    919  int64_t sum64;
    920  uint64_t sse64;
    921  highbd_obmc_variance64(pre8, pre_stride, wsrc, mask, w, h, &sse64, &sum64);
    922  *sum = (int)ROUND_POWER_OF_TWO(sum64, 2);
    923  *sse = (unsigned int)ROUND_POWER_OF_TWO(sse64, 4);
    924 }
    925 
    926 static inline void highbd_12_obmc_variance(const uint8_t *pre8, int pre_stride,
    927                                           const int32_t *wsrc,
    928                                           const int32_t *mask, int w, int h,
    929                                           unsigned int *sse, int *sum) {
    930  int64_t sum64;
    931  uint64_t sse64;
    932  highbd_obmc_variance64(pre8, pre_stride, wsrc, mask, w, h, &sse64, &sum64);
    933  *sum = (int)ROUND_POWER_OF_TWO(sum64, 4);
    934  *sse = (unsigned int)ROUND_POWER_OF_TWO(sse64, 8);
    935 }
    936 
    937 #define HIGHBD_OBMC_VAR(W, H)                                              \
    938  unsigned int aom_highbd_8_obmc_variance##W##x##H##_c(                    \
    939      const uint8_t *pre, int pre_stride, const int32_t *wsrc,             \
    940      const int32_t *mask, unsigned int *sse) {                            \
    941    int sum;                                                               \
    942    highbd_obmc_variance(pre, pre_stride, wsrc, mask, W, H, sse, &sum);    \
    943    return *sse - (unsigned int)(((int64_t)sum * sum) / (W * H));          \
    944  }                                                                        \
    945                                                                           \
    946  unsigned int aom_highbd_10_obmc_variance##W##x##H##_c(                   \
    947      const uint8_t *pre, int pre_stride, const int32_t *wsrc,             \
    948      const int32_t *mask, unsigned int *sse) {                            \
    949    int sum;                                                               \
    950    int64_t var;                                                           \
    951    highbd_10_obmc_variance(pre, pre_stride, wsrc, mask, W, H, sse, &sum); \
    952    var = (int64_t)(*sse) - (((int64_t)sum * sum) / (W * H));              \
    953    return (var >= 0) ? (uint32_t)var : 0;                                 \
    954  }                                                                        \
    955                                                                           \
    956  unsigned int aom_highbd_12_obmc_variance##W##x##H##_c(                   \
    957      const uint8_t *pre, int pre_stride, const int32_t *wsrc,             \
    958      const int32_t *mask, unsigned int *sse) {                            \
    959    int sum;                                                               \
    960    int64_t var;                                                           \
    961    highbd_12_obmc_variance(pre, pre_stride, wsrc, mask, W, H, sse, &sum); \
    962    var = (int64_t)(*sse) - (((int64_t)sum * sum) / (W * H));              \
    963    return (var >= 0) ? (uint32_t)var : 0;                                 \
    964  }
    965 
    966 #define HIGHBD_OBMC_SUBPIX_VAR(W, H)                                           \
    967  unsigned int aom_highbd_8_obmc_sub_pixel_variance##W##x##H##_c(              \
    968      const uint8_t *pre, int pre_stride, int xoffset, int yoffset,            \
    969      const int32_t *wsrc, const int32_t *mask, unsigned int *sse) {           \
    970    uint16_t fdata3[(H + 1) * W];                                              \
    971    uint16_t temp2[H * W];                                                     \
    972                                                                               \
    973    aom_highbd_var_filter_block2d_bil_first_pass(                              \
    974        pre, fdata3, pre_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]);   \
    975    aom_highbd_var_filter_block2d_bil_second_pass(                             \
    976        fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]);              \
    977                                                                               \
    978    return aom_highbd_8_obmc_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2),  \
    979                                                   W, wsrc, mask, sse);        \
    980  }                                                                            \
    981                                                                               \
    982  unsigned int aom_highbd_10_obmc_sub_pixel_variance##W##x##H##_c(             \
    983      const uint8_t *pre, int pre_stride, int xoffset, int yoffset,            \
    984      const int32_t *wsrc, const int32_t *mask, unsigned int *sse) {           \
    985    uint16_t fdata3[(H + 1) * W];                                              \
    986    uint16_t temp2[H * W];                                                     \
    987                                                                               \
    988    aom_highbd_var_filter_block2d_bil_first_pass(                              \
    989        pre, fdata3, pre_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]);   \
    990    aom_highbd_var_filter_block2d_bil_second_pass(                             \
    991        fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]);              \
    992                                                                               \
    993    return aom_highbd_10_obmc_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), \
    994                                                    W, wsrc, mask, sse);       \
    995  }                                                                            \
    996                                                                               \
    997  unsigned int aom_highbd_12_obmc_sub_pixel_variance##W##x##H##_c(             \
    998      const uint8_t *pre, int pre_stride, int xoffset, int yoffset,            \
    999      const int32_t *wsrc, const int32_t *mask, unsigned int *sse) {           \
   1000    uint16_t fdata3[(H + 1) * W];                                              \
   1001    uint16_t temp2[H * W];                                                     \
   1002                                                                               \
   1003    aom_highbd_var_filter_block2d_bil_first_pass(                              \
   1004        pre, fdata3, pre_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]);   \
   1005    aom_highbd_var_filter_block2d_bil_second_pass(                             \
   1006        fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]);              \
   1007                                                                               \
   1008    return aom_highbd_12_obmc_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), \
   1009                                                    W, wsrc, mask, sse);       \
   1010  }
   1011 
   1012 HIGHBD_OBMC_VAR(4, 4)
   1013 HIGHBD_OBMC_SUBPIX_VAR(4, 4)
   1014 
   1015 HIGHBD_OBMC_VAR(4, 8)
   1016 HIGHBD_OBMC_SUBPIX_VAR(4, 8)
   1017 
   1018 HIGHBD_OBMC_VAR(8, 4)
   1019 HIGHBD_OBMC_SUBPIX_VAR(8, 4)
   1020 
   1021 HIGHBD_OBMC_VAR(8, 8)
   1022 HIGHBD_OBMC_SUBPIX_VAR(8, 8)
   1023 
   1024 HIGHBD_OBMC_VAR(8, 16)
   1025 HIGHBD_OBMC_SUBPIX_VAR(8, 16)
   1026 
   1027 HIGHBD_OBMC_VAR(16, 8)
   1028 HIGHBD_OBMC_SUBPIX_VAR(16, 8)
   1029 
   1030 HIGHBD_OBMC_VAR(16, 16)
   1031 HIGHBD_OBMC_SUBPIX_VAR(16, 16)
   1032 
   1033 HIGHBD_OBMC_VAR(16, 32)
   1034 HIGHBD_OBMC_SUBPIX_VAR(16, 32)
   1035 
   1036 HIGHBD_OBMC_VAR(32, 16)
   1037 HIGHBD_OBMC_SUBPIX_VAR(32, 16)
   1038 
   1039 HIGHBD_OBMC_VAR(32, 32)
   1040 HIGHBD_OBMC_SUBPIX_VAR(32, 32)
   1041 
   1042 HIGHBD_OBMC_VAR(32, 64)
   1043 HIGHBD_OBMC_SUBPIX_VAR(32, 64)
   1044 
   1045 HIGHBD_OBMC_VAR(64, 32)
   1046 HIGHBD_OBMC_SUBPIX_VAR(64, 32)
   1047 
   1048 HIGHBD_OBMC_VAR(64, 64)
   1049 HIGHBD_OBMC_SUBPIX_VAR(64, 64)
   1050 
   1051 HIGHBD_OBMC_VAR(64, 128)
   1052 HIGHBD_OBMC_SUBPIX_VAR(64, 128)
   1053 
   1054 HIGHBD_OBMC_VAR(128, 64)
   1055 HIGHBD_OBMC_SUBPIX_VAR(128, 64)
   1056 
   1057 HIGHBD_OBMC_VAR(128, 128)
   1058 HIGHBD_OBMC_SUBPIX_VAR(128, 128)
   1059 
   1060 HIGHBD_OBMC_VAR(4, 16)
   1061 HIGHBD_OBMC_SUBPIX_VAR(4, 16)
   1062 HIGHBD_OBMC_VAR(16, 4)
   1063 HIGHBD_OBMC_SUBPIX_VAR(16, 4)
   1064 HIGHBD_OBMC_VAR(8, 32)
   1065 HIGHBD_OBMC_SUBPIX_VAR(8, 32)
   1066 HIGHBD_OBMC_VAR(32, 8)
   1067 HIGHBD_OBMC_SUBPIX_VAR(32, 8)
   1068 HIGHBD_OBMC_VAR(16, 64)
   1069 HIGHBD_OBMC_SUBPIX_VAR(16, 64)
   1070 HIGHBD_OBMC_VAR(64, 16)
   1071 HIGHBD_OBMC_SUBPIX_VAR(64, 16)
   1072 #endif  // CONFIG_AV1_HIGHBITDEPTH
   1073 #endif  // !CONFIG_REALTIME_ONLY
   1074 
   1075 uint64_t aom_mse_wxh_16bit_c(uint8_t *dst, int dstride, uint16_t *src,
   1076                             int sstride, int w, int h) {
   1077  uint64_t sum = 0;
   1078  for (int i = 0; i < h; i++) {
   1079    for (int j = 0; j < w; j++) {
   1080      int e = (uint16_t)dst[i * dstride + j] - src[i * sstride + j];
   1081      sum += e * e;
   1082    }
   1083  }
   1084  return sum;
   1085 }
   1086 
   1087 uint64_t aom_mse_16xh_16bit_c(uint8_t *dst, int dstride, uint16_t *src, int w,
   1088                              int h) {
   1089  uint16_t *src_temp = src;
   1090  uint8_t *dst_temp = dst;
   1091  const int num_blks = 16 / w;
   1092  int64_t sum = 0;
   1093  for (int i = 0; i < num_blks; i++) {
   1094    sum += aom_mse_wxh_16bit_c(dst_temp, dstride, src_temp, w, w, h);
   1095    dst_temp += w;
   1096    src_temp += (w * h);
   1097  }
   1098  return sum;
   1099 }
   1100 
   1101 #if CONFIG_AV1_HIGHBITDEPTH
   1102 uint64_t aom_mse_wxh_16bit_highbd_c(uint16_t *dst, int dstride, uint16_t *src,
   1103                                    int sstride, int w, int h) {
   1104  uint64_t sum = 0;
   1105  for (int i = 0; i < h; i++) {
   1106    for (int j = 0; j < w; j++) {
   1107      int e = dst[i * dstride + j] - src[i * sstride + j];
   1108      sum += e * e;
   1109    }
   1110  }
   1111  return sum;
   1112 }
   1113 #endif  // CONFIG_AV1_HIGHBITDEPTH