tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

highbd_variance_sse2.c (38855B)


      1 /*
      2 * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
      3 *
      4 * This source code is subject to the terms of the BSD 2 Clause License and
      5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
      6 * was not distributed with this source code in the LICENSE file, you can
      7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
      8 * Media Patent License 1.0 was not distributed with this source code in the
      9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
     10 */
     11 
     12 #include <assert.h>
     13 #include <emmintrin.h>  // SSE2
     14 
     15 #include "config/aom_config.h"
     16 #include "config/aom_dsp_rtcd.h"
     17 
     18 #include "aom_dsp/x86/synonyms.h"
     19 #include "aom_ports/mem.h"
     20 
     21 #include "av1/common/filter.h"
     22 #include "av1/common/reconinter.h"
     23 
     24 typedef uint32_t (*high_variance_fn_t)(const uint16_t *src, int src_stride,
     25                                       const uint16_t *ref, int ref_stride,
     26                                       uint32_t *sse, int *sum);
     27 
     28 uint32_t aom_highbd_calc8x8var_sse2(const uint16_t *src, int src_stride,
     29                                    const uint16_t *ref, int ref_stride,
     30                                    uint32_t *sse, int *sum);
     31 
     32 uint32_t aom_highbd_calc16x16var_sse2(const uint16_t *src, int src_stride,
     33                                      const uint16_t *ref, int ref_stride,
     34                                      uint32_t *sse, int *sum);
     35 
     36 static void highbd_8_variance_sse2(const uint16_t *src, int src_stride,
     37                                   const uint16_t *ref, int ref_stride, int w,
     38                                   int h, uint32_t *sse, int *sum,
     39                                   high_variance_fn_t var_fn, int block_size) {
     40  int i, j;
     41 
     42  *sse = 0;
     43  *sum = 0;
     44 
     45  for (i = 0; i < h; i += block_size) {
     46    for (j = 0; j < w; j += block_size) {
     47      unsigned int sse0;
     48      int sum0;
     49      var_fn(src + src_stride * i + j, src_stride, ref + ref_stride * i + j,
     50             ref_stride, &sse0, &sum0);
     51      *sse += sse0;
     52      *sum += sum0;
     53    }
     54  }
     55 }
     56 
     57 static void highbd_10_variance_sse2(const uint16_t *src, int src_stride,
     58                                    const uint16_t *ref, int ref_stride, int w,
     59                                    int h, uint32_t *sse, int *sum,
     60                                    high_variance_fn_t var_fn, int block_size) {
     61  int i, j;
     62  uint64_t sse_long = 0;
     63  int32_t sum_long = 0;
     64 
     65  for (i = 0; i < h; i += block_size) {
     66    for (j = 0; j < w; j += block_size) {
     67      unsigned int sse0;
     68      int sum0;
     69      var_fn(src + src_stride * i + j, src_stride, ref + ref_stride * i + j,
     70             ref_stride, &sse0, &sum0);
     71      sse_long += sse0;
     72      sum_long += sum0;
     73    }
     74  }
     75  *sum = ROUND_POWER_OF_TWO(sum_long, 2);
     76  *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_long, 4);
     77 }
     78 
     79 static void highbd_12_variance_sse2(const uint16_t *src, int src_stride,
     80                                    const uint16_t *ref, int ref_stride, int w,
     81                                    int h, uint32_t *sse, int *sum,
     82                                    high_variance_fn_t var_fn, int block_size) {
     83  int i, j;
     84  uint64_t sse_long = 0;
     85  int32_t sum_long = 0;
     86 
     87  for (i = 0; i < h; i += block_size) {
     88    for (j = 0; j < w; j += block_size) {
     89      unsigned int sse0;
     90      int sum0;
     91      var_fn(src + src_stride * i + j, src_stride, ref + ref_stride * i + j,
     92             ref_stride, &sse0, &sum0);
     93      sse_long += sse0;
     94      sum_long += sum0;
     95    }
     96  }
     97  *sum = ROUND_POWER_OF_TWO(sum_long, 4);
     98  *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_long, 8);
     99 }
    100 
    101 #define VAR_FN(w, h, block_size, shift)                                    \
    102  uint32_t aom_highbd_8_variance##w##x##h##_sse2(                          \
    103      const uint8_t *src8, int src_stride, const uint8_t *ref8,            \
    104      int ref_stride, uint32_t *sse) {                                     \
    105    int sum;                                                               \
    106    uint16_t *src = CONVERT_TO_SHORTPTR(src8);                             \
    107    uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);                             \
    108    highbd_8_variance_sse2(                                                \
    109        src, src_stride, ref, ref_stride, w, h, sse, &sum,                 \
    110        aom_highbd_calc##block_size##x##block_size##var_sse2, block_size); \
    111    return *sse - (uint32_t)(((int64_t)sum * sum) >> shift);               \
    112  }                                                                        \
    113                                                                           \
    114  uint32_t aom_highbd_10_variance##w##x##h##_sse2(                         \
    115      const uint8_t *src8, int src_stride, const uint8_t *ref8,            \
    116      int ref_stride, uint32_t *sse) {                                     \
    117    int sum;                                                               \
    118    int64_t var;                                                           \
    119    uint16_t *src = CONVERT_TO_SHORTPTR(src8);                             \
    120    uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);                             \
    121    highbd_10_variance_sse2(                                               \
    122        src, src_stride, ref, ref_stride, w, h, sse, &sum,                 \
    123        aom_highbd_calc##block_size##x##block_size##var_sse2, block_size); \
    124    var = (int64_t)(*sse) - (((int64_t)sum * sum) >> shift);               \
    125    return (var >= 0) ? (uint32_t)var : 0;                                 \
    126  }                                                                        \
    127                                                                           \
    128  uint32_t aom_highbd_12_variance##w##x##h##_sse2(                         \
    129      const uint8_t *src8, int src_stride, const uint8_t *ref8,            \
    130      int ref_stride, uint32_t *sse) {                                     \
    131    int sum;                                                               \
    132    int64_t var;                                                           \
    133    uint16_t *src = CONVERT_TO_SHORTPTR(src8);                             \
    134    uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);                             \
    135    highbd_12_variance_sse2(                                               \
    136        src, src_stride, ref, ref_stride, w, h, sse, &sum,                 \
    137        aom_highbd_calc##block_size##x##block_size##var_sse2, block_size); \
    138    var = (int64_t)(*sse) - (((int64_t)sum * sum) >> shift);               \
    139    return (var >= 0) ? (uint32_t)var : 0;                                 \
    140  }
    141 
    142 VAR_FN(128, 128, 16, 14)
    143 VAR_FN(128, 64, 16, 13)
    144 VAR_FN(64, 128, 16, 13)
    145 VAR_FN(64, 64, 16, 12)
    146 VAR_FN(64, 32, 16, 11)
    147 VAR_FN(32, 64, 16, 11)
    148 VAR_FN(32, 32, 16, 10)
    149 VAR_FN(32, 16, 16, 9)
    150 VAR_FN(16, 32, 16, 9)
    151 VAR_FN(16, 16, 16, 8)
    152 VAR_FN(16, 8, 8, 7)
    153 VAR_FN(8, 16, 8, 7)
    154 VAR_FN(8, 8, 8, 6)
    155 
    156 #if !CONFIG_REALTIME_ONLY
    157 VAR_FN(8, 32, 8, 8)
    158 VAR_FN(32, 8, 8, 8)
    159 VAR_FN(16, 64, 16, 10)
    160 VAR_FN(64, 16, 16, 10)
    161 #endif  // !CONFIG_REALTIME_ONLY
    162 
    163 #undef VAR_FN
    164 
    165 unsigned int aom_highbd_8_mse16x16_sse2(const uint8_t *src8, int src_stride,
    166                                        const uint8_t *ref8, int ref_stride,
    167                                        unsigned int *sse) {
    168  int sum;
    169  uint16_t *src = CONVERT_TO_SHORTPTR(src8);
    170  uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
    171  highbd_8_variance_sse2(src, src_stride, ref, ref_stride, 16, 16, sse, &sum,
    172                         aom_highbd_calc16x16var_sse2, 16);
    173  return *sse;
    174 }
    175 
    176 unsigned int aom_highbd_10_mse16x16_sse2(const uint8_t *src8, int src_stride,
    177                                         const uint8_t *ref8, int ref_stride,
    178                                         unsigned int *sse) {
    179  int sum;
    180  uint16_t *src = CONVERT_TO_SHORTPTR(src8);
    181  uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
    182  highbd_10_variance_sse2(src, src_stride, ref, ref_stride, 16, 16, sse, &sum,
    183                          aom_highbd_calc16x16var_sse2, 16);
    184  return *sse;
    185 }
    186 
    187 unsigned int aom_highbd_12_mse16x16_sse2(const uint8_t *src8, int src_stride,
    188                                         const uint8_t *ref8, int ref_stride,
    189                                         unsigned int *sse) {
    190  int sum;
    191  uint16_t *src = CONVERT_TO_SHORTPTR(src8);
    192  uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
    193  highbd_12_variance_sse2(src, src_stride, ref, ref_stride, 16, 16, sse, &sum,
    194                          aom_highbd_calc16x16var_sse2, 16);
    195  return *sse;
    196 }
    197 
    198 unsigned int aom_highbd_8_mse8x8_sse2(const uint8_t *src8, int src_stride,
    199                                      const uint8_t *ref8, int ref_stride,
    200                                      unsigned int *sse) {
    201  int sum;
    202  uint16_t *src = CONVERT_TO_SHORTPTR(src8);
    203  uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
    204  highbd_8_variance_sse2(src, src_stride, ref, ref_stride, 8, 8, sse, &sum,
    205                         aom_highbd_calc8x8var_sse2, 8);
    206  return *sse;
    207 }
    208 
    209 unsigned int aom_highbd_10_mse8x8_sse2(const uint8_t *src8, int src_stride,
    210                                       const uint8_t *ref8, int ref_stride,
    211                                       unsigned int *sse) {
    212  int sum;
    213  uint16_t *src = CONVERT_TO_SHORTPTR(src8);
    214  uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
    215  highbd_10_variance_sse2(src, src_stride, ref, ref_stride, 8, 8, sse, &sum,
    216                          aom_highbd_calc8x8var_sse2, 8);
    217  return *sse;
    218 }
    219 
    220 unsigned int aom_highbd_12_mse8x8_sse2(const uint8_t *src8, int src_stride,
    221                                       const uint8_t *ref8, int ref_stride,
    222                                       unsigned int *sse) {
    223  int sum;
    224  uint16_t *src = CONVERT_TO_SHORTPTR(src8);
    225  uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
    226  highbd_12_variance_sse2(src, src_stride, ref, ref_stride, 8, 8, sse, &sum,
    227                          aom_highbd_calc8x8var_sse2, 8);
    228  return *sse;
    229 }
    230 
    231 // The 2 unused parameters are place holders for PIC enabled build.
    232 // These definitions are for functions defined in
    233 // highbd_subpel_variance_impl_sse2.asm
    234 #define DECL(w, opt)                                                         \
    235  int aom_highbd_sub_pixel_variance##w##xh_##opt(                            \
    236      const uint16_t *src, ptrdiff_t src_stride, int x_offset, int y_offset, \
    237      const uint16_t *dst, ptrdiff_t dst_stride, int height,                 \
    238      unsigned int *sse, void *unused0, void *unused);
    239 #define DECLS(opt) \
    240  DECL(8, opt)     \
    241  DECL(16, opt)
    242 
    243 DECLS(sse2)
    244 
    245 #undef DECLS
    246 #undef DECL
    247 
    248 #define FN(w, h, wf, wlog2, hlog2, opt, cast)                                  \
    249  uint32_t aom_highbd_8_sub_pixel_variance##w##x##h##_##opt(                   \
    250      const uint8_t *src8, int src_stride, int x_offset, int y_offset,         \
    251      const uint8_t *dst8, int dst_stride, uint32_t *sse_ptr) {                \
    252    uint16_t *src = CONVERT_TO_SHORTPTR(src8);                                 \
    253    uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);                                 \
    254    int se = 0;                                                                \
    255    unsigned int sse = 0;                                                      \
    256    unsigned int sse2;                                                         \
    257    int row_rep = (w > 64) ? 2 : 1;                                            \
    258    for (int wd_64 = 0; wd_64 < row_rep; wd_64++) {                            \
    259      src += wd_64 * 64;                                                       \
    260      dst += wd_64 * 64;                                                       \
    261      int se2 = aom_highbd_sub_pixel_variance##wf##xh_##opt(                   \
    262          src, src_stride, x_offset, y_offset, dst, dst_stride, h, &sse2,      \
    263          NULL, NULL);                                                         \
    264      se += se2;                                                               \
    265      sse += sse2;                                                             \
    266      if (w > wf) {                                                            \
    267        se2 = aom_highbd_sub_pixel_variance##wf##xh_##opt(                     \
    268            src + wf, src_stride, x_offset, y_offset, dst + wf, dst_stride, h, \
    269            &sse2, NULL, NULL);                                                \
    270        se += se2;                                                             \
    271        sse += sse2;                                                           \
    272        if (w > wf * 2) {                                                      \
    273          se2 = aom_highbd_sub_pixel_variance##wf##xh_##opt(                   \
    274              src + 2 * wf, src_stride, x_offset, y_offset, dst + 2 * wf,      \
    275              dst_stride, h, &sse2, NULL, NULL);                               \
    276          se += se2;                                                           \
    277          sse += sse2;                                                         \
    278          se2 = aom_highbd_sub_pixel_variance##wf##xh_##opt(                   \
    279              src + 3 * wf, src_stride, x_offset, y_offset, dst + 3 * wf,      \
    280              dst_stride, h, &sse2, NULL, NULL);                               \
    281          se += se2;                                                           \
    282          sse += sse2;                                                         \
    283        }                                                                      \
    284      }                                                                        \
    285    }                                                                          \
    286    *sse_ptr = sse;                                                            \
    287    return sse - (uint32_t)((cast se * se) >> (wlog2 + hlog2));                \
    288  }                                                                            \
    289                                                                               \
    290  uint32_t aom_highbd_10_sub_pixel_variance##w##x##h##_##opt(                  \
    291      const uint8_t *src8, int src_stride, int x_offset, int y_offset,         \
    292      const uint8_t *dst8, int dst_stride, uint32_t *sse_ptr) {                \
    293    int64_t var;                                                               \
    294    uint32_t sse;                                                              \
    295    uint64_t long_sse = 0;                                                     \
    296    uint16_t *src = CONVERT_TO_SHORTPTR(src8);                                 \
    297    uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);                                 \
    298    int se = 0;                                                                \
    299    int row_rep = (w > 64) ? 2 : 1;                                            \
    300    for (int wd_64 = 0; wd_64 < row_rep; wd_64++) {                            \
    301      src += wd_64 * 64;                                                       \
    302      dst += wd_64 * 64;                                                       \
    303      int se2 = aom_highbd_sub_pixel_variance##wf##xh_##opt(                   \
    304          src, src_stride, x_offset, y_offset, dst, dst_stride, h, &sse, NULL, \
    305          NULL);                                                               \
    306      se += se2;                                                               \
    307      long_sse += sse;                                                         \
    308      if (w > wf) {                                                            \
    309        uint32_t sse2;                                                         \
    310        se2 = aom_highbd_sub_pixel_variance##wf##xh_##opt(                     \
    311            src + wf, src_stride, x_offset, y_offset, dst + wf, dst_stride, h, \
    312            &sse2, NULL, NULL);                                                \
    313        se += se2;                                                             \
    314        long_sse += sse2;                                                      \
    315        if (w > wf * 2) {                                                      \
    316          se2 = aom_highbd_sub_pixel_variance##wf##xh_##opt(                   \
    317              src + 2 * wf, src_stride, x_offset, y_offset, dst + 2 * wf,      \
    318              dst_stride, h, &sse2, NULL, NULL);                               \
    319          se += se2;                                                           \
    320          long_sse += sse2;                                                    \
    321          se2 = aom_highbd_sub_pixel_variance##wf##xh_##opt(                   \
    322              src + 3 * wf, src_stride, x_offset, y_offset, dst + 3 * wf,      \
    323              dst_stride, h, &sse2, NULL, NULL);                               \
    324          se += se2;                                                           \
    325          long_sse += sse2;                                                    \
    326        }                                                                      \
    327      }                                                                        \
    328    }                                                                          \
    329    se = ROUND_POWER_OF_TWO(se, 2);                                            \
    330    sse = (uint32_t)ROUND_POWER_OF_TWO(long_sse, 4);                           \
    331    *sse_ptr = sse;                                                            \
    332    var = (int64_t)(sse) - ((cast se * se) >> (wlog2 + hlog2));                \
    333    return (var >= 0) ? (uint32_t)var : 0;                                     \
    334  }                                                                            \
    335                                                                               \
    336  uint32_t aom_highbd_12_sub_pixel_variance##w##x##h##_##opt(                  \
    337      const uint8_t *src8, int src_stride, int x_offset, int y_offset,         \
    338      const uint8_t *dst8, int dst_stride, uint32_t *sse_ptr) {                \
    339    int start_row;                                                             \
    340    uint32_t sse;                                                              \
    341    int se = 0;                                                                \
    342    int64_t var;                                                               \
    343    uint64_t long_sse = 0;                                                     \
    344    uint16_t *src = CONVERT_TO_SHORTPTR(src8);                                 \
    345    uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);                                 \
    346    int row_rep = (w > 64) ? 2 : 1;                                            \
    347    for (start_row = 0; start_row < h; start_row += 16) {                      \
    348      uint32_t sse2;                                                           \
    349      int height = h - start_row < 16 ? h - start_row : 16;                    \
    350      uint16_t *src_tmp = src + (start_row * src_stride);                      \
    351      uint16_t *dst_tmp = dst + (start_row * dst_stride);                      \
    352      for (int wd_64 = 0; wd_64 < row_rep; wd_64++) {                          \
    353        src_tmp += wd_64 * 64;                                                 \
    354        dst_tmp += wd_64 * 64;                                                 \
    355        int se2 = aom_highbd_sub_pixel_variance##wf##xh_##opt(                 \
    356            src_tmp, src_stride, x_offset, y_offset, dst_tmp, dst_stride,      \
    357            height, &sse2, NULL, NULL);                                        \
    358        se += se2;                                                             \
    359        long_sse += sse2;                                                      \
    360        if (w > wf) {                                                          \
    361          se2 = aom_highbd_sub_pixel_variance##wf##xh_##opt(                   \
    362              src_tmp + wf, src_stride, x_offset, y_offset, dst_tmp + wf,      \
    363              dst_stride, height, &sse2, NULL, NULL);                          \
    364          se += se2;                                                           \
    365          long_sse += sse2;                                                    \
    366          if (w > wf * 2) {                                                    \
    367            se2 = aom_highbd_sub_pixel_variance##wf##xh_##opt(                 \
    368                src_tmp + 2 * wf, src_stride, x_offset, y_offset,              \
    369                dst_tmp + 2 * wf, dst_stride, height, &sse2, NULL, NULL);      \
    370            se += se2;                                                         \
    371            long_sse += sse2;                                                  \
    372            se2 = aom_highbd_sub_pixel_variance##wf##xh_##opt(                 \
    373                src_tmp + 3 * wf, src_stride, x_offset, y_offset,              \
    374                dst_tmp + 3 * wf, dst_stride, height, &sse2, NULL, NULL);      \
    375            se += se2;                                                         \
    376            long_sse += sse2;                                                  \
    377          }                                                                    \
    378        }                                                                      \
    379      }                                                                        \
    380    }                                                                          \
    381    se = ROUND_POWER_OF_TWO(se, 4);                                            \
    382    sse = (uint32_t)ROUND_POWER_OF_TWO(long_sse, 8);                           \
    383    *sse_ptr = sse;                                                            \
    384    var = (int64_t)(sse) - ((cast se * se) >> (wlog2 + hlog2));                \
    385    return (var >= 0) ? (uint32_t)var : 0;                                     \
    386  }
    387 
    388 #if CONFIG_REALTIME_ONLY
    389 #define FNS(opt)                         \
    390  FN(128, 128, 16, 7, 7, opt, (int64_t)) \
    391  FN(128, 64, 16, 7, 6, opt, (int64_t))  \
    392  FN(64, 128, 16, 6, 7, opt, (int64_t))  \
    393  FN(64, 64, 16, 6, 6, opt, (int64_t))   \
    394  FN(64, 32, 16, 6, 5, opt, (int64_t))   \
    395  FN(32, 64, 16, 5, 6, opt, (int64_t))   \
    396  FN(32, 32, 16, 5, 5, opt, (int64_t))   \
    397  FN(32, 16, 16, 5, 4, opt, (int64_t))   \
    398  FN(16, 32, 16, 4, 5, opt, (int64_t))   \
    399  FN(16, 16, 16, 4, 4, opt, (int64_t))   \
    400  FN(16, 8, 16, 4, 3, opt, (int64_t))    \
    401  FN(8, 16, 8, 3, 4, opt, (int64_t))     \
    402  FN(8, 8, 8, 3, 3, opt, (int64_t))      \
    403  FN(8, 4, 8, 3, 2, opt, (int64_t))
    404 #else  // !CONFIG_REALTIME_ONLY
    405 #define FNS(opt)                         \
    406  FN(128, 128, 16, 7, 7, opt, (int64_t)) \
    407  FN(128, 64, 16, 7, 6, opt, (int64_t))  \
    408  FN(64, 128, 16, 6, 7, opt, (int64_t))  \
    409  FN(64, 64, 16, 6, 6, opt, (int64_t))   \
    410  FN(64, 32, 16, 6, 5, opt, (int64_t))   \
    411  FN(32, 64, 16, 5, 6, opt, (int64_t))   \
    412  FN(32, 32, 16, 5, 5, opt, (int64_t))   \
    413  FN(32, 16, 16, 5, 4, opt, (int64_t))   \
    414  FN(16, 32, 16, 4, 5, opt, (int64_t))   \
    415  FN(16, 16, 16, 4, 4, opt, (int64_t))   \
    416  FN(16, 8, 16, 4, 3, opt, (int64_t))    \
    417  FN(8, 16, 8, 3, 4, opt, (int64_t))     \
    418  FN(8, 8, 8, 3, 3, opt, (int64_t))      \
    419  FN(8, 4, 8, 3, 2, opt, (int64_t))      \
    420  FN(16, 4, 16, 4, 2, opt, (int64_t))    \
    421  FN(8, 32, 8, 3, 5, opt, (int64_t))     \
    422  FN(32, 8, 16, 5, 3, opt, (int64_t))    \
    423  FN(16, 64, 16, 4, 6, opt, (int64_t))   \
    424  FN(64, 16, 16, 6, 4, opt, (int64_t))
    425 #endif  // CONFIG_REALTIME_ONLY
    426 
    427 FNS(sse2)
    428 
    429 #undef FNS
    430 #undef FN
    431 
    432 // The 2 unused parameters are place holders for PIC enabled build.
    433 #define DECL(w, opt)                                                         \
    434  int aom_highbd_sub_pixel_avg_variance##w##xh_##opt(                        \
    435      const uint16_t *src, ptrdiff_t src_stride, int x_offset, int y_offset, \
    436      const uint16_t *dst, ptrdiff_t dst_stride, const uint16_t *sec,        \
    437      ptrdiff_t sec_stride, int height, unsigned int *sse, void *unused0,    \
    438      void *unused);
    439 #define DECLS(opt) \
    440  DECL(16, opt)    \
    441  DECL(8, opt)
    442 
    443 DECLS(sse2)
    444 #undef DECL
    445 #undef DECLS
    446 
    447 #define FN(w, h, wf, wlog2, hlog2, opt, cast)                                  \
    448  uint32_t aom_highbd_8_sub_pixel_avg_variance##w##x##h##_##opt(               \
    449      const uint8_t *src8, int src_stride, int x_offset, int y_offset,         \
    450      const uint8_t *dst8, int dst_stride, uint32_t *sse_ptr,                  \
    451      const uint8_t *sec8) {                                                   \
    452    uint32_t sse;                                                              \
    453    uint16_t *src = CONVERT_TO_SHORTPTR(src8);                                 \
    454    uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);                                 \
    455    uint16_t *sec = CONVERT_TO_SHORTPTR(sec8);                                 \
    456    int se = aom_highbd_sub_pixel_avg_variance##wf##xh_##opt(                  \
    457        src, src_stride, x_offset, y_offset, dst, dst_stride, sec, w, h, &sse, \
    458        NULL, NULL);                                                           \
    459    if (w > wf) {                                                              \
    460      uint32_t sse2;                                                           \
    461      int se2 = aom_highbd_sub_pixel_avg_variance##wf##xh_##opt(               \
    462          src + wf, src_stride, x_offset, y_offset, dst + wf, dst_stride,      \
    463          sec + wf, w, h, &sse2, NULL, NULL);                                  \
    464      se += se2;                                                               \
    465      sse += sse2;                                                             \
    466      if (w > wf * 2) {                                                        \
    467        se2 = aom_highbd_sub_pixel_avg_variance##wf##xh_##opt(                 \
    468            src + 2 * wf, src_stride, x_offset, y_offset, dst + 2 * wf,        \
    469            dst_stride, sec + 2 * wf, w, h, &sse2, NULL, NULL);                \
    470        se += se2;                                                             \
    471        sse += sse2;                                                           \
    472        se2 = aom_highbd_sub_pixel_avg_variance##wf##xh_##opt(                 \
    473            src + 3 * wf, src_stride, x_offset, y_offset, dst + 3 * wf,        \
    474            dst_stride, sec + 3 * wf, w, h, &sse2, NULL, NULL);                \
    475        se += se2;                                                             \
    476        sse += sse2;                                                           \
    477      }                                                                        \
    478    }                                                                          \
    479    *sse_ptr = sse;                                                            \
    480    return sse - (uint32_t)((cast se * se) >> (wlog2 + hlog2));                \
    481  }                                                                            \
    482                                                                               \
    483  uint32_t aom_highbd_10_sub_pixel_avg_variance##w##x##h##_##opt(              \
    484      const uint8_t *src8, int src_stride, int x_offset, int y_offset,         \
    485      const uint8_t *dst8, int dst_stride, uint32_t *sse_ptr,                  \
    486      const uint8_t *sec8) {                                                   \
    487    int64_t var;                                                               \
    488    uint32_t sse;                                                              \
    489    uint16_t *src = CONVERT_TO_SHORTPTR(src8);                                 \
    490    uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);                                 \
    491    uint16_t *sec = CONVERT_TO_SHORTPTR(sec8);                                 \
    492    int se = aom_highbd_sub_pixel_avg_variance##wf##xh_##opt(                  \
    493        src, src_stride, x_offset, y_offset, dst, dst_stride, sec, w, h, &sse, \
    494        NULL, NULL);                                                           \
    495    if (w > wf) {                                                              \
    496      uint32_t sse2;                                                           \
    497      int se2 = aom_highbd_sub_pixel_avg_variance##wf##xh_##opt(               \
    498          src + wf, src_stride, x_offset, y_offset, dst + wf, dst_stride,      \
    499          sec + wf, w, h, &sse2, NULL, NULL);                                  \
    500      se += se2;                                                               \
    501      sse += sse2;                                                             \
    502      if (w > wf * 2) {                                                        \
    503        se2 = aom_highbd_sub_pixel_avg_variance##wf##xh_##opt(                 \
    504            src + 2 * wf, src_stride, x_offset, y_offset, dst + 2 * wf,        \
    505            dst_stride, sec + 2 * wf, w, h, &sse2, NULL, NULL);                \
    506        se += se2;                                                             \
    507        sse += sse2;                                                           \
    508        se2 = aom_highbd_sub_pixel_avg_variance##wf##xh_##opt(                 \
    509            src + 3 * wf, src_stride, x_offset, y_offset, dst + 3 * wf,        \
    510            dst_stride, sec + 3 * wf, w, h, &sse2, NULL, NULL);                \
    511        se += se2;                                                             \
    512        sse += sse2;                                                           \
    513      }                                                                        \
    514    }                                                                          \
    515    se = ROUND_POWER_OF_TWO(se, 2);                                            \
    516    sse = ROUND_POWER_OF_TWO(sse, 4);                                          \
    517    *sse_ptr = sse;                                                            \
    518    var = (int64_t)(sse) - ((cast se * se) >> (wlog2 + hlog2));                \
    519    return (var >= 0) ? (uint32_t)var : 0;                                     \
    520  }                                                                            \
    521                                                                               \
    522  uint32_t aom_highbd_12_sub_pixel_avg_variance##w##x##h##_##opt(              \
    523      const uint8_t *src8, int src_stride, int x_offset, int y_offset,         \
    524      const uint8_t *dst8, int dst_stride, uint32_t *sse_ptr,                  \
    525      const uint8_t *sec8) {                                                   \
    526    int start_row;                                                             \
    527    int64_t var;                                                               \
    528    uint32_t sse;                                                              \
    529    int se = 0;                                                                \
    530    uint64_t long_sse = 0;                                                     \
    531    uint16_t *src = CONVERT_TO_SHORTPTR(src8);                                 \
    532    uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);                                 \
    533    uint16_t *sec = CONVERT_TO_SHORTPTR(sec8);                                 \
    534    for (start_row = 0; start_row < h; start_row += 16) {                      \
    535      uint32_t sse2;                                                           \
    536      int height = h - start_row < 16 ? h - start_row : 16;                    \
    537      int se2 = aom_highbd_sub_pixel_avg_variance##wf##xh_##opt(               \
    538          src + (start_row * src_stride), src_stride, x_offset, y_offset,      \
    539          dst + (start_row * dst_stride), dst_stride, sec + (start_row * w),   \
    540          w, height, &sse2, NULL, NULL);                                       \
    541      se += se2;                                                               \
    542      long_sse += sse2;                                                        \
    543      if (w > wf) {                                                            \
    544        se2 = aom_highbd_sub_pixel_avg_variance##wf##xh_##opt(                 \
    545            src + wf + (start_row * src_stride), src_stride, x_offset,         \
    546            y_offset, dst + wf + (start_row * dst_stride), dst_stride,         \
    547            sec + wf + (start_row * w), w, height, &sse2, NULL, NULL);         \
    548        se += se2;                                                             \
    549        long_sse += sse2;                                                      \
    550        if (w > wf * 2) {                                                      \
    551          se2 = aom_highbd_sub_pixel_avg_variance##wf##xh_##opt(               \
    552              src + 2 * wf + (start_row * src_stride), src_stride, x_offset,   \
    553              y_offset, dst + 2 * wf + (start_row * dst_stride), dst_stride,   \
    554              sec + 2 * wf + (start_row * w), w, height, &sse2, NULL, NULL);   \
    555          se += se2;                                                           \
    556          long_sse += sse2;                                                    \
    557          se2 = aom_highbd_sub_pixel_avg_variance##wf##xh_##opt(               \
    558              src + 3 * wf + (start_row * src_stride), src_stride, x_offset,   \
    559              y_offset, dst + 3 * wf + (start_row * dst_stride), dst_stride,   \
    560              sec + 3 * wf + (start_row * w), w, height, &sse2, NULL, NULL);   \
    561          se += se2;                                                           \
    562          long_sse += sse2;                                                    \
    563        }                                                                      \
    564      }                                                                        \
    565    }                                                                          \
    566    se = ROUND_POWER_OF_TWO(se, 4);                                            \
    567    sse = (uint32_t)ROUND_POWER_OF_TWO(long_sse, 8);                           \
    568    *sse_ptr = sse;                                                            \
    569    var = (int64_t)(sse) - ((cast se * se) >> (wlog2 + hlog2));                \
    570    return (var >= 0) ? (uint32_t)var : 0;                                     \
    571  }
    572 
    573 #if CONFIG_REALTIME_ONLY
    574 #define FNS(opt)                       \
    575  FN(64, 64, 16, 6, 6, opt, (int64_t)) \
    576  FN(64, 32, 16, 6, 5, opt, (int64_t)) \
    577  FN(32, 64, 16, 5, 6, opt, (int64_t)) \
    578  FN(32, 32, 16, 5, 5, opt, (int64_t)) \
    579  FN(32, 16, 16, 5, 4, opt, (int64_t)) \
    580  FN(16, 32, 16, 4, 5, opt, (int64_t)) \
    581  FN(16, 16, 16, 4, 4, opt, (int64_t)) \
    582  FN(16, 8, 16, 4, 3, opt, (int64_t))  \
    583  FN(8, 16, 8, 3, 4, opt, (int64_t))   \
    584  FN(8, 8, 8, 3, 3, opt, (int64_t))    \
    585  FN(8, 4, 8, 3, 2, opt, (int64_t))
    586 #else  // !CONFIG_REALTIME_ONLY
    587 #define FNS(opt)                       \
    588  FN(64, 64, 16, 6, 6, opt, (int64_t)) \
    589  FN(64, 32, 16, 6, 5, opt, (int64_t)) \
    590  FN(32, 64, 16, 5, 6, opt, (int64_t)) \
    591  FN(32, 32, 16, 5, 5, opt, (int64_t)) \
    592  FN(32, 16, 16, 5, 4, opt, (int64_t)) \
    593  FN(16, 32, 16, 4, 5, opt, (int64_t)) \
    594  FN(16, 16, 16, 4, 4, opt, (int64_t)) \
    595  FN(16, 8, 16, 4, 3, opt, (int64_t))  \
    596  FN(8, 16, 8, 3, 4, opt, (int64_t))   \
    597  FN(8, 8, 8, 3, 3, opt, (int64_t))    \
    598  FN(8, 4, 8, 3, 2, opt, (int64_t))    \
    599  FN(16, 4, 16, 4, 2, opt, (int64_t))  \
    600  FN(8, 32, 8, 3, 5, opt, (int64_t))   \
    601  FN(32, 8, 16, 5, 3, opt, (int64_t))  \
    602  FN(16, 64, 16, 4, 6, opt, (int64_t)) \
    603  FN(64, 16, 16, 6, 4, opt, (int64_t))
    604 #endif  // CONFIG_REALTIME_ONLY
    605 
    606 FNS(sse2)
    607 
    608 #undef FNS
    609 #undef FN
    610 
    611 static uint64_t mse_4xh_16bit_highbd_sse2(uint16_t *dst, int dstride,
    612                                          uint16_t *src, int sstride, int h) {
    613  uint64_t sum = 0;
    614  __m128i reg0_4x16, reg1_4x16;
    615  __m128i src_8x16;
    616  __m128i dst_8x16;
    617  __m128i res0_4x32, res1_4x32, res0_4x64, res1_4x64, res2_4x64, res3_4x64;
    618  __m128i sub_result_8x16;
    619  const __m128i zeros = _mm_setzero_si128();
    620  __m128i square_result = _mm_setzero_si128();
    621  for (int i = 0; i < h; i += 2) {
    622    reg0_4x16 = _mm_loadl_epi64((__m128i const *)(&dst[(i + 0) * dstride]));
    623    reg1_4x16 = _mm_loadl_epi64((__m128i const *)(&dst[(i + 1) * dstride]));
    624    dst_8x16 = _mm_unpacklo_epi64(reg0_4x16, reg1_4x16);
    625 
    626    reg0_4x16 = _mm_loadl_epi64((__m128i const *)(&src[(i + 0) * sstride]));
    627    reg1_4x16 = _mm_loadl_epi64((__m128i const *)(&src[(i + 1) * sstride]));
    628    src_8x16 = _mm_unpacklo_epi64(reg0_4x16, reg1_4x16);
    629 
    630    sub_result_8x16 = _mm_sub_epi16(src_8x16, dst_8x16);
    631 
    632    res0_4x32 = _mm_unpacklo_epi16(sub_result_8x16, zeros);
    633    res1_4x32 = _mm_unpackhi_epi16(sub_result_8x16, zeros);
    634 
    635    res0_4x32 = _mm_madd_epi16(res0_4x32, res0_4x32);
    636    res1_4x32 = _mm_madd_epi16(res1_4x32, res1_4x32);
    637 
    638    res0_4x64 = _mm_unpacklo_epi32(res0_4x32, zeros);
    639    res1_4x64 = _mm_unpackhi_epi32(res0_4x32, zeros);
    640    res2_4x64 = _mm_unpacklo_epi32(res1_4x32, zeros);
    641    res3_4x64 = _mm_unpackhi_epi32(res1_4x32, zeros);
    642 
    643    square_result = _mm_add_epi64(
    644        square_result,
    645        _mm_add_epi64(
    646            _mm_add_epi64(_mm_add_epi64(res0_4x64, res1_4x64), res2_4x64),
    647            res3_4x64));
    648  }
    649 
    650  const __m128i sum_1x64 =
    651      _mm_add_epi64(square_result, _mm_srli_si128(square_result, 8));
    652  xx_storel_64(&sum, sum_1x64);
    653  return sum;
    654 }
    655 
    656 static uint64_t mse_8xh_16bit_highbd_sse2(uint16_t *dst, int dstride,
    657                                          uint16_t *src, int sstride, int h) {
    658  uint64_t sum = 0;
    659  __m128i src_8x16;
    660  __m128i dst_8x16;
    661  __m128i res0_4x32, res1_4x32, res0_4x64, res1_4x64, res2_4x64, res3_4x64;
    662  __m128i sub_result_8x16;
    663  const __m128i zeros = _mm_setzero_si128();
    664  __m128i square_result = _mm_setzero_si128();
    665 
    666  for (int i = 0; i < h; i++) {
    667    dst_8x16 = _mm_loadu_si128((__m128i *)&dst[i * dstride]);
    668    src_8x16 = _mm_loadu_si128((__m128i *)&src[i * sstride]);
    669 
    670    sub_result_8x16 = _mm_sub_epi16(src_8x16, dst_8x16);
    671 
    672    res0_4x32 = _mm_unpacklo_epi16(sub_result_8x16, zeros);
    673    res1_4x32 = _mm_unpackhi_epi16(sub_result_8x16, zeros);
    674 
    675    res0_4x32 = _mm_madd_epi16(res0_4x32, res0_4x32);
    676    res1_4x32 = _mm_madd_epi16(res1_4x32, res1_4x32);
    677 
    678    res0_4x64 = _mm_unpacklo_epi32(res0_4x32, zeros);
    679    res1_4x64 = _mm_unpackhi_epi32(res0_4x32, zeros);
    680    res2_4x64 = _mm_unpacklo_epi32(res1_4x32, zeros);
    681    res3_4x64 = _mm_unpackhi_epi32(res1_4x32, zeros);
    682 
    683    square_result = _mm_add_epi64(
    684        square_result,
    685        _mm_add_epi64(
    686            _mm_add_epi64(_mm_add_epi64(res0_4x64, res1_4x64), res2_4x64),
    687            res3_4x64));
    688  }
    689 
    690  const __m128i sum_1x64 =
    691      _mm_add_epi64(square_result, _mm_srli_si128(square_result, 8));
    692  xx_storel_64(&sum, sum_1x64);
    693  return sum;
    694 }
    695 
    696 uint64_t aom_mse_wxh_16bit_highbd_sse2(uint16_t *dst, int dstride,
    697                                       uint16_t *src, int sstride, int w,
    698                                       int h) {
    699  assert((w == 8 || w == 4) && (h == 8 || h == 4) &&
    700         "w=8/4 and h=8/4 must satisfy");
    701  switch (w) {
    702    case 4: return mse_4xh_16bit_highbd_sse2(dst, dstride, src, sstride, h);
    703    case 8: return mse_8xh_16bit_highbd_sse2(dst, dstride, src, sstride, h);
    704    default: assert(0 && "unsupported width"); return -1;
    705  }
    706 }