tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

obmc_variance_sse4.c (14510B)


      1 /*
      2 * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
      3 *
      4 * This source code is subject to the terms of the BSD 2 Clause License and
      5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
      6 * was not distributed with this source code in the LICENSE file, you can
      7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
      8 * Media Patent License 1.0 was not distributed with this source code in the
      9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
     10 */
     11 
     12 #include <assert.h>
     13 #include <immintrin.h>
     14 
     15 #include "config/aom_config.h"
     16 #include "config/aom_dsp_rtcd.h"
     17 
     18 #include "aom_ports/mem.h"
     19 #include "aom/aom_integer.h"
     20 
     21 #include "aom_dsp/aom_dsp_common.h"
     22 #include "aom_dsp/aom_filter.h"
     23 #include "aom_dsp/x86/obmc_intrinsic_sse4.h"
     24 #include "aom_dsp/x86/synonyms.h"
     25 #include "aom_dsp/x86/variance_impl_ssse3.h"
     26 
     27 ////////////////////////////////////////////////////////////////////////////////
     28 // 8 bit
     29 ////////////////////////////////////////////////////////////////////////////////
     30 
     31 static inline void obmc_variance_w8n(const uint8_t *pre, const int pre_stride,
     32                                     const int32_t *wsrc, const int32_t *mask,
     33                                     unsigned int *const sse, int *const sum,
     34                                     const int w, const int h) {
     35  const int pre_step = pre_stride - w;
     36  int n = 0;
     37  __m128i v_sum_d = _mm_setzero_si128();
     38  __m128i v_sse_d = _mm_setzero_si128();
     39 
     40  assert(w >= 8);
     41  assert(IS_POWER_OF_TWO(w));
     42  assert(IS_POWER_OF_TWO(h));
     43 
     44  do {
     45    const __m128i v_p1_b = xx_loadl_32(pre + n + 4);
     46    const __m128i v_m1_d = xx_load_128(mask + n + 4);
     47    const __m128i v_w1_d = xx_load_128(wsrc + n + 4);
     48    const __m128i v_p0_b = xx_loadl_32(pre + n);
     49    const __m128i v_m0_d = xx_load_128(mask + n);
     50    const __m128i v_w0_d = xx_load_128(wsrc + n);
     51 
     52    const __m128i v_p0_d = _mm_cvtepu8_epi32(v_p0_b);
     53    const __m128i v_p1_d = _mm_cvtepu8_epi32(v_p1_b);
     54 
     55    // Values in both pre and mask fit in 15 bits, and are packed at 32 bit
     56    // boundaries. We use pmaddwd, as it has lower latency on Haswell
     57    // than pmulld but produces the same result with these inputs.
     58    const __m128i v_pm0_d = _mm_madd_epi16(v_p0_d, v_m0_d);
     59    const __m128i v_pm1_d = _mm_madd_epi16(v_p1_d, v_m1_d);
     60 
     61    const __m128i v_diff0_d = _mm_sub_epi32(v_w0_d, v_pm0_d);
     62    const __m128i v_diff1_d = _mm_sub_epi32(v_w1_d, v_pm1_d);
     63 
     64    const __m128i v_rdiff0_d = xx_roundn_epi32(v_diff0_d, 12);
     65    const __m128i v_rdiff1_d = xx_roundn_epi32(v_diff1_d, 12);
     66    const __m128i v_rdiff01_w = _mm_packs_epi32(v_rdiff0_d, v_rdiff1_d);
     67    const __m128i v_sqrdiff_d = _mm_madd_epi16(v_rdiff01_w, v_rdiff01_w);
     68 
     69    v_sum_d = _mm_add_epi32(v_sum_d, v_rdiff0_d);
     70    v_sum_d = _mm_add_epi32(v_sum_d, v_rdiff1_d);
     71    v_sse_d = _mm_add_epi32(v_sse_d, v_sqrdiff_d);
     72 
     73    n += 8;
     74 
     75    if (n % w == 0) pre += pre_step;
     76  } while (n < w * h);
     77 
     78  *sum = xx_hsum_epi32_si32(v_sum_d);
     79  *sse = xx_hsum_epi32_si32(v_sse_d);
     80 }
     81 
     82 #define OBMCVARWXH(W, H)                                               \
     83  unsigned int aom_obmc_variance##W##x##H##_sse4_1(                    \
     84      const uint8_t *pre, int pre_stride, const int32_t *wsrc,         \
     85      const int32_t *mask, unsigned int *sse) {                        \
     86    int sum;                                                           \
     87    if (W == 4) {                                                      \
     88      obmc_variance_w4(pre, pre_stride, wsrc, mask, sse, &sum, H);     \
     89    } else {                                                           \
     90      obmc_variance_w8n(pre, pre_stride, wsrc, mask, sse, &sum, W, H); \
     91    }                                                                  \
     92    return *sse - (unsigned int)(((int64_t)sum * sum) / (W * H));      \
     93  }
     94 
     95 OBMCVARWXH(128, 128)
     96 OBMCVARWXH(128, 64)
     97 OBMCVARWXH(64, 128)
     98 OBMCVARWXH(64, 64)
     99 OBMCVARWXH(64, 32)
    100 OBMCVARWXH(32, 64)
    101 OBMCVARWXH(32, 32)
    102 OBMCVARWXH(32, 16)
    103 OBMCVARWXH(16, 32)
    104 OBMCVARWXH(16, 16)
    105 OBMCVARWXH(16, 8)
    106 OBMCVARWXH(8, 16)
    107 OBMCVARWXH(8, 8)
    108 OBMCVARWXH(8, 4)
    109 OBMCVARWXH(4, 8)
    110 OBMCVARWXH(4, 4)
    111 OBMCVARWXH(4, 16)
    112 OBMCVARWXH(16, 4)
    113 OBMCVARWXH(8, 32)
    114 OBMCVARWXH(32, 8)
    115 OBMCVARWXH(16, 64)
    116 OBMCVARWXH(64, 16)
    117 
    118 #include "config/aom_dsp_rtcd.h"
    119 
    120 #define OBMC_SUBPIX_VAR(W, H)                                                \
    121  uint32_t aom_obmc_sub_pixel_variance##W##x##H##_sse4_1(                    \
    122      const uint8_t *pre, int pre_stride, int xoffset, int yoffset,          \
    123      const int32_t *wsrc, const int32_t *mask, unsigned int *sse) {         \
    124    uint16_t fdata3[(H + 1) * W];                                            \
    125    uint8_t temp2[H * W];                                                    \
    126                                                                             \
    127    aom_var_filter_block2d_bil_first_pass_ssse3(                             \
    128        pre, fdata3, pre_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \
    129    aom_var_filter_block2d_bil_second_pass_ssse3(                            \
    130        fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]);            \
    131                                                                             \
    132    return aom_obmc_variance##W##x##H##_sse4_1(temp2, W, wsrc, mask, sse);   \
    133  }
    134 
    135 OBMC_SUBPIX_VAR(128, 128)
    136 OBMC_SUBPIX_VAR(128, 64)
    137 OBMC_SUBPIX_VAR(64, 128)
    138 OBMC_SUBPIX_VAR(64, 64)
    139 OBMC_SUBPIX_VAR(64, 32)
    140 OBMC_SUBPIX_VAR(32, 64)
    141 OBMC_SUBPIX_VAR(32, 32)
    142 OBMC_SUBPIX_VAR(32, 16)
    143 OBMC_SUBPIX_VAR(16, 32)
    144 OBMC_SUBPIX_VAR(16, 16)
    145 OBMC_SUBPIX_VAR(16, 8)
    146 OBMC_SUBPIX_VAR(8, 16)
    147 OBMC_SUBPIX_VAR(8, 8)
    148 OBMC_SUBPIX_VAR(8, 4)
    149 OBMC_SUBPIX_VAR(4, 8)
    150 OBMC_SUBPIX_VAR(4, 4)
    151 OBMC_SUBPIX_VAR(4, 16)
    152 OBMC_SUBPIX_VAR(16, 4)
    153 OBMC_SUBPIX_VAR(8, 32)
    154 OBMC_SUBPIX_VAR(32, 8)
    155 OBMC_SUBPIX_VAR(16, 64)
    156 OBMC_SUBPIX_VAR(64, 16)
    157 
    158 ////////////////////////////////////////////////////////////////////////////////
    159 // High bit-depth
    160 ////////////////////////////////////////////////////////////////////////////////
    161 #if CONFIG_AV1_HIGHBITDEPTH
    162 static inline void hbd_obmc_variance_w4(
    163    const uint8_t *pre8, const int pre_stride, const int32_t *wsrc,
    164    const int32_t *mask, uint64_t *const sse, int64_t *const sum, const int h) {
    165  const uint16_t *pre = CONVERT_TO_SHORTPTR(pre8);
    166  const int pre_step = pre_stride - 4;
    167  int n = 0;
    168  __m128i v_sum_d = _mm_setzero_si128();
    169  __m128i v_sse_d = _mm_setzero_si128();
    170 
    171  assert(IS_POWER_OF_TWO(h));
    172 
    173  do {
    174    const __m128i v_p_w = xx_loadl_64(pre + n);
    175    const __m128i v_m_d = xx_load_128(mask + n);
    176    const __m128i v_w_d = xx_load_128(wsrc + n);
    177 
    178    const __m128i v_p_d = _mm_cvtepu16_epi32(v_p_w);
    179 
    180    // Values in both pre and mask fit in 15 bits, and are packed at 32 bit
    181    // boundaries. We use pmaddwd, as it has lower latency on Haswell
    182    // than pmulld but produces the same result with these inputs.
    183    const __m128i v_pm_d = _mm_madd_epi16(v_p_d, v_m_d);
    184 
    185    const __m128i v_diff_d = _mm_sub_epi32(v_w_d, v_pm_d);
    186    const __m128i v_rdiff_d = xx_roundn_epi32(v_diff_d, 12);
    187    const __m128i v_sqrdiff_d = _mm_mullo_epi32(v_rdiff_d, v_rdiff_d);
    188 
    189    v_sum_d = _mm_add_epi32(v_sum_d, v_rdiff_d);
    190    v_sse_d = _mm_add_epi32(v_sse_d, v_sqrdiff_d);
    191 
    192    n += 4;
    193 
    194    if (n % 4 == 0) pre += pre_step;
    195  } while (n < 4 * h);
    196 
    197  *sum = xx_hsum_epi32_si32(v_sum_d);
    198  *sse = xx_hsum_epi32_si32(v_sse_d);
    199 }
    200 
    201 static inline void hbd_obmc_variance_w8n(
    202    const uint8_t *pre8, const int pre_stride, const int32_t *wsrc,
    203    const int32_t *mask, uint64_t *const sse, int64_t *const sum, const int w,
    204    const int h) {
    205  const uint16_t *pre = CONVERT_TO_SHORTPTR(pre8);
    206  const int pre_step = pre_stride - w;
    207  int n = 0;
    208  __m128i v_sum_d = _mm_setzero_si128();
    209  __m128i v_sse_d = _mm_setzero_si128();
    210 
    211  assert(w >= 8);
    212  assert(IS_POWER_OF_TWO(w));
    213  assert(IS_POWER_OF_TWO(h));
    214 
    215  do {
    216    const __m128i v_p1_w = xx_loadl_64(pre + n + 4);
    217    const __m128i v_m1_d = xx_load_128(mask + n + 4);
    218    const __m128i v_w1_d = xx_load_128(wsrc + n + 4);
    219    const __m128i v_p0_w = xx_loadl_64(pre + n);
    220    const __m128i v_m0_d = xx_load_128(mask + n);
    221    const __m128i v_w0_d = xx_load_128(wsrc + n);
    222 
    223    const __m128i v_p0_d = _mm_cvtepu16_epi32(v_p0_w);
    224    const __m128i v_p1_d = _mm_cvtepu16_epi32(v_p1_w);
    225 
    226    // Values in both pre and mask fit in 15 bits, and are packed at 32 bit
    227    // boundaries. We use pmaddwd, as it has lower latency on Haswell
    228    // than pmulld but produces the same result with these inputs.
    229    const __m128i v_pm0_d = _mm_madd_epi16(v_p0_d, v_m0_d);
    230    const __m128i v_pm1_d = _mm_madd_epi16(v_p1_d, v_m1_d);
    231 
    232    const __m128i v_diff0_d = _mm_sub_epi32(v_w0_d, v_pm0_d);
    233    const __m128i v_diff1_d = _mm_sub_epi32(v_w1_d, v_pm1_d);
    234 
    235    const __m128i v_rdiff0_d = xx_roundn_epi32(v_diff0_d, 12);
    236    const __m128i v_rdiff1_d = xx_roundn_epi32(v_diff1_d, 12);
    237    const __m128i v_rdiff01_w = _mm_packs_epi32(v_rdiff0_d, v_rdiff1_d);
    238    const __m128i v_sqrdiff_d = _mm_madd_epi16(v_rdiff01_w, v_rdiff01_w);
    239 
    240    v_sum_d = _mm_add_epi32(v_sum_d, v_rdiff0_d);
    241    v_sum_d = _mm_add_epi32(v_sum_d, v_rdiff1_d);
    242    v_sse_d = _mm_add_epi32(v_sse_d, v_sqrdiff_d);
    243 
    244    n += 8;
    245 
    246    if (n % w == 0) pre += pre_step;
    247  } while (n < w * h);
    248 
    249  *sum += xx_hsum_epi32_si64(v_sum_d);
    250  *sse += xx_hsum_epi32_si64(v_sse_d);
    251 }
    252 
    253 static inline void highbd_8_obmc_variance(const uint8_t *pre8, int pre_stride,
    254                                          const int32_t *wsrc,
    255                                          const int32_t *mask, int w, int h,
    256                                          unsigned int *sse, int *sum) {
    257  int64_t sum64 = 0;
    258  uint64_t sse64 = 0;
    259  if (w == 4) {
    260    hbd_obmc_variance_w4(pre8, pre_stride, wsrc, mask, &sse64, &sum64, h);
    261  } else {
    262    hbd_obmc_variance_w8n(pre8, pre_stride, wsrc, mask, &sse64, &sum64, w, h);
    263  }
    264  *sum = (int)sum64;
    265  *sse = (unsigned int)sse64;
    266 }
    267 
    268 static inline void highbd_10_obmc_variance(const uint8_t *pre8, int pre_stride,
    269                                           const int32_t *wsrc,
    270                                           const int32_t *mask, int w, int h,
    271                                           unsigned int *sse, int *sum) {
    272  int64_t sum64 = 0;
    273  uint64_t sse64 = 0;
    274  if (w == 4) {
    275    hbd_obmc_variance_w4(pre8, pre_stride, wsrc, mask, &sse64, &sum64, h);
    276  } else if (w < 128 || h < 128) {
    277    hbd_obmc_variance_w8n(pre8, pre_stride, wsrc, mask, &sse64, &sum64, w, h);
    278  } else {
    279    assert(w == 128 && h == 128);
    280 
    281    do {
    282      hbd_obmc_variance_w8n(pre8, pre_stride, wsrc, mask, &sse64, &sum64, w,
    283                            64);
    284      pre8 += 64 * pre_stride;
    285      wsrc += 64 * w;
    286      mask += 64 * w;
    287      h -= 64;
    288    } while (h > 0);
    289  }
    290  *sum = (int)ROUND_POWER_OF_TWO(sum64, 2);
    291  *sse = (unsigned int)ROUND_POWER_OF_TWO(sse64, 4);
    292 }
    293 
    294 static inline void highbd_12_obmc_variance(const uint8_t *pre8, int pre_stride,
    295                                           const int32_t *wsrc,
    296                                           const int32_t *mask, int w, int h,
    297                                           unsigned int *sse, int *sum) {
    298  int64_t sum64 = 0;
    299  uint64_t sse64 = 0;
    300  int max_pel_allowed_per_ovf = 512;
    301  if (w == 4) {
    302    hbd_obmc_variance_w4(pre8, pre_stride, wsrc, mask, &sse64, &sum64, h);
    303  } else if (w * h <= max_pel_allowed_per_ovf) {
    304    hbd_obmc_variance_w8n(pre8, pre_stride, wsrc, mask, &sse64, &sum64, w, h);
    305  } else {
    306    int h_per_ovf = max_pel_allowed_per_ovf / w;
    307 
    308    assert(max_pel_allowed_per_ovf % w == 0);
    309    do {
    310      hbd_obmc_variance_w8n(pre8, pre_stride, wsrc, mask, &sse64, &sum64, w,
    311                            h_per_ovf);
    312      pre8 += h_per_ovf * pre_stride;
    313      wsrc += h_per_ovf * w;
    314      mask += h_per_ovf * w;
    315      h -= h_per_ovf;
    316    } while (h > 0);
    317  }
    318  *sum = (int)ROUND_POWER_OF_TWO(sum64, 4);
    319  *sse = (unsigned int)ROUND_POWER_OF_TWO(sse64, 8);
    320 }
    321 
    322 #define HBD_OBMCVARWXH(W, H)                                               \
    323  unsigned int aom_highbd_8_obmc_variance##W##x##H##_sse4_1(               \
    324      const uint8_t *pre, int pre_stride, const int32_t *wsrc,             \
    325      const int32_t *mask, unsigned int *sse) {                            \
    326    int sum;                                                               \
    327    highbd_8_obmc_variance(pre, pre_stride, wsrc, mask, W, H, sse, &sum);  \
    328    return *sse - (unsigned int)(((int64_t)sum * sum) / (W * H));          \
    329  }                                                                        \
    330                                                                           \
    331  unsigned int aom_highbd_10_obmc_variance##W##x##H##_sse4_1(              \
    332      const uint8_t *pre, int pre_stride, const int32_t *wsrc,             \
    333      const int32_t *mask, unsigned int *sse) {                            \
    334    int sum;                                                               \
    335    int64_t var;                                                           \
    336    highbd_10_obmc_variance(pre, pre_stride, wsrc, mask, W, H, sse, &sum); \
    337    var = (int64_t)(*sse) - (((int64_t)sum * sum) / (W * H));              \
    338    return (var >= 0) ? (uint32_t)var : 0;                                 \
    339  }                                                                        \
    340                                                                           \
    341  unsigned int aom_highbd_12_obmc_variance##W##x##H##_sse4_1(              \
    342      const uint8_t *pre, int pre_stride, const int32_t *wsrc,             \
    343      const int32_t *mask, unsigned int *sse) {                            \
    344    int sum;                                                               \
    345    int64_t var;                                                           \
    346    highbd_12_obmc_variance(pre, pre_stride, wsrc, mask, W, H, sse, &sum); \
    347    var = (int64_t)(*sse) - (((int64_t)sum * sum) / (W * H));              \
    348    return (var >= 0) ? (uint32_t)var : 0;                                 \
    349  }
    350 
    351 HBD_OBMCVARWXH(128, 128)
    352 HBD_OBMCVARWXH(128, 64)
    353 HBD_OBMCVARWXH(64, 128)
    354 HBD_OBMCVARWXH(64, 64)
    355 HBD_OBMCVARWXH(64, 32)
    356 HBD_OBMCVARWXH(32, 64)
    357 HBD_OBMCVARWXH(32, 32)
    358 HBD_OBMCVARWXH(32, 16)
    359 HBD_OBMCVARWXH(16, 32)
    360 HBD_OBMCVARWXH(16, 16)
    361 HBD_OBMCVARWXH(16, 8)
    362 HBD_OBMCVARWXH(8, 16)
    363 HBD_OBMCVARWXH(8, 8)
    364 HBD_OBMCVARWXH(8, 4)
    365 HBD_OBMCVARWXH(4, 8)
    366 HBD_OBMCVARWXH(4, 4)
    367 HBD_OBMCVARWXH(4, 16)
    368 HBD_OBMCVARWXH(16, 4)
    369 HBD_OBMCVARWXH(8, 32)
    370 HBD_OBMCVARWXH(32, 8)
    371 HBD_OBMCVARWXH(16, 64)
    372 HBD_OBMCVARWXH(64, 16)
    373 #endif  // CONFIG_AV1_HIGHBITDEPTH