tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

obmc_sad_avx2.c (10135B)


      1 /*
      2 * Copyright (c) 2018, Alliance for Open Media. All rights reserved.
      3 *
      4 * This source code is subject to the terms of the BSD 2 Clause License and
      5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
      6 * was not distributed with this source code in the LICENSE file, you can
      7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
      8 * Media Patent License 1.0 was not distributed with this source code in the
      9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
     10 */
     11 
     12 #include <assert.h>
     13 #include <immintrin.h>
     14 
     15 #include "config/aom_config.h"
     16 #include "config/aom_dsp_rtcd.h"
     17 
     18 #include "aom_ports/mem.h"
     19 #include "aom/aom_integer.h"
     20 
     21 #include "aom_dsp/aom_dsp_common.h"
     22 #include "aom_dsp/x86/obmc_intrinsic_ssse3.h"
     23 #include "aom_dsp/x86/synonyms.h"
     24 
     25 ////////////////////////////////////////////////////////////////////////////////
     26 // 8 bit
     27 ////////////////////////////////////////////////////////////////////////////////
     28 
     29 static inline unsigned int obmc_sad_w4_avx2(const uint8_t *pre,
     30                                            const int pre_stride,
     31                                            const int32_t *wsrc,
     32                                            const int32_t *mask,
     33                                            const int height) {
     34  int n = 0;
     35  __m256i v_sad_d = _mm256_setzero_si256();
     36  const __m256i v_bias_d = _mm256_set1_epi32((1 << 12) >> 1);
     37 
     38  do {
     39    const __m128i v_p_b_0 = xx_loadl_32(pre);
     40    const __m128i v_p_b_1 = xx_loadl_32(pre + pre_stride);
     41    const __m128i v_p_b = _mm_unpacklo_epi32(v_p_b_0, v_p_b_1);
     42    const __m256i v_m_d = _mm256_lddqu_si256((__m256i *)(mask + n));
     43    const __m256i v_w_d = _mm256_lddqu_si256((__m256i *)(wsrc + n));
     44 
     45    const __m256i v_p_d = _mm256_cvtepu8_epi32(v_p_b);
     46 
     47    // Values in both pre and mask fit in 15 bits, and are packed at 32 bit
     48    // boundaries. We use pmaddwd, as it has lower latency on Haswell
     49    // than pmulld but produces the same result with these inputs.
     50    const __m256i v_pm_d = _mm256_madd_epi16(v_p_d, v_m_d);
     51 
     52    const __m256i v_diff_d = _mm256_sub_epi32(v_w_d, v_pm_d);
     53    const __m256i v_absdiff_d = _mm256_abs_epi32(v_diff_d);
     54 
     55    // Rounded absolute difference
     56    const __m256i v_tmp_d = _mm256_add_epi32(v_absdiff_d, v_bias_d);
     57    const __m256i v_rad_d = _mm256_srli_epi32(v_tmp_d, 12);
     58 
     59    v_sad_d = _mm256_add_epi32(v_sad_d, v_rad_d);
     60 
     61    n += 8;
     62    pre += pre_stride << 1;
     63  } while (n < 8 * (height >> 1));
     64 
     65  __m128i v_sad_d_0 = _mm256_castsi256_si128(v_sad_d);
     66  __m128i v_sad_d_1 = _mm256_extracti128_si256(v_sad_d, 1);
     67  v_sad_d_0 = _mm_add_epi32(v_sad_d_0, v_sad_d_1);
     68  return xx_hsum_epi32_si32(v_sad_d_0);
     69 }
     70 
     71 static inline unsigned int obmc_sad_w8n_avx2(
     72    const uint8_t *pre, const int pre_stride, const int32_t *wsrc,
     73    const int32_t *mask, const int width, const int height) {
     74  const int pre_step = pre_stride - width;
     75  int n = 0;
     76  __m256i v_sad_d = _mm256_setzero_si256();
     77  const __m256i v_bias_d = _mm256_set1_epi32((1 << 12) >> 1);
     78  assert(width >= 8);
     79  assert(IS_POWER_OF_TWO(width));
     80 
     81  do {
     82    const __m128i v_p0_b = xx_loadl_64(pre + n);
     83    const __m256i v_m0_d = _mm256_lddqu_si256((__m256i *)(mask + n));
     84    const __m256i v_w0_d = _mm256_lddqu_si256((__m256i *)(wsrc + n));
     85 
     86    const __m256i v_p0_d = _mm256_cvtepu8_epi32(v_p0_b);
     87 
     88    // Values in both pre and mask fit in 15 bits, and are packed at 32 bit
     89    // boundaries. We use pmaddwd, as it has lower latency on Haswell
     90    // than pmulld but produces the same result with these inputs.
     91    const __m256i v_pm0_d = _mm256_madd_epi16(v_p0_d, v_m0_d);
     92 
     93    const __m256i v_diff0_d = _mm256_sub_epi32(v_w0_d, v_pm0_d);
     94    const __m256i v_absdiff0_d = _mm256_abs_epi32(v_diff0_d);
     95 
     96    // Rounded absolute difference
     97    const __m256i v_tmp_d = _mm256_add_epi32(v_absdiff0_d, v_bias_d);
     98    const __m256i v_rad0_d = _mm256_srli_epi32(v_tmp_d, 12);
     99 
    100    v_sad_d = _mm256_add_epi32(v_sad_d, v_rad0_d);
    101 
    102    n += 8;
    103 
    104    if ((n & (width - 1)) == 0) pre += pre_step;
    105  } while (n < width * height);
    106 
    107  __m128i v_sad_d_0 = _mm256_castsi256_si128(v_sad_d);
    108  __m128i v_sad_d_1 = _mm256_extracti128_si256(v_sad_d, 1);
    109  v_sad_d_0 = _mm_add_epi32(v_sad_d_0, v_sad_d_1);
    110  return xx_hsum_epi32_si32(v_sad_d_0);
    111 }
    112 
    113 #define OBMCSADWXH(w, h)                                          \
    114  unsigned int aom_obmc_sad##w##x##h##_avx2(                      \
    115      const uint8_t *pre, int pre_stride, const int32_t *wsrc,    \
    116      const int32_t *msk) {                                       \
    117    if (w == 4) {                                                 \
    118      return obmc_sad_w4_avx2(pre, pre_stride, wsrc, msk, h);     \
    119    } else {                                                      \
    120      return obmc_sad_w8n_avx2(pre, pre_stride, wsrc, msk, w, h); \
    121    }                                                             \
    122  }
    123 
    124 OBMCSADWXH(128, 128)
    125 OBMCSADWXH(128, 64)
    126 OBMCSADWXH(64, 128)
    127 OBMCSADWXH(64, 64)
    128 OBMCSADWXH(64, 32)
    129 OBMCSADWXH(32, 64)
    130 OBMCSADWXH(32, 32)
    131 OBMCSADWXH(32, 16)
    132 OBMCSADWXH(16, 32)
    133 OBMCSADWXH(16, 16)
    134 OBMCSADWXH(16, 8)
    135 OBMCSADWXH(8, 16)
    136 OBMCSADWXH(8, 8)
    137 OBMCSADWXH(8, 4)
    138 OBMCSADWXH(4, 8)
    139 OBMCSADWXH(4, 4)
    140 OBMCSADWXH(4, 16)
    141 OBMCSADWXH(16, 4)
    142 OBMCSADWXH(8, 32)
    143 OBMCSADWXH(32, 8)
    144 OBMCSADWXH(16, 64)
    145 OBMCSADWXH(64, 16)
    146 
    147 ////////////////////////////////////////////////////////////////////////////////
    148 // High bit-depth
    149 ////////////////////////////////////////////////////////////////////////////////
    150 
    151 #if CONFIG_AV1_HIGHBITDEPTH
    152 static inline unsigned int hbd_obmc_sad_w4_avx2(const uint8_t *pre8,
    153                                                const int pre_stride,
    154                                                const int32_t *wsrc,
    155                                                const int32_t *mask,
    156                                                const int height) {
    157  const uint16_t *pre = CONVERT_TO_SHORTPTR(pre8);
    158  int n = 0;
    159  __m256i v_sad_d = _mm256_setzero_si256();
    160  const __m256i v_bias_d = _mm256_set1_epi32((1 << 12) >> 1);
    161  do {
    162    const __m128i v_p_w_0 = xx_loadl_64(pre);
    163    const __m128i v_p_w_1 = xx_loadl_64(pre + pre_stride);
    164    const __m128i v_p_w = _mm_unpacklo_epi64(v_p_w_0, v_p_w_1);
    165    const __m256i v_m_d = _mm256_lddqu_si256((__m256i *)(mask + n));
    166    const __m256i v_w_d = _mm256_lddqu_si256((__m256i *)(wsrc + n));
    167 
    168    const __m256i v_p_d = _mm256_cvtepu16_epi32(v_p_w);
    169 
    170    // Values in both pre and mask fit in 15 bits, and are packed at 32 bit
    171    // boundaries. We use pmaddwd, as it has lower latency on Haswell
    172    // than pmulld but produces the same result with these inputs.
    173    const __m256i v_pm_d = _mm256_madd_epi16(v_p_d, v_m_d);
    174 
    175    const __m256i v_diff_d = _mm256_sub_epi32(v_w_d, v_pm_d);
    176    const __m256i v_absdiff_d = _mm256_abs_epi32(v_diff_d);
    177 
    178    // Rounded absolute difference
    179 
    180    const __m256i v_tmp_d = _mm256_add_epi32(v_absdiff_d, v_bias_d);
    181    const __m256i v_rad_d = _mm256_srli_epi32(v_tmp_d, 12);
    182 
    183    v_sad_d = _mm256_add_epi32(v_sad_d, v_rad_d);
    184 
    185    n += 8;
    186 
    187    pre += pre_stride << 1;
    188  } while (n < 8 * (height >> 1));
    189 
    190  __m128i v_sad_d_0 = _mm256_castsi256_si128(v_sad_d);
    191  __m128i v_sad_d_1 = _mm256_extracti128_si256(v_sad_d, 1);
    192  v_sad_d_0 = _mm_add_epi32(v_sad_d_0, v_sad_d_1);
    193  return xx_hsum_epi32_si32(v_sad_d_0);
    194 }
    195 
    196 static inline unsigned int hbd_obmc_sad_w8n_avx2(
    197    const uint8_t *pre8, const int pre_stride, const int32_t *wsrc,
    198    const int32_t *mask, const int width, const int height) {
    199  const uint16_t *pre = CONVERT_TO_SHORTPTR(pre8);
    200  const int pre_step = pre_stride - width;
    201  int n = 0;
    202  __m256i v_sad_d = _mm256_setzero_si256();
    203  const __m256i v_bias_d = _mm256_set1_epi32((1 << 12) >> 1);
    204 
    205  assert(width >= 8);
    206  assert(IS_POWER_OF_TWO(width));
    207 
    208  do {
    209    const __m128i v_p0_w = _mm_lddqu_si128((__m128i *)(pre + n));
    210    const __m256i v_m0_d = _mm256_lddqu_si256((__m256i *)(mask + n));
    211    const __m256i v_w0_d = _mm256_lddqu_si256((__m256i *)(wsrc + n));
    212 
    213    const __m256i v_p0_d = _mm256_cvtepu16_epi32(v_p0_w);
    214 
    215    // Values in both pre and mask fit in 15 bits, and are packed at 32 bit
    216    // boundaries. We use pmaddwd, as it has lower latency on Haswell
    217    // than pmulld but produces the same result with these inputs.
    218    const __m256i v_pm0_d = _mm256_madd_epi16(v_p0_d, v_m0_d);
    219 
    220    const __m256i v_diff0_d = _mm256_sub_epi32(v_w0_d, v_pm0_d);
    221    const __m256i v_absdiff0_d = _mm256_abs_epi32(v_diff0_d);
    222 
    223    // Rounded absolute difference
    224    const __m256i v_tmp_d = _mm256_add_epi32(v_absdiff0_d, v_bias_d);
    225    const __m256i v_rad0_d = _mm256_srli_epi32(v_tmp_d, 12);
    226 
    227    v_sad_d = _mm256_add_epi32(v_sad_d, v_rad0_d);
    228 
    229    n += 8;
    230 
    231    if (n % width == 0) pre += pre_step;
    232  } while (n < width * height);
    233 
    234  __m128i v_sad_d_0 = _mm256_castsi256_si128(v_sad_d);
    235  __m128i v_sad_d_1 = _mm256_extracti128_si256(v_sad_d, 1);
    236  v_sad_d_0 = _mm_add_epi32(v_sad_d_0, v_sad_d_1);
    237  return xx_hsum_epi32_si32(v_sad_d_0);
    238 }
    239 
    240 #define HBD_OBMCSADWXH(w, h)                                           \
    241  unsigned int aom_highbd_obmc_sad##w##x##h##_avx2(                    \
    242      const uint8_t *pre, int pre_stride, const int32_t *wsrc,         \
    243      const int32_t *mask) {                                           \
    244    if (w == 4) {                                                      \
    245      return hbd_obmc_sad_w4_avx2(pre, pre_stride, wsrc, mask, h);     \
    246    } else {                                                           \
    247      return hbd_obmc_sad_w8n_avx2(pre, pre_stride, wsrc, mask, w, h); \
    248    }                                                                  \
    249  }
    250 
    251 HBD_OBMCSADWXH(128, 128)
    252 HBD_OBMCSADWXH(128, 64)
    253 HBD_OBMCSADWXH(64, 128)
    254 HBD_OBMCSADWXH(64, 64)
    255 HBD_OBMCSADWXH(64, 32)
    256 HBD_OBMCSADWXH(32, 64)
    257 HBD_OBMCSADWXH(32, 32)
    258 HBD_OBMCSADWXH(32, 16)
    259 HBD_OBMCSADWXH(16, 32)
    260 HBD_OBMCSADWXH(16, 16)
    261 HBD_OBMCSADWXH(16, 8)
    262 HBD_OBMCSADWXH(8, 16)
    263 HBD_OBMCSADWXH(8, 8)
    264 HBD_OBMCSADWXH(8, 4)
    265 HBD_OBMCSADWXH(4, 8)
    266 HBD_OBMCSADWXH(4, 4)
    267 HBD_OBMCSADWXH(4, 16)
    268 HBD_OBMCSADWXH(16, 4)
    269 HBD_OBMCSADWXH(8, 32)
    270 HBD_OBMCSADWXH(32, 8)
    271 HBD_OBMCSADWXH(16, 64)
    272 HBD_OBMCSADWXH(64, 16)
    273 #endif  // CONFIG_AV1_HIGHBITDEPTH