[ tor-browser ].git.dasho

obmc_sad_sse4.c (9537B)
      1 /*
      2 * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
      3 *
      4 * This source code is subject to the terms of the BSD 2 Clause License and
      5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
      6 * was not distributed with this source code in the LICENSE file, you can
      7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
      8 * Media Patent License 1.0 was not distributed with this source code in the
      9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
     10 */
     11 
     12 #include <assert.h>
     13 #include <immintrin.h>
     14 
     15 #include "config/aom_config.h"
     16 #include "config/aom_dsp_rtcd.h"
     17 
     18 #include "aom_ports/mem.h"
     19 #include "aom/aom_integer.h"
     20 
     21 #include "aom_dsp/aom_dsp_common.h"
     22 #include "aom_dsp/x86/obmc_intrinsic_ssse3.h"
     23 #include "aom_dsp/x86/synonyms.h"
     24 
     25 ////////////////////////////////////////////////////////////////////////////////
     26 // 8 bit
     27 ////////////////////////////////////////////////////////////////////////////////
     28 
     29 static AOM_FORCE_INLINE unsigned int obmc_sad_w4(const uint8_t *pre,
     30                                                 const int pre_stride,
     31                                                 const int32_t *wsrc,
     32                                                 const int32_t *mask,
     33                                                 const int height) {
     34  const int pre_step = pre_stride - 4;
     35  int n = 0;
     36  __m128i v_sad_d = _mm_setzero_si128();
     37 
     38  do {
     39    const __m128i v_p_b = xx_loadl_32(pre + n);
     40    const __m128i v_m_d = xx_load_128(mask + n);
     41    const __m128i v_w_d = xx_load_128(wsrc + n);
     42 
     43    const __m128i v_p_d = _mm_cvtepu8_epi32(v_p_b);
     44 
     45    // Values in both pre and mask fit in 15 bits, and are packed at 32 bit
     46    // boundaries. We use pmaddwd, as it has lower latency on Haswell
     47    // than pmulld but produces the same result with these inputs.
     48    const __m128i v_pm_d = _mm_madd_epi16(v_p_d, v_m_d);
     49 
     50    const __m128i v_diff_d = _mm_sub_epi32(v_w_d, v_pm_d);
     51    const __m128i v_absdiff_d = _mm_abs_epi32(v_diff_d);
     52 
     53    // Rounded absolute difference
     54    const __m128i v_rad_d = xx_roundn_epu32(v_absdiff_d, 12);
     55 
     56    v_sad_d = _mm_add_epi32(v_sad_d, v_rad_d);
     57 
     58    n += 4;
     59 
     60    if (n % 4 == 0) pre += pre_step;
     61  } while (n < 4 * height);
     62 
     63  return xx_hsum_epi32_si32(v_sad_d);
     64 }
     65 
     66 static AOM_FORCE_INLINE unsigned int obmc_sad_w8n(
     67    const uint8_t *pre, const int pre_stride, const int32_t *wsrc,
     68    const int32_t *mask, const int width, const int height) {
     69  const int pre_step = pre_stride - width;
     70  int n = 0;
     71  __m128i v_sad_d = _mm_setzero_si128();
     72 
     73  assert(width >= 8);
     74  assert(IS_POWER_OF_TWO(width));
     75 
     76  do {
     77    const __m128i v_p1_b = xx_loadl_32(pre + n + 4);
     78    const __m128i v_m1_d = xx_load_128(mask + n + 4);
     79    const __m128i v_w1_d = xx_load_128(wsrc + n + 4);
     80    const __m128i v_p0_b = xx_loadl_32(pre + n);
     81    const __m128i v_m0_d = xx_load_128(mask + n);
     82    const __m128i v_w0_d = xx_load_128(wsrc + n);
     83 
     84    const __m128i v_p0_d = _mm_cvtepu8_epi32(v_p0_b);
     85    const __m128i v_p1_d = _mm_cvtepu8_epi32(v_p1_b);
     86 
     87    // Values in both pre and mask fit in 15 bits, and are packed at 32 bit
     88    // boundaries. We use pmaddwd, as it has lower latency on Haswell
     89    // than pmulld but produces the same result with these inputs.
     90    const __m128i v_pm0_d = _mm_madd_epi16(v_p0_d, v_m0_d);
     91    const __m128i v_pm1_d = _mm_madd_epi16(v_p1_d, v_m1_d);
     92 
     93    const __m128i v_diff0_d = _mm_sub_epi32(v_w0_d, v_pm0_d);
     94    const __m128i v_diff1_d = _mm_sub_epi32(v_w1_d, v_pm1_d);
     95    const __m128i v_absdiff0_d = _mm_abs_epi32(v_diff0_d);
     96    const __m128i v_absdiff1_d = _mm_abs_epi32(v_diff1_d);
     97 
     98    // Rounded absolute difference
     99    const __m128i v_rad0_d = xx_roundn_epu32(v_absdiff0_d, 12);
    100    const __m128i v_rad1_d = xx_roundn_epu32(v_absdiff1_d, 12);
    101 
    102    v_sad_d = _mm_add_epi32(v_sad_d, v_rad0_d);
    103    v_sad_d = _mm_add_epi32(v_sad_d, v_rad1_d);
    104 
    105    n += 8;
    106 
    107    if (n % width == 0) pre += pre_step;
    108  } while (n < width * height);
    109 
    110  return xx_hsum_epi32_si32(v_sad_d);
    111 }
    112 
    113 #define OBMCSADWXH(w, h)                                       \
    114  unsigned int aom_obmc_sad##w##x##h##_sse4_1(                 \
    115      const uint8_t *pre, int pre_stride, const int32_t *wsrc, \
    116      const int32_t *msk) {                                    \
    117    if (w == 4) {                                              \
    118      return obmc_sad_w4(pre, pre_stride, wsrc, msk, h);       \
    119    } else {                                                   \
    120      return obmc_sad_w8n(pre, pre_stride, wsrc, msk, w, h);   \
    121    }                                                          \
    122  }
    123 
    124 OBMCSADWXH(128, 128)
    125 OBMCSADWXH(128, 64)
    126 OBMCSADWXH(64, 128)
    127 OBMCSADWXH(64, 64)
    128 OBMCSADWXH(64, 32)
    129 OBMCSADWXH(32, 64)
    130 OBMCSADWXH(32, 32)
    131 OBMCSADWXH(32, 16)
    132 OBMCSADWXH(16, 32)
    133 OBMCSADWXH(16, 16)
    134 OBMCSADWXH(16, 8)
    135 OBMCSADWXH(8, 16)
    136 OBMCSADWXH(8, 8)
    137 OBMCSADWXH(8, 4)
    138 OBMCSADWXH(4, 8)
    139 OBMCSADWXH(4, 4)
    140 OBMCSADWXH(4, 16)
    141 OBMCSADWXH(16, 4)
    142 OBMCSADWXH(8, 32)
    143 OBMCSADWXH(32, 8)
    144 OBMCSADWXH(16, 64)
    145 OBMCSADWXH(64, 16)
    146 
    147 ////////////////////////////////////////////////////////////////////////////////
    148 // High bit-depth
    149 ////////////////////////////////////////////////////////////////////////////////
    150 
    151 #if CONFIG_AV1_HIGHBITDEPTH
    152 static AOM_FORCE_INLINE unsigned int hbd_obmc_sad_w4(const uint8_t *pre8,
    153                                                     const int pre_stride,
    154                                                     const int32_t *wsrc,
    155                                                     const int32_t *mask,
    156                                                     const int height) {
    157  const uint16_t *pre = CONVERT_TO_SHORTPTR(pre8);
    158  const int pre_step = pre_stride - 4;
    159  int n = 0;
    160  __m128i v_sad_d = _mm_setzero_si128();
    161 
    162  do {
    163    const __m128i v_p_w = xx_loadl_64(pre + n);
    164    const __m128i v_m_d = xx_load_128(mask + n);
    165    const __m128i v_w_d = xx_load_128(wsrc + n);
    166 
    167    const __m128i v_p_d = _mm_cvtepu16_epi32(v_p_w);
    168 
    169    // Values in both pre and mask fit in 15 bits, and are packed at 32 bit
    170    // boundaries. We use pmaddwd, as it has lower latency on Haswell
    171    // than pmulld but produces the same result with these inputs.
    172    const __m128i v_pm_d = _mm_madd_epi16(v_p_d, v_m_d);
    173 
    174    const __m128i v_diff_d = _mm_sub_epi32(v_w_d, v_pm_d);
    175    const __m128i v_absdiff_d = _mm_abs_epi32(v_diff_d);
    176 
    177    // Rounded absolute difference
    178    const __m128i v_rad_d = xx_roundn_epu32(v_absdiff_d, 12);
    179 
    180    v_sad_d = _mm_add_epi32(v_sad_d, v_rad_d);
    181 
    182    n += 4;
    183 
    184    if (n % 4 == 0) pre += pre_step;
    185  } while (n < 4 * height);
    186 
    187  return xx_hsum_epi32_si32(v_sad_d);
    188 }
    189 
    190 static AOM_FORCE_INLINE unsigned int hbd_obmc_sad_w8n(
    191    const uint8_t *pre8, const int pre_stride, const int32_t *wsrc,
    192    const int32_t *mask, const int width, const int height) {
    193  const uint16_t *pre = CONVERT_TO_SHORTPTR(pre8);
    194  const int pre_step = pre_stride - width;
    195  int n = 0;
    196  __m128i v_sad_d = _mm_setzero_si128();
    197 
    198  assert(width >= 8);
    199  assert(IS_POWER_OF_TWO(width));
    200 
    201  do {
    202    const __m128i v_p1_w = xx_loadl_64(pre + n + 4);
    203    const __m128i v_m1_d = xx_load_128(mask + n + 4);
    204    const __m128i v_w1_d = xx_load_128(wsrc + n + 4);
    205    const __m128i v_p0_w = xx_loadl_64(pre + n);
    206    const __m128i v_m0_d = xx_load_128(mask + n);
    207    const __m128i v_w0_d = xx_load_128(wsrc + n);
    208 
    209    const __m128i v_p0_d = _mm_cvtepu16_epi32(v_p0_w);
    210    const __m128i v_p1_d = _mm_cvtepu16_epi32(v_p1_w);
    211 
    212    // Values in both pre and mask fit in 15 bits, and are packed at 32 bit
    213    // boundaries. We use pmaddwd, as it has lower latency on Haswell
    214    // than pmulld but produces the same result with these inputs.
    215    const __m128i v_pm0_d = _mm_madd_epi16(v_p0_d, v_m0_d);
    216    const __m128i v_pm1_d = _mm_madd_epi16(v_p1_d, v_m1_d);
    217 
    218    const __m128i v_diff0_d = _mm_sub_epi32(v_w0_d, v_pm0_d);
    219    const __m128i v_diff1_d = _mm_sub_epi32(v_w1_d, v_pm1_d);
    220    const __m128i v_absdiff0_d = _mm_abs_epi32(v_diff0_d);
    221    const __m128i v_absdiff1_d = _mm_abs_epi32(v_diff1_d);
    222 
    223    // Rounded absolute difference
    224    const __m128i v_rad0_d = xx_roundn_epu32(v_absdiff0_d, 12);
    225    const __m128i v_rad1_d = xx_roundn_epu32(v_absdiff1_d, 12);
    226 
    227    v_sad_d = _mm_add_epi32(v_sad_d, v_rad0_d);
    228    v_sad_d = _mm_add_epi32(v_sad_d, v_rad1_d);
    229 
    230    n += 8;
    231 
    232    if (n % width == 0) pre += pre_step;
    233  } while (n < width * height);
    234 
    235  return xx_hsum_epi32_si32(v_sad_d);
    236 }
    237 
    238 #define HBD_OBMCSADWXH(w, h)                                      \
    239  unsigned int aom_highbd_obmc_sad##w##x##h##_sse4_1(             \
    240      const uint8_t *pre, int pre_stride, const int32_t *wsrc,    \
    241      const int32_t *mask) {                                      \
    242    if (w == 4) {                                                 \
    243      return hbd_obmc_sad_w4(pre, pre_stride, wsrc, mask, h);     \
    244    } else {                                                      \
    245      return hbd_obmc_sad_w8n(pre, pre_stride, wsrc, mask, w, h); \
    246    }                                                             \
    247  }
    248 
    249 HBD_OBMCSADWXH(128, 128)
    250 HBD_OBMCSADWXH(128, 64)
    251 HBD_OBMCSADWXH(64, 128)
    252 HBD_OBMCSADWXH(64, 64)
    253 HBD_OBMCSADWXH(64, 32)
    254 HBD_OBMCSADWXH(32, 64)
    255 HBD_OBMCSADWXH(32, 32)
    256 HBD_OBMCSADWXH(32, 16)
    257 HBD_OBMCSADWXH(16, 32)
    258 HBD_OBMCSADWXH(16, 16)
    259 HBD_OBMCSADWXH(16, 8)
    260 HBD_OBMCSADWXH(8, 16)
    261 HBD_OBMCSADWXH(8, 8)
    262 HBD_OBMCSADWXH(8, 4)
    263 HBD_OBMCSADWXH(4, 8)
    264 HBD_OBMCSADWXH(4, 4)
    265 HBD_OBMCSADWXH(4, 16)
    266 HBD_OBMCSADWXH(16, 4)
    267 HBD_OBMCSADWXH(8, 32)
    268 HBD_OBMCSADWXH(32, 8)
    269 HBD_OBMCSADWXH(16, 64)
    270 HBD_OBMCSADWXH(64, 16)
    271 #endif  // CONFIG_AV1_HIGHBITDEPTH
	tor-browser The Tor Browser
	git clone https://git.dasho.dev/tor-browser.git
	Log \| Files \| Refs \| README \| LICENSE