tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

sad_avx2.c (11176B)


      1 /*
      2 * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
      3 *
      4 * This source code is subject to the terms of the BSD 2 Clause License and
      5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
      6 * was not distributed with this source code in the LICENSE file, you can
      7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
      8 * Media Patent License 1.0 was not distributed with this source code in the
      9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
     10 */
     11 #include <immintrin.h>
     12 #include <stdint.h>
     13 
     14 #include "config/aom_dsp_rtcd.h"
     15 
     16 #include "aom_ports/mem.h"
     17 
     18 // SAD, SAD_SKIP and SAD_AVG for 64xh blocks
     19 #if !CONFIG_HIGHWAY
     20 static inline unsigned int sad64xh_avx2(const uint8_t *src_ptr, int src_stride,
     21                                        const uint8_t *ref_ptr, int ref_stride,
     22                                        int h) {
     23  int i;
     24  __m256i sad1_reg, sad2_reg, ref1_reg, ref2_reg;
     25  __m256i sum_sad = _mm256_setzero_si256();
     26  __m256i sum_sad_h;
     27  __m128i sum_sad128;
     28  for (i = 0; i < h; i++) {
     29    ref1_reg = _mm256_loadu_si256((__m256i const *)ref_ptr);
     30    ref2_reg = _mm256_loadu_si256((__m256i const *)(ref_ptr + 32));
     31    sad1_reg =
     32        _mm256_sad_epu8(ref1_reg, _mm256_loadu_si256((__m256i const *)src_ptr));
     33    sad2_reg = _mm256_sad_epu8(
     34        ref2_reg, _mm256_loadu_si256((__m256i const *)(src_ptr + 32)));
     35    sum_sad = _mm256_add_epi32(sum_sad, _mm256_add_epi32(sad1_reg, sad2_reg));
     36    ref_ptr += ref_stride;
     37    src_ptr += src_stride;
     38  }
     39  sum_sad_h = _mm256_srli_si256(sum_sad, 8);
     40  sum_sad = _mm256_add_epi32(sum_sad, sum_sad_h);
     41  sum_sad128 = _mm256_extracti128_si256(sum_sad, 1);
     42  sum_sad128 = _mm_add_epi32(_mm256_castsi256_si128(sum_sad), sum_sad128);
     43  unsigned int res = (unsigned int)_mm_cvtsi128_si32(sum_sad128);
     44  _mm256_zeroupper();
     45  return res;
     46 }
     47 
     48 #define FSAD64_H(h)                                                           \
     49  unsigned int aom_sad64x##h##_avx2(const uint8_t *src_ptr, int src_stride,   \
     50                                    const uint8_t *ref_ptr, int ref_stride) { \
     51    return sad64xh_avx2(src_ptr, src_stride, ref_ptr, ref_stride, h);         \
     52  }
     53 
     54 #define FSADS64_H(h)                                                          \
     55  unsigned int aom_sad_skip_64x##h##_avx2(                                    \
     56      const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,         \
     57      int ref_stride) {                                                       \
     58    return 2 * sad64xh_avx2(src_ptr, src_stride * 2, ref_ptr, ref_stride * 2, \
     59                            h / 2);                                           \
     60  }
     61 
     62 #define FSAD64  \
     63  FSAD64_H(64)  \
     64  FSAD64_H(32)  \
     65  FSADS64_H(64) \
     66  FSADS64_H(32)
     67 
     68 /* clang-format off */
     69 FSAD64
     70 /* clang-format on */
     71 
     72 #undef FSAD64
     73 #undef FSAD64_H
     74 
     75 #define FSADAVG64_H(h)                                                        \
     76  unsigned int aom_sad64x##h##_avg_avx2(                                      \
     77      const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,         \
     78      int ref_stride, const uint8_t *second_pred) {                           \
     79    int i;                                                                    \
     80    __m256i sad1_reg, sad2_reg, ref1_reg, ref2_reg;                           \
     81    __m256i sum_sad = _mm256_setzero_si256();                                 \
     82    __m256i sum_sad_h;                                                        \
     83    __m128i sum_sad128;                                                       \
     84    for (i = 0; i < h; i++) {                                                 \
     85      ref1_reg = _mm256_loadu_si256((__m256i const *)ref_ptr);                \
     86      ref2_reg = _mm256_loadu_si256((__m256i const *)(ref_ptr + 32));         \
     87      ref1_reg = _mm256_avg_epu8(                                             \
     88          ref1_reg, _mm256_loadu_si256((__m256i const *)second_pred));        \
     89      ref2_reg = _mm256_avg_epu8(                                             \
     90          ref2_reg, _mm256_loadu_si256((__m256i const *)(second_pred + 32))); \
     91      sad1_reg = _mm256_sad_epu8(                                             \
     92          ref1_reg, _mm256_loadu_si256((__m256i const *)src_ptr));            \
     93      sad2_reg = _mm256_sad_epu8(                                             \
     94          ref2_reg, _mm256_loadu_si256((__m256i const *)(src_ptr + 32)));     \
     95      sum_sad =                                                               \
     96          _mm256_add_epi32(sum_sad, _mm256_add_epi32(sad1_reg, sad2_reg));    \
     97      ref_ptr += ref_stride;                                                  \
     98      src_ptr += src_stride;                                                  \
     99      second_pred += 64;                                                      \
    100    }                                                                         \
    101    sum_sad_h = _mm256_srli_si256(sum_sad, 8);                                \
    102    sum_sad = _mm256_add_epi32(sum_sad, sum_sad_h);                           \
    103    sum_sad128 = _mm256_extracti128_si256(sum_sad, 1);                        \
    104    sum_sad128 = _mm_add_epi32(_mm256_castsi256_si128(sum_sad), sum_sad128);  \
    105    unsigned int res = (unsigned int)_mm_cvtsi128_si32(sum_sad128);           \
    106    _mm256_zeroupper();                                                       \
    107    return res;                                                               \
    108  }
    109 
    110 #define FSADAVG64 \
    111  FSADAVG64_H(64) \
    112  FSADAVG64_H(32)
    113 
    114 /* clang-format off */
    115 FSADAVG64
    116 /* clang-format on */
    117 
    118 #undef FSADAVG64
    119 #undef FSADAVG64_H
    120 #endif  // !CONFIG_HIGHWAY
    121 
    122 // SAD, SAD_SKIP and SAD_AVG for 32xh blocks
    123 static inline unsigned int sad32xh_avx2(const uint8_t *src_ptr, int src_stride,
    124                                        const uint8_t *ref_ptr, int ref_stride,
    125                                        int h) {
    126  int i;
    127  __m256i sad1_reg, sad2_reg, ref1_reg, ref2_reg;
    128  __m256i sum_sad = _mm256_setzero_si256();
    129  __m256i sum_sad_h;
    130  __m128i sum_sad128;
    131  int ref2_stride = ref_stride << 1;
    132  int src2_stride = src_stride << 1;
    133  int max = h >> 1;
    134  for (i = 0; i < max; i++) {
    135    ref1_reg = _mm256_loadu_si256((__m256i const *)ref_ptr);
    136    ref2_reg = _mm256_loadu_si256((__m256i const *)(ref_ptr + ref_stride));
    137    sad1_reg =
    138        _mm256_sad_epu8(ref1_reg, _mm256_loadu_si256((__m256i const *)src_ptr));
    139    sad2_reg = _mm256_sad_epu8(
    140        ref2_reg, _mm256_loadu_si256((__m256i const *)(src_ptr + src_stride)));
    141    sum_sad = _mm256_add_epi32(sum_sad, _mm256_add_epi32(sad1_reg, sad2_reg));
    142    ref_ptr += ref2_stride;
    143    src_ptr += src2_stride;
    144  }
    145  sum_sad_h = _mm256_srli_si256(sum_sad, 8);
    146  sum_sad = _mm256_add_epi32(sum_sad, sum_sad_h);
    147  sum_sad128 = _mm256_extracti128_si256(sum_sad, 1);
    148  sum_sad128 = _mm_add_epi32(_mm256_castsi256_si128(sum_sad), sum_sad128);
    149  unsigned int res = (unsigned int)_mm_cvtsi128_si32(sum_sad128);
    150  _mm256_zeroupper();
    151  return res;
    152 }
    153 
    154 #define FSAD32_H(h)                                                           \
    155  unsigned int aom_sad32x##h##_avx2(const uint8_t *src_ptr, int src_stride,   \
    156                                    const uint8_t *ref_ptr, int ref_stride) { \
    157    return sad32xh_avx2(src_ptr, src_stride, ref_ptr, ref_stride, h);         \
    158  }
    159 
    160 #define FSADS32_H(h)                                                          \
    161  unsigned int aom_sad_skip_32x##h##_avx2(                                    \
    162      const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,         \
    163      int ref_stride) {                                                       \
    164    return 2 * sad32xh_avx2(src_ptr, src_stride * 2, ref_ptr, ref_stride * 2, \
    165                            h / 2);                                           \
    166  }
    167 
    168 #define FSAD32  \
    169  FSAD32_H(64)  \
    170  FSAD32_H(32)  \
    171  FSAD32_H(16)  \
    172  FSADS32_H(64) \
    173  FSADS32_H(32) \
    174  FSADS32_H(16)
    175 
    176 /* clang-format off */
    177 FSAD32
    178 /* clang-format on */
    179 
    180 #undef FSAD32
    181 #undef FSAD32_H
    182 
    183 #define FSADAVG32_H(h)                                                        \
    184  unsigned int aom_sad32x##h##_avg_avx2(                                      \
    185      const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,         \
    186      int ref_stride, const uint8_t *second_pred) {                           \
    187    int i;                                                                    \
    188    __m256i sad1_reg, sad2_reg, ref1_reg, ref2_reg;                           \
    189    __m256i sum_sad = _mm256_setzero_si256();                                 \
    190    __m256i sum_sad_h;                                                        \
    191    __m128i sum_sad128;                                                       \
    192    int ref2_stride = ref_stride << 1;                                        \
    193    int src2_stride = src_stride << 1;                                        \
    194    int max = h >> 1;                                                         \
    195    for (i = 0; i < max; i++) {                                               \
    196      ref1_reg = _mm256_loadu_si256((__m256i const *)ref_ptr);                \
    197      ref2_reg = _mm256_loadu_si256((__m256i const *)(ref_ptr + ref_stride)); \
    198      ref1_reg = _mm256_avg_epu8(                                             \
    199          ref1_reg, _mm256_loadu_si256((__m256i const *)second_pred));        \
    200      ref2_reg = _mm256_avg_epu8(                                             \
    201          ref2_reg, _mm256_loadu_si256((__m256i const *)(second_pred + 32))); \
    202      sad1_reg = _mm256_sad_epu8(                                             \
    203          ref1_reg, _mm256_loadu_si256((__m256i const *)src_ptr));            \
    204      sad2_reg = _mm256_sad_epu8(                                             \
    205          ref2_reg,                                                           \
    206          _mm256_loadu_si256((__m256i const *)(src_ptr + src_stride)));       \
    207      sum_sad =                                                               \
    208          _mm256_add_epi32(sum_sad, _mm256_add_epi32(sad1_reg, sad2_reg));    \
    209      ref_ptr += ref2_stride;                                                 \
    210      src_ptr += src2_stride;                                                 \
    211      second_pred += 64;                                                      \
    212    }                                                                         \
    213    sum_sad_h = _mm256_srli_si256(sum_sad, 8);                                \
    214    sum_sad = _mm256_add_epi32(sum_sad, sum_sad_h);                           \
    215    sum_sad128 = _mm256_extracti128_si256(sum_sad, 1);                        \
    216    sum_sad128 = _mm_add_epi32(_mm256_castsi256_si128(sum_sad), sum_sad128);  \
    217    unsigned int res = (unsigned int)_mm_cvtsi128_si32(sum_sad128);           \
    218    _mm256_zeroupper();                                                       \
    219    return res;                                                               \
    220  }
    221 
    222 #define FSADAVG32 \
    223  FSADAVG32_H(64) \
    224  FSADAVG32_H(32) \
    225  FSADAVG32_H(16)
    226 
    227 /* clang-format off */
    228 FSADAVG32
    229 /* clang-format on */
    230 
    231 #undef FSADAVG32
    232 #undef FSADAVG32_H