tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

fwd_txfm_sse2.h (6963B)


      1 /*
      2 * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
      3 *
      4 * This source code is subject to the terms of the BSD 2 Clause License and
      5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
      6 * was not distributed with this source code in the LICENSE file, you can
      7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
      8 * Media Patent License 1.0 was not distributed with this source code in the
      9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
     10 */
     11 
     12 #ifndef AOM_AOM_DSP_X86_FWD_TXFM_SSE2_H_
     13 #define AOM_AOM_DSP_X86_FWD_TXFM_SSE2_H_
     14 
     15 #ifdef __cplusplus
     16 extern "C" {
     17 #endif
     18 
     19 static inline __m128i k_madd_epi32(__m128i a, __m128i b) {
     20  __m128i buf0, buf1;
     21  buf0 = _mm_mul_epu32(a, b);
     22  a = _mm_srli_epi64(a, 32);
     23  b = _mm_srli_epi64(b, 32);
     24  buf1 = _mm_mul_epu32(a, b);
     25  return _mm_add_epi64(buf0, buf1);
     26 }
     27 
     28 static inline __m128i k_packs_epi64(__m128i a, __m128i b) {
     29  __m128i buf0 = _mm_shuffle_epi32(a, _MM_SHUFFLE(0, 0, 2, 0));
     30  __m128i buf1 = _mm_shuffle_epi32(b, _MM_SHUFFLE(0, 0, 2, 0));
     31  return _mm_unpacklo_epi64(buf0, buf1);
     32 }
     33 
     34 static inline int check_epi16_overflow_x2(const __m128i *preg0,
     35                                          const __m128i *preg1) {
     36  const __m128i max_overflow = _mm_set1_epi16(0x7fff);
     37  const __m128i min_overflow = _mm_set1_epi16((short)0x8000);
     38  __m128i cmp0 = _mm_or_si128(_mm_cmpeq_epi16(*preg0, max_overflow),
     39                              _mm_cmpeq_epi16(*preg0, min_overflow));
     40  __m128i cmp1 = _mm_or_si128(_mm_cmpeq_epi16(*preg1, max_overflow),
     41                              _mm_cmpeq_epi16(*preg1, min_overflow));
     42  cmp0 = _mm_or_si128(cmp0, cmp1);
     43  return _mm_movemask_epi8(cmp0);
     44 }
     45 
     46 static inline int check_epi16_overflow_x4(const __m128i *preg0,
     47                                          const __m128i *preg1,
     48                                          const __m128i *preg2,
     49                                          const __m128i *preg3) {
     50  const __m128i max_overflow = _mm_set1_epi16(0x7fff);
     51  const __m128i min_overflow = _mm_set1_epi16((short)0x8000);
     52  __m128i cmp0 = _mm_or_si128(_mm_cmpeq_epi16(*preg0, max_overflow),
     53                              _mm_cmpeq_epi16(*preg0, min_overflow));
     54  __m128i cmp1 = _mm_or_si128(_mm_cmpeq_epi16(*preg1, max_overflow),
     55                              _mm_cmpeq_epi16(*preg1, min_overflow));
     56  __m128i cmp2 = _mm_or_si128(_mm_cmpeq_epi16(*preg2, max_overflow),
     57                              _mm_cmpeq_epi16(*preg2, min_overflow));
     58  __m128i cmp3 = _mm_or_si128(_mm_cmpeq_epi16(*preg3, max_overflow),
     59                              _mm_cmpeq_epi16(*preg3, min_overflow));
     60  cmp0 = _mm_or_si128(_mm_or_si128(cmp0, cmp1), _mm_or_si128(cmp2, cmp3));
     61  return _mm_movemask_epi8(cmp0);
     62 }
     63 
     64 static inline int check_epi16_overflow_x8(
     65    const __m128i *preg0, const __m128i *preg1, const __m128i *preg2,
     66    const __m128i *preg3, const __m128i *preg4, const __m128i *preg5,
     67    const __m128i *preg6, const __m128i *preg7) {
     68  int res0, res1;
     69  res0 = check_epi16_overflow_x4(preg0, preg1, preg2, preg3);
     70  res1 = check_epi16_overflow_x4(preg4, preg5, preg6, preg7);
     71  return res0 + res1;
     72 }
     73 
     74 static inline int check_epi16_overflow_x12(
     75    const __m128i *preg0, const __m128i *preg1, const __m128i *preg2,
     76    const __m128i *preg3, const __m128i *preg4, const __m128i *preg5,
     77    const __m128i *preg6, const __m128i *preg7, const __m128i *preg8,
     78    const __m128i *preg9, const __m128i *preg10, const __m128i *preg11) {
     79  int res0, res1;
     80  res0 = check_epi16_overflow_x4(preg0, preg1, preg2, preg3);
     81  res1 = check_epi16_overflow_x4(preg4, preg5, preg6, preg7);
     82  if (!res0) res0 = check_epi16_overflow_x4(preg8, preg9, preg10, preg11);
     83  return res0 + res1;
     84 }
     85 
     86 static inline int check_epi16_overflow_x16(
     87    const __m128i *preg0, const __m128i *preg1, const __m128i *preg2,
     88    const __m128i *preg3, const __m128i *preg4, const __m128i *preg5,
     89    const __m128i *preg6, const __m128i *preg7, const __m128i *preg8,
     90    const __m128i *preg9, const __m128i *preg10, const __m128i *preg11,
     91    const __m128i *preg12, const __m128i *preg13, const __m128i *preg14,
     92    const __m128i *preg15) {
     93  int res0, res1;
     94  res0 = check_epi16_overflow_x4(preg0, preg1, preg2, preg3);
     95  res1 = check_epi16_overflow_x4(preg4, preg5, preg6, preg7);
     96  if (!res0) {
     97    res0 = check_epi16_overflow_x4(preg8, preg9, preg10, preg11);
     98    if (!res1) res1 = check_epi16_overflow_x4(preg12, preg13, preg14, preg15);
     99  }
    100  return res0 + res1;
    101 }
    102 
    103 static inline int check_epi16_overflow_x32(
    104    const __m128i *preg0, const __m128i *preg1, const __m128i *preg2,
    105    const __m128i *preg3, const __m128i *preg4, const __m128i *preg5,
    106    const __m128i *preg6, const __m128i *preg7, const __m128i *preg8,
    107    const __m128i *preg9, const __m128i *preg10, const __m128i *preg11,
    108    const __m128i *preg12, const __m128i *preg13, const __m128i *preg14,
    109    const __m128i *preg15, const __m128i *preg16, const __m128i *preg17,
    110    const __m128i *preg18, const __m128i *preg19, const __m128i *preg20,
    111    const __m128i *preg21, const __m128i *preg22, const __m128i *preg23,
    112    const __m128i *preg24, const __m128i *preg25, const __m128i *preg26,
    113    const __m128i *preg27, const __m128i *preg28, const __m128i *preg29,
    114    const __m128i *preg30, const __m128i *preg31) {
    115  int res0, res1;
    116  res0 = check_epi16_overflow_x4(preg0, preg1, preg2, preg3);
    117  res1 = check_epi16_overflow_x4(preg4, preg5, preg6, preg7);
    118  if (!res0) {
    119    res0 = check_epi16_overflow_x4(preg8, preg9, preg10, preg11);
    120    if (!res1) {
    121      res1 = check_epi16_overflow_x4(preg12, preg13, preg14, preg15);
    122      if (!res0) {
    123        res0 = check_epi16_overflow_x4(preg16, preg17, preg18, preg19);
    124        if (!res1) {
    125          res1 = check_epi16_overflow_x4(preg20, preg21, preg22, preg23);
    126          if (!res0) {
    127            res0 = check_epi16_overflow_x4(preg24, preg25, preg26, preg27);
    128            if (!res1)
    129              res1 = check_epi16_overflow_x4(preg28, preg29, preg30, preg31);
    130          }
    131        }
    132      }
    133    }
    134  }
    135  return res0 + res1;
    136 }
    137 
    138 static inline void store_output(const __m128i *poutput, tran_low_t *dst_ptr) {
    139  const __m128i zero = _mm_setzero_si128();
    140  const __m128i sign_bits = _mm_cmplt_epi16(*poutput, zero);
    141  __m128i out0 = _mm_unpacklo_epi16(*poutput, sign_bits);
    142  __m128i out1 = _mm_unpackhi_epi16(*poutput, sign_bits);
    143  _mm_store_si128((__m128i *)(dst_ptr), out0);
    144  _mm_store_si128((__m128i *)(dst_ptr + 4), out1);
    145 }
    146 
    147 static inline void storeu_output(const __m128i *poutput, tran_low_t *dst_ptr) {
    148  const __m128i zero = _mm_setzero_si128();
    149  const __m128i sign_bits = _mm_cmplt_epi16(*poutput, zero);
    150  __m128i out0 = _mm_unpacklo_epi16(*poutput, sign_bits);
    151  __m128i out1 = _mm_unpackhi_epi16(*poutput, sign_bits);
    152  _mm_storeu_si128((__m128i *)(dst_ptr), out0);
    153  _mm_storeu_si128((__m128i *)(dst_ptr + 4), out1);
    154 }
    155 
    156 #ifdef __cplusplus
    157 }  // extern "C"
    158 #endif
    159 
    160 #endif  // AOM_AOM_DSP_X86_FWD_TXFM_SSE2_H_