tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

av1_txfm1d_sse4.h (4913B)


      1 /*
      2 * Copyright (c) 2018, Alliance for Open Media. All rights reserved.
      3 *
      4 * This source code is subject to the terms of the BSD 2 Clause License and
      5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
      6 * was not distributed with this source code in the LICENSE file, you can
      7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
      8 * Media Patent License 1.0 was not distributed with this source code in the
      9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
     10 */
     11 
     12 #ifndef AOM_AV1_ENCODER_X86_AV1_TXFM1D_SSE4_H_
     13 #define AOM_AV1_ENCODER_X86_AV1_TXFM1D_SSE4_H_
     14 
     15 #include <smmintrin.h>
     16 #include "av1/common/av1_txfm.h"
     17 #include "av1/common/x86/av1_txfm_sse4.h"
     18 
     19 #ifdef __cplusplus
     20 extern "C" {
     21 #endif
     22 
     23 void av1_fdct32_sse4_1(__m128i *input, __m128i *output, int cos_bit,
     24                       const int stride);
     25 void av1_fdct64_sse4_1(__m128i *input, __m128i *output, int8_t cos_bit,
     26                       const int instride, const int outstride);
     27 
     28 void av1_idtx32_sse4_1(__m128i *input, __m128i *output, int cos_bit,
     29                       const int col_num);
     30 
     31 static inline void transpose_32_4x4(int stride, const __m128i *input,
     32                                    __m128i *output) {
     33  __m128i temp0 = _mm_unpacklo_epi32(input[0 * stride], input[2 * stride]);
     34  __m128i temp1 = _mm_unpackhi_epi32(input[0 * stride], input[2 * stride]);
     35  __m128i temp2 = _mm_unpacklo_epi32(input[1 * stride], input[3 * stride]);
     36  __m128i temp3 = _mm_unpackhi_epi32(input[1 * stride], input[3 * stride]);
     37 
     38  output[0 * stride] = _mm_unpacklo_epi32(temp0, temp2);
     39  output[1 * stride] = _mm_unpackhi_epi32(temp0, temp2);
     40  output[2 * stride] = _mm_unpacklo_epi32(temp1, temp3);
     41  output[3 * stride] = _mm_unpackhi_epi32(temp1, temp3);
     42 }
     43 
     44 // the entire input block can be represent by a grid of 4x4 blocks
     45 // each 4x4 blocks can be represent by 4 vertical __m128i
     46 // we first transpose each 4x4 block internally
     47 // then transpose the grid
     48 static inline void transpose_32(int txfm_size, const __m128i *input,
     49                                __m128i *output) {
     50  const int num_per_128 = 4;
     51  const int row_size = txfm_size;
     52  const int col_size = txfm_size / num_per_128;
     53  int r, c;
     54 
     55  // transpose each 4x4 block internally
     56  for (r = 0; r < row_size; r += 4) {
     57    for (c = 0; c < col_size; c++) {
     58      transpose_32_4x4(col_size, &input[r * col_size + c],
     59                       &output[c * 4 * col_size + r / 4]);
     60    }
     61  }
     62 }
     63 
     64 // out0 = in0*w0 + in1*w1
     65 // out1 = -in1*w0 + in0*w1
     66 #define btf_32_sse4_1_type0(w0, w1, in0, in1, out0, out1, bit) \
     67  do {                                                         \
     68    const __m128i ww0 = _mm_set1_epi32(w0);                    \
     69    const __m128i ww1 = _mm_set1_epi32(w1);                    \
     70    const __m128i in0_w0 = _mm_mullo_epi32(in0, ww0);          \
     71    const __m128i in1_w1 = _mm_mullo_epi32(in1, ww1);          \
     72    out0 = _mm_add_epi32(in0_w0, in1_w1);                      \
     73    out0 = av1_round_shift_32_sse4_1(out0, bit);               \
     74    const __m128i in0_w1 = _mm_mullo_epi32(in0, ww1);          \
     75    const __m128i in1_w0 = _mm_mullo_epi32(in1, ww0);          \
     76    out1 = _mm_sub_epi32(in0_w1, in1_w0);                      \
     77    out1 = av1_round_shift_32_sse4_1(out1, bit);               \
     78  } while (0)
     79 
     80 // out0 = in0*w0 + in1*w1
     81 // out1 = in1*w0 - in0*w1
     82 #define btf_32_sse4_1_type1(w0, w1, in0, in1, out0, out1, bit) \
     83  do {                                                         \
     84    btf_32_sse4_1_type0(w1, w0, in1, in0, out0, out1, bit);    \
     85  } while (0)
     86 
     87 // out0 = in0*w0 + in1*w1
     88 // out1 = -in1*w0 + in0*w1
     89 #define btf_32_type0_sse4_1_new(ww0, ww1, in0, in1, out0, out1, r, bit) \
     90  do {                                                                  \
     91    const __m128i in0_w0 = _mm_mullo_epi32(in0, ww0);                   \
     92    const __m128i in1_w1 = _mm_mullo_epi32(in1, ww1);                   \
     93    out0 = _mm_add_epi32(in0_w0, in1_w1);                               \
     94    out0 = _mm_add_epi32(out0, r);                                      \
     95    out0 = _mm_srai_epi32(out0, bit);                                   \
     96    const __m128i in0_w1 = _mm_mullo_epi32(in0, ww1);                   \
     97    const __m128i in1_w0 = _mm_mullo_epi32(in1, ww0);                   \
     98    out1 = _mm_sub_epi32(in0_w1, in1_w0);                               \
     99    out1 = _mm_add_epi32(out1, r);                                      \
    100    out1 = _mm_srai_epi32(out1, bit);                                   \
    101  } while (0)
    102 
    103 // out0 = in0*w0 + in1*w1
    104 // out1 = in1*w0 - in0*w1
    105 #define btf_32_type1_sse4_1_new(ww0, ww1, in0, in1, out0, out1, r, bit) \
    106  do {                                                                  \
    107    btf_32_type0_sse4_1_new(ww1, ww0, in1, in0, out0, out1, r, bit);    \
    108  } while (0)
    109 
    110 #ifdef __cplusplus
    111 }
    112 #endif
    113 
    114 #endif  // AOM_AV1_ENCODER_X86_AV1_TXFM1D_SSE4_H_