tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

cfl_simd.h (14837B)


      1 /*
      2 * Copyright (c) 2017, Alliance for Open Media. All rights reserved.
      3 *
      4 * This source code is subject to the terms of the BSD 2 Clause License and
      5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
      6 * was not distributed with this source code in the LICENSE file, you can
      7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
      8 * Media Patent License 1.0 was not distributed with this source code in the
      9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
     10 */
     11 
     12 #ifndef AOM_AV1_COMMON_X86_CFL_SIMD_H_
     13 #define AOM_AV1_COMMON_X86_CFL_SIMD_H_
     14 
     15 #include "av1/common/blockd.h"
     16 
     17 // SSSE3 version is optimal for with == 4, we reuse them in AVX2
     18 void cfl_subsample_lbd_420_4x4_ssse3(const uint8_t *cfl_type, int input_stride,
     19                                     uint16_t *output_q3);
     20 void cfl_subsample_lbd_420_4x8_ssse3(const uint8_t *cfl_type, int input_stride,
     21                                     uint16_t *output_q3);
     22 void cfl_subsample_lbd_420_4x16_ssse3(const uint8_t *cfl_type, int input_stride,
     23                                      uint16_t *output_q3);
     24 
     25 // SSSE3 version is optimal for with == 8, we reuse it in AVX2
     26 void cfl_subsample_lbd_420_8x4_ssse3(const uint8_t *cfl_type, int input_stride,
     27                                     uint16_t *output_q3);
     28 void cfl_subsample_lbd_420_8x8_ssse3(const uint8_t *cfl_type, int input_stride,
     29                                     uint16_t *output_q3);
     30 void cfl_subsample_lbd_420_8x16_ssse3(const uint8_t *cfl_type, int input_stride,
     31                                      uint16_t *output_q3);
     32 void cfl_subsample_lbd_420_8x32_ssse3(const uint8_t *cfl_type, int input_stride,
     33                                      uint16_t *output_q3);
     34 
     35 // SSSE3 version is optimal for with == 16, we reuse it in AVX2
     36 void cfl_subsample_lbd_420_16x4_ssse3(const uint8_t *cfl_type, int input_stride,
     37                                      uint16_t *output_q3);
     38 void cfl_subsample_lbd_420_16x8_ssse3(const uint8_t *cfl_type, int input_stride,
     39                                      uint16_t *output_q3);
     40 void cfl_subsample_lbd_420_16x16_ssse3(const uint8_t *cfl_type,
     41                                       int input_stride, uint16_t *output_q3);
     42 void cfl_subsample_lbd_420_16x32_ssse3(const uint8_t *cfl_type,
     43                                       int input_stride, uint16_t *output_q3);
     44 
     45 // SSSE3 version is optimal for with == 4, we reuse them in AVX2
     46 void cfl_subsample_lbd_422_4x4_ssse3(const uint8_t *cfl_type, int input_stride,
     47                                     uint16_t *output_q3);
     48 void cfl_subsample_lbd_422_4x8_ssse3(const uint8_t *cfl_type, int input_stride,
     49                                     uint16_t *output_q3);
     50 void cfl_subsample_lbd_422_4x16_ssse3(const uint8_t *cfl_type, int input_stride,
     51                                      uint16_t *output_q3);
     52 
     53 // SSSE3 version is optimal for with == 8, we reuse it in AVX2
     54 void cfl_subsample_lbd_422_8x4_ssse3(const uint8_t *cfl_type, int input_stride,
     55                                     uint16_t *output_q3);
     56 void cfl_subsample_lbd_422_8x8_ssse3(const uint8_t *cfl_type, int input_stride,
     57                                     uint16_t *output_q3);
     58 void cfl_subsample_lbd_422_8x16_ssse3(const uint8_t *cfl_type, int input_stride,
     59                                      uint16_t *output_q3);
     60 void cfl_subsample_lbd_422_8x32_ssse3(const uint8_t *cfl_type, int input_stride,
     61                                      uint16_t *output_q3);
     62 
     63 // SSSE3 version is optimal for with == 16, we reuse it in AVX2
     64 void cfl_subsample_lbd_422_16x4_ssse3(const uint8_t *cfl_type, int input_stride,
     65                                      uint16_t *output_q3);
     66 void cfl_subsample_lbd_422_16x8_ssse3(const uint8_t *cfl_type, int input_stride,
     67                                      uint16_t *output_q3);
     68 void cfl_subsample_lbd_422_16x16_ssse3(const uint8_t *cfl_type,
     69                                       int input_stride, uint16_t *output_q3);
     70 void cfl_subsample_lbd_422_16x32_ssse3(const uint8_t *cfl_type,
     71                                       int input_stride, uint16_t *output_q3);
     72 
     73 // SSSE3 version is optimal for with == 4, we reuse them in AVX2
     74 void cfl_subsample_lbd_444_4x4_ssse3(const uint8_t *cfl_type, int input_stride,
     75                                     uint16_t *output_q3);
     76 void cfl_subsample_lbd_444_4x8_ssse3(const uint8_t *cfl_type, int input_stride,
     77                                     uint16_t *output_q3);
     78 void cfl_subsample_lbd_444_4x16_ssse3(const uint8_t *cfl_type, int input_stride,
     79                                      uint16_t *output_q3);
     80 
     81 // SSSE3 version is optimal for with == 8, we reuse it in AVX2
     82 void cfl_subsample_lbd_444_8x4_ssse3(const uint8_t *cfl_type, int input_stride,
     83                                     uint16_t *output_q3);
     84 void cfl_subsample_lbd_444_8x8_ssse3(const uint8_t *cfl_type, int input_stride,
     85                                     uint16_t *output_q3);
     86 void cfl_subsample_lbd_444_8x16_ssse3(const uint8_t *cfl_type, int input_stride,
     87                                      uint16_t *output_q3);
     88 void cfl_subsample_lbd_444_8x32_ssse3(const uint8_t *cfl_type, int input_stride,
     89                                      uint16_t *output_q3);
     90 
     91 // SSSE3 version is optimal for with == 16, we reuse it in AVX2
     92 void cfl_subsample_lbd_444_16x4_ssse3(const uint8_t *cfl_type, int input_stride,
     93                                      uint16_t *output_q3);
     94 void cfl_subsample_lbd_444_16x8_ssse3(const uint8_t *cfl_type, int input_stride,
     95                                      uint16_t *output_q3);
     96 void cfl_subsample_lbd_444_16x16_ssse3(const uint8_t *cfl_type,
     97                                       int input_stride, uint16_t *output_q3);
     98 void cfl_subsample_lbd_444_16x32_ssse3(const uint8_t *cfl_type,
     99                                       int input_stride, uint16_t *output_q3);
    100 
    101 #if CONFIG_AV1_HIGHBITDEPTH
    102 void cfl_subsample_hbd_420_4x4_ssse3(const uint16_t *cfl_type, int input_stride,
    103                                     uint16_t *output_q3);
    104 void cfl_subsample_hbd_420_4x8_ssse3(const uint16_t *cfl_type, int input_stride,
    105                                     uint16_t *output_q3);
    106 void cfl_subsample_hbd_420_4x16_ssse3(const uint16_t *cfl_type,
    107                                      int input_stride, uint16_t *output_q3);
    108 
    109 // SSSE3 version is optimal for with == 8, we reuse it in AVX2
    110 void cfl_subsample_hbd_420_8x4_ssse3(const uint16_t *cfl_type, int input_stride,
    111                                     uint16_t *output_q3);
    112 void cfl_subsample_hbd_420_8x8_ssse3(const uint16_t *cfl_type, int input_stride,
    113                                     uint16_t *output_q3);
    114 void cfl_subsample_hbd_420_8x16_ssse3(const uint16_t *cfl_type,
    115                                      int input_stride, uint16_t *output_q3);
    116 void cfl_subsample_hbd_420_8x32_ssse3(const uint16_t *cfl_type,
    117                                      int input_stride, uint16_t *output_q3);
    118 
    119 // SSSE3 version is faster for with == 16, we reuse it in AVX2
    120 void cfl_subsample_hbd_420_16x4_ssse3(const uint16_t *cfl_type,
    121                                      int input_stride, uint16_t *output_q3);
    122 void cfl_subsample_hbd_420_16x8_ssse3(const uint16_t *cfl_type,
    123                                      int input_stride, uint16_t *output_q3);
    124 void cfl_subsample_hbd_420_16x16_ssse3(const uint16_t *cfl_type,
    125                                       int input_stride, uint16_t *output_q3);
    126 void cfl_subsample_hbd_420_16x32_ssse3(const uint16_t *cfl_type,
    127                                       int input_stride, uint16_t *output_q3);
    128 
    129 void cfl_subsample_hbd_422_4x4_ssse3(const uint16_t *cfl_type, int input_stride,
    130                                     uint16_t *output_q3);
    131 void cfl_subsample_hbd_422_4x8_ssse3(const uint16_t *cfl_type, int input_stride,
    132                                     uint16_t *output_q3);
    133 void cfl_subsample_hbd_422_4x16_ssse3(const uint16_t *cfl_type,
    134                                      int input_stride, uint16_t *output_q3);
    135 
    136 // SSSE3 version is optimal for with == 8, we reuse it in AVX2
    137 void cfl_subsample_hbd_422_8x4_ssse3(const uint16_t *cfl_type, int input_stride,
    138                                     uint16_t *output_q3);
    139 void cfl_subsample_hbd_422_8x8_ssse3(const uint16_t *cfl_type, int input_stride,
    140                                     uint16_t *output_q3);
    141 void cfl_subsample_hbd_422_8x16_ssse3(const uint16_t *cfl_type,
    142                                      int input_stride, uint16_t *output_q3);
    143 void cfl_subsample_hbd_422_8x32_ssse3(const uint16_t *cfl_type,
    144                                      int input_stride, uint16_t *output_q3);
    145 
    146 // SSSE3 version is faster for with == 16, we reuse it in AVX2
    147 void cfl_subsample_hbd_422_16x4_ssse3(const uint16_t *cfl_type,
    148                                      int input_stride, uint16_t *output_q3);
    149 void cfl_subsample_hbd_422_16x8_ssse3(const uint16_t *cfl_type,
    150                                      int input_stride, uint16_t *output_q3);
    151 void cfl_subsample_hbd_422_16x16_ssse3(const uint16_t *cfl_type,
    152                                       int input_stride, uint16_t *output_q3);
    153 void cfl_subsample_hbd_422_16x32_ssse3(const uint16_t *cfl_type,
    154                                       int input_stride, uint16_t *output_q3);
    155 
    156 void cfl_subsample_hbd_444_4x4_ssse3(const uint16_t *cfl_type, int input_stride,
    157                                     uint16_t *output_q3);
    158 void cfl_subsample_hbd_444_4x8_ssse3(const uint16_t *cfl_type, int input_stride,
    159                                     uint16_t *output_q3);
    160 void cfl_subsample_hbd_444_4x16_ssse3(const uint16_t *cfl_type,
    161                                      int input_stride, uint16_t *output_q3);
    162 
    163 // SSSE3 version is optimal for with == 8, we reuse it in AVX2
    164 void cfl_subsample_hbd_444_8x4_ssse3(const uint16_t *cfl_type, int input_stride,
    165                                     uint16_t *output_q3);
    166 void cfl_subsample_hbd_444_8x8_ssse3(const uint16_t *cfl_type, int input_stride,
    167                                     uint16_t *output_q3);
    168 void cfl_subsample_hbd_444_8x16_ssse3(const uint16_t *cfl_type,
    169                                      int input_stride, uint16_t *output_q3);
    170 void cfl_subsample_hbd_444_8x32_ssse3(const uint16_t *cfl_type,
    171                                      int input_stride, uint16_t *output_q3);
    172 
    173 // SSSE3 version is faster for with == 16, we reuse it in AVX2
    174 void cfl_subsample_hbd_444_16x4_ssse3(const uint16_t *cfl_type,
    175                                      int input_stride, uint16_t *output_q3);
    176 void cfl_subsample_hbd_444_16x8_ssse3(const uint16_t *cfl_type,
    177                                      int input_stride, uint16_t *output_q3);
    178 void cfl_subsample_hbd_444_16x16_ssse3(const uint16_t *cfl_type,
    179                                       int input_stride, uint16_t *output_q3);
    180 void cfl_subsample_hbd_444_16x32_ssse3(const uint16_t *cfl_type,
    181                                       int input_stride, uint16_t *output_q3);
    182 #endif  // CONFIG_AV1_HIGHBITDEPTH
    183 
    184 // SSE2 version is optimal for with == 4, we reuse them in AVX2
    185 void cfl_subtract_average_4x4_sse2(const uint16_t *src, int16_t *dst);
    186 void cfl_subtract_average_4x8_sse2(const uint16_t *src, int16_t *dst);
    187 void cfl_subtract_average_4x16_sse2(const uint16_t *src, int16_t *dst);
    188 
    189 // SSE2 version is optimal for with == 8, we reuse them in AVX2
    190 void cfl_subtract_average_8x4_sse2(const uint16_t *src, int16_t *dst);
    191 void cfl_subtract_average_8x8_sse2(const uint16_t *src, int16_t *dst);
    192 void cfl_subtract_average_8x16_sse2(const uint16_t *src, int16_t *dst);
    193 void cfl_subtract_average_8x32_sse2(const uint16_t *src, int16_t *dst);
    194 
    195 void cfl_predict_lbd_4x4_ssse3(const int16_t *pred_buf_q3, uint8_t *dst,
    196                               int dst_stride, int alpha_q3);
    197 void cfl_predict_lbd_4x8_ssse3(const int16_t *pred_buf_q3, uint8_t *dst,
    198                               int dst_stride, int alpha_q3);
    199 void cfl_predict_lbd_4x16_ssse3(const int16_t *pred_buf_q3, uint8_t *dst,
    200                                int dst_stride, int alpha_q3);
    201 
    202 void cfl_predict_lbd_8x4_ssse3(const int16_t *pred_buf_q3, uint8_t *dst,
    203                               int dst_stride, int alpha_q3);
    204 void cfl_predict_lbd_8x8_ssse3(const int16_t *pred_buf_q3, uint8_t *dst,
    205                               int dst_stride, int alpha_q3);
    206 void cfl_predict_lbd_8x16_ssse3(const int16_t *pred_buf_q3, uint8_t *dst,
    207                                int dst_stride, int alpha_q3);
    208 void cfl_predict_lbd_8x32_ssse3(const int16_t *pred_buf_q3, uint8_t *dst,
    209                                int dst_stride, int alpha_q3);
    210 
    211 void cfl_predict_lbd_16x4_ssse3(const int16_t *pred_buf_q3, uint8_t *dst,
    212                                int dst_stride, int alpha_q3);
    213 void cfl_predict_lbd_16x8_ssse3(const int16_t *pred_buf_q3, uint8_t *dst,
    214                                int dst_stride, int alpha_q3);
    215 void cfl_predict_lbd_16x16_ssse3(const int16_t *pred_buf_q3, uint8_t *dst,
    216                                 int dst_stride, int alpha_q3);
    217 void cfl_predict_lbd_16x32_ssse3(const int16_t *pred_buf_q3, uint8_t *dst,
    218                                 int dst_stride, int alpha_q3);
    219 
    220 #if CONFIG_AV1_HIGHBITDEPTH
    221 void cfl_predict_hbd_4x4_ssse3(const int16_t *pred_buf_q3, uint16_t *dst,
    222                               int dst_stride, int alpha_q3, int bd);
    223 void cfl_predict_hbd_4x8_ssse3(const int16_t *pred_buf_q3, uint16_t *dst,
    224                               int dst_stride, int alpha_q3, int bd);
    225 void cfl_predict_hbd_4x16_ssse3(const int16_t *pred_buf_q3, uint16_t *dst,
    226                                int dst_stride, int alpha_q3, int bd);
    227 
    228 void cfl_predict_hbd_8x4_ssse3(const int16_t *pred_buf_q3, uint16_t *dst,
    229                               int dst_stride, int alpha_q3, int bd);
    230 void cfl_predict_hbd_8x8_ssse3(const int16_t *pred_buf_q3, uint16_t *dst,
    231                               int dst_stride, int alpha_q3, int bd);
    232 void cfl_predict_hbd_8x16_ssse3(const int16_t *pred_buf_q3, uint16_t *dst,
    233                                int dst_stride, int alpha_q3, int bd);
    234 void cfl_predict_hbd_8x32_ssse3(const int16_t *pred_buf_q3, uint16_t *dst,
    235                                int dst_stride, int alpha_q3, int bd);
    236 
    237 void cfl_predict_hbd_16x4_ssse3(const int16_t *pred_buf_q3, uint16_t *dst,
    238                                int dst_stride, int alpha_q3, int bd);
    239 void cfl_predict_hbd_16x8_ssse3(const int16_t *pred_buf_q3, uint16_t *dst,
    240                                int dst_stride, int alpha_q3, int bd);
    241 void cfl_predict_hbd_16x16_ssse3(const int16_t *pred_buf_q3, uint16_t *dst,
    242                                 int dst_stride, int alpha_q3, int bd);
    243 void cfl_predict_hbd_16x32_ssse3(const int16_t *pred_buf_q3, uint16_t *dst,
    244                                 int dst_stride, int alpha_q3, int bd);
    245 #endif  // CONFIG_AV1_HIGHBITDEPTH
    246 #endif  // AOM_AV1_COMMON_X86_CFL_SIMD_H_