tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

vp9dsp_init_16bpp_template.c (8746B)


      1 /*
      2 * VP9 SIMD optimizations
      3 *
      4 * Copyright (c) 2013 Ronald S. Bultje <rsbultje gmail com>
      5 *
      6 * This file is part of FFmpeg.
      7 *
      8 * FFmpeg is free software; you can redistribute it and/or
      9 * modify it under the terms of the GNU Lesser General Public
     10 * License as published by the Free Software Foundation; either
     11 * version 2.1 of the License, or (at your option) any later version.
     12 *
     13 * FFmpeg is distributed in the hope that it will be useful,
     14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
     15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
     16 * Lesser General Public License for more details.
     17 *
     18 * You should have received a copy of the GNU Lesser General Public
     19 * License along with FFmpeg; if not, write to the Free Software
     20 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
     21 */
     22 
     23 #include "libavutil/attributes.h"
     24 #include "libavutil/cpu.h"
     25 #include "libavutil/x86/cpu.h"
     26 #include "libavcodec/vp9dsp.h"
     27 #include "libavcodec/x86/vp9dsp_init.h"
     28 
     29 #if HAVE_X86ASM
     30 
     31 extern const int16_t ff_filters_16bpp[3][15][4][16];
     32 
     33 decl_mc_funcs(4, sse2, int16_t, 16, BPC);
     34 decl_mc_funcs(8, sse2, int16_t, 16, BPC);
     35 decl_mc_funcs(16, avx2, int16_t, 16, BPC);
     36 
     37 mc_rep_funcs(16,  8, 16, sse2, int16_t, 16, BPC)
     38 mc_rep_funcs(32, 16, 32, sse2, int16_t, 16, BPC)
     39 mc_rep_funcs(64, 32, 64, sse2, int16_t, 16, BPC)
     40 #if HAVE_AVX2_EXTERNAL
     41 mc_rep_funcs(32, 16, 32, avx2, int16_t, 16, BPC)
     42 mc_rep_funcs(64, 32, 64, avx2, int16_t, 16, BPC)
     43 #endif
     44 
     45 filters_8tap_2d_fn2(put, 16, BPC, 2, sse2, sse2, 16bpp)
     46 filters_8tap_2d_fn2(avg, 16, BPC, 2, sse2, sse2, 16bpp)
     47 #if HAVE_AVX2_EXTERNAL
     48 filters_8tap_2d_fn(put, 64, 32, BPC, 2, avx2, 16bpp)
     49 filters_8tap_2d_fn(avg, 64, 32, BPC, 2, avx2, 16bpp)
     50 filters_8tap_2d_fn(put, 32, 32, BPC, 2, avx2, 16bpp)
     51 filters_8tap_2d_fn(avg, 32, 32, BPC, 2, avx2, 16bpp)
     52 filters_8tap_2d_fn(put, 16, 32, BPC, 2, avx2, 16bpp)
     53 filters_8tap_2d_fn(avg, 16, 32, BPC, 2, avx2, 16bpp)
     54 #endif
     55 
     56 filters_8tap_1d_fn3(put, BPC, sse2, sse2, 16bpp)
     57 filters_8tap_1d_fn3(avg, BPC, sse2, sse2, 16bpp)
     58 #if HAVE_AVX2_EXTERNAL
     59 filters_8tap_1d_fn2(put, 64, BPC, avx2, 16bpp)
     60 filters_8tap_1d_fn2(avg, 64, BPC, avx2, 16bpp)
     61 filters_8tap_1d_fn2(put, 32, BPC, avx2, 16bpp)
     62 filters_8tap_1d_fn2(avg, 32, BPC, avx2, 16bpp)
     63 filters_8tap_1d_fn2(put, 16, BPC, avx2, 16bpp)
     64 filters_8tap_1d_fn2(avg, 16, BPC, avx2, 16bpp)
     65 #endif
     66 
     67 #define decl_lpf_func(dir, wd, bpp, opt) \
     68 void ff_vp9_loop_filter_##dir##_##wd##_##bpp##_##opt(uint8_t *dst, ptrdiff_t stride, \
     69                                                     int E, int I, int H)
     70 
     71 #define decl_lpf_funcs(dir, wd, bpp) \
     72 decl_lpf_func(dir, wd, bpp, sse2); \
     73 decl_lpf_func(dir, wd, bpp, ssse3); \
     74 decl_lpf_func(dir, wd, bpp, avx)
     75 
     76 #define decl_lpf_funcs_wd(dir) \
     77 decl_lpf_funcs(dir,  4, BPC); \
     78 decl_lpf_funcs(dir,  8, BPC); \
     79 decl_lpf_funcs(dir, 16, BPC)
     80 
     81 decl_lpf_funcs_wd(h);
     82 decl_lpf_funcs_wd(v);
     83 
     84 #define lpf_16_wrapper(dir, off, bpp, opt) \
     85 static void loop_filter_##dir##_16_##bpp##_##opt(uint8_t *dst, ptrdiff_t stride, \
     86                                                 int E, int I, int H) \
     87 { \
     88    ff_vp9_loop_filter_##dir##_16_##bpp##_##opt(dst,       stride, E, I, H); \
     89    ff_vp9_loop_filter_##dir##_16_##bpp##_##opt(dst + off, stride, E, I, H); \
     90 }
     91 
     92 #define lpf_16_wrappers(bpp, opt) \
     93 lpf_16_wrapper(h, 8 * stride, bpp, opt) \
     94 lpf_16_wrapper(v, 16,         bpp, opt)
     95 
     96 lpf_16_wrappers(BPC, sse2)
     97 lpf_16_wrappers(BPC, ssse3)
     98 lpf_16_wrappers(BPC, avx)
     99 
    100 #define lpf_mix2_wrapper(dir, off, wd1, wd2, bpp, opt) \
    101 static void loop_filter_##dir##_##wd1##wd2##_##bpp##_##opt(uint8_t *dst, ptrdiff_t stride, \
    102                                                           int E, int I, int H) \
    103 { \
    104    ff_vp9_loop_filter_##dir##_##wd1##_##bpp##_##opt(dst,       stride, \
    105                                                     E & 0xff, I & 0xff, H & 0xff); \
    106    ff_vp9_loop_filter_##dir##_##wd2##_##bpp##_##opt(dst + off, stride, \
    107                                                     E >> 8,   I >> 8,   H >> 8); \
    108 }
    109 
    110 #define lpf_mix2_wrappers(wd1, wd2, bpp, opt) \
    111 lpf_mix2_wrapper(h, 8 * stride, wd1, wd2, bpp, opt) \
    112 lpf_mix2_wrapper(v, 16,         wd1, wd2, bpp, opt)
    113 
    114 #define lpf_mix2_wrappers_set(bpp, opt) \
    115 lpf_mix2_wrappers(4, 4, bpp, opt) \
    116 lpf_mix2_wrappers(4, 8, bpp, opt) \
    117 lpf_mix2_wrappers(8, 4, bpp, opt) \
    118 lpf_mix2_wrappers(8, 8, bpp, opt) \
    119 
    120 lpf_mix2_wrappers_set(BPC, sse2)
    121 lpf_mix2_wrappers_set(BPC, ssse3)
    122 lpf_mix2_wrappers_set(BPC, avx)
    123 
    124 decl_ipred_fns(tm, BPC, mmxext, sse2);
    125 
    126 decl_itxfm_func(iwht, iwht, 4, BPC, mmxext);
    127 #if BPC == 10
    128 decl_itxfm_func(idct,  idct,  4, BPC, mmxext);
    129 decl_itxfm_funcs(4, BPC, ssse3);
    130 #else
    131 decl_itxfm_func(idct,  idct,  4, BPC, sse2);
    132 #endif
    133 decl_itxfm_func(idct,  iadst, 4, BPC, sse2);
    134 decl_itxfm_func(iadst, idct,  4, BPC, sse2);
    135 decl_itxfm_func(iadst, iadst, 4, BPC, sse2);
    136 decl_itxfm_funcs(8, BPC, sse2);
    137 decl_itxfm_funcs(16, BPC, sse2);
    138 decl_itxfm_func(idct,  idct, 32, BPC, sse2);
    139 #endif /* HAVE_X86ASM */
    140 
    141 av_cold void INIT_FUNC(VP9DSPContext *dsp, int bitexact)
    142 {
    143 #if HAVE_X86ASM
    144    int cpu_flags = av_get_cpu_flags();
    145 
    146 #define init_lpf_8_func(idx1, idx2, dir, wd, bpp, opt) \
    147    dsp->loop_filter_8[idx1][idx2] = ff_vp9_loop_filter_##dir##_##wd##_##bpp##_##opt
    148 #define init_lpf_16_func(idx, dir, bpp, opt) \
    149    dsp->loop_filter_16[idx] = loop_filter_##dir##_16_##bpp##_##opt
    150 #define init_lpf_mix2_func(idx1, idx2, idx3, dir, wd1, wd2, bpp, opt) \
    151    dsp->loop_filter_mix2[idx1][idx2][idx3] = loop_filter_##dir##_##wd1##wd2##_##bpp##_##opt
    152 
    153 #define init_lpf_funcs(bpp, opt) \
    154    init_lpf_8_func(0, 0, h,  4, bpp, opt); \
    155    init_lpf_8_func(0, 1, v,  4, bpp, opt); \
    156    init_lpf_8_func(1, 0, h,  8, bpp, opt); \
    157    init_lpf_8_func(1, 1, v,  8, bpp, opt); \
    158    init_lpf_8_func(2, 0, h, 16, bpp, opt); \
    159    init_lpf_8_func(2, 1, v, 16, bpp, opt); \
    160    init_lpf_16_func(0, h, bpp, opt); \
    161    init_lpf_16_func(1, v, bpp, opt); \
    162    init_lpf_mix2_func(0, 0, 0, h, 4, 4, bpp, opt); \
    163    init_lpf_mix2_func(0, 1, 0, h, 4, 8, bpp, opt); \
    164    init_lpf_mix2_func(1, 0, 0, h, 8, 4, bpp, opt); \
    165    init_lpf_mix2_func(1, 1, 0, h, 8, 8, bpp, opt); \
    166    init_lpf_mix2_func(0, 0, 1, v, 4, 4, bpp, opt); \
    167    init_lpf_mix2_func(0, 1, 1, v, 4, 8, bpp, opt); \
    168    init_lpf_mix2_func(1, 0, 1, v, 8, 4, bpp, opt); \
    169    init_lpf_mix2_func(1, 1, 1, v, 8, 8, bpp, opt)
    170 
    171 #define init_itx_func(idxa, idxb, typea, typeb, size, bpp, opt) \
    172    dsp->itxfm_add[idxa][idxb] = \
    173        cat(ff_vp9_##typea##_##typeb##_##size##x##size##_add_, bpp, _##opt);
    174 #define init_itx_func_one(idx, typea, typeb, size, bpp, opt) \
    175    init_itx_func(idx, DCT_DCT,   typea, typeb, size, bpp, opt); \
    176    init_itx_func(idx, ADST_DCT,  typea, typeb, size, bpp, opt); \
    177    init_itx_func(idx, DCT_ADST,  typea, typeb, size, bpp, opt); \
    178    init_itx_func(idx, ADST_ADST, typea, typeb, size, bpp, opt)
    179 #define init_itx_funcs(idx, size, bpp, opt) \
    180    init_itx_func(idx, DCT_DCT,   idct,  idct,  size, bpp, opt); \
    181    init_itx_func(idx, ADST_DCT,  idct,  iadst, size, bpp, opt); \
    182    init_itx_func(idx, DCT_ADST,  iadst, idct,  size, bpp, opt); \
    183    init_itx_func(idx, ADST_ADST, iadst, iadst, size, bpp, opt); \
    184 
    185    if (EXTERNAL_MMXEXT(cpu_flags)) {
    186        init_ipred_func(tm, TM_VP8, 4, BPC, mmxext);
    187        if (!bitexact) {
    188            init_itx_func_one(4 /* lossless */, iwht, iwht, 4, BPC, mmxext);
    189 #if BPC == 10
    190            init_itx_func(TX_4X4, DCT_DCT, idct, idct, 4, 10, mmxext);
    191 #endif
    192        }
    193    }
    194 
    195    if (EXTERNAL_SSE2(cpu_flags)) {
    196        init_subpel3(0, put, BPC, sse2);
    197        init_subpel3(1, avg, BPC, sse2);
    198        init_lpf_funcs(BPC, sse2);
    199        init_8_16_32_ipred_funcs(tm, TM_VP8, BPC, sse2);
    200 #if BPC == 10
    201        if (!bitexact) {
    202            init_itx_func(TX_4X4, ADST_DCT,  idct,  iadst, 4, 10, sse2);
    203            init_itx_func(TX_4X4, DCT_ADST,  iadst, idct,  4, 10, sse2);
    204            init_itx_func(TX_4X4, ADST_ADST, iadst, iadst, 4, 10, sse2);
    205        }
    206 #else
    207        init_itx_funcs(TX_4X4, 4, 12, sse2);
    208 #endif
    209        init_itx_funcs(TX_8X8, 8, BPC, sse2);
    210        init_itx_funcs(TX_16X16, 16, BPC, sse2);
    211        init_itx_func_one(TX_32X32, idct, idct, 32, BPC, sse2);
    212    }
    213 
    214    if (EXTERNAL_SSSE3(cpu_flags)) {
    215        init_lpf_funcs(BPC, ssse3);
    216 #if BPC == 10
    217        if (!bitexact) {
    218            init_itx_funcs(TX_4X4, 4, BPC, ssse3);
    219        }
    220 #endif
    221    }
    222 
    223    if (EXTERNAL_AVX(cpu_flags)) {
    224        init_lpf_funcs(BPC, avx);
    225    }
    226 
    227    if (EXTERNAL_AVX2_FAST(cpu_flags)) {
    228 #if HAVE_AVX2_EXTERNAL
    229        init_subpel3_32_64(0,  put, BPC, avx2);
    230        init_subpel3_32_64(1,  avg, BPC, avx2);
    231        init_subpel2(2, 0, 16, put, BPC, avx2);
    232        init_subpel2(2, 1, 16, avg, BPC, avx2);
    233 #endif
    234    }
    235 
    236 #endif /* HAVE_X86ASM */
    237 
    238    ff_vp9dsp_init_16bpp_x86(dsp);
    239 }