tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

itx.h (13230B)


      1 /*
      2 * Copyright © 2018-2023, VideoLAN and dav1d authors
      3 * Copyright © 2018-2023, Two Orioles, LLC
      4 * All rights reserved.
      5 *
      6 * Redistribution and use in source and binary forms, with or without
      7 * modification, are permitted provided that the following conditions are met:
      8 *
      9 * 1. Redistributions of source code must retain the above copyright notice, this
     10 *    list of conditions and the following disclaimer.
     11 *
     12 * 2. Redistributions in binary form must reproduce the above copyright notice,
     13 *    this list of conditions and the following disclaimer in the documentation
     14 *    and/or other materials provided with the distribution.
     15 *
     16 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
     17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
     18 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
     19 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
     20 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
     21 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
     22 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
     23 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
     24 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
     25 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     26 */
     27 
     28 #include "src/cpu.h"
     29 #include "src/itx.h"
     30 
     31 #define BF_BPC(x, bits, suffix) x##_##bits##bpc_##suffix
     32 
     33 #define decl_itx_fns(ext) \
     34 decl_itx17_fns( 4,  4, ext); \
     35 decl_itx16_fns( 4,  8, ext); \
     36 decl_itx16_fns( 4, 16, ext); \
     37 decl_itx16_fns( 8,  4, ext); \
     38 decl_itx16_fns( 8,  8, ext); \
     39 decl_itx16_fns( 8, 16, ext); \
     40 decl_itx2_fns ( 8, 32, ext); \
     41 decl_itx16_fns(16,  4, ext); \
     42 decl_itx16_fns(16,  8, ext); \
     43 decl_itx12_fns(16, 16, ext); \
     44 decl_itx2_fns (16, 32, ext); \
     45 decl_itx2_fns (32,  8, ext); \
     46 decl_itx2_fns (32, 16, ext); \
     47 decl_itx2_fns (32, 32, ext); \
     48 decl_itx_fn(BF(dav1d_inv_txfm_add_dct_dct_16x64, ext)); \
     49 decl_itx_fn(BF(dav1d_inv_txfm_add_dct_dct_32x64, ext)); \
     50 decl_itx_fn(BF(dav1d_inv_txfm_add_dct_dct_64x16, ext)); \
     51 decl_itx_fn(BF(dav1d_inv_txfm_add_dct_dct_64x32, ext)); \
     52 decl_itx_fn(BF(dav1d_inv_txfm_add_dct_dct_64x64, ext))
     53 
     54 
     55 #define decl_itx2_bpc_fns(w, h, bpc, opt) \
     56 decl_itx_fn(BF_BPC(dav1d_inv_txfm_add_dct_dct_##w##x##h, bpc, opt)); \
     57 decl_itx_fn(BF_BPC(dav1d_inv_txfm_add_identity_identity_##w##x##h, bpc, opt))
     58 
     59 #define decl_itx12_bpc_fns(w, h, bpc, opt) \
     60 decl_itx2_bpc_fns(w, h, bpc, opt); \
     61 decl_itx_fn(BF_BPC(dav1d_inv_txfm_add_dct_adst_##w##x##h, bpc, opt)); \
     62 decl_itx_fn(BF_BPC(dav1d_inv_txfm_add_dct_flipadst_##w##x##h, bpc, opt)); \
     63 decl_itx_fn(BF_BPC(dav1d_inv_txfm_add_dct_identity_##w##x##h, bpc, opt)); \
     64 decl_itx_fn(BF_BPC(dav1d_inv_txfm_add_adst_dct_##w##x##h, bpc, opt)); \
     65 decl_itx_fn(BF_BPC(dav1d_inv_txfm_add_adst_adst_##w##x##h, bpc, opt)); \
     66 decl_itx_fn(BF_BPC(dav1d_inv_txfm_add_adst_flipadst_##w##x##h, bpc, opt)); \
     67 decl_itx_fn(BF_BPC(dav1d_inv_txfm_add_flipadst_dct_##w##x##h, bpc, opt)); \
     68 decl_itx_fn(BF_BPC(dav1d_inv_txfm_add_flipadst_adst_##w##x##h, bpc, opt)); \
     69 decl_itx_fn(BF_BPC(dav1d_inv_txfm_add_flipadst_flipadst_##w##x##h, bpc, opt)); \
     70 decl_itx_fn(BF_BPC(dav1d_inv_txfm_add_identity_dct_##w##x##h, bpc, opt))
     71 
     72 #define decl_itx16_bpc_fns(w, h, bpc, opt) \
     73 decl_itx12_bpc_fns(w, h, bpc, opt); \
     74 decl_itx_fn(BF_BPC(dav1d_inv_txfm_add_adst_identity_##w##x##h, bpc, opt)); \
     75 decl_itx_fn(BF_BPC(dav1d_inv_txfm_add_flipadst_identity_##w##x##h, bpc, opt)); \
     76 decl_itx_fn(BF_BPC(dav1d_inv_txfm_add_identity_adst_##w##x##h, bpc, opt)); \
     77 decl_itx_fn(BF_BPC(dav1d_inv_txfm_add_identity_flipadst_##w##x##h, bpc, opt))
     78 
     79 #define decl_itx_bpc_fns(bpc, ext) \
     80 decl_itx16_bpc_fns( 4,  4, bpc, ext); \
     81 decl_itx16_bpc_fns( 4,  8, bpc, ext); \
     82 decl_itx16_bpc_fns( 4, 16, bpc, ext); \
     83 decl_itx16_bpc_fns( 8,  4, bpc, ext); \
     84 decl_itx16_bpc_fns( 8,  8, bpc, ext); \
     85 decl_itx16_bpc_fns( 8, 16, bpc, ext); \
     86 decl_itx2_bpc_fns ( 8, 32, bpc, ext); \
     87 decl_itx16_bpc_fns(16,  4, bpc, ext); \
     88 decl_itx16_bpc_fns(16,  8, bpc, ext); \
     89 decl_itx12_bpc_fns(16, 16, bpc, ext); \
     90 decl_itx2_bpc_fns (16, 32, bpc, ext); \
     91 decl_itx2_bpc_fns (32,  8, bpc, ext); \
     92 decl_itx2_bpc_fns (32, 16, bpc, ext); \
     93 decl_itx2_bpc_fns (32, 32, bpc, ext); \
     94 decl_itx_fn(BF_BPC(dav1d_inv_txfm_add_dct_dct_16x64, bpc, ext)); \
     95 decl_itx_fn(BF_BPC(dav1d_inv_txfm_add_dct_dct_32x64, bpc, ext)); \
     96 decl_itx_fn(BF_BPC(dav1d_inv_txfm_add_dct_dct_64x16, bpc, ext)); \
     97 decl_itx_fn(BF_BPC(dav1d_inv_txfm_add_dct_dct_64x32, bpc, ext)); \
     98 decl_itx_fn(BF_BPC(dav1d_inv_txfm_add_dct_dct_64x64, bpc, ext))
     99 
    100 decl_itx_fns(avx512icl);
    101 decl_itx_bpc_fns(10, avx512icl);
    102 decl_itx_fns(avx2);
    103 decl_itx_bpc_fns(10, avx2);
    104 decl_itx_bpc_fns(12, avx2);
    105 decl_itx_fns(sse4);
    106 decl_itx_fns(ssse3);
    107 decl_itx_fn(dav1d_inv_txfm_add_wht_wht_4x4_16bpc_avx2);
    108 decl_itx_fn(BF(dav1d_inv_txfm_add_wht_wht_4x4, sse2));
    109 
    110 static ALWAYS_INLINE void itx_dsp_init_x86(Dav1dInvTxfmDSPContext *const c,
    111                                           const int bpc, int *const all_simd)
    112 {
    113 #define assign_itx_bpc_fn(pfx, w, h, type, type_enum, bpc, ext) \
    114    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
    115        BF_BPC(dav1d_inv_txfm_add_##type##_##w##x##h, bpc, ext)
    116 
    117 #define assign_itx1_bpc_fn(pfx, w, h, bpc, ext) \
    118    assign_itx_bpc_fn(pfx, w, h, dct_dct,           DCT_DCT,           bpc, ext)
    119 
    120 #define assign_itx2_bpc_fn(pfx, w, h, bpc, ext) \
    121    assign_itx1_bpc_fn(pfx, w, h, bpc, ext); \
    122    assign_itx_bpc_fn(pfx, w, h, identity_identity, IDTX,              bpc, ext)
    123 
    124 #define assign_itx12_bpc_fn(pfx, w, h, bpc, ext) \
    125    assign_itx2_bpc_fn(pfx, w, h, bpc, ext); \
    126    assign_itx_bpc_fn(pfx, w, h, dct_adst,          ADST_DCT,          bpc, ext); \
    127    assign_itx_bpc_fn(pfx, w, h, dct_flipadst,      FLIPADST_DCT,      bpc, ext); \
    128    assign_itx_bpc_fn(pfx, w, h, dct_identity,      H_DCT,             bpc, ext); \
    129    assign_itx_bpc_fn(pfx, w, h, adst_dct,          DCT_ADST,          bpc, ext); \
    130    assign_itx_bpc_fn(pfx, w, h, adst_adst,         ADST_ADST,         bpc, ext); \
    131    assign_itx_bpc_fn(pfx, w, h, adst_flipadst,     FLIPADST_ADST,     bpc, ext); \
    132    assign_itx_bpc_fn(pfx, w, h, flipadst_dct,      DCT_FLIPADST,      bpc, ext); \
    133    assign_itx_bpc_fn(pfx, w, h, flipadst_adst,     ADST_FLIPADST,     bpc, ext); \
    134    assign_itx_bpc_fn(pfx, w, h, flipadst_flipadst, FLIPADST_FLIPADST, bpc, ext); \
    135    assign_itx_bpc_fn(pfx, w, h, identity_dct,      V_DCT,             bpc, ext)
    136 
    137 #define assign_itx16_bpc_fn(pfx, w, h, bpc, ext) \
    138    assign_itx12_bpc_fn(pfx, w, h, bpc, ext); \
    139    assign_itx_bpc_fn(pfx, w, h, adst_identity,     H_ADST,            bpc, ext); \
    140    assign_itx_bpc_fn(pfx, w, h, flipadst_identity, H_FLIPADST,        bpc, ext); \
    141    assign_itx_bpc_fn(pfx, w, h, identity_adst,     V_ADST,            bpc, ext); \
    142    assign_itx_bpc_fn(pfx, w, h, identity_flipadst, V_FLIPADST,        bpc, ext)
    143 
    144    const unsigned flags = dav1d_get_cpu_flags();
    145 
    146    if (!(flags & DAV1D_X86_CPU_FLAG_SSE2)) return;
    147 
    148    assign_itx_fn(, 4, 4, wht_wht, WHT_WHT, sse2);
    149 
    150    if (!(flags & DAV1D_X86_CPU_FLAG_SSSE3)) return;
    151 
    152 #if BITDEPTH == 8
    153    assign_itx16_fn(,   4,  4, ssse3);
    154    assign_itx16_fn(R,  4,  8, ssse3);
    155    assign_itx16_fn(R,  8,  4, ssse3);
    156    assign_itx16_fn(,   8,  8, ssse3);
    157    assign_itx16_fn(R,  4, 16, ssse3);
    158    assign_itx16_fn(R, 16,  4, ssse3);
    159    assign_itx16_fn(R,  8, 16, ssse3);
    160    assign_itx16_fn(R, 16,  8, ssse3);
    161    assign_itx12_fn(,  16, 16, ssse3);
    162    assign_itx2_fn (R,  8, 32, ssse3);
    163    assign_itx2_fn (R, 32,  8, ssse3);
    164    assign_itx2_fn (R, 16, 32, ssse3);
    165    assign_itx2_fn (R, 32, 16, ssse3);
    166    assign_itx2_fn (,  32, 32, ssse3);
    167    assign_itx1_fn (R, 16, 64, ssse3);
    168    assign_itx1_fn (R, 32, 64, ssse3);
    169    assign_itx1_fn (R, 64, 16, ssse3);
    170    assign_itx1_fn (R, 64, 32, ssse3);
    171    assign_itx1_fn ( , 64, 64, ssse3);
    172    *all_simd = 1;
    173 #endif
    174 
    175    if (!(flags & DAV1D_X86_CPU_FLAG_SSE41)) return;
    176 
    177 #if BITDEPTH == 16
    178    if (bpc == 10) {
    179        assign_itx16_fn(,   4,  4, sse4);
    180        assign_itx16_fn(R,  4,  8, sse4);
    181        assign_itx16_fn(R,  4, 16, sse4);
    182        assign_itx16_fn(R,  8,  4, sse4);
    183        assign_itx16_fn(,   8,  8, sse4);
    184        assign_itx16_fn(R,  8, 16, sse4);
    185        assign_itx16_fn(R, 16,  4, sse4);
    186        assign_itx16_fn(R, 16,  8, sse4);
    187        assign_itx12_fn(,  16, 16, sse4);
    188        assign_itx2_fn (R,  8, 32, sse4);
    189        assign_itx2_fn (R, 32,  8, sse4);
    190        assign_itx2_fn (R, 16, 32, sse4);
    191        assign_itx2_fn (R, 32, 16, sse4);
    192        assign_itx2_fn (,  32, 32, sse4);
    193        assign_itx1_fn (R, 16, 64, sse4);
    194        assign_itx1_fn (R, 32, 64, sse4);
    195        assign_itx1_fn (R, 64, 16, sse4);
    196        assign_itx1_fn (R, 64, 32, sse4);
    197        assign_itx1_fn (,  64, 64, sse4);
    198        *all_simd = 1;
    199    }
    200 #endif
    201 
    202 #if ARCH_X86_64
    203    if (!(flags & DAV1D_X86_CPU_FLAG_AVX2)) return;
    204 
    205    assign_itx_fn(, 4, 4, wht_wht, WHT_WHT, avx2);
    206 
    207 #if BITDEPTH == 8
    208    assign_itx16_fn( ,  4,  4, avx2);
    209    assign_itx16_fn(R,  4,  8, avx2);
    210    assign_itx16_fn(R,  4, 16, avx2);
    211    assign_itx16_fn(R,  8,  4, avx2);
    212    assign_itx16_fn( ,  8,  8, avx2);
    213    assign_itx16_fn(R,  8, 16, avx2);
    214    assign_itx2_fn (R,  8, 32, avx2);
    215    assign_itx16_fn(R, 16,  4, avx2);
    216    assign_itx16_fn(R, 16,  8, avx2);
    217    assign_itx12_fn( , 16, 16, avx2);
    218    assign_itx2_fn (R, 16, 32, avx2);
    219    assign_itx1_fn (R, 16, 64, avx2);
    220    assign_itx2_fn (R, 32,  8, avx2);
    221    assign_itx2_fn (R, 32, 16, avx2);
    222    assign_itx2_fn ( , 32, 32, avx2);
    223    assign_itx1_fn (R, 32, 64, avx2);
    224    assign_itx1_fn (R, 64, 16, avx2);
    225    assign_itx1_fn (R, 64, 32, avx2);
    226    assign_itx1_fn ( , 64, 64, avx2);
    227 #else
    228    if (bpc == 10) {
    229        assign_itx16_bpc_fn( ,  4,  4, 10, avx2);
    230        assign_itx16_bpc_fn(R,  4,  8, 10, avx2);
    231        assign_itx16_bpc_fn(R,  4, 16, 10, avx2);
    232        assign_itx16_bpc_fn(R,  8,  4, 10, avx2);
    233        assign_itx16_bpc_fn( ,  8,  8, 10, avx2);
    234        assign_itx16_bpc_fn(R,  8, 16, 10, avx2);
    235        assign_itx2_bpc_fn (R,  8, 32, 10, avx2);
    236        assign_itx16_bpc_fn(R, 16,  4, 10, avx2);
    237        assign_itx16_bpc_fn(R, 16,  8, 10, avx2);
    238        assign_itx12_bpc_fn( , 16, 16, 10, avx2);
    239        assign_itx2_bpc_fn (R, 16, 32, 10, avx2);
    240        assign_itx1_bpc_fn (R, 16, 64, 10, avx2);
    241        assign_itx2_bpc_fn (R, 32,  8, 10, avx2);
    242        assign_itx2_bpc_fn (R, 32, 16, 10, avx2);
    243        assign_itx2_bpc_fn ( , 32, 32, 10, avx2);
    244        assign_itx1_bpc_fn (R, 32, 64, 10, avx2);
    245        assign_itx1_bpc_fn (R, 64, 16, 10, avx2);
    246        assign_itx1_bpc_fn (R, 64, 32, 10, avx2);
    247        assign_itx1_bpc_fn ( , 64, 64, 10, avx2);
    248    } else {
    249        assign_itx16_bpc_fn( ,  4,  4, 12, avx2);
    250        assign_itx16_bpc_fn(R,  4,  8, 12, avx2);
    251        assign_itx16_bpc_fn(R,  4, 16, 12, avx2);
    252        assign_itx16_bpc_fn(R,  8,  4, 12, avx2);
    253        assign_itx16_bpc_fn( ,  8,  8, 12, avx2);
    254        assign_itx16_bpc_fn(R,  8, 16, 12, avx2);
    255        assign_itx2_bpc_fn (R,  8, 32, 12, avx2);
    256        assign_itx16_bpc_fn(R, 16,  4, 12, avx2);
    257        assign_itx16_bpc_fn(R, 16,  8, 12, avx2);
    258        assign_itx12_bpc_fn( , 16, 16, 12, avx2);
    259        assign_itx2_bpc_fn (R, 32,  8, 12, avx2);
    260        assign_itx_bpc_fn(R, 16, 32, identity_identity, IDTX, 12, avx2);
    261        assign_itx_bpc_fn(R, 32, 16, identity_identity, IDTX, 12, avx2);
    262        assign_itx_bpc_fn( , 32, 32, identity_identity, IDTX, 12, avx2);
    263    }
    264 #endif
    265 
    266    if (!(flags & DAV1D_X86_CPU_FLAG_AVX512ICL)) return;
    267 
    268 #if BITDEPTH == 8
    269    assign_itx16_fn( ,  4,  4, avx512icl); // no wht
    270    assign_itx16_fn(R,  4,  8, avx512icl);
    271    assign_itx16_fn(R,  4, 16, avx512icl);
    272    assign_itx16_fn(R,  8,  4, avx512icl);
    273    assign_itx16_fn( ,  8,  8, avx512icl);
    274    assign_itx16_fn(R,  8, 16, avx512icl);
    275    assign_itx2_fn (R,  8, 32, avx512icl);
    276    assign_itx16_fn(R, 16,  4, avx512icl);
    277    assign_itx16_fn(R, 16,  8, avx512icl);
    278    assign_itx12_fn( , 16, 16, avx512icl);
    279    assign_itx2_fn (R, 16, 32, avx512icl);
    280    assign_itx1_fn (R, 16, 64, avx512icl);
    281    assign_itx2_fn (R, 32,  8, avx512icl);
    282    assign_itx2_fn (R, 32, 16, avx512icl);
    283    assign_itx2_fn ( , 32, 32, avx512icl);
    284    assign_itx1_fn (R, 32, 64, avx512icl);
    285    assign_itx1_fn (R, 64, 16, avx512icl);
    286    assign_itx1_fn (R, 64, 32, avx512icl);
    287    assign_itx1_fn ( , 64, 64, avx512icl);
    288 #else
    289    if (bpc == 10) {
    290        assign_itx16_bpc_fn( ,  8,  8, 10, avx512icl);
    291        assign_itx16_bpc_fn(R,  8, 16, 10, avx512icl);
    292        assign_itx2_bpc_fn (R,  8, 32, 10, avx512icl);
    293        assign_itx16_bpc_fn(R, 16,  8, 10, avx512icl);
    294        assign_itx12_bpc_fn( , 16, 16, 10, avx512icl);
    295        assign_itx2_bpc_fn (R, 16, 32, 10, avx512icl);
    296        assign_itx2_bpc_fn (R, 32,  8, 10, avx512icl);
    297        assign_itx2_bpc_fn (R, 32, 16, 10, avx512icl);
    298        assign_itx2_bpc_fn ( , 32, 32, 10, avx512icl);
    299        assign_itx1_bpc_fn (R, 16, 64, 10, avx512icl);
    300        assign_itx1_bpc_fn (R, 32, 64, 10, avx512icl);
    301        assign_itx1_bpc_fn (R, 64, 16, 10, avx512icl);
    302        assign_itx1_bpc_fn (R, 64, 32, 10, avx512icl);
    303        assign_itx1_bpc_fn ( , 64, 64, 10, avx512icl);
    304    }
    305 #endif
    306 #endif
    307 }