itx.h (13230B)
1 /* 2 * Copyright © 2018-2023, VideoLAN and dav1d authors 3 * Copyright © 2018-2023, Two Orioles, LLC 4 * All rights reserved. 5 * 6 * Redistribution and use in source and binary forms, with or without 7 * modification, are permitted provided that the following conditions are met: 8 * 9 * 1. Redistributions of source code must retain the above copyright notice, this 10 * list of conditions and the following disclaimer. 11 * 12 * 2. Redistributions in binary form must reproduce the above copyright notice, 13 * this list of conditions and the following disclaimer in the documentation 14 * and/or other materials provided with the distribution. 15 * 16 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 18 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 19 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 20 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 21 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 22 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 23 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 25 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 */ 27 28 #include "src/cpu.h" 29 #include "src/itx.h" 30 31 #define BF_BPC(x, bits, suffix) x##_##bits##bpc_##suffix 32 33 #define decl_itx_fns(ext) \ 34 decl_itx17_fns( 4, 4, ext); \ 35 decl_itx16_fns( 4, 8, ext); \ 36 decl_itx16_fns( 4, 16, ext); \ 37 decl_itx16_fns( 8, 4, ext); \ 38 decl_itx16_fns( 8, 8, ext); \ 39 decl_itx16_fns( 8, 16, ext); \ 40 decl_itx2_fns ( 8, 32, ext); \ 41 decl_itx16_fns(16, 4, ext); \ 42 decl_itx16_fns(16, 8, ext); \ 43 decl_itx12_fns(16, 16, ext); \ 44 decl_itx2_fns (16, 32, ext); \ 45 decl_itx2_fns (32, 8, ext); \ 46 decl_itx2_fns (32, 16, ext); \ 47 decl_itx2_fns (32, 32, ext); \ 48 decl_itx_fn(BF(dav1d_inv_txfm_add_dct_dct_16x64, ext)); \ 49 decl_itx_fn(BF(dav1d_inv_txfm_add_dct_dct_32x64, ext)); \ 50 decl_itx_fn(BF(dav1d_inv_txfm_add_dct_dct_64x16, ext)); \ 51 decl_itx_fn(BF(dav1d_inv_txfm_add_dct_dct_64x32, ext)); \ 52 decl_itx_fn(BF(dav1d_inv_txfm_add_dct_dct_64x64, ext)) 53 54 55 #define decl_itx2_bpc_fns(w, h, bpc, opt) \ 56 decl_itx_fn(BF_BPC(dav1d_inv_txfm_add_dct_dct_##w##x##h, bpc, opt)); \ 57 decl_itx_fn(BF_BPC(dav1d_inv_txfm_add_identity_identity_##w##x##h, bpc, opt)) 58 59 #define decl_itx12_bpc_fns(w, h, bpc, opt) \ 60 decl_itx2_bpc_fns(w, h, bpc, opt); \ 61 decl_itx_fn(BF_BPC(dav1d_inv_txfm_add_dct_adst_##w##x##h, bpc, opt)); \ 62 decl_itx_fn(BF_BPC(dav1d_inv_txfm_add_dct_flipadst_##w##x##h, bpc, opt)); \ 63 decl_itx_fn(BF_BPC(dav1d_inv_txfm_add_dct_identity_##w##x##h, bpc, opt)); \ 64 decl_itx_fn(BF_BPC(dav1d_inv_txfm_add_adst_dct_##w##x##h, bpc, opt)); \ 65 decl_itx_fn(BF_BPC(dav1d_inv_txfm_add_adst_adst_##w##x##h, bpc, opt)); \ 66 decl_itx_fn(BF_BPC(dav1d_inv_txfm_add_adst_flipadst_##w##x##h, bpc, opt)); \ 67 decl_itx_fn(BF_BPC(dav1d_inv_txfm_add_flipadst_dct_##w##x##h, bpc, opt)); \ 68 decl_itx_fn(BF_BPC(dav1d_inv_txfm_add_flipadst_adst_##w##x##h, bpc, opt)); \ 69 decl_itx_fn(BF_BPC(dav1d_inv_txfm_add_flipadst_flipadst_##w##x##h, bpc, opt)); \ 70 decl_itx_fn(BF_BPC(dav1d_inv_txfm_add_identity_dct_##w##x##h, bpc, opt)) 71 72 #define decl_itx16_bpc_fns(w, h, bpc, opt) \ 73 decl_itx12_bpc_fns(w, h, bpc, opt); \ 74 decl_itx_fn(BF_BPC(dav1d_inv_txfm_add_adst_identity_##w##x##h, bpc, opt)); \ 75 decl_itx_fn(BF_BPC(dav1d_inv_txfm_add_flipadst_identity_##w##x##h, bpc, opt)); \ 76 decl_itx_fn(BF_BPC(dav1d_inv_txfm_add_identity_adst_##w##x##h, bpc, opt)); \ 77 decl_itx_fn(BF_BPC(dav1d_inv_txfm_add_identity_flipadst_##w##x##h, bpc, opt)) 78 79 #define decl_itx_bpc_fns(bpc, ext) \ 80 decl_itx16_bpc_fns( 4, 4, bpc, ext); \ 81 decl_itx16_bpc_fns( 4, 8, bpc, ext); \ 82 decl_itx16_bpc_fns( 4, 16, bpc, ext); \ 83 decl_itx16_bpc_fns( 8, 4, bpc, ext); \ 84 decl_itx16_bpc_fns( 8, 8, bpc, ext); \ 85 decl_itx16_bpc_fns( 8, 16, bpc, ext); \ 86 decl_itx2_bpc_fns ( 8, 32, bpc, ext); \ 87 decl_itx16_bpc_fns(16, 4, bpc, ext); \ 88 decl_itx16_bpc_fns(16, 8, bpc, ext); \ 89 decl_itx12_bpc_fns(16, 16, bpc, ext); \ 90 decl_itx2_bpc_fns (16, 32, bpc, ext); \ 91 decl_itx2_bpc_fns (32, 8, bpc, ext); \ 92 decl_itx2_bpc_fns (32, 16, bpc, ext); \ 93 decl_itx2_bpc_fns (32, 32, bpc, ext); \ 94 decl_itx_fn(BF_BPC(dav1d_inv_txfm_add_dct_dct_16x64, bpc, ext)); \ 95 decl_itx_fn(BF_BPC(dav1d_inv_txfm_add_dct_dct_32x64, bpc, ext)); \ 96 decl_itx_fn(BF_BPC(dav1d_inv_txfm_add_dct_dct_64x16, bpc, ext)); \ 97 decl_itx_fn(BF_BPC(dav1d_inv_txfm_add_dct_dct_64x32, bpc, ext)); \ 98 decl_itx_fn(BF_BPC(dav1d_inv_txfm_add_dct_dct_64x64, bpc, ext)) 99 100 decl_itx_fns(avx512icl); 101 decl_itx_bpc_fns(10, avx512icl); 102 decl_itx_fns(avx2); 103 decl_itx_bpc_fns(10, avx2); 104 decl_itx_bpc_fns(12, avx2); 105 decl_itx_fns(sse4); 106 decl_itx_fns(ssse3); 107 decl_itx_fn(dav1d_inv_txfm_add_wht_wht_4x4_16bpc_avx2); 108 decl_itx_fn(BF(dav1d_inv_txfm_add_wht_wht_4x4, sse2)); 109 110 static ALWAYS_INLINE void itx_dsp_init_x86(Dav1dInvTxfmDSPContext *const c, 111 const int bpc, int *const all_simd) 112 { 113 #define assign_itx_bpc_fn(pfx, w, h, type, type_enum, bpc, ext) \ 114 c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \ 115 BF_BPC(dav1d_inv_txfm_add_##type##_##w##x##h, bpc, ext) 116 117 #define assign_itx1_bpc_fn(pfx, w, h, bpc, ext) \ 118 assign_itx_bpc_fn(pfx, w, h, dct_dct, DCT_DCT, bpc, ext) 119 120 #define assign_itx2_bpc_fn(pfx, w, h, bpc, ext) \ 121 assign_itx1_bpc_fn(pfx, w, h, bpc, ext); \ 122 assign_itx_bpc_fn(pfx, w, h, identity_identity, IDTX, bpc, ext) 123 124 #define assign_itx12_bpc_fn(pfx, w, h, bpc, ext) \ 125 assign_itx2_bpc_fn(pfx, w, h, bpc, ext); \ 126 assign_itx_bpc_fn(pfx, w, h, dct_adst, ADST_DCT, bpc, ext); \ 127 assign_itx_bpc_fn(pfx, w, h, dct_flipadst, FLIPADST_DCT, bpc, ext); \ 128 assign_itx_bpc_fn(pfx, w, h, dct_identity, H_DCT, bpc, ext); \ 129 assign_itx_bpc_fn(pfx, w, h, adst_dct, DCT_ADST, bpc, ext); \ 130 assign_itx_bpc_fn(pfx, w, h, adst_adst, ADST_ADST, bpc, ext); \ 131 assign_itx_bpc_fn(pfx, w, h, adst_flipadst, FLIPADST_ADST, bpc, ext); \ 132 assign_itx_bpc_fn(pfx, w, h, flipadst_dct, DCT_FLIPADST, bpc, ext); \ 133 assign_itx_bpc_fn(pfx, w, h, flipadst_adst, ADST_FLIPADST, bpc, ext); \ 134 assign_itx_bpc_fn(pfx, w, h, flipadst_flipadst, FLIPADST_FLIPADST, bpc, ext); \ 135 assign_itx_bpc_fn(pfx, w, h, identity_dct, V_DCT, bpc, ext) 136 137 #define assign_itx16_bpc_fn(pfx, w, h, bpc, ext) \ 138 assign_itx12_bpc_fn(pfx, w, h, bpc, ext); \ 139 assign_itx_bpc_fn(pfx, w, h, adst_identity, H_ADST, bpc, ext); \ 140 assign_itx_bpc_fn(pfx, w, h, flipadst_identity, H_FLIPADST, bpc, ext); \ 141 assign_itx_bpc_fn(pfx, w, h, identity_adst, V_ADST, bpc, ext); \ 142 assign_itx_bpc_fn(pfx, w, h, identity_flipadst, V_FLIPADST, bpc, ext) 143 144 const unsigned flags = dav1d_get_cpu_flags(); 145 146 if (!(flags & DAV1D_X86_CPU_FLAG_SSE2)) return; 147 148 assign_itx_fn(, 4, 4, wht_wht, WHT_WHT, sse2); 149 150 if (!(flags & DAV1D_X86_CPU_FLAG_SSSE3)) return; 151 152 #if BITDEPTH == 8 153 assign_itx16_fn(, 4, 4, ssse3); 154 assign_itx16_fn(R, 4, 8, ssse3); 155 assign_itx16_fn(R, 8, 4, ssse3); 156 assign_itx16_fn(, 8, 8, ssse3); 157 assign_itx16_fn(R, 4, 16, ssse3); 158 assign_itx16_fn(R, 16, 4, ssse3); 159 assign_itx16_fn(R, 8, 16, ssse3); 160 assign_itx16_fn(R, 16, 8, ssse3); 161 assign_itx12_fn(, 16, 16, ssse3); 162 assign_itx2_fn (R, 8, 32, ssse3); 163 assign_itx2_fn (R, 32, 8, ssse3); 164 assign_itx2_fn (R, 16, 32, ssse3); 165 assign_itx2_fn (R, 32, 16, ssse3); 166 assign_itx2_fn (, 32, 32, ssse3); 167 assign_itx1_fn (R, 16, 64, ssse3); 168 assign_itx1_fn (R, 32, 64, ssse3); 169 assign_itx1_fn (R, 64, 16, ssse3); 170 assign_itx1_fn (R, 64, 32, ssse3); 171 assign_itx1_fn ( , 64, 64, ssse3); 172 *all_simd = 1; 173 #endif 174 175 if (!(flags & DAV1D_X86_CPU_FLAG_SSE41)) return; 176 177 #if BITDEPTH == 16 178 if (bpc == 10) { 179 assign_itx16_fn(, 4, 4, sse4); 180 assign_itx16_fn(R, 4, 8, sse4); 181 assign_itx16_fn(R, 4, 16, sse4); 182 assign_itx16_fn(R, 8, 4, sse4); 183 assign_itx16_fn(, 8, 8, sse4); 184 assign_itx16_fn(R, 8, 16, sse4); 185 assign_itx16_fn(R, 16, 4, sse4); 186 assign_itx16_fn(R, 16, 8, sse4); 187 assign_itx12_fn(, 16, 16, sse4); 188 assign_itx2_fn (R, 8, 32, sse4); 189 assign_itx2_fn (R, 32, 8, sse4); 190 assign_itx2_fn (R, 16, 32, sse4); 191 assign_itx2_fn (R, 32, 16, sse4); 192 assign_itx2_fn (, 32, 32, sse4); 193 assign_itx1_fn (R, 16, 64, sse4); 194 assign_itx1_fn (R, 32, 64, sse4); 195 assign_itx1_fn (R, 64, 16, sse4); 196 assign_itx1_fn (R, 64, 32, sse4); 197 assign_itx1_fn (, 64, 64, sse4); 198 *all_simd = 1; 199 } 200 #endif 201 202 #if ARCH_X86_64 203 if (!(flags & DAV1D_X86_CPU_FLAG_AVX2)) return; 204 205 assign_itx_fn(, 4, 4, wht_wht, WHT_WHT, avx2); 206 207 #if BITDEPTH == 8 208 assign_itx16_fn( , 4, 4, avx2); 209 assign_itx16_fn(R, 4, 8, avx2); 210 assign_itx16_fn(R, 4, 16, avx2); 211 assign_itx16_fn(R, 8, 4, avx2); 212 assign_itx16_fn( , 8, 8, avx2); 213 assign_itx16_fn(R, 8, 16, avx2); 214 assign_itx2_fn (R, 8, 32, avx2); 215 assign_itx16_fn(R, 16, 4, avx2); 216 assign_itx16_fn(R, 16, 8, avx2); 217 assign_itx12_fn( , 16, 16, avx2); 218 assign_itx2_fn (R, 16, 32, avx2); 219 assign_itx1_fn (R, 16, 64, avx2); 220 assign_itx2_fn (R, 32, 8, avx2); 221 assign_itx2_fn (R, 32, 16, avx2); 222 assign_itx2_fn ( , 32, 32, avx2); 223 assign_itx1_fn (R, 32, 64, avx2); 224 assign_itx1_fn (R, 64, 16, avx2); 225 assign_itx1_fn (R, 64, 32, avx2); 226 assign_itx1_fn ( , 64, 64, avx2); 227 #else 228 if (bpc == 10) { 229 assign_itx16_bpc_fn( , 4, 4, 10, avx2); 230 assign_itx16_bpc_fn(R, 4, 8, 10, avx2); 231 assign_itx16_bpc_fn(R, 4, 16, 10, avx2); 232 assign_itx16_bpc_fn(R, 8, 4, 10, avx2); 233 assign_itx16_bpc_fn( , 8, 8, 10, avx2); 234 assign_itx16_bpc_fn(R, 8, 16, 10, avx2); 235 assign_itx2_bpc_fn (R, 8, 32, 10, avx2); 236 assign_itx16_bpc_fn(R, 16, 4, 10, avx2); 237 assign_itx16_bpc_fn(R, 16, 8, 10, avx2); 238 assign_itx12_bpc_fn( , 16, 16, 10, avx2); 239 assign_itx2_bpc_fn (R, 16, 32, 10, avx2); 240 assign_itx1_bpc_fn (R, 16, 64, 10, avx2); 241 assign_itx2_bpc_fn (R, 32, 8, 10, avx2); 242 assign_itx2_bpc_fn (R, 32, 16, 10, avx2); 243 assign_itx2_bpc_fn ( , 32, 32, 10, avx2); 244 assign_itx1_bpc_fn (R, 32, 64, 10, avx2); 245 assign_itx1_bpc_fn (R, 64, 16, 10, avx2); 246 assign_itx1_bpc_fn (R, 64, 32, 10, avx2); 247 assign_itx1_bpc_fn ( , 64, 64, 10, avx2); 248 } else { 249 assign_itx16_bpc_fn( , 4, 4, 12, avx2); 250 assign_itx16_bpc_fn(R, 4, 8, 12, avx2); 251 assign_itx16_bpc_fn(R, 4, 16, 12, avx2); 252 assign_itx16_bpc_fn(R, 8, 4, 12, avx2); 253 assign_itx16_bpc_fn( , 8, 8, 12, avx2); 254 assign_itx16_bpc_fn(R, 8, 16, 12, avx2); 255 assign_itx2_bpc_fn (R, 8, 32, 12, avx2); 256 assign_itx16_bpc_fn(R, 16, 4, 12, avx2); 257 assign_itx16_bpc_fn(R, 16, 8, 12, avx2); 258 assign_itx12_bpc_fn( , 16, 16, 12, avx2); 259 assign_itx2_bpc_fn (R, 32, 8, 12, avx2); 260 assign_itx_bpc_fn(R, 16, 32, identity_identity, IDTX, 12, avx2); 261 assign_itx_bpc_fn(R, 32, 16, identity_identity, IDTX, 12, avx2); 262 assign_itx_bpc_fn( , 32, 32, identity_identity, IDTX, 12, avx2); 263 } 264 #endif 265 266 if (!(flags & DAV1D_X86_CPU_FLAG_AVX512ICL)) return; 267 268 #if BITDEPTH == 8 269 assign_itx16_fn( , 4, 4, avx512icl); // no wht 270 assign_itx16_fn(R, 4, 8, avx512icl); 271 assign_itx16_fn(R, 4, 16, avx512icl); 272 assign_itx16_fn(R, 8, 4, avx512icl); 273 assign_itx16_fn( , 8, 8, avx512icl); 274 assign_itx16_fn(R, 8, 16, avx512icl); 275 assign_itx2_fn (R, 8, 32, avx512icl); 276 assign_itx16_fn(R, 16, 4, avx512icl); 277 assign_itx16_fn(R, 16, 8, avx512icl); 278 assign_itx12_fn( , 16, 16, avx512icl); 279 assign_itx2_fn (R, 16, 32, avx512icl); 280 assign_itx1_fn (R, 16, 64, avx512icl); 281 assign_itx2_fn (R, 32, 8, avx512icl); 282 assign_itx2_fn (R, 32, 16, avx512icl); 283 assign_itx2_fn ( , 32, 32, avx512icl); 284 assign_itx1_fn (R, 32, 64, avx512icl); 285 assign_itx1_fn (R, 64, 16, avx512icl); 286 assign_itx1_fn (R, 64, 32, avx512icl); 287 assign_itx1_fn ( , 64, 64, avx512icl); 288 #else 289 if (bpc == 10) { 290 assign_itx16_bpc_fn( , 8, 8, 10, avx512icl); 291 assign_itx16_bpc_fn(R, 8, 16, 10, avx512icl); 292 assign_itx2_bpc_fn (R, 8, 32, 10, avx512icl); 293 assign_itx16_bpc_fn(R, 16, 8, 10, avx512icl); 294 assign_itx12_bpc_fn( , 16, 16, 10, avx512icl); 295 assign_itx2_bpc_fn (R, 16, 32, 10, avx512icl); 296 assign_itx2_bpc_fn (R, 32, 8, 10, avx512icl); 297 assign_itx2_bpc_fn (R, 32, 16, 10, avx512icl); 298 assign_itx2_bpc_fn ( , 32, 32, 10, avx512icl); 299 assign_itx1_bpc_fn (R, 16, 64, 10, avx512icl); 300 assign_itx1_bpc_fn (R, 32, 64, 10, avx512icl); 301 assign_itx1_bpc_fn (R, 64, 16, 10, avx512icl); 302 assign_itx1_bpc_fn (R, 64, 32, 10, avx512icl); 303 assign_itx1_bpc_fn ( , 64, 64, 10, avx512icl); 304 } 305 #endif 306 #endif 307 }