vp9dsp_init.c (15364B)
1 /* 2 * VP9 SIMD optimizations 3 * 4 * Copyright (c) 2013 Ronald S. Bultje <rsbultje gmail com> 5 * 6 * This file is part of FFmpeg. 7 * 8 * FFmpeg is free software; you can redistribute it and/or 9 * modify it under the terms of the GNU Lesser General Public 10 * License as published by the Free Software Foundation; either 11 * version 2.1 of the License, or (at your option) any later version. 12 * 13 * FFmpeg is distributed in the hope that it will be useful, 14 * but WITHOUT ANY WARRANTY; without even the implied warranty of 15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 16 * Lesser General Public License for more details. 17 * 18 * You should have received a copy of the GNU Lesser General Public 19 * License along with FFmpeg; if not, write to the Free Software 20 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 21 */ 22 23 #include "libavutil/attributes.h" 24 #include "libavutil/cpu.h" 25 #include "libavutil/x86/cpu.h" 26 #include "libavcodec/vp9dsp.h" 27 #include "libavcodec/x86/vp9dsp_init.h" 28 29 #if HAVE_X86ASM 30 31 decl_fpel_func(put, 4, , mmx); 32 decl_fpel_func(put, 8, , mmx); 33 decl_fpel_func(put, 16, , sse); 34 decl_fpel_func(put, 32, , sse); 35 decl_fpel_func(put, 64, , sse); 36 decl_fpel_func(avg, 4, _8, mmxext); 37 decl_fpel_func(avg, 8, _8, mmxext); 38 decl_fpel_func(avg, 16, _8, sse2); 39 decl_fpel_func(avg, 32, _8, sse2); 40 decl_fpel_func(avg, 64, _8, sse2); 41 decl_fpel_func(put, 32, , avx); 42 decl_fpel_func(put, 64, , avx); 43 decl_fpel_func(avg, 32, _8, avx2); 44 decl_fpel_func(avg, 64, _8, avx2); 45 46 decl_mc_funcs(4, mmxext, int16_t, 8, 8); 47 decl_mc_funcs(8, sse2, int16_t, 8, 8); 48 decl_mc_funcs(4, ssse3, int8_t, 32, 8); 49 decl_mc_funcs(8, ssse3, int8_t, 32, 8); 50 #if ARCH_X86_64 51 decl_mc_funcs(16, ssse3, int8_t, 32, 8); 52 decl_mc_funcs(32, avx2, int8_t, 32, 8); 53 #endif 54 55 mc_rep_funcs(16, 8, 8, sse2, int16_t, 8, 8) 56 #if ARCH_X86_32 57 mc_rep_funcs(16, 8, 8, ssse3, int8_t, 32, 8) 58 #endif 59 mc_rep_funcs(32, 16, 16, sse2, int16_t, 8, 8) 60 mc_rep_funcs(32, 16, 16, ssse3, int8_t, 32, 8) 61 mc_rep_funcs(64, 32, 32, sse2, int16_t, 8, 8) 62 mc_rep_funcs(64, 32, 32, ssse3, int8_t, 32, 8) 63 #if ARCH_X86_64 && HAVE_AVX2_EXTERNAL 64 mc_rep_funcs(64, 32, 32, avx2, int8_t, 32, 8) 65 #endif 66 67 extern const int8_t ff_filters_ssse3[3][15][4][32]; 68 extern const int16_t ff_filters_sse2[3][15][8][8]; 69 70 filters_8tap_2d_fn2(put, 16, 8, 1, mmxext, sse2, sse2) 71 filters_8tap_2d_fn2(avg, 16, 8, 1, mmxext, sse2, sse2) 72 filters_8tap_2d_fn2(put, 16, 8, 1, ssse3, ssse3, ssse3) 73 filters_8tap_2d_fn2(avg, 16, 8, 1, ssse3, ssse3, ssse3) 74 #if ARCH_X86_64 && HAVE_AVX2_EXTERNAL 75 filters_8tap_2d_fn(put, 64, 32, 8, 1, avx2, ssse3) 76 filters_8tap_2d_fn(put, 32, 32, 8, 1, avx2, ssse3) 77 filters_8tap_2d_fn(avg, 64, 32, 8, 1, avx2, ssse3) 78 filters_8tap_2d_fn(avg, 32, 32, 8, 1, avx2, ssse3) 79 #endif 80 81 filters_8tap_1d_fn3(put, 8, mmxext, sse2, sse2) 82 filters_8tap_1d_fn3(avg, 8, mmxext, sse2, sse2) 83 filters_8tap_1d_fn3(put, 8, ssse3, ssse3, ssse3) 84 filters_8tap_1d_fn3(avg, 8, ssse3, ssse3, ssse3) 85 #if ARCH_X86_64 && HAVE_AVX2_EXTERNAL 86 filters_8tap_1d_fn2(put, 64, 8, avx2, ssse3) 87 filters_8tap_1d_fn2(put, 32, 8, avx2, ssse3) 88 filters_8tap_1d_fn2(avg, 64, 8, avx2, ssse3) 89 filters_8tap_1d_fn2(avg, 32, 8, avx2, ssse3) 90 #endif 91 92 #define itxfm_func(typea, typeb, size, opt) \ 93 void ff_vp9_##typea##_##typeb##_##size##x##size##_add_##opt(uint8_t *dst, ptrdiff_t stride, \ 94 int16_t *block, int eob) 95 #define itxfm_funcs(size, opt) \ 96 itxfm_func(idct, idct, size, opt); \ 97 itxfm_func(iadst, idct, size, opt); \ 98 itxfm_func(idct, iadst, size, opt); \ 99 itxfm_func(iadst, iadst, size, opt) 100 101 itxfm_func(idct, idct, 4, mmxext); 102 itxfm_func(idct, iadst, 4, sse2); 103 itxfm_func(iadst, idct, 4, sse2); 104 itxfm_func(iadst, iadst, 4, sse2); 105 itxfm_funcs(4, ssse3); 106 itxfm_funcs(8, sse2); 107 itxfm_funcs(8, ssse3); 108 itxfm_funcs(8, avx); 109 itxfm_funcs(16, sse2); 110 itxfm_funcs(16, ssse3); 111 itxfm_funcs(16, avx); 112 itxfm_func(idct, idct, 32, sse2); 113 itxfm_func(idct, idct, 32, ssse3); 114 itxfm_func(idct, idct, 32, avx); 115 itxfm_func(iwht, iwht, 4, mmx); 116 itxfm_funcs(16, avx2); 117 itxfm_func(idct, idct, 32, avx2); 118 119 #undef itxfm_func 120 #undef itxfm_funcs 121 122 #define lpf_funcs(size1, size2, opt) \ 123 void ff_vp9_loop_filter_v_##size1##_##size2##_##opt(uint8_t *dst, ptrdiff_t stride, \ 124 int E, int I, int H); \ 125 void ff_vp9_loop_filter_h_##size1##_##size2##_##opt(uint8_t *dst, ptrdiff_t stride, \ 126 int E, int I, int H) 127 128 lpf_funcs(4, 8, mmxext); 129 lpf_funcs(8, 8, mmxext); 130 lpf_funcs(16, 16, sse2); 131 lpf_funcs(16, 16, ssse3); 132 lpf_funcs(16, 16, avx); 133 lpf_funcs(44, 16, sse2); 134 lpf_funcs(44, 16, ssse3); 135 lpf_funcs(44, 16, avx); 136 lpf_funcs(84, 16, sse2); 137 lpf_funcs(84, 16, ssse3); 138 lpf_funcs(84, 16, avx); 139 lpf_funcs(48, 16, sse2); 140 lpf_funcs(48, 16, ssse3); 141 lpf_funcs(48, 16, avx); 142 lpf_funcs(88, 16, sse2); 143 lpf_funcs(88, 16, ssse3); 144 lpf_funcs(88, 16, avx); 145 146 #undef lpf_funcs 147 148 #define ipred_func(size, type, opt) \ 149 void ff_vp9_ipred_##type##_##size##x##size##_##opt(uint8_t *dst, ptrdiff_t stride, \ 150 const uint8_t *l, const uint8_t *a) 151 152 ipred_func(8, v, mmx); 153 154 #define ipred_dc_funcs(size, opt) \ 155 ipred_func(size, dc, opt); \ 156 ipred_func(size, dc_left, opt); \ 157 ipred_func(size, dc_top, opt) 158 159 ipred_dc_funcs(4, mmxext); 160 ipred_dc_funcs(8, mmxext); 161 162 #define ipred_dir_tm_funcs(size, opt) \ 163 ipred_func(size, tm, opt); \ 164 ipred_func(size, dl, opt); \ 165 ipred_func(size, dr, opt); \ 166 ipred_func(size, hd, opt); \ 167 ipred_func(size, hu, opt); \ 168 ipred_func(size, vl, opt); \ 169 ipred_func(size, vr, opt) 170 171 ipred_dir_tm_funcs(4, mmxext); 172 173 ipred_func(16, v, sse); 174 ipred_func(32, v, sse); 175 176 ipred_dc_funcs(16, sse2); 177 ipred_dc_funcs(32, sse2); 178 179 #define ipred_dir_tm_h_funcs(size, opt) \ 180 ipred_dir_tm_funcs(size, opt); \ 181 ipred_func(size, h, opt) 182 183 ipred_dir_tm_h_funcs(8, sse2); 184 ipred_dir_tm_h_funcs(16, sse2); 185 ipred_dir_tm_h_funcs(32, sse2); 186 187 ipred_func(4, h, sse2); 188 189 #define ipred_all_funcs(size, opt) \ 190 ipred_dc_funcs(size, opt); \ 191 ipred_dir_tm_h_funcs(size, opt) 192 193 // FIXME hd/vl_4x4_ssse3 does not exist 194 ipred_all_funcs(4, ssse3); 195 ipred_all_funcs(8, ssse3); 196 ipred_all_funcs(16, ssse3); 197 ipred_all_funcs(32, ssse3); 198 199 ipred_dir_tm_h_funcs(8, avx); 200 ipred_dir_tm_h_funcs(16, avx); 201 ipred_dir_tm_h_funcs(32, avx); 202 203 ipred_func(32, v, avx); 204 205 ipred_dc_funcs(32, avx2); 206 ipred_func(32, h, avx2); 207 ipred_func(32, tm, avx2); 208 209 #undef ipred_func 210 #undef ipred_dir_tm_h_funcs 211 #undef ipred_dir_tm_funcs 212 #undef ipred_dc_funcs 213 214 #endif /* HAVE_X86ASM */ 215 216 av_cold void ff_vp9dsp_init_x86(VP9DSPContext *dsp, int bpp, int bitexact) 217 { 218 #if HAVE_X86ASM 219 int cpu_flags; 220 221 if (bpp == 10) { 222 ff_vp9dsp_init_10bpp_x86(dsp, bitexact); 223 return; 224 } else if (bpp == 12) { 225 ff_vp9dsp_init_12bpp_x86(dsp, bitexact); 226 return; 227 } 228 229 cpu_flags = av_get_cpu_flags(); 230 231 #define init_lpf(opt) do { \ 232 dsp->loop_filter_16[0] = ff_vp9_loop_filter_h_16_16_##opt; \ 233 dsp->loop_filter_16[1] = ff_vp9_loop_filter_v_16_16_##opt; \ 234 dsp->loop_filter_mix2[0][0][0] = ff_vp9_loop_filter_h_44_16_##opt; \ 235 dsp->loop_filter_mix2[0][0][1] = ff_vp9_loop_filter_v_44_16_##opt; \ 236 dsp->loop_filter_mix2[0][1][0] = ff_vp9_loop_filter_h_48_16_##opt; \ 237 dsp->loop_filter_mix2[0][1][1] = ff_vp9_loop_filter_v_48_16_##opt; \ 238 dsp->loop_filter_mix2[1][0][0] = ff_vp9_loop_filter_h_84_16_##opt; \ 239 dsp->loop_filter_mix2[1][0][1] = ff_vp9_loop_filter_v_84_16_##opt; \ 240 dsp->loop_filter_mix2[1][1][0] = ff_vp9_loop_filter_h_88_16_##opt; \ 241 dsp->loop_filter_mix2[1][1][1] = ff_vp9_loop_filter_v_88_16_##opt; \ 242 } while (0) 243 244 #define init_ipred(sz, opt, t, e) \ 245 dsp->intra_pred[TX_##sz##X##sz][e##_PRED] = ff_vp9_ipred_##t##_##sz##x##sz##_##opt 246 247 #define ff_vp9_ipred_hd_4x4_ssse3 ff_vp9_ipred_hd_4x4_mmxext 248 #define ff_vp9_ipred_vl_4x4_ssse3 ff_vp9_ipred_vl_4x4_mmxext 249 #define init_dir_tm_ipred(sz, opt) do { \ 250 init_ipred(sz, opt, dl, DIAG_DOWN_LEFT); \ 251 init_ipred(sz, opt, dr, DIAG_DOWN_RIGHT); \ 252 init_ipred(sz, opt, hd, HOR_DOWN); \ 253 init_ipred(sz, opt, vl, VERT_LEFT); \ 254 init_ipred(sz, opt, hu, HOR_UP); \ 255 init_ipred(sz, opt, tm, TM_VP8); \ 256 init_ipred(sz, opt, vr, VERT_RIGHT); \ 257 } while (0) 258 #define init_dir_tm_h_ipred(sz, opt) do { \ 259 init_dir_tm_ipred(sz, opt); \ 260 init_ipred(sz, opt, h, HOR); \ 261 } while (0) 262 #define init_dc_ipred(sz, opt) do { \ 263 init_ipred(sz, opt, dc, DC); \ 264 init_ipred(sz, opt, dc_left, LEFT_DC); \ 265 init_ipred(sz, opt, dc_top, TOP_DC); \ 266 } while (0) 267 #define init_all_ipred(sz, opt) do { \ 268 init_dc_ipred(sz, opt); \ 269 init_dir_tm_h_ipred(sz, opt); \ 270 } while (0) 271 272 if (EXTERNAL_MMX(cpu_flags)) { 273 init_fpel_func(4, 0, 4, put, , mmx); 274 init_fpel_func(3, 0, 8, put, , mmx); 275 if (!bitexact) { 276 dsp->itxfm_add[4 /* lossless */][DCT_DCT] = 277 dsp->itxfm_add[4 /* lossless */][ADST_DCT] = 278 dsp->itxfm_add[4 /* lossless */][DCT_ADST] = 279 dsp->itxfm_add[4 /* lossless */][ADST_ADST] = ff_vp9_iwht_iwht_4x4_add_mmx; 280 } 281 init_ipred(8, mmx, v, VERT); 282 } 283 284 if (EXTERNAL_MMXEXT(cpu_flags)) { 285 dsp->loop_filter_8[0][0] = ff_vp9_loop_filter_h_4_8_mmxext; 286 dsp->loop_filter_8[0][1] = ff_vp9_loop_filter_v_4_8_mmxext; 287 dsp->loop_filter_8[1][0] = ff_vp9_loop_filter_h_8_8_mmxext; 288 dsp->loop_filter_8[1][1] = ff_vp9_loop_filter_v_8_8_mmxext; 289 init_subpel2(4, 0, 4, put, 8, mmxext); 290 init_subpel2(4, 1, 4, avg, 8, mmxext); 291 init_fpel_func(4, 1, 4, avg, _8, mmxext); 292 init_fpel_func(3, 1, 8, avg, _8, mmxext); 293 dsp->itxfm_add[TX_4X4][DCT_DCT] = ff_vp9_idct_idct_4x4_add_mmxext; 294 init_dc_ipred(4, mmxext); 295 init_dc_ipred(8, mmxext); 296 init_dir_tm_ipred(4, mmxext); 297 } 298 299 if (EXTERNAL_SSE(cpu_flags)) { 300 init_fpel_func(2, 0, 16, put, , sse); 301 init_fpel_func(1, 0, 32, put, , sse); 302 init_fpel_func(0, 0, 64, put, , sse); 303 init_ipred(16, sse, v, VERT); 304 init_ipred(32, sse, v, VERT); 305 } 306 307 if (EXTERNAL_SSE2(cpu_flags)) { 308 init_subpel3_8to64(0, put, 8, sse2); 309 init_subpel3_8to64(1, avg, 8, sse2); 310 init_fpel_func(2, 1, 16, avg, _8, sse2); 311 init_fpel_func(1, 1, 32, avg, _8, sse2); 312 init_fpel_func(0, 1, 64, avg, _8, sse2); 313 init_lpf(sse2); 314 dsp->itxfm_add[TX_4X4][ADST_DCT] = ff_vp9_idct_iadst_4x4_add_sse2; 315 dsp->itxfm_add[TX_4X4][DCT_ADST] = ff_vp9_iadst_idct_4x4_add_sse2; 316 dsp->itxfm_add[TX_4X4][ADST_ADST] = ff_vp9_iadst_iadst_4x4_add_sse2; 317 dsp->itxfm_add[TX_8X8][DCT_DCT] = ff_vp9_idct_idct_8x8_add_sse2; 318 dsp->itxfm_add[TX_8X8][ADST_DCT] = ff_vp9_idct_iadst_8x8_add_sse2; 319 dsp->itxfm_add[TX_8X8][DCT_ADST] = ff_vp9_iadst_idct_8x8_add_sse2; 320 dsp->itxfm_add[TX_8X8][ADST_ADST] = ff_vp9_iadst_iadst_8x8_add_sse2; 321 dsp->itxfm_add[TX_16X16][DCT_DCT] = ff_vp9_idct_idct_16x16_add_sse2; 322 dsp->itxfm_add[TX_16X16][ADST_DCT] = ff_vp9_idct_iadst_16x16_add_sse2; 323 dsp->itxfm_add[TX_16X16][DCT_ADST] = ff_vp9_iadst_idct_16x16_add_sse2; 324 dsp->itxfm_add[TX_16X16][ADST_ADST] = ff_vp9_iadst_iadst_16x16_add_sse2; 325 dsp->itxfm_add[TX_32X32][ADST_ADST] = 326 dsp->itxfm_add[TX_32X32][ADST_DCT] = 327 dsp->itxfm_add[TX_32X32][DCT_ADST] = 328 dsp->itxfm_add[TX_32X32][DCT_DCT] = ff_vp9_idct_idct_32x32_add_sse2; 329 init_dc_ipred(16, sse2); 330 init_dc_ipred(32, sse2); 331 init_dir_tm_h_ipred(8, sse2); 332 init_dir_tm_h_ipred(16, sse2); 333 init_dir_tm_h_ipred(32, sse2); 334 init_ipred(4, sse2, h, HOR); 335 } 336 337 if (EXTERNAL_SSSE3(cpu_flags)) { 338 init_subpel3(0, put, 8, ssse3); 339 init_subpel3(1, avg, 8, ssse3); 340 dsp->itxfm_add[TX_4X4][DCT_DCT] = ff_vp9_idct_idct_4x4_add_ssse3; 341 dsp->itxfm_add[TX_4X4][ADST_DCT] = ff_vp9_idct_iadst_4x4_add_ssse3; 342 dsp->itxfm_add[TX_4X4][DCT_ADST] = ff_vp9_iadst_idct_4x4_add_ssse3; 343 dsp->itxfm_add[TX_4X4][ADST_ADST] = ff_vp9_iadst_iadst_4x4_add_ssse3; 344 dsp->itxfm_add[TX_8X8][DCT_DCT] = ff_vp9_idct_idct_8x8_add_ssse3; 345 dsp->itxfm_add[TX_8X8][ADST_DCT] = ff_vp9_idct_iadst_8x8_add_ssse3; 346 dsp->itxfm_add[TX_8X8][DCT_ADST] = ff_vp9_iadst_idct_8x8_add_ssse3; 347 dsp->itxfm_add[TX_8X8][ADST_ADST] = ff_vp9_iadst_iadst_8x8_add_ssse3; 348 dsp->itxfm_add[TX_16X16][DCT_DCT] = ff_vp9_idct_idct_16x16_add_ssse3; 349 dsp->itxfm_add[TX_16X16][ADST_DCT] = ff_vp9_idct_iadst_16x16_add_ssse3; 350 dsp->itxfm_add[TX_16X16][DCT_ADST] = ff_vp9_iadst_idct_16x16_add_ssse3; 351 dsp->itxfm_add[TX_16X16][ADST_ADST] = ff_vp9_iadst_iadst_16x16_add_ssse3; 352 dsp->itxfm_add[TX_32X32][ADST_ADST] = 353 dsp->itxfm_add[TX_32X32][ADST_DCT] = 354 dsp->itxfm_add[TX_32X32][DCT_ADST] = 355 dsp->itxfm_add[TX_32X32][DCT_DCT] = ff_vp9_idct_idct_32x32_add_ssse3; 356 init_lpf(ssse3); 357 init_all_ipred(4, ssse3); 358 init_all_ipred(8, ssse3); 359 init_all_ipred(16, ssse3); 360 init_all_ipred(32, ssse3); 361 } 362 363 if (EXTERNAL_AVX(cpu_flags)) { 364 dsp->itxfm_add[TX_8X8][DCT_DCT] = ff_vp9_idct_idct_8x8_add_avx; 365 dsp->itxfm_add[TX_8X8][ADST_DCT] = ff_vp9_idct_iadst_8x8_add_avx; 366 dsp->itxfm_add[TX_8X8][DCT_ADST] = ff_vp9_iadst_idct_8x8_add_avx; 367 dsp->itxfm_add[TX_8X8][ADST_ADST] = ff_vp9_iadst_iadst_8x8_add_avx; 368 dsp->itxfm_add[TX_16X16][DCT_DCT] = ff_vp9_idct_idct_16x16_add_avx; 369 dsp->itxfm_add[TX_16X16][ADST_DCT] = ff_vp9_idct_iadst_16x16_add_avx; 370 dsp->itxfm_add[TX_16X16][DCT_ADST] = ff_vp9_iadst_idct_16x16_add_avx; 371 dsp->itxfm_add[TX_16X16][ADST_ADST] = ff_vp9_iadst_iadst_16x16_add_avx; 372 dsp->itxfm_add[TX_32X32][ADST_ADST] = 373 dsp->itxfm_add[TX_32X32][ADST_DCT] = 374 dsp->itxfm_add[TX_32X32][DCT_ADST] = 375 dsp->itxfm_add[TX_32X32][DCT_DCT] = ff_vp9_idct_idct_32x32_add_avx; 376 init_lpf(avx); 377 init_dir_tm_h_ipred(8, avx); 378 init_dir_tm_h_ipred(16, avx); 379 init_dir_tm_h_ipred(32, avx); 380 } 381 if (EXTERNAL_AVX_FAST(cpu_flags)) { 382 init_fpel_func(1, 0, 32, put, , avx); 383 init_fpel_func(0, 0, 64, put, , avx); 384 init_ipred(32, avx, v, VERT); 385 } 386 387 if (EXTERNAL_AVX2_FAST(cpu_flags)) { 388 init_fpel_func(1, 1, 32, avg, _8, avx2); 389 init_fpel_func(0, 1, 64, avg, _8, avx2); 390 if (ARCH_X86_64) { 391 #if ARCH_X86_64 && HAVE_AVX2_EXTERNAL 392 dsp->itxfm_add[TX_16X16][DCT_DCT] = ff_vp9_idct_idct_16x16_add_avx2; 393 dsp->itxfm_add[TX_16X16][ADST_DCT] = ff_vp9_idct_iadst_16x16_add_avx2; 394 dsp->itxfm_add[TX_16X16][DCT_ADST] = ff_vp9_iadst_idct_16x16_add_avx2; 395 dsp->itxfm_add[TX_16X16][ADST_ADST] = ff_vp9_iadst_iadst_16x16_add_avx2; 396 dsp->itxfm_add[TX_32X32][ADST_ADST] = 397 dsp->itxfm_add[TX_32X32][ADST_DCT] = 398 dsp->itxfm_add[TX_32X32][DCT_ADST] = 399 dsp->itxfm_add[TX_32X32][DCT_DCT] = ff_vp9_idct_idct_32x32_add_avx2; 400 init_subpel3_32_64(0, put, 8, avx2); 401 init_subpel3_32_64(1, avg, 8, avx2); 402 #endif 403 } 404 init_dc_ipred(32, avx2); 405 init_ipred(32, avx2, h, HOR); 406 init_ipred(32, avx2, tm, TM_VP8); 407 } 408 409 #undef init_fpel 410 #undef init_subpel1 411 #undef init_subpel2 412 #undef init_subpel3 413 414 #endif /* HAVE_X86ASM */ 415 }