vp9mc_16bpp.asm (10737B)
1 ;****************************************************************************** 2 ;* VP9 MC SIMD optimizations 3 ;* 4 ;* Copyright (c) 2015 Ronald S. Bultje <rsbultje gmail com> 5 ;* 6 ;* This file is part of FFmpeg. 7 ;* 8 ;* FFmpeg is free software; you can redistribute it and/or 9 ;* modify it under the terms of the GNU Lesser General Public 10 ;* License as published by the Free Software Foundation; either 11 ;* version 2.1 of the License, or (at your option) any later version. 12 ;* 13 ;* FFmpeg is distributed in the hope that it will be useful, 14 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of 15 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 16 ;* Lesser General Public License for more details. 17 ;* 18 ;* You should have received a copy of the GNU Lesser General Public 19 ;* License along with FFmpeg; if not, write to the Free Software 20 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 21 ;****************************************************************************** 22 23 %include "libavutil/x86/x86util.asm" 24 25 SECTION_RODATA 32 26 27 pd_64: times 8 dd 64 28 29 cextern pw_1023 30 cextern pw_4095 31 32 SECTION .text 33 34 %macro filter_h4_fn 1-2 12 35 cglobal vp9_%1_8tap_1d_h_4_10, 6, 6, %2, dst, dstride, src, sstride, h, filtery 36 mova m5, [pw_1023] 37 .body: 38 %if notcpuflag(sse4) && ARCH_X86_64 39 pxor m11, m11 40 %endif 41 mova m6, [pd_64] 42 mova m7, [filteryq+ 0] 43 %if ARCH_X86_64 && mmsize > 8 44 mova m8, [filteryq+32] 45 mova m9, [filteryq+64] 46 mova m10, [filteryq+96] 47 %endif 48 .loop: 49 movh m0, [srcq-6] 50 movh m1, [srcq-4] 51 movh m2, [srcq-2] 52 movh m3, [srcq+0] 53 movh m4, [srcq+2] 54 punpcklwd m0, m1 55 punpcklwd m2, m3 56 pmaddwd m0, m7 57 %if ARCH_X86_64 && mmsize > 8 58 pmaddwd m2, m8 59 %else 60 pmaddwd m2, [filteryq+32] 61 %endif 62 movu m1, [srcq+4] 63 movu m3, [srcq+6] 64 paddd m0, m2 65 movu m2, [srcq+8] 66 add srcq, sstrideq 67 punpcklwd m4, m1 68 punpcklwd m3, m2 69 %if ARCH_X86_64 && mmsize > 8 70 pmaddwd m4, m9 71 pmaddwd m3, m10 72 %else 73 pmaddwd m4, [filteryq+64] 74 pmaddwd m3, [filteryq+96] 75 %endif 76 paddd m0, m4 77 paddd m0, m3 78 paddd m0, m6 79 psrad m0, 7 80 %if cpuflag(sse4) 81 packusdw m0, m0 82 %else 83 packssdw m0, m0 84 %endif 85 %ifidn %1, avg 86 movh m1, [dstq] 87 %endif 88 pminsw m0, m5 89 %if notcpuflag(sse4) 90 %if ARCH_X86_64 91 pmaxsw m0, m11 92 %else 93 pxor m2, m2 94 pmaxsw m0, m2 95 %endif 96 %endif 97 %ifidn %1, avg 98 pavgw m0, m1 99 %endif 100 movh [dstq], m0 101 add dstq, dstrideq 102 dec hd 103 jg .loop 104 RET 105 106 cglobal vp9_%1_8tap_1d_h_4_12, 6, 6, %2, dst, dstride, src, sstride, h, filtery 107 mova m5, [pw_4095] 108 jmp mangle(private_prefix %+ _ %+ vp9_%1_8tap_1d_h_4_10 %+ SUFFIX).body 109 %endmacro 110 111 INIT_XMM sse2 112 filter_h4_fn put 113 filter_h4_fn avg 114 115 %macro filter_h_fn 1-2 12 116 %assign %%px mmsize/2 117 cglobal vp9_%1_8tap_1d_h_ %+ %%px %+ _10, 6, 6, %2, dst, dstride, src, sstride, h, filtery 118 mova m5, [pw_1023] 119 .body: 120 %if notcpuflag(sse4) && ARCH_X86_64 121 pxor m11, m11 122 %endif 123 mova m6, [pd_64] 124 mova m7, [filteryq+ 0] 125 %if ARCH_X86_64 && mmsize > 8 126 mova m8, [filteryq+32] 127 mova m9, [filteryq+64] 128 mova m10, [filteryq+96] 129 %endif 130 .loop: 131 movu m0, [srcq-6] 132 movu m1, [srcq-4] 133 movu m2, [srcq-2] 134 movu m3, [srcq+0] 135 movu m4, [srcq+2] 136 pmaddwd m0, m7 137 pmaddwd m1, m7 138 %if ARCH_X86_64 && mmsize > 8 139 pmaddwd m2, m8 140 pmaddwd m3, m8 141 pmaddwd m4, m9 142 %else 143 pmaddwd m2, [filteryq+32] 144 pmaddwd m3, [filteryq+32] 145 pmaddwd m4, [filteryq+64] 146 %endif 147 paddd m0, m2 148 paddd m1, m3 149 paddd m0, m4 150 movu m2, [srcq+4] 151 movu m3, [srcq+6] 152 movu m4, [srcq+8] 153 add srcq, sstrideq 154 %if ARCH_X86_64 && mmsize > 8 155 pmaddwd m2, m9 156 pmaddwd m3, m10 157 pmaddwd m4, m10 158 %else 159 pmaddwd m2, [filteryq+64] 160 pmaddwd m3, [filteryq+96] 161 pmaddwd m4, [filteryq+96] 162 %endif 163 paddd m1, m2 164 paddd m0, m3 165 paddd m1, m4 166 paddd m0, m6 167 paddd m1, m6 168 psrad m0, 7 169 psrad m1, 7 170 %if cpuflag(sse4) 171 packusdw m0, m0 172 packusdw m1, m1 173 %else 174 packssdw m0, m0 175 packssdw m1, m1 176 %endif 177 punpcklwd m0, m1 178 pminsw m0, m5 179 %if notcpuflag(sse4) 180 %if ARCH_X86_64 181 pmaxsw m0, m11 182 %else 183 pxor m2, m2 184 pmaxsw m0, m2 185 %endif 186 %endif 187 %ifidn %1, avg 188 pavgw m0, [dstq] 189 %endif 190 mova [dstq], m0 191 add dstq, dstrideq 192 dec hd 193 jg .loop 194 RET 195 196 cglobal vp9_%1_8tap_1d_h_ %+ %%px %+ _12, 6, 6, %2, dst, dstride, src, sstride, h, filtery 197 mova m5, [pw_4095] 198 jmp mangle(private_prefix %+ _ %+ vp9_%1_8tap_1d_h_ %+ %%px %+ _10 %+ SUFFIX).body 199 %endmacro 200 201 INIT_XMM sse2 202 filter_h_fn put 203 filter_h_fn avg 204 %if HAVE_AVX2_EXTERNAL 205 INIT_YMM avx2 206 filter_h_fn put 207 filter_h_fn avg 208 %endif 209 210 %macro filter_v4_fn 1-2 12 211 %if ARCH_X86_64 212 cglobal vp9_%1_8tap_1d_v_4_10, 6, 8, %2, dst, dstride, src, sstride, h, filtery, src4, sstride3 213 %else 214 cglobal vp9_%1_8tap_1d_v_4_10, 4, 7, %2, dst, dstride, src, sstride, filtery, src4, sstride3 215 mov filteryq, r5mp 216 %define hd r4mp 217 %endif 218 mova m5, [pw_1023] 219 .body: 220 %if notcpuflag(sse4) && ARCH_X86_64 221 pxor m11, m11 222 %endif 223 mova m6, [pd_64] 224 lea sstride3q, [sstrideq*3] 225 lea src4q, [srcq+sstrideq] 226 sub srcq, sstride3q 227 mova m7, [filteryq+ 0] 228 %if ARCH_X86_64 && mmsize > 8 229 mova m8, [filteryq+ 32] 230 mova m9, [filteryq+ 64] 231 mova m10, [filteryq+ 96] 232 %endif 233 .loop: 234 ; FIXME maybe reuse loads from previous rows, or just 235 ; more generally unroll this to prevent multiple loads of 236 ; the same data? 237 movh m0, [srcq] 238 movh m1, [srcq+sstrideq] 239 movh m2, [srcq+sstrideq*2] 240 movh m3, [srcq+sstride3q] 241 add srcq, sstrideq 242 movh m4, [src4q] 243 punpcklwd m0, m1 244 punpcklwd m2, m3 245 pmaddwd m0, m7 246 %if ARCH_X86_64 && mmsize > 8 247 pmaddwd m2, m8 248 %else 249 pmaddwd m2, [filteryq+ 32] 250 %endif 251 movh m1, [src4q+sstrideq] 252 movh m3, [src4q+sstrideq*2] 253 paddd m0, m2 254 movh m2, [src4q+sstride3q] 255 add src4q, sstrideq 256 punpcklwd m4, m1 257 punpcklwd m3, m2 258 %if ARCH_X86_64 && mmsize > 8 259 pmaddwd m4, m9 260 pmaddwd m3, m10 261 %else 262 pmaddwd m4, [filteryq+ 64] 263 pmaddwd m3, [filteryq+ 96] 264 %endif 265 paddd m0, m4 266 paddd m0, m3 267 paddd m0, m6 268 psrad m0, 7 269 %if cpuflag(sse4) 270 packusdw m0, m0 271 %else 272 packssdw m0, m0 273 %endif 274 %ifidn %1, avg 275 movh m1, [dstq] 276 %endif 277 pminsw m0, m5 278 %if notcpuflag(sse4) 279 %if ARCH_X86_64 280 pmaxsw m0, m11 281 %else 282 pxor m2, m2 283 pmaxsw m0, m2 284 %endif 285 %endif 286 %ifidn %1, avg 287 pavgw m0, m1 288 %endif 289 movh [dstq], m0 290 add dstq, dstrideq 291 dec hd 292 jg .loop 293 RET 294 295 %if ARCH_X86_64 296 cglobal vp9_%1_8tap_1d_v_4_12, 6, 8, %2, dst, dstride, src, sstride, h, filtery, src4, sstride3 297 %else 298 cglobal vp9_%1_8tap_1d_v_4_12, 4, 7, %2, dst, dstride, src, sstride, filtery, src4, sstride3 299 mov filteryq, r5mp 300 %endif 301 mova m5, [pw_4095] 302 jmp mangle(private_prefix %+ _ %+ vp9_%1_8tap_1d_v_4_10 %+ SUFFIX).body 303 %endmacro 304 305 INIT_XMM sse2 306 filter_v4_fn put 307 filter_v4_fn avg 308 309 %macro filter_v_fn 1-2 13 310 %assign %%px mmsize/2 311 %if ARCH_X86_64 312 cglobal vp9_%1_8tap_1d_v_ %+ %%px %+ _10, 6, 8, %2, dst, dstride, src, sstride, h, filtery, src4, sstride3 313 %else 314 cglobal vp9_%1_8tap_1d_v_ %+ %%px %+ _10, 4, 7, %2, dst, dstride, src, sstride, filtery, src4, sstride3 315 mov filteryq, r5mp 316 %define hd r4mp 317 %endif 318 mova m5, [pw_1023] 319 .body: 320 %if notcpuflag(sse4) && ARCH_X86_64 321 pxor m12, m12 322 %endif 323 %if ARCH_X86_64 324 mova m11, [pd_64] 325 %endif 326 lea sstride3q, [sstrideq*3] 327 lea src4q, [srcq+sstrideq] 328 sub srcq, sstride3q 329 mova m7, [filteryq+ 0] 330 %if ARCH_X86_64 && mmsize > 8 331 mova m8, [filteryq+ 32] 332 mova m9, [filteryq+ 64] 333 mova m10, [filteryq+ 96] 334 %endif 335 .loop: 336 ; FIXME maybe reuse loads from previous rows, or just 337 ; more generally unroll this to prevent multiple loads of 338 ; the same data? 339 movu m0, [srcq] 340 movu m1, [srcq+sstrideq] 341 movu m2, [srcq+sstrideq*2] 342 movu m3, [srcq+sstride3q] 343 add srcq, sstrideq 344 movu m4, [src4q] 345 SBUTTERFLY wd, 0, 1, 6 346 SBUTTERFLY wd, 2, 3, 6 347 pmaddwd m0, m7 348 pmaddwd m1, m7 349 %if ARCH_X86_64 && mmsize > 8 350 pmaddwd m2, m8 351 pmaddwd m3, m8 352 %else 353 pmaddwd m2, [filteryq+ 32] 354 pmaddwd m3, [filteryq+ 32] 355 %endif 356 paddd m0, m2 357 paddd m1, m3 358 movu m2, [src4q+sstrideq] 359 movu m3, [src4q+sstrideq*2] 360 SBUTTERFLY wd, 4, 2, 6 361 %if ARCH_X86_64 && mmsize > 8 362 pmaddwd m4, m9 363 pmaddwd m2, m9 364 %else 365 pmaddwd m4, [filteryq+ 64] 366 pmaddwd m2, [filteryq+ 64] 367 %endif 368 paddd m0, m4 369 paddd m1, m2 370 movu m4, [src4q+sstride3q] 371 add src4q, sstrideq 372 SBUTTERFLY wd, 3, 4, 6 373 %if ARCH_X86_64 && mmsize > 8 374 pmaddwd m3, m10 375 pmaddwd m4, m10 376 %else 377 pmaddwd m3, [filteryq+ 96] 378 pmaddwd m4, [filteryq+ 96] 379 %endif 380 paddd m0, m3 381 paddd m1, m4 382 %if ARCH_X86_64 383 paddd m0, m11 384 paddd m1, m11 385 %else 386 paddd m0, [pd_64] 387 paddd m1, [pd_64] 388 %endif 389 psrad m0, 7 390 psrad m1, 7 391 %if cpuflag(sse4) 392 packusdw m0, m1 393 %else 394 packssdw m0, m1 395 %endif 396 pminsw m0, m5 397 %if notcpuflag(sse4) 398 %if ARCH_X86_64 399 pmaxsw m0, m12 400 %else 401 pxor m2, m2 402 pmaxsw m0, m2 403 %endif 404 %endif 405 %ifidn %1, avg 406 pavgw m0, [dstq] 407 %endif 408 mova [dstq], m0 409 add dstq, dstrideq 410 dec hd 411 jg .loop 412 RET 413 414 %if ARCH_X86_64 415 cglobal vp9_%1_8tap_1d_v_ %+ %%px %+ _12, 6, 8, %2, dst, dstride, src, sstride, h, filtery, src4, sstride3 416 %else 417 cglobal vp9_%1_8tap_1d_v_ %+ %%px %+ _12, 4, 7, %2, dst, dstride, src, sstride, filtery, src4, sstride3 418 mov filteryq, r5mp 419 %endif 420 mova m5, [pw_4095] 421 jmp mangle(private_prefix %+ _ %+ vp9_%1_8tap_1d_v_ %+ %%px %+ _10 %+ SUFFIX).body 422 %endmacro 423 424 INIT_XMM sse2 425 filter_v_fn put 426 filter_v_fn avg 427 %if HAVE_AVX2_EXTERNAL 428 INIT_YMM avx2 429 filter_v_fn put 430 filter_v_fn avg 431 %endif