refmvs.asm (29605B)
1 ; Copyright © 2021, VideoLAN and dav1d authors 2 ; Copyright © 2021, Two Orioles, LLC 3 ; All rights reserved. 4 ; 5 ; Redistribution and use in source and binary forms, with or without 6 ; modification, are permitted provided that the following conditions are met: 7 ; 8 ; 1. Redistributions of source code must retain the above copyright notice, this 9 ; list of conditions and the following disclaimer. 10 ; 11 ; 2. Redistributions in binary form must reproduce the above copyright notice, 12 ; this list of conditions and the following disclaimer in the documentation 13 ; and/or other materials provided with the distribution. 14 ; 15 ; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 16 ; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 17 ; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 18 ; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 19 ; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 20 ; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 21 ; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 22 ; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 23 ; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 24 ; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 25 26 %include "config.asm" 27 %include "ext/x86/x86inc.asm" 28 29 SECTION_RODATA 64 30 31 %macro JMP_TABLE 2-* 32 %xdefine %%prefix mangle(private_prefix %+ _%1) 33 %1_table: 34 %xdefine %%base %1_table 35 %rep %0 - 1 36 dd %%prefix %+ .w%2 - %%base 37 %rotate 1 38 %endrep 39 %endmacro 40 41 %macro SAVE_TMVS_TABLE 3 ; num_entries, w, suffix 42 %rep %1 43 db %2*3 44 db mangle(private_prefix %+ _save_tmvs_%3).write%2 - \ 45 mangle(private_prefix %+ _save_tmvs_%3).write1 46 %endrep 47 %endmacro 48 49 %if ARCH_X86_64 50 mv_proj: dw 0, 16384, 8192, 5461, 4096, 3276, 2730, 2340 51 dw 2048, 1820, 1638, 1489, 1365, 1260, 1170, 1092 52 dw 1024, 963, 910, 862, 819, 780, 744, 712 53 dw 682, 655, 630, 606, 585, 564, 546, 528 54 splat_mv_shuf: db 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 0, 1, 2, 3 55 db 4, 5, 6, 7, 8, 9, 10, 11, 0, 1, 2, 3, 4, 5, 6, 7 56 db 8, 9, 10, 11, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11 57 db 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 0, 1, 2, 3 58 %endif 59 save_pack0: db 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0 60 db 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1 61 save_pack1: db 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2 62 db 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3 63 save_ref_shuf: db 0, -1, -1, -1, 1, -1, -1, -1, 8, -1, -1, -1, 9, -1, -1, -1 64 cond_shuf512: db 3, 3, 3, 3, 7, 7, 7, 7, 7, 7, 7, 7, 3, 3, 3, 3 65 save_cond0: db 0x80, 0x81, 0x82, 0x83, 0x89, 0x84, 0x00, 0x00 66 save_cond1: db 0x84, 0x85, 0x86, 0x87, 0x88, 0x80, 0x00, 0x00 67 pb_128: times 16 db 128 68 pq_8192: dq 8192 69 70 save_tmvs_ssse3_table: SAVE_TMVS_TABLE 2, 16, ssse3 71 SAVE_TMVS_TABLE 4, 8, ssse3 72 SAVE_TMVS_TABLE 4, 4, ssse3 73 SAVE_TMVS_TABLE 5, 2, ssse3 74 SAVE_TMVS_TABLE 7, 1, ssse3 75 76 %if ARCH_X86_64 77 save_tmvs_avx2_table: SAVE_TMVS_TABLE 2, 16, avx2 78 SAVE_TMVS_TABLE 4, 8, avx2 79 SAVE_TMVS_TABLE 4, 4, avx2 80 SAVE_TMVS_TABLE 5, 2, avx2 81 SAVE_TMVS_TABLE 7, 1, avx2 82 83 save_tmvs_avx512icl_table: SAVE_TMVS_TABLE 2, 16, avx512icl 84 SAVE_TMVS_TABLE 4, 8, avx512icl 85 SAVE_TMVS_TABLE 4, 4, avx512icl 86 SAVE_TMVS_TABLE 5, 2, avx512icl 87 SAVE_TMVS_TABLE 7, 1, avx512icl 88 89 JMP_TABLE splat_mv_avx512icl, 1, 2, 4, 8, 16, 32 90 JMP_TABLE splat_mv_avx2, 1, 2, 4, 8, 16, 32 91 %endif 92 93 JMP_TABLE splat_mv_sse2, 1, 2, 4, 8, 16, 32 94 95 struc rf 96 .frm_hdr: resq 1 97 .iw4: resd 1 98 .ih4: resd 1 99 .iw8: resd 1 100 .ih8: resd 1 101 .sbsz: resd 1 102 .use_rf_mvs: resd 1 103 .sign_bias: resb 7 104 .mfmv_sign: resb 7 105 .pocdiff: resb 7 106 .mfmv_ref: resb 3 107 .mfmv_ref2cur: resb 3 108 .mfmv_ref2ref: resb 3*7 109 .n_mfmvs: resd 1 110 .n_blocks: resd 1 111 .rp: resq 1 112 .rp_ref: resq 1 113 .rp_proj: resq 1 114 .rp_stride: resq 1 115 .r: resq 1 116 .n_tile_threads: resd 1 117 .n_frame_threads: resd 1 118 endstruc 119 120 SECTION .text 121 122 %macro movif32 2 123 %if ARCH_X86_32 124 mov %1, %2 125 %endif 126 %endmacro 127 128 INIT_XMM ssse3 129 ; refmvs_temporal_block *rp, ptrdiff_t stride, 130 ; refmvs_block **rr, uint8_t *ref_sign, 131 ; int col_end8, int row_end8, int col_start8, int row_start8 132 %if ARCH_X86_64 133 cglobal save_tmvs, 4, 13, 11, rp, stride, rr, ref_sign, \ 134 xend, yend, xstart, ystart 135 %define base_reg r12 136 %else 137 cglobal save_tmvs, 6, 7, 8, rp, stride, rr, ref_sign, \ 138 xend, yend, xstart, ystart 139 movq m5, [ref_signq] 140 lea strided, [strided*5] 141 mov stridem, strided 142 mov r3, xstartm 143 mov r1, ystartm 144 DEFINE_ARGS b, ystart, rr, cand, xend, x 145 %define stridemp r1m 146 %define m8 [base+pb_128] 147 %define m9 [base+save_pack0+ 0] 148 %define m10 [base+save_pack0+16] 149 %define base_reg r6 150 %endif 151 %define base base_reg-.write1 152 LEA base_reg, .write1 153 %if ARCH_X86_64 154 movifnidn xendd, xendm 155 movifnidn yendd, yendm 156 mov xstartd, xstartm 157 mov ystartd, ystartm 158 movq m5, [ref_signq] 159 %endif 160 movu m4, [base+save_ref_shuf] 161 movddup m6, [base+save_cond0] 162 movddup m7, [base+save_cond1] 163 %if ARCH_X86_64 164 mova m8, [base+pb_128] 165 mova m9, [base+save_pack0+ 0] 166 mova m10, [base+save_pack0+16] 167 %endif 168 psllq m5, 8 169 %if ARCH_X86_64 170 lea r9d, [xendq*5] 171 lea xstartd, [xstartq*5] 172 sub yendd, ystartd 173 add ystartd, ystartd 174 lea strideq, [strideq*5] 175 sub xstartq, r9 176 add xendd, r9d 177 add rpq, r9 178 DEFINE_ARGS rp, stride, rr, x, xend, h, xstart, ystart, b, cand 179 %else 180 lea r0, [xendd*5] ; xend5 181 lea r3, [r3*5] ; xstart5 182 sub r3, r0 ; -w5 183 mov r6m, r3 184 %define xstartq r6m 185 add xendd, r0 ; xend6 186 add r0m, r0 ; rp+xend5 187 mov xendm, xendd 188 sub r5, r1 ; h 189 add r1, r1 190 mov r7m, r1 191 mov r5m, r5 192 %define hd r5mp 193 jmp .loop_y_noload 194 %endif 195 .loop_y: 196 movif32 ystartd, r7m 197 movif32 xendd, xendm 198 .loop_y_noload: 199 and ystartd, 30 200 mov xq, xstartq 201 mov bq, [rrq+ystartq*gprsize] 202 add ystartd, 2 203 movif32 r7m, ystartd 204 lea bq, [bq+xendq*4] 205 .loop_x: 206 %if ARCH_X86_32 207 %define rpq r3 208 %define r10 r1 209 %define r10d r1 210 %define r11 r4 211 %define r11d r4 212 %endif 213 imul candq, xq, 0x9999 ; x / 5 * 3 214 sar candq, 16 215 movzx r10d, byte [bq+candq*8+22] ; cand_b->bs 216 movu m0, [bq+candq*8+12] ; cand_b 217 movzx r11d, byte [base+save_tmvs_ssse3_table+r10*2+0] 218 movzx r10d, byte [base+save_tmvs_ssse3_table+r10*2+1] 219 add r10, base_reg 220 add candq, r11 221 jge .calc 222 movu m1, [bq+candq*8+12] 223 movzx r11d, byte [bq+candq*8+22] 224 movzx r11d, byte [base+save_tmvs_ssse3_table+r11*2+1] 225 add r11, base_reg 226 .calc: 227 movif32 rpq, r0m 228 ; ref check 229 punpckhqdq m2, m0, m1 230 pshufb m2, m4 ; b0.ref0 b0.ref1 b1.ref0 b1.ref1 | ... 231 pshufb m3, m5, m2 ; ref > 0 && res_sign[ref - 1] 232 ; mv check 233 punpcklqdq m2, m0, m1 ; b0.mv0 b0.mv1 b1.mv0 b1.mv1 | ... 234 pabsw m2, m2 235 psrlw m2, 12 ; (abs(mv.x) | abs(mv.y)) < 4096 236 ; res 237 pcmpgtd m3, m2 238 pshufd m2, m3, q2301 239 pand m3, m6 ; b0c0 b0c1 b1c0 b1c1 | ... 240 pand m2, m7 ; b0c1 b0c0 b1c1 b1c0 | ... 241 por m3, m2 ; b0.shuf b1.shuf | ... 242 pxor m3, m8 ; if cond0|cond1 == 0 => zero out 243 pshufb m0, m3 244 pshufb m1, m3 245 call r10 246 jge .next_line 247 pshufd m0, m1, q3232 248 call r11 249 jl .loop_x 250 .next_line: 251 add rpq, stridemp 252 movif32 r0m, rpq 253 dec hd 254 jg .loop_y 255 RET 256 .write1: 257 movd [rpq+xq+0], m0 258 psrlq m0, 8 259 movd [rpq+xq+1], m0 260 add xq, 5*1 261 ret 262 .write2: 263 movq [rpq+xq+0], m0 264 psrlq m0, 8 265 movd [rpq+xq+6], m0 266 add xq, 5*2 267 ret 268 .write4: 269 pshufb m0, m9 270 movu [rpq+xq+ 0], m0 271 psrlq m0, 8 272 movd [rpq+xq+16], m0 273 add xq, 5*4 274 ret 275 .write8: 276 pshufb m2, m0, m9 277 movu [rpq+xq+ 0], m2 278 pshufb m0, m10 279 movu [rpq+xq+16], m0 280 psrldq m2, 2 281 movq [rpq+xq+32], m2 282 add xq, 5*8 283 ret 284 .write16: 285 pshufb m2, m0, m9 286 movu [rpq+xq+ 0], m2 287 pshufb m0, m10 288 movu [rpq+xq+16], m0 289 shufps m2, m0, q1032 290 movu [rpq+xq+48], m2 291 shufps m2, m0, q2121 292 movu [rpq+xq+32], m2 293 shufps m0, m2, q1032 294 movu [rpq+xq+64], m0 295 add xq, 5*16 296 ret 297 298 INIT_XMM sse2 299 ; refmvs_block **rr, refmvs_block *a, int bx4, int bw4, int bh4 300 cglobal splat_mv, 4, 5, 3, rr, a, bx4, bw4, bh4 301 add bx4d, bw4d 302 tzcnt bw4d, bw4d 303 mova m2, [aq] 304 LEA aq, splat_mv_sse2_table 305 lea bx4q, [bx4q*3-32] 306 movsxd bw4q, [aq+bw4q*4] 307 movifnidn bh4d, bh4m 308 pshufd m0, m2, q0210 309 pshufd m1, m2, q1021 310 pshufd m2, m2, q2102 311 add bw4q, aq 312 .loop: 313 mov aq, [rrq] 314 add rrq, gprsize 315 lea aq, [aq+bx4q*4] 316 jmp bw4q 317 .w32: 318 mova [aq-16*16], m0 319 mova [aq-16*15], m1 320 mova [aq-16*14], m2 321 mova [aq-16*13], m0 322 mova [aq-16*12], m1 323 mova [aq-16*11], m2 324 mova [aq-16*10], m0 325 mova [aq-16* 9], m1 326 mova [aq-16* 8], m2 327 mova [aq-16* 7], m0 328 mova [aq-16* 6], m1 329 mova [aq-16* 5], m2 330 .w16: 331 mova [aq-16* 4], m0 332 mova [aq-16* 3], m1 333 mova [aq-16* 2], m2 334 mova [aq-16* 1], m0 335 mova [aq+16* 0], m1 336 mova [aq+16* 1], m2 337 .w8: 338 mova [aq+16* 2], m0 339 mova [aq+16* 3], m1 340 mova [aq+16* 4], m2 341 .w4: 342 mova [aq+16* 5], m0 343 mova [aq+16* 6], m1 344 mova [aq+16* 7], m2 345 dec bh4d 346 jg .loop 347 RET 348 .w2: 349 movu [aq+104], m0 350 movq [aq+120], m1 351 dec bh4d 352 jg .loop 353 RET 354 .w1: 355 movq [aq+116], m0 356 movd [aq+124], m2 357 dec bh4d 358 jg .loop 359 RET 360 361 %if ARCH_X86_64 362 INIT_XMM sse4 363 ; refmvs_frame *rf, int tile_row_idx, 364 ; int col_start8, int col_end8, int row_start8, int row_end8 365 cglobal load_tmvs, 6, 15, 4, -0x50, rf, tridx, xstart, xend, ystart, yend, \ 366 stride, rp_proj, roff, troff, \ 367 xendi, xstarti, iw8, ih8, dst 368 xor r14d, r14d 369 cmp dword [rfq+rf.n_tile_threads], 1 370 mov ih8d, [rfq+rf.ih8] 371 mov iw8d, [rfq+rf.iw8] 372 mov xstartd, xstartd 373 mov xendd, xendd 374 cmove tridxd, r14d 375 lea xstartid, [xstartq-8] 376 lea xendid, [xendq+8] 377 mov strideq, [rfq+rf.rp_stride] 378 mov rp_projq, [rfq+rf.rp_proj] 379 cmp ih8d, yendd 380 mov [rsp+0x30], strideq 381 cmovs yendd, ih8d 382 test xstartid, xstartid 383 cmovs xstartid, r14d 384 cmp iw8d, xendid 385 cmovs xendid, iw8d 386 mov troffq, strideq 387 shl troffq, 4 388 imul troffq, tridxq 389 mov dstd, ystartd 390 and dstd, 15 391 imul dstq, strideq 392 add dstq, troffq ; (16 * tridx + (ystart & 15)) * stride 393 lea dstq, [dstq*5] 394 add dstq, rp_projq 395 lea troffq, [troffq*5] ; 16 * tridx * stride * 5 396 lea r13d, [xendq*5] 397 lea r12, [strideq*5] 398 DEFINE_ARGS rf, w5, xstart, xend, ystart, yend, h, x5, \ 399 _, troff, xendi, xstarti, stride5, _, dst 400 lea w5d, [xstartq*5] 401 add r7, troffq ; rp_proj + tile_row_offset 402 mov hd, yendd 403 mov [rsp+0x28], r7 404 add dstq, r13 405 sub w5q, r13 406 sub hd, ystartd 407 .init_xloop_start: 408 mov x5q, w5q 409 test w5b, 1 410 jz .init_2blk 411 mov dword [dstq+x5q], 0x80008000 412 add x5q, 5 413 jz .init_next_row 414 .init_2blk: 415 mov dword [dstq+x5q+0], 0x80008000 416 mov dword [dstq+x5q+5], 0x80008000 417 add x5q, 10 418 jl .init_2blk 419 .init_next_row: 420 add dstq, stride5q 421 dec hd 422 jg .init_xloop_start 423 DEFINE_ARGS rf, _, xstart, xend, ystart, yend, n7, stride, \ 424 _, _, xendi, xstarti, stride5, _, n 425 mov r13d, [rfq+rf.n_mfmvs] 426 test r13d, r13d 427 jz .ret 428 mov [rsp+0x0c], r13d 429 mov strideq, [rsp+0x30] 430 movddup m3, [pq_8192] 431 mov r9d, ystartd 432 mov [rsp+0x38], yendd 433 mov [rsp+0x20], xstartid 434 xor nd, nd 435 lea n7q, [rfq+rf.mfmv_ref2ref-1] 436 imul r9, strideq ; ystart * stride 437 mov [rsp+0x48], rfq 438 mov [rsp+0x18], stride5q 439 lea r7, [r9*5] 440 mov [rsp+0x24], ystartd 441 mov [rsp+0x00], r7 442 .nloop: 443 DEFINE_ARGS y, off, xstart, xend, ystart, rf, n7, refsign, \ 444 ref, rp_ref, xendi, xstarti, _, _, n 445 mov rfq, [rsp+0x48] 446 movsx refd, byte [rfq+rf.mfmv_ref2cur+nq] 447 cmp refd, -32 ; INVALID_REF2CUR 448 je .next_n 449 mov [rsp+0x40], refd 450 mov offq, [rsp+0x00] ; ystart * stride * 5 451 movzx refd, byte [rfq+rf.mfmv_ref+nq] 452 lea refsignq, [refq-4] 453 mov rp_refq, [rfq+rf.rp_ref] 454 movq m2, refsignq 455 add offq, [rp_refq+refq*8] ; r = rp_ref[ref] + row_offset 456 mov [rsp+0x14], nd 457 mov yd, ystartd 458 .yloop: 459 mov r11d, [rsp+0x24] ; ystart 460 mov r12d, [rsp+0x38] ; yend 461 mov r14d, yd 462 and r14d, ~7 ; y_sb_align 463 cmp r11d, r14d 464 cmovs r11d, r14d ; imax(y_sb_align, ystart) 465 mov [rsp+0x44], r11d ; y_proj_start 466 add r14d, 8 467 cmp r12d, r14d 468 cmovs r14d, r12d ; imin(y_sb_align + 8, yend) 469 mov [rsp+0x3c], r14d ; y_proj_end 470 DEFINE_ARGS y, src, xstart, xend, frac, rf, n7, mv, \ 471 ref, x, xendi, mvx, mvy, rb, ref2ref 472 mov xd, [rsp+0x20] ; xstarti 473 .xloop: 474 lea rbd, [xq*5] 475 add rbq, srcq 476 movzx refd, byte [rbq+4] 477 test refd, refd 478 jz .next_x_bad_ref 479 movzx ref2refd, byte [n7q+refq] ; rf->mfmv_ref2ref[n][b_ref-1] 480 test ref2refd, ref2refd 481 jz .next_x_bad_ref 482 lea fracq, [mv_proj] 483 movzx fracd, word [fracq+ref2refq*2] 484 mov mvd, [rbq] 485 imul fracd, [rsp+0x40] ; ref2cur 486 pmovsxwq m0, [rbq] 487 movd m1, fracd 488 punpcklqdq m1, m1 489 pmuldq m0, m1 ; mv * frac 490 pshufd m1, m0, q3311 491 paddd m0, m3 492 paddd m0, m1 493 psrad m0, 14 ; offset = (xy + (xy >> 31) + 8192) >> 14 494 pabsd m1, m0 495 packssdw m0, m0 496 psrld m1, 6 497 packuswb m1, m1 498 pxor m0, m2 ; offset ^ ref_sign 499 psignd m1, m0 ; apply_sign(abs(offset) >> 6, offset ^ refsign) 500 movq mvxq, m1 501 lea mvyd, [mvxq+yq] ; ypos 502 sar mvxq, 32 503 DEFINE_ARGS y, src, xstart, xend, _, _, n7, mv, \ 504 ref, x, xendi, mvx, ypos, rb, ref2ref 505 cmp yposd, [rsp+0x44] ; y_proj_start 506 jl .next_x_bad_pos_y 507 cmp yposd, [rsp+0x3c] ; y_proj_end 508 jge .next_x_bad_pos_y 509 and yposd, 15 510 add mvxq, xq ; xpos 511 imul yposq, [rsp+0x30] ; pos = (ypos & 15) * stride 512 DEFINE_ARGS y, src, xstart, xend, dst, _, n7, mv, \ 513 ref, x, xendi, xpos, pos, rb, ref2ref 514 mov dstq, [rsp+0x28] ; dst = rp_proj + tile_row_offset 515 add posq, xposq ; pos += xpos 516 lea posq, [posq*5] 517 add dstq, posq ; dst += pos5 518 jmp .write_loop_entry 519 .write_loop: 520 add rbq, 5 521 cmp refb, byte [rbq+4] 522 jne .xloop 523 cmp mvd, [rbq] 524 jne .xloop 525 add dstq, 5 526 inc xposd 527 .write_loop_entry: 528 mov r12d, xd 529 and r12d, ~7 530 lea r5d, [r12-8] 531 cmp r5d, xstartd 532 cmovs r5d, xstartd ; x_proj_start 533 cmp xposd, r5d 534 jl .next_xpos 535 add r12d, 16 536 cmp xendd, r12d 537 cmovs r12d, xendd ; x_proj_end 538 cmp xposd, r12d 539 jge .next_xpos 540 mov [dstq+0], mvd 541 mov byte [dstq+4], ref2refb 542 .next_xpos: 543 inc xd 544 cmp xd, xendid 545 jl .write_loop 546 .next_y: 547 DEFINE_ARGS y, src, xstart, xend, ystart, _, n7, _, _, x, xendi, _, _, _, n 548 add srcq, [rsp+0x18] ; stride5 549 inc yd 550 cmp yd, [rsp+0x38] ; yend 551 jne .yloop 552 mov nd, [rsp+0x14] 553 mov ystartd, [rsp+0x24] 554 .next_n: 555 add n7q, 7 556 inc nd 557 cmp nd, [rsp+0x0c] ; n_mfmvs 558 jne .nloop 559 .ret: 560 RET 561 .next_x: 562 DEFINE_ARGS y, src, xstart, xend, _, _, n7, mv, ref, x, xendi, _, _, rb, _ 563 add rbq, 5 564 cmp refb, byte [rbq+4] 565 jne .xloop 566 cmp mvd, [rbq] 567 jne .xloop 568 .next_x_bad_pos_y: 569 inc xd 570 cmp xd, xendid 571 jl .next_x 572 jmp .next_y 573 .next_x_bad_ref: 574 inc xd 575 cmp xd, xendid 576 jl .xloop 577 jmp .next_y 578 579 INIT_YMM avx2 580 ; refmvs_temporal_block *rp, ptrdiff_t stride, 581 ; refmvs_block **rr, uint8_t *ref_sign, 582 ; int col_end8, int row_end8, int col_start8, int row_start8 583 cglobal save_tmvs, 4, 13, 10, rp, stride, rr, ref_sign, \ 584 xend, yend, xstart, ystart 585 %define base r12-.write1 586 lea r12, [.write1] 587 movifnidn xendd, xendm 588 movifnidn yendd, yendm 589 mov xstartd, xstartm 590 mov ystartd, ystartm 591 vpbroadcastq m4, [ref_signq] 592 vpbroadcastq m3, [base+save_ref_shuf+8] 593 vpbroadcastq m5, [base+save_cond0] 594 vpbroadcastq m6, [base+save_cond1] 595 vpbroadcastd m7, [base+pb_128] 596 mova m8, [base+save_pack0] 597 mova m9, [base+save_pack1] 598 psllq m4, 8 599 lea r9d, [xendq*5] 600 lea xstartd, [xstartq*5] 601 sub yendd, ystartd 602 add ystartd, ystartd 603 lea strideq, [strideq*5] 604 sub xstartq, r9 605 add xendd, r9d 606 add rpq, r9 607 DEFINE_ARGS rp, stride, rr, x, xend, h, xstart, ystart, b, cand 608 .loop_y: 609 and ystartd, 30 610 mov xq, xstartq 611 mov bq, [rrq+ystartq*8] 612 add ystartd, 2 613 lea bq, [bq+xendq*4] 614 .loop_x: 615 imul candq, xq, 0x9999 616 sar candq, 16 ; x / 5 * 3 617 movzx r10d, byte [bq+candq*8+22] ; cand_b->bs 618 movu xm0, [bq+candq*8+12] ; cand_b 619 movzx r11d, byte [base+save_tmvs_avx2_table+r10*2+0] 620 movzx r10d, byte [base+save_tmvs_avx2_table+r10*2+1] 621 add r10, r12 622 add candq, r11 623 jge .calc 624 vinserti128 m0, [bq+candq*8+12], 1 625 movzx r11d, byte [bq+candq*8+22] 626 movzx r11d, byte [base+save_tmvs_avx2_table+r11*2+1] 627 add r11, r12 628 .calc: 629 pshufb m1, m0, m3 630 pabsw m2, m0 631 pshufb m1, m4, m1 ; ref > 0 && res_sign[ref - 1] 632 psrlw m2, 12 ; (abs(mv.x) | abs(mv.y)) < 4096 633 pcmpgtd m1, m2 634 pshufd m2, m1, q2301 635 pand m1, m5 ; b0.cond0 b1.cond0 636 pand m2, m6 ; b0.cond1 b1.cond1 637 por m1, m2 ; b0.shuf b1.shuf 638 pxor m1, m7 ; if cond0|cond1 == 0 => zero out 639 pshufb m0, m1 640 call r10 641 jge .next_line 642 vextracti128 xm0, m0, 1 643 call r11 644 jl .loop_x 645 .next_line: 646 add rpq, strideq 647 dec hd 648 jg .loop_y 649 RET 650 .write1: 651 movd [rpq+xq+ 0], xm0 652 pextrb [rpq+xq+ 4], xm0, 4 653 add xq, 5*1 654 ret 655 .write2: 656 movq [rpq+xq+0], xm0 657 psrlq xm1, xm0, 8 658 movd [rpq+xq+6], xm1 659 add xq, 5*2 660 ret 661 .write4: 662 pshufb xm1, xm0, xm8 663 movu [rpq+xq+ 0], xm1 664 psrlq xm1, 8 665 movd [rpq+xq+16], xm1 666 add xq, 5*4 667 ret 668 .write8: 669 vinserti128 m1, m0, xm0, 1 670 pshufb m1, m8 671 movu [rpq+xq+ 0], m1 672 psrldq xm1, 2 673 movq [rpq+xq+32], xm1 674 add xq, 5*8 675 ret 676 .write16: 677 vinserti128 m1, m0, xm0, 1 678 pshufb m2, m1, m8 679 movu [rpq+xq+ 0], m2 680 pshufb m1, m9 681 movu [rpq+xq+32], m1 682 shufps xm2, xm1, q1021 683 movu [rpq+xq+64], xm2 684 add xq, 5*16 685 ret 686 687 cglobal splat_mv, 4, 5, 3, rr, a, bx4, bw4, bh4 688 add bx4d, bw4d 689 tzcnt bw4d, bw4d 690 vbroadcasti128 m0, [aq] 691 lea aq, [splat_mv_avx2_table] 692 lea bx4q, [bx4q*3-32] 693 movsxd bw4q, [aq+bw4q*4] 694 pshufb m0, [splat_mv_shuf] 695 movifnidn bh4d, bh4m 696 pshufd m1, m0, q2102 697 pshufd m2, m0, q1021 698 add bw4q, aq 699 .loop: 700 mov aq, [rrq] 701 add rrq, gprsize 702 lea aq, [aq+bx4q*4] 703 jmp bw4q 704 .w32: 705 mova [aq-32*8], m0 706 mova [aq-32*7], m1 707 mova [aq-32*6], m2 708 mova [aq-32*5], m0 709 mova [aq-32*4], m1 710 mova [aq-32*3], m2 711 .w16: 712 mova [aq-32*2], m0 713 mova [aq-32*1], m1 714 mova [aq+32*0], m2 715 .w8: 716 mova [aq+32*1], m0 717 mova [aq+32*2], m1 718 mova [aq+32*3], m2 719 dec bh4d 720 jg .loop 721 RET 722 .w4: 723 movu [aq+ 80], m0 724 mova [aq+112], xm1 725 dec bh4d 726 jg .loop 727 RET 728 .w2: 729 movu [aq+104], xm0 730 movq [aq+120], xm2 731 dec bh4d 732 jg .loop 733 RET 734 .w1: 735 movq [aq+116], xm0 736 movd [aq+124], xm1 737 dec bh4d 738 jg .loop 739 RET 740 741 INIT_ZMM avx512icl 742 ; refmvs_temporal_block *rp, ptrdiff_t stride, 743 ; refmvs_block **rr, uint8_t *ref_sign, 744 ; int col_end8, int row_end8, int col_start8, int row_start8 745 cglobal save_tmvs, 4, 15, 10, rp, stride, rr, ref_sign, \ 746 xend, yend, xstart, ystart 747 %define base r14-.write1 748 lea r14, [.write1] 749 movifnidn xendd, xendm 750 movifnidn yendd, yendm 751 mov xstartd, xstartm 752 mov ystartd, ystartm 753 psllq m4, [ref_signq]{bcstq}, 8 754 vpbroadcastq m3, [base+save_ref_shuf+8] 755 vbroadcasti32x4 m5, [base+cond_shuf512] 756 vbroadcasti32x4 m6, [base+save_cond0] 757 vpbroadcastd m7, [base+pb_128] 758 mova m8, [base+save_pack0] 759 movu xm9, [base+save_pack0+4] 760 lea r9d, [xendq*5] 761 lea xstartd, [xstartq*5] 762 sub yendd, ystartd 763 add ystartd, ystartd 764 lea strideq, [strideq*5] 765 sub xstartq, r9 766 add xendd, r9d 767 add rpq, r9 768 mov r10d, 0x1f 769 kmovb k2, r10d 770 DEFINE_ARGS rp, stride, rr, x, xend, h, xstart, ystart, b, cand 771 .loop_y: 772 and ystartd, 30 773 mov xq, xstartq 774 mov bq, [rrq+ystartq*8] 775 add ystartd, 2 776 lea bq, [bq+xendq*4] 777 .loop_x: 778 imul candq, xq, 0x9999 779 sar candq, 16 ; x / 5 * 3 780 movzx r10d, byte [bq+candq*8+22] ; cand_b->bs 781 movu xm0, [bq+candq*8+12] ; cand_b 782 movzx r11d, byte [base+save_tmvs_avx512icl_table+r10*2+0] 783 movzx r10d, byte [base+save_tmvs_avx512icl_table+r10*2+1] 784 add r10, r14 785 add candq, r11 786 jge .calc 787 movzx r11d, byte [bq+candq*8+22] 788 vinserti32x4 ym0, [bq+candq*8+12], 1 789 movzx r12d, byte [base+save_tmvs_avx512icl_table+r11*2+0] 790 movzx r11d, byte [base+save_tmvs_avx512icl_table+r11*2+1] 791 add r11, r14 792 add candq, r12 793 jge .calc 794 movzx r12d, byte [bq+candq*8+22] 795 vinserti32x4 m0, [bq+candq*8+12], 2 796 movzx r13d, byte [base+save_tmvs_avx512icl_table+r12*2+0] 797 movzx r12d, byte [base+save_tmvs_avx512icl_table+r12*2+1] 798 add r12, r14 799 add candq, r13 800 jge .calc 801 vinserti32x4 m0, [bq+candq*8+12], 3 802 movzx r13d, byte [bq+candq*8+22] 803 movzx r13d, byte [base+save_tmvs_avx512icl_table+r13*2+1] 804 add r13, r14 805 .calc: 806 pshufb m1, m0, m3 807 pabsw m2, m0 808 pshufb m1, m4, m1 ; ref > 0 && res_sign[ref - 1] 809 psrlw m2, 12 ; (abs(mv.x) | abs(mv.y)) < 4096 810 psubd m2, m1 811 pshufb m2, m5 ; c0 c1 c1 c0 812 pand m2, m6 813 punpckhqdq m1, m2, m2 814 vpternlogd m1, m2, m7, 0x56 ; (c0shuf | c1shuf) ^ 0x80 815 pshufb m2, m0, m1 816 mova xm0, xm2 817 call r10 818 jge .next_line 819 vextracti32x4 xm0, m2, 1 820 call r11 821 jge .next_line 822 vextracti32x4 xm0, m2, 2 823 call r12 824 jge .next_line 825 vextracti32x4 xm0, m2, 3 826 call r13 827 jl .loop_x 828 .next_line: 829 add rpq, strideq 830 dec hd 831 jg .loop_y 832 RET 833 .write1: 834 vmovdqu8 [rpq+xq]{k2}, xm0 835 add xq, 5*1 836 ret 837 .write2: 838 pshufb xm0, xm8 839 vmovdqu16 [rpq+xq]{k2}, xm0 840 add xq, 5*2 841 ret 842 .write4: 843 vpermb ym0, ym8, ym0 844 vmovdqu32 [rpq+xq]{k2}, ym0 845 add xq, 5*4 846 ret 847 .write8: 848 vpermb m0, m8, m0 849 vmovdqu64 [rpq+xq]{k2}, m0 850 add xq, 5*8 851 ret 852 .write16: 853 vpermb m1, m8, m0 854 movu [rpq+xq+ 0], m1 855 pshufb xm0, xm9 856 movu [rpq+xq+64], xm0 857 add xq, 5*16 858 ret 859 860 INIT_ZMM avx512icl 861 cglobal splat_mv, 4, 7, 3, rr, a, bx4, bw4, bh4 862 vbroadcasti32x4 m0, [aq] 863 lea r1, [splat_mv_avx512icl_table] 864 tzcnt bw4d, bw4d 865 lea bx4d, [bx4q*3] 866 pshufb m0, [splat_mv_shuf] 867 movsxd bw4q, [r1+bw4q*4] 868 mov r6d, bh4m 869 add bw4q, r1 870 lea rrq, [rrq+r6*8] 871 mov r1d, 0x3f 872 neg r6 873 kmovb k1, r1d 874 jmp bw4q 875 .w1: 876 mov r1, [rrq+r6*8] 877 vmovdqu16 [r1+bx4q*4]{k1}, xm0 878 inc r6 879 jl .w1 880 RET 881 .w2: 882 mov r1, [rrq+r6*8] 883 vmovdqu32 [r1+bx4q*4]{k1}, ym0 884 inc r6 885 jl .w2 886 RET 887 .w4: 888 mov r1, [rrq+r6*8] 889 vmovdqu64 [r1+bx4q*4]{k1}, m0 890 inc r6 891 jl .w4 892 RET 893 .w8: 894 pshufd ym1, ym0, q1021 895 .w8_loop: 896 mov r1, [rrq+r6*8+0] 897 mov r3, [rrq+r6*8+8] 898 movu [r1+bx4q*4+ 0], m0 899 mova [r1+bx4q*4+64], ym1 900 movu [r3+bx4q*4+ 0], m0 901 mova [r3+bx4q*4+64], ym1 902 add r6, 2 903 jl .w8_loop 904 RET 905 .w16: 906 pshufd m1, m0, q1021 907 pshufd m2, m0, q2102 908 .w16_loop: 909 mov r1, [rrq+r6*8+0] 910 mov r3, [rrq+r6*8+8] 911 mova [r1+bx4q*4+64*0], m0 912 mova [r1+bx4q*4+64*1], m1 913 mova [r1+bx4q*4+64*2], m2 914 mova [r3+bx4q*4+64*0], m0 915 mova [r3+bx4q*4+64*1], m1 916 mova [r3+bx4q*4+64*2], m2 917 add r6, 2 918 jl .w16_loop 919 RET 920 .w32: 921 pshufd m1, m0, q1021 922 pshufd m2, m0, q2102 923 .w32_loop: 924 mov r1, [rrq+r6*8] 925 lea r1, [r1+bx4q*4] 926 mova [r1+64*0], m0 927 mova [r1+64*1], m1 928 mova [r1+64*2], m2 929 mova [r1+64*3], m0 930 mova [r1+64*4], m1 931 mova [r1+64*5], m2 932 inc r6 933 jl .w32_loop 934 RET 935 %endif ; ARCH_X86_64