tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

refmvs.asm (29605B)


      1 ; Copyright © 2021, VideoLAN and dav1d authors
      2 ; Copyright © 2021, Two Orioles, LLC
      3 ; All rights reserved.
      4 ;
      5 ; Redistribution and use in source and binary forms, with or without
      6 ; modification, are permitted provided that the following conditions are met:
      7 ;
      8 ; 1. Redistributions of source code must retain the above copyright notice, this
      9 ;    list of conditions and the following disclaimer.
     10 ;
     11 ; 2. Redistributions in binary form must reproduce the above copyright notice,
     12 ;    this list of conditions and the following disclaimer in the documentation
     13 ;    and/or other materials provided with the distribution.
     14 ;
     15 ; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
     16 ; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
     17 ; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
     18 ; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
     19 ; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
     20 ; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
     21 ; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
     22 ; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
     23 ; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
     24 ; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     25 
     26 %include "config.asm"
     27 %include "ext/x86/x86inc.asm"
     28 
     29 SECTION_RODATA 64
     30 
     31 %macro JMP_TABLE 2-*
     32    %xdefine %%prefix mangle(private_prefix %+ _%1)
     33    %1_table:
     34    %xdefine %%base %1_table
     35    %rep %0 - 1
     36        dd %%prefix %+ .w%2 - %%base
     37        %rotate 1
     38    %endrep
     39 %endmacro
     40 
     41 %macro SAVE_TMVS_TABLE 3 ; num_entries, w, suffix
     42    %rep %1
     43        db %2*3
     44        db mangle(private_prefix %+ _save_tmvs_%3).write%2 - \
     45           mangle(private_prefix %+ _save_tmvs_%3).write1
     46    %endrep
     47 %endmacro
     48 
     49 %if ARCH_X86_64
     50 mv_proj:       dw    0, 16384, 8192, 5461, 4096, 3276, 2730, 2340
     51               dw 2048,  1820, 1638, 1489, 1365, 1260, 1170, 1092
     52               dw 1024,   963,  910,  862,  819,  780,  744,  712
     53               dw  682,   655,  630,  606,  585,  564,  546,  528
     54 splat_mv_shuf: db  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11,  0,  1,  2,  3
     55               db  4,  5,  6,  7,  8,  9, 10, 11,  0,  1,  2,  3,  4,  5,  6,  7
     56               db  8,  9, 10, 11,  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11
     57               db  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11,  0,  1,  2,  3
     58 %endif
     59 save_pack0:    db  0,  1,  2,  3,  4,  0,  1,  2,  3,  4,  0,  1,  2,  3,  4,  0
     60               db  1,  2,  3,  4,  0,  1,  2,  3,  4,  0,  1,  2,  3,  4,  0,  1
     61 save_pack1:    db  2,  3,  4,  0,  1,  2,  3,  4,  0,  1,  2,  3,  4,  0,  1,  2
     62               db  3,  4,  0,  1,  2,  3,  4,  0,  1,  2,  3,  4,  0,  1,  2,  3
     63 save_ref_shuf: db  0, -1, -1, -1,  1, -1, -1, -1,  8, -1, -1, -1,  9, -1, -1, -1
     64 cond_shuf512:  db  3,  3,  3,  3,  7,  7,  7,  7,  7,  7,  7,  7,  3,  3,  3,  3
     65 save_cond0:    db  0x80, 0x81, 0x82, 0x83, 0x89, 0x84, 0x00, 0x00
     66 save_cond1:    db  0x84, 0x85, 0x86, 0x87, 0x88, 0x80, 0x00, 0x00
     67 pb_128:        times 16 db 128
     68 pq_8192:       dq 8192
     69 
     70 save_tmvs_ssse3_table: SAVE_TMVS_TABLE 2, 16, ssse3
     71                       SAVE_TMVS_TABLE 4,  8, ssse3
     72                       SAVE_TMVS_TABLE 4,  4, ssse3
     73                       SAVE_TMVS_TABLE 5,  2, ssse3
     74                       SAVE_TMVS_TABLE 7,  1, ssse3
     75 
     76 %if ARCH_X86_64
     77 save_tmvs_avx2_table: SAVE_TMVS_TABLE 2, 16, avx2
     78                      SAVE_TMVS_TABLE 4,  8, avx2
     79                      SAVE_TMVS_TABLE 4,  4, avx2
     80                      SAVE_TMVS_TABLE 5,  2, avx2
     81                      SAVE_TMVS_TABLE 7,  1, avx2
     82 
     83 save_tmvs_avx512icl_table: SAVE_TMVS_TABLE 2, 16, avx512icl
     84                           SAVE_TMVS_TABLE 4,  8, avx512icl
     85                           SAVE_TMVS_TABLE 4,  4, avx512icl
     86                           SAVE_TMVS_TABLE 5,  2, avx512icl
     87                           SAVE_TMVS_TABLE 7,  1, avx512icl
     88 
     89 JMP_TABLE splat_mv_avx512icl, 1, 2, 4, 8, 16, 32
     90 JMP_TABLE splat_mv_avx2,      1, 2, 4, 8, 16, 32
     91 %endif
     92 
     93 JMP_TABLE splat_mv_sse2,      1, 2, 4, 8, 16, 32
     94 
     95 struc rf
     96    .frm_hdr:         resq 1
     97    .iw4:             resd 1
     98    .ih4:             resd 1
     99    .iw8:             resd 1
    100    .ih8:             resd 1
    101    .sbsz:            resd 1
    102    .use_rf_mvs:      resd 1
    103    .sign_bias:       resb 7
    104    .mfmv_sign:       resb 7
    105    .pocdiff:         resb 7
    106    .mfmv_ref:        resb 3
    107    .mfmv_ref2cur:    resb 3
    108    .mfmv_ref2ref:    resb 3*7
    109    .n_mfmvs:         resd 1
    110    .n_blocks:        resd 1
    111    .rp:              resq 1
    112    .rp_ref:          resq 1
    113    .rp_proj:         resq 1
    114    .rp_stride:       resq 1
    115    .r:               resq 1
    116    .n_tile_threads:  resd 1
    117    .n_frame_threads: resd 1
    118 endstruc
    119 
    120 SECTION .text
    121 
    122 %macro movif32 2
    123 %if ARCH_X86_32
    124    mov             %1, %2
    125 %endif
    126 %endmacro
    127 
    128 INIT_XMM ssse3
    129 ; refmvs_temporal_block *rp, ptrdiff_t stride,
    130 ; refmvs_block **rr, uint8_t *ref_sign,
    131 ; int col_end8, int row_end8, int col_start8, int row_start8
    132 %if ARCH_X86_64
    133 cglobal save_tmvs, 4, 13, 11, rp, stride, rr, ref_sign, \
    134                             xend, yend, xstart, ystart
    135 %define base_reg r12
    136 %else
    137 cglobal save_tmvs, 6, 7, 8, rp, stride, rr, ref_sign, \
    138                            xend, yend, xstart, ystart
    139    movq            m5, [ref_signq]
    140    lea        strided, [strided*5]
    141    mov        stridem, strided
    142    mov             r3, xstartm
    143    mov             r1, ystartm
    144 DEFINE_ARGS b, ystart, rr, cand, xend, x
    145 %define stridemp r1m
    146 %define m8  [base+pb_128]
    147 %define m9  [base+save_pack0+ 0]
    148 %define m10 [base+save_pack0+16]
    149 %define base_reg r6
    150 %endif
    151 %define base base_reg-.write1
    152    LEA       base_reg, .write1
    153 %if ARCH_X86_64
    154    movifnidn    xendd, xendm
    155    movifnidn    yendd, yendm
    156    mov        xstartd, xstartm
    157    mov        ystartd, ystartm
    158    movq            m5, [ref_signq]
    159 %endif
    160    movu            m4, [base+save_ref_shuf]
    161    movddup         m6, [base+save_cond0]
    162    movddup         m7, [base+save_cond1]
    163 %if ARCH_X86_64
    164    mova            m8, [base+pb_128]
    165    mova            m9, [base+save_pack0+ 0]
    166    mova           m10, [base+save_pack0+16]
    167 %endif
    168    psllq           m5, 8
    169 %if ARCH_X86_64
    170    lea            r9d, [xendq*5]
    171    lea        xstartd, [xstartq*5]
    172    sub          yendd, ystartd
    173    add        ystartd, ystartd
    174    lea        strideq, [strideq*5]
    175    sub        xstartq, r9
    176    add          xendd, r9d
    177    add            rpq, r9
    178 DEFINE_ARGS rp, stride, rr, x, xend, h, xstart, ystart, b, cand
    179 %else
    180    lea             r0, [xendd*5]   ; xend5
    181    lea             r3, [r3*5]      ; xstart5
    182    sub             r3, r0          ; -w5
    183    mov            r6m, r3
    184 %define xstartq r6m
    185    add          xendd, r0          ; xend6
    186    add            r0m, r0          ; rp+xend5
    187    mov          xendm, xendd
    188    sub             r5, r1          ; h
    189    add             r1, r1
    190    mov            r7m, r1
    191    mov            r5m, r5
    192 %define hd r5mp
    193    jmp .loop_y_noload
    194 %endif
    195 .loop_y:
    196    movif32    ystartd, r7m
    197    movif32      xendd, xendm
    198 .loop_y_noload:
    199    and        ystartd, 30
    200    mov             xq, xstartq
    201    mov             bq, [rrq+ystartq*gprsize]
    202    add        ystartd, 2
    203    movif32        r7m, ystartd
    204    lea             bq, [bq+xendq*4]
    205 .loop_x:
    206 %if ARCH_X86_32
    207 %define rpq  r3
    208 %define r10  r1
    209 %define r10d r1
    210 %define r11  r4
    211 %define r11d r4
    212 %endif
    213    imul         candq, xq, 0x9999  ; x / 5 * 3
    214    sar          candq, 16
    215    movzx         r10d, byte [bq+candq*8+22] ; cand_b->bs
    216    movu            m0, [bq+candq*8+12]      ; cand_b
    217    movzx         r11d, byte [base+save_tmvs_ssse3_table+r10*2+0]
    218    movzx         r10d, byte [base+save_tmvs_ssse3_table+r10*2+1]
    219    add            r10, base_reg
    220    add          candq, r11
    221    jge .calc
    222    movu            m1, [bq+candq*8+12]
    223    movzx         r11d, byte [bq+candq*8+22]
    224    movzx         r11d, byte [base+save_tmvs_ssse3_table+r11*2+1]
    225    add            r11, base_reg
    226 .calc:
    227    movif32        rpq, r0m
    228    ; ref check
    229    punpckhqdq      m2, m0, m1
    230    pshufb          m2, m4      ; b0.ref0 b0.ref1 b1.ref0 b1.ref1 | ...
    231    pshufb          m3, m5, m2  ; ref > 0 && res_sign[ref - 1]
    232    ; mv check
    233    punpcklqdq      m2, m0, m1  ; b0.mv0 b0.mv1 b1.mv0 b1.mv1 | ...
    234    pabsw           m2, m2
    235    psrlw           m2, 12      ; (abs(mv.x) | abs(mv.y)) < 4096
    236    ; res
    237    pcmpgtd         m3, m2
    238    pshufd          m2, m3, q2301
    239    pand            m3, m6      ; b0c0 b0c1 b1c0 b1c1 | ...
    240    pand            m2, m7      ; b0c1 b0c0 b1c1 b1c0 | ...
    241    por             m3, m2      ; b0.shuf b1.shuf | ...
    242    pxor            m3, m8      ; if cond0|cond1 == 0 => zero out
    243    pshufb          m0, m3
    244    pshufb          m1, m3
    245    call           r10
    246    jge .next_line
    247    pshufd          m0, m1, q3232
    248    call           r11
    249    jl .loop_x
    250 .next_line:
    251    add            rpq, stridemp
    252    movif32        r0m, rpq
    253    dec             hd
    254    jg .loop_y
    255    RET
    256 .write1:
    257    movd    [rpq+xq+0], m0
    258    psrlq           m0, 8
    259    movd    [rpq+xq+1], m0
    260    add             xq, 5*1
    261    ret
    262 .write2:
    263    movq    [rpq+xq+0], m0
    264    psrlq           m0, 8
    265    movd    [rpq+xq+6], m0
    266    add             xq, 5*2
    267    ret
    268 .write4:
    269    pshufb          m0, m9
    270    movu   [rpq+xq+ 0], m0
    271    psrlq           m0, 8
    272    movd   [rpq+xq+16], m0
    273    add             xq, 5*4
    274    ret
    275 .write8:
    276    pshufb          m2, m0, m9
    277    movu   [rpq+xq+ 0], m2
    278    pshufb          m0, m10
    279    movu   [rpq+xq+16], m0
    280    psrldq          m2, 2
    281    movq   [rpq+xq+32], m2
    282    add             xq, 5*8
    283    ret
    284 .write16:
    285    pshufb          m2, m0, m9
    286    movu   [rpq+xq+ 0], m2
    287    pshufb          m0, m10
    288    movu   [rpq+xq+16], m0
    289    shufps          m2, m0, q1032
    290    movu   [rpq+xq+48], m2
    291    shufps          m2, m0, q2121
    292    movu   [rpq+xq+32], m2
    293    shufps          m0, m2, q1032
    294    movu   [rpq+xq+64], m0
    295    add             xq, 5*16
    296    ret
    297 
    298 INIT_XMM sse2
    299 ; refmvs_block **rr, refmvs_block *a, int bx4, int bw4, int bh4
    300 cglobal splat_mv, 4, 5, 3, rr, a, bx4, bw4, bh4
    301    add           bx4d, bw4d
    302    tzcnt         bw4d, bw4d
    303    mova            m2, [aq]
    304    LEA             aq, splat_mv_sse2_table
    305    lea           bx4q, [bx4q*3-32]
    306    movsxd        bw4q, [aq+bw4q*4]
    307    movifnidn     bh4d, bh4m
    308    pshufd          m0, m2, q0210
    309    pshufd          m1, m2, q1021
    310    pshufd          m2, m2, q2102
    311    add           bw4q, aq
    312 .loop:
    313    mov             aq, [rrq]
    314    add            rrq, gprsize
    315    lea             aq, [aq+bx4q*4]
    316    jmp           bw4q
    317 .w32:
    318    mova    [aq-16*16], m0
    319    mova    [aq-16*15], m1
    320    mova    [aq-16*14], m2
    321    mova    [aq-16*13], m0
    322    mova    [aq-16*12], m1
    323    mova    [aq-16*11], m2
    324    mova    [aq-16*10], m0
    325    mova    [aq-16* 9], m1
    326    mova    [aq-16* 8], m2
    327    mova    [aq-16* 7], m0
    328    mova    [aq-16* 6], m1
    329    mova    [aq-16* 5], m2
    330 .w16:
    331    mova    [aq-16* 4], m0
    332    mova    [aq-16* 3], m1
    333    mova    [aq-16* 2], m2
    334    mova    [aq-16* 1], m0
    335    mova    [aq+16* 0], m1
    336    mova    [aq+16* 1], m2
    337 .w8:
    338    mova    [aq+16* 2], m0
    339    mova    [aq+16* 3], m1
    340    mova    [aq+16* 4], m2
    341 .w4:
    342    mova    [aq+16* 5], m0
    343    mova    [aq+16* 6], m1
    344    mova    [aq+16* 7], m2
    345    dec           bh4d
    346    jg .loop
    347    RET
    348 .w2:
    349    movu      [aq+104], m0
    350    movq      [aq+120], m1
    351    dec           bh4d
    352    jg .loop
    353    RET
    354 .w1:
    355    movq      [aq+116], m0
    356    movd      [aq+124], m2
    357    dec           bh4d
    358    jg .loop
    359    RET
    360 
    361 %if ARCH_X86_64
    362 INIT_XMM sse4
    363 ; refmvs_frame *rf, int tile_row_idx,
    364 ; int col_start8, int col_end8, int row_start8, int row_end8
    365 cglobal load_tmvs, 6, 15, 4, -0x50, rf, tridx, xstart, xend, ystart, yend, \
    366                                    stride, rp_proj, roff, troff, \
    367                                    xendi, xstarti, iw8, ih8, dst
    368    xor           r14d, r14d
    369    cmp dword [rfq+rf.n_tile_threads], 1
    370    mov           ih8d, [rfq+rf.ih8]
    371    mov           iw8d, [rfq+rf.iw8]
    372    mov        xstartd, xstartd
    373    mov          xendd, xendd
    374    cmove       tridxd, r14d
    375    lea       xstartid, [xstartq-8]
    376    lea         xendid, [xendq+8]
    377    mov        strideq, [rfq+rf.rp_stride]
    378    mov       rp_projq, [rfq+rf.rp_proj]
    379    cmp           ih8d, yendd
    380    mov     [rsp+0x30], strideq
    381    cmovs        yendd, ih8d
    382    test      xstartid, xstartid
    383    cmovs     xstartid, r14d
    384    cmp           iw8d, xendid
    385    cmovs       xendid, iw8d
    386    mov         troffq, strideq
    387    shl         troffq, 4
    388    imul        troffq, tridxq
    389    mov           dstd, ystartd
    390    and           dstd, 15
    391    imul          dstq, strideq
    392    add           dstq, troffq      ; (16 * tridx + (ystart & 15)) * stride
    393    lea           dstq, [dstq*5]
    394    add           dstq, rp_projq
    395    lea         troffq, [troffq*5]  ; 16 * tridx * stride * 5
    396    lea           r13d, [xendq*5]
    397    lea            r12, [strideq*5]
    398 DEFINE_ARGS rf, w5, xstart, xend, ystart, yend, h, x5, \
    399             _, troff, xendi, xstarti, stride5, _, dst
    400    lea            w5d, [xstartq*5]
    401    add             r7, troffq      ; rp_proj + tile_row_offset
    402    mov             hd, yendd
    403    mov     [rsp+0x28], r7
    404    add           dstq, r13
    405    sub            w5q, r13
    406    sub             hd, ystartd
    407 .init_xloop_start:
    408    mov            x5q, w5q
    409    test           w5b, 1
    410    jz .init_2blk
    411    mov dword [dstq+x5q], 0x80008000
    412    add            x5q, 5
    413    jz .init_next_row
    414 .init_2blk:
    415    mov dword [dstq+x5q+0], 0x80008000
    416    mov dword [dstq+x5q+5], 0x80008000
    417    add            x5q, 10
    418    jl .init_2blk
    419 .init_next_row:
    420    add           dstq, stride5q
    421    dec             hd
    422    jg .init_xloop_start
    423 DEFINE_ARGS rf, _, xstart, xend, ystart, yend, n7, stride, \
    424             _, _, xendi, xstarti, stride5, _, n
    425    mov           r13d, [rfq+rf.n_mfmvs]
    426    test          r13d, r13d
    427    jz .ret
    428    mov     [rsp+0x0c], r13d
    429    mov        strideq, [rsp+0x30]
    430    movddup         m3, [pq_8192]
    431    mov            r9d, ystartd
    432    mov     [rsp+0x38], yendd
    433    mov     [rsp+0x20], xstartid
    434    xor             nd, nd
    435    lea            n7q, [rfq+rf.mfmv_ref2ref-1]
    436    imul            r9, strideq     ; ystart * stride
    437    mov     [rsp+0x48], rfq
    438    mov     [rsp+0x18], stride5q
    439    lea             r7, [r9*5]
    440    mov     [rsp+0x24], ystartd
    441    mov     [rsp+0x00], r7
    442 .nloop:
    443 DEFINE_ARGS y, off, xstart, xend, ystart, rf, n7, refsign, \
    444             ref, rp_ref, xendi, xstarti, _, _, n
    445    mov            rfq, [rsp+0x48]
    446    movsx         refd, byte [rfq+rf.mfmv_ref2cur+nq]
    447    cmp           refd, -32                 ; INVALID_REF2CUR
    448    je .next_n
    449    mov     [rsp+0x40], refd
    450    mov           offq, [rsp+0x00]          ; ystart * stride * 5
    451    movzx         refd, byte [rfq+rf.mfmv_ref+nq]
    452    lea       refsignq, [refq-4]
    453    mov        rp_refq, [rfq+rf.rp_ref]
    454    movq            m2, refsignq
    455    add           offq, [rp_refq+refq*8]    ; r = rp_ref[ref] + row_offset
    456    mov     [rsp+0x14], nd
    457    mov             yd, ystartd
    458 .yloop:
    459    mov           r11d, [rsp+0x24]          ; ystart
    460    mov           r12d, [rsp+0x38]          ; yend
    461    mov           r14d, yd
    462    and           r14d, ~7                  ; y_sb_align
    463    cmp           r11d, r14d
    464    cmovs         r11d, r14d                ; imax(y_sb_align, ystart)
    465    mov     [rsp+0x44], r11d                ; y_proj_start
    466    add           r14d, 8
    467    cmp           r12d, r14d
    468    cmovs         r14d, r12d                ; imin(y_sb_align + 8, yend)
    469    mov     [rsp+0x3c], r14d                ; y_proj_end
    470 DEFINE_ARGS y, src, xstart, xend, frac, rf, n7, mv, \
    471             ref, x, xendi, mvx, mvy, rb, ref2ref
    472    mov             xd, [rsp+0x20] ; xstarti
    473 .xloop:
    474    lea            rbd, [xq*5]
    475    add            rbq, srcq
    476    movzx         refd, byte [rbq+4]
    477    test          refd, refd
    478    jz .next_x_bad_ref
    479    movzx     ref2refd, byte [n7q+refq]     ; rf->mfmv_ref2ref[n][b_ref-1]
    480    test      ref2refd, ref2refd
    481    jz .next_x_bad_ref
    482    lea          fracq, [mv_proj]
    483    movzx        fracd, word [fracq+ref2refq*2]
    484    mov            mvd, [rbq]
    485    imul         fracd, [rsp+0x40] ; ref2cur
    486    pmovsxwq        m0, [rbq]
    487    movd            m1, fracd
    488    punpcklqdq      m1, m1
    489    pmuldq          m0, m1          ; mv * frac
    490    pshufd          m1, m0, q3311
    491    paddd           m0, m3
    492    paddd           m0, m1
    493    psrad           m0, 14          ; offset = (xy + (xy >> 31) + 8192) >> 14
    494    pabsd           m1, m0
    495    packssdw        m0, m0
    496    psrld           m1, 6
    497    packuswb        m1, m1
    498    pxor            m0, m2          ; offset ^ ref_sign
    499    psignd          m1, m0          ; apply_sign(abs(offset) >> 6, offset ^ refsign)
    500    movq          mvxq, m1
    501    lea           mvyd, [mvxq+yq]   ; ypos
    502    sar           mvxq, 32
    503 DEFINE_ARGS y, src, xstart, xend, _, _, n7, mv, \
    504             ref, x, xendi, mvx, ypos, rb, ref2ref
    505    cmp          yposd, [rsp+0x44] ; y_proj_start
    506    jl .next_x_bad_pos_y
    507    cmp          yposd, [rsp+0x3c] ; y_proj_end
    508    jge .next_x_bad_pos_y
    509    and          yposd, 15
    510    add           mvxq, xq          ; xpos
    511    imul         yposq, [rsp+0x30]  ; pos = (ypos & 15) * stride
    512 DEFINE_ARGS y, src, xstart, xend, dst, _, n7, mv, \
    513             ref, x, xendi, xpos, pos, rb, ref2ref
    514    mov           dstq, [rsp+0x28]  ; dst = rp_proj + tile_row_offset
    515    add           posq, xposq       ; pos += xpos
    516    lea           posq, [posq*5]
    517    add           dstq, posq        ; dst += pos5
    518    jmp .write_loop_entry
    519 .write_loop:
    520    add            rbq, 5
    521    cmp           refb, byte [rbq+4]
    522    jne .xloop
    523    cmp            mvd, [rbq]
    524    jne .xloop
    525    add           dstq, 5
    526    inc          xposd
    527 .write_loop_entry:
    528    mov           r12d, xd
    529    and           r12d, ~7
    530    lea            r5d, [r12-8]
    531    cmp            r5d, xstartd
    532    cmovs          r5d, xstartd     ; x_proj_start
    533    cmp          xposd, r5d
    534    jl .next_xpos
    535    add           r12d, 16
    536    cmp          xendd, r12d
    537    cmovs         r12d, xendd       ; x_proj_end
    538    cmp          xposd, r12d
    539    jge .next_xpos
    540    mov       [dstq+0], mvd
    541    mov  byte [dstq+4], ref2refb
    542 .next_xpos:
    543    inc             xd
    544    cmp             xd, xendid
    545    jl .write_loop
    546 .next_y:
    547 DEFINE_ARGS y, src, xstart, xend, ystart, _, n7, _, _, x, xendi, _, _, _, n
    548    add           srcq, [rsp+0x18] ; stride5
    549    inc             yd
    550    cmp             yd, [rsp+0x38] ; yend
    551    jne .yloop
    552    mov             nd, [rsp+0x14]
    553    mov        ystartd, [rsp+0x24]
    554 .next_n:
    555    add            n7q, 7
    556    inc             nd
    557    cmp             nd, [rsp+0x0c] ; n_mfmvs
    558    jne .nloop
    559 .ret:
    560    RET
    561 .next_x:
    562 DEFINE_ARGS y, src, xstart, xend, _, _, n7, mv, ref, x, xendi, _, _, rb, _
    563    add            rbq, 5
    564    cmp           refb, byte [rbq+4]
    565    jne .xloop
    566    cmp            mvd, [rbq]
    567    jne .xloop
    568 .next_x_bad_pos_y:
    569    inc             xd
    570    cmp             xd, xendid
    571    jl .next_x
    572    jmp .next_y
    573 .next_x_bad_ref:
    574    inc             xd
    575    cmp             xd, xendid
    576    jl .xloop
    577    jmp .next_y
    578 
    579 INIT_YMM avx2
    580 ; refmvs_temporal_block *rp, ptrdiff_t stride,
    581 ; refmvs_block **rr, uint8_t *ref_sign,
    582 ; int col_end8, int row_end8, int col_start8, int row_start8
    583 cglobal save_tmvs, 4, 13, 10, rp, stride, rr, ref_sign, \
    584                              xend, yend, xstart, ystart
    585 %define base r12-.write1
    586    lea            r12, [.write1]
    587    movifnidn    xendd, xendm
    588    movifnidn    yendd, yendm
    589    mov        xstartd, xstartm
    590    mov        ystartd, ystartm
    591    vpbroadcastq    m4, [ref_signq]
    592    vpbroadcastq    m3, [base+save_ref_shuf+8]
    593    vpbroadcastq    m5, [base+save_cond0]
    594    vpbroadcastq    m6, [base+save_cond1]
    595    vpbroadcastd    m7, [base+pb_128]
    596    mova            m8, [base+save_pack0]
    597    mova            m9, [base+save_pack1]
    598    psllq           m4, 8
    599    lea            r9d, [xendq*5]
    600    lea        xstartd, [xstartq*5]
    601    sub          yendd, ystartd
    602    add        ystartd, ystartd
    603    lea        strideq, [strideq*5]
    604    sub        xstartq, r9
    605    add          xendd, r9d
    606    add            rpq, r9
    607 DEFINE_ARGS rp, stride, rr, x, xend, h, xstart, ystart, b, cand
    608 .loop_y:
    609    and        ystartd, 30
    610    mov             xq, xstartq
    611    mov             bq, [rrq+ystartq*8]
    612    add        ystartd, 2
    613    lea             bq, [bq+xendq*4]
    614 .loop_x:
    615    imul         candq, xq, 0x9999
    616    sar          candq, 16                   ; x / 5 * 3
    617    movzx         r10d, byte [bq+candq*8+22] ; cand_b->bs
    618    movu           xm0, [bq+candq*8+12]      ; cand_b
    619    movzx         r11d, byte [base+save_tmvs_avx2_table+r10*2+0]
    620    movzx         r10d, byte [base+save_tmvs_avx2_table+r10*2+1]
    621    add            r10, r12
    622    add          candq, r11
    623    jge .calc
    624    vinserti128     m0, [bq+candq*8+12], 1
    625    movzx         r11d, byte [bq+candq*8+22]
    626    movzx         r11d, byte [base+save_tmvs_avx2_table+r11*2+1]
    627    add            r11, r12
    628 .calc:
    629    pshufb          m1, m0, m3
    630    pabsw           m2, m0
    631    pshufb          m1, m4, m1  ; ref > 0 && res_sign[ref - 1]
    632    psrlw           m2, 12      ; (abs(mv.x) | abs(mv.y)) < 4096
    633    pcmpgtd         m1, m2
    634    pshufd          m2, m1, q2301
    635    pand            m1, m5      ; b0.cond0 b1.cond0
    636    pand            m2, m6      ; b0.cond1 b1.cond1
    637    por             m1, m2      ; b0.shuf b1.shuf
    638    pxor            m1, m7      ; if cond0|cond1 == 0 => zero out
    639    pshufb          m0, m1
    640    call           r10
    641    jge .next_line
    642    vextracti128   xm0, m0, 1
    643    call           r11
    644    jl .loop_x
    645 .next_line:
    646    add            rpq, strideq
    647    dec             hd
    648    jg .loop_y
    649    RET
    650 .write1:
    651    movd   [rpq+xq+ 0], xm0
    652    pextrb [rpq+xq+ 4], xm0, 4
    653    add             xq, 5*1
    654    ret
    655 .write2:
    656    movq    [rpq+xq+0], xm0
    657    psrlq          xm1, xm0, 8
    658    movd    [rpq+xq+6], xm1
    659    add             xq, 5*2
    660    ret
    661 .write4:
    662    pshufb         xm1, xm0, xm8
    663    movu   [rpq+xq+ 0], xm1
    664    psrlq          xm1, 8
    665    movd   [rpq+xq+16], xm1
    666    add             xq, 5*4
    667    ret
    668 .write8:
    669    vinserti128     m1, m0, xm0, 1
    670    pshufb          m1, m8
    671    movu   [rpq+xq+ 0], m1
    672    psrldq         xm1, 2
    673    movq   [rpq+xq+32], xm1
    674    add             xq, 5*8
    675    ret
    676 .write16:
    677    vinserti128     m1, m0, xm0, 1
    678    pshufb          m2, m1, m8
    679    movu   [rpq+xq+ 0], m2
    680    pshufb          m1, m9
    681    movu   [rpq+xq+32], m1
    682    shufps         xm2, xm1, q1021
    683    movu   [rpq+xq+64], xm2
    684    add             xq, 5*16
    685    ret
    686 
    687 cglobal splat_mv, 4, 5, 3, rr, a, bx4, bw4, bh4
    688    add           bx4d, bw4d
    689    tzcnt         bw4d, bw4d
    690    vbroadcasti128  m0, [aq]
    691    lea             aq, [splat_mv_avx2_table]
    692    lea           bx4q, [bx4q*3-32]
    693    movsxd        bw4q, [aq+bw4q*4]
    694    pshufb          m0, [splat_mv_shuf]
    695    movifnidn     bh4d, bh4m
    696    pshufd          m1, m0, q2102
    697    pshufd          m2, m0, q1021
    698    add           bw4q, aq
    699 .loop:
    700    mov             aq, [rrq]
    701    add            rrq, gprsize
    702    lea             aq, [aq+bx4q*4]
    703    jmp           bw4q
    704 .w32:
    705    mova     [aq-32*8], m0
    706    mova     [aq-32*7], m1
    707    mova     [aq-32*6], m2
    708    mova     [aq-32*5], m0
    709    mova     [aq-32*4], m1
    710    mova     [aq-32*3], m2
    711 .w16:
    712    mova     [aq-32*2], m0
    713    mova     [aq-32*1], m1
    714    mova     [aq+32*0], m2
    715 .w8:
    716    mova     [aq+32*1], m0
    717    mova     [aq+32*2], m1
    718    mova     [aq+32*3], m2
    719    dec           bh4d
    720    jg .loop
    721    RET
    722 .w4:
    723    movu      [aq+ 80], m0
    724    mova      [aq+112], xm1
    725    dec           bh4d
    726    jg .loop
    727    RET
    728 .w2:
    729    movu      [aq+104], xm0
    730    movq      [aq+120], xm2
    731    dec           bh4d
    732    jg .loop
    733    RET
    734 .w1:
    735    movq      [aq+116], xm0
    736    movd      [aq+124], xm1
    737    dec           bh4d
    738    jg .loop
    739    RET
    740 
    741 INIT_ZMM avx512icl
    742 ; refmvs_temporal_block *rp, ptrdiff_t stride,
    743 ; refmvs_block **rr, uint8_t *ref_sign,
    744 ; int col_end8, int row_end8, int col_start8, int row_start8
    745 cglobal save_tmvs, 4, 15, 10, rp, stride, rr, ref_sign, \
    746                              xend, yend, xstart, ystart
    747 %define base r14-.write1
    748    lea            r14, [.write1]
    749    movifnidn    xendd, xendm
    750    movifnidn    yendd, yendm
    751    mov        xstartd, xstartm
    752    mov        ystartd, ystartm
    753    psllq           m4, [ref_signq]{bcstq}, 8
    754    vpbroadcastq    m3, [base+save_ref_shuf+8]
    755    vbroadcasti32x4 m5, [base+cond_shuf512]
    756    vbroadcasti32x4 m6, [base+save_cond0]
    757    vpbroadcastd    m7, [base+pb_128]
    758    mova            m8, [base+save_pack0]
    759    movu           xm9, [base+save_pack0+4]
    760    lea            r9d, [xendq*5]
    761    lea        xstartd, [xstartq*5]
    762    sub          yendd, ystartd
    763    add        ystartd, ystartd
    764    lea        strideq, [strideq*5]
    765    sub        xstartq, r9
    766    add          xendd, r9d
    767    add            rpq, r9
    768    mov           r10d, 0x1f
    769    kmovb           k2, r10d
    770 DEFINE_ARGS rp, stride, rr, x, xend, h, xstart, ystart, b, cand
    771 .loop_y:
    772    and        ystartd, 30
    773    mov             xq, xstartq
    774    mov             bq, [rrq+ystartq*8]
    775    add        ystartd, 2
    776    lea             bq, [bq+xendq*4]
    777 .loop_x:
    778    imul         candq, xq, 0x9999
    779    sar          candq, 16                   ; x / 5 * 3
    780    movzx         r10d, byte [bq+candq*8+22] ; cand_b->bs
    781    movu           xm0, [bq+candq*8+12]      ; cand_b
    782    movzx         r11d, byte [base+save_tmvs_avx512icl_table+r10*2+0]
    783    movzx         r10d, byte [base+save_tmvs_avx512icl_table+r10*2+1]
    784    add            r10, r14
    785    add          candq, r11
    786    jge .calc
    787    movzx         r11d, byte [bq+candq*8+22]
    788    vinserti32x4   ym0, [bq+candq*8+12], 1
    789    movzx         r12d, byte [base+save_tmvs_avx512icl_table+r11*2+0]
    790    movzx         r11d, byte [base+save_tmvs_avx512icl_table+r11*2+1]
    791    add            r11, r14
    792    add          candq, r12
    793    jge .calc
    794    movzx         r12d, byte [bq+candq*8+22]
    795    vinserti32x4    m0, [bq+candq*8+12], 2
    796    movzx         r13d, byte [base+save_tmvs_avx512icl_table+r12*2+0]
    797    movzx         r12d, byte [base+save_tmvs_avx512icl_table+r12*2+1]
    798    add            r12, r14
    799    add          candq, r13
    800    jge .calc
    801    vinserti32x4    m0, [bq+candq*8+12], 3
    802    movzx         r13d, byte [bq+candq*8+22]
    803    movzx         r13d, byte [base+save_tmvs_avx512icl_table+r13*2+1]
    804    add            r13, r14
    805 .calc:
    806    pshufb          m1, m0, m3
    807    pabsw           m2, m0
    808    pshufb          m1, m4, m1      ; ref > 0 && res_sign[ref - 1]
    809    psrlw           m2, 12          ; (abs(mv.x) | abs(mv.y)) < 4096
    810    psubd           m2, m1
    811    pshufb          m2, m5           ; c0 c1 c1 c0
    812    pand            m2, m6
    813    punpckhqdq      m1, m2, m2
    814    vpternlogd      m1, m2, m7, 0x56 ; (c0shuf | c1shuf) ^ 0x80
    815    pshufb          m2, m0, m1
    816    mova           xm0, xm2
    817    call           r10
    818    jge .next_line
    819    vextracti32x4  xm0, m2, 1
    820    call           r11
    821    jge .next_line
    822    vextracti32x4  xm0, m2, 2
    823    call           r12
    824    jge .next_line
    825    vextracti32x4  xm0, m2, 3
    826    call           r13
    827    jl .loop_x
    828 .next_line:
    829    add            rpq, strideq
    830    dec             hd
    831    jg .loop_y
    832    RET
    833 .write1:
    834    vmovdqu8 [rpq+xq]{k2}, xm0
    835    add             xq, 5*1
    836    ret
    837 .write2:
    838    pshufb         xm0, xm8
    839    vmovdqu16 [rpq+xq]{k2}, xm0
    840    add             xq, 5*2
    841    ret
    842 .write4:
    843    vpermb         ym0, ym8, ym0
    844    vmovdqu32 [rpq+xq]{k2}, ym0
    845    add             xq, 5*4
    846    ret
    847 .write8:
    848    vpermb          m0, m8, m0
    849    vmovdqu64 [rpq+xq]{k2}, m0
    850    add             xq, 5*8
    851    ret
    852 .write16:
    853    vpermb          m1, m8, m0
    854    movu   [rpq+xq+ 0], m1
    855    pshufb         xm0, xm9
    856    movu   [rpq+xq+64], xm0
    857    add             xq, 5*16
    858    ret
    859 
    860 INIT_ZMM avx512icl
    861 cglobal splat_mv, 4, 7, 3, rr, a, bx4, bw4, bh4
    862    vbroadcasti32x4    m0, [aq]
    863    lea                r1, [splat_mv_avx512icl_table]
    864    tzcnt            bw4d, bw4d
    865    lea              bx4d, [bx4q*3]
    866    pshufb             m0, [splat_mv_shuf]
    867    movsxd           bw4q, [r1+bw4q*4]
    868    mov               r6d, bh4m
    869    add              bw4q, r1
    870    lea               rrq, [rrq+r6*8]
    871    mov               r1d, 0x3f
    872    neg                r6
    873    kmovb              k1, r1d
    874    jmp              bw4q
    875 .w1:
    876    mov                r1, [rrq+r6*8]
    877    vmovdqu16 [r1+bx4q*4]{k1}, xm0
    878    inc                r6
    879    jl .w1
    880    RET
    881 .w2:
    882    mov                r1, [rrq+r6*8]
    883    vmovdqu32 [r1+bx4q*4]{k1}, ym0
    884    inc                r6
    885    jl .w2
    886    RET
    887 .w4:
    888    mov                r1, [rrq+r6*8]
    889    vmovdqu64 [r1+bx4q*4]{k1}, m0
    890    inc                r6
    891    jl .w4
    892    RET
    893 .w8:
    894    pshufd            ym1, ym0, q1021
    895 .w8_loop:
    896    mov                r1, [rrq+r6*8+0]
    897    mov                r3, [rrq+r6*8+8]
    898    movu   [r1+bx4q*4+ 0], m0
    899    mova   [r1+bx4q*4+64], ym1
    900    movu   [r3+bx4q*4+ 0], m0
    901    mova   [r3+bx4q*4+64], ym1
    902    add                r6, 2
    903    jl .w8_loop
    904    RET
    905 .w16:
    906    pshufd             m1, m0, q1021
    907    pshufd             m2, m0, q2102
    908 .w16_loop:
    909    mov                r1, [rrq+r6*8+0]
    910    mov                r3, [rrq+r6*8+8]
    911    mova [r1+bx4q*4+64*0], m0
    912    mova [r1+bx4q*4+64*1], m1
    913    mova [r1+bx4q*4+64*2], m2
    914    mova [r3+bx4q*4+64*0], m0
    915    mova [r3+bx4q*4+64*1], m1
    916    mova [r3+bx4q*4+64*2], m2
    917    add                r6, 2
    918    jl .w16_loop
    919    RET
    920 .w32:
    921    pshufd             m1, m0, q1021
    922    pshufd             m2, m0, q2102
    923 .w32_loop:
    924    mov                r1, [rrq+r6*8]
    925    lea                r1, [r1+bx4q*4]
    926    mova        [r1+64*0], m0
    927    mova        [r1+64*1], m1
    928    mova        [r1+64*2], m2
    929    mova        [r1+64*3], m0
    930    mova        [r1+64*4], m1
    931    mova        [r1+64*5], m2
    932    inc                r6
    933    jl .w32_loop
    934    RET
    935 %endif ; ARCH_X86_64