tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

filmgrain16_avx512.asm (31953B)


      1 ; Copyright © 2022, VideoLAN and dav1d authors
      2 ; Copyright © 2022, Two Orioles, LLC
      3 ; All rights reserved.
      4 ;
      5 ; Redistribution and use in source and binary forms, with or without
      6 ; modification, are permitted provided that the following conditions are met:
      7 ;
      8 ; 1. Redistributions of source code must retain the above copyright notice, this
      9 ;    list of conditions and the following disclaimer.
     10 ;
     11 ; 2. Redistributions in binary form must reproduce the above copyright notice,
     12 ;    this list of conditions and the following disclaimer in the documentation
     13 ;    and/or other materials provided with the distribution.
     14 ;
     15 ; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
     16 ; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
     17 ; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
     18 ; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
     19 ; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
     20 ; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
     21 ; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
     22 ; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
     23 ; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
     24 ; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     25 
     26 %include "config.asm"
     27 %include "ext/x86/x86inc.asm"
     28 %include "x86/filmgrain_common.asm"
     29 
     30 %if ARCH_X86_64
     31 
     32 SECTION_RODATA 16
     33 scale_mask:    db -1, -1,  0, -1, -1, -1,  4, -1, -1, -1,  8, -1, -1, -1, 12, -1
     34 scale_shift:           dw   7,   7,   6,   6,   5,   5,   4,   4
     35 pw_27_17_17_27:        dw 108,  68,  68, 108,  27,  17,  17,  27
     36 pw_23_22:              dw  92,  88,   0, 128,  23,  22,   0,  32
     37 fg_min:        times 2 dw 0
     38               times 2 dw 64
     39               times 2 dw 256
     40 fg_max:        times 2 dw 1023
     41               times 2 dw 4095
     42               times 2 dw 960
     43               times 2 dw 3840
     44               times 2 dw 940
     45               times 2 dw 3760
     46 scale_rnd:             dd 64
     47                       dd 16
     48 uv_offset_mul:         dd 256
     49                       dd 1024
     50 pb_8_9_0_1:            db 8, 9, 0, 1
     51 
     52 cextern pb_0to63
     53 
     54 SECTION .text
     55 
     56 INIT_ZMM avx512icl
     57 cglobal fgy_32x32xn_16bpc, 6, 15, 21, dst, src, stride, fg_data, w, scaling, \
     58                                      grain_lut, offx, sby, see, offy, src_bak
     59 %define base r11-fg_min
     60    lea             r11, [fg_min]
     61    mov             r6d, r9m    ; bdmax
     62    mov             r9d, [fg_dataq+FGData.clip_to_restricted_range]
     63    mov             r7d, [fg_dataq+FGData.scaling_shift]
     64    mov            sbyd, sbym
     65    vpbroadcastd     m6, r9m
     66    shr             r6d, 11     ; is_12bpc
     67    vbroadcasti32x4  m7, [base+scale_mask]
     68    shlx           r10d, r9d, r6d
     69    vpbroadcastd    m10, [base+scale_shift+r7*4-32]
     70    lea             r9d, [r6+r9*4]
     71    vpbroadcastd     m8, [base+fg_min+r10*4]
     72    kxnorw           k1, k1, k1 ; 0xffff
     73    vpbroadcastd     m9, [base+fg_max+r9*4]
     74    mov             r12, 0xeeeeeeeeeeeeeeee
     75    vpbroadcastd    m19, [base+scale_rnd+r6*4]
     76    kshiftrb         k2, k1, 4  ; 0xf
     77    vpbroadcastq   xm20, [base+pw_27_17_17_27+r6*8]
     78    kmovq            k3, r12
     79    vpbroadcastd    m11, [base+scale_shift+r6*8+4]
     80    test           sbyd, sbyd
     81    setnz           r7b
     82    vpbroadcastd    m12, [base+pw_27_17_17_27+r6*8+0]
     83    vpbroadcastd    m13, [base+pw_27_17_17_27+r6*8+4]
     84    test            [fg_dataq+FGData.overlap_flag], r7b
     85    jnz .v_overlap
     86 
     87    imul           seed, sbyd, (173 << 24) | 37
     88    add            seed, (105 << 24) | 178
     89    rorx           seed, seed, 24
     90    movzx          seed, seew
     91    xor            seed, [fg_dataq+FGData.seed]
     92    lea        src_bakq, [srcq+wq*2]
     93    neg              wq
     94    sub            dstq, srcq
     95 
     96 .loop_x:
     97    rorx             r6, seeq, 1
     98    or             seed, 0xeff4
     99    test           seeb, seeh
    100    lea            seed, [r6+0x8000]
    101    cmovp          seed, r6d                 ; updated seed
    102    rorx          offyd, seed, 8
    103    rorx          offxq, seeq, 12
    104    and           offyd, 0xf
    105    imul          offyd, 164
    106    lea           offyd, [offyq+offxq*2+747] ; offy*stride+offx
    107 
    108    DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, h, \
    109                sby, see, offxy, src_bak
    110 
    111    mov      grain_lutq, grain_lutmp
    112    mov              hd, hm
    113 .loop_y:
    114    movu             m4, [grain_lutq+offxyq*2+82*0]
    115    movu             m5, [grain_lutq+offxyq*2+82*2]
    116    call .add_noise
    117    sub              hb, 2
    118    jg .loop_y
    119    add              wq, 32
    120    jge .end
    121    lea            srcq, [src_bakq+wq*2]
    122    cmp byte [fg_dataq+FGData.overlap_flag], 0
    123    je .loop_x
    124    test           sbyd, sbyd
    125    jnz .hv_overlap
    126 
    127    ; horizontal overlap (without vertical overlap)
    128 .loop_x_h_overlap:
    129    rorx             r6, seeq, 1
    130    or             seed, 0xeff4
    131    test           seeb, seeh
    132    lea            seed, [r6+0x8000]
    133    cmovp          seed, r6d                 ; updated seed
    134 
    135    DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, offx, \
    136                sby, see, offy, src_bak, left_offxy
    137 
    138    lea     left_offxyd, [offyq+73]          ; previous column's offy*stride+offx
    139    rorx          offyd, seed, 8
    140    rorx          offxq, seeq, 12
    141    and           offyd, 0xf
    142    imul          offyd, 164
    143    lea           offyd, [offyq+offxq*2+747] ; offy*stride+offx
    144 
    145    DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, h, \
    146                sby, see, offxy, src_bak, left_offxy
    147 
    148    mov      grain_lutq, grain_lutmp
    149    mov              hd, hm
    150 .loop_y_h_overlap:
    151    movu             m4, [grain_lutq+offxyq*2+82*0]
    152    movu             m5, [grain_lutq+offxyq*2+82*2]
    153    movd           xm17, [grain_lutq+left_offxyq*2-82*1]
    154    pinsrd         xm17, [grain_lutq+left_offxyq*2+82*1], 1
    155    punpckldq      xm16, xm4, xm5
    156    punpcklwd      xm17, xm16
    157    mova           xm16, xm19
    158    vpdpwssd       xm16, xm20, xm17
    159    psrad          xm16, 1
    160    packssdw       xm16, xm16
    161    vpsravw        xm16, xm11
    162    vmovdqu8     m4{k2}, m16
    163    vpalignr     m5{k2}, m16, m16, 4
    164    call .add_noise
    165    sub              hb, 2
    166    jg .loop_y_h_overlap
    167    add              wq, 32
    168    jge .end
    169    lea            srcq, [src_bakq+wq*2]
    170    test           sbyd, sbyd
    171    jnz .hv_overlap
    172    jmp .loop_x_h_overlap
    173 
    174 .v_overlap:
    175    movzx          sbyd, sbyb
    176    imul           seed, [fg_dataq+FGData.seed], 0x00010001
    177    imul            r7d, sbyd, 173 * 0x00010001
    178    imul           sbyd, 37 * 0x01000100
    179    add             r7d, (105 << 16) | 188
    180    add            sbyd, (178 << 24) | (141 << 8)
    181    and             r7d, 0x00ff00ff
    182    and            sbyd, 0xff00ff00
    183    xor            seed, r7d
    184    xor            seed, sbyd               ; (cur_seed << 16) | top_seed
    185    lea        src_bakq, [srcq+wq*2]
    186    neg              wq
    187    sub            dstq, srcq
    188 
    189    ; we assume from the block above that bits 8-15 of r7d are zero'ed
    190    mov             r6d, seed
    191    or             seed, 0xeff4eff4
    192    test           seeb, seeh
    193    setp            r7b                     ; parity of top_seed
    194    shr            seed, 16
    195    shl             r7d, 16
    196    test           seeb, seeh
    197    setp            r7b                     ; parity of cur_seed
    198    or              r6d, 0x00010001
    199    xor             r7d, r6d
    200    rorx           seed, r7d, 1             ; updated (cur_seed << 16) | top_seed
    201 
    202    DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, offx, \
    203                sby, see, offy, src_bak, _, top_offxy
    204 
    205    rorx          offyd, seed, 8
    206    rorx          offxd, seed, 12
    207    and           offyd, 0xf000f
    208    and           offxd, 0xf000f
    209    imul          offyd, 164
    210    ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy
    211    lea           offyd, [offyq+offxq*2+0x10001*747+32*82]
    212 
    213    DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, h, \
    214                sby, see, offxy, src_bak, _, top_offxy
    215 
    216    mov      grain_lutq, grain_lutmp
    217    mov              hd, hm
    218    movzx    top_offxyd, offxyw
    219    shr          offxyd, 16
    220 
    221    movu            m16, [grain_lutq+offxyq*2+82*0]
    222    movu             m0, [grain_lutq+top_offxyq*2+82*0]
    223    movu            m17, [grain_lutq+offxyq*2+82*2]
    224    movu             m1, [grain_lutq+top_offxyq*2+82*2]
    225    punpckhwd        m4, m0, m16
    226    punpcklwd        m0, m16
    227    punpckhwd        m5, m1, m17
    228    punpcklwd        m1, m17
    229    call .add_noise_v
    230    sub              hb, 2
    231    jg .loop_y
    232    add              wq, 32
    233    jge .end
    234    lea            srcq, [src_bakq+wq*2]
    235 
    236    ; since fg_dataq.overlap is guaranteed to be set, we never jump back
    237    ; to .v_overlap, and instead always fall-through to .hv_overlap
    238 .hv_overlap:
    239    ; we assume from the block above that bits 8-15 of r7d are zero'ed
    240    mov             r6d, seed
    241    or             seed, 0xeff4eff4
    242    test           seeb, seeh
    243    setp            r7b                     ; parity of top_seed
    244    shr            seed, 16
    245    shl             r7d, 16
    246    test           seeb, seeh
    247    setp            r7b                     ; parity of cur_seed
    248    or              r6d, 0x00010001
    249    xor             r7d, r6d
    250    rorx           seed, r7d, 1             ; updated (cur_seed << 16) | top_seed
    251 
    252    DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, offx, \
    253                sby, see, offy, src_bak, left_offxy, top_offxy, topleft_offxy
    254 
    255    lea  topleft_offxyd, [top_offxyq+73]
    256    lea     left_offxyd, [offyq+73]
    257    rorx          offyd, seed, 8
    258    rorx          offxd, seed, 12
    259    and           offyd, 0xf000f
    260    and           offxd, 0xf000f
    261    imul          offyd, 164
    262    ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy
    263    lea           offyd, [offyq+offxq*2+0x10001*747+32*82]
    264 
    265    DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, h, \
    266                sby, see, offxy, src_bak, left_offxy, top_offxy, topleft_offxy
    267 
    268    mov      grain_lutq, grain_lutmp
    269    mov              hd, hm
    270    movzx    top_offxyd, offxyw
    271    shr          offxyd, 16
    272 
    273    movu             m5, [grain_lutq+offxyq*2+82*0]
    274    movu             m0, [grain_lutq+top_offxyq*2+82*0]
    275    movd           xm17, [grain_lutq+left_offxyq*2-82*1]
    276    pinsrd         xm17, [grain_lutq+topleft_offxyq*2-82*1], 1
    277    movu             m2, [grain_lutq+offxyq*2+82*2]
    278    movu             m1, [grain_lutq+top_offxyq*2+82*2]
    279    movd           xm18, [grain_lutq+left_offxyq*2+82*1]
    280    pinsrd         xm18, [grain_lutq+topleft_offxyq*2+82*1], 1
    281    punpckldq      xm16, xm5, xm0
    282    punpcklwd      xm17, xm16
    283    mova           xm16, xm19
    284    vpdpwssd       xm16, xm20, xm17
    285    punpckldq      xm17, xm2, xm1
    286    punpcklwd      xm18, xm17
    287    mova           xm17, xm19
    288    vpdpwssd       xm17, xm20, xm18
    289    punpckhwd        m4, m0, m5
    290    punpcklwd        m0, m5
    291    punpckhwd        m5, m1, m2
    292    punpcklwd        m1, m2
    293    psrad          xm16, 1
    294    psrad          xm17, 1
    295    packssdw       xm16, xm17
    296    vpsravw        xm16, xm11
    297    vpshuflw     m0{k2}, m16, q1302
    298    punpckhqdq     xm16, xm16
    299    vpshuflw     m1{k2}, m16, q1302
    300    call .add_noise_v
    301    sub              hb, 2
    302    jg .loop_y_h_overlap
    303    add              wq, 32
    304    lea            srcq, [src_bakq+wq*2]
    305    jl .hv_overlap
    306 .end:
    307    RET
    308 ALIGN function_align
    309 .add_noise_v:
    310    mova             m2, m19
    311    vpdpwssd         m2, m12, m4
    312    mova             m3, m19
    313    vpdpwssd         m3, m13, m5
    314    mova             m4, m19
    315    vpdpwssd         m4, m12, m0
    316    mova             m5, m19
    317    vpdpwssd         m5, m13, m1
    318    REPX   {psrad x, 1}, m2, m3, m4, m5
    319    packssdw         m4, m2
    320    packssdw         m5, m3
    321    vpsravw          m4, m11
    322    vpsravw          m5, m11
    323 .add_noise:
    324    mova             m0, [srcq+strideq*0]
    325    mova             m1, [srcq+strideq*1]
    326    kmovw            k4, k1
    327    pand            m16, m6, m0
    328    psrld            m3, m0, 16
    329    vpgatherdd   m2{k4}, [scalingq+m16]
    330    vpcmpud          k4, m3, m6, 2 ; px <= bdmax
    331    vpgatherdd  m16{k4}, [scalingq+m3]
    332    kmovw            k4, k1
    333    pand            m17, m6, m1
    334    vpgatherdd   m3{k4}, [scalingq+m17]
    335    vpshufb      m2{k3}, m16, m7
    336    psrld           m16, m1, 16
    337    vpcmpud          k4, m16, m6, 2
    338    vpgatherdd  m17{k4}, [scalingq+m16]
    339    vpshufb      m3{k3}, m17, m7
    340    vpsllvw          m2, m10
    341    vpsllvw          m3, m10
    342    pmulhrsw         m4, m2
    343    pmulhrsw         m5, m3
    344    add      grain_lutq, 82*4
    345    paddw            m0, m4
    346    paddw            m1, m5
    347    pmaxsw           m0, m8
    348    pmaxsw           m1, m8
    349    pminsw           m0, m9
    350    pminsw           m1, m9
    351    mova    [dstq+srcq], m0
    352    add            srcq, strideq
    353    mova    [dstq+srcq], m1
    354    add            srcq, strideq
    355    ret
    356 
    357 %macro FGUV_FN 3 ; name, ss_hor, ss_ver
    358 cglobal fguv_32x32xn_i%1_16bpc, 6, 15, 22, dst, src, stride, fg_data, w, scaling, \
    359                                           grain_lut, h, sby, luma, lstride, uv_pl, is_id
    360 %define base r12-fg_min
    361    lea             r12, [fg_min]
    362    mov             r9d, r13m            ; bdmax
    363    mov             r7d, [fg_dataq+FGData.scaling_shift]
    364    mov             r6d, [fg_dataq+FGData.clip_to_restricted_range]
    365    mov            r11d, is_idm
    366    kxnorw           k1, k1, k1          ; 0xffff
    367    vpbroadcastd     m5, r13m
    368    mov             r13, 0xeeeeeeeeeeeeeeee
    369    vbroadcasti32x4  m6, [base+scale_mask]
    370    shr             r9d, 11              ; is_12bpc
    371    vpbroadcastd     m7, [base+scale_shift+r7*4-32]
    372    shlx           r10d, r6d, r9d
    373    mov            sbyd, sbym
    374    shlx            r6d, r6d, r11d
    375    vpbroadcastd     m8, [base+fg_min+r10*4]
    376    lea             r6d, [r9+r6*2]
    377    vpbroadcastd     m9, [base+fg_max+r6*4]
    378    kmovq            k2, r13
    379    vpbroadcastd    m20, [base+scale_rnd+r9*4]
    380    packssdw         m4, m5, m5
    381    vpbroadcastd    m21, [base+scale_shift+r9*8+4]
    382 %if %2
    383    mova            m12, [pb_0to63] ; pw_even
    384    mov            r13d, 0x0101
    385    vpbroadcastq    m10, [base+pw_23_22+r9*8]
    386    kmovw            k3, r13d
    387 %if %3
    388    pshufd          m11, m10, q0000
    389 %else
    390    vpbroadcastd   ym16, [base+pw_27_17_17_27+r9*8+0]
    391    vpbroadcastd    m11, [base+pw_27_17_17_27+r9*8+4]
    392    vmovdqu16   m11{k1}, m16
    393 %endif
    394    psrlw           m13, m12, 8          ; pw_odd
    395 %else
    396    vpbroadcastq    m10, [base+pw_27_17_17_27+r9*8]
    397    kshiftrb         k3, k1, 7           ; 0x01
    398    kshiftrb         k4, k1, 4           ; 0x0f
    399    pshufd          m11, m10, q0000
    400 %endif
    401    mov        lstrideq, r10mp
    402    test           sbyd, sbyd
    403    setnz           r7b
    404    cmp byte [fg_dataq+FGData.chroma_scaling_from_luma], 0
    405    jne .csfl
    406 
    407 %macro %%FGUV_32x32xN_LOOP 3 ; not-csfl, ss_hor, ss_ver
    408    DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
    409                _, sby, see, lstride
    410 
    411 %if %1
    412    mov             r6d, r11m
    413    vpbroadcastd     m0, [base+uv_offset_mul+r9*4]
    414    vpbroadcastd     m1, [base+pb_8_9_0_1]
    415    vpbroadcastd    m14, [fg_dataq+FGData.uv_offset+r6*4]
    416    vbroadcasti32x4 m15, [fg_dataq+FGData.uv_mult+r6*4]
    417    pmaddwd         m14, m0
    418    pshufb          m15, m1 ; { uv_luma_mult, uv_mult }
    419 %endif
    420    test            [fg_dataq+FGData.overlap_flag], r7b
    421    jnz %%v_overlap
    422 
    423    imul           seed, sbyd, (173 << 24) | 37
    424    add            seed, (105 << 24) | 178
    425    rorx           seed, seed, 24
    426    movzx          seed, seew
    427    xor            seed, [fg_dataq+FGData.seed]
    428 
    429    DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
    430                offx, offy, see, lstride, luma
    431 
    432    mov           lumaq, r9mp
    433    lea             r12, [srcq+wq*2]
    434    lea             r13, [dstq+wq*2]
    435    lea             r14, [lumaq+wq*(2<<%2)]
    436    mov            r9mp, r12
    437    mov           r10mp, r13
    438    mov           r11mp, r14
    439    neg              wq
    440 
    441 %%loop_x:
    442    rorx             r6, seeq, 1
    443    or             seed, 0xeff4
    444    test           seeb, seeh
    445    lea            seed, [r6+0x8000]
    446    cmovp          seed, r6d               ; updated seed
    447    rorx          offyd, seed, 8
    448    rorx          offxq, seeq, 12
    449    and           offyd, 0xf
    450    imul          offyd, 164>>%3
    451    lea           offyd, [offyq+offxq*(2-%2)+(3+(6>>%3))*82+(3+(6>>%2))] ; offy*stride+offx
    452 
    453    DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
    454                h, offxy, see, lstride, luma
    455 
    456    mov      grain_lutq, grain_lutmp
    457    mov              hd, hm
    458 %%loop_y:
    459 %if %2
    460    movu           ym18, [grain_lutq+offxyq*2+82*0]
    461    vinserti32x8    m18, [grain_lutq+offxyq*2+82*2], 1
    462    movu           ym19, [grain_lutq+offxyq*2+82*4]
    463    vinserti32x8    m19, [grain_lutq+offxyq*2+82*6], 1
    464 %else
    465    movu            m18, [grain_lutq+offxyq*2+82*0]
    466    movu            m19, [grain_lutq+offxyq*2+82*2]
    467 %endif
    468    call %%add_noise
    469    sub              hb, 2<<%2
    470    jg %%loop_y
    471    add              wq, 32>>%2
    472    jge .end
    473    mov            srcq, r9mp
    474    mov            dstq, r10mp
    475    mov           lumaq, r11mp
    476    lea            srcq, [srcq+wq*2]
    477    lea            dstq, [dstq+wq*2]
    478    lea           lumaq, [lumaq+wq*(2<<%2)]
    479    cmp byte [fg_dataq+FGData.overlap_flag], 0
    480    je %%loop_x
    481    cmp       dword r8m, 0 ; sby
    482    jne %%hv_overlap
    483 
    484    ; horizontal overlap (without vertical overlap)
    485 %%loop_x_h_overlap:
    486    rorx             r6, seeq, 1
    487    or             seed, 0xEFF4
    488    test           seeb, seeh
    489    lea            seed, [r6+0x8000]
    490    cmovp          seed, r6d               ; updated seed
    491 
    492    DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
    493                offx, offy, see, lstride, luma, left_offxy
    494 
    495    lea     left_offxyd, [offyq+(32>>%2)]  ; previous column's offy*stride+offx
    496    rorx          offyd, seed, 8
    497    rorx          offxq, seeq, 12
    498    and           offyd, 0xf
    499    imul          offyd, 164>>%3
    500    lea           offyd, [offyq+offxq*(2-%2)+(3+(6>>%3))*82+3+(6>>%2)] ; offy*stride+offx
    501 
    502    DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
    503                h, offxy, see, lstride, luma, left_offxy
    504 
    505    mov      grain_lutq, grain_lutmp
    506    mov              hd, hm
    507 %%loop_y_h_overlap:
    508 %if %2
    509    movu           ym18, [grain_lutq+offxyq*2+82*0]
    510    vinserti32x8    m18, [grain_lutq+offxyq*2+82*2], 1
    511    movu           ym19, [grain_lutq+offxyq*2+82*4]
    512    vinserti32x8    m19, [grain_lutq+offxyq*2+82*6], 1
    513    movd           xm16, [grain_lutq+left_offxyq*2+82*0]
    514    vinserti32x4    m16, [grain_lutq+left_offxyq*2+82*2], 2
    515    movd           xm17, [grain_lutq+left_offxyq*2+82*4]
    516    vinserti32x4    m17, [grain_lutq+left_offxyq*2+82*6], 2
    517    punpckldq       m16, m17
    518    punpckldq       m17, m18, m19
    519    punpcklwd       m16, m17
    520    mova            m17, m20
    521    vpdpwssd        m17, m16, m10
    522    psrad           m17, 1
    523    packssdw        m17, m17
    524    vpsravw         m17, m21
    525 %else
    526    movu            m18, [grain_lutq+offxyq*2+82*0]
    527    movu            m19, [grain_lutq+offxyq*2+82*2]
    528    movd           xm16, [grain_lutq+left_offxyq*2+82*0]
    529    pinsrd         xm16, [grain_lutq+left_offxyq*2+82*2], 1
    530    punpckldq      xm17, xm18, xm19
    531    punpcklwd      xm16, xm17
    532    mova           xm17, xm20
    533    vpdpwssd       xm17, xm16, xm10
    534    psrad          xm17, 1
    535    packssdw       xm17, xm17
    536    vpsravw        xm17, xm21
    537 %endif
    538    vmovdqa32   m18{k3}, m17
    539    vpshufd     m19{k3}, m17, q0321
    540    call %%add_noise
    541    sub              hb, 2<<%2
    542    jg %%loop_y_h_overlap
    543    add              wq, 32>>%2
    544    jge .end
    545    mov            srcq, r9mp
    546    mov            dstq, r10mp
    547    mov           lumaq, r11mp
    548    lea            srcq, [srcq+wq*2]
    549    lea            dstq, [dstq+wq*2]
    550    lea           lumaq, [lumaq+wq*(2<<%2)]
    551    cmp       dword r8m, 0 ; sby
    552    jne %%hv_overlap
    553    jmp %%loop_x_h_overlap
    554 
    555 %%v_overlap:
    556    DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
    557                _, sby, see, lstride
    558 
    559    movzx          sbyd, sbyb
    560    imul           seed, [fg_dataq+FGData.seed], 0x00010001
    561    imul            r7d, sbyd, 173 * 0x00010001
    562    imul           sbyd, 37 * 0x01000100
    563    add             r7d, (105 << 16) | 188
    564    add            sbyd, (178 << 24) | (141 << 8)
    565    and             r7d, 0x00ff00ff
    566    and            sbyd, 0xff00ff00
    567    xor            seed, r7d
    568    xor            seed, sbyd               ; (cur_seed << 16) | top_seed
    569 
    570    DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
    571                offx, offy, see, lstride, luma, _, top_offxy
    572 
    573    mov           lumaq, r9mp
    574    lea             r12, [srcq+wq*2]
    575    lea             r13, [dstq+wq*2]
    576    lea             r14, [lumaq+wq*(2<<%2)]
    577    mov            r9mp, r12
    578    mov           r10mp, r13
    579    mov           r11mp, r14
    580    neg              wq
    581 
    582    ; we assume from the block above that bits 8-15 of r7d are zero'ed
    583    mov             r6d, seed
    584    or             seed, 0xeff4eff4
    585    test           seeb, seeh
    586    setp            r7b                     ; parity of top_seed
    587    shr            seed, 16
    588    shl             r7d, 16
    589    test           seeb, seeh
    590    setp            r7b                     ; parity of cur_seed
    591    or              r6d, 0x00010001
    592    xor             r7d, r6d
    593    rorx           seed, r7d, 1             ; updated (cur_seed << 16) | top_seed
    594 
    595    rorx          offyd, seed, 8
    596    rorx          offxd, seed, 12
    597    and           offyd, 0xf000f
    598    and           offxd, 0xf000f
    599    imul          offyd, 164>>%3
    600    ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy
    601    lea           offyd, [offyq+offxq*(2-%2)+0x10001*((3+(6>>%3))*82+3+(6>>%2))+(32>>%3)*82]
    602 
    603    DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
    604                h, offxy, see, lstride, luma, _, top_offxy
    605 
    606    mov      grain_lutq, grain_lutmp
    607    mov              hd, hm
    608    movzx    top_offxyd, offxyw
    609    shr          offxyd, 16
    610 
    611 %if %3
    612    movu           ym16, [grain_lutq+offxyq*2+82*0]
    613    movu            ym1, [grain_lutq+top_offxyq*2+82*0]
    614    vbroadcasti32x8 m18, [grain_lutq+offxyq*2+82*2]
    615    movu           ym19, [grain_lutq+offxyq*2+82*4]
    616    vinserti32x8    m19, [grain_lutq+offxyq*2+82*6], 1
    617    punpcklwd      ym17, ym1, ym16
    618    punpckhwd       ym1, ym16
    619 %elif %2
    620    movu           ym18, [grain_lutq+offxyq*2+82*0]
    621    vinserti32x8    m18, [grain_lutq+offxyq*2+82*2], 1
    622    movu           ym17, [grain_lutq+top_offxyq*2+82*0]
    623    vinserti32x8    m17, [grain_lutq+top_offxyq*2+82*2], 1
    624    movu           ym19, [grain_lutq+offxyq*2+82*4]
    625    vinserti32x8    m19, [grain_lutq+offxyq*2+82*6], 1
    626    punpcklwd       m16, m17, m18
    627    punpckhwd       m17, m18
    628 %else
    629    movu            m18, [grain_lutq+offxyq*2+82*0]
    630    movu            m19, [grain_lutq+top_offxyq*2+82*0]
    631    movu             m2, [grain_lutq+offxyq*2+82*2]
    632    movu            m16, [grain_lutq+top_offxyq*2+82*2]
    633    punpckhwd        m1, m19, m18
    634    punpcklwd       m19, m18
    635    punpckhwd       m18, m2, m16
    636    punpcklwd        m2, m16
    637 %endif
    638    call %%add_noise_v
    639    sub              hb, 2<<%2
    640    jg %%loop_y
    641    add              wq, 32>>%2
    642    jge .end
    643    mov            srcq, r9mp
    644    mov            dstq, r10mp
    645    mov           lumaq, r11mp
    646    lea            srcq, [srcq+wq*2]
    647    lea            dstq, [dstq+wq*2]
    648    lea           lumaq, [lumaq+wq*(2<<%2)]
    649 
    650    ; since fg_dataq.overlap is guaranteed to be set, we never jump back
    651    ; to %%v_overlap, and instead always fall-through to %%hv_overlap
    652 %%hv_overlap:
    653    ; we assume from the block above that bits 8-15 of r7d are zero'ed
    654    mov             r6d, seed
    655    or             seed, 0xeff4eff4
    656    test           seeb, seeh
    657    setp            r7b                     ; parity of top_seed
    658    shr            seed, 16
    659    shl             r7d, 16
    660    test           seeb, seeh
    661    setp            r7b                     ; parity of cur_seed
    662    or              r6d, 0x00010001
    663    xor             r7d, r6d
    664    rorx           seed, r7d, 1             ; updated (cur_seed << 16) | top_seed
    665 
    666    DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
    667                offx, offy, see, lstride, luma, left_offxy, top_offxy, topleft_offxy
    668 
    669    lea  topleft_offxyq, [top_offxyq+(32>>%2)]
    670    lea     left_offxyq, [offyq+(32>>%2)]
    671    rorx          offyd, seed, 8
    672    rorx          offxd, seed, 12
    673    and           offyd, 0xf000f
    674    and           offxd, 0xf000f
    675    imul          offyd, 164>>%3
    676    ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy
    677    lea           offyd, [offyq+offxq*(2-%2)+0x10001*((3+(6>>%3))*82+3+(6>>%2))+(32>>%3)*82]
    678 
    679    DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
    680                h, offxy, see, lstride, luma, left_offxy, top_offxy, topleft_offxy
    681 
    682    mov      grain_lutq, grain_lutmp
    683    mov              hd, hm
    684    movzx    top_offxyd, offxyw
    685    shr          offxyd, 16
    686 
    687    ; grain = grain_lut[offy+y][offx+x]
    688 %if %2
    689    movd           xm16, [grain_lutq+left_offxyq*2+82*0]
    690    vinserti32x4    m16, [grain_lutq+left_offxyq*2+82*2], 2
    691    movd           xm17, [grain_lutq+left_offxyq*2+82*4]
    692    vinserti32x4    m17, [grain_lutq+left_offxyq*2+82*6], 2
    693    movu           ym18, [grain_lutq+offxyq*2+82*0]
    694    vinserti32x8    m18, [grain_lutq+offxyq*2+82*2], 1
    695    movu           ym19, [grain_lutq+offxyq*2+82*4]
    696    vinserti32x8    m19, [grain_lutq+offxyq*2+82*6], 1
    697    punpckldq       m16, m17
    698    punpckldq       m17, m18, m19
    699    punpcklwd       m16, m17
    700    movu            ym1, [grain_lutq+top_offxyq*2+82*0]
    701    movd           xm17, [grain_lutq+topleft_offxyq*2+82*0]
    702    mova             m0, m20
    703    vpdpwssd         m0, m16, m10
    704 %if %3
    705    punpcklwd      xm17, xm1
    706    mova           xm16, xm20
    707    vpdpwssd       xm16, xm17, xm10
    708    psrad          xm16, 1
    709 %else
    710    vinserti32x8     m1, [grain_lutq+top_offxyq*2+82*2], 1
    711    vinserti32x4    m17, [grain_lutq+topleft_offxyq*2+82*2], 2
    712    punpcklwd       m17, m1
    713    mova            m16, m20
    714    vpdpwssd        m16, m17, m10
    715    psrad           m16, 1
    716 %endif
    717    psrad            m0, 1
    718    packssdw         m0, m16
    719    vpsravw          m0, m21
    720    vmovdqa32   m18{k3}, m0
    721    vpshufd     m19{k3}, m0, q0321
    722 %if %3
    723    vpunpckhdq  ym1{k3}, ym0, ym0
    724    punpcklwd      ym17, ym1, ym18
    725    punpckhwd       ym1, ym18
    726 %else
    727    vpunpckhdq   m1{k3}, m0, m0
    728    punpcklwd       m16, m1, m18
    729    punpckhwd       m17, m1, m18
    730 %endif
    731 %else
    732    movu            m18, [grain_lutq+offxyq*2+82*0]
    733    movu            m19, [grain_lutq+top_offxyq*2+82*0]
    734    movd           xm17, [grain_lutq+left_offxyq*2+82*0]
    735    pinsrd         xm17, [grain_lutq+topleft_offxyq*2+82*0], 1
    736    punpckldq      xm16, xm18, xm19
    737    punpcklwd      xm17, xm16
    738    movu             m2, [grain_lutq+offxyq*2+82*2]
    739    movu             m0, [grain_lutq+top_offxyq*2+82*2]
    740    movd           xm16, [grain_lutq+left_offxyq*2+82*2]
    741    pinsrd         xm16, [grain_lutq+topleft_offxyq*2+82*2], 1
    742    punpckldq       xm1, xm2, xm0
    743    punpcklwd       xm1, xm16, xm1
    744    mova           xm16, xm20
    745    vpdpwssd       xm16, xm17, xm10
    746    mova           xm17, xm20
    747    vpdpwssd       xm17, xm1, xm10
    748    punpckhwd        m1, m19, m18
    749    punpcklwd       m19, m18
    750    punpckhwd       m18, m2, m0
    751    punpcklwd        m2, m0
    752    psrad          xm16, 1
    753    psrad          xm17, 1
    754    packssdw       xm16, xm17
    755    vpsravw        xm16, xm21
    756    vpshuflw    m19{k4}, m16, q1302
    757    punpckhqdq     xm16, xm16
    758    vpshuflw     m2{k4}, m16, q3120
    759 %endif
    760    call %%add_noise_v
    761    sub              hb, 2<<%2
    762    jg %%loop_y_h_overlap
    763    add              wq, 32>>%2
    764    jge .end
    765    mov            srcq, r9mp
    766    mov            dstq, r10mp
    767    mov           lumaq, r11mp
    768    lea            srcq, [srcq+wq*2]
    769    lea            dstq, [dstq+wq*2]
    770    lea           lumaq, [lumaq+wq*(2<<%2)]
    771    jmp %%hv_overlap
    772 
    773 ALIGN function_align
    774 %%add_noise_v:
    775 %if %3
    776    mova           ym16, ym20
    777    vpdpwssd       ym16, ym17, ym11
    778    mova           ym17, ym20
    779    vpdpwssd       ym17, ym1, ym11
    780    psrad          ym16, 1
    781    psrad          ym17, 1
    782    packssdw       ym16, ym17
    783    vpsravw     m18{k1}, m16, m21
    784 %elif %2
    785    mova            m18, m20
    786    vpdpwssd        m18, m16, m11
    787    mova            m16, m20
    788    vpdpwssd        m16, m17, m11
    789    psrad           m18, 1
    790    psrad           m16, 1
    791    packssdw        m18, m16
    792    vpsravw         m18, m21
    793 %else
    794    mova            m16, m20
    795    vpdpwssd        m16, m1, m11
    796    mova            m17, m20
    797    vpdpwssd        m17, m18, m11
    798    mova            m18, m20
    799    vpdpwssd        m18, m19, m11
    800    mova            m19, m20
    801    vpdpwssd        m19, m2, m11
    802    REPX   {psrad x, 1}, m16, m17, m18, m19
    803    packssdw        m18, m16
    804    packssdw        m19, m17
    805    vpsravw         m18, m21
    806    vpsravw         m19, m21
    807 %endif
    808 %%add_noise:
    809 %if %2
    810    mova             m2, [lumaq+lstrideq*(0<<%3)]
    811    mova             m0, [lumaq+lstrideq*(1<<%3)]
    812    lea           lumaq, [lumaq+lstrideq*(2<<%3)]
    813    mova             m3, [lumaq+lstrideq*(0<<%3)]
    814    mova             m1, [lumaq+lstrideq*(1<<%3)]
    815    mova            m16, m12
    816    vpermi2w        m16, m2, m0
    817    vpermt2w         m2, m13, m0
    818    mova            m17, m12
    819    vpermi2w        m17, m3, m1
    820    vpermt2w         m3, m13, m1
    821    pavgw            m2, m16
    822    pavgw            m3, m17
    823 %elif %1
    824    mova             m2, [lumaq+lstrideq*0]
    825    mova             m3, [lumaq+lstrideq*1]
    826 %endif
    827 %if %2
    828    mova           ym16, [srcq+strideq*0]
    829    vinserti32x8    m16, [srcq+strideq*1], 1
    830    lea            srcq, [srcq+strideq*2]
    831 %else
    832    mova            m16, [srcq+strideq*0]
    833 %endif
    834 %if %1
    835    punpckhwd       m17, m2, m16
    836    mova             m0, m14
    837    vpdpwssd         m0, m17, m15
    838    punpcklwd       m17, m2, m16
    839    mova             m2, m14
    840    vpdpwssd         m2, m17, m15
    841 %endif
    842 %if %2
    843    mova           ym17, [srcq+strideq*0]
    844    vinserti32x8    m17, [srcq+strideq*1], 1
    845 %else
    846    mova            m17, [srcq+strideq*1]
    847 %endif
    848 %if %1
    849    psrad            m0, 6
    850    psrad            m2, 6
    851    packusdw         m2, m0
    852    punpckhwd        m0, m3, m17
    853    mova             m1, m14
    854    vpdpwssd         m1, m15, m0
    855    punpcklwd        m0, m3, m17
    856    mova             m3, m14
    857    vpdpwssd         m3, m15, m0
    858    psrad            m1, 6
    859    psrad            m3, 6
    860    packusdw         m3, m1
    861    pminuw           m2, m4
    862    pminuw           m3, m4
    863 
    864 .add_noise_main:
    865    ; scaling[luma_src]
    866    kmovw            k5, k1
    867    pand             m1, m5, m2
    868    vpgatherdd   m0{k5}, [scalingq+m1]
    869    kmovw            k5, k1
    870    psrld            m2, 16
    871    vpgatherdd   m1{k5}, [scalingq+m2]
    872    vpshufb      m0{k2}, m1, m6
    873    kmovw            k5, k1
    874    psrld            m1, m3, 16
    875    vpgatherdd   m2{k5}, [scalingq+m1]
    876    kmovw            k5, k1
    877    pand             m3, m5
    878    vpgatherdd   m1{k5}, [scalingq+m3]
    879    vpshufb      m1{k2}, m2, m6
    880 
    881    ; noise = round2(scaling[luma_src] * grain, scaling_shift)
    882    vpsllvw          m0, m7
    883    vpsllvw          m1, m7
    884    pmulhrsw        m18, m0
    885    pmulhrsw        m19, m1
    886    add      grain_lutq, 82*(4<<%2)
    887    lea           lumaq, [lumaq+lstrideq*(2<<%3)]
    888    lea            srcq, [srcq+strideq*2]
    889    paddw           m16, m18
    890    paddw           m17, m19
    891    pmaxsw          m16, m8
    892    pmaxsw          m17, m8
    893    pminsw          m16, m9
    894    pminsw          m17, m9
    895 %if %2
    896    mova          [dstq+strideq*0], ym16
    897    vextracti32x8 [dstq+strideq*1], m16, 1
    898    lea            dstq, [dstq+strideq*2]
    899    mova          [dstq+strideq*0], ym17
    900    vextracti32x8 [dstq+strideq*1], m17, 1
    901 %else
    902    mova [dstq+strideq*0], m16
    903    mova [dstq+strideq*1], m17
    904 %endif
    905    lea            dstq, [dstq+strideq*2]
    906    ret
    907 %else
    908 %if %2
    909    pand             m2, m4
    910    pand             m3, m4
    911 %else
    912    pand             m2, m4, [lumaq+lstrideq*0]
    913    pand             m3, m4, [lumaq+lstrideq*1]
    914 %endif
    915    jmp .add_noise_main
    916 %endif
    917 %endmacro
    918 
    919    %%FGUV_32x32xN_LOOP 1, %2, %3
    920 .csfl:
    921    %%FGUV_32x32xN_LOOP 0, %2, %3
    922 .end:
    923    RET
    924 %endmacro
    925 
    926 FGUV_FN 420, 1, 1
    927 FGUV_FN 422, 1, 0
    928 FGUV_FN 444, 0, 0
    929 
    930 %endif