tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

filmgrain_avx2.asm (65296B)


      1 ; Copyright © 2019-2022, VideoLAN and dav1d authors
      2 ; Copyright © 2019-2022, Two Orioles, LLC
      3 ; All rights reserved.
      4 ;
      5 ; Redistribution and use in source and binary forms, with or without
      6 ; modification, are permitted provided that the following conditions are met:
      7 ;
      8 ; 1. Redistributions of source code must retain the above copyright notice, this
      9 ;    list of conditions and the following disclaimer.
     10 ;
     11 ; 2. Redistributions in binary form must reproduce the above copyright notice,
     12 ;    this list of conditions and the following disclaimer in the documentation
     13 ;    and/or other materials provided with the distribution.
     14 ;
     15 ; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
     16 ; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
     17 ; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
     18 ; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
     19 ; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
     20 ; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
     21 ; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
     22 ; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
     23 ; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
     24 ; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     25 
     26 %include "config.asm"
     27 %include "ext/x86/x86inc.asm"
     28 %include "x86/filmgrain_common.asm"
     29 
     30 %if ARCH_X86_64
     31 
     32 SECTION_RODATA 32
     33 pb_mask:       db  0,128,128,  0,128,  0,  0,128,128,  0,  0,128,  0,128,128,  0
     34 gen_shufE:     db  0,  1,  8,  9,  2,  3, 10, 11,  4,  5, 12, 13,  6,  7, 14, 15
     35 gen_shufA:     db  0,  1,  2,  3,  2,  3,  4,  5,  4,  5,  6,  7,  6,  7,  8,  9
     36 gen_shufB:     db  2,  3,  4,  5,  4,  5,  6,  7,  6,  7,  8,  9,  8,  9, 10, 11
     37 gen_shufC:     db  4,  5,  6,  7,  6,  7,  8,  9,  8,  9, 10, 11, 10, 11, 12, 13
     38 gen_shufD:     db  6,  7,  8,  9,  8,  9, 10, 11, 10, 11, 12, 13, 12, 13, 14, 15
     39 ; note: the order of (some of) the following constants matter
     40 pb_27_17:      times 2 db 27, 17
     41 byte_blend:            db  0,  0,  0, -1
     42 pb_27_17_17_27:        db 27, 17, 17, 27,  0, 32,  0, 32
     43 pb_17_27:      times 2 db 17, 27
     44 pb_1:          times 4 db 1
     45 pb_23_22:              db 23, 22,  0, 32,  0, 32,  0, 32
     46 next_upperbit_mask:    dw 0x100B, 0x2016, 0x402C, 0x8058
     47 pw_seed_xor:   times 2 dw 0xb524
     48               times 2 dw 0x49d8
     49 fg_min:        times 4 db 0
     50               times 4 db 16
     51 fg_max:        times 4 db 255
     52               times 4 db 240
     53               times 4 db 235
     54 pd_m65536:             dd -65536
     55 pw_8:          times 2 dw 8
     56 pw_1024:       times 2 dw 1024
     57 hmul_bits:             dw 32768, 16384,  8192,  4096
     58 round:                 dw  2048,  1024,   512
     59 mul_bits:              dw   256,   128,    64,    32,    16
     60 round_vals:            dw    32,    64,   128,   256,   512
     61 pw_1:                  dw 1
     62 
     63 %macro JMP_TABLE 2-*
     64    %1_8bpc_%2_table:
     65    %xdefine %%base %1_8bpc_%2_table
     66    %xdefine %%prefix mangle(private_prefix %+ _%1_8bpc_%2)
     67    %rep %0 - 2
     68        dd %%prefix %+ .ar%3 - %%base
     69        %rotate 1
     70    %endrep
     71 %endmacro
     72 
     73 JMP_TABLE generate_grain_y,      avx2, 0, 1, 2, 3
     74 JMP_TABLE generate_grain_uv_420, avx2, 0, 1, 2, 3
     75 JMP_TABLE generate_grain_uv_422, avx2, 0, 1, 2, 3
     76 JMP_TABLE generate_grain_uv_444, avx2, 0, 1, 2, 3
     77 
     78 SECTION .text
     79 
     80 INIT_YMM avx2
     81 cglobal generate_grain_y_8bpc, 2, 9, 8, buf, fg_data
     82 %define base r4-generate_grain_y_8bpc_avx2_table
     83    lea              r4, [generate_grain_y_8bpc_avx2_table]
     84    vpbroadcastw    xm0, [fg_dataq+FGData.seed]
     85    mov             r6d, [fg_dataq+FGData.grain_scale_shift]
     86    movq            xm1, [base+next_upperbit_mask]
     87    movsxd           r5, [fg_dataq+FGData.ar_coeff_lag]
     88    movq            xm4, [base+mul_bits]
     89    movq            xm5, [base+hmul_bits]
     90    mov              r7, -73*82
     91    mova            xm6, [base+pb_mask]
     92    sub            bufq, r7
     93    vpbroadcastw    xm7, [base+round+r6*2]
     94    lea              r6, [gaussian_sequence]
     95    movsxd           r5, [r4+r5*4]
     96 .loop:
     97    pand            xm2, xm0, xm1
     98    psrlw           xm3, xm2, 10
     99    por             xm2, xm3            ; bits 0xf, 0x1e, 0x3c and 0x78 are set
    100    pmullw          xm2, xm4            ; bits 0x0f00 are set
    101    pmulhuw         xm0, xm5
    102    pshufb          xm3, xm6, xm2       ; set 15th bit for next 4 seeds
    103    psllq           xm2, xm3, 30
    104    por             xm2, xm3
    105    psllq           xm3, xm2, 15
    106    por             xm2, xm0            ; aggregate each bit into next seed's high bit
    107    por             xm3, xm2            ; 4 next output seeds
    108    pshuflw         xm0, xm3, q3333
    109    psrlw           xm3, 5
    110    pand            xm2, xm0, xm1
    111    movq             r2, xm3
    112    psrlw           xm3, xm2, 10
    113    por             xm2, xm3
    114    pmullw          xm2, xm4
    115    pmulhuw         xm0, xm5
    116    movzx           r3d, r2w
    117    pshufb          xm3, xm6, xm2
    118    psllq           xm2, xm3, 30
    119    por             xm2, xm3
    120    psllq           xm3, xm2, 15
    121    por             xm0, xm2
    122    movd            xm2, [r6+r3*2]
    123    rorx             r3, r2, 32
    124    por             xm3, xm0
    125    shr             r2d, 16
    126    pinsrw          xm2, [r6+r2*2], 1
    127    pshuflw         xm0, xm3, q3333
    128    movzx           r2d, r3w
    129    psrlw           xm3, 5
    130    pinsrw          xm2, [r6+r2*2], 2
    131    shr             r3d, 16
    132    movq             r2, xm3
    133    pinsrw          xm2, [r6+r3*2], 3
    134    movzx           r3d, r2w
    135    pinsrw          xm2, [r6+r3*2], 4
    136    rorx             r3, r2, 32
    137    shr             r2d, 16
    138    pinsrw          xm2, [r6+r2*2], 5
    139    movzx           r2d, r3w
    140    pinsrw          xm2, [r6+r2*2], 6
    141    shr             r3d, 16
    142    pinsrw          xm2, [r6+r3*2], 7
    143    pmulhrsw        xm2, xm7
    144    packsswb        xm2, xm2
    145    movq      [bufq+r7], xm2
    146    add              r7, 8
    147    jl .loop
    148 
    149    ; auto-regression code
    150    add              r5, r4
    151    jmp              r5
    152 
    153 .ar1:
    154    DEFINE_ARGS buf, fg_data, cf3, shift, val3, min, max, x, val0
    155    mov          shiftd, [fg_dataq+FGData.ar_coeff_shift]
    156    movsx          cf3d, byte [fg_dataq+FGData.ar_coeffs_y+3]
    157    movd            xm5, [fg_dataq+FGData.ar_coeffs_y]
    158    mova            xm2, [base+gen_shufC]
    159    DEFINE_ARGS buf, h, cf3, shift, val3, min, max, x, val0
    160    pinsrb          xm5, [base+pb_1], 3
    161    vpbroadcastw    xm3, [base+round_vals+shiftq*2-12]    ; rnd
    162    pmovsxbw        xm5, xm5
    163    pshufd          xm4, xm5, q0000
    164    pshufd          xm5, xm5, q1111
    165    sub            bufq, 82*73-(82*3+79)
    166    mov              hd, 70
    167    mov            mind, -128
    168    mov            maxd, 127
    169 .y_loop_ar1:
    170    mov              xq, -76
    171    movsx         val3d, byte [bufq+xq-1]
    172 .x_loop_ar1:
    173    pmovsxbw        xm1, [bufq+xq-82-3]
    174    pshufb          xm0, xm1, xm2
    175    punpckhwd       xm1, xm3
    176    pmaddwd         xm0, xm4
    177    pmaddwd         xm1, xm5
    178    paddd           xm0, xm1
    179 .x_loop_ar1_inner:
    180    movd          val0d, xm0
    181    psrldq          xm0, 4
    182    imul          val3d, cf3d
    183    add           val3d, val0d
    184    movsx         val0d, byte [bufq+xq]
    185    sarx          val3d, val3d, shiftd
    186    add           val3d, val0d
    187    cmp           val3d, maxd
    188    cmovns        val3d, maxd
    189    cmp           val3d, mind
    190    cmovs         val3d, mind
    191    mov       [bufq+xq], val3b
    192    ; keep val3d in-place as left for next x iteration
    193    inc              xq
    194    jz .x_loop_ar1_end
    195    test             xb, 3
    196    jnz .x_loop_ar1_inner
    197    jmp .x_loop_ar1
    198 .x_loop_ar1_end:
    199    add            bufq, 82
    200    dec              hd
    201    jg .y_loop_ar1
    202 .ar0:
    203    RET
    204 
    205 .ar2:
    206 %if WIN64
    207    %assign stack_size_padded 168
    208    SUB             rsp, stack_size_padded
    209    WIN64_PUSH_XMM   16, 8
    210 %endif
    211    DEFINE_ARGS buf, fg_data, h, x
    212    mov             r6d, [fg_dataq+FGData.ar_coeff_shift]
    213    pmovsxbw        xm7, [fg_dataq+FGData.ar_coeffs_y+0]    ; cf0-7
    214    movd            xm9, [fg_dataq+FGData.ar_coeffs_y+8]    ; cf8-11
    215    vpbroadcastd   xm10, [base+round_vals-14+r6*2]
    216    movd           xm11, [base+byte_blend+1]
    217    pmovsxbw        xm9, xm9
    218    pshufd          xm4, xm7, q0000
    219    mova           xm12, [base+gen_shufA]
    220    pshufd          xm5, xm7, q3333
    221    mova           xm13, [base+gen_shufB]
    222    pshufd          xm6, xm7, q1111
    223    mova           xm14, [base+gen_shufC]
    224    pshufd          xm7, xm7, q2222
    225    mova           xm15, [base+gen_shufD]
    226    pshufd          xm8, xm9, q0000
    227    psrld          xm10, 16
    228    pshufd          xm9, xm9, q1111
    229    sub            bufq, 82*73-(82*3+79)
    230    mov              hd, 70
    231 .y_loop_ar2:
    232    mov              xq, -76
    233 .x_loop_ar2:
    234    pmovsxbw        xm0, [bufq+xq-82*2-2]   ; y=-2,x=[-2,+5]
    235    pmovsxbw        xm1, [bufq+xq-82*1-2]   ; y=-1,x=[-2,+5]
    236    pshufb          xm2, xm0, xm12
    237    pmaddwd         xm2, xm4
    238    pshufb          xm3, xm1, xm13
    239    pmaddwd         xm3, xm5
    240    paddd           xm2, xm3
    241    pshufb          xm3, xm0, xm14
    242    pmaddwd         xm3, xm6
    243    punpckhqdq      xm0, xm0
    244    punpcklwd       xm0, xm1
    245    pmaddwd         xm0, xm7
    246    pshufb          xm1, xm15
    247    pmaddwd         xm1, xm8
    248    paddd           xm2, xm10
    249    paddd           xm2, xm3
    250    paddd           xm0, xm1
    251    paddd           xm2, xm0
    252    movq            xm0, [bufq+xq-2]        ; y=0,x=[-2,+5]
    253 .x_loop_ar2_inner:
    254    pmovsxbw        xm1, xm0
    255    pmaddwd         xm3, xm9, xm1
    256    psrldq          xm1, 4                  ; y=0,x=0
    257    paddd           xm3, xm2
    258    psrldq          xm2, 4                  ; shift top to next pixel
    259    psrad           xm3, [fg_dataq+FGData.ar_coeff_shift]
    260    ; don't packssdw since we only care about one value
    261    paddw           xm3, xm1
    262    packsswb        xm3, xm3
    263    pextrb    [bufq+xq], xm3, 0
    264    pslldq          xm3, 2
    265    vpblendvb       xm0, xm3, xm11
    266    psrldq          xm0, 1
    267    inc              xq
    268    jz .x_loop_ar2_end
    269    test             xb, 3
    270    jnz .x_loop_ar2_inner
    271    jmp .x_loop_ar2
    272 .x_loop_ar2_end:
    273    add            bufq, 82
    274    dec              hd
    275    jg .y_loop_ar2
    276    RET
    277 
    278 INIT_YMM avx2
    279 .ar3:
    280 %if WIN64
    281    ALLOC_STACK   16*14
    282    %assign stack_size stack_size - 16*4
    283    WIN64_PUSH_XMM   12, 8
    284 %else
    285    ALLOC_STACK   16*12
    286 %endif
    287    mov             r6d, [fg_dataq+FGData.ar_coeff_shift]
    288    movd           xm11, [base+byte_blend]
    289    pmovsxbw         m1, [fg_dataq+FGData.ar_coeffs_y+ 0]   ; cf0-15
    290    pmovsxbw        xm2, [fg_dataq+FGData.ar_coeffs_y+16]   ; cf16-23
    291    pshufd           m0, m1, q0000
    292    mova    [rsp+16* 0], m0
    293    pshufd           m0, m1, q1111
    294    mova    [rsp+16* 2], m0
    295    pshufd           m0, m1, q2222
    296    mova    [rsp+16* 4], m0
    297    pshufd           m1, m1, q3333
    298    mova    [rsp+16* 6], m1
    299    pshufd          xm0, xm2, q0000
    300    mova    [rsp+16* 8], xm0
    301    pshufd          xm0, xm2, q1111
    302    mova    [rsp+16* 9], xm0
    303    psrldq          xm7, xm2, 10
    304    mova             m8, [base+gen_shufA]
    305    pinsrw          xm2, [base+pw_1], 5
    306    mova             m9, [base+gen_shufC]
    307    pshufd          xm2, xm2, q2222
    308    movu            m10, [base+gen_shufE]
    309    vpbroadcastw    xm6, [base+round_vals-12+r6*2]
    310    pinsrw          xm7, [base+round_vals+r6*2-10], 3
    311    mova    [rsp+16*10], xm2
    312    DEFINE_ARGS buf, fg_data, h, x
    313    sub            bufq, 82*73-(82*3+79)
    314    mov              hd, 70
    315 .y_loop_ar3:
    316    mov              xq, -76
    317 .x_loop_ar3:
    318    movu            xm5, [bufq+xq-82*3-3]    ; y=-3,x=[-3,+12]
    319    vinserti128      m5, [bufq+xq-82*2-3], 1 ; y=-2,x=[-3,+12]
    320    movu            xm4, [bufq+xq-82*1-3]    ; y=-1,x=[-3,+12]
    321    punpcklbw        m3, m5, m5
    322    punpckhwd        m5, m4
    323    psraw            m3, 8
    324    punpcklbw        m5, m5
    325    psraw            m5, 8
    326    punpcklbw       xm4, xm4
    327    psraw           xm4, 8
    328    pshufb           m0, m3, m8
    329    pmaddwd          m0, [rsp+16*0]
    330    pshufb           m1, m3, m9
    331    pmaddwd          m1, [rsp+16*2]
    332    shufps           m2, m3, m5, q1032
    333    paddd            m0, m1
    334    pshufb           m1, m2, m8
    335    vperm2i128       m3, m4, 0x21
    336    pmaddwd          m1, [rsp+16*4]
    337    shufps          xm2, xm3, q1021
    338    vpblendd         m2, m3, 0xf0
    339    pshufb           m2, m10
    340    paddd            m0, m1
    341    pmaddwd          m2, [rsp+16*6]
    342    pshufb          xm1, xm4, xm9
    343    pmaddwd         xm1, [rsp+16*8]
    344    shufps          xm4, xm5, q1132
    345    paddd            m0, m2
    346    pshufb          xm2, xm4, xm8
    347    pshufd          xm4, xm4, q2121
    348    pmaddwd         xm2, [rsp+16*9]
    349    punpcklwd       xm4, xm6
    350    pmaddwd         xm4, [rsp+16*10]
    351    vextracti128    xm3, m0, 1
    352    paddd           xm0, xm1
    353    movq            xm1, [bufq+xq-3]        ; y=0,x=[-3,+4]
    354    paddd           xm2, xm4
    355    paddd           xm0, xm2
    356    paddd           xm0, xm3
    357 .x_loop_ar3_inner:
    358    pmovsxbw        xm2, xm1
    359    pmaddwd         xm2, xm7
    360    pshufd          xm3, xm2, q1111
    361    paddd           xm2, xm0                ; add top
    362    paddd           xm2, xm3                ; left+cur
    363    psrldq          xm0, 4
    364    psrad           xm2, [fg_dataq+FGData.ar_coeff_shift]
    365    ; don't packssdw since we only care about one value
    366    packsswb        xm2, xm2
    367    pextrb    [bufq+xq], xm2, 0
    368    pslldq          xm2, 3
    369    vpblendvb       xm1, xm2, xm11
    370    psrldq          xm1, 1
    371    inc              xq
    372    jz .x_loop_ar3_end
    373    test             xb, 3
    374    jnz .x_loop_ar3_inner
    375    jmp .x_loop_ar3
    376 .x_loop_ar3_end:
    377    add            bufq, 82
    378    dec              hd
    379    jg .y_loop_ar3
    380    RET
    381 
    382 %macro GEN_GRAIN_UV_FN 3 ; ss_name, ss_x, ss_y
    383 INIT_XMM avx2
    384 cglobal generate_grain_uv_%1_8bpc, 4, 10, 16, buf, bufy, fg_data, uv
    385 %define base r4-generate_grain_uv_%1_8bpc_avx2_table
    386    lea              r4, [generate_grain_uv_%1_8bpc_avx2_table]
    387    vpbroadcastw    xm0, [fg_dataq+FGData.seed]
    388    mov             r6d, [fg_dataq+FGData.grain_scale_shift]
    389    movq            xm1, [base+next_upperbit_mask]
    390    movq            xm4, [base+mul_bits]
    391    movq            xm5, [base+hmul_bits]
    392    mova            xm6, [base+pb_mask]
    393    vpbroadcastw    xm7, [base+round+r6*2]
    394    vpbroadcastd    xm2, [base+pw_seed_xor+uvq*4]
    395    pxor            xm0, xm2
    396    lea              r6, [gaussian_sequence]
    397 %if %2
    398    mov             r7d, 73-35*%3
    399    add            bufq, 44
    400 .loop_y:
    401    mov              r5, -44
    402 %else
    403    mov              r5, -73*82
    404    sub            bufq, r5
    405 %endif
    406 .loop:
    407    pand            xm2, xm0, xm1
    408    psrlw           xm3, xm2, 10
    409    por             xm2, xm3            ; bits 0xf, 0x1e, 0x3c and 0x78 are set
    410    pmullw          xm2, xm4            ; bits 0x0f00 are set
    411    pmulhuw         xm0, xm5
    412    pshufb          xm3, xm6, xm2       ; set 15th bit for next 4 seeds
    413    psllq           xm2, xm3, 30
    414    por             xm2, xm3
    415    psllq           xm3, xm2, 15
    416    por             xm2, xm0            ; aggregate each bit into next seed's high bit
    417    por             xm2, xm3            ; 4 next output seeds
    418    pshuflw         xm0, xm2, q3333
    419    psrlw           xm2, 5
    420    movq             r8, xm2
    421    movzx           r9d, r8w
    422    movd            xm2, [r6+r9*2]
    423    rorx             r9, r8, 32
    424    shr             r8d, 16
    425    pinsrw          xm2, [r6+r8*2], 1
    426    movzx           r8d, r9w
    427    pinsrw          xm2, [r6+r8*2], 2
    428    shr             r9d, 16
    429    pinsrw          xm2, [r6+r9*2], 3
    430    pmulhrsw        xm2, xm7
    431    packsswb        xm2, xm2
    432    movd      [bufq+r5], xm2
    433    add              r5, 4
    434    jl .loop
    435 %if %2
    436    add            bufq, 82
    437    dec             r7d
    438    jg .loop_y
    439 %endif
    440 
    441    ; auto-regression code
    442    movsxd           r6, [fg_dataq+FGData.ar_coeff_lag]
    443    movsxd           r6, [base+generate_grain_uv_%1_8bpc_avx2_table+r6*4]
    444    add              r6, r4
    445    jmp              r6
    446 
    447 INIT_YMM avx2
    448 .ar0:
    449    DEFINE_ARGS buf, bufy, fg_data, uv, unused, shift
    450    imul            uvd, 28
    451    mov          shiftd, [fg_dataq+FGData.ar_coeff_shift]
    452    movd            xm2, [fg_dataq+FGData.ar_coeffs_uv+uvq]
    453    movd            xm3, [base+hmul_bits+shiftq*2]
    454    DEFINE_ARGS buf, bufy, h
    455    pmovsxbw        xm2, xm2
    456 %if %2
    457    vpbroadcastd     m7, [base+pb_1]
    458    vpbroadcastw     m6, [base+hmul_bits+2+%3*2]
    459 %endif
    460    vpbroadcastw     m2, xm2
    461    vpbroadcastw     m3, xm3
    462    pxor            m12, m12
    463 %if %2
    464    sub            bufq, 82*(73-35*%3)+82-(82*3+41)
    465 %else
    466    sub            bufq, 82*70-3
    467 %endif
    468    add           bufyq, 3+82*3
    469    mov              hd, 70-35*%3
    470 .y_loop_ar0:
    471 %if %2
    472    ; first 32 pixels
    473    movu            xm4, [bufyq]
    474    vinserti128      m4, [bufyq+32], 1
    475 %if %3
    476    movu            xm0, [bufyq+82]
    477    vinserti128      m0, [bufyq+82+32], 1
    478 %endif
    479    movu            xm5, [bufyq+16]
    480    vinserti128      m5, [bufyq+48], 1
    481 %if %3
    482    movu            xm1, [bufyq+82+16]
    483    vinserti128      m1, [bufyq+82+48], 1
    484 %endif
    485    pmaddubsw        m4, m7, m4
    486 %if %3
    487    pmaddubsw        m0, m7, m0
    488 %endif
    489    pmaddubsw        m5, m7, m5
    490 %if %3
    491    pmaddubsw        m1, m7, m1
    492    paddw            m4, m0
    493    paddw            m5, m1
    494 %endif
    495    pmulhrsw         m4, m6
    496    pmulhrsw         m5, m6
    497 %else
    498    xor             r3d, r3d
    499    ; first 32x2 pixels
    500 .x_loop_ar0:
    501    movu             m4, [bufyq+r3]
    502    pcmpgtb          m0, m12, m4
    503    punpckhbw        m5, m4, m0
    504    punpcklbw        m4, m0
    505 %endif
    506    pmullw           m4, m2
    507    pmullw           m5, m2
    508    pmulhrsw         m4, m3
    509    pmulhrsw         m5, m3
    510 %if %2
    511    movu             m1, [bufq]
    512 %else
    513    movu             m1, [bufq+r3]
    514 %endif
    515    pcmpgtb          m8, m12, m1
    516    punpcklbw        m0, m1, m8
    517    punpckhbw        m1, m8
    518    paddw            m0, m4
    519    paddw            m1, m5
    520    packsswb         m0, m1
    521 %if %2
    522    movu         [bufq], m0
    523 %else
    524    movu      [bufq+r3], m0
    525    add             r3d, 32
    526    cmp             r3d, 64
    527    jl .x_loop_ar0
    528 %endif
    529 
    530    ; last 6/12 pixels
    531    movu            xm4, [bufyq+32*2]
    532 %if %2
    533 %if %3
    534    movu            xm5, [bufyq+32*2+82]
    535 %endif
    536    pmaddubsw       xm4, xm7, xm4
    537 %if %3
    538    pmaddubsw       xm5, xm7, xm5
    539    paddw           xm4, xm5
    540 %endif
    541    movq            xm0, [bufq+32]
    542    pmulhrsw        xm4, xm6
    543    pmullw          xm4, xm2
    544    pmulhrsw        xm4, xm3
    545    pcmpgtb         xm5, xm12, xm0
    546    punpcklbw       xm5, xm0, xm5
    547    paddw           xm4, xm5
    548    packsswb        xm4, xm4
    549    pblendw         xm0, xm4, xm0, 1000b
    550    movq      [bufq+32], xm0
    551 %else
    552    movu            xm0, [bufq+64]
    553    pcmpgtb         xm1, xm12, xm4
    554    punpckhbw       xm5, xm4, xm1
    555    punpcklbw       xm4, xm1
    556    pmullw          xm5, xm2
    557    pmullw          xm4, xm2
    558    vpblendd        xm1, xm3, xm12, 0x0c
    559    pmulhrsw        xm5, xm1
    560    pmulhrsw        xm4, xm3
    561    pcmpgtb         xm1, xm12, xm0
    562    punpckhbw       xm8, xm0, xm1
    563    punpcklbw       xm0, xm1
    564    paddw           xm5, xm8
    565    paddw           xm0, xm4
    566    packsswb        xm0, xm5
    567    movu      [bufq+64], xm0
    568 %endif
    569    add            bufq, 82
    570    add           bufyq, 82<<%3
    571    dec              hd
    572    jg .y_loop_ar0
    573    RET
    574 
    575 INIT_XMM avx2
    576 .ar1:
    577    DEFINE_ARGS buf, bufy, fg_data, uv, val3, cf3, min, max, x, shift
    578    imul            uvd, 28
    579    mov          shiftd, [fg_dataq+FGData.ar_coeff_shift]
    580    movsx          cf3d, byte [fg_dataq+FGData.ar_coeffs_uv+uvq+3]
    581    movd            xm4, [fg_dataq+FGData.ar_coeffs_uv+uvq]
    582    pinsrb          xm4, [fg_dataq+FGData.ar_coeffs_uv+uvq+4], 3
    583    DEFINE_ARGS buf, bufy, h, val0, val3, cf3, min, max, x, shift
    584    pmovsxbw        xm4, xm4
    585    pshufd          xm5, xm4, q1111
    586    pshufd          xm4, xm4, q0000
    587    pmovsxwd        xm3, [base+round_vals+shiftq*2-12]    ; rnd
    588 %if %2
    589    vpbroadcastd    xm7, [base+pb_1]
    590    vpbroadcastw    xm6, [base+hmul_bits+2+%3*2]
    591 %endif
    592    vpbroadcastd    xm3, xm3
    593 %if %2
    594    sub            bufq, 82*(73-35*%3)+44-(82*3+41)
    595 %else
    596    sub            bufq, 82*70-(82-3)
    597 %endif
    598    add           bufyq, 79+82*3
    599    mov              hd, 70-35*%3
    600    mov            mind, -128
    601    mov            maxd, 127
    602 .y_loop_ar1:
    603    mov              xq, -(76>>%2)
    604    movsx         val3d, byte [bufq+xq-1]
    605 .x_loop_ar1:
    606    pmovsxbw        xm0, [bufq+xq-82-1]     ; top/left
    607 %if %2
    608    movq            xm8, [bufyq+xq*2]
    609 %if %3
    610    movq            xm9, [bufyq+xq*2+82]
    611 %endif
    612 %endif
    613    psrldq          xm2, xm0, 2             ; top
    614    psrldq          xm1, xm0, 4             ; top/right
    615 %if %2
    616    pmaddubsw       xm8, xm7, xm8
    617 %if %3
    618    pmaddubsw       xm9, xm7, xm9
    619    paddw           xm8, xm9
    620 %endif
    621    pmulhrsw        xm8, xm6
    622 %else
    623    pmovsxbw        xm8, [bufyq+xq]
    624 %endif
    625    punpcklwd       xm0, xm2
    626    punpcklwd       xm1, xm8
    627    pmaddwd         xm0, xm4
    628    pmaddwd         xm1, xm5
    629    paddd           xm0, xm1
    630    paddd           xm0, xm3
    631 .x_loop_ar1_inner:
    632    movd          val0d, xm0
    633    psrldq          xm0, 4
    634    imul          val3d, cf3d
    635    add           val3d, val0d
    636    sarx          val3d, val3d, shiftd
    637    movsx         val0d, byte [bufq+xq]
    638    add           val3d, val0d
    639    cmp           val3d, maxd
    640    cmovns        val3d, maxd
    641    cmp           val3d, mind
    642    cmovs         val3d, mind
    643    mov  byte [bufq+xq], val3b
    644    ; keep val3d in-place as left for next x iteration
    645    inc              xq
    646    jz .x_loop_ar1_end
    647    test             xq, 3
    648    jnz .x_loop_ar1_inner
    649    jmp .x_loop_ar1
    650 
    651 .x_loop_ar1_end:
    652    add            bufq, 82
    653    add           bufyq, 82<<%3
    654    dec              hd
    655    jg .y_loop_ar1
    656    RET
    657 
    658 .ar2:
    659    DEFINE_ARGS buf, bufy, fg_data, uv, unused, shift
    660    mov          shiftd, [fg_dataq+FGData.ar_coeff_shift]
    661    imul            uvd, 28
    662    vpbroadcastw   xm13, [base+round_vals-12+shiftq*2]
    663    pmovsxbw        xm7, [fg_dataq+FGData.ar_coeffs_uv+uvq+0]   ; cf0-7
    664    pmovsxbw        xm0, [fg_dataq+FGData.ar_coeffs_uv+uvq+8]   ; cf8-12
    665    pinsrw          xm0, [base+pw_1], 5
    666 %if %2
    667    vpbroadcastw   xm12, [base+hmul_bits+2+%3*2]
    668    vpbroadcastd   xm11, [base+pb_1]
    669 %endif
    670    DEFINE_ARGS buf, bufy, fg_data, h, unused, x
    671    pshufd          xm4, xm7, q0000
    672    pshufd          xm5, xm7, q3333
    673    pshufd          xm6, xm7, q1111
    674    pshufd          xm7, xm7, q2222
    675    pshufd          xm8, xm0, q0000
    676    pshufd          xm9, xm0, q1111
    677    pshufd         xm10, xm0, q2222
    678 %if %2
    679    sub            bufq, 82*(73-35*%3)+44-(82*3+41)
    680 %else
    681    sub            bufq, 82*70-(82-3)
    682 %endif
    683    add           bufyq, 79+82*3
    684    mov              hd, 70-35*%3
    685 .y_loop_ar2:
    686    mov              xq, -(76>>%2)
    687 
    688 .x_loop_ar2:
    689    pmovsxbw        xm0, [bufq+xq-82*2-2]   ; y=-2,x=[-2,+5]
    690    pmovsxbw        xm1, [bufq+xq-82*1-2]   ; y=-1,x=[-2,+5]
    691    pshufb          xm2, xm0, [base+gen_shufA]
    692    pmaddwd         xm2, xm4
    693    pshufb          xm3, xm1, [base+gen_shufB]
    694    pmaddwd         xm3, xm5
    695    paddd           xm2, xm3
    696    pshufb          xm3, xm0, [base+gen_shufC]
    697    pmaddwd         xm3, xm6
    698    punpckhqdq      xm0, xm0                 ; y=-2,x=[+2,+5]
    699    punpcklwd       xm0, xm1
    700    pmaddwd         xm0, xm7
    701    pshufb          xm1, [gen_shufD]
    702    pmaddwd         xm1, xm8
    703    paddd           xm2, xm3
    704    paddd           xm0, xm1
    705    paddd           xm2, xm0
    706 
    707 %if %2
    708    movq            xm0, [bufyq+xq*2]
    709 %if %3
    710    movq            xm3, [bufyq+xq*2+82]
    711 %endif
    712    pmaddubsw       xm0, xm11, xm0
    713 %if %3
    714    pmaddubsw       xm3, xm11, xm3
    715    paddw           xm0, xm3
    716 %endif
    717    pmulhrsw        xm0, xm12
    718 %else
    719    pmovsxbw        xm0, [bufyq+xq]
    720 %endif
    721    punpcklwd       xm0, xm13
    722    pmaddwd         xm0, xm10
    723    paddd           xm2, xm0
    724 
    725    movq            xm0, [bufq+xq-2]        ; y=0,x=[-2,+5]
    726 .x_loop_ar2_inner:
    727    pmovsxbw        xm0, xm0
    728    pmaddwd         xm3, xm0, xm9
    729    psrldq          xm0, 2
    730    paddd           xm3, xm2
    731    psrldq          xm2, 4                  ; shift top to next pixel
    732    psrad           xm3, [fg_dataq+FGData.ar_coeff_shift]
    733    pslldq          xm3, 2
    734    paddw           xm3, xm0
    735    pblendw         xm0, xm3, 00000010b
    736    packsswb        xm0, xm0
    737    pextrb    [bufq+xq], xm0, 1
    738    inc              xq
    739    jz .x_loop_ar2_end
    740    test             xb, 3
    741    jnz .x_loop_ar2_inner
    742    jmp .x_loop_ar2
    743 
    744 .x_loop_ar2_end:
    745    add            bufq, 82
    746    add           bufyq, 82<<%3
    747    dec              hd
    748    jg .y_loop_ar2
    749    RET
    750 
    751 INIT_YMM avx2
    752 .ar3:
    753    DEFINE_ARGS buf, bufy, fg_data, uv, unused, shift
    754    mov          shiftd, [fg_dataq+FGData.ar_coeff_shift]
    755    imul            uvd, 28
    756    pmovsxbw         m0, [fg_dataq+FGData.ar_coeffs_uv+uvq+ 0] ; cf0-15
    757    pmovsxbw        xm1, [fg_dataq+FGData.ar_coeffs_uv+uvq+16] ; cf16-23
    758    vpbroadcastb    xm2, [fg_dataq+FGData.ar_coeffs_uv+uvq+24] ; cf24 [luma]
    759    movd           xm13, [base+round_vals-10+shiftq*2]
    760    vpbroadcastd   xm14, [base+round_vals-14+shiftq*2]
    761    pshufd           m6, m0, q0000
    762    pshufd           m7, m0, q1111
    763    pshufd           m8, m0, q2222
    764    pshufd           m9, m0, q3333
    765    pshufd         xm10, xm1, q0000
    766    pshufd         xm11, xm1, q1111
    767    pshufhw        xm12, xm1, q0000
    768    psraw           xm2, 8
    769    palignr        xm13, xm1, 10
    770    punpckhwd      xm12, xm2                     ; interleave luma cf
    771    psrld          xm14, 16
    772    DEFINE_ARGS buf, bufy, fg_data, h, unused, x
    773 %if %2
    774    vpbroadcastw   xm15, [base+hmul_bits+2+%3*2]
    775    sub            bufq, 82*(73-35*%3)+44-(82*3+41)
    776 %else
    777    sub            bufq, 82*70-(82-3)
    778 %endif
    779    add           bufyq, 79+82*3
    780    mov              hd, 70-35*%3
    781 .y_loop_ar3:
    782    mov              xq, -(76>>%2)
    783 .x_loop_ar3:
    784    vbroadcasti128   m3, [bufq+xq-82*2-3]         ; y=-2,x=[-3,+12
    785    palignr         xm1, xm3, [bufq+xq-82*3-9], 6 ; y=-3,x=[-3,+12]
    786    vbroadcasti128   m4, [bufq+xq-82*1-3]    ; y=-1,x=[-3,+12]
    787    vpblendd         m3, m1, 0x0f
    788    pxor             m0, m0
    789    pcmpgtb          m2, m0, m3
    790    pcmpgtb          m0, m4
    791    punpcklbw        m1, m3, m2
    792    punpckhbw        m3, m2
    793    punpcklbw        m2, m4, m0
    794    punpckhbw       xm4, xm0
    795    pshufb           m0, m1, [base+gen_shufA]
    796    pmaddwd          m0, m6
    797    pshufb           m5, m1, [base+gen_shufC]
    798    pmaddwd          m5, m7
    799    shufps           m1, m3, q1032
    800    paddd            m0, m5
    801    pshufb           m5, m1, [base+gen_shufA]
    802    pmaddwd          m5, m8
    803    shufps          xm1, xm3, q2121
    804    vpblendd         m1, m2, 0xf0
    805    pshufb           m1, [base+gen_shufE]
    806    pmaddwd          m1, m9
    807    paddd            m0, m5
    808    pshufb          xm3, xm2, [base+gen_shufC]
    809    paddd            m0, m1
    810    pmaddwd         xm3, xm10
    811    palignr         xm1, xm4, xm2, 2
    812    punpckhwd       xm1, xm2, xm1
    813    pmaddwd         xm1, xm11
    814    palignr         xm4, xm2, 12
    815    paddd           xm3, xm1
    816 %if %2
    817    vpbroadcastd    xm5, [base+pb_1]
    818    movq            xm1, [bufyq+xq*2]
    819    pmaddubsw       xm1, xm5, xm1
    820 %if %3
    821    movq            xm2, [bufyq+xq*2+82]
    822    pmaddubsw       xm5, xm2
    823    paddw           xm1, xm5
    824 %endif
    825    pmulhrsw        xm1, xm15
    826 %else
    827    pmovsxbw        xm1, [bufyq+xq]
    828 %endif
    829    punpcklwd       xm4, xm1
    830    pmaddwd         xm4, xm12
    831    movq            xm1, [bufq+xq-3]        ; y=0,x=[-3,+4]
    832    vextracti128    xm2, m0, 1
    833    paddd           xm0, xm14
    834    paddd           xm3, xm4
    835    paddd           xm0, xm3
    836    paddd           xm0, xm2
    837 .x_loop_ar3_inner:
    838    pmovsxbw        xm1, xm1
    839    pmaddwd         xm2, xm13, xm1
    840    pshuflw         xm3, xm2, q1032
    841    paddd           xm2, xm0                ; add top
    842    paddd           xm2, xm3                ; left+cur
    843    psrldq          xm0, 4
    844    psrad           xm2, [fg_dataq+FGData.ar_coeff_shift]
    845    psrldq          xm1, 2
    846    ; don't packssdw, we only care about one value
    847    punpckldq       xm2, xm2
    848    pblendw         xm1, xm2, 0100b
    849    packsswb        xm1, xm1
    850    pextrb    [bufq+xq], xm1, 2
    851    inc              xq
    852    jz .x_loop_ar3_end
    853    test             xb, 3
    854    jnz .x_loop_ar3_inner
    855    jmp .x_loop_ar3
    856 .x_loop_ar3_end:
    857    add            bufq, 82
    858    add           bufyq, 82<<%3
    859    dec              hd
    860    jg .y_loop_ar3
    861    RET
    862 %endmacro
    863 
    864 INIT_YMM avx2
    865 cglobal fgy_32x32xn_8bpc, 6, 13, 15, dst, src, stride, fg_data, w, scaling, \
    866                                     grain_lut, h, sby, see, overlap
    867 %define base r9-pd_m65536
    868    lea              r9, [pd_m65536]
    869    mov             r6d, [fg_dataq+FGData.scaling_shift]
    870    mov             r7d, [fg_dataq+FGData.clip_to_restricted_range]
    871    mov            sbyd, sbym
    872    mov        overlapd, [fg_dataq+FGData.overlap_flag]
    873    vpbroadcastd     m8, [base+pd_m65536]
    874    vpbroadcastw     m9, [base+mul_bits+r6*2-14]
    875    vpbroadcastd    m10, [base+fg_min+r7*4]
    876    vpbroadcastd    m11, [base+fg_max+r7*8]
    877    vpbroadcastd    m12, [base+pw_1024]
    878    movq           xm13, [base+pb_27_17_17_27]
    879    test           sbyd, sbyd
    880    setnz           r7b
    881    pxor             m7, m7
    882    test            r7b, overlapb
    883    jnz .vertical_overlap
    884 
    885    imul           seed, sbyd, (173 << 24) | 37
    886    add            seed, (105 << 24) | 178
    887    rorx           seed, seed, 24
    888    movzx          seed, seew
    889    xor            seed, [fg_dataq+FGData.seed]
    890 
    891    DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \
    892                offx, offy, see, overlap
    893 
    894    lea        src_bakq, [srcq+wq]
    895    neg              wq
    896    sub            dstq, srcq
    897 
    898 .loop_x:
    899    rorx             r6, seeq, 1
    900    or             seed, 0xEFF4
    901    test           seeb, seeh
    902    lea            seed, [r6+0x8000]
    903    cmovp          seed, r6d                ; updated seed
    904 
    905    rorx          offyd, seed, 8
    906    rorx          offxq, seeq, 12
    907    and           offyd, 0xf
    908    imul          offyd, 164
    909    lea           offyd, [offyq+offxq*2+747] ; offy*stride+offx
    910 
    911    DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \
    912                h, offxy, see, overlap
    913 
    914    mov              hd, hm
    915    mov      grain_lutq, grain_lutmp
    916 .loop_y:
    917    ; src
    918    mova             m2, [srcq]
    919    punpcklbw        m0, m2, m7
    920    punpckhbw        m1, m2, m7
    921 
    922    ; scaling[src]
    923    pandn            m4, m8, m0
    924    mova             m6, m8
    925    vpgatherdd       m2, [scalingq+m4-0], m8
    926    psrld            m3, m0, 16
    927    mova             m8, m6
    928    vpgatherdd       m4, [scalingq+m3-2], m6
    929    pandn            m5, m8, m1
    930    mova             m6, m8
    931    vpgatherdd       m3, [scalingq+m5-0], m8
    932    pblendw          m2, m4, 0xaa
    933    psrld            m4, m1, 16
    934    mova             m8, m6
    935    vpgatherdd       m5, [scalingq+m4-2], m6
    936    pblendw          m3, m5, 0xaa
    937 
    938    ; grain = grain_lut[offy+y][offx+x]
    939    movu             m5, [grain_lutq+offxyq]
    940    punpcklbw        m4, m5, m7
    941    punpckhbw        m5, m7
    942 
    943    ; noise = round2(scaling[src] * grain, scaling_shift)
    944    pmaddubsw        m2, m4
    945    pmaddubsw        m3, m5
    946    pmulhrsw         m2, m9
    947    pmulhrsw         m3, m9
    948 
    949    ; dst = clip_pixel(src, noise)
    950    paddw            m0, m2
    951    paddw            m1, m3
    952    packuswb         m0, m1
    953    pmaxub           m0, m10
    954    pminub           m0, m11
    955    mova    [dstq+srcq], m0
    956 
    957    add            srcq, strideq
    958    add      grain_lutq, 82
    959    dec              hd
    960    jg .loop_y
    961 
    962    add              wq, 32
    963    jge .end
    964    lea            srcq, [src_bakq+wq]
    965    test       overlapd, overlapd
    966    jz .loop_x
    967 
    968    ; r8m = sbym
    969    cmp       dword r8m, 0
    970    jne .loop_x_hv_overlap
    971 
    972    ; horizontal overlap (without vertical overlap)
    973 .loop_x_h_overlap:
    974    rorx             r6, seeq, 1
    975    or             seed, 0xEFF4
    976    test           seeb, seeh
    977    lea            seed, [r6+0x8000]
    978    cmovp          seed, r6d                ; updated seed
    979 
    980    DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \
    981                offx, offy, see, left_offxy
    982 
    983    lea     left_offxyd, [offyq+32]         ; previous column's offy*stride+offx
    984    rorx          offyd, seed, 8
    985    rorx          offxq, seeq, 12
    986    and           offyd, 0xf
    987    imul          offyd, 164
    988    lea           offyd, [offyq+offxq*2+747] ; offy*stride+offx
    989 
    990    DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \
    991                h, offxy, see, left_offxy
    992 
    993    mov      grain_lutq, grain_lutmp
    994    mov              hd, hm
    995 .loop_y_h_overlap:
    996    ; src
    997    mova             m2, [srcq]
    998    punpcklbw        m0, m2, m7
    999    punpckhbw        m1, m2, m7
   1000 
   1001    ; scaling[src]
   1002    pandn            m4, m8, m0
   1003    mova             m6, m8
   1004    vpgatherdd       m2, [scalingq+m4-0], m8
   1005    psrld            m3, m0, 16
   1006    mova             m8, m6
   1007    vpgatherdd       m4, [scalingq+m3-2], m6
   1008    pandn            m5, m8, m1
   1009    mova             m6, m8
   1010    vpgatherdd       m3, [scalingq+m5-0], m8
   1011    pblendw          m2, m4, 0xaa
   1012    psrld            m4, m1, 16
   1013    mova             m8, m6
   1014    vpgatherdd       m5, [scalingq+m4-2], m6
   1015    pblendw          m3, m5, 0xaa
   1016 
   1017    ; grain = grain_lut[offy+y][offx+x]
   1018    movu             m5, [grain_lutq+offxyq]
   1019    movd            xm4, [grain_lutq+left_offxyq]
   1020    punpcklbw       xm4, xm5
   1021    pmaddubsw       xm4, xm13, xm4
   1022    pmulhrsw        xm4, xm12
   1023    packsswb        xm4, xm4
   1024    vpblendd         m4, m5, 0xfe
   1025    punpckhbw        m5, m7
   1026    punpcklbw        m4, m7
   1027 
   1028    ; noise = round2(scaling[src] * grain, scaling_shift)
   1029    pmaddubsw        m2, m4
   1030    pmaddubsw        m3, m5
   1031    pmulhrsw         m2, m9
   1032    pmulhrsw         m3, m9
   1033 
   1034    ; dst = clip_pixel(src, noise)
   1035    paddw            m0, m2
   1036    paddw            m1, m3
   1037    packuswb         m0, m1
   1038    pmaxub           m0, m10
   1039    pminub           m0, m11
   1040    mova    [dstq+srcq], m0
   1041 
   1042    add            srcq, strideq
   1043    add      grain_lutq, 82
   1044    dec              hd
   1045    jg .loop_y_h_overlap
   1046 
   1047    add              wq, 32
   1048    jge .end
   1049    lea            srcq, [src_bakq+wq]
   1050 
   1051    ; r8m = sbym
   1052    cmp       dword r8m, 0
   1053    jne .loop_x_hv_overlap
   1054    jmp .loop_x_h_overlap
   1055 
   1056 .vertical_overlap:
   1057    DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
   1058                unused, sby, see, overlap
   1059 
   1060    movzx          sbyd, sbyb
   1061    imul           seed, [fg_dataq+FGData.seed], 0x00010001
   1062    imul            r7d, sbyd, 173 * 0x00010001
   1063    imul           sbyd, 37 * 0x01000100
   1064    add             r7d, (105 << 16) | 188
   1065    add            sbyd, (178 << 24) | (141 << 8)
   1066    and             r7d, 0x00ff00ff
   1067    and            sbyd, 0xff00ff00
   1068    xor            seed, r7d
   1069    xor            seed, sbyd               ; (cur_seed << 16) | top_seed
   1070 
   1071    DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \
   1072                offx, offy, see, overlap
   1073 
   1074    lea        src_bakq, [srcq+wq]
   1075    neg              wq
   1076    sub            dstq, srcq
   1077 
   1078 .loop_x_v_overlap:
   1079    vpbroadcastd    m14, [pb_27_17]
   1080 
   1081    ; we assume from the block above that bits 8-15 of r7d are zero'ed
   1082    mov             r6d, seed
   1083    or             seed, 0xeff4eff4
   1084    test           seeb, seeh
   1085    setp            r7b                     ; parity of top_seed
   1086    shr            seed, 16
   1087    shl             r7d, 16
   1088    test           seeb, seeh
   1089    setp            r7b                     ; parity of cur_seed
   1090    or              r6d, 0x00010001
   1091    xor             r7d, r6d
   1092    rorx           seed, r7d, 1             ; updated (cur_seed << 16) | top_seed
   1093 
   1094    rorx          offyd, seed, 8
   1095    rorx          offxd, seed, 12
   1096    and           offyd, 0xf000f
   1097    and           offxd, 0xf000f
   1098    imul          offyd, 164
   1099    ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy
   1100    lea           offyd, [offyq+offxq*2+0x10001*747+32*82]
   1101 
   1102    DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \
   1103                h, offxy, see, overlap, top_offxy
   1104 
   1105    mov      grain_lutq, grain_lutmp
   1106    mov              hd, hm
   1107    movzx    top_offxyd, offxyw
   1108    shr          offxyd, 16
   1109 .loop_y_v_overlap:
   1110    ; src
   1111    mova             m2, [srcq]
   1112    punpcklbw        m0, m2, m7
   1113    punpckhbw        m1, m2, m7
   1114 
   1115    ; scaling[src]
   1116    pandn            m4, m8, m0
   1117    mova             m6, m8
   1118    vpgatherdd       m2, [scalingq+m4-0], m8
   1119    psrld            m3, m0, 16
   1120    mova             m8, m6
   1121    vpgatherdd       m4, [scalingq+m3-2], m6
   1122    pandn            m5, m8, m1
   1123    mova             m6, m8
   1124    vpgatherdd       m3, [scalingq+m5-0], m8
   1125    pblendw          m2, m4, 0xaa
   1126    psrld            m4, m1, 16
   1127    mova             m8, m6
   1128    vpgatherdd       m5, [scalingq+m4-2], m6
   1129    pblendw          m3, m5, 0xaa
   1130 
   1131    ; grain = grain_lut[offy+y][offx+x]
   1132    movu             m6, [grain_lutq+offxyq]
   1133    movu             m4, [grain_lutq+top_offxyq]
   1134    punpcklbw        m5, m4, m6
   1135    punpckhbw        m4, m6
   1136    pmaddubsw        m5, m14, m5
   1137    pmaddubsw        m4, m14, m4
   1138    pmulhrsw         m5, m12
   1139    pmulhrsw         m4, m12
   1140    packsswb         m5, m4
   1141    punpcklbw        m4, m5, m7
   1142    punpckhbw        m5, m7
   1143 
   1144    ; noise = round2(scaling[src] * grain, scaling_shift)
   1145    pmaddubsw        m2, m4
   1146    pmaddubsw        m3, m5
   1147    pmulhrsw         m2, m9
   1148    pmulhrsw         m3, m9
   1149 
   1150    ; dst = clip_pixel(src, noise)
   1151    paddw            m0, m2
   1152    paddw            m1, m3
   1153    packuswb         m0, m1
   1154    pmaxub           m0, m10
   1155    pminub           m0, m11
   1156    mova    [dstq+srcq], m0
   1157 
   1158    add            srcq, strideq
   1159    add      grain_lutq, 82
   1160    dec              hb
   1161    jz .end_y_v_overlap
   1162    vpbroadcastd    m14, [pb_17_27] ; swap weights for second v-overlap line
   1163    ; 2 lines get vertical overlap, then fall back to non-overlap code for
   1164    ; remaining (up to) 30 lines
   1165    add              hd, 0x80000000
   1166    jnc .loop_y_v_overlap
   1167    jmp .loop_y
   1168 .end_y_v_overlap:
   1169    add              wq, 32
   1170    jge .end
   1171    lea            srcq, [src_bakq+wq]
   1172 
   1173    ; since fg_dataq.overlap is guaranteed to be set, we never jump
   1174    ; back to .loop_x_v_overlap, and instead always fall-through to
   1175    ; h+v overlap
   1176 .loop_x_hv_overlap:
   1177    vpbroadcastd    m14, [pb_27_17]
   1178 
   1179    ; we assume from the block above that bits 8-15 of r7d are zero'ed
   1180    mov             r6d, seed
   1181    or             seed, 0xeff4eff4
   1182    test           seeb, seeh
   1183    setp            r7b                     ; parity of top_seed
   1184    shr            seed, 16
   1185    shl             r7d, 16
   1186    test           seeb, seeh
   1187    setp            r7b                     ; parity of cur_seed
   1188    or              r6d, 0x00010001
   1189    xor             r7d, r6d
   1190    rorx           seed, r7d, 1             ; updated (cur_seed << 16) | top_seed
   1191 
   1192    DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \
   1193                offx, offy, see, left_offxy, top_offxy, topleft_offxy
   1194 
   1195    lea  topleft_offxyd, [top_offxyq+32]
   1196    lea     left_offxyd, [offyq+32]
   1197    rorx          offyd, seed, 8
   1198    rorx          offxd, seed, 12
   1199    and           offyd, 0xf000f
   1200    and           offxd, 0xf000f
   1201    imul          offyd, 164
   1202    ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy
   1203    lea           offyd, [offyq+offxq*2+0x10001*747+32*82]
   1204 
   1205    DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \
   1206                h, offxy, see, left_offxy, top_offxy, topleft_offxy
   1207 
   1208    mov      grain_lutq, grain_lutmp
   1209    mov              hd, hm
   1210    movzx    top_offxyd, offxyw
   1211    shr          offxyd, 16
   1212 .loop_y_hv_overlap:
   1213    ; src
   1214    mova             m2, [srcq]
   1215    punpcklbw        m0, m2, m7
   1216    punpckhbw        m1, m2, m7
   1217 
   1218    ; scaling[src]
   1219    pandn            m4, m8, m0
   1220    mova             m6, m8
   1221    vpgatherdd       m2, [scalingq+m4-0], m8
   1222    psrld            m3, m0, 16
   1223    mova             m8, m6
   1224    vpgatherdd       m4, [scalingq+m3-2], m6
   1225    pandn            m5, m8, m1
   1226    mova             m6, m8
   1227    vpgatherdd       m3, [scalingq+m5-0], m8
   1228    pblendw          m2, m4, 0xaa
   1229    psrld            m4, m1, 16
   1230    mova             m8, m6
   1231    vpgatherdd       m5, [scalingq+m4-2], m6
   1232    pblendw          m3, m5, 0xaa
   1233 
   1234    ; grain = grain_lut[offy+y][offx+x]
   1235    movu             m6, [grain_lutq+offxyq]
   1236    movd            xm7, [grain_lutq+left_offxyq]
   1237    movu             m4, [grain_lutq+top_offxyq]
   1238    movd            xm5, [grain_lutq+topleft_offxyq]
   1239    ; do h interpolation first (so top | top/left -> top, left | cur -> cur)
   1240    punpcklbw       xm7, xm6
   1241    punpcklbw       xm5, xm4
   1242    pmaddubsw       xm7, xm13, xm7
   1243    pmaddubsw       xm5, xm13, xm5
   1244    pmulhrsw        xm7, xm12
   1245    pmulhrsw        xm5, xm12
   1246    packsswb        xm7, xm7
   1247    packsswb        xm5, xm5
   1248    vpblendd         m7, m6, 0xfe
   1249    vpblendd         m5, m4, 0xfe
   1250    ; followed by v interpolation (top | cur -> cur)
   1251    punpckhbw        m4, m6
   1252    punpcklbw        m5, m7
   1253    pmaddubsw        m4, m14, m4
   1254    pmaddubsw        m5, m14, m5
   1255    pmulhrsw         m4, m12
   1256    pmulhrsw         m5, m12
   1257    pxor             m7, m7
   1258    packsswb         m5, m4
   1259    punpcklbw        m4, m5, m7
   1260    punpckhbw        m5, m7
   1261 
   1262    ; noise = round2(scaling[src] * grain, scaling_shift)
   1263    pmaddubsw        m2, m4
   1264    pmaddubsw        m3, m5
   1265    pmulhrsw         m2, m9
   1266    pmulhrsw         m3, m9
   1267 
   1268    ; dst = clip_pixel(src, noise)
   1269    paddw            m0, m2
   1270    paddw            m1, m3
   1271    packuswb         m0, m1
   1272    pmaxub           m0, m10
   1273    pminub           m0, m11
   1274    mova    [dstq+srcq], m0
   1275 
   1276    add            srcq, strideq
   1277    add      grain_lutq, 82
   1278    dec              hb
   1279    jz .end_y_hv_overlap
   1280    vpbroadcastd    m14, [pb_17_27] ; swap weights for second v-overlap line
   1281    ; 2 lines get vertical overlap, then fall back to non-overlap code for
   1282    ; remaining (up to) 30 lines
   1283    add              hd, 0x80000000
   1284    jnc .loop_y_hv_overlap
   1285    jmp .loop_y_h_overlap
   1286 .end_y_hv_overlap:
   1287    add              wq, 32
   1288    lea            srcq, [src_bakq+wq]
   1289    jl .loop_x_hv_overlap
   1290 .end:
   1291    RET
   1292 
   1293 %macro FGUV_FN 3 ; name, ss_hor, ss_ver
   1294 cglobal fguv_32x32xn_i%1_8bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \
   1295                                          grain_lut, h, sby, luma, overlap, uv_pl, is_id
   1296 %define base r11-pd_m65536
   1297    lea             r11, [pd_m65536]
   1298    mov             r6d, [fg_dataq+FGData.scaling_shift]
   1299    mov             r7d, [fg_dataq+FGData.clip_to_restricted_range]
   1300    mov             r9d, is_idm
   1301    mov            sbyd, sbym
   1302    mov        overlapd, [fg_dataq+FGData.overlap_flag]
   1303    vpbroadcastd     m8, [base+pd_m65536]
   1304    vpbroadcastw     m9, [base+mul_bits+r6*2-14]
   1305    vpbroadcastd    m10, [base+fg_min+r7*4]
   1306    shlx            r7d, r7d, r9d
   1307    vpbroadcastd    m11, [base+fg_max+r7*4]
   1308    vpbroadcastd    m12, [base+pw_1024]
   1309    pxor             m7, m7
   1310    test           sbyd, sbyd
   1311    setnz           r7b
   1312    cmp byte [fg_dataq+FGData.chroma_scaling_from_luma], 0
   1313    jne .csfl
   1314 
   1315 %macro %%FGUV_32x32xN_LOOP 3 ; not-csfl, ss_hor, ss_ver
   1316    DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
   1317                h, sby, see, overlap, uv_pl
   1318 %if %1
   1319    mov             r6d, uv_plm
   1320    vpbroadcastd     m0, [base+pw_8]
   1321    vbroadcasti128  m14, [fg_dataq+FGData.uv_mult+r6*4]
   1322    vpbroadcastw    m15, [fg_dataq+FGData.uv_offset+r6*4]
   1323    pshufb          m14, m0 ; uv_luma_mult, uv_mult
   1324 %elif %2
   1325    vpbroadcastq    m15, [base+pb_23_22]
   1326 %else
   1327    vpbroadcastq   xm15, [base+pb_27_17_17_27]
   1328 %endif
   1329 %if %3
   1330    vpbroadcastw    m13, [base+pb_23_22]
   1331 %elif %2
   1332    pshufd          m13, [base+pb_27_17], q0000 ; 8x27_17, 8x17_27
   1333 %endif
   1334    test            r7b, overlapb
   1335    jnz %%vertical_overlap
   1336 
   1337    imul           seed, sbyd, (173 << 24) | 37
   1338    add            seed, (105 << 24) | 178
   1339    rorx           seed, seed, 24
   1340    movzx          seed, seew
   1341    xor            seed, [fg_dataq+FGData.seed]
   1342 
   1343    DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \
   1344                unused2, unused3, see, overlap, unused4, unused5, lstride
   1345 
   1346    mov           lumaq, r9mp
   1347    lea             r12, [srcq+wq]
   1348    lea             r13, [dstq+wq]
   1349    lea             r14, [lumaq+wq*(1+%2)]
   1350    mov           r11mp, r12
   1351    mov           r12mp, r13
   1352    mov        lstrideq, r10mp
   1353    neg              wq
   1354 
   1355 %%loop_x:
   1356    rorx             r6, seeq, 1
   1357    or             seed, 0xEFF4
   1358    test           seeb, seeh
   1359    lea            seed, [r6+0x8000]
   1360    cmovp          seed, r6d               ; updated seed
   1361 
   1362    DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \
   1363                offx, offy, see, overlap, unused1, unused2, lstride
   1364 
   1365    rorx          offyd, seed, 8
   1366    rorx          offxq, seeq, 12
   1367    and           offyd, 0xf
   1368    imul          offyd, 164>>%3
   1369    lea           offyd, [offyq+offxq*(2-%2)+(3+(6>>%3))*82+3+(6>>%2)]  ; offy*stride+offx
   1370 
   1371    DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \
   1372                h, offxy, see, overlap, unused1, unused2, lstride
   1373 
   1374    mov      grain_lutq, grain_lutmp
   1375    mov              hd, hm
   1376 %%loop_y:
   1377    ; src
   1378 %if %2
   1379    mova            xm3, [lumaq+lstrideq*0+ 0]
   1380    vinserti128      m3, [lumaq+lstrideq*(1+%3) +0], 1
   1381    vpbroadcastd     m2, [pb_1]
   1382    mova            xm0, [lumaq+lstrideq*0+16]
   1383    vinserti128      m0, [lumaq+lstrideq*(1+%3)+16], 1
   1384    mova            xm1, [srcq]
   1385    vinserti128      m1, [srcq+strideq], 1
   1386    pmaddubsw        m3, m2
   1387    pmaddubsw        m0, m2
   1388    pavgw            m3, m7
   1389    pavgw            m0, m7
   1390 %else
   1391    mova             m2, [lumaq]
   1392    mova             m1, [srcq]
   1393 %endif
   1394 %if %1
   1395 %if %2
   1396    packuswb         m2, m3, m0             ; luma
   1397 %endif
   1398    punpckhbw        m3, m2, m1
   1399    punpcklbw        m2, m1                 ; { luma, chroma }
   1400    pmaddubsw        m3, m14
   1401    pmaddubsw        m2, m14
   1402    psraw            m3, 6
   1403    psraw            m2, 6
   1404    paddw            m3, m15
   1405    paddw            m2, m15
   1406    packuswb         m2, m3                 ; pack+unpack = clip
   1407 %endif
   1408 %if %1 || %2 == 0
   1409    punpcklbw        m3, m2, m7
   1410    punpckhbw        m0, m2, m7
   1411 %endif
   1412 
   1413    ; scaling[luma_src]
   1414    pandn            m4, m8, m3
   1415    mova             m6, m8
   1416    vpgatherdd       m2, [scalingq+m4-0], m8
   1417    psrld            m3, 16
   1418    mova             m8, m6
   1419    vpgatherdd       m4, [scalingq+m3-2], m6
   1420    pandn            m5, m8, m0
   1421    mova             m6, m8
   1422    vpgatherdd       m3, [scalingq+m5-0], m8
   1423    psrld            m0, 16
   1424    mova             m8, m6
   1425    vpgatherdd       m5, [scalingq+m0-2], m6
   1426    pblendw          m2, m4, 0xaa
   1427    pblendw          m3, m5, 0xaa
   1428 
   1429    ; grain = grain_lut[offy+y][offx+x]
   1430 %if %2
   1431    movu            xm5, [grain_lutq+offxyq+ 0]
   1432    vinserti128      m5, [grain_lutq+offxyq+82], 1
   1433 %else
   1434    movu             m5, [grain_lutq+offxyq]
   1435 %endif
   1436    punpcklbw        m4, m5, m7
   1437    punpckhbw        m5, m7
   1438 
   1439    ; noise = round2(scaling[luma_src] * grain, scaling_shift)
   1440    pmaddubsw        m2, m4
   1441    pmaddubsw        m3, m5
   1442    pmulhrsw         m2, m9
   1443    pmulhrsw         m3, m9
   1444 
   1445    ; unpack chroma_source
   1446    punpcklbw        m0, m1, m7
   1447    punpckhbw        m1, m7
   1448 
   1449    ; dst = clip_pixel(src, noise)
   1450    paddw            m0, m2
   1451    paddw            m1, m3
   1452    packuswb         m0, m1
   1453    pmaxub           m0, m10
   1454    pminub           m0, m11
   1455 %if %2
   1456    mova         [dstq], xm0
   1457    vextracti128 [dstq+strideq], m0, 1
   1458 %else
   1459    mova         [dstq], m0
   1460 %endif
   1461 
   1462 %if %2
   1463    lea            srcq, [srcq+strideq*2]
   1464    lea            dstq, [dstq+strideq*2]
   1465    lea           lumaq, [lumaq+lstrideq*(2<<%3)]
   1466 %else
   1467    add            srcq, strideq
   1468    add            dstq, strideq
   1469    add           lumaq, lstrideq
   1470 %endif
   1471    add      grain_lutq, 82<<%2
   1472    sub              hb, 1+%2
   1473    jg %%loop_y
   1474 
   1475    add              wq, 32>>%2
   1476    jge .end
   1477    mov            srcq, r11mp
   1478    mov            dstq, r12mp
   1479    lea           lumaq, [r14+wq*(1+%2)]
   1480    add            srcq, wq
   1481    add            dstq, wq
   1482    test       overlapd, overlapd
   1483    jz %%loop_x
   1484 
   1485    ; r8m = sbym
   1486    cmp       dword r8m, 0
   1487    jne %%loop_x_hv_overlap
   1488 
   1489    ; horizontal overlap (without vertical overlap)
   1490 %%loop_x_h_overlap:
   1491    rorx             r6, seeq, 1
   1492    or             seed, 0xEFF4
   1493    test           seeb, seeh
   1494    lea            seed, [r6+0x8000]
   1495    cmovp          seed, r6d               ; updated seed
   1496 
   1497    DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \
   1498                offx, offy, see, left_offxy, unused1, unused2, lstride
   1499 
   1500    lea     left_offxyd, [offyq+(32>>%2)]         ; previous column's offy*stride+offx
   1501    rorx          offyd, seed, 8
   1502    rorx          offxq, seeq, 12
   1503    and           offyd, 0xf
   1504    imul          offyd, 164>>%3
   1505    lea           offyd, [offyq+offxq*(2-%2)+(3+(6>>%3))*82+3+(6>>%2)]  ; offy*stride+offx
   1506 
   1507    DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \
   1508                h, offxy, see, left_offxy, unused1, unused2, lstride
   1509 
   1510    mov      grain_lutq, grain_lutmp
   1511    mov              hd, hm
   1512 %%loop_y_h_overlap:
   1513    ; src
   1514 %if %2
   1515    mova            xm3, [lumaq+lstrideq*0+ 0]
   1516    vinserti128      m3, [lumaq+lstrideq*(1+%3)+ 0], 1
   1517    vpbroadcastd     m2, [pb_1]
   1518    mova            xm0, [lumaq+lstrideq*0+16]
   1519    vinserti128      m0, [lumaq+lstrideq*(1+%3)+16], 1
   1520    mova            xm1, [srcq]
   1521    vinserti128      m1, [srcq+strideq], 1
   1522    pmaddubsw        m3, m2
   1523    pmaddubsw        m0, m2
   1524    pavgw            m3, m7
   1525    pavgw            m0, m7
   1526 %else
   1527    mova             m2, [lumaq]
   1528    mova             m1, [srcq]
   1529 %endif
   1530 %if %1
   1531 %if %2
   1532    packuswb         m2, m3, m0             ; luma
   1533 %endif
   1534    punpckhbw        m3, m2, m1
   1535    punpcklbw        m2, m1                 ; { luma, chroma }
   1536    pmaddubsw        m3, m14
   1537    pmaddubsw        m2, m14
   1538    psraw            m3, 6
   1539    psraw            m2, 6
   1540    paddw            m3, m15
   1541    paddw            m2, m15
   1542    packuswb         m2, m3                 ; pack+unpack = clip
   1543 %endif
   1544 %if %1 || %2 == 0
   1545    punpcklbw        m3, m2, m7
   1546    punpckhbw        m0, m2, m7
   1547 %endif
   1548 
   1549    ; scaling[luma_src]
   1550    pandn            m4, m8, m3
   1551    mova             m6, m8
   1552    vpgatherdd       m2, [scalingq+m4-0], m8
   1553    psrld            m3, 16
   1554    mova             m8, m6
   1555    vpgatherdd       m4, [scalingq+m3-2], m6
   1556    pandn            m5, m8, m0
   1557    mova             m6, m8
   1558    vpgatherdd       m3, [scalingq+m5-0], m8
   1559    psrld            m0, 16
   1560    mova             m8, m6
   1561    vpgatherdd       m5, [scalingq+m0-2], m6
   1562    pblendw          m2, m4, 0xaa
   1563    pblendw          m3, m5, 0xaa
   1564 
   1565    ; grain = grain_lut[offy+y][offx+x]
   1566 %if %2
   1567    movu            xm5, [grain_lutq+offxyq+ 0]
   1568    vinserti128      m5, [grain_lutq+offxyq+82], 1
   1569    movd            xm4, [grain_lutq+left_offxyq+ 0]
   1570    vinserti128      m4, [grain_lutq+left_offxyq+82], 1
   1571    punpcklbw        m4, m5
   1572 %if %1
   1573    vpbroadcastq     m0, [pb_23_22]
   1574    pmaddubsw        m4, m0, m4
   1575 %else
   1576    pmaddubsw        m4, m15, m4
   1577 %endif
   1578    pmulhrsw         m4, m12
   1579    packsswb         m4, m4
   1580    vpblendd         m4, m5, 0xee
   1581 %else
   1582    movu             m5, [grain_lutq+offxyq]
   1583    movd            xm4, [grain_lutq+left_offxyq]
   1584    punpcklbw       xm4, xm5
   1585 %if %1
   1586    movq            xm0, [pb_27_17_17_27]
   1587    pmaddubsw       xm4, xm0, xm4
   1588 %else
   1589    pmaddubsw       xm4, xm15, xm4
   1590 %endif
   1591    pmulhrsw        xm4, xm12
   1592    packsswb        xm4, xm4
   1593    vpblendd         m4, m5, 0xfe
   1594 %endif
   1595    punpckhbw        m5, m7
   1596    punpcklbw        m4, m7
   1597 
   1598    ; noise = round2(scaling[luma_src] * grain, scaling_shift)
   1599    pmaddubsw        m2, m4
   1600    pmaddubsw        m3, m5
   1601    pmulhrsw         m2, m9
   1602    pmulhrsw         m3, m9
   1603 
   1604    ; unpack chroma_source
   1605    punpcklbw        m0, m1, m7
   1606    punpckhbw        m1, m7
   1607 
   1608    ; dst = clip_pixel(src, noise)
   1609    paddw            m0, m2
   1610    paddw            m1, m3
   1611    packuswb         m0, m1
   1612    pmaxub           m0, m10
   1613    pminub           m0, m11
   1614 %if %2
   1615    mova         [dstq], xm0
   1616    vextracti128 [dstq+strideq], m0, 1
   1617 %else
   1618    mova         [dstq], m0
   1619 %endif
   1620 
   1621 %if %2
   1622    lea            srcq, [srcq+strideq*2]
   1623    lea            dstq, [dstq+strideq*2]
   1624    lea           lumaq, [lumaq+lstrideq*(2<<%3)]
   1625 %else
   1626    add            srcq, strideq
   1627    add            dstq, strideq
   1628    add           lumaq, lstrideq
   1629 %endif
   1630    add      grain_lutq, 82*(1+%2)
   1631    sub              hb, 1+%2
   1632    jg %%loop_y_h_overlap
   1633 
   1634    add              wq, 32>>%2
   1635    jge .end
   1636    mov            srcq, r11mp
   1637    mov            dstq, r12mp
   1638    lea           lumaq, [r14+wq*(1+%2)]
   1639    add            srcq, wq
   1640    add            dstq, wq
   1641 
   1642    ; r8m = sbym
   1643    cmp       dword r8m, 0
   1644    jne %%loop_x_hv_overlap
   1645    jmp %%loop_x_h_overlap
   1646 
   1647 %%vertical_overlap:
   1648    DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, unused, \
   1649                sby, see, overlap, unused1, unused2, lstride
   1650 
   1651    movzx          sbyd, sbyb
   1652    imul           seed, [fg_dataq+FGData.seed], 0x00010001
   1653    imul            r7d, sbyd, 173 * 0x00010001
   1654    imul           sbyd, 37 * 0x01000100
   1655    add             r7d, (105 << 16) | 188
   1656    add            sbyd, (178 << 24) | (141 << 8)
   1657    and             r7d, 0x00ff00ff
   1658    and            sbyd, 0xff00ff00
   1659    xor            seed, r7d
   1660    xor            seed, sbyd               ; (cur_seed << 16) | top_seed
   1661 
   1662    DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \
   1663                unused1, unused2, see, overlap, unused3, unused4, lstride
   1664 
   1665    mov           lumaq, r9mp
   1666    lea             r12, [srcq+wq]
   1667    lea             r13, [dstq+wq]
   1668    lea             r14, [lumaq+wq*(1+%2)]
   1669    mov           r11mp, r12
   1670    mov           r12mp, r13
   1671    mov        lstrideq, r10mp
   1672    neg              wq
   1673 
   1674 %%loop_x_v_overlap:
   1675    ; we assume from the block above that bits 8-15 of r7d are zero'ed
   1676    mov             r6d, seed
   1677    or             seed, 0xeff4eff4
   1678    test           seeb, seeh
   1679    setp            r7b                     ; parity of top_seed
   1680    shr            seed, 16
   1681    shl             r7d, 16
   1682    test           seeb, seeh
   1683    setp            r7b                     ; parity of cur_seed
   1684    or              r6d, 0x00010001
   1685    xor             r7d, r6d
   1686    rorx           seed, r7d, 1             ; updated (cur_seed << 16) | top_seed
   1687 
   1688    DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \
   1689                offx, offy, see, overlap, top_offxy, unused, lstride
   1690 
   1691    rorx          offyd, seed, 8
   1692    rorx          offxd, seed, 12
   1693    and           offyd, 0xf000f
   1694    and           offxd, 0xf000f
   1695    imul          offyd, 164>>%3
   1696    ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy
   1697    lea           offyd, [offyq+offxq*(2-%2)+0x10001*((3+(6>>%3))*82+3+(6>>%2))+(32>>%3)*82]
   1698 
   1699    DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \
   1700                h, offxy, see, overlap, top_offxy, unused, lstride
   1701 
   1702    mov      grain_lutq, grain_lutmp
   1703    mov              hd, hm
   1704    movzx    top_offxyd, offxyw
   1705    shr          offxyd, 16
   1706 %if %2 == 0
   1707    vpbroadcastd    m13, [pb_27_17]
   1708 %endif
   1709 %%loop_y_v_overlap:
   1710    ; src
   1711 %if %2
   1712    mova            xm3, [lumaq+lstrideq*0+ 0]
   1713    vinserti128      m3, [lumaq+lstrideq*(1+%3)+ 0], 1
   1714    vpbroadcastd     m2, [pb_1]
   1715    mova            xm0, [lumaq+lstrideq*0+16]
   1716    vinserti128      m0, [lumaq+lstrideq*(1+%3)+16], 1
   1717    mova            xm1, [srcq]
   1718    vinserti128      m1, [srcq+strideq], 1
   1719    pmaddubsw        m3, m2
   1720    pmaddubsw        m0, m2
   1721    pavgw            m3, m7
   1722    pavgw            m0, m7
   1723 %else
   1724    mova             m2, [lumaq]
   1725    mova             m1, [srcq]
   1726 %endif
   1727 %if %1
   1728 %if %2
   1729    packuswb         m2, m3, m0             ; luma
   1730 %endif
   1731    punpckhbw        m3, m2, m1
   1732    punpcklbw        m2, m1                 ; { luma, chroma }
   1733    pmaddubsw        m3, m14
   1734    pmaddubsw        m2, m14
   1735    psraw            m3, 6
   1736    psraw            m2, 6
   1737    paddw            m3, m15
   1738    paddw            m2, m15
   1739    packuswb         m2, m3                 ; pack+unpack = clip
   1740 %endif
   1741 %if %1 || %2 == 0
   1742    punpcklbw        m3, m2, m7
   1743    punpckhbw        m0, m2, m7
   1744 %endif
   1745 
   1746    ; scaling[luma_src]
   1747    pandn            m4, m8, m3
   1748    mova             m6, m8
   1749    vpgatherdd       m2, [scalingq+m4-0], m8
   1750    psrld            m3, 16
   1751    mova             m8, m6
   1752    vpgatherdd       m4, [scalingq+m3-2], m6
   1753    pandn            m5, m8, m0
   1754    mova             m6, m8
   1755    vpgatherdd       m3, [scalingq+m5-0], m8
   1756    psrld            m0, 16
   1757    mova             m8, m6
   1758    vpgatherdd       m5, [scalingq+m0-2], m6
   1759    pblendw          m2, m4, 0xaa
   1760    pblendw          m3, m5, 0xaa
   1761 
   1762    ; grain = grain_lut[offy+y][offx+x]
   1763 %if %3 == 0
   1764 %if %2
   1765    movu            xm0, [grain_lutq+offxyq]
   1766    vinserti128      m0, [grain_lutq+offxyq+82], 1
   1767    movu            xm4, [grain_lutq+top_offxyq]
   1768    vinserti128      m4, [grain_lutq+top_offxyq+82], 1
   1769 %else
   1770    movu             m0, [grain_lutq+offxyq]
   1771    movu             m4, [grain_lutq+top_offxyq]
   1772 %endif
   1773    punpcklbw        m5, m4, m0
   1774    punpckhbw        m4, m0
   1775    pmaddubsw        m5, m13, m5
   1776    pmaddubsw        m4, m13, m4
   1777    pmulhrsw         m5, m12
   1778    pmulhrsw         m4, m12
   1779    packsswb         m5, m4
   1780 %else
   1781    movq            xm4, [grain_lutq+offxyq]
   1782    vinserti128      m4, [grain_lutq+offxyq+8], 1
   1783    movq            xm5, [grain_lutq+top_offxyq]
   1784    vinserti128      m5, [grain_lutq+top_offxyq+8], 1
   1785    punpcklbw        m5, m4
   1786    pmaddubsw        m5, m13, m5
   1787    pmulhrsw         m5, m12
   1788    vextracti128    xm4, m5, 1
   1789    packsswb        xm5, xm4
   1790    ; only interpolate first line, insert second line unmodified
   1791    vinserti128      m5, [grain_lutq+offxyq+82], 1
   1792 %endif
   1793    punpcklbw        m4, m5, m7
   1794    punpckhbw        m5, m7
   1795 
   1796    ; noise = round2(scaling[luma_src] * grain, scaling_shift)
   1797    pmaddubsw        m2, m4
   1798    pmaddubsw        m3, m5
   1799    pmulhrsw         m2, m9
   1800    pmulhrsw         m3, m9
   1801 
   1802    ; unpack chroma_source
   1803    punpcklbw        m0, m1, m7
   1804    punpckhbw        m1, m7
   1805 
   1806    ; dst = clip_pixel(src, noise)
   1807    paddw            m0, m2
   1808    paddw            m1, m3
   1809    packuswb         m0, m1
   1810    pmaxub           m0, m10
   1811    pminub           m0, m11
   1812 %if %2
   1813    mova         [dstq], xm0
   1814    vextracti128 [dstq+strideq], m0, 1
   1815 %else
   1816    mova         [dstq], m0
   1817 %endif
   1818 
   1819    sub              hb, 1+%2
   1820    jle %%end_y_v_overlap
   1821 %if %2
   1822    lea            srcq, [srcq+strideq*2]
   1823    lea            dstq, [dstq+strideq*2]
   1824    lea           lumaq, [lumaq+lstrideq*(2<<%3)]
   1825 %else
   1826    add            srcq, strideq
   1827    add            dstq, strideq
   1828    add           lumaq, lstrideq
   1829 %endif
   1830    add      grain_lutq, 82<<%2
   1831 %if %2 == 0
   1832    vpbroadcastd    m13, [pb_17_27]
   1833    add              hd, 0x80000000
   1834    jnc %%loop_y_v_overlap
   1835 %endif
   1836    jmp %%loop_y
   1837 
   1838 %%end_y_v_overlap:
   1839    add              wq, 32>>%2
   1840    jge .end
   1841    mov            srcq, r11mp
   1842    mov            dstq, r12mp
   1843    lea           lumaq, [r14+wq*(1+%2)]
   1844    add            srcq, wq
   1845    add            dstq, wq
   1846 
   1847    ; since fg_dataq.overlap is guaranteed to be set, we never jump
   1848    ; back to .loop_x_v_overlap, and instead always fall-through to
   1849    ; h+v overlap
   1850 
   1851 %%loop_x_hv_overlap:
   1852    ; we assume from the block above that bits 8-15 of r7d are zero'ed
   1853    mov             r6d, seed
   1854    or             seed, 0xeff4eff4
   1855    test           seeb, seeh
   1856    setp            r7b                     ; parity of top_seed
   1857    shr            seed, 16
   1858    shl             r7d, 16
   1859    test           seeb, seeh
   1860    setp            r7b                     ; parity of cur_seed
   1861    or              r6d, 0x00010001
   1862    xor             r7d, r6d
   1863    rorx           seed, r7d, 1             ; updated (cur_seed << 16) | top_seed
   1864 
   1865    DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \
   1866                offx, offy, see, left_offxy, top_offxy, topleft_offxy, lstride
   1867 
   1868    lea  topleft_offxyd, [top_offxyq+(32>>%2)]
   1869    lea     left_offxyd, [offyq+(32>>%2)]
   1870    rorx          offyd, seed, 8
   1871    rorx          offxd, seed, 12
   1872    and           offyd, 0xf000f
   1873    and           offxd, 0xf000f
   1874    imul          offyd, 164>>%3
   1875    ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy
   1876    lea           offyd, [offyq+offxq*(2-%2)+0x10001*((3+(6>>%3))*82+3+(6>>%2))+(32>>%3)*82]
   1877 
   1878    DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \
   1879                h, offxy, see, left_offxy, top_offxy, topleft_offxy, lstride
   1880 
   1881    mov      grain_lutq, grain_lutmp
   1882    mov              hd, hm
   1883    movzx    top_offxyd, offxyw
   1884    shr          offxyd, 16
   1885 %if %2 == 0
   1886    vpbroadcastd    m13, [pb_27_17]
   1887 %endif
   1888 %%loop_y_hv_overlap:
   1889    ; src
   1890 %if %2
   1891    mova            xm3, [lumaq+lstrideq*0+ 0]
   1892    vinserti128      m3, [lumaq+lstrideq*(1+%3)+ 0], 1
   1893    vpbroadcastd     m2, [pb_1]
   1894    mova            xm0, [lumaq+lstrideq*0+16]
   1895    vinserti128      m0, [lumaq+lstrideq*(1+%3)+16], 1
   1896    mova            xm1, [srcq]
   1897    vinserti128      m1, [srcq+strideq], 1
   1898    pmaddubsw        m3, m2
   1899    pmaddubsw        m0, m2
   1900    pavgw            m3, m7
   1901    pavgw            m0, m7
   1902 %else
   1903    mova             m2, [lumaq]
   1904    mova             m1, [srcq]
   1905 %endif
   1906 %if %1
   1907 %if %2
   1908    packuswb         m2, m3, m0             ; luma
   1909 %endif
   1910    punpckhbw        m3, m2, m1
   1911    punpcklbw        m2, m1                 ; { luma, chroma }
   1912    pmaddubsw        m3, m14
   1913    pmaddubsw        m2, m14
   1914    psraw            m3, 6
   1915    psraw            m2, 6
   1916    paddw            m3, m15
   1917    paddw            m2, m15
   1918    packuswb         m2, m3                 ; pack+unpack = clip
   1919 %endif
   1920 %if %1 || %2 == 0
   1921    punpcklbw        m3, m2, m7
   1922    punpckhbw        m0, m2, m7
   1923 %endif
   1924 
   1925    ; scaling[luma_src]
   1926    pandn            m4, m8, m3
   1927    mova             m6, m8
   1928    vpgatherdd       m2, [scalingq+m4-0], m8
   1929    psrld            m3, 16
   1930    mova             m8, m6
   1931    vpgatherdd       m4, [scalingq+m3-2], m6
   1932    pandn            m5, m8, m0
   1933    mova             m6, m8
   1934    vpgatherdd       m3, [scalingq+m5-0], m8
   1935    psrld            m0, 16
   1936    mova             m8, m6
   1937    vpgatherdd       m5, [scalingq+m0-2], m6
   1938    pblendw          m2, m4, 0xaa
   1939    pblendw          m3, m5, 0xaa
   1940 
   1941    ; grain = grain_lut[offy+y][offx+x]
   1942 %if %2
   1943    movu            xm4, [grain_lutq+offxyq]
   1944    vinserti128      m4, [grain_lutq+offxyq+82], 1
   1945    movd            xm0, [grain_lutq+left_offxyq]
   1946    vinserti128      m0, [grain_lutq+left_offxyq+82], 1
   1947    movd            xm6, [grain_lutq+topleft_offxyq]
   1948 %if %3
   1949    movq            xm5, [grain_lutq+top_offxyq]
   1950    vinserti128      m5, [grain_lutq+top_offxyq+8], 1
   1951 %else
   1952    vinserti128      m6, [grain_lutq+topleft_offxyq+82], 1
   1953    movu            xm5, [grain_lutq+top_offxyq]
   1954    vinserti128      m5, [grain_lutq+top_offxyq+82], 1
   1955 %endif
   1956 
   1957    ; do h interpolation first (so top | top/left -> top, left | cur -> cur)
   1958    punpcklbw        m0, m4
   1959 %if %3
   1960    punpcklbw       xm6, xm5
   1961 %else
   1962    punpcklbw        m6, m5
   1963 %endif
   1964    punpcklqdq       m0, m6
   1965 %if %1
   1966    vpbroadcastq     m6, [pb_23_22]
   1967    pmaddubsw        m0, m6, m0
   1968 %else
   1969    pmaddubsw        m0, m15, m0
   1970 %endif
   1971    pmulhrsw         m0, m12
   1972    packsswb         m0, m0
   1973    vpblendd         m4, m0, 0x11
   1974 %if %3
   1975    pshuflw         xm0, xm0, q1032
   1976    vpblendd         m5, m0, 0x01
   1977 %else
   1978    pshuflw          m0, m0, q1032
   1979    vpblendd         m5, m0, 0x11
   1980 %endif
   1981 %else
   1982    movu             m4, [grain_lutq+offxyq]
   1983    movd            xm0, [grain_lutq+left_offxyq]
   1984    movu             m5, [grain_lutq+top_offxyq]
   1985    movd            xm6, [grain_lutq+topleft_offxyq]
   1986    punpcklbw       xm0, xm4
   1987    punpcklbw       xm6, xm5
   1988    punpcklqdq      xm0, xm6
   1989 %if %1
   1990    vpbroadcastq    xm6, [pb_27_17_17_27]
   1991    pmaddubsw       xm0, xm6, xm0
   1992 %else
   1993    pmaddubsw       xm0, xm15, xm0
   1994 %endif
   1995    pmulhrsw        xm0, xm12
   1996    packsswb        xm0, xm0
   1997    vpblendd         m4, m0, 0x01
   1998    pshuflw         xm0, xm0, q1032
   1999    vpblendd         m5, m0, 0x01
   2000 %endif
   2001 
   2002    ; followed by v interpolation (top | cur -> cur)
   2003 %if %3
   2004    vpermq           m0, m4, q3120
   2005    punpcklbw        m5, m0
   2006    pmaddubsw        m5, m13, m5
   2007    pmulhrsw         m5, m12
   2008    vextracti128    xm0, m5, 1
   2009    packsswb        xm5, xm0
   2010    vpblendd         m5, m4, 0xf0
   2011 %else
   2012    punpckhbw        m0, m5, m4
   2013    punpcklbw        m5, m4
   2014    pmaddubsw        m4, m13, m0
   2015    pmaddubsw        m5, m13, m5
   2016    pmulhrsw         m4, m12
   2017    pmulhrsw         m5, m12
   2018    packsswb         m5, m4
   2019 %endif
   2020    punpcklbw        m4, m5, m7
   2021    punpckhbw        m5, m7
   2022 
   2023    ; noise = round2(scaling[src] * grain, scaling_shift)
   2024    pmaddubsw        m2, m4
   2025    pmaddubsw        m3, m5
   2026    pmulhrsw         m2, m9
   2027    pmulhrsw         m3, m9
   2028 
   2029    ; unpack chroma source
   2030    punpcklbw        m0, m1, m7
   2031    punpckhbw        m1, m7
   2032 
   2033    ; dst = clip_pixel(src, noise)
   2034    paddw            m0, m2
   2035    paddw            m1, m3
   2036    packuswb         m0, m1
   2037    pmaxub           m0, m10
   2038    pminub           m0, m11
   2039 %if %2
   2040    mova         [dstq], xm0
   2041    vextracti128 [dstq+strideq], m0, 1
   2042 %else
   2043    mova         [dstq], m0
   2044 %endif
   2045 
   2046 %if %2
   2047    lea            srcq, [srcq+strideq*2]
   2048    lea            dstq, [dstq+strideq*2]
   2049    lea           lumaq, [lumaq+lstrideq*(2<<%3)]
   2050 %else
   2051    add            srcq, strideq
   2052    add            dstq, strideq
   2053    add           lumaq, lstrideq
   2054 %endif
   2055    add      grain_lutq, 82<<%2
   2056    sub              hb, 1+%2
   2057 %if %2
   2058    jg %%loop_y_h_overlap
   2059 %else
   2060    je %%end_y_hv_overlap
   2061    vpbroadcastd    m13, [pb_17_27]
   2062    add              hd, 0x80000000
   2063    jnc %%loop_y_hv_overlap
   2064    jmp %%loop_y_h_overlap
   2065 %endif
   2066 
   2067 %%end_y_hv_overlap:
   2068    add              wq, 32>>%2
   2069    jge .end
   2070    mov            srcq, r11mp
   2071    mov            dstq, r12mp
   2072    lea           lumaq, [r14+wq*(1+%2)]
   2073    add            srcq, wq
   2074    add            dstq, wq
   2075    jmp %%loop_x_hv_overlap
   2076 %endmacro
   2077 
   2078    %%FGUV_32x32xN_LOOP 1, %2, %3
   2079 .csfl:
   2080    %%FGUV_32x32xN_LOOP 0, %2, %3
   2081 .end:
   2082    RET
   2083 %endmacro
   2084 
   2085 GEN_GRAIN_UV_FN 420, 1, 1
   2086 FGUV_FN         420, 1, 1
   2087 GEN_GRAIN_UV_FN 422, 1, 0
   2088 FGUV_FN         422, 1, 0
   2089 GEN_GRAIN_UV_FN 444, 0, 0
   2090 FGUV_FN         444, 0, 0
   2091 
   2092 %endif ; ARCH_X86_64