tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

filmgrain16_sse.asm (96046B)


      1 ; Copyright © 2021, VideoLAN and dav1d authors
      2 ; Copyright © 2021, Two Orioles, LLC
      3 ; All rights reserved.
      4 ;
      5 ; Redistribution and use in source and binary forms, with or without
      6 ; modification, are permitted provided that the following conditions are met:
      7 ;
      8 ; 1. Redistributions of source code must retain the above copyright notice, this
      9 ;    list of conditions and the following disclaimer.
     10 ;
     11 ; 2. Redistributions in binary form must reproduce the above copyright notice,
     12 ;    this list of conditions and the following disclaimer in the documentation
     13 ;    and/or other materials provided with the distribution.
     14 ;
     15 ; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
     16 ; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
     17 ; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
     18 ; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
     19 ; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
     20 ; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
     21 ; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
     22 ; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
     23 ; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
     24 ; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     25 
     26 %include "config.asm"
     27 %include "ext/x86/x86inc.asm"
     28 %include "x86/filmgrain_common.asm"
     29 
     30 SECTION_RODATA 16
     31 pd_16: times 4 dd 16
     32 pw_1: times 8 dw 1
     33 pw_16384: times 8 dw 16384
     34 pw_8192: times 8 dw 8192
     35 pw_23_22: dw 23, 22
     36          times 3 dw 0, 32
     37 pb_mask: db 0, 0x80, 0x80, 0, 0x80, 0, 0, 0x80, 0x80, 0, 0, 0x80, 0, 0x80, 0x80, 0
     38 pw_27_17_17_27: dw 27, 17, 17, 27
     39                times 2 dw 0, 32
     40 rnd_next_upperbit_mask: dw 0x100B, 0x2016, 0x402C, 0x8058
     41 pw_seed_xor: times 2 dw 0xb524
     42             times 2 dw 0x49d8
     43 pb_1: times 4 db 1
     44 hmul_bits: dw 32768, 16384, 8192, 4096
     45 round: dw 2048, 1024, 512
     46 mul_bits: dw 256, 128, 64, 32, 16
     47 round_vals: dw 32, 64, 128, 256, 512, 1024
     48 max: dw 256*4-1, 240*4, 235*4, 256*16-1, 240*16, 235*16
     49 min: dw 0, 16*4, 16*16
     50 ; these two should be next to each other
     51 pw_4: times 2 dw 4
     52 pw_16: times 2 dw 16
     53 
     54 %macro JMP_TABLE 1-*
     55    %xdefine %1_table %%table
     56    %xdefine %%base %1_table
     57    %xdefine %%prefix mangle(private_prefix %+ _%1)
     58    %%table:
     59    %rep %0 - 1
     60        dd %%prefix %+ .ar%2 - %%base
     61        %rotate 1
     62    %endrep
     63 %endmacro
     64 
     65 JMP_TABLE generate_grain_y_16bpc_ssse3, 0, 1, 2, 3
     66 JMP_TABLE generate_grain_uv_420_16bpc_ssse3, 0, 1, 2, 3
     67 JMP_TABLE generate_grain_uv_422_16bpc_ssse3, 0, 1, 2, 3
     68 JMP_TABLE generate_grain_uv_444_16bpc_ssse3, 0, 1, 2, 3
     69 
     70 SECTION .text
     71 
     72 %if ARCH_X86_32
     73 %undef base
     74 %define PIC_ptr(a) base+a
     75 %else
     76 %define PIC_ptr(a) a
     77 %endif
     78 
     79 %define m(x) mangle(private_prefix %+ _ %+ x %+ SUFFIX)
     80 
     81 %macro vpgatherdw 5-8 8, 1 ; dst, src, base, tmp_gpr[x2], cnt, stride, tmp_xmm_reg
     82 %assign %%idx 0
     83 %define %%tmp %2
     84 %if %0 == 8
     85 %define %%tmp %8
     86 %endif
     87 %rep (%6/2)
     88 %if %%idx == 0
     89    movd        %5 %+ d, %2
     90    pshuflw       %%tmp, %2, q3232
     91 %else
     92    movd        %5 %+ d, %%tmp
     93 %if %6 == 8
     94 %if %%idx == 2
     95    punpckhqdq    %%tmp, %%tmp
     96 %elif %%idx == 4
     97    psrlq         %%tmp, 32
     98 %endif
     99 %endif
    100 %endif
    101    movzx       %4 %+ d, %5 %+ w
    102    shr         %5 %+ d, 16
    103 
    104 %if %%idx == 0
    105    movd             %1, [%3+%4*%7]
    106 %else
    107    pinsrw           %1, [%3+%4*%7], %%idx + 0
    108 %endif
    109    pinsrw           %1, [%3+%5*%7], %%idx + 1
    110 %assign %%idx %%idx+2
    111 %endrep
    112 %endmacro
    113 
    114 %macro SPLATD 2 ; dst, src
    115 %ifnidn %1, %2
    116    movd %1, %2
    117 %endif
    118    pshufd %1, %1, q0000
    119 %endmacro
    120 
    121 %macro SPLATW 2 ; dst, src
    122 %ifnidn %1, %2
    123    movd %1, %2
    124 %endif
    125    pshuflw %1, %1, q0000
    126    punpcklqdq %1, %1
    127 %endmacro
    128 
    129 
    130 INIT_XMM ssse3
    131 %if ARCH_X86_64
    132 cglobal generate_grain_y_16bpc, 3, 8, 16, buf, fg_data, bdmax
    133    lea              r4, [pb_mask]
    134 %define base r4-pb_mask
    135 %else
    136 cglobal generate_grain_y_16bpc, 3, 6, 8, buf, fg_data, bdmax
    137    LEA              r4, $$
    138 %define base r4-$$
    139 %endif
    140    movq             m1, [base+rnd_next_upperbit_mask]
    141    movq             m4, [base+mul_bits]
    142    movq             m7, [base+hmul_bits]
    143    mov             r3d, [fg_dataq+FGData.grain_scale_shift]
    144    lea             r5d, [bdmaxq+1]
    145    shr             r5d, 11             ; 0 for 10bpc, 2 for 12bpc
    146    sub              r3, r5
    147    SPLATW           m6, [base+round+r3*2-2]
    148    mova             m5, [base+pb_mask]
    149    SPLATW           m0, [fg_dataq+FGData.seed]
    150    mov              r3, -73*82*2
    151    sub            bufq, r3
    152 %if ARCH_X86_64
    153    lea              r6, [gaussian_sequence]
    154 %endif
    155 .loop:
    156    pand             m2, m0, m1
    157    psrlw            m3, m2, 10
    158    por              m2, m3             ; bits 0xf, 0x1e, 0x3c and 0x78 are set
    159    pmullw           m2, m4             ; bits 0x0f00 are set
    160    pshufb           m3, m5, m2         ; set 15th bit for next 4 seeds
    161    psllq            m2, m3, 30
    162    por              m2, m3
    163    psllq            m3, m2, 15
    164    por              m2, m3             ; aggregate each bit into next seed's high bit
    165    pmulhuw          m3, m0, m7
    166    por              m2, m3             ; 4 next output seeds
    167    pshuflw          m0, m2, q3333
    168    psrlw            m2, 5
    169 %if ARCH_X86_64
    170    vpgatherdw       m3, m2, r6, r5, r7, 4, 2
    171 %else
    172    vpgatherdw       m3, m2, base+gaussian_sequence, r5, r2, 4, 2
    173 %endif
    174    paddw            m3, m3             ; otherwise bpc=12 w/ grain_scale_shift=0
    175                                        ; shifts by 0, which pmulhrsw does not support
    176    pmulhrsw         m3, m6
    177    movq      [bufq+r3], m3
    178    add              r3, 4*2
    179    jl .loop
    180 
    181    ; auto-regression code
    182    movsxd           r3, [fg_dataq+FGData.ar_coeff_lag]
    183    movsxd           r3, [base+generate_grain_y_16bpc_ssse3_table+r3*4]
    184    lea              r3, [r3+base+generate_grain_y_16bpc_ssse3_table]
    185    jmp              r3
    186 
    187 .ar1:
    188 %if WIN64
    189    DEFINE_ARGS shift, fg_data, max, buf, val3, min, cf3, x, val0
    190    lea            bufq, [r0-2*(82*73-(82*3+79))]
    191    PUSH             r8
    192 %else
    193 %if ARCH_X86_64
    194    DEFINE_ARGS buf, fg_data, max, shift, val3, min, cf3, x, val0
    195 %else ; x86-32
    196    DEFINE_ARGS buf, fg_data, min, val3, x, cf3, val0
    197    PUSH             r6
    198 %define shiftd r1d
    199 %endif
    200    sub            bufq, 2*(82*73-(82*3+79))
    201 %endif
    202    movsx          cf3d, byte [fg_dataq+FGData.ar_coeffs_y+3]
    203    movd             m4, [fg_dataq+FGData.ar_coeffs_y]
    204    mov          shiftd, [fg_dataq+FGData.ar_coeff_shift]
    205 %if WIN64
    206    DEFINE_ARGS shift, h, max, buf, val3, min, cf3, x, val0
    207 %elif ARCH_X86_64
    208    DEFINE_ARGS buf, h, max, shift, val3, min, cf3, x, val0
    209 %else ; x86-32
    210 %undef shiftd
    211    DEFINE_ARGS buf, shift, min, val3, x, cf3, val0
    212 %define hd dword r0m
    213 %define maxd dword minm
    214 %endif
    215 %if cpuflag(sse4)
    216    pmovsxbw         m4, m4
    217 %else
    218    pxor             m3, m3
    219    pcmpgtb          m3, m4
    220    punpcklbw        m4, m3
    221 %endif
    222    pinsrw           m4, [base+pw_1], 3
    223    pshufd           m5, m4, q1111
    224    pshufd           m4, m4, q0000
    225    SPLATW           m3, [base+round_vals+shiftq*2-12]    ; rnd
    226    mov              hd, 70
    227    sar            maxd, 1
    228    mov            mind, maxd
    229    xor            mind, -1
    230 .y_loop_ar1:
    231    mov              xq, -76
    232    movsx         val3d, word [bufq+xq*2-2]
    233 .x_loop_ar1:
    234    movu             m0, [bufq+xq*2-82*2-2]     ; top/left
    235    psrldq           m2, m0, 2                  ; top
    236    psrldq           m1, m0, 4                  ; top/right
    237    punpcklwd        m0, m2
    238    punpcklwd        m1, m3
    239    pmaddwd          m0, m4
    240    pmaddwd          m1, m5
    241    paddd            m0, m1
    242 .x_loop_ar1_inner:
    243    movd          val0d, m0
    244    psrldq           m0, 4
    245    imul          val3d, cf3d
    246    add           val3d, val0d
    247    sar           val3d, shiftb
    248    movsx         val0d, word [bufq+xq*2]
    249    add           val3d, val0d
    250    cmp           val3d, maxd
    251    cmovg         val3d, maxd
    252    cmp           val3d, mind
    253    cmovl         val3d, mind
    254    mov word [bufq+xq*2], val3w
    255    ; keep val3d in-place as left for next x iteration
    256    inc              xq
    257    jz .x_loop_ar1_end
    258    test             xq, 3
    259    jnz .x_loop_ar1_inner
    260    jmp .x_loop_ar1
    261 
    262 .x_loop_ar1_end:
    263    add            bufq, 82*2
    264    dec              hd
    265    jg .y_loop_ar1
    266 %if WIN64
    267    POP              r8
    268 %elif ARCH_X86_32
    269    POP              r6
    270 %undef maxd
    271 %undef hd
    272 %endif
    273 .ar0:
    274    RET
    275 
    276 .ar2:
    277 %if ARCH_X86_32
    278    ALLOC_STACK -16*8
    279 %endif
    280    DEFINE_ARGS buf, fg_data, bdmax, shift
    281    mov          shiftd, [fg_dataq+FGData.ar_coeff_shift]
    282    movd             m0, [base+round_vals-12+shiftq*2]
    283    pshuflw          m0, m0, q0000
    284    movu             m6, [fg_dataq+FGData.ar_coeffs_y+0]    ; cf0-11
    285    pxor             m2, m2
    286    punpcklwd        m0, m2
    287    pcmpgtb          m2, m6
    288    punpckhbw        m3, m6, m2
    289    punpcklbw        m6, m2
    290    pshufd           m2, m6, q3333
    291    pshufd           m1, m6, q2222
    292    pshufd           m7, m6, q1111
    293    pshufd           m6, m6, q0000
    294    pshufd           m4, m3, q1111
    295    pshufd           m3, m3, q0000
    296 %if ARCH_X86_64
    297    SWAP              0, 12
    298    SWAP              1, 8
    299    SWAP              2, 9
    300    SWAP              3, 10
    301    SWAP              4, 11
    302 %else
    303 %define m12 [rsp+0*16]
    304 %define m8 [rsp+1*16]
    305 %define m9 [rsp+2*16]
    306 %define m10 [rsp+3*16]
    307 %define m11 [rsp+4*16]
    308    mova            m12, m0
    309    mova             m8, m1
    310    mova             m9, m2
    311    mova            m10, m3
    312    mova            m11, m4
    313    mov          bdmaxd, bdmaxm
    314 %endif
    315    sar          bdmaxd, 1
    316    SPLATW           m0, bdmaxd                             ; max_grain
    317    pcmpeqw          m1, m1
    318 %if !cpuflag(sse4)
    319    pcmpeqw          m2, m2
    320    psrldq           m2, 14
    321    pslldq           m2, 2
    322    pxor             m2, m1
    323 %endif
    324    pxor             m1, m0                                 ; min_grain
    325 %if ARCH_X86_64
    326    SWAP              0, 13
    327    SWAP              1, 14
    328    SWAP              2, 15
    329 %else
    330 %define m13 [rsp+5*16]
    331 %define m14 [rsp+6*16]
    332    mova            m13, m0
    333    mova            m14, m1
    334 %if !cpuflag(sse4)
    335 %define m15 [rsp+7*16]
    336    mova            m15, m2
    337 %endif
    338 %endif
    339    sub            bufq, 2*(82*73-(82*3+79))
    340    DEFINE_ARGS buf, fg_data, h, x
    341    mov              hd, 70
    342 .y_loop_ar2:
    343    mov              xq, -76
    344 
    345 .x_loop_ar2:
    346    movu             m0, [bufq+xq*2-82*4-4]     ; y=-2,x=[-2,+5]
    347    movu             m1, [bufq+xq*2-82*2-4]     ; y=-1,x=[-2,+5]
    348    psrldq           m2, m0, 2
    349    psrldq           m3, m0, 4
    350    psrldq           m4, m0, 6
    351    psrldq           m5, m0, 8
    352    punpcklwd        m0, m2
    353    punpcklwd        m3, m4
    354    punpcklwd        m5, m1
    355    psrldq           m2, m1, 2
    356    psrldq           m4, m1, 4
    357    punpcklwd        m2, m4
    358    psrldq           m4, m1, 6
    359    psrldq           m1, 8
    360    punpcklwd        m4, m1
    361    pmaddwd          m0, m6
    362    pmaddwd          m3, m7
    363    pmaddwd          m5, m8
    364    pmaddwd          m2, m9
    365    pmaddwd          m4, m10
    366    paddd            m0, m3
    367    paddd            m5, m2
    368    paddd            m0, m4
    369    paddd            m0, m5                     ; accumulated top 2 rows
    370    paddd            m0, m12
    371 
    372    movu             m1, [bufq+xq*2-4]      ; y=0,x=[-2,+5]
    373    pshufd           m4, m1, q3321
    374    pxor             m2, m2
    375    pcmpgtw          m2, m4
    376    punpcklwd        m4, m2                 ; in dwords, y=0,x=[0,3]
    377 .x_loop_ar2_inner:
    378    pmaddwd          m2, m1, m11
    379    paddd            m2, m0
    380    psrldq           m0, 4                  ; shift top to next pixel
    381    psrad            m2, [fg_dataq+FGData.ar_coeff_shift]
    382    paddd            m2, m4
    383    packssdw         m2, m2
    384    pminsw           m2, m13
    385    pmaxsw           m2, m14
    386    psrldq           m4, 4
    387    pslldq           m2, 2
    388    psrldq           m1, 2
    389 %if cpuflag(sse4)
    390    pblendw          m1, m2, 00000010b
    391 %else
    392    pand             m1, m15
    393    pandn            m3, m15, m2
    394    por              m1, m3
    395 %endif
    396    ; overwrite previous pixel, this should be ok
    397    movd  [bufq+xq*2-2], m1
    398    inc              xq
    399    jz .x_loop_ar2_end
    400    test             xq, 3
    401    jnz .x_loop_ar2_inner
    402    jmp .x_loop_ar2
    403 
    404 .x_loop_ar2_end:
    405    add            bufq, 82*2
    406    dec              hd
    407    jg .y_loop_ar2
    408 %if ARCH_X86_32
    409 %undef m8
    410 %undef m9
    411 %undef m10
    412 %undef m11
    413 %undef m12
    414 %undef m13
    415 %undef m14
    416 %undef m15
    417 %endif
    418    RET
    419 
    420 .ar3:
    421    DEFINE_ARGS buf, fg_data, bdmax, shift
    422 %if WIN64
    423    mov              r6, rsp
    424    and             rsp, ~15
    425    sub             rsp, 64
    426    %define         tmp  rsp
    427 %elif ARCH_X86_64
    428    %define         tmp  rsp+stack_offset-72
    429 %else
    430    ALLOC_STACK  -16*12
    431    %define         tmp  rsp
    432    mov          bdmaxd, bdmaxm
    433 %endif
    434    sar          bdmaxd, 1
    435    SPLATW           m7, bdmaxd                                 ; max_grain
    436    pcmpeqw          m6, m6
    437 %if !cpuflag(sse4)
    438    pcmpeqw          m4, m4
    439    psrldq           m4, 14
    440    pslldq           m4, 4
    441    pxor             m4, m6
    442 %endif
    443    pxor             m6, m7                                    ; min_grain
    444    mov          shiftd, [fg_dataq+FGData.ar_coeff_shift]
    445 
    446 %if ARCH_X86_64
    447    SWAP              6, 14
    448    SWAP              7, 15
    449 %else
    450 %define m14 [rsp+10*16]
    451 %define m15 [esp+11*16]
    452    mova            m14, m6
    453    mova            m15, m7
    454 %endif
    455 
    456    ; build cf0-1 until 18-19 in m5-12 and r0/1
    457    pxor             m1, m1
    458    movu             m0, [fg_dataq+FGData.ar_coeffs_y+ 0]       ; cf0-15
    459    pcmpgtb          m1, m0
    460    punpckhbw        m2, m0, m1
    461    punpcklbw        m0, m1
    462 
    463 %if cpuflag(sse4)
    464    pshufd           m4, m2, q3333
    465 %else
    466    pshufd           m5, m2, q3333
    467    mova       [tmp+48], m5
    468 %endif
    469    pshufd           m3, m2, q2222
    470    pshufd           m1, m2, q0000
    471    pshufd           m2, m2, q1111
    472    pshufd           m7, m0, q2222
    473    pshufd           m6, m0, q1111
    474    pshufd           m5, m0, q0000
    475    pshufd           m0, m0, q3333
    476 
    477 %if ARCH_X86_64
    478    SWAP              0, 8
    479    SWAP              1, 9
    480    SWAP              2, 10
    481    SWAP              3, 11
    482    SWAP              4, 12
    483 %else
    484 %define m8 [rsp+4*16]
    485 %define m9 [esp+5*16]
    486 %define m10 [rsp+6*16]
    487 %define m11 [esp+7*16]
    488 %define m12 [rsp+8*16]
    489    mova             m8, m0
    490    mova             m9, m1
    491    mova            m10, m2
    492    mova            m11, m3
    493    mova            m12, m4
    494 %endif
    495 
    496    ; build cf20,round in r2
    497    ; build cf21-23,round*2 in m13
    498    pxor             m1, m1
    499    movq             m0, [fg_dataq+FGData.ar_coeffs_y+16]       ; cf16-23
    500    pcmpgtb          m1, m0
    501    punpcklbw        m0, m1
    502    pshufd           m1, m0, q0000
    503    pshufd           m2, m0, q1111
    504    mova       [tmp+ 0], m1
    505    mova       [tmp+16], m2
    506    psrldq           m3, m0, 10
    507    pinsrw           m3, [base+round_vals+shiftq*2-10], 3
    508 
    509 %if ARCH_X86_64
    510    SWAP              3, 13
    511 %else
    512 %define m13 [esp+9*16]
    513    mova            m13, m3
    514 %endif
    515 
    516    pinsrw           m0, [base+round_vals+shiftq*2-12], 5
    517    pshufd           m3, m0, q2222
    518    mova       [tmp+32], m3
    519 
    520    DEFINE_ARGS buf, fg_data, h, x
    521    sub            bufq, 2*(82*73-(82*3+79))
    522    mov              hd, 70
    523 .y_loop_ar3:
    524    mov              xq, -76
    525 
    526 .x_loop_ar3:
    527    movu             m0, [bufq+xq*2-82*6-6+ 0]      ; y=-3,x=[-3,+4]
    528    movd             m1, [bufq+xq*2-82*6-6+16]      ; y=-3,x=[+5,+6]
    529    palignr          m2, m1, m0, 2                  ; y=-3,x=[-2,+5]
    530    palignr          m1, m1, m0, 12                 ; y=-3,x=[+3,+6]
    531    punpckhwd        m3, m0, m2                     ; y=-3,x=[+1/+2,+2/+3,+3/+4,+4/+5]
    532    punpcklwd        m0, m2                         ; y=-3,x=[-3/-2,-2/-1,-1/+0,+0/+1]
    533    shufps           m2, m0, m3, q1032              ; y=-3,x=[-1/+0,+0/+1,+1/+2,+2/+3]
    534 
    535    pmaddwd          m0, m5
    536    pmaddwd          m2, m6
    537    pmaddwd          m3, m7
    538    paddd            m0, m2
    539    paddd            m0, m3
    540    ; m0 = top line first 6 multiplied by cf, m1 = top line last entry
    541 
    542    movu             m2, [bufq+xq*2-82*4-6+ 0]      ; y=-2,x=[-3,+4]
    543    movd             m3, [bufq+xq*2-82*4-6+16]      ; y=-2,x=[+5,+6]
    544    punpcklwd        m1, m2                         ; y=-3/-2,x=[+3/-3,+4/-2,+5/-1,+6/+0]
    545    palignr          m4, m3, m2, 2                  ; y=-3,x=[-2,+5]
    546    palignr          m3, m3, m2, 4                  ; y=-3,x=[-1,+6]
    547    punpckhwd        m2, m4, m3                     ; y=-2,x=[+2/+3,+3/+4,+4/+5,+5/+6]
    548    punpcklwd        m4, m3                         ; y=-2,x=[-2/-1,-1/+0,+0/+1,+1/+2]
    549    shufps           m3, m4, m2, q1032              ; y=-2,x=[+0/+1,+1/+2,+2/+3,+3/+4]
    550 
    551    pmaddwd          m1, m8
    552    pmaddwd          m4, m9
    553    pmaddwd          m3, m10
    554    pmaddwd          m2, m11
    555    paddd            m1, m4
    556    paddd            m3, m2
    557    paddd            m0, m1
    558    paddd            m0, m3
    559    ; m0 = top 2 lines multiplied by cf
    560 
    561    movu             m1, [bufq+xq*2-82*2-6+ 0]      ; y=-1,x=[-3,+4]
    562    movd             m2, [bufq+xq*2-82*2-6+16]      ; y=-1,x=[+5,+6]
    563    palignr          m3, m2, m1, 2                  ; y=-1,x=[-2,+5]
    564    palignr          m2, m2, m1, 12                 ; y=-1,x=[+3,+6]
    565    punpckhwd        m4, m1, m3                     ; y=-1,x=[+1/+2,+2/+3,+3/+4,+4/+5]
    566    punpcklwd        m1, m3                         ; y=-1,x=[-3/-2,-2/-1,-1/+0,+0/+1]
    567    shufps           m3, m1, m4, q1032              ; y=-1,x=[-1/+0,+0/+1,+1/+2,+2/+3]
    568    punpcklwd        m2, [base+pw_1]
    569 
    570 %if cpuflag(sse4)
    571    pmaddwd          m1, m12
    572 %else
    573    pmaddwd          m1, [tmp+48]
    574 %endif
    575    pmaddwd          m3, [tmp+ 0]
    576    pmaddwd          m4, [tmp+16]
    577    pmaddwd          m2, [tmp+32]
    578    paddd            m1, m3
    579    paddd            m4, m2
    580    paddd            m0, m1
    581    paddd            m0, m4
    582    ; m0 = top 3 lines multiplied by cf plus rounding for downshift
    583 
    584    movu             m1, [bufq+xq*2-6]      ; y=0,x=[-3,+4]
    585 .x_loop_ar3_inner:
    586    pmaddwd          m2, m1, m13
    587    pshufd           m3, m2, q1111
    588    paddd            m2, m3                 ; left+cur
    589    paddd            m2, m0                 ; add top
    590    psrldq           m0, 4
    591    psrad            m2, [fg_dataq+FGData.ar_coeff_shift]
    592    packssdw         m2, m2
    593    pminsw           m2, m15
    594    pmaxsw           m2, m14
    595    pslldq           m2, 4
    596    psrldq           m1, 2
    597 %if cpuflag(sse4)
    598    pblendw          m1, m2, 00000100b
    599 %else
    600    pand             m1, m12
    601    pandn            m3, m12, m2
    602    por              m1, m3
    603 %endif
    604    ; overwrite a couple of pixels, should be ok
    605    movq  [bufq+xq*2-4], m1
    606    inc              xq
    607    jz .x_loop_ar3_end
    608    test             xq, 3
    609    jnz .x_loop_ar3_inner
    610    jmp .x_loop_ar3
    611 
    612 .x_loop_ar3_end:
    613    add            bufq, 82*2
    614    dec              hd
    615    jg .y_loop_ar3
    616 %if WIN64
    617    mov             rsp, r6
    618 %elif ARCH_X86_32
    619 %undef m8
    620 %undef m9
    621 %undef m10
    622 %undef m11
    623 %undef m12
    624 %undef m13
    625 %undef m14
    626 %undef m15
    627 %endif
    628    RET
    629 
    630 %macro generate_grain_uv_fn 3 ; ss_name, ss_x, ss_y
    631 INIT_XMM ssse3
    632 %if ARCH_X86_64
    633 cglobal generate_grain_uv_%1_16bpc, 4, 11, 16, buf, bufy, fg_data, uv, bdmax, x, gaussian_reg, h, pic_reg
    634 %define base r8-pb_mask
    635    lea              r8, [pb_mask]
    636    movifnidn    bdmaxd, bdmaxm
    637    lea             r6d, [bdmaxq+1]
    638 %else
    639 cglobal generate_grain_uv_%1_16bpc, 1, 7, 8, buf, x, pic_reg, fg_data, h
    640 %define base r2-$$
    641    LEA              r2, $$
    642    mov        fg_dataq, r2m
    643    mov             r6d, r4m
    644    inc             r6d
    645 %endif
    646    movq             m1, [base+rnd_next_upperbit_mask]
    647    movq             m4, [base+mul_bits]
    648    movq             m7, [base+hmul_bits]
    649    mov             r5d, [fg_dataq+FGData.grain_scale_shift]
    650    shr             r6d, 11             ; 0 for 10bpc, 2 for 12bpc
    651    sub              r5, r6
    652    SPLATW           m6, [base+round+r5*2-2]
    653    mova             m5, [base+pb_mask]
    654    SPLATW           m0, [fg_dataq+FGData.seed]
    655 %if ARCH_X86_64
    656    SPLATW           m2, [base+pw_seed_xor+uvq*4]
    657 %else
    658    mov             r5d, r3m
    659    SPLATW           m2, [base+pw_seed_xor+r5*4]
    660 %endif
    661    pxor             m0, m2
    662 %if ARCH_X86_64
    663    lea              r6, [gaussian_sequence]
    664 %endif
    665 %if %2
    666    mov              hd, 73-35*%3
    667    add            bufq, 44*2
    668 .loop_y:
    669    mov              xq, -44
    670 %else
    671    mov              xq, -82*73
    672    add            bufq, 82*73*2
    673 %endif
    674 .loop_x:
    675    pand             m2, m0, m1
    676    psrlw            m3, m2, 10
    677    por              m2, m3             ; bits 0xf, 0x1e, 0x3c and 0x78 are set
    678    pmullw           m2, m4             ; bits 0x0f00 are set
    679    pshufb           m3, m5, m2         ; set 15th bit for next 4 seeds
    680    psllq            m2, m3, 30
    681    por              m2, m3
    682    psllq            m3, m2, 15
    683    por              m2, m3             ; aggregate each bit into next seed's high bit
    684    pmulhuw          m3, m0, m7
    685    por              m2, m3             ; 4 next output seeds
    686    pshuflw          m0, m2, q3333
    687    psrlw            m2, 5
    688 %if ARCH_X86_64
    689    vpgatherdw       m3, m2, r6, r9, r10, 4, 2
    690 %else
    691    vpgatherdw       m3, m2, base+gaussian_sequence, r5, r6, 4, 2
    692 %endif
    693    paddw            m3, m3             ; otherwise bpc=12 w/ grain_scale_shift=0
    694                                        ; shifts by 0, which pmulhrsw does not support
    695    pmulhrsw         m3, m6
    696    movq    [bufq+xq*2], m3
    697    add              xq, 4
    698    jl .loop_x
    699 %if %2
    700    add            bufq, 82*2
    701    dec              hd
    702    jg .loop_y
    703 %endif
    704 
    705    ; auto-regression code
    706    movsxd           r5, [fg_dataq+FGData.ar_coeff_lag]
    707    movsxd           r5, [base+generate_grain_uv_%1_16bpc_ssse3_table+r5*4]
    708    lea              r5, [r5+base+generate_grain_uv_%1_16bpc_ssse3_table]
    709    jmp              r5
    710 
    711 .ar0:
    712 %if ARCH_X86_64
    713    DEFINE_ARGS buf, bufy, fg_data, uv, bdmax, shift
    714 %else
    715    DEFINE_ARGS buf, bufy, pic_reg, fg_data, uv, shift
    716    ALLOC_STACK  -16*2
    717    mov           bufyq, r1m
    718    mov             uvd, r3m
    719 %endif
    720    imul            uvd, 28
    721    mov          shiftd, [fg_dataq+FGData.ar_coeff_shift]
    722    movd             m4, [fg_dataq+FGData.ar_coeffs_uv+uvq]
    723    SPLATW           m3, [base+hmul_bits+shiftq*2-10]
    724 %if ARCH_X86_64
    725    sar          bdmaxd, 1
    726    SPLATW           m1, bdmaxd                     ; max_gain
    727 %else
    728    SPLATW           m1, r4m
    729    psraw            m1, 1
    730 %endif
    731    pcmpeqw          m7, m7
    732    pxor             m7, m1                         ; min_grain
    733 %if ARCH_X86_64
    734    SWAP              1, 14
    735    DEFINE_ARGS buf, bufy, h, x
    736 %else
    737 %define m14 [rsp+0*16]
    738    mova            m14, m1
    739    DEFINE_ARGS buf, bufy, pic_reg, h, x
    740 %endif
    741    pxor             m5, m5
    742    pcmpgtb          m5, m4
    743    punpcklbw        m4, m5
    744 %if %2
    745    SPLATW           m6, [base+hmul_bits+2+%3*2]
    746 %endif
    747    SPLATW           m4, m4
    748    pxor             m5, m5
    749 %if %2
    750 %if !cpuflag(sse4)
    751    pcmpeqw          m2, m2
    752    pslldq           m2, 12
    753 %if ARCH_X86_64
    754    SWAP              2, 12
    755 %else
    756 %define m12 [rsp+1*16]
    757    mova            m12, m2
    758 %endif
    759 %endif
    760 %endif
    761 %if %2
    762    sub            bufq, 2*(82*(73-35*%3)+82-(82*3+41))
    763 %else
    764    sub            bufq, 2*(82*70-3)
    765 %endif
    766    add           bufyq, 2*(3+82*3)
    767    mov              hd, 70-35*%3
    768 .y_loop_ar0:
    769    ; first 32 pixels
    770    xor              xd, xd
    771 .x_loop_ar0:
    772    movu             m0, [bufyq+xq*(2<<%2)]
    773 %if %2
    774 %if %3
    775    movu             m2, [bufyq+xq*4+82*2]
    776    paddw            m0, m2
    777 %endif
    778    movu             m1, [bufyq+xq*4     +16]
    779 %if %3
    780    movu             m2, [bufyq+xq*4+82*2+16]
    781    paddw            m1, m2
    782 %endif
    783    phaddw           m0, m1
    784    pmulhrsw         m0, m6
    785 %endif
    786    punpckhwd        m1, m0, m5
    787    punpcklwd        m0, m5
    788    REPX {pmaddwd x, m4}, m0, m1
    789    REPX {psrad x, 5}, m0, m1
    790    packssdw         m0, m1
    791    pmulhrsw         m0, m3
    792    movu             m1, [bufq+xq*2]
    793    paddw            m0, m1
    794    pminsw           m0, m14
    795    pmaxsw           m0, m7
    796    cmp              xd, 72-40*%2
    797    je .end
    798    movu    [bufq+xq*2], m0
    799    add              xd, 8
    800    jmp .x_loop_ar0
    801 
    802    ; last 6/4 pixels
    803 .end:
    804 %if %2
    805 %if cpuflag(sse4)
    806    pblendw          m0, m1, 11000000b
    807 %else
    808    pand             m1, m12
    809    pandn            m2, m12, m0
    810    por              m0, m1, m2
    811 %endif
    812    movu    [bufq+xq*2], m0
    813 %else
    814    movq    [bufq+xq*2], m0
    815 %endif
    816 
    817    add            bufq, 82*2
    818    add           bufyq, 82*(2<<%3)
    819    dec              hd
    820    jg .y_loop_ar0
    821 %if ARCH_X86_32
    822 %undef m12
    823 %undef m14
    824 %endif
    825    RET
    826 
    827 .ar1:
    828 %if ARCH_X86_64
    829    DEFINE_ARGS buf, bufy, fg_data, uv, max, cf3, min, val3, x
    830 %else
    831    RESET_STACK_STATE
    832    DEFINE_ARGS buf, shift, pic_reg, fg_data, uv, bufy, cf3
    833    mov           bufyq, r1m
    834    mov             uvd, r3m
    835 %endif
    836    imul            uvd, 28
    837    movsx          cf3d, byte [fg_dataq+FGData.ar_coeffs_uv+uvq+3]
    838    movq             m4, [fg_dataq+FGData.ar_coeffs_uv+uvq]
    839 %if WIN64
    840    DEFINE_ARGS shift, bufy, h, buf, max, cf3, min, val3, x, val0
    841 %if %2
    842    lea            bufq, [r0-2*(82*(73-35*%3)+44-(82*3+41))]
    843 %else
    844    lea            bufq, [r0-2*(82*69+3)]
    845 %endif
    846 %else
    847 %if ARCH_X86_64
    848    DEFINE_ARGS buf, bufy, h, shift, max, cf3, min, val3, x, val0
    849 %else
    850    DEFINE_ARGS buf, shift, pic_reg, fg_data, val0, bufy, cf3
    851 %define hd dword r1m
    852 %define mind dword r3m
    853 %define maxd dword r4m
    854 %endif
    855 %if %2
    856    sub            bufq, 2*(82*(73-35*%3)+44-(82*3+41))
    857 %else
    858    sub            bufq, 2*(82*69+3)
    859 %endif
    860 %endif
    861 %if ARCH_X86_64
    862    mov          shiftd, [r2+FGData.ar_coeff_shift]
    863 %else
    864    mov          shiftd, [r3+FGData.ar_coeff_shift]
    865 %endif
    866    pxor             m5, m5
    867    pcmpgtb          m5, m4
    868    punpcklbw        m4, m5                 ; cf0-4 in words
    869    pshuflw          m4, m4, q2100
    870    psrldq           m4, 2                  ; cf0-3,4 in words
    871    pshufd           m5, m4, q1111
    872    pshufd           m4, m4, q0000
    873    movd             m3, [base+round_vals+shiftq*2-12]    ; rnd
    874    pxor             m6, m6
    875    punpcklwd        m3, m6
    876 %if %2
    877    SPLATW           m6, [base+hmul_bits+2+%3*2]
    878 %endif
    879    SPLATD           m3, m3
    880    add           bufyq, 2*(79+82*3)
    881    mov              hd, 70-35*%3
    882    sar            maxd, 1
    883 %if ARCH_X86_64
    884    mov            mind, maxd
    885    xor            mind, -1
    886 %else
    887    DEFINE_ARGS buf, shift, val3, x, val0, bufy, cf3
    888    mov              r2, maxd
    889    xor              r2, -1
    890    mov            mind, r2
    891 %endif
    892 .y_loop_ar1:
    893    mov              xq, -(76>>%2)
    894    movsx         val3d, word [bufq+xq*2-2]
    895 .x_loop_ar1:
    896    movu             m0, [bufq+xq*2-82*2-2] ; top/left
    897 %if %2
    898    movu             m7, [bufyq+xq*4]
    899 %if %3
    900    movu             m1, [bufyq+xq*4+82*2]
    901    phaddw           m7, m1
    902 %else
    903    phaddw           m7, m7
    904 %endif
    905 %else
    906    movq             m7, [bufyq+xq*2]
    907 %endif
    908    psrldq           m2, m0, 2              ; top
    909    psrldq           m1, m0, 4              ; top/right
    910    punpcklwd        m0, m2
    911 %if %2
    912 %if %3
    913    pshufd           m2, m7, q3232
    914    paddw            m7, m2
    915 %endif
    916    pmulhrsw         m7, m6
    917 %endif
    918    punpcklwd        m1, m7
    919    pmaddwd          m0, m4
    920    pmaddwd          m1, m5
    921    paddd            m0, m1
    922    paddd            m0, m3
    923 .x_loop_ar1_inner:
    924    movd          val0d, m0
    925    psrldq           m0, 4
    926    imul          val3d, cf3d
    927    add           val3d, val0d
    928    sar           val3d, shiftb
    929    movsx         val0d, word [bufq+xq*2]
    930    add           val3d, val0d
    931    cmp           val3d, maxd
    932    cmovg         val3d, maxd
    933    cmp           val3d, mind
    934    cmovl         val3d, mind
    935    mov word [bufq+xq*2], val3w
    936    ; keep val3d in-place as left for next x iteration
    937    inc              xq
    938    jz .x_loop_ar1_end
    939    test             xq, 3
    940    jnz .x_loop_ar1_inner
    941    jmp .x_loop_ar1
    942 
    943 .x_loop_ar1_end:
    944    add            bufq, 82*2
    945    add           bufyq, 82*2<<%3
    946    dec              hd
    947    jg .y_loop_ar1
    948 %if ARCH_X86_32
    949 %undef maxd
    950 %undef mind
    951 %undef hd
    952 %endif
    953    RET
    954 
    955 .ar2:
    956 %if ARCH_X86_64
    957    DEFINE_ARGS buf, bufy, fg_data, uv, bdmax, shift
    958 %else
    959    DEFINE_ARGS buf, bufy, pic_reg, fg_data, uv, shift
    960    ALLOC_STACK  -16*8
    961    mov           bufyq, r1m
    962    mov             uvd, r3m
    963 %endif
    964    mov          shiftd, [fg_dataq+FGData.ar_coeff_shift]
    965    imul            uvd, 28
    966 %if ARCH_X86_64
    967    sar          bdmaxd, 1
    968    SPLATW           m5, bdmaxd                 ; max_grain
    969 %else
    970    SPLATW           m5, r4m
    971    psraw            m5, 1
    972 %endif
    973    pcmpeqw          m6, m6
    974 %if !cpuflag(sse4)
    975    pcmpeqw          m7, m7
    976    psrldq           m7, 14
    977    pslldq           m7, 2
    978    pxor             m7, m6
    979 %endif
    980    pxor             m6, m5                    ; min_grain
    981 %if %2 && cpuflag(sse4)
    982    SPLATW           m7, [base+hmul_bits+2+%3*2]
    983 %endif
    984 
    985 %if ARCH_X86_64
    986    SWAP              5, 13
    987    SWAP              6, 14
    988    SWAP              7, 15
    989 %else
    990 %define m13 [rsp+5*16]
    991 %define m14 [rsp+6*16]
    992 %define m15 [rsp+7*16]
    993    mova            m13, m5
    994    mova            m14, m6
    995    mova            m15, m7
    996 %endif
    997 
    998    ; coef values
    999    movu             m0, [fg_dataq+FGData.ar_coeffs_uv+uvq+0]
   1000    pxor             m1, m1
   1001    pcmpgtb          m1, m0
   1002    punpckhbw        m2, m0, m1
   1003    punpcklbw        m0, m1
   1004    pinsrw           m2, [base+round_vals-12+shiftq*2], 5
   1005 
   1006    pshufd           m6, m0, q0000
   1007    pshufd           m7, m0, q1111
   1008    pshufd           m1, m0, q3333
   1009    pshufd           m0, m0, q2222
   1010    pshufd           m3, m2, q1111
   1011    pshufd           m4, m2, q2222
   1012    pshufd           m2, m2, q0000
   1013 
   1014 %if ARCH_X86_64
   1015    SWAP              0, 8
   1016    SWAP              1, 9
   1017    SWAP              2, 10
   1018    SWAP              3, 11
   1019    SWAP              4, 12
   1020 %else
   1021 %define m8 [rsp+0*16]
   1022 %define m9 [rsp+1*16]
   1023 %define m10 [rsp+2*16]
   1024 %define m11 [rsp+3*16]
   1025 %define m12 [rsp+4*16]
   1026    mova             m8, m0
   1027    mova             m9, m1
   1028    mova            m10, m2
   1029    mova            m11, m3
   1030    mova            m12, m4
   1031 %endif
   1032 
   1033 %if ARCH_X86_64
   1034    DEFINE_ARGS buf, bufy, fg_data, h, x
   1035 %else
   1036    DEFINE_ARGS buf, bufy, pic_reg, fg_data, h, x
   1037 %endif
   1038 %if %2
   1039    sub            bufq, 2*(82*(73-35*%3)+44-(82*3+41))
   1040 %else
   1041    sub            bufq, 2*(82*69+3)
   1042 %endif
   1043    add           bufyq, 2*(79+82*3)
   1044    mov              hd, 70-35*%3
   1045 .y_loop_ar2:
   1046    mov              xq, -(76>>%2)
   1047 
   1048 .x_loop_ar2:
   1049    movu             m0, [bufq+xq*2-82*4-4]     ; y=-2,x=[-2,+5]
   1050    movu             m5, [bufq+xq*2-82*2-4]     ; y=-1,x=[-2,+5]
   1051    psrldq           m4, m0, 2                  ; y=-2,x=[-1,+5]
   1052    psrldq           m1, m0, 4                  ; y=-2,x=[-0,+5]
   1053    psrldq           m3, m0, 6                  ; y=-2,x=[+1,+5]
   1054    psrldq           m2, m0, 8                  ; y=-2,x=[+2,+5]
   1055    punpcklwd        m0, m4                     ; y=-2,x=[-2/-1,-1/+0,+0/+1,+1/+2]
   1056    punpcklwd        m1, m3                     ; y=-2,x=[+0/+1,+1/+2,+2/+3,+3/+4]
   1057    punpcklwd        m2, m5                     ; y=-2/-1,x=[+2/-2,+3/-1,+4/+0,+5/+1]
   1058    pmaddwd          m0, m6
   1059    pmaddwd          m1, m7
   1060    pmaddwd          m2, m8
   1061    paddd            m0, m1
   1062    paddd            m0, m2
   1063    psrldq           m3, m5, 2                  ; y=-1,x=[-1,+5]
   1064    psrldq           m1, m5, 4                  ; y=-1,x=[-0,+5]
   1065    psrldq           m4, m5, 6                  ; y=-1,x=[+1,+5]
   1066    psrldq           m2, m5, 8                  ; y=-1,x=[+2,+5]
   1067    punpcklwd        m3, m1
   1068    punpcklwd        m4, m2
   1069    pmaddwd          m3, m9
   1070    pmaddwd          m4, m10
   1071    paddd            m3, m4
   1072    paddd            m0, m3
   1073 
   1074    ; luma component & rounding
   1075 %if %2
   1076    movu             m1, [bufyq+xq*4]
   1077 %if %3
   1078    movu             m2, [bufyq+xq*4+82*2]
   1079    phaddw           m1, m2
   1080    pshufd           m2, m1, q3232
   1081    paddw            m1, m2
   1082 %else
   1083    phaddw           m1, m1
   1084 %endif
   1085 %if cpuflag(sse4)
   1086    pmulhrsw         m1, m15
   1087 %elif %3
   1088    pmulhrsw         m1, [base+pw_8192]
   1089 %else
   1090    pmulhrsw         m1, [base+pw_16384]
   1091 %endif
   1092 %else
   1093    movq             m1, [bufyq+xq*2]
   1094 %endif
   1095    punpcklwd        m1, [base+pw_1]
   1096    pmaddwd          m1, m12
   1097    paddd            m0, m1
   1098 
   1099    movu             m1, [bufq+xq*2-4]      ; y=0,x=[-2,+5]
   1100    pshufd           m2, m1, q3321
   1101    pxor             m3, m3
   1102    pcmpgtw          m3, m2
   1103    punpcklwd        m2, m3                 ; y=0,x=[0,3] in dword
   1104 .x_loop_ar2_inner:
   1105    pmaddwd          m3, m1, m11
   1106    paddd            m3, m0
   1107    psrldq           m0, 4                  ; shift top to next pixel
   1108    psrad            m3, [fg_dataq+FGData.ar_coeff_shift]
   1109    ; we do not need to packssdw since we only care about one value
   1110    paddd            m3, m2
   1111    packssdw         m3, m3
   1112    pminsw           m3, m13
   1113    pmaxsw           m3, m14
   1114    psrldq           m1, 2
   1115    pslldq           m3, 2
   1116    psrldq           m2, 4
   1117 %if cpuflag(sse4)
   1118    pblendw          m1, m3, 00000010b
   1119 %else
   1120    pand             m1, m15
   1121    pandn            m4, m15, m3
   1122    por              m1, m4
   1123 %endif
   1124    ; overwrite previous pixel, should be ok
   1125    movd  [bufq+xq*2-2], m1
   1126    inc              xq
   1127    jz .x_loop_ar2_end
   1128    test             xq, 3
   1129    jnz .x_loop_ar2_inner
   1130    jmp .x_loop_ar2
   1131 
   1132 .x_loop_ar2_end:
   1133    add            bufq, 82*2
   1134    add           bufyq, 82*2<<%3
   1135    dec              hd
   1136    jg .y_loop_ar2
   1137 %if ARCH_X86_32
   1138 %undef m13
   1139 %undef m14
   1140 %undef m15
   1141 %endif
   1142    RET
   1143 
   1144 .ar3:
   1145 %if ARCH_X86_64
   1146    DEFINE_ARGS buf, bufy, fg_data, uv, bdmax, shift
   1147 %if WIN64
   1148    mov              r6, rsp
   1149    and             rsp, ~15
   1150    sub             rsp, 96
   1151    %define         tmp  rsp
   1152 %else
   1153    %define         tmp  rsp+stack_offset-120
   1154 %endif
   1155 %else
   1156    DEFINE_ARGS buf, bufy, pic_reg, fg_data, uv, shift
   1157    ALLOC_STACK  -16*14
   1158    mov           bufyq, r1m
   1159    mov             uvd, r3m
   1160    %define         tmp  rsp
   1161 %endif
   1162    mov          shiftd, [fg_dataq+FGData.ar_coeff_shift]
   1163    imul            uvd, 28
   1164    SPLATW           m4, [base+round_vals-12+shiftq*2]
   1165    pxor             m5, m5
   1166    pcmpgtw          m5, m4
   1167    punpcklwd        m4, m5
   1168 %if ARCH_X86_64
   1169    sar          bdmaxd, 1
   1170    SPLATW           m6, bdmaxd                 ; max_grain
   1171 %else
   1172    SPLATW           m6, r4m
   1173    psraw            m6, 1
   1174 %endif
   1175    pcmpeqw          m7, m7
   1176 %if !cpuflag(sse4)
   1177    pcmpeqw          m3, m3
   1178    psrldq           m3, 14
   1179    pslldq           m3, 4
   1180    pxor             m3, m7
   1181 %endif
   1182    pxor             m7, m6                     ; min_grain
   1183 %if %2 && cpuflag(sse4)
   1184    SPLATW           m3, [base+hmul_bits+2+%3*2]
   1185 %endif
   1186 
   1187 %if ARCH_X86_64
   1188    SWAP              3, 11
   1189    SWAP              4, 12
   1190    SWAP              6, 14
   1191    SWAP              7, 15
   1192 %else
   1193 %define m11 [rsp+ 9*16]
   1194 %define m12 [rsp+10*16]
   1195 %define m14 [rsp+12*16]
   1196 %define m15 [rsp+13*16]
   1197    mova            m11, m3
   1198    mova            m12, m4
   1199    mova            m14, m6
   1200    mova            m15, m7
   1201 %endif
   1202 
   1203    ; cf from y=-3,x=-3 until y=-3,x=-2
   1204    movu             m0, [fg_dataq+FGData.ar_coeffs_uv+uvq+ 0]
   1205    pxor             m1, m1
   1206    pcmpgtb          m1, m0
   1207    punpckhbw        m2, m0, m1
   1208    punpcklbw        m0, m1
   1209    pshufd           m1, m0, q0000
   1210    pshufd           m3, m0, q1111
   1211    pshufd           m4, m0, q2222
   1212    pshufd           m0, m0, q3333
   1213    pshufd           m5, m2, q0000
   1214    pshufd           m6, m2, q1111
   1215    mova     [tmp+16*0], m1
   1216    mova     [tmp+16*1], m3
   1217    mova     [tmp+16*2], m4
   1218    mova     [tmp+16*3], m0
   1219    mova     [tmp+16*4], m5
   1220    mova     [tmp+16*5], m6
   1221    pshufd           m6, m2, q2222
   1222    pshufd           m7, m2, q3333
   1223 
   1224    ; cf from y=-1,x=-1 to y=0,x=-1 + luma component
   1225    movu             m0, [fg_dataq+FGData.ar_coeffs_uv+uvq+16]
   1226    pxor             m1, m1
   1227    pcmpgtb          m1, m0
   1228    punpckhbw        m2, m0, m1                 ; luma
   1229    punpcklbw        m0, m1
   1230    pshufd           m3, m0, q3232
   1231    psrldq           m5, m0, 10
   1232    ; y=0,x=[-3 to -1] + "1.0" for current pixel
   1233    pinsrw           m5, [base+round_vals-10+shiftq*2], 3
   1234    ; y=-1,x=[-1 to +2]
   1235    pshufd           m1, m0, q0000
   1236    pshufd           m0, m0, q1111
   1237    ; y=-1,x=+3 + luma
   1238    punpcklwd        m3, m2
   1239    pshufd           m3, m3, q0000
   1240 
   1241 %if ARCH_X86_64
   1242    SWAP              1, 8
   1243    SWAP              0, 9
   1244    SWAP              3, 10
   1245    SWAP              5, 13
   1246    DEFINE_ARGS buf, bufy, fg_data, h, x
   1247 %else
   1248 %define m8  [rsp+ 6*16]
   1249 %define m9  [rsp+ 7*16]
   1250 %define m10 [rsp+ 8*16]
   1251 %define m13 [rsp+11*16]
   1252    mova             m8, m1
   1253    mova             m9, m0
   1254    mova            m10, m3
   1255    mova            m13, m5
   1256    DEFINE_ARGS buf, bufy, pic_reg, fg_data, h, x
   1257 %endif
   1258 %if %2
   1259    sub            bufq, 2*(82*(73-35*%3)+44-(82*3+41))
   1260 %else
   1261    sub            bufq, 2*(82*69+3)
   1262 %endif
   1263    add           bufyq, 2*(79+82*3)
   1264    mov              hd, 70-35*%3
   1265 .y_loop_ar3:
   1266    mov              xq, -(76>>%2)
   1267 
   1268 .x_loop_ar3:
   1269    ; first line
   1270    movu             m0, [bufq+xq*2-82*6-6+ 0]      ; y=-3,x=[-3,+4]
   1271    movd             m1, [bufq+xq*2-82*6-6+16]      ; y=-3,x=[+5,+6]
   1272    palignr          m2, m1, m0, 2                  ; y=-3,x=[-2,+5]
   1273    palignr          m1, m1, m0, 12                 ; y=-3,x=[+3,+6]
   1274    punpckhwd        m3, m0, m2                     ; y=-3,x=[+1/+2,+2/+3,+3/+4,+4/+5]
   1275    punpcklwd        m0, m2                         ; y=-3,x=[-3/-2,-2/-1,-1/+0,+0/+1]
   1276    shufps           m2, m0, m3, q1032              ; y=-3,x=[-1/+0,+0/+1,+1/+2,+2/+3]
   1277 
   1278    pmaddwd          m0, [tmp+0*16]
   1279    pmaddwd          m2, [tmp+1*16]
   1280    pmaddwd          m3, [tmp+2*16]
   1281    paddd            m0, m2
   1282    paddd            m0, m3                         ; first 6 x of top y
   1283 
   1284    ; second line [m0/1 are busy]
   1285    movu             m2, [bufq+xq*2-82*4-6+ 0]      ; y=-2,x=[-3,+4]
   1286    movd             m3, [bufq+xq*2-82*4-6+16]      ; y=-2,x=[+5,+6]
   1287    punpcklwd        m1, m2                         ; y=-3/-2,x=[+3/-3,+4/-2,+5/-1,+6/+0]
   1288    palignr          m4, m3, m2, 2                  ; y=-2,x=[-2,+5]
   1289    palignr          m3, m3, m2, 4                  ; y=-2,x=[-2,+5]
   1290    punpckhwd        m5, m4, m3                     ; y=-2,x=[+2/+3,+3/+4,+4/+5,+5/+6]
   1291    punpcklwd        m4, m3                         ; y=-2,x=[-2/-1,-1/+0,+0/+1,+1/+2]
   1292    shufps           m3, m4, m5, q1032              ; t=-2,x=[+0/+1,+1/+2,+2/+3,+3/+4]
   1293    pmaddwd          m1, [tmp+3*16]
   1294    pmaddwd          m4, [tmp+4*16]
   1295    pmaddwd          m3, [tmp+5*16]
   1296    pmaddwd          m5, m6
   1297    paddd            m1, m4
   1298    paddd            m3, m5
   1299    paddd            m0, m1
   1300    paddd            m0, m3                         ; top 2 lines
   1301 
   1302    ; third line [m0 is busy] & luma + round
   1303    movu             m1, [bufq+xq*2-82*2-6+ 0]      ; y=-1,x=[-3,+4]
   1304    movd             m2, [bufq+xq*2-82*2-6+16]      ; y=-1,x=[+5,+6]
   1305 %if %2
   1306    movu             m5, [bufyq+xq*4]
   1307 %if %3
   1308    movu             m4, [bufyq+xq*4+82*2]
   1309    phaddw           m5, m4
   1310 %else
   1311    phaddw           m5, m5
   1312 %endif
   1313 %else
   1314    movq             m5, [bufyq+xq*2]
   1315 %endif
   1316    palignr          m3, m2, m1, 2                  ; y=-1,x=[-2,+5]
   1317    palignr          m2, m2, m1, 12                 ; y=-1,x=[+3,+6]
   1318 %if %3
   1319    pshufd           m4, m5, q3232
   1320    paddw            m5, m4
   1321 %endif
   1322 %if %2
   1323 %if cpuflag(sse4)
   1324    pmulhrsw         m5, m11
   1325 %elif %3
   1326    pmulhrsw         m5, [base+pw_8192]
   1327 %else
   1328    pmulhrsw         m5, [base+pw_16384]
   1329 %endif
   1330 %endif
   1331    punpckhwd        m4, m1, m3                     ; y=-1,x=[+1/+2,+2/+3,+3/+4,+4/+5]
   1332    punpcklwd        m1, m3                         ; y=-1,x=[-3/-2,-2/-1,-1/+0,+0/+1]
   1333    shufps           m3, m1, m4, q1032              ; y=-1,x=[-1/+0,+0/+1,+1/+2,+2/+3]
   1334    punpcklwd        m2, m5
   1335    pmaddwd          m1, m7
   1336    pmaddwd          m3, m8
   1337    pmaddwd          m4, m9
   1338    pmaddwd          m2, m10
   1339    paddd            m1, m3
   1340    paddd            m4, m2
   1341    paddd            m0, m12                        ; += round
   1342    paddd            m1, m4
   1343    paddd            m0, m1
   1344 
   1345    movu             m1, [bufq+xq*2-6]      ; y=0,x=[-3,+4]
   1346 .x_loop_ar3_inner:
   1347    pmaddwd          m2, m1, m13
   1348    pshufd           m3, m2, q1111
   1349    paddd            m2, m3                 ; left+cur
   1350    paddd            m2, m0                 ; add top
   1351    psrldq           m0, 4
   1352    psrad            m2, [fg_dataq+FGData.ar_coeff_shift]
   1353    packssdw         m2, m2
   1354    pminsw           m2, m14
   1355    pmaxsw           m2, m15
   1356    pslldq           m2, 4
   1357    psrldq           m1, 2
   1358 %if cpuflag(sse4)
   1359    pblendw          m1, m2, 00000100b
   1360 %else
   1361    pand             m1, m11
   1362    pandn            m3, m11, m2
   1363    por              m1, m3
   1364 %endif
   1365    ; overwrite previous pixels, should be ok
   1366    movq  [bufq+xq*2-4], m1
   1367    inc              xq
   1368    jz .x_loop_ar3_end
   1369    test             xq, 3
   1370    jnz .x_loop_ar3_inner
   1371    jmp .x_loop_ar3
   1372 
   1373 .x_loop_ar3_end:
   1374    add            bufq, 82*2
   1375    add           bufyq, 82*2<<%3
   1376    dec              hd
   1377    jg .y_loop_ar3
   1378 %if WIN64
   1379    mov             rsp, r6
   1380 %elif ARCH_X86_32
   1381 %undef m8
   1382 %undef m9
   1383 %undef m10
   1384 %undef m11
   1385 %undef m12
   1386 %undef m13
   1387 %undef m14
   1388 %undef m15
   1389 %endif
   1390    RET
   1391 %endmacro
   1392 
   1393 generate_grain_uv_fn 420, 1, 1
   1394 generate_grain_uv_fn 422, 1, 0
   1395 generate_grain_uv_fn 444, 0, 0
   1396 
   1397 %macro SCRATCH 3
   1398 %if ARCH_X86_32
   1399    mova [rsp+%3*mmsize], m%1
   1400 %define m%2 [rsp+%3*mmsize]
   1401 %else
   1402    SWAP             %1, %2
   1403 %endif
   1404 %endmacro
   1405 
   1406 INIT_XMM ssse3
   1407 %if ARCH_X86_32
   1408 %if STACK_ALIGNMENT < mmsize
   1409 cglobal fgy_32x32xn_16bpc, 0, 7, 8, 0-(8 * mmsize + 12 * gprsize), \
   1410        dst, src, scaling, unused1, fg_data, picptr, unused2
   1411    ; copy stack arguments to new position post-alignment, so that we
   1412    ; don't have to keep the old stack location in a separate register
   1413    mov              r0, r0m
   1414    mov              r1, r2m
   1415    mov              r2, r4m
   1416    mov              r3, r6m
   1417    mov              r4, r7m
   1418    mov              r5, r8m
   1419 
   1420 %define r0m [rsp+8*mmsize+ 3*gprsize]
   1421 %define r2m [rsp+8*mmsize+ 5*gprsize]
   1422 %define r4m [rsp+8*mmsize+ 7*gprsize]
   1423 %define r6m [rsp+8*mmsize+ 9*gprsize]
   1424 %define r7m [rsp+8*mmsize+10*gprsize]
   1425 %define r8m [rsp+8*mmsize+11*gprsize]
   1426 
   1427    mov             r0m, r0
   1428    mov             r2m, r1
   1429    mov             r4m, r2
   1430    mov             r6m, r3
   1431    mov             r7m, r4
   1432    mov             r8m, r5
   1433 %else
   1434 cglobal fgy_32x32xn_16bpc, 0, 7, 8, 8 * mmsize + 4 * gprsize, \
   1435        dst, src, scaling, unused1, fg_data, picptr, unused2
   1436 %endif
   1437    mov            srcq, srcm
   1438    mov        scalingq, r5m
   1439    mov        fg_dataq, r3m
   1440 %if STACK_ALIGNMENT < mmsize
   1441    mov              r6, r9m
   1442 
   1443 %define r9m [rsp+8*mmsize+ 4*gprsize]
   1444 %define r3m [rsp+8*mmsize+ 6*gprsize]
   1445 %define r5m [rsp+8*mmsize+ 8*gprsize]
   1446 
   1447    mov             r9m, r6
   1448 %endif
   1449    LEA              r5, $$
   1450 %define base r5-$$
   1451    mov             r5m, picptrq
   1452 %else
   1453 cglobal fgy_32x32xn_16bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling, grain_lut
   1454    lea              r8, [pb_mask]
   1455 %define base r8-pb_mask
   1456 %endif
   1457    mov             r6d, [fg_dataq+FGData.scaling_shift]
   1458    SPLATW           m3, [base+mul_bits+r6*2-14]
   1459    mov             r6d, [fg_dataq+FGData.clip_to_restricted_range]
   1460 %if ARCH_X86_32
   1461    DECLARE_REG_TMP   0, 3
   1462 %else
   1463    DECLARE_REG_TMP   9, 10
   1464 %endif
   1465    mov             t0d, r9m        ; bdmax
   1466    sar             t0d, 11         ; is_12bpc
   1467    inc             t0d
   1468    mov             t1d, r6d
   1469    imul            t1d, t0d
   1470    dec             t0d
   1471    SPLATW           m5, [base+min+t1*2]
   1472    lea             t0d, [t0d*3]
   1473    lea             t0d, [r6d*2+t0d]
   1474    SPLATW           m4, [base+max+t0*2]
   1475    SPLATW           m2, r9m
   1476 
   1477    pcmpeqw          m1, m1
   1478    psraw            m7, m2, 1              ; max_grain
   1479    pxor             m1, m7                 ; min_grain
   1480    SPLATD           m6, [base+pd_16]
   1481 
   1482    SCRATCH           1,  9, 0
   1483    SCRATCH           2, 10, 1
   1484    SCRATCH           3, 11, 2
   1485    SCRATCH           4, 12, 3
   1486    SCRATCH           5, 13, 4
   1487    SCRATCH           6, 14, 5
   1488    SCRATCH           7, 15, 6
   1489 
   1490    mova             m6, [base+pw_27_17_17_27]   ; for horizontal filter
   1491 
   1492 %if ARCH_X86_32
   1493    DEFINE_ARGS dst, src, scaling, sby, fg_data, picptr, unused2
   1494    DECLARE_REG_TMP   0
   1495 %else
   1496    DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, unused1, \
   1497                sby, see
   1498    DECLARE_REG_TMP   7
   1499 %endif
   1500 
   1501    mov            sbyd, r8m
   1502    movzx           t0d, byte [fg_dataq+FGData.overlap_flag]
   1503    test            t0d, t0d
   1504    jz .no_vertical_overlap
   1505    test           sbyd, sbyd
   1506    jnz .vertical_overlap
   1507 .no_vertical_overlap:
   1508    mov       dword r8m, t0d
   1509 
   1510 %if ARCH_X86_32
   1511    DEFINE_ARGS dst, src, scaling, see, fg_data, picptr, unused
   1512    imul           seed, (173 << 24) | 37
   1513 %else
   1514    imul           seed, sbyd, (173 << 24) | 37
   1515 %endif
   1516    add            seed, (105 << 24) | 178
   1517    rol            seed, 8
   1518    movzx          seed, seew
   1519    xor            seed, [fg_dataq+FGData.seed]
   1520 
   1521 %if ARCH_X86_32
   1522    DEFINE_ARGS dst, src, scaling, see, w, picptr, src_bak
   1523 
   1524    mov             r3m, seed
   1525    mov              wq, r4m
   1526 %else
   1527    DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
   1528                unused1, unused2, see, src_bak
   1529 %endif
   1530 
   1531    lea        src_bakq, [srcq+wq*2]
   1532    mov            r9mp, src_bakq
   1533    neg              wq
   1534    sub           dstmp, srcq
   1535 %if ARCH_X86_32
   1536    mov             r4m, wq
   1537 %endif
   1538 
   1539 .loop_x:
   1540 %if ARCH_X86_32
   1541    mov            seed, r3m
   1542 %endif
   1543    mov             r6d, seed
   1544    or             seed, 0xEFF4
   1545    shr             r6d, 1
   1546    test           seeb, seeh
   1547    lea            seed, [r6+0x8000]
   1548    cmovp          seed, r6d                ; updated seed
   1549 
   1550 %if ARCH_X86_32
   1551    mov             r3m, seed
   1552 
   1553    DEFINE_ARGS dst, src, scaling, offy, unused1, unused2, offx
   1554 
   1555    mov           offxd, offyd
   1556 %else
   1557    DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
   1558                offx, offy, see, src_bak
   1559 
   1560    mov           offyd, seed
   1561    mov           offxd, seed
   1562 %endif
   1563    ror           offyd, 8
   1564    shr           offxd, 12
   1565    and           offyd, 0xf
   1566    imul          offyd, 164
   1567    lea           offyq, [offyq+offxq*2+747] ; offy*stride+offx
   1568 
   1569 %if ARCH_X86_32
   1570    DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut
   1571 %else
   1572    DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
   1573                h, offxy, see, src_bak
   1574 %endif
   1575 
   1576 .loop_x_odd:
   1577    movzx            hd, word r7m
   1578    mov      grain_lutq, grain_lutmp
   1579 .loop_y:
   1580    ; src
   1581    pand             m0, m10, [srcq+ 0]
   1582    pand             m1, m10, [srcq+16]          ; m0-1: src as word
   1583 
   1584    ; scaling[src]
   1585 %if ARCH_X86_32
   1586    vpgatherdw       m2, m0, scalingq-1, r0, r5, 8, 1, m4
   1587    vpgatherdw       m3, m1, scalingq-1, r0, r5, 8, 1, m4
   1588 %else
   1589    vpgatherdw       m2, m0, scalingq-1, r11, r13, 8, 1, m4
   1590    vpgatherdw       m3, m1, scalingq-1, r11, r13, 8, 1, m4
   1591 %endif
   1592    REPX   {psrlw x, 8}, m2, m3
   1593 
   1594    ; grain = grain_lut[offy+y][offx+x]
   1595    movu             m4, [grain_lutq+offxyq*2]
   1596    movu             m5, [grain_lutq+offxyq*2+16]
   1597 
   1598    ; noise = round2(scaling[src] * grain, scaling_shift)
   1599    REPX {pmullw x, m11}, m2, m3
   1600    pmulhrsw         m4, m2
   1601    pmulhrsw         m5, m3
   1602 
   1603    ; dst = clip_pixel(src, noise)
   1604    paddw            m0, m4
   1605    paddw            m1, m5
   1606    pmaxsw           m0, m13
   1607    pmaxsw           m1, m13
   1608    pminsw           m0, m12
   1609    pminsw           m1, m12
   1610    movifnidn      dstq, dstmp
   1611    mova [dstq+srcq+ 0], m0
   1612    mova [dstq+srcq+16], m1
   1613 
   1614    add            srcq, r2mp               ; src += stride
   1615    add      grain_lutq, 82*2
   1616    dec              hd
   1617    jg .loop_y
   1618 
   1619 %if ARCH_X86_32
   1620    add            r4mp, 16
   1621 %else
   1622    add              wq, 16
   1623 %endif
   1624    jge .end
   1625 %if ARCH_X86_32
   1626    mov            srcq, r9mp
   1627    add            srcq, r4mp
   1628    add            srcq, r4mp
   1629 %else
   1630    mov        src_bakq, r9mp
   1631    lea            srcq, [src_bakq+wq*2]
   1632 %endif
   1633    btc       dword r8m, 2
   1634    jc .next_blk
   1635    add          offxyd, 16
   1636    test      dword r8m, 2
   1637    jz .loop_x_odd
   1638 %if ARCH_X86_32
   1639    add dword [rsp+8*mmsize+1*gprsize], 16
   1640 %else
   1641    add            r12d, 16                 ; top_offxy += 16
   1642 %endif
   1643    jmp .loop_x_odd_v_overlap
   1644 
   1645 .next_blk:
   1646    test      dword r8m, 1
   1647    jz .loop_x
   1648 
   1649    ; r8m = sbym
   1650    test      dword r8m, 2
   1651    jnz .loop_x_hv_overlap
   1652 
   1653    ; horizontal overlap (without vertical overlap)
   1654 .loop_x_h_overlap:
   1655 %if ARCH_X86_32
   1656    add          offxyd, 16
   1657    mov [rsp+8*mmsize+0*gprsize], offxyd
   1658    DEFINE_ARGS dst, src, scaling, see, w, picptr, src_bak
   1659    mov            seed, r3m
   1660 %endif
   1661 
   1662    mov             r6d, seed
   1663    or             seed, 0xEFF4
   1664    shr             r6d, 1
   1665    test           seeb, seeh
   1666    lea            seed, [r6+0x8000]
   1667    cmovp          seed, r6d                ; updated seed
   1668 
   1669 %if ARCH_X86_32
   1670    mov             r3m, seed
   1671 
   1672    DEFINE_ARGS dst, src, scaling, offy, h, picptr, offx
   1673 
   1674    mov           offxd, offyd
   1675 %else
   1676    DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
   1677                offx, offy, see, src_bak, left_offxy
   1678 
   1679    lea     left_offxyd, [offyd+16]         ; previous column's offy*stride+offx
   1680 
   1681    mov           offyd, seed
   1682    mov           offxd, seed
   1683 %endif
   1684    ror           offyd, 8
   1685    shr           offxd, 12
   1686    and           offyd, 0xf
   1687    imul          offyd, 164
   1688    lea           offyq, [offyq+offxq*2+747] ; offy*stride+offx
   1689 
   1690 %if ARCH_X86_32
   1691    DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut
   1692 %else
   1693    DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
   1694                h, offxy, see, src_bak, left_offxy
   1695 %endif
   1696 
   1697    mov              hd, dword r7m
   1698    mov      grain_lutq, grain_lutmp
   1699 .loop_y_h_overlap:
   1700    ; grain = grain_lut[offy+y][offx+x]
   1701    movu             m5, [grain_lutq+offxyq*2]
   1702 %if ARCH_X86_32
   1703    mov              r5, [rsp+8*mmsize+0*gprsize]
   1704    movd             m4, [grain_lutq+r5*2]
   1705 %else
   1706    movd             m4, [grain_lutq+left_offxyq*2]
   1707 %endif
   1708    punpcklwd        m4, m5
   1709    pmaddwd          m4, m6
   1710    paddd            m4, m14
   1711    psrad            m4, 5
   1712    packssdw         m4, m4
   1713    pminsw           m4, m15
   1714    pmaxsw           m4, m9
   1715    shufps           m4, m5, q3210
   1716 
   1717    ; src
   1718    pand             m0, m10, [srcq+ 0]
   1719    pand             m1, m10, [srcq+16]          ; m0-1: src as word
   1720 
   1721    ; scaling[src]
   1722 %if ARCH_X86_32
   1723    vpgatherdw       m2, m0, scalingq-1, r0, r5, 8, 1, m5
   1724    vpgatherdw       m3, m1, scalingq-1, r0, r5, 8, 1, m5
   1725 %else
   1726    vpgatherdw       m2, m0, scalingq-1, r13, r14, 8, 1, m5
   1727    vpgatherdw       m3, m1, scalingq-1, r13, r14, 8, 1, m5
   1728 %endif
   1729    REPX   {psrlw x, 8}, m2, m3
   1730 
   1731    ; noise = round2(scaling[src] * grain, scaling_shift)
   1732    movu             m5, [grain_lutq+offxyq*2+16]
   1733    REPX {pmullw x, m11}, m2, m3
   1734    pmulhrsw         m4, m2
   1735    pmulhrsw         m5, m3
   1736 
   1737    ; dst = clip_pixel(src, noise)
   1738    paddw            m0, m4
   1739    paddw            m1, m5
   1740    pmaxsw           m0, m13
   1741    pmaxsw           m1, m13
   1742    pminsw           m0, m12
   1743    pminsw           m1, m12
   1744    movifnidn      dstq, dstmp
   1745    mova [dstq+srcq+ 0], m0
   1746    mova [dstq+srcq+16], m1
   1747 
   1748    add            srcq, r2mp
   1749    add      grain_lutq, 82*2
   1750    dec              hd
   1751    jg .loop_y_h_overlap
   1752 
   1753 %if ARCH_X86_32
   1754    add            r4mp, 16
   1755 %else
   1756    add              wq, 16
   1757 %endif
   1758    jge .end
   1759 %if ARCH_X86_32
   1760    mov            srcq, r9mp
   1761    add            srcq, r4mp
   1762    add            srcq, r4mp
   1763 %else
   1764    mov        src_bakq, r9mp
   1765    lea            srcq, [src_bakq+wq*2]
   1766 %endif
   1767    or        dword r8m, 4
   1768    add          offxyd, 16
   1769 
   1770    ; r8m = sbym
   1771    test      dword r8m, 2
   1772    jz .loop_x_odd
   1773 %if ARCH_X86_32
   1774    add dword [rsp+8*mmsize+1*gprsize], 16
   1775 %else
   1776    add            r12d, 16                 ; top_offxy += 16
   1777 %endif
   1778    jmp .loop_x_odd_v_overlap
   1779 
   1780 .end:
   1781    RET
   1782 
   1783 .vertical_overlap:
   1784    or              t0d, 2
   1785    mov             r8m, t0d
   1786 
   1787 %if ARCH_X86_32
   1788    DEFINE_ARGS dst, src, scaling, sby, fg_data, picptr, unused
   1789 %else
   1790    DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, unused1, \
   1791                sby, see
   1792 %endif
   1793 
   1794    movzx          sbyd, sbyb
   1795 %if ARCH_X86_32
   1796    imul             r4, [fg_dataq+FGData.seed], 0x00010001
   1797    DEFINE_ARGS dst, src, scaling, sby, see, picptr, unused
   1798 %else
   1799    imul           seed, [fg_dataq+FGData.seed], 0x00010001
   1800 %endif
   1801    imul            t0d, sbyd, 173 * 0x00010001
   1802    imul           sbyd, 37 * 0x01000100
   1803    add             t0d, (105 << 16) | 188
   1804    add            sbyd, (178 << 24) | (141 << 8)
   1805    and             t0d, 0x00ff00ff
   1806    and            sbyd, 0xff00ff00
   1807    xor            seed, t0d
   1808 %if ARCH_X86_32
   1809    xor            sbyd, seed
   1810 
   1811    DEFINE_ARGS dst, src, scaling, see, w, picptr, src_bak
   1812 
   1813    mov             r3m, seed
   1814    mov              wq, r4m
   1815 %else
   1816    xor            seed, sbyd               ; (cur_seed << 16) | top_seed
   1817 
   1818    DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
   1819                unused1, unused2, see, src_bak
   1820 %endif
   1821 
   1822    lea        src_bakq, [srcq+wq*2]
   1823    mov            r9mp, src_bakq
   1824    neg              wq
   1825    sub           dstmp, srcq
   1826 %if ARCH_X86_32
   1827    mov             r4m, wq
   1828 %endif
   1829 
   1830 .loop_x_v_overlap:
   1831 %if ARCH_X86_32
   1832    mov              r5, r5m
   1833    SPLATD           m7, [base+pw_27_17_17_27]
   1834    mov            seed, r3m
   1835 %else
   1836    SPLATD           m7, [pw_27_17_17_27]
   1837 %endif
   1838 
   1839    ; we assume from the block above that bits 8-15 of r7d are zero'ed
   1840    mov             r6d, seed
   1841    or             seed, 0xeff4eff4
   1842    test           seeb, seeh
   1843    setp            t0b                     ; parity of top_seed
   1844    shr            seed, 16
   1845    shl             t0d, 16
   1846    test           seeb, seeh
   1847    setp            t0b                     ; parity of cur_seed
   1848    or              r6d, 0x00010001
   1849    xor             t0d, r6d
   1850    mov            seed, t0d
   1851    ror            seed, 1                  ; updated (cur_seed << 16) | top_seed
   1852 
   1853 %if ARCH_X86_32
   1854    mov             r3m, seed
   1855 
   1856    DEFINE_ARGS dst, src, scaling, offy, unused1, unused2, offx
   1857 
   1858    mov           offxd, offyd
   1859 %else
   1860    DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
   1861                offx, offy, see, src_bak, unused, top_offxy
   1862 
   1863    mov           offyd, seed
   1864    mov           offxd, seed
   1865 %endif
   1866    ror           offyd, 8
   1867    ror           offxd, 12
   1868    and           offyd, 0xf000f
   1869    and           offxd, 0xf000f
   1870    imul          offyd, 164
   1871    ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy
   1872    lea           offyq, [offyq+offxq*2+0x10001*747+32*82]
   1873 
   1874 %if ARCH_X86_32
   1875    DEFINE_ARGS top_offxy, src, scaling, offxy, h, picptr, grain_lut
   1876 %else
   1877    DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
   1878                h, offxy, see, src_bak, unused, top_offxy
   1879 %endif
   1880 
   1881    movzx    top_offxyd, offxyw
   1882 %if ARCH_X86_32
   1883    mov [rsp+8*mmsize+1*gprsize], top_offxyd
   1884 
   1885    DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut
   1886 %endif
   1887    shr          offxyd, 16
   1888 
   1889 .loop_x_odd_v_overlap:
   1890 %if ARCH_X86_32
   1891    mov              r5, r5m
   1892 %endif
   1893    SPLATD           m7, [PIC_ptr(pw_27_17_17_27)]
   1894    mov              hd, dword r7m
   1895    mov      grain_lutq, grain_lutmp
   1896 .loop_y_v_overlap:
   1897    ; grain = grain_lut[offy+y][offx+x]
   1898    movu             m3, [grain_lutq+offxyq*2]
   1899 %if ARCH_X86_32
   1900    mov              r5, [rsp+8*mmsize+1*gprsize]
   1901    movu             m2, [grain_lutq+r5*2]
   1902 %else
   1903    movu             m2, [grain_lutq+top_offxyq*2]
   1904 %endif
   1905    punpckhwd        m4, m2, m3
   1906    punpcklwd        m2, m3
   1907    REPX {pmaddwd x, m7}, m4, m2
   1908    REPX {paddd   x, m14}, m4, m2
   1909    REPX {psrad   x, 5}, m4, m2
   1910    packssdw         m2, m4
   1911    pminsw           m2, m15
   1912    pmaxsw           m2, m9
   1913    movu             m4, [grain_lutq+offxyq*2+16]
   1914 %if ARCH_X86_32
   1915    movu             m3, [grain_lutq+r5*2+16]
   1916 %else
   1917    movu             m3, [grain_lutq+top_offxyq*2+16]
   1918 %endif
   1919    punpckhwd        m5, m3, m4
   1920    punpcklwd        m3, m4
   1921    REPX {pmaddwd x, m7}, m5, m3
   1922    REPX {paddd   x, m14}, m5, m3
   1923    REPX {psrad   x, 5}, m5, m3
   1924    packssdw         m3, m5
   1925    pminsw           m3, m15
   1926    pmaxsw           m3, m9
   1927 
   1928    ; src
   1929    pand             m0, m10, [srcq+ 0]          ; m0-1: src as word
   1930    pand             m1, m10, [srcq+16]          ; m0-1: src as word
   1931 
   1932    ; scaling[src]
   1933    ; noise = round2(scaling[src] * grain, scaling_shift)
   1934 %if ARCH_X86_32
   1935    vpgatherdw       m4, m0, scalingq-1, r0, r5, 8, 1, m5
   1936 %else
   1937    vpgatherdw       m4, m0, scalingq-1, r11, r13, 8, 1, m5
   1938 %endif
   1939    psrlw            m4, 8
   1940    pmullw           m4, m11
   1941    pmulhrsw         m4, m2
   1942 %if ARCH_X86_32
   1943    vpgatherdw       m5, m1, scalingq-1, r0, r5, 8, 1, m2
   1944 %else
   1945    vpgatherdw       m5, m1, scalingq-1, r11, r13, 8, 1, m2
   1946 %endif
   1947    psrlw            m5, 8
   1948    pmullw           m5, m11
   1949    pmulhrsw         m5, m3
   1950 
   1951    ; dst = clip_pixel(src, noise)
   1952    paddw            m0, m4
   1953    paddw            m1, m5
   1954    pmaxsw           m0, m13
   1955    pmaxsw           m1, m13
   1956    pminsw           m0, m12
   1957    pminsw           m1, m12
   1958    movifnidn      dstq, dstmp
   1959    mova [dstq+srcq+ 0], m0
   1960    mova [dstq+srcq+16], m1
   1961 
   1962    add            srcq, r2mp
   1963    add      grain_lutq, 82*2
   1964    dec              hw
   1965    jz .end_y_v_overlap
   1966    ; 2 lines get vertical overlap, then fall back to non-overlap code for
   1967    ; remaining (up to) 30 lines
   1968 %if ARCH_X86_32
   1969    mov              r5, r5m
   1970 %endif
   1971    SPLATD           m7, [PIC_ptr(pw_27_17_17_27)+4]
   1972    xor              hd, 0x10000
   1973    test             hd, 0x10000
   1974    jnz .loop_y_v_overlap
   1975    jmp .loop_y
   1976 
   1977 .end_y_v_overlap:
   1978 %if ARCH_X86_32
   1979    add            r4mp, 16
   1980 %else
   1981    add              wq, 16
   1982 %endif
   1983    jge .end_hv
   1984 %if ARCH_X86_32
   1985    mov            srcq, r9mp
   1986    add            srcq, r4mp
   1987    add            srcq, r4mp
   1988 %else
   1989    mov        src_bakq, r9mp
   1990    lea            srcq, [src_bakq+wq*2]
   1991 %endif
   1992    btc       dword r8m, 2
   1993    jc .next_blk_v
   1994 %if ARCH_X86_32
   1995    add dword [rsp+8*mmsize+1*gprsize], 16
   1996 %else
   1997    add      top_offxyd, 16
   1998 %endif
   1999    add          offxyd, 16
   2000    jmp .loop_x_odd_v_overlap
   2001 
   2002 .next_blk_v:
   2003    ; since fg_dataq.overlap is guaranteed to be set, we never jump
   2004    ; back to .loop_x_v_overlap, and instead always fall-through to
   2005    ; h+v overlap
   2006 
   2007 .loop_x_hv_overlap:
   2008 %if ARCH_X86_32
   2009    DEFINE_ARGS dst, src, scaling, see, w, picptr, src_bak
   2010 
   2011    mov              r0, [rsp+8*mmsize+1*gprsize]
   2012    add              r3, 16
   2013    add              r0, 16
   2014    mov [rsp+8*mmsize+0*gprsize], r3 ; left_offxy
   2015    mov [rsp+8*mmsize+2*gprsize], r0 ; topleft_offxy
   2016 
   2017    mov            seed, r3m
   2018    xor              r0, r0
   2019 %else
   2020    ; we assume from the block above that bits 8-15 of r7d are zero'ed
   2021 %endif
   2022    mov             r6d, seed
   2023    or             seed, 0xeff4eff4
   2024    test           seeb, seeh
   2025    setp            t0b                     ; parity of top_seed
   2026    shr            seed, 16
   2027    shl             t0d, 16
   2028    test           seeb, seeh
   2029    setp            t0b                     ; parity of cur_seed
   2030    or              r6d, 0x00010001
   2031    xor             t0d, r6d
   2032    mov            seed, t0d
   2033    ror            seed, 1                  ; updated (cur_seed << 16) | top_seed
   2034 
   2035 %if ARCH_X86_32
   2036    mov             r3m, seed
   2037 
   2038    DEFINE_ARGS  dst, src, scaling, offy, w, picptr, offx
   2039 
   2040    mov           offxd, offyd
   2041 %else
   2042    DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
   2043                offx, offy, see, src_bak, left_offxy, top_offxy, topleft_offxy
   2044 
   2045    lea  topleft_offxyq, [top_offxyq+16]
   2046    lea     left_offxyq, [offyq+16]
   2047    mov           offyd, seed
   2048    mov           offxd, seed
   2049 %endif
   2050    ror           offyd, 8
   2051    ror           offxd, 12
   2052    and           offyd, 0xf000f
   2053    and           offxd, 0xf000f
   2054    imul          offyd, 164
   2055    ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy
   2056    lea           offyq, [offyq+offxq*2+0x10001*747+32*82]
   2057 
   2058 %if ARCH_X86_32
   2059    DEFINE_ARGS top_offxy, src, scaling, offxy, w, picptr, grain_lut
   2060 %else
   2061    DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
   2062                h, offxy, see, src_bak, left_offxy, top_offxy, topleft_offxy
   2063 %endif
   2064 
   2065    movzx    top_offxyd, offxyw
   2066 %if ARCH_X86_32
   2067    mov [rsp+8*mmsize+1*gprsize], top_offxyd
   2068 
   2069    DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut
   2070 %endif
   2071    shr          offxyd, 16
   2072 
   2073 %if ARCH_X86_32
   2074    mov              r5, r5m
   2075 %endif
   2076    SPLATD           m7, [PIC_ptr(pw_27_17_17_27)]
   2077 
   2078    movzx            hd, word r7m
   2079    mov      grain_lutq, grain_lutmp
   2080 .loop_y_hv_overlap:
   2081    ; grain = grain_lut[offy+y][offx+x]
   2082    movu             m2, [grain_lutq+offxyq*2]
   2083 %if ARCH_X86_32
   2084    mov              r0, [rsp+8*mmsize+1*gprsize] ; top_offxy
   2085    mov              r5, [rsp+8*mmsize+0*gprsize] ; left_offxy
   2086    movu             m4, [grain_lutq+r0*2]
   2087    movd             m5, [grain_lutq+r5*2]
   2088    mov              r5, [rsp+8*mmsize+2*gprsize] ; topleft_offxy
   2089    movd             m3, [grain_lutq+r5*2]
   2090 %else
   2091    movu             m4, [grain_lutq+top_offxyq*2]
   2092    movd             m5, [grain_lutq+left_offxyq*2]
   2093    movd             m3, [grain_lutq+topleft_offxyq*2]
   2094 %endif
   2095    ; do h interpolation first (so top | top/left -> top, left | cur -> cur)
   2096    punpcklwd        m5, m2
   2097    punpcklwd        m3, m4
   2098    REPX {pmaddwd x, m6}, m5, m3
   2099    REPX {paddd   x, m14}, m5, m3
   2100    REPX {psrad   x, 5}, m5, m3
   2101    packssdw         m5, m3
   2102    pminsw           m5, m15
   2103    pmaxsw           m5, m9
   2104    shufps           m3, m5, m2, q3210
   2105    shufps           m5, m4, q3232
   2106    ; followed by v interpolation (top | cur -> cur)
   2107    movu             m0, [grain_lutq+offxyq*2+16]
   2108 %if ARCH_X86_32
   2109    movu             m1, [grain_lutq+r0*2+16]
   2110 %else
   2111    movu             m1, [grain_lutq+top_offxyq*2+16]
   2112 %endif
   2113    punpcklwd        m2, m5, m3
   2114    punpckhwd        m5, m3
   2115    punpcklwd        m3, m1, m0
   2116    punpckhwd        m1, m0
   2117    REPX {pmaddwd x, m7}, m2, m5, m3, m1
   2118    REPX {paddd   x, m14}, m2, m5, m3, m1
   2119    REPX {psrad   x, 5}, m2, m5, m3, m1
   2120    packssdw         m2, m5
   2121    packssdw         m3, m1
   2122    REPX {pminsw x, m15}, m2, m3
   2123    REPX {pmaxsw x, m9}, m2, m3
   2124 
   2125    ; src
   2126    pand             m0, m10, [srcq+ 0]
   2127    pand             m1, m10, [srcq+16]          ; m0-1: src as word
   2128 
   2129    ; scaling[src]
   2130    ; noise = round2(scaling[src] * grain, scaling_shift)
   2131 %if ARCH_X86_32
   2132    vpgatherdw       m4, m0, scalingq-1, r0, r5, 8, 1, m5
   2133 %else
   2134    vpgatherdw       m4, m0, scalingq-1, r14, r10, 8, 1, m5
   2135 %endif
   2136    psrlw            m4, 8
   2137    pmullw           m4, m11
   2138    pmulhrsw         m2, m4
   2139 %if ARCH_X86_32
   2140    vpgatherdw       m5, m1, scalingq-1, r0, r5, 8, 1, m4
   2141 %else
   2142    vpgatherdw       m5, m1, scalingq-1, r14, r10, 8, 1, m4
   2143 %endif
   2144    psrlw            m5, 8
   2145    pmullw           m5, m11
   2146    pmulhrsw         m3, m5
   2147 
   2148    ; dst = clip_pixel(src, noise)
   2149    paddw            m0, m2
   2150    paddw            m1, m3
   2151    pmaxsw           m0, m13
   2152    pmaxsw           m1, m13
   2153    pminsw           m0, m12
   2154    pminsw           m1, m12
   2155    movifnidn      dstq, dstmp
   2156    mova [dstq+srcq+ 0], m0
   2157    mova [dstq+srcq+16], m1
   2158 
   2159    add            srcq, r2mp
   2160    add      grain_lutq, 82*2
   2161    dec              hw
   2162    jz .end_y_hv_overlap
   2163    ; 2 lines get vertical overlap, then fall back to non-overlap code for
   2164    ; remaining (up to) 30 lines
   2165 %if ARCH_X86_32
   2166    mov              r5, r5m
   2167 %endif
   2168    SPLATD           m7, [PIC_ptr(pw_27_17_17_27)+4]
   2169    xor              hd, 0x10000
   2170    test             hd, 0x10000
   2171    jnz .loop_y_hv_overlap
   2172    jmp .loop_y_h_overlap
   2173 
   2174 .end_y_hv_overlap:
   2175    or        dword r8m, 4
   2176 %if ARCH_X86_32
   2177    add            r4mp, 16
   2178 %else
   2179    add              wq, 16
   2180 %endif
   2181    jge .end_hv
   2182 %if ARCH_X86_32
   2183    mov              r5, r5m
   2184    add          offxyd, 16
   2185    add dword [rsp+8*mmsize+1*gprsize], 16 ; top_offxy += 16
   2186    mov            srcq, r9mp
   2187    add            srcq, r4mp
   2188    add            srcq, r4mp
   2189 %else
   2190    add          offxyd, 16
   2191    add      top_offxyd, 16
   2192    mov        src_bakq, r9mp
   2193    lea            srcq, [src_bakq+wq*2]
   2194 %endif
   2195    jmp .loop_x_odd_v_overlap
   2196 
   2197 .end_hv:
   2198    RET
   2199 %if ARCH_X86_32
   2200    DECLARE_ARG 0, 1, 2, 3, 4, 5, 6, 7, 8, 9
   2201 %endif
   2202 
   2203 %macro FGUV_FN 3 ; name, ss_hor, ss_ver
   2204 INIT_XMM ssse3
   2205 %if ARCH_X86_32
   2206 %if STACK_ALIGNMENT < mmsize
   2207 cglobal fguv_32x32xn_i%1_16bpc, 0, 7, 8, 0-(8 * mmsize + 16 * gprsize), \
   2208        tmp, src, scaling, h, fg_data, picptr, unused
   2209    mov              r0, r0m
   2210    mov              r1, r1m
   2211    mov              r2, r2m
   2212    mov              r4, r3m
   2213    mov              r3, r4m
   2214    mov              r5, r5m
   2215 %define r0m [rsp+8*mmsize+ 3*gprsize]
   2216 %define r1m [rsp+8*mmsize+ 4*gprsize]
   2217 %define r2m [rsp+8*mmsize+ 5*gprsize]
   2218 %define r3m [rsp+8*mmsize+ 6*gprsize]
   2219 %define r4m [rsp+8*mmsize+ 7*gprsize]
   2220 %define r5m [rsp+8*mmsize+ 8*gprsize]
   2221    mov             r0m, r0
   2222    mov             r2m, r2
   2223    mov             r4m, r3
   2224    mov             r5m, r5
   2225 
   2226    mov              r0, r6m
   2227    mov              r2, r7m
   2228    mov              r3, r8m
   2229    mov              r5, r9m
   2230 %define r6m [rsp+8*mmsize+ 9*gprsize]
   2231 %define r7m [rsp+8*mmsize+10*gprsize]
   2232 %define r8m [rsp+8*mmsize+11*gprsize]
   2233 %define r9m [rsp+8*mmsize+12*gprsize]
   2234    mov             r6m, r0
   2235    mov             r7m, r2
   2236    mov             r8m, r3
   2237    mov             r9m, r5
   2238 
   2239    mov              r2, r10m
   2240    mov              r3, r11m
   2241    mov              r5, r12m
   2242    mov              r0, r13m
   2243 %define r10m [rsp+8*mmsize+13*gprsize]
   2244 %define r11m [rsp+8*mmsize+14*gprsize]
   2245 %define r12m [rsp+8*mmsize+15*gprsize]
   2246    mov            r10m, r2
   2247    mov            r11m, r3
   2248    mov            r12m, r5
   2249 
   2250    SPLATW           m2, r13m
   2251 %else
   2252 cglobal fguv_32x32xn_i%1_16bpc, 0, 7, 8, 8 * mmsize + (4) * gprsize, \
   2253        tmp, src, scaling, h, fg_data, picptr, unused
   2254    mov            srcq, srcm
   2255    mov        fg_dataq, r3m
   2256 %endif
   2257    LEA              r5, $$
   2258 %define base r5-$$
   2259 
   2260    DECLARE_REG_TMP   0, 2, 3
   2261 %else
   2262 cglobal fguv_32x32xn_i%1_16bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \
   2263                                      grain_lut, h, sby, luma, lstride, uv_pl, is_id
   2264 %define base r8-pb_mask
   2265    lea              r8, [pb_mask]
   2266 
   2267    DECLARE_REG_TMP   9, 10, 11
   2268 %endif
   2269    mov             r6d, [fg_dataq+FGData.scaling_shift]
   2270    SPLATW           m3, [base+mul_bits+r6*2-14]
   2271    mov             r6d, [fg_dataq+FGData.clip_to_restricted_range]
   2272 %if STACK_ALIGNMENT >= mmsize
   2273    mov             t0d, r13m               ; bdmax
   2274 %endif
   2275    sar             t0d, 11                 ; is_12bpc
   2276    inc             t0d
   2277    mov             t1d, r6d
   2278    imul            t1d, t0d
   2279    dec             t0d
   2280    SPLATW           m5, [base+min+t1*2]
   2281    lea             t1d, [t0d*3]
   2282    mov             t2d, r12m
   2283    inc             t2d
   2284    imul            r6d, t2d
   2285    add             t1d, r6d
   2286    SPLATW           m4, [base+max+t1*2]
   2287 %if STACK_ALIGNMENT >= mmsize
   2288    SPLATW           m2, r13m
   2289 %endif
   2290 
   2291    SCRATCH           2, 10, 2
   2292    SCRATCH           3, 11, 3
   2293    SCRATCH           4, 12, 4
   2294    SCRATCH           5, 13, 5
   2295 
   2296 %define mzero m7
   2297 
   2298 %if %3
   2299    SPLATD           m2, [base+pw_23_22]
   2300 %endif
   2301 
   2302 %if ARCH_X86_32
   2303    mov        scalingq, r5m
   2304    mov             r5m, r5
   2305 %else
   2306    mov           r13mp, strideq
   2307 %endif
   2308 
   2309    pcmpeqw          m0, m0
   2310    psraw            m1, m10, 1
   2311    pxor             m0, m1
   2312 
   2313    SCRATCH           0,  8, 0
   2314    SCRATCH           1,  9, 1
   2315 
   2316    cmp byte [fg_dataq+FGData.chroma_scaling_from_luma], 0
   2317    jne .csfl
   2318 
   2319 %macro %%FGUV_32x32xN_LOOP 3 ; not-csfl, ss_h, ss_v
   2320 %if ARCH_X86_32
   2321    DEFINE_ARGS dst, src, scaling, sby, fg_data, picptr, overlap
   2322 
   2323    DECLARE_REG_TMP    0
   2324 %else
   2325    DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, unused, sby, see, overlap
   2326 
   2327    DECLARE_REG_TMP    9
   2328 %endif
   2329 
   2330 %if %1
   2331    mov             r6d, r11m
   2332    SPLATW           m0, [fg_dataq+FGData.uv_mult+r6*4]
   2333    SPLATW           m1, [fg_dataq+FGData.uv_luma_mult+r6*4]
   2334    punpcklwd        m6, m1, m0
   2335    SPLATW           m5, [fg_dataq+FGData.uv_offset+r6*4]
   2336    SPLATD           m7, [base+pw_4+t0*4]
   2337    pmullw           m5, m7
   2338 %else
   2339    SPLATD           m6, [base+pd_16]
   2340 %if %2
   2341    mova             m5, [base+pw_23_22]
   2342 %else
   2343    mova             m5, [base+pw_27_17_17_27]
   2344 %endif
   2345 %endif
   2346 
   2347    SCRATCH           6, 14, 6
   2348    SCRATCH           5, 15, 7
   2349 
   2350 %if ARCH_X86_32
   2351    DECLARE_REG_TMP   0
   2352 %else
   2353    DECLARE_REG_TMP   7
   2354 %endif
   2355 
   2356    mov            sbyd, r8m
   2357    mov             t0d, [fg_dataq+FGData.overlap_flag]
   2358    test            t0d, t0d
   2359    jz %%no_vertical_overlap
   2360    test           sbyd, sbyd
   2361    jnz %%vertical_overlap
   2362 
   2363 %%no_vertical_overlap:
   2364    mov             r8m, t0d
   2365 %if ARCH_X86_32
   2366    DEFINE_ARGS dst, src, scaling, see, fg_data, picptr, overlap
   2367    imul           seed, (173 << 24) | 37
   2368 %else
   2369    imul           seed, sbyd, (173 << 24) | 37
   2370 %endif
   2371    add            seed, (105 << 24) | 178
   2372    rol            seed, 8
   2373    movzx          seed, seew
   2374    xor            seed, [fg_dataq+FGData.seed]
   2375 %if ARCH_X86_32
   2376    mov             r3m, seed
   2377 
   2378    DEFINE_ARGS dst, src, scaling, see, w, picptr, luma
   2379 
   2380    mov            dstq, r0mp
   2381    mov           lumaq, r9mp
   2382    mov              wq, r4m
   2383    lea              r3, [srcq+wq*2]
   2384    mov            r1mp, r3
   2385    lea              r3, [dstq+wq*2]
   2386    mov           r11mp, r3
   2387    lea              r3, [lumaq+wq*(2<<%2)]
   2388    mov           r12mp, r3
   2389 %if %3
   2390    shl           r10mp, 1
   2391 %endif
   2392 %else
   2393    DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
   2394                unused2, unused3, see, unused4, unused5, unused6, luma, lstride
   2395 
   2396    mov        lstrideq, r10mp
   2397 %if %3
   2398    add        lstrideq, lstrideq
   2399 %endif
   2400    mov           lumaq, r9mp
   2401    lea             r10, [srcq+wq*2]
   2402    lea             r11, [dstq+wq*2]
   2403    lea             r12, [lumaq+wq*(2<<%2)]
   2404    mov           r10mp, r10
   2405    mov           r11mp, r11
   2406    mov           r12mp, r12
   2407 %endif
   2408    neg              wq
   2409 %if ARCH_X86_32
   2410    mov           r4mp, wq
   2411 %endif
   2412 
   2413 %%loop_x:
   2414 %if ARCH_X86_32
   2415    mov            seed, r3m
   2416 %endif
   2417 
   2418    mov             r6d, seed
   2419    or             seed, 0xEFF4
   2420    shr             r6d, 1
   2421    test           seeb, seeh
   2422    lea            seed, [r6+0x8000]
   2423    cmovp          seed, r6d               ; updated seed
   2424 
   2425 %if ARCH_X86_32
   2426    mov             r3m, seed
   2427 
   2428    DEFINE_ARGS dst, src, scaling, offy, w, picptr, offx
   2429 
   2430    mov           offxd, offyd
   2431 %else
   2432    DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
   2433                offx, offy, see, unused1, unused2, unused3, luma, lstride
   2434 
   2435    mov           offxd, seed
   2436    mov           offyd, seed
   2437 %endif
   2438    ror           offyd, 8
   2439    shr           offxd, 12
   2440    and           offyd, 0xf
   2441    imul          offyd, 164>>%3
   2442    lea           offyq, [offyq+offxq*(2-%2)+(3+(6>>%3))*82+3+(6>>%2)]  ; offy*stride+offx
   2443 
   2444 %if ARCH_X86_32
   2445    DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut
   2446 %else
   2447    DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
   2448                h, offxy, see, unused1, unused2, unused3, luma, lstride
   2449 %endif
   2450 
   2451 %if %2 == 0
   2452 %%loop_x_odd:
   2453 %endif
   2454    mov              hd, r7m
   2455    mov      grain_lutq, grain_lutmp
   2456 %%loop_y:
   2457    ; src
   2458    mova             m0, [srcq]
   2459    mova             m1, [srcq+16]          ; m0-1: src as word
   2460 
   2461    ; luma_src
   2462    pxor          mzero, mzero
   2463 %if ARCH_X86_32
   2464    DEFINE_ARGS dst, src, scaling, offxy, h, luma, grain_lut
   2465 
   2466    mov           lumaq, r9m
   2467 %endif
   2468    mova             m4, [lumaq+ 0]
   2469    mova             m6, [lumaq+(16<<%2)]
   2470 %if %2
   2471    phaddw           m4, [lumaq+16]
   2472    phaddw           m6, [lumaq+48]
   2473 %endif
   2474 %if ARCH_X86_32
   2475    add           lumaq, r10mp
   2476    mov             r9m, lumaq
   2477 %endif
   2478 %if %2
   2479    pavgw            m4, mzero
   2480    pavgw            m6, mzero
   2481 %endif
   2482 
   2483 %if %1
   2484    punpckhwd        m3, m4, m0
   2485    punpcklwd        m4, m0
   2486    punpckhwd        m5, m6, m1
   2487    punpcklwd        m6, m1                 ; { luma, chroma }
   2488    REPX {pmaddwd x, m14}, m3, m4, m5, m6
   2489    REPX {psrad   x, 6}, m3, m4, m5, m6
   2490    packssdw         m4, m3
   2491    packssdw         m6, m5
   2492    REPX {paddw x, m15}, m4, m6
   2493    REPX {pmaxsw x, mzero}, m4, m6
   2494    REPX {pminsw x, m10}, m4, m6             ; clip_pixel()
   2495 %else
   2496    REPX  {pand x, m10}, m4, m6
   2497 %endif
   2498 
   2499    ; scaling[luma_src]
   2500 %if ARCH_X86_32
   2501    vpgatherdw       m3, m4, scalingq-1, r0, r5, 8, 1
   2502    vpgatherdw       m5, m6, scalingq-1, r0, r5, 8, 1
   2503 %else
   2504    vpgatherdw       m3, m4, scalingq-1, r10, r12, 8, 1
   2505    vpgatherdw       m5, m6, scalingq-1, r10, r12, 8, 1
   2506 %endif
   2507    REPX   {psrlw x, 8}, m3, m5
   2508 
   2509    ; grain = grain_lut[offy+y][offx+x]
   2510    movu             m4, [grain_lutq+offxyq*2]
   2511    movu             m6, [grain_lutq+offxyq*2+16]
   2512 
   2513    ; noise = round2(scaling[luma_src] * grain, scaling_shift)
   2514    REPX {pmullw x, m11}, m3, m5
   2515    pmulhrsw         m4, m3
   2516    pmulhrsw         m6, m5
   2517 
   2518    ; dst = clip_pixel(src, noise)
   2519    paddw            m0, m4
   2520    paddw            m1, m6
   2521    pmaxsw           m0, m13
   2522    pmaxsw           m1, m13
   2523    pminsw           m0, m12
   2524    pminsw           m1, m12
   2525    movifnidn      dstq, dstmp
   2526    mova      [dstq+ 0], m0
   2527    mova      [dstq+16], m1
   2528 
   2529 %if ARCH_X86_32
   2530    add            srcq, r2mp
   2531    add            dstq, r2mp
   2532    mov           dstmp, dstq
   2533 %else
   2534    add            srcq, r13mp
   2535    add            dstq, r13mp
   2536    add           lumaq, lstrideq
   2537 %endif
   2538    add      grain_lutq, 82*2
   2539    dec              hd
   2540    jg %%loop_y
   2541 
   2542 %if ARCH_X86_32
   2543    DEFINE_ARGS dst, src, scaling, offxy, w, picptr, luma
   2544 
   2545    mov              wq, r4mp
   2546 %endif
   2547    add              wq, 16
   2548    jge %%end
   2549 %if ARCH_X86_32
   2550    mov            srcq, r1mp
   2551 %else
   2552    mov            srcq, r10mp
   2553 %endif
   2554    mov            dstq, r11mp
   2555    mov           lumaq, r12mp
   2556    lea            srcq, [srcq+wq*2]
   2557    lea            dstq, [dstq+wq*2]
   2558    lea           lumaq, [lumaq+wq*(2<<%2)]
   2559 %if ARCH_X86_32
   2560    mov             r0m, dstq
   2561    mov             r9m, lumaq
   2562    mov             r4m, wq
   2563 %endif
   2564 %if %2 == 0
   2565    btc       dword r8m, 2
   2566    jc %%next_blk
   2567    add          offxyd, 16
   2568    test      dword r8m, 2
   2569    jz %%loop_x_odd
   2570 %if ARCH_X86_32
   2571    add dword [rsp+8*mmsize+1*gprsize], 16
   2572 %else
   2573    add            r11d, 16
   2574 %endif
   2575    jmp %%loop_x_odd_v_overlap
   2576 %%next_blk:
   2577 %endif
   2578    test      dword r8m, 1
   2579    je %%loop_x
   2580 
   2581    ; r8m = sbym
   2582    test      dword r8m, 2
   2583    jnz %%loop_x_hv_overlap
   2584 
   2585    ; horizontal overlap (without vertical overlap)
   2586 %%loop_x_h_overlap:
   2587 %if ARCH_X86_32
   2588    add          offxyd, 16
   2589    mov [rsp+8*mmsize+0*gprsize], offxyd
   2590 
   2591    DEFINE_ARGS dst, src, scaling, see, w, picptr, grain_lut
   2592 
   2593    mov            seed, r3m
   2594 %endif
   2595    mov             r6d, seed
   2596    or             seed, 0xEFF4
   2597    shr             r6d, 1
   2598    test           seeb, seeh
   2599    lea            seed, [r6+0x8000]
   2600    cmovp          seed, r6d               ; updated seed
   2601 
   2602 %if ARCH_X86_32
   2603    mov             r3m, seed
   2604 
   2605    DEFINE_ARGS dst, src, scaling, offy, w, picptr, offx
   2606 
   2607    mov           offxd, offyd
   2608 %else
   2609    DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
   2610                offx, offy, see, left_offxy, unused1, unused2, luma, lstride
   2611 
   2612    lea     left_offxyd, [offyd+16]         ; previous column's offy*stride+offx
   2613    mov           offxd, seed
   2614    mov           offyd, seed
   2615 %endif
   2616    ror           offyd, 8
   2617    shr           offxd, 12
   2618    and           offyd, 0xf
   2619    imul          offyd, 164>>%3
   2620    lea           offyq, [offyq+offxq*(2-%2)+(3+(6>>%3))*82+3+(6>>%2)]  ; offy*stride+offx
   2621 
   2622 %if ARCH_X86_32
   2623    DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut
   2624 %else
   2625    DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
   2626                h, offxy, see, left_offxy, unused1, unused2, luma, lstride
   2627 %endif
   2628 
   2629    mov              hd, r7m
   2630    mov      grain_lutq, grain_lutmp
   2631 %%loop_y_h_overlap:
   2632    mova             m0, [srcq]
   2633    mova             m1, [srcq+16]
   2634 
   2635    ; luma_src
   2636    pxor          mzero, mzero
   2637 %if ARCH_X86_32
   2638    DEFINE_ARGS dst, src, scaling, offxy, h, luma, grain_lut
   2639    mov           lumaq, r9m
   2640 %endif
   2641    mova             m4, [lumaq+ 0]
   2642    mova             m6, [lumaq+(16<<%2)]
   2643 %if %2
   2644    phaddw           m4, [lumaq+16]
   2645    phaddw           m6, [lumaq+48]
   2646 %endif
   2647 %if ARCH_X86_32
   2648    add           lumaq, r10mp
   2649    mov             r9m, lumaq
   2650 %endif
   2651 %if %2
   2652    pavgw            m4, mzero
   2653    pavgw            m6, mzero
   2654 %endif
   2655 
   2656 %if %1
   2657    punpckhwd        m3, m4, m0
   2658    punpcklwd        m4, m0
   2659    punpckhwd        m5, m6, m1
   2660    punpcklwd        m6, m1                 ; { luma, chroma }
   2661    REPX {pmaddwd x, m14}, m3, m4, m5, m6
   2662    REPX {psrad   x, 6}, m3, m4, m5, m6
   2663    packssdw         m4, m3
   2664    packssdw         m6, m5
   2665    REPX {paddw x, m15}, m4, m6
   2666    REPX {pmaxsw x, mzero}, m4, m6
   2667    REPX {pminsw x, m10}, m4, m6             ; clip_pixel()
   2668 %else
   2669    REPX  {pand x, m10}, m4, m6
   2670 %endif
   2671 
   2672    ; grain = grain_lut[offy+y][offx+x]
   2673    movu             m7, [grain_lutq+offxyq*2]
   2674 %if ARCH_X86_32
   2675    mov              r5, [rsp+8*mmsize+0*gprsize]
   2676    movd             m5, [grain_lutq+r5*2]
   2677 %else
   2678    movd             m5, [grain_lutq+left_offxyq*2+ 0]
   2679 %endif
   2680    punpcklwd        m5, m7                ; {left0, cur0}
   2681 %if %1
   2682 %if ARCH_X86_32
   2683    mov              r5, r5m
   2684 %endif
   2685 %if %2
   2686    pmaddwd          m5, [PIC_ptr(pw_23_22)]
   2687 %else
   2688    pmaddwd          m5, [PIC_ptr(pw_27_17_17_27)]
   2689 %endif
   2690    paddd            m5, [PIC_ptr(pd_16)]
   2691 %else
   2692    pmaddwd          m5, m15
   2693    paddd            m5, m14
   2694 %endif
   2695    psrad            m5, 5
   2696    packssdw         m5, m5
   2697    pmaxsw           m5, m8
   2698    pminsw           m5, m9
   2699    shufps           m5, m7, q3210
   2700    movu             m3, [grain_lutq+offxyq*2+16]
   2701 
   2702    ; scaling[luma_src]
   2703 %if ARCH_X86_32
   2704    vpgatherdw       m7, m4, scalingq-1, r0, r5, 8, 1
   2705    vpgatherdw       m4, m6, scalingq-1, r0, r5, 8, 1
   2706 %else
   2707    vpgatherdw       m7, m4, scalingq-1, r2, r12, 8, 1
   2708    vpgatherdw       m4, m6, scalingq-1, r2, r12, 8, 1
   2709 %endif
   2710    REPX   {psrlw x, 8}, m7, m4
   2711 
   2712    ; noise = round2(scaling[luma_src] * grain, scaling_shift)
   2713    REPX {pmullw x, m11}, m7, m4
   2714    pmulhrsw         m5, m7
   2715    pmulhrsw         m3, m4
   2716 
   2717    ; dst = clip_pixel(src, noise)
   2718    paddw            m0, m5
   2719    paddw            m1, m3
   2720    pmaxsw           m0, m13
   2721    pmaxsw           m1, m13
   2722    pminsw           m0, m12
   2723    pminsw           m1, m12
   2724    movifnidn      dstq, dstmp
   2725    mova      [dstq+ 0], m0
   2726    mova      [dstq+16], m1
   2727 
   2728 %if ARCH_X86_32
   2729    add            srcq, r2mp
   2730    add            dstq, r2mp
   2731    mov           dstmp, dstq
   2732 %else
   2733    add            srcq, r13mp
   2734    add            dstq, r13mp
   2735    add           lumaq, lstrideq
   2736 %endif
   2737    add      grain_lutq, 82*2
   2738    dec              hd
   2739    jg %%loop_y_h_overlap
   2740 
   2741 %if ARCH_X86_32
   2742    DEFINE_ARGS dst, src, scaling, offxy, w, luma, grain_lut
   2743    mov              wq, r4mp
   2744 %endif
   2745    add              wq, 16
   2746    jge %%end
   2747 %if ARCH_X86_32
   2748    mov            srcq, r1mp
   2749 %else
   2750    mov            srcq, r10mp
   2751 %endif
   2752    mov            dstq, r11mp
   2753    mov           lumaq, r12mp
   2754    lea            srcq, [srcq+wq*2]
   2755    lea            dstq, [dstq+wq*2]
   2756    lea           lumaq, [lumaq+wq*(2<<%2)]
   2757 %if ARCH_X86_32
   2758    mov            r0mp, dstq
   2759    mov            r9mp, lumaq
   2760    mov             r4m, wq
   2761 %endif
   2762 
   2763 %if %2
   2764    ; r8m = sbym
   2765    test      dword r8m, 2
   2766    jne %%loop_x_hv_overlap
   2767    jmp %%loop_x_h_overlap
   2768 %else
   2769    or        dword r8m, 4
   2770    add          offxyd, 16
   2771 
   2772    ; r8m = sbym
   2773    test      dword r8m, 2
   2774    jz %%loop_x_odd
   2775 %if ARCH_X86_32
   2776    add dword [rsp+8*mmsize+1*gprsize], 16
   2777 %else
   2778    add            r11d, 16                 ; top_offxy += 16
   2779 %endif
   2780    jmp %%loop_x_odd_v_overlap
   2781 %endif
   2782 
   2783 %%end:
   2784    RET
   2785 
   2786 %%vertical_overlap:
   2787    or              t0d, 2
   2788    mov             r8m, t0d
   2789 
   2790 %if ARCH_X86_32
   2791    DEFINE_ARGS dst, src, scaling, sby, fg_data, picptr, overlap
   2792 %else
   2793    DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, unused, \
   2794                sby, see, unused1, unused2, unused3, lstride
   2795 %endif
   2796 
   2797    movzx          sbyd, sbyb
   2798 %if ARCH_X86_32
   2799    imul             r4, [fg_dataq+FGData.seed], 0x00010001
   2800 
   2801    DEFINE_ARGS tmp, src, scaling, sby, see, picptr, unused
   2802 %else
   2803    imul           seed, [fg_dataq+FGData.seed], 0x00010001
   2804 %endif
   2805    imul            t0d, sbyd, 173 * 0x00010001
   2806    imul           sbyd, 37 * 0x01000100
   2807    add             t0d, (105 << 16) | 188
   2808    add            sbyd, (178 << 24) | (141 << 8)
   2809    and             t0d, 0x00ff00ff
   2810    and            sbyd, 0xff00ff00
   2811    xor            seed, t0d
   2812 %if ARCH_X86_32
   2813    xor            sbyd, seed
   2814 
   2815    DEFINE_ARGS dst, src, scaling, see, w, picptr, luma
   2816 
   2817    mov             r3m, seed
   2818    mov            dstq, r0mp
   2819    mov           lumaq, r9mp
   2820    mov              wq, r4m
   2821    lea              r3, [srcq+wq*2]
   2822    mov            r1mp, r3
   2823    lea              r3, [dstq+wq*2]
   2824    mov           r11mp, r3
   2825    lea              r3, [lumaq+wq*(2<<%2)]
   2826    mov           r12mp, r3
   2827 %if %3
   2828    shl           r10mp, 1
   2829 %endif
   2830 %else
   2831    xor            seed, sbyd               ; (cur_seed << 16) | top_seed
   2832 
   2833    DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
   2834                unused1, unused2, see, unused3, unused4, unused5, luma, lstride
   2835 
   2836    mov        lstrideq, r10mp
   2837 %if %3
   2838    add        lstrideq, lstrideq
   2839 %endif
   2840    mov           lumaq, r9mp
   2841    lea             r10, [srcq+wq*2]
   2842    lea             r11, [dstq+wq*2]
   2843    lea             r12, [lumaq+wq*(2<<%2)]
   2844    mov           r10mp, r10
   2845    mov           r11mp, r11
   2846    mov           r12mp, r12
   2847 %endif
   2848    neg              wq
   2849 %if ARCH_X86_32
   2850    mov             r4m, wq
   2851 %endif
   2852 
   2853 %%loop_x_v_overlap:
   2854 %if ARCH_X86_32
   2855    mov            seed, r3m
   2856    xor             t0d, t0d
   2857 %else
   2858    ; we assume from the block above that bits 8-15 of r7d are zero'ed
   2859 %endif
   2860    mov             r6d, seed
   2861    or             seed, 0xeff4eff4
   2862    test           seeb, seeh
   2863    setp            t0b                     ; parity of top_seed
   2864    shr            seed, 16
   2865    shl             t0d, 16
   2866    test           seeb, seeh
   2867    setp            t0b                     ; parity of cur_seed
   2868    or              r6d, 0x00010001
   2869    xor             t0d, r6d
   2870    mov            seed, t0d
   2871    ror            seed, 1                  ; updated (cur_seed << 16) | top_seed
   2872 %if ARCH_X86_32
   2873    mov             r3m, seed
   2874 
   2875    DEFINE_ARGS dst, src, scaling, offy, w, picptr, offx
   2876 
   2877    mov           offxd, offyd
   2878 %else
   2879    DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
   2880                offx, offy, see, unused1, top_offxy, unused2, luma, lstride
   2881 
   2882    mov           offyd, seed
   2883    mov           offxd, seed
   2884 %endif
   2885    ror           offyd, 8
   2886    ror           offxd, 12
   2887    and           offyd, 0xf000f
   2888    and           offxd, 0xf000f
   2889    imul          offyd, 164>>%3
   2890    ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy
   2891    lea           offyq, [offyq+offxq*(2-%2)+0x10001*((3+(6>>%3))*82+3+(6>>%2))+(32>>%3)*82]
   2892 
   2893 %if ARCH_X86_32
   2894    DEFINE_ARGS top_offxy, src, scaling, offxy, h, picptr, grain_lut
   2895 %else
   2896    DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
   2897                h, offxy, see, unused1, top_offxy, unused2, luma, lstride
   2898 %endif
   2899    movzx    top_offxyd, offxyw
   2900 %if ARCH_X86_32
   2901    mov [rsp+8*mmsize+1*gprsize], top_offxyd
   2902    DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut
   2903 %endif
   2904    shr          offxyd, 16
   2905 
   2906 %if %2 == 0
   2907 %%loop_x_odd_v_overlap:
   2908 %endif
   2909 %if %3 == 0
   2910 %if ARCH_X86_32
   2911    mov              r5, r5m
   2912 %endif
   2913    SPLATD           m2, [PIC_ptr(pw_27_17_17_27)]
   2914 %endif
   2915 
   2916    mov              hd, r7m
   2917    mov      grain_lutq, grain_lutmp
   2918 %%loop_y_v_overlap:
   2919    ; grain = grain_lut[offy+y][offx+x]
   2920    movu             m3, [grain_lutq+offxyq*2]
   2921 %if ARCH_X86_32
   2922    mov              r0, [rsp+mmsize*8+gprsize*1] ; top_offxy
   2923    movu             m5, [grain_lutq+r0*2]
   2924 %else
   2925    movu             m5, [grain_lutq+top_offxyq*2]
   2926 %endif
   2927    punpckhwd        m7, m5, m3
   2928    punpcklwd        m5, m3                 ; {top/cur interleaved}
   2929    REPX {pmaddwd x, m2}, m7, m5
   2930 %if %1
   2931 %if ARCH_X86_32
   2932    mov              r5, r5m
   2933 %endif
   2934    REPX  {paddd x, [PIC_ptr(pd_16)]}, m7, m5
   2935 %else
   2936    REPX  {paddd x, m14}, m7, m5
   2937 %endif
   2938    REPX   {psrad x, 5}, m7, m5
   2939    packssdw         m3, m5, m7
   2940    pmaxsw           m3, m8
   2941    pminsw           m3, m9
   2942 
   2943    ; grain = grain_lut[offy+y][offx+x]
   2944    movu             m4, [grain_lutq+offxyq*2+16]
   2945 %if ARCH_X86_32
   2946    movu             m5, [grain_lutq+r0*2+16]
   2947 %else
   2948    movu             m5, [grain_lutq+top_offxyq*2+16]
   2949 %endif
   2950    punpckhwd        m7, m5, m4
   2951    punpcklwd        m5, m4                 ; {top/cur interleaved}
   2952    REPX {pmaddwd x, m2}, m7, m5
   2953 %if %1
   2954    REPX  {paddd x, [PIC_ptr(pd_16)]}, m7, m5
   2955 %else
   2956    REPX  {paddd x, m14}, m7, m5
   2957 %endif
   2958    REPX   {psrad x, 5}, m7, m5
   2959    packssdw         m4, m5, m7
   2960    pmaxsw           m4, m8
   2961    pminsw           m4, m9
   2962 
   2963    ; src
   2964    mova             m0, [srcq]
   2965    mova             m1, [srcq+16]
   2966 
   2967    ; luma_src
   2968    pxor          mzero, mzero
   2969 %if ARCH_X86_32
   2970    DEFINE_ARGS dst, src, scaling, offxy, h, luma, grain_lut
   2971 
   2972    mov           lumaq, r9mp
   2973 %endif
   2974    mova             m5, [lumaq+ 0]
   2975    mova             m6, [lumaq+(16<<%2)]
   2976 %if %2
   2977    phaddw           m5, [lumaq+16]
   2978    phaddw           m6, [lumaq+48]
   2979 %endif
   2980 %if ARCH_X86_32
   2981    add           lumaq, r10mp
   2982    mov            r9mp, lumaq
   2983 %endif
   2984 %if %2
   2985    pavgw            m5, mzero
   2986    pavgw            m6, mzero
   2987 %endif
   2988 
   2989 %if %1
   2990    punpckhwd        m7, m5, m0
   2991    punpcklwd        m5, m0
   2992    REPX {pmaddwd x, m14}, m7, m5
   2993    REPX {psrad   x, 6}, m7, m5
   2994    packssdw         m5, m7
   2995    punpckhwd        m7, m6, m1
   2996    punpcklwd        m6, m1                 ; { luma, chroma }
   2997    REPX {pmaddwd x, m14}, m7, m6
   2998    REPX {psrad   x, 6}, m7, m6
   2999    packssdw         m6, m7
   3000    pxor          mzero, mzero
   3001    REPX {paddw x, m15}, m5, m6
   3002    REPX {pmaxsw x, mzero}, m5, m6
   3003    REPX {pminsw x, m10}, m5, m6            ; clip_pixel()
   3004 %else
   3005    REPX  {pand x, m10}, m5, m6
   3006 %endif
   3007 
   3008    ; scaling[luma_src]
   3009 %if ARCH_X86_32
   3010    vpgatherdw       m7, m5, scalingq-1, r0, r5, 8, 1
   3011    vpgatherdw       m5, m6, scalingq-1, r0, r5, 8, 1
   3012 %else
   3013    vpgatherdw       m7, m5, scalingq-1, r10, r12, 8, 1
   3014    vpgatherdw       m5, m6, scalingq-1, r10, r12, 8, 1
   3015 %endif
   3016    REPX   {psrlw x, 8}, m7, m5
   3017 
   3018    ; noise = round2(scaling[luma_src] * grain, scaling_shift)
   3019    REPX {pmullw x, m11}, m7, m5
   3020    pmulhrsw         m3, m7
   3021    pmulhrsw         m4, m5
   3022 
   3023    ; dst = clip_pixel(src, noise)
   3024    paddw            m0, m3
   3025    paddw            m1, m4
   3026    pmaxsw           m0, m13
   3027    pmaxsw           m1, m13
   3028    pminsw           m0, m12
   3029    pminsw           m1, m12
   3030    movifnidn      dstq, dstmp
   3031    mova      [dstq+ 0], m0
   3032    mova      [dstq+16], m1
   3033 
   3034    dec              hw
   3035    jle %%end_y_v_overlap
   3036 %if ARCH_X86_32
   3037    add            srcq, r2mp
   3038    add            dstq, r2mp
   3039    mov           dstmp, dstq
   3040 %else
   3041    add            srcq, r13mp
   3042    add            dstq, r13mp
   3043    add           lumaq, lstrideq
   3044 %endif
   3045    add      grain_lutq, 82*2
   3046 %if %3
   3047    jmp %%loop_y
   3048 %else
   3049    btc              hd, 16
   3050    jc %%loop_y
   3051 %if ARCH_X86_32
   3052    mov              r5, r5m
   3053 %endif
   3054    SPLATD           m2, [PIC_ptr(pw_27_17_17_27)+4]
   3055    jmp %%loop_y_v_overlap
   3056 %endif
   3057 
   3058 %%end_y_v_overlap:
   3059 %if ARCH_X86_32
   3060    DEFINE_ARGS dst, src, scaling, offxy, w, luma, grain_lut
   3061 
   3062    mov              wq, r4m
   3063 %endif
   3064    add              wq, 16
   3065    jge %%end_hv
   3066 %if ARCH_X86_32
   3067    mov            srcq, r1mp
   3068 %else
   3069    mov            srcq, r10mp
   3070 %endif
   3071    mov            dstq, r11mp
   3072    mov           lumaq, r12mp
   3073    lea            srcq, [srcq+wq*2]
   3074    lea            dstq, [dstq+wq*2]
   3075    lea           lumaq, [lumaq+wq*(2<<%2)]
   3076 %if ARCH_X86_32
   3077    mov            r0mp, dstq
   3078    mov            r9mp, lumaq
   3079    mov             r4m, wq
   3080 %endif
   3081 
   3082 %if %2
   3083    ; since fg_dataq.overlap is guaranteed to be set, we never jump
   3084    ; back to .loop_x_v_overlap, and instead always fall-through to
   3085    ; h+v overlap
   3086 %else
   3087    btc       dword r8m, 2
   3088    jc %%loop_x_hv_overlap
   3089    add          offxyd, 16
   3090 %if ARCH_X86_32
   3091    add dword [rsp+8*mmsize+1*gprsize], 16
   3092 %else
   3093    add            r11d, 16
   3094 %endif
   3095    jmp %%loop_x_odd_v_overlap
   3096 %endif
   3097 
   3098 %%loop_x_hv_overlap:
   3099 %if ARCH_X86_32
   3100    DEFINE_ARGS dst, src, scaling, offxy, w, picptr, grain_lut
   3101 
   3102    mov             t0d, [rsp+mmsize*8+gprsize*1] ; top_offxy
   3103    add          offxyd, 16
   3104    add             t0d, 16
   3105    mov [rsp+mmsize*8+gprsize*0], offxyd ; left_offxyd
   3106    mov [rsp+mmsize*8+gprsize*2], t0d ; topleft_offxyd
   3107 
   3108    DEFINE_ARGS dst, src, scaling, see, w, picptr, grain_lut
   3109 
   3110    mov            seed, r3m
   3111    xor             t0d, t0d
   3112 %else
   3113    ; we assume from the block above that bits 8-15 of r7d are zero'ed
   3114 %endif
   3115    mov             r6d, seed
   3116    or             seed, 0xeff4eff4
   3117    test           seeb, seeh
   3118    setp            t0b                     ; parity of top_seed
   3119    shr            seed, 16
   3120    shl             t0d, 16
   3121    test           seeb, seeh
   3122    setp            t0b                     ; parity of cur_seed
   3123    or              r6d, 0x00010001
   3124    xor             t0d, r6d
   3125    mov            seed, t0d
   3126    ror            seed, 1                  ; updated (cur_seed << 16) | top_seed
   3127 %if ARCH_X86_32
   3128    mov             r3m, seed
   3129 
   3130    DEFINE_ARGS dst, src, scaling, offy, w, picptr, offx
   3131 
   3132    mov           offxd, offyd
   3133 %else
   3134    DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
   3135                offx, offy, see, left_offxy, top_offxy, topleft_offxy, luma, lstride
   3136 
   3137    lea  topleft_offxyq, [top_offxyq+16]
   3138    lea     left_offxyq, [offyq+16]
   3139    mov           offyd, seed
   3140    mov           offxd, seed
   3141 %endif
   3142    ror           offyd, 8
   3143    ror           offxd, 12
   3144    and           offyd, 0xf000f
   3145    and           offxd, 0xf000f
   3146    imul          offyd, 164>>%3
   3147    ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy
   3148    lea           offyq, [offyq+offxq*(2-%2)+0x10001*((3+(6>>%3))*82+3+(6>>%2))+(32>>%3)*82]
   3149 
   3150 %if ARCH_X86_32
   3151    DEFINE_ARGS dst, src, scaling, offxy, h, picptr, top_offxy
   3152 %else
   3153    DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
   3154                h, offxy, see, left_offxy, top_offxy, topleft_offxy, luma, lstride
   3155 %endif
   3156    movzx    top_offxyd, offxyw
   3157 %if ARCH_X86_32
   3158    mov [rsp+8*mmsize+1*gprsize], top_offxyd
   3159 
   3160    DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut
   3161 %endif
   3162    shr          offxyd, 16
   3163 
   3164 %if %3 == 0
   3165 %if ARCH_X86_32
   3166    mov              r5, r5m
   3167 %endif
   3168    SPLATD           m2, [PIC_ptr(pw_27_17_17_27)]
   3169 %endif
   3170 
   3171    mov              hd, r7m
   3172    mov      grain_lutq, grain_lutmp
   3173 %%loop_y_hv_overlap:
   3174    ; grain = grain_lut[offy+y][offx+x]
   3175 %if ARCH_X86_32
   3176    mov              r5, [rsp+8*mmsize+0*gprsize] ; left_offxy
   3177    mov              r0, [rsp+8*mmsize+1*gprsize] ; top_offxy
   3178    movd             m5, [grain_lutq+r5*2]
   3179 %else
   3180    movd             m5, [grain_lutq+left_offxyq*2]
   3181 %endif
   3182    movu             m7, [grain_lutq+offxyq*2]
   3183 %if ARCH_X86_32
   3184    mov              r5, [rsp+8*mmsize+2*gprsize]
   3185    movu             m4, [grain_lutq+r0*2]
   3186 %if %2
   3187    pinsrw           m5, [grain_lutq+r5*2], 2
   3188 %else
   3189    movd             m3, [grain_lutq+r5*2]
   3190 %endif
   3191 %else
   3192    movu             m4, [grain_lutq+top_offxyq*2]
   3193 %if %2
   3194    pinsrw           m5, [grain_lutq+topleft_offxyq*2], 2 ; { left, _, top/left }
   3195 %else
   3196    movd             m3, [grain_lutq+topleft_offxyq*2]
   3197 %endif
   3198 %endif
   3199 %if %2 == 0
   3200    punpckldq        m5, m3
   3201 %endif
   3202    punpckldq        m3, m7, m4             ; { cur0/1,top0/1,cur2/3,top2/3 }
   3203    punpcklwd        m5, m3                 ; { left/cur0,_/cur1,topleft/top0,_/top1 }
   3204 %if %1
   3205 %if ARCH_X86_32
   3206    mov              r5, r5m
   3207 %endif
   3208 %if %2
   3209    movddup          m0, [PIC_ptr(pw_23_22)]
   3210 %else
   3211    movddup          m0, [PIC_ptr(pw_27_17_17_27)]
   3212 %endif
   3213 %else
   3214    pshufd           m0, m15, q1010
   3215 %endif
   3216    pmaddwd          m5, m0
   3217 %if %1
   3218    paddd            m5, [PIC_ptr(pd_16)]
   3219 %else
   3220    paddd            m5, m14
   3221 %endif
   3222    psrad            m5, 5
   3223    packssdw         m5, m5
   3224    pmaxsw           m5, m8
   3225    pminsw           m5, m9
   3226    shufps           m5, m3, q3210          ; cur0/1,top0/1,cur2/3,top2/3
   3227    shufps           m3, m5, m7, q3220      ; cur0-7 post-h_filter
   3228    shufps           m5, m4, q3231          ; top0-7 post-h_filter
   3229 
   3230    punpckhwd        m7, m5, m3
   3231    punpcklwd        m5, m3                 ; {top/cur interleaved}
   3232    REPX {pmaddwd x, m2}, m7, m5
   3233 %if %1
   3234    REPX  {paddd x, [PIC_ptr(pd_16)]}, m5, m7
   3235 %else
   3236    REPX  {paddd x, m14}, m5, m7
   3237 %endif
   3238    REPX   {psrad x, 5}, m5, m7
   3239    packssdw         m3, m5, m7
   3240    pmaxsw           m3, m8
   3241    pminsw           m3, m9
   3242 
   3243    ; right half
   3244    movu             m4, [grain_lutq+offxyq*2+16]
   3245 %if ARCH_X86_32
   3246    movu             m0, [grain_lutq+r0*2+16]
   3247 %else
   3248    movu             m0, [grain_lutq+top_offxyq*2+16]
   3249 %endif
   3250    punpckhwd        m1, m0, m4
   3251    punpcklwd        m0, m4                 ; {top/cur interleaved}
   3252    REPX {pmaddwd x, m2}, m1, m0
   3253 %if %1
   3254    REPX  {paddd x, [PIC_ptr(pd_16)]}, m1, m0
   3255 %else
   3256    REPX  {paddd x, m14}, m1, m0
   3257 %endif
   3258    REPX   {psrad x, 5}, m1, m0
   3259    packssdw         m4, m0, m1
   3260    pmaxsw           m4, m8
   3261    pminsw           m4, m9
   3262 
   3263    ; src
   3264    mova             m0, [srcq]
   3265    mova             m1, [srcq+16]
   3266 
   3267    ; luma_src
   3268    pxor          mzero, mzero
   3269 %if ARCH_X86_32
   3270    DEFINE_ARGS dst, src, scaling, offxy, h, luma, grain_lut
   3271 
   3272    mov           lumaq, r9mp
   3273 %endif
   3274    mova             m6, [lumaq+ 0]
   3275    mova             m5, [lumaq+(16<<%2)]
   3276 %if %2
   3277    phaddw           m6, [lumaq+16]
   3278    phaddw           m5, [lumaq+48]
   3279 %endif
   3280 %if ARCH_X86_32
   3281    add           lumaq, r10mp
   3282    mov            r9mp, lumaq
   3283 %endif
   3284 %if %2
   3285    pavgw            m6, mzero
   3286    pavgw            m5, mzero
   3287 %endif
   3288 
   3289 %if %1
   3290    punpckhwd        m7, m6, m0
   3291    punpcklwd        m6, m0
   3292    REPX {pmaddwd x, m14}, m7, m6
   3293    REPX {psrad   x, 6}, m7, m6
   3294    packssdw         m6, m7
   3295    punpckhwd        m7, m5, m1
   3296    punpcklwd        m5, m1                 ; { luma, chroma }
   3297    REPX {pmaddwd x, m14}, m7, m5
   3298    REPX {psrad   x, 6}, m7, m5
   3299    packssdw         m5, m7
   3300    pxor          mzero, mzero
   3301    REPX {paddw x, m15}, m6, m5
   3302    REPX {pmaxsw x, mzero}, m6, m5
   3303    REPX {pminsw x, m10}, m6, m5            ; clip_pixel()
   3304 %else
   3305    REPX  {pand x, m10}, m6, m5
   3306 %endif
   3307 
   3308    ; scaling[luma_src]
   3309 %if ARCH_X86_32
   3310    vpgatherdw       m7, m6, scalingq-1, r0, r5, 8, 1
   3311    vpgatherdw       m6, m5, scalingq-1, r0, r5, 8, 1
   3312 %else
   3313 %if %3 == 0
   3314    ; register shortage :)
   3315    push            r12
   3316 %endif
   3317    vpgatherdw       m7, m6, scalingq-1, r2, r12, 8, 1
   3318    vpgatherdw       m6, m5, scalingq-1, r2, r12, 8, 1
   3319 %if %3 == 0
   3320    pop             r12
   3321 %endif
   3322 %endif
   3323    REPX   {psrlw x, 8}, m7, m6
   3324 
   3325    ; noise = round2(scaling[luma_src] * grain, scaling_shift)
   3326    REPX {pmullw x, m11}, m7, m6
   3327    pmulhrsw         m3, m7
   3328    pmulhrsw         m4, m6
   3329 
   3330    ; dst = clip_pixel(src, noise)
   3331    paddw            m0, m3
   3332    paddw            m1, m4
   3333    pmaxsw           m0, m13
   3334    pmaxsw           m1, m13
   3335    pminsw           m0, m12
   3336    pminsw           m1, m12
   3337    movifnidn      dstq, dstmp
   3338    mova      [dstq+ 0], m0
   3339    mova      [dstq+16], m1
   3340 
   3341 %if ARCH_X86_32
   3342    add            srcq, r2mp
   3343    add            dstq, r2mp
   3344    mov           dstmp, dstq
   3345 %else
   3346    add            srcq, r13mp
   3347    add            dstq, r13mp
   3348    add           lumaq, lstrideq
   3349 %endif
   3350    add      grain_lutq, 82*2
   3351    dec              hw
   3352 %if %3
   3353    jg %%loop_y_h_overlap
   3354 %else
   3355    jle %%end_y_hv_overlap
   3356    btc              hd, 16
   3357    jc %%loop_y_h_overlap
   3358 %if ARCH_X86_32
   3359    mov              r5, r5m
   3360 %endif
   3361    SPLATD           m2, [PIC_ptr(pw_27_17_17_27)+4]
   3362    jmp %%loop_y_hv_overlap
   3363 %%end_y_hv_overlap:
   3364 %endif
   3365 %if ARCH_X86_32
   3366    DEFINE_ARGS dst, src, scaling, offxy, w, luma, grain_lut
   3367 
   3368    mov              wq, r4m
   3369 %endif
   3370    add              wq, 16
   3371    jge %%end_hv
   3372 %if ARCH_X86_32
   3373    mov            srcq, r1mp
   3374 %else
   3375    mov            srcq, r10mp
   3376 %endif
   3377    mov            dstq, r11mp
   3378    mov           lumaq, r12mp
   3379    lea            srcq, [srcq+wq*2]
   3380    lea            dstq, [dstq+wq*2]
   3381    lea           lumaq, [lumaq+wq*(2<<%2)]
   3382 %if ARCH_X86_32
   3383    mov           dstmp, dstq
   3384    mov            r9mp, lumaq
   3385    mov             r4m, wq
   3386 %endif
   3387 %if %2
   3388    jmp %%loop_x_hv_overlap
   3389 %else
   3390    or        dword r8m, 4
   3391    add          offxyd, 16
   3392 %if ARCH_X86_32
   3393    add dword [rsp+8*mmsize+1*gprsize], 16
   3394 %else
   3395    add            r11d, 16                 ; top_offxy += 16
   3396 %endif
   3397    jmp %%loop_x_odd_v_overlap
   3398 %endif
   3399 
   3400 %%end_hv:
   3401    RET
   3402 %endmacro
   3403 
   3404    %%FGUV_32x32xN_LOOP 1, %2, %3
   3405 .csfl:
   3406    %%FGUV_32x32xN_LOOP 0, %2, %3
   3407 
   3408 %if STACK_ALIGNMENT < mmsize
   3409 DECLARE_ARG 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12
   3410 %endif
   3411 %endmacro
   3412 
   3413 FGUV_FN 420, 1, 1
   3414 FGUV_FN 422, 1, 0
   3415 FGUV_FN 444, 0, 0