tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

filmgrain_sse.asm (90261B)


      1 ; Copyright © 2019-2021, VideoLAN and dav1d authors
      2 ; Copyright © 2019, Two Orioles, LLC
      3 ; All rights reserved.
      4 ;
      5 ; Redistribution and use in source and binary forms, with or without
      6 ; modification, are permitted provided that the following conditions are met:
      7 ;
      8 ; 1. Redistributions of source code must retain the above copyright notice, this
      9 ;    list of conditions and the following disclaimer.
     10 ;
     11 ; 2. Redistributions in binary form must reproduce the above copyright notice,
     12 ;    this list of conditions and the following disclaimer in the documentation
     13 ;    and/or other materials provided with the distribution.
     14 ;
     15 ; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
     16 ; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
     17 ; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
     18 ; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
     19 ; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
     20 ; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
     21 ; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
     22 ; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
     23 ; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
     24 ; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     25 
     26 %include "config.asm"
     27 %include "ext/x86/x86inc.asm"
     28 %include "x86/filmgrain_common.asm"
     29 
     30 SECTION_RODATA
     31 
     32 pw_1024: times 8 dw 1024
     33 pb_27_17_17_27: db 27, 17, 17, 27
     34                times 6 db 0, 32
     35 pb_23_22_h: db 23, 22
     36            times 7 db 0, 32
     37 pb_27_17: times 8 db 27, 17
     38 pb_17_27: times 8 db 17, 27
     39 pb_23_22: times 8 db 23, 22
     40 pb_mask: db 0, 0x80, 0x80, 0, 0x80, 0, 0, 0x80, 0x80, 0, 0, 0x80, 0, 0x80, 0x80, 0
     41 rnd_next_upperbit_mask: dw 0x100B, 0x2016, 0x402C, 0x8058
     42 byte_blend: db 0, 0, 0, 0xff, 0, 0, 0, 0
     43 pw_seed_xor: times 2 dw 0xb524
     44             times 2 dw 0x49d8
     45 pb_1: times 4 db 1
     46 hmul_bits: dw 32768, 16384, 8192, 4096
     47 round: dw 2048, 1024, 512
     48 mul_bits: dw 256, 128, 64, 32, 16
     49 round_vals: dw 32, 64, 128, 256, 512
     50 max: dw 255, 240, 235
     51 min: dw 0, 16
     52 pw_1: dw 1
     53 
     54 %macro JMP_TABLE 2-*
     55    %xdefine %1_8bpc_%2_table %%table
     56    %xdefine %%base %1_8bpc_%2_table
     57    %xdefine %%prefix mangle(private_prefix %+ _%1_8bpc_%2)
     58    %%table:
     59    %rep %0 - 2
     60        dd %%prefix %+ .ar%3 - %%base
     61        %rotate 1
     62    %endrep
     63 %endmacro
     64 
     65 JMP_TABLE generate_grain_y, ssse3, 0, 1, 2, 3
     66 JMP_TABLE generate_grain_uv_420, ssse3, 0, 1, 2, 3
     67 JMP_TABLE generate_grain_uv_422, ssse3, 0, 1, 2, 3
     68 JMP_TABLE generate_grain_uv_444, ssse3, 0, 1, 2, 3
     69 
     70 SECTION .text
     71 
     72 %if ARCH_X86_32
     73 %define PIC_ptr(a) base+a
     74 %else
     75 %define PIC_ptr(a) a
     76 %endif
     77 
     78 %macro SCRATCH 3
     79 %if ARCH_X86_32
     80    mova [rsp+%3*mmsize], m%1
     81 %define m%2 [rsp+%3*mmsize]
     82 %else
     83    SWAP             %1, %2
     84 %endif
     85 %endmacro
     86 
     87 INIT_XMM ssse3
     88 cglobal generate_grain_y_8bpc, 2, 7 + 2 * ARCH_X86_64, 16, buf, fg_data
     89    LEA              r4, $$
     90 %define base r4-$$
     91    movq             m1, [base+rnd_next_upperbit_mask]
     92    movq             m4, [base+mul_bits]
     93    movq             m7, [base+hmul_bits]
     94    mov             r2d, [fg_dataq+FGData.grain_scale_shift]
     95    movd             m2, [base+round+r2*2]
     96    movd             m0, [fg_dataq+FGData.seed]
     97    mova             m5, [base+pb_mask]
     98    pshuflw          m2, m2, q0000
     99    pshuflw          m0, m0, q0000
    100    mov              r2, -73*82
    101    sub            bufq, r2
    102    lea              r3, [base+gaussian_sequence]
    103 .loop:
    104    pand             m6, m0, m1
    105    psrlw            m3, m6, 10
    106    por              m6, m3            ; bits 0xf, 0x1e, 0x3c and 0x78 are set
    107    pmullw           m6, m4            ; bits 0x0f00 are set
    108    pshufb           m3, m5, m6        ; set 15th bit for next 4 seeds
    109    psllq            m6, m3, 30
    110    por              m3, m6
    111    psllq            m6, m3, 15
    112    por              m3, m6            ; aggregate each bit into next seed's high bit
    113    pmulhuw          m6, m0, m7
    114    por              m3, m6            ; 4 next output seeds
    115    pshuflw          m0, m3, q3333
    116    psrlw            m3, 5
    117 %if ARCH_X86_64
    118    movq             r6, m3
    119    mov              r8, r6
    120    movzx           r5d, r6w
    121    shr             r6d, 16
    122    shr              r8, 32
    123    movzx            r7, r8w
    124    shr              r8, 16
    125 
    126    movd             m6, [r3+r5*2]
    127    pinsrw           m6, [r3+r6*2], 1
    128    pinsrw           m6, [r3+r7*2], 2
    129    pinsrw           m6, [r3+r8*2], 3
    130 %else
    131    movd             r6, m3
    132    pshuflw          m3, m3, q3232
    133    movzx            r5, r6w
    134    shr              r6, 16
    135 
    136    movd             m6, [r3+r5*2]
    137    pinsrw           m6, [r3+r6*2], 1
    138 
    139    movd             r6, m3
    140    movzx            r5, r6w
    141    shr              r6, 16
    142 
    143    pinsrw           m6, [r3+r5*2], 2
    144    pinsrw           m6, [r3+r6*2], 3
    145 %endif
    146    pmulhrsw         m6, m2
    147    packsswb         m6, m6
    148    movd      [bufq+r2], m6
    149    add              r2, 4
    150    jl .loop
    151 
    152    ; auto-regression code
    153    movsxd           r2, [fg_dataq+FGData.ar_coeff_lag]
    154    movsxd           r2, [base+generate_grain_y_8bpc_ssse3_table+r2*4]
    155    lea              r2, [r2+base+generate_grain_y_8bpc_ssse3_table]
    156    jmp              r2
    157 
    158 .ar1:
    159 %if ARCH_X86_32
    160    DEFINE_ARGS buf, fg_data, cf3, unused, val3, min, max
    161 %elif WIN64
    162    DEFINE_ARGS shift, fg_data, cf3, buf, val3, min, max, x, val0
    163    mov            bufq, r0
    164 %else
    165    DEFINE_ARGS buf, fg_data, cf3, shift, val3, min, max, x, val0
    166 %endif
    167    movsx          cf3d, byte [fg_dataq+FGData.ar_coeffs_y+3]
    168    movd             m4, [fg_dataq+FGData.ar_coeffs_y]
    169    mov             ecx, [fg_dataq+FGData.ar_coeff_shift]
    170 %if ARCH_X86_32
    171    mov             r1m, cf3d
    172    DEFINE_ARGS buf, shift, val3, min, max, x, val0
    173 %define hd r0mp
    174 %define cf3d r1mp
    175 %elif WIN64
    176    DEFINE_ARGS shift, h, cf3, buf, val3, min, max, x, val0
    177 %else
    178    DEFINE_ARGS buf, h, cf3, shift, val3, min, max, x, val0
    179 %endif
    180    pxor             m6, m6
    181    pcmpgtb          m7, m6, m4
    182    punpcklbw        m4, m7
    183    pinsrw           m4, [base+pw_1], 3
    184    pshufd           m5, m4, q1111
    185    pshufd           m4, m4, q0000
    186    movd             m3, [base+round_vals+shiftq*2-12]    ; rnd
    187    pshuflw          m3, m3, q0000
    188    sub            bufq, 82*73-(82*3+79)
    189    mov              hd, 70
    190    mov            mind, -128
    191    mov            maxd, 127
    192 .y_loop_ar1:
    193    mov              xq, -76
    194    movsx         val3d, byte [bufq+xq-1]
    195 .x_loop_ar1:
    196    movq             m0, [bufq+xq-82-1]     ; top/left
    197    pcmpgtb          m7, m6, m0
    198    punpcklbw        m0, m7
    199    psrldq           m2, m0, 2              ; top
    200    psrldq           m1, m0, 4              ; top/right
    201    punpcklwd        m0, m2
    202    punpcklwd        m1, m3
    203    pmaddwd          m0, m4
    204    pmaddwd          m1, m5
    205    paddd            m0, m1
    206 .x_loop_ar1_inner:
    207    movd          val0d, m0
    208    psrldq           m0, 4
    209    imul          val3d, cf3d
    210    add           val3d, val0d
    211    sar           val3d, shiftb
    212    movsx         val0d, byte [bufq+xq]
    213    add           val3d, val0d
    214    cmp           val3d, maxd
    215    cmovns        val3d, maxd
    216    cmp           val3d, mind
    217    cmovs         val3d, mind
    218    mov  byte [bufq+xq], val3b
    219    ; keep val3d in-place as left for next x iteration
    220    inc              xq
    221    jz .x_loop_ar1_end
    222    test             xq, 3
    223    jnz .x_loop_ar1_inner
    224    jmp .x_loop_ar1
    225 
    226 .x_loop_ar1_end:
    227    add            bufq, 82
    228    dec              hd
    229    jg .y_loop_ar1
    230 .ar0:
    231    RET
    232 
    233 .ar2:
    234 %if ARCH_X86_32
    235    ALLOC_STACK -16*8
    236 %endif
    237    DEFINE_ARGS buf, fg_data, shift
    238    mov          shiftd, [fg_dataq+FGData.ar_coeff_shift]
    239    movd             m6, [base+round_vals-12+shiftq*2]
    240    movd             m7, [base+byte_blend+1]
    241    SCRATCH           7, 15, 7
    242    movq             m0, [fg_dataq+FGData.ar_coeffs_y+0]    ; cf0-7
    243    movd             m1, [fg_dataq+FGData.ar_coeffs_y+8]    ; cf8-11
    244    pxor             m7, m7
    245    pshuflw          m6, m6, q0000
    246    punpcklwd        m6, m7
    247    pcmpgtb          m4, m7, m0
    248    pcmpgtb          m5, m7, m1
    249    punpcklbw        m0, m4
    250    punpcklbw        m1, m5
    251    DEFINE_ARGS buf, fg_data, h, x
    252    pshufd           m4, m1, q0000
    253    pshufd           m5, m1, q1111
    254    pshufd           m3, m0, q3333
    255    pshufd           m2, m0, q2222
    256    pshufd           m1, m0, q1111
    257    pshufd           m0, m0, q0000
    258    SCRATCH           0, 8,  0
    259    SCRATCH           1, 9,  1
    260    SCRATCH           2, 10, 2
    261    SCRATCH           3, 11, 3
    262    SCRATCH           4, 12, 4
    263    SCRATCH           5, 13, 5
    264    SCRATCH           6, 14, 6
    265    sub            bufq, 82*73-(82*3+79)
    266    mov              hd, 70
    267 .y_loop_ar2:
    268    mov              xq, -76
    269 
    270 .x_loop_ar2:
    271    movq             m0, [bufq+xq-82*2-2]   ; y=-2,x=[-2,+5]
    272    movhps           m0, [bufq+xq-82*1-2]   ; y=-1,x=[-2,+5]
    273    pcmpgtb          m2, m7, m0
    274    punpckhbw        m1, m0, m2
    275    punpcklbw        m0, m2
    276    psrldq           m5, m0, 2              ; y=-2,x=[-1,+5]
    277    psrldq           m3, m1, 2              ; y=-1,x=[-1,+5]
    278    psrldq           m4, m1, 4              ; y=-1,x=[+0,+5]
    279    punpcklwd        m2, m0, m5
    280    punpcklwd        m3, m4
    281    pmaddwd          m2, m8
    282    pmaddwd          m3, m11
    283    paddd            m2, m3
    284 
    285    psrldq           m4, m0, 4              ; y=-2,x=[+0,+5]
    286    psrldq           m5, m0, 6              ; y=-2,x=[+1,+5]
    287    psrldq           m6, m0, 8              ; y=-2,x=[+2,+5]
    288    punpcklwd        m4, m5
    289    punpcklwd        m6, m1
    290    psrldq           m5, m1, 6              ; y=-1,x=[+1,+5]
    291    psrldq           m1, m1, 8              ; y=-1,x=[+2,+5]
    292    punpcklwd        m5, m1
    293    pmaddwd          m4, m9
    294    pmaddwd          m6, m10
    295    pmaddwd          m5, m12
    296    paddd            m4, m6
    297    paddd            m2, m5
    298    paddd            m2, m4
    299    paddd            m2, m14
    300 
    301    movq             m0, [bufq+xq-2]        ; y=0,x=[-2,+5]
    302 .x_loop_ar2_inner:
    303    pcmpgtb          m4, m7, m0
    304    punpcklbw        m1, m0, m4
    305    pmaddwd          m3, m1, m13
    306    paddd            m3, m2
    307    psrldq           m1, 4                  ; y=0,x=0
    308    psrldq           m2, 4                  ; shift top to next pixel
    309    psrad            m3, [fg_dataq+FGData.ar_coeff_shift]
    310    ; don't packssdw since we only care about one value
    311    paddw            m3, m1
    312    packsswb         m3, m3
    313    pslldq           m3, 2
    314    pand             m3, m15
    315    pandn            m1, m15, m0
    316    por              m0, m1, m3
    317    psrldq           m0, 1
    318    ; overwrite 2 pixels, but that's ok
    319    movd      [bufq+xq-1], m0
    320    inc              xq
    321    jz .x_loop_ar2_end
    322    test             xq, 3
    323    jnz .x_loop_ar2_inner
    324    jmp .x_loop_ar2
    325 
    326 .x_loop_ar2_end:
    327    add            bufq, 82
    328    dec              hd
    329    jg .y_loop_ar2
    330    RET
    331 
    332 .ar3:
    333    DEFINE_ARGS buf, fg_data, shift
    334 %if ARCH_X86_32
    335    ALLOC_STACK  -16*14
    336 %elif WIN64
    337    SUB             rsp, 16*6
    338 %assign stack_size_padded (stack_size_padded+16*6)
    339 %assign stack_size (stack_size+16*6)
    340 %else
    341    ALLOC_STACK  -16*6
    342 %endif
    343    mov          shiftd, [fg_dataq+FGData.ar_coeff_shift]
    344    movd             m6, [base+round_vals-12+shiftq*2]
    345    movd             m7, [base+byte_blend]
    346    movu             m0, [fg_dataq+FGData.ar_coeffs_y+ 0]   ; cf0-15
    347    movq             m2, [fg_dataq+FGData.ar_coeffs_y+16]   ; cf16-23
    348    pxor             m3, m3
    349    pcmpgtb          m4, m3, m0
    350    pcmpgtb          m3, m2
    351    pshuflw          m6, m6, q0000
    352    SCRATCH           6, 14, 12
    353    SCRATCH           7, 15, 13
    354    punpckhbw        m1, m0, m4
    355    punpcklbw        m0, m4
    356    punpcklbw        m2, m3
    357    pshufd           m3, m0, q1111
    358    pshufd           m4, m0, q2222
    359    pshufd           m5, m0, q3333
    360    pshufd           m0, m0, q0000
    361    mova    [rsp+ 0*16], m0
    362    mova    [rsp+ 1*16], m3
    363    mova    [rsp+ 2*16], m4
    364    mova    [rsp+ 3*16], m5
    365    pshufd           m6, m1, q1111
    366    pshufd           m7, m1, q2222
    367    pshufd           m5, m1, q3333
    368    pshufd           m1, m1, q0000
    369    pshufd           m3, m2, q1111
    370    psrldq           m0, m2, 10
    371    pinsrw           m2, [base+pw_1], 5
    372    pshufd           m4, m2, q2222
    373    pshufd           m2, m2, q0000
    374    pinsrw           m0, [base+round_vals+shiftq*2-10], 3
    375    mova    [rsp+ 4*16], m1
    376    mova    [rsp+ 5*16], m6
    377    SCRATCH           7, 8,  6
    378    SCRATCH           5, 9,  7
    379    SCRATCH           2, 10, 8
    380    SCRATCH           3, 11, 9
    381    SCRATCH           4, 12, 10
    382    SCRATCH           0, 13, 11
    383    DEFINE_ARGS buf, fg_data, h, x
    384    sub            bufq, 82*73-(82*3+79)
    385    mov              hd, 70
    386 .y_loop_ar3:
    387    mov              xq, -76
    388 
    389 .x_loop_ar3:
    390    movu             m0, [bufq+xq-82*3-3]   ; y=-3,x=[-3,+12]
    391    pxor             m3, m3
    392    pcmpgtb          m3, m0
    393    punpckhbw        m2, m0, m3
    394    punpcklbw        m0, m3
    395 
    396    psrldq           m5, m0, 2
    397    psrldq           m6, m0, 4
    398    psrldq           m7, m0, 6
    399    punpcklwd        m4, m0, m5
    400    punpcklwd        m6, m7
    401    pmaddwd          m4, [rsp+ 0*16]
    402    pmaddwd          m6, [rsp+ 1*16]
    403    paddd            m4, m6
    404 
    405    movu             m1, [bufq+xq-82*2-3]   ; y=-2,x=[-3,+12]
    406    pxor             m5, m5
    407    pcmpgtb          m5, m1
    408    punpckhbw        m3, m1, m5
    409    punpcklbw        m1, m5
    410    palignr          m6, m2, m0, 10
    411    palignr          m7, m2, m0, 12
    412    psrldq           m0, 8
    413    punpcklwd        m0, m6
    414    punpcklwd        m7, m1
    415    pmaddwd          m0, [rsp+ 2*16]
    416    pmaddwd          m7, [rsp+ 3*16]
    417    paddd            m0, m7
    418    paddd            m0, m4
    419 
    420    psrldq           m4, m1, 2
    421    psrldq           m5, m1, 4
    422    psrldq           m6, m1, 6
    423    psrldq           m7, m1, 8
    424    punpcklwd        m4, m5
    425    punpcklwd        m6, m7
    426    pmaddwd          m4, [rsp+ 4*16]
    427    pmaddwd          m6, [rsp+ 5*16]
    428    paddd            m4, m6
    429    paddd            m0, m4
    430 
    431    movu             m2, [bufq+xq-82*1-3]   ; y=-1,x=[-3,+12]
    432    pxor             m7, m7
    433    pcmpgtb          m7, m2
    434    punpckhbw        m5, m2, m7
    435    punpcklbw        m2, m7
    436    palignr          m7, m3, m1, 10
    437    palignr          m3, m1, 12
    438    psrldq           m1, m2, 2
    439    punpcklwd        m7, m3
    440    punpcklwd        m3, m2, m1
    441    pmaddwd          m7, m8
    442    pmaddwd          m3, m9
    443    paddd            m7, m3
    444    paddd            m0, m7
    445 
    446    psrldq           m6, m2, 4
    447    psrldq           m1, m2, 6
    448    psrldq           m3, m2, 8
    449    palignr          m4, m5, m2, 10
    450    palignr          m5, m5, m2, 12
    451 
    452    punpcklwd        m6, m1
    453    punpcklwd        m3, m4
    454    punpcklwd        m5, m14
    455    pmaddwd          m6, m10
    456    pmaddwd          m3, m11
    457    pmaddwd          m5, m12
    458    paddd            m0, m6
    459    paddd            m3, m5
    460    paddd            m0, m3
    461 
    462    movq             m1, [bufq+xq-3]        ; y=0,x=[-3,+4]
    463 .x_loop_ar3_inner:
    464    pxor             m5, m5
    465    pcmpgtb          m5, m1
    466    punpcklbw        m2, m1, m5
    467    pmaddwd          m2, m13
    468    pshufd           m3, m2, q1111
    469    paddd            m2, m3                 ; left+cur
    470    paddd            m2, m0                 ; add top
    471    psrldq           m0, 4
    472    psrad            m2, [fg_dataq+FGData.ar_coeff_shift]
    473    ; don't packssdw since we only care about one value
    474    packsswb         m2, m2
    475    pslldq           m2, 3
    476    pand             m2, m15
    477    pandn            m3, m15, m1
    478    por              m1, m2, m3
    479    movd    [bufq+xq-3], m1
    480    psrldq           m1, 1
    481    inc              xq
    482    jz .x_loop_ar3_end
    483    test             xq, 3
    484    jnz .x_loop_ar3_inner
    485    jmp .x_loop_ar3
    486 
    487 .x_loop_ar3_end:
    488    add            bufq, 82
    489    dec              hd
    490    jg .y_loop_ar3
    491    RET
    492 
    493 %macro generate_grain_uv_fn 3 ; ss_name, ss_x, ss_y
    494 INIT_XMM ssse3
    495 cglobal generate_grain_uv_%1_8bpc, 1, 7 + 3 * ARCH_X86_64, 16, buf, bufy, fg_data, uv
    496    movifnidn        r2, r2mp
    497    movifnidn        r3, r3mp
    498    LEA              r4, $$
    499 %define base r4-$$
    500    movq             m1, [base+rnd_next_upperbit_mask]
    501    movq             m4, [base+mul_bits]
    502    movq             m7, [base+hmul_bits]
    503    mov             r5d, [fg_dataq+FGData.grain_scale_shift]
    504    movd             m6, [base+round+r5*2]
    505    mova             m5, [base+pb_mask]
    506    movd             m0, [fg_dataq+FGData.seed]
    507    movd             m2, [base+pw_seed_xor+uvq*4]
    508    pxor             m0, m2
    509    pshuflw          m6, m6, q0000
    510    pshuflw          m0, m0, q0000
    511    lea              r6, [base+gaussian_sequence]
    512 %if %2
    513 %if ARCH_X86_64
    514    mov             r7d, 73-35*%3
    515 %else
    516    mov            r3mp, 73-35*%3
    517 %endif
    518    add            bufq, 44
    519 .loop_y:
    520    mov              r5, -44
    521 .loop_x:
    522 %else
    523    mov              r5, -82*73
    524    sub            bufq, r5
    525 .loop:
    526 %endif
    527    pand             m2, m0, m1
    528    psrlw            m3, m2, 10
    529    por              m2, m3             ; bits 0xf, 0x1e, 0x3c and 0x78 are set
    530    pmullw           m2, m4             ; bits 0x0f00 are set
    531    pshufb           m3, m5, m2         ; set 15th bit for next 4 seeds
    532    psllq            m2, m3, 30
    533    por              m3, m2
    534    psllq            m2, m3, 15
    535    por              m3, m2             ; aggregate each bit into next seed's high bit
    536    pmulhuw          m2, m0, m7
    537    por              m2, m3             ; 4 next output seeds
    538    pshuflw          m0, m2, q3333
    539    psrlw            m2, 5
    540 %if ARCH_X86_64
    541    movd            r9d, m2
    542    pshuflw          m2, m2, q3232
    543    movzx            r8, r9w
    544    shr              r9, 16
    545 
    546    movd             m3, [r6+r8*2]
    547    pinsrw           m3, [r6+r9*2], 1
    548 
    549    movd            r9d, m2
    550    movzx            r8, r9w
    551    shr              r9, 16
    552 
    553    pinsrw           m3, [r6+r8*2], 2
    554    pinsrw           m3, [r6+r9*2], 3
    555 %else
    556    movd             r2, m2
    557    pshuflw          m2, m2, q3232
    558    movzx            r1, r2w
    559    shr              r2, 16
    560 
    561    movd             m3, [r6+r1*2]
    562    pinsrw           m3, [r6+r2*2], 1
    563 
    564    movd             r2, m2
    565    movzx            r1, r2w
    566    shr              r2, 16
    567 
    568    pinsrw           m3, [r6+r1*2], 2
    569    pinsrw           m3, [r6+r2*2], 3
    570 %endif
    571    pmulhrsw         m3, m6
    572    packsswb         m3, m3
    573    movd      [bufq+r5], m3
    574    add              r5, 4
    575 %if %2
    576    jl .loop_x
    577    add            bufq, 82
    578 %if ARCH_X86_64
    579    dec             r7d
    580 %else
    581    dec            r3mp
    582 %endif
    583    jg .loop_y
    584 %else
    585    jl .loop
    586 %endif
    587 
    588 %if ARCH_X86_32
    589    mov              r2, r2mp
    590 %endif
    591 
    592    ; auto-regression code
    593    movsxd           r5, [fg_dataq+FGData.ar_coeff_lag]
    594    movsxd           r5, [base+generate_grain_uv_%1_8bpc_ssse3_table+r5*4]
    595    lea              r5, [r5+base+generate_grain_uv_%1_8bpc_ssse3_table]
    596    jmp              r5
    597 
    598 .ar0:
    599    DEFINE_ARGS buf, bufy, fg_data, uv, unused, shift
    600    movifnidn     bufyq, bufymp
    601 %if ARCH_X86_32
    602    ALLOC_STACK   -2*16
    603 %endif
    604    imul            uvd, 28
    605    mov          shiftd, [fg_dataq+FGData.ar_coeff_shift]
    606    movd             m5, [fg_dataq+FGData.ar_coeffs_uv+uvq]
    607    movd             m4, [base+hmul_bits+shiftq*2]
    608    DEFINE_ARGS buf, bufy, h, x
    609    pxor             m0, m0
    610    pcmpgtb          m0, m5
    611    punpcklbw        m5, m0
    612    movd             m7, [base+pb_1]
    613 %if %2
    614    movd             m6, [base+hmul_bits+2+%3*2]
    615 %endif
    616    pshuflw          m5, m5, q0000
    617    pshuflw          m4, m4, q0000
    618    pshufd           m7, m7, q0000
    619 %if %2
    620    pshuflw          m6, m6, q0000
    621 %endif
    622    punpcklqdq       m5, m5
    623    punpcklqdq       m4, m4
    624 %if %2
    625    punpcklqdq       m6, m6
    626 %endif
    627    pcmpeqw          m1, m1
    628    pslldq           m1, 12>>%2
    629    SCRATCH           1, 8, 0
    630    SCRATCH           4, 9, 1
    631 %if %2
    632    sub            bufq, 82*(73-35*%3)+82-(82*3+41)
    633 %else
    634    sub            bufq, 82*70-3
    635 %endif
    636    add           bufyq, 3+82*3
    637    mov              hd, 70-35*%3
    638 .y_loop_ar0:
    639    xor              xd, xd
    640 .x_loop_ar0:
    641    ; first 32 pixels
    642 %if %2
    643    movu             m1, [bufyq+xq*2]
    644 %if %3
    645    movu             m2, [bufyq+xq*2+82]
    646 %endif
    647    movu             m3, [bufyq+xq*2+16]
    648 %if %3
    649    movu             m4, [bufyq+xq*2+82+16]
    650 %endif
    651    pmaddubsw        m0, m7, m1
    652 %if %3
    653    pmaddubsw        m1, m7, m2
    654 %endif
    655    pmaddubsw        m2, m7, m3
    656 %if %3
    657    pmaddubsw        m3, m7, m4
    658    paddw            m0, m1
    659    paddw            m2, m3
    660 %endif
    661    pmulhrsw         m0, m6
    662    pmulhrsw         m2, m6
    663 %else
    664    movu             m0, [bufyq+xq]
    665    pxor             m6, m6
    666    pcmpgtb          m6, m0
    667    punpckhbw        m2, m0, m6
    668    punpcklbw        m0, m6
    669 %endif
    670    pmullw           m0, m5
    671    pmullw           m2, m5
    672    pmulhrsw         m0, m9
    673    pmulhrsw         m2, m9
    674    movu             m1, [bufq+xq]
    675    pxor             m4, m4
    676    pcmpgtb          m4, m1
    677    punpckhbw        m3, m1, m4
    678 %if %2
    679    punpcklbw        m1, m4
    680    paddw            m2, m3
    681    paddw            m0, m1
    682 %else
    683    punpcklbw        m6, m1, m4
    684    paddw            m2, m3
    685    paddw            m0, m6
    686 %endif
    687    packsswb         m0, m2
    688 %if %2
    689    movu      [bufq+xq], m0
    690    add              xd, 16
    691    cmp              xd, 32
    692    jl .x_loop_ar0
    693 
    694    ; last 6/12 pixels
    695    movu             m1, [bufyq+xq*(1+%2)]
    696 %if %3
    697    movu             m2, [bufyq+xq*2+82]
    698 %endif
    699    pmaddubsw        m0, m7, m1
    700 %if %3
    701    pmaddubsw        m1, m7, m2
    702    paddw            m0, m1
    703 %endif
    704    pmulhrsw         m0, m6
    705    pmullw           m0, m5
    706    pmulhrsw         m0, m9
    707    movq             m1, [bufq+xq]
    708    pxor             m4, m4
    709    pcmpgtb          m4, m1
    710    punpcklbw        m2, m1, m4
    711    paddw            m0, m2
    712    packsswb         m0, m0
    713    pandn            m2, m8, m0
    714    pand             m1, m8
    715    por              m2, m1
    716    movq      [bufq+xq], m2
    717 %else
    718    add              xd, 16
    719    cmp              xd, 80
    720    je .y_loop_final_ar0
    721    movu   [bufq+xq-16], m0
    722    jmp .x_loop_ar0
    723 .y_loop_final_ar0:
    724    pandn            m2, m8, m0
    725    pand             m1, m8
    726    por              m2, m1
    727    movu   [bufq+xq-16], m2
    728 %endif
    729 
    730    add            bufq, 82
    731    add           bufyq, 82<<%3
    732    dec              hd
    733    jg .y_loop_ar0
    734    RET
    735 
    736 .ar1:
    737 %if ARCH_X86_32
    738    RESET_STACK_STATE
    739 %endif
    740    DEFINE_ARGS buf, bufy, fg_data, uv, val3, cf3, min, max, x
    741    imul            uvd, 28
    742    movsx          cf3d, byte [fg_dataq+FGData.ar_coeffs_uv+uvq+3]
    743    movd             m4, [fg_dataq+FGData.ar_coeffs_uv+uvq-1]
    744    pinsrw           m4, [fg_dataq+FGData.ar_coeffs_uv+uvq+4], 2
    745 %if ARCH_X86_32
    746    mov            r3mp, cf3d
    747    DEFINE_ARGS buf, shift, fg_data, val3, min, max, x
    748 %elif WIN64
    749    DEFINE_ARGS shift, bufy, fg_data, buf, val3, cf3, min, max, x
    750    mov            bufq, r0
    751 %else
    752    DEFINE_ARGS buf, bufy, fg_data, shift, val3, cf3, min, max, x
    753 %endif
    754    mov          shiftd, [fg_dataq+FGData.ar_coeff_shift]
    755    movd             m3, [base+round_vals+shiftq*2-12]    ; rnd
    756 %if %2
    757    movd             m7, [base+pb_1]
    758    movd             m6, [base+hmul_bits+2+%3*2]
    759 %endif
    760    psrldq           m4, 1
    761 %if ARCH_X86_32
    762    DEFINE_ARGS buf, shift, val0, val3, min, max, x
    763 %elif WIN64
    764    DEFINE_ARGS shift, bufy, h, buf, val3, cf3, min, max, x, val0
    765 %else
    766    DEFINE_ARGS buf, bufy, h, shift, val3, cf3, min, max, x, val0
    767 %endif
    768    pxor             m5, m5
    769    punpcklwd        m3, m5
    770 %if %2
    771    punpcklwd        m6, m6
    772 %endif
    773    pcmpgtb          m5, m4
    774    punpcklbw        m4, m5
    775    pshufd           m5, m4, q1111
    776    pshufd           m4, m4, q0000
    777    pshufd           m3, m3, q0000
    778 %if %2
    779    pshufd           m7, m7, q0000
    780    pshufd           m6, m6, q0000
    781    sub            bufq, 82*(73-35*%3)+44-(82*3+41)
    782 %else
    783    sub            bufq, 82*69+3
    784 %endif
    785 %if ARCH_X86_32
    786    add            r1mp, 79+82*3
    787    mov            r0mp, 70-35*%3
    788 %else
    789    add           bufyq, 79+82*3
    790    mov              hd, 70-35*%3
    791 %endif
    792    mov            mind, -128
    793    mov            maxd, 127
    794 .y_loop_ar1:
    795    mov              xq, -(76>>%2)
    796    movsx         val3d, byte [bufq+xq-1]
    797 .x_loop_ar1:
    798 %if %2
    799 %if ARCH_X86_32
    800    mov              r2, r1mp
    801    movq             m0, [r2+xq*2]
    802 %if %3
    803    movq             m1, [r2+xq*2+82]
    804 %endif
    805 %else
    806    movq             m0, [bufyq+xq*2]
    807 %if %3
    808    movq             m1, [bufyq+xq*2+82]
    809 %endif
    810 %endif
    811    pmaddubsw        m2, m7, m0
    812 %if %3
    813    pmaddubsw        m0, m7, m1
    814    paddw            m2, m0
    815 %endif
    816    pmulhrsw         m2, m6
    817 %else
    818 %if ARCH_X86_32
    819    mov              r2, r1mp
    820    movd             m2, [r2+xq]
    821 %else
    822    movd             m2, [bufyq+xq]
    823 %endif
    824    pxor             m0, m0
    825    pcmpgtb          m0, m2
    826    punpcklbw        m2, m0
    827 %endif
    828 
    829    movq             m0, [bufq+xq-82-1]     ; top/left
    830    pxor             m1, m1
    831    pcmpgtb          m1, m0
    832    punpcklbw        m0, m1
    833    psrldq           m1, m0, 4              ; top/right
    834    punpcklwd        m1, m2
    835    psrldq           m2, m0, 2              ; top
    836    punpcklwd        m0, m2
    837    pmaddwd          m0, m4
    838    pmaddwd          m1, m5
    839    paddd            m0, m1
    840    paddd            m0, m3
    841 .x_loop_ar1_inner:
    842    movd          val0d, m0
    843    psrldq           m0, 4
    844 %if ARCH_X86_32
    845    imul          val3d, r3mp
    846 %else
    847    imul          val3d, cf3d
    848 %endif
    849    add           val3d, val0d
    850    sar           val3d, shiftb
    851    movsx         val0d, byte [bufq+xq]
    852    add           val3d, val0d
    853    cmp           val3d, maxd
    854    cmovns        val3d, maxd
    855    cmp           val3d, mind
    856    cmovs         val3d, mind
    857    mov  byte [bufq+xq], val3b
    858    ; keep val3d in-place as left for next x iteration
    859    inc              xq
    860    jz .x_loop_ar1_end
    861    test             xq, 3
    862    jnz .x_loop_ar1_inner
    863    jmp .x_loop_ar1
    864 
    865 .x_loop_ar1_end:
    866    add            bufq, 82
    867 %if ARCH_X86_32
    868    add            r1mp, 82<<%3
    869    dec            r0mp
    870 %else
    871    add           bufyq, 82<<%3
    872    dec              hd
    873 %endif
    874    jg .y_loop_ar1
    875    RET
    876 
    877 .ar2:
    878 %if ARCH_X86_32
    879    ALLOC_STACK   -8*16
    880 %endif
    881    DEFINE_ARGS buf, bufy, fg_data, uv, unused, shift
    882    movifnidn     bufyq, bufymp
    883    mov          shiftd, [fg_dataq+FGData.ar_coeff_shift]
    884    imul            uvd, 28
    885    movd             m7, [base+round_vals-12+shiftq*2]
    886    movu             m0, [fg_dataq+FGData.ar_coeffs_uv+uvq+0]   ; cf0-12
    887    pxor             m2, m2
    888    pcmpgtb          m2, m0
    889    punpckhbw        m1, m0, m2
    890    punpcklbw        m0, m2
    891    pinsrw           m1, [base+pw_1], 5
    892    punpcklwd        m7, m7
    893    pshufd           m7, m7, q0000
    894    DEFINE_ARGS buf, bufy, fg_data, h, unused, x
    895    pshufd           m4, m1, q0000
    896    pshufd           m5, m1, q1111
    897    pshufd           m6, m1, q2222
    898    pshufd           m3, m0, q3333
    899    pshufd           m2, m0, q2222
    900    pshufd           m1, m0, q1111
    901    pshufd           m0, m0, q0000
    902    SCRATCH           0, 8,  0
    903    SCRATCH           1, 9,  1
    904    SCRATCH           2, 10, 2
    905    SCRATCH           3, 11, 3
    906    SCRATCH           4, 12, 4
    907    SCRATCH           5, 13, 5
    908    SCRATCH           6, 14, 6
    909    SCRATCH           7, 15, 7
    910 %if %2
    911    movd             m7, [base+hmul_bits+2+%3*2]
    912    movd             m6, [base+pb_1]
    913    punpcklwd        m7, m7
    914    pshufd           m6, m6, q0000
    915    pshufd           m7, m7, q0000
    916    sub            bufq, 82*(73-35*%3)+44-(82*3+41)
    917 %else
    918    sub            bufq, 82*69+3
    919 %endif
    920    add           bufyq, 79+82*3
    921    mov              hd, 70-35*%3
    922 .y_loop_ar2:
    923    mov              xq, -(76>>%2)
    924 
    925 .x_loop_ar2:
    926    pxor             m2, m2
    927    movq             m0, [bufq+xq-82*2-2]   ; y=-2,x=[-2,+5]
    928    movhps           m0, [bufq+xq-82*1-2]   ; y=-1,x=[-2,+5]
    929    pcmpgtb          m2, m0
    930    punpckhbw        m1, m0, m2
    931    punpcklbw        m0, m2
    932    psrldq           m5, m0, 2              ; y=-2,x=[-1,+5]
    933    psrldq           m3, m1, 2              ; y=-1,x=[-1,+5]
    934    psrldq           m4, m1, 4              ; y=-1,x=[+0,+5]
    935    punpcklwd        m2, m0, m5
    936    punpcklwd        m3, m4
    937    pmaddwd          m2, m8
    938    pmaddwd          m3, m11
    939    paddd            m2, m3
    940 
    941    psrldq           m4, m0, 4              ; y=-2,x=[+0,+5]
    942    psrldq           m5, m0, 6              ; y=-2,x=[+1,+5]
    943    psrldq           m0, 8                  ; y=-2,x=[+2,+5]
    944    punpcklwd        m4, m5
    945    punpcklwd        m0, m1
    946    psrldq           m3, m1, 6              ; y=-1,x=[+1,+5]
    947    psrldq           m1, m1, 8              ; y=-1,x=[+2,+5]
    948    punpcklwd        m3, m1
    949    pmaddwd          m4, m9
    950    pmaddwd          m0, m10
    951    pmaddwd          m3, m12
    952    paddd            m4, m0
    953    paddd            m2, m3
    954    paddd            m2, m4
    955 
    956 %if %2
    957    movq             m1, [bufyq+xq*2]
    958 %if %3
    959    movq             m3, [bufyq+xq*2+82]
    960 %endif
    961    pmaddubsw        m0, m6, m1
    962 %if %3
    963    pmaddubsw        m1, m6, m3
    964    paddw            m0, m1
    965 %endif
    966    pmulhrsw         m0, m7
    967 %else
    968    movd             m0, [bufyq+xq]
    969    pxor             m1, m1
    970    pcmpgtb          m1, m0
    971    punpcklbw        m0, m1
    972 %endif
    973    punpcklwd        m0, m15
    974    pmaddwd          m0, m14
    975    paddd            m2, m0
    976 
    977    movq             m0, [bufq+xq-2]        ; y=0,x=[-2,+5]
    978    pxor             m4, m4
    979    movd             m5, [base+byte_blend+1]
    980    punpcklbw        m5, m5
    981 .x_loop_ar2_inner:
    982    pcmpgtb          m1, m4, m0
    983    punpcklbw        m0, m1
    984    pmaddwd          m3, m0, m13
    985    paddd            m3, m2
    986    psrldq           m2, 4                  ; shift top to next pixel
    987    psrad            m3, [fg_dataq+FGData.ar_coeff_shift]
    988    pslldq           m3, 4
    989    pand             m3, m5
    990    paddw            m0, m3
    991    packsswb         m0, m0
    992    movd    [bufq+xq-2], m0
    993    psrldq           m0, 1
    994    inc              xq
    995    jz .x_loop_ar2_end
    996    test             xq, 3
    997    jnz .x_loop_ar2_inner
    998    jmp .x_loop_ar2
    999 
   1000 .x_loop_ar2_end:
   1001    add            bufq, 82
   1002    add           bufyq, 82<<%3
   1003    dec              hd
   1004    jg .y_loop_ar2
   1005    RET
   1006 
   1007 .ar3:
   1008 %if ARCH_X86_32
   1009    RESET_STACK_STATE
   1010 %endif
   1011    DEFINE_ARGS buf, bufy, fg_data, uv, unused, shift
   1012    movifnidn     bufyq, bufymp
   1013 %if ARCH_X86_32
   1014    ALLOC_STACK  -15*16
   1015 %else
   1016    SUB             rsp, 16*7
   1017 %assign stack_size_padded (stack_size_padded+16*7)
   1018 %assign stack_size (stack_size+16*7)
   1019 %endif
   1020    mov          shiftd, [fg_dataq+FGData.ar_coeff_shift]
   1021    imul            uvd, 28
   1022 
   1023    movu             m0, [fg_dataq+FGData.ar_coeffs_uv+uvq+ 0]   ; cf0-15
   1024    pxor             m3, m3
   1025    pcmpgtb          m3, m0
   1026    punpckhbw        m1, m0, m3
   1027    punpcklbw        m0, m3
   1028    pshufd           m2, m0, q1111
   1029    pshufd           m3, m0, q2222
   1030    pshufd           m4, m0, q3333
   1031    pshufd           m0, m0, q0000
   1032    pshufd           m5, m1, q1111
   1033    pshufd           m6, m1, q2222
   1034    pshufd           m7, m1, q3333
   1035    pshufd           m1, m1, q0000
   1036    mova    [rsp+ 0*16], m0
   1037    mova    [rsp+ 1*16], m2
   1038    mova    [rsp+ 2*16], m3
   1039    mova    [rsp+ 3*16], m4
   1040    mova    [rsp+ 4*16], m1
   1041    mova    [rsp+ 5*16], m5
   1042    mova    [rsp+ 6*16], m6
   1043    SCRATCH           7, 8, 7
   1044 
   1045    movu             m2, [fg_dataq+FGData.ar_coeffs_uv+uvq+16]   ; cf16-24 [24=luma]
   1046    pxor             m4, m4
   1047    pcmpgtb          m4, m2
   1048    punpckhbw        m5, m2, m4
   1049    punpcklbw        m2, m4
   1050    pshufd           m4, m2, q3232
   1051    punpcklwd        m3, m4, m5
   1052    pshuflw          m5, m4, q3321
   1053    pshufd           m4, m3, q0000
   1054    pshufd           m3, m2, q1111
   1055    pshufd           m2, m2, q0000
   1056    pinsrw           m5, [base+round_vals+shiftq*2-10], 3
   1057    SCRATCH           2, 9,  8
   1058    SCRATCH           3, 10, 9
   1059    SCRATCH           4, 11, 10
   1060    SCRATCH           5, 12, 11
   1061 
   1062    movd             m2, [base+round_vals-12+shiftq*2]
   1063 %if %2
   1064    movd             m1, [base+pb_1]
   1065    movd             m3, [base+hmul_bits+2+%3*2]
   1066 %endif
   1067    pxor             m0, m0
   1068    punpcklwd        m2, m0
   1069 %if %2
   1070    punpcklwd        m3, m3
   1071 %endif
   1072    pshufd           m2, m2, q0000
   1073 %if %2
   1074    pshufd           m1, m1, q0000
   1075    pshufd           m3, m3, q0000
   1076    SCRATCH           1, 13, 12
   1077 %endif
   1078    SCRATCH           2, 14, 13
   1079 %if %2
   1080    SCRATCH           3, 15, 14
   1081 %endif
   1082 
   1083    DEFINE_ARGS buf, bufy, fg_data, h, unused, x
   1084 %if %2
   1085    sub            bufq, 82*(73-35*%3)+44-(82*3+41)
   1086 %else
   1087    sub            bufq, 82*69+3
   1088 %endif
   1089    add           bufyq, 79+82*3
   1090    mov              hd, 70-35*%3
   1091 .y_loop_ar3:
   1092    mov              xq, -(76>>%2)
   1093 
   1094 .x_loop_ar3:
   1095    movu             m0, [bufq+xq-82*3-3]   ; y=-3,x=[-3,+12]
   1096    pxor             m4, m4
   1097    pcmpgtb          m4, m0
   1098    punpckhbw        m3, m0, m4
   1099    punpcklbw        m0, m4
   1100 
   1101    psrldq           m5, m0, 2
   1102    psrldq           m6, m0, 4
   1103    psrldq           m7, m0, 6
   1104    punpcklwd        m4, m0, m5
   1105    punpcklwd        m6, m7
   1106    pmaddwd          m4, [rsp+ 0*16]
   1107    pmaddwd          m6, [rsp+ 1*16]
   1108    paddd            m4, m6
   1109 
   1110    palignr          m2, m3, m0, 10
   1111    palignr          m3, m0, 12
   1112    psrldq           m0, 8
   1113 
   1114    movu             m1, [bufq+xq-82*2-3]   ; y=-2,x=[-3,+12]
   1115    pxor             m6, m6
   1116    pcmpgtb          m6, m1
   1117    punpckhbw        m5, m1, m6
   1118    punpcklbw        m1, m6
   1119 
   1120    punpcklwd        m0, m2
   1121    punpcklwd        m3, m1
   1122    pmaddwd          m0, [rsp+ 2*16]
   1123    pmaddwd          m3, [rsp+ 3*16]
   1124    paddd            m0, m3
   1125    paddd            m0, m4
   1126 
   1127    movu             m2, [bufq+xq-82*1-3]   ; y=-1,x=[-3,+12]
   1128    pxor             m7, m7
   1129    pcmpgtb          m7, m2
   1130    punpckhbw        m6, m2, m7
   1131    punpcklbw        m2, m7
   1132 
   1133    palignr          m3, m5, m1, 10
   1134    palignr          m5, m1, 12
   1135    psrldq           m4, m2, 2
   1136 
   1137    punpcklwd        m3, m5
   1138    punpcklwd        m5, m2, m4
   1139    pmaddwd          m3, [rsp+ 6*16]
   1140    pmaddwd          m5, m8
   1141    paddd            m3, m5
   1142    paddd            m0, m3
   1143 
   1144    psrldq           m3, m1, 2
   1145    psrldq           m4, m1, 4
   1146    psrldq           m5, m1, 6
   1147    psrldq           m1, 8
   1148 
   1149    punpcklwd        m3, m4
   1150    punpcklwd        m5, m1
   1151    pmaddwd          m3, [rsp+ 4*16]
   1152    pmaddwd          m5, [rsp+ 5*16]
   1153    paddd            m3, m5
   1154    paddd            m0, m3
   1155 
   1156 %if %2
   1157    movq             m1, [bufyq+xq*2]
   1158 %if %3
   1159    movq             m3, [bufyq+xq*2+82]
   1160 %endif
   1161    pmaddubsw        m7, m13, m1
   1162 %if %3
   1163    pmaddubsw        m5, m13, m3
   1164    paddw            m7, m5
   1165 %endif
   1166    pmulhrsw         m7, m15
   1167 %else
   1168    movd             m7, [bufyq+xq]
   1169    pxor             m1, m1
   1170    pcmpgtb          m1, m7
   1171    punpcklbw        m7, m1
   1172 %endif
   1173 
   1174    psrldq           m1, m2, 4
   1175    psrldq           m3, m2, 6
   1176    palignr          m4, m6, m2, 10
   1177    palignr          m6, m2, 12
   1178    psrldq           m2, 8
   1179 
   1180    punpcklwd        m1, m3
   1181    punpcklwd        m2, m4
   1182    punpcklwd        m6, m7
   1183    pmaddwd          m1, m9
   1184    pmaddwd          m2, m10
   1185    pmaddwd          m6, m11
   1186    paddd            m1, m2
   1187    paddd            m0, m6
   1188    paddd            m0, m1
   1189    paddd            m0, m14
   1190 
   1191    movq             m1, [bufq+xq-3]        ; y=0,x=[-3,+4]
   1192    pxor             m4, m4
   1193    movd             m5, [base+byte_blend]
   1194 .x_loop_ar3_inner:
   1195    pcmpgtb          m2, m4, m1
   1196    punpcklbw        m3, m1, m2
   1197    pmaddwd          m2, m3, m12
   1198    pshufd           m3, m2, q1111
   1199    paddd            m2, m3                 ; left+cur
   1200    paddd            m2, m0                 ; add top
   1201    psrldq           m0, 4
   1202    psrad            m2, [fg_dataq+FGData.ar_coeff_shift]
   1203    ; don't packssdw, we only care about one value
   1204    packsswb         m2, m2
   1205    pandn            m3, m5, m1
   1206    pslld            m2, 24
   1207    pand             m2, m5
   1208    por              m1, m2, m3
   1209    movd    [bufq+xq-3], m1
   1210    psrldq           m1, 1
   1211    inc              xq
   1212    jz .x_loop_ar3_end
   1213    test             xq, 3
   1214    jnz .x_loop_ar3_inner
   1215    jmp .x_loop_ar3
   1216 
   1217 .x_loop_ar3_end:
   1218    add            bufq, 82
   1219    add           bufyq, 82<<%3
   1220    dec              hd
   1221    jg .y_loop_ar3
   1222    RET
   1223 %endmacro
   1224 
   1225 generate_grain_uv_fn 420, 1, 1
   1226 generate_grain_uv_fn 422, 1, 0
   1227 generate_grain_uv_fn 444, 0, 0
   1228 
   1229 %macro vpgatherdw 5-6 ; dst, src, base, tmp_gpr[x2], tmp_xmm_reg
   1230 %assign %%idx 0
   1231 %define %%tmp %2
   1232 %if %0 == 6
   1233 %define %%tmp %6
   1234 %endif
   1235 %rep 4
   1236 %if %%idx == 0
   1237    movd        %5 %+ d, %2
   1238    pshuflw       %%tmp, %2, q3232
   1239 %else
   1240    movd        %5 %+ d, %%tmp
   1241 %if %%idx == 2
   1242    punpckhqdq    %%tmp, %%tmp
   1243 %elif %%idx == 4
   1244    psrlq         %%tmp, 32
   1245 %endif
   1246 %endif
   1247    movzx       %4 %+ d, %5 %+ w
   1248    shr         %5 %+ d, 16
   1249 
   1250 %if %%idx == 0
   1251    movd             %1, [%3+%4]
   1252 %else
   1253    pinsrw           %1, [%3+%4], %%idx + 0
   1254 %endif
   1255    pinsrw           %1, [%3+%5], %%idx + 1
   1256 %assign %%idx %%idx+2
   1257 %endrep
   1258 %endmacro
   1259 
   1260 INIT_XMM ssse3
   1261 ; fgy_32x32xn(dst, src, stride, fg_data, w, scaling, grain_lut, h, sby)
   1262 %if ARCH_X86_32
   1263 %if STACK_ALIGNMENT < mmsize
   1264 cglobal fgy_32x32xn_8bpc, 0, 7, 16, 0 - (5 * mmsize + 16 * gprsize), \
   1265        dst, src, scaling, unused1, fg_data, picptr, unused2
   1266    ; copy stack arguments to new position post-alignment, so that we
   1267    ; don't have to keep the old stack location in a separate register
   1268    mov              r0, r0m
   1269    mov              r1, r2m
   1270    mov              r2, r4m
   1271    mov              r3, r6m
   1272    mov              r4, r7m
   1273    mov              r5, r8m
   1274 
   1275    mov [rsp+5*mmsize+ 4*gprsize], r0
   1276    mov [rsp+5*mmsize+ 6*gprsize], r1
   1277    mov [rsp+5*mmsize+ 8*gprsize], r2
   1278    mov [rsp+5*mmsize+10*gprsize], r3
   1279    mov [rsp+5*mmsize+11*gprsize], r4
   1280    mov [rsp+5*mmsize+12*gprsize], r5
   1281 %else
   1282 cglobal fgy_32x32xn_8bpc, 0, 7, 16, 5 * mmsize + 4 * gprsize, \
   1283        dst, src, scaling, unused1, fg_data, picptr, unused2
   1284 %endif
   1285    mov            srcq, srcm
   1286    mov        fg_dataq, r3m
   1287    mov        scalingq, r5m
   1288 %if STACK_ALIGNMENT < mmsize
   1289 %define r0m [rsp+5*mmsize+ 4*gprsize]
   1290 %define r1m [rsp+5*mmsize+ 5*gprsize]
   1291 %define r2m [rsp+5*mmsize+ 6*gprsize]
   1292 %define r3m [rsp+5*mmsize+ 7*gprsize]
   1293 %define r4m [rsp+5*mmsize+ 8*gprsize]
   1294 %define r5m [rsp+5*mmsize+ 9*gprsize]
   1295 %define r6m [rsp+5*mmsize+10*gprsize]
   1296 %define r7m [rsp+5*mmsize+11*gprsize]
   1297 %define r8m [rsp+5*mmsize+12*gprsize]
   1298 %endif
   1299    LEA              r5, pb_mask
   1300 %define base r5-pb_mask
   1301    mov             r5m, picptrq
   1302 %else
   1303 cglobal fgy_32x32xn_8bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling, grain_lut
   1304    lea              r7, [pb_mask]
   1305 %define base r7-pb_mask
   1306 %endif
   1307    mov             r6d, [fg_dataq+FGData.scaling_shift]
   1308    movd             m3, [base+mul_bits+r6*2-14]
   1309    mov             r6d, [fg_dataq+FGData.clip_to_restricted_range]
   1310    movd             m4, [base+max+r6*4]
   1311    movd             m5, [base+min+r6*2]
   1312    punpcklwd        m3, m3
   1313    punpcklwd        m4, m4
   1314    punpcklwd        m5, m5
   1315    pshufd           m3, m3, q0000
   1316    pshufd           m4, m4, q0000
   1317    pshufd           m5, m5, q0000
   1318    SCRATCH           3, 11, 0
   1319    SCRATCH           4, 12, 1
   1320    SCRATCH           5, 13, 2
   1321 
   1322 %if ARCH_X86_32
   1323    DEFINE_ARGS dst, src, scaling, sby, fg_data, picptr, overlap
   1324 %else
   1325    DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, unused, sby, see, overlap
   1326 %endif
   1327 
   1328    mov            sbyd, r8m
   1329    mov        overlapd, [fg_dataq+FGData.overlap_flag] ; left_overlap: overlap & 1
   1330    test       overlapd, overlapd
   1331    jz .no_vertical_overlap
   1332    mova             m6, [base+pw_1024]
   1333    mova             m7, [base+pb_27_17_17_27]
   1334    SCRATCH           6, 14, 3
   1335    SCRATCH           7, 15, 4
   1336    test           sbyd, sbyd
   1337    jnz .vertical_overlap
   1338    ; fall-through
   1339 
   1340 .no_vertical_overlap:
   1341    mov             r8m, overlapd
   1342 %if ARCH_X86_32
   1343    DEFINE_ARGS dst, src, scaling, see, fg_data, picptr, unused
   1344    imul           seed, (173 << 24) | 37
   1345 %else
   1346    imul           seed, sbyd, (173 << 24) | 37
   1347 %endif
   1348    add            seed, (105 << 24) | 178
   1349    rol            seed, 8
   1350    movzx          seed, seew
   1351    xor            seed, [fg_dataq+FGData.seed]
   1352 
   1353 %if ARCH_X86_32
   1354    DEFINE_ARGS dst, src, scaling, see, w, picptr, src_bak
   1355 
   1356    mov             r3m, seed
   1357    mov              wq, r4m
   1358 %else
   1359    DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \
   1360                unused1, unused2, see, unused3
   1361 %endif
   1362 
   1363    lea        src_bakq, [srcq+wq]
   1364    neg              wq
   1365    sub           dstmp, srcq
   1366 %if ARCH_X86_32
   1367    mov             r1m, src_bakq
   1368    mov             r4m, wq
   1369    DEFINE_ARGS dst, src, scaling, see, unused1, unused2, unused3
   1370 %endif
   1371 
   1372 .loop_x:
   1373 %if ARCH_X86_32
   1374    mov            seed, r3m
   1375 %endif
   1376    mov             r6d, seed
   1377    or             seed, 0xEFF4
   1378    shr             r6d, 1
   1379    test           seeb, seeh
   1380    lea            seed, [r6+0x8000]
   1381    cmovp          seed, r6d                ; updated seed
   1382 %if ARCH_X86_32
   1383    mov             r3m, seed
   1384 
   1385    DEFINE_ARGS dst, src, scaling, offy, unused1, unused2, offx
   1386 
   1387    mov           offxd, offyd
   1388 %else
   1389    DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \
   1390                offx, offy, see, unused
   1391 
   1392    mov           offyd, seed
   1393    mov           offxd, seed
   1394 %endif
   1395    ror           offyd, 8
   1396    shr           offxd, 12
   1397    and           offyd, 0xf
   1398    imul          offyd, 164
   1399    lea           offyq, [offyq+offxq*2+747] ; offy*stride+offx
   1400 
   1401 %if ARCH_X86_32
   1402    ; r0m=dst, r1m=src_bak, r2m=stride, r3m=see, r4m=w, r5m=picptr,
   1403    ; r6m=grain_lut, r7m=h, r8m=overlap_v|h
   1404    DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut
   1405 %else
   1406    DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \
   1407                h, offxy, see, unused
   1408 %endif
   1409 
   1410 .loop_x_odd:
   1411    mov              hd, r7m
   1412    mov      grain_lutq, grain_lutmp
   1413 .loop_y:
   1414    ; src
   1415    mova             m0, [srcq]
   1416    pxor             m2, m2
   1417    punpckhbw        m1, m0, m2
   1418    punpcklbw        m0, m2                 ; m0-1: src as word
   1419 
   1420    ; scaling[src]
   1421 %if ARCH_X86_32
   1422    vpgatherdw       m4, m0, scalingq-1, r0, r5, m3
   1423    vpgatherdw       m5, m1, scalingq-1, r0, r5, m3
   1424 %else
   1425    vpgatherdw       m4, m0, scalingq-1, r12, r13, m3
   1426    vpgatherdw       m5, m1, scalingq-1, r12, r13, m3
   1427 %endif
   1428    REPX {psrlw x, 8}, m4, m5
   1429 
   1430    ; grain = grain_lut[offy+y][offx+x]
   1431    movu             m3, [grain_lutq+offxyq]
   1432    pcmpgtb          m7, m2, m3
   1433    punpcklbw        m2, m3, m7
   1434    punpckhbw        m3, m7
   1435 
   1436    ; noise = round2(scaling[src] * grain, scaling_shift)
   1437    pmullw           m2, m4
   1438    pmullw           m3, m5
   1439    pmulhrsw         m2, m11
   1440    pmulhrsw         m3, m11
   1441 
   1442    ; dst = clip_pixel(src, noise)
   1443    paddw            m0, m2
   1444    paddw            m1, m3
   1445    pmaxsw           m0, m13
   1446    pmaxsw           m1, m13
   1447    pminsw           m0, m12
   1448    pminsw           m1, m12
   1449    packuswb         m0, m1
   1450    movifnidn      dstq, dstmp
   1451    mova    [dstq+srcq], m0
   1452 
   1453    add            srcq, r2mp
   1454    add      grain_lutq, 82
   1455    dec              hd
   1456    jg .loop_y
   1457 
   1458 %if ARCH_X86_32
   1459    add            r4mp, 16
   1460 %else
   1461    add              wq, 16
   1462 %endif
   1463    jge .end
   1464 %if ARCH_X86_32
   1465    mov            srcq, r1mp
   1466    add            srcq, r4mp
   1467 %else
   1468    lea            srcq, [src_bakq+wq]
   1469 %endif
   1470    btc       dword r8m, 2
   1471    jc .next_blk
   1472 
   1473    add          offxyd, 16
   1474    test      dword r8m, 2              ; r8m & 2 = have_top_overlap
   1475    jz .loop_x_odd
   1476 
   1477 %if ARCH_X86_32
   1478    add dword [rsp+5*mmsize+1*gprsize], 16
   1479 %else
   1480    add            r11d, 16             ; top_offxyd
   1481 %endif
   1482    jnz .loop_x_odd_v_overlap
   1483 
   1484 .next_blk:
   1485    test      dword r8m, 1
   1486    jz .loop_x
   1487 
   1488    test      dword r8m, 2
   1489    jnz .loop_x_hv_overlap
   1490 
   1491    ; horizontal overlap (without vertical overlap)
   1492 .loop_x_h_overlap:
   1493 %if ARCH_X86_32
   1494    ; r0m=dst, r1m=src_bak, r2m=stride, r3m=see, r4m=w, r5m=picptr,
   1495    ; r6m=grain_lut, r7m=h, r8m=overlap_v|h
   1496    DEFINE_ARGS dst, src, scaling, offxy, unused1, unused2, unused3
   1497 
   1498    add          offxyd, 16                 ; left_offxyd
   1499    mov [rsp+5*mmsize+0*gprsize], offxyd
   1500 
   1501    DEFINE_ARGS dst, src, scaling, see, unused1, unused2, unused3
   1502 
   1503    mov            seed, r3m
   1504 %else
   1505    DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \
   1506                offx, offy, see, left_offxy
   1507 
   1508    lea     left_offxyd, [offyd+16]         ; previous column's offy*stride+offx
   1509 %endif
   1510 
   1511    mov             r6d, seed
   1512    or             seed, 0xEFF4
   1513    shr             r6d, 1
   1514    test           seeb, seeh
   1515    lea            seed, [r6+0x8000]
   1516    cmovp          seed, r6d                ; updated seed
   1517 
   1518 %if ARCH_X86_32
   1519    mov             r3m, seed
   1520 
   1521    DEFINE_ARGS dst, src, scaling, offy, unused1, unused2, offx
   1522 
   1523    mov           offxd, offyd
   1524 %else
   1525    mov           offyd, seed
   1526    mov           offxd, seed
   1527 %endif
   1528    ror           offyd, 8
   1529    shr           offxd, 12
   1530    and           offyd, 0xf
   1531    imul          offyd, 164
   1532    lea           offyq, [offyq+offxq*2+747] ; offy*stride+offx
   1533 
   1534 %if ARCH_X86_32
   1535    DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut
   1536 %else
   1537    DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \
   1538                h, offxy, see, left_offxy
   1539 %endif
   1540 
   1541    mov              hd, r7m
   1542    mov      grain_lutq, grain_lutmp
   1543 .loop_y_h_overlap:
   1544    ; src
   1545    mova             m0, [srcq]
   1546    pxor             m2, m2
   1547    punpckhbw        m1, m0, m2
   1548    punpcklbw        m0, m2                 ; m0-1: src as word
   1549 
   1550    ; scaling[src]
   1551 %if ARCH_X86_32
   1552    vpgatherdw       m4, m0, scalingq-1, r0, r5, m3
   1553    vpgatherdw       m5, m1, scalingq-1, r0, r5, m3
   1554 %else
   1555    vpgatherdw       m4, m0, scalingq-1, r12, r13, m3
   1556    vpgatherdw       m5, m1, scalingq-1, r12, r13, m3
   1557 %endif
   1558    REPX {psrlw x, 8}, m4, m5
   1559 
   1560    ; grain = grain_lut[offy+y][offx+x]
   1561    movu             m3, [grain_lutq+offxyq]
   1562 %if ARCH_X86_32
   1563    mov              r5, [rsp+5*mmsize+0*gprsize]
   1564    movd             m7, [grain_lutq+r5]
   1565 %else
   1566    movd             m7, [grain_lutq+left_offxyq]
   1567 %endif
   1568    punpcklbw        m7, m3
   1569    pmaddubsw        m6, m15, m7
   1570    pmulhrsw         m6, m14
   1571    packsswb         m6, m6
   1572    shufps           m6, m3, q3210
   1573    pcmpgtb          m2, m6
   1574    punpcklbw        m7, m6, m2
   1575    punpckhbw        m6, m2
   1576 
   1577    ; noise = round2(scaling[src] * grain, scaling_shift)
   1578    pmullw           m7, m4
   1579    pmullw           m6, m5
   1580    pmulhrsw         m7, m11
   1581    pmulhrsw         m6, m11
   1582 
   1583    ; dst = clip_pixel(src, noise)
   1584    paddw            m0, m7
   1585    paddw            m1, m6
   1586    pmaxsw           m0, m13
   1587    pmaxsw           m1, m13
   1588    pminsw           m0, m12
   1589    pminsw           m1, m12
   1590    packuswb         m0, m1
   1591    movifnidn      dstq, dstmp
   1592    mova    [dstq+srcq], m0
   1593 
   1594    add            srcq, r2mp
   1595    add      grain_lutq, 82
   1596    dec              hd
   1597    jg .loop_y_h_overlap
   1598 
   1599 %if ARCH_X86_32
   1600    add            r4mp, 16
   1601 %else
   1602    add              wq, 16
   1603 %endif
   1604    jge .end
   1605 %if ARCH_X86_32
   1606    mov            srcq, r1m
   1607    add            srcq, r4m
   1608 %else
   1609    lea            srcq, [src_bakq+wq]
   1610 %endif
   1611    xor       dword r8m, 4
   1612    add          offxyd, 16
   1613 
   1614    ; since this half-block had left-overlap, the next does not
   1615    test      dword r8m, 2              ; have_top_overlap
   1616    jz .loop_x_odd
   1617 %if ARCH_X86_32
   1618    add dword [rsp+5*mmsize+1*gprsize], 16
   1619 %else
   1620    add            r11d, 16             ; top_offxyd
   1621 %endif
   1622    jmp .loop_x_odd_v_overlap
   1623 
   1624 .end:
   1625    RET
   1626 
   1627 .vertical_overlap:
   1628 %if ARCH_X86_32
   1629    DEFINE_ARGS dst, src, scaling, sby, fg_data, picptr, overlap
   1630 %else
   1631    DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, tmp, sby, see, overlap
   1632 %endif
   1633 
   1634    or         overlapd, 2                  ; top_overlap: overlap & 2
   1635    mov             r8m, overlapd
   1636    movzx          sbyd, sbyb
   1637 %if ARCH_X86_32
   1638    imul             r4, [fg_dataq+FGData.seed], 0x00010001
   1639    DEFINE_ARGS tmp, src, scaling, sby, see, picptr, unused
   1640 %else
   1641    imul           seed, [fg_dataq+FGData.seed], 0x00010001
   1642 %endif
   1643    imul           tmpd, sbyd, 173 * 0x00010001
   1644    imul           sbyd, 37 * 0x01000100
   1645    add            tmpd, (105 << 16) | 188
   1646    add            sbyd, (178 << 24) | (141 << 8)
   1647    and            tmpd, 0x00ff00ff
   1648    and            sbyd, 0xff00ff00
   1649    xor            seed, tmpd
   1650 %if ARCH_X86_32
   1651    xor            sbyd, seed               ; (cur_seed << 16) | top_seed
   1652 
   1653    DEFINE_ARGS dst, src, scaling, see, w, picptr, src_bak
   1654 
   1655    mov             r3m, seed
   1656    mov              wq, r4m
   1657 %else
   1658    xor            seed, sbyd               ; (cur_seed << 16) | top_seed
   1659 
   1660    DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \
   1661                tmp, unused2, see, unused3
   1662 %endif
   1663 
   1664    lea        src_bakq, [srcq+wq]
   1665    neg              wq
   1666    sub           dstmp, srcq
   1667 %if ARCH_X86_32
   1668    mov             r1m, src_bakq
   1669    mov             r4m, wq
   1670    DEFINE_ARGS tmp, src, scaling, see, unused1, picptr, unused2
   1671 %endif
   1672 
   1673 .loop_x_v_overlap:
   1674 %if ARCH_X86_32
   1675    mov            seed, r3m
   1676 %endif
   1677    ; we assume from the block above that bits 8-15 of tmpd are zero'ed,
   1678    ; because of the 'and tmpd, 0x00ff00ff' above
   1679    mov             r6d, seed
   1680    or             seed, 0xeff4eff4
   1681    test           seeb, seeh
   1682    setp           tmpb                     ; parity of top_seed
   1683    shr            seed, 16
   1684    shl            tmpd, 16
   1685    test           seeb, seeh
   1686    setp           tmpb                     ; parity of cur_seed
   1687    or              r6d, 0x00010001
   1688    xor            tmpd, r6d
   1689    mov            seed, tmpd
   1690    ror            seed, 1                  ; updated (cur_seed << 16) | top_seed
   1691 
   1692 %if ARCH_X86_32
   1693    mov             r3m, seed
   1694 
   1695    DEFINE_ARGS dst, src, scaling, offy, unused1, unused2, offx
   1696 
   1697    mov           offxd, offyd
   1698 %else
   1699    DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \
   1700                offx, offy, see, unused, top_offxy
   1701 
   1702    mov           offyd, seed
   1703    mov           offxd, seed
   1704 %endif
   1705 
   1706    ror           offyd, 8
   1707    ror           offxd, 12
   1708    and           offyd, 0xf000f
   1709    and           offxd, 0xf000f
   1710    imul          offyd, 164
   1711    ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy
   1712    lea           offyq, [offyq+offxq*2+0x10001*747+32*82]
   1713 
   1714 %if ARCH_X86_32
   1715    DEFINE_ARGS top_offxy, src, scaling, offxy, h, picptr, grain_lut
   1716 %else
   1717    DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \
   1718                h, offxy, see, unused, top_offxy
   1719 %endif
   1720 
   1721    movzx    top_offxyd, offxyw
   1722 %if ARCH_X86_32
   1723    mov [rsp+5*mmsize+1*gprsize], top_offxyd
   1724 
   1725    DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut
   1726 %endif
   1727    shr          offxyd, 16
   1728 
   1729 .loop_x_odd_v_overlap:
   1730 %if ARCH_X86_32
   1731    mov              r5, r5m
   1732    lea              r5, [base+pb_27_17]
   1733    mov [rsp+5*mmsize+12], r5
   1734 %else
   1735    mova             m8, [pb_27_17]
   1736 %endif
   1737    mov              hd, r7m
   1738    mov      grain_lutq, grain_lutmp
   1739 .loop_y_v_overlap:
   1740    ; src
   1741    mova             m0, [srcq]
   1742    pxor             m2, m2
   1743    punpckhbw        m1, m0, m2
   1744    punpcklbw        m0, m2                 ; m0-1: src as word
   1745 
   1746    ; scaling[src]
   1747 %if ARCH_X86_32
   1748    vpgatherdw       m4, m0, scalingq-1, r0, r5, m3
   1749    vpgatherdw       m5, m1, scalingq-1, r0, r5, m3
   1750 %else
   1751    vpgatherdw       m4, m0, scalingq-1, r12, r13, m3
   1752    vpgatherdw       m5, m1, scalingq-1, r12, r13, m3
   1753 %endif
   1754    REPX {psrlw x, 8}, m4, m5
   1755 
   1756    ; grain = grain_lut[offy+y][offx+x]
   1757    movu             m3, [grain_lutq+offxyq]
   1758 %if ARCH_X86_32
   1759    mov              r5, [rsp+5*mmsize+1*gprsize]
   1760    movu             m7, [grain_lutq+r5]
   1761 %else
   1762    movu             m7, [grain_lutq+top_offxyq]
   1763 %endif
   1764    punpckhbw        m6, m7, m3
   1765    punpcklbw        m7, m3
   1766 %if ARCH_X86_32
   1767    mov              r5, [rsp+5*mmsize+12]
   1768    pmaddubsw        m3, [r5], m6
   1769    pmaddubsw        m6, [r5], m7
   1770 %else
   1771    pmaddubsw        m3, m8, m6
   1772    pmaddubsw        m6, m8, m7
   1773 %endif
   1774    pmulhrsw         m3, m14
   1775    pmulhrsw         m6, m14
   1776    packsswb         m6, m3
   1777    pcmpgtb          m7, m2, m6
   1778    punpcklbw        m2, m6, m7
   1779    punpckhbw        m6, m7
   1780 
   1781    ; noise = round2(scaling[src] * grain, scaling_shift)
   1782    pmullw           m2, m4
   1783    pmullw           m6, m5
   1784    pmulhrsw         m2, m11
   1785    pmulhrsw         m6, m11
   1786 
   1787    ; dst = clip_pixel(src, noise)
   1788    paddw            m0, m2
   1789    paddw            m1, m6
   1790    pmaxsw           m0, m13
   1791    pmaxsw           m1, m13
   1792    pminsw           m0, m12
   1793    pminsw           m1, m12
   1794    packuswb         m0, m1
   1795    movifnidn      dstq, dstmp
   1796    mova    [dstq+srcq], m0
   1797 
   1798 %if ARCH_X86_32
   1799    add dword [rsp+5*mmsize+12], mmsize
   1800 %else
   1801    mova             m8, [pb_17_27]
   1802 %endif
   1803    add            srcq, r2mp
   1804    add      grain_lutq, 82
   1805    dec              hw
   1806    jz .end_y_v_overlap
   1807    ; 2 lines get vertical overlap, then fall back to non-overlap code for
   1808    ; remaining (up to) 30 lines
   1809    btc              hd, 16
   1810    jnc .loop_y_v_overlap
   1811    jmp .loop_y
   1812 
   1813 .end_y_v_overlap:
   1814 %if ARCH_X86_32
   1815    add            r4mp, 16
   1816 %else
   1817    add              wq, 16
   1818 %endif
   1819    jge .end_hv
   1820 %if ARCH_X86_32
   1821    mov            srcq, r1mp
   1822    add            srcq, r4mp
   1823 %else
   1824    lea            srcq, [src_bakq+wq]
   1825 %endif
   1826    btc       dword r8m, 2
   1827    jc .loop_x_hv_overlap
   1828    add          offxyd, 16
   1829 %if ARCH_X86_32
   1830    add dword [rsp+5*mmsize+1*gprsize], 16
   1831 %else
   1832    add      top_offxyd, 16
   1833 %endif
   1834    jmp .loop_x_odd_v_overlap
   1835 
   1836 .loop_x_hv_overlap:
   1837 %if ARCH_X86_32
   1838    mov              r5, r5m
   1839    lea              r5, [base+pb_27_17]
   1840    mov [rsp+5*mmsize+12], r5
   1841 
   1842    DEFINE_ARGS tmp, src, scaling, offxy, w, picptr, src_bak
   1843 
   1844    mov              r5, [rsp+5*mmsize+1*gprsize]
   1845    mov              r4, offxyd
   1846    add              r5, 16
   1847    add              r4, 16
   1848    mov [rsp+5*mmsize+2*gprsize], r5        ; topleft_offxy
   1849    mov [rsp+5*mmsize+0*gprsize], r4        ; left_offxy
   1850 
   1851    DEFINE_ARGS tmp, src, scaling, see, w, picptr, src_bak
   1852 
   1853    xor            tmpd, tmpd
   1854    mov            seed, r3m
   1855 %else
   1856    mova             m8, [pb_27_17]
   1857 
   1858    DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \
   1859                tmp, unused2, see, unused3
   1860 
   1861    ; we assume from the block above that bits 8-15 of tmpd are zero'ed
   1862 %endif
   1863    mov             r6d, seed
   1864    or             seed, 0xeff4eff4
   1865    test           seeb, seeh
   1866    setp           tmpb                     ; parity of top_seed
   1867    shr            seed, 16
   1868    shl            tmpd, 16
   1869    test           seeb, seeh
   1870    setp           tmpb                     ; parity of cur_seed
   1871    or              r6d, 0x00010001
   1872    xor            tmpd, r6d
   1873    mov            seed, tmpd
   1874    ror            seed, 1                  ; updated (cur_seed << 16) | top_seed
   1875 
   1876 %if ARCH_X86_32
   1877    mov             r3m, seed
   1878 
   1879    DEFINE_ARGS dst, src, scaling, offy, unused1, unused2, offx
   1880 
   1881    mov           offxd, offyd
   1882 %else
   1883    DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \
   1884                offx, offy, see, left_offxy, top_offxy, topleft_offxy
   1885 
   1886    lea  topleft_offxyq, [top_offxyq+16]
   1887    lea     left_offxyq, [offyq+16]
   1888    mov           offyd, seed
   1889    mov           offxd, seed
   1890 %endif
   1891    ror           offyd, 8
   1892    ror           offxd, 12
   1893    and           offyd, 0xf000f
   1894    and           offxd, 0xf000f
   1895    imul          offyd, 164
   1896    ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy
   1897    lea           offyq, [offyq+offxq*2+0x10001*747+32*82]
   1898 
   1899 %if ARCH_X86_32
   1900    DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut
   1901 
   1902    movzx            r5, offxyw             ; top_offxy
   1903    mov [rsp+5*mmsize+1*gprsize], r5
   1904 %else
   1905    DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \
   1906                h, offxy, see, left_offxy, top_offxy, topleft_offxy
   1907 
   1908    movzx    top_offxyd, offxyw
   1909 %endif
   1910    shr          offxyd, 16
   1911 
   1912    mov              hd, r7m
   1913    mov      grain_lutq, grain_lutmp
   1914 .loop_y_hv_overlap:
   1915    ; grain = grain_lut[offy+y][offx+x]
   1916    movu             m3, [grain_lutq+offxyq]
   1917 %if ARCH_X86_32
   1918    mov              r5, [rsp+5*mmsize+1*gprsize]   ; top_offxy
   1919    mov              r0, [rsp+5*mmsize+0*gprsize]   ; left_offxy
   1920    movu             m6, [grain_lutq+r5]
   1921    mov              r5, [rsp+5*mmsize+2*gprsize]   ; topleft_offxy
   1922    movd             m4, [grain_lutq+r0]
   1923    movd             m7, [grain_lutq+r5]
   1924 %else
   1925    movu             m6, [grain_lutq+top_offxyq]
   1926    movd             m4, [grain_lutq+left_offxyq]
   1927    movd             m7, [grain_lutq+topleft_offxyq]
   1928 %endif
   1929    ; do h interpolation first (so top | top/left -> top, left | cur -> cur)
   1930    punpcklbw        m4, m3
   1931    punpcklbw        m7, m6
   1932    pmaddubsw        m2, m15, m4
   1933    pmaddubsw        m4, m15, m7
   1934    pmulhrsw         m2, m14
   1935    pmulhrsw         m4, m14
   1936    packsswb         m2, m2
   1937    packsswb         m4, m4
   1938    shufps           m2, m3, q3210
   1939    shufps           m4, m6, q3210
   1940    ; followed by v interpolation (top | cur -> cur)
   1941    punpcklbw        m3, m4, m2
   1942    punpckhbw        m4, m2
   1943 %if ARCH_X86_32
   1944    mov              r5, [rsp+5*mmsize+12]
   1945    pmaddubsw        m7, [r5], m4
   1946    pmaddubsw        m4, [r5], m3
   1947 %else
   1948    pmaddubsw        m7, m8, m4
   1949    pmaddubsw        m4, m8, m3
   1950 %endif
   1951    pmulhrsw         m7, m14
   1952    pmulhrsw         m4, m14
   1953    packsswb         m4, m7
   1954    pxor             m2, m2
   1955    pcmpgtb          m7, m2, m4
   1956    punpcklbw        m3, m4, m7
   1957    punpckhbw        m4, m7
   1958 
   1959    ; src
   1960    mova             m0, [srcq]
   1961    punpckhbw        m1, m0, m2
   1962    punpcklbw        m0, m2                 ; m0-1: src as word
   1963 
   1964    ; scaling[src]
   1965 %if ARCH_X86_32
   1966    vpgatherdw       m5, m0, scalingq-1, r0, r5, m7
   1967    vpgatherdw       m6, m1, scalingq-1, r0, r5, m7
   1968 %else
   1969    vpgatherdw       m5, m0, scalingq-1, r13, r14, m7
   1970    vpgatherdw       m6, m1, scalingq-1, r13, r14, m7
   1971 %endif
   1972    REPX {psrlw x, 8}, m5, m6
   1973 
   1974    ; noise = round2(scaling[src] * grain, scaling_shift)
   1975    pmullw           m3, m5
   1976    pmullw           m4, m6
   1977    pmulhrsw         m3, m11
   1978    pmulhrsw         m4, m11
   1979 
   1980    ; dst = clip_pixel(src, noise)
   1981    paddw            m0, m3
   1982    paddw            m1, m4
   1983    pmaxsw           m0, m13
   1984    pmaxsw           m1, m13
   1985    pminsw           m0, m12
   1986    pminsw           m1, m12
   1987    packuswb         m0, m1
   1988    movifnidn      dstq, dstmp
   1989    mova    [dstq+srcq], m0
   1990 
   1991 %if ARCH_X86_32
   1992    add dword [rsp+5*mmsize+12], mmsize
   1993 %else
   1994    mova             m8, [pb_17_27]
   1995 %endif
   1996    add            srcq, r2mp
   1997    add      grain_lutq, 82
   1998    dec              hw
   1999    jz .end_y_hv_overlap
   2000    ; 2 lines get vertical overlap, then fall back to non-overlap code for
   2001    ; remaining (up to) 30 lines
   2002    btc              hd, 16
   2003    jnc .loop_y_hv_overlap
   2004    jmp .loop_y_h_overlap
   2005 
   2006 .end_y_hv_overlap:
   2007 %if ARCH_X86_32
   2008    add            r4mp, 16
   2009 %else
   2010    add              wq, 16
   2011 %endif
   2012    jge .end_hv
   2013 %if ARCH_X86_32
   2014    mov            srcq, r1m
   2015    add            srcq, r4m
   2016 %else
   2017    lea            srcq, [src_bakq+wq]
   2018 %endif
   2019    xor       dword r8m, 4
   2020    add          offxyd, 16
   2021 %if ARCH_X86_32
   2022    add dword [rsp+5*mmsize+1*gprsize], 16
   2023 %else
   2024    add      top_offxyd, 16
   2025 %endif
   2026    jmp .loop_x_odd_v_overlap
   2027 
   2028 .end_hv:
   2029    RET
   2030 
   2031 %macro FGUV_FN 3 ; name, ss_hor, ss_ver
   2032 INIT_XMM ssse3
   2033 %if ARCH_X86_32
   2034 ; fguv_32x32xn_i420_ssse3(dst, src, stride, fg_data, w, scaling, grain_lut, h,
   2035 ;                         sby, luma, lstride, uv_pl, is_id)
   2036 %if STACK_ALIGNMENT < mmsize
   2037 DECLARE_ARG 0, 1, 2, 3, 4, 5, 6, 7, 8
   2038 cglobal fguv_32x32xn_i%1_8bpc, 0, 7, 8, 0 - (7 * mmsize + (13 + 3) * gprsize), \
   2039        tmp, src, scaling, h, fg_data, picptr, unused
   2040    mov              r0, r0m
   2041    mov              r1, r2m
   2042    mov              r2, r4m
   2043    mov              r3, r6m
   2044    mov              r4, r7m
   2045    mov [rsp+7*mmsize+3*gprsize], r0
   2046    mov [rsp+7*mmsize+5*gprsize], r1
   2047    mov [rsp+7*mmsize+7*gprsize], r2
   2048    mov [rsp+7*mmsize+9*gprsize], r3
   2049    mov [rsp+7*mmsize+10*gprsize], r4
   2050 
   2051    mov              r0, r8m
   2052    mov              r1, r9m
   2053    mov              r2, r10m
   2054    mov              r4, r11m
   2055    mov              r3, r12m
   2056    mov [rsp+7*mmsize+11*gprsize], r0
   2057    mov [rsp+7*mmsize+12*gprsize], r1
   2058    mov [rsp+7*mmsize+13*gprsize], r2
   2059    mov [rsp+7*mmsize+14*gprsize], r4
   2060 %else
   2061 cglobal fguv_32x32xn_i%1_8bpc, 0, 7, 8, 7 * mmsize + (4) * gprsize, \
   2062        tmp, src, scaling, h, fg_data, picptr, unused
   2063 %endif
   2064    mov            srcq, srcm
   2065    mov        fg_dataq, r3m
   2066    mov        scalingq, r5m
   2067 %if STACK_ALIGNMENT < mmsize
   2068 %define r0m [rsp+7*mmsize+ 3*gprsize]
   2069 %define r1m [rsp+7*mmsize+ 4*gprsize]
   2070 %define r2m [rsp+7*mmsize+ 5*gprsize]
   2071 %define r3m [rsp+7*mmsize+ 6*gprsize]
   2072 %define r4m [rsp+7*mmsize+ 7*gprsize]
   2073 %define r5m [rsp+7*mmsize+ 8*gprsize]
   2074 %define r6m [rsp+7*mmsize+ 9*gprsize]
   2075 %define r7m [rsp+7*mmsize+10*gprsize]
   2076 %define r8m [rsp+7*mmsize+11*gprsize]
   2077 %define r9m [rsp+7*mmsize+12*gprsize]
   2078 %define r10m [rsp+7*mmsize+13*gprsize]
   2079 %define r11m [rsp+7*mmsize+14*gprsize]
   2080 %define r12m [rsp+7*mmsize+15*gprsize]
   2081 %endif
   2082    LEA              r5, pb_mask
   2083 %define base r5-pb_mask
   2084    mov             r5m, r5
   2085 %else
   2086 cglobal fguv_32x32xn_i%1_8bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \
   2087                                     grain_lut, tmp, sby, luma, lstride, uv_pl, is_id
   2088    lea              r8, [pb_mask]
   2089 %define base r8-pb_mask
   2090 %endif
   2091    mov             r6d, [fg_dataq+FGData.scaling_shift]
   2092    movd             m3, [base+mul_bits+r6*2-14]
   2093    mov             r6d, [fg_dataq+FGData.clip_to_restricted_range]
   2094    lea            tmpd, [r6d*2]
   2095 %if ARCH_X86_32 && STACK_ALIGNMENT < mmsize
   2096    test             r3, r3
   2097 %else
   2098    cmp      dword r12m, 0                      ; is_idm
   2099 %endif
   2100    movd             m5, [base+min+r6*2]
   2101    cmovne          r6d, tmpd
   2102    movd             m4, [base+max+r6*2]
   2103    punpcklwd        m3, m3
   2104    punpcklwd        m5, m5
   2105    punpcklwd        m4, m4
   2106    pshufd           m3, m3, q0000
   2107    pshufd           m5, m5, q0000
   2108    pshufd           m4, m4, q0000
   2109    SCRATCH           3, 11, 0
   2110    SCRATCH           4, 12, 1
   2111    SCRATCH           5, 13, 2
   2112 
   2113    cmp byte [fg_dataq+FGData.chroma_scaling_from_luma], 0
   2114    jne .csfl
   2115 
   2116 %macro %%FGUV_32x32xN_LOOP 3 ; not-csfl, ss_hor, ss_ver
   2117 %if ARCH_X86_32
   2118    DEFINE_ARGS dst, src, scaling, sby, fg_data, picptr, overlap
   2119 %else
   2120    DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, unused, sby, see, overlap
   2121 %endif
   2122 
   2123 %if %1
   2124    mov             r6d, dword r11m
   2125    movd             m0, [fg_dataq+FGData.uv_mult+r6*4]
   2126    movd             m1, [fg_dataq+FGData.uv_luma_mult+r6*4]
   2127    punpcklbw        m6, m1, m0
   2128    movd             m7, [fg_dataq+FGData.uv_offset+r6*4]
   2129    punpcklwd        m6, m6
   2130    punpcklwd        m7, m7
   2131    pshufd           m6, m6, q0000
   2132    pshufd           m7, m7, q0000
   2133    SCRATCH           6, 14, 3
   2134    SCRATCH           7, 15, 4
   2135 %endif
   2136 
   2137    mov            sbyd, r8m
   2138    mov        overlapd, [fg_dataq+FGData.overlap_flag] ; left_overlap: overlap & 1
   2139    test       overlapd, overlapd
   2140    jz %%no_vertical_overlap
   2141 %if ARCH_X86_32
   2142 %if %2
   2143    mova             m1, [base+pb_23_22_h]
   2144 %else
   2145    mova             m1, [base+pb_27_17_17_27]
   2146 %endif
   2147    mova             m0, [base+pw_1024]
   2148 %else
   2149 %if %2
   2150    mova             m1, [pb_23_22_h]
   2151 %else
   2152    mova             m1, [pb_27_17_17_27]
   2153 %endif
   2154    mova             m0, [pw_1024]
   2155 %endif
   2156    SCRATCH           0, 8, 5
   2157    SCRATCH           1, 9, 6
   2158    test           sbyd, sbyd
   2159    jnz %%vertical_overlap
   2160    ; fall-through
   2161 
   2162 %%no_vertical_overlap:
   2163    mov             r8m, overlapd
   2164 %if ARCH_X86_32
   2165    DEFINE_ARGS dst, src, scaling, see, fg_data, picptr, overlap
   2166    imul           seed, (173 << 24) | 37
   2167 %else
   2168    imul           seed, sbyd, (173 << 24) | 37
   2169 %endif
   2170    add            seed, (105 << 24) | 178
   2171    rol            seed, 8
   2172    movzx          seed, seew
   2173    xor            seed, [fg_dataq+FGData.seed]
   2174 
   2175 %if ARCH_X86_32
   2176    mov             r3m, seed
   2177 
   2178    DEFINE_ARGS luma, src, scaling, see, w, picptr, src_bak
   2179 %define luma_bakq lumaq
   2180 
   2181    mov              wq, r4m
   2182 %if %3
   2183    shl           r10mp, 1
   2184 %endif
   2185 %else
   2186    DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \
   2187                unused2, unused3, see, overlap, unused4, src_bak, lstride, luma_bak
   2188 
   2189    mov        lstrideq, r10mp
   2190 %endif
   2191 
   2192    mov           lumaq, r9mp
   2193    lea        src_bakq, [srcq+wq]
   2194    lea       luma_bakq, [lumaq+wq*(1+%2)]
   2195    neg              wq
   2196    sub            r0mp, srcq
   2197 %if ARCH_X86_32
   2198    mov             r1m, src_bakq
   2199    mov            r11m, luma_bakq
   2200    mov             r4m, wq
   2201 
   2202    DEFINE_ARGS tmp, src, scaling, see, unused1, picptr, unused2
   2203 %else
   2204    mov           r11mp, src_bakq
   2205    mov           r12mp, strideq
   2206 %endif
   2207 
   2208 %%loop_x:
   2209 %if ARCH_X86_32
   2210    mov            seed, r3m
   2211 %endif
   2212    mov             r6d, seed
   2213    or             seed, 0xEFF4
   2214    shr             r6d, 1
   2215    test           seeb, seeh
   2216    lea            seed, [r6+0x8000]
   2217    cmovp          seed, r6d               ; updated seed
   2218 %if ARCH_X86_32
   2219    mov             r3m, seed
   2220 
   2221    DEFINE_ARGS dst, src, scaling, offy, w, picptr, offx
   2222 
   2223    mov           offxd, offyd
   2224 %else
   2225    DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \
   2226                offx, offy, see, overlap, unused1, unused2, lstride
   2227 
   2228    mov           offyd, seed
   2229    mov           offxd, seed
   2230 %endif
   2231    ror           offyd, 8
   2232    shr           offxd, 12
   2233    and           offyd, 0xf
   2234    imul          offyd, 164>>%3
   2235    lea           offyq, [offyq+offxq*(2-%2)+(3+(6>>%3))*82+(3+(6>>%2))]  ; offy*stride+offx
   2236 
   2237 %if ARCH_X86_32
   2238    DEFINE_ARGS luma, src, scaling, offxy, h, picptr, grain_lut
   2239 %else
   2240    DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \
   2241                h, offxy, see, overlap, unused1, unused2, lstride, luma_bak
   2242 %endif
   2243 
   2244 %%loop_x_odd:
   2245    mov              hd, r7m
   2246    mov      grain_lutq, grain_lutmp
   2247 %%loop_y:
   2248    ; src
   2249 %if ARCH_X86_32
   2250    mov           lumaq, r9mp
   2251 %endif
   2252 %if %2
   2253    mova             m4, [lumaq+ 0]
   2254    mova             m6, [lumaq+16]
   2255    mova             m0, [srcq]
   2256 %if ARCH_X86_32
   2257    add           lumaq, r10mp
   2258    mov            r9mp, lumaq
   2259    mov              r5, r5m
   2260    movd             m7, [base+pb_1]
   2261 %else
   2262    movd             m7, [pb_1]
   2263 %endif
   2264    pshufd           m7, m7, q0000
   2265    pxor             m2, m2
   2266    pmaddubsw        m4, m7
   2267    pmaddubsw        m6, m7
   2268    pavgw            m4, m2
   2269    pavgw            m6, m2
   2270 %else
   2271    mova             m4, [lumaq]
   2272    mova             m0, [srcq]
   2273 %if ARCH_X86_32
   2274    add           lumaq, r10mp
   2275    mov            r9mp, lumaq
   2276 %endif
   2277    pxor             m2, m2
   2278 %endif
   2279 
   2280 %if %1
   2281 %if %2
   2282    packuswb         m4, m6                 ; luma
   2283 %endif
   2284    punpckhbw        m6, m4, m0
   2285    punpcklbw        m4, m0                 ; { luma, chroma }
   2286    pmaddubsw        m6, m14
   2287    pmaddubsw        m4, m14
   2288    psraw            m6, 6
   2289    psraw            m4, 6
   2290    paddw            m6, m15
   2291    paddw            m4, m15
   2292    packuswb         m4, m6                 ; pack+unpack = clip
   2293    punpckhbw        m6, m4, m2
   2294    punpcklbw        m4, m2
   2295 %elif %2 == 0
   2296    punpckhbw        m6, m4, m2
   2297    punpcklbw        m4, m2
   2298 %endif
   2299 
   2300    ; scaling[luma_src]
   2301 %if ARCH_X86_32
   2302    vpgatherdw       m7, m4, scalingq-1, r0, r5
   2303    vpgatherdw       m5, m6, scalingq-1, r0, r5
   2304 %else
   2305    vpgatherdw       m7, m4, scalingq-1, r12, r2
   2306    vpgatherdw       m5, m6, scalingq-1, r12, r2
   2307 %endif
   2308    REPX {psrlw x, 8}, m7, m5
   2309 
   2310    ; unpack chroma_source
   2311    punpckhbw        m1, m0, m2
   2312    punpcklbw        m0, m2                 ; m0-1: src as word
   2313 
   2314    ; grain = grain_lut[offy+y][offx+x]
   2315    movu             m3, [grain_lutq+offxyq+ 0]
   2316    pcmpgtb          m6, m2, m3
   2317    punpcklbw        m2, m3, m6
   2318    punpckhbw        m3, m6
   2319 
   2320    ; noise = round2(scaling[luma_src] * grain, scaling_shift)
   2321    pmullw           m2, m7
   2322    pmullw           m3, m5
   2323    pmulhrsw         m2, m11
   2324    pmulhrsw         m3, m11
   2325 
   2326 %if ARCH_X86_32
   2327    DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut
   2328 %endif
   2329 
   2330    ; dst = clip_pixel(src, noise)
   2331    paddw            m0, m2
   2332    paddw            m1, m3
   2333    pmaxsw           m0, m13
   2334    pmaxsw           m1, m13
   2335    pminsw           m0, m12
   2336    pminsw           m1, m12
   2337    packuswb         m0, m1
   2338    movifnidn      dstq, dstmp
   2339    mova    [dstq+srcq], m0
   2340 
   2341 %if ARCH_X86_32
   2342    add            srcq, r2mp
   2343    ; we already incremented lumaq above
   2344 %else
   2345    add            srcq, r12mp
   2346 %if %3
   2347    lea           lumaq, [lumaq+lstrideq*2]
   2348 %else
   2349    add           lumaq, lstrideq
   2350 %endif
   2351 %endif
   2352    add      grain_lutq, 82
   2353    dec              hw
   2354    jg %%loop_y
   2355 
   2356 %if ARCH_X86_32
   2357    DEFINE_ARGS luma, src, scaling, offxy, w, picptr, grain_lut
   2358 
   2359    mov              wq, r4m
   2360 %endif
   2361    add              wq, 16
   2362    jge %%end
   2363 %if ARCH_X86_32
   2364    mov            srcq, r1mp
   2365    mov           lumaq, r11mp
   2366 %else
   2367    mov            srcq, r11mp
   2368 %endif
   2369    lea           lumaq, [luma_bakq+wq*(1+%2)]
   2370    add            srcq, wq
   2371 %if ARCH_X86_32
   2372    mov             r4m, wq
   2373    mov             r9m, lumaq
   2374 %endif
   2375 %if %2 == 0
   2376    ; adjust top_offxy
   2377 %if ARCH_X86_32
   2378    add dword [rsp+7*mmsize+1*gprsize], 16
   2379 %else
   2380    add            r11d, 16
   2381 %endif
   2382    add          offxyd, 16
   2383    btc       dword r8m, 2
   2384    jc %%loop_x_even
   2385    test      dword r8m, 2
   2386    jz %%loop_x_odd
   2387    jmp %%loop_x_odd_v_overlap
   2388 %%loop_x_even:
   2389 %endif
   2390    test      dword r8m, 1
   2391    jz %%loop_x
   2392 
   2393    ; r8m = sbym
   2394    test      dword r8m, 2
   2395    jne %%loop_x_hv_overlap
   2396 
   2397    ; horizontal overlap (without vertical overlap)
   2398 %%loop_x_h_overlap:
   2399 %if ARCH_X86_32
   2400 %if %2
   2401    lea              r6, [offxyd+16]
   2402    mov [rsp+7*mmsize+0*gprsize], r6
   2403 %else
   2404    mov [rsp+7*mmsize+0*gprsize], offxyd
   2405 %endif
   2406 
   2407    DEFINE_ARGS luma, src, scaling, see, w, picptr, grain_lut
   2408 
   2409    mov            seed, r3m
   2410 %else
   2411    DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \
   2412                offx, offy, see, left_offxy, unused1, unused2, lstride
   2413 
   2414 %if %2
   2415    lea     left_offxyd, [offyd+16]         ; previous column's offy*stride+offx
   2416 %else
   2417    mov     left_offxyd, offyd
   2418 %endif
   2419 %endif
   2420    mov             r6d, seed
   2421    or             seed, 0xEFF4
   2422    shr             r6d, 1
   2423    test           seeb, seeh
   2424    lea            seed, [r6+0x8000]
   2425    cmovp          seed, r6d                ; updated seed
   2426 
   2427 %if ARCH_X86_32
   2428    mov             r3m, seed
   2429 
   2430    DEFINE_ARGS luma, src, scaling, offy, w, picptr, offx
   2431 
   2432    mov          offxd, offyd
   2433 %else
   2434    DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \
   2435                offx, offy, see, left_offxy, unused1, unused2, lstride
   2436 
   2437    mov           offyd, seed
   2438    mov           offxd, seed
   2439 %endif
   2440    ror           offyd, 8
   2441    shr           offxd, 12
   2442    and           offyd, 0xf
   2443    imul          offyd, 164>>%3
   2444    lea           offyq, [offyq+offxq*(2-%2)+(3+(6>>%3))*82+3+(6>>%2)]  ; offy*stride+offx
   2445 
   2446 %if ARCH_X86_32
   2447    DEFINE_ARGS luma, src, scaling, offxy, h, picptr, grain_lut
   2448 %else
   2449    DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \
   2450                h, offxy, see, left_offxy, unused1, unused2, lstride, luma_bak
   2451 %endif
   2452 
   2453    mov              hd, r7m
   2454    mov      grain_lutq, grain_lutmp
   2455 %%loop_y_h_overlap:
   2456    ; src
   2457 %if ARCH_X86_32
   2458    mov           lumaq, r9mp
   2459 %endif
   2460 %if %2
   2461    mova             m4, [lumaq+ 0]
   2462    mova             m6, [lumaq+16]
   2463    mova             m0, [srcq]
   2464 %if ARCH_X86_32
   2465    add           lumaq, r10mp
   2466    mov            r9mp, lumaq
   2467    mov              r5, r5m
   2468    movd             m7, [base+pb_1]
   2469 %else
   2470    movd             m7, [pb_1]
   2471 %endif
   2472    pshufd           m7, m7, q0000
   2473    pxor             m2, m2
   2474    pmaddubsw        m4, m7
   2475    pmaddubsw        m6, m7
   2476    pavgw            m4, m2
   2477    pavgw            m6, m2
   2478 %else
   2479    mova             m4, [lumaq]
   2480    mova             m0, [srcq]
   2481 %if ARCH_X86_32
   2482    add           lumaq, r10mp
   2483    mov            r9mp, lumaq
   2484 %endif
   2485    pxor             m2, m2
   2486 %endif
   2487 
   2488 %if %1
   2489 %if %2
   2490    packuswb         m4, m6                 ; luma
   2491 %endif
   2492    punpckhbw        m6, m4, m0
   2493    punpcklbw        m4, m0                 ; { luma, chroma }
   2494    pmaddubsw        m6, m14
   2495    pmaddubsw        m4, m14
   2496    psraw            m6, 6
   2497    psraw            m4, 6
   2498    paddw            m6, m15
   2499    paddw            m4, m15
   2500    packuswb         m4, m6                 ; pack+unpack = clip
   2501    punpckhbw        m6, m4, m2
   2502    punpcklbw        m4, m2
   2503 %elif %2 == 0
   2504    punpckhbw        m6, m4, m2
   2505    punpcklbw        m4, m2
   2506 %endif
   2507 
   2508    ; scaling[luma_src]
   2509 %if ARCH_X86_32
   2510    vpgatherdw       m7, m4, scalingq-1, r0, r5
   2511    vpgatherdw       m5, m6, scalingq-1, r0, r5
   2512 %else
   2513    vpgatherdw       m7, m4, scalingq-1, r12, r2
   2514    vpgatherdw       m5, m6, scalingq-1, r12, r2
   2515 %endif
   2516    REPX {psrlw x, 8}, m7, m5
   2517 
   2518    ; unpack chroma_source
   2519    punpckhbw        m1, m0, m2
   2520    punpcklbw        m0, m2                 ; m0-1: src as word
   2521 
   2522    ; grain = grain_lut[offy+y][offx+x]
   2523    movu             m4, [grain_lutq+offxyq+ 0]
   2524 %if ARCH_X86_32
   2525    mov              r0, [rsp+7*mmsize+0*gprsize]
   2526    movd             m2, [grain_lutq+r0+ 0]
   2527 %else
   2528    movd             m2, [grain_lutq+left_offxyq+ 0]
   2529 %endif
   2530    punpcklbw        m2, m4
   2531    pmaddubsw        m3, m9, m2
   2532    pmulhrsw         m3, m8
   2533    packsswb         m3, m3
   2534    shufps           m3, m4, q3210
   2535    pxor             m4, m4
   2536    pcmpgtb          m4, m3
   2537    punpcklbw        m2, m3, m4
   2538    punpckhbw        m3, m4
   2539 
   2540    ; noise = round2(scaling[luma_src] * grain, scaling_shift)
   2541    pmullw           m2, m7
   2542    pmullw           m3, m5
   2543    pmulhrsw         m2, m11
   2544    pmulhrsw         m3, m11
   2545 
   2546 %if ARCH_X86_32
   2547    DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut
   2548 %endif
   2549 
   2550    ; dst = clip_pixel(src, noise)
   2551    paddw            m0, m2
   2552    paddw            m1, m3
   2553    pmaxsw           m0, m13
   2554    pmaxsw           m1, m13
   2555    pminsw           m0, m12
   2556    pminsw           m1, m12
   2557    packuswb         m0, m1
   2558    movifnidn      dstq, dstmp
   2559    mova    [dstq+srcq], m0
   2560 
   2561 %if ARCH_X86_32
   2562    add            srcq, r2mp
   2563    ; lumaq has already been incremented above
   2564 %else
   2565    add            srcq, r12mp
   2566 %if %3
   2567    lea           lumaq, [lumaq+lstrideq*2]
   2568 %else
   2569    add           lumaq, lstrideq
   2570 %endif
   2571 %endif
   2572    add      grain_lutq, 82
   2573    dec              hw
   2574    jg %%loop_y_h_overlap
   2575 
   2576 %if ARCH_X86_32
   2577    DEFINE_ARGS luma, src, scaling, offxy, w, picptr, grain_lut
   2578 
   2579    mov              wq, r4m
   2580 %endif
   2581    add              wq, 16
   2582    jge %%end
   2583 %if ARCH_X86_32
   2584    mov            srcq, r1mp
   2585    mov           lumaq, r11mp
   2586 %else
   2587    mov            srcq, r11mp
   2588 %endif
   2589    lea           lumaq, [luma_bakq+wq*(1+%2)]
   2590    add            srcq, wq
   2591 %if ARCH_X86_32
   2592    mov             r4m, wq
   2593    mov             r9m, lumaq
   2594 %endif
   2595 %if %2 == 0
   2596    xor       dword r8m, 4
   2597    ; adjust top_offxyd
   2598 %if ARCH_X86_32
   2599    add dword [rsp+7*mmsize+1*gprsize], 16
   2600 %else
   2601    add            r11d, 16
   2602 %endif
   2603    add          offxyd, 16
   2604 %endif
   2605 
   2606    ; r8m = sbym
   2607    test      dword r8m, 2
   2608 %if %2
   2609    jne %%loop_x_hv_overlap
   2610    jmp %%loop_x_h_overlap
   2611 %else
   2612    jne %%loop_x_odd_v_overlap
   2613    jmp %%loop_x_odd
   2614 %endif
   2615 
   2616 %%end:
   2617    RET
   2618 
   2619 %%vertical_overlap:
   2620 %if ARCH_X86_32
   2621    DEFINE_ARGS dst, src, scaling, sby, fg_data, picptr, overlap
   2622 %else
   2623    DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, tmp, sby, see, overlap
   2624 %endif
   2625 
   2626    or         overlapd, 2                  ; top_overlap: overlap & 2
   2627    mov             r8m, overlapd
   2628    movzx          sbyd, sbyb
   2629 %if ARCH_X86_32
   2630    imul             r4, [fg_dataq+FGData.seed], 0x00010001
   2631    DEFINE_ARGS tmp, src, scaling, sby, see, picptr, unused
   2632 %else
   2633    imul           seed, [fg_dataq+FGData.seed], 0x00010001
   2634 %endif
   2635    imul           tmpd, sbyd, 173 * 0x00010001
   2636    imul           sbyd, 37 * 0x01000100
   2637    add            tmpd, (105 << 16) | 188
   2638    add            sbyd, (178 << 24) | (141 << 8)
   2639    and            tmpd, 0x00ff00ff
   2640    and            sbyd, 0xff00ff00
   2641    xor            seed, tmpd
   2642 %if ARCH_X86_32
   2643    xor            sbyd, seed               ; (cur_seed << 16) | top_seed
   2644 
   2645    DEFINE_ARGS luma, src, scaling, see, w, picptr, src_bak
   2646 
   2647    mov             r3m, seed
   2648    mov              wq, r4m
   2649 %if %3
   2650    shl           r10mp, 1
   2651 %endif
   2652 %else
   2653    xor            seed, sbyd               ; (cur_seed << 16) | top_seed
   2654 
   2655    DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \
   2656                tmp, unused2, see, overlap, unused3, src_bak, lstride, luma_bak
   2657 
   2658    mov        lstrideq, r10mp
   2659 %endif
   2660 
   2661    mov           lumaq, r9mp
   2662    lea        src_bakq, [srcq+wq]
   2663    lea       luma_bakq, [lumaq+wq*(1+%2)]
   2664    neg              wq
   2665    sub            r0mp, srcq
   2666 %if ARCH_X86_32
   2667    mov             r1m, src_bakq
   2668    mov            r11m, luma_bakq
   2669    mov             r4m, wq
   2670 
   2671    DEFINE_ARGS tmp, src, scaling, see, unused1, picptr, unused2
   2672 %else
   2673    mov           r11mp, src_bakq
   2674    mov           r12mp, strideq
   2675 %endif
   2676 
   2677 %%loop_x_v_overlap:
   2678 %if ARCH_X86_32
   2679    mov            seed, r3m
   2680    xor            tmpd, tmpd
   2681 %endif
   2682    ; we assume from the block above that bits 8-15 of tmpd are zero'ed
   2683    mov             r6d, seed
   2684    or             seed, 0xeff4eff4
   2685    test           seeb, seeh
   2686    setp           tmpb                     ; parity of top_seed
   2687    shr            seed, 16
   2688    shl            tmpd, 16
   2689    test           seeb, seeh
   2690    setp           tmpb                     ; parity of cur_seed
   2691    or              r6d, 0x00010001
   2692    xor            tmpd, r6d
   2693    mov            seed, tmpd
   2694    ror            seed, 1                  ; updated (cur_seed << 16) | top_seed
   2695 
   2696 %if ARCH_X86_32
   2697    mov             r3m, seed
   2698 
   2699    DEFINE_ARGS dst, src, scaling, offy, h, picptr, offx
   2700 
   2701    mov           offxd, offyd
   2702 %else
   2703    DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \
   2704                offx, offy, see, overlap, top_offxy, unused, lstride
   2705 
   2706    mov           offxd, seed
   2707    mov           offyd, seed
   2708 %endif
   2709    ror           offyd, 8
   2710    ror           offxd, 12
   2711    and           offyd, 0xf000f
   2712    and           offxd, 0xf000f
   2713    imul          offyd, 164>>%3
   2714    ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy
   2715    lea           offyq, [offyq+offxq*(2-%2)+0x10001*((3+(6>>%3))*82+3+(6>>%2))+(32>>%3)*82]
   2716 
   2717 %if ARCH_X86_32
   2718    DEFINE_ARGS tmp, src, scaling, offxy, h, picptr, top_offxy
   2719 %else
   2720    DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \
   2721                h, offxy, see, overlap, top_offxy, unused, lstride, luma_bak
   2722 %endif
   2723 
   2724    movzx    top_offxyd, offxyw
   2725    shr          offxyd, 16
   2726 %if ARCH_X86_32
   2727    mov [rsp+7*mmsize+1*gprsize], top_offxyd
   2728 
   2729    DEFINE_ARGS luma, src, scaling, offxy, h, picptr, grain_lut
   2730 %endif
   2731 
   2732 %%loop_x_odd_v_overlap:
   2733    mov              hd, r7m
   2734    mov      grain_lutq, grain_lutmp
   2735 %if ARCH_X86_32
   2736    mov              r5, r5m
   2737 %endif
   2738 %if %3
   2739    mova             m1, [PIC_ptr(pb_23_22)]
   2740 %else
   2741    mova             m1, [PIC_ptr(pb_27_17)]
   2742 %endif
   2743 %%loop_y_v_overlap:
   2744 %if ARCH_X86_32
   2745    mov           lumaq, r9mp
   2746 %endif
   2747 %if %2
   2748    mova             m4, [lumaq+ 0]
   2749    mova             m6, [lumaq+16]
   2750    mova             m0, [srcq]
   2751 %if ARCH_X86_32
   2752    add           lumaq, r10mp
   2753    mov            r9mp, lumaq
   2754    mov              r5, r5m
   2755    movd             m7, [base+pb_1]
   2756 %else
   2757    movd             m7, [pb_1]
   2758 %endif
   2759    pshufd           m7, m7, q0000
   2760    pxor             m2, m2
   2761    pmaddubsw        m4, m7
   2762    pmaddubsw        m6, m7
   2763    pavgw            m4, m2
   2764    pavgw            m6, m2
   2765 %else
   2766    mova             m4, [lumaq]
   2767    mova             m0, [srcq]
   2768 %if ARCH_X86_32
   2769    add           lumaq, r10mp
   2770    mov            r9mp, lumaq
   2771 %endif
   2772    pxor             m2, m2
   2773 %endif
   2774 
   2775 %if %1
   2776 %if %2
   2777    packuswb         m4, m6                 ; luma
   2778 %endif
   2779    punpckhbw        m6, m4, m0
   2780    punpcklbw        m4, m0                 ; { luma, chroma }
   2781    pmaddubsw        m6, m14
   2782    pmaddubsw        m4, m14
   2783    psraw            m6, 6
   2784    psraw            m4, 6
   2785    paddw            m6, m15
   2786    paddw            m4, m15
   2787    packuswb         m4, m6                 ; pack+unpack = clip
   2788    punpckhbw        m6, m4, m2
   2789    punpcklbw        m4, m2
   2790 %elif %2 == 0
   2791    punpckhbw        m6, m4, m2
   2792    punpcklbw        m4, m2
   2793 %endif
   2794 
   2795    ; scaling[luma_src]
   2796 %if ARCH_X86_32
   2797    vpgatherdw       m7, m4, scalingq-1, r0, r5
   2798    vpgatherdw       m5, m6, scalingq-1, r0, r5
   2799 %else
   2800    vpgatherdw       m7, m4, scalingq-1, r12, r2
   2801    vpgatherdw       m5, m6, scalingq-1, r12, r2
   2802 %endif
   2803    REPX {psrlw x, 8}, m7, m5
   2804 
   2805    ; grain = grain_lut[offy+y][offx+x]
   2806    movu             m3, [grain_lutq+offxyq]
   2807 %if ARCH_X86_32
   2808    mov              r0, [rsp+7*mmsize+1*gprsize]
   2809    movu             m4, [grain_lutq+r0]
   2810 %else
   2811    movu             m4, [grain_lutq+top_offxyq]
   2812 %endif
   2813    punpckhbw        m6, m4, m3
   2814    punpcklbw        m4, m3
   2815    pmaddubsw        m2, m1, m6
   2816    pmaddubsw        m3, m1, m4
   2817    pmulhrsw         m2, m8
   2818    pmulhrsw         m3, m8
   2819    packsswb         m3, m2
   2820    pxor             m6, m6
   2821    pcmpgtb          m6, m3
   2822    punpcklbw        m2, m3, m6
   2823    punpckhbw        m3, m6
   2824 
   2825    ; noise = round2(scaling[luma_src] * grain, scaling_shift)
   2826    pmullw           m2, m7
   2827    pmullw           m3, m5
   2828    pmulhrsw         m2, m11
   2829    pmulhrsw         m3, m11
   2830 
   2831    ; unpack chroma_source
   2832    pxor             m4, m4
   2833    punpckhbw        m6, m0, m4
   2834    punpcklbw        m0, m4                 ; m0-1: src as word
   2835 
   2836 %if ARCH_X86_32
   2837    DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut
   2838 %endif
   2839 
   2840    ; dst = clip_pixel(src, noise)
   2841    paddw            m0, m2
   2842    paddw            m6, m3
   2843    pmaxsw           m0, m13
   2844    pmaxsw           m6, m13
   2845    pminsw           m0, m12
   2846    pminsw           m6, m12
   2847    packuswb         m0, m6
   2848    movifnidn      dstq, dstmp
   2849    mova    [dstq+srcq], m0
   2850 
   2851    dec              hw
   2852    je %%end_y_v_overlap
   2853 %if ARCH_X86_32
   2854    add            srcq, r2mp
   2855    ; lumaq has already been incremented above
   2856 %else
   2857    add            srcq, r12mp
   2858 %if %3
   2859    lea           lumaq, [lumaq+lstrideq*2]
   2860 %else
   2861    add           lumaq, lstrideq
   2862 %endif
   2863 %endif
   2864    add      grain_lutq, 82
   2865 %if %3 == 0
   2866    btc              hd, 16
   2867 %if ARCH_X86_32
   2868    mov              r5, r5m
   2869 %endif
   2870    mova             m1, [PIC_ptr(pb_17_27)]
   2871    jnc %%loop_y_v_overlap
   2872 %endif
   2873    jmp %%loop_y
   2874 
   2875 %%end_y_v_overlap:
   2876 %if ARCH_X86_32
   2877    DEFINE_ARGS luma, src, scaling, offxy, w, picptr, grain_lut
   2878 
   2879    mov              wq, r4m
   2880 %endif
   2881    add              wq, 16
   2882    jge %%end_hv
   2883 %if ARCH_X86_32
   2884    mov            srcq, r1mp
   2885    mov           lumaq, r11mp
   2886 %else
   2887    mov            srcq, r11mp
   2888 %endif
   2889    lea           lumaq, [luma_bakq+wq*(1+%2)]
   2890    add            srcq, wq
   2891 %if ARCH_X86_32
   2892    mov             r4m, wq
   2893    mov             r9m, lumaq
   2894 %endif
   2895 
   2896 %if %2
   2897    ; since fg_dataq.overlap is guaranteed to be set, we never jump
   2898    ; back to .loop_x_v_overlap, and instead always fall-through to
   2899    ; h+v overlap
   2900 %else
   2901 %if ARCH_X86_32
   2902    add dword [rsp+7*mmsize+1*gprsize], 16
   2903 %else
   2904    add      top_offxyd, 16
   2905 %endif
   2906    add          offxyd, 16
   2907    btc       dword r8m, 2
   2908    jnc %%loop_x_odd_v_overlap
   2909 %endif
   2910 
   2911 %%loop_x_hv_overlap:
   2912 %if ARCH_X86_32
   2913    DEFINE_ARGS tmp, src, scaling, offxy, w, picptr, unused
   2914 
   2915    mov              r6, [rsp+7*mmsize+1*gprsize]
   2916 %if %2
   2917    lea              r0, [r3d+16]
   2918    add              r6, 16
   2919    mov [rsp+7*mmsize+0*gprsize], r0        ; left_offxy
   2920 %else
   2921    mov [rsp+7*mmsize+0*gprsize], r3        ; left_offxy
   2922 %endif
   2923    mov [rsp+7*mmsize+2*gprsize], r6        ; topleft_offxy
   2924 
   2925    DEFINE_ARGS tmp, src, scaling, see, w, picptr, unused
   2926 
   2927    mov            seed, r3m
   2928    xor            tmpd, tmpd
   2929 %else
   2930    DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \
   2931                tmp, offxy, see, left_offxy, top_offxy, topleft_offxy, lstride
   2932 
   2933 %if %2
   2934    lea  topleft_offxyq, [top_offxyq+16]
   2935    lea     left_offxyq, [offxyq+16]
   2936 %else
   2937    mov  topleft_offxyq, top_offxyq
   2938    mov     left_offxyq, offxyq
   2939 %endif
   2940 
   2941    ; we assume from the block above that bits 8-15 of tmpd are zero'ed
   2942 %endif
   2943    mov             r6d, seed
   2944    or             seed, 0xeff4eff4
   2945    test           seeb, seeh
   2946    setp           tmpb                     ; parity of top_seed
   2947    shr            seed, 16
   2948    shl            tmpd, 16
   2949    test           seeb, seeh
   2950    setp           tmpb                     ; parity of cur_seed
   2951    or              r6d, 0x00010001
   2952    xor            tmpd, r6d
   2953    mov            seed, tmpd
   2954    ror            seed, 1                  ; updated (cur_seed << 16) | top_seed
   2955 
   2956 %if ARCH_X86_32
   2957    mov             r3m, seed
   2958 
   2959    DEFINE_ARGS tmp, src, scaling, offy, w, picptr, offx
   2960 
   2961    mov           offxd, offyd
   2962 %else
   2963    DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \
   2964                offx, offy, see, left_offxy, top_offxy, topleft_offxy, lstride
   2965 
   2966    mov           offxd, seed
   2967    mov           offyd, seed
   2968 %endif
   2969    ror           offyd, 8
   2970    ror           offxd, 12
   2971    and           offyd, 0xf000f
   2972    and           offxd, 0xf000f
   2973    imul          offyd, 164>>%3
   2974    ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy
   2975    lea           offyq, [offyq+offxq*(2-%2)+0x10001*((3+(6>>%3))*82+3+(6>>%2))+(32>>%3)*82]
   2976 
   2977 %if ARCH_X86_32
   2978    DEFINE_ARGS top_offxy, src, scaling, offxy, h, picptr, grain_lut
   2979 %else
   2980    DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \
   2981                h, offxy, see, left_offxy, top_offxy, topleft_offxy, lstride, luma_bak
   2982 %endif
   2983 
   2984    movzx    top_offxyd, offxyw
   2985    shr          offxyd, 16
   2986 %if ARCH_X86_32
   2987    mov [rsp+7*mmsize+1*gprsize], top_offxyd
   2988 %endif
   2989 
   2990    mov              hd, r7m
   2991    mov      grain_lutq, grain_lutmp
   2992 %if ARCH_X86_32
   2993    mov              r5, r5m
   2994 %endif
   2995 %if %3
   2996    mova             m3, [PIC_ptr(pb_23_22)]
   2997 %else
   2998    mova             m3, [PIC_ptr(pb_27_17)]
   2999 %endif
   3000 %%loop_y_hv_overlap:
   3001    ; grain = grain_lut[offy+y][offx+x]
   3002 %if ARCH_X86_32
   3003    mov              r0, [rsp+7*mmsize+2*gprsize]       ; topleft_offxy
   3004    mov              r5, [rsp+7*mmsize+1*gprsize]       ; top_offxy
   3005    movd             m1, [grain_lutq+r0]
   3006    mov              r0, [rsp+7*mmsize+0*gprsize]       ; left_offxy
   3007 %else
   3008    movd             m1, [grain_lutq+topleft_offxyq]
   3009 %endif
   3010    movu             m2, [grain_lutq+offxyq]
   3011 %if ARCH_X86_32
   3012    movu             m6, [grain_lutq+r5]
   3013    movd             m4, [grain_lutq+r0]
   3014 %else
   3015    movu             m6, [grain_lutq+top_offxyq]
   3016    movd             m4, [grain_lutq+left_offxyq]
   3017 %endif
   3018    ; do h interpolation first (so top | top/left -> top, left | cur -> cur)
   3019    punpcklbw        m1, m6
   3020    punpcklbw        m4, m2
   3021    pmaddubsw        m0, m9, m1
   3022    pmaddubsw        m1, m9, m4
   3023    REPX {pmulhrsw x, m8}, m0, m1
   3024    packsswb         m0, m1
   3025    shufps           m4, m0, m2, q3232
   3026    shufps           m0, m6, q3210
   3027    ; followed by v interpolation (top | cur -> cur)
   3028    punpcklbw        m2, m0, m4
   3029    punpckhbw        m0, m4
   3030    pmaddubsw        m4, m3, m0
   3031    pmaddubsw        m1, m3, m2
   3032    pmulhrsw         m4, m8
   3033    pmulhrsw         m1, m8
   3034    packsswb         m1, m4
   3035 
   3036    ; src
   3037 %if ARCH_X86_32
   3038    DEFINE_ARGS luma, src, scaling, offxy, w, picptr, grain_lut
   3039 
   3040    mov           lumaq, r9mp
   3041 %endif
   3042 %if %2
   3043    mova             m4, [lumaq+ 0]
   3044    mova             m6, [lumaq+16]
   3045    mova             m0, [srcq]
   3046 %if ARCH_X86_32
   3047    add           lumaq, r10mp
   3048    mov            r9mp, lumaq
   3049    mov              r5, r5m
   3050    movd             m7, [base+pb_1]
   3051 %else
   3052    movd             m7, [pb_1]
   3053 %endif
   3054    pshufd           m7, m7, q0000
   3055    pxor             m2, m2
   3056    pmaddubsw        m4, m7
   3057    pmaddubsw        m6, m7
   3058    pavgw            m4, m2
   3059    pavgw            m6, m2
   3060 %else
   3061    mova             m4, [lumaq]
   3062    mova             m0, [srcq]
   3063 %if ARCH_X86_32
   3064    add           lumaq, r10mp
   3065    mov            r9mp, lumaq
   3066 %endif
   3067    pxor             m2, m2
   3068 %endif
   3069 
   3070 %if %1
   3071 %if %2
   3072    packuswb         m4, m6                 ; luma
   3073 %endif
   3074    punpckhbw        m6, m4, m0
   3075    punpcklbw        m4, m0                 ; { luma, chroma }
   3076    pmaddubsw        m6, m14
   3077    pmaddubsw        m4, m14
   3078    psraw            m6, 6
   3079    psraw            m4, 6
   3080    paddw            m6, m15
   3081    paddw            m4, m15
   3082    packuswb         m4, m6                 ; pack+unpack = clip
   3083    punpckhbw        m6, m4, m2
   3084    punpcklbw        m4, m2
   3085 %elif %2 == 0
   3086    punpckhbw        m6, m4, m2
   3087    punpcklbw        m4, m2
   3088 %endif
   3089 
   3090    ; scaling[src]
   3091 %if ARCH_X86_32
   3092    vpgatherdw       m7, m4, scalingq-1, r0, r5
   3093    vpgatherdw       m5, m6, scalingq-1, r0, r5
   3094 %else
   3095 %if %3
   3096    vpgatherdw       m7, m4, scalingq-1, r2, r12
   3097    vpgatherdw       m5, m6, scalingq-1, r2, r12
   3098 %else
   3099    vpgatherdw       m7, m4, scalingq-1, r2, r13
   3100    vpgatherdw       m5, m6, scalingq-1, r2, r13
   3101 %endif
   3102 %endif
   3103    REPX {psrlw x, 8}, m7, m5
   3104 
   3105    ; unpack grain
   3106    pxor             m4, m4
   3107    pcmpgtb          m4, m1
   3108    punpcklbw        m2, m1, m4
   3109    punpckhbw        m1, m4
   3110 
   3111    ; noise = round2(scaling[src] * grain, scaling_shift)
   3112    pmullw           m2, m7
   3113    pmullw           m1, m5
   3114    pmulhrsw         m2, m11
   3115    pmulhrsw         m1, m11
   3116 
   3117 %if ARCH_X86_32
   3118    DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut
   3119 %endif
   3120 
   3121    ; unpack chroma source
   3122    pxor             m4, m4
   3123    punpckhbw        m5, m0, m4
   3124    punpcklbw        m0, m4                 ; m0-1: src as word
   3125 
   3126    ; dst = clip_pixel(src, noise)
   3127    paddw            m0, m2
   3128    paddw            m5, m1
   3129    pmaxsw           m0, m13
   3130    pmaxsw           m5, m13
   3131    pminsw           m0, m12
   3132    pminsw           m5, m12
   3133    packuswb         m0, m5
   3134    movifnidn      dstq, dstmp
   3135    mova    [dstq+srcq], m0
   3136 
   3137 %if ARCH_X86_32
   3138    add            srcq, r2mp
   3139    ; lumaq has been adjusted above already
   3140 %else
   3141    add            srcq, r12mp
   3142 %if %3
   3143    lea           lumaq, [lumaq+lstrideq*(1+%2)]
   3144 %else
   3145    add           lumaq, r10mp
   3146 %endif
   3147 %endif
   3148    add      grain_lutq, 82
   3149    dec              hw
   3150 %if %3
   3151    jg %%loop_y_h_overlap
   3152 %else
   3153    jle %%end_y_hv_overlap
   3154 %if ARCH_X86_32
   3155    mov              r5, r5m
   3156 %endif
   3157    mova             m3, [PIC_ptr(pb_17_27)]
   3158    btc              hd, 16
   3159    jnc %%loop_y_hv_overlap
   3160 %if ARCH_X86_64
   3161    mov        lstrideq, r10mp
   3162 %endif
   3163    jmp %%loop_y_h_overlap
   3164 %%end_y_hv_overlap:
   3165 %if ARCH_X86_64
   3166    mov        lstrideq, r10mp
   3167 %endif
   3168 %endif
   3169 
   3170 %if ARCH_X86_32
   3171    DEFINE_ARGS luma, src, scaling, offxy, w, picptr, grain_lut
   3172 
   3173    mov              wq, r4m
   3174 %endif
   3175    add              wq, 16
   3176    jge %%end_hv
   3177 %if ARCH_X86_32
   3178    mov            srcq, r1mp
   3179    mov           lumaq, r11mp
   3180 %else
   3181    mov            srcq, r11mp
   3182 %endif
   3183    lea           lumaq, [luma_bakq+wq*(1+%2)]
   3184    add            srcq, wq
   3185 %if ARCH_X86_32
   3186    mov             r4m, wq
   3187    mov             r9m, lumaq
   3188 %endif
   3189 %if %2
   3190    jmp %%loop_x_hv_overlap
   3191 %else
   3192 %if ARCH_X86_32
   3193    add dword [rsp+7*mmsize+1*gprsize], 16
   3194 %else
   3195    add      top_offxyd, 16
   3196 %endif
   3197    add          offxyd, 16
   3198    xor       dword r8m, 4
   3199    jmp %%loop_x_odd_v_overlap
   3200 %endif
   3201 
   3202 %%end_hv:
   3203    RET
   3204 %endmacro
   3205 
   3206    %%FGUV_32x32xN_LOOP 1, %2, %3
   3207 .csfl:
   3208    %%FGUV_32x32xN_LOOP 0, %2, %3
   3209 %endmacro
   3210 
   3211 FGUV_FN 420, 1, 1
   3212 
   3213 %if STACK_ALIGNMENT < mmsize
   3214 DECLARE_ARG 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12
   3215 %endif
   3216 
   3217 FGUV_FN 422, 1, 0
   3218 
   3219 %if STACK_ALIGNMENT < mmsize
   3220 DECLARE_ARG 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12
   3221 %endif
   3222 
   3223 FGUV_FN 444, 0, 0