tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

filmgrain16_avx2.asm (74089B)


      1 ; Copyright © 2021-2022, VideoLAN and dav1d authors
      2 ; Copyright © 2021-2022, Two Orioles, LLC
      3 ; All rights reserved.
      4 ;
      5 ; Redistribution and use in source and binary forms, with or without
      6 ; modification, are permitted provided that the following conditions are met:
      7 ;
      8 ; 1. Redistributions of source code must retain the above copyright notice, this
      9 ;    list of conditions and the following disclaimer.
     10 ;
     11 ; 2. Redistributions in binary form must reproduce the above copyright notice,
     12 ;    this list of conditions and the following disclaimer in the documentation
     13 ;    and/or other materials provided with the distribution.
     14 ;
     15 ; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
     16 ; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
     17 ; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
     18 ; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
     19 ; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
     20 ; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
     21 ; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
     22 ; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
     23 ; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
     24 ; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     25 
     26 %include "config.asm"
     27 %include "ext/x86/x86inc.asm"
     28 %include "x86/filmgrain_common.asm"
     29 
     30 %if ARCH_X86_64
     31 
     32 SECTION_RODATA 16
     33 pb_mask:       db  0,128,128,  0,128,  0,  0,128,128,  0,  0,128,  0,128,128,  0
     34 gen_shufA:     db  0,  1,  2,  3,  2,  3,  4,  5,  4,  5,  6,  7,  6,  7,  8,  9
     35 gen_shufB:     db  4,  5,  6,  7,  6,  7,  8,  9,  8,  9, 10, 11, 10, 11, 12, 13
     36 next_upperbit_mask:    dw 0x100B, 0x2016, 0x402C, 0x8058
     37 pw_27_17_17_27:        dw 27, 17, 17, 27
     38 pw_23_22:              dw 23, 22, 0, 32
     39 pw_seed_xor:   times 2 dw 0xb524
     40               times 2 dw 0x49d8
     41 gen_ar0_shift: times 4 db 128
     42               times 4 db 64
     43               times 4 db 32
     44               times 4 db 16
     45 pd_16:                 dd 16
     46 pd_m65536:             dd -65536
     47 pb_1:          times 4 db 1
     48 grain_max:     times 2 dw  511
     49               times 2 dw 2047
     50 grain_min:     times 2 dw -512
     51               times 2 dw -2048
     52 fg_max:        times 2 dw 1023
     53               times 2 dw 4095
     54               times 2 dw 960
     55               times 2 dw 3840
     56               times 2 dw 940
     57               times 2 dw 3760
     58 fg_min:        times 2 dw 0
     59               times 2 dw 64
     60               times 2 dw 256
     61 uv_offset_mul:         dd 256
     62                       dd 1024
     63 hmul_bits:             dw 32768, 16384,  8192,  4096
     64 round:                 dw  2048,  1024,   512
     65 mul_bits:              dw   256,   128,    64,    32,    16,     8
     66 round_vals:            dw    32,    64,   128,   256,   512,  1024
     67 pb_8_9_0_1:            db 8, 9, 0, 1
     68 
     69 %macro JMP_TABLE 1-*
     70    %xdefine %1_table %%table
     71    %xdefine %%base %1_table
     72    %xdefine %%prefix mangle(private_prefix %+ _%1)
     73    %%table:
     74    %rep %0 - 1
     75        dd %%prefix %+ .ar%2 - %%base
     76        %rotate 1
     77    %endrep
     78 %endmacro
     79 
     80 JMP_TABLE generate_grain_y_16bpc_avx2, 0, 1, 2, 3
     81 JMP_TABLE generate_grain_uv_420_16bpc_avx2, 0, 1, 2, 3
     82 JMP_TABLE generate_grain_uv_422_16bpc_avx2, 0, 1, 2, 3
     83 JMP_TABLE generate_grain_uv_444_16bpc_avx2, 0, 1, 2, 3
     84 
     85 SECTION .text
     86 
     87 %define m(x) mangle(private_prefix %+ _ %+ x %+ SUFFIX)
     88 
     89 INIT_YMM avx2
     90 cglobal generate_grain_y_16bpc, 3, 9, 14, buf, fg_data, bdmax
     91 %define base r4-generate_grain_y_16bpc_avx2_table
     92    lea              r4, [generate_grain_y_16bpc_avx2_table]
     93    vpbroadcastw    xm0, [fg_dataq+FGData.seed]
     94    mov             r6d, [fg_dataq+FGData.grain_scale_shift]
     95    movq            xm1, [base+next_upperbit_mask]
     96    mov              r3, -73*82*2
     97    movsxd           r5, [fg_dataq+FGData.ar_coeff_lag]
     98    lea             r7d, [bdmaxq+1]
     99    movq            xm4, [base+mul_bits]
    100    shr             r7d, 11             ; 0 for 10bpc, 2 for 12bpc
    101    movq            xm5, [base+hmul_bits]
    102    sub              r6, r7
    103    mova            xm6, [base+pb_mask]
    104    sub            bufq, r3
    105    vpbroadcastw    xm7, [base+round+r6*2-2]
    106    lea              r6, [gaussian_sequence]
    107    movsxd           r5, [r4+r5*4]
    108 .loop:
    109    pand            xm2, xm0, xm1
    110    psrlw           xm3, xm2, 10
    111    por             xm2, xm3            ; bits 0xf, 0x1e, 0x3c and 0x78 are set
    112    pmullw          xm2, xm4            ; bits 0x0f00 are set
    113    pmulhuw         xm0, xm5
    114    pshufb          xm3, xm6, xm2       ; set 15th bit for next 4 seeds
    115    psllq           xm2, xm3, 30
    116    por             xm2, xm3
    117    psllq           xm3, xm2, 15
    118    por             xm2, xm0            ; aggregate each bit into next seed's high bit
    119    por             xm3, xm2            ; 4 next output seeds
    120    pshuflw         xm0, xm3, q3333
    121    psrlw           xm3, 5
    122    pand            xm2, xm0, xm1
    123    movq             r7, xm3
    124    psrlw           xm3, xm2, 10
    125    por             xm2, xm3
    126    pmullw          xm2, xm4
    127    pmulhuw         xm0, xm5
    128    movzx           r8d, r7w
    129    pshufb          xm3, xm6, xm2
    130    psllq           xm2, xm3, 30
    131    por             xm2, xm3
    132    psllq           xm3, xm2, 15
    133    por             xm0, xm2
    134    movd            xm2, [r6+r8*2]
    135    rorx             r8, r7, 32
    136    por             xm3, xm0
    137    shr             r7d, 16
    138    pinsrw          xm2, [r6+r7*2], 1
    139    pshuflw         xm0, xm3, q3333
    140    movzx           r7d, r8w
    141    psrlw           xm3, 5
    142    pinsrw          xm2, [r6+r7*2], 2
    143    shr             r8d, 16
    144    movq             r7, xm3
    145    pinsrw          xm2, [r6+r8*2], 3
    146    movzx           r8d, r7w
    147    pinsrw          xm2, [r6+r8*2], 4
    148    rorx             r8, r7, 32
    149    shr             r7d, 16
    150    pinsrw          xm2, [r6+r7*2], 5
    151    movzx           r7d, r8w
    152    pinsrw          xm2, [r6+r7*2], 6
    153    shr             r8d, 16
    154    pinsrw          xm2, [r6+r8*2], 7
    155    paddw           xm2, xm2            ; otherwise bpc=12 w/ grain_scale_shift=0
    156    pmulhrsw        xm2, xm7            ; shifts by 0, which pmulhrsw does not support
    157    mova      [bufq+r3], xm2
    158    add              r3, 8*2
    159    jl .loop
    160 
    161    ; auto-regression code
    162    add              r5, r4
    163    jmp              r5
    164 
    165 .ar1:
    166    DEFINE_ARGS buf, fg_data, max, shift, val3, min, cf3, x, val0
    167    mov          shiftd, [fg_dataq+FGData.ar_coeff_shift]
    168    movsx          cf3d, byte [fg_dataq+FGData.ar_coeffs_y+3]
    169    movd            xm4, [fg_dataq+FGData.ar_coeffs_y]
    170    DEFINE_ARGS buf, h, max, shift, val3, min, cf3, x, val0
    171    pinsrb          xm4, [base+pb_1], 3
    172    pmovsxbw        xm4, xm4
    173    pshufd          xm5, xm4, q1111
    174    pshufd          xm4, xm4, q0000
    175    vpbroadcastw    xm3, [base+round_vals+shiftq*2-12]    ; rnd
    176    sub            bufq, 2*(82*73-(82*3+79))
    177    mov              hd, 70
    178    sar            maxd, 1
    179    mov            mind, maxd
    180    xor            mind, -1
    181 .y_loop_ar1:
    182    mov              xq, -76
    183    movsx         val3d, word [bufq+xq*2-2]
    184 .x_loop_ar1:
    185    movu            xm0, [bufq+xq*2-82*2-2]     ; top/left
    186    psrldq          xm2, xm0, 2                 ; top
    187    psrldq          xm1, xm0, 4                 ; top/right
    188    punpcklwd       xm0, xm2
    189    punpcklwd       xm1, xm3
    190    pmaddwd         xm0, xm4
    191    pmaddwd         xm1, xm5
    192    paddd           xm0, xm1
    193 .x_loop_ar1_inner:
    194    movd          val0d, xm0
    195    psrldq          xm0, 4
    196    imul          val3d, cf3d
    197    add           val3d, val0d
    198    sarx          val3d, val3d, shiftd
    199    movsx         val0d, word [bufq+xq*2]
    200    add           val3d, val0d
    201    cmp           val3d, maxd
    202    cmovg         val3d, maxd
    203    cmp           val3d, mind
    204    cmovl         val3d, mind
    205    mov word [bufq+xq*2], val3w
    206    ; keep val3d in-place as left for next x iteration
    207    inc              xq
    208    jz .x_loop_ar1_end
    209    test             xb, 3
    210    jnz .x_loop_ar1_inner
    211    jmp .x_loop_ar1
    212 .x_loop_ar1_end:
    213    add            bufq, 82*2
    214    dec              hd
    215    jg .y_loop_ar1
    216 .ar0:
    217    RET
    218 
    219 .ar2:
    220    DEFINE_ARGS buf, fg_data, bdmax, shift
    221    mov          shiftd, [fg_dataq+FGData.ar_coeff_shift]
    222    movq            xm0, [fg_dataq+FGData.ar_coeffs_y+5]    ; cf5-11
    223    vinserti128      m0, [fg_dataq+FGData.ar_coeffs_y+0], 1 ; cf0-4
    224    vpbroadcastw   xm10, [base+round_vals-12+shiftq*2]
    225    pxor             m1, m1
    226    punpcklwd      xm10, xm1
    227    pcmpgtb          m1, m0
    228    punpcklbw        m0, m1                                 ; cf5-11,0-4
    229    vpermq           m1, m0, q3333                          ; cf4
    230    vbroadcasti128  m11, [base+gen_shufA]
    231    pshufd           m6, m0, q0000                          ; cf[5,6], cf[0-1]
    232    vbroadcasti128  m12, [base+gen_shufB]
    233    pshufd           m7, m0, q1111                          ; cf[7,8], cf[2-3]
    234    punpckhwd       xm1, xm0
    235    pshufhw         xm9, xm0, q2121
    236    pshufd          xm8, xm1, q0000                         ; cf[4,9]
    237    sar          bdmaxd, 1
    238    punpckhqdq      xm9, xm9                                ; cf[10,11]
    239    movd            xm4, bdmaxd                             ; max_grain
    240    pcmpeqd         xm5, xm5
    241    sub            bufq, 2*(82*73-(82*3+79))
    242    pxor            xm5, xm4                                ; min_grain
    243    DEFINE_ARGS buf, fg_data, h, x
    244    mov              hd, 70
    245 .y_loop_ar2:
    246    mov              xq, -76
    247 .x_loop_ar2:
    248    vbroadcasti128   m2, [bufq+xq*2-82*4-4]        ; y=-2,x=[-2,+5]
    249    vinserti128      m1, m2, [bufq+xq*2-82*2-4], 0 ; y=-1,x=[-2,+5]
    250    pshufb           m0, m1, m11                   ; y=-1/-2,x=[-2/-1,-1/+0,+0/+1,+1/+2]
    251    pmaddwd          m0, m6
    252    punpckhwd       xm2, xm1                       ; y=-2/-1 interleaved, x=[+2,+5]
    253    pshufb           m1, m12                       ; y=-1/-2,x=[+0/+1,+1/+2,+2/+3,+3/+4]
    254    pmaddwd          m1, m7
    255    pmaddwd         xm2, xm8
    256    paddd            m0, m1
    257    vextracti128    xm1, m0, 1
    258    paddd           xm0, xm10
    259    paddd           xm2, xm0
    260    movu            xm0, [bufq+xq*2-4]      ; y=0,x=[-2,+5]
    261    paddd           xm2, xm1
    262    pmovsxwd        xm1, [bufq+xq*2]        ; in dwords, y=0,x=[0,3]
    263 .x_loop_ar2_inner:
    264    pmaddwd         xm3, xm9, xm0
    265    psrldq          xm0, 2
    266    paddd           xm3, xm2
    267    psrldq          xm2, 4                  ; shift top to next pixel
    268    psrad           xm3, [fg_dataq+FGData.ar_coeff_shift]
    269    ; skip packssdw because we only care about one value
    270    paddd           xm3, xm1
    271    pminsd          xm3, xm4
    272    psrldq          xm1, 4
    273    pmaxsd          xm3, xm5
    274    pextrw  [bufq+xq*2], xm3, 0
    275    punpcklwd       xm3, xm3
    276    pblendw         xm0, xm3, 0010b
    277    inc              xq
    278    jz .x_loop_ar2_end
    279    test             xb, 3
    280    jnz .x_loop_ar2_inner
    281    jmp .x_loop_ar2
    282 .x_loop_ar2_end:
    283    add            bufq, 82*2
    284    dec              hd
    285    jg .y_loop_ar2
    286    RET
    287 
    288 .ar3:
    289    DEFINE_ARGS buf, fg_data, bdmax, shift
    290    mov          shiftd, [fg_dataq+FGData.ar_coeff_shift]
    291    sar          bdmaxd, 1
    292    movq            xm7, [fg_dataq+FGData.ar_coeffs_y+ 0]    ; cf0-6
    293    movd            xm0, [fg_dataq+FGData.ar_coeffs_y+14]    ; cf14-16
    294    pinsrb          xm7, [fg_dataq+FGData.ar_coeffs_y+13], 7 ; cf0-6,13
    295    pinsrb          xm0, [base+pb_1], 3                      ; cf14-16,pb_1
    296    movd            xm1, [fg_dataq+FGData.ar_coeffs_y+21]    ; cf21-23
    297    vinserti128      m7, [fg_dataq+FGData.ar_coeffs_y+ 7], 1 ; cf7-13
    298    vinserti128      m0, [fg_dataq+FGData.ar_coeffs_y+17], 1 ; cf17-20
    299    vpbroadcastw   xm11, [base+round_vals+shiftq*2-12]
    300    movd           xm12, bdmaxd                              ; max_grain
    301    punpcklbw        m7, m7                                  ; sign-extension
    302    punpcklbw        m0, m0                                  ; sign-extension
    303    punpcklbw       xm1, xm1
    304    REPX   {psraw x, 8}, m7, m0, xm1
    305    pshufd           m4, m7, q0000                           ; cf[0,1] | cf[7,8]
    306    pshufd           m5, m7, q1111                           ; cf[2,3] | cf[9,10]
    307    pshufd           m6, m7, q2222                           ; cf[4,5] | cf[11,12]
    308    pshufd          xm7, xm7, q3333                          ; cf[6,13]
    309    pshufd           m8, m0, q0000                           ; cf[14,15] | cf[17,18]
    310    pshufd           m9, m0, q1111                           ; cf[16],pw_1 | cf[19,20]
    311    paddw           xm0, xm11, xm11
    312    pcmpeqd        xm13, xm13
    313    pblendw        xm10, xm1, xm0, 00001000b
    314    pxor           xm13, xm12                                ; min_grain
    315    DEFINE_ARGS buf, fg_data, h, x
    316    sub            bufq, 2*(82*73-(82*3+79))
    317    mov              hd, 70
    318 .y_loop_ar3:
    319    mov              xq, -76
    320 .x_loop_ar3:
    321    movu            xm0, [bufq+xq*2-82*6-6+ 0]      ; y=-3,x=[-3,+4]
    322    vinserti128      m0, [bufq+xq*2-82*4-6+ 0], 1   ; y=-3/-2,x=[-3,+4]
    323    movq            xm1, [bufq+xq*2-82*6-6+16]      ; y=-3,x=[+5,+8]
    324    vinserti128      m1, [bufq+xq*2-82*4-6+16], 1   ; y=-3/-2,x=[+5,+12]
    325    palignr          m3, m1, m0, 2                  ; y=-3/-2,x=[-2,+5]
    326    palignr          m1, m0, 12                     ; y=-3/-2,x=[+3,+6]
    327    punpckhwd        m2, m0, m3                     ; y=-3/-2,x=[+1/+2,+2/+3,+3/+4,+4/+5]
    328    punpcklwd        m0, m3                         ; y=-3/-2,x=[-3/-2,-2/-1,-1/+0,+0/+1]
    329    shufps           m3, m0, m2, q1032              ; y=-3/-2,x=[-1/+0,+0/+1,+1/+2,+2/+3]
    330    pmaddwd          m0, m4
    331    pmaddwd          m2, m6
    332    pmaddwd          m3, m5
    333    paddd            m0, m2
    334    movu            xm2, [bufq+xq*2-82*2-6+ 0]      ; y=-1,x=[-3,+4]
    335    vinserti128      m2, [bufq+xq*2-82*2-6+ 6], 1   ; y=-1,x=[+1,+8]
    336    paddd            m0, m3
    337    psrldq           m3, m2, 2
    338    punpcklwd        m3, m2, m3                     ; y=-1,x=[-3/-2,-2/-1,-1/+0,+0/+1]
    339    pmaddwd          m3, m8                         ;      x=[+0/+1,+1/+2,+2/+3,+3/+4]
    340    paddd            m0, m3
    341    psrldq           m3, m2, 4
    342    psrldq           m2, 6
    343    vpblendd         m2, m11, 0x0f                  ; rounding constant
    344    punpcklwd        m3, m2                         ; y=-1,x=[-1/rnd,+0/rnd,+1/rnd,+2/rnd]
    345    pmaddwd          m3, m9                         ;      x=[+2/+3,+3/+4,+4/+5,+5,+6]
    346    vextracti128    xm2, m1, 1
    347    punpcklwd       xm1, xm2
    348    pmaddwd         xm1, xm7                        ; y=-3/-2 interleaved,x=[+3,+4,+5,+6]
    349    paddd            m0, m3
    350    vextracti128    xm2, m0, 1
    351    paddd           xm0, xm1
    352    movu            xm1, [bufq+xq*2-6]        ; y=0,x=[-3,+4]
    353    paddd           xm0, xm2
    354 .x_loop_ar3_inner:
    355    pmaddwd         xm2, xm1, xm10
    356    pshuflw         xm3, xm2, q1032
    357    paddd           xm2, xm0                ; add top
    358    paddd           xm2, xm3                ; left+cur
    359    psrldq          xm0, 4
    360    psrad           xm2, [fg_dataq+FGData.ar_coeff_shift]
    361    ; skip packssdw because we only care about one value
    362    pminsd          xm2, xm12
    363    pmaxsd          xm2, xm13
    364    pextrw  [bufq+xq*2], xm2, 0
    365    pslldq          xm2, 4
    366    psrldq          xm1, 2
    367    pblendw         xm1, xm2, 0100b
    368    inc              xq
    369    jz .x_loop_ar3_end
    370    test             xb, 3
    371    jnz .x_loop_ar3_inner
    372    jmp .x_loop_ar3
    373 .x_loop_ar3_end:
    374    add            bufq, 82*2
    375    dec              hd
    376    jg .y_loop_ar3
    377    RET
    378 
    379 %macro GEN_GRAIN_UV_FN 3 ; ss_name, ss_x, ss_y
    380 INIT_XMM avx2
    381 cglobal generate_grain_uv_%1_16bpc, 4, 11, 8, buf, bufy, fg_data, uv, bdmax
    382 %define base r8-generate_grain_uv_%1_16bpc_avx2_table
    383    lea              r8, [generate_grain_uv_%1_16bpc_avx2_table]
    384    movifnidn    bdmaxd, bdmaxm
    385    vpbroadcastw    xm0, [fg_dataq+FGData.seed]
    386    mov             r5d, [fg_dataq+FGData.grain_scale_shift]
    387    movq            xm1, [base+next_upperbit_mask]
    388    lea             r6d, [bdmaxq+1]
    389    movq            xm4, [base+mul_bits]
    390    shr             r6d, 11             ; 0 for 10bpc, 2 for 12bpc
    391    movq            xm5, [base+hmul_bits]
    392    sub              r5, r6
    393    mova            xm6, [base+pb_mask]
    394    vpbroadcastd    xm2, [base+pw_seed_xor+uvq*4]
    395    vpbroadcastw    xm7, [base+round+r5*2-2]
    396    pxor            xm0, xm2
    397    lea              r6, [gaussian_sequence]
    398 %if %2
    399    mov             r7d, 73-35*%3
    400    add            bufq, 44*2
    401 .loop_y:
    402    mov              r5, -44*2
    403 %else
    404    mov              r5, -82*73*2
    405    sub            bufq, r5
    406 %endif
    407 .loop_x:
    408    pand            xm2, xm0, xm1
    409    psrlw           xm3, xm2, 10
    410    por             xm2, xm3            ; bits 0xf, 0x1e, 0x3c and 0x78 are set
    411    pmullw          xm2, xm4            ; bits 0x0f00 are set
    412    pmulhuw         xm0, xm5
    413    pshufb          xm3, xm6, xm2       ; set 15th bit for next 4 seeds
    414    psllq           xm2, xm3, 30
    415    por             xm2, xm3
    416    psllq           xm3, xm2, 15
    417    por             xm2, xm0            ; aggregate each bit into next seed's high bit
    418    por             xm2, xm3            ; 4 next output seeds
    419    pshuflw         xm0, xm2, q3333
    420    psrlw           xm2, 5
    421    movq            r10, xm2
    422    movzx           r9d, r10w
    423    movd            xm2, [r6+r9*2]
    424    rorx             r9, r10, 32
    425    shr            r10d, 16
    426    pinsrw          xm2, [r6+r10*2], 1
    427    movzx          r10d, r9w
    428    pinsrw          xm2, [r6+r10*2], 2
    429    shr             r9d, 16
    430    pinsrw          xm2, [r6+r9*2], 3
    431    paddw           xm2, xm2            ; otherwise bpc=12 w/ grain_scale_shift=0
    432    pmulhrsw        xm2, xm7            ; shifts by 0, which pmulhrsw does not support
    433    movq      [bufq+r5], xm2
    434    add              r5, 8
    435    jl .loop_x
    436 %if %2
    437    add            bufq, 82*2
    438    dec             r7d
    439    jg .loop_y
    440 %endif
    441 
    442    ; auto-regression code
    443    movsxd           r6, [fg_dataq+FGData.ar_coeff_lag]
    444    movsxd           r6, [r8+r6*4]
    445    add              r6, r8
    446    jmp              r6
    447 
    448 INIT_YMM avx2
    449 .ar0:
    450    DEFINE_ARGS buf, bufy, fg_data, uv, bdmax, shift
    451    imul            uvd, 28
    452    mov          shiftd, [fg_dataq+FGData.ar_coeff_shift]
    453    vpbroadcastb     m0, [fg_dataq+FGData.ar_coeffs_uv+uvq]
    454    sar          bdmaxd, 1
    455    vpbroadcastd     m4, [base+gen_ar0_shift-24+shiftq*4]
    456    movd            xm6, bdmaxd
    457    pcmpeqw          m7, m7
    458    pmaddubsw        m4, m0  ; ar_coeff << (14 - shift)
    459    vpbroadcastw     m6, xm6 ; max_gain
    460    pxor             m7, m6  ; min_grain
    461    DEFINE_ARGS buf, bufy, h, x
    462 %if %2
    463    vpbroadcastw     m5, [base+hmul_bits+2+%3*2]
    464    sub            bufq, 2*(82*(73-35*%3)+82-(82*3+41))
    465 %else
    466    sub            bufq, 2*(82*70-3)
    467 %endif
    468    add           bufyq, 2*(3+82*3)
    469    mov              hd, 70-35*%3
    470 .y_loop_ar0:
    471 %if %2
    472    ; first 32 pixels
    473    movu            xm0, [bufyq+16*0]
    474    vinserti128      m0, [bufyq+16*2], 1
    475    movu            xm1, [bufyq+16*1]
    476    vinserti128      m1, [bufyq+16*3], 1
    477 %if %3
    478    movu            xm2, [bufyq+82*2+16*0]
    479    vinserti128      m2, [bufyq+82*2+16*2], 1
    480    movu            xm3, [bufyq+82*2+16*1]
    481    vinserti128      m3, [bufyq+82*2+16*3], 1
    482    paddw            m0, m2
    483    paddw            m1, m3
    484 %endif
    485    phaddw           m0, m1
    486    movu            xm1, [bufyq+16*4]
    487    vinserti128      m1, [bufyq+16*6], 1
    488    movu            xm2, [bufyq+16*5]
    489    vinserti128      m2, [bufyq+16*7], 1
    490 %if %3
    491    movu            xm3, [bufyq+82*2+16*4]
    492    vinserti128      m3, [bufyq+82*2+16*6], 1
    493    paddw            m1, m3
    494    movu            xm3, [bufyq+82*2+16*5]
    495    vinserti128      m3, [bufyq+82*2+16*7], 1
    496    paddw            m2, m3
    497 %endif
    498    phaddw           m1, m2
    499    pmulhrsw         m0, m5
    500    pmulhrsw         m1, m5
    501 %else
    502    xor              xd, xd
    503 .x_loop_ar0:
    504    movu             m0, [bufyq+xq*2]
    505    movu             m1, [bufyq+xq*2+32]
    506 %endif
    507    paddw            m0, m0
    508    paddw            m1, m1
    509    pmulhrsw         m0, m4
    510    pmulhrsw         m1, m4
    511 %if %2
    512    paddw            m0, [bufq+ 0]
    513    paddw            m1, [bufq+32]
    514 %else
    515    paddw            m0, [bufq+xq*2+ 0]
    516    paddw            m1, [bufq+xq*2+32]
    517 %endif
    518    pminsw           m0, m6
    519    pminsw           m1, m6
    520    pmaxsw           m0, m7
    521    pmaxsw           m1, m7
    522 %if %2
    523    movu      [bufq+ 0], m0
    524    movu      [bufq+32], m1
    525 
    526    ; last 6 pixels
    527    movu            xm0, [bufyq+32*4]
    528    movu            xm1, [bufyq+32*4+16]
    529 %if %3
    530    paddw           xm0, [bufyq+32*4+82*2]
    531    paddw           xm1, [bufyq+32*4+82*2+16]
    532 %endif
    533    phaddw          xm0, xm1
    534    movu            xm1, [bufq+32*2]
    535    pmulhrsw        xm0, xm5
    536    paddw           xm0, xm0
    537    pmulhrsw        xm0, xm4
    538    paddw           xm0, xm1
    539    pminsw          xm0, xm6
    540    pmaxsw          xm0, xm7
    541    vpblendd        xm0, xm1, 0x08
    542    movu    [bufq+32*2], xm0
    543 %else
    544    movu [bufq+xq*2+ 0], m0
    545    movu [bufq+xq*2+32], m1
    546    add              xd, 32
    547    cmp              xd, 64
    548    jl .x_loop_ar0
    549 
    550    ; last 12 pixels
    551    movu             m0, [bufyq+64*2]
    552    movu             m1, [bufq+64*2]
    553    paddw            m0, m0
    554    pmulhrsw         m0, m4
    555    paddw            m0, m1
    556    pminsw           m0, m6
    557    pmaxsw           m0, m7
    558    vpblendd         m0, m1, 0xc0
    559    movu    [bufq+64*2], m0
    560 %endif
    561    add            bufq, 82*2
    562    add           bufyq, 82*2<<%3
    563    dec              hd
    564    jg .y_loop_ar0
    565    RET
    566 
    567 INIT_XMM avx2
    568 .ar1:
    569    DEFINE_ARGS buf, bufy, fg_data, uv, max, cf3, min, val3, x, shift
    570    imul            uvd, 28
    571    mov          shiftd, [fg_dataq+FGData.ar_coeff_shift]
    572    movsx          cf3d, byte [fg_dataq+FGData.ar_coeffs_uv+uvq+3]
    573    movd            xm4, [fg_dataq+FGData.ar_coeffs_uv+uvq]
    574    pinsrb          xm4, [fg_dataq+FGData.ar_coeffs_uv+uvq+4], 3
    575    DEFINE_ARGS buf, bufy, h, val0, max, cf3, min, val3, x, shift
    576    pmovsxbw        xm4, xm4
    577    pshufd          xm5, xm4, q1111
    578    pshufd          xm4, xm4, q0000
    579    pmovsxwd        xm3, [base+round_vals+shiftq*2-12]    ; rnd
    580    vpbroadcastw    xm6, [base+hmul_bits+2+%3*2]
    581    vpbroadcastd    xm3, xm3
    582 %if %2
    583    sub            bufq, 2*(82*(73-35*%3)+44-(82*3+41))
    584 %else
    585    sub            bufq, 2*(82*69+3)
    586 %endif
    587    add           bufyq, 2*(79+82*3)
    588    mov              hd, 70-35*%3
    589    sar            maxd, 1
    590    mov            mind, maxd
    591    xor            mind, -1
    592 .y_loop_ar1:
    593    mov              xq, -(76>>%2)
    594    movsx         val3d, word [bufq+xq*2-2]
    595 .x_loop_ar1:
    596    movu            xm0, [bufq+xq*2-82*2-2] ; top/left
    597 %if %2
    598    movu            xm2, [bufyq+xq*4]
    599 %else
    600    movq            xm2, [bufyq+xq*2]
    601 %endif
    602 %if %2
    603 %if %3
    604    phaddw          xm2, [bufyq+xq*4+82*2]
    605    punpckhqdq      xm1, xm2, xm2
    606    paddw           xm2, xm1
    607 %else
    608    phaddw          xm2, xm2
    609 %endif
    610    pmulhrsw        xm2, xm6
    611 %endif
    612    psrldq          xm1, xm0, 4             ; top/right
    613    punpcklwd       xm1, xm2
    614    psrldq          xm2, xm0, 2             ; top
    615    punpcklwd       xm0, xm2
    616    pmaddwd         xm1, xm5
    617    pmaddwd         xm0, xm4
    618    paddd           xm1, xm3
    619    paddd           xm0, xm1
    620 .x_loop_ar1_inner:
    621    movd          val0d, xm0
    622    psrldq          xm0, 4
    623    imul          val3d, cf3d
    624    add           val3d, val0d
    625    sarx          val3d, val3d, shiftd
    626    movsx         val0d, word [bufq+xq*2]
    627    add           val3d, val0d
    628    cmp           val3d, maxd
    629    cmovg         val3d, maxd
    630    cmp           val3d, mind
    631    cmovl         val3d, mind
    632    mov word [bufq+xq*2], val3w
    633    ; keep val3d in-place as left for next x iteration
    634    inc              xq
    635    jz .x_loop_ar1_end
    636    test             xb, 3
    637    jnz .x_loop_ar1_inner
    638    jmp .x_loop_ar1
    639 .x_loop_ar1_end:
    640    add            bufq, 82*2
    641    add           bufyq, 82*2<<%3
    642    dec              hd
    643    jg .y_loop_ar1
    644    RET
    645 
    646 INIT_YMM avx2
    647 .ar2:
    648 %if WIN64
    649    %assign stack_size_padded 136
    650    SUB             rsp, stack_size_padded
    651    WIN64_PUSH_XMM 13 + %2, 8
    652 %endif
    653    DEFINE_ARGS buf, bufy, fg_data, uv, bdmax, shift
    654    mov          shiftd, [fg_dataq+FGData.ar_coeff_shift]
    655    imul            uvd, 28
    656    vbroadcasti128  m10, [base+gen_shufA]
    657    sar          bdmaxd, 1
    658    vbroadcasti128  m11, [base+gen_shufB]
    659    movd            xm7, [fg_dataq+FGData.ar_coeffs_uv+uvq+ 5]
    660    pinsrb          xm7, [fg_dataq+FGData.ar_coeffs_uv+uvq+12], 4
    661    pinsrb          xm7, [base+pb_1], 5
    662    pinsrw          xm7, [fg_dataq+FGData.ar_coeffs_uv+uvq+10], 3
    663    movhps          xm7, [fg_dataq+FGData.ar_coeffs_uv+uvq+ 0]
    664    pinsrb          xm7, [fg_dataq+FGData.ar_coeffs_uv+uvq+ 9], 13
    665    pmovsxbw         m7, xm7
    666    movd            xm8, bdmaxd             ; max_grain
    667    pshufd           m4, m7, q0000
    668    vpbroadcastw   xm12, [base+round_vals-12+shiftq*2]
    669    pshufd           m5, m7, q1111
    670    pcmpeqd         xm9, xm9
    671    pshufd           m6, m7, q2222
    672    pxor            xm9, xm8                ; min_grain
    673    pshufd          xm7, xm7, q3333
    674    DEFINE_ARGS buf, bufy, fg_data, h, x
    675 %if %2
    676    vpbroadcastw   xm13, [base+hmul_bits+2+%3*2]
    677    sub            bufq, 2*(82*(73-35*%3)+44-(82*3+41))
    678 %else
    679    sub            bufq, 2*(82*69+3)
    680 %endif
    681    add           bufyq, 2*(79+82*3)
    682    mov              hd, 70-35*%3
    683 .y_loop_ar2:
    684    mov              xq, -(76>>%2)
    685 .x_loop_ar2:
    686    vbroadcasti128   m3, [bufq+xq*2-82*2-4]        ; y=-1,x=[-2,+5]
    687    vinserti128      m2, m3, [bufq+xq*2-82*4-4], 1 ; y=-2,x=[-2,+5]
    688    pshufb           m0, m2, m10                   ; y=-1/-2,x=[-2/-1,-1/+0,+0/+1,+1/+2]
    689    pmaddwd          m0, m4
    690    pshufb           m1, m2, m11                   ; y=-1/-2,x=[+0/+1,+1/+2,+2/+3,+3/+4]
    691    pmaddwd          m1, m5
    692    punpckhwd        m2, m3                        ; y=-2/-1 interleaved, x=[+2,+5]
    693 %if %2
    694    movu            xm3, [bufyq+xq*4]
    695 %if %3
    696    paddw           xm3, [bufyq+xq*4+82*2]
    697 %endif
    698    phaddw          xm3, xm3
    699    pmulhrsw        xm3, xm13
    700 %else
    701    movq            xm3, [bufyq+xq*2]
    702 %endif
    703    punpcklwd       xm3, xm12                   ; luma, round interleaved
    704    vpblendd         m2, m3, 0x0f
    705    pmaddwd          m2, m6
    706    paddd            m1, m0
    707    movu            xm0, [bufq+xq*2-4]      ; y=0,x=[-2,+5]
    708    paddd            m2, m1
    709    vextracti128    xm1, m2, 1
    710    paddd           xm2, xm1
    711    pshufd          xm1, xm0, q3321
    712    pmovsxwd        xm1, xm1                ; y=0,x=[0,3] in dword
    713 .x_loop_ar2_inner:
    714    pmaddwd         xm3, xm7, xm0
    715    paddd           xm3, xm2
    716    psrldq          xm2, 4                  ; shift top to next pixel
    717    psrad           xm3, [fg_dataq+FGData.ar_coeff_shift]
    718    ; we do not need to packssdw since we only care about one value
    719    paddd           xm3, xm1
    720    psrldq          xm1, 4
    721    pminsd          xm3, xm8
    722    pmaxsd          xm3, xm9
    723    pextrw  [bufq+xq*2], xm3, 0
    724    psrldq          xm0, 2
    725    pslldq          xm3, 2
    726    pblendw         xm0, xm3, 00000010b
    727    inc              xq
    728    jz .x_loop_ar2_end
    729    test             xb, 3
    730    jnz .x_loop_ar2_inner
    731    jmp .x_loop_ar2
    732 .x_loop_ar2_end:
    733    add            bufq, 82*2
    734    add           bufyq, 82*2<<%3
    735    dec              hd
    736    jg .y_loop_ar2
    737    RET
    738 
    739 .ar3:
    740 %if WIN64
    741    %assign stack_offset 32
    742    %assign stack_size_padded 152
    743    SUB             rsp, stack_size_padded
    744    WIN64_PUSH_XMM 14 + %2, 8
    745 %endif
    746    DEFINE_ARGS buf, bufy, fg_data, uv, bdmax, shift
    747    mov          shiftd, [fg_dataq+FGData.ar_coeff_shift]
    748    imul            uvd, 28
    749    vpbroadcastw   xm11, [base+round_vals-12+shiftq*2]
    750    sar          bdmaxd, 1
    751    movq            xm7, [fg_dataq+FGData.ar_coeffs_uv+uvq+ 0]
    752    pinsrb          xm7, [fg_dataq+FGData.ar_coeffs_uv+uvq+24], 7 ; luma
    753    movhps          xm7, [fg_dataq+FGData.ar_coeffs_uv+uvq+ 7]
    754    pmovsxbw         m7, xm7
    755 %if %2
    756    vpbroadcastw   xm14, [base+hmul_bits+2+%3*2]
    757 %endif
    758    pshufd           m4, m7, q0000
    759    pshufd           m5, m7, q1111
    760    pshufd           m6, m7, q2222
    761    pshufd           m7, m7, q3333
    762    movd            xm0, [fg_dataq+FGData.ar_coeffs_uv+uvq+14]
    763    pinsrb          xm0, [base+pb_1], 3
    764    pinsrd          xm0, [fg_dataq+FGData.ar_coeffs_uv+uvq+21], 1
    765    pinsrd          xm0, [fg_dataq+FGData.ar_coeffs_uv+uvq+17], 2
    766    pmovsxbw         m0, xm0
    767    movd           xm12, bdmaxd                 ; max_grain
    768    pshufd           m8, m0, q0000
    769    pshufd           m9, m0, q1111
    770    pcmpeqd        xm13, xm13
    771    punpckhqdq     xm10, xm0, xm0
    772    pxor           xm13, xm12                   ; min_grain
    773    pinsrw         xm10, [base+round_vals-10+shiftq*2], 3
    774    DEFINE_ARGS buf, bufy, fg_data, h, unused, x
    775 %if %2
    776    sub            bufq, 2*(82*(73-35*%3)+44-(82*3+41))
    777 %else
    778    sub            bufq, 2*(82*69+3)
    779 %endif
    780    add           bufyq, 2*(79+82*3)
    781    mov              hd, 70-35*%3
    782 .y_loop_ar3:
    783    mov              xq, -(76>>%2)
    784 .x_loop_ar3:
    785    movu            xm2, [bufq+xq*2-82*6-6+ 0]    ; y=-3,x=[-3,+4]
    786    vinserti128      m2, [bufq+xq*2-82*4-6+ 0], 1 ; y=-3/-2,x=[-3,+4]
    787    movq            xm1, [bufq+xq*2-82*6-6+16]    ; y=-3,x=[+5,+8]
    788    vinserti128      m1, [bufq+xq*2-82*4-6+16], 1 ; y=-3/-2,x=[+5,+12]
    789    palignr          m3, m1, m2, 2                ; y=-3/-2,x=[-2,+5]
    790    palignr          m1, m2, 12                   ; y=-3/-2,x=[+3,+6]
    791    punpcklwd        m0, m2, m3                   ; y=-3/-2,x=[-3/-2,-2/-1,-1/+0,+0/+1]
    792    punpckhwd        m2, m3                       ; y=-3/-2,x=[+1/+2,+2/+3,+3/+4,+4/+5]
    793    shufps           m3, m0, m2, q1032            ; y=-3/-2,x=[-1/+0,+0/+1,+1/+2,+2/+3]
    794    pmaddwd          m0, m4
    795    pmaddwd          m2, m6
    796    pmaddwd          m3, m5
    797    paddd            m0, m2
    798    paddd            m0, m3
    799    movu            xm2, [bufq+xq*2-82*2-6+ 0]    ; y=-1,x=[-3,+4]
    800    vinserti128      m2, [bufq+xq*2-82*2-6+ 6], 1 ; y=-1,x=[+1,+8]
    801 %if %2
    802    movu            xm3, [bufyq+xq*4]
    803 %if %3
    804    paddw           xm3, [bufyq+xq*4+82*2]
    805 %endif
    806    phaddw          xm3, xm3
    807    pmulhrsw        xm3, xm14
    808 %else
    809    movq            xm3, [bufyq+xq*2]
    810 %endif
    811    punpcklwd        m1, m3
    812    pmaddwd          m1, m7
    813    paddd            m0, m1
    814    psrldq           m1, m2, 4
    815    psrldq           m3, m2, 6
    816    vpblendd         m3, m11, 0x0f                ; rounding constant
    817    punpcklwd        m1, m3                       ; y=-1,x=[-1/rnd,+0/rnd,+1/rnd,+2/rnd]
    818    pmaddwd          m1, m9                       ;      x=[+2/+3,+3/+4,+4/+5,+5,+6]
    819    psrldq           m3, m2, 2
    820    punpcklwd        m2, m3                       ; y=-1,x=[-3/-2,-2/-1,-1/+0,+0/+1]
    821    pmaddwd          m2, m8                       ;      x=[+0/+1,+1/+2,+2/+3,+3/+4]
    822    paddd            m0, m1
    823    movu            xm1, [bufq+xq*2-6]            ; y=0,x=[-3,+4]
    824    paddd            m0, m2
    825    vextracti128    xm2, m0, 1
    826    paddd           xm0, xm2
    827 .x_loop_ar3_inner:
    828    pmaddwd         xm2, xm1, xm10
    829    pshuflw         xm3, xm2, q1032
    830    paddd           xm2, xm0                      ; add top
    831    paddd           xm2, xm3                      ; left+cur
    832    psrldq          xm0, 4
    833    psrad           xm2, [fg_dataq+FGData.ar_coeff_shift]
    834    psrldq          xm1, 2
    835    ; no need to packssdw since we only care about one value
    836    pminsd          xm2, xm12
    837    pmaxsd          xm2, xm13
    838    pextrw  [bufq+xq*2], xm2, 0
    839    pslldq          xm2, 4
    840    pblendw         xm1, xm2, 00000100b
    841    inc              xq
    842    jz .x_loop_ar3_end
    843    test             xb, 3
    844    jnz .x_loop_ar3_inner
    845    jmp .x_loop_ar3
    846 .x_loop_ar3_end:
    847    add            bufq, 82*2
    848    add           bufyq, 82*2<<%3
    849    dec              hd
    850    jg .y_loop_ar3
    851    RET
    852 %endmacro
    853 
    854 cglobal fgy_32x32xn_16bpc, 6, 14, 16, dst, src, stride, fg_data, w, scaling, \
    855                                      grain_lut, unused, sby, see
    856 %define base r11-grain_min
    857    lea             r11, [grain_min]
    858    mov             r6d, r9m ; bdmax
    859    mov             r9d, [fg_dataq+FGData.clip_to_restricted_range]
    860    mov             r7d, [fg_dataq+FGData.scaling_shift]
    861    mov            sbyd, sbym
    862    vpbroadcastd     m8, r9m
    863    shr             r6d, 11  ; is_12bpc
    864    vpbroadcastd     m9, [base+grain_min+r6*4]
    865    shlx           r10d, r9d, r6d
    866    vpbroadcastd    m10, [base+grain_max+r6*4]
    867    lea             r9d, [r6+r9*4]
    868    vpbroadcastw    m11, [base+mul_bits+r7*2-12]
    869    vpbroadcastd    m12, [base+fg_min+r10*4]
    870    vpbroadcastd    m13, [base+fg_max+r9*4]
    871    test           sbyd, sbyd
    872    setnz           r7b
    873    vpbroadcastd    m14, [base+pd_16]
    874    test            [fg_dataq+FGData.overlap_flag], r7b
    875    jnz .vertical_overlap
    876 
    877    imul           seed, sbyd, (173 << 24) | 37
    878    add            seed, (105 << 24) | 178
    879    rorx           seed, seed, 24
    880    movzx          seed, seew
    881    xor            seed, [fg_dataq+FGData.seed]
    882 
    883    DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
    884                offx, offy, see, src_bak
    885 
    886    lea        src_bakq, [srcq+wq*2]
    887    neg              wq
    888    sub            dstq, srcq
    889 
    890 .loop_x:
    891    rorx             r6, seeq, 1
    892    or             seed, 0xEFF4
    893    test           seeb, seeh
    894    lea            seed, [r6+0x8000]
    895    cmovp          seed, r6d                ; updated seed
    896    rorx          offyd, seed, 8
    897    rorx          offxq, seeq, 12
    898    and           offyd, 0xf
    899    imul          offyd, 164
    900    lea           offyd, [offyq+offxq*2+747] ; offy*stride+offx
    901 
    902    DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
    903                h, offxy, see, src_bak
    904 
    905    mov      grain_lutq, grain_lutmp
    906    mov              hd, hm
    907 .loop_y:
    908    ; scaling[src]
    909    mova             m0, [srcq+ 0]
    910    mova             m1, [srcq+32]
    911    pand             m4, m8, m0
    912    psrld            m3, m0, 16
    913    mova             m6, m9
    914    vpgatherdd       m2, [scalingq+m4-0], m9
    915    pand             m3, m8
    916    mova             m9, m6
    917    vpgatherdd       m4, [scalingq+m3-2], m6
    918    pand             m5, m8, m1
    919    mova             m6, m9
    920    vpgatherdd       m3, [scalingq+m5-0], m9
    921    pblendw          m4, m2, 0x55
    922    psrld            m2, m1, 16
    923    mova             m9, m6
    924    pand             m2, m8
    925    vpgatherdd       m5, [scalingq+m2-2], m6
    926    pblendw          m5, m3, 0x55
    927 
    928    ; noise = round2(scaling[src] * grain, scaling_shift)
    929    pmaddubsw        m4, m11
    930    pmaddubsw        m5, m11
    931    paddw            m4, m4
    932    paddw            m5, m5
    933    pmulhrsw         m4, [grain_lutq+offxyq*2]
    934    pmulhrsw         m5, [grain_lutq+offxyq*2+32]
    935 
    936    ; dst = clip_pixel(src, noise)
    937    paddw            m0, m4
    938    paddw            m1, m5
    939    pmaxsw           m0, m12
    940    pmaxsw           m1, m12
    941    pminsw           m0, m13
    942    pminsw           m1, m13
    943    mova [dstq+srcq+ 0], m0
    944    mova [dstq+srcq+32], m1
    945 
    946    add            srcq, strideq
    947    add      grain_lutq, 82*2
    948    dec              hd
    949    jg .loop_y
    950    add              wq, 32
    951    jge .end
    952    lea            srcq, [src_bakq+wq*2]
    953    cmp byte [fg_dataq+FGData.overlap_flag], 0
    954    je .loop_x
    955    movq            xm7, [pw_27_17_17_27]
    956    cmp       dword r8m, 0 ; sby
    957    jne .loop_x_hv_overlap
    958 
    959    ; horizontal overlap (without vertical overlap)
    960 .loop_x_h_overlap:
    961    rorx             r6, seeq, 1
    962    or             seed, 0xEFF4
    963    test           seeb, seeh
    964    lea            seed, [r6+0x8000]
    965    cmovp          seed, r6d                ; updated seed
    966 
    967    DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
    968                offx, offy, see, src_bak, left_offxy
    969 
    970    lea     left_offxyd, [offyq+32]         ; previous column's offy*stride+offx
    971    rorx          offyd, seed, 8
    972    rorx          offxq, seeq, 12
    973    and           offyd, 0xf
    974    imul          offyd, 164
    975    lea           offyd, [offyq+offxq*2+747] ; offy*stride+offx
    976 
    977    DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
    978                h, offxy, see, src_bak, left_offxy
    979 
    980    mov      grain_lutq, grain_lutmp
    981    mov              hd, hm
    982 .loop_y_h_overlap:
    983    ; scaling[src]
    984    mova             m0, [srcq+ 0]
    985    mova             m1, [srcq+32]
    986    pand             m4, m8, m0
    987    psrld            m3, m0, 16
    988    mova             m6, m9
    989    vpgatherdd       m2, [scalingq+m4-0], m9
    990    pand             m3, m8
    991    mova             m9, m6
    992    vpgatherdd       m4, [scalingq+m3-2], m6
    993    pand             m5, m8, m1
    994    mova             m6, m9
    995    vpgatherdd       m3, [scalingq+m5-0], m9
    996    pblendw          m4, m2, 0x55
    997    psrld            m2, m1, 16
    998    mova             m9, m6
    999    pand             m2, m8
   1000    vpgatherdd       m5, [scalingq+m2-2], m6
   1001    pblendw          m5, m3, 0x55
   1002 
   1003    ; grain = grain_lut[offy+y][offx+x]
   1004    movu             m3, [grain_lutq+offxyq*2]
   1005    movd            xm6, [grain_lutq+left_offxyq*2]
   1006    punpcklwd       xm6, xm3
   1007    pmaddwd         xm6, xm7
   1008    paddd           xm6, xm14
   1009    psrad           xm6, 5
   1010    packssdw        xm6, xm6
   1011    pmaxsw          xm6, xm9
   1012    pminsw          xm6, xm10
   1013    vpblendd         m3, m6, 0x01
   1014 
   1015    ; noise = round2(scaling[src] * grain, scaling_shift)
   1016    pmaddubsw        m4, m11
   1017    pmaddubsw        m5, m11
   1018    paddw            m4, m4
   1019    paddw            m5, m5
   1020    pmulhrsw         m4, m3
   1021    pmulhrsw         m5, [grain_lutq+offxyq*2+32]
   1022 
   1023    ; dst = clip_pixel(src, noise)
   1024    paddw            m0, m4
   1025    paddw            m1, m5
   1026    pmaxsw           m0, m12
   1027    pmaxsw           m1, m12
   1028    pminsw           m0, m13
   1029    pminsw           m1, m13
   1030    mova [dstq+srcq+ 0], m0
   1031    mova [dstq+srcq+32], m1
   1032 
   1033    add            srcq, strideq
   1034    add      grain_lutq, 82*2
   1035    dec              hd
   1036    jg .loop_y_h_overlap
   1037    add              wq, 32
   1038    jge .end
   1039    lea            srcq, [src_bakq+wq*2]
   1040    cmp       dword r8m, 0 ; sby
   1041    jne .loop_x_hv_overlap
   1042    jmp .loop_x_h_overlap
   1043 
   1044 .vertical_overlap:
   1045    DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, unused1, \
   1046                sby, see, src_bak
   1047 
   1048    movzx          sbyd, sbyb
   1049    imul           seed, [fg_dataq+FGData.seed], 0x00010001
   1050    imul            r7d, sbyd, 173 * 0x00010001
   1051    imul           sbyd, 37 * 0x01000100
   1052    add             r7d, (105 << 16) | 188
   1053    add            sbyd, (178 << 24) | (141 << 8)
   1054    and             r7d, 0x00ff00ff
   1055    and            sbyd, 0xff00ff00
   1056    xor            seed, r7d
   1057    xor            seed, sbyd               ; (cur_seed << 16) | top_seed
   1058 
   1059    lea        src_bakq, [srcq+wq*2]
   1060    neg              wq
   1061    sub            dstq, srcq
   1062 
   1063 .loop_x_v_overlap:
   1064    vpbroadcastd    m15, [pw_27_17_17_27]
   1065 
   1066    ; we assume from the block above that bits 8-15 of r7d are zero'ed
   1067    mov             r6d, seed
   1068    or             seed, 0xeff4eff4
   1069    test           seeb, seeh
   1070    setp            r7b                     ; parity of top_seed
   1071    shr            seed, 16
   1072    shl             r7d, 16
   1073    test           seeb, seeh
   1074    setp            r7b                     ; parity of cur_seed
   1075    or              r6d, 0x00010001
   1076    xor             r7d, r6d
   1077    rorx           seed, r7d, 1             ; updated (cur_seed << 16) | top_seed
   1078 
   1079    DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
   1080                offx, offy, see, src_bak, unused, top_offxy
   1081 
   1082    rorx          offyd, seed, 8
   1083    rorx          offxd, seed, 12
   1084    and           offyd, 0xf000f
   1085    and           offxd, 0xf000f
   1086    imul          offyd, 164
   1087    ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy
   1088    lea           offyd, [offyq+offxq*2+0x10001*747+32*82]
   1089 
   1090    DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
   1091                h, offxy, see, src_bak, unused, top_offxy
   1092 
   1093    mov      grain_lutq, grain_lutmp
   1094    mov              hd, hm
   1095    movzx    top_offxyd, offxyw
   1096    shr          offxyd, 16
   1097 .loop_y_v_overlap:
   1098    ; scaling[src]
   1099    mova             m0, [srcq+ 0]
   1100    mova             m1, [srcq+32]
   1101    pand             m4, m8, m0
   1102    psrld            m3, m0, 16
   1103    mova             m6, m9
   1104    vpgatherdd       m2, [scalingq+m4-0], m9
   1105    pand             m3, m8
   1106    mova             m9, m6
   1107    vpgatherdd       m4, [scalingq+m3-2], m6
   1108    pand             m5, m8, m1
   1109    mova             m6, m9
   1110    vpgatherdd       m3, [scalingq+m5-0], m9
   1111    pblendw          m2, m4, 0xaa
   1112    psrld            m4, m1, 16
   1113    mova             m9, m6
   1114    pand             m4, m8
   1115    vpgatherdd       m5, [scalingq+m4-2], m6
   1116    pblendw          m3, m5, 0xaa
   1117 
   1118    ; grain = grain_lut[offy+y][offx+x]
   1119    movu             m6, [grain_lutq+offxyq*2]
   1120    movu             m5, [grain_lutq+top_offxyq*2]
   1121    punpcklwd        m4, m5, m6
   1122    punpckhwd        m5, m6
   1123    pmaddwd          m4, m15
   1124    pmaddwd          m5, m15
   1125    movu             m7, [grain_lutq+offxyq*2+32]
   1126    movu             m6, [grain_lutq+top_offxyq*2+32]
   1127    paddd            m4, m14
   1128    paddd            m5, m14
   1129    psrad            m4, 5
   1130    psrad            m5, 5
   1131    packssdw         m4, m5
   1132    punpcklwd        m5, m6, m7
   1133    punpckhwd        m6, m7
   1134    pmaddwd          m5, m15
   1135    pmaddwd          m6, m15
   1136    paddd            m5, m14
   1137    paddd            m6, m14
   1138    psrad            m5, 5
   1139    psrad            m6, 5
   1140    packssdw         m5, m6
   1141    pmaxsw           m4, m9
   1142    pmaxsw           m5, m9
   1143    pminsw           m4, m10
   1144    pminsw           m5, m10
   1145 
   1146    ; noise = round2(scaling[src] * grain, scaling_shift)
   1147    pmaddubsw        m2, m11
   1148    pmaddubsw        m3, m11
   1149    paddw            m2, m2
   1150    paddw            m3, m3
   1151    pmulhrsw         m4, m2
   1152    pmulhrsw         m5, m3
   1153 
   1154    ; dst = clip_pixel(src, noise)
   1155    paddw            m0, m4
   1156    paddw            m1, m5
   1157    pmaxsw           m0, m12
   1158    pmaxsw           m1, m12
   1159    pminsw           m0, m13
   1160    pminsw           m1, m13
   1161    mova [dstq+srcq+ 0], m0
   1162    mova [dstq+srcq+32], m1
   1163 
   1164    add            srcq, strideq
   1165    add      grain_lutq, 82*2
   1166    dec              hb
   1167    jz .end_y_v_overlap
   1168    vpbroadcastd    m15, [pw_27_17_17_27+4] ; swap weights for second v-overlap line
   1169    ; 2 lines get vertical overlap, then fall back to non-overlap code for
   1170    ; remaining (up to) 30 lines
   1171    add              hd, 0x80000000
   1172    jnc .loop_y_v_overlap
   1173    jmp .loop_y
   1174 .end_y_v_overlap:
   1175    add              wq, 32
   1176    jge .end
   1177    lea            srcq, [src_bakq+wq*2]
   1178 
   1179    ; since fg_dataq.overlap is guaranteed to be set, we never jump
   1180    ; back to .loop_x_v_overlap, and instead always fall-through to
   1181    ; h+v overlap
   1182 
   1183 .loop_x_hv_overlap:
   1184    vpbroadcastd    m15, [pw_27_17_17_27]
   1185 
   1186    ; we assume from the block above that bits 8-15 of r7d are zero'ed
   1187    mov             r6d, seed
   1188    or             seed, 0xeff4eff4
   1189    test           seeb, seeh
   1190    setp            r7b                     ; parity of top_seed
   1191    shr            seed, 16
   1192    shl             r7d, 16
   1193    test           seeb, seeh
   1194    setp            r7b                     ; parity of cur_seed
   1195    or              r6d, 0x00010001
   1196    xor             r7d, r6d
   1197    rorx           seed, r7d, 1             ; updated (cur_seed << 16) | top_seed
   1198 
   1199    DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
   1200                offx, offy, see, src_bak, left_offxy, top_offxy, topleft_offxy
   1201 
   1202    lea  topleft_offxyd, [top_offxyq+32]
   1203    lea     left_offxyd, [offyq+32]
   1204    rorx          offyd, seed, 8
   1205    rorx          offxd, seed, 12
   1206    and           offyd, 0xf000f
   1207    and           offxd, 0xf000f
   1208    imul          offyd, 164
   1209    ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy
   1210    lea           offyd, [offyq+offxq*2+0x10001*747+32*82]
   1211 
   1212    DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
   1213                h, offxy, see, src_bak, left_offxy, top_offxy, topleft_offxy
   1214 
   1215    mov      grain_lutq, grain_lutmp
   1216    mov              hd, hm
   1217    movzx    top_offxyd, offxyw
   1218    shr          offxyd, 16
   1219 .loop_y_hv_overlap:
   1220    ; scaling[src]
   1221    mova             m0, [srcq+ 0]
   1222    mova             m1, [srcq+32]
   1223    pand             m4, m8, m0
   1224    psrld            m3, m0, 16
   1225    mova             m6, m9
   1226    vpgatherdd       m2, [scalingq+m4-0], m9
   1227    pand             m3, m8
   1228    mova             m9, m6
   1229    vpgatherdd       m4, [scalingq+m3-2], m6
   1230    pand             m5, m8, m1
   1231    mova             m6, m9
   1232    vpgatherdd       m3, [scalingq+m5-0], m9
   1233    pblendw          m2, m4, 0xaa
   1234    psrld            m4, m1, 16
   1235    mova             m9, m6
   1236    pand             m4, m8
   1237    vpgatherdd       m5, [scalingq+m4-2], m6
   1238    pblendw          m3, m5, 0xaa
   1239 
   1240    ; grain = grain_lut[offy+y][offx+x]
   1241    movu             m7, [grain_lutq+offxyq*2]
   1242    movd            xm6, [grain_lutq+left_offxyq*2]
   1243    movu             m5, [grain_lutq+top_offxyq*2]
   1244    movd            xm4, [grain_lutq+topleft_offxyq*2]
   1245    ; do h interpolation first (so top | top/left -> top, left | cur -> cur)
   1246    punpcklwd       xm6, xm7
   1247    punpcklwd       xm4, xm5
   1248    punpcklqdq      xm6, xm4
   1249    movddup         xm4, [pw_27_17_17_27]
   1250    pmaddwd         xm6, xm4
   1251    paddd           xm6, xm14
   1252    psrad           xm6, 5
   1253    packssdw        xm6, xm6
   1254    pmaxsw          xm6, xm9
   1255    pminsw          xm6, xm10
   1256    pshuflw         xm4, xm6, q1032
   1257    vpblendd         m6, m7, 0xfe
   1258    vpblendd         m4, m5, 0xfe
   1259    ; followed by v interpolation (top | cur -> cur)
   1260    punpckhwd        m5, m7
   1261    pmaddwd          m5, m15
   1262    punpcklwd        m4, m6
   1263    pmaddwd          m4, m15
   1264    movu             m7, [grain_lutq+offxyq*2+32]
   1265    movu             m6, [grain_lutq+top_offxyq*2+32]
   1266    paddd            m5, m14
   1267    paddd            m4, m14
   1268    psrad            m5, 5
   1269    psrad            m4, 5
   1270    packssdw         m4, m5
   1271    punpcklwd        m5, m6, m7
   1272    punpckhwd        m6, m7
   1273    pmaddwd          m5, m15
   1274    pmaddwd          m6, m15
   1275    paddd            m5, m14
   1276    paddd            m6, m14
   1277    psrad            m5, 5
   1278    psrad            m6, 5
   1279    packssdw         m5, m6
   1280    pmaxsw           m4, m9
   1281    pmaxsw           m5, m9
   1282    pminsw           m4, m10
   1283    pminsw           m5, m10
   1284 
   1285    ; noise = round2(scaling[src] * grain, scaling_shift)
   1286    pmaddubsw        m2, m11
   1287    pmaddubsw        m3, m11
   1288    paddw            m2, m2
   1289    paddw            m3, m3
   1290    pmulhrsw         m4, m2
   1291    pmulhrsw         m5, m3
   1292 
   1293    ; dst = clip_pixel(src, noise)
   1294    paddw            m0, m4
   1295    paddw            m1, m5
   1296    pmaxsw           m0, m12
   1297    pmaxsw           m1, m12
   1298    pminsw           m0, m13
   1299    pminsw           m1, m13
   1300    mova [dstq+srcq+ 0], m0
   1301    mova [dstq+srcq+32], m1
   1302 
   1303    add            srcq, strideq
   1304    add      grain_lutq, 82*2
   1305    dec              hb
   1306    jz .end_y_hv_overlap
   1307    vpbroadcastd    m15, [pw_27_17_17_27+4] ; swap weights for second v-overlap line
   1308    ; 2 lines get vertical overlap, then fall back to non-overlap code for
   1309    ; remaining (up to) 30 lines
   1310    add              hd, 0x80000000
   1311    jnc .loop_y_hv_overlap
   1312    movq            xm7, [pw_27_17_17_27]
   1313    jmp .loop_y_h_overlap
   1314 .end_y_hv_overlap:
   1315    add              wq, 32
   1316    lea            srcq, [src_bakq+wq*2]
   1317    jl .loop_x_hv_overlap
   1318 .end:
   1319    RET
   1320 
   1321 %macro FGUV_FN 3 ; name, ss_hor, ss_ver
   1322 cglobal fguv_32x32xn_i%1_16bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \
   1323                                           grain_lut, h, sby, luma, lstride, uv_pl, is_id
   1324 %define base r12-grain_min
   1325    lea             r12, [grain_min]
   1326    mov             r9d, r13m               ; bdmax
   1327    mov             r7d, [fg_dataq+FGData.scaling_shift]
   1328    mov            r11d, is_idm
   1329    mov            sbyd, sbym
   1330    vpbroadcastw    m11, [base+mul_bits+r7*2-12]
   1331    mov             r6d, [fg_dataq+FGData.clip_to_restricted_range]
   1332    shr             r9d, 11                 ; is_12bpc
   1333    vpbroadcastd     m8, [base+grain_min+r9*4]
   1334    shlx           r10d, r6d, r9d
   1335    vpbroadcastd     m9, [base+grain_max+r9*4]
   1336    vpbroadcastw    m10, r13m
   1337    shlx            r6d, r6d, r11d
   1338    vpbroadcastd    m12, [base+fg_min+r10*4]
   1339    lea             r6d, [r9+r6*2]
   1340    vpbroadcastd    m13, [base+fg_max+r6*4]
   1341    test           sbyd, sbyd
   1342    setnz           r7b
   1343    cmp byte [fg_dataq+FGData.chroma_scaling_from_luma], 0
   1344    jne .csfl
   1345 
   1346 %macro %%FGUV_32x32xN_LOOP 3 ; not-csfl, ss_hor, ss_ver
   1347    DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
   1348                unused, sby, see, overlap
   1349 
   1350 %if %1
   1351    mov             r6d, r11m
   1352    vpbroadcastd     m0, [base+pb_8_9_0_1]
   1353    vpbroadcastd     m1, [base+uv_offset_mul+r9*4]
   1354    vbroadcasti128  m14, [fg_dataq+FGData.uv_mult+r6*4]
   1355    vpbroadcastd    m15, [fg_dataq+FGData.uv_offset+r6*4]
   1356    pshufb          m14, m0 ; { uv_luma_mult, uv_mult }
   1357    pmaddwd         m15, m1
   1358 %else
   1359 %if %2
   1360    vpbroadcastq    m15, [base+pw_23_22]
   1361 %else
   1362    vpbroadcastq    m15, [base+pw_27_17_17_27]
   1363 %endif
   1364    vpbroadcastd    m14, [base+pd_16]
   1365 %endif
   1366    test            [fg_dataq+FGData.overlap_flag], r7b
   1367    jnz %%vertical_overlap
   1368 
   1369    imul           seed, sbyd, (173 << 24) | 37
   1370    add            seed, (105 << 24) | 178
   1371    rorx           seed, seed, 24
   1372    movzx          seed, seew
   1373    xor            seed, [fg_dataq+FGData.seed]
   1374 
   1375    DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
   1376                unused2, unused3, see, unused4, unused5, unused6, luma, lstride
   1377 
   1378    mov           lumaq, r9mp
   1379    mov        lstrideq, r10mp
   1380    lea             r10, [srcq+wq*2]
   1381    lea             r11, [dstq+wq*2]
   1382    lea             r12, [lumaq+wq*(2<<%2)]
   1383    mov            r9mp, r10
   1384    mov           r11mp, r11
   1385    mov           r12mp, r12
   1386    neg              wq
   1387 
   1388 %%loop_x:
   1389    rorx             r6, seeq, 1
   1390    or             seed, 0xEFF4
   1391    test           seeb, seeh
   1392    lea            seed, [r6+0x8000]
   1393    cmovp          seed, r6d               ; updated seed
   1394 
   1395    DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
   1396                offx, offy, see, unused1, unused2, unused3, luma, lstride
   1397 
   1398    rorx          offyd, seed, 8
   1399    rorx          offxq, seeq, 12
   1400    and           offyd, 0xf
   1401    imul          offyd, 164>>%3
   1402    lea           offyd, [offyq+offxq*(2-%2)+(3+(6>>%3))*82+(3+(6>>%2))]  ; offy*stride+offx
   1403 
   1404    DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
   1405                h, offxy, see, unused1, unused2, unused3, luma, lstride
   1406 
   1407    mov      grain_lutq, grain_lutmp
   1408    mov              hd, hm
   1409 %%loop_y:
   1410    ; luma_src
   1411 %if %2
   1412    mova            xm2, [lumaq+lstrideq*0+ 0]
   1413    vinserti128      m2, [lumaq+lstrideq*0+32], 1
   1414    mova            xm4, [lumaq+lstrideq*0+16]
   1415    vinserti128      m4, [lumaq+lstrideq*0+48], 1
   1416    mova            xm3, [lumaq+lstrideq*(1<<%3)+ 0]
   1417    vinserti128      m3, [lumaq+lstrideq*(1<<%3)+32], 1
   1418    mova            xm5, [lumaq+lstrideq*(1<<%3)+16]
   1419    vinserti128      m5, [lumaq+lstrideq*(1<<%3)+48], 1
   1420    phaddw           m2, m4
   1421    phaddw           m3, m5
   1422    pxor             m4, m4
   1423    pavgw            m2, m4
   1424    pavgw            m3, m4
   1425 %elif %1
   1426    mova             m2, [lumaq+ 0]
   1427    mova             m3, [lumaq+32]
   1428 %endif
   1429 %if %1
   1430    mova             m0, [srcq]
   1431 %if %2
   1432    mova             m1, [srcq+strideq]
   1433 %else
   1434    mova             m1, [srcq+32]
   1435 %endif
   1436    punpckhwd        m4, m2, m0
   1437    punpcklwd        m2, m0
   1438    punpckhwd        m5, m3, m1
   1439    punpcklwd        m3, m1                 ; { luma, chroma }
   1440    REPX {pmaddwd x, m14}, m4, m2, m5, m3
   1441    REPX {paddd   x, m15}, m4, m2, m5, m3
   1442    REPX {psrad   x, 6  }, m4, m2, m5, m3
   1443    packusdw         m2, m4
   1444    packusdw         m3, m5
   1445    pminuw           m2, m10
   1446    pminuw           m3, m10                ; clip_pixel()
   1447 %elif %2
   1448    pand             m2, m10
   1449    pand             m3, m10
   1450 %else
   1451    pand             m2, m10, [lumaq+ 0]
   1452    pand             m3, m10, [lumaq+32]
   1453 %endif
   1454 
   1455    ; scaling[luma_src]
   1456    vpbroadcastd     m7, [pd_m65536]
   1457    pandn            m4, m7, m2
   1458    mova             m6, m7
   1459    vpgatherdd       m5, [scalingq+m4-0], m7
   1460    psrld            m2, 16
   1461    mova             m7, m6
   1462    vpgatherdd       m4, [scalingq+m2-2], m6
   1463    pblendw          m4, m5, 0x55
   1464    pandn            m5, m7, m3
   1465    mova             m6, m7
   1466    vpgatherdd       m2, [scalingq+m5-0], m7
   1467    psrld            m3, 16
   1468    vpgatherdd       m5, [scalingq+m3-2], m6
   1469    pblendw          m5, m2, 0x55
   1470 
   1471    ; noise = round2(scaling[luma_src] * grain, scaling_shift)
   1472    pmaddubsw        m4, m11
   1473    pmaddubsw        m5, m11
   1474    paddw            m4, m4
   1475    paddw            m5, m5
   1476    pmulhrsw         m4, [grain_lutq+offxyq*2]
   1477 %if %2
   1478    pmulhrsw         m5, [grain_lutq+offxyq*2+82*2]
   1479 %else
   1480    pmulhrsw         m5, [grain_lutq+offxyq*2+32]
   1481 %endif
   1482 
   1483    ; dst = clip_pixel(src, noise)
   1484 %if %1
   1485    paddw            m0, m4
   1486    paddw            m1, m5
   1487 %else
   1488    paddw            m0, m4, [srcq]
   1489 %if %2
   1490    paddw            m1, m5, [srcq+strideq]
   1491 %else
   1492    paddw            m1, m5, [srcq+32]
   1493 %endif
   1494 %endif
   1495    pmaxsw           m0, m12
   1496    pmaxsw           m1, m12
   1497    pminsw           m0, m13
   1498    pminsw           m1, m13
   1499    mova         [dstq], m0
   1500 %if %2
   1501    mova [dstq+strideq], m1
   1502    lea            srcq, [srcq+strideq*2]
   1503    lea            dstq, [dstq+strideq*2]
   1504    lea           lumaq, [lumaq+lstrideq*(2<<%3)]
   1505 %else
   1506    mova      [dstq+32], m1
   1507    add            srcq, strideq
   1508    add            dstq, strideq
   1509    add           lumaq, lstrideq
   1510 %endif
   1511    add      grain_lutq, 82*(2<<%2)
   1512 %if %2
   1513    sub              hb, 2
   1514 %else
   1515    dec              hb
   1516 %endif
   1517    jg %%loop_y
   1518    add              wq, 32>>%2
   1519    jge .end
   1520    mov            srcq, r9mp
   1521    mov            dstq, r11mp
   1522    mov           lumaq, r12mp
   1523    lea            srcq, [srcq+wq*2]
   1524    lea            dstq, [dstq+wq*2]
   1525    lea           lumaq, [lumaq+wq*(2<<%2)]
   1526    cmp byte [fg_dataq+FGData.overlap_flag], 0
   1527    je %%loop_x
   1528    cmp       dword r8m, 0 ; sby
   1529    jne %%loop_x_hv_overlap
   1530 
   1531    ; horizontal overlap (without vertical overlap)
   1532 %%loop_x_h_overlap:
   1533    rorx             r6, seeq, 1
   1534    or             seed, 0xEFF4
   1535    test           seeb, seeh
   1536    lea            seed, [r6+0x8000]
   1537    cmovp          seed, r6d               ; updated seed
   1538 
   1539    DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
   1540                offx, offy, see, left_offxy, unused1, unused2, luma, lstride
   1541 
   1542    lea     left_offxyd, [offyq+(32>>%2)]         ; previous column's offy*stride+offx
   1543    rorx          offyd, seed, 8
   1544    rorx          offxq, seeq, 12
   1545    and           offyd, 0xf
   1546    imul          offyd, 164>>%3
   1547    lea           offyd, [offyq+offxq*(2-%2)+(3+(6>>%3))*82+3+(6>>%2)]  ; offy*stride+offx
   1548 
   1549    DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
   1550                h, offxy, see, left_offxy, unused1, unused2, luma, lstride
   1551 
   1552    mov      grain_lutq, grain_lutmp
   1553    mov              hd, hm
   1554 %%loop_y_h_overlap:
   1555    ; luma_src
   1556 %if %2
   1557    mova            xm2, [lumaq+lstrideq*0+ 0]
   1558    vinserti128      m2, [lumaq+lstrideq*0+32], 1
   1559    mova            xm4, [lumaq+lstrideq*0+16]
   1560    vinserti128      m4, [lumaq+lstrideq*0+48], 1
   1561    mova            xm3, [lumaq+lstrideq*(1<<%3)+ 0]
   1562    vinserti128      m3, [lumaq+lstrideq*(1<<%3)+32], 1
   1563    mova            xm5, [lumaq+lstrideq*(1<<%3)+16]
   1564    vinserti128      m5, [lumaq+lstrideq*(1<<%3)+48], 1
   1565    phaddw           m2, m4
   1566    phaddw           m3, m5
   1567    pxor             m4, m4
   1568    pavgw            m2, m4
   1569    pavgw            m3, m4
   1570 %elif %1
   1571    mova             m2, [lumaq]
   1572    mova             m3, [lumaq+32]
   1573 %endif
   1574 %if %1
   1575    mova             m0, [srcq]
   1576 %if %2
   1577    mova             m1, [srcq+strideq]
   1578 %else
   1579    mova             m1, [srcq+32]
   1580 %endif
   1581    punpckhwd        m4, m2, m0
   1582    punpcklwd        m2, m0
   1583    punpckhwd        m5, m3, m1
   1584    punpcklwd        m3, m1                 ; { luma, chroma }
   1585    REPX {pmaddwd x, m14}, m4, m2, m5, m3
   1586    REPX {paddd   x, m15}, m4, m2, m5, m3
   1587    REPX {psrad   x, 6  }, m4, m2, m5, m3
   1588    packusdw         m2, m4
   1589    packusdw         m3, m5
   1590    pminuw           m2, m10                ; clip_pixel()
   1591    pminuw           m3, m10
   1592 %elif %2
   1593    pand             m2, m10
   1594    pand             m3, m10
   1595 %else
   1596    pand             m2, m10, [lumaq+ 0]
   1597    pand             m3, m10, [lumaq+32]
   1598 %endif
   1599 
   1600    ; scaling[luma_src]
   1601    vpbroadcastd     m7, [pd_m65536]
   1602    pandn            m4, m7, m2
   1603    mova             m6, m7
   1604    vpgatherdd       m5, [scalingq+m4-0], m7
   1605    psrld            m2, 16
   1606    mova             m7, m6
   1607    vpgatherdd       m4, [scalingq+m2-2], m6
   1608    pblendw          m4, m5, 0x55
   1609    pandn            m5, m7, m3
   1610    mova             m6, m7
   1611    vpgatherdd       m2, [scalingq+m5-0], m7
   1612    psrld            m3, 16
   1613    vpgatherdd       m5, [scalingq+m3-2], m6
   1614    pblendw          m5, m2, 0x55
   1615 
   1616    ; grain = grain_lut[offy+y][offx+x]
   1617    movu             m2, [grain_lutq+offxyq*2]
   1618 %if %2
   1619    movu             m3, [grain_lutq+offxyq*2+82*2]
   1620 %else
   1621    movu             m3, [grain_lutq+offxyq*2+32]
   1622 %endif
   1623    movd            xm6, [grain_lutq+left_offxyq*2]
   1624 %if %2
   1625    pinsrw          xm6, [grain_lutq+left_offxyq*2+82*2], 2 ; {left0, left1}
   1626    punpckldq       xm7, xm2, xm3           ; {cur0, cur1}
   1627    punpcklwd       xm6, xm7                ; {left0, cur0, left1, cur1}
   1628 %else
   1629    punpcklwd       xm6, xm2
   1630 %endif
   1631 %if %1
   1632 %if %2
   1633    vpbroadcastq    xm7, [pw_23_22]
   1634 %else
   1635    movq            xm7, [pw_27_17_17_27]
   1636 %endif
   1637    pmaddwd         xm6, xm7
   1638    vpbroadcastd    xm7, [pd_16]
   1639    paddd           xm6, xm7
   1640 %else
   1641    pmaddwd         xm6, xm15
   1642    paddd           xm6, xm14
   1643 %endif
   1644    psrad           xm6, 5
   1645    packssdw        xm6, xm6
   1646    pmaxsw          xm6, xm8
   1647    pminsw          xm6, xm9
   1648    vpblendd         m2, m6, 0x01
   1649 %if %2
   1650    pshuflw         xm6, xm6, q1032
   1651    vpblendd         m3, m6, 0x01
   1652 %endif
   1653 
   1654    ; noise = round2(scaling[luma_src] * grain, scaling_shift)
   1655    pmaddubsw        m4, m11
   1656    pmaddubsw        m5, m11
   1657    paddw            m4, m4
   1658    paddw            m5, m5
   1659    pmulhrsw         m2, m4
   1660    pmulhrsw         m3, m5
   1661 
   1662    ; dst = clip_pixel(src, noise)
   1663 %if %1
   1664    paddw            m0, m2
   1665    paddw            m1, m3
   1666 %else
   1667    paddw            m0, m2, [srcq]
   1668 %if %2
   1669    paddw            m1, m3, [srcq+strideq]
   1670 %else
   1671    paddw            m1, m3, [srcq+32]
   1672 %endif
   1673 %endif
   1674    pmaxsw           m0, m12
   1675    pmaxsw           m1, m12
   1676    pminsw           m0, m13
   1677    pminsw           m1, m13
   1678    mova         [dstq], m0
   1679 %if %2
   1680    mova [dstq+strideq], m1
   1681    lea            srcq, [srcq+strideq*2]
   1682    lea            dstq, [dstq+strideq*2]
   1683    lea           lumaq, [lumaq+lstrideq*(2<<%3)]
   1684 %else
   1685    mova      [dstq+32], m1
   1686    add            srcq, strideq
   1687    add            dstq, strideq
   1688    add           lumaq, r10mp
   1689 %endif
   1690    add      grain_lutq, 82*(2<<%2)
   1691 %if %2
   1692    sub              hb, 2
   1693 %else
   1694    dec              hb
   1695 %endif
   1696    jg %%loop_y_h_overlap
   1697    add              wq, 32>>%2
   1698    jge .end
   1699    mov            srcq, r9mp
   1700    mov            dstq, r11mp
   1701    mov           lumaq, r12mp
   1702    lea            srcq, [srcq+wq*2]
   1703    lea            dstq, [dstq+wq*2]
   1704    lea           lumaq, [lumaq+wq*(2<<%2)]
   1705    cmp       dword r8m, 0 ; sby
   1706    jne %%loop_x_hv_overlap
   1707    jmp %%loop_x_h_overlap
   1708 
   1709 %%vertical_overlap:
   1710    DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, unused, \
   1711                sby, see, unused1, unused2, unused3, lstride
   1712 
   1713    movzx          sbyd, sbyb
   1714    imul           seed, [fg_dataq+FGData.seed], 0x00010001
   1715    imul            r7d, sbyd, 173 * 0x00010001
   1716    imul           sbyd, 37 * 0x01000100
   1717    add             r7d, (105 << 16) | 188
   1718    add            sbyd, (178 << 24) | (141 << 8)
   1719    and             r7d, 0x00ff00ff
   1720    and            sbyd, 0xff00ff00
   1721    xor            seed, r7d
   1722    xor            seed, sbyd               ; (cur_seed << 16) | top_seed
   1723 
   1724    DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
   1725                offx, offy, see, unused1, top_offxy, unused2, luma, lstride
   1726 
   1727    mov           lumaq, r9mp
   1728    mov        lstrideq, r10mp
   1729    lea             r10, [srcq+wq*2]
   1730    lea             r11, [dstq+wq*2]
   1731    lea             r12, [lumaq+wq*(2<<%2)]
   1732    mov            r9mp, r10
   1733    mov           r11mp, r11
   1734    mov           r12mp, r12
   1735    neg              wq
   1736 
   1737 %%loop_x_v_overlap:
   1738    ; we assume from the block above that bits 8-15 of r7d are zero'ed
   1739    mov             r6d, seed
   1740    or             seed, 0xeff4eff4
   1741    test           seeb, seeh
   1742    setp            r7b                     ; parity of top_seed
   1743    shr            seed, 16
   1744    shl             r7d, 16
   1745    test           seeb, seeh
   1746    setp            r7b                     ; parity of cur_seed
   1747    or              r6d, 0x00010001
   1748    xor             r7d, r6d
   1749    rorx           seed, r7d, 1             ; updated (cur_seed << 16) | top_seed
   1750 
   1751    rorx          offyd, seed, 8
   1752    rorx          offxd, seed, 12
   1753    and           offyd, 0xf000f
   1754    and           offxd, 0xf000f
   1755    imul          offyd, 164>>%3
   1756    ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy
   1757    lea           offyd, [offyq+offxq*(2-%2)+0x10001*((3+(6>>%3))*82+3+(6>>%2))+(32>>%3)*82]
   1758 
   1759    DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
   1760                h, offxy, see, unused1, top_offxy, unused2, luma, lstride
   1761 
   1762    mov      grain_lutq, grain_lutmp
   1763    mov              hd, hm
   1764    movzx    top_offxyd, offxyw
   1765    shr          offxyd, 16
   1766 %if %2 == 0
   1767    lea             r10, [pw_27_17_17_27]
   1768 %endif
   1769 %%loop_y_v_overlap:
   1770    ; luma_src
   1771 %if %2
   1772    mova            xm2, [lumaq+lstrideq*0+ 0]
   1773    vinserti128      m2, [lumaq+lstrideq*0+32], 1
   1774    mova            xm4, [lumaq+lstrideq*0+16]
   1775    vinserti128      m4, [lumaq+lstrideq*0+48], 1
   1776    mova            xm3, [lumaq+lstrideq*(1<<%3)+ 0]
   1777    vinserti128      m3, [lumaq+lstrideq*(1<<%3)+32], 1
   1778    mova            xm5, [lumaq+lstrideq*(1<<%3)+16]
   1779    vinserti128      m5, [lumaq+lstrideq*(1<<%3)+48], 1
   1780    phaddw           m2, m4
   1781    phaddw           m3, m5
   1782    pxor             m4, m4
   1783    pavgw            m2, m4
   1784    pavgw            m3, m4
   1785 %elif %1
   1786    mova             m2, [lumaq]
   1787    mova             m3, [lumaq+32]
   1788 %endif
   1789 %if %1
   1790    mova             m0, [srcq]
   1791 %if %2
   1792    mova             m1, [srcq+strideq]
   1793 %else
   1794    mova             m1, [srcq+32]
   1795 %endif
   1796    punpckhwd        m4, m2, m0
   1797    punpcklwd        m2, m0
   1798    punpckhwd        m5, m3, m1
   1799    punpcklwd        m3, m1                 ; { luma, chroma }
   1800    REPX {pmaddwd x, m14}, m4, m2, m5, m3
   1801    REPX {paddd   x, m15}, m4, m2, m5, m3
   1802    REPX {psrad   x, 6  }, m4, m2, m5, m3
   1803    packusdw         m2, m4
   1804    packusdw         m3, m5
   1805    pminuw           m2, m10                ; clip_pixel()
   1806    pminuw           m3, m10
   1807 %elif %2
   1808    pand             m2, m10
   1809    pand             m3, m10
   1810 %else
   1811    pand             m2, m10, [lumaq+ 0]
   1812    pand             m3, m10, [lumaq+32]
   1813 %endif
   1814 
   1815    ; scaling[luma_src]
   1816    vpbroadcastd     m7, [pd_m65536]
   1817    pandn            m4, m7, m2
   1818    mova             m6, m7
   1819    vpgatherdd       m5, [scalingq+m4-0], m7
   1820    psrld            m2, 16
   1821    mova             m7, m6
   1822    vpgatherdd       m4, [scalingq+m2-2], m6
   1823    pblendw          m4, m5, 0x55
   1824    pandn            m5, m7, m3
   1825    mova             m6, m7
   1826    vpgatherdd       m2, [scalingq+m5-0], m7
   1827    psrld            m3, 16
   1828    vpgatherdd       m5, [scalingq+m3-2], m6
   1829    pblendw          m5, m2, 0x55
   1830 
   1831    ; grain = grain_lut[offy+y][offx+x]
   1832    movu             m6, [grain_lutq+offxyq*2]
   1833    movu             m3, [grain_lutq+top_offxyq*2]
   1834    punpcklwd        m2, m3, m6
   1835    punpckhwd        m3, m6                 ; { top, cur }
   1836 %if %3
   1837    vpbroadcastd     m0, [pw_23_22]
   1838 %elif %2
   1839    vpbroadcastd     m0, [pw_27_17_17_27]
   1840 %else
   1841    vpbroadcastd     m0, [r10]
   1842 %endif
   1843    REPX {pmaddwd x, m0}, m2, m3
   1844 %if %1
   1845    vpbroadcastd     m1, [pd_16]
   1846    REPX  {paddd x, m1}, m2, m3
   1847 %else
   1848    REPX {paddd x, m14}, m2, m3
   1849 %endif
   1850    REPX   {psrad x, 5}, m2, m3
   1851    packssdw         m2, m3
   1852 %if %2
   1853    movu             m3, [grain_lutq+offxyq*2+82*2]
   1854 %else
   1855    movu             m3, [grain_lutq+offxyq*2+32]
   1856 %endif
   1857 %if %3
   1858    pmaxsw           m2, m8
   1859    pminsw           m2, m9
   1860 %else
   1861 %if %2
   1862    movu             m7, [grain_lutq+top_offxyq*2+82*2]
   1863    punpckhwd        m6, m3, m7             ; { cur, top }
   1864    punpcklwd        m3, m7
   1865 %else
   1866    movu             m7, [grain_lutq+top_offxyq*2+32]
   1867    punpckhwd        m6, m7, m3
   1868    punpcklwd        m3, m7, m3             ; { top, cur }
   1869 %endif
   1870    pmaddwd          m6, m0
   1871    pmaddwd          m3, m0
   1872 %if %1
   1873    paddd            m6, m1
   1874    paddd            m3, m1
   1875 %else
   1876    paddd            m6, m14
   1877    paddd            m3, m14
   1878 %endif
   1879    psrad            m6, 5
   1880    psrad            m3, 5
   1881    packssdw         m3, m6
   1882    pmaxsw           m2, m8
   1883    pmaxsw           m3, m8
   1884    pminsw           m2, m9
   1885    pminsw           m3, m9
   1886 %endif
   1887 
   1888    ; noise = round2(scaling[luma_src] * grain, scaling_shift)
   1889    pmaddubsw        m4, m11
   1890    pmaddubsw        m5, m11
   1891    paddw            m4, m4
   1892    paddw            m5, m5
   1893    pmulhrsw         m2, m4
   1894    pmulhrsw         m3, m5
   1895 
   1896    ; dst = clip_pixel(src, noise)
   1897    paddw            m0, m2, [srcq]
   1898 %if %2
   1899    paddw            m1, m3, [srcq+strideq]
   1900 %else
   1901    paddw            m1, m3, [srcq+32]
   1902 %endif
   1903    pmaxsw           m0, m12
   1904    pmaxsw           m1, m12
   1905    pminsw           m0, m13
   1906    pminsw           m1, m13
   1907    mova         [dstq], m0
   1908 %if %2
   1909    mova [dstq+strideq], m1
   1910    sub              hb, 2
   1911 %else
   1912    mova      [dstq+32], m1
   1913    dec              hb
   1914 %endif
   1915    jle %%end_y_v_overlap
   1916 %if %2
   1917    lea            srcq, [srcq+strideq*2]
   1918    lea            dstq, [dstq+strideq*2]
   1919    lea           lumaq, [lumaq+lstrideq*(2<<%3)]
   1920 %else
   1921    add            srcq, strideq
   1922    add            dstq, strideq
   1923    add           lumaq, lstrideq
   1924 %endif
   1925    add      grain_lutq, 82*(2<<%2)
   1926 %if %2
   1927    jmp %%loop_y
   1928 %else
   1929    add              hd, 0x80000000
   1930    jc %%loop_y
   1931    add             r10, 4
   1932    jmp %%loop_y_v_overlap
   1933 %endif
   1934 %%end_y_v_overlap:
   1935    add              wq, 32>>%2
   1936    jge .end
   1937    mov            srcq, r9mp
   1938    mov            dstq, r11mp
   1939    mov           lumaq, r12mp
   1940    lea            srcq, [srcq+wq*2]
   1941    lea            dstq, [dstq+wq*2]
   1942    lea           lumaq, [lumaq+wq*(2<<%2)]
   1943 
   1944    ; since fg_dataq.overlap is guaranteed to be set, we never jump
   1945    ; back to .loop_x_v_overlap, and instead always fall-through to
   1946    ; h+v overlap
   1947 %%loop_x_hv_overlap:
   1948    ; we assume from the block above that bits 8-15 of r7d are zero'ed
   1949    mov             r6d, seed
   1950    or             seed, 0xeff4eff4
   1951    test           seeb, seeh
   1952    setp            r7b                     ; parity of top_seed
   1953    shr            seed, 16
   1954    shl             r7d, 16
   1955    test           seeb, seeh
   1956    setp            r7b                     ; parity of cur_seed
   1957    or              r6d, 0x00010001
   1958    xor             r7d, r6d
   1959    rorx           seed, r7d, 1             ; updated (cur_seed << 16) | top_seed
   1960 
   1961    DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
   1962                offx, offy, see, left_offxy, top_offxy, topleft_offxy, luma, lstride
   1963 
   1964 %if %2 == 0
   1965    lea             r14, [pw_27_17_17_27]
   1966 %endif
   1967    lea  topleft_offxyq, [top_offxyq+(32>>%2)]
   1968    lea     left_offxyq, [offyq+(32>>%2)]
   1969    rorx          offyd, seed, 8
   1970    rorx          offxd, seed, 12
   1971    and           offyd, 0xf000f
   1972    and           offxd, 0xf000f
   1973    imul          offyd, 164>>%3
   1974    ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy
   1975    lea           offyd, [offyq+offxq*(2-%2)+0x10001*((3+(6>>%3))*82+3+(6>>%2))+(32>>%3)*82]
   1976 
   1977    DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
   1978                h, offxy, see, left_offxy, top_offxy, topleft_offxy, luma, lstride
   1979 
   1980    mov      grain_lutq, grain_lutmp
   1981    mov              hd, hm
   1982    movzx    top_offxyd, offxyw
   1983    shr          offxyd, 16
   1984 %%loop_y_hv_overlap:
   1985    ; luma_src
   1986 %if %2
   1987    mova            xm2, [lumaq+lstrideq*0+ 0]
   1988    vinserti128      m2, [lumaq+lstrideq*0+32], 1
   1989    mova            xm4, [lumaq+lstrideq*0+16]
   1990    vinserti128      m4, [lumaq+lstrideq*0+48], 1
   1991    mova            xm3, [lumaq+lstrideq*(1<<%3)+ 0]
   1992    vinserti128      m3, [lumaq+lstrideq*(1<<%3)+32], 1
   1993    mova            xm5, [lumaq+lstrideq*(1<<%3)+16]
   1994    vinserti128      m5, [lumaq+lstrideq*(1<<%3)+48], 1
   1995    phaddw           m2, m4
   1996    phaddw           m3, m5
   1997    pxor             m4, m4
   1998    pavgw            m2, m4
   1999    pavgw            m3, m4
   2000 %elif %1
   2001    mova             m2, [lumaq]
   2002    mova             m3, [lumaq+32]
   2003 %endif
   2004 %if %1
   2005    mova             m0, [srcq]
   2006 %if %2
   2007    mova             m1, [srcq+strideq]
   2008 %else
   2009    mova             m1, [srcq+32]
   2010 %endif
   2011    punpckhwd        m4, m2, m0
   2012    punpcklwd        m2, m0
   2013    punpckhwd        m5, m3, m1
   2014    punpcklwd        m3, m1                 ; { luma, chroma }
   2015    REPX {pmaddwd x, m14}, m4, m2, m5, m3
   2016    REPX {paddd   x, m15}, m4, m2, m5, m3
   2017    REPX {psrad   x, 6  }, m4, m2, m5, m3
   2018    packusdw         m2, m4
   2019    packusdw         m3, m5
   2020    pminuw           m2, m10                ; clip_pixel()
   2021    pminuw           m3, m10
   2022 %elif %2
   2023    pand             m2, m10
   2024    pand             m3, m10
   2025 %else
   2026    pand             m2, m10, [lumaq+ 0]
   2027    pand             m3, m10, [lumaq+32]
   2028 %endif
   2029 
   2030    ; scaling[luma_src]
   2031    vpbroadcastd     m7, [pd_m65536]
   2032    pandn            m4, m7, m2
   2033    mova             m6, m7
   2034    vpgatherdd       m5, [scalingq+m4-0], m7
   2035    psrld            m2, 16
   2036    mova             m7, m6
   2037    vpgatherdd       m4, [scalingq+m2-2], m6
   2038    pblendw          m4, m5, 0x55
   2039    pandn            m5, m7, m3
   2040    mova             m6, m7
   2041    vpgatherdd       m2, [scalingq+m5-0], m7
   2042    psrld            m3, 16
   2043    vpgatherdd       m5, [scalingq+m3-2], m6
   2044    pblendw          m5, m2, 0x55
   2045 
   2046    ; grain = grain_lut[offy+y][offx+x]
   2047    movu             m0, [grain_lutq+offxyq*2]
   2048    movd            xm2, [grain_lutq+left_offxyq*2]
   2049    movu             m6, [grain_lutq+top_offxyq*2]
   2050 %if %2
   2051    pinsrw          xm2, [grain_lutq+left_offxyq*2+82*2], 2
   2052    movu             m3, [grain_lutq+offxyq*2+82*2]
   2053    punpckldq       xm1, xm0, xm3           ; { cur0, cur1 }
   2054 %if %3
   2055    vinserti128      m2, [grain_lutq+topleft_offxyq*2], 1 ; { left0, left1, top/left }
   2056    vinserti128      m1, [grain_lutq+top_offxyq*2], 1     ; { cur0, cur1, top0 }
   2057 %else
   2058    vinserti128      m2, [grain_lutq+topleft_offxyq*2+82*2], 1
   2059    vpbroadcastd     m7, [grain_lutq+topleft_offxyq*2]
   2060    vpblendd         m2, m7, 0x20
   2061    movd            xm7, [grain_lutq+top_offxyq*2+82*2]
   2062    punpckldq       xm7, xm6
   2063    vinserti128      m1, xm7, 1
   2064    movu             m7, [grain_lutq+top_offxyq*2+82*2]
   2065 %endif
   2066    punpcklwd        m2, m1                 ; { cur, left }
   2067 %if %1
   2068    vpbroadcastq     m1, [pw_23_22]
   2069    pmaddwd          m2, m1
   2070    vpbroadcastd     m1, [pd_16]
   2071    paddd            m2, m1
   2072    psrad            m2, 5
   2073    packssdw         m2, m2
   2074    vpermq           m2, m2, q3120
   2075 %else
   2076    pmaddwd          m2, m15
   2077    paddd            m2, m14
   2078    psrad            m2, 5
   2079    vextracti128    xm1, m2, 1
   2080    packssdw        xm2, xm1
   2081 %endif
   2082 %else
   2083    pinsrd          xm2, [grain_lutq+topleft_offxyq*2], 1
   2084    movu             m3, [grain_lutq+offxyq*2+32]
   2085    movu             m7, [grain_lutq+top_offxyq*2+32]
   2086    punpckldq       xm1, xm0, xm6
   2087    punpcklwd       xm2, xm1                ; { cur, left }
   2088 %if %1
   2089    movddup         xm1, [pw_27_17_17_27]
   2090    pmaddwd         xm2, xm1
   2091    vpbroadcastd     m1, [pd_16]
   2092    paddd           xm2, xm1
   2093 %else
   2094    pmaddwd         xm2, xm15
   2095    paddd           xm2, xm14
   2096 %endif
   2097    psrad           xm2, 5
   2098    packssdw        xm2, xm2
   2099 %endif
   2100    pmaxsw          xm2, xm8
   2101    pminsw          xm2, xm9
   2102    vpblendd         m0, m2, 0x01
   2103 %if %2
   2104    pshufd          xm2, xm2, q0321
   2105    vpblendd         m3, m2, 0x01
   2106 %if %3 == 0
   2107    pshufd          xm2, xm2, q0321
   2108    vpblendd         m7, m2, 0x01
   2109 %endif
   2110 %endif
   2111    pshuflw         xm2, xm2, q1032
   2112    vpblendd         m2, m6, 0xfe
   2113    punpckhwd        m6, m0                 ; { top, cur }
   2114    punpcklwd        m2, m0
   2115 %if %3
   2116    vpbroadcastd     m0, [pw_23_22]
   2117 %elif %2
   2118    vpbroadcastd     m0, [pw_27_17_17_27]
   2119 %else
   2120    vpbroadcastd     m0, [r14]
   2121 %endif
   2122    pmaddwd          m6, m0
   2123    pmaddwd          m2, m0
   2124 %if %1
   2125    paddd            m6, m1
   2126    paddd            m2, m1
   2127 %else
   2128    paddd            m6, m14
   2129    paddd            m2, m14
   2130 %endif
   2131    psrad            m6, 5
   2132    psrad            m2, 5
   2133    packssdw         m2, m6
   2134 
   2135 %if %3
   2136    pmaxsw           m2, m8
   2137    pminsw           m2, m9
   2138 %else
   2139 %if %2
   2140    punpckhwd        m6, m3, m7
   2141    punpcklwd        m3, m7                 ; { cur, top }
   2142 %else
   2143    punpckhwd        m6, m7, m3
   2144    punpcklwd        m3, m7, m3             ; { top, cur }
   2145 %endif
   2146    REPX {pmaddwd x, m0}, m6, m3
   2147 %if %1
   2148    REPX  {paddd x, m1}, m6, m3
   2149 %else
   2150    REPX {paddd x, m14}, m6, m3
   2151 %endif
   2152    REPX   {psrad x, 5}, m6, m3
   2153    packssdw         m3, m6
   2154    pmaxsw           m2, m8
   2155    pmaxsw           m3, m8
   2156    pminsw           m2, m9
   2157    pminsw           m3, m9
   2158 %endif
   2159 
   2160    ; noise = round2(scaling[luma_src] * grain, scaling_shift)
   2161    pmaddubsw        m4, m11
   2162    pmaddubsw        m5, m11
   2163    paddw            m4, m4
   2164    paddw            m5, m5
   2165    pmulhrsw         m2, m4
   2166    pmulhrsw         m3, m5
   2167 
   2168    ; dst = clip_pixel(src, noise)
   2169    paddw            m0, m2, [srcq]
   2170 %if %2
   2171    paddw            m1, m3, [srcq+strideq]
   2172 %else
   2173    paddw            m1, m3, [srcq+32]
   2174 %endif
   2175    pmaxsw           m0, m12
   2176    pmaxsw           m1, m12
   2177    pminsw           m0, m13
   2178    pminsw           m1, m13
   2179    mova         [dstq], m0
   2180 %if %2
   2181    mova [dstq+strideq], m1
   2182    lea            srcq, [srcq+strideq*2]
   2183    lea            dstq, [dstq+strideq*2]
   2184    lea           lumaq, [lumaq+lstrideq*(2<<%3)]
   2185 %else
   2186    mova      [dstq+32], m1
   2187    add            srcq, strideq
   2188    add            dstq, strideq
   2189    add           lumaq, r10mp
   2190 %endif
   2191    add      grain_lutq, 82*(2<<%2)
   2192 %if %2
   2193    sub              hb, 2
   2194    jg %%loop_y_h_overlap
   2195 %else
   2196    dec              hb
   2197    jle %%end_y_hv_overlap
   2198    add              hd, 0x80000000
   2199    jc %%loop_y_h_overlap
   2200    add             r14, 4
   2201    jmp %%loop_y_hv_overlap
   2202 %endif
   2203 %%end_y_hv_overlap:
   2204    add              wq, 32>>%2
   2205    jge .end
   2206    mov            srcq, r9mp
   2207    mov            dstq, r11mp
   2208    mov           lumaq, r12mp
   2209    lea            srcq, [srcq+wq*2]
   2210    lea            dstq, [dstq+wq*2]
   2211    lea           lumaq, [lumaq+wq*(2<<%2)]
   2212    jmp %%loop_x_hv_overlap
   2213 %endmacro
   2214 
   2215    %%FGUV_32x32xN_LOOP 1, %2, %3
   2216 .csfl:
   2217    %%FGUV_32x32xN_LOOP 0, %2, %3
   2218 .end:
   2219    RET
   2220 %endmacro
   2221 
   2222 GEN_GRAIN_UV_FN 420, 1, 1
   2223 FGUV_FN 420,         1, 1
   2224 GEN_GRAIN_UV_FN 422, 1, 0
   2225 FGUV_FN 422,         1, 0
   2226 GEN_GRAIN_UV_FN 444, 0, 0
   2227 FGUV_FN 444,         0, 0
   2228 
   2229 %endif ; ARCH_X86_64