tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

filmgrain_avx512.asm (28704B)


      1 ; Copyright © 2022, VideoLAN and dav1d authors
      2 ; Copyright © 2022, Two Orioles, LLC
      3 ; All rights reserved.
      4 ;
      5 ; Redistribution and use in source and binary forms, with or without
      6 ; modification, are permitted provided that the following conditions are met:
      7 ;
      8 ; 1. Redistributions of source code must retain the above copyright notice, this
      9 ;    list of conditions and the following disclaimer.
     10 ;
     11 ; 2. Redistributions in binary form must reproduce the above copyright notice,
     12 ;    this list of conditions and the following disclaimer in the documentation
     13 ;    and/or other materials provided with the distribution.
     14 ;
     15 ; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
     16 ; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
     17 ; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
     18 ; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
     19 ; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
     20 ; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
     21 ; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
     22 ; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
     23 ; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
     24 ; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     25 
     26 %include "config.asm"
     27 %include "ext/x86/x86inc.asm"
     28 %include "x86/filmgrain_common.asm"
     29 
     30 %if ARCH_X86_64
     31 
     32 SECTION_RODATA 64
     33 
     34 pb_even:       db  0,  2,  4,  6,  8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
     35               db 32, 34, 36, 38, 40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62
     36               db 64, 66, 68, 70, 72, 74, 76, 78, 80, 82, 84, 86, 88, 90, 92, 94
     37               db 96, 98,100,102,104,106,108,110,112,114,116,118,120,122,124,126
     38 pb_odd:        db  1,  3,  5,  7,  9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31
     39               db 33, 35, 37, 39, 41, 43, 45, 47, 49, 51, 53, 55, 57, 59, 61, 63
     40               db 65, 67, 69, 71, 73, 75, 77, 79, 81, 83, 85, 87, 89, 91, 93, 95
     41               db 97, 99,101,103,105,107,109,111,113,115,117,119,121,123,125,127
     42 interleave_hl: db  8,  0,  9,  1, 10,  2, 11,  3, 12,  4, 13,  5, 14,  6, 15,  7
     43 pb_27_17_17_27:        db 27, 17, 17, 27,  0, 32,  0, 32
     44 pb_23_22_0_32:         db 23, 22,  0, 32,  0, 32,  0, 32
     45 pb_27_17:      times 2 db 27, 17
     46 pb_23_22:      times 2 db 23, 22
     47 pw_8:          times 2 dw 8
     48 pw_1024:       times 2 dw 1024
     49 pb_17_27:      times 2 db 17, 27
     50 fg_max:        times 4 db 255
     51               times 4 db 240
     52               times 4 db 235
     53 fg_min:        times 4 db 0
     54               times 4 db 16
     55 noise_rnd:     times 2 dw 128
     56               times 2 dw 64
     57               times 2 dw 32
     58               times 2 dw 16
     59 
     60 SECTION .text
     61 
     62 INIT_ZMM avx512icl
     63 cglobal fgy_32x32xn_8bpc, 6, 13, 22, dst, src, stride, fg_data, w, scaling, \
     64                                     grain_lut, h, sby, see, overlap
     65 %define base r11-fg_min
     66    lea             r11, [fg_min]
     67    mov             r6d, [fg_dataq+FGData.scaling_shift]
     68    mov             r7d, [fg_dataq+FGData.clip_to_restricted_range]
     69    mov            sbyd, sbym
     70    mov        overlapd, [fg_dataq+FGData.overlap_flag]
     71    mov             r12, 0x0000000f0000000f ; h_overlap mask
     72    mova             m0, [scalingq+64*0]
     73    mova             m1, [scalingq+64*1]
     74    mova             m2, [scalingq+64*2]
     75    mova             m3, [scalingq+64*3]
     76    kmovq            k1, r12
     77    vbroadcasti32x4  m4, [base+interleave_hl]
     78    vpbroadcastd   ym16, [base+pb_27_17]
     79    vpbroadcastd    m12, [base+pb_17_27]
     80    vpbroadcastd     m6, [base+noise_rnd+r6*4-32]
     81    test           sbyd, sbyd
     82    setnz           r6b
     83    vpbroadcastd     m7, [base+fg_min+r7*4]
     84    vpbroadcastd     m8, [base+fg_max+r7*8]
     85    pxor             m5, m5
     86    vpbroadcastd     m9, [base+pw_1024]
     87    vpbroadcastq    m10, [base+pb_27_17_17_27]
     88    vmovdqa64   m12{k1}, m16
     89    test            r6b, overlapb
     90    jnz .v_overlap
     91 
     92    imul           seed, sbyd, (173 << 24) | 37
     93    add            seed, (105 << 24) | 178
     94    rorx           seed, seed, 24
     95    movzx          seed, seew
     96    xor            seed, [fg_dataq+FGData.seed]
     97 
     98    DEFINE_ARGS dst, src, stride, src_bak, w, offx, offy, \
     99                h, sby, see, overlap
    100 
    101    lea        src_bakq, [srcq+wq]
    102    neg              wq
    103    sub            dstq, srcq
    104 .loop_x:
    105    rorx             r6, seeq, 1
    106    or             seed, 0xeff4
    107    test           seeb, seeh
    108    lea            seed, [r6+0x8000]
    109    cmovp          seed, r6d                 ; updated seed
    110    rorx          offyd, seed, 8
    111    rorx          offxq, seeq, 12
    112    and           offyd, 0xf
    113    imul          offyd, 164
    114    lea           offxd, [offyq+offxq*2+829] ; offy*stride+offx
    115 
    116    DEFINE_ARGS dst, src, stride, src_bak, w, offxy, grain_lut, \
    117                h, sby, see, overlap
    118 
    119    mov      grain_lutq, grain_lutmp
    120    mov              hd, hm
    121 .loop_y:
    122    movu           ym21, [grain_lutq+offxyq-82]
    123    vinserti32x8    m21, [grain_lutq+offxyq+ 0], 1
    124    call .add_noise
    125    sub              hb, 2
    126    jg .loop_y
    127    add              wq, 32
    128    jge .end
    129    lea            srcq, [src_bakq+wq]
    130    test       overlapd, overlapd
    131    jz .loop_x
    132    test           sbyd, sbyd
    133    jnz .hv_overlap
    134 
    135 .loop_x_h_overlap:
    136    rorx             r6, seeq, 1
    137    or             seed, 0xeff4
    138    test           seeb, seeh
    139    lea            seed, [r6+0x8000]
    140    cmovp          seed, r6d                 ; updated seed
    141 
    142    DEFINE_ARGS dst, src, stride, src_bak, w, offx, offy, \
    143                h, sby, see, left_offxy
    144 
    145    rorx          offyd, seed, 8
    146    mov     left_offxyd, offxd               ; previous column's offy*stride
    147    rorx          offxq, seeq, 12
    148    and           offyd, 0xf
    149    imul          offyd, 164
    150    lea           offxd, [offyq+offxq*2+829] ; offy*stride+offx
    151 
    152    DEFINE_ARGS dst, src, stride, src_bak, w, offxy, grain_lut, \
    153                h, sby, see, left_offxy
    154 
    155    mov      grain_lutq, grain_lutmp
    156    mov              hd, hm
    157 .loop_y_h_overlap:
    158    movu           ym20, [grain_lutq+offxyq-82]
    159    vinserti32x8    m20, [grain_lutq+offxyq+ 0], 1
    160    movd           xm19, [grain_lutq+left_offxyq-50]
    161    vinserti32x4    m19, [grain_lutq+left_offxyq+32], 2
    162    punpcklbw       m19, m20
    163    pmaddubsw       m19, m10, m19
    164    pmulhrsw        m19, m9
    165    punpckhbw       m21, m20, m5
    166    packsswb    m20{k1}, m19, m19
    167    punpcklbw       m20, m5, m20
    168    call .add_noise_h
    169    sub              hb, 2
    170    jg .loop_y_h_overlap
    171    add              wq, 32
    172    jge .end
    173    lea            srcq, [src_bakq+wq]
    174    test           sbyd, sbyd
    175    jnz .hv_overlap
    176    jmp .loop_x_h_overlap
    177 
    178 .v_overlap:
    179    DEFINE_ARGS dst, src, stride, fg_data, w, offy, offx, \
    180                h, sby, see, overlap
    181 
    182    movzx           r6d, sbyb
    183    imul           seed, [fg_dataq+FGData.seed], 0x00010001
    184    imul            r7d, r6d, 173 * 0x00010001
    185    imul            r6d, 37 * 0x01000100
    186    add             r7d, (105 << 16) | 188
    187    add             r6d, (178 << 24) | (141 << 8)
    188    and             r7d, 0x00ff00ff
    189    and             r6d, 0xff00ff00
    190    xor            seed, r7d
    191    xor            seed, r6d     ; (cur_seed << 16) | top_seed
    192 
    193    DEFINE_ARGS dst, src, stride, src_bak, w, offx, offy, \
    194                h, sby, see, overlap
    195 
    196    lea        src_bakq, [srcq+wq]
    197    neg              wq
    198    sub            dstq, srcq
    199 
    200    ; we assume from the block above that bits 8-15 of r7d are zero'ed
    201    mov             r6d, seed
    202    or             seed, 0xeff4eff4
    203    test           seeb, seeh
    204    setp            r7b          ; parity of top_seed
    205    shr            seed, 16
    206    shl             r7d, 16
    207    test           seeb, seeh
    208    setp            r7b          ; parity of cur_seed
    209    or              r6d, 0x00010001
    210    xor             r7d, r6d
    211    rorx           seed, r7d, 1  ; updated (cur_seed << 16) | top_seed
    212    rorx          offyd, seed, 8
    213    rorx          offxd, seed, 12
    214    and           offyd, 0xf000f
    215    and           offxd, 0xf000f
    216    imul          offyd, 164
    217    ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy
    218    lea           offxd, [offyq+offxq*2+0x10001*829+32*82]
    219 
    220    DEFINE_ARGS dst, src, stride, src_bak, w, offxy, grain_lut, \
    221                h, sby, see, overlap, top_offxy
    222 
    223    mov      grain_lutq, grain_lutmp
    224    mov              hd, hm
    225    movzx    top_offxyd, offxyw
    226    shr          offxyd, 16
    227    movu           ym19, [grain_lutq+offxyq-82]
    228    vinserti32x8    m19, [grain_lutq+offxyq+ 0], 1
    229    movu           ym21, [grain_lutq+top_offxyq-82]
    230    vinserti32x8    m21, [grain_lutq+top_offxyq+ 0], 1
    231    punpckhbw       m20, m21, m19
    232    punpcklbw       m21, m19
    233    call .add_noise_v
    234    sub              hb, 2
    235    jg .loop_y
    236    add              wq, 32
    237    jge .end
    238    lea            srcq, [src_bakq+wq]
    239 
    240    ; since fg_dataq.overlap is guaranteed to be set, we never jump back
    241    ; to .v_overlap, and instead always fall-through to h+v overlap
    242 .hv_overlap:
    243    ; we assume from the block above that bits 8-15 of r7d are zero'ed
    244    mov             r6d, seed
    245    or             seed, 0xeff4eff4
    246    test           seeb, seeh
    247    setp            r7b          ; parity of top_seed
    248    shr            seed, 16
    249    shl             r7d, 16
    250    test           seeb, seeh
    251    setp            r7b          ; parity of cur_seed
    252    or              r6d, 0x00010001
    253    xor             r7d, r6d
    254    rorx           seed, r7d, 1  ; updated (cur_seed << 16) | top_seed
    255 
    256    DEFINE_ARGS dst, src, stride, src_bak, w, offx, offy, \
    257                h, sby, see, left_offxy, top_offxy, topleft_offxy
    258 
    259    mov  topleft_offxyd, top_offxyd
    260    rorx          offyd, seed, 8
    261    mov     left_offxyd, offxd
    262    rorx          offxd, seed, 12
    263    and           offyd, 0xf000f
    264    and           offxd, 0xf000f
    265    imul          offyd, 164
    266    ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy
    267    lea           offxd, [offyq+offxq*2+0x10001*829+32*82]
    268 
    269    DEFINE_ARGS dst, src, stride, src_bak, w, offxy, grain_lut, \
    270                h, sby, see, left_offxy, top_offxy, topleft_offxy
    271 
    272    mov      grain_lutq, grain_lutmp
    273    mov              hd, hm
    274    movzx    top_offxyd, offxyw
    275    shr          offxyd, 16
    276    movu           ym19, [grain_lutq+offxyq-82]
    277    vinserti32x8    m19, [grain_lutq+offxyq+ 0], 1
    278    movd           xm16, [grain_lutq+left_offxyq-50]
    279    vinserti32x4    m16, [grain_lutq+left_offxyq+32], 2
    280    movu           ym21, [grain_lutq+top_offxyq-82]
    281    vinserti32x8    m21, [grain_lutq+top_offxyq+ 0], 1
    282    movd           xm17, [grain_lutq+topleft_offxyq-50]
    283    vinserti32x4    m17, [grain_lutq+topleft_offxyq+32], 2
    284    ; do h interpolation first (so top | top/left -> top, left | cur -> cur)
    285    punpcklbw       m16, m19
    286    pmaddubsw       m16, m10, m16
    287    punpcklbw       m17, m21
    288    pmaddubsw       m17, m10, m17
    289    punpckhbw       m20, m21, m19
    290    pmulhrsw        m16, m9
    291    pmulhrsw        m17, m9
    292    packsswb    m19{k1}, m16, m16
    293    packsswb    m21{k1}, m17, m17
    294    ; followed by v interpolation (top | cur -> cur)
    295    punpcklbw       m21, m19
    296    call .add_noise_v
    297    sub              hb, 2
    298    jg .loop_y_h_overlap
    299    add              wq, 32
    300    lea            srcq, [src_bakq+wq]
    301    jl .hv_overlap
    302 .end:
    303    RET
    304 ALIGN function_align
    305 .add_noise_v:
    306    pmaddubsw       m20, m12, m20
    307    pmaddubsw       m21, m12, m21
    308    pmulhrsw        m20, m9
    309    pmulhrsw        m21, m9
    310    packsswb        m21, m20
    311 .add_noise:
    312    punpcklbw       m20, m5, m21
    313    punpckhbw       m21, m5
    314 .add_noise_h:
    315    mova           ym18, [srcq+strideq*0]
    316    vinserti32x8    m18, [srcq+strideq*1], 1
    317    mova            m19, m0
    318    punpcklbw       m16, m18, m5
    319    vpermt2b        m19, m18, m1 ; scaling[  0..127]
    320    vpmovb2m         k2, m18
    321    punpckhbw       m17, m18, m5
    322    vpermi2b        m18, m2, m3  ; scaling[128..255]
    323    vmovdqu8    m19{k2}, m18     ; scaling[src]
    324    pshufb          m19, m4
    325    pmaddubsw       m18, m19, m20
    326    pmaddubsw       m19, m21
    327    add      grain_lutq, 82*2
    328    pmulhrsw        m18, m6      ; noise
    329    pmulhrsw        m19, m6
    330    paddw           m16, m18
    331    paddw           m17, m19
    332    packuswb        m16, m17
    333    pmaxub          m16, m7
    334    pminub          m16, m8
    335    mova    [dstq+srcq], ym16
    336    add            srcq, strideq
    337    vextracti32x8 [dstq+srcq], m16, 1
    338    add            srcq, strideq
    339    ret
    340 
    341 %macro FGUV_FN 3 ; name, ss_hor, ss_ver
    342 cglobal fguv_32x32xn_i%1_8bpc, 6, 14+%2, 22, dst, src, stride, fg_data, w, \
    343                                             scaling, grain_lut, h, sby, luma, \
    344                                             overlap, uv_pl, is_id, _, stride3
    345    lea             r11, [fg_min]
    346    mov             r6d, [fg_dataq+FGData.scaling_shift]
    347    mov             r7d, [fg_dataq+FGData.clip_to_restricted_range]
    348    mov             r9d, is_idm
    349    mov            sbyd, sbym
    350    mov        overlapd, [fg_dataq+FGData.overlap_flag]
    351 %if %2
    352    mov             r12, 0x000f000f000f000f ; h_overlap mask
    353    vpbroadcastq    m10, [base+pb_23_22_0_32]
    354    lea        stride3q, [strideq*3]
    355 %else
    356    mov             r12, 0x0000000f0000000f
    357    vpbroadcastq    m10, [base+pb_27_17_17_27]
    358 %endif
    359    mova             m0, [scalingq+64*0]
    360    mova             m1, [scalingq+64*1]
    361    mova             m2, [scalingq+64*2]
    362    mova             m3, [scalingq+64*3]
    363    kmovq            k1, r12
    364    vbroadcasti32x4  m4, [base+interleave_hl]
    365    vpbroadcastd     m6, [base+noise_rnd+r6*4-32]
    366    vpbroadcastd     m7, [base+fg_min+r7*4]
    367    shlx            r7d, r7d, r9d
    368    vpbroadcastd     m8, [base+fg_max+r7*4]
    369    test           sbyd, sbyd
    370    setnz           r7b
    371    vpbroadcastd     m9, [base+pw_1024]
    372    mova            m11, [base+pb_even]
    373    mova            m12, [base+pb_odd]
    374    pxor             m5, m5
    375    mov              r5, r10mp      ; lstride
    376    cmp byte [fg_dataq+FGData.chroma_scaling_from_luma], 0
    377    jne .csfl
    378 
    379 %macro %%FGUV_32x32xN_LOOP 3 ; not-csfl, ss_hor, ss_ver
    380    DEFINE_ARGS dst, src, stride, fg_data, w, lstride, grain_lut, \
    381                h, sby, see, overlap, uv_pl, _, _, stride3
    382 %if %1
    383    mov             r6d, uv_plm
    384    vpbroadcastd    m16, [base+pw_8]
    385    vbroadcasti32x4 m14, [fg_dataq+FGData.uv_mult+r6*4]
    386    vpbroadcastw    m15, [fg_dataq+FGData.uv_offset+r6*4]
    387    pshufb          m14, m16     ; uv_luma_mult, uv_mult
    388 %endif
    389    test            r7b, overlapb
    390    jnz %%v_overlap
    391 
    392    imul           seed, sbyd, (173 << 24) | 37
    393    add            seed, (105 << 24) | 178
    394    rorx           seed, seed, 24
    395    movzx          seed, seew
    396    xor            seed, [fg_dataq+FGData.seed]
    397 
    398    DEFINE_ARGS dst, src, stride, luma, w, lstride, grain_lut, \
    399                offx, offy, see, overlap, _, _, _, stride3
    400 
    401    mov           lumaq, r9mp
    402    lea             r11, [srcq+wq]
    403    lea             r12, [dstq+wq]
    404    lea             r13, [lumaq+wq*(1+%2)]
    405    mov           r11mp, r11
    406    mov           r12mp, r12
    407    neg              wq
    408 
    409 %%loop_x:
    410    rorx             r6, seeq, 1
    411    or             seed, 0xeff4
    412    test           seeb, seeh
    413    lea            seed, [r6+0x8000]
    414    cmovp          seed, r6d     ; updated seed
    415    rorx          offyd, seed, 8
    416    rorx          offxq, seeq, 12
    417    and           offyd, 0xf
    418    imul          offyd, 164>>%3
    419    lea           offyd, [offyq+offxq*(2-%2)+(3+(6>>%3))*82+3+(6>>%2)] ; offy*stride+offx
    420 
    421    DEFINE_ARGS dst, src, stride, luma, w, lstride, grain_lut, \
    422                h, offxy, see, overlap, _, _, _, stride3
    423 
    424    mov      grain_lutq, grain_lutmp
    425    mov              hd, hm
    426 %%loop_y:
    427 %if %2
    428    movu           xm21, [grain_lutq+offxyq+82*0]
    429    vinserti128    ym21, [grain_lutq+offxyq+82*1], 1
    430    vinserti32x4    m21, [grain_lutq+offxyq+82*2], 2
    431    vinserti32x4    m21, [grain_lutq+offxyq+82*3], 3
    432 %else
    433    movu           ym21, [grain_lutq+offxyq+82*0]
    434    vinserti32x8    m21, [grain_lutq+offxyq+82*1], 1
    435 %endif
    436    call %%add_noise
    437    sub              hb, 2<<%2
    438    jg %%loop_y
    439    add              wq, 32>>%2
    440    jge .end
    441    mov            srcq, r11mp
    442    mov            dstq, r12mp
    443    lea           lumaq, [r13+wq*(1<<%2)]
    444    add            srcq, wq
    445    add            dstq, wq
    446    test       overlapd, overlapd
    447    jz %%loop_x
    448    cmp       dword r8m, 0       ; sby
    449    jne %%hv_overlap
    450 
    451    ; horizontal overlap (without vertical overlap)
    452 %%loop_x_h_overlap:
    453    rorx             r6, seeq, 1
    454    or             seed, 0xeff4
    455    test           seeb, seeh
    456    lea            seed, [r6+0x8000]
    457    cmovp          seed, r6d     ; updated seed
    458 
    459    DEFINE_ARGS dst, src, stride, luma, w, lstride, grain_lut, \
    460                offx, offy, see, left_offxy, _, _, _, stride3
    461 
    462    lea     left_offxyd, [offyq+(32>>%2)]         ; previous column's offy*stride+offx
    463    rorx          offyd, seed, 8
    464    rorx          offxq, seeq, 12
    465    and           offyd, 0xf
    466    imul          offyd, 164>>%3
    467    lea           offyd, [offyq+offxq*(2-%2)+(3+(6>>%3))*82+3+(6>>%2)] ; offy*stride+offx
    468 
    469    DEFINE_ARGS dst, src, stride, luma, w, lstride, grain_lut, \
    470                h, offxy, see, left_offxy, _, _, _, stride3
    471 
    472    mov      grain_lutq, grain_lutmp
    473    mov              hd, hm
    474 %%loop_y_h_overlap:
    475 %if %2
    476    movu           xm20, [grain_lutq+offxyq     +82*0]
    477    movd           xm19, [grain_lutq+left_offxyq+82*0]
    478    vinserti32x4   ym20, [grain_lutq+offxyq     +82*1], 1
    479    vinserti32x4   ym19, [grain_lutq+left_offxyq+82*1], 1
    480    vinserti32x4    m20, [grain_lutq+offxyq     +82*2], 2
    481    vinserti32x4    m19, [grain_lutq+left_offxyq+82*2], 2
    482    vinserti32x4    m20, [grain_lutq+offxyq     +82*3], 3
    483    vinserti32x4    m19, [grain_lutq+left_offxyq+82*3], 3
    484 %else
    485    movu           ym20, [grain_lutq+offxyq     + 0]
    486    movd           xm19, [grain_lutq+left_offxyq+ 0]
    487    vinserti32x8    m20, [grain_lutq+offxyq     +82], 1
    488    vinserti32x4    m19, [grain_lutq+left_offxyq+82], 2
    489 %endif
    490    punpcklbw       m19, m20
    491    pmaddubsw       m19, m10, m19
    492    punpckhbw       m21, m20, m5
    493    pmulhrsw        m19, m9
    494    vpacksswb   m20{k1}, m19, m19
    495    punpcklbw       m20, m5, m20
    496    call %%add_noise_h
    497    sub              hb, 2<<%2
    498    jg %%loop_y_h_overlap
    499    add              wq, 32>>%2
    500    jge .end
    501    mov            srcq, r11mp
    502    mov            dstq, r12mp
    503    lea           lumaq, [r13+wq*(1<<%2)]
    504    add            srcq, wq
    505    add            dstq, wq
    506    cmp       dword r8m, 0       ; sby
    507    jne %%hv_overlap
    508    jmp %%loop_x_h_overlap
    509 
    510 %%v_overlap:
    511    DEFINE_ARGS dst, src, stride, fg_data, w, lstride, grain_lut, \
    512                _, sby, see, overlap, _, _, _, stride3
    513 
    514    movzx          sbyd, sbyb
    515    imul           seed, [fg_dataq+FGData.seed], 0x00010001
    516    imul            r7d, sbyd, 173 * 0x00010001
    517    imul           sbyd, 37 * 0x01000100
    518    add             r7d, (105 << 16) | 188
    519    add            sbyd, (178 << 24) | (141 << 8)
    520    and             r7d, 0x00ff00ff
    521    and            sbyd, 0xff00ff00
    522    xor            seed, r7d
    523    xor            seed, sbyd    ; (cur_seed << 16) | top_seed
    524 
    525 %if %3
    526    vpbroadcastd    m13, [base+pb_23_22]
    527    kxnorw           k3, k3, k3  ; v_overlap mask
    528 %elif %2
    529    vbroadcasti32x8 m13, [base+pb_27_17]
    530    kxnord           k3, k3, k3
    531    pshufd          m13, m13, q0000 ; 8x27_17, 8x17_27
    532 %else
    533    vpbroadcastd   ym16, [base+pb_27_17]
    534    vpbroadcastd    m13, [base+pb_17_27]
    535    vmovdqa64   m13{k1}, m16
    536 %endif
    537 
    538    DEFINE_ARGS dst, src, stride, luma, w, lstride, grain_lut, \
    539                offx, offy, see, overlap, top_offxy, _, _, stride3
    540 
    541    mov           lumaq, r9mp
    542    lea             r11, [srcq+wq]
    543    lea             r12, [dstq+wq]
    544    lea             r13, [lumaq+wq*(1<<%2)]
    545    mov           r11mp, r11
    546    mov           r12mp, r12
    547    neg              wq
    548 
    549    ; we assume from the block above that bits 8-15 of r7d are zero'ed
    550    mov             r6d, seed
    551    or             seed, 0xeff4eff4
    552    test           seeb, seeh
    553    setp            r7b          ; parity of top_seed
    554    shr            seed, 16
    555    shl             r7d, 16
    556    test           seeb, seeh
    557    setp            r7b          ; parity of cur_seed
    558    or              r6d, 0x00010001
    559    xor             r7d, r6d
    560    rorx           seed, r7d, 1  ; updated (cur_seed << 16) | top_seed
    561    rorx          offyd, seed, 8
    562    rorx          offxd, seed, 12
    563    and           offyd, 0x000f000f
    564    and           offxd, 0x000f000f
    565    imul          offyd, 164>>%3
    566    ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy
    567    lea           offyd, [offyq+offxq*(2-%2)+0x10001*((3+(6>>%3))*82+3+(6>>%2))+(32>>%3)*82]
    568 
    569    DEFINE_ARGS dst, src, stride, luma, w, lstride, grain_lut, \
    570                h, offxy, see, overlap, top_offxy, _, _, stride3
    571 
    572    mov      grain_lutq, grain_lutmp
    573    mov              hd, hm
    574    movzx    top_offxyd, offxyw
    575    shr          offxyd, 16
    576 
    577 %if %3
    578    movu           xm18, [grain_lutq+offxyq+82*0]
    579    movu           xm20, [grain_lutq+top_offxyq+82*0]
    580    ; only interpolate first line, insert remaining line unmodified
    581    vbroadcasti128 ym21, [grain_lutq+offxyq+82*1]
    582    vinserti32x4    m21, [grain_lutq+offxyq+82*2], 2
    583    vinserti32x4    m21, [grain_lutq+offxyq+82*3], 3
    584    punpcklbw      xm19, xm20, xm18
    585    punpckhbw      xm20, xm18
    586 %elif %2
    587    movu           xm18, [grain_lutq+offxyq+82*0]
    588    vinserti128    ym18, [grain_lutq+offxyq+82*1], 1
    589    movu           xm20, [grain_lutq+top_offxyq+82*0]
    590    vinserti32x4   ym20, [grain_lutq+top_offxyq+82*1], 1
    591    vbroadcasti32x4 m21, [grain_lutq+offxyq+82*2]
    592    vinserti32x4    m21, [grain_lutq+offxyq+82*3], 3
    593    punpcklbw      ym19, ym20, ym18
    594    punpckhbw      ym20, ym18
    595 %else
    596    movu           ym21, [grain_lutq+offxyq+82*0]
    597    vinserti32x8    m21, [grain_lutq+offxyq+82*1], 1
    598    movu           ym20, [grain_lutq+top_offxyq+82*0]
    599    vinserti32x8    m20, [grain_lutq+top_offxyq+82*1], 1
    600 %endif
    601    call %%add_noise_v
    602    sub              hb, 2<<%2
    603    jg %%loop_y
    604    add              wq, 32>>%2
    605    jge .end
    606    mov            srcq, r11mp
    607    mov            dstq, r12mp
    608    lea           lumaq, [r13+wq*(1<<%2)]
    609    add            srcq, wq
    610    add            dstq, wq
    611 
    612 %%hv_overlap:
    613    ; we assume from the block above that bits 8-15 of r7d are zero'ed
    614    mov             r6d, seed
    615    or             seed, 0xeff4eff4
    616    test           seeb, seeh
    617    setp            r7b          ; parity of top_seed
    618    shr            seed, 16
    619    shl             r7d, 16
    620    test           seeb, seeh
    621    setp            r7b          ; parity of cur_seed
    622    or              r6d, 0x00010001
    623    xor             r7d, r6d
    624    rorx           seed, r7d, 1  ; updated (cur_seed << 16) | top_seed
    625 
    626    DEFINE_ARGS dst, src, stride, luma, w, lstride, grain_lut, \
    627                offx, offy, see, left_offxy, top_offxy, topleft_offxy, _, stride3
    628 
    629    lea  topleft_offxyd, [top_offxyq+(32>>%2)]
    630    lea     left_offxyd, [offyq+(32>>%2)]
    631    rorx          offyd, seed, 8
    632    rorx          offxd, seed, 12
    633    and           offyd, 0x000f000f
    634    and           offxd, 0x000f000f
    635    imul          offyd, 164>>%3
    636    ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy
    637    lea           offyd, [offyq+offxq*(2-%2)+0x10001*((3+(6>>%3))*82+3+(6>>%2))+(32>>%3)*82]
    638 
    639    DEFINE_ARGS dst, src, stride, luma, w, lstride, grain_lut, \
    640                h, offxy, see, left_offxy, top_offxy, topleft_offxy, _, stride3
    641 
    642    mov      grain_lutq, grain_lutmp
    643    mov              hd, hm
    644    movzx    top_offxyd, offxyw
    645    shr          offxyd, 16
    646 
    647 %if %2
    648    movu           xm21, [grain_lutq+offxyq+82*0]
    649    movd           xm16, [grain_lutq+left_offxyq+82*0]
    650    vinserti128    ym21, [grain_lutq+offxyq+82*1], 1
    651    vinserti128    ym16, [grain_lutq+left_offxyq+82*1], 1
    652    vinserti32x4    m21, [grain_lutq+offxyq+82*2], 2
    653    vinserti32x4    m16, [grain_lutq+left_offxyq+82*2], 2
    654    vinserti32x4    m21, [grain_lutq+offxyq+82*3], 3
    655    vinserti32x4    m16, [grain_lutq+left_offxyq+82*3], 3
    656    movd           xm18, [grain_lutq+topleft_offxyq+82*0]
    657    movu           xm20, [grain_lutq+top_offxyq]
    658    ; do h interpolation first (so top | top/left -> top, left | cur -> cur)
    659    punpcklbw       m16, m21
    660 %if %3
    661    punpcklbw      xm18, xm20
    662 %else
    663    vinserti128    ym18, [grain_lutq+topleft_offxyq+82*1], 1
    664    vinserti128    ym20, [grain_lutq+top_offxyq+82*1], 1
    665    punpcklbw      ym18, ym20
    666 %endif
    667    punpcklqdq      m16, m18
    668    pmaddubsw       m16, m10, m16
    669    pmulhrsw        m16, m9
    670    packsswb        m16, m16
    671    vmovdqu8    m21{k1}, m16
    672 %if %3
    673    vpalignr   xm20{k1}, xm16, xm16, 4
    674    punpcklbw      xm19, xm20, xm21
    675    punpckhbw      xm20, xm21
    676 %else
    677    vpalignr   ym20{k1}, ym16, ym16, 4
    678    punpcklbw      ym19, ym20, ym21
    679    punpckhbw      ym20, ym21
    680 %endif
    681 %else
    682    movu           ym21, [grain_lutq+offxyq+82*0]
    683    vinserti32x8    m21, [grain_lutq+offxyq+82*1], 1
    684    movd           xm16, [grain_lutq+left_offxyq+82*0]
    685    vinserti32x4    m16, [grain_lutq+left_offxyq+82*1], 2
    686    movu           ym20, [grain_lutq+top_offxyq+82*0]
    687    vinserti32x8    m20, [grain_lutq+top_offxyq+82*1], 1
    688    movd           xm18, [grain_lutq+topleft_offxyq+82*0]
    689    vinserti32x4    m18, [grain_lutq+topleft_offxyq+82*1], 2
    690    punpcklbw       m16, m21
    691    punpcklbw       m18, m20
    692    punpcklqdq      m16, m18
    693    pmaddubsw       m16, m10, m16
    694    pmulhrsw        m16, m9
    695    packsswb        m16, m16
    696    vpalignr    m20{k1}, m16, m16, 4
    697    vmovdqu8    m21{k1}, m16
    698 %endif
    699    call %%add_noise_v
    700    sub              hb, 2<<%2
    701    jg %%loop_y_h_overlap
    702    add              wq, 32>>%2
    703    jge .end
    704    mov            srcq, r11mp
    705    mov            dstq, r12mp
    706    lea           lumaq, [r13+wq*(1<<%2)]
    707    add            srcq, wq
    708    add            dstq, wq
    709    jmp %%hv_overlap
    710 ALIGN function_align
    711 %%add_noise_v:
    712 %if %3
    713    pmaddubsw      xm19, xm13, xm19
    714    pmaddubsw      xm20, xm13, xm20
    715    pmulhrsw       xm19, xm9
    716    pmulhrsw       xm20, xm9
    717    vpacksswb   m21{k3}, m19, m20
    718 %elif %2
    719    pmaddubsw      ym19, ym13, ym19
    720    pmaddubsw      ym20, ym13, ym20
    721    pmulhrsw       ym19, ym9
    722    pmulhrsw       ym20, ym9
    723    vpacksswb   m21{k3}, m19, m20
    724 %else
    725    punpcklbw       m19, m20, m21
    726    punpckhbw       m20, m21
    727    pmaddubsw       m19, m13, m19
    728    pmaddubsw       m20, m13, m20
    729    pmulhrsw        m19, m9
    730    pmulhrsw        m20, m9
    731    packsswb        m21, m19, m20
    732 %endif
    733 %%add_noise:
    734    punpcklbw       m20, m5, m21
    735    punpckhbw       m21, m5
    736 %%add_noise_h:
    737    mova           ym18, [lumaq+lstrideq*(0<<%3)]
    738    vinserti32x8    m18, [lumaq+lstrideq*(1<<%3)], 1
    739 %if %2
    740    lea           lumaq, [lumaq+lstrideq*(2<<%3)]
    741    mova           ym16, [lumaq+lstrideq*(0<<%3)]
    742    vinserti32x8    m16, [lumaq+lstrideq*(1<<%3)], 1
    743    mova           xm17, [srcq+strideq*0]
    744    mova            m19, m11
    745    vpermi2b        m19, m18, m16
    746    vinserti128    ym17, [srcq+strideq*1], 1
    747    vpermt2b        m18, m12, m16
    748    vinserti32x4    m17, [srcq+strideq*2], 2
    749    pavgb           m18, m19
    750    vinserti32x4    m17, [srcq+stride3q ], 3
    751 %else
    752    mova           ym17, [srcq+strideq*0]
    753    vinserti32x8    m17, [srcq+strideq*1], 1
    754 %endif
    755 %if %1
    756    punpckhbw       m19, m18, m17
    757    punpcklbw       m18, m17     ; { luma, chroma }
    758    pmaddubsw       m19, m14
    759    pmaddubsw       m18, m14
    760    psraw           m19, 6
    761    psraw           m18, 6
    762    paddw           m19, m15
    763    paddw           m18, m15
    764    packuswb        m18, m19
    765 .add_noise_main:
    766    mova            m19, m0
    767    vpermt2b        m19, m18, m1 ; scaling[  0..127]
    768    vpmovb2m         k2, m18
    769    vpermi2b        m18, m2, m3  ; scaling[128..255]
    770    vmovdqu8    m19{k2}, m18     ; scaling[src]
    771    pshufb          m19, m4
    772    pmaddubsw       m18, m19, m20
    773    pmaddubsw       m19, m21
    774    add      grain_lutq, 82*2<<%2
    775    lea           lumaq, [lumaq+lstrideq*(2<<%3)]
    776    lea            srcq, [srcq+strideq*(2<<%2)]
    777    pmulhrsw        m18, m6      ; noise
    778    pmulhrsw        m19, m6
    779    punpcklbw       m16, m17, m5 ; chroma
    780    punpckhbw       m17, m5
    781    paddw           m16, m18
    782    paddw           m17, m19
    783    packuswb        m16, m17
    784    pmaxub          m16, m7
    785    pminub          m16, m8
    786 %if %2
    787    mova          [dstq+strideq*0], xm16
    788    vextracti128  [dstq+strideq*1], ym16, 1
    789    vextracti32x4 [dstq+strideq*2], m16, 2
    790    vextracti32x4 [dstq+stride3q ], m16, 3
    791 %else
    792    mova          [dstq+strideq*0], ym16
    793    vextracti32x8 [dstq+strideq*1], m16, 1
    794 %endif
    795    lea            dstq, [dstq+strideq*(2<<%2)]
    796    ret
    797 %else
    798    jmp .add_noise_main
    799 %endif
    800 %endmacro
    801 
    802    %%FGUV_32x32xN_LOOP 1, %2, %3
    803 .csfl:
    804    %%FGUV_32x32xN_LOOP 0, %2, %3
    805 .end:
    806    RET
    807 %endmacro
    808 
    809 FGUV_FN 420, 1, 1
    810 FGUV_FN 422, 1, 0
    811 FGUV_FN 444, 0, 0
    812 
    813 %endif ; ARCH_X86_64