tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

ipred16_sse.asm (135959B)


      1 ; Copyright © 2021, VideoLAN and dav1d authors
      2 ; Copyright © 2021, Two Orioles, LLC
      3 ; All rights reserved.
      4 ;
      5 ; Redistribution and use in source and binary forms, with or without
      6 ; modification, are permitted provided that the following conditions are met:
      7 ;
      8 ; 1. Redistributions of source code must retain the above copyright notice, this
      9 ;    list of conditions and the following disclaimer.
     10 ;
     11 ; 2. Redistributions in binary form must reproduce the above copyright notice,
     12 ;    this list of conditions and the following disclaimer in the documentation
     13 ;    and/or other materials provided with the distribution.
     14 ;
     15 ; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
     16 ; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
     17 ; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
     18 ; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
     19 ; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
     20 ; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
     21 ; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
     22 ; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
     23 ; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
     24 ; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     25 
     26 %include "config.asm"
     27 %include "ext/x86/x86inc.asm"
     28 
     29 SECTION_RODATA
     30 
     31 filter_shuf:   db  6,  7,  8,  9, 10, 11, 12, 13, 14, 15,  4,  5,  2,  3, -1, -1
     32 pal_pred_shuf: db  0,  2,  4,  6,  8, 10, 12, 14,  1,  3,  5,  7,  9, 11, 13, 15
     33 z_base_inc:    dw   0*64,   1*64,   2*64,   3*64,   4*64,   5*64,   6*64,   7*64
     34 z_base_inc_z2: dw   7*64,   6*64,   5*64,   4*64,   3*64,   2*64,   1*64,   0*64
     35 z_upsample:    db  0,  1,  4,  5,  8,  9, 12, 13,  2,  3,  6,  7, 10, 11, 14, 15
     36 z2_upsample_l: db -1, -1, -2, -1, -3, -1, -4, -1,  8,  9,  8,  9, 10, 11, 12, 13
     37               db  0,  1,  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13
     38 z2_top_shufA:  db  0,  1,  2,  3,  4,  5,  6,  7,  2,  3,  4,  5,  6,  7,  8,  9
     39 z2_top_shufB:  db  0,  1,  4,  5,  8,  9, 12, 13,  2,  3,  6,  7, 10, 11, 14, 15
     40 z2_left_shufA: db 14, 15, 12, 13, 10, 11,  8,  9, 12, 13, 10, 11,  8,  9,  6,  7
     41 z2_left_shufB: db 14, 15, 10, 11,  6,  7,  2,  3, 12, 13,  8,  9,  4,  5,  0,  1
     42 z_filt_wh16:   db 19, 19, 19, 23, 23, 23, 31, 31, 31, 47, 47, 47, 79, 79, 79, -1
     43 z_filt_t_w48:  db 55,127,  7,127, 15, 31, 39, 31,127, 39,127, 39,  7, 15, 31, 15
     44               db 39, 63,  3, 63,  3,  3, 19,  3, 47, 19, 47, 19,  3,  3,  3,  3
     45 z_filt_t_w16:  db 15, 31,  7, 15, 31,  7,  3, 31,  3,  3,  3,  3,  3,  3,  0,  0
     46 z_filt_wh4:    db  7,  7, 19,  7,
     47 z_filt_wh8:    db 19, 19, 11, 19, 11, 15, 15, 15, 23, 23, 23, 23, 39, 39, 39, 39
     48 ALIGN 8
     49 pb_2_3:   times 4 db 2, 3
     50 z2_dy_offset:     dw 96*64, 96*64, 95*64, 95*64
     51 z_filt_k: times 4 dw 8
     52          times 4 dw 6
     53          times 4 dw 4
     54          times 4 dw 5
     55 pw_m3584: times 4 dw -3584
     56 pw_m3072: times 4 dw -3072
     57 pw_m2560: times 4 dw -2560
     58 pw_m2048: times 4 dw -2048
     59 pw_m1536: times 4 dw -1536
     60 pw_m1024: times 4 dw -1024
     61 pw_m512:  times 4 dw -512
     62 pw_1:     times 4 dw 1
     63 pw_2:     times 4 dw 2
     64 pw_3:     times 4 dw 3
     65 pw_62:    times 4 dw 62
     66 pw_256:   times 4 dw 256
     67 pw_512:   times 4 dw 512
     68 pw_2048:  times 4 dw 2048
     69 
     70 %define pw_4 (z_filt_k+8*2)
     71 %define pw_8 (z_filt_k+8*0)
     72 %define pw_m1to4 z2_upsample_l
     73 
     74 %macro JMP_TABLE 3-*
     75    %xdefine %1_%2_table (%%table - 2*4)
     76    %xdefine %%base mangle(private_prefix %+ _%1_%2)
     77    %%table:
     78    %rep %0 - 2
     79        dd %%base %+ .%3 - (%%table - 2*4)
     80        %rotate 1
     81    %endrep
     82 %endmacro
     83 
     84 %define ipred_dc_splat_16bpc_ssse3_table (ipred_dc_16bpc_ssse3_table + 10*4)
     85 %define ipred_dc_128_16bpc_ssse3_table   (ipred_dc_16bpc_ssse3_table + 15*4)
     86 %define ipred_cfl_splat_16bpc_ssse3_table (ipred_cfl_16bpc_ssse3_table + 8*4)
     87 
     88 JMP_TABLE ipred_dc_left_16bpc,    ssse3, h4, h8, h16, h32, h64
     89 JMP_TABLE ipred_dc_16bpc,         ssse3, h4, h8, h16, h32, h64, w4, w8, w16, w32, w64, \
     90                                         s4-10*4, s8-10*4, s16-10*4, s32-10*4, s64-10*4, \
     91                                         s4-15*4, s8-15*4, s16c-15*4, s32c-15*4, s64-15*4
     92 JMP_TABLE ipred_h_16bpc,          ssse3, w4, w8, w16, w32, w64
     93 JMP_TABLE ipred_z1_16bpc,         ssse3, w4, w8, w16, w32, w64
     94 JMP_TABLE ipred_z2_16bpc,         ssse3, w4, w8, w16, w32, w64
     95 JMP_TABLE ipred_z3_16bpc,         ssse3, h4, h8, h16, h32, h64
     96 JMP_TABLE ipred_cfl_16bpc,        ssse3, h4, h8, h16, h32, w4, w8, w16, w32, \
     97                                         s4-8*4, s8-8*4, s16-8*4, s32-8*4
     98 JMP_TABLE ipred_cfl_left_16bpc,   ssse3, h4, h8, h16, h32
     99 JMP_TABLE ipred_cfl_ac_444_16bpc, ssse3, w4, w8, w16, w32
    100 JMP_TABLE pal_pred_16bpc,         ssse3, w4, w8, w16, w32, w64
    101 
    102 cextern smooth_weights_1d_16bpc
    103 cextern smooth_weights_2d_16bpc
    104 cextern dr_intra_derivative
    105 cextern filter_intra_taps
    106 
    107 SECTION .text
    108 
    109 INIT_XMM ssse3
    110 cglobal ipred_dc_top_16bpc, 3, 7, 6, dst, stride, tl, w, h
    111    LEA                  r5, ipred_dc_left_16bpc_ssse3_table
    112    movd                 m4, wm
    113    tzcnt                wd, wm
    114    add                 tlq, 2
    115    movifnidn            hd, hm
    116    pxor                 m3, m3
    117    pavgw                m4, m3
    118    movd                 m5, wd
    119    movu                 m0, [tlq]
    120    movsxd               r6, [r5+wq*4]
    121    add                  r6, r5
    122    add                  r5, ipred_dc_128_16bpc_ssse3_table-ipred_dc_left_16bpc_ssse3_table
    123    movsxd               wq, [r5+wq*4]
    124    add                  wq, r5
    125    jmp                  r6
    126 
    127 cglobal ipred_dc_left_16bpc, 3, 7, 6, dst, stride, tl, w, h, stride3
    128    LEA                  r5, ipred_dc_left_16bpc_ssse3_table
    129    mov                  hd, hm
    130    movd                 m4, hm
    131    tzcnt               r6d, hd
    132    sub                 tlq, hq
    133    tzcnt                wd, wm
    134    pxor                 m3, m3
    135    sub                 tlq, hq
    136    pavgw                m4, m3
    137    movd                 m5, r6d
    138    movu                 m0, [tlq]
    139    movsxd               r6, [r5+r6*4]
    140    add                  r6, r5
    141    add                  r5, ipred_dc_128_16bpc_ssse3_table-ipred_dc_left_16bpc_ssse3_table
    142    movsxd               wq, [r5+wq*4]
    143    add                  wq, r5
    144    jmp                  r6
    145 .h64:
    146    movu                 m2, [tlq+112]
    147    movu                 m1, [tlq+ 96]
    148    paddw                m0, m2
    149    movu                 m2, [tlq+ 80]
    150    paddw                m1, m2
    151    movu                 m2, [tlq+ 64]
    152    paddw                m0, m2
    153    paddw                m0, m1
    154 .h32:
    155    movu                 m1, [tlq+ 48]
    156    movu                 m2, [tlq+ 32]
    157    paddw                m1, m2
    158    paddw                m0, m1
    159 .h16:
    160    movu                 m1, [tlq+ 16]
    161    paddw                m0, m1
    162 .h8:
    163    movhlps              m1, m0
    164    paddw                m0, m1
    165 .h4:
    166    punpcklwd            m0, m3
    167    paddd                m4, m0
    168    punpckhqdq           m0, m0
    169    paddd                m0, m4
    170    pshuflw              m4, m0, q1032
    171    paddd                m0, m4
    172    psrld                m0, m5
    173    lea            stride3q, [strideq*3]
    174    pshuflw              m0, m0, q0000
    175    punpcklqdq           m0, m0
    176    jmp                  wq
    177 
    178 cglobal ipred_dc_16bpc, 4, 7, 6, dst, stride, tl, w, h, stride3
    179    movifnidn            hd, hm
    180    tzcnt               r6d, hd
    181    lea                 r5d, [wq+hq]
    182    movd                 m4, r5d
    183    tzcnt               r5d, r5d
    184    movd                 m5, r5d
    185    LEA                  r5, ipred_dc_16bpc_ssse3_table
    186    tzcnt                wd, wd
    187    movsxd               r6, [r5+r6*4]
    188    movsxd               wq, [r5+wq*4+5*4]
    189    pxor                 m3, m3
    190    psrlw                m4, 1
    191    add                  r6, r5
    192    add                  wq, r5
    193    lea            stride3q, [strideq*3]
    194    jmp                  r6
    195 .h4:
    196    movq                 m0, [tlq-8]
    197    jmp                  wq
    198 .w4:
    199    movq                 m1, [tlq+2]
    200    paddw                m1, m0
    201    punpckhwd            m0, m3
    202    punpcklwd            m1, m3
    203    paddd                m0, m1
    204    paddd                m4, m0
    205    punpckhqdq           m0, m0
    206    paddd                m0, m4
    207    pshuflw              m1, m0, q1032
    208    paddd                m0, m1
    209    cmp                  hd, 4
    210    jg .w4_mul
    211    psrlw                m0, 3
    212    jmp .w4_end
    213 .w4_mul:
    214    mov                 r2d, 0xAAAB
    215    mov                 r3d, 0x6667
    216    cmp                  hd, 16
    217    cmove               r2d, r3d
    218    psrld                m0, 2
    219    movd                 m1, r2d
    220    pmulhuw              m0, m1
    221    psrlw                m0, 1
    222 .w4_end:
    223    pshuflw              m0, m0, q0000
    224 .s4:
    225    movq   [dstq+strideq*0], m0
    226    movq   [dstq+strideq*1], m0
    227    movq   [dstq+strideq*2], m0
    228    movq   [dstq+stride3q ], m0
    229    lea                dstq, [dstq+strideq*4]
    230    sub                  hd, 4
    231    jg .s4
    232    RET
    233 .h8:
    234    mova                 m0, [tlq-16]
    235    jmp                  wq
    236 .w8:
    237    movu                 m1, [tlq+2]
    238    paddw                m0, m1
    239    punpcklwd            m1, m0, m3
    240    punpckhwd            m0, m3
    241    paddd                m0, m1
    242    paddd                m4, m0
    243    punpckhqdq           m0, m0
    244    paddd                m0, m4
    245    pshuflw              m1, m0, q1032
    246    paddd                m0, m1
    247    psrld                m0, m5
    248    cmp                  hd, 8
    249    je .w8_end
    250    mov                 r2d, 0xAAAB
    251    mov                 r3d, 0x6667
    252    cmp                  hd, 32
    253    cmove               r2d, r3d
    254    movd                 m1, r2d
    255    pmulhuw              m0, m1
    256    psrlw                m0, 1
    257 .w8_end:
    258    pshuflw              m0, m0, q0000
    259    punpcklqdq           m0, m0
    260 .s8:
    261    mova   [dstq+strideq*0], m0
    262    mova   [dstq+strideq*1], m0
    263    mova   [dstq+strideq*2], m0
    264    mova   [dstq+stride3q ], m0
    265    lea                dstq, [dstq+strideq*4]
    266    sub                  hd, 4
    267    jg .s8
    268    RET
    269 .h16:
    270    mova                 m0, [tlq-32]
    271    paddw                m0, [tlq-16]
    272    jmp                  wq
    273 .w16:
    274    movu                 m1, [tlq+ 2]
    275    movu                 m2, [tlq+18]
    276    paddw                m1, m2
    277    paddw                m0, m1
    278    punpckhwd            m1, m0, m3
    279    punpcklwd            m0, m3
    280    paddd                m0, m1
    281    paddd                m4, m0
    282    punpckhqdq           m0, m0
    283    paddd                m0, m4
    284    pshuflw              m1, m0, q1032
    285    paddd                m0, m1
    286    psrld                m0, m5
    287    cmp                  hd, 16
    288    je .w16_end
    289    mov                 r2d, 0xAAAB
    290    mov                 r3d, 0x6667
    291    test                 hd, 8|32
    292    cmovz               r2d, r3d
    293    movd                 m1, r2d
    294    pmulhuw              m0, m1
    295    psrlw                m0, 1
    296 .w16_end:
    297    pshuflw              m0, m0, q0000
    298    punpcklqdq           m0, m0
    299 .s16c:
    300    mova                 m1, m0
    301 .s16:
    302    mova [dstq+strideq*0+16*0], m0
    303    mova [dstq+strideq*0+16*1], m1
    304    mova [dstq+strideq*1+16*0], m0
    305    mova [dstq+strideq*1+16*1], m1
    306    mova [dstq+strideq*2+16*0], m0
    307    mova [dstq+strideq*2+16*1], m1
    308    mova [dstq+stride3q +16*0], m0
    309    mova [dstq+stride3q +16*1], m1
    310    lea                dstq, [dstq+strideq*4]
    311    sub                  hd, 4
    312    jg .s16
    313    RET
    314 .h32:
    315    mova                 m0, [tlq-64]
    316    paddw                m0, [tlq-48]
    317    paddw                m0, [tlq-32]
    318    paddw                m0, [tlq-16]
    319    jmp                  wq
    320 .w32:
    321    movu                 m1, [tlq+ 2]
    322    movu                 m2, [tlq+18]
    323    paddw                m1, m2
    324    movu                 m2, [tlq+34]
    325    paddw                m0, m2
    326    movu                 m2, [tlq+50]
    327    paddw                m1, m2
    328    paddw                m0, m1
    329    punpcklwd            m1, m0, m3
    330    punpckhwd            m0, m3
    331    paddd                m0, m1
    332    paddd                m4, m0
    333    punpckhqdq           m0, m0
    334    paddd                m0, m4
    335    pshuflw              m1, m0, q1032
    336    paddd                m0, m1
    337    psrld                m0, m5
    338    cmp                  hd, 32
    339    je .w32_end
    340    mov                 r2d, 0xAAAB
    341    mov                 r3d, 0x6667
    342    cmp                  hd, 8
    343    cmove               r2d, r3d
    344    movd                 m1, r2d
    345    pmulhuw              m0, m1
    346    psrlw                m0, 1
    347 .w32_end:
    348    pshuflw              m0, m0, q0000
    349    punpcklqdq           m0, m0
    350 .s32c:
    351    mova                 m1, m0
    352    mova                 m2, m0
    353    mova                 m3, m0
    354 .s32:
    355    mova [dstq+strideq*0+16*0], m0
    356    mova [dstq+strideq*0+16*1], m1
    357    mova [dstq+strideq*0+16*2], m2
    358    mova [dstq+strideq*0+16*3], m3
    359    mova [dstq+strideq*1+16*0], m0
    360    mova [dstq+strideq*1+16*1], m1
    361    mova [dstq+strideq*1+16*2], m2
    362    mova [dstq+strideq*1+16*3], m3
    363    lea                dstq, [dstq+strideq*2]
    364    sub                  hd, 2
    365    jg .s32
    366    RET
    367 .h64:
    368    mova                 m0, [tlq-128]
    369    mova                 m1, [tlq-112]
    370    paddw                m0, [tlq- 96]
    371    paddw                m1, [tlq- 80]
    372    paddw                m0, [tlq- 64]
    373    paddw                m1, [tlq- 48]
    374    paddw                m0, [tlq- 32]
    375    paddw                m1, [tlq- 16]
    376    paddw                m0, m1
    377    jmp                  wq
    378 .w64:
    379    movu                 m1, [tlq+  2]
    380    movu                 m2, [tlq+ 18]
    381    paddw                m1, m2
    382    movu                 m2, [tlq+ 34]
    383    paddw                m0, m2
    384    movu                 m2, [tlq+ 50]
    385    paddw                m1, m2
    386    movu                 m2, [tlq+ 66]
    387    paddw                m0, m2
    388    movu                 m2, [tlq+ 82]
    389    paddw                m1, m2
    390    movu                 m2, [tlq+ 98]
    391    paddw                m0, m2
    392    movu                 m2, [tlq+114]
    393    paddw                m1, m2
    394    paddw                m0, m1
    395    punpcklwd            m1, m0, m3
    396    punpckhwd            m0, m3
    397    paddd                m0, m1
    398    paddd                m4, m0
    399    punpckhqdq           m0, m0
    400    paddd                m0, m4
    401    pshuflw              m1, m0, q1032
    402    paddd                m0, m1
    403    psrld                m0, m5
    404    cmp                  hd, 64
    405    je .w64_end
    406    mov                 r2d, 0xAAAB
    407    mov                 r3d, 0x6667
    408    cmp                  hd, 16
    409    cmove               r2d, r3d
    410    movd                 m1, r2d
    411    pmulhuw              m0, m1
    412    psrlw                m0, 1
    413 .w64_end:
    414    pshuflw              m0, m0, q0000
    415    punpcklqdq           m0, m0
    416 .s64:
    417    mova        [dstq+16*0], m0
    418    mova        [dstq+16*1], m0
    419    mova        [dstq+16*2], m0
    420    mova        [dstq+16*3], m0
    421    mova        [dstq+16*4], m0
    422    mova        [dstq+16*5], m0
    423    mova        [dstq+16*6], m0
    424    mova        [dstq+16*7], m0
    425    add                dstq, strideq
    426    dec                  hd
    427    jg .s64
    428    RET
    429 
    430 cglobal ipred_dc_128_16bpc, 2, 7, 6, dst, stride, tl, w, h, stride3
    431    mov                 r6d, r8m
    432    LEA                  r5, ipred_dc_128_16bpc_ssse3_table
    433    tzcnt                wd, wm
    434    shr                 r6d, 11
    435    movifnidn            hd, hm
    436    movsxd               wq, [r5+wq*4]
    437    movddup              m0, [r5-ipred_dc_128_16bpc_ssse3_table+pw_512+r6*8]
    438    add                  wq, r5
    439    lea            stride3q, [strideq*3]
    440    jmp                  wq
    441 
    442 cglobal ipred_v_16bpc, 4, 7, 6, dst, stride, tl, w, h, stride3
    443    LEA                  r5, ipred_dc_splat_16bpc_ssse3_table
    444    movifnidn            hd, hm
    445    movu                 m0, [tlq+  2]
    446    movu                 m1, [tlq+ 18]
    447    movu                 m2, [tlq+ 34]
    448    movu                 m3, [tlq+ 50]
    449    cmp                  wd, 64
    450    je .w64
    451    tzcnt                wd, wd
    452    movsxd               wq, [r5+wq*4]
    453    add                  wq, r5
    454    lea            stride3q, [strideq*3]
    455    jmp                  wq
    456 .w64:
    457    WIN64_SPILL_XMM 8
    458    movu                 m4, [tlq+ 66]
    459    movu                 m5, [tlq+ 82]
    460    movu                 m6, [tlq+ 98]
    461    movu                 m7, [tlq+114]
    462 .w64_loop:
    463    mova        [dstq+16*0], m0
    464    mova        [dstq+16*1], m1
    465    mova        [dstq+16*2], m2
    466    mova        [dstq+16*3], m3
    467    mova        [dstq+16*4], m4
    468    mova        [dstq+16*5], m5
    469    mova        [dstq+16*6], m6
    470    mova        [dstq+16*7], m7
    471    add                dstq, strideq
    472    dec                  hd
    473    jg .w64_loop
    474    RET
    475 
    476 cglobal ipred_h_16bpc, 3, 6, 4, dst, stride, tl, w, h, stride3
    477 %define base r5-ipred_h_16bpc_ssse3_table
    478    tzcnt                wd, wm
    479    LEA                  r5, ipred_h_16bpc_ssse3_table
    480    movifnidn            hd, hm
    481    movsxd               wq, [r5+wq*4]
    482    movddup              m2, [base+pw_256]
    483    movddup              m3, [base+pb_2_3]
    484    add                  wq, r5
    485    lea            stride3q, [strideq*3]
    486    jmp                  wq
    487 .w4:
    488    sub                 tlq, 8
    489    movq                 m3, [tlq]
    490    pshuflw              m0, m3, q3333
    491    pshuflw              m1, m3, q2222
    492    pshuflw              m2, m3, q1111
    493    pshuflw              m3, m3, q0000
    494    movq   [dstq+strideq*0], m0
    495    movq   [dstq+strideq*1], m1
    496    movq   [dstq+strideq*2], m2
    497    movq   [dstq+stride3q ], m3
    498    lea                dstq, [dstq+strideq*4]
    499    sub                  hd, 4
    500    jg .w4
    501    RET
    502 .w8:
    503    sub                 tlq, 8
    504    movq                 m3, [tlq]
    505    punpcklwd            m3, m3
    506    pshufd               m0, m3, q3333
    507    pshufd               m1, m3, q2222
    508    pshufd               m2, m3, q1111
    509    pshufd               m3, m3, q0000
    510    mova   [dstq+strideq*0], m0
    511    mova   [dstq+strideq*1], m1
    512    mova   [dstq+strideq*2], m2
    513    mova   [dstq+stride3q ], m3
    514    lea                dstq, [dstq+strideq*4]
    515    sub                  hd, 4
    516    jg .w8
    517    RET
    518 .w16:
    519    sub                 tlq, 4
    520    movd                 m1, [tlq]
    521    pshufb               m0, m1, m3
    522    pshufb               m1, m2
    523    mova [dstq+strideq*0+16*0], m0
    524    mova [dstq+strideq*0+16*1], m0
    525    mova [dstq+strideq*1+16*0], m1
    526    mova [dstq+strideq*1+16*1], m1
    527    lea                dstq, [dstq+strideq*2]
    528    sub                  hd, 2
    529    jg .w16
    530    RET
    531 .w32:
    532    sub                 tlq, 4
    533    movd                 m1, [tlq]
    534    pshufb               m0, m1, m3
    535    pshufb               m1, m2
    536    mova [dstq+strideq*0+16*0], m0
    537    mova [dstq+strideq*0+16*1], m0
    538    mova [dstq+strideq*0+16*2], m0
    539    mova [dstq+strideq*0+16*3], m0
    540    mova [dstq+strideq*1+16*0], m1
    541    mova [dstq+strideq*1+16*1], m1
    542    mova [dstq+strideq*1+16*2], m1
    543    mova [dstq+strideq*1+16*3], m1
    544    lea                dstq, [dstq+strideq*2]
    545    sub                  hd, 2
    546    jg .w32
    547    RET
    548 .w64:
    549    sub                 tlq, 2
    550    movd                 m0, [tlq]
    551    pshufb               m0, m2
    552    mova        [dstq+16*0], m0
    553    mova        [dstq+16*1], m0
    554    mova        [dstq+16*2], m0
    555    mova        [dstq+16*3], m0
    556    mova        [dstq+16*4], m0
    557    mova        [dstq+16*5], m0
    558    mova        [dstq+16*6], m0
    559    mova        [dstq+16*7], m0
    560    add                dstq, strideq
    561    dec                  hd
    562    jg .w64
    563    RET
    564 
    565 cglobal ipred_paeth_16bpc, 4, 6, 8, dst, stride, tl, w, h, left
    566 %define base r5-ipred_paeth_16bpc_ssse3_table
    567    movifnidn            hd, hm
    568    pshuflw              m4, [tlq], q0000
    569    mov               leftq, tlq
    570    add                  hd, hd
    571    punpcklqdq           m4, m4      ; topleft
    572    sub               leftq, hq
    573    and                  wd, ~7
    574    jnz .w8
    575    movddup              m5, [tlq+2] ; top
    576    psubw                m6, m5, m4
    577    pabsw                m7, m6
    578 .w4_loop:
    579    movd                 m1, [leftq+hq-4]
    580    punpcklwd            m1, m1
    581    punpckldq            m1, m1      ; left
    582 %macro PAETH 0
    583    paddw                m0, m6, m1
    584    psubw                m2, m4, m0  ; tldiff
    585    psubw                m0, m5      ; tdiff
    586    pabsw                m2, m2
    587    pabsw                m0, m0
    588    pminsw               m2, m0
    589    pcmpeqw              m0, m2
    590    pand                 m3, m5, m0
    591    pandn                m0, m4
    592    por                  m0, m3
    593    pcmpgtw              m3, m7, m2
    594    pand                 m0, m3
    595    pandn                m3, m1
    596    por                  m0, m3
    597 %endmacro
    598    PAETH
    599    movhps [dstq+strideq*0], m0
    600    movq   [dstq+strideq*1], m0
    601    lea                dstq, [dstq+strideq*2]
    602    sub                  hd, 2*2
    603    jg .w4_loop
    604    RET
    605 .w8:
    606 %if ARCH_X86_32
    607    PUSH                 r6
    608    %define             r7d  hm
    609    %assign regs_used     7
    610 %elif WIN64
    611    movaps              r4m, m8
    612    PUSH                 r7
    613    %assign regs_used     8
    614 %endif
    615 %if ARCH_X86_64
    616    movddup              m8, [pw_256]
    617 %endif
    618    lea                 tlq, [tlq+wq*2+2]
    619    neg                  wq
    620    mov                 r7d, hd
    621 .w8_loop0:
    622    movu                 m5, [tlq+wq*2]
    623    mov                  r6, dstq
    624    add                dstq, 16
    625    psubw                m6, m5, m4
    626    pabsw                m7, m6
    627 .w8_loop:
    628    movd                 m1, [leftq+hq-2]
    629 %if ARCH_X86_64
    630    pshufb               m1, m8
    631 %else
    632    pshuflw              m1, m1, q0000
    633    punpcklqdq           m1, m1
    634 %endif
    635    PAETH
    636    mova               [r6], m0
    637    add                  r6, strideq
    638    sub                  hd, 1*2
    639    jg .w8_loop
    640    mov                  hd, r7d
    641    add                  wq, 8
    642    jl .w8_loop0
    643 %if WIN64
    644    movaps               m8, r4m
    645 %endif
    646    RET
    647 
    648 %if ARCH_X86_64
    649 DECLARE_REG_TMP 7
    650 %else
    651 DECLARE_REG_TMP 4
    652 %endif
    653 
    654 cglobal ipred_smooth_v_16bpc, 4, 6, 6, dst, stride, tl, w, h, weights
    655    LEA            weightsq, smooth_weights_1d_16bpc
    656    mov                  hd, hm
    657    lea            weightsq, [weightsq+hq*4]
    658    neg                  hq
    659    movd                 m5, [tlq+hq*2] ; bottom
    660    pshuflw              m5, m5, q0000
    661    punpcklqdq           m5, m5
    662    cmp                  wd, 4
    663    jne .w8
    664    movddup              m4, [tlq+2]    ; top
    665    lea                  r3, [strideq*3]
    666    psubw                m4, m5         ; top - bottom
    667 .w4_loop:
    668    movq                 m1, [weightsq+hq*2]
    669    punpcklwd            m1, m1
    670    pshufd               m0, m1, q1100
    671    punpckhdq            m1, m1
    672    pmulhrsw             m0, m4
    673    pmulhrsw             m1, m4
    674    paddw                m0, m5
    675    paddw                m1, m5
    676    movq   [dstq+strideq*0], m0
    677    movhps [dstq+strideq*1], m0
    678    movq   [dstq+strideq*2], m1
    679    movhps [dstq+r3       ], m1
    680    lea                dstq, [dstq+strideq*4]
    681    add                  hq, 4
    682    jl .w4_loop
    683    RET
    684 .w8:
    685 %if ARCH_X86_32
    686    PUSH                 r6
    687    %assign regs_used     7
    688    mov                  hm, hq
    689    %define              hq  hm
    690 %elif WIN64
    691    PUSH                 r7
    692    %assign regs_used     8
    693 %endif
    694 .w8_loop0:
    695    mov                  t0, hq
    696    movu                 m4, [tlq+2]
    697    add                 tlq, 16
    698    mov                  r6, dstq
    699    add                dstq, 16
    700    psubw                m4, m5
    701 .w8_loop:
    702    movq                 m3, [weightsq+t0*2]
    703    punpcklwd            m3, m3
    704    pshufd               m0, m3, q0000
    705    pshufd               m1, m3, q1111
    706    pshufd               m2, m3, q2222
    707    pshufd               m3, m3, q3333
    708    REPX   {pmulhrsw x, m4}, m0, m1, m2, m3
    709    REPX   {paddw    x, m5}, m0, m1, m2, m3
    710    mova     [r6+strideq*0], m0
    711    mova     [r6+strideq*1], m1
    712    lea                  r6, [r6+strideq*2]
    713    mova     [r6+strideq*0], m2
    714    mova     [r6+strideq*1], m3
    715    lea                  r6, [r6+strideq*2]
    716    add                  t0, 4
    717    jl .w8_loop
    718    sub                  wd, 8
    719    jg .w8_loop0
    720    RET
    721 
    722 cglobal ipred_smooth_h_16bpc, 3, 6, 6, dst, stride, tl, w, h, weights
    723    LEA            weightsq, smooth_weights_1d_16bpc
    724    mov                  wd, wm
    725    movifnidn            hd, hm
    726    movd                 m5, [tlq+wq*2] ; right
    727    sub                 tlq, 8
    728    add                  hd, hd
    729    pshuflw              m5, m5, q0000
    730    sub                 tlq, hq
    731    punpcklqdq           m5, m5
    732    cmp                  wd, 4
    733    jne .w8
    734    movddup              m4, [weightsq+4*2]
    735    lea                  r3, [strideq*3]
    736 .w4_loop:
    737    movq                 m1, [tlq+hq]   ; left
    738    punpcklwd            m1, m1
    739    psubw                m1, m5         ; left - right
    740    pshufd               m0, m1, q3322
    741    punpckldq            m1, m1
    742    pmulhrsw             m0, m4
    743    pmulhrsw             m1, m4
    744    paddw                m0, m5
    745    paddw                m1, m5
    746    movhps [dstq+strideq*0], m0
    747    movq   [dstq+strideq*1], m0
    748    movhps [dstq+strideq*2], m1
    749    movq   [dstq+r3       ], m1
    750    lea                dstq, [dstq+strideq*4]
    751    sub                  hd, 4*2
    752    jg .w4_loop
    753    RET
    754 .w8:
    755    lea            weightsq, [weightsq+wq*4]
    756    neg                  wq
    757 %if ARCH_X86_32
    758    PUSH                 r6
    759    %assign regs_used     7
    760    %define              hd  hm
    761 %elif WIN64
    762    PUSH                 r7
    763    %assign regs_used     8
    764 %endif
    765 .w8_loop0:
    766    mov                 t0d, hd
    767    mova                 m4, [weightsq+wq*2]
    768    mov                  r6, dstq
    769    add                dstq, 16
    770 .w8_loop:
    771    movq                 m3, [tlq+t0*(1+ARCH_X86_32)]
    772    punpcklwd            m3, m3
    773    psubw                m3, m5
    774    pshufd               m0, m3, q3333
    775    pshufd               m1, m3, q2222
    776    pshufd               m2, m3, q1111
    777    pshufd               m3, m3, q0000
    778    REPX   {pmulhrsw x, m4}, m0, m1, m2, m3
    779    REPX   {paddw    x, m5}, m0, m1, m2, m3
    780    mova     [r6+strideq*0], m0
    781    mova     [r6+strideq*1], m1
    782    lea                  r6, [r6+strideq*2]
    783    mova     [r6+strideq*0], m2
    784    mova     [r6+strideq*1], m3
    785    lea                  r6, [r6+strideq*2]
    786    sub                 t0d, 4*(1+ARCH_X86_64)
    787    jg .w8_loop
    788    add                  wq, 8
    789    jl .w8_loop0
    790    RET
    791 
    792 %if ARCH_X86_64
    793 DECLARE_REG_TMP 10
    794 %else
    795 DECLARE_REG_TMP 3
    796 %endif
    797 
    798 cglobal ipred_smooth_16bpc, 3, 7, 8, dst, stride, tl, w, h, \
    799                                     h_weights, v_weights, top
    800    LEA          h_weightsq, smooth_weights_2d_16bpc
    801    mov                  wd, wm
    802    mov                  hd, hm
    803    movd                 m7, [tlq+wq*2] ; right
    804    lea          v_weightsq, [h_weightsq+hq*8]
    805    neg                  hq
    806    movd                 m6, [tlq+hq*2] ; bottom
    807    pshuflw              m7, m7, q0000
    808    pshuflw              m6, m6, q0000
    809    cmp                  wd, 4
    810    jne .w8
    811    movq                 m4, [tlq+2]    ; top
    812    mova                 m5, [h_weightsq+4*4]
    813    punpcklwd            m4, m6         ; top, bottom
    814    pxor                 m6, m6
    815 .w4_loop:
    816    movq                 m1, [v_weightsq+hq*4]
    817    sub                 tlq, 4
    818    movd                 m3, [tlq]      ; left
    819    pshufd               m0, m1, q0000
    820    pshufd               m1, m1, q1111
    821    pmaddwd              m0, m4
    822    punpcklwd            m3, m7         ; left, right
    823    pmaddwd              m1, m4
    824    pshufd               m2, m3, q1111
    825    pshufd               m3, m3, q0000
    826    pmaddwd              m2, m5
    827    pmaddwd              m3, m5
    828    paddd                m0, m2
    829    paddd                m1, m3
    830    psrld                m0, 8
    831    psrld                m1, 8
    832    packssdw             m0, m1
    833    pavgw                m0, m6
    834    movq   [dstq+strideq*0], m0
    835    movhps [dstq+strideq*1], m0
    836    lea                dstq, [dstq+strideq*2]
    837    add                  hq, 2
    838    jl .w4_loop
    839    RET
    840 .w8:
    841 %if ARCH_X86_32
    842    lea          h_weightsq, [h_weightsq+wq*4]
    843    mov                  t0, tlq
    844    mov                 r1m, tlq
    845    mov                 r2m, hq
    846    %define              m8  [h_weightsq+16*0]
    847    %define              m9  [h_weightsq+16*1]
    848 %else
    849 %if WIN64
    850    movaps              r4m, m8
    851    movaps              r6m, m9
    852    PUSH                 r7
    853    PUSH                 r8
    854 %endif
    855    PUSH                 r9
    856    PUSH                r10
    857    %assign       regs_used  11
    858    lea          h_weightsq, [h_weightsq+wq*8]
    859    lea                topq, [tlq+wq*2]
    860    neg                  wq
    861    mov                  r8, tlq
    862    mov                  r9, hq
    863 %endif
    864    punpcklqdq           m6, m6
    865 .w8_loop0:
    866 %if ARCH_X86_32
    867    movu                 m5, [t0+2]
    868    add                  t0, 16
    869    mov                 r0m, t0
    870 %else
    871    movu                 m5, [topq+wq*2+2]
    872    mova                 m8, [h_weightsq+wq*4+16*0]
    873    mova                 m9, [h_weightsq+wq*4+16*1]
    874 %endif
    875    mov                  t0, dstq
    876    add                dstq, 16
    877    punpcklwd            m4, m5, m6
    878    punpckhwd            m5, m6
    879 .w8_loop:
    880    movd                 m1, [v_weightsq+hq*4]
    881    sub                 tlq, 2
    882    movd                 m3, [tlq]      ; left
    883    pshufd               m1, m1, q0000
    884    pmaddwd              m0, m4, m1
    885    pshuflw              m3, m3, q0000
    886    pmaddwd              m1, m5
    887    punpcklwd            m3, m7         ; left, right
    888    pmaddwd              m2, m8, m3
    889    pmaddwd              m3, m9
    890    paddd                m0, m2
    891    paddd                m1, m3
    892    psrld                m0, 8
    893    psrld                m1, 8
    894    packssdw             m0, m1
    895    pxor                 m1, m1
    896    pavgw                m0, m1
    897    mova               [t0], m0
    898    add                  t0, strideq
    899    inc                  hq
    900    jl .w8_loop
    901 %if ARCH_X86_32
    902    mov                  t0, r0m
    903    mov                 tlq, r1m
    904    add          h_weightsq, 16*2
    905    mov                  hq, r2m
    906    sub            dword wm, 8
    907    jg .w8_loop0
    908 %else
    909    mov                 tlq, r8
    910    mov                  hq, r9
    911    add                  wq, 8
    912    jl .w8_loop0
    913 %endif
    914 %if WIN64
    915    movaps               m8, r4m
    916    movaps               m9, r6m
    917 %endif
    918    RET
    919 
    920 %if ARCH_X86_64
    921 cglobal ipred_z1_16bpc, 3, 8, 8, 16*18, dst, stride, tl, w, h, angle, dx
    922    %define            base  r7-$$
    923    %define          bdmaxm  r8m
    924    lea                  r7, [$$]
    925 %else
    926 cglobal ipred_z1_16bpc, 3, 7, 8, -16*18, dst, stride, tl, w, h, angle, dx
    927    %define            base  r1-$$
    928    %define        stridemp  [rsp+4*0]
    929    %define          bdmaxm  [rsp+4*1]
    930    mov                  r3, r8m
    931    mov            stridemp, r1
    932    mov              bdmaxm, r3
    933    LEA                  r1, $$
    934 %endif
    935    tzcnt                wd, wm
    936    movifnidn        angled, anglem
    937    movifnidn            hd, hm
    938    add                 tlq, 2
    939    movsxd               wq, [base+ipred_z1_16bpc_ssse3_table+wq*4]
    940    mov                 dxd, angled
    941    movddup              m0, [base+pw_256]
    942    and                 dxd, 0x7e
    943    movddup              m7, [base+pw_62]
    944    add              angled, 165 ; ~90
    945    lea                  wq, [base+wq+ipred_z1_16bpc_ssse3_table]
    946    movzx               dxd, word [base+dr_intra_derivative+dxq]
    947    xor              angled, 0x4ff ; d = 90 - angle
    948    jmp                  wq
    949 .w4:
    950    lea                 r3d, [angleq+88]
    951    test                r3d, 0x480
    952    jnz .w4_no_upsample ; !enable_intra_edge_filter || angle >= 40
    953    sar                 r3d, 9
    954    add                 r3d, hd
    955    cmp                 r3d, 8
    956    jg .w4_no_upsample ; h > 8 || (w == h && is_sm)
    957    movd                 m3, [tlq+14]
    958    movu                 m2, [tlq+ 0]  ; 1 2 3 4 5 6 7 8
    959    movd                 m1, bdmaxm
    960    pshufb               m3, m0
    961    palignr              m4, m3, m2, 4 ; 3 4 5 6 7 8 8 8
    962    paddw                m4, [tlq- 2]  ; 0 1 2 3 4 5 6 7
    963    add                 dxd, dxd
    964    mova           [rsp+32], m3
    965    palignr              m3, m2, 2     ; 2 3 4 5 6 7 8 8
    966    pshufb               m1, m0
    967    paddw                m3, m2        ; -1 * a + 9 * b + 9 * c + -1 * d
    968    psubw                m5, m3, m4    ; = (b + c - a - d + (b + c) << 3 + 8) >> 4
    969    movd                 m4, dxd
    970    psraw                m5, 3         ; = ((b + c - a - d) >> 3 + b + c + 1) >> 1
    971    paddw                m3, m5
    972    pxor                 m5, m5
    973    pmaxsw               m3, m5
    974    mov                 r3d, dxd
    975    pavgw                m3, m5
    976    pshufb               m4, m0
    977    pminsw               m3, m1
    978    punpcklwd            m1, m2, m3
    979    punpckhwd            m2, m3
    980    mova                 m3, [base+z_upsample]
    981    movifnidn       strideq, stridemp
    982    mova           [rsp+ 0], m1
    983    paddw                m5, m4, m4
    984    mova           [rsp+16], m2
    985    punpcklqdq           m4, m5 ; xpos0 xpos1
    986 .w4_upsample_loop:
    987    lea                 r2d, [r3+dxq]
    988    shr                 r3d, 6 ; base0
    989    movu                 m1, [rsp+r3*2]
    990    lea                 r3d, [r2+dxq]
    991    shr                 r2d, 6 ; base1
    992    movu                 m2, [rsp+r2*2]
    993    pshufb               m1, m3
    994    pshufb               m2, m3
    995    punpcklqdq           m0, m1, m2
    996    punpckhqdq           m1, m2
    997    pand                 m2, m7, m4 ; frac
    998    psllw                m2, 9      ; (a * (64 - frac) + b * frac + 32) >> 6
    999    psubw                m1, m0     ; = a + (((b - a) * frac + 32) >> 6)
   1000    pmulhrsw             m1, m2     ; = a + (((b - a) * (frac << 9) + 16384) >> 15)
   1001    paddw                m4, m5     ; xpos += dx
   1002    paddw                m0, m1
   1003    movq   [dstq+strideq*0], m0
   1004    movhps [dstq+strideq*1], m0
   1005    lea                dstq, [dstq+strideq*2]
   1006    sub                  hd, 2
   1007    jg .w4_upsample_loop
   1008    RET
   1009 .w4_no_upsample:
   1010    mov                 r3d, 7     ; max_base
   1011    test             angled, 0x400 ; !enable_intra_edge_filter
   1012    jnz .w4_main
   1013    lea                 r3d, [hq+3]
   1014    movd                 m1, r3d
   1015    movd                 m3, angled
   1016    shr              angled, 8 ; is_sm << 1
   1017    pxor                 m2, m2
   1018    pshufb               m1, m2
   1019    pshufb               m3, m2
   1020    pcmpeqb              m1, [base+z_filt_wh4]
   1021    pand                 m1, m3
   1022    pcmpgtb              m1, [base+z_filt_t_w48+angleq*8]
   1023    pmovmskb            r5d, m1
   1024    mov                 r3d, 7
   1025    test                r5d, r5d
   1026    jz .w4_main ; filter_strength == 0
   1027    pshuflw              m1, [tlq-2], q0000
   1028    movu                 m2, [tlq+16*0]
   1029    imul                r5d, 0x55555555
   1030    movd                 m3, [tlq+r3*2]
   1031    shr                 r5d, 30 ; filter_strength
   1032    movd           [rsp+12], m1
   1033    pshuflw              m3, m3, q0000
   1034    mova         [rsp+16*1], m2
   1035    lea                 r2d, [r3+2]
   1036    movq      [rsp+r3*2+18], m3
   1037    cmp                  hd, 8
   1038    cmovae              r3d, r2d
   1039    lea                 tlq, [rsp+16*1]
   1040    call .filter_edge
   1041 .w4_main:
   1042    lea                 tlq, [tlq+r3*2]
   1043    movd                 m4, dxd
   1044    movddup              m1, [base+z_base_inc] ; base_inc << 6
   1045    movd                 m6, [tlq] ; top[max_base_x]
   1046    shl                 r3d, 6
   1047    movd                 m3, r3d
   1048    pshufb               m4, m0
   1049    mov                 r5d, dxd ; xpos
   1050    pshufb               m6, m0
   1051    sub                  r5, r3
   1052    pshufb               m3, m0
   1053    paddw                m5, m4, m4
   1054    psubw                m3, m1 ; max_base_x
   1055    punpcklqdq           m4, m5 ; xpos0 xpos1
   1056    movifnidn       strideq, stridemp
   1057 .w4_loop:
   1058    lea                  r3, [r5+dxq]
   1059    sar                  r5, 6      ; base0
   1060    movq                 m0, [tlq+r5*2+0]
   1061    movq                 m1, [tlq+r5*2+2]
   1062    lea                  r5, [r3+dxq]
   1063    sar                  r3, 6      ; base1
   1064    movhps               m0, [tlq+r3*2+0]
   1065    movhps               m1, [tlq+r3*2+2]
   1066    pand                 m2, m7, m4
   1067    psllw                m2, 9
   1068    psubw                m1, m0
   1069    pmulhrsw             m1, m2
   1070    pcmpgtw              m2, m3, m4 ; xpos < max_base_x
   1071    paddw                m4, m5     ; xpos += dx
   1072    paddw                m0, m1
   1073    pand                 m0, m2
   1074    pandn                m2, m6
   1075    por                  m0, m2
   1076    movq   [dstq+strideq*0], m0
   1077    movhps [dstq+strideq*1], m0
   1078    sub                  hd, 2
   1079    jz .w4_end
   1080    lea                dstq, [dstq+strideq*2]
   1081    test                r5d, r5d
   1082    jl .w4_loop
   1083 .w4_end_loop:
   1084    movq   [dstq+strideq*0], m6
   1085    movq   [dstq+strideq*1], m6
   1086    lea                dstq, [dstq+strideq*2]
   1087    sub                  hd, 2
   1088    jg .w4_end_loop
   1089 .w4_end:
   1090    RET
   1091 .w8:
   1092    lea                 r3d, [angleq+88]
   1093    and                 r3d, ~0x7f
   1094    or                  r3d, hd
   1095    cmp                 r3d, 8
   1096    ja .w8_no_upsample ; !enable_intra_edge_filter || is_sm || d >= 40 || h > 8
   1097    movu                 m1, [tlq+ 0]  ; 1 2 3 4 5 6 7 8
   1098    movu                 m5, [tlq+ 2]  ; 2 3 4 5 6 7 8 9
   1099    movu                 m3, [tlq+ 4]  ; 3 4 5 6 7 8 9 a
   1100    paddw                m5, m1
   1101    paddw                m3, [tlq- 2]  ; 0 1 2 3 4 5 6 7
   1102    psubw                m2, m5, m3
   1103    movu                 m6, [tlq+18]  ; a b c d e f g _
   1104    psraw                m2, 3
   1105    movu                 m3, [tlq+20]  ; b c d e f g _ _
   1106    paddw                m5, m2
   1107    movu                 m2, [tlq+16]  ; 9 a b c d e f g
   1108    paddw                m6, m2
   1109    add                 dxd, dxd
   1110    cmp                  hd, 4
   1111    jne .w8_upsample_h8 ; awkward single-pixel edge case
   1112    pshuflw              m3, m3, q1110 ; b c c _ _ _ _ _
   1113 .w8_upsample_h8:
   1114    paddw                m3, [tlq+14]  ; 8 9 a b c d e f
   1115    psubw                m4, m6, m3
   1116    movd                 m3, bdmaxm
   1117    psraw                m4, 3
   1118    mov                 r3d, dxd
   1119    paddw                m6, m4
   1120    pxor                 m4, m4
   1121    pmaxsw               m5, m4
   1122    pmaxsw               m6, m4
   1123    pshufb               m3, m0
   1124    pavgw                m5, m4
   1125    pavgw                m6, m4
   1126    movd                 m4, dxd
   1127    pminsw               m5, m3
   1128    pminsw               m6, m3
   1129    mova                 m3, [base+z_upsample]
   1130    pshufb               m4, m0
   1131    movifnidn       strideq, stridemp
   1132    punpcklwd            m0, m1, m5
   1133    mova           [rsp+ 0], m0
   1134    punpckhwd            m1, m5
   1135    mova           [rsp+16], m1
   1136    punpcklwd            m0, m2, m6
   1137    mova           [rsp+32], m0
   1138    punpckhwd            m2, m6
   1139    mova           [rsp+48], m2
   1140    mova                 m5, m4
   1141 .w8_upsample_loop:
   1142    mov                 r2d, r3d
   1143    shr                 r2d, 6
   1144    movu                 m1, [rsp+r2*2+ 0]
   1145    movu                 m2, [rsp+r2*2+16]
   1146    add                 r3d, dxd
   1147    pshufb               m1, m3
   1148    pshufb               m2, m3
   1149    punpcklqdq           m0, m1, m2
   1150    punpckhqdq           m1, m2
   1151    pand                 m2, m7, m4
   1152    psllw                m2, 9
   1153    psubw                m1, m0
   1154    pmulhrsw             m1, m2
   1155    paddw                m4, m5
   1156    paddw                m0, m1
   1157    mova             [dstq], m0
   1158    add                dstq, strideq
   1159    dec                  hd
   1160    jg .w8_upsample_loop
   1161    RET
   1162 .w8_no_upsample:
   1163    lea                 r3d, [hq+7]
   1164    movd                 m1, r3d
   1165    and                 r3d, 7
   1166    or                  r3d, 8 ; imin(h+7, 15)
   1167    test             angled, 0x400
   1168    jnz .w8_main
   1169    movd                 m3, angled
   1170    shr              angled, 8 ; is_sm << 1
   1171    pxor                 m2, m2
   1172    pshufb               m1, m2
   1173    pshufb               m3, m2
   1174    movu                 m2, [base+z_filt_wh8]
   1175    psrldq               m4, [base+z_filt_t_w48+angleq*8], 4
   1176    pcmpeqb              m2, m1
   1177    pand                 m2, m3
   1178    pcmpgtb              m2, m4
   1179    pmovmskb            r5d, m2
   1180    test                r5d, r5d
   1181    jz .w8_main ; filter_strength == 0
   1182    pshuflw              m1, [tlq-2], q0000
   1183    movu                 m2, [tlq+16*0]
   1184    imul                r5d, 0x55555555
   1185    movu                 m3, [tlq+16*1]
   1186    movd                 m4, [tlq+r3*2]
   1187    shr                 r5d, 30 ; filter_strength
   1188    movd           [rsp+12], m1
   1189    mova         [rsp+16*1], m2
   1190    pshuflw              m4, m4, q0000
   1191    mova         [rsp+16*2], m3
   1192    lea                 r2d, [r3+2]
   1193    movq      [rsp+r3*2+18], m4
   1194    cmp                  hd, 16
   1195    cmovae              r3d, r2d
   1196    lea                 tlq, [rsp+16*1]
   1197    call .filter_edge
   1198 .w8_main:
   1199    lea                 tlq, [tlq+r3*2]
   1200    movd                 m5, dxd
   1201    mova                 m4, [base+z_base_inc]
   1202    shl                 r3d, 6
   1203    movd                 m6, [tlq] ; top[max_base_x]
   1204    movd                 m1, r3d
   1205    pshufb               m5, m0
   1206    mov                 r5d, dxd ; xpos
   1207    pshufb               m1, m0
   1208    sub                  r5, r3
   1209    psubw                m4, m1 ; max_base_x
   1210    pshufb               m6, m0
   1211    paddw                m4, m5
   1212    movifnidn       strideq, stridemp
   1213 .w8_loop:
   1214    mov                  r3, r5
   1215    sar                  r3, 6
   1216    movu                 m0, [tlq+r3*2+0]
   1217    movu                 m1, [tlq+r3*2+2]
   1218    pand                 m2, m7, m4
   1219    psllw                m2, 9
   1220    psubw                m1, m0
   1221    pmulhrsw             m1, m2
   1222    psraw                m2, m4, 15 ; xpos < max_base_x
   1223    paddw                m4, m5     ; xpos += dx
   1224    paddw                m0, m1
   1225    pand                 m0, m2
   1226    pandn                m2, m6
   1227    por                  m0, m2
   1228    mova             [dstq], m0
   1229    dec                  hd
   1230    jz .w8_end
   1231    add                dstq, strideq
   1232    add                  r5, dxq
   1233    jl .w8_loop
   1234 .w8_end_loop:
   1235    mova             [dstq], m6
   1236    add                dstq, strideq
   1237    dec                  hd
   1238    jg .w8_end_loop
   1239 .w8_end:
   1240    RET
   1241 .w16:
   1242 %if ARCH_X86_32
   1243    %define         strideq  r3
   1244 %endif
   1245    lea                 r3d, [hq+15]
   1246    movd                 m1, r3d
   1247    and                 r3d, 15
   1248    or                  r3d, 16 ; imin(h+15, 31)
   1249    test             angled, 0x400
   1250    jnz .w16_main
   1251    movd                 m3, angled
   1252    shr              angled, 8 ; is_sm << 1
   1253    pxor                 m2, m2
   1254    pshufb               m1, m2
   1255    pshufb               m3, m2
   1256    movq                 m4, [base+z_filt_t_w16+angleq*4]
   1257    pcmpeqb              m1, [base+z_filt_wh16]
   1258    pand                 m1, m3
   1259    pcmpgtb              m1, m4
   1260    pmovmskb            r5d, m1
   1261    test                r5d, r5d
   1262    jz .w16_main ; filter_strength == 0
   1263    pshuflw              m1, [tlq-2], q0000
   1264    movu                 m2, [tlq+16*0]
   1265    imul                r5d, 0x24924924
   1266    movu                 m3, [tlq+16*1]
   1267    movu                 m4, [tlq+16*2]
   1268    shr                 r5d, 30
   1269    movu                 m5, [tlq+16*3]
   1270    movd                 m6, [tlq+r3*2]
   1271    adc                 r5d, -1 ; filter_strength
   1272    movd           [rsp+12], m1
   1273    mova         [rsp+16*1], m2
   1274    mova         [rsp+16*2], m3
   1275    pshuflw              m6, m6, q0000
   1276    mova         [rsp+16*3], m4
   1277    mova         [rsp+16*4], m5
   1278    lea                 r2d, [r3+2]
   1279    movq      [rsp+r3*2+18], m6
   1280    cmp                  hd, 32
   1281    cmovae              r3d, r2d
   1282    lea                 tlq, [rsp+16*1]
   1283    call .filter_edge
   1284 .w16_main:
   1285    lea                 tlq, [tlq+r3*2]
   1286    movd                 m5, dxd
   1287    mova                 m4, [base+z_base_inc]
   1288    shl                 r3d, 6
   1289    movd                 m6, [tlq] ; top[max_base_x]
   1290    movd                 m1, r3d
   1291    pshufb               m5, m0
   1292    mov                 r5d, dxd ; xpos
   1293    pshufb               m1, m0
   1294    sub                  r5, r3
   1295    psubw                m4, m1 ; max_base_x
   1296    pshufb               m6, m0
   1297    paddw                m4, m5
   1298 .w16_loop:
   1299    mov                  r3, r5
   1300    sar                  r3, 6
   1301    movu                 m0, [tlq+r3*2+ 0]
   1302    movu                 m2, [tlq+r3*2+ 2]
   1303    pand                 m3, m7, m4
   1304    psllw                m3, 9
   1305    psubw                m2, m0
   1306    pmulhrsw             m2, m3
   1307    movu                 m1, [tlq+r3*2+16]
   1308    paddw                m0, m2
   1309    movu                 m2, [tlq+r3*2+18]
   1310    psubw                m2, m1
   1311    pmulhrsw             m2, m3
   1312    movddup              m3, [base+pw_m512]
   1313    paddw                m1, m2
   1314    psraw                m2, m4, 15
   1315    pcmpgtw              m3, m4
   1316    paddw                m4, m5
   1317    pand                 m0, m2
   1318    pandn                m2, m6
   1319    pand                 m1, m3
   1320    pandn                m3, m6
   1321    por                  m0, m2
   1322    mova        [dstq+16*0], m0
   1323    por                  m1, m3
   1324    mova        [dstq+16*1], m1
   1325    dec                  hd
   1326    jz .w16_end
   1327    movifnidn       strideq, stridemp
   1328    add                dstq, strideq
   1329    add                  r5, dxq
   1330    jl .w16_loop
   1331 .w16_end_loop:
   1332    mova        [dstq+16*0], m6
   1333    mova        [dstq+16*1], m6
   1334    add                dstq, strideq
   1335    dec                  hd
   1336    jg .w16_end_loop
   1337 .w16_end:
   1338    RET
   1339 .w32:
   1340    lea                 r3d, [hq+31]
   1341    and                 r3d, 31
   1342    or                  r3d, 32    ; imin(h+31, 63)
   1343    test             angled, 0x400 ; !enable_intra_edge_filter
   1344    jnz .w32_main
   1345    call .filter_copy
   1346    lea                 r5d, [r3+2]
   1347    cmp                  hd, 64
   1348    cmove               r3d, r5d
   1349    call .filter_edge_s3
   1350 .w32_main:
   1351    lea                 tlq, [tlq+r3*2]
   1352    movd                 m5, dxd
   1353    mova                 m4, [base+z_base_inc]
   1354    shl                 r3d, 6
   1355    movd                 m6, [tlq] ; top[max_base_x]
   1356    movd                 m1, r3d
   1357    pshufb               m5, m0
   1358    mov                 r5d, dxd ; xpos
   1359    pshufb               m1, m0
   1360    sub                  r5, r3
   1361    psubw                m4, m1 ; max_base_x
   1362    pshufb               m6, m0
   1363    paddw                m4, m5
   1364 .w32_loop:
   1365    mov                  r3, r5
   1366    sar                  r3, 6
   1367    movu                 m0, [tlq+r3*2+ 0]
   1368    movu                 m2, [tlq+r3*2+ 2]
   1369    pand                 m3, m7, m4
   1370    psllw                m3, 9
   1371    psubw                m2, m0
   1372    pmulhrsw             m2, m3
   1373    movu                 m1, [tlq+r3*2+16]
   1374    paddw                m0, m2
   1375    movu                 m2, [tlq+r3*2+18]
   1376    psubw                m2, m1
   1377    pmulhrsw             m2, m3
   1378    paddw                m1, m2
   1379    psraw                m2, m4, 15
   1380    pand                 m0, m2
   1381    pandn                m2, m6
   1382    por                  m0, m2
   1383    movddup              m2, [base+pw_m512]
   1384    pcmpgtw              m2, m4
   1385    pand                 m1, m2
   1386    pandn                m2, m6
   1387    mova        [dstq+16*0], m0
   1388    por                  m1, m2
   1389    mova        [dstq+16*1], m1
   1390    movu                 m0, [tlq+r3*2+32]
   1391    movu                 m2, [tlq+r3*2+34]
   1392    psubw                m2, m0
   1393    pmulhrsw             m2, m3
   1394    movu                 m1, [tlq+r3*2+48]
   1395    paddw                m0, m2
   1396    movu                 m2, [tlq+r3*2+50]
   1397    psubw                m2, m1
   1398    pmulhrsw             m2, m3
   1399    paddw                m1, m2
   1400    movddup              m2, [base+pw_m1024]
   1401    movddup              m3, [base+pw_m1536]
   1402    pcmpgtw              m2, m4
   1403    pcmpgtw              m3, m4
   1404    paddw                m4, m5
   1405    pand                 m0, m2
   1406    pandn                m2, m6
   1407    pand                 m1, m3
   1408    pandn                m3, m6
   1409    por                  m0, m2
   1410    mova        [dstq+16*2], m0
   1411    por                  m1, m3
   1412    mova        [dstq+16*3], m1
   1413    dec                  hd
   1414    jz .w32_end
   1415    movifnidn       strideq, stridemp
   1416    add                dstq, strideq
   1417    add                  r5, dxq
   1418    jl .w32_loop
   1419 .w32_end_loop:
   1420    REPX {mova [dstq+16*x], m6}, 0, 1, 2, 3
   1421    add                dstq, strideq
   1422    dec                  hd
   1423    jg .w32_end_loop
   1424 .w32_end:
   1425    RET
   1426 .w64:
   1427    lea                 r3d, [hq+63]
   1428    test             angled, 0x400 ; !enable_intra_edge_filter
   1429    jnz .w64_main
   1430    call .filter_copy
   1431    call .filter_edge_s3
   1432 .w64_main:
   1433    lea                 tlq, [tlq+r3*2]
   1434    movd                 m5, dxd
   1435    mova                 m4, [base+z_base_inc]
   1436    shl                 r3d, 6
   1437    movd                 m6, [tlq] ; top[max_base_x]
   1438    movd                 m1, r3d
   1439    pshufb               m5, m0
   1440    mov                 r5d, dxd ; xpos
   1441    pshufb               m1, m0
   1442    sub                  r5, r3
   1443    psubw                m4, m1 ; max_base_x
   1444    pshufb               m6, m0
   1445    paddw                m4, m5
   1446 .w64_loop:
   1447    mov                  r3, r5
   1448    sar                  r3, 6
   1449    movu                 m0, [tlq+r3*2+ 0]
   1450    movu                 m2, [tlq+r3*2+ 2]
   1451    pand                 m3, m7, m4
   1452    psllw                m3, 9
   1453    psubw                m2, m0
   1454    pmulhrsw             m2, m3
   1455    movu                 m1, [tlq+r3*2+16]
   1456    paddw                m0, m2
   1457    movu                 m2, [tlq+r3*2+18]
   1458    psubw                m2, m1
   1459    pmulhrsw             m2, m3
   1460    paddw                m1, m2
   1461    psraw                m2, m4, 15
   1462    pand                 m0, m2
   1463    pandn                m2, m6
   1464    por                  m0, m2
   1465    movddup              m2, [base+pw_m512]
   1466    pcmpgtw              m2, m4
   1467    pand                 m1, m2
   1468    pandn                m2, m6
   1469    mova        [dstq+16*0], m0
   1470    por                  m1, m2
   1471    mova        [dstq+16*1], m1
   1472    movu                 m0, [tlq+r3*2+32]
   1473    movu                 m2, [tlq+r3*2+34]
   1474    psubw                m2, m0
   1475    pmulhrsw             m2, m3
   1476    movu                 m1, [tlq+r3*2+48]
   1477    paddw                m0, m2
   1478    movu                 m2, [tlq+r3*2+50]
   1479    psubw                m2, m1
   1480    pmulhrsw             m2, m3
   1481    paddw                m1, m2
   1482    movddup              m2, [base+pw_m1024]
   1483    pcmpgtw              m2, m4
   1484    pand                 m0, m2
   1485    pandn                m2, m6
   1486    por                  m0, m2
   1487    movddup              m2, [base+pw_m1536]
   1488    pcmpgtw              m2, m4
   1489    pand                 m1, m2
   1490    pandn                m2, m6
   1491    mova        [dstq+16*2], m0
   1492    por                  m1, m2
   1493    mova        [dstq+16*3], m1
   1494    movu                 m0, [tlq+r3*2+64]
   1495    movu                 m2, [tlq+r3*2+66]
   1496    psubw                m2, m0
   1497    pmulhrsw             m2, m3
   1498    movu                 m1, [tlq+r3*2+80]
   1499    paddw                m0, m2
   1500    movu                 m2, [tlq+r3*2+82]
   1501    psubw                m2, m1
   1502    pmulhrsw             m2, m3
   1503    paddw                m1, m2
   1504    movddup              m2, [base+pw_m2048]
   1505    pcmpgtw              m2, m4
   1506    pand                 m0, m2
   1507    pandn                m2, m6
   1508    por                  m0, m2
   1509    movddup              m2, [base+pw_m2560]
   1510    pcmpgtw              m2, m4
   1511    pand                 m1, m2
   1512    pandn                m2, m6
   1513    mova        [dstq+16*4], m0
   1514    por                  m1, m2
   1515    mova        [dstq+16*5], m1
   1516    movu                 m0, [tlq+r3*2+96]
   1517    movu                 m2, [tlq+r3*2+98]
   1518    psubw                m2, m0
   1519    pmulhrsw             m2, m3
   1520    movu                 m1, [tlq+r3*2+112]
   1521    paddw                m0, m2
   1522    movu                 m2, [tlq+r3*2+114]
   1523    psubw                m2, m1
   1524    pmulhrsw             m2, m3
   1525    paddw                m1, m2
   1526    movddup              m2, [base+pw_m3072]
   1527    movddup              m3, [base+pw_m3584]
   1528    pcmpgtw              m2, m4
   1529    pcmpgtw              m3, m4
   1530    paddw                m4, m5
   1531    pand                 m0, m2
   1532    pandn                m2, m6
   1533    pand                 m1, m3
   1534    pandn                m3, m6
   1535    por                  m0, m2
   1536    mova        [dstq+16*6], m0
   1537    por                  m1, m3
   1538    mova        [dstq+16*7], m1
   1539    dec                  hd
   1540    jz .w64_end
   1541    movifnidn       strideq, stridemp
   1542    add                dstq, strideq
   1543    add                  r5, dxq
   1544    jl .w64_loop
   1545 .w64_end_loop:
   1546    REPX {mova [dstq+16*x], m6}, 0, 1, 2, 3, 4, 5, 6, 7
   1547    add                dstq, strideq
   1548    dec                  hd
   1549    jg .w64_end_loop
   1550 .w64_end:
   1551    RET
   1552 ALIGN function_align
   1553 .filter_copy:
   1554    pshuflw              m2, [tlq-2], q0000
   1555    pshuflw              m3, [tlq+r3*2], q0000
   1556    xor                 r5d, r5d
   1557    movd   [rsp+gprsize+12], m2
   1558 .filter_copy_loop:
   1559    movu                 m1, [tlq+r5*2+16*0]
   1560    movu                 m2, [tlq+r5*2+16*1]
   1561    add                 r5d, 16
   1562    mova [rsp+r5*2+gprsize-16*1], m1
   1563    mova [rsp+r5*2+gprsize-16*0], m2
   1564    cmp                 r5d, r3d
   1565    jle .filter_copy_loop
   1566    lea                 tlq, [rsp+gprsize+16*1]
   1567    movq       [tlq+r3*2+2], m3
   1568    ret
   1569 .filter_edge:
   1570    cmp                 r5d, 3
   1571    je .filter_edge_s3
   1572    movddup              m4, [base+z_filt_k+r5*8-8]
   1573    movddup              m5, [base+z_filt_k+r5*8+8]
   1574    xor                 r5d, r5d
   1575    movddup              m6, [base+pw_8]
   1576    movu                 m2, [tlq-2]
   1577    jmp .filter_edge_start
   1578 .filter_edge_loop:
   1579    movu                 m2, [tlq+r5*2-2]
   1580    mova      [tlq+r5*2-16], m1
   1581 .filter_edge_start:
   1582    pmullw               m1, m4, [tlq+r5*2]
   1583    movu                 m3, [tlq+r5*2+2]
   1584    paddw                m2, m3
   1585    pmullw               m2, m5
   1586    add                 r5d, 8
   1587    paddw                m1, m6
   1588    paddw                m1, m2
   1589    psrlw                m1, 4
   1590    cmp                 r5d, r3d
   1591    jl .filter_edge_loop
   1592    mova      [tlq+r5*2-16], m1
   1593    ret
   1594 .filter_edge_s3:
   1595    movddup              m5, [base+pw_3]
   1596    xor                 r5d, r5d
   1597    movu                 m2, [tlq-2]
   1598    movu                 m3, [tlq-4]
   1599    jmp .filter_edge_s3_start
   1600 .filter_edge_s3_loop:
   1601    movu                 m2, [tlq+r5*2-2]
   1602    movu                 m3, [tlq+r5*2-4]
   1603    mova      [tlq+r5*2-16], m1
   1604 .filter_edge_s3_start:
   1605    paddw                m2, [tlq+r5*2+0]
   1606    paddw                m3, m5
   1607    movu                 m1, [tlq+r5*2+2]
   1608    movu                 m4, [tlq+r5*2+4]
   1609    add                 r5d, 8
   1610    paddw                m1, m2
   1611    pavgw                m3, m4
   1612    paddw                m1, m3
   1613    psrlw                m1, 2
   1614    cmp                 r5d, r3d
   1615    jl .filter_edge_s3_loop
   1616    mova      [tlq+r5*2-16], m1
   1617    ret
   1618 
   1619 %if ARCH_X86_64
   1620 cglobal ipred_z2_16bpc, 4, 12, 11, 16*24, dst, stride, tl, w, h, angle, dx, _, dy
   1621    %define            base  r7-$$
   1622    %define           maxwm  r6m
   1623    %define           maxhm  r7m
   1624    %define          bdmaxm  r8m
   1625    lea                  r7, [$$]
   1626    mov                  hd, hm
   1627    movddup              m8, [base+pw_62]
   1628    lea                 r9d, [wq-4]
   1629    shl                 r9d, 6
   1630    mova                 m9, [base+z2_top_shufA]
   1631    or                  r9d, hd
   1632    mova                m10, [base+z2_left_shufA]
   1633 %else
   1634 cglobal ipred_z2_16bpc, 4, 7, 8, -16*27, dst, _, tl, w, h, angle, dx
   1635    %define            base  r1-$$
   1636    %define             r9b  byte  [rsp+16*26+4*0]
   1637    %define             r9d  dword [rsp+16*26+4*0]
   1638    %define            r10d  dword [rsp+16*26+4*1]
   1639    %define            r11d  dword [rsp+16*26+4*2]
   1640    %define           maxwm  [rsp+16*2+4*0]
   1641    %define           maxhm  [rsp+16*2+4*1]
   1642    %define          bdmaxm  [rsp+16*2+4*2]
   1643    %define        stridemp  [rsp+16*26+4*3]
   1644    %define         strideq  r3
   1645    %define             dyd  r4
   1646    %define             dyq  r4
   1647    mov            stridemp, r1
   1648    mov                 r1d, r6m
   1649    mov                 r4d, r7m
   1650    mov                 r5d, r8m
   1651    mov               maxwm, r1d
   1652    mov               maxhm, r4d
   1653    mov              bdmaxm, r5d
   1654    LEA                  r1, $$
   1655    lea                  hd, [wq-4]
   1656    mova                 m0, [base+z2_top_shufA]
   1657    shl                  hd, 6
   1658    mova                 m1, [base+z2_left_shufA]
   1659    or                   hd, hm
   1660    mova        [rsp+16*24], m0
   1661    mov                 r9d, hd
   1662    mova        [rsp+16*25], m1
   1663 %endif
   1664    tzcnt                wd, wd
   1665    movifnidn        angled, anglem
   1666    mova                 m0, [tlq-16*8]
   1667    mova                 m1, [tlq-16*7]
   1668    mova                 m2, [tlq-16*6]
   1669    mova                 m3, [tlq-16*5]
   1670    movsxd               wq, [base+ipred_z2_16bpc_ssse3_table+wq*4]
   1671 %if ARCH_X86_64
   1672    movzx               dxd, angleb
   1673 %else
   1674    movzx               dxd, byte anglem
   1675 %endif
   1676    mova                 m4, [tlq-16*4]
   1677    mova                 m5, [tlq-16*3]
   1678    mova                 m6, [tlq-16*2]
   1679    mova                 m7, [tlq-16*1]
   1680    mova        [rsp+16* 5], m0
   1681    xor              angled, 0x400
   1682    mova        [rsp+16* 6], m1
   1683    mov                 dyd, dxd
   1684    mova        [rsp+16* 7], m2
   1685    neg                 dxq
   1686    mova        [rsp+16* 8], m3
   1687    and                 dyd, ~1
   1688    mova        [rsp+16* 9], m4
   1689    and                 dxq, ~1
   1690    mova        [rsp+16*10], m5
   1691    lea                  wq, [base+ipred_z2_16bpc_ssse3_table+wq]
   1692    mova        [rsp+16*11], m6
   1693    pxor                 m3, m3
   1694    mova        [rsp+16*12], m7
   1695    movzx               dyd, word [base+dr_intra_derivative+dyq-90]  ; angle - 90
   1696    movzx               dxd, word [base+dr_intra_derivative+dxq+180] ; 180 - angle
   1697    movddup              m0, [base+pw_256] ; 4<<6
   1698    movd                 m4, [tlq]
   1699    movu                 m5, [tlq+16*0+2]
   1700    movu                 m6, [tlq+16*1+2]
   1701    movsldup             m1, [base+z2_dy_offset]
   1702    pshufb               m4, m0
   1703    movq                 m7, [base+z_base_inc+2]
   1704    mov                r11d, (112-4)<<6
   1705    mova        [rsp+16*13], m4
   1706    neg                 dxd
   1707    mova        [rsp+16*14], m5
   1708    or                  dyd, 4<<16
   1709    mova        [rsp+16*15], m6
   1710 %if ARCH_X86_64
   1711    lea                r10d, [dxq+(112<<6)] ; xpos
   1712 %else
   1713    mov           [rsp+8*3], dyd
   1714    lea                 r4d, [dxq+(112<<6)]
   1715    mov                r10d, r4d
   1716    movzx                hd, r9b
   1717 %endif
   1718    movq          [rsp+8*0], m1
   1719    movq          [rsp+8*1], m0
   1720    movq          [rsp+8*2], m7
   1721    jmp                  wq
   1722 .w4:
   1723    test             angled, 0x400
   1724    jnz .w4_main
   1725    lea                 r3d, [hq+2]
   1726    add              angled, 1022
   1727    pshuflw              m1, m5, q3333
   1728    shl                 r3d, 6
   1729    movq      [rsp+16*14+8], m1
   1730    test                r3d, angled
   1731    jnz .w4_no_upsample_above ; angle >= 130 || h > 8 || (is_sm && h == 8)
   1732    call .upsample_above
   1733    sub              angled, 1075 ; angle - 53
   1734    lea                 r3d, [hq+3]
   1735    xor              angled, 0x7f ; 180 - angle
   1736    movd                 m2, r3d
   1737    movd                 m7, angled
   1738    shr              angled, 8 ; is_sm << 1
   1739    pshufb               m2, m3
   1740    pshufb               m7, m3
   1741    pcmpeqb              m2, [base+z_filt_wh4]
   1742    pand                 m7, m2
   1743    pcmpgtb              m7, [base+z_filt_t_w48+angleq*8]
   1744    jmp .w8_filter_left
   1745 .upsample_above: ; w4/w8
   1746    paddw                m2, m5, [tlq]
   1747    movu                 m1, [rsp+gprsize+16*14+2]
   1748    movu                 m4, [rsp+gprsize+16*14-4]
   1749 %if ARCH_X86_64
   1750    movd                 m6, r9m ; bdmax, offset due to call
   1751 %else
   1752    movd                 m6, [rsp+gprsize+16*2+4*2]
   1753 %endif
   1754    paddw                m4, m1
   1755    psubw                m1, m2, m4
   1756    pshufb               m6, m0
   1757    psraw                m1, 3
   1758    paddw                m2, m1
   1759    add                 dxd, dxd
   1760    pmaxsw               m2, m3
   1761    paddw                m7, m7
   1762    pavgw                m2, m3
   1763    pminsw               m2, m6
   1764 %if ARCH_X86_64
   1765    mova                 m9, [base+z2_top_shufB]
   1766    lea                r10d, [dxq+(113<<6)]
   1767    mov                r11d, (112-7)<<6
   1768 %else
   1769    mova                 m1, [base+z2_top_shufB]
   1770    lea                 r3d, [dxq+(113<<6)]
   1771    mov dword [rsp+gprsize+16*26+4*2], (112-7)<<6
   1772    mov [rsp+gprsize+16*26+4*1], r3d
   1773    mova [rsp+gprsize+16*24], m1
   1774 %endif
   1775    punpcklwd            m1, m2, m5
   1776    punpckhwd            m2, m5
   1777    movq  [rsp+gprsize+8*2], m7
   1778    mova [rsp+gprsize+16*14], m1
   1779    mova [rsp+gprsize+16*15], m2
   1780    ret
   1781 .w4_no_upsample_above:
   1782    lea                 r3d, [hq+3]
   1783    mov          [rsp+16*4], angled
   1784    sub              angled, 1112 ; angle - 90
   1785    movd                 m2, r3d
   1786    mov                 r3d, 90
   1787    movd                 m1, angled
   1788    sub                 r3d, angled ; 180 - angle
   1789    shr              angled, 8 ; is_sm << 1
   1790    mova                 m4, [base+z_filt_wh4]
   1791    movd                 m7, r3d
   1792    mova                 m5, [base+z_filt_t_w48+angleq*8]
   1793    mov                 r3d, 4
   1794    call .w8_filter_top
   1795    mov              angled, [rsp+16*4]
   1796    lea                 r3d, [hq+2]
   1797    sub              angled, 139
   1798    shl                 r3d, 6
   1799    test                r3d, angled
   1800    jnz .w8_filter_left ; angle <= 140 || h > 8 || (is_sm && h == 8)
   1801 .upsample_left: ; w4/w8
   1802    mova                 m2, [tlq-16]
   1803    lea                 r3d, [hq-4]
   1804    movu                 m3, [tlq-14]
   1805    movu                 m4, [rsp+16*12+4]
   1806    pshufb               m1, m2, [base+z2_upsample_l+r3*4]
   1807    movd                 m6, bdmaxm
   1808    pxor                 m5, m5
   1809    paddw                m3, m2
   1810    paddw                m4, m1
   1811    psubw                m1, m3, m4
   1812    movshdup             m4, [base+z2_dy_offset]
   1813    psraw                m1, 3
   1814    pshufb               m6, m0
   1815    paddw                m3, m1
   1816    pmaxsw               m3, m5
   1817    pavgw                m3, m5
   1818    pminsw               m3, m6
   1819 %if ARCH_X86_64
   1820    mova                m10, [base+z2_left_shufB]
   1821    add                 dyd, dyd
   1822 %else
   1823    mova                 m1, [base+z2_left_shufB]
   1824    shl     dword [rsp+8*3], 1
   1825    mova        [rsp+16*25], m1
   1826 %endif
   1827    punpckhwd            m1, m2, m3
   1828    punpcklwd            m2, m3
   1829    movq          [rsp+8*0], m4
   1830    mova        [rsp+16*12], m1
   1831    mova        [rsp+16*11], m2
   1832 .w4_main:
   1833    movd                 m6, dxd
   1834 %if ARCH_X86_64
   1835    movd                 m3, dyd
   1836 %else
   1837    movd                 m3, [rsp+8*3]
   1838 %endif
   1839    pshufb               m6, m0
   1840    movddup              m0, [rsp+8*2]
   1841    paddw                m7, m6, m6
   1842    movq                 m5, [base+pw_m1to4]
   1843    pshuflw              m4, m3, q0000
   1844    punpcklqdq           m6, m7
   1845    pmullw               m4, m5
   1846    pshuflw              m3, m3, q1111
   1847    paddw                m6, m0
   1848    mov                 r2d, r10d
   1849    pshuflw              m0, m4, q3333
   1850    psubw                m4, [rsp+8*0]
   1851    movq          [rsp+8*3], m3
   1852    movq          [rsp+8*5], m0 ; dy*4
   1853    mov                  r5, dstq
   1854 .w4_loop0:
   1855    mova         [rsp+16*4], m6
   1856    movq          [rsp+8*4], m4
   1857 %if ARCH_X86_64
   1858    pand                 m0, m8, m4
   1859 %else
   1860    movq                 m0, [base+pw_62]
   1861    pand                 m0, m4
   1862 %endif
   1863    psraw                m4, 6
   1864    psllw                m0, 9 ; frac_y << 9
   1865    movq          [rsp+8*7], m0
   1866    pabsw                m4, m4
   1867    movq          [rsp+8*6], m4
   1868    movzx                hd, r9b
   1869 .w4_loop:
   1870    lea                 r3d, [r2+dxq]
   1871    shr                 r2d, 6        ; base_x0
   1872    movu                 m2, [rsp+r2*2]
   1873    lea                 r2d, [r3+dxq]
   1874    shr                 r3d, 6        ; base_x1
   1875    movu                 m1, [rsp+r3*2]
   1876    lea                 r3d, [r2+dxq]
   1877    shr                 r2d, 6        ; base_x2
   1878    movu                 m3, [rsp+r2*2]
   1879    lea                 r2d, [r3+dxq]
   1880    shr                 r3d, 6        ; base_x3
   1881    movu                 m4, [rsp+r3*2]
   1882 %if ARCH_X86_64
   1883    REPX     {pshufb x, m9}, m2, m1, m3, m4
   1884 %else
   1885    mova                 m0, [rsp+16*24]
   1886    REPX     {pshufb x, m0}, m2, m1, m3, m4
   1887 %endif
   1888    punpcklqdq           m0, m2, m1
   1889    punpckhqdq           m2, m1
   1890    punpcklqdq           m1, m3, m4
   1891    punpckhqdq           m3, m4
   1892 %if ARCH_X86_64
   1893    pand                 m5, m8, m6
   1894 %else
   1895    movddup              m5, [base+pw_62]
   1896    pand                 m5, m6
   1897 %endif
   1898    psllw                m5, 9
   1899    psubw                m2, m0
   1900    pmulhrsw             m2, m5
   1901    paddw                m5, m6, m7
   1902    psubw                m3, m1
   1903    paddw                m0, m2
   1904 %if ARCH_X86_64
   1905    pand                 m2, m8, m5
   1906 %else
   1907    movddup              m2, [base+pw_62]
   1908    pand                 m2, m5
   1909 %endif
   1910    psllw                m2, 9
   1911    pmulhrsw             m3, m2
   1912    paddw                m1, m3
   1913    cmp                 r3d, 111 ; topleft
   1914    jge .w4_toponly
   1915    mova        [rsp+16*22], m0
   1916    mova        [rsp+16*23], m1
   1917    movzx               r3d, byte [rsp+8*6+0] ; base_y0
   1918    movu                 m3, [rsp+r3*2]
   1919    movzx               r3d, byte [rsp+8*6+2] ; base_y1
   1920    movu                 m2, [rsp+r3*2]
   1921    movzx               r3d, byte [rsp+8*6+4] ; base_y2
   1922    movu                 m4, [rsp+r3*2]
   1923    movzx               r3d, byte [rsp+8*6+6] ; base_y3
   1924    movu                 m0, [rsp+r3*2]
   1925 %if ARCH_X86_64
   1926    REPX    {pshufb x, m10}, m3, m2, m4, m0
   1927 %else
   1928    mova                 m1, [rsp+16*25]
   1929    REPX     {pshufb x, m1}, m3, m2, m4, m0
   1930 %endif
   1931    punpcklwd            m1, m3, m2
   1932    punpckhwd            m3, m2     ; 01
   1933    punpcklwd            m2, m4, m0
   1934    punpckhwd            m4, m0     ; 23
   1935    punpckldq            m0, m1, m2 ; y0 d1
   1936    punpckhdq            m1, m2     ; y2 y3
   1937    punpckldq            m2, m3, m4
   1938    punpckhdq            m3, m4
   1939    movddup              m4, [rsp+8*7]
   1940    psubw                m2, m0
   1941    psubw                m3, m1
   1942    pmulhrsw             m2, m4
   1943    pmulhrsw             m3, m4
   1944    psraw                m6, 15       ; base_x < topleft
   1945    psraw                m4, m5, 15
   1946    paddw                m0, m2
   1947    paddw                m1, m3
   1948    pand                 m0, m6
   1949    pandn                m6, [rsp+16*22]
   1950    pand                 m1, m4
   1951    pandn                m4, [rsp+16*23]
   1952    por                  m0, m6
   1953    por                  m1, m4
   1954 .w4_toponly:
   1955    movifnidn       strideq, stridemp
   1956    movq   [dstq+strideq*0], m0
   1957    movhps [dstq+strideq*1], m0
   1958    lea                dstq, [dstq+strideq*2]
   1959    movq   [dstq+strideq*0], m1
   1960    movhps [dstq+strideq*1], m1
   1961    sub                  hd, 4
   1962    jz .w4_end
   1963    movq                 m4, [rsp+8*6]
   1964    paddsw               m6, m5, m7   ; xpos += dx
   1965    movq                 m5, [rsp+8*3]
   1966    psubw                m4, m5
   1967    lea                dstq, [dstq+strideq*2]
   1968    movq          [rsp+8*6], m4
   1969    cmp                 r2d, r11d
   1970    jge .w4_loop
   1971 .w4_leftonly_loop:
   1972    movzx               r2d, byte [rsp+8*6+0] ; base_y0
   1973    movu                 m3, [rsp+r2*2]
   1974    movzx               r2d, byte [rsp+8*6+2] ; base_y1
   1975    movu                 m2, [rsp+r2*2]
   1976    movzx               r2d, byte [rsp+8*6+4] ; base_y2
   1977    movu                 m6, [rsp+r2*2]
   1978    movzx               r2d, byte [rsp+8*6+6] ; base_y3
   1979    movu                 m0, [rsp+r2*2]
   1980    psubw                m4, m5
   1981 %if ARCH_X86_64
   1982    REPX    {pshufb x, m10}, m3, m2, m6, m0
   1983 %else
   1984    mova                 m1, [rsp+16*25]
   1985    REPX     {pshufb x, m1}, m3, m2, m6, m0
   1986 %endif
   1987    movq          [rsp+8*6], m4
   1988    punpcklwd            m1, m3, m2
   1989    punpckhwd            m3, m2
   1990    punpcklwd            m2, m6, m0
   1991    punpckhwd            m6, m0
   1992    punpckldq            m0, m1, m2
   1993    punpckhdq            m1, m2
   1994    punpckldq            m2, m3, m6
   1995    punpckhdq            m3, m6
   1996    movddup              m6, [rsp+8*7]
   1997    psubw                m2, m0
   1998    psubw                m3, m1
   1999    pmulhrsw             m2, m6
   2000    pmulhrsw             m3, m6
   2001    paddw                m0, m2
   2002    paddw                m1, m3
   2003    movq   [dstq+strideq*0], m0
   2004    movhps [dstq+strideq*1], m0
   2005    lea                dstq, [dstq+strideq*2]
   2006    movq   [dstq+strideq*0], m1
   2007    movhps [dstq+strideq*1], m1
   2008    lea                dstq, [dstq+strideq*2]
   2009    sub                  hd, 4
   2010    jg .w4_leftonly_loop
   2011 .w4_end:
   2012    sub                 r9d, 1<<8
   2013    jl .w4_ret
   2014    movq                 m4, [rsp+8*5]
   2015    add                  r5, 8
   2016    mov                dstq, r5
   2017    paddw                m4, [rsp+8*4] ; base_y += 4*dy
   2018    movzx               r2d, word [rsp+8*1]
   2019    movddup              m6, [rsp+8*1]
   2020    paddw                m6, [rsp+16*4] ; base_x += (4 << upsample_above)
   2021    add                 r2d, r10d
   2022    mov                r10d, r2d
   2023    jmp .w4_loop0
   2024 .w4_ret:
   2025    RET
   2026 .w8:
   2027    test             angled, 0x400
   2028    jnz .w4_main
   2029    lea                 r3d, [angleq+126]
   2030    pshufhw              m1, m5, q3333
   2031 %if ARCH_X86_64
   2032    mov                 r3b, hb
   2033 %else
   2034    xor                 r3b, r3b
   2035    or                  r3d, hd
   2036 %endif
   2037    movhps      [rsp+16*15], m1
   2038    cmp                 r3d, 8
   2039    ja .w8_no_upsample_above ; angle >= 130 || h > 8 || is_sm
   2040    call .upsample_above
   2041    sub              angled, 53
   2042    lea                 r3d, [hq+7]
   2043    xor              angled, 0x7f ; 180 - angle
   2044    movu                 m1, [base+z_filt_wh8]
   2045    movd                 m2, r3d
   2046    movd                 m7, angled
   2047    shr              angled, 8 ; is_sm << 1
   2048    psrldq               m4, [base+z_filt_t_w48+angleq*8], 4
   2049    pshufb               m2, m3
   2050    pshufb               m7, m3
   2051    pcmpeqb              m2, m1
   2052    movq                 m1, [base+pw_512]
   2053    pand                 m7, m2
   2054    pcmpgtb              m7, m4
   2055    movq          [rsp+8*1], m1 ; 8<<6
   2056    jmp .w8_filter_left
   2057 .w8_no_upsample_above:
   2058    lea                 r3d, [hq+7]
   2059    mov          [rsp+16*4], angled
   2060    sub              angled, 90
   2061    movd                 m2, r3d
   2062    mov                 r3d, 90
   2063    movd                 m1, angled
   2064    sub                 r3d, angled ; 180 - angle
   2065    shr              angled, 8 ; is_sm << 1
   2066    movu                 m4, [base+z_filt_wh8]
   2067    movd                 m7, r3d
   2068    psrldq               m5, [base+z_filt_t_w48+angleq*8], 4
   2069    mov                 r3d, 8
   2070    call .w8_filter_top
   2071    mov                 r3d, [rsp+16*4]
   2072    sub                 r3d, 141
   2073 %if ARCH_X86_64
   2074    mov                 r3b, hb
   2075 %else
   2076    xor                 r3b, r3b
   2077    or                  r3d, hd
   2078 %endif
   2079    cmp                 r3d, 8
   2080    jbe .upsample_left ; angle > 140 && h <= 8 && !is_sm
   2081 .w8_filter_left:
   2082    pmovmskb            r5d, m7
   2083    test                r5d, r5d
   2084    jz .w4_main
   2085    imul                r5d, 0x55555555
   2086    neg                  hq
   2087    mov                  r3, tlq
   2088    movd                 m1, [tlq+hq*2]
   2089    shr                 r5d, 30 ; filter_strength
   2090    lea                 tlq, [rsp+16*13-2]
   2091    pshuflw              m1, m1, q0000
   2092    movq       [tlq+hq*2-6], m1
   2093    call mangle(private_prefix %+ _ipred_z3_16bpc_ssse3).filter_edge
   2094    jmp .filter_left_end
   2095 .w8_filter_top:
   2096    REPX     {pshufb x, m3}, m2, m1, m7
   2097    pcmpeqb              m2, m4
   2098    pand                 m1, m2
   2099    pand                 m7, m2
   2100    pcmpgtb              m1, m5
   2101    pcmpgtb              m7, m5
   2102    pmovmskb            r5d, m1
   2103    test                r5d, r5d
   2104    jz .w8_filter_top_end ; filter_strength == 0
   2105    imul                r5d, 0x55555555
   2106    mov              [dstq], tlq
   2107    lea                 tlq, [rsp+16*14+gprsize]
   2108    shr                 r5d, 30 ; filter_strength
   2109    call mangle(private_prefix %+ _ipred_z1_16bpc_ssse3).filter_edge
   2110 %if ARCH_X86_64
   2111    mov                 r3d, r7m ; maxw, offset due to call
   2112 %else
   2113    mov                 r3d, [rsp+16*2+4*1]
   2114 %endif
   2115    mov                 tlq, [dstq]
   2116    cmp                 r3d, 8
   2117    jge .w8_filter_top_end
   2118    movu                 m1, [tlq+r3*2+16*0+2]
   2119    movu                 m2, [tlq+r3*2+16*1+2]
   2120    movu [rsp+r3*2+16*14+gprsize], m1
   2121    movu [rsp+r3*2+16*15+gprsize], m2
   2122 .w8_filter_top_end:
   2123    ret
   2124 .w16:
   2125    test             angled, 0x400
   2126    jnz .w4_main
   2127    lea                 r3d, [hq+15]
   2128    sub              angled, 90
   2129    movd                 m2, r3d
   2130    mov                 r3d, 90
   2131    movd                 m1, angled
   2132    sub                 r3d, angled ; 180 - angle
   2133    shr              angled, 8 ; is_sm << 1
   2134    movd                 m7, r3d
   2135    REPX     {pshufb x, m3}, m2, m1, m7
   2136    movq                 m4, [base+z_filt_t_w16+angleq*4]
   2137    pcmpeqb              m2, [base+z_filt_wh16]
   2138    pand                 m1, m2
   2139    pand                 m7, m2
   2140    pcmpgtb              m1, m4
   2141    pcmpgtb              m7, m4
   2142    pmovmskb            r5d, m1
   2143    test                r5d, r5d
   2144    jz .w16_filter_left ; filter_strength == 0
   2145    imul                r5d, 0x24924924
   2146    pshufhw              m6, m6, q3333
   2147    mov              [dstq], tlq
   2148    lea                 tlq, [rsp+16*14]
   2149    shr                 r5d, 30
   2150    movhps       [tlq+16*2], m6
   2151    adc                 r5d, -1 ; filter_strength
   2152    mov                 r3d, 16
   2153    call mangle(private_prefix %+ _ipred_z1_16bpc_ssse3).filter_edge
   2154    mov                 r3d, maxwm
   2155    mov                 tlq, [dstq]
   2156    cmp                 r3d, 16
   2157    jge .w16_filter_left
   2158    movu                 m1, [tlq+r3*2+16*0+2]
   2159    movu                 m2, [tlq+r3*2+16*1+2]
   2160    movu   [rsp+r3*2+16*14], m1
   2161    movu   [rsp+r3*2+16*15], m2
   2162 .w16_filter_left:
   2163    pmovmskb            r5d, m7
   2164    test                r5d, r5d
   2165    jz .w4_main
   2166    imul                r5d, 0x24924924
   2167    neg                  hq
   2168    mov                  r3, tlq
   2169    movd                 m1, [tlq+hq*2]
   2170    shr                 r5d, 30
   2171    lea                 tlq, [rsp+16*13-2]
   2172    pshuflw              m1, m1, q0000
   2173    adc                 r5d, -1 ; filter_strength
   2174    movq       [tlq+hq*2-6], m1
   2175    call mangle(private_prefix %+ _ipred_z3_16bpc_ssse3).filter_edge
   2176    jmp .filter_left_end
   2177 .w32:
   2178    movu                 m1, [tlq+16*2+2]
   2179    movu                 m2, [tlq+16*3+2]
   2180    mova        [rsp+16*16], m1
   2181    mova        [rsp+16*17], m2
   2182    test             angled, 0x400
   2183    jnz .w4_main
   2184    mov              [dstq], tlq
   2185    lea                 tlq, [rsp+16*14]
   2186    pshufhw              m2, m2, q3333
   2187    mov                 r3d, 32
   2188    movhps       [tlq+16*4], m2
   2189    call mangle(private_prefix %+ _ipred_z1_16bpc_ssse3).filter_edge_s3
   2190    mov                 r3d, maxwm
   2191    mov                 tlq, [dstq]
   2192    cmp                 r3d, 32
   2193    jge .filter_left
   2194    movu                 m1, [tlq+r3*2+16*0+2]
   2195    movu                 m2, [tlq+r3*2+16*1+2]
   2196    movu   [rsp+r3*2+16*14], m1
   2197    movu   [rsp+r3*2+16*15], m2
   2198    cmp                 r3d, 16
   2199    jge .filter_left
   2200    movu                 m1, [tlq+r3*2+16*2+2]
   2201    movu                 m2, [tlq+r3*2+16*3+2]
   2202    movu   [rsp+r3*2+16*16], m1
   2203    movu   [rsp+r3*2+16*17], m2
   2204 .filter_left:
   2205    neg                  hq
   2206    mov                  r3, tlq
   2207    pshuflw              m1, [tlq+hq*2], q0000
   2208    lea                 tlq, [rsp+16*13-2]
   2209    movq       [tlq+hq*2-6], m1
   2210    call mangle(private_prefix %+ _ipred_z3_16bpc_ssse3).filter_edge_s3
   2211 .filter_left_end:
   2212    mov                 r2d, maxhm
   2213    cmp                 r2d, hd
   2214    jge .w4_main
   2215    neg                  r2
   2216    movu                 m1, [r3+r2*2-16*1]
   2217    movu                 m2, [r3+r2*2-16*2]
   2218    movu   [rsp+r2*2+16*12], m1
   2219    movu   [rsp+r2*2+16*11], m2
   2220    cmp                 r2d, -48
   2221    jle .w4_main
   2222    movu                 m1, [r3+r2*2-16*3]
   2223    movu                 m2, [r3+r2*2-16*4]
   2224    movu   [rsp+r2*2+16*10], m1
   2225    movu   [rsp+r2*2+16* 9], m2
   2226    cmp                 r2d, -32
   2227    jle .w4_main
   2228    movu                 m1, [r3+r2*2-16*5]
   2229    movu                 m2, [r3+r2*2-16*6]
   2230    movu   [rsp+r2*2+16* 8], m1
   2231    movu   [rsp+r2*2+16* 7], m2
   2232    cmp                 r2d, -16
   2233    jle .w4_main
   2234    movu                 m1, [r3+r2*2-16*7]
   2235    movu                 m2, [r3+r2*2-16*8]
   2236    movu   [rsp+r2*2+16* 6], m1
   2237    movu   [rsp+r2*2+16* 5], m2
   2238    jmp .w4_main
   2239 .w64:
   2240    movu                 m1, [tlq+16*2+2]
   2241    movu                 m2, [tlq+16*3+2]
   2242    movu                 m3, [tlq+16*4+2]
   2243    movu                 m4, [tlq+16*5+2]
   2244    movu                 m5, [tlq+16*6+2]
   2245    movu                 m6, [tlq+16*7+2]
   2246    mov              [dstq], tlq
   2247    lea                 tlq, [rsp+16*14]
   2248    mova         [tlq+16*2], m1
   2249    mova         [tlq+16*3], m2
   2250    mova         [tlq+16*4], m3
   2251    mova         [tlq+16*5], m4
   2252    mova         [tlq+16*6], m5
   2253    mova         [tlq+16*7], m6
   2254    test             angled, 0x400
   2255    jnz .w4_main
   2256    pshufhw              m6, m6, q3333
   2257    mov                 r3d, 64
   2258    movhps       [tlq+16*8], m6
   2259    call mangle(private_prefix %+ _ipred_z1_16bpc_ssse3).filter_edge_s3
   2260    mov                 r3d, maxwm
   2261    mov                 tlq, [dstq]
   2262    cmp                 r3d, 64
   2263    jge .filter_left
   2264    movu                 m1, [tlq+r3*2+16*0+2]
   2265    movu                 m2, [tlq+r3*2+16*1+2]
   2266    movu   [rsp+r3*2+16*14], m1
   2267    movu   [rsp+r3*2+16*15], m2
   2268    cmp                 r3d, 48
   2269    jge .filter_left
   2270    movu                 m1, [tlq+r3*2+16*2+2]
   2271    movu                 m2, [tlq+r3*2+16*3+2]
   2272    movu   [rsp+r3*2+16*16], m1
   2273    movu   [rsp+r3*2+16*17], m2
   2274    cmp                 r3d, 32
   2275    jge .filter_left
   2276    movu                 m1, [tlq+r3*2+16*4+2]
   2277    movu                 m2, [tlq+r3*2+16*5+2]
   2278    movu   [rsp+r3*2+16*18], m1
   2279    movu   [rsp+r3*2+16*19], m2
   2280    cmp                 r3d, 16
   2281    jge .filter_left
   2282    movu                 m1, [tlq+r3*2+16*6+2]
   2283    movu                 m2, [tlq+r3*2+16*7+2]
   2284    movu   [rsp+r3*2+16*20], m1
   2285    movu   [rsp+r3*2+16*21], m2
   2286    jmp .filter_left
   2287 
   2288 %if ARCH_X86_64
   2289 cglobal ipred_z3_16bpc, 4, 9, 8, 16*18, dst, stride, tl, w, h, angle, dy, _, org_w
   2290    %define            base  r7-$$
   2291    lea                  r7, [$$]
   2292    mov              org_wd, wd
   2293 %else
   2294 cglobal ipred_z3_16bpc, 4, 7, 8, -16*18, dst, stride, tl, w, h, angle, dy
   2295    %define            base  r1-$$
   2296    %define          org_wd  r5
   2297    %define          org_wq  r5
   2298    movd                 m6, r8m ; pixel_max
   2299    mov          [dstq+4*0], strideq
   2300    LEA                  r1, $$
   2301    mov          [dstq+4*1], wd
   2302 %endif
   2303    tzcnt                hd, hm
   2304    movifnidn        angled, anglem
   2305    sub                 tlq, 2
   2306    movsxd               hq, [base+ipred_z3_16bpc_ssse3_table+hq*4]
   2307    sub              angled, 180
   2308    movddup              m0, [base+pw_256]
   2309    mov                 dyd, angled
   2310    neg                 dyd
   2311    xor              angled, 0x400
   2312    movddup              m7, [base+pw_62]
   2313    or                  dyq, ~0x7e
   2314    lea                  hq, [base+ipred_z3_16bpc_ssse3_table+hq]
   2315    movzx               dyd, word [base+dr_intra_derivative+45*2-1+dyq]
   2316    jmp                  hq
   2317 .h4:
   2318    lea                 r4d, [angleq+88]
   2319    test                r4d, 0x480
   2320    jnz .h4_no_upsample ; !enable_intra_edge_filter || angle >= 40
   2321    sar                 r4d, 9
   2322    add                 r4d, wd
   2323    cmp                 r4d, 8
   2324    jg .h4_no_upsample ; w > 8 || (w == 8 && is_sm)
   2325    mova                 m2, [tlq-14]   ; 7 6 5 4 3 2 1 0
   2326    movu                 m3, [tlq-12]   ; 8 7 6 5 4 3 2 1
   2327 %if ARCH_X86_64
   2328    movd                 m6, r8m
   2329 %endif
   2330    pshufb               m4, m2, m0
   2331    mov                 tlq, rsp
   2332    palignr              m1, m2, m4, 14 ; 8 8 7 6 5 4 3 2
   2333    add                 dyd, dyd
   2334    palignr              m5, m2, m4, 12 ; 8 8 8 7 6 5 4 3
   2335    paddw                m1, m2
   2336    paddw                m3, m5
   2337    psubw                m5, m1, m3
   2338    mova                 m3, [base+z_upsample]
   2339    mova           [tlq+ 0], m4
   2340    movd                 m4, dyd
   2341    psraw                m5, 3
   2342    neg                 dyd
   2343    paddw                m1, m5
   2344    pxor                 m5, m5
   2345    lea                 r5d, [dyq+(16<<6)+63] ; ypos
   2346    pmaxsw               m1, m5
   2347    pshufb               m6, m0
   2348    shl                  wd, 3
   2349    pavgw                m1, m5
   2350    pshufb               m4, m0
   2351    pminsw               m1, m6
   2352    sub                 rsp, wq
   2353    punpckhwd            m0, m1, m2
   2354    paddw                m5, m4, m4
   2355    punpcklwd            m1, m2
   2356    mova           [tlq+32], m0
   2357    movsd                m4, m5
   2358    mova           [tlq+16], m1
   2359 .h4_upsample_loop:
   2360    lea                 r4d, [r5+dyq]
   2361    sar                 r5d, 6
   2362    movu                 m2, [tlq+r5*2]
   2363    lea                 r5d, [r4+dyq]
   2364    sar                 r4d, 6
   2365    movu                 m1, [tlq+r4*2]
   2366    pshufb               m2, m3
   2367    pshufb               m1, m3
   2368    punpckhqdq           m0, m1, m2
   2369    punpcklqdq           m1, m2
   2370    pand                 m2, m7, m4
   2371    psllw                m2, 9
   2372    psubw                m1, m0
   2373    pmulhrsw             m1, m2
   2374    paddw                m4, m5
   2375    paddw                m0, m1
   2376    mova        [rsp+wq-16], m0
   2377    sub                  wd, 16
   2378    jg .h4_upsample_loop
   2379    or                  r3d, 4*2
   2380    jmp .end_transpose
   2381 .h4_no_upsample:
   2382    mov                 r4d, 7
   2383    test             angled, 0x400 ; !enable_intra_edge_filter
   2384    jnz .h4_main
   2385    lea                 r4d, [wq+3]
   2386    movd                 m1, r4d
   2387    movd                 m3, angled
   2388    shr              angled, 8 ; is_sm << 1
   2389    pxor                 m2, m2
   2390    pshufb               m1, m2
   2391    pshufb               m3, m2
   2392    pcmpeqb              m1, [base+z_filt_wh4]
   2393    pand                 m1, m3
   2394    pcmpgtb              m1, [base+z_filt_t_w48+angleq*8]
   2395    pmovmskb            r5d, m1
   2396    mov                 r4d, 7
   2397    test                r5d, r5d
   2398    jz .h4_main ; filter_strength == 0
   2399    pshuflw              m1, [tlq+2], q0000
   2400    imul                r5d, 0x55555555
   2401    mova                 m2, [tlq-14]
   2402    neg                  r4
   2403    movd                 m3, [tlq+r4*2]
   2404    shr                 r5d, 30
   2405    movd        [rsp+16*17], m1
   2406    pshuflw              m3, m3, q0000
   2407    mova        [rsp+16*16], m2
   2408    lea                  r2, [r4-2]
   2409    movq [rsp+16*17+r4*2-10], m3
   2410    cmp                  wd, 8
   2411    cmovae               r4, r2
   2412    lea                 tlq, [rsp+16*17-2]
   2413    call .filter_edge
   2414 .h4_main:
   2415    movd                 m4, dyd
   2416    sub                 tlq, r4
   2417    movddup              m1, [base+z_base_inc_z2+8] ; base_inc << 6
   2418    sub                 tlq, r4
   2419    shl                 r4d, 6
   2420    movd                 m6, [tlq]
   2421    movd                 m3, r4d
   2422    pshufb               m4, m0
   2423    neg                 dyq
   2424    pshufb               m6, m0
   2425    lea                  r5, [dyq+r4+63] ; ypos
   2426    pshufb               m3, m0
   2427    shl                  wd, 3
   2428    paddw                m5, m4, m4
   2429    sub                 rsp, wq
   2430    psubw                m3, m1 ; max_base_y
   2431    movsd                m4, m5 ; ypos1 ypos0
   2432 .h4_loop:
   2433    lea                  r4, [r5+dyq]
   2434    sar                  r5, 6
   2435    movddup              m0, [tlq+r5*2-6]
   2436    movddup              m1, [tlq+r5*2-8]
   2437    lea                  r5, [r4+dyq]
   2438    sar                  r4, 6
   2439    movlps               m0, [tlq+r4*2-6]
   2440    movlps               m1, [tlq+r4*2-8]
   2441    pand                 m2, m7, m4
   2442    psllw                m2, 9
   2443    psubw                m1, m0
   2444    pmulhrsw             m1, m2
   2445    pcmpgtw              m2, m3, m4
   2446    paddw                m4, m5
   2447    paddw                m0, m1
   2448    pand                 m0, m2
   2449    pandn                m2, m6
   2450    por                  m0, m2
   2451    mova        [rsp+wq-16], m0
   2452    sub                  wd, 16
   2453    jz .h4_transpose
   2454    test                r5d, r5d
   2455    jg .h4_loop
   2456 .h4_end_loop:
   2457    mova        [rsp+wq-16], m6
   2458    sub                  wd, 16
   2459    jg .h4_end_loop
   2460 .h4_transpose:
   2461    or                  r3d, 4*2
   2462    jmp .end_transpose
   2463 .h8:
   2464    lea                 r4d, [angleq+88]
   2465    and                 r4d, ~0x7f
   2466    or                  r4d, wd
   2467    cmp                 r4d, 8
   2468    ja .h8_no_upsample ; !enable_intra_edge_filter || is_sm || d >= 40 || w > 8
   2469    mova                 m2, [tlq-30]  ; g f e d c b a 9
   2470    movu                 m1, [tlq-32]  ; _ g f e d c b a
   2471    movu                 m3, [tlq-16]  ; 9 8 7 6 5 4 3 2
   2472    paddw                m3, [tlq-14]  ; 8 7 6 5 4 3 2 1
   2473    pshufd               m4, m2, q2100 ; _ _ g f e d c b
   2474    paddw                m1, m2
   2475    movu                 m5, [tlq-28]  ; f e d c b a 9 8
   2476    add                 dyd, dyd
   2477    cmp                  wd, 8
   2478    je .h8_upsample_w8
   2479    pshufhw              m4, m2, q1000 ; _ _ _ _ c c c b
   2480 .h8_upsample_w8:
   2481    paddw                m4, m5
   2482    psubw                m5, m1, m4
   2483    movu                 m4, [tlq-18]  ; a 9 8 7 6 5 4 3
   2484    psraw                m5, 3
   2485    paddw                m1, m5
   2486    movu                 m5, [tlq-12]  ; 7 6 5 4 3 2 1 0
   2487 %if ARCH_X86_64
   2488    movd                 m6, r8m ; pixel_max
   2489 %endif
   2490    paddw                m4, m5
   2491    shl                  wd, 4
   2492    psubw                m5, m3, m4
   2493    movd                 m4, dyd
   2494    psraw                m5, 3
   2495    neg                 dyd
   2496    paddw                m3, m5
   2497    pshufb               m6, m0
   2498    mova                 m5, [tlq-14]
   2499    pshufb               m4, m0
   2500    pxor                 m0, m0
   2501    pmaxsw               m1, m0
   2502    pmaxsw               m3, m0
   2503    mov                 tlq, rsp
   2504    pavgw                m1, m0
   2505    pavgw                m3, m0
   2506    sub                 rsp, wq
   2507    pminsw               m1, m6
   2508    pminsw               m6, m3
   2509    mova                 m3, [base+z_upsample]
   2510    lea                 r5d, [dyq+(16<<6)+63] ; ypos
   2511    punpcklwd            m0, m1, m2
   2512    mova         [tlq+16*0], m0
   2513    punpckhwd            m1, m2
   2514    mova         [tlq+16*1], m1
   2515    punpcklwd            m0, m6, m5
   2516    mova         [tlq+16*2], m0
   2517    punpckhwd            m6, m5
   2518    mova         [tlq+16*3], m6
   2519    mova                 m5, m4
   2520 .h8_upsample_loop:
   2521    mov                 r4d, r5d
   2522    sar                 r4d, 6
   2523    movu                 m1, [tlq+r4*2+16*0]
   2524    movu                 m2, [tlq+r4*2+16*1]
   2525    add                 r5d, dyd
   2526    pshufb               m2, m3
   2527    pshufb               m1, m3
   2528    punpckhqdq           m0, m1, m2
   2529    punpcklqdq           m1, m2
   2530    pand                 m2, m7, m4
   2531    psllw                m2, 9
   2532    psubw                m1, m0
   2533    pmulhrsw             m1, m2
   2534    paddw                m4, m5
   2535    paddw                m0, m1
   2536    mova        [rsp+wq-16], m0
   2537    sub                  wd, 16
   2538    jg .h8_upsample_loop
   2539    or                  r3d, 8*2
   2540    jmp .end_transpose
   2541 .h8_no_upsample:
   2542    lea                 r4d, [wq+7]
   2543    movd                 m1, r4d
   2544    and                 r4d, 7
   2545    or                  r4d, 8 ; imin(w+7, 15)
   2546    test             angled, 0x400
   2547    jnz .h8_main
   2548    movd                 m3, angled
   2549    shr              angled, 8 ; is_sm << 1
   2550    pxor                 m2, m2
   2551    pshufb               m1, m2
   2552    pshufb               m3, m2
   2553    movu                 m2, [base+z_filt_wh8]
   2554    psrldq               m4, [base+z_filt_t_w48+angleq*8], 4
   2555    pcmpeqb              m2, m1
   2556    pand                 m2, m3
   2557    pcmpgtb              m2, m4
   2558    pmovmskb            r5d, m2
   2559    test                r5d, r5d
   2560    jz .h8_main ; filter_strength == 0
   2561    pshuflw              m1, [tlq+2], q0000
   2562    imul                r5d, 0x55555555
   2563    mova                 m2, [tlq-16*1+2]
   2564    neg                  r4
   2565    mova                 m3, [tlq-16*2+2]
   2566    shr                 r5d, 30
   2567    movd                 m4, [tlq+r4*2]
   2568    movd        [rsp+16*17], m1
   2569    mova        [rsp+16*16], m2
   2570    pshuflw              m4, m4, q0000
   2571    mova        [rsp+16*15], m3
   2572    lea                  r2, [r4-2]
   2573    movq [rsp+16*17+r4*2-10], m4
   2574    cmp                  wd, 16
   2575    cmovae               r4, r2
   2576    lea                 tlq, [rsp+16*17-2]
   2577    call .filter_edge
   2578 .h8_main:
   2579    sub                 tlq, r4
   2580    movd                 m4, dyd
   2581    sub                 tlq, r4
   2582    shl                 r4d, 6
   2583    movd                 m6, [tlq]
   2584    movd                 m3, r4d
   2585    pshufb               m4, m0
   2586    neg                 dyq
   2587    pshufb               m6, m0
   2588    lea                  r5, [dyq+r4+63]
   2589    pshufb               m3, m0
   2590    shl                  wd, 4
   2591    mova                 m5, m4
   2592    sub                 rsp, wq
   2593    psubw                m3, [base+z_base_inc_z2]
   2594 .h8_loop:
   2595    mov                  r4, r5
   2596    sar                  r4, 6
   2597    movu                 m0, [tlq+r4*2-14]
   2598    movu                 m1, [tlq+r4*2-16]
   2599    pand                 m2, m7, m4
   2600    psllw                m2, 9
   2601    psubw                m1, m0
   2602    pmulhrsw             m1, m2
   2603    pcmpgtw              m2, m3, m4
   2604    paddw                m4, m5
   2605    paddw                m0, m1
   2606    pand                 m0, m2
   2607    pandn                m2, m6
   2608    por                  m0, m2
   2609    mova        [rsp+wq-16], m0
   2610    sub                  wd, 8*2
   2611    jz .h8_transpose
   2612    add                  r5, dyq
   2613    jg .h8_loop
   2614 .h8_end_loop:
   2615    mova        [rsp+wq-16], m6
   2616    sub                  wd, 8*2
   2617    jg .h8_end_loop
   2618 .h8_transpose:
   2619    or                  r3d, 8*2
   2620    jmp .end_transpose
   2621 .h16:
   2622    lea                 r4d, [wq+15]
   2623    movd                 m1, r4d
   2624    and                 r4d, 15
   2625    or                  r4d, 16 ; imin(w+15, 31)
   2626    test             angled, 0x400
   2627    jnz .h16_main
   2628    movd                 m3, angled
   2629    shr              angled, 8 ; is_sm << 1
   2630    pxor                 m2, m2
   2631    pshufb               m1, m2
   2632    pshufb               m3, m2
   2633    movq                 m4, [base+z_filt_t_w16+angleq*4]
   2634    pcmpeqb              m1, [base+z_filt_wh16]
   2635    pand                 m1, m3
   2636    pcmpgtb              m1, m4
   2637    pmovmskb            r5d, m1
   2638    test                r5d, r5d
   2639    jz .h16_main ; filter_strength == 0
   2640    pshuflw              m1, [tlq+2], q0000
   2641    mova                 m2, [tlq-16*1+2]
   2642    imul                r5d, 0x24924924
   2643    mova                 m3, [tlq-16*2+2]
   2644    neg                  r4
   2645    mova                 m4, [tlq-16*3+2]
   2646    shr                 r5d, 30
   2647    mova                 m5, [tlq-16*4+2]
   2648    movd                 m6, [tlq+r4*2]
   2649    adc                 r5d, -1 ; filter_strength
   2650    movd        [rsp+16*17], m1
   2651    mova        [rsp+16*16], m2
   2652    mova        [rsp+16*15], m3
   2653    pshuflw              m6, m6, q0000
   2654    mova        [rsp+16*14], m4
   2655    mova        [rsp+16*13], m5
   2656    lea                  r2, [r4-2]
   2657    movq [rsp+16*17+r4*2-10], m6
   2658    cmp                  wd, 32
   2659    cmovae               r4, r2
   2660    lea                 tlq, [rsp+16*17-2]
   2661    call .filter_edge
   2662 .h16_main:
   2663    sub                 tlq, r4
   2664    movd                 m5, dyd
   2665    sub                 tlq, r4
   2666    shl                 r4d, 6
   2667    movd                 m6, [tlq]
   2668    movd                 m3, r4d
   2669    pshufb               m5, m0
   2670    neg                 dyq
   2671    pshufb               m6, m0
   2672    lea                  r5, [dyq+r4+63]
   2673    pshufb               m3, m0
   2674    shl                  wd, 5
   2675    paddw                m4, m5, [base+z_base_inc_z2]
   2676    sub                 rsp, wq
   2677    psubw                m4, m3
   2678 .h16_loop:
   2679    mov                  r4, r5
   2680    sar                  r4, 6
   2681    movu                 m0, [tlq+r4*2-14]
   2682    movu                 m2, [tlq+r4*2-16]
   2683    pand                 m3, m7, m4
   2684    psllw                m3, 9
   2685    psubw                m2, m0
   2686    pmulhrsw             m2, m3
   2687    movu                 m1, [tlq+r4*2-30]
   2688    paddw                m0, m2
   2689    movu                 m2, [tlq+r4*2-32]
   2690    psubw                m2, m1
   2691    pmulhrsw             m2, m3
   2692    movddup              m3, [base+pw_m512]
   2693    paddw                m1, m2
   2694    psraw                m2, m4, 15
   2695    pcmpgtw              m3, m4
   2696    paddw                m4, m5
   2697    pand                 m0, m2
   2698    pandn                m2, m6
   2699    pand                 m1, m3
   2700    pandn                m3, m6
   2701    por                  m0, m2
   2702    mova      [rsp+wq-16*1], m0
   2703    por                  m1, m3
   2704    mova      [rsp+wq-16*2], m1
   2705    sub                  wd, 16*2
   2706    jz .h16_transpose
   2707    add                  r5, dyq
   2708    jg .h16_loop
   2709 .h16_end_loop:
   2710    mova      [rsp+wq-16*1], m6
   2711    mova      [rsp+wq-16*2], m6
   2712    sub                  wd, 16*2
   2713    jg .h16_end_loop
   2714 .h16_transpose:
   2715    or                  r3d, 16*2
   2716    jmp .end_transpose
   2717 .h32:
   2718    lea                 r4d, [wq+31]
   2719    and                 r4d, 31
   2720    or                  r4d, 32 ; imin(w+31, 63)
   2721    test             angled, 0x400 ; !enable_intra_edge_filter
   2722    jnz .h32_main
   2723    call .filter_copy
   2724    lea                  r5, [r4-2]
   2725    cmp                  wd, 64
   2726    cmove                r4, r5
   2727    call .filter_edge_s3
   2728 .h32_main:
   2729    sub                 tlq, r4
   2730    movd                 m5, dyd
   2731    sub                 tlq, r4
   2732    shl                 r4d, 6
   2733    movd                 m6, [tlq]
   2734    movd                 m3, r4d
   2735    pshufb               m5, m0
   2736    neg                 dyq
   2737    pshufb               m6, m0
   2738    lea                  r5, [dyq+r4+63]
   2739    pshufb               m3, m0
   2740    paddw                m4, m5, [base+z_base_inc_z2]
   2741    psubw                m4, m3
   2742 .h32_loop:
   2743    mov                  r4, r5
   2744    sar                  r4, 6
   2745    movu                 m0, [tlq+r4*2-14]
   2746    movu                 m3, [tlq+r4*2-16]
   2747    pand                 m2, m7, m4
   2748    psllw                m2, 9
   2749    psubw                m3, m0
   2750    pmulhrsw             m3, m2
   2751    movu                 m1, [tlq+r4*2-30]
   2752    paddw                m0, m3
   2753    movu                 m3, [tlq+r4*2-32]
   2754    psubw                m3, m1
   2755    pmulhrsw             m3, m2
   2756    sub                 rsp, 16*4
   2757    paddw                m1, m3
   2758    psraw                m3, m4, 15
   2759    pand                 m0, m3
   2760    pandn                m3, m6
   2761    por                  m0, m3
   2762    movddup              m3, [base+pw_m512]
   2763    pcmpgtw              m3, m4
   2764    pand                 m1, m3
   2765    pandn                m3, m6
   2766    mova         [rsp+16*3], m0
   2767    por                  m1, m3
   2768    mova         [rsp+16*2], m1
   2769    movu                 m0, [tlq+r4*2-46]
   2770    movu                 m3, [tlq+r4*2-48]
   2771    psubw                m3, m0
   2772    pmulhrsw             m3, m2
   2773    movu                 m1, [tlq+r4*2-62]
   2774    paddw                m0, m3
   2775    movu                 m3, [tlq+r4*2-64]
   2776    psubw                m3, m1
   2777    pmulhrsw             m3, m2
   2778    movddup              m2, [base+pw_m1024]
   2779    paddw                m1, m3
   2780    movddup              m3, [base+pw_m1536]
   2781    pcmpgtw              m2, m4
   2782    pcmpgtw              m3, m4
   2783    paddw                m4, m5
   2784    pand                 m0, m2
   2785    pandn                m2, m6
   2786    pand                 m1, m3
   2787    pandn                m3, m6
   2788    por                  m0, m2
   2789    mova         [rsp+16*1], m0
   2790    por                  m1, m3
   2791    mova         [rsp+16*0], m1
   2792    dec                  wd
   2793    jz .h32_transpose
   2794    add                  r5, dyq
   2795    jg .h32_loop
   2796 .h32_end_loop:
   2797    sub                 rsp, 16*4
   2798    REPX {mova [rsp+16*x], m6}, 3, 2, 1, 0
   2799    dec                  wd
   2800    jg .h32_end_loop
   2801 .h32_transpose:
   2802    or                  r3d, 32*2
   2803    jmp .end_transpose
   2804 .h64:
   2805    lea                 r4d, [wq+63]
   2806    test             angled, 0x400 ; !enable_intra_edge_filter
   2807    jnz .h64_main
   2808    call .filter_copy
   2809    call .filter_edge_s3
   2810 .h64_main:
   2811    sub                 tlq, r4
   2812    movd                 m5, dyd
   2813    sub                 tlq, r4
   2814    shl                 r4d, 6
   2815    movd                 m6, [tlq]
   2816    movd                 m3, r4d
   2817    pshufb               m5, m0
   2818    neg                 dyq
   2819    pshufb               m6, m0
   2820    lea                  r5, [dyq+r4+63]
   2821    pshufb               m3, m0
   2822    paddw                m4, m5, [base+z_base_inc_z2]
   2823    psubw                m4, m3
   2824 .h64_loop:
   2825    mov                  r4, r5
   2826    sar                  r4, 6
   2827    movu                 m0, [tlq+r4*2- 14]
   2828    movu                 m3, [tlq+r4*2- 16]
   2829    pand                 m2, m7, m4
   2830    psllw                m2, 9
   2831    psubw                m3, m0
   2832    pmulhrsw             m3, m2
   2833    movu                 m1, [tlq+r4*2- 30]
   2834    paddw                m0, m3
   2835    movu                 m3, [tlq+r4*2- 32]
   2836    psubw                m3, m1
   2837    pmulhrsw             m3, m2
   2838    sub                 rsp, 16*8
   2839    paddw                m1, m3
   2840    psraw                m3, m4, 15
   2841    pand                 m0, m3
   2842    pandn                m3, m6
   2843    por                  m0, m3
   2844    movddup              m3, [base+pw_m512]
   2845    pcmpgtw              m3, m4
   2846    pand                 m1, m3
   2847    pandn                m3, m6
   2848    mova         [rsp+16*7], m0
   2849    por                  m1, m3
   2850    mova         [rsp+16*6], m1
   2851    movu                 m0, [tlq+r4*2- 46]
   2852    movu                 m3, [tlq+r4*2- 48]
   2853    psubw                m3, m0
   2854    pmulhrsw             m3, m2
   2855    movu                 m1, [tlq+r4*2- 62]
   2856    paddw                m0, m3
   2857    movu                 m3, [tlq+r4*2- 64]
   2858    psubw                m3, m1
   2859    pmulhrsw             m3, m2
   2860    paddw                m1, m3
   2861    movddup              m3, [base+pw_m1024]
   2862    pcmpgtw              m3, m4
   2863    pand                 m0, m3
   2864    pandn                m3, m6
   2865    por                  m0, m3
   2866    movddup              m3, [base+pw_m1536]
   2867    pcmpgtw              m3, m4
   2868    pand                 m1, m3
   2869    pandn                m3, m6
   2870    mova         [rsp+16*5], m0
   2871    por                  m1, m3
   2872    mova         [rsp+16*4], m1
   2873    movu                 m0, [tlq+r4*2- 78]
   2874    movu                 m3, [tlq+r4*2- 80]
   2875    psubw                m3, m0
   2876    pmulhrsw             m3, m2
   2877    movu                 m1, [tlq+r4*2- 94]
   2878    paddw                m0, m3
   2879    movu                 m3, [tlq+r4*2- 96]
   2880    psubw                m3, m1
   2881    pmulhrsw             m3, m2
   2882    paddw                m1, m3
   2883    movddup              m3, [base+pw_m2048]
   2884    pcmpgtw              m3, m4
   2885    pand                 m0, m3
   2886    pandn                m3, m6
   2887    por                  m0, m3
   2888    movddup              m3, [base+pw_m2560]
   2889    pcmpgtw              m3, m4
   2890    pand                 m1, m3
   2891    pandn                m3, m6
   2892    mova         [rsp+16*3], m0
   2893    por                  m1, m3
   2894    mova         [rsp+16*2], m1
   2895    movu                 m0, [tlq+r4*2-110]
   2896    movu                 m3, [tlq+r4*2-112]
   2897    psubw                m3, m0
   2898    pmulhrsw             m3, m2
   2899    movu                 m1, [tlq+r4*2-126]
   2900    paddw                m0, m3
   2901    movu                 m3, [tlq+r4*2-128]
   2902    psubw                m3, m1
   2903    pmulhrsw             m3, m2
   2904    movddup              m2, [base+pw_m3072]
   2905    paddw                m1, m3
   2906    movddup              m3, [base+pw_m3584]
   2907    pcmpgtw              m2, m4
   2908    pcmpgtw              m3, m4
   2909    paddw                m4, m5
   2910    pand                 m0, m2
   2911    pandn                m2, m6
   2912    pand                 m1, m3
   2913    pandn                m3, m6
   2914    por                  m0, m2
   2915    mova         [rsp+16*1], m0
   2916    por                  m1, m3
   2917    mova         [rsp+16*0], m1
   2918    dec                  wd
   2919    jz .h64_transpose
   2920    add                  r5, dyq
   2921    jg .h64_loop
   2922 .h64_end_loop:
   2923    sub                 rsp, 16*8
   2924    REPX {mova [rsp+16*x], m6}, 7, 6, 5, 4, 3, 2, 1, 0
   2925    dec                  wd
   2926    jg .h64_end_loop
   2927 .h64_transpose:
   2928    add                 r3d, 64*2
   2929 .end_transpose:
   2930 %if ARCH_X86_64
   2931    lea                  r7, [strideq*3]
   2932 %else
   2933    mov             strideq, [dstq+4*0]
   2934    mov              org_wd, [dstq+4*1]
   2935 %endif
   2936    lea                 r4d, [r3*3]
   2937 .end_transpose_loop:
   2938    lea                  r2, [rsp+r3-8]
   2939    lea                  r6, [dstq+org_wq*2-8]
   2940 .end_transpose_loop_y:
   2941    movq                 m0, [r2+r4  ]
   2942    movq                 m1, [r2+r3*2]
   2943    movq                 m2, [r2+r3*1]
   2944    movq                 m3, [r2+r3*0]
   2945    sub                  r2, 8
   2946    punpcklwd            m0, m1
   2947    punpcklwd            m2, m3
   2948    punpckhdq            m1, m0, m2
   2949    punpckldq            m0, m2
   2950    movhps   [r6+strideq*0], m1
   2951    movq     [r6+strideq*1], m1
   2952 %if ARCH_X86_64
   2953    movhps   [r6+strideq*2], m0
   2954    movq     [r6+r7       ], m0
   2955    lea                  r6, [r6+strideq*4]
   2956 %else
   2957    lea                  r6, [r6+strideq*2]
   2958    movhps   [r6+strideq*0], m0
   2959    movq     [r6+strideq*1], m0
   2960    lea                  r6, [r6+strideq*2]
   2961 %endif
   2962    cmp                  r2, rsp
   2963    jae .end_transpose_loop_y
   2964    lea                 rsp, [rsp+r3*4]
   2965    sub              org_wd, 4
   2966    jg .end_transpose_loop
   2967    RET
   2968 .filter_copy:
   2969    neg                  r4
   2970    pshuflw              m2, [tlq+2], q0000
   2971    xor                 r5d, r5d
   2972    pshuflw              m3, [tlq+r4*2], q0000
   2973    movq [rsp+gprsize+16*17], m2
   2974 .filter_copy_loop:
   2975    mova                 m1, [tlq+r5*2-16*1+2]
   2976    mova                 m2, [tlq+r5*2-16*2+2]
   2977    sub                  r5, 16
   2978    mova [rsp+r5*2+gprsize+16*18], m1
   2979    mova [rsp+r5*2+gprsize+16*17], m2
   2980    cmp                 r5d, r4d
   2981    jg .filter_copy_loop
   2982    lea                 tlq, [rsp+gprsize+16*17-2]
   2983    movq       [tlq+r4*2-8], m3
   2984    ret
   2985 .filter_edge:
   2986    cmp                 r5d, 3
   2987    je .filter_edge_s3
   2988    movddup              m4, [base+z_filt_k+r5*8-8]
   2989    movddup              m5, [base+z_filt_k+r5*8+8]
   2990    xor                 r5d, r5d
   2991    movddup              m6, [base+pw_8]
   2992    movu                 m2, [tlq-12]
   2993    jmp .filter_edge_start
   2994 .filter_edge_loop:
   2995    movu                 m2, [tlq+r5*2-12]
   2996    mova       [tlq+r5*2+2], m1
   2997 .filter_edge_start:
   2998    pmullw               m1, m4, [tlq+r5*2-14]
   2999    movu                 m3, [tlq+r5*2-16]
   3000    sub                  r5, 8
   3001    paddw                m2, m3
   3002    pmullw               m2, m5
   3003    paddw                m1, m6
   3004    paddw                m1, m2
   3005    psrlw                m1, 4
   3006    cmp                 r5d, r4d
   3007    jg .filter_edge_loop
   3008    mova       [tlq+r5*2+2], m1
   3009    neg                 r4d
   3010    ret
   3011 .filter_edge_s3:
   3012    movddup              m5, [base+pw_3]
   3013    xor                 r5d, r5d
   3014    movu                 m2, [tlq-12]
   3015    movu                 m3, [tlq-10]
   3016    jmp .filter_edge_s3_start
   3017 .filter_edge_s3_loop:
   3018    movu                 m2, [tlq+r5*2-12]
   3019    movu                 m3, [tlq+r5*2-10]
   3020    mova       [tlq+r5*2+2], m1
   3021 .filter_edge_s3_start:
   3022    paddw                m2, [tlq+r5*2-14]
   3023    paddw                m3, m5
   3024    movu                 m1, [tlq+r5*2-16]
   3025    movu                 m4, [tlq+r5*2-18]
   3026    sub                  r5, 8
   3027    paddw                m1, m2
   3028    pavgw                m3, m4
   3029    paddw                m1, m3
   3030    psrlw                m1, 2
   3031    cmp                 r5d, r4d
   3032    jg .filter_edge_s3_loop
   3033    mova       [tlq+r5*2+2], m1
   3034    neg                 r4d
   3035    ret
   3036 
   3037 %if ARCH_X86_64
   3038 cglobal ipred_filter_16bpc, 4, 7, 16, dst, stride, tl, w, h, filter
   3039 %else
   3040 cglobal ipred_filter_16bpc, 4, 7, 8, -16*8, dst, stride, tl, w, h, filter
   3041 %define  m8 [esp+16*0]
   3042 %define  m9 [esp+16*1]
   3043 %define m10 [esp+16*2]
   3044 %define m11 [esp+16*3]
   3045 %define m12 [esp+16*4]
   3046 %define m13 [esp+16*5]
   3047 %define m14 [esp+16*6]
   3048 %define m15 [esp+16*7]
   3049 %endif
   3050 %define base r6-$$
   3051    movifnidn            hd, hm
   3052    movd                 m6, r8m     ; bitdepth_max
   3053 %ifidn filterd, filterm
   3054    movzx           filterd, filterb
   3055 %else
   3056    movzx           filterd, byte filterm
   3057 %endif
   3058    LEA                  r6, $$
   3059    shl             filterd, 6
   3060    movu                 m0, [tlq-6] ; __ l1 l0 tl t0 t1 t2 t3
   3061    mova                 m1, [base+filter_intra_taps+filterq+16*0]
   3062    mova                 m2, [base+filter_intra_taps+filterq+16*1]
   3063    mova                 m3, [base+filter_intra_taps+filterq+16*2]
   3064    mova                 m4, [base+filter_intra_taps+filterq+16*3]
   3065    pxor                 m5, m5
   3066 %if ARCH_X86_64
   3067    punpcklbw            m8, m5, m1  ; place 8-bit coefficients in the upper
   3068    punpckhbw            m9, m5, m1  ; half of each 16-bit word to avoid
   3069    punpcklbw           m10, m5, m2  ; having to perform sign-extension.
   3070    punpckhbw           m11, m5, m2
   3071    punpcklbw           m12, m5, m3
   3072    punpckhbw           m13, m5, m3
   3073    punpcklbw           m14, m5, m4
   3074    punpckhbw           m15, m5, m4
   3075 %else
   3076    punpcklbw            m7, m5, m1
   3077    mova                 m8, m7
   3078    punpckhbw            m7, m5, m1
   3079    mova                 m9, m7
   3080    punpcklbw            m7, m5, m2
   3081    mova                m10, m7
   3082    punpckhbw            m7, m5, m2
   3083    mova                m11, m7
   3084    punpcklbw            m7, m5, m3
   3085    mova                m12, m7
   3086    punpckhbw            m7, m5, m3
   3087    mova                m13, m7
   3088    punpcklbw            m7, m5, m4
   3089    mova                m14, m7
   3090    punpckhbw            m7, m5, m4
   3091    mova                m15, m7
   3092 %endif
   3093    mova                 m7, [base+filter_shuf]
   3094    add                  hd, hd
   3095    mov                  r5, dstq
   3096    pshuflw              m6, m6, q0000
   3097    mov                  r6, tlq
   3098    punpcklqdq           m6, m6
   3099    sub                 tlq, hq
   3100 .left_loop:
   3101    pshufb               m0, m7      ; tl t0 t1 t2 t3 l0 l1 __
   3102    pshufd               m1, m0, q0000
   3103    pmaddwd              m2, m8, m1
   3104    pmaddwd              m1, m9
   3105    pshufd               m4, m0, q1111
   3106    pmaddwd              m3, m10, m4
   3107    pmaddwd              m4, m11
   3108    paddd                m2, m3
   3109    paddd                m1, m4
   3110    pshufd               m4, m0, q2222
   3111    pmaddwd              m3, m12, m4
   3112    pmaddwd              m4, m13
   3113    paddd                m2, m3
   3114    paddd                m1, m4
   3115    pshufd               m3, m0, q3333
   3116    pmaddwd              m0, m14, m3
   3117    pmaddwd              m3, m15
   3118    paddd                m0, m2
   3119    paddd                m1, m3
   3120    psrad                m0, 11     ; x >> 3
   3121    psrad                m1, 11
   3122    packssdw             m0, m1
   3123    pmaxsw               m0, m5
   3124    pavgw                m0, m5     ; (x + 8) >> 4
   3125    pminsw               m0, m6
   3126    movq   [dstq+strideq*0], m0
   3127    movhps [dstq+strideq*1], m0
   3128    movlps               m0, [tlq+hq-10]
   3129    lea                dstq, [dstq+strideq*2]
   3130    sub                  hd, 2*2
   3131    jg .left_loop
   3132    sub                  wd, 4
   3133    jz .end
   3134    sub                 tld, r6d     ; -h*2
   3135    sub                  r6, r5      ; tl-dst
   3136 .right_loop0:
   3137    add                  r5, 8
   3138    mov                  hd, tld
   3139    movu                 m0, [r5+r6] ; tl t0 t1 t2 t3 __ __ __
   3140    mov                dstq, r5
   3141 .right_loop:
   3142    pshufd               m2, m0, q0000
   3143    pmaddwd              m1, m8, m2
   3144    pmaddwd              m2, m9
   3145    pshufd               m4, m0, q1111
   3146    pmaddwd              m3, m10, m4
   3147    pmaddwd              m4, m11
   3148    pinsrw               m0, [dstq+strideq*0-2], 5
   3149    paddd                m1, m3
   3150    paddd                m2, m4
   3151    pshufd               m0, m0, q2222
   3152    movddup              m4, [dstq+strideq*1-8]
   3153    pmaddwd              m3, m12, m0
   3154    pmaddwd              m0, m13
   3155    paddd                m1, m3
   3156    paddd                m0, m2
   3157    pshuflw              m2, m4, q3333
   3158    punpcklwd            m2, m5
   3159    pmaddwd              m3, m14, m2
   3160    pmaddwd              m2, m15
   3161    paddd                m1, m3
   3162    paddd                m0, m2
   3163    psrad                m1, 11
   3164    psrad                m0, 11
   3165    packssdw             m0, m1
   3166    pmaxsw               m0, m5
   3167    pavgw                m0, m5
   3168    pminsw               m0, m6
   3169    movhps [dstq+strideq*0], m0
   3170    movq   [dstq+strideq*1], m0
   3171    palignr              m0, m4, 14
   3172    lea                dstq, [dstq+strideq*2]
   3173    add                  hd, 2*2
   3174    jl .right_loop
   3175    sub                  wd, 4
   3176    jg .right_loop0
   3177 .end:
   3178    RET
   3179 
   3180 %if UNIX64
   3181 DECLARE_REG_TMP 7
   3182 %else
   3183 DECLARE_REG_TMP 5
   3184 %endif
   3185 
   3186 cglobal ipred_cfl_top_16bpc, 4, 7, 8, dst, stride, tl, w, h, ac
   3187    LEA                  t0, ipred_cfl_left_16bpc_ssse3_table
   3188    movd                 m4, wd
   3189    tzcnt                wd, wd
   3190    movifnidn            hd, hm
   3191    add                 tlq, 2
   3192    movsxd               r6, [t0+wq*4]
   3193    movd                 m5, wd
   3194    jmp mangle(private_prefix %+ _ipred_cfl_left_16bpc_ssse3.start)
   3195 
   3196 cglobal ipred_cfl_left_16bpc, 3, 7, 8, dst, stride, tl, w, h, ac, alpha
   3197    movifnidn            hd, hm
   3198    LEA                  t0, ipred_cfl_left_16bpc_ssse3_table
   3199    tzcnt                wd, wm
   3200    lea                 r6d, [hq*2]
   3201    movd                 m4, hd
   3202    sub                 tlq, r6
   3203    tzcnt               r6d, hd
   3204    movd                 m5, r6d
   3205    movsxd               r6, [t0+r6*4]
   3206 .start:
   3207    movd                 m7, r7m
   3208    movu                 m0, [tlq]
   3209    add                  r6, t0
   3210    add                  t0, ipred_cfl_splat_16bpc_ssse3_table-ipred_cfl_left_16bpc_ssse3_table
   3211    movsxd               wq, [t0+wq*4]
   3212    pxor                 m6, m6
   3213    pshuflw              m7, m7, q0000
   3214    pcmpeqw              m3, m3
   3215    add                  wq, t0
   3216    movifnidn           acq, acmp
   3217    pavgw                m4, m6
   3218    punpcklqdq           m7, m7
   3219    jmp                  r6
   3220 .h32:
   3221    movu                 m1, [tlq+48]
   3222    movu                 m2, [tlq+32]
   3223    paddw                m0, m1
   3224    paddw                m0, m2
   3225 .h16:
   3226    movu                 m1, [tlq+16]
   3227    paddw                m0, m1
   3228 .h8:
   3229    pshufd               m1, m0, q1032
   3230    paddw                m0, m1
   3231 .h4:
   3232    pmaddwd              m0, m3
   3233    psubd                m4, m0
   3234    pshuflw              m0, m4, q1032
   3235    paddd                m0, m4
   3236    psrld                m0, m5
   3237    pshuflw              m0, m0, q0000
   3238    punpcklqdq           m0, m0
   3239    jmp                  wq
   3240 
   3241 %macro IPRED_CFL 2 ; dst, src
   3242    pabsw               m%1, m%2
   3243    pmulhrsw            m%1, m2
   3244    psignw              m%2, m1
   3245    psignw              m%1, m%2
   3246    paddw               m%1, m0
   3247    pmaxsw              m%1, m6
   3248    pminsw              m%1, m7
   3249 %endmacro
   3250 
   3251 cglobal ipred_cfl_16bpc, 4, 7, 8, dst, stride, tl, w, h, ac, alpha
   3252    movifnidn            hd, hm
   3253    tzcnt               r6d, hd
   3254    lea                 t0d, [wq+hq]
   3255    movd                 m4, t0d
   3256    tzcnt               t0d, t0d
   3257    movd                 m5, t0d
   3258    LEA                  t0, ipred_cfl_16bpc_ssse3_table
   3259    tzcnt                wd, wd
   3260    movd                 m7, r7m
   3261    movsxd               r6, [t0+r6*4]
   3262    movsxd               wq, [t0+wq*4+4*4]
   3263    psrlw                m4, 1
   3264    pxor                 m6, m6
   3265    pshuflw              m7, m7, q0000
   3266    add                  r6, t0
   3267    add                  wq, t0
   3268    movifnidn           acq, acmp
   3269    pcmpeqw              m3, m3
   3270    punpcklqdq           m7, m7
   3271    jmp                  r6
   3272 .h4:
   3273    movq                 m0, [tlq-8]
   3274    jmp                  wq
   3275 .w4:
   3276    movq                 m1, [tlq+2]
   3277    paddw                m0, m1
   3278    pmaddwd              m0, m3
   3279    psubd                m4, m0
   3280    pshufd               m0, m4, q1032
   3281    paddd                m0, m4
   3282    pshuflw              m4, m0, q1032
   3283    paddd                m0, m4
   3284    cmp                  hd, 4
   3285    jg .w4_mul
   3286    psrld                m0, 3
   3287    jmp .w4_end
   3288 .w4_mul:
   3289    mov                 r6d, 0xAAAB
   3290    mov                 r2d, 0x6667
   3291    cmp                  hd, 16
   3292    cmove               r6d, r2d
   3293    movd                 m1, r6d
   3294    psrld                m0, 2
   3295    pmulhuw              m0, m1
   3296    psrlw                m0, 1
   3297 .w4_end:
   3298    pshuflw              m0, m0, q0000
   3299    punpcklqdq           m0, m0
   3300 .s4:
   3301    movd                 m1, alpham
   3302    lea                  r6, [strideq*3]
   3303    pshuflw              m1, m1, q0000
   3304    punpcklqdq           m1, m1
   3305    pabsw                m2, m1
   3306    psllw                m2, 9
   3307 .s4_loop:
   3308    mova                 m4, [acq+16*0]
   3309    mova                 m5, [acq+16*1]
   3310    add                 acq, 16*2
   3311    IPRED_CFL             3, 4
   3312    IPRED_CFL             4, 5
   3313    movq   [dstq+strideq*0], m3
   3314    movhps [dstq+strideq*1], m3
   3315    movq   [dstq+strideq*2], m4
   3316    movhps [dstq+r6       ], m4
   3317    lea                dstq, [dstq+strideq*4]
   3318    sub                  hd, 4
   3319    jg .s4_loop
   3320    RET
   3321 .h8:
   3322    mova                 m0, [tlq-16]
   3323    jmp                  wq
   3324 .w8:
   3325    movu                 m1, [tlq+2]
   3326    paddw                m0, m1
   3327    pmaddwd              m0, m3
   3328    psubd                m4, m0
   3329    pshufd               m0, m4, q1032
   3330    paddd                m0, m4
   3331    pshuflw              m4, m0, q1032
   3332    paddd                m0, m4
   3333    psrld                m0, m5
   3334    cmp                  hd, 8
   3335    je .w8_end
   3336    mov                 r6d, 0xAAAB
   3337    mov                 r2d, 0x6667
   3338    cmp                  hd, 32
   3339    cmove               r6d, r2d
   3340    movd                 m1, r6d
   3341    pmulhuw              m0, m1
   3342    psrlw                m0, 1
   3343 .w8_end:
   3344    pshuflw              m0, m0, q0000
   3345    punpcklqdq           m0, m0
   3346 .s8:
   3347    movd                 m1, alpham
   3348    pshuflw              m1, m1, q0000
   3349    punpcklqdq           m1, m1
   3350    pabsw                m2, m1
   3351    psllw                m2, 9
   3352 .s8_loop:
   3353    mova                 m4, [acq+16*0]
   3354    mova                 m5, [acq+16*1]
   3355    add                 acq, 16*2
   3356    IPRED_CFL             3, 4
   3357    IPRED_CFL             4, 5
   3358    mova   [dstq+strideq*0], m3
   3359    mova   [dstq+strideq*1], m4
   3360    lea                dstq, [dstq+strideq*2]
   3361    sub                  hd, 2
   3362    jg .s8_loop
   3363    RET
   3364 .h16:
   3365    mova                 m0, [tlq-32]
   3366    paddw                m0, [tlq-16]
   3367    jmp                  wq
   3368 .w16:
   3369    movu                 m1, [tlq+ 2]
   3370    movu                 m2, [tlq+18]
   3371    paddw                m1, m2
   3372    paddw                m0, m1
   3373    pmaddwd              m0, m3
   3374    psubd                m4, m0
   3375    pshufd               m0, m4, q1032
   3376    paddd                m0, m4
   3377    pshuflw              m4, m0, q1032
   3378    paddd                m0, m4
   3379    psrld                m0, m5
   3380    cmp                  hd, 16
   3381    je .w16_end
   3382    mov                 r6d, 0xAAAB
   3383    mov                 r2d, 0x6667
   3384    test                 hd, 8|32
   3385    cmovz               r6d, r2d
   3386    movd                 m1, r6d
   3387    pmulhuw              m0, m1
   3388    psrlw                m0, 1
   3389 .w16_end:
   3390    pshuflw              m0, m0, q0000
   3391    punpcklqdq           m0, m0
   3392 .s16:
   3393    movd                 m1, alpham
   3394    pshuflw              m1, m1, q0000
   3395    punpcklqdq           m1, m1
   3396    pabsw                m2, m1
   3397    psllw                m2, 9
   3398 .s16_loop:
   3399    mova                 m4, [acq+16*0]
   3400    mova                 m5, [acq+16*1]
   3401    add                 acq, 16*2
   3402    IPRED_CFL             3, 4
   3403    IPRED_CFL             4, 5
   3404    mova        [dstq+16*0], m3
   3405    mova        [dstq+16*1], m4
   3406    add                dstq, strideq
   3407    dec                  hd
   3408    jg .s16_loop
   3409    RET
   3410 .h32:
   3411    mova                 m0, [tlq-64]
   3412    paddw                m0, [tlq-48]
   3413    paddw                m0, [tlq-32]
   3414    paddw                m0, [tlq-16]
   3415    jmp                  wq
   3416 .w32:
   3417    movu                 m1, [tlq+ 2]
   3418    movu                 m2, [tlq+18]
   3419    paddw                m1, m2
   3420    movu                 m2, [tlq+34]
   3421    paddw                m1, m2
   3422    movu                 m2, [tlq+50]
   3423    paddw                m1, m2
   3424    paddw                m0, m1
   3425    pmaddwd              m0, m3
   3426    psubd                m4, m0
   3427    pshufd               m0, m4, q1032
   3428    paddd                m0, m4
   3429    pshuflw              m4, m0, q1032
   3430    paddd                m0, m4
   3431    psrld                m0, m5
   3432    cmp                  hd, 32
   3433    je .w32_end
   3434    mov                 r6d, 0xAAAB
   3435    mov                 r2d, 0x6667
   3436    cmp                  hd, 8
   3437    cmove               r6d, r2d
   3438    movd                 m1, r6d
   3439    pmulhuw              m0, m1
   3440    psrlw                m0, 1
   3441 .w32_end:
   3442    pshuflw              m0, m0, q0000
   3443    punpcklqdq           m0, m0
   3444 .s32:
   3445    movd                 m1, alpham
   3446    pshuflw              m1, m1, q0000
   3447    punpcklqdq           m1, m1
   3448    pabsw                m2, m1
   3449    psllw                m2, 9
   3450 .s32_loop:
   3451    mova                 m4, [acq+16*0]
   3452    mova                 m5, [acq+16*1]
   3453    IPRED_CFL             3, 4
   3454    IPRED_CFL             4, 5
   3455    mova        [dstq+16*0], m3
   3456    mova        [dstq+16*1], m4
   3457    mova                 m4, [acq+16*2]
   3458    mova                 m5, [acq+16*3]
   3459    add                 acq, 16*4
   3460    IPRED_CFL             3, 4
   3461    IPRED_CFL             4, 5
   3462    mova        [dstq+16*2], m3
   3463    mova        [dstq+16*3], m4
   3464    add                dstq, strideq
   3465    dec                  hd
   3466    jg .s32_loop
   3467    RET
   3468 
   3469 cglobal ipred_cfl_128_16bpc, 3, 7, 8, dst, stride, tl, w, h, ac
   3470    tzcnt                wd, wm
   3471    LEA                  t0, ipred_cfl_splat_16bpc_ssse3_table
   3472    mov                 r6d, r7m
   3473    movifnidn            hd, hm
   3474    shr                 r6d, 11
   3475    movd                 m7, r7m
   3476    movsxd               wq, [t0+wq*4]
   3477    movddup              m0, [t0-ipred_cfl_splat_16bpc_ssse3_table+pw_512+r6*8]
   3478    pshuflw              m7, m7, q0000
   3479    pxor                 m6, m6
   3480    add                  wq, t0
   3481    movifnidn           acq, acmp
   3482    punpcklqdq           m7, m7
   3483    jmp                  wq
   3484 
   3485 cglobal ipred_cfl_ac_420_16bpc, 3, 7, 6, ac, ypx, stride, wpad, hpad, w, h
   3486    movifnidn         hpadd, hpadm
   3487 %if ARCH_X86_32 && PIC
   3488    pcmpeqw              m5, m5
   3489    pabsw                m5, m5
   3490    paddw                m5, m5
   3491 %else
   3492    movddup              m5, [pw_2]
   3493 %endif
   3494    mov                  hd, hm
   3495    shl               hpadd, 2
   3496    pxor                 m4, m4
   3497    sub                  hd, hpadd
   3498    cmp            dword wm, 8
   3499    mov                  r5, acq
   3500    jg .w16
   3501    je .w8
   3502    lea                  r3, [strideq*3]
   3503 .w4_loop:
   3504    pmaddwd              m0, m5, [ypxq+strideq*0]
   3505    pmaddwd              m1, m5, [ypxq+strideq*1]
   3506    pmaddwd              m2, m5, [ypxq+strideq*2]
   3507    pmaddwd              m3, m5, [ypxq+r3       ]
   3508    lea                ypxq, [ypxq+strideq*4]
   3509    paddd                m0, m1
   3510    paddd                m2, m3
   3511    paddd                m4, m0
   3512    packssdw             m0, m2
   3513    paddd                m4, m2
   3514    mova              [acq], m0
   3515    add                 acq, 16
   3516    sub                  hd, 2
   3517    jg .w4_loop
   3518    test              hpadd, hpadd
   3519    jz .dc
   3520    punpckhqdq           m0, m0
   3521    pslld                m2, 2
   3522 .w4_hpad:
   3523    mova         [acq+16*0], m0
   3524    paddd                m4, m2
   3525    mova         [acq+16*1], m0
   3526    add                 acq, 16*2
   3527    sub               hpadd, 4
   3528    jg .w4_hpad
   3529    jmp .dc
   3530 .w8:
   3531 %if ARCH_X86_32
   3532    cmp         dword wpadm, 0
   3533 %else
   3534    test              wpadd, wpadd
   3535 %endif
   3536    jnz .w8_wpad1
   3537 .w8_loop:
   3538    pmaddwd              m0, m5, [ypxq+strideq*0+16*0]
   3539    pmaddwd              m2, m5, [ypxq+strideq*1+16*0]
   3540    pmaddwd              m1, m5, [ypxq+strideq*0+16*1]
   3541    pmaddwd              m3, m5, [ypxq+strideq*1+16*1]
   3542    lea                ypxq, [ypxq+strideq*2]
   3543    paddd                m0, m2
   3544    paddd                m1, m3
   3545    paddd                m2, m0, m1
   3546    packssdw             m0, m1
   3547    paddd                m4, m2
   3548    mova              [acq], m0
   3549    add                 acq, 16
   3550    dec                  hd
   3551    jg .w8_loop
   3552 .w8_hpad:
   3553    test              hpadd, hpadd
   3554    jz .dc
   3555    pslld                m2, 2
   3556    mova                 m1, m0
   3557    jmp .hpad
   3558 .w8_wpad1:
   3559    pmaddwd              m0, m5, [ypxq+strideq*0]
   3560    pmaddwd              m1, m5, [ypxq+strideq*1]
   3561    lea                ypxq, [ypxq+strideq*2]
   3562    paddd                m0, m1
   3563    pshufd               m1, m0, q3333
   3564    paddd                m2, m0, m1
   3565    packssdw             m0, m1
   3566    paddd                m4, m2
   3567    mova              [acq], m0
   3568    add                 acq, 16
   3569    dec                  hd
   3570    jg .w8_wpad1
   3571    jmp .w8_hpad
   3572 .w16_wpad3:
   3573    pshufd               m3, m0, q3333
   3574    mova                 m1, m3
   3575    mova                 m2, m3
   3576    jmp .w16_wpad_end
   3577 .w16_wpad2:
   3578    pshufd               m1, m3, q3333
   3579    mova                 m2, m1
   3580    jmp .w16_wpad_end
   3581 .w16_wpad1:
   3582    pshufd               m2, m1, q3333
   3583    jmp .w16_wpad_end
   3584 .w16:
   3585    movifnidn         wpadd, wpadm
   3586    WIN64_SPILL_XMM       7
   3587 .w16_loop:
   3588    pmaddwd              m0, m5, [ypxq+strideq*0+16*0]
   3589    pmaddwd              m6, m5, [ypxq+strideq*1+16*0]
   3590    paddd                m0, m6
   3591    cmp               wpadd, 2
   3592    jg .w16_wpad3
   3593    pmaddwd              m3, m5, [ypxq+strideq*0+16*1]
   3594    pmaddwd              m6, m5, [ypxq+strideq*1+16*1]
   3595    paddd                m3, m6
   3596    je .w16_wpad2
   3597    pmaddwd              m1, m5, [ypxq+strideq*0+16*2]
   3598    pmaddwd              m6, m5, [ypxq+strideq*1+16*2]
   3599    paddd                m1, m6
   3600    jp .w16_wpad1
   3601    pmaddwd              m2, m5, [ypxq+strideq*0+16*3]
   3602    pmaddwd              m6, m5, [ypxq+strideq*1+16*3]
   3603    paddd                m2, m6
   3604 .w16_wpad_end:
   3605    lea                ypxq, [ypxq+strideq*2]
   3606    paddd                m6, m0, m3
   3607    packssdw             m0, m3
   3608    paddd                m6, m1
   3609    mova         [acq+16*0], m0
   3610    packssdw             m1, m2
   3611    paddd                m2, m6
   3612    mova         [acq+16*1], m1
   3613    add                 acq, 16*2
   3614    paddd                m4, m2
   3615    dec                  hd
   3616    jg .w16_loop
   3617    WIN64_RESTORE_XMM
   3618    add               hpadd, hpadd
   3619    jz .dc
   3620    paddd                m2, m2
   3621 .hpad:
   3622    mova         [acq+16*0], m0
   3623    mova         [acq+16*1], m1
   3624    paddd                m4, m2
   3625    mova         [acq+16*2], m0
   3626    mova         [acq+16*3], m1
   3627    add                 acq, 16*4
   3628    sub               hpadd, 4
   3629    jg .hpad
   3630 .dc:
   3631    sub                  r5, acq ; -w*h*2
   3632    pshufd               m2, m4, q1032
   3633    tzcnt               r1d, r5d
   3634    paddd                m2, m4
   3635    sub                 r1d, 2
   3636    pshufd               m4, m2, q2301
   3637    movd                 m0, r1d
   3638    paddd                m2, m4
   3639    psrld                m2, m0
   3640    pxor                 m0, m0
   3641    pavgw                m2, m0
   3642    packssdw             m2, m2
   3643 .dc_loop:
   3644    mova                 m0, [acq+r5+16*0]
   3645    mova                 m1, [acq+r5+16*1]
   3646    psubw                m0, m2
   3647    psubw                m1, m2
   3648    mova      [acq+r5+16*0], m0
   3649    mova      [acq+r5+16*1], m1
   3650    add                  r5, 16*2
   3651    jl .dc_loop
   3652    RET
   3653 
   3654 cglobal ipred_cfl_ac_422_16bpc, 3, 7, 6, ac, ypx, stride, wpad, hpad, w, h
   3655    movifnidn         hpadd, hpadm
   3656 %if ARCH_X86_32 && PIC
   3657    pcmpeqw              m5, m5
   3658    pabsw                m5, m5
   3659    psllw                m5, 2
   3660 %else
   3661    movddup              m5, [pw_4]
   3662 %endif
   3663    mov                  hd, hm
   3664    shl               hpadd, 2
   3665    pxor                 m4, m4
   3666    sub                  hd, hpadd
   3667    cmp            dword wm, 8
   3668    mov                  r5, acq
   3669    jg .w16
   3670    je .w8
   3671    lea                  r3, [strideq*3]
   3672 .w4_loop:
   3673    pmaddwd              m0, m5, [ypxq+strideq*0]
   3674    pmaddwd              m3, m5, [ypxq+strideq*1]
   3675    pmaddwd              m1, m5, [ypxq+strideq*2]
   3676    pmaddwd              m2, m5, [ypxq+r3       ]
   3677    lea                ypxq, [ypxq+strideq*4]
   3678    paddd                m4, m0
   3679    packssdw             m0, m3
   3680    paddd                m3, m1
   3681    packssdw             m1, m2
   3682    paddd                m4, m2
   3683    paddd                m4, m3
   3684    mova         [acq+16*0], m0
   3685    mova         [acq+16*1], m1
   3686    add                 acq, 16*2
   3687    sub                  hd, 4
   3688    jg .w4_loop
   3689    test              hpadd, hpadd
   3690    jz mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_ssse3).dc
   3691    punpckhqdq           m1, m1
   3692    pslld                m2, 3
   3693    mova         [acq+16*0], m1
   3694    mova         [acq+16*1], m1
   3695    paddd                m4, m2
   3696    mova         [acq+16*2], m1
   3697    mova         [acq+16*3], m1
   3698    add                 acq, 16*4
   3699    jmp mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_ssse3).dc
   3700 .w8:
   3701 %if ARCH_X86_32
   3702    cmp         dword wpadm, 0
   3703 %else
   3704    test              wpadd, wpadd
   3705 %endif
   3706    jnz .w8_wpad1
   3707 .w8_loop:
   3708    pmaddwd              m0, m5, [ypxq+strideq*0+16*0]
   3709    pmaddwd              m2, m5, [ypxq+strideq*0+16*1]
   3710    pmaddwd              m1, m5, [ypxq+strideq*1+16*0]
   3711    pmaddwd              m3, m5, [ypxq+strideq*1+16*1]
   3712    lea                ypxq, [ypxq+strideq*2]
   3713    paddd                m4, m0
   3714    packssdw             m0, m2
   3715    paddd                m4, m2
   3716    mova         [acq+16*0], m0
   3717    paddd                m2, m1, m3
   3718    packssdw             m1, m3
   3719    paddd                m4, m2
   3720    mova         [acq+16*1], m1
   3721    add                 acq, 16*2
   3722    sub                  hd, 2
   3723    jg .w8_loop
   3724 .w8_hpad:
   3725    test              hpadd, hpadd
   3726    jz mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_ssse3).dc
   3727    pslld                m2, 2
   3728    mova                 m0, m1
   3729    jmp mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_ssse3).hpad
   3730 .w8_wpad1:
   3731    pmaddwd              m0, m5, [ypxq+strideq*0]
   3732    pmaddwd              m1, m5, [ypxq+strideq*1]
   3733    lea                ypxq, [ypxq+strideq*2]
   3734    pshufd               m2, m0, q3333
   3735    pshufd               m3, m1, q3333
   3736    paddd                m4, m0
   3737    packssdw             m0, m2
   3738    paddd                m4, m2
   3739    paddd                m2, m1, m3
   3740    packssdw             m1, m3
   3741    paddd                m4, m2
   3742    mova         [acq+16*0], m0
   3743    mova         [acq+16*1], m1
   3744    add                 acq, 16*2
   3745    sub                  hd, 2
   3746    jg .w8_wpad1
   3747    jmp .w8_hpad
   3748 .w16_wpad3:
   3749    pshufd               m3, m0, q3333
   3750    mova                 m1, m3
   3751    mova                 m2, m3
   3752    jmp .w16_wpad_end
   3753 .w16_wpad2:
   3754    pshufd               m1, m3, q3333
   3755    mova                 m2, m1
   3756    jmp .w16_wpad_end
   3757 .w16_wpad1:
   3758    pshufd               m2, m1, q3333
   3759    jmp .w16_wpad_end
   3760 .w16:
   3761    movifnidn         wpadd, wpadm
   3762    WIN64_SPILL_XMM       7
   3763 .w16_loop:
   3764    pmaddwd              m0, m5, [ypxq+16*0]
   3765    cmp               wpadd, 2
   3766    jg .w16_wpad3
   3767    pmaddwd              m3, m5, [ypxq+16*1]
   3768    je .w16_wpad2
   3769    pmaddwd              m1, m5, [ypxq+16*2]
   3770    jp .w16_wpad1
   3771    pmaddwd              m2, m5, [ypxq+16*3]
   3772 .w16_wpad_end:
   3773    add                ypxq, strideq
   3774    paddd                m6, m0, m3
   3775    packssdw             m0, m3
   3776    mova         [acq+16*0], m0
   3777    paddd                m6, m1
   3778    packssdw             m1, m2
   3779    paddd                m2, m6
   3780    mova         [acq+16*1], m1
   3781    add                 acq, 16*2
   3782    paddd                m4, m2
   3783    dec                  hd
   3784    jg .w16_loop
   3785    WIN64_RESTORE_XMM
   3786    add               hpadd, hpadd
   3787    jz mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_ssse3).dc
   3788    paddd                m2, m2
   3789    jmp mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_ssse3).hpad
   3790 
   3791 cglobal ipred_cfl_ac_444_16bpc, 3, 7, 6, ac, ypx, stride, wpad, hpad, w, h
   3792 %define base r6-ipred_cfl_ac_444_16bpc_ssse3_table
   3793    LEA                  r6, ipred_cfl_ac_444_16bpc_ssse3_table
   3794    tzcnt                wd, wm
   3795    movifnidn         hpadd, hpadm
   3796    pxor                 m4, m4
   3797    movsxd               wq, [r6+wq*4]
   3798    movddup              m5, [base+pw_1]
   3799    add                  wq, r6
   3800    mov                  hd, hm
   3801    shl               hpadd, 2
   3802    sub                  hd, hpadd
   3803    jmp                  wq
   3804 .w4:
   3805    lea                  r3, [strideq*3]
   3806    mov                  r5, acq
   3807 .w4_loop:
   3808    movq                 m0, [ypxq+strideq*0]
   3809    movhps               m0, [ypxq+strideq*1]
   3810    movq                 m1, [ypxq+strideq*2]
   3811    movhps               m1, [ypxq+r3       ]
   3812    lea                ypxq, [ypxq+strideq*4]
   3813    psllw                m0, 3
   3814    psllw                m1, 3
   3815    mova         [acq+16*0], m0
   3816    pmaddwd              m0, m5
   3817    mova         [acq+16*1], m1
   3818    pmaddwd              m2, m5, m1
   3819    add                 acq, 16*2
   3820    paddd                m4, m0
   3821    paddd                m4, m2
   3822    sub                  hd, 4
   3823    jg .w4_loop
   3824    test              hpadd, hpadd
   3825    jz mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_ssse3).dc
   3826    punpckhqdq           m1, m1
   3827    mova         [acq+16*0], m1
   3828    pslld                m2, 2
   3829    mova         [acq+16*1], m1
   3830    punpckhqdq           m2, m2
   3831    mova         [acq+16*2], m1
   3832    paddd                m4, m2
   3833    mova         [acq+16*3], m1
   3834    add                 acq, 16*4
   3835    jmp mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_ssse3).dc
   3836 .w8:
   3837    mov                  r5, acq
   3838 .w8_loop:
   3839    mova                 m0, [ypxq+strideq*0]
   3840    mova                 m1, [ypxq+strideq*1]
   3841    lea                ypxq, [ypxq+strideq*2]
   3842    psllw                m0, 3
   3843    psllw                m1, 3
   3844    mova         [acq+16*0], m0
   3845    pmaddwd              m0, m5
   3846    mova         [acq+16*1], m1
   3847    pmaddwd              m2, m5, m1
   3848    add                 acq, 16*2
   3849    paddd                m4, m0
   3850    paddd                m4, m2
   3851    sub                  hd, 2
   3852    jg .w8_loop
   3853 .w8_hpad:
   3854    test              hpadd, hpadd
   3855    jz mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_ssse3).dc
   3856    pslld                m2, 2
   3857    mova                 m0, m1
   3858    jmp mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_ssse3).hpad
   3859 .w16_wpad2:
   3860    pshufhw              m3, m2, q3333
   3861    pshufhw              m1, m0, q3333
   3862    punpckhqdq           m3, m3
   3863    punpckhqdq           m1, m1
   3864    jmp .w16_wpad_end
   3865 .w16:
   3866    movifnidn         wpadd, wpadm
   3867    mov                  r5, acq
   3868 .w16_loop:
   3869    mova                 m2, [ypxq+strideq*0+16*0]
   3870    mova                 m0, [ypxq+strideq*1+16*0]
   3871    psllw                m2, 3
   3872    psllw                m0, 3
   3873    test              wpadd, wpadd
   3874    jnz .w16_wpad2
   3875    mova                 m3, [ypxq+strideq*0+16*1]
   3876    mova                 m1, [ypxq+strideq*1+16*1]
   3877    psllw                m3, 3
   3878    psllw                m1, 3
   3879 .w16_wpad_end:
   3880    lea                ypxq, [ypxq+strideq*2]
   3881    mova         [acq+16*0], m2
   3882    pmaddwd              m2, m5
   3883    mova         [acq+16*1], m3
   3884    pmaddwd              m3, m5
   3885    paddd                m4, m2
   3886    pmaddwd              m2, m5, m0
   3887    mova         [acq+16*2], m0
   3888    paddd                m4, m3
   3889    pmaddwd              m3, m5, m1
   3890    mova         [acq+16*3], m1
   3891    add                 acq, 16*4
   3892    paddd                m2, m3
   3893    paddd                m4, m2
   3894    sub                  hd, 2
   3895    jg .w16_loop
   3896    add               hpadd, hpadd
   3897    jz mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_ssse3).dc
   3898    paddd                m2, m2
   3899    jmp mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_ssse3).hpad
   3900 .w32_wpad6:
   3901    pshufhw              m1, m0, q3333
   3902    punpckhqdq           m1, m1
   3903    mova                 m2, m1
   3904    mova                 m3, m1
   3905    jmp .w32_wpad_end
   3906 .w32_wpad4:
   3907    pshufhw              m2, m1, q3333
   3908    punpckhqdq           m2, m2
   3909    mova                 m3, m2
   3910    jmp .w32_wpad_end
   3911 .w32_wpad2:
   3912    pshufhw              m3, m2, q3333
   3913    punpckhqdq           m3, m3
   3914    jmp .w32_wpad_end
   3915 .w32:
   3916    movifnidn         wpadd, wpadm
   3917    mov                  r5, acq
   3918    WIN64_SPILL_XMM       8
   3919 .w32_loop:
   3920    mova                 m0, [ypxq+16*0]
   3921    psllw                m0, 3
   3922    cmp               wpadd, 4
   3923    jg .w32_wpad6
   3924    mova                 m1, [ypxq+16*1]
   3925    psllw                m1, 3
   3926    je .w32_wpad4
   3927    mova                 m2, [ypxq+16*2]
   3928    psllw                m2, 3
   3929    jnp .w32_wpad2
   3930    mova                 m3, [ypxq+16*3]
   3931    psllw                m3, 3
   3932 .w32_wpad_end:
   3933    add                ypxq, strideq
   3934    pmaddwd              m6, m5, m0
   3935    mova         [acq+16*0], m0
   3936    pmaddwd              m7, m5, m1
   3937    mova         [acq+16*1], m1
   3938    paddd                m6, m7
   3939    pmaddwd              m7, m5, m2
   3940    mova         [acq+16*2], m2
   3941    paddd                m6, m7
   3942    pmaddwd              m7, m5, m3
   3943    mova         [acq+16*3], m3
   3944    add                 acq, 16*4
   3945    paddd                m6, m7
   3946    paddd                m4, m6
   3947    dec                  hd
   3948    jg .w32_loop
   3949 %if WIN64
   3950    mova                 m5, m6
   3951    WIN64_RESTORE_XMM
   3952    SWAP                  5, 6
   3953 %endif
   3954    test              hpadd, hpadd
   3955    jz mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_ssse3).dc
   3956 .w32_hpad_loop:
   3957    mova         [acq+16*0], m0
   3958    mova         [acq+16*1], m1
   3959    paddd                m4, m6
   3960    mova         [acq+16*2], m2
   3961    mova         [acq+16*3], m3
   3962    add                 acq, 16*4
   3963    dec               hpadd
   3964    jg .w32_hpad_loop
   3965    jmp mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_ssse3).dc
   3966 
   3967 cglobal pal_pred_16bpc, 4, 5, 6, dst, stride, pal, idx, w, h
   3968 %define base r2-pal_pred_16bpc_ssse3_table
   3969 %if ARCH_X86_32
   3970    %define              hd  r2d
   3971 %endif
   3972    mova                 m4, [palq]
   3973    LEA                  r2, pal_pred_16bpc_ssse3_table
   3974    tzcnt                wd, wm
   3975    pshufb               m4, [base+pal_pred_shuf]
   3976    movsxd               wq, [r2+wq*4]
   3977    pshufd               m5, m4, q1032
   3978    add                  wq, r2
   3979    movifnidn            hd, hm
   3980    jmp                  wq
   3981 .w4:
   3982    movq                 m0, [idxq]
   3983    add                idxq, 8
   3984    psrlw                m1, m0, 4
   3985    punpcklbw            m0, m1
   3986    pshufb               m1, m4, m0
   3987    pshufb               m2, m5, m0
   3988    punpcklbw            m0, m1, m2
   3989    punpckhbw            m1, m2
   3990    movq   [dstq+strideq*0], m0
   3991    movhps [dstq+strideq*1], m0
   3992    lea                dstq, [dstq+strideq*2]
   3993    movq   [dstq+strideq*0], m1
   3994    movhps [dstq+strideq*1], m1
   3995    lea                dstq, [dstq+strideq*2]
   3996    sub                  hd, 4
   3997    jg .w4
   3998    RET
   3999 .w8:
   4000    movu                 m3, [idxq]
   4001    add                idxq, 16
   4002    psrlw                m1, m3, 4
   4003    punpcklbw            m0, m3, m1
   4004    punpckhbw            m3, m1
   4005    pshufb               m1, m4, m0
   4006    pshufb               m2, m5, m0
   4007    punpcklbw            m0, m1, m2
   4008    punpckhbw            m1, m2
   4009    mova   [dstq+strideq*0], m0
   4010    mova   [dstq+strideq*1], m1
   4011    lea                dstq, [dstq+strideq*2]
   4012    pshufb               m1, m4, m3
   4013    pshufb               m2, m5, m3
   4014    punpcklbw            m0, m1, m2
   4015    punpckhbw            m1, m2
   4016    mova   [dstq+strideq*0], m0
   4017    mova   [dstq+strideq*1], m1
   4018    lea                dstq, [dstq+strideq*2]
   4019    sub                  hd, 4
   4020    jg .w8
   4021    RET
   4022 .w16:
   4023    movu                 m3, [idxq]
   4024    add                idxq, 16
   4025    psrlw                m1, m3, 4
   4026    punpcklbw            m0, m3, m1
   4027    punpckhbw            m3, m1
   4028    pshufb               m1, m4, m0
   4029    pshufb               m2, m5, m0
   4030    punpcklbw            m0, m1, m2
   4031    punpckhbw            m1, m2
   4032    mova          [dstq+ 0], m0
   4033    mova          [dstq+16], m1
   4034    pshufb               m1, m4, m3
   4035    pshufb               m2, m5, m3
   4036    punpcklbw            m0, m1, m2
   4037    punpckhbw            m1, m2
   4038    mova  [dstq+strideq+ 0], m0
   4039    mova  [dstq+strideq+16], m1
   4040    lea                dstq, [dstq+strideq*2]
   4041    sub                  hd, 2
   4042    jg .w16
   4043    RET
   4044 .w32:
   4045    movu                 m3, [idxq]
   4046    add                idxq, 16
   4047    psrlw                m1, m3, 4
   4048    punpcklbw            m0, m3, m1
   4049    punpckhbw            m3, m1
   4050    pshufb               m1, m4, m0
   4051    pshufb               m2, m5, m0
   4052    punpcklbw            m0, m1, m2
   4053    punpckhbw            m1, m2
   4054    mova        [dstq+16*0], m0
   4055    mova        [dstq+16*1], m1
   4056    pshufb               m1, m4, m3
   4057    pshufb               m2, m5, m3
   4058    punpcklbw            m0, m1, m2
   4059    punpckhbw            m1, m2
   4060    mova        [dstq+16*2], m0
   4061    mova        [dstq+16*3], m1
   4062    add                dstq, strideq
   4063    dec                  hd
   4064    jg .w32
   4065    RET
   4066 .w64:
   4067    movu                 m3, [idxq+16*0]
   4068    psrlw                m1, m3, 4
   4069    punpcklbw            m0, m3, m1
   4070    punpckhbw            m3, m1
   4071    pshufb               m1, m4, m0
   4072    pshufb               m2, m5, m0
   4073    punpcklbw            m0, m1, m2
   4074    punpckhbw            m1, m2
   4075    mova        [dstq+16*0], m0
   4076    mova        [dstq+16*1], m1
   4077    pshufb               m1, m4, m3
   4078    pshufb               m2, m5, m3
   4079    movu                 m3, [idxq+16*1]
   4080    add                idxq, 32
   4081    punpcklbw            m0, m1, m2
   4082    punpckhbw            m1, m2
   4083    mova        [dstq+16*2], m0
   4084    mova        [dstq+16*3], m1
   4085    psrlw                m1, m3, 4
   4086    punpcklbw            m0, m3, m1
   4087    punpckhbw            m3, m1
   4088    pshufb               m1, m4, m0
   4089    pshufb               m2, m5, m0
   4090    punpcklbw            m0, m1, m2
   4091    punpckhbw            m1, m2
   4092    mova        [dstq+16*4], m0
   4093    mova        [dstq+16*5], m1
   4094    pshufb               m1, m4, m3
   4095    pshufb               m2, m5, m3
   4096    punpcklbw            m0, m1, m2
   4097    punpckhbw            m1, m2
   4098    mova        [dstq+16*6], m0
   4099    mova        [dstq+16*7], m1
   4100    add                dstq, strideq
   4101    dec                  hd
   4102    jg .w64
   4103    RET