tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

ipred_avx2.asm (186558B)


      1 ; Copyright © 2018-2021, VideoLAN and dav1d authors
      2 ; Copyright © 2018, Two Orioles, LLC
      3 ; All rights reserved.
      4 ;
      5 ; Redistribution and use in source and binary forms, with or without
      6 ; modification, are permitted provided that the following conditions are met:
      7 ;
      8 ; 1. Redistributions of source code must retain the above copyright notice, this
      9 ;    list of conditions and the following disclaimer.
     10 ;
     11 ; 2. Redistributions in binary form must reproduce the above copyright notice,
     12 ;    this list of conditions and the following disclaimer in the documentation
     13 ;    and/or other materials provided with the distribution.
     14 ;
     15 ; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
     16 ; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
     17 ; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
     18 ; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
     19 ; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
     20 ; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
     21 ; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
     22 ; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
     23 ; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
     24 ; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     25 
     26 %include "config.asm"
     27 %include "ext/x86/x86inc.asm"
     28 
     29 %if ARCH_X86_64
     30 
     31 SECTION_RODATA 64
     32 
     33 %macro SMOOTH_WEIGHT_TABLE 1-*
     34    %rep %0
     35        db %1-128, 127-%1
     36        %rotate 1
     37    %endrep
     38 %endmacro
     39 
     40 ; sm_weights[], but modified to precalculate x and 256-x with offsets to
     41 ; enable efficient use of pmaddubsw (which requires signed values)
     42 smooth_weights: SMOOTH_WEIGHT_TABLE         \
     43      0,   0, 255, 128, 255, 149,  85,  64, \
     44    255, 197, 146, 105,  73,  50,  37,  32, \
     45    255, 225, 196, 170, 145, 123, 102,  84, \
     46     68,  54,  43,  33,  26,  20,  17,  16, \
     47    255, 240, 225, 210, 196, 182, 169, 157, \
     48    145, 133, 122, 111, 101,  92,  83,  74, \
     49     66,  59,  52,  45,  39,  34,  29,  25, \
     50     21,  17,  14,  12,  10,   9,   8,   8, \
     51    255, 248, 240, 233, 225, 218, 210, 203, \
     52    196, 189, 182, 176, 169, 163, 156, 150, \
     53    144, 138, 133, 127, 121, 116, 111, 106, \
     54    101,  96,  91,  86,  82,  77,  73,  69, \
     55     65,  61,  57,  54,  50,  47,  44,  41, \
     56     38,  35,  32,  29,  27,  25,  22,  20, \
     57     18,  16,  15,  13,  12,  10,   9,   8, \
     58      7,   6,   6,   5,   5,   4,   4,   4
     59 
     60 pb_1to32:     db  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16
     61              db 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32
     62 pb_32to1:     db 32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17
     63 pb_16to1:     db 16, 15, 14, 13, 12, 11, 10,  9,  8,  7,  6,  5,  4,  3,  2,  1
     64 z_filter_wh:  db  7,  7, 11, 11, 15, 15, 19, 19, 19, 23, 23, 23, 31, 31, 31, 39
     65              db 39, 39, 47, 47, 47, 63, 63, 63, 79, 79, 79, -1
     66 z_filter_k:   db  0, 16,  0, 16,  0, 20,  0, 20,  8, 16,  8, 16
     67              db 32, 16, 32, 16, 24, 20, 24, 20, 16, 16, 16, 16
     68              db  0,  0,  0,  0,  0,  0,  0,  0,  8,  0,  8,  0
     69 const \
     70 z_filter_s,   db  0,  0,  0,  1,  1,  2,  2,  3,  3,  4,  4,  5,  5,  6,  6,  7
     71              db  7,  8,  8,  9,  9, 10, 10, 11, 11, 12, 12, 13, 13, 14, 14, 15
     72              db 15, 15, 15, 15, 15, 15, 15, 15 ; should be in one cache line
     73 pb_128:       times 4 db 128 ; those are just placed here for alignment.
     74 pb_36_m4:     times 2 db 36, -4
     75 z3_shuf:      db  8,  7,  7,  6,  6,  5,  5,  4,  4,  3,  3,  2,  2,  1,  1,  0
     76 z_filter_t0:  db 55,127, 39,127, 39,127,  7, 15, 31,  7, 15, 31,  0,  3, 31,  0
     77 z_filter_t1:  db 39, 63, 19, 47, 19, 47,  3,  3,  3,  3,  3,  3,  0,  0,  0,  0
     78 z_upsample1:  db  1,  0,  2,  1,  3,  2,  4,  3,  5,  4,  6,  5,  7,  6,  8,  7
     79 z_upsample2:  db  2,  3,  3,  4,  4,  5,  5,  6,  6,  7,  7,  8,  8,  8,  8,  8
     80 z2_upsample:  db  7,  6, 15, 14,  5,  4, 13, 12,  3,  2, 11, 10,  1,  0,  9,  8
     81 z1_shuf_w4:   db  0,  1,  1,  2,  2,  3,  3,  4,  8,  9,  9, 10, 10, 11, 11, 12
     82 z2_shuf_h2:   db  3,  2,  7,  6, 11, 10, 15, 14,  2,  1,  6,  5, 10,  9, 14, 13
     83 z2_shuf_h4:   db  7,  6, 15, 14,  6,  5, 14, 13,  5,  4, 13, 12,  4,  3, 12, 11
     84 z3_shuf_w4:   db  4,  3,  3,  2,  2,  1,  1,  0, 12, 11, 11, 10, 10,  9,  9,  8
     85 z_transpose4: db  0,  4,  8, 12,  1,  5,  9, 13,  2,  6, 10, 14,  3,  7, 11, 15
     86 z_base_inc:   dw   0*64,   1*64,   2*64,   3*64,   4*64,   5*64,   6*64,   7*64
     87              dw  16*64,  17*64,  18*64,  19*64,  20*64,  21*64,  22*64,  23*64
     88 z2_base_inc:  dw   1*64,   2*64,   3*64,   4*64,   5*64,   6*64,   7*64,   8*64
     89              dw   9*64,  10*64,  11*64,  12*64,  13*64,  14*64,  15*64,  16*64
     90 z2_ymul:      dw  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16
     91 z2_y_shuf_h4: db 90, 90, 90, 90, 14, 14, 14, 14, 27, 27, 27, 27, 31, 31, 31, 31 ; 2, 6, 3, 7
     92              db 32, 32, 32, 32, 12, 12, 12, 12,  1,  0,  1,  0,  5, -1, -1, -1 ; 0, 4, 1, 5
     93 ; vpermd indices in bits 4..6 of filter_shuf1: 0, 2, 6, 4, 1, 3, 7, 5
     94 filter_shuf1: db 10,  4, 10,  4, 37,  6,  5,  6,103,  9,  7,  9, 72, -1,  8, -1
     95              db 16,  4,  0,  4, 53,  6,  5,  6,119, 11,  7, 11, 95, -1, 15, -1
     96 filter_shuf2: db  3,  4,  3,  4,  5,  6,  5,  6,  7,  2,  7,  2,  1, -1,  1, -1
     97 filter_shuf3: db  3,  4,  3,  4,  5,  6,  5,  6,  7, 11,  7, 11; 15, -1, 15, -1
     98 pb_127_m127:  times 2 db 127, -127
     99 ipred_v_shuf: db  0,  1,  0,  1,  4,  5,  4,  5,  8,  9,  8,  9, 12, 13, 12, 13
    100              db  2,  3,  2,  3,  6,  7,  6,  7, 10, 11, 10, 11, 14, 15, 14, 15
    101 ipred_h_shuf: db  7,  7,  7,  7,  3,  3,  3,  3,  5,  5,  5,  5,  1,  1,  1,  1
    102              db  6,  6,  6,  6,  2,  2,  2,  2,  4,  4,  4,  4;  0,  0,  0,  0
    103 pw_64:        times 2 dw 64
    104 
    105 cfl_ac_444_w16_pad1_shuffle: db 0, -1, 1, -1, 2, -1, 3, -1, 4, -1, 5, -1, 6, -1
    106                             times 9 db 7, -1
    107 cfl_ac_w16_pad_shuffle: ; w=16, w_pad=1
    108                        db 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
    109                        ; w=8, w_pad=1 as well as second half of previous one
    110 cfl_ac_w8_pad1_shuffle: db 0, 1, 2, 3, 4, 5
    111                        times 5 db 6, 7
    112                        ; w=16,w_pad=2
    113                        db 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
    114                        times 8 db 14, 15
    115                        ; w=16,w_pad=3
    116                        db 0, 1, 2, 3, 4, 5
    117                        times 13 db 6, 7
    118 pb_15to0:               db 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
    119 
    120 %define pb_0to15 cfl_ac_w16_pad_shuffle
    121 %define pb_1  (ipred_h_shuf+12)
    122 %define pb_2  (ipred_h_shuf+20)
    123 %define pb_3  (ipred_h_shuf+ 4)
    124 %define pb_4  (ipred_h_shuf+24)
    125 %define pb_5  (ipred_h_shuf+ 8)
    126 %define pb_7  (ipred_h_shuf+ 0)
    127 %define pb_8  (z_upsample2 +12)
    128 %define pb_12 (z2_y_shuf_h4+20)
    129 %define pb_14 (z2_y_shuf_h4+ 4)
    130 %define pb_15 (z_filter_s  +32)
    131 %define pb_27 (z2_y_shuf_h4+ 8)
    132 %define pb_31 (z2_y_shuf_h4+12)
    133 %define pb_32 (z2_y_shuf_h4+16)
    134 %define pb_90 (z2_y_shuf_h4+ 0)
    135 %define pw_1  (z2_y_shuf_h4+24)
    136 %define pw_8  (z_filter_k  +32)
    137 
    138 pw_62:    times 2 dw 62
    139 pw_128:   times 2 dw 128
    140 pw_255:   times 2 dw 255
    141 pw_512:   times 2 dw 512
    142 
    143 %macro JMP_TABLE 3-*
    144    %xdefine %1_%2_table (%%table - 2*4)
    145    %xdefine %%base mangle(private_prefix %+ _%1_8bpc_%2)
    146    %%table:
    147    %rep %0 - 2
    148        dd %%base %+ .%3 - (%%table - 2*4)
    149        %rotate 1
    150    %endrep
    151 %endmacro
    152 
    153 %define ipred_dc_splat_avx2_table (ipred_dc_avx2_table + 10*4)
    154 %define ipred_cfl_splat_avx2_table (ipred_cfl_avx2_table + 8*4)
    155 
    156 JMP_TABLE ipred_smooth,     avx2, w4, w8, w16, w32, w64
    157 JMP_TABLE ipred_smooth_v,   avx2, w4, w8, w16, w32, w64
    158 JMP_TABLE ipred_smooth_h,   avx2, w4, w8, w16, w32, w64
    159 JMP_TABLE ipred_paeth,      avx2, w4, w8, w16, w32, w64
    160 JMP_TABLE ipred_filter,     avx2, w4, w8, w16, w32
    161 JMP_TABLE ipred_dc,         avx2, h4, h8, h16, h32, h64, w4, w8, w16, w32, w64, \
    162                                  s4-10*4, s8-10*4, s16-10*4, s32-10*4, s64-10*4
    163 JMP_TABLE ipred_dc_left,    avx2, h4, h8, h16, h32, h64
    164 JMP_TABLE ipred_h,          avx2, w4, w8, w16, w32, w64
    165 JMP_TABLE ipred_z1,         avx2, w4, w8, w16, w32, w64
    166 JMP_TABLE ipred_z2,         avx2, w4, w8, w16, w32, w64
    167 JMP_TABLE ipred_z3,         avx2, h4, h8, h16, h32, h64
    168 JMP_TABLE ipred_cfl,        avx2, h4, h8, h16, h32, w4, w8, w16, w32, \
    169                                  s4-8*4, s8-8*4, s16-8*4, s32-8*4
    170 JMP_TABLE ipred_cfl_left,   avx2, h4, h8, h16, h32
    171 JMP_TABLE ipred_cfl_ac_420, avx2, w16_pad1, w16_pad2, w16_pad3
    172 JMP_TABLE ipred_cfl_ac_422, avx2, w16_pad1, w16_pad2, w16_pad3
    173 JMP_TABLE ipred_cfl_ac_444, avx2, w32_pad1, w32_pad2, w32_pad3, w4, w8, w16, w32
    174 JMP_TABLE pal_pred,         avx2, w4, w8, w16, w32, w64
    175 
    176 cextern dr_intra_derivative
    177 cextern filter_intra_taps
    178 
    179 SECTION .text
    180 
    181 INIT_YMM avx2
    182 cglobal ipred_dc_top_8bpc, 3, 7, 6, dst, stride, tl, w, h
    183    lea                  r5, [ipred_dc_left_avx2_table]
    184    tzcnt                wd, wm
    185    inc                 tlq
    186    movu                 m0, [tlq]
    187    movifnidn            hd, hm
    188    mov                 r6d, 0x8000
    189    shrx                r6d, r6d, wd
    190    movd                xm3, r6d
    191    movsxd               r6, [r5+wq*4]
    192    pcmpeqd              m2, m2
    193    pmaddubsw            m0, m2
    194    add                  r6, r5
    195    add                  r5, ipred_dc_splat_avx2_table-ipred_dc_left_avx2_table
    196    movsxd               wq, [r5+wq*4]
    197    add                  wq, r5
    198    jmp                  r6
    199 
    200 cglobal ipred_dc_left_8bpc, 3, 7, 6, dst, stride, tl, w, h, stride3
    201    mov                  hd, hm ; zero upper half
    202    tzcnt               r6d, hd
    203    sub                 tlq, hq
    204    tzcnt                wd, wm
    205    movu                 m0, [tlq]
    206    mov                 r5d, 0x8000
    207    shrx                r5d, r5d, r6d
    208    movd                xm3, r5d
    209    lea                  r5, [ipred_dc_left_avx2_table]
    210    movsxd               r6, [r5+r6*4]
    211    pcmpeqd              m2, m2
    212    pmaddubsw            m0, m2
    213    add                  r6, r5
    214    add                  r5, ipred_dc_splat_avx2_table-ipred_dc_left_avx2_table
    215    movsxd               wq, [r5+wq*4]
    216    add                  wq, r5
    217    jmp                  r6
    218 .h64:
    219    movu                 m1, [tlq+32] ; unaligned when jumping here from dc_top
    220    pmaddubsw            m1, m2
    221    paddw                m0, m1
    222 .h32:
    223    vextracti128        xm1, m0, 1
    224    paddw               xm0, xm1
    225 .h16:
    226    punpckhqdq          xm1, xm0, xm0
    227    paddw               xm0, xm1
    228 .h8:
    229    psrlq               xm1, xm0, 32
    230    paddw               xm0, xm1
    231 .h4:
    232    pmaddwd             xm0, xm2
    233    pmulhrsw            xm0, xm3
    234    lea            stride3q, [strideq*3]
    235    vpbroadcastb         m0, xm0
    236    mova                 m1, m0
    237    jmp                  wq
    238 
    239 cglobal ipred_dc_8bpc, 3, 7, 6, dst, stride, tl, w, h, stride3
    240    movifnidn            hd, hm
    241    movifnidn            wd, wm
    242    tzcnt               r6d, hd
    243    lea                 r5d, [wq+hq]
    244    movd                xm4, r5d
    245    tzcnt               r5d, r5d
    246    movd                xm5, r5d
    247    lea                  r5, [ipred_dc_avx2_table]
    248    tzcnt                wd, wd
    249    movsxd               r6, [r5+r6*4]
    250    movsxd               wq, [r5+wq*4+5*4]
    251    pcmpeqd              m3, m3
    252    psrlw               xm4, 1
    253    add                  r6, r5
    254    add                  wq, r5
    255    lea            stride3q, [strideq*3]
    256    jmp                  r6
    257 .h4:
    258    movd                xm0, [tlq-4]
    259    pmaddubsw           xm0, xm3
    260    jmp                  wq
    261 .w4:
    262    movd                xm1, [tlq+1]
    263    pmaddubsw           xm1, xm3
    264    psubw               xm0, xm4
    265    paddw               xm0, xm1
    266    pmaddwd             xm0, xm3
    267    cmp                  hd, 4
    268    jg .w4_mul
    269    psrlw               xm0, 3
    270    jmp .w4_end
    271 .w4_mul:
    272    punpckhqdq          xm1, xm0, xm0
    273    lea                 r2d, [hq*2]
    274    mov                 r6d, 0x55563334
    275    paddw               xm0, xm1
    276    shrx                r6d, r6d, r2d
    277    psrlq               xm1, xm0, 32
    278    paddw               xm0, xm1
    279    movd                xm1, r6d
    280    psrlw               xm0, 2
    281    pmulhuw             xm0, xm1
    282 .w4_end:
    283    vpbroadcastb        xm0, xm0
    284 .s4:
    285    movd   [dstq+strideq*0], xm0
    286    movd   [dstq+strideq*1], xm0
    287    movd   [dstq+strideq*2], xm0
    288    movd   [dstq+stride3q ], xm0
    289    lea                dstq, [dstq+strideq*4]
    290    sub                  hd, 4
    291    jg .s4
    292    RET
    293 ALIGN function_align
    294 .h8:
    295    movq                xm0, [tlq-8]
    296    pmaddubsw           xm0, xm3
    297    jmp                  wq
    298 .w8:
    299    movq                xm1, [tlq+1]
    300    vextracti128        xm2, m0, 1
    301    pmaddubsw           xm1, xm3
    302    psubw               xm0, xm4
    303    paddw               xm0, xm2
    304    punpckhqdq          xm2, xm0, xm0
    305    paddw               xm0, xm2
    306    paddw               xm0, xm1
    307    psrlq               xm1, xm0, 32
    308    paddw               xm0, xm1
    309    pmaddwd             xm0, xm3
    310    psrlw               xm0, xm5
    311    cmp                  hd, 8
    312    je .w8_end
    313    mov                 r6d, 0x5556
    314    mov                 r2d, 0x3334
    315    cmp                  hd, 32
    316    cmove               r6d, r2d
    317    movd                xm1, r6d
    318    pmulhuw             xm0, xm1
    319 .w8_end:
    320    vpbroadcastb        xm0, xm0
    321 .s8:
    322    movq   [dstq+strideq*0], xm0
    323    movq   [dstq+strideq*1], xm0
    324    movq   [dstq+strideq*2], xm0
    325    movq   [dstq+stride3q ], xm0
    326    lea                dstq, [dstq+strideq*4]
    327    sub                  hd, 4
    328    jg .s8
    329    RET
    330 ALIGN function_align
    331 .h16:
    332    mova                xm0, [tlq-16]
    333    pmaddubsw           xm0, xm3
    334    jmp                  wq
    335 .w16:
    336    movu                xm1, [tlq+1]
    337    vextracti128        xm2, m0, 1
    338    pmaddubsw           xm1, xm3
    339    psubw               xm0, xm4
    340    paddw               xm0, xm2
    341    paddw               xm0, xm1
    342    punpckhqdq          xm1, xm0, xm0
    343    paddw               xm0, xm1
    344    psrlq               xm1, xm0, 32
    345    paddw               xm0, xm1
    346    pmaddwd             xm0, xm3
    347    psrlw               xm0, xm5
    348    cmp                  hd, 16
    349    je .w16_end
    350    mov                 r6d, 0x5556
    351    mov                 r2d, 0x3334
    352    test                 hb, 8|32
    353    cmovz               r6d, r2d
    354    movd                xm1, r6d
    355    pmulhuw             xm0, xm1
    356 .w16_end:
    357    vpbroadcastb        xm0, xm0
    358 .s16:
    359    mova   [dstq+strideq*0], xm0
    360    mova   [dstq+strideq*1], xm0
    361    mova   [dstq+strideq*2], xm0
    362    mova   [dstq+stride3q ], xm0
    363    lea                dstq, [dstq+strideq*4]
    364    sub                  hd, 4
    365    jg .s16
    366    RET
    367 ALIGN function_align
    368 .h32:
    369    mova                 m0, [tlq-32]
    370    pmaddubsw            m0, m3
    371    jmp                  wq
    372 .w32:
    373    movu                 m1, [tlq+1]
    374    pmaddubsw            m1, m3
    375    paddw                m0, m1
    376    vextracti128        xm1, m0, 1
    377    psubw               xm0, xm4
    378    paddw               xm0, xm1
    379    punpckhqdq          xm1, xm0, xm0
    380    paddw               xm0, xm1
    381    psrlq               xm1, xm0, 32
    382    paddw               xm0, xm1
    383    pmaddwd             xm0, xm3
    384    psrlw               xm0, xm5
    385    cmp                  hd, 32
    386    je .w32_end
    387    lea                 r2d, [hq*2]
    388    mov                 r6d, 0x33345556
    389    shrx                r6d, r6d, r2d
    390    movd                xm1, r6d
    391    pmulhuw             xm0, xm1
    392 .w32_end:
    393    vpbroadcastb         m0, xm0
    394 .s32:
    395    mova   [dstq+strideq*0], m0
    396    mova   [dstq+strideq*1], m0
    397    mova   [dstq+strideq*2], m0
    398    mova   [dstq+stride3q ], m0
    399    lea                dstq, [dstq+strideq*4]
    400    sub                  hd, 4
    401    jg .s32
    402    RET
    403 ALIGN function_align
    404 .h64:
    405    mova                 m0, [tlq-64]
    406    mova                 m1, [tlq-32]
    407    pmaddubsw            m0, m3
    408    pmaddubsw            m1, m3
    409    paddw                m0, m1
    410    jmp                  wq
    411 .w64:
    412    movu                 m1, [tlq+ 1]
    413    movu                 m2, [tlq+33]
    414    pmaddubsw            m1, m3
    415    pmaddubsw            m2, m3
    416    paddw                m0, m1
    417    paddw                m0, m2
    418    vextracti128        xm1, m0, 1
    419    psubw               xm0, xm4
    420    paddw               xm0, xm1
    421    punpckhqdq          xm1, xm0, xm0
    422    paddw               xm0, xm1
    423    psrlq               xm1, xm0, 32
    424    paddw               xm0, xm1
    425    pmaddwd             xm0, xm3
    426    psrlw               xm0, xm5
    427    cmp                  hd, 64
    428    je .w64_end
    429    mov                 r6d, 0x33345556
    430    shrx                r6d, r6d, hd
    431    movd                xm1, r6d
    432    pmulhuw             xm0, xm1
    433 .w64_end:
    434    vpbroadcastb         m0, xm0
    435    mova                 m1, m0
    436 .s64:
    437    mova [dstq+strideq*0+32*0], m0
    438    mova [dstq+strideq*0+32*1], m1
    439    mova [dstq+strideq*1+32*0], m0
    440    mova [dstq+strideq*1+32*1], m1
    441    mova [dstq+strideq*2+32*0], m0
    442    mova [dstq+strideq*2+32*1], m1
    443    mova [dstq+stride3q +32*0], m0
    444    mova [dstq+stride3q +32*1], m1
    445    lea                dstq, [dstq+strideq*4]
    446    sub                  hd, 4
    447    jg .s64
    448    RET
    449 
    450 cglobal ipred_dc_128_8bpc, 2, 7, 6, dst, stride, tl, w, h, stride3
    451    lea                  r5, [ipred_dc_splat_avx2_table]
    452    tzcnt                wd, wm
    453    movifnidn            hd, hm
    454    movsxd               wq, [r5+wq*4]
    455    vpbroadcastd         m0, [r5-ipred_dc_splat_avx2_table+pb_128]
    456    mova                 m1, m0
    457    add                  wq, r5
    458    lea            stride3q, [strideq*3]
    459    jmp                  wq
    460 
    461 cglobal ipred_v_8bpc, 3, 7, 6, dst, stride, tl, w, h, stride3
    462    lea                  r5, [ipred_dc_splat_avx2_table]
    463    tzcnt                wd, wm
    464    movu                 m0, [tlq+ 1]
    465    movu                 m1, [tlq+33]
    466    movifnidn            hd, hm
    467    movsxd               wq, [r5+wq*4]
    468    add                  wq, r5
    469    lea            stride3q, [strideq*3]
    470    jmp                  wq
    471 
    472 %macro IPRED_H 2 ; w, store_type
    473    vpbroadcastb         m0, [tlq-1]
    474    vpbroadcastb         m1, [tlq-2]
    475    vpbroadcastb         m2, [tlq-3]
    476    sub                 tlq, 4
    477    vpbroadcastb         m3, [tlq+0]
    478    mov%2  [dstq+strideq*0], m0
    479    mov%2  [dstq+strideq*1], m1
    480    mov%2  [dstq+strideq*2], m2
    481    mov%2  [dstq+stride3q ], m3
    482    lea                dstq, [dstq+strideq*4]
    483    sub                  hd, 4
    484    jg .w%1
    485    RET
    486 ALIGN function_align
    487 %endmacro
    488 
    489 INIT_XMM avx2
    490 cglobal ipred_h_8bpc, 3, 6, 4, dst, stride, tl, w, h, stride3
    491    lea                  r5, [ipred_h_avx2_table]
    492    tzcnt                wd, wm
    493    movifnidn            hd, hm
    494    movsxd               wq, [r5+wq*4]
    495    add                  wq, r5
    496    lea            stride3q, [strideq*3]
    497    jmp                  wq
    498 .w4:
    499    IPRED_H               4, d
    500 .w8:
    501    IPRED_H               8, q
    502 .w16:
    503    IPRED_H              16, a
    504 INIT_YMM avx2
    505 .w32:
    506    IPRED_H              32, a
    507 .w64:
    508    vpbroadcastb         m0, [tlq-1]
    509    vpbroadcastb         m1, [tlq-2]
    510    vpbroadcastb         m2, [tlq-3]
    511    sub                 tlq, 4
    512    vpbroadcastb         m3, [tlq+0]
    513    mova [dstq+strideq*0+32*0], m0
    514    mova [dstq+strideq*0+32*1], m0
    515    mova [dstq+strideq*1+32*0], m1
    516    mova [dstq+strideq*1+32*1], m1
    517    mova [dstq+strideq*2+32*0], m2
    518    mova [dstq+strideq*2+32*1], m2
    519    mova [dstq+stride3q +32*0], m3
    520    mova [dstq+stride3q +32*1], m3
    521    lea                dstq, [dstq+strideq*4]
    522    sub                  hd, 4
    523    jg .w64
    524    RET
    525 
    526 %macro PAETH 2 ; top, ldiff
    527    pavgb                m1, m%1, m3 ; Calculating tldiff normally requires
    528    pxor                 m0, m%1, m3 ; 10-bit intermediates, but we can do it
    529    pand                 m0, m4      ; in 8-bit with some tricks which avoids
    530    psubusb              m2, m5, m1  ; having to unpack everything to 16-bit.
    531    psubb                m1, m0
    532    psubusb              m1, m5
    533    por                  m1, m2
    534    paddusb              m1, m1
    535    por                  m1, m0      ; min(tldiff, 255)
    536    psubusb              m2, m5, m3
    537    psubusb              m0, m3, m5
    538    por                  m2, m0      ; tdiff
    539    pminub               m2, m%2
    540    pcmpeqb              m0, m%2, m2 ; ldiff <= tdiff
    541    vpblendvb            m0, m%1, m3, m0
    542    pminub               m1, m2
    543    pcmpeqb              m1, m2      ; ldiff <= tldiff || tdiff <= tldiff
    544    vpblendvb            m0, m5, m0, m1
    545 %endmacro
    546 
    547 cglobal ipred_paeth_8bpc, 3, 6, 9, dst, stride, tl, w, h
    548 %define base r5-ipred_paeth_avx2_table
    549    lea                  r5, [ipred_paeth_avx2_table]
    550    tzcnt                wd, wm
    551    vpbroadcastb         m5, [tlq]   ; topleft
    552    movifnidn            hd, hm
    553    movsxd               wq, [r5+wq*4]
    554    vpbroadcastd         m4, [base+pb_1]
    555    add                  wq, r5
    556    jmp                  wq
    557 .w4:
    558    vpbroadcastd         m6, [tlq+1] ; top
    559    mova                 m8, [base+ipred_h_shuf]
    560    lea                  r3, [strideq*3]
    561    psubusb              m7, m5, m6
    562    psubusb              m0, m6, m5
    563    por                  m7, m0      ; ldiff
    564 .w4_loop:
    565    sub                 tlq, 8
    566    vpbroadcastq         m3, [tlq]
    567    pshufb               m3, m8      ; left
    568    PAETH                 6, 7
    569    vextracti128        xm1, m0, 1
    570    movd   [dstq+strideq*0], xm0
    571    movd   [dstq+strideq*1], xm1
    572    pextrd [dstq+strideq*2], xm0, 2
    573    pextrd [dstq+r3       ], xm1, 2
    574    cmp                  hd, 4
    575    je .ret
    576    lea                dstq, [dstq+strideq*4]
    577    pextrd [dstq+strideq*0], xm0, 1
    578    pextrd [dstq+strideq*1], xm1, 1
    579    pextrd [dstq+strideq*2], xm0, 3
    580    pextrd [dstq+r3       ], xm1, 3
    581    lea                dstq, [dstq+strideq*4]
    582    sub                  hd, 8
    583    jg .w4_loop
    584 .ret:
    585    RET
    586 ALIGN function_align
    587 .w8:
    588    vpbroadcastq         m6, [tlq+1]
    589    mova                 m8, [base+ipred_h_shuf]
    590    lea                  r3, [strideq*3]
    591    psubusb              m7, m5, m6
    592    psubusb              m0, m6, m5
    593    por                  m7, m0
    594 .w8_loop:
    595    sub                 tlq, 4
    596    vpbroadcastd         m3, [tlq]
    597    pshufb               m3, m8
    598    PAETH                 6, 7
    599    vextracti128        xm1, m0, 1
    600    movq   [dstq+strideq*0], xm0
    601    movq   [dstq+strideq*1], xm1
    602    movhps [dstq+strideq*2], xm0
    603    movhps [dstq+r3       ], xm1
    604    lea                dstq, [dstq+strideq*4]
    605    sub                  hd, 4
    606    jg .w8_loop
    607    RET
    608 ALIGN function_align
    609 .w16:
    610    vbroadcasti128       m6, [tlq+1]
    611    mova                xm8, xm4 ; lower half = 1, upper half = 0
    612    psubusb              m7, m5, m6
    613    psubusb              m0, m6, m5
    614    por                  m7, m0
    615 .w16_loop:
    616    sub                 tlq, 2
    617    vpbroadcastd         m3, [tlq]
    618    pshufb               m3, m8
    619    PAETH                 6, 7
    620    mova         [dstq+strideq*0], xm0
    621    vextracti128 [dstq+strideq*1], m0, 1
    622    lea                dstq, [dstq+strideq*2]
    623    sub                  hd, 2
    624    jg .w16_loop
    625    RET
    626 ALIGN function_align
    627 .w32:
    628    movu                 m6, [tlq+1]
    629    psubusb              m7, m5, m6
    630    psubusb              m0, m6, m5
    631    por                  m7, m0
    632 .w32_loop:
    633    dec                 tlq
    634    vpbroadcastb         m3, [tlq]
    635    PAETH                 6, 7
    636    mova             [dstq], m0
    637    add                dstq, strideq
    638    dec                  hd
    639    jg .w32_loop
    640    RET
    641 ALIGN function_align
    642 .w64:
    643    movu                 m6, [tlq+ 1]
    644    movu                 m7, [tlq+33]
    645 %if WIN64
    646    movaps              r4m, xmm9
    647 %endif
    648    psubusb              m8, m5, m6
    649    psubusb              m0, m6, m5
    650    psubusb              m9, m5, m7
    651    psubusb              m1, m7, m5
    652    por                  m8, m0
    653    por                  m9, m1
    654 .w64_loop:
    655    dec                 tlq
    656    vpbroadcastb         m3, [tlq]
    657    PAETH                 6, 8
    658    mova        [dstq+32*0], m0
    659    PAETH                 7, 9
    660    mova        [dstq+32*1], m0
    661    add                dstq, strideq
    662    dec                  hd
    663    jg .w64_loop
    664 %if WIN64
    665    movaps             xmm9, r4m
    666 %endif
    667    RET
    668 
    669 %macro SMOOTH 6 ; src[1-2], mul[1-2], add[1-2]
    670    ; w * a         = (w - 128) * a + 128 * a
    671    ; (256 - w) * b = (127 - w) * b + 129 * b
    672    pmaddubsw            m0, m%3, m%1
    673    pmaddubsw            m1, m%4, m%2
    674    paddw                m0, m%5
    675    paddw                m1, m%6
    676    psrlw                m0, 8
    677    psrlw                m1, 8
    678    packuswb             m0, m1
    679 %endmacro
    680 
    681 cglobal ipred_smooth_v_8bpc, 3, 7, 0, dst, stride, tl, w, h, weights
    682 %define base r6-ipred_smooth_v_avx2_table
    683    lea                  r6, [ipred_smooth_v_avx2_table]
    684    tzcnt                wd, wm
    685    mov                  hd, hm
    686    movsxd               wq, [r6+wq*4]
    687    vpbroadcastd         m0, [base+pb_127_m127]
    688    vpbroadcastd         m1, [base+pw_128]
    689    lea            weightsq, [base+smooth_weights+hq*4]
    690    neg                  hq
    691    vpbroadcastb         m5, [tlq+hq] ; bottom
    692    add                  wq, r6
    693    jmp                  wq
    694 .w4:
    695    vpbroadcastd         m2, [tlq+1]
    696    punpcklbw            m2, m5 ; top, bottom
    697    mova                 m5, [base+ipred_v_shuf]
    698    lea                  r3, [strideq*3]
    699    punpckldq            m4, m5, m5
    700    punpckhdq            m5, m5
    701    pmaddubsw            m3, m2, m0
    702    paddw                m1, m2 ;   1 * top + 256 * bottom + 128, overflow is ok
    703    paddw                m3, m1 ; 128 * top + 129 * bottom + 128
    704 .w4_loop:
    705    vbroadcasti128       m1, [weightsq+hq*2]
    706    pshufb               m0, m1, m4
    707    pshufb               m1, m5
    708    SMOOTH                0, 1, 2, 2, 3, 3
    709    vextracti128        xm1, m0, 1
    710    movd   [dstq+strideq*0], xm0
    711    movd   [dstq+strideq*1], xm1
    712    pextrd [dstq+strideq*2], xm0, 1
    713    pextrd [dstq+r3       ], xm1, 1
    714    cmp                  hd, -4
    715    je .ret
    716    lea                dstq, [dstq+strideq*4]
    717    pextrd [dstq+strideq*0], xm0, 2
    718    pextrd [dstq+strideq*1], xm1, 2
    719    pextrd [dstq+strideq*2], xm0, 3
    720    pextrd [dstq+r3       ], xm1, 3
    721    lea                dstq, [dstq+strideq*4]
    722    add                  hq, 8
    723    jl .w4_loop
    724 .ret:
    725    RET
    726 ALIGN function_align
    727 .w8:
    728    vpbroadcastq         m2, [tlq+1]
    729    punpcklbw            m2, m5
    730    mova                 m5, [base+ipred_v_shuf]
    731    lea                  r3, [strideq*3]
    732    pshufd               m4, m5, q0000
    733    pshufd               m5, m5, q1111
    734    pmaddubsw            m3, m2, m0
    735    paddw                m1, m2
    736    paddw                m3, m1
    737 .w8_loop:
    738    vpbroadcastq         m1, [weightsq+hq*2]
    739    pshufb               m0, m1, m4
    740    pshufb               m1, m5
    741    SMOOTH                0, 1, 2, 2, 3, 3
    742    vextracti128        xm1, m0, 1
    743    movq   [dstq+strideq*0], xm0
    744    movq   [dstq+strideq*1], xm1
    745    movhps [dstq+strideq*2], xm0
    746    movhps [dstq+r3       ], xm1
    747    lea                dstq, [dstq+strideq*4]
    748    add                  hq, 4
    749    jl .w8_loop
    750    RET
    751 ALIGN function_align
    752 .w16:
    753    WIN64_SPILL_XMM       7
    754    vbroadcasti128       m3, [tlq+1]
    755    mova                 m6, [base+ipred_v_shuf]
    756    punpcklbw            m2, m3, m5
    757    punpckhbw            m3, m5
    758    pmaddubsw            m4, m2, m0
    759    pmaddubsw            m5, m3, m0
    760    paddw                m0, m1, m2
    761    paddw                m1, m3
    762    paddw                m4, m0
    763    paddw                m5, m1
    764 .w16_loop:
    765    vpbroadcastd         m1, [weightsq+hq*2]
    766    pshufb               m1, m6
    767    SMOOTH                1, 1, 2, 3, 4, 5
    768    mova         [dstq+strideq*0], xm0
    769    vextracti128 [dstq+strideq*1], m0, 1
    770    lea                dstq, [dstq+strideq*2]
    771    add                  hq, 2
    772    jl .w16_loop
    773    RET
    774 ALIGN function_align
    775 .w32:
    776    WIN64_SPILL_XMM       6
    777    movu                 m3, [tlq+1]
    778    punpcklbw            m2, m3, m5
    779    punpckhbw            m3, m5
    780    pmaddubsw            m4, m2, m0
    781    pmaddubsw            m5, m3, m0
    782    paddw                m0, m1, m2
    783    paddw                m1, m3
    784    paddw                m4, m0
    785    paddw                m5, m1
    786 .w32_loop:
    787    vpbroadcastw         m1, [weightsq+hq*2]
    788    SMOOTH                1, 1, 2, 3, 4, 5
    789    mova             [dstq], m0
    790    add                dstq, strideq
    791    inc                  hq
    792    jl .w32_loop
    793    RET
    794 ALIGN function_align
    795 .w64:
    796    WIN64_SPILL_XMM      11
    797    movu                 m4, [tlq+ 1]
    798    movu                 m8, [tlq+33]
    799    punpcklbw            m3, m4, m5
    800    punpckhbw            m4, m5
    801    punpcklbw            m7, m8, m5
    802    punpckhbw            m8, m5
    803    pmaddubsw            m5, m3, m0
    804    pmaddubsw            m6, m4, m0
    805    pmaddubsw            m9, m7, m0
    806    pmaddubsw           m10, m8, m0
    807    paddw                m2, m1, m3
    808    paddw                m5, m2
    809    paddw                m2, m1, m4
    810    paddw                m6, m2
    811    paddw                m0, m1, m7
    812    paddw                m9, m0
    813    paddw                m1, m8
    814    paddw               m10, m1
    815 .w64_loop:
    816    vpbroadcastw         m2, [weightsq+hq*2]
    817    SMOOTH                2, 2, 3, 4, 5, 6
    818    mova        [dstq+32*0], m0
    819    SMOOTH                2, 2, 7, 8, 9, 10
    820    mova        [dstq+32*1], m0
    821    add                dstq, strideq
    822    inc                  hq
    823    jl .w64_loop
    824    RET
    825 
    826 cglobal ipred_smooth_h_8bpc, 3, 7, 0, dst, stride, tl, w, h
    827 %define base r5-ipred_smooth_h_avx2_table
    828    lea                  r5, [ipred_smooth_h_avx2_table]
    829    mov                  wd, wm
    830    vpbroadcastb         m3, [tlq+wq] ; right
    831    tzcnt                wd, wd
    832    mov                  hd, hm
    833    movsxd               wq, [r5+wq*4]
    834    vpbroadcastd         m4, [base+pb_127_m127]
    835    vpbroadcastd         m5, [base+pw_128]
    836    add                  wq, r5
    837    jmp                  wq
    838 .w4:
    839    WIN64_SPILL_XMM       8
    840    vpbroadcastq         m6, [base+smooth_weights+4*2]
    841    mova                 m7, [base+ipred_h_shuf]
    842    sub                 tlq, 8
    843    sub                 tlq, hq
    844    lea                  r3, [strideq*3]
    845 .w4_loop:
    846    vpbroadcastq         m2, [tlq+hq]
    847    pshufb               m2, m7
    848    punpcklbw            m1, m2, m3 ; left, right
    849    punpckhbw            m2, m3
    850    pmaddubsw            m0, m1, m4 ; 127 * left - 127 * right
    851    paddw                m0, m1     ; 128 * left + 129 * right
    852    pmaddubsw            m1, m6
    853    paddw                m1, m5
    854    paddw                m0, m1
    855    pmaddubsw            m1, m2, m4
    856    paddw                m1, m2
    857    pmaddubsw            m2, m6
    858    paddw                m2, m5
    859    paddw                m1, m2
    860    psrlw                m0, 8
    861    psrlw                m1, 8
    862    packuswb             m0, m1
    863    vextracti128        xm1, m0, 1
    864    movd   [dstq+strideq*0], xm0
    865    movd   [dstq+strideq*1], xm1
    866    pextrd [dstq+strideq*2], xm0, 2
    867    pextrd [dstq+r3       ], xm1, 2
    868    cmp                  hd, 4
    869    je .ret
    870    lea                dstq, [dstq+strideq*4]
    871    pextrd [dstq+strideq*0], xm0, 1
    872    pextrd [dstq+strideq*1], xm1, 1
    873    pextrd [dstq+strideq*2], xm0, 3
    874    pextrd [dstq+r3       ], xm1, 3
    875    lea                dstq, [dstq+strideq*4]
    876    sub                  hd, 8
    877    jg .w4_loop
    878 .ret:
    879    RET
    880 ALIGN function_align
    881 .w8:
    882    WIN64_SPILL_XMM       8
    883    vbroadcasti128       m6, [base+smooth_weights+8*2]
    884    mova                 m7, [base+ipred_h_shuf]
    885    sub                 tlq, 4
    886    lea                  r3, [strideq*3]
    887    sub                 tlq, hq
    888 .w8_loop:
    889    vpbroadcastd         m2, [tlq+hq]
    890    pshufb               m2, m7
    891    punpcklbw            m1, m2, m3
    892    punpckhbw            m2, m3
    893    pmaddubsw            m0, m1, m4
    894    paddw                m0, m1
    895    pmaddubsw            m1, m6
    896    paddw                m1, m5
    897    paddw                m0, m1
    898    pmaddubsw            m1, m2, m4
    899    paddw                m1, m2
    900    pmaddubsw            m2, m6
    901    paddw                m2, m5
    902    paddw                m1, m2
    903    psrlw                m0, 8
    904    psrlw                m1, 8
    905    packuswb             m0, m1
    906    vextracti128        xm1, m0, 1
    907    movq   [dstq+strideq*0], xm0
    908    movq   [dstq+strideq*1], xm1
    909    movhps [dstq+strideq*2], xm0
    910    movhps [dstq+r3       ], xm1
    911    lea                dstq, [dstq+strideq*4]
    912    sub                  hd, 4
    913    jg .w8_loop
    914    RET
    915 ALIGN function_align
    916 .w16:
    917    ALLOC_STACK        32*4, 8
    918    lea                  r3, [rsp+64*2-4]
    919    call .prep ; only worthwhile for for w16 and above
    920    sub                 tlq, 2
    921    vpbroadcastd        xm6, [base+pb_1]
    922    mova                xm7, [base+ipred_v_shuf+16]
    923    vinserti128          m7, [base+ipred_v_shuf+ 0], 1
    924    vbroadcasti128       m4, [base+smooth_weights+16*2]
    925    vbroadcasti128       m5, [base+smooth_weights+16*3]
    926 .w16_loop:
    927    vpbroadcastd         m1, [tlq+hq]
    928    vpbroadcastd         m2, [r3+hq*2]
    929    pshufb               m1, m6
    930    punpcklbw            m1, m3
    931    pshufb               m2, m7
    932    SMOOTH                4, 5, 1, 1, 2, 2
    933    mova         [dstq+strideq*0], xm0
    934    vextracti128 [dstq+strideq*1], m0, 1
    935    lea                dstq, [dstq+strideq*2]
    936    sub                  hd, 2
    937    jg .w16_loop
    938    RET
    939 ALIGN function_align
    940 .w32:
    941    ALLOC_STACK        32*4
    942    lea                  r3, [rsp+64*2-2]
    943    call .prep
    944    dec                 tlq
    945    mova                xm4, [base+smooth_weights+16*4]
    946    vinserti128          m4, [base+smooth_weights+16*6], 1
    947    mova                xm5, [base+smooth_weights+16*5]
    948    vinserti128          m5, [base+smooth_weights+16*7], 1
    949 .w32_loop:
    950    vpbroadcastb         m1, [tlq+hq]
    951    punpcklbw            m1, m3
    952    vpbroadcastw         m2, [r3+hq*2]
    953    SMOOTH                4, 5, 1, 1, 2, 2
    954    mova             [dstq], m0
    955    add                dstq, strideq
    956    dec                  hd
    957    jg .w32_loop
    958    RET
    959 ALIGN function_align
    960 .w64:
    961    ALLOC_STACK        32*4, 9
    962    lea                  r3, [rsp+64*2-2]
    963    call .prep
    964    add                  r5, smooth_weights+16*15-ipred_smooth_h_avx2_table
    965    dec                 tlq
    966    mova                xm5, [r5-16*7]
    967    vinserti128          m5, [r5-16*5], 1
    968    mova                xm6, [r5-16*6]
    969    vinserti128          m6, [r5-16*4], 1
    970    mova                xm7, [r5-16*3]
    971    vinserti128          m7, [r5-16*1], 1
    972    mova                xm8, [r5-16*2]
    973    vinserti128          m8, [r5-16*0], 1
    974 .w64_loop:
    975    vpbroadcastb         m2, [tlq+hq]
    976    punpcklbw            m2, m3
    977    vpbroadcastw         m4, [r3+hq*2]
    978    SMOOTH                5, 6, 2, 2, 4, 4
    979    mova        [dstq+32*0], m0
    980    SMOOTH                7, 8, 2, 2, 4, 4
    981    mova        [dstq+32*1], m0
    982    add                dstq, strideq
    983    dec                  hd
    984    jg .w64_loop
    985    RET
    986 ALIGN function_align
    987 .prep:
    988    vpermq               m2, [tlq-32*1], q3120
    989    punpckhbw            m1, m2, m3
    990    punpcklbw            m2, m3
    991    pmaddubsw            m0, m1, m4 ; 127 * left - 127 * right
    992    paddw                m1, m5     ;   1 * left + 256 * right + 128
    993    paddw                m0, m1     ; 128 * left + 129 * right + 128
    994    pmaddubsw            m1, m2, m4
    995    paddw                m2, m5
    996    paddw                m1, m2
    997    vpermq               m2, [tlq-32*2], q3120
    998    mova [rsp+gprsize+32*3], m0
    999    mova [rsp+gprsize+32*2], m1
   1000    punpckhbw            m1, m2, m3
   1001    punpcklbw            m2, m3
   1002    pmaddubsw            m0, m1, m4
   1003    paddw                m1, m5
   1004    paddw                m0, m1
   1005    pmaddubsw            m1, m2, m4
   1006    paddw                m2, m5
   1007    paddw                m1, m2
   1008    mova [rsp+gprsize+32*1], m0
   1009    mova [rsp+gprsize+32*0], m1
   1010    sub                  r3, hq
   1011    sub                 tlq, hq
   1012    sub                  r3, hq
   1013    ret
   1014 
   1015 %macro SMOOTH_2D_END 6 ; src[1-2], mul[1-2], add[1-2]
   1016    pmaddubsw            m0, m%3, m%1
   1017    pmaddubsw            m1, m%4, m%2
   1018 %ifnum %5
   1019    paddw                m0, m%5
   1020 %else
   1021    paddw                m0, %5
   1022 %endif
   1023 %ifnum %6
   1024    paddw                m1, m%6
   1025 %else
   1026    paddw                m1, %6
   1027 %endif
   1028    pavgw                m0, m2
   1029    pavgw                m1, m3
   1030    psrlw                m0, 8
   1031    psrlw                m1, 8
   1032    packuswb             m0, m1
   1033 %endmacro
   1034 
   1035 cglobal ipred_smooth_8bpc, 3, 7, 0, dst, stride, tl, w, h, v_weights
   1036 %define base r6-ipred_smooth_avx2_table
   1037    lea                  r6, [ipred_smooth_avx2_table]
   1038    mov                  wd, wm
   1039    vpbroadcastb         m4, [tlq+wq] ; right
   1040    tzcnt                wd, wd
   1041    mov                  hd, hm
   1042    mov                  r5, tlq
   1043    sub                  r5, hq
   1044    movsxd               wq, [r6+wq*4]
   1045    vpbroadcastd         m5, [base+pb_127_m127]
   1046    vpbroadcastb         m0, [r5] ; bottom
   1047    vpbroadcastd         m3, [base+pw_255]
   1048    add                  wq, r6
   1049    lea          v_weightsq, [base+smooth_weights+hq*2]
   1050    jmp                  wq
   1051 .w4:
   1052    WIN64_SPILL_XMM      12
   1053    mova                m10, [base+ipred_h_shuf]
   1054    vpbroadcastq        m11, [base+smooth_weights+4*2]
   1055    mova                 m7, [base+ipred_v_shuf]
   1056    vpbroadcastd         m8, [tlq+1]
   1057    sub                 tlq, 8
   1058    lea                  r3, [strideq*3]
   1059    sub                 tlq, hq
   1060    punpcklbw            m8, m0 ; top, bottom
   1061    pshufd               m6, m7, q2200
   1062    pshufd               m7, m7, q3311
   1063    pmaddubsw            m9, m8, m5
   1064    paddw                m3, m8 ;   1 * top + 255 * bottom + 255
   1065    paddw                m9, m3 ; 128 * top + 129 * bottom + 255
   1066 .w4_loop:
   1067    vpbroadcastq         m1, [tlq+hq]
   1068    pshufb               m1, m10
   1069    punpcklbw            m0, m1, m4 ; left, right
   1070    punpckhbw            m1, m4
   1071    pmaddubsw            m2, m0, m5 ; 127 * left - 127 * right
   1072    pmaddubsw            m3, m1, m5
   1073    paddw                m2, m0     ; 128 * left + 129 * right
   1074    paddw                m3, m1
   1075    pmaddubsw            m0, m11
   1076    pmaddubsw            m1, m11
   1077    paddw                m2, m0
   1078    paddw                m3, m1
   1079    vbroadcasti128       m1, [v_weightsq]
   1080    add          v_weightsq, 16
   1081    pshufb               m0, m1, m6
   1082    pshufb               m1, m7
   1083    SMOOTH_2D_END         0, 1, 8, 8, 9, 9
   1084    vextracti128        xm1, m0, 1
   1085    movd   [dstq+strideq*0], xm0
   1086    movd   [dstq+strideq*1], xm1
   1087    pextrd [dstq+strideq*2], xm0, 2
   1088    pextrd [dstq+r3       ], xm1, 2
   1089    cmp                  hd, 4
   1090    je .ret
   1091    lea                dstq, [dstq+strideq*4]
   1092    pextrd [dstq+strideq*0], xm0, 1
   1093    pextrd [dstq+strideq*1], xm1, 1
   1094    pextrd [dstq+strideq*2], xm0, 3
   1095    pextrd [dstq+r3       ], xm1, 3
   1096    lea                dstq, [dstq+strideq*4]
   1097    sub                  hd, 8
   1098    jg .w4_loop
   1099 .ret:
   1100    RET
   1101 ALIGN function_align
   1102 .w8:
   1103    WIN64_SPILL_XMM      12
   1104    mova                m10, [base+ipred_h_shuf]
   1105    vbroadcasti128      m11, [base+smooth_weights+8*2]
   1106    mova                 m7, [base+ipred_v_shuf]
   1107    vpbroadcastq         m8, [tlq+1]
   1108    sub                 tlq, 4
   1109    lea                  r3, [strideq*3]
   1110    sub                 tlq, hq
   1111    punpcklbw            m8, m0
   1112    pshufd               m6, m7, q0000
   1113    pshufd               m7, m7, q1111
   1114    pmaddubsw            m9, m8, m5
   1115    paddw                m3, m8
   1116    paddw                m9, m3
   1117 .w8_loop:
   1118    vpbroadcastd         m1, [tlq+hq]
   1119    pshufb               m1, m10
   1120    punpcklbw            m0, m1, m4
   1121    punpckhbw            m1, m4
   1122    pmaddubsw            m2, m0, m5
   1123    pmaddubsw            m3, m1, m5
   1124    paddw                m2, m0
   1125    paddw                m3, m1
   1126    pmaddubsw            m0, m11
   1127    pmaddubsw            m1, m11
   1128    paddw                m2, m0
   1129    paddw                m3, m1
   1130    vpbroadcastq         m1, [v_weightsq]
   1131    add          v_weightsq, 8
   1132    pshufb               m0, m1, m6
   1133    pshufb               m1, m7
   1134    SMOOTH_2D_END         0, 1, 8, 8, 9, 9
   1135    vextracti128        xm1, m0, 1
   1136    movq   [dstq+strideq*0], xm0
   1137    movq   [dstq+strideq*1], xm1
   1138    movhps [dstq+strideq*2], xm0
   1139    movhps [dstq+r3       ], xm1
   1140    lea                dstq, [dstq+strideq*4]
   1141    sub                  hd, 4
   1142    jg .w8_loop
   1143    RET
   1144 ALIGN function_align
   1145 .w16:
   1146    %assign regs_used 4
   1147    ALLOC_STACK       -32*4, 14
   1148    %assign regs_used 7
   1149    vbroadcasti128      m11, [tlq+1]
   1150    lea                  r3, [rsp+64*2-4]
   1151    punpcklbw           m10, m11, m0 ; top, bottom
   1152    punpckhbw           m11, m0
   1153    call .prep_v
   1154    sub                 tlq, 2
   1155    pmaddubsw           m12, m10, m5
   1156    pmaddubsw           m13, m11, m5
   1157    vpbroadcastd        xm5, [base+pb_1]
   1158    mova                 m9, [base+ipred_v_shuf]
   1159    vbroadcasti128       m6, [base+smooth_weights+16*2]
   1160    vbroadcasti128       m7, [base+smooth_weights+16*3]
   1161    vperm2i128           m8, m9, m9, 0x01
   1162    paddw                m0, m10, m3
   1163    paddw                m3, m11
   1164    paddw               m12, m0
   1165    paddw               m13, m3
   1166 .w16_loop:
   1167    vpbroadcastd         m3, [tlq+hq]
   1168    vpbroadcastd         m0, [r3+hq*2]
   1169    vpbroadcastd         m1, [v_weightsq]
   1170    add          v_weightsq, 4
   1171    pshufb               m3, m5
   1172    punpcklbw            m3, m4 ; left, right
   1173    pmaddubsw            m2, m3, m6
   1174    pmaddubsw            m3, m7
   1175    pshufb               m0, m8
   1176    pshufb               m1, m9
   1177    paddw                m2, m0
   1178    paddw                m3, m0
   1179    SMOOTH_2D_END         1, 1, 10, 11, 12, 13
   1180    mova         [dstq+strideq*0], xm0
   1181    vextracti128 [dstq+strideq*1], m0, 1
   1182    lea                dstq, [dstq+strideq*2]
   1183    sub                  hd, 2
   1184    jg .w16_loop
   1185    RET
   1186 ALIGN function_align
   1187 .w32:
   1188    %assign regs_used 4
   1189    ALLOC_STACK       -32*4, 11
   1190    %assign regs_used 7
   1191    movu                 m8, [tlq+1]
   1192    lea                  r3, [rsp+64*2-2]
   1193    punpcklbw            m7, m8, m0
   1194    punpckhbw            m8, m0
   1195    call .prep_v
   1196    dec                 tlq
   1197    pmaddubsw            m9, m7, m5
   1198    pmaddubsw           m10, m8, m5
   1199    mova                xm5, [base+smooth_weights+16*4]
   1200    vinserti128          m5, [base+smooth_weights+16*6], 1
   1201    mova                xm6, [base+smooth_weights+16*5]
   1202    vinserti128          m6, [base+smooth_weights+16*7], 1
   1203    paddw                m0, m7, m3
   1204    paddw                m3, m8
   1205    paddw                m9, m0
   1206    paddw               m10, m3
   1207 .w32_loop:
   1208    vpbroadcastb         m3, [tlq+hq]
   1209    punpcklbw            m3, m4
   1210    vpbroadcastw         m0, [r3+hq*2]
   1211    vpbroadcastw         m1, [v_weightsq]
   1212    add          v_weightsq, 2
   1213    pmaddubsw            m2, m3, m5
   1214    pmaddubsw            m3, m6
   1215    paddw                m2, m0
   1216    paddw                m3, m0
   1217    SMOOTH_2D_END         1, 1, 7, 8, 9, 10
   1218    mova             [dstq], m0
   1219    add                dstq, strideq
   1220    dec                  hd
   1221    jg .w32_loop
   1222    RET
   1223 ALIGN function_align
   1224 .w64:
   1225    %assign regs_used 4
   1226    ALLOC_STACK       -32*8, 16
   1227    %assign regs_used 7
   1228    movu                m13, [tlq+1 ]
   1229    movu                m15, [tlq+33]
   1230    add                  r6, smooth_weights+16*15-ipred_smooth_avx2_table
   1231    lea                  r3, [rsp+64*2-2]
   1232    punpcklbw           m12, m13, m0
   1233    punpckhbw           m13, m0
   1234    punpcklbw           m14, m15, m0
   1235    punpckhbw           m15, m0
   1236    call .prep_v
   1237    dec                 tlq
   1238    pmaddubsw            m0, m12, m5
   1239    pmaddubsw            m1, m13, m5
   1240    pmaddubsw            m2, m14, m5
   1241    pmaddubsw            m5, m15, m5
   1242    mova                xm8, [r6-16*7]
   1243    vinserti128          m8, [r6-16*5], 1
   1244    mova                xm9, [r6-16*6]
   1245    vinserti128          m9, [r6-16*4], 1
   1246    mova               xm10, [r6-16*3]
   1247    vinserti128         m10, [r6-16*1], 1
   1248    mova               xm11, [r6-16*2]
   1249    vinserti128         m11, [r6-16*0], 1
   1250    lea                  r6, [rsp+32*4]
   1251    paddw                m0, m3
   1252    paddw                m1, m3
   1253    paddw                m2, m3
   1254    paddw                m3, m5
   1255    paddw                m0, m12
   1256    paddw                m1, m13
   1257    paddw                m2, m14
   1258    paddw                m3, m15
   1259    mova          [r6+32*0], m0
   1260    mova          [r6+32*1], m1
   1261    mova          [r6+32*2], m2
   1262    mova          [r6+32*3], m3
   1263 .w64_loop:
   1264    vpbroadcastb         m5, [tlq+hq]
   1265    punpcklbw            m5, m4
   1266    vpbroadcastw         m6, [r3+hq*2]
   1267    vpbroadcastw         m7, [v_weightsq]
   1268    add          v_weightsq, 2
   1269    pmaddubsw            m2, m5, m8
   1270    pmaddubsw            m3, m5, m9
   1271    paddw                m2, m6
   1272    paddw                m3, m6
   1273    SMOOTH_2D_END         7, 7, 12, 13, [r6+32*0], [r6+32*1]
   1274    mova        [dstq+32*0], m0
   1275    pmaddubsw            m2, m5, m10
   1276    pmaddubsw            m3, m5, m11
   1277    paddw                m2, m6
   1278    paddw                m3, m6
   1279    SMOOTH_2D_END         7, 7, 14, 15, [r6+32*2], [r6+32*3]
   1280    mova        [dstq+32*1], m0
   1281    add                dstq, strideq
   1282    dec                  hd
   1283    jg .w64_loop
   1284    RET
   1285 ALIGN function_align
   1286 .prep_v:
   1287    vpermq               m2, [tlq-32*1], q3120
   1288    punpckhbw            m1, m2, m4
   1289    punpcklbw            m2, m4
   1290    pmaddubsw            m0, m1, m5 ; 127 * left - 127 * right
   1291    paddw                m0, m1     ; 128 * left + 129 * right
   1292    pmaddubsw            m1, m2, m5
   1293    paddw                m1, m2
   1294    vpermq               m2, [tlq-32*2], q3120
   1295    mova [rsp+gprsize+32*3], m0
   1296    mova [rsp+gprsize+32*2], m1
   1297    punpckhbw            m1, m2, m4
   1298    punpcklbw            m2, m4
   1299    pmaddubsw            m0, m1, m5
   1300    paddw                m0, m1
   1301    pmaddubsw            m1, m2, m5
   1302    paddw                m1, m2
   1303    mova [rsp+gprsize+32*1], m0
   1304    mova [rsp+gprsize+32*0], m1
   1305    sub                  r3, hq
   1306    sub                 tlq, hq
   1307    sub                  r3, hq
   1308    ret
   1309 
   1310 cglobal ipred_z1_8bpc, 3, 8, 0, dst, stride, tl, w, h, angle, dx, maxbase
   1311    lea                  r6, [ipred_z1_avx2_table]
   1312    tzcnt                wd, wm
   1313    movifnidn        angled, anglem
   1314    movifnidn            hd, hm
   1315    lea                  r7, [dr_intra_derivative]
   1316    inc                 tlq
   1317    movsxd               wq, [r6+wq*4]
   1318    add                  wq, r6
   1319    mov                 dxd, angled
   1320    and                 dxd, 0x7e
   1321    add              angled, 165 ; ~90
   1322    movzx               dxd, word [r7+dxq]
   1323    xor              angled, 0x4ff ; d = 90 - angle
   1324    vpbroadcastd         m3, [pw_512]
   1325    vpbroadcastd         m4, [pw_62]
   1326    vpbroadcastd         m5, [pw_64]
   1327    jmp                  wq
   1328 .w4:
   1329    cmp              angleb, 40
   1330    jae .w4_no_upsample
   1331    lea                 r3d, [angleq-1024]
   1332    sar                 r3d, 7
   1333    add                 r3d, hd
   1334    jg .w4_no_upsample ; !enable_intra_edge_filter || h > 8 || (h == 8 && is_sm)
   1335    ALLOC_STACK         -32, 8
   1336    mova                xm1, [tlq-1]
   1337    pshufb              xm0, xm1, [z_upsample1]
   1338    pshufb              xm1, [z_upsample2]
   1339    vpbroadcastd        xm2, [pb_36_m4] ; upshifted by 2 to be able to reuse
   1340    add                 dxd, dxd        ; pw_512 (which is already in m3)
   1341    pmaddubsw           xm0, xm2        ; for rounding instead of pw_2048
   1342    pextrd         [rsp+16], xm1, 3 ; top[max_base_x]
   1343    pmaddubsw           xm1, xm2
   1344    movd                xm7, dxd
   1345    mov                 r3d, dxd ; xpos
   1346    vpbroadcastw         m7, xm7
   1347    paddw               xm1, xm0
   1348    movq                xm0, [tlq]
   1349    pmulhrsw            xm1, xm3
   1350    pslldq               m6, m7, 8
   1351    paddw               xm2, xm7, xm7
   1352    lea                  r2, [strideq*3]
   1353    paddw                m6, m7
   1354    packuswb            xm1, xm1
   1355    paddw                m6, m2 ; xpos2 xpos3 xpos0 xpos1
   1356    punpcklbw           xm0, xm1
   1357    psllw                m7, 2
   1358    mova              [rsp], xm0
   1359 .w4_upsample_loop:
   1360    lea                 r5d, [r3+dxq]
   1361    shr                 r3d, 6 ; base0
   1362    vpbroadcastq         m1, [rsp+r3]
   1363    lea                 r3d, [r5+dxq]
   1364    shr                 r5d, 6 ; base1
   1365    vpbroadcastq         m2, [rsp+r5]
   1366    lea                 r5d, [r3+dxq]
   1367    shr                 r3d, 6 ; base2
   1368    movq                xm0, [rsp+r3]
   1369    lea                 r3d, [r5+dxq]
   1370    shr                 r5d, 6 ; base3
   1371    movhps              xm0, [rsp+r5]
   1372    vpblendd             m1, m2, 0xc0
   1373    pand                 m2, m4, m6 ; frac
   1374    vpblendd             m0, m1, 0xf0
   1375    psubw                m1, m5, m2 ; 64-frac
   1376    psllw                m2, 8
   1377    por                  m1, m2     ; 64-frac, frac
   1378    pmaddubsw            m0, m1
   1379    paddw                m6, m7     ; xpos += dx
   1380    pmulhrsw             m0, m3
   1381    packuswb             m0, m0
   1382    vextracti128        xm1, m0, 1
   1383    movd   [dstq+strideq*2], xm0
   1384    pextrd [dstq+r2       ], xm0, 1
   1385    movd   [dstq+strideq*0], xm1
   1386    pextrd [dstq+strideq*1], xm1, 1
   1387    lea                dstq, [dstq+strideq*4]
   1388    sub                  hd, 4
   1389    jg .w4_upsample_loop
   1390    RET
   1391 ALIGN function_align
   1392 .filter_strength: ; w4/w8/w16
   1393    ; The C version uses a lot of branches, but we can do all the comparisons
   1394    ; in parallel and use popcnt to get the final filter strength value.
   1395 %define base r3-z_filter_t0
   1396    lea                  r3, [z_filter_t0]
   1397    movd                xm0, maxbased
   1398    movd                xm2, angled
   1399    shr              angled, 8 ; is_sm << 1
   1400    vpbroadcastb         m0, xm0
   1401    vpbroadcastb         m2, xm2
   1402    pcmpeqb              m1, m0, [base+z_filter_wh]
   1403    pand                 m1, m2
   1404    mova                xm2, [r3+angleq*8] ; upper ymm half zero in both cases
   1405    pcmpgtb              m1, m2
   1406    pmovmskb            r5d, m1
   1407    ret
   1408 .w4_no_upsample:
   1409    ALLOC_STACK         -16, 11
   1410    mov            maxbased, 7
   1411    test             angled, 0x400 ; !enable_intra_edge_filter
   1412    jnz .w4_main
   1413    lea            maxbased, [hq+3]
   1414    call .filter_strength
   1415    mov            maxbased, 7
   1416    test                r5d, r5d
   1417    jz .w4_main ; filter_strength == 0
   1418    popcnt              r5d, r5d
   1419    vpbroadcastd         m7, [base+pb_8]
   1420    vbroadcasti128       m2, [tlq-1]
   1421    pminub               m1, m7, [base+z_filter_s]
   1422    vpbroadcastd         m8, [base+z_filter_k-4+r5*4+12*0]
   1423    pminub               m7, [base+z_filter_s+8]
   1424    vpbroadcastd         m9, [base+z_filter_k-4+r5*4+12*1]
   1425    vpbroadcastd        m10, [base+z_filter_k-4+r5*4+12*2]
   1426    pshufb               m0, m2, m1
   1427    shufps               m1, m7, q2121
   1428    pmaddubsw            m0, m8
   1429    pshufb               m1, m2, m1
   1430    pmaddubsw            m1, m9
   1431    pshufb               m2, m7
   1432    pmaddubsw            m2, m10
   1433    paddw                m0, m1
   1434    paddw                m0, m2
   1435    pmulhrsw             m0, m3
   1436    mov                 r3d, 9
   1437    mov                 tlq, rsp
   1438    cmp                  hd, 4
   1439    cmovne         maxbased, r3d
   1440    vextracti128        xm1, m0, 1
   1441    packuswb            xm0, xm1
   1442    mova              [tlq], xm0
   1443 .w4_main:
   1444    movd                xm6, dxd
   1445    vpbroadcastq         m0, [z_base_inc] ; base_inc << 6
   1446    vpbroadcastb         m7, [tlq+maxbaseq]
   1447    shl            maxbased, 6
   1448    vpbroadcastw         m6, xm6
   1449    mov                 r3d, dxd ; xpos
   1450    movd                xm9, maxbased
   1451    vpbroadcastw         m9, xm9
   1452    vbroadcasti128       m8, [z1_shuf_w4]
   1453    psrlw                m7, 8  ; top[max_base_x]
   1454    paddw               m10, m6, m6
   1455    psubw                m9, m0 ; max_base_x
   1456    vpblendd             m6, m10, 0xcc
   1457    mova                xm0, xm10
   1458    paddw                m6, m0 ; xpos2 xpos3 xpos0 xpos1
   1459    paddw               m10, m10
   1460 .w4_loop:
   1461    lea                 r5d, [r3+dxq]
   1462    shr                 r3d, 6 ; base0
   1463    vpbroadcastq         m1, [tlq+r3]
   1464    lea                 r3d, [r5+dxq]
   1465    shr                 r5d, 6 ; base1
   1466    vpbroadcastq         m2, [tlq+r5]
   1467    lea                 r5d, [r3+dxq]
   1468    shr                 r3d, 6 ; base2
   1469    movq                xm0, [tlq+r3]
   1470    lea                 r3d, [r5+dxq]
   1471    shr                 r5d, 6 ; base3
   1472    movhps              xm0, [tlq+r5]
   1473    vpblendd             m1, m2, 0xc0
   1474    pand                 m2, m4, m6 ; frac
   1475    vpblendd             m0, m1, 0xf0
   1476    psubw                m1, m5, m2 ; 64-frac
   1477    psllw                m2, 8
   1478    pshufb               m0, m8
   1479    por                  m1, m2     ; 64-frac, frac
   1480    pmaddubsw            m0, m1
   1481    pcmpgtw              m1, m9, m6 ; base < max_base_x
   1482    pmulhrsw             m0, m3
   1483    paddw                m6, m10    ; xpos += dx
   1484    lea                  r5, [dstq+strideq*2]
   1485    vpblendvb            m0, m7, m0, m1
   1486    packuswb             m0, m0
   1487    vextracti128        xm1, m0, 1
   1488    movd   [r5  +strideq*0], xm0
   1489    pextrd [r5  +strideq*1], xm0, 1
   1490    movd   [dstq+strideq*0], xm1
   1491    pextrd [dstq+strideq*1], xm1, 1
   1492    sub                  hd, 4
   1493    jz .w4_end
   1494    lea                dstq, [dstq+strideq*4]
   1495    cmp                 r3d, maxbased
   1496    jb .w4_loop
   1497    packuswb            xm7, xm7
   1498    lea                  r6, [strideq*3]
   1499 .w4_end_loop:
   1500    movd   [dstq+strideq*0], xm7
   1501    movd   [dstq+strideq*1], xm7
   1502    movd   [dstq+strideq*2], xm7
   1503    movd   [dstq+r6       ], xm7
   1504    lea                dstq, [dstq+strideq*4]
   1505    sub                  hd, 4
   1506    jg .w4_end_loop
   1507 .w4_end:
   1508    RET
   1509 ALIGN function_align
   1510 .w8:
   1511    lea                 r3d, [angleq+216]
   1512    mov                 r3b, hb
   1513    cmp                 r3d, 8
   1514    ja .w8_no_upsample ; !enable_intra_edge_filter || is_sm || d >= 40 || h > 8
   1515    ALLOC_STACK         -32, 8
   1516    movu                xm2, [z_filter_s+6]
   1517    mova                xm0, [tlq-1]
   1518    movd                xm6, hd
   1519    vinserti128          m0, [tlq+7], 1
   1520    vpbroadcastb        xm6, xm6
   1521    vbroadcasti128       m1, [z_upsample1]
   1522    pminub              xm6, xm2
   1523    vpbroadcastd         m7, [pb_36_m4]
   1524    vinserti128          m2, xm6, 1
   1525    add                 dxd, dxd
   1526    pshufb               m1, m0, m1
   1527    pshufb               m2, m0, m2
   1528    movd                xm6, dxd
   1529    pmaddubsw            m1, m7
   1530    pmaddubsw            m2, m7
   1531    vpbroadcastw         m6, xm6
   1532    mov                 r3d, dxd
   1533    psrldq               m0, 1
   1534    lea                  r2, [strideq*3]
   1535    paddw                m7, m6, m6
   1536    paddw                m1, m2
   1537    vpblendd             m6, m7, 0xf0
   1538    pmulhrsw             m1, m3
   1539    pslldq               m2, m7, 8
   1540    paddw                m7, m7
   1541    paddw                m6, m2
   1542    packuswb             m1, m1
   1543    punpcklbw            m0, m1
   1544    mova              [rsp], m0
   1545 .w8_upsample_loop:
   1546    lea                 r5d, [r3+dxq]
   1547    shr                 r3d, 6 ; base0
   1548    movu                xm0, [rsp+r3]
   1549    lea                 r3d, [r5+dxq]
   1550    shr                 r5d, 6 ; base1
   1551    vinserti128          m0, [rsp+r5], 1
   1552    lea                 r5d, [r3+dxq]
   1553    shr                 r3d, 6 ; base2
   1554    pand                 m1, m4, m6
   1555    psubw                m2, m5, m1
   1556    psllw                m1, 8
   1557    por                  m2, m1
   1558    punpcklqdq           m1, m2, m2 ; frac0 frac1
   1559    pmaddubsw            m0, m1
   1560    movu                xm1, [rsp+r3]
   1561    lea                 r3d, [r5+dxq]
   1562    shr                 r5d, 6 ; base3
   1563    vinserti128          m1, [rsp+r5], 1
   1564    punpckhqdq           m2, m2 ; frac2 frac3
   1565    pmaddubsw            m1, m2
   1566    pmulhrsw             m0, m3
   1567    paddw                m6, m7
   1568    pmulhrsw             m1, m3
   1569    packuswb             m0, m1
   1570    vextracti128        xm1, m0, 1
   1571    movq   [dstq+strideq*0], xm0
   1572    movhps [dstq+strideq*2], xm0
   1573    movq   [dstq+strideq*1], xm1
   1574    movhps [dstq+r2       ], xm1
   1575    lea                dstq, [dstq+strideq*4]
   1576    sub                  hd, 4
   1577    jg .w8_upsample_loop
   1578    RET
   1579 .w8_no_intra_edge_filter:
   1580    and            maxbased, 7
   1581    or             maxbased, 8 ; imin(h+7, 15)
   1582    jmp .w8_main
   1583 .w8_no_upsample:
   1584    ALLOC_STACK         -32, 10
   1585    lea            maxbased, [hq+7]
   1586    test             angled, 0x400
   1587    jnz .w8_no_intra_edge_filter
   1588    call .filter_strength
   1589    test                r5d, r5d
   1590    jz .w8_main ; filter_strength == 0
   1591    popcnt              r5d, r5d
   1592    movu                xm2, [tlq]
   1593    pminub              xm1, xm0, [base+z_filter_s+14]
   1594    vinserti128          m2, [tlq-1], 1
   1595    vinserti128          m1, [base+z_filter_s+ 0], 1
   1596    vpbroadcastd         m7, [base+z_filter_k-4+r5*4+12*0]
   1597    pminub              xm0, [base+z_filter_s+22]
   1598    vinserti128          m0, [base+z_filter_s+ 8], 1
   1599    pshufb               m6, m2, m1
   1600    pmaddubsw            m6, m7
   1601    vpbroadcastd         m7, [base+z_filter_k-4+r5*4+12*1]
   1602    movzx               r3d, byte [tlq+15]
   1603    shufps               m1, m0, q2121
   1604    pshufb               m1, m2, m1
   1605    pmaddubsw            m1, m7
   1606    paddw                m1, m6
   1607    sub                 r5d, 3
   1608    jnz .w8_3tap
   1609    ; filter_strength == 3 uses a 5-tap filter instead of a 3-tap one,
   1610    ; which also results in an awkward edge case where out[w*2] is
   1611    ; slightly different from out[max_base_x] when h > w.
   1612    vpbroadcastd         m7, [z_filter_k+4*8]
   1613    movzx               r2d, byte [tlq+14]
   1614    pshufb               m2, m0
   1615    pmaddubsw            m2, m7
   1616    sub                 r2d, r3d
   1617    lea                 r2d, [r2+r3*8+4]
   1618    shr                 r2d, 3 ; (tlq[w*2-2] + tlq[w*2-1]*7 + 4) >> 3
   1619    mov            [rsp+16], r2b
   1620    paddw                m1, m2
   1621 .w8_3tap:
   1622    pmulhrsw             m1, m3
   1623    sar                 r5d, 1
   1624    mov                 tlq, rsp
   1625    add                 r5d, 17 ; w*2 + (filter_strength == 3)
   1626    cmp                  hd, 16
   1627    cmovns         maxbased, r5d
   1628    mov            [tlq+r5], r3b
   1629    vextracti128        xm0, m1, 1
   1630    packuswb            xm0, xm1
   1631    mova              [tlq], xm0
   1632 .w8_main:
   1633    movd                xm2, dxd
   1634    vbroadcasti128       m0, [z_base_inc]
   1635    vpbroadcastw         m2, xm2
   1636    vpbroadcastb         m7, [tlq+maxbaseq]
   1637    shl            maxbased, 6
   1638    movd                xm9, maxbased
   1639    vbroadcasti128       m8, [z_filter_s+2]
   1640    vpbroadcastw         m9, xm9
   1641    psrlw                m7, 8
   1642    psubw                m9, m0
   1643    mov                 r3d, dxd
   1644    paddw                m6, m2, m2
   1645    vpblendd             m2, m6, 0xf0
   1646 .w8_loop:
   1647    lea                 r5d, [r3+dxq]
   1648    shr                 r3d, 6
   1649    pand                 m0, m4, m2
   1650    psubw                m1, m5, m0
   1651    psllw                m0, 8
   1652    por                  m1, m0
   1653    movu                xm0, [tlq+r3]
   1654    lea                 r3d, [r5+dxq]
   1655    shr                 r5d, 6 ; base1
   1656    vinserti128          m0, [tlq+r5], 1
   1657    pshufb               m0, m8
   1658    pmaddubsw            m0, m1
   1659    pcmpgtw              m1, m9, m2
   1660    paddw                m2, m6
   1661    pmulhrsw             m0, m3
   1662    vpblendvb            m0, m7, m0, m1
   1663    vextracti128        xm1, m0, 1
   1664    packuswb            xm0, xm1
   1665    movq   [dstq+strideq*0], xm0
   1666    movhps [dstq+strideq*1], xm0
   1667    sub                  hd, 2
   1668    jz .w8_end
   1669    lea                dstq, [dstq+strideq*2]
   1670    cmp                 r3d, maxbased
   1671    jb .w8_loop
   1672    packuswb            xm7, xm7
   1673 .w8_end_loop:
   1674    movq   [dstq+strideq*0], xm7
   1675    movq   [dstq+strideq*1], xm7
   1676    lea                dstq, [dstq+strideq*2]
   1677    sub                  hd, 2
   1678    jg .w8_end_loop
   1679 .w8_end:
   1680    RET
   1681 .w16_no_intra_edge_filter:
   1682    and            maxbased, 15
   1683    or             maxbased, 16 ; imin(h+15, 31)
   1684    jmp .w16_main
   1685 ALIGN function_align
   1686 .w16:
   1687    ALLOC_STACK         -64, 12
   1688    lea            maxbased, [hq+15]
   1689    test             angled, 0x400
   1690    jnz .w16_no_intra_edge_filter
   1691    call .filter_strength
   1692    test                r5d, r5d
   1693    jz .w16_main ; filter_strength == 0
   1694    popcnt              r5d, r5d
   1695    vpbroadcastd         m1, [base+pb_12]
   1696    vbroadcasti128       m6, [base+z_filter_s+8]
   1697    vinserti128          m2, m6, [base+z_filter_s], 0
   1698    vinserti128          m6, [base+z_filter_s+16], 1
   1699    mova               xm10, [tlq-1]
   1700    vinserti128         m10, [tlq+3], 1
   1701    vpbroadcastd         m9, [base+z_filter_k-4+r5*4+12*0]
   1702    vbroadcasti128       m7, [base+z_filter_s+14]
   1703    vinserti128          m8, m7, [base+z_filter_s+6], 0
   1704    vinserti128          m7, [base+z_filter_s+22], 1
   1705    psubw                m0, m1
   1706    movu               xm11, [tlq+12]
   1707    vinserti128         m11, [tlq+16], 1
   1708    pminub               m8, m0
   1709    pminub               m7, m0
   1710    pshufb               m0, m10, m2
   1711    shufps               m2, m6, q2121
   1712    pmaddubsw            m0, m9
   1713    pshufb               m1, m11, m8
   1714    shufps               m8, m7, q2121
   1715    pmaddubsw            m1, m9
   1716    vpbroadcastd         m9, [base+z_filter_k-4+r5*4+12*1]
   1717    movzx               r3d, byte [tlq+31]
   1718    pshufb               m2, m10, m2
   1719    pmaddubsw            m2, m9
   1720    pshufb               m8, m11, m8
   1721    pmaddubsw            m8, m9
   1722    paddw                m0, m2
   1723    paddw                m1, m8
   1724    sub                 r5d, 3
   1725    jnz .w16_3tap
   1726    vpbroadcastd         m9, [z_filter_k+4*8]
   1727    movzx               r2d, byte [tlq+30]
   1728    pshufb              m10, m6
   1729    pmaddubsw           m10, m9
   1730    pshufb              m11, m7
   1731    pmaddubsw           m11, m9
   1732    sub                 r2d, r3d
   1733    lea                 r2d, [r2+r3*8+4]
   1734    shr                 r2d, 3
   1735    mov            [rsp+32], r2b
   1736    paddw                m0, m10
   1737    paddw                m1, m11
   1738 .w16_3tap:
   1739    pmulhrsw             m0, m3
   1740    pmulhrsw             m1, m3
   1741    sar                 r5d, 1
   1742    mov                 tlq, rsp
   1743    add                 r5d, 33
   1744    cmp                  hd, 32
   1745    cmovns         maxbased, r5d
   1746    mov            [tlq+r5], r3b
   1747    packuswb             m0, m1
   1748    vpermq               m0, m0, q3120
   1749    mova              [tlq], m0
   1750 .w16_main:
   1751    movd                xm6, dxd
   1752    vbroadcasti128       m0, [z_base_inc]
   1753    vpbroadcastb         m7, [tlq+maxbaseq]
   1754    shl            maxbased, 6
   1755    vpbroadcastw         m6, xm6
   1756    movd                xm9, maxbased
   1757    vbroadcasti128       m8, [z_filter_s+2]
   1758    vpbroadcastw         m9, xm9
   1759    mov                 r3d, dxd
   1760    psubw                m9, m0
   1761    paddw               m11, m6, m6
   1762    psubw               m10, m9, m3 ; 64*8
   1763    vpblendd             m6, m11, 0xf0
   1764 .w16_loop:
   1765    lea                 r5d, [r3+dxq]
   1766    shr                 r3d, 6 ; base0
   1767    pand                 m1, m4, m6
   1768    psubw                m2, m5, m1
   1769    psllw                m1, 8
   1770    por                  m2, m1
   1771    movu                xm0, [tlq+r3+0]
   1772    movu                xm1, [tlq+r3+8]
   1773    lea                 r3d, [r5+dxq]
   1774    shr                 r5d, 6 ; base1
   1775    vinserti128          m0, [tlq+r5+0], 1
   1776    vinserti128          m1, [tlq+r5+8], 1
   1777    pshufb               m0, m8
   1778    pshufb               m1, m8
   1779    pmaddubsw            m0, m2
   1780    pmaddubsw            m1, m2
   1781    pmulhrsw             m0, m3
   1782    pmulhrsw             m1, m3
   1783    packuswb             m0, m1
   1784    pcmpgtw              m1, m9, m6
   1785    pcmpgtw              m2, m10, m6
   1786    packsswb             m1, m2
   1787    paddw                m6, m11
   1788    vpblendvb            m0, m7, m0, m1
   1789    mova         [dstq+strideq*0], xm0
   1790    vextracti128 [dstq+strideq*1], m0, 1
   1791    sub                  hd, 2
   1792    jz .w16_end
   1793    lea                dstq, [dstq+strideq*2]
   1794    cmp                 r3d, maxbased
   1795    jb .w16_loop
   1796 .w16_end_loop:
   1797    mova   [dstq+strideq*0], xm7
   1798    mova   [dstq+strideq*1], xm7
   1799    lea                dstq, [dstq+strideq*2]
   1800    sub                  hd, 2
   1801    jg .w16_end_loop
   1802 .w16_end:
   1803    RET
   1804 ALIGN function_align
   1805 .w32:
   1806    ALLOC_STACK         -96, 15
   1807    lea                 r3d, [hq+31]
   1808    mov            maxbased, 63
   1809    cmp                  hd, 32
   1810    cmovs          maxbased, r3d
   1811    test             angled, 0x400 ; !enable_intra_edge_filter
   1812    jnz .w32_main
   1813    vbroadcasti128       m0, [pb_0to15]
   1814    sub                 r3d, 29 ; h+2
   1815    movu               xm13, [tlq+29]    ; 32-39
   1816    movd                xm1, r3d
   1817    movu               xm14, [tlq+37]    ; 40-47
   1818    sub                 r3d, 8 ; h-6
   1819    vinserti128         m14, [tlq+51], 1 ; 56-63
   1820    vpbroadcastb        xm1, xm1
   1821    mova               xm11, [tlq- 1]    ;  0- 7
   1822    vinserti128         m11, [tlq+13], 1 ; 16-23
   1823    movd                xm2, r3d
   1824    movu               xm12, [tlq+ 5]    ;  8-15
   1825    vinserti128         m12, [tlq+19], 1 ; 24-31
   1826    pminub              xm1, xm0 ; clip 32x8
   1827    mova                 m7, [z_filter_s+0]
   1828    pshufb             xm13, xm1
   1829    vpbroadcastd         m1, [pb_12]
   1830    vpbroadcastb        xm2, xm2
   1831    vinserti128         m13, [tlq+43], 1 ; 48-55
   1832    vinserti128          m8, m7, [z_filter_s+4], 1
   1833    vpblendd             m2, m1, 0xf0
   1834    vinserti128          m7, [z_filter_s+12], 0
   1835    pminub               m2, m0 ; clip 32x16 and 32x(32|64)
   1836    vpbroadcastd         m9, [z_filter_k+4*2+12*0]
   1837    pshufb              m14, m2
   1838    pshufb               m0, m11, m8
   1839    shufps               m8, m7, q1021
   1840    pmaddubsw            m0, m9
   1841    pshufb               m2, m12, m8
   1842    pmaddubsw            m2, m9
   1843    pshufb               m1, m13, m8
   1844    pmaddubsw            m1, m9
   1845    pshufb               m6, m14, m8
   1846    pmaddubsw            m6, m9
   1847    vpbroadcastd         m9, [z_filter_k+4*2+12*1]
   1848    pshufb              m10, m11, m8
   1849    shufps               m8, m7, q2121
   1850    pmaddubsw           m10, m9
   1851    paddw                m0, m10
   1852    pshufb              m10, m12, m8
   1853    pmaddubsw           m10, m9
   1854    paddw                m2, m10
   1855    pshufb              m10, m13, m8
   1856    pmaddubsw           m10, m9
   1857    paddw                m1, m10
   1858    pshufb              m10, m14, m8
   1859    pmaddubsw           m10, m9
   1860    paddw                m6, m10
   1861    vpbroadcastd         m9, [z_filter_k+4*2+12*2]
   1862    pshufb              m11, m8
   1863    pmaddubsw           m11, m9
   1864    pshufb              m12, m7
   1865    pmaddubsw           m12, m9
   1866    movzx               r3d, byte [tlq+63]
   1867    movzx               r2d, byte [tlq+62]
   1868    paddw                m0, m11
   1869    paddw                m2, m12
   1870    pshufb              m13, m7
   1871    pmaddubsw           m13, m9
   1872    pshufb              m14, m7
   1873    pmaddubsw           m14, m9
   1874    paddw                m1, m13
   1875    paddw                m6, m14
   1876    sub                 r2d, r3d
   1877    lea                 r2d, [r2+r3*8+4] ; edge case for 32x64
   1878    pmulhrsw             m0, m3
   1879    pmulhrsw             m2, m3
   1880    pmulhrsw             m1, m3
   1881    pmulhrsw             m6, m3
   1882    shr                 r2d, 3
   1883    mov            [rsp+64], r2b
   1884    mov                 tlq, rsp
   1885    mov            [tlq+65], r3b
   1886    mov                 r3d, 65
   1887    cmp                  hd, 64
   1888    cmove          maxbased, r3d
   1889    packuswb             m0, m2
   1890    packuswb             m1, m6
   1891    mova           [tlq+ 0], m0
   1892    mova           [tlq+32], m1
   1893 .w32_main:
   1894    movd                xm6, dxd
   1895    vpbroadcastb         m7, [tlq+maxbaseq]
   1896    shl            maxbased, 6
   1897    vpbroadcastw         m6, xm6
   1898    movd                xm9, maxbased
   1899    vbroadcasti128       m8, [z_filter_s+2]
   1900    vpbroadcastw         m9, xm9
   1901    mov                 r5d, dxd
   1902    psubw                m9, [z_base_inc]
   1903    mova                m11, m6
   1904    psubw               m10, m9, m3 ; 64*8
   1905 .w32_loop:
   1906    mov                 r3d, r5d
   1907    shr                 r3d, 6
   1908    pand                 m1, m4, m6
   1909    psubw                m2, m5, m1
   1910    psllw                m1, 8
   1911    por                  m2, m1
   1912    movu                 m0, [tlq+r3+0]
   1913    movu                 m1, [tlq+r3+8]
   1914    add                 r5d, dxd
   1915    pshufb               m0, m8
   1916    pshufb               m1, m8
   1917    pmaddubsw            m0, m2
   1918    pmaddubsw            m1, m2
   1919    pmulhrsw             m0, m3
   1920    pmulhrsw             m1, m3
   1921    packuswb             m0, m1
   1922    pcmpgtw              m1, m9, m6
   1923    pcmpgtw              m2, m10, m6
   1924    packsswb             m1, m2
   1925    paddw                m6, m11
   1926    vpblendvb            m0, m7, m0, m1
   1927    mova             [dstq], m0
   1928    dec                  hd
   1929    jz .w32_end
   1930    add                dstq, strideq
   1931    cmp                 r5d, maxbased
   1932    jb .w32_loop
   1933    test                 hb, 1
   1934    jz .w32_end_loop
   1935    mova             [dstq], m7
   1936    add                dstq, strideq
   1937    dec                  hd
   1938    jz .w32_end
   1939 .w32_end_loop:
   1940    mova   [dstq+strideq*0], m7
   1941    mova   [dstq+strideq*1], m7
   1942    lea                dstq, [dstq+strideq*2]
   1943    sub                  hd, 2
   1944    jg .w32_end_loop
   1945 .w32_end:
   1946    RET
   1947 ALIGN function_align
   1948 .w64:
   1949    ALLOC_STACK        -128, 16
   1950    lea            maxbased, [hq+63]
   1951    test             angled, 0x400 ; !enable_intra_edge_filter
   1952    jnz .w64_main
   1953    mova               xm11, [tlq- 1]    ;  0- 7
   1954    vinserti128         m11, [tlq+13], 1 ; 16-23
   1955    movu               xm12, [tlq+ 5]    ;  8-15
   1956    vinserti128         m12, [tlq+19], 1 ; 24-31
   1957    mova                 m7, [z_filter_s+0]
   1958    vinserti128          m8, m7, [z_filter_s+4], 1
   1959    vinserti128          m7, [z_filter_s+12], 0
   1960    vpbroadcastd         m9, [z_filter_k+4*2+12*0]
   1961    movu               xm13, [tlq+29]    ; 32-39
   1962    vinserti128         m13, [tlq+43], 1 ; 48-55
   1963    movu               xm14, [tlq+37]    ; 40-47
   1964    vinserti128         m14, [tlq+51], 1 ; 56-63
   1965    pshufb               m0, m11, m8
   1966    shufps               m8, m7, q1021
   1967    pmaddubsw            m0, m9
   1968    pshufb               m2, m12, m8
   1969    pmaddubsw            m2, m9
   1970    pshufb               m1, m13, m8
   1971    pmaddubsw            m1, m9
   1972    pshufb               m6, m14, m8
   1973    pmaddubsw            m6, m9
   1974    vpbroadcastd         m9, [z_filter_k+4*2+12*1]
   1975    pshufb              m10, m11, m8
   1976    shufps              m15, m8, m7, q2121
   1977    pmaddubsw           m10, m9
   1978    paddw                m0, m10
   1979    pshufb              m10, m12, m15
   1980    pmaddubsw           m10, m9
   1981    paddw                m2, m10
   1982    pshufb              m10, m13, m15
   1983    pmaddubsw           m10, m9
   1984    paddw                m1, m10
   1985    pshufb              m10, m14, m15
   1986    pmaddubsw           m10, m9
   1987    paddw                m6, m10
   1988    vpbroadcastd        m10, [z_filter_k+4*2+12*2]
   1989    pshufb              m11, m15
   1990    pmaddubsw           m11, m10
   1991    pshufb              m12, m7
   1992    pmaddubsw           m12, m10
   1993    pshufb              m13, m7
   1994    pmaddubsw           m13, m10
   1995    pshufb              m14, m7
   1996    pmaddubsw           m14, m10
   1997    paddw                m0, m11
   1998    paddw                m2, m12
   1999    paddw                m1, m13
   2000    paddw                m6, m14
   2001    movu               xm11, [tlq+ 61]    ;  64- 71
   2002    vinserti128         m11, [tlq+ 75], 1 ;  80- 87
   2003    movu               xm12, [tlq+ 69]    ;  72- 79
   2004    vinserti128         m12, [tlq+ 83], 1 ;  88- 95
   2005    movu               xm13, [tlq+ 93]    ;  96-103
   2006    vinserti128         m13, [tlq+107], 1 ; 112-119
   2007    movu               xm14, [tlq+101]    ; 104-111
   2008    vinserti128         m14, [tlq+115], 1 ; 120-127
   2009    pmulhrsw             m0, m3
   2010    pmulhrsw             m2, m3
   2011    pmulhrsw             m1, m3
   2012    pmulhrsw             m6, m3
   2013    lea                 r3d, [hq-20]
   2014    mov                 tlq, rsp
   2015    packuswb             m0, m2
   2016    packuswb             m1, m6
   2017    vpbroadcastd        xm2, [pb_14]
   2018    vbroadcasti128       m6, [pb_0to15]
   2019    mova         [tlq+32*0], m0
   2020    mova         [tlq+32*1], m1
   2021    movd                xm0, r3d
   2022    vpbroadcastd         m1, [pb_12]
   2023    vpbroadcastb         m0, xm0
   2024    paddb                m0, m2
   2025    pminub               m0, m6 ; clip 64x16 and 64x32
   2026    pshufb              m12, m0
   2027    pminub               m1, m6 ; clip 64x64
   2028    pshufb              m14, m1
   2029    pshufb               m0, m11, m7
   2030    pmaddubsw            m0, m10
   2031    pshufb               m2, m12, m7
   2032    pmaddubsw            m2, m10
   2033    pshufb               m1, m13, m7
   2034    pmaddubsw            m1, m10
   2035    pshufb               m6, m14, m7
   2036    pmaddubsw            m6, m10
   2037    pshufb               m7, m11, m15
   2038    pmaddubsw            m7, m9
   2039    pshufb              m10, m12, m15
   2040    pmaddubsw           m10, m9
   2041    paddw                m0, m7
   2042    pshufb               m7, m13, m15
   2043    pmaddubsw            m7, m9
   2044    paddw                m2, m10
   2045    pshufb              m10, m14, m15
   2046    pmaddubsw           m10, m9
   2047    paddw                m1, m7
   2048    paddw                m6, m10
   2049    vpbroadcastd         m9, [z_filter_k+4*2+12*0]
   2050    pshufb              m11, m8
   2051    pmaddubsw           m11, m9
   2052    pshufb              m12, m8
   2053    pmaddubsw           m12, m9
   2054    pshufb              m13, m8
   2055    pmaddubsw           m13, m9
   2056    pshufb              m14, m8
   2057    pmaddubsw           m14, m9
   2058    paddw                m0, m11
   2059    paddw                m2, m12
   2060    paddw                m1, m13
   2061    paddw                m6, m14
   2062    pmulhrsw             m0, m3
   2063    pmulhrsw             m2, m3
   2064    pmulhrsw             m1, m3
   2065    pmulhrsw             m6, m3
   2066    packuswb             m0, m2
   2067    packuswb             m1, m6
   2068    mova         [tlq+32*2], m0
   2069    mova         [tlq+32*3], m1
   2070 .w64_main:
   2071    movd               xm12, dxd
   2072    vpbroadcastb         m7, [tlq+maxbaseq]
   2073    lea                 r3d, [dxq-64]
   2074    shl            maxbased, 6
   2075    vpbroadcastw        m12, xm12
   2076    sub                 r3d, maxbased
   2077    vbroadcasti128       m8, [z_filter_s+2]
   2078    movd                xm6, r3d
   2079    mov                 r5d, dxd
   2080    mova                m10, [pb_1to32]
   2081    vpbroadcastd        m11, [pb_32]
   2082    vpbroadcastw         m6, xm6
   2083 .w64_loop:
   2084    mov                 r3d, r5d
   2085    shr                 r3d, 6
   2086    movu                 m0, [tlq+r3+ 0]
   2087    movu                 m1, [tlq+r3+ 8]
   2088    pand                 m2, m4, m6
   2089    psubw                m9, m5, m2
   2090    psllw                m2, 8
   2091    por                  m9, m2
   2092    pshufb               m0, m8
   2093    pshufb               m1, m8
   2094    pmaddubsw            m0, m9
   2095    pmaddubsw            m1, m9
   2096    psraw                m2, m6, 6
   2097    pmulhrsw             m0, m3
   2098    pmulhrsw             m1, m3
   2099    packsswb             m2, m2
   2100    paddb                m2, m10
   2101    packuswb             m0, m1
   2102    vpblendvb            m0, m7, m0, m2
   2103    mova          [dstq+ 0], m0
   2104    movu                 m0, [tlq+r3+32]
   2105    movu                 m1, [tlq+r3+40]
   2106    add                 r5d, dxd
   2107    pshufb               m0, m8
   2108    pshufb               m1, m8
   2109    pmaddubsw            m0, m9
   2110    pmaddubsw            m1, m9
   2111    paddb                m2, m11
   2112    pmulhrsw             m0, m3
   2113    pmulhrsw             m1, m3
   2114    paddw                m6, m12
   2115    packuswb             m0, m1
   2116    vpblendvb            m0, m7, m0, m2
   2117    mova          [dstq+32], m0
   2118    dec                  hd
   2119    jz .w64_end
   2120    add                dstq, strideq
   2121    cmp                 r5d, maxbased
   2122    jb .w64_loop
   2123 .w64_end_loop:
   2124    mova          [dstq+ 0], m7
   2125    mova          [dstq+32], m7
   2126    add                dstq, strideq
   2127    dec                  hd
   2128    jg .w64_end_loop
   2129 .w64_end:
   2130    RET
   2131 
   2132 cglobal ipred_z2_8bpc, 3, 10, 16, 224, dst, stride, tl, w, h, angle, dx, dy
   2133 %define base r9-z_filter_t0
   2134    lea                  r9, [ipred_z2_avx2_table]
   2135    tzcnt                wd, wm
   2136    movifnidn        angled, anglem
   2137    movifnidn            hd, hm
   2138    lea                 dxq, [dr_intra_derivative-90]
   2139    movsxd               wq, [r9+wq*4]
   2140    movzx               dyd, angleb
   2141    xor              angled, 0x400
   2142    mov                  r8, dxq
   2143    sub                 dxq, dyq
   2144    add                  wq, r9
   2145    add                  r9, z_filter_t0-ipred_z2_avx2_table
   2146    mova                 m2, [tlq-64]
   2147    mova                 m0, [tlq-32]
   2148    mova                 m1, [tlq]
   2149    and                 dyd, ~1
   2150    and                 dxq, ~1
   2151    movzx               dyd, word [r8+dyq]  ; angle - 90
   2152    movzx               dxd, word [dxq+270] ; 180 - angle
   2153    vpbroadcastd        m13, [base+pw_512]
   2154    vpbroadcastd        m14, [base+pw_62]
   2155    vpbroadcastd        m15, [base+pw_64]
   2156    mova           [rsp+ 0], m2
   2157    mova           [rsp+32], m0
   2158    mova           [rsp+64], m1
   2159    neg                 dxd
   2160    neg                 dyd
   2161    jmp                  wq
   2162 .w4:
   2163    vpbroadcastq         m6, [base+z2_base_inc] ; base_inc << 6
   2164    vbroadcasti128      m10, [base+z1_shuf_w4]
   2165    vbroadcasti128      m11, [base+z2_shuf_h4]
   2166    lea                 r2d, [dxq+(65<<6)] ; xpos
   2167    movd                xm5, dyd
   2168    mov                 r8d, (63-4)<<6
   2169    mov                 dyq, -4
   2170    pshuflw             xm5, xm5, q0000
   2171    pmullw              xm5, [base+z2_ymul]
   2172    test             angled, 0x400
   2173    jnz .w4_main ; !enable_intra_edge_filter
   2174    lea                 r3d, [hq+2]
   2175    add              angled, 1022
   2176    shl                 r3d, 6
   2177    test                r3d, angled
   2178    jnz .w4_no_upsample_above ; angle >= 130 || h > 8 || (is_sm && h == 8)
   2179    vpbroadcastd        xm3, [base+pb_4]
   2180    call .upsample_above
   2181    sub              angled, 1075 ; angle - 53
   2182    lea                 r3d, [hq+3]
   2183    xor              angled, 0x7f ; 180 - angle
   2184    call .filter_strength
   2185    jmp .w4_filter_left
   2186 ALIGN function_align
   2187 .filter_strength:
   2188    movd                xm8, r3d
   2189    mov                 r3d, angled
   2190    movd                xm7, angled
   2191    vpbroadcastb         m8, xm8
   2192    shr                 r3d, 8 ; is_sm << 1
   2193    vpbroadcastb         m7, xm7
   2194    pcmpeqb              m8, [base+z_filter_wh]
   2195    mova                xm9, [r9+r3*8]
   2196    pand                 m0, m8, m7
   2197    pcmpgtb              m0, m9
   2198    pmovmskb            r3d, m0
   2199    ret
   2200 ALIGN function_align
   2201 .upsample_above: ; w4/w8
   2202    pshufb              xm2, xm1, [base+z_upsample1-2]
   2203    pminub              xm3, [base+z_filter_s+4]
   2204    vpbroadcastd        xm4, [base+pb_36_m4]
   2205    vbroadcasti128      m10, [base+pb_0to15]
   2206    pshufb              xm3, xm1, xm3
   2207    pmaddubsw           xm2, xm4
   2208    pmaddubsw           xm3, xm4
   2209    lea                 r2d, [r2+dxq+(1<<6)]
   2210    add                 dxd, dxd
   2211    paddw               xm2, xm3
   2212    pmulhrsw            xm2, xm13
   2213    sub                 r8d, 3<<6
   2214    paddw                m6, m6
   2215    packuswb            xm2, xm2
   2216    punpcklbw           xm1, xm2
   2217    mova   [rsp+gprsize+64], xm1
   2218    ret
   2219 ALIGN function_align
   2220 .upsample_left: ; h4/h8
   2221    mov                 r3d, hd
   2222    and                 r3d, 4
   2223    movd                xm2, [rsp+gprsize+64]
   2224    movddup             xm0, [rsp+gprsize+56]
   2225    movd                xm1, r3d
   2226    palignr             xm2, xm0, 1
   2227    vpbroadcastb        xm1, xm1
   2228    pshufb              xm2, [base+z_filter_s+18]
   2229    vpbroadcastd        xm3, [base+pb_36_m4]
   2230    pmaxub              xm1, [base+z_upsample1-2]
   2231    pshufb              xm1, xm0, xm1
   2232    pmaddubsw           xm2, xm3
   2233    pmaddubsw           xm1, xm3
   2234    paddw               xm5, xm5
   2235    add                 dyq, dyq
   2236    paddw               xm1, xm2
   2237    pmulhrsw            xm1, xm13
   2238    vbroadcasti128      m11, [base+z2_upsample]
   2239    paddw               xm5, xm15
   2240    packuswb            xm1, xm1
   2241    punpcklbw           xm0, xm1
   2242    mova   [rsp+gprsize+48], xm0
   2243    ret
   2244 .w4_no_upsample_above:
   2245    lea                 r3d, [hq+3]
   2246    sub              angled, 1112 ; angle - 90
   2247    call .filter_strength
   2248    test                r3d, r3d
   2249    jz .w4_no_filter_above
   2250    popcnt              r3d, r3d
   2251    vpbroadcastd        xm2, [base+pb_4]
   2252    pminub              xm2, [base+z_filter_s]
   2253    vpbroadcastd        xm0, [base+z_filter_k-4+r3*4+12*0]
   2254    vpbroadcastd        xm4, [base+z_filter_k-4+r3*4+12*1]
   2255    pshufb              xm3, xm1, xm2 ; 00 01 12 23
   2256    pshufd              xm2, xm2, q0321
   2257    pmaddubsw           xm0, xm3, xm0
   2258    pshufb              xm2, xm1, xm2 ; 12 23 34 44
   2259    pmaddubsw           xm2, xm4
   2260    vpbroadcastd        xm4, [base+z_filter_k-4+r3*4+12*2]
   2261    punpckhqdq          xm3, xm3      ; 34 44 44 44
   2262    pmaddubsw           xm3, xm4
   2263    vpbroadcastd        xm4, r6m      ; max_width
   2264    packssdw            xm4, xm4
   2265    paddw               xm0, xm2
   2266    paddw               xm0, xm3
   2267    pmulhrsw            xm0, xm13
   2268    packsswb            xm4, xm4
   2269    psrlq               xm1, 8
   2270    psubb               xm4, [base+pb_1to32]
   2271    packuswb            xm0, xm0
   2272    vpblendvb           xm0, xm1, xm4
   2273    movd           [rsp+65], xm0
   2274 .w4_no_filter_above:
   2275    lea                 r3d, [hq+2]
   2276    add              angled, 973 ; angle + 883
   2277    shl                 r3d, 6
   2278    test                r3d, angled
   2279    jz .w4_upsample_left ; angle <= 140 || h > 8 || (is_sm && h == 8)
   2280    vpbroadcastd        xm0, [base+pb_90]
   2281    psubb               xm0, xm7 ; 180 - angle
   2282    pand                xm0, xm8 ; reuse from previous filter_strength call
   2283    pcmpgtb             xm0, xm9
   2284    pmovmskb            r3d, xm0
   2285 .w4_filter_left:
   2286    test                r3d, r3d
   2287    jz .w4_main
   2288    popcnt              r3d, r3d
   2289    mov                 r5d, 10
   2290    cmp                  hd, 16
   2291    movu                xm2, [rsp+49]
   2292    vinserti128          m2, [rsp+43], 1
   2293    cmovs               r5d, hd
   2294    xor                 r5d, 15 ; h == 16 ? 5 : 15 - h
   2295    movd                xm0, r5d
   2296    vbroadcasti128       m1, [base+z_filter_s+12]
   2297    vbroadcasti128       m4, [base+z_filter_s+16]
   2298    vinserti128          m3, m1, [z_filter_s+8], 1   ; 56 67 78 89 9a ab bc cd   55 55 56 67 78 89 9a ab
   2299    vpblendd             m1, m4, 0x0f                ; 78 89 9a ab bc cd de ef   56 67 78 89 9a ab bc cd
   2300    vinserti128          m4, [base+z_filter_s+20], 0 ; 9a ab bc cd de ef ff ff   78 89 9a ab bc cd de ef
   2301    vpbroadcastb         m0, xm0
   2302    pmaxub               m0, m3
   2303    vpbroadcastd         m3, [base+z_filter_k-4+r3*4+12*0]
   2304    pshufb               m0, m2, m0
   2305    pmaddubsw            m0, m3
   2306    vpbroadcastd         m3, [base+z_filter_k-4+r3*4+12*1]
   2307    pshufb               m1, m2, m1
   2308    pmaddubsw            m1, m3
   2309    vpbroadcastd         m3, [base+z_filter_k-4+r3*4+12*2]
   2310    pshufb               m2, m4
   2311    pmaddubsw            m2, m3
   2312    vpbroadcastd        xm4, r7m ; max_height
   2313    packssdw            xm4, xm4
   2314    paddw                m1, m0
   2315    paddw                m1, m2
   2316    pmulhrsw             m1, m13
   2317    packsswb            xm4, xm4
   2318    vextracti128        xm0, m1, 1
   2319    psubb               xm4, [base+pb_16to1]
   2320    packuswb            xm0, xm1
   2321    vpblendvb           xm0, [rsp+48], xm4
   2322    mova           [rsp+48], xm0
   2323    jmp .w4_main
   2324 .w4_upsample_left:
   2325    call .upsample_left
   2326 .w4_main:
   2327    movd                xm0, dxd
   2328    mova                m12, [base+z2_y_shuf_h4]
   2329    lea                  r5, [rsp+56]  ; left-7
   2330    vpbroadcastw         m0, xm0
   2331    lea                  r9, [strideq*3]
   2332    psraw               xm1, xm5, 6
   2333    pand                xm5, xm14      ; frac_y
   2334    pxor                xm2, xm2
   2335    paddw                m7, m0, m0
   2336    psubw               xm4, xm2, xm1  ; base_y
   2337    vpblendd             m0, m7, 0xcc
   2338    mova                xm1, xm7
   2339    punpcklwd           xm4, xm2
   2340    paddw                m0, m1        ; xpos2 xpos3 xpos0 xpos1
   2341    psubw               xm1, xm15, xm5 ; 64-frac_y
   2342    psllw               xm5, 8
   2343    paddw                m7, m7
   2344    paddw                m6, m0
   2345    por                 xm5, xm1       ; 64-frac_y, frac_y
   2346    vpbroadcastq         m5, xm5
   2347 .w4_loop:
   2348    lea                 r3d, [r2+dxq]
   2349    shr                 r2d, 6         ; base_x0
   2350    vpbroadcastq         m1, [rsp+r2]
   2351    lea                 r2d, [r3+dxq]
   2352    shr                 r3d, 6         ; base_x1
   2353    vpbroadcastq         m2, [rsp+r3]
   2354    lea                 r3d, [r2+dxq]
   2355    shr                 r2d, 6         ; base_x2
   2356    movq                xm0, [rsp+r2]
   2357    lea                 r2d, [r3+dxq]
   2358    shr                 r3d, 6         ; base_x3
   2359    movhps              xm0, [rsp+r3]
   2360    vpblendd             m1, m2, 0xc0
   2361    pand                 m2, m14, m6   ; frac_x
   2362    vpblendd             m0, m1, 0xf0
   2363    psubw                m1, m15, m2   ; 64-frac_x
   2364    psllw                m2, 8
   2365    pshufb               m0, m10
   2366    por                  m1, m2        ; 64-frac_x, frac_x
   2367    pmaddubsw            m0, m1
   2368    cmp                 r3d, 64
   2369    jge .w4_toponly
   2370    mova                 m1, m7        ; arbitrary negative value
   2371    vpgatherdq           m3, [r5+xm4], m1
   2372    pshufb               m1, m3, m11
   2373    vpermd               m1, m12, m1
   2374    pmaddubsw            m1, m5
   2375    psraw                m2, m6, 15    ; base_x < topleft
   2376    vpblendvb            m0, m1, m2
   2377 .w4_toponly:
   2378    pmulhrsw             m0, m13
   2379    paddw                m6, m7        ; xpos += dx
   2380    add                  r5, dyq
   2381    packuswb             m0, m0
   2382    vextracti128        xm1, m0, 1
   2383    movd   [dstq+strideq*2], xm0
   2384    pextrd [dstq+r9       ], xm0, 1
   2385    movd   [dstq+strideq*0], xm1
   2386    pextrd [dstq+strideq*1], xm1, 1
   2387    sub                  hd, 4
   2388    jz .w4_end
   2389    lea                dstq, [dstq+strideq*4]
   2390    cmp                 r2d, r8d
   2391    jge .w4_loop
   2392 .w4_leftonly_loop:
   2393    mova                 m1, m7
   2394    vpgatherdq           m2, [r5+xm4], m1
   2395    add                  r5, dyq
   2396    pshufb               m0, m2, m11
   2397    vpermd               m0, m12, m0
   2398    pmaddubsw            m0, m5
   2399    pmulhrsw             m0, m13
   2400    packuswb             m0, m0
   2401    vextracti128        xm1, m0, 1
   2402    movd   [dstq+strideq*2], xm0
   2403    pextrd [dstq+r9       ], xm0, 1
   2404    movd   [dstq+strideq*0], xm1
   2405    pextrd [dstq+strideq*1], xm1, 1
   2406    lea                dstq, [dstq+strideq*4]
   2407    sub                  hd, 4
   2408    jg .w4_leftonly_loop
   2409 .w4_end:
   2410    RET
   2411 .w8:
   2412    vbroadcasti128       m6, [base+z2_base_inc] ; base_inc << 6
   2413    movd                xm5, dyd
   2414    vbroadcasti128      m10, [base+z_filter_s+2]
   2415    vbroadcasti128      m11, [base+z2_shuf_h4]
   2416    lea                 r2d, [dxq+(65<<6)] ; xpos
   2417    vpbroadcastw        xm5, xm5
   2418    mov                 r8d, (63-8)<<6
   2419    mov                 dyq, -4
   2420    pmullw              xm5, [base+z2_ymul]
   2421    test             angled, 0x400
   2422    jnz .w8_main
   2423    lea                 r3d, [angleq+126]
   2424    mov                 r3b, hb
   2425    cmp                 r3d, 8
   2426    ja .w8_no_upsample_above ; angle >= 130 || h > 8 || is_sm
   2427    vpbroadcastd        xm3, [base+pb_8]
   2428    movhps         [rsp+80], xm1
   2429    call .upsample_above
   2430    sub              angled, 53 ; angle - 53
   2431    lea                 r3d, [hq+7]
   2432    xor              angled, 0x7f ; 180 - angle
   2433    call .filter_strength
   2434    jmp .w8_filter_left
   2435 .w8_no_upsample_above:
   2436    lea                 r3d, [hq+7]
   2437    sub              angled, 90 ; angle - 90
   2438    call .filter_strength
   2439    test                r3d, r3d
   2440    jz .w8_no_filter_above
   2441    popcnt              r3d, r3d
   2442    vpbroadcastd        xm3, [base+pb_8]
   2443    pminub              xm3, [base+z_filter_s+8]
   2444    vpbroadcastd        xm0, [base+z_filter_k-4+r3*4+12*0]
   2445    vpbroadcastd        xm4, [base+z_filter_k-4+r3*4+12*1]
   2446    pshufb              xm2, xm1, [base+z_filter_s] ; 00 01 12 23 34 45 56 67
   2447    pmaddubsw           xm0, xm2, xm0
   2448    pshufb              xm3, xm1, xm3               ; 34 45 56 67 78 88 88 88
   2449    shufps              xm2, xm3, q2121             ; 12 23 34 45 56 67 78 88
   2450    pmaddubsw           xm2, xm4
   2451    vpbroadcastd        xm4, [base+z_filter_k-4+r3*4+12*2]
   2452    pmaddubsw           xm3, xm4
   2453    vpbroadcastd        xm4, r6m ; max_width
   2454    packssdw            xm4, xm4
   2455    paddw               xm0, xm2
   2456    paddw               xm0, xm3
   2457    pmulhrsw            xm0, xm13
   2458    packsswb            xm4, xm4
   2459    psrldq              xm1, 1
   2460    psubb               xm4, [base+pb_1to32]
   2461    packuswb            xm0, xm0
   2462    vpblendvb           xm0, xm1, xm4
   2463    movq           [rsp+65], xm0
   2464 .w8_no_filter_above:
   2465    lea                 r3d, [angleq-51]
   2466    mov                 r3b, hb
   2467    cmp                 r3d, 8
   2468    jbe .w8_upsample_left ; angle > 140 && h <= 8 && !is_sm
   2469    vpbroadcastd         m0, [base+pb_90]
   2470    psubb                m0, m7
   2471    pand                 m0, m8
   2472    pcmpgtb              m0, m9
   2473    pmovmskb            r3d, m0
   2474 .w8_filter_left:
   2475    test                r3d, r3d
   2476    jz .w8_main
   2477    popcnt              r3d, r3d
   2478    vpbroadcastd         m7, [base+z_filter_k-4+r3*4+12*0]
   2479    vpbroadcastd         m8, [base+z_filter_k-4+r3*4+12*1]
   2480    vpbroadcastd         m9, [base+z_filter_k-4+r3*4+12*2]
   2481    cmp                  hd, 32
   2482    jne .w8_filter_left_h16
   2483    movu                xm2, [rsp+27]
   2484    vinserti128          m2, [rsp+35], 1
   2485    vpbroadcastd        xm0, [base+pb_5]
   2486    vbroadcasti128       m3, [base+z_filter_s+ 8]
   2487    vbroadcasti128       m1, [base+z_filter_s+12]
   2488    vbroadcasti128       m4, [base+z_filter_s+16]
   2489    pmaxub               m3, m0
   2490    pshufb               m3, m2, m3
   2491    pmaddubsw            m3, m7
   2492    pshufb               m1, m2, m1
   2493    pmaddubsw            m1, m8
   2494    pshufb               m2, m4
   2495    pmaddubsw            m2, m9
   2496    paddw                m3, m1
   2497    paddw                m3, m2
   2498    pmulhrsw             m3, m13
   2499    jmp .w8_filter_left_top16
   2500 .w8_filter_left_h16:
   2501    mov                 r5d, 10
   2502    cmp                  hd, 16
   2503    cmovs               r5d, hd
   2504    xor                 r5d, 15 ; h == 16 ? 5 : 15 - h
   2505    movd                xm0, r5d
   2506    vpbroadcastb         m0, xm0
   2507 .w8_filter_left_top16:
   2508    vbroadcasti128       m1, [base+z_filter_s+12]
   2509    vinserti128          m2, m1, [base+z_filter_s+8], 1 ; 56 67 78 89 9a ab bc cd   55 55 56 67 78 89 9a ab
   2510    vbroadcasti128       m4, [base+z_filter_s+16]
   2511    vpblendd             m1, m4, 0x0f                   ; 78 89 9a ab bc cd de ef   56 67 78 89 9a ab bc cd
   2512    vinserti128          m4, [base+z_filter_s+20], 0    ; 9a ab bc cd de ef ff ff   78 89 9a ab bc cd de ef
   2513    pmaxub               m0, m2
   2514    movu                xm2, [rsp+49]
   2515    vinserti128          m2, [rsp+43], 1
   2516    pshufb               m0, m2, m0
   2517    pmaddubsw            m0, m7
   2518    vpbroadcastd         m7, r7m ; max_height
   2519    pshufb               m1, m2, m1
   2520    pmaddubsw            m1, m8
   2521    pshufb               m2, m4
   2522    pmaddubsw            m2, m9
   2523    packssdw             m7, m7
   2524    paddw                m1, m0
   2525    packsswb             m7, m7
   2526    paddw                m1, m2
   2527    pmulhrsw             m1, m13
   2528    psubb                m7, [base+pb_32to1]
   2529    packuswb             m3, m1
   2530    vpermq               m3, m3, q1320
   2531    vpblendvb            m3, [rsp+32], m7
   2532    mova           [rsp+32], m3
   2533    jmp .w8_main
   2534 .w8_upsample_left:
   2535    call .upsample_left
   2536 .w8_main:
   2537    movd                xm3, dxd
   2538    lea                  r5, [rsp+56]  ; left-7
   2539    pshufd              xm1, xm5, q3120
   2540    pand                xm5, xm14
   2541    vpbroadcastw         m3, xm3
   2542    pxor                xm0, xm0
   2543    psubw               xm2, xm15, xm5
   2544    psraw               xm1, 6
   2545    lea                  r9, [strideq*3]
   2546    paddw                m7, m3, m3
   2547    psubw               xm9, xm0, xm1  ; base_y
   2548    psllw               xm5, 8
   2549    punpcklwd           xm8, xm9, xm0  ; base_y 0, 1, 4, 5
   2550    vpblendd             m3, m7, 0xf0  ; xpos0 xpos1
   2551    por                 xm5, xm2       ; 64-frac_y, frac_y
   2552    punpckhwd           xm9, xm0       ; base_y 2, 3, 6, 7
   2553    paddw                m6, m3
   2554    vinserti128         m12, m5, xm5, 1
   2555 .w8_loop:
   2556    lea                 r3d, [r2+dxq]
   2557    shr                 r2d, 6         ; base_x0
   2558    movu                xm0, [rsp+r2]
   2559    lea                 r2d, [r3+dxq]
   2560    shr                 r3d, 6         ; base_x1
   2561    vinserti128          m0, [rsp+r3], 1
   2562    lea                 r3d, [r2+dxq]
   2563    shr                 r2d, 6         ; base_x2
   2564    movu                xm1, [rsp+r2]
   2565    lea                 r2d, [r3+dxq]
   2566    shr                 r3d, 6         ; base_x3
   2567    vinserti128          m1, [rsp+r3], 1
   2568    pand                 m2, m14, m6
   2569    paddsw               m4, m6, m7
   2570    psubw                m5, m15, m2
   2571    psllw                m2, 8
   2572    pshufb               m0, m10
   2573    por                  m2, m5
   2574    pmaddubsw            m0, m2
   2575    pand                 m2, m14, m4
   2576    psubw                m5, m15, m2
   2577    psllw                m2, 8
   2578    pshufb               m1, m10
   2579    por                  m2, m5
   2580    pmaddubsw            m1, m2
   2581    cmp                 r3d, 64
   2582    jge .w8_toponly
   2583    mova                 m5, m7
   2584    vpgatherdq           m3, [r5+xm9], m7
   2585    mova                 m7, m5
   2586    vpgatherdq           m2, [r5+xm8], m5
   2587    pshufb               m3, m11
   2588    pshufb               m2, m11
   2589    punpckldq            m5, m2, m3    ; a0 b0 c0 d0 a1 b1 c1 d1   e0 f0 g0 h0 e1 f1 g1 h1
   2590    punpckhdq            m2, m3        ; a2 b2 c2 d2 a3 b3 c3 d3   e2 f2 g2 h2 e3 f3 g3 h3
   2591    vpermq               m5, m5, q3120 ; y0 y1
   2592    vpermq               m2, m2, q3120 ; y2 y3
   2593    pmaddubsw            m5, m12
   2594    pmaddubsw            m2, m12
   2595    psraw                m6, 15        ; base_x < topleft
   2596    vpblendvb            m0, m5, m6
   2597    psraw                m3, m4, 15
   2598    vpblendvb            m1, m2, m3
   2599 .w8_toponly:
   2600    pmulhrsw             m0, m13
   2601    pmulhrsw             m1, m13
   2602    paddw                m6, m4, m7     ; xpos += dx
   2603    add                  r5, dyq
   2604    packuswb             m0, m1
   2605    vextracti128        xm1, m0, 1
   2606    movq   [dstq+strideq*0], xm0
   2607    movhps [dstq+strideq*2], xm0
   2608    movq   [dstq+strideq*1], xm1
   2609    movhps [dstq+r9       ], xm1
   2610    sub                  hd, 4
   2611    jz .w8_end
   2612    lea                dstq, [dstq+strideq*4]
   2613    cmp                 r2d, r8d
   2614    jge .w8_loop
   2615 .w8_leftonly_loop:
   2616    mova                 m0, m7
   2617    vpgatherdq           m5, [r5+xm9], m7
   2618    mova                 m7, m0
   2619    vpgatherdq           m3, [r5+xm8], m0
   2620    add                  r5, dyq
   2621    pshufb               m2, m5, m11
   2622    pshufb               m1, m3, m11
   2623    punpckldq            m0, m1, m2
   2624    punpckhdq            m1, m2
   2625    vpermq               m0, m0, q3120
   2626    vpermq               m1, m1, q3120
   2627    pmaddubsw            m0, m12
   2628    pmaddubsw            m1, m12
   2629    pmulhrsw             m0, m13
   2630    pmulhrsw             m1, m13
   2631    packuswb             m0, m1
   2632    vextracti128        xm1, m0, 1
   2633    movq   [dstq+strideq*0], xm0
   2634    movhps [dstq+strideq*2], xm0
   2635    movq   [dstq+strideq*1], xm1
   2636    movhps [dstq+r9       ], xm1
   2637    lea                dstq, [dstq+strideq*4]
   2638    sub                  hd, 4
   2639    jg .w8_leftonly_loop
   2640 .w8_end:
   2641    RET
   2642 .w16:
   2643    mov                 r8d, hd
   2644    test             angled, 0x400
   2645    jnz .w16_main
   2646    lea                 r3d, [hq+15]
   2647    sub              angled, 90
   2648    call .filter_strength
   2649    test                r3d, r3d
   2650    jz .w16_no_filter_above
   2651    popcnt              r3d, r3d
   2652    vbroadcasti128       m6, [tlq+1]
   2653    mova                xm2, [base+z_filter_s]
   2654    vinserti128          m2, [base+z_filter_s+14], 1 ; 00 01 12 23 34 45 56 67   67 78 89 9a ab bc cd de
   2655    movu                xm3, [base+z_filter_s+8]
   2656    vinserti128          m3, [base+z_filter_s+22], 1 ; 34 45 56 67 78 89 9a ab   ab bc cd de ef ff ff ff
   2657    vpblendd             m1, m6, 0xf0
   2658    vpbroadcastd         m0, [base+z_filter_k-4+r3*4+12*0]
   2659    vpbroadcastd         m4, [base+z_filter_k-4+r3*4+12*1]
   2660    vpbroadcastd         m5, [base+z_filter_k-4+r3*4+12*2]
   2661    pshufb               m2, m1, m2
   2662    pshufb               m1, m3
   2663    pmaddubsw            m0, m2, m0
   2664    shufps               m2, m1, q2121                ; 12 23 34 45 56 67 78 89   89 9a ab bc cd de ef ff
   2665    pmaddubsw            m2, m4
   2666    pmaddubsw            m1, m5
   2667    vpbroadcastd        xm4, r6m ; max_width
   2668    packssdw            xm4, xm4
   2669    paddw                m0, m2
   2670    paddw                m0, m1
   2671    pmulhrsw             m0, m13
   2672    packsswb            xm4, xm4
   2673    vextracti128        xm2, m0, 1
   2674    psubb               xm4, [base+pb_1to32]
   2675    packuswb            xm0, xm2
   2676    vpblendvb           xm0, xm6, xm4
   2677    movu           [rsp+65], xm0
   2678 .w16_no_filter_above:
   2679    vpbroadcastd         m0, [base+pb_90]
   2680    psubb                m0, m7
   2681    pand                 m0, m8
   2682    pcmpgtb              m0, m9
   2683    pmovmskb            r3d, m0
   2684    test                r3d, r3d
   2685    jz .w16_main
   2686    popcnt              r3d, r3d
   2687    vpbroadcastd         m7, [base+z_filter_k-4+r3*4+12*0]
   2688    vpbroadcastd         m8, [base+z_filter_k-4+r3*4+12*1]
   2689    vpbroadcastd         m9, [base+z_filter_k-4+r3*4+12*2]
   2690 .w16_filter_left:
   2691    vpbroadcastd         m6, r7m ; max_height
   2692    packssdw             m6, m6
   2693    packsswb             m6, m6
   2694    cmp                  hd, 32
   2695    jl .w16_filter_left_h16
   2696    vpbroadcastd        xm0, [base+pb_5]
   2697    vbroadcasti128      m10, [base+z_filter_s+ 8]
   2698    vbroadcasti128      m11, [base+z_filter_s+12]
   2699    vbroadcasti128      m12, [base+z_filter_s+16]
   2700    je .w16_filter_left_h32
   2701    movu                 m3, [tlq-69]
   2702    movu                 m5, [tlq-61]
   2703    pmaxub               m1, m10, m0
   2704    pshufb               m1, m3, m1
   2705    pmaddubsw            m1, m7
   2706    pshufb               m2, m3, m11
   2707    pmaddubsw            m2, m8
   2708    pshufb               m3, m12
   2709    pmaddubsw            m3, m9
   2710    paddw                m1, m2
   2711    pshufb               m2, m5, m10
   2712    pmaddubsw            m2, m7
   2713    pshufb               m4, m5, m11
   2714    pmaddubsw            m4, m8
   2715    pshufb               m5, m12
   2716    pmaddubsw            m5, m9
   2717    paddw                m1, m3
   2718    vpbroadcastd         m3, [base+pb_32]
   2719    paddb                m3, [base+pb_32to1]
   2720    paddw                m2, m4
   2721    paddw                m2, m5
   2722    pmulhrsw             m1, m13
   2723    pmulhrsw             m2, m13
   2724    psubb                m3, m6, m3
   2725    packuswb             m1, m2
   2726    vpblendvb            m1, [tlq-64], m3
   2727    mova              [rsp], m1
   2728    jmp .w16_filter_left_top32
   2729 .w16_filter_left_h32:
   2730    pmaxub              m10, m0
   2731 .w16_filter_left_top32:
   2732    movu                xm2, [tlq-37]
   2733    vinserti128          m2, [tlq-29], 1
   2734    pshufb               m3, m2, m10
   2735    pshufb               m1, m2, m11
   2736    pshufb               m2, m12
   2737    pmaddubsw            m3, m7
   2738    pmaddubsw            m1, m8
   2739    pmaddubsw            m2, m9
   2740    paddw                m3, m1
   2741    paddw                m3, m2
   2742    pmulhrsw             m3, m13
   2743    jmp .w16_filter_left_top16
   2744 .w16_filter_left_h16:
   2745    mov                 r5d, 10
   2746    cmp                  hd, 16
   2747    cmovs               r5d, hd
   2748    xor                 r5d, 15 ; h == 16 ? 5 : 15 - h
   2749    movd                xm0, r5d
   2750    vpbroadcastb         m0, xm0
   2751 .w16_filter_left_top16:
   2752    movu                xm2, [tlq-15]
   2753    vinserti128          m2, [tlq-21], 1
   2754    vbroadcasti128       m1, [base+z_filter_s+12]
   2755    vbroadcasti128       m4, [base+z_filter_s+16]
   2756    vinserti128          m5, m1, [base+z_filter_s+8], 1 ; 56 67 78 89 9a ab bc cd   34 45 56 67 78 89 9a ab
   2757    vpblendd             m1, m4, 0x0f                   ; 78 89 9a ab bc cd de ef   56 67 78 89 9a ab bc cd
   2758    vinserti128          m4, [base+z_filter_s+20], 0    ; 9a ab bc cd de ef ff ff   78 89 9a ab bc cd de ef
   2759    pmaxub               m0, m5
   2760    pshufb               m0, m2, m0
   2761    pmaddubsw            m0, m7
   2762    pshufb               m1, m2, m1
   2763    pmaddubsw            m1, m8
   2764    pshufb               m2, m4
   2765    pmaddubsw            m2, m9
   2766    psubb                m6, [base+pb_32to1]
   2767    paddw                m1, m0
   2768    paddw                m1, m2
   2769    pmulhrsw             m1, m13
   2770    packuswb             m3, m1
   2771    vpermq               m3, m3, q1320
   2772    vpblendvb            m3, [tlq-32], m6
   2773    mova           [rsp+32], m3
   2774 .w16_main:
   2775    movd                xm1, dyd
   2776    vbroadcasti128      m10, [base+z_filter_s+2]
   2777    movd                xm7, dxd
   2778    vbroadcasti128      m11, [base+z2_shuf_h2]
   2779    vpbroadcastw         m1, xm1
   2780    vpbroadcastw         m7, xm7
   2781    mov                  r7, dstq
   2782    pmullw               m0, m1, [base+z2_ymul]
   2783    psllw               xm1, 4
   2784    paddw                m6, m7, [base+z2_base_inc]
   2785    lea                 r9d, [dxq+(65<<6)] ; xpos
   2786    movd          [rsp+156], xm1
   2787 .w16_loop0:
   2788    mov                 r2d, r9d
   2789    mova          [rsp+160], m0
   2790    lea                  r5, [rsp+60] ; left-3
   2791    mova          [rsp+192], m6
   2792    pxor                 m1, m1
   2793    psraw                m2, m0, 6
   2794    pand                 m0, m14
   2795    psubw                m9, m1, m2   ; base_y
   2796    psubw               m12, m15, m0
   2797    punpcklwd            m8, m9, m1   ; base_y  0,  1,  2,  3,     8,  9, 10, 11
   2798    psllw                m0, 8
   2799    punpckhwd            m9, m1       ; base_y  4,  5,  6,  7,    12, 13, 14, 15
   2800    por                 m12, m0       ; 64-frac_y, frac_y
   2801 .w16_loop:
   2802    lea                 r3d, [r2+dxq]
   2803    shr                 r2d, 6        ; base_x0
   2804    movu                xm0, [rsp+r2]
   2805    vinserti128          m0, [rsp+r2+8], 1
   2806    lea                 r2d, [r3+dxq]
   2807    shr                 r3d, 6        ; base_x1
   2808    movu                xm1, [rsp+r3]
   2809    vinserti128          m1, [rsp+r3+8], 1
   2810    pand                 m2, m14, m6
   2811    paddsw               m5, m6, m7
   2812    psubw                m3, m15, m2
   2813    psllw                m2, 8
   2814    pshufb               m0, m10
   2815    por                  m2, m3
   2816    pmaddubsw            m0, m2
   2817    pand                 m2, m14, m5
   2818    psubw                m3, m15, m2
   2819    psllw                m2, 8
   2820    pshufb               m1, m10
   2821    por                  m2, m3
   2822    pmaddubsw            m1, m2
   2823    cmp                 r3d, 64
   2824    jge .w16_toponly
   2825    punpckhwd            m2, m5, m5   ; mask out unnecessary loads
   2826    vpgatherdd           m4, [r5+m9], m2
   2827    punpcklwd            m2, m5, m5
   2828    vpgatherdd           m3, [r5+m8], m2
   2829    pshufb               m4, m11      ; e0 f0 g0 h0 e1 f1 g1 h1   m0 n0 o0 p0 m1 n1 o1 p1
   2830    pshufb               m3, m11      ; a0 b0 c0 d0 a1 b1 c1 d1   i0 j0 k0 l0 i1 j1 k1 l1
   2831    punpcklqdq           m2, m3, m4   ; y0
   2832    punpckhqdq           m3, m4       ; y1
   2833    pmaddubsw            m2, m12
   2834    pmaddubsw            m3, m12
   2835    psraw                m6, 15       ; base_x < topleft
   2836    vpblendvb            m0, m2, m6
   2837    psraw                m6, m5, 15
   2838    vpblendvb            m1, m3, m6
   2839 .w16_toponly:
   2840    pmulhrsw             m0, m13
   2841    pmulhrsw             m1, m13
   2842    paddw                m6, m5, m7   ; xpos += dx
   2843    sub                  r5, 2
   2844    packuswb             m0, m1
   2845    vpermq               m0, m0, q3120
   2846    mova         [dstq+strideq*0], xm0
   2847    vextracti128 [dstq+strideq*1], m0, 1
   2848    sub                  hd, 2
   2849    jz .w16_end
   2850    lea                dstq, [dstq+strideq*2]
   2851    cmp                 r2d, (63-16)<<6
   2852    jge .w16_loop
   2853 .w16_leftonly_loop:
   2854    mova                 m0, m7
   2855    vpgatherdd           m4, [r5+m9], m7
   2856    mova                 m7, m0
   2857    vpgatherdd           m3, [r5+m8], m0
   2858    sub                  r5, 2
   2859    pshufb               m2, m4, m11
   2860    pshufb               m1, m3, m11
   2861    punpcklqdq           m0, m1, m2
   2862    punpckhqdq           m1, m2
   2863    pmaddubsw            m0, m12
   2864    pmaddubsw            m1, m12
   2865    pmulhrsw             m0, m13
   2866    pmulhrsw             m1, m13
   2867    packuswb             m0, m1
   2868    vpermq               m0, m0, q3120
   2869    mova         [dstq+strideq*0], xm0
   2870    vextracti128 [dstq+strideq*1], m0, 1
   2871    lea                dstq, [dstq+strideq*2]
   2872    sub                  hd, 2
   2873    jg .w16_leftonly_loop
   2874 .w16_end:
   2875    sub                 r8d, 1<<8
   2876    jl .w16_ret
   2877    vpbroadcastd         m0, [rsp+156]
   2878    paddw                m0, [rsp+160] ; base_y += 16*dy
   2879    paddw                m6, m13, [rsp+192]
   2880    add                  r7, 16
   2881    add                 r9d, 16<<6
   2882    movzx                hd, r8b
   2883    mov                dstq, r7
   2884    paddw                m6, m13 ; base_x += 16*64
   2885    jmp .w16_loop0
   2886 .w16_ret:
   2887    RET
   2888 .w32:
   2889    mova                 m2, [tlq+32]
   2890    lea                 r8d, [hq+(1<<8)]
   2891    mova           [rsp+96], m2
   2892    test             angled, 0x400
   2893    jnz .w16_main
   2894    vpbroadcastd         m7, [base+z_filter_k+4*2+12*0]
   2895    vpbroadcastd         m8, [base+z_filter_k+4*2+12*1]
   2896    vpbroadcastd         m9, [base+z_filter_k+4*2+12*2]
   2897    mova                xm5, [base+z_filter_s]
   2898    vinserti128          m5, [base+z_filter_s+10], 1 ; 00 01 12 23 34 45 56 67   45 56 67 78 89 9a ab bc
   2899    vinserti128          m1, [tlq+11], 1
   2900    movu                xm6, [base+z_filter_s+12]
   2901    vinserti128          m6, [base+z_filter_s+22], 1 ; 56 67 78 89 9a ab bc cd   ab bc cd de ef ff ff ff
   2902    movu                xm3, [tlq+ 6]
   2903    vinserti128          m3, [tlq+17], 1
   2904    vpbroadcastd        m10, r6m ; max_width
   2905    packssdw            m10, m10
   2906    packsswb            m10, m10
   2907 .w32_filter_above:
   2908    pshufb               m0, m1, m5
   2909    shufps               m4, m5, m6, q1021           ; 12 23 34 45 56 67 78 89   67 78 89 9a ab bc cd de
   2910    pmaddubsw            m0, m7
   2911    pshufb               m2, m1, m4
   2912    shufps               m5, m6, q2132               ; 34 45 56 67 78 89 9a ab   89 9a ab bc cd de ef ff
   2913    pmaddubsw            m2, m8
   2914    pshufb               m1, m5
   2915    pmaddubsw            m1, m9
   2916    paddw                m0, m2
   2917    paddw                m0, m1
   2918    pshufb               m1, m3, m4
   2919    pmaddubsw            m1, m7
   2920    pshufb               m2, m3, m5
   2921    pmaddubsw            m2, m8
   2922    pshufb               m3, m6
   2923    pmaddubsw            m3, m9
   2924    paddw                m1, m2
   2925    paddw                m1, m3
   2926    pmulhrsw             m0, m13
   2927    pmulhrsw             m1, m13
   2928    psubb               m10, [base+pb_1to32]
   2929    packuswb             m0, m1
   2930    vpblendvb            m0, [tlq+1], m10
   2931    movu           [rsp+65], m0
   2932    jmp .w16_filter_left
   2933 .w64:
   2934    mova                 m2, [tlq+32]
   2935    mov                 r3d, [tlq+64]
   2936    lea                 r8d, [hq+(3<<8)]
   2937    mova          [rsp+ 96], m2
   2938    mov           [rsp+128], r3d
   2939    test             angled, 0x400
   2940    jnz .w16_main
   2941    vpbroadcastd         m7, [base+z_filter_k+4*2+12*0]
   2942    vpbroadcastd         m8, [base+z_filter_k+4*2+12*1]
   2943    vpbroadcastd         m9, [base+z_filter_k+4*2+12*2]
   2944    movu                xm6, [base+z_filter_s+ 4]
   2945    vinserti128          m6, [base+z_filter_s+10], 1 ; 12 23 34 45 56 67 78 89   45 56 67 78 89 9a ab bc
   2946    movu                xm3, [tlq+30]
   2947    vinserti128          m3, [tlq+43], 1
   2948    movu                xm5, [base+z_filter_s+16]
   2949    vinserti128          m5, [base+z_filter_s+22], 1 ; 78 89 9a ab bc cd de ef   ab bc cd de ef ff ff ff
   2950    pshufb               m0, m3, m6
   2951    shufps               m4, m6, m5, q1021           ; 34 45 56 67 78 89 9a ab   67 78 89 9a ab bc cd de
   2952    pmaddubsw            m0, m7
   2953    pshufb               m2, m3, m4
   2954    shufps               m6, m5, q2132               ; 56 67 78 89 9a ab bc cd   89 9a ab bc cd de ef ff
   2955    pmaddubsw            m2, m8
   2956    pshufb               m3, m6
   2957    pmaddubsw            m3, m9
   2958    paddw                m0, m2
   2959    paddw                m0, m3
   2960    movu                xm2, [tlq+36]
   2961    vinserti128          m2, [tlq+49], 1
   2962    vpbroadcastd        m10, r6m ; max_width
   2963    pshufb               m4, m2, m4
   2964    pmaddubsw            m4, m7
   2965    pshufb               m3, m2, m6
   2966    pmaddubsw            m3, m8
   2967    pshufb               m2, m5
   2968    pmaddubsw            m2, m9
   2969    packssdw            m10, m10
   2970    paddw                m3, m4
   2971    paddw                m2, m3
   2972    vpbroadcastd         m3, [base+pb_32]
   2973    pmulhrsw             m0, m13
   2974    pmulhrsw             m2, m13
   2975    packsswb            m10, m10
   2976    mova                xm5, [base+z_filter_s]
   2977    vinserti128          m5, [base+z_filter_s+6], 1
   2978    psubb                m3, m10, m3
   2979    psubb                m3, [base+pb_1to32]
   2980    vinserti128          m1, [tlq+13], 1
   2981    packuswb             m0, m2
   2982    vpblendvb            m0, [tlq+33], m3
   2983    movu                xm3, [tlq+ 6]
   2984    vinserti128          m3, [tlq+19], 1
   2985    movu           [rsp+97], m0
   2986    jmp .w32_filter_above
   2987 
   2988 cglobal ipred_z3_8bpc, 4, 9, 0, dst, stride, tl, w, h, angle, dy, org_w, maxbase
   2989    lea                  r6, [ipred_z3_avx2_table]
   2990    tzcnt                hd, hm
   2991    movifnidn        angled, anglem
   2992    lea                  r7, [dr_intra_derivative+45*2-1]
   2993    dec                 tlq
   2994    movsxd               hq, [r6+hq*4]
   2995    sub              angled, 180
   2996    add                  hq, r6
   2997    mov                 dyd, angled
   2998    neg                 dyd
   2999    xor              angled, 0x400
   3000    or                  dyq, ~0x7e
   3001    movzx               dyd, word [r7+dyq]
   3002    vpbroadcastd         m3, [pw_512]
   3003    vpbroadcastd         m4, [pw_62]
   3004    vpbroadcastd         m5, [pw_64]
   3005    mov              org_wd, wd
   3006    jmp                  hq
   3007 .h4:
   3008    lea                  r7, [strideq*3]
   3009    cmp              angleb, 40
   3010    jae .h4_no_upsample
   3011    lea                 r4d, [angleq-1024]
   3012    sar                 r4d, 7
   3013    add                 r4d, wd
   3014    jg .h4_no_upsample ; !enable_intra_edge_filter || w > 8 || (w == 8 && is_sm)
   3015    ALLOC_STACK         -32, 9
   3016    movu                xm8, [tlq-7]
   3017    pshufb              xm0, xm8, [z_upsample1-4]
   3018    vpbroadcastb        xm2, xm8
   3019    pshufb              xm1, xm8, [z_filter_s+2]
   3020    mova           [rsp+16], xm2 ; top[max_base_y]
   3021    vpbroadcastd        xm2, [pb_36_m4]
   3022    add                 dyd, dyd
   3023    pmaddubsw           xm0, xm2
   3024    pmaddubsw           xm1, xm2
   3025    movd                xm7, dyd
   3026    mov                 r2d, dyd
   3027    vpbroadcastw         m7, xm7
   3028    paddw               xm1, xm0
   3029    pmulhrsw            xm1, xm3
   3030    pslldq               m6, m7, 8
   3031    paddw               xm2, xm7, xm7
   3032    paddw                m6, m7
   3033    packuswb            xm1, xm1
   3034    paddw                m6, m2
   3035    punpcklbw           xm1, xm8
   3036    mova                xm8, [z_transpose4]
   3037    psllw                m7, 2
   3038    pshufb              xm1, [pb_15to0]
   3039    mova              [rsp], xm1
   3040 .h4_upsample_loop:
   3041    lea                 r4d, [r2+dyq]
   3042    shr                 r2d, 6
   3043    vpbroadcastq         m1, [rsp+r2]
   3044    lea                 r2d, [r4+dyq]
   3045    shr                 r4d, 6
   3046    vpbroadcastq         m2, [rsp+r4]
   3047    lea                 r4d, [r2+dyq]
   3048    shr                 r2d, 6
   3049    movq                xm0, [rsp+r2]
   3050    lea                 r2d, [r4+dyq]
   3051    shr                 r4d, 6
   3052    movhps              xm0, [rsp+r4]
   3053    vpblendd             m1, m2, 0xc0
   3054    pand                 m2, m4, m6
   3055    vpblendd             m0, m1, 0xf0
   3056    psubw                m1, m5, m2
   3057    psllw                m2, 8
   3058    por                  m1, m2
   3059    pmaddubsw            m0, m1
   3060    paddw                m6, m7
   3061    pmulhrsw             m0, m3
   3062    vextracti128        xm1, m0, 1
   3063    packuswb            xm1, xm0
   3064    pshufb              xm1, xm8
   3065    movd   [dstq+strideq*0], xm1
   3066    pextrd [dstq+strideq*1], xm1, 1
   3067    pextrd [dstq+strideq*2], xm1, 2
   3068    pextrd [dstq+r7       ], xm1, 3
   3069    add                dstq, 4
   3070    sub                  wd, 4
   3071    jg .h4_upsample_loop
   3072    RET
   3073 ALIGN function_align
   3074 .filter_strength: ; h4/h8/h16
   3075 %define base r4-z_filter_t0
   3076    lea                  r4, [z_filter_t0]
   3077    movd                xm0, maxbased
   3078    movd                xm2, angled
   3079    shr              angled, 8 ; is_sm << 1
   3080    vpbroadcastb         m0, xm0
   3081    vpbroadcastb         m2, xm2
   3082    pcmpeqb              m1, m0, [base+z_filter_wh]
   3083    pand                 m1, m2
   3084    mova                xm2, [r4+angleq*8]
   3085    pcmpgtb              m1, m2
   3086    pmovmskb            r5d, m1
   3087    ret
   3088 .h4_no_upsample:
   3089    ALLOC_STACK         -16, 12
   3090    mov            maxbased, 7
   3091    test             angled, 0x400 ; !enable_intra_edge_filter
   3092    jnz .h4_main
   3093    lea            maxbased, [wq+3]
   3094    call .filter_strength
   3095    mov            maxbased, 7
   3096    test                r5d, r5d
   3097    jz .h4_main ; filter_strength == 0
   3098    popcnt              r5d, r5d
   3099    vpbroadcastd         m7, [base+pb_7]
   3100    vbroadcasti128       m2, [tlq-14]
   3101    pmaxub               m1, m7, [base+z_filter_s-4]
   3102    vpbroadcastd         m8, [base+z_filter_k-4+r5*4+12*0]
   3103    pmaxub               m7, [base+z_filter_s+4]
   3104    vpbroadcastd         m9, [base+z_filter_k-4+r5*4+12*1]
   3105    vpbroadcastd        m10, [base+z_filter_k-4+r5*4+12*2]
   3106    pshufb               m0, m2, m1
   3107    shufps               m1, m7, q2121
   3108    pmaddubsw            m0, m8
   3109    pshufb               m1, m2, m1
   3110    pmaddubsw            m1, m9
   3111    pshufb               m2, m7
   3112    pmaddubsw            m2, m10
   3113    paddw                m0, m1
   3114    paddw                m0, m2
   3115    pmulhrsw             m0, m3
   3116    mov                 r4d, 9
   3117    lea                 tlq, [rsp+15]
   3118    cmp                  wd, 4
   3119    cmovne         maxbased, r4d
   3120    vextracti128        xm1, m0, 1
   3121    packuswb            xm0, xm1
   3122    mova              [rsp], xm0
   3123 .h4_main:
   3124    movd                xm6, dyd
   3125    vpbroadcastq         m0, [z_base_inc] ; base_inc << 6
   3126    mov                  r4, tlq
   3127    sub                 tlq, 4
   3128    neg                 dyq
   3129    vpbroadcastw         m6, xm6
   3130    sub                  r4, maxbaseq
   3131    shl            maxbased, 6
   3132    vpbroadcastb         m7, [r4]
   3133    lea                  r4, [dyq+63] ; ypos
   3134    movd                xm9, maxbased
   3135    not            maxbased
   3136    vbroadcasti128       m8, [z3_shuf_w4]
   3137    add            maxbased, 64
   3138    vpbroadcastw         m9, xm9
   3139    psrlw                m7, 8  ; top[max_base_y]
   3140    paddw               m10, m6, m6
   3141    psubw                m9, m0 ; max_base_y
   3142    vpblendd             m6, m10, 0xcc
   3143    mova                xm0, xm10
   3144    paddw                m6, m0 ; ypos2 ypos3 ypos0 ypos1
   3145    paddw               m10, m10
   3146    mova               xm11, [z_transpose4]
   3147 .h4_loop:
   3148    lea                  r5, [r4+dyq]
   3149    sar                  r4, 6 ; base0
   3150    vpbroadcastq         m1, [tlq+r4]
   3151    lea                  r4, [r5+dyq]
   3152    sar                  r5, 6 ; base1
   3153    vpbroadcastq         m2, [tlq+r5]
   3154    lea                  r5, [r4+dyq]
   3155    sar                  r4, 6 ; base2
   3156    movq                xm0, [tlq+r4]
   3157    lea                  r4, [r5+dyq]
   3158    sar                  r5, 6 ; base3
   3159    movhps              xm0, [tlq+r5]
   3160    vpblendd             m1, m2, 0xc0
   3161    pand                 m2, m4, m6 ; frac
   3162    vpblendd             m0, m1, 0xf0
   3163    psubw                m1, m5, m2 ; 64-frac
   3164    psllw                m2, 8
   3165    pshufb               m0, m8
   3166    por                  m1, m2     ; 64-frac, frac
   3167    pmaddubsw            m0, m1
   3168    pcmpgtw              m1, m9, m6 ; base < max_base_y
   3169    pmulhrsw             m0, m3
   3170    paddw                m6, m10    ; ypos += dy
   3171    vpblendvb            m0, m7, m0, m1
   3172    vextracti128        xm1, m0, 1
   3173    packuswb            xm1, xm0
   3174    pshufb              xm1, xm11   ; transpose
   3175    movd   [dstq+strideq*0], xm1
   3176    pextrd [dstq+strideq*1], xm1, 1
   3177    pextrd [dstq+strideq*2], xm1, 2
   3178    pextrd [dstq+r7       ], xm1, 3
   3179    sub                  wd, 4
   3180    jz .h4_end
   3181    add                dstq, 4
   3182    cmp                 r4d, maxbased
   3183    jg .h4_loop
   3184    packuswb            xm7, xm7
   3185 .h4_end_loop:
   3186    movd   [dstq+strideq*0], xm7
   3187    movd   [dstq+strideq*1], xm7
   3188    movd   [dstq+strideq*2], xm7
   3189    movd   [dstq+r7       ], xm7
   3190    add                dstq, 4
   3191    sub                  wd, 4
   3192    jg .h4_end_loop
   3193 .h4_end:
   3194    RET
   3195 ALIGN function_align
   3196 .h8:
   3197    lea                 r4d, [angleq+216]
   3198    mov                 r4b, wb
   3199    cmp                 r4d, 8
   3200    ja .h8_no_upsample ; !enable_intra_edge_filter || is_sm || d >= 40 || w > 8
   3201    ALLOC_STACK         -32, 8
   3202    and                 r4d, 4
   3203    mova                xm0, [tlq-15]
   3204    vinserti128          m0, [tlq- 9], 1
   3205    movd                xm1, r4d
   3206    movu                xm2, [z_filter_s+2]
   3207    vinserti128          m2, [z_filter_s+6], 1
   3208    vpbroadcastb        xm1, xm1 ; w & 4
   3209    vpbroadcastd         m7, [pb_36_m4]
   3210    pmaxub              xm1, [z_upsample1-4] ; clip 4x8
   3211    vinserti128          m1, [z_upsample1], 1
   3212    add                 dyd, dyd
   3213    pshufb               m1, m0, m1
   3214    pshufb               m2, m0, m2
   3215    vinserti128          m0, [tlq-7], 1
   3216    movd                xm6, dyd
   3217    pmaddubsw            m1, m7
   3218    pmaddubsw            m2, m7
   3219    vpbroadcastw         m6, xm6
   3220    mov                 r2d, dyd
   3221    lea                  r5, [strideq*3]
   3222    paddw                m7, m6, m6
   3223    paddw                m1, m2
   3224    vpblendd             m6, m7, 0xf0
   3225    pmulhrsw             m1, m3
   3226    pslldq               m2, m7, 8
   3227    paddw                m7, m7
   3228    paddw                m6, m2
   3229    vbroadcasti128       m2, [pb_15to0]
   3230    packuswb             m1, m1
   3231    punpcklbw            m1, m0
   3232    pshufb               m1, m2
   3233    vextracti128   [rsp+ 0], m1, 1
   3234    mova           [rsp+16], xm1
   3235 .h8_upsample_loop:
   3236    lea                 r4d, [r2+dyq]
   3237    shr                 r2d, 6 ; base0
   3238    movu                xm0, [rsp+r2]
   3239    lea                 r2d, [r4+dyq]
   3240    shr                 r4d, 6 ; base1
   3241    vinserti128          m0, [rsp+r4], 1
   3242    lea                 r4d, [r2+dyq]
   3243    shr                 r2d, 6 ; base2
   3244    pand                 m1, m4, m6
   3245    psubw                m2, m5, m1
   3246    psllw                m1, 8
   3247    por                  m2, m1
   3248    punpcklqdq           m1, m2, m2 ; frac0 frac1
   3249    pmaddubsw            m0, m1
   3250    movu                xm1, [rsp+r2]
   3251    lea                 r2d, [r4+dyq]
   3252    shr                 r4d, 6 ; base3
   3253    vinserti128          m1, [rsp+r4], 1
   3254    punpckhqdq           m2, m2 ; frac2 frac3
   3255    pmaddubsw            m1, m2
   3256    pmulhrsw             m0, m3
   3257    paddw                m6, m7
   3258    pmulhrsw             m1, m3
   3259    lea                  r4, [dstq+strideq*4]
   3260    psllw                m1, 8
   3261    por                  m0, m1
   3262    vextracti128        xm1, m0, 1
   3263    punpcklbw           xm2, xm0, xm1
   3264    punpckhbw           xm0, xm1
   3265    movd   [dstq+strideq*0], xm2
   3266    pextrd [dstq+strideq*1], xm2, 1
   3267    pextrd [dstq+strideq*2], xm2, 2
   3268    pextrd [dstq+r5       ], xm2, 3
   3269    movd   [r4  +strideq*0], xm0
   3270    pextrd [r4  +strideq*1], xm0, 1
   3271    pextrd [r4  +strideq*2], xm0, 2
   3272    pextrd [r4  +r5       ], xm0, 3
   3273    add                dstq, 4
   3274    sub                  wd, 4
   3275    jg .h8_upsample_loop
   3276    RET
   3277 .h8_no_intra_edge_filter:
   3278    and            maxbased, 7
   3279    or             maxbased, 8 ; imin(w+7, 15)
   3280    jmp .h8_main
   3281 .h8_no_upsample:
   3282    ALLOC_STACK         -32, 10
   3283    lea            maxbased, [wq+7]
   3284    test             angled, 0x400
   3285    jnz .h8_no_intra_edge_filter
   3286    call .filter_strength
   3287    test                r5d, r5d
   3288    jz .h8_main ; filter_strength == 0
   3289    popcnt              r5d, r5d
   3290    vpbroadcastd        xm6, [base+pb_15]
   3291    pcmpeqb             xm1, xm1
   3292    psubusb             xm6, xm0
   3293    psubb               xm6, xm1 ; w == 4 ? 5 : 1
   3294    movu                xm2, [tlq-16]
   3295    pmaxub              xm1, xm6, [base+z_filter_s]
   3296    vinserti128          m2, [tlq-14], 1
   3297    vinserti128          m1, [base+z_filter_s+12], 1
   3298    vpbroadcastd         m7, [base+z_filter_k-4+r5*4+12*0]
   3299    pmaxub              xm6, [base+z_filter_s+ 8]
   3300    vinserti128          m6, [base+z_filter_s+20], 1
   3301    pshufb               m0, m2, m1
   3302    pmaddubsw            m0, m7
   3303    vpbroadcastd         m7, [base+z_filter_k-4+r5*4+12*1]
   3304    movzx               r4d, byte [tlq-15]
   3305    shufps               m1, m6, q2121
   3306    pshufb               m1, m2, m1
   3307    pmaddubsw            m1, m7
   3308    paddw                m0, m1
   3309    sub                 r5d, 3
   3310    jnz .h8_3tap
   3311    vpbroadcastd         m7, [z_filter_k+4*8]
   3312    movzx               r2d, byte [tlq-14]
   3313    pshufb               m2, m6
   3314    pmaddubsw            m2, m7
   3315    sub                 r2d, r4d
   3316    lea                 r2d, [r2+r4*8+4]
   3317    shr                 r2d, 3
   3318    mov            [rsp+15], r2b
   3319    paddw                m0, m2
   3320 .h8_3tap:
   3321    pmulhrsw             m0, m3
   3322    sar                 r5d, 1
   3323    lea                 tlq, [rsp+31]
   3324    add                 r5d, 17
   3325    cmp                  wd, 16
   3326    cmovns         maxbased, r5d
   3327    neg                  r5
   3328    mov            [tlq+r5], r4b
   3329    vextracti128        xm1, m0, 1
   3330    packuswb            xm0, xm1
   3331    mova           [tlq-15], xm0
   3332 .h8_main:
   3333    movd                xm2, dyd
   3334    vbroadcasti128       m0, [z_base_inc]
   3335    mov                  r4, tlq
   3336    sub                 tlq, 8
   3337    neg                 dyq
   3338    vpbroadcastw         m2, xm2
   3339    sub                  r4, maxbaseq
   3340    shl            maxbased, 6
   3341    vpbroadcastb         m7, [r4]
   3342    lea                  r4, [dyq+63]
   3343    movd                xm9, maxbased
   3344    not            maxbased
   3345    vbroadcasti128       m8, [z3_shuf]
   3346    add            maxbased, 64
   3347    vpbroadcastw         m9, xm9
   3348    psrlw                m7, 8
   3349    psubw                m9, m0
   3350    paddw                m6, m2, m2
   3351    vpblendd             m2, m6, 0x0f
   3352 .h8_loop:
   3353    lea                  r5, [r4+dyq]
   3354    sar                  r4, 6
   3355    pand                 m0, m4, m2
   3356    psubw                m1, m5, m0
   3357    psllw                m0, 8
   3358    por                  m1, m0
   3359    vbroadcasti128       m0, [tlq+r4]
   3360    lea                  r4, [r5+dyq]
   3361    sar                  r5, 6
   3362    vinserti128          m0, [tlq+r5], 0
   3363    sub                 rsp, 8*2
   3364    pshufb               m0, m8
   3365    pmaddubsw            m0, m1
   3366    pcmpgtw              m1, m9, m2
   3367    paddw                m2, m6
   3368    pmulhrsw             m0, m3
   3369    vpblendvb            m0, m7, m0, m1
   3370    vextracti128        xm1, m0, 1
   3371    psllw               xm0, 8
   3372    por                 xm0, xm1 ; interleave rows (partial transpose)
   3373    mova              [rsp], xm0
   3374    sub                  wd, 2
   3375    jz .h8_transpose
   3376    cmp                 r4d, maxbased
   3377    jg .h8_loop
   3378    packuswb            xm0, xm7, xm7
   3379 .h8_end_loop:
   3380    sub                 rsp, 8*2
   3381    mova              [rsp], xm0
   3382    sub                  wd, 2
   3383    jg .h8_end_loop
   3384 .h8_transpose:
   3385    mova                xm2, [rsp+16*1]
   3386    sub              org_wd, 8
   3387    lea                  r2, [strideq*3]
   3388    lea                  r6, [dstq+org_wq]
   3389    cmovns             dstq, r6
   3390    punpcklwd           xm1, xm2, xm0
   3391    punpckhwd           xm2, xm0
   3392    lea                  r6, [dstq+strideq*4]
   3393    jge .h8_w8
   3394    add                 rsp, 16*2
   3395    movd   [dstq+strideq*0], xm1
   3396    pextrd [dstq+strideq*1], xm1, 1
   3397    pextrd [dstq+strideq*2], xm1, 2
   3398    pextrd [dstq+r2       ], xm1, 3
   3399    movd   [r6  +strideq*0], xm2
   3400    pextrd [r6  +strideq*1], xm2, 1
   3401    pextrd [r6  +strideq*2], xm2, 2
   3402    pextrd [r6  +r2       ], xm2, 3
   3403    jmp .h8_end
   3404 .h8_w8_loop:
   3405    mova                xm0, [rsp+16*0]
   3406    mova                xm2, [rsp+16*1]
   3407    punpcklwd           xm1, xm2, xm0
   3408    punpckhwd           xm2, xm0
   3409 .h8_w8: ; w8/w16/w32
   3410    mova                xm0, [rsp+16*2]
   3411    mova                xm4, [rsp+16*3]
   3412    add                 rsp, 16*4
   3413    punpcklwd           xm3, xm4, xm0
   3414    punpckhwd           xm4, xm0
   3415    punpckldq           xm0, xm3, xm1
   3416    punpckhdq           xm3, xm1
   3417    punpckldq           xm1, xm4, xm2
   3418    punpckhdq           xm4, xm2
   3419    movq   [dstq+strideq*0], xm0
   3420    movhps [dstq+strideq*1], xm0
   3421    movq   [dstq+strideq*2], xm3
   3422    movhps [dstq+r2       ], xm3
   3423    movq   [r6  +strideq*0], xm1
   3424    movhps [r6  +strideq*1], xm1
   3425    movq   [r6  +strideq*2], xm4
   3426    movhps [r6  +r2       ], xm4
   3427    sub                dstq, 8
   3428    sub                  r6, 8
   3429    sub              org_wd, 8
   3430    jge .h8_w8_loop
   3431 .h8_end:
   3432    RET
   3433 .h16_no_intra_edge_filter:
   3434    and            maxbased, 15
   3435    or             maxbased, 16 ; imin(w+15, 31)
   3436    jmp .h16_main
   3437 ALIGN function_align
   3438 .h16:
   3439    ALLOC_STACK         -64, 12
   3440    lea            maxbased, [wq+15]
   3441    test             angled, 0x400
   3442    jnz .h16_no_intra_edge_filter
   3443    call .filter_strength
   3444    test                r5d, r5d
   3445    jz .h16_main ; filter_strength == 0
   3446    popcnt              r5d, r5d
   3447    vpbroadcastd        m11, [base+pb_27]
   3448    vpbroadcastd         m1, [base+pb_1]
   3449    vbroadcasti128       m6, [base+z_filter_s+12]
   3450    vinserti128          m2, m6, [base+z_filter_s+4], 0
   3451    vinserti128          m6, [base+z_filter_s+20], 1
   3452    movu               xm10, [tlq-18]
   3453    vinserti128         m10, [tlq-14], 1
   3454    vpbroadcastd         m9, [base+z_filter_k-4+r5*4+12*0]
   3455    vbroadcasti128       m7, [base+z_filter_s+8]
   3456    vinserti128          m8, m7, [base+z_filter_s+0], 0
   3457    vinserti128          m7, [base+z_filter_s+16], 1
   3458    psubusb             m11, m0
   3459    por                  m1, m11
   3460    movu               xm11, [tlq-32]
   3461    vinserti128         m11, [tlq-28], 1
   3462    pmaxub               m8, m1
   3463    pmaxub               m7, m1
   3464    pshufb               m0, m10, m2
   3465    shufps               m2, m6, q2121
   3466    pmaddubsw            m0, m9
   3467    pshufb               m1, m11, m8
   3468    shufps               m8, m7, q2121
   3469    pmaddubsw            m1, m9
   3470    vpbroadcastd         m9, [base+z_filter_k-4+r5*4+12*1]
   3471    movzx               r4d, byte [tlq-31]
   3472    pshufb               m2, m10, m2
   3473    pmaddubsw            m2, m9
   3474    pshufb               m8, m11, m8
   3475    pmaddubsw            m8, m9
   3476    paddw                m0, m2
   3477    paddw                m1, m8
   3478    sub                 r5d, 3
   3479    jnz .h16_3tap
   3480    vpbroadcastd         m9, [z_filter_k+4*8]
   3481    movzx               r2d, byte [tlq-30]
   3482    pshufb              m10, m6
   3483    pmaddubsw           m10, m9
   3484    pshufb              m11, m7
   3485    pmaddubsw           m11, m9
   3486    sub                 r2d, r4d
   3487    lea                 r2d, [r2+r4*8+4]
   3488    shr                 r2d, 3
   3489    mov            [rsp+31], r2b
   3490    paddw                m0, m10
   3491    paddw                m1, m11
   3492 .h16_3tap:
   3493    pmulhrsw             m0, m3
   3494    pmulhrsw             m1, m3
   3495    sar                 r5d, 1
   3496    lea                 tlq, [rsp+63]
   3497    add                 r5d, 33
   3498    cmp                  wd, 32
   3499    cmovns         maxbased, r5d
   3500    neg                  r5
   3501    mov            [tlq+r5], r4b
   3502    packuswb             m0, m1
   3503    vpermq               m0, m0, q2031
   3504    mova           [tlq-31], m0
   3505 .h16_main:
   3506    movd                xm6, dyd
   3507    vbroadcasti128       m0, [z_base_inc]
   3508    mov                  r4, tlq
   3509    sub                 tlq, 8
   3510    neg                 dyq
   3511    vpbroadcastw         m6, xm6
   3512    sub                  r4, maxbaseq
   3513    shl            maxbased, 6
   3514    vpbroadcastb         m7, [r4]
   3515    lea                  r4, [dyq+63]
   3516    movd                xm9, maxbased
   3517    not            maxbased
   3518    vbroadcasti128       m8, [z3_shuf]
   3519    add            maxbased, 64
   3520    vpbroadcastw         m9, xm9
   3521    psubw                m9, m0
   3522    paddw               m11, m6, m6
   3523    psubw               m10, m9, m3 ; 64*8
   3524    vpblendd             m6, m11, 0xf0
   3525 .h16_loop:
   3526    lea                  r5, [r4+dyq]
   3527    sar                  r4, 6
   3528    pand                 m1, m4, m6
   3529    psubw                m2, m5, m1
   3530    psllw                m1, 8
   3531    por                  m2, m1
   3532    movu                xm0, [tlq+r4-0]
   3533    movu                xm1, [tlq+r4-8]
   3534    lea                  r4, [r5+dyq]
   3535    sar                  r5, 6
   3536    vinserti128          m0, [tlq+r5-0], 1
   3537    vinserti128          m1, [tlq+r5-8], 1
   3538    sub                 rsp, 32
   3539    pshufb               m0, m8
   3540    pshufb               m1, m8
   3541    pmaddubsw            m0, m2
   3542    pmaddubsw            m1, m2
   3543    pmulhrsw             m0, m3
   3544    pmulhrsw             m1, m3
   3545    packuswb             m0, m1
   3546    pcmpgtw              m1, m9, m6
   3547    pcmpgtw              m2, m10, m6
   3548    packsswb             m1, m2
   3549    paddw                m6, m11
   3550    vpblendvb            m0, m7, m0, m1
   3551    vpermq               m0, m0, q3120
   3552    mova              [rsp], m0
   3553    sub                  wd, 2
   3554    jz .h16_transpose
   3555    cmp                 r4d, maxbased
   3556    jg .h16_loop
   3557    mova                 m0, m7
   3558 .h16_end_loop:
   3559    sub                 rsp, 32
   3560    mova              [rsp], m7
   3561    sub                  wd, 2
   3562    jg .h16_end_loop
   3563 .h16_transpose:
   3564    mova                 m2, [rsp+32*1]
   3565    sub              org_wd, 8
   3566    lea                  r2, [strideq*3]
   3567    lea                  r6, [dstq+org_wq]
   3568    cmovns             dstq, r6
   3569    punpcklbw            m1, m2, m0
   3570    punpckhbw            m2, m0
   3571    lea                  r3, [strideq*5]
   3572    punpcklbw            m0, m1, m2
   3573    punpckhbw            m1, m2
   3574    lea                  r4, [strideq+r2*2] ; stride*7
   3575    jge .h16_w8
   3576    add                 rsp, 32*2
   3577    movd   [dstq+strideq*0], xm0
   3578    pextrd [dstq+strideq*1], xm0, 1
   3579    pextrd [dstq+strideq*2], xm0, 2
   3580    pextrd [dstq+r2       ], xm0, 3
   3581    vextracti128        xm0, m0, 1
   3582    movd   [dstq+strideq*4], xm1
   3583    pextrd [dstq+r3       ], xm1, 1
   3584    pextrd [dstq+r2*2     ], xm1, 2
   3585    pextrd [dstq+r4       ], xm1, 3
   3586    lea                dstq, [dstq+strideq*8]
   3587    vextracti128        xm1, m1, 1
   3588    movd   [dstq+strideq*0], xm0
   3589    pextrd [dstq+strideq*1], xm0, 1
   3590    pextrd [dstq+strideq*2], xm0, 2
   3591    pextrd [dstq+r2       ], xm0, 3
   3592    movd   [dstq+strideq*4], xm1
   3593    pextrd [dstq+r3       ], xm1, 1
   3594    pextrd [dstq+r2*2     ], xm1, 2
   3595    pextrd [dstq+r4       ], xm1, 3
   3596    jmp .h16_end
   3597 .h16_w8_loop:
   3598    mova                 m0, [rsp+32*0]
   3599    mova                 m2, [rsp+32*1]
   3600    punpcklbw            m1, m2, m0
   3601    punpckhbw            m2, m0
   3602    punpcklbw            m0, m1, m2
   3603    punpckhbw            m1, m2
   3604 .h16_w8:
   3605    mova                 m2, [rsp+32*2]
   3606    mova                 m4, [rsp+32*3]
   3607    lea                  r6, [dstq+strideq*8]
   3608    add                 rsp, 32*4
   3609    punpcklbw            m3, m4, m2
   3610    punpckhbw            m4, m2
   3611    punpcklbw            m2, m3, m4
   3612    punpckhbw            m3, m4
   3613    punpckldq            m4, m2, m0
   3614    punpckhdq            m2, m0
   3615    punpckldq            m0, m3, m1
   3616    punpckhdq            m3, m1
   3617    movq   [dstq+strideq*0], xm4
   3618    movhps [dstq+strideq*1], xm4
   3619    vextracti128        xm4, m4, 1
   3620    movq   [dstq+strideq*2], xm2
   3621    movhps [dstq+r2       ], xm2
   3622    vextracti128        xm2, m2, 1
   3623    movq   [dstq+strideq*4], xm0
   3624    movhps [dstq+r3       ], xm0
   3625    vextracti128        xm0, m0, 1
   3626    movq   [dstq+r2*2     ], xm3
   3627    movhps [dstq+r4       ], xm3
   3628    vextracti128        xm3, m3, 1
   3629    movq     [r6+strideq*0], xm4
   3630    movhps   [r6+strideq*1], xm4
   3631    movq     [r6+strideq*2], xm2
   3632    movhps   [r6+r2       ], xm2
   3633    movq     [r6+strideq*4], xm0
   3634    movhps   [r6+r3       ], xm0
   3635    movq     [r6+r2*2     ], xm3
   3636    movhps   [r6+r4       ], xm3
   3637    sub                dstq, 8
   3638    sub              org_wd, 8
   3639    jge .h16_w8_loop
   3640 .h16_end:
   3641    RET
   3642 ALIGN function_align
   3643 .h32:
   3644    ALLOC_STACK         -96, 15
   3645    lea            maxbased, [wq+31]
   3646    and            maxbased, 31
   3647    or             maxbased, 32 ; imin(w+31, 63)
   3648    test             angled, 0x400 ; !enable_intra_edge_filter
   3649    jnz .h32_main
   3650    vbroadcasti128       m0, [pb_0to15]
   3651    mov                 r4d, 21
   3652    mov                 r5d, 3
   3653    movu               xm11, [tlq-66]    ; 56-63
   3654    vinserti128         m11, [tlq-52], 1 ; 40-47
   3655    sub                 r4d, wd ; 21-w
   3656    cmovns              r5d, r4d
   3657    movu               xm12, [tlq-58]    ; 48-55
   3658    vinserti128         m12, [tlq-44], 1 ; 32-39
   3659    sub                 r4d, 8 ; 13-w
   3660    movd                xm1, r5d
   3661    movu               xm13, [tlq-34]    ; 24-31
   3662    vinserti128         m13, [tlq-20], 1 ;  8-15
   3663    movd                xm2, r4d
   3664    vpbroadcastb         m1, xm1
   3665    movu               xm14, [tlq-28]    ; 16-23
   3666    vinserti128         m14, [tlq-14], 1 ;  0- 7
   3667    vpbroadcastb         m2, xm2
   3668    pmaxsb               m1, m0 ; clip 16x32 and (32|64)x32
   3669    movu                 m7, [z_filter_s+4]
   3670    pshufb              m11, m1
   3671    vinserti128          m8, m7, [z_filter_s+8], 1
   3672    vinserti128          m7, [z_filter_s+16], 0
   3673    pmaxsb               m2, m0 ; clip 8x32
   3674    vpbroadcastd         m9, [z_filter_k+4*2+12*0]
   3675    pshufb              m12, m2
   3676    pshufb               m0, m11, m8
   3677    pmaddubsw            m0, m9
   3678    pshufb               m2, m12, m8
   3679    pmaddubsw            m2, m9
   3680    pshufb               m1, m13, m8
   3681    pmaddubsw            m1, m9
   3682    shufps               m8, m7, q1021
   3683    pshufb               m6, m14, m8
   3684    pmaddubsw            m6, m9
   3685    vpbroadcastd         m9, [z_filter_k+4*2+12*1]
   3686    pshufb              m10, m11, m8
   3687    pmaddubsw           m10, m9
   3688    paddw                m0, m10
   3689    pshufb              m10, m12, m8
   3690    pmaddubsw           m10, m9
   3691    paddw                m2, m10
   3692    pshufb              m10, m13, m8
   3693    pmaddubsw           m10, m9
   3694    shufps               m8, m7, q2121
   3695    paddw                m1, m10
   3696    pshufb              m10, m14, m8
   3697    pmaddubsw           m10, m9
   3698    paddw                m6, m10
   3699    vpbroadcastd         m9, [z_filter_k+4*2+12*2]
   3700    pshufb              m11, m8
   3701    pmaddubsw           m11, m9
   3702    pshufb              m12, m8
   3703    pmaddubsw           m12, m9
   3704    movzx               r4d, byte [tlq-63]
   3705    movzx               r2d, byte [tlq-62]
   3706    paddw                m0, m11
   3707    paddw                m2, m12
   3708    pshufb              m13, m8
   3709    pmaddubsw           m13, m9
   3710    pshufb              m14, m7
   3711    pmaddubsw           m14, m9
   3712    paddw                m1, m13
   3713    paddw                m6, m14
   3714    sub                 r2d, r4d
   3715    lea                 r2d, [r2+r4*8+4] ; edge case for 64x32
   3716    pmulhrsw             m0, m3
   3717    pmulhrsw             m2, m3
   3718    pmulhrsw             m1, m3
   3719    pmulhrsw             m6, m3
   3720    shr                 r2d, 3
   3721    mov            [rsp+31], r2b
   3722    lea                 tlq, [rsp+95]
   3723    mov            [tlq-65], r4b
   3724    mov                 r4d, 65
   3725    cmp                  wd, 64
   3726    cmove          maxbased, r4d
   3727    packuswb             m0, m2
   3728    packuswb             m1, m6
   3729    mova           [tlq-63], m0
   3730    mova           [tlq-31], m1
   3731 .h32_main:
   3732    movd                xm6, dyd
   3733    mov                  r4, tlq
   3734    sub                 tlq, 8
   3735    neg                 dyq
   3736    vpbroadcastw         m6, xm6
   3737    sub                  r4, maxbaseq
   3738    shl            maxbased, 6
   3739    vpbroadcastb         m7, [r4]
   3740    lea                  r4, [dyq+63]
   3741    movd                xm9, maxbased
   3742    not            maxbased
   3743    vbroadcasti128       m8, [z3_shuf]
   3744    add            maxbased, 64
   3745    vpbroadcastw         m9, xm9
   3746    psubw                m9, [z_base_inc]
   3747    mova                m11, m6
   3748    psubw               m10, m9, m3 ; 64*8
   3749 .h32_loop:
   3750    mov                  r5, r4
   3751    sar                  r5, 6
   3752    pand                 m1, m4, m6
   3753    psubw                m2, m5, m1
   3754    psllw                m1, 8
   3755    por                  m2, m1
   3756    movu                xm0, [tlq+r5- 0]
   3757    vinserti128          m0, [tlq+r5-16], 1
   3758    movu                xm1, [tlq+r5- 8]
   3759    vinserti128          m1, [tlq+r5-24], 1
   3760    sub                 rsp, 32
   3761    add                  r4, dyq
   3762    pshufb               m0, m8
   3763    pshufb               m1, m8
   3764    pmaddubsw            m0, m2
   3765    pmaddubsw            m1, m2
   3766    pmulhrsw             m0, m3
   3767    pmulhrsw             m1, m3
   3768    packuswb             m0, m1
   3769    pcmpgtw              m1, m9, m6
   3770    pcmpgtw              m2, m10, m6
   3771    packsswb             m1, m2
   3772    paddw                m6, m11
   3773    vpblendvb            m0, m7, m0, m1
   3774    mova              [rsp], m0
   3775    dec                  wd
   3776    jz .h32_transpose
   3777    cmp                 r4d, maxbased
   3778    jg .h32_loop
   3779 .h32_end_loop:
   3780    sub                 rsp, 32
   3781    mova              [rsp], m7
   3782    dec                  wd
   3783    jg .h32_end_loop
   3784 .h32_transpose:
   3785    lea                dstq, [dstq+org_wq-8]
   3786    lea                  r2, [strideq*3]
   3787    lea                  r3, [strideq*5]
   3788    lea                  r4, [strideq+r2*2] ; stride*7
   3789 .h32_w8_loop:
   3790    mova                 m7, [rsp+32*0]
   3791    mova                 m6, [rsp+32*1]
   3792    mova                 m5, [rsp+32*2]
   3793    mova                 m4, [rsp+32*3]
   3794    mova                 m3, [rsp+32*4]
   3795    mova                 m2, [rsp+32*5]
   3796    mova                 m1, [rsp+32*6]
   3797    mova                 m0, [rsp+32*7]
   3798    lea                  r6, [dstq+strideq*8]
   3799    add                 rsp, 32*8
   3800    punpcklbw            m8, m0, m1
   3801    punpckhbw            m0, m1
   3802    punpcklbw            m1, m2, m3
   3803    punpckhbw            m2, m3
   3804    punpcklbw            m3, m4, m5
   3805    punpckhbw            m4, m5
   3806    punpcklbw            m5, m6, m7
   3807    punpckhbw            m6, m7
   3808    punpcklwd            m7, m8, m1
   3809    punpckhwd            m8, m1
   3810    punpcklwd            m1, m0, m2
   3811    punpckhwd            m0, m2
   3812    punpcklwd            m2, m3, m5
   3813    punpckhwd            m3, m5
   3814    punpcklwd            m5, m4, m6
   3815    punpckhwd            m4, m6
   3816    punpckldq            m6, m7, m2
   3817    punpckhdq            m7, m2
   3818    punpckldq            m2, m8, m3
   3819    punpckhdq            m8, m3
   3820    punpckldq            m3, m1, m5
   3821    punpckhdq            m1, m5
   3822    punpckldq            m5, m0, m4
   3823    punpckhdq            m0, m4
   3824    movq   [dstq+strideq*0], xm6
   3825    movhps [dstq+strideq*1], xm6
   3826    vextracti128        xm6, m6, 1
   3827    movq   [dstq+strideq*2], xm7
   3828    movhps [dstq+r2       ], xm7
   3829    vextracti128        xm7, m7, 1
   3830    movq   [dstq+strideq*4], xm2
   3831    movhps [dstq+r3       ], xm2
   3832    vextracti128        xm2, m2, 1
   3833    movq   [dstq+r2*2     ], xm8
   3834    movhps [dstq+r4       ], xm8
   3835    vextracti128        xm8, m8, 1
   3836    movq     [r6+strideq*0], xm3
   3837    movhps   [r6+strideq*1], xm3
   3838    vextracti128        xm3, m3, 1
   3839    movq     [r6+strideq*2], xm1
   3840    movhps   [r6+r2       ], xm1
   3841    vextracti128        xm1, m1, 1
   3842    movq     [r6+strideq*4], xm5
   3843    movhps   [r6+r3       ], xm5
   3844    vextracti128        xm5, m5, 1
   3845    movq     [r6+r2*2     ], xm0
   3846    movhps   [r6+r4       ], xm0
   3847    lea                  r6, [r6+strideq*8]
   3848    vextracti128        xm0, m0, 1
   3849    movq     [r6+strideq*0], xm6
   3850    movhps   [r6+strideq*1], xm6
   3851    movq     [r6+strideq*2], xm7
   3852    movhps   [r6+r2       ], xm7
   3853    movq     [r6+strideq*4], xm2
   3854    movhps   [r6+r3       ], xm2
   3855    movq     [r6+r2*2     ], xm8
   3856    movhps   [r6+r4       ], xm8
   3857    lea                  r6, [r6+strideq*8]
   3858    movq     [r6+strideq*0], xm3
   3859    movhps   [r6+strideq*1], xm3
   3860    movq     [r6+strideq*2], xm1
   3861    movhps   [r6+r2       ], xm1
   3862    movq     [r6+strideq*4], xm5
   3863    movhps   [r6+r3       ], xm5
   3864    movq     [r6+r2*2     ], xm0
   3865    movhps   [r6+r4       ], xm0
   3866    sub                dstq, 8
   3867    sub              org_wd, 8
   3868    jg .h32_w8_loop
   3869    RET
   3870 ALIGN function_align
   3871 .h64:
   3872    ALLOC_STACK        -128, 16
   3873    lea            maxbased, [wq+63]
   3874    test             angled, 0x400 ; !enable_intra_edge_filter
   3875    jnz .h64_main
   3876    mov                 r4d, 21
   3877    vpbroadcastb       xm11, [tlq-127]
   3878    vpblendd           xm11, [tlq-130], 0x0e ; 120-127
   3879    sub                 r4d, wd ; 21-w
   3880    mov                 r5d, 3
   3881    vinserti128         m11, [tlq-116], 1    ; 104-111
   3882    movu                 m7, [z_filter_s+4]
   3883    cmp                  wd, 32
   3884    cmove               r4d, r5d
   3885    vinserti128          m8, m7, [z_filter_s+8], 1
   3886    vbroadcasti128       m6, [pb_0to15]
   3887    movd                xm1, r4d
   3888    vpbroadcastd         m9, [z_filter_k+4*2+12*0]
   3889    movu               xm12, [tlq-122]       ; 112-119
   3890    vinserti128         m12, [tlq-108], 1    ;  96-103
   3891    vpbroadcastb         m1, xm1
   3892    movu               xm13, [tlq- 98]       ;  88- 95
   3893    vinserti128         m13, [tlq- 84], 1    ;  72- 79
   3894    movu               xm14, [tlq- 90]       ;  80- 87
   3895    vinserti128         m14, [tlq- 76], 1    ;  64- 71
   3896    vinserti128          m7, [z_filter_s+16], 0
   3897    pshufb               m0, m11, m8
   3898    pmaddubsw            m0, m9
   3899    pshufb               m2, m12, m8
   3900    pmaddubsw            m2, m9
   3901    pmaxsb               m1, m6 ; clip (16|32)x64
   3902    pshufb              m13, m1
   3903    pshufb               m1, m13, m8
   3904    pmaddubsw            m1, m9
   3905    pshufb               m6, m14, m8
   3906    pmaddubsw            m6, m9
   3907    vpbroadcastd         m9, [z_filter_k+4*2+12*1]
   3908    shufps              m15, m8, m7, q1021
   3909    pshufb              m10, m11, m15
   3910    pmaddubsw           m10, m9
   3911    paddw                m0, m10
   3912    pshufb              m10, m12, m15
   3913    pmaddubsw           m10, m9
   3914    paddw                m2, m10
   3915    pshufb              m10, m13, m15
   3916    pmaddubsw           m10, m9
   3917    paddw                m1, m10
   3918    pshufb              m10, m14, m15
   3919    pmaddubsw           m10, m9
   3920    paddw                m6, m10
   3921    vpbroadcastd         m9, [z_filter_k+4*2+12*2]
   3922    shufps              m10, m8, m7, q2132
   3923    pshufb              m11, m10
   3924    pmaddubsw           m11, m9
   3925    pshufb              m12, m10
   3926    pmaddubsw           m12, m9
   3927    pshufb              m13, m10
   3928    pmaddubsw           m13, m9
   3929    pshufb              m14, m10
   3930    pmaddubsw           m14, m9
   3931    paddw                m0, m11
   3932    paddw                m2, m12
   3933    paddw                m1, m13
   3934    paddw                m6, m14
   3935    movu               xm11, [tlq-66]    ; 56-63
   3936    vinserti128         m11, [tlq-52], 1 ; 40-47
   3937    movu               xm12, [tlq-58]    ; 48-55
   3938    vinserti128         m12, [tlq-44], 1 ; 32-39
   3939    movu               xm13, [tlq-34]    ; 24-31
   3940    vinserti128         m13, [tlq-20], 1 ;  8-15
   3941    movu               xm14, [tlq-28]    ; 16-23
   3942    vinserti128         m14, [tlq-14], 1 ;  0- 7
   3943    pmulhrsw             m0, m3
   3944    pmulhrsw             m2, m3
   3945    pmulhrsw             m1, m3
   3946    pmulhrsw             m6, m3
   3947    lea                 tlq, [rsp+127]
   3948    packuswb             m0, m2
   3949    packuswb             m1, m6
   3950    mova          [tlq-127], m0
   3951    mova          [tlq- 95], m1
   3952    pshufb               m0, m11, m10
   3953    pmaddubsw            m0, m9
   3954    pshufb               m2, m12, m10
   3955    pmaddubsw            m2, m9
   3956    pshufb               m1, m13, m10
   3957    pmaddubsw            m1, m9
   3958    pshufb               m6, m14, m7
   3959    pmaddubsw            m6, m9
   3960    vpbroadcastd         m9, [z_filter_k+4*2+12*1]
   3961    pshufb               m7, m11, m15
   3962    pmaddubsw            m7, m9
   3963    paddw                m0, m7
   3964    pshufb               m7, m12, m15
   3965    pmaddubsw            m7, m9
   3966    paddw                m2, m7
   3967    pshufb               m7, m13, m15
   3968    pmaddubsw            m7, m9
   3969    paddw                m1, m7
   3970    pshufb               m7, m14, m10
   3971    pmaddubsw            m7, m9
   3972    paddw                m6, m7
   3973    vpbroadcastd         m9, [z_filter_k+4*2+12*0]
   3974    pshufb              m11, m8
   3975    pmaddubsw           m11, m9
   3976    pshufb              m12, m8
   3977    pmaddubsw           m12, m9
   3978    pshufb              m13, m8
   3979    pmaddubsw           m13, m9
   3980    pshufb              m14, m15
   3981    pmaddubsw           m14, m9
   3982    paddw                m0, m11
   3983    paddw                m2, m12
   3984    paddw                m1, m13
   3985    paddw                m6, m14
   3986    pmulhrsw             m0, m3
   3987    pmulhrsw             m2, m3
   3988    pmulhrsw             m1, m3
   3989    pmulhrsw             m6, m3
   3990    packuswb             m0, m2
   3991    packuswb             m1, m6
   3992    mova           [tlq-63], m0
   3993    mova           [tlq-31], m1
   3994 .h64_main:
   3995    movd               xm12, dyd
   3996    neg            maxbaseq
   3997    vbroadcasti128       m8, [z3_shuf]
   3998    vpbroadcastb         m7, [tlq+maxbaseq]
   3999    shl            maxbased, 6
   4000    vpbroadcastw        m12, xm12
   4001    lea                 r5d, [dyq+maxbaseq-64]
   4002    neg                 dyq
   4003    or             maxbased, 63
   4004    lea                  r4, [dyq+63]
   4005    movd                xm6, r5d
   4006    mova               xm10, [pb_1to32+16]
   4007    vinserti128         m10, [pb_1to32], 1
   4008    vpbroadcastd        m11, [pb_32]
   4009    vpbroadcastw         m6, xm6
   4010 .h64_loop:
   4011    mov                  r5, r4
   4012    sar                  r5, 6
   4013    movu                 m0, [tlq+r5-24]
   4014    movu                 m1, [tlq+r5-32]
   4015    pand                 m2, m4, m6
   4016    psubw                m9, m5, m2
   4017    psllw                m2, 8
   4018    por                  m9, m2
   4019    pshufb               m0, m8
   4020    pshufb               m1, m8
   4021    pmaddubsw            m0, m9
   4022    pmaddubsw            m1, m9
   4023    psraw                m2, m6, 6
   4024    sub                 rsp, 64
   4025    pmulhrsw             m0, m3
   4026    pmulhrsw             m1, m3
   4027    packsswb             m2, m2
   4028    paddb                m2, m10
   4029    packuswb             m0, m1
   4030    vpblendvb            m0, m7, m0, m2
   4031    mova           [rsp+32], m0
   4032    movu                 m0, [tlq+r5-56]
   4033    movu                 m1, [tlq+r5-64]
   4034    add                  r4, dyq
   4035    pshufb               m0, m8
   4036    pshufb               m1, m8
   4037    pmaddubsw            m0, m9
   4038    pmaddubsw            m1, m9
   4039    paddb                m2, m11
   4040    pmulhrsw             m0, m3
   4041    pmulhrsw             m1, m3
   4042    paddw                m6, m12
   4043    packuswb             m0, m1
   4044    vpblendvb            m0, m7, m0, m2
   4045    mova              [rsp], m0
   4046    dec                  wd
   4047    jz .h64_transpose
   4048    cmp                 r4d, maxbased
   4049    jg .h64_loop
   4050 .h64_end_loop:
   4051    sub                 rsp, 64
   4052    mova           [rsp+32], m7
   4053    mova           [rsp+ 0], m7
   4054    dec                  wd
   4055    jg .h64_end_loop
   4056 .h64_transpose:
   4057    lea                  r2, [strideq*3]
   4058    lea                  r3, [strideq*5]
   4059    imul                 r5, strideq, -8
   4060    lea                dstq, [dstq+org_wq-16]
   4061    lea                  r4, [strideq+r2*2] ; stride*7
   4062 .h64_transpose_loop0:
   4063    lea                  r6, [rsp+16*3]
   4064 .h64_transpose_loop:
   4065    mova                xm0, [r6+64*15]
   4066    vinserti128          m0, [r6+64* 7], 1
   4067    mova                xm1, [r6+64*14]
   4068    vinserti128          m1, [r6+64* 6], 1
   4069    mova                xm2, [r6+64*13]
   4070    vinserti128          m2, [r6+64* 5], 1
   4071    mova                xm3, [r6+64*12]
   4072    vinserti128          m3, [r6+64* 4], 1
   4073    mova                xm4, [r6+64*11]
   4074    vinserti128          m4, [r6+64* 3], 1
   4075    mova                xm5, [r6+64*10]
   4076    vinserti128          m5, [r6+64* 2], 1
   4077    mova                xm6, [r6+64* 9]
   4078    vinserti128          m6, [r6+64* 1], 1
   4079    mova                xm7, [r6+64* 8]
   4080    vinserti128          m7, [r6+64* 0], 1
   4081    sub                  r6, 16
   4082    punpcklbw            m8, m0, m1
   4083    punpckhbw            m0, m1
   4084    punpcklbw            m1, m2, m3
   4085    punpckhbw            m2, m3
   4086    punpcklbw            m3, m4, m5
   4087    punpckhbw            m4, m5
   4088    punpcklbw            m5, m6, m7
   4089    punpckhbw            m6, m7
   4090    punpcklwd            m7, m8, m1
   4091    punpckhwd            m8, m1
   4092    punpcklwd            m1, m0, m2
   4093    punpckhwd            m0, m2
   4094    punpcklwd            m2, m3, m5
   4095    punpckhwd            m3, m5
   4096    punpcklwd            m5, m4, m6
   4097    punpckhwd            m4, m6
   4098    punpckldq            m6, m7, m2
   4099    punpckhdq            m7, m2
   4100    punpckldq            m2, m8, m3
   4101    punpckhdq            m8, m3
   4102    punpckldq            m3, m1, m5
   4103    punpckhdq            m1, m5
   4104    punpckldq            m5, m0, m4
   4105    punpckhdq            m0, m4
   4106    vpermq               m6, m6, q3120
   4107    vpermq               m7, m7, q3120
   4108    vpermq               m2, m2, q3120
   4109    vpermq               m8, m8, q3120
   4110    vpermq               m3, m3, q3120
   4111    vpermq               m1, m1, q3120
   4112    vpermq               m5, m5, q3120
   4113    vpermq               m0, m0, q3120
   4114    mova         [dstq+strideq*0], xm6
   4115    vextracti128 [dstq+strideq*1], m6, 1
   4116    mova         [dstq+strideq*2], xm7
   4117    vextracti128 [dstq+r2       ], m7, 1
   4118    mova         [dstq+strideq*4], xm2
   4119    vextracti128 [dstq+r3       ], m2, 1
   4120    mova         [dstq+r2*2     ], xm8
   4121    vextracti128 [dstq+r4       ], m8, 1
   4122    sub               dstq, r5
   4123    mova         [dstq+strideq*0], xm3
   4124    vextracti128 [dstq+strideq*1], m3, 1
   4125    mova         [dstq+strideq*2], xm1
   4126    vextracti128 [dstq+r2       ], m1, 1
   4127    mova         [dstq+strideq*4], xm5
   4128    vextracti128 [dstq+r3       ], m5, 1
   4129    mova         [dstq+r2*2     ], xm0
   4130    vextracti128 [dstq+r4       ], m0, 1
   4131    sub                dstq, r5
   4132    cmp                  r6, rsp
   4133    jae .h64_transpose_loop
   4134    add                 rsp, 64*16
   4135    lea                dstq, [dstq+r5*8-16]
   4136    sub              org_wd, 16
   4137    jg .h64_transpose_loop0
   4138 .h64_end:
   4139    RET
   4140 
   4141 %macro FILTER_XMM 4 ; dst, src, tmp, shuf
   4142 %ifnum %4
   4143    pshufb             xm%2, xm%4
   4144 %else
   4145    pshufb             xm%2, %4
   4146 %endif
   4147    pshufd             xm%1, xm%2, q0000 ; p0 p1
   4148    pmaddubsw          xm%1, xm2
   4149    pshufd             xm%3, xm%2, q1111 ; p2 p3
   4150    pmaddubsw          xm%3, xm3
   4151    paddw              xm%1, xm1
   4152    paddw              xm%1, xm%3
   4153    pshufd             xm%3, xm%2, q2222 ; p4 p5
   4154    pmaddubsw          xm%3, xm4
   4155    paddw              xm%1, xm%3
   4156    pshufd             xm%3, xm%2, q3333 ; p6 __
   4157    pmaddubsw          xm%3, xm5
   4158    paddw              xm%1, xm%3
   4159    psraw              xm%1, 4
   4160    packuswb           xm%1, xm%1
   4161 %endmacro
   4162 
   4163 %macro FILTER_YMM 4 ; dst, src, tmp, shuf
   4164    pshufb              m%2, m%4
   4165    pshufd              m%1, m%2, q0000
   4166    pmaddubsw           m%1, m2
   4167    pshufd              m%3, m%2, q1111
   4168    pmaddubsw           m%3, m3
   4169    paddw               m%1, m1
   4170    paddw               m%1, m%3
   4171    pshufd              m%3, m%2, q2222
   4172    pmaddubsw           m%3, m4
   4173    paddw               m%1, m%3
   4174    pshufd              m%3, m%2, q3333
   4175    pmaddubsw           m%3, m5
   4176    paddw               m%1, m%3
   4177    psraw               m%1, 4
   4178    vperm2i128          m%3, m%1, m%1, 0x01
   4179    packuswb            m%1, m%3
   4180 %endmacro
   4181 
   4182 ; The ipred_filter SIMD processes 4x2 blocks in the following order which
   4183 ; increases parallelism compared to doing things row by row. One redundant
   4184 ; block is calculated for w8 and w16, two for w32.
   4185 ;     w4     w8       w16             w32
   4186 ;     1     1 2     1 2 3 5     1 2 3 5 b c d f
   4187 ;     2     2 3     2 4 5 7     2 4 5 7 c e f h
   4188 ;     3     3 4     4 6 7 9     4 6 7 9 e g h j
   4189 ; ___ 4 ___ 4 5 ___ 6 8 9 a ___ 6 8 9 a g i j k ___
   4190 ;           5       8           8       i
   4191 
   4192 cglobal ipred_filter_8bpc, 3, 7, 0, dst, stride, tl, w, h, filter
   4193 %define base r6-ipred_filter_avx2_table
   4194    lea                  r6, [filter_intra_taps]
   4195    tzcnt                wd, wm
   4196 %ifidn filterd, filterm
   4197    movzx           filterd, filterb
   4198 %else
   4199    movzx           filterd, byte filterm
   4200 %endif
   4201    shl             filterd, 6
   4202    WIN64_SPILL_XMM       9, 15
   4203    add             filterq, r6
   4204    lea                  r6, [ipred_filter_avx2_table]
   4205    movq                xm0, [tlq-3] ; _ 6 5 0 1 2 3 4
   4206    movsxd               wq, [r6+wq*4]
   4207    vpbroadcastd         m1, [base+pw_8]
   4208    vbroadcasti128       m2, [filterq+16*0]
   4209    vbroadcasti128       m3, [filterq+16*1]
   4210    vbroadcasti128       m4, [filterq+16*2]
   4211    vbroadcasti128       m5, [filterq+16*3]
   4212    add                  wq, r6
   4213    mov                  hd, hm
   4214    jmp                  wq
   4215 .w4:
   4216    mova                xm8, [base+filter_shuf2]
   4217    sub                 tlq, 3
   4218    sub                 tlq, hq
   4219    jmp .w4_loop_start
   4220 .w4_loop:
   4221    pinsrd              xm0, xm6, [tlq+hq], 0
   4222    lea                dstq, [dstq+strideq*2]
   4223 .w4_loop_start:
   4224    FILTER_XMM            6, 0, 7, 8
   4225    movd   [dstq+strideq*0], xm6
   4226    pextrd [dstq+strideq*1], xm6, 1
   4227    sub                  hd, 2
   4228    jg .w4_loop
   4229    RET
   4230 ALIGN function_align
   4231 .w8:
   4232    WIN64_PUSH_XMM       10
   4233    mova                 m8, [base+filter_shuf1]
   4234    FILTER_XMM            7, 0, 6, [base+filter_shuf2]
   4235    vpbroadcastd         m0, [tlq+4]
   4236    vpbroadcastd         m6, [tlq+5]
   4237    sub                 tlq, 4
   4238    sub                 tlq, hq
   4239    vpbroadcastq         m7, xm7
   4240    vpblendd             m7, m6, 0x20
   4241 .w8_loop:
   4242    vpbroadcastd        xm6, [tlq+hq]
   4243    palignr              m6, m0, 12
   4244    vpblendd             m0, m6, m7, 0xeb     ; _ _ _ _ 1 2 3 4 6 5 0 _ _ _ _ _
   4245                                              ; 0 _ _ _ 1 2 3 4 _ _ _ 5 _ _ _ 6
   4246    mova                xm6, xm7
   4247    call .main
   4248    vpblendd            xm6, xm7, 0x0c
   4249    pshufd              xm6, xm6, q3120
   4250    movq   [dstq+strideq*0], xm6
   4251    movhps [dstq+strideq*1], xm6
   4252    lea                dstq, [dstq+strideq*2]
   4253    sub                  hd, 2
   4254    jg .w8_loop
   4255    RET
   4256 ALIGN function_align
   4257 .w16:
   4258    sub                  hd, 2
   4259    call .w16_main
   4260 %if WIN64
   4261    jmp .end
   4262 %else
   4263    RET
   4264 %endif
   4265 .w16_main:
   4266    ; The spills are into the callers stack frame
   4267    %assign stack_size stack_size + gprsize
   4268    WIN64_PUSH_XMM       15, 9
   4269    %assign stack_size stack_size - gprsize
   4270    FILTER_XMM           12, 0, 7, [base+filter_shuf2]
   4271    vpbroadcastd         m0, [tlq+5]
   4272    vpblendd             m0, [tlq-12], 0x14
   4273    mova                 m8, [base+filter_shuf1]
   4274    vpbroadcastq         m7, xm12
   4275    vpblendd             m0, m7, 0xc2         ; _ _ _ _ 1 2 3 4 6 5 0 _ _ _ _ _
   4276                                              ; 0 _ _ _ 1 2 3 4 _ _ _ 5 _ _ _ 6
   4277    call .main                                ; c0 d0 a1 b1   a1 b1 c0 d0
   4278    movlps              xm9, xm7, [tlq+5]     ; _ _ _ 0 1 2 3 4 _ _ _ 5 _ _ _ 6
   4279    vinserti128         m14, m8, [base+filter_shuf3], 0
   4280    vpblendd           xm12, xm7, 0x0c        ; a0 b0 a1 b1
   4281    FILTER_XMM            6, 9, 10, 14
   4282    vpbroadcastq         m6, xm6              ; a2 b2 __ __ __ __ a2 b2
   4283    vpbroadcastd         m9, [tlq+13]
   4284    vpbroadcastd        m10, [tlq+12]
   4285    psrld               m11, m8, 4
   4286    vpblendd             m6, m9, 0x20         ; top
   4287    sub                 tlq, 6
   4288    sub                 tlq, hq
   4289 .w16_loop:
   4290    vpbroadcastd        xm9, [tlq+hq]
   4291    palignr              m9, m0, 12
   4292    vpblendd             m0, m9, m7, 0xe2     ; _ _ _ _ 1 2 3 4 6 5 0 _ _ _ _ _
   4293                                              ; 0 _ _ _ 1 2 3 4 _ _ _ 5 _ _ _ 6
   4294    mova               xm13, xm7
   4295    call .main                                ; e0 f0 c1 d1   c1 d1 e0 f0
   4296    vpblendd             m9, m12, m10, 0xf0
   4297    vpblendd            m12, m6, 0xc0
   4298    pshufd               m9, m9, q3333
   4299    vpblendd             m9, m6, 0xee
   4300    vpblendd            m10, m9, m7, 0x0c     ; _ _ _ 0 1 2 3 4 _ _ _ 5 _ _ _ 6
   4301                                              ; 0 _ _ _ 1 2 3 4 _ _ _ 5 _ _ _ 6
   4302    FILTER_YMM            6, 10, 9, 14        ; c2 d2 a3 b3   a3 b3 c2 d2
   4303    vpblendd            m12, m6, 0x30         ; a0 b0 a1 b1   a3 b3 a2 b2
   4304    vpermd               m9, m11, m12         ; a0 a1 a2 a3   b0 b1 b2 b3
   4305    vpblendd           xm12, xm13, xm7, 0x0c  ; c0 d0 c1 d1
   4306    mova         [dstq+strideq*0], xm9
   4307    vextracti128 [dstq+strideq*1], m9, 1
   4308    lea                dstq, [dstq+strideq*2]
   4309    sub                  hd, 2
   4310    jg .w16_loop
   4311    vpblendd            xm7, xm6, xm10, 0x04  ; _ _ _ 5 _ _ _ 6 0 _ _ _ 1 2 3 4
   4312    pshufd              xm7, xm7, q1032       ; 0 _ _ _ 1 2 3 4 _ _ _ 5 _ _ _ 6
   4313    FILTER_XMM            0, 7, 9, [base+filter_shuf1+16]
   4314    vpblendd            xm6, xm0, 0x0c        ; c2 d2 c3 d3
   4315    shufps              xm0, xm12, xm6, q2020 ; c0 c1 c2 c3
   4316    shufps              xm6, xm12, xm6, q3131 ; d0 d1 d2 d3
   4317    mova   [dstq+strideq*0], xm0
   4318    mova   [dstq+strideq*1], xm6
   4319    ret
   4320 ALIGN function_align
   4321 .w32:
   4322    sub                  hd, 2
   4323    lea                  r3, [dstq+16]
   4324    lea                 r5d, [hq-2]
   4325    call .w16_main
   4326    add                 tlq, r5
   4327    mov                dstq, r3
   4328    lea                  r3, [strideq-4]
   4329    lea                  r4, [r3+strideq*2]
   4330    movq                xm0, [tlq+21]
   4331    pinsrd              xm0, [dstq-4], 2
   4332    pinsrd              xm0, [dstq+r3*1], 3
   4333    FILTER_XMM           12, 0, 7, 14         ; a0 b0 a0 b0
   4334    movq                xm7, [dstq+r3*2]
   4335    pinsrd              xm7, [dstq+r4], 2
   4336    palignr             xm7, xm0, 12          ; 0 _ _ _ _ _ _ _ _ _ _ 5 _ _ _ 6
   4337    vpbroadcastd         m0, [tlq+28]
   4338    vpbroadcastd         m9, [tlq+29]
   4339    vbroadcasti128       m8, [base+filter_shuf1+16]
   4340    vpblendd             m0, m9, 0x20
   4341    vpblendd             m0, m7, 0x0f
   4342    vpbroadcastq         m7, xm12
   4343    vpblendd             m0, m7, 0xc2         ; 0 _ _ _ 1 2 3 4 _ _ _ 5 _ _ _ 6
   4344    call .main                                ; c0 d0 a1 b1   a1 b1 c0 d0
   4345    add                  r3, 2
   4346    lea                  r4, [r4+strideq*2]
   4347    movlps              xm9, xm7, [tlq+29]    ; _ _ _ 0 1 2 3 4 _ _ _ 5 _ _ _ 6
   4348    vpblendd           xm12, xm7, 0x0c        ; a0 b0 a1 b1
   4349    FILTER_XMM            6, 9, 10, 14
   4350    vpbroadcastq         m6, xm6              ; a2 b2 __ __ __ __ a2 b2
   4351    vpbroadcastd         m9, [tlq+37]
   4352    vpbroadcastd        m10, [tlq+36]
   4353    vpblendd             m6, m9, 0x20         ; top
   4354 .w32_loop:
   4355    movq                xm9, [dstq+r3*4]
   4356    pinsrd              xm9, [dstq+r4], 2
   4357 .w32_loop_last:
   4358    palignr              m9, m0, 12
   4359    vpblendd             m0, m9, m7, 0xe2     ; 0 _ _ _ 1 2 3 4 _ _ _ 5 _ _ _ 6
   4360    mova               xm13, xm7              ; c0 d0
   4361    call .main                                ; e0 f0 c1 d1   c1 d1 e0 f0
   4362    vpblendd             m9, m12, m10, 0xf0
   4363    vpblendd            m12, m6, 0xc0
   4364    pshufd               m9, m9, q3333
   4365    vpblendd             m9, m6, 0xee
   4366    vpblendd            m10, m9, m7, 0x0c     ; _ _ _ 0 1 2 3 4 _ _ _ 5 _ _ _ 6
   4367                                              ; 0 _ _ _ 1 2 3 4 _ _ _ 5 _ _ _ 6
   4368    FILTER_YMM            6, 10, 9, 14        ; c2 d2 a3 b3   a3 b3 c2 d2
   4369    vpblendd            m12, m6, 0x30         ; a0 b0 a1 b1   a3 b3 a2 b2
   4370    vpermd               m9, m11, m12         ; a0 a1 a2 a3   b0 b1 b2 b3
   4371    vpblendd           xm12, xm13, xm7, 0x0c  ; c0 d0 c1 d1
   4372    mova         [dstq+strideq*0], xm9
   4373    vextracti128 [dstq+strideq*1], m9, 1
   4374    lea                dstq, [dstq+strideq*2]
   4375    sub                 r5d, 2
   4376    jg .w32_loop
   4377    jz .w32_loop_last
   4378    vpblendd            xm7, xm6, xm10, 0x04  ; _ _ _ 5 _ _ _ 6 0 _ _ _ 1 2 3 4
   4379    pshufd              xm7, xm7, q1032       ; 0 _ _ _ 1 2 3 4 _ _ _ 5 _ _ _ 6
   4380    FILTER_XMM            0, 7, 9, [base+filter_shuf1+16]
   4381    vpblendd            xm6, xm0, 0x0c        ; c2 d2 c3 d3
   4382    shufps              xm0, xm12, xm6, q2020 ; c0 c1 c2 c3
   4383    shufps              xm6, xm12, xm6, q3131 ; d0 d1 d2 d3
   4384    mova   [dstq+strideq*0], xm0
   4385    mova   [dstq+strideq*1], xm6
   4386 .end:
   4387    RET
   4388 ALIGN function_align
   4389 .main:
   4390    FILTER_YMM            7, 0, 9, 8
   4391    ret
   4392 
   4393 %if WIN64
   4394 DECLARE_REG_TMP 5
   4395 %else
   4396 DECLARE_REG_TMP 7
   4397 %endif
   4398 
   4399 %macro IPRED_CFL 1 ; ac in, unpacked pixels out
   4400    psignw               m3, m%1, m1
   4401    pabsw               m%1, m%1
   4402    pmulhrsw            m%1, m2
   4403    psignw              m%1, m3
   4404    paddw               m%1, m0
   4405 %endmacro
   4406 
   4407 cglobal ipred_cfl_top_8bpc, 3, 7, 6, dst, stride, tl, w, h, ac, alpha
   4408    lea                  t0, [ipred_cfl_left_avx2_table]
   4409    tzcnt                wd, wm
   4410    inc                 tlq
   4411    movu                 m0, [tlq]
   4412    movifnidn            hd, hm
   4413    mov                 r6d, 0x8000
   4414    shrx                r6d, r6d, wd
   4415    movd                xm3, r6d
   4416    movsxd               r6, [t0+wq*4]
   4417    pcmpeqd              m2, m2
   4418    pmaddubsw            m0, m2
   4419    add                  r6, t0
   4420    add                  t0, ipred_cfl_splat_avx2_table-ipred_cfl_left_avx2_table
   4421    movsxd               wq, [t0+wq*4]
   4422    add                  wq, t0
   4423    movifnidn           acq, acmp
   4424    jmp                  r6
   4425 
   4426 cglobal ipred_cfl_left_8bpc, 3, 7, 6, dst, stride, tl, w, h, ac, alpha
   4427    mov                  hd, hm ; zero upper half
   4428    tzcnt               r6d, hd
   4429    sub                 tlq, hq
   4430    tzcnt                wd, wm
   4431    movu                 m0, [tlq]
   4432    mov                 t0d, 0x8000
   4433    shrx                t0d, t0d, r6d
   4434    movd                xm3, t0d
   4435    lea                  t0, [ipred_cfl_left_avx2_table]
   4436    movsxd               r6, [t0+r6*4]
   4437    pcmpeqd              m2, m2
   4438    pmaddubsw            m0, m2
   4439    add                  r6, t0
   4440    add                  t0, ipred_cfl_splat_avx2_table-ipred_cfl_left_avx2_table
   4441    movsxd               wq, [t0+wq*4]
   4442    add                  wq, t0
   4443    movifnidn           acq, acmp
   4444    jmp                  r6
   4445 .h32:
   4446    vextracti128        xm1, m0, 1
   4447    paddw               xm0, xm1
   4448 .h16:
   4449    punpckhqdq          xm1, xm0, xm0
   4450    paddw               xm0, xm1
   4451 .h8:
   4452    psrlq               xm1, xm0, 32
   4453    paddw               xm0, xm1
   4454 .h4:
   4455    pmaddwd             xm0, xm2
   4456    pmulhrsw            xm0, xm3
   4457    vpbroadcastw         m0, xm0
   4458    jmp                  wq
   4459 
   4460 cglobal ipred_cfl_8bpc, 3, 7, 6, dst, stride, tl, w, h, ac, alpha
   4461    movifnidn            hd, hm
   4462    movifnidn            wd, wm
   4463    tzcnt               r6d, hd
   4464    lea                 t0d, [wq+hq]
   4465    movd                xm4, t0d
   4466    tzcnt               t0d, t0d
   4467    movd                xm5, t0d
   4468    lea                  t0, [ipred_cfl_avx2_table]
   4469    tzcnt                wd, wd
   4470    movsxd               r6, [t0+r6*4]
   4471    movsxd               wq, [t0+wq*4+4*4]
   4472    pcmpeqd              m3, m3
   4473    psrlw               xm4, 1
   4474    add                  r6, t0
   4475    add                  wq, t0
   4476    movifnidn           acq, acmp
   4477    jmp                  r6
   4478 .h4:
   4479    movd                xm0, [tlq-4]
   4480    pmaddubsw           xm0, xm3
   4481    jmp                  wq
   4482 .w4:
   4483    movd                xm1, [tlq+1]
   4484    pmaddubsw           xm1, xm3
   4485    psubw               xm0, xm4
   4486    paddw               xm0, xm1
   4487    pmaddwd             xm0, xm3
   4488    cmp                  hd, 4
   4489    jg .w4_mul
   4490    psrlw               xm0, 3
   4491    jmp .w4_end
   4492 .w4_mul:
   4493    punpckhqdq          xm1, xm0, xm0
   4494    lea                 r2d, [hq*2]
   4495    mov                 r6d, 0x55563334
   4496    paddw               xm0, xm1
   4497    shrx                r6d, r6d, r2d
   4498    psrlq               xm1, xm0, 32
   4499    paddw               xm0, xm1
   4500    movd                xm1, r6d
   4501    psrlw               xm0, 2
   4502    pmulhuw             xm0, xm1
   4503 .w4_end:
   4504    vpbroadcastw         m0, xm0
   4505 .s4:
   4506    vpbroadcastw         m1, alpham
   4507    lea                  r6, [strideq*3]
   4508    pabsw                m2, m1
   4509    psllw                m2, 9
   4510 .s4_loop:
   4511    mova                 m4, [acq]
   4512    IPRED_CFL             4
   4513    packuswb             m4, m4
   4514    vextracti128        xm5, m4, 1
   4515    movd   [dstq+strideq*0], xm4
   4516    pextrd [dstq+strideq*1], xm4, 1
   4517    movd   [dstq+strideq*2], xm5
   4518    pextrd [dstq+r6       ], xm5, 1
   4519    lea                dstq, [dstq+strideq*4]
   4520    add                 acq, 32
   4521    sub                  hd, 4
   4522    jg .s4_loop
   4523    RET
   4524 ALIGN function_align
   4525 .h8:
   4526    movq                xm0, [tlq-8]
   4527    pmaddubsw           xm0, xm3
   4528    jmp                  wq
   4529 .w8:
   4530    movq                xm1, [tlq+1]
   4531    vextracti128        xm2, m0, 1
   4532    pmaddubsw           xm1, xm3
   4533    psubw               xm0, xm4
   4534    paddw               xm0, xm2
   4535    punpckhqdq          xm2, xm0, xm0
   4536    paddw               xm0, xm2
   4537    paddw               xm0, xm1
   4538    psrlq               xm1, xm0, 32
   4539    paddw               xm0, xm1
   4540    pmaddwd             xm0, xm3
   4541    psrlw               xm0, xm5
   4542    cmp                  hd, 8
   4543    je .w8_end
   4544    mov                 r6d, 0x5556
   4545    mov                 r2d, 0x3334
   4546    cmp                  hd, 32
   4547    cmove               r6d, r2d
   4548    movd                xm1, r6d
   4549    pmulhuw             xm0, xm1
   4550 .w8_end:
   4551    vpbroadcastw         m0, xm0
   4552 .s8:
   4553    vpbroadcastw         m1, alpham
   4554    lea                  r6, [strideq*3]
   4555    pabsw                m2, m1
   4556    psllw                m2, 9
   4557 .s8_loop:
   4558    mova                 m4, [acq]
   4559    mova                 m5, [acq+32]
   4560    IPRED_CFL             4
   4561    IPRED_CFL             5
   4562    packuswb             m4, m5
   4563    vextracti128        xm5, m4, 1
   4564    movq   [dstq+strideq*0], xm4
   4565    movq   [dstq+strideq*1], xm5
   4566    movhps [dstq+strideq*2], xm4
   4567    movhps [dstq+r6       ], xm5
   4568    lea                dstq, [dstq+strideq*4]
   4569    add                 acq, 64
   4570    sub                  hd, 4
   4571    jg .s8_loop
   4572    RET
   4573 ALIGN function_align
   4574 .h16:
   4575    mova                xm0, [tlq-16]
   4576    pmaddubsw           xm0, xm3
   4577    jmp                  wq
   4578 .w16:
   4579    movu                xm1, [tlq+1]
   4580    vextracti128        xm2, m0, 1
   4581    pmaddubsw           xm1, xm3
   4582    psubw               xm0, xm4
   4583    paddw               xm0, xm2
   4584    paddw               xm0, xm1
   4585    punpckhqdq          xm1, xm0, xm0
   4586    paddw               xm0, xm1
   4587    psrlq               xm1, xm0, 32
   4588    paddw               xm0, xm1
   4589    pmaddwd             xm0, xm3
   4590    psrlw               xm0, xm5
   4591    cmp                  hd, 16
   4592    je .w16_end
   4593    mov                 r6d, 0x5556
   4594    mov                 r2d, 0x3334
   4595    test                 hb, 8|32
   4596    cmovz               r6d, r2d
   4597    movd                xm1, r6d
   4598    pmulhuw             xm0, xm1
   4599 .w16_end:
   4600    vpbroadcastw         m0, xm0
   4601 .s16:
   4602    vpbroadcastw         m1, alpham
   4603    pabsw                m2, m1
   4604    psllw                m2, 9
   4605 .s16_loop:
   4606    mova                 m4, [acq]
   4607    mova                 m5, [acq+32]
   4608    IPRED_CFL             4
   4609    IPRED_CFL             5
   4610    packuswb             m4, m5
   4611    vpermq               m4, m4, q3120
   4612    mova         [dstq+strideq*0], xm4
   4613    vextracti128 [dstq+strideq*1], m4, 1
   4614    lea                dstq, [dstq+strideq*2]
   4615    add                 acq, 64
   4616    sub                  hd, 2
   4617    jg .s16_loop
   4618    RET
   4619 ALIGN function_align
   4620 .h32:
   4621    mova                 m0, [tlq-32]
   4622    pmaddubsw            m0, m3
   4623    jmp                  wq
   4624 .w32:
   4625    movu                 m1, [tlq+1]
   4626    pmaddubsw            m1, m3
   4627    paddw                m0, m1
   4628    vextracti128        xm1, m0, 1
   4629    psubw               xm0, xm4
   4630    paddw               xm0, xm1
   4631    punpckhqdq          xm1, xm0, xm0
   4632    paddw               xm0, xm1
   4633    psrlq               xm1, xm0, 32
   4634    paddw               xm0, xm1
   4635    pmaddwd             xm0, xm3
   4636    psrlw               xm0, xm5
   4637    cmp                  hd, 32
   4638    je .w32_end
   4639    lea                 r2d, [hq*2]
   4640    mov                 r6d, 0x33345556
   4641    shrx                r6d, r6d, r2d
   4642    movd                xm1, r6d
   4643    pmulhuw             xm0, xm1
   4644 .w32_end:
   4645    vpbroadcastw         m0, xm0
   4646 .s32:
   4647    vpbroadcastw         m1, alpham
   4648    pabsw                m2, m1
   4649    psllw                m2, 9
   4650 .s32_loop:
   4651    mova                 m4, [acq]
   4652    mova                 m5, [acq+32]
   4653    IPRED_CFL             4
   4654    IPRED_CFL             5
   4655    packuswb             m4, m5
   4656    vpermq               m4, m4, q3120
   4657    mova             [dstq], m4
   4658    add                dstq, strideq
   4659    add                 acq, 64
   4660    dec                  hd
   4661    jg .s32_loop
   4662    RET
   4663 
   4664 cglobal ipred_cfl_128_8bpc, 3, 7, 6, dst, stride, tl, w, h, ac, alpha
   4665    lea                  t0, [ipred_cfl_splat_avx2_table]
   4666    tzcnt                wd, wm
   4667    movifnidn            hd, hm
   4668    movsxd               wq, [t0+wq*4]
   4669    vpbroadcastd         m0, [t0-ipred_cfl_splat_avx2_table+pw_128]
   4670    add                  wq, t0
   4671    movifnidn           acq, acmp
   4672    jmp                  wq
   4673 
   4674 cglobal ipred_cfl_ac_420_8bpc, 4, 9, 5, ac, y, stride, wpad, hpad, w, h, sz, ac_bak
   4675    movifnidn         hpadd, hpadm
   4676    movifnidn            wd, wm
   4677    mov                  hd, hm
   4678    mov                 szd, wd
   4679    mov             ac_bakq, acq
   4680    imul                szd, hd
   4681    shl               hpadd, 2
   4682    sub                  hd, hpadd
   4683    vpbroadcastd         m2, [pb_2]
   4684    pxor                 m4, m4
   4685    cmp                  wd, 8
   4686    jg .w16
   4687    je .w8
   4688    ; fall-through
   4689 
   4690    DEFINE_ARGS ac, y, stride, wpad, hpad, stride3, h, sz, ac_bak
   4691 .w4:
   4692    lea            stride3q, [strideq*3]
   4693 .w4_loop:
   4694    movq                xm0, [yq]
   4695    movq                xm1, [yq+strideq]
   4696    movhps              xm0, [yq+strideq*2]
   4697    movhps              xm1, [yq+stride3q]
   4698    pmaddubsw           xm0, xm2
   4699    pmaddubsw           xm1, xm2
   4700    paddw               xm0, xm1
   4701    mova              [acq], xm0
   4702    paddw               xm4, xm0
   4703    lea                  yq, [yq+strideq*4]
   4704    add                 acq, 16
   4705    sub                  hd, 2
   4706    jg .w4_loop
   4707    test              hpadd, hpadd
   4708    jz .calc_avg
   4709    vpermq               m0, m0, q1111
   4710 .w4_hpad_loop:
   4711    mova              [acq], m0
   4712    paddw                m4, m0
   4713    add                 acq, 32
   4714    sub               hpadd, 4
   4715    jg .w4_hpad_loop
   4716    jmp .calc_avg
   4717 
   4718 .w8:
   4719    lea            stride3q, [strideq*3]
   4720    test              wpadd, wpadd
   4721    jnz .w8_wpad
   4722 .w8_loop:
   4723    mova                xm0, [yq]
   4724    mova                xm1, [yq+strideq]
   4725    vinserti128          m0, [yq+strideq*2], 1
   4726    vinserti128          m1, [yq+stride3q], 1
   4727    pmaddubsw            m0, m2
   4728    pmaddubsw            m1, m2
   4729    paddw                m0, m1
   4730    mova              [acq], m0
   4731    paddw                m4, m0
   4732    lea                  yq, [yq+strideq*4]
   4733    add                 acq, 32
   4734    sub                  hd, 2
   4735    jg .w8_loop
   4736    test              hpadd, hpadd
   4737    jz .calc_avg
   4738    jmp .w8_hpad
   4739 .w8_wpad:
   4740    vbroadcasti128       m3, [cfl_ac_w8_pad1_shuffle]
   4741 .w8_wpad_loop:
   4742    movq                xm0, [yq]
   4743    movq                xm1, [yq+strideq]
   4744    vinserti128          m0, [yq+strideq*2], 1
   4745    vinserti128          m1, [yq+stride3q], 1
   4746    pmaddubsw            m0, m2
   4747    pmaddubsw            m1, m2
   4748    paddw                m0, m1
   4749    pshufb               m0, m3
   4750    mova              [acq], m0
   4751    paddw                m4, m0
   4752    lea                  yq, [yq+strideq*4]
   4753    add                 acq, 32
   4754    sub                  hd, 2
   4755    jg .w8_wpad_loop
   4756    test              hpadd, hpadd
   4757    jz .calc_avg
   4758 .w8_hpad:
   4759    vpermq               m0, m0, q3232
   4760 .w8_hpad_loop:
   4761    mova              [acq], m0
   4762    paddw                m4, m0
   4763    add                 acq, 32
   4764    sub               hpadd, 2
   4765    jg .w8_hpad_loop
   4766    jmp .calc_avg
   4767 
   4768 .w16:
   4769    test              wpadd, wpadd
   4770    jnz .w16_wpad
   4771 .w16_loop:
   4772    mova                 m0, [yq]
   4773    mova                 m1, [yq+strideq]
   4774    pmaddubsw            m0, m2
   4775    pmaddubsw            m1, m2
   4776    paddw                m0, m1
   4777    mova              [acq], m0
   4778    paddw                m4, m0
   4779    lea                  yq, [yq+strideq*2]
   4780    add                 acq, 32
   4781    dec                  hd
   4782    jg .w16_loop
   4783    test              hpadd, hpadd
   4784    jz .calc_avg
   4785    jmp .w16_hpad_loop
   4786 .w16_wpad:
   4787    DEFINE_ARGS ac, y, stride, wpad, hpad, iptr, h, sz, ac_bak
   4788    lea               iptrq, [ipred_cfl_ac_420_avx2_table]
   4789    shl               wpadd, 2
   4790    mova                 m3, [iptrq+cfl_ac_w16_pad_shuffle- \
   4791                              ipred_cfl_ac_420_avx2_table+wpadq*8-32]
   4792    movsxd            wpadq, [iptrq+wpadq+4]
   4793    add               iptrq, wpadq
   4794    jmp iptrq
   4795 .w16_pad3:
   4796    vpbroadcastq         m0, [yq]
   4797    vpbroadcastq         m1, [yq+strideq]
   4798    jmp .w16_wpad_end
   4799 .w16_pad2:
   4800    vbroadcasti128       m0, [yq]
   4801    vbroadcasti128       m1, [yq+strideq]
   4802    jmp .w16_wpad_end
   4803 .w16_pad1:
   4804    mova                 m0, [yq]
   4805    mova                 m1, [yq+strideq]
   4806    ; fall-through
   4807 .w16_wpad_end:
   4808    pmaddubsw            m0, m2
   4809    pmaddubsw            m1, m2
   4810    paddw                m0, m1
   4811    pshufb               m0, m3
   4812    mova              [acq], m0
   4813    paddw                m4, m0
   4814    lea                  yq, [yq+strideq*2]
   4815    add                 acq, 32
   4816    dec                  hd
   4817    jz .w16_wpad_done
   4818    jmp iptrq
   4819 .w16_wpad_done:
   4820    test              hpadd, hpadd
   4821    jz .calc_avg
   4822 .w16_hpad_loop:
   4823    mova              [acq], m0
   4824    paddw                m4, m0
   4825    add                 acq, 32
   4826    dec               hpadd
   4827    jg .w16_hpad_loop
   4828    ; fall-through
   4829 
   4830 .calc_avg:
   4831    vpbroadcastd         m2, [pw_1]
   4832    pmaddwd              m0, m4, m2
   4833    vextracti128        xm1, m0, 1
   4834    tzcnt               r1d, szd
   4835    paddd               xm0, xm1
   4836    movd                xm2, r1d
   4837    movd                xm3, szd
   4838    punpckhqdq          xm1, xm0, xm0
   4839    paddd               xm0, xm1
   4840    psrad               xm3, 1
   4841    psrlq               xm1, xm0, 32
   4842    paddd               xm0, xm3
   4843    paddd               xm0, xm1
   4844    psrad               xm0, xm2
   4845    vpbroadcastw         m0, xm0
   4846 .sub_loop:
   4847    mova                 m1, [ac_bakq]
   4848    psubw                m1, m0
   4849    mova          [ac_bakq], m1
   4850    add             ac_bakq, 32
   4851    sub                 szd, 16
   4852    jg .sub_loop
   4853    RET
   4854 
   4855 cglobal ipred_cfl_ac_422_8bpc, 4, 9, 6, ac, y, stride, wpad, hpad, w, h, sz, ac_bak
   4856    movifnidn         hpadd, hpadm
   4857    movifnidn            wd, wm
   4858    mov                  hd, hm
   4859    mov                 szd, wd
   4860    mov             ac_bakq, acq
   4861    imul                szd, hd
   4862    shl               hpadd, 2
   4863    sub                  hd, hpadd
   4864    vpbroadcastd         m2, [pb_4]
   4865    pxor                 m4, m4
   4866    pxor                 m5, m5
   4867    cmp                  wd, 8
   4868    jg .w16
   4869    je .w8
   4870    ; fall-through
   4871 
   4872    DEFINE_ARGS ac, y, stride, wpad, hpad, stride3, h, sz, ac_bak
   4873 .w4:
   4874    lea            stride3q, [strideq*3]
   4875 .w4_loop:
   4876    movq                xm1, [yq]
   4877    movhps              xm1, [yq+strideq]
   4878    movq                xm0, [yq+strideq*2]
   4879    movhps              xm0, [yq+stride3q]
   4880    pmaddubsw           xm0, xm2
   4881    pmaddubsw           xm1, xm2
   4882    mova              [acq], xm1
   4883    mova           [acq+16], xm0
   4884    paddw               xm4, xm0
   4885    paddw               xm5, xm1
   4886    lea                  yq, [yq+strideq*4]
   4887    add                 acq, 32
   4888    sub                  hd, 4
   4889    jg .w4_loop
   4890    test              hpadd, hpadd
   4891    jz .calc_avg
   4892    vpermq               m0, m0, q1111
   4893 .w4_hpad_loop:
   4894    mova              [acq], m0
   4895    paddw                m4, m0
   4896    add                 acq, 32
   4897    sub               hpadd, 4
   4898    jg .w4_hpad_loop
   4899    jmp .calc_avg
   4900 
   4901 .w8:
   4902    lea            stride3q, [strideq*3]
   4903    test              wpadd, wpadd
   4904    jnz .w8_wpad
   4905 .w8_loop:
   4906    mova                xm1, [yq]
   4907    vinserti128          m1, [yq+strideq], 1
   4908    mova                xm0, [yq+strideq*2]
   4909    vinserti128          m0, [yq+stride3q], 1
   4910    pmaddubsw            m0, m2
   4911    pmaddubsw            m1, m2
   4912    mova              [acq], m1
   4913    mova           [acq+32], m0
   4914    paddw                m4, m0
   4915    paddw                m5, m1
   4916    lea                  yq, [yq+strideq*4]
   4917    add                 acq, 64
   4918    sub                  hd, 4
   4919    jg .w8_loop
   4920    test              hpadd, hpadd
   4921    jz .calc_avg
   4922    jmp .w8_hpad
   4923 .w8_wpad:
   4924    vbroadcasti128       m3, [cfl_ac_w8_pad1_shuffle]
   4925 .w8_wpad_loop:
   4926    movq                xm1, [yq]
   4927    vinserti128          m1, [yq+strideq], 1
   4928    movq                xm0, [yq+strideq*2]
   4929    vinserti128          m0, [yq+stride3q], 1
   4930    pmaddubsw            m0, m2
   4931    pmaddubsw            m1, m2
   4932    pshufb               m0, m3
   4933    pshufb               m1, m3
   4934    mova              [acq], m1
   4935    mova           [acq+32], m0
   4936    paddw                m4, m0
   4937    paddw                m5, m1
   4938    lea                  yq, [yq+strideq*4]
   4939    add                 acq, 64
   4940    sub                  hd, 4
   4941    jg .w8_wpad_loop
   4942    test              hpadd, hpadd
   4943    jz .calc_avg
   4944 .w8_hpad:
   4945    vpermq               m0, m0, q3232
   4946 .w8_hpad_loop:
   4947    mova              [acq], m0
   4948    paddw                m4, m0
   4949    add                 acq, 32
   4950    sub               hpadd, 2
   4951    jg .w8_hpad_loop
   4952    jmp .calc_avg
   4953 
   4954 .w16:
   4955    test              wpadd, wpadd
   4956    jnz .w16_wpad
   4957 .w16_loop:
   4958    mova                 m1, [yq]
   4959    mova                 m0, [yq+strideq]
   4960    pmaddubsw            m0, m2
   4961    pmaddubsw            m1, m2
   4962    mova              [acq], m1
   4963    mova           [acq+32], m0
   4964    paddw                m4, m0
   4965    paddw                m5, m1
   4966    lea                  yq, [yq+strideq*2]
   4967    add                 acq, 64
   4968    sub                  hd, 2
   4969    jg .w16_loop
   4970    test              hpadd, hpadd
   4971    jz .calc_avg
   4972    jmp .w16_hpad_loop
   4973 .w16_wpad:
   4974    DEFINE_ARGS ac, y, stride, wpad, hpad, iptr, h, sz, ac_bak
   4975    lea               iptrq, [ipred_cfl_ac_422_avx2_table]
   4976    shl               wpadd, 2
   4977    mova                 m3, [iptrq+cfl_ac_w16_pad_shuffle- \
   4978                              ipred_cfl_ac_422_avx2_table+wpadq*8-32]
   4979    movsxd            wpadq, [iptrq+wpadq+4]
   4980    add               iptrq, wpadq
   4981    jmp iptrq
   4982 .w16_pad3:
   4983    vpbroadcastq         m1, [yq]
   4984    vpbroadcastq         m0, [yq+strideq]
   4985    jmp .w16_wpad_end
   4986 .w16_pad2:
   4987    vbroadcasti128       m1, [yq]
   4988    vbroadcasti128       m0, [yq+strideq]
   4989    jmp .w16_wpad_end
   4990 .w16_pad1:
   4991    mova                 m1, [yq]
   4992    mova                 m0, [yq+strideq]
   4993    ; fall-through
   4994 .w16_wpad_end:
   4995    pmaddubsw            m0, m2
   4996    pmaddubsw            m1, m2
   4997    pshufb               m0, m3
   4998    pshufb               m1, m3
   4999    mova              [acq], m1
   5000    mova           [acq+32], m0
   5001    paddw                m4, m0
   5002    paddw                m5, m1
   5003    lea                  yq, [yq+strideq*2]
   5004    add                 acq, 64
   5005    sub                  hd, 2
   5006    jz .w16_wpad_done
   5007    jmp iptrq
   5008 .w16_wpad_done:
   5009    test              hpadd, hpadd
   5010    jz .calc_avg
   5011 .w16_hpad_loop:
   5012    mova              [acq], m0
   5013    mova           [acq+32], m0
   5014    paddw                m4, m0
   5015    paddw                m5, m0
   5016    add                 acq, 64
   5017    sub               hpadd, 2
   5018    jg .w16_hpad_loop
   5019    ; fall-through
   5020 
   5021 .calc_avg:
   5022    vpbroadcastd         m2, [pw_1]
   5023    pmaddwd              m5, m5, m2
   5024    pmaddwd              m0, m4, m2
   5025    paddd                m0, m5
   5026    vextracti128        xm1, m0, 1
   5027    tzcnt               r1d, szd
   5028    paddd               xm0, xm1
   5029    movd                xm2, r1d
   5030    movd                xm3, szd
   5031    punpckhqdq          xm1, xm0, xm0
   5032    paddd               xm0, xm1
   5033    psrad               xm3, 1
   5034    psrlq               xm1, xm0, 32
   5035    paddd               xm0, xm3
   5036    paddd               xm0, xm1
   5037    psrad               xm0, xm2
   5038    vpbroadcastw         m0, xm0
   5039 .sub_loop:
   5040    mova                 m1, [ac_bakq]
   5041    psubw                m1, m0
   5042    mova          [ac_bakq], m1
   5043    add             ac_bakq, 32
   5044    sub                 szd, 16
   5045    jg .sub_loop
   5046    RET
   5047 
   5048 cglobal ipred_cfl_ac_444_8bpc, 4, 9, 6, ac, y, stride, wpad, hpad, w, h, sz, ac_bak
   5049    movifnidn         hpadd, hpadm
   5050    movifnidn            wd, wm
   5051    mov                  hd, hm
   5052    mov                 szd, wd
   5053    imul                szd, hd
   5054    shl               hpadd, 2
   5055    sub                  hd, hpadd
   5056    pxor                 m4, m4
   5057    vpbroadcastd         m5, [pw_1]
   5058    tzcnt               r8d, wd
   5059    lea                  r5, [ipred_cfl_ac_444_avx2_table]
   5060    movsxd               r8, [r5+r8*4+12]
   5061    add                  r5, r8
   5062 
   5063    DEFINE_ARGS ac, y, stride, wpad, hpad, stride3, h, sz, ac_bak
   5064    mov             ac_bakq, acq
   5065    jmp                  r5
   5066 
   5067 .w4:
   5068    lea            stride3q, [strideq*3]
   5069    pxor                xm2, xm2
   5070 .w4_loop:
   5071    movd                xm1, [yq]
   5072    movd                xm0, [yq+strideq*2]
   5073    pinsrd              xm1, [yq+strideq], 1
   5074    pinsrd              xm0, [yq+stride3q], 1
   5075    punpcklbw           xm1, xm2
   5076    punpcklbw           xm0, xm2
   5077    psllw               xm1, 3
   5078    psllw               xm0, 3
   5079    mova              [acq], xm1
   5080    mova           [acq+16], xm0
   5081    paddw               xm1, xm0
   5082    paddw               xm4, xm1
   5083    lea                  yq, [yq+strideq*4]
   5084    add                 acq, 32
   5085    sub                  hd, 4
   5086    jg .w4_loop
   5087    test              hpadd, hpadd
   5088    jz .calc_avg_mul
   5089    pshufd              xm0, xm0, q3232
   5090    paddw               xm1, xm0, xm0
   5091 .w4_hpad_loop:
   5092    mova              [acq], xm0
   5093    mova           [acq+16], xm0
   5094    paddw               xm4, xm1
   5095    add                 acq, 32
   5096    sub               hpadd, 4
   5097    jg .w4_hpad_loop
   5098    jmp .calc_avg_mul
   5099 
   5100 .w8:
   5101    lea            stride3q, [strideq*3]
   5102    pxor                 m2, m2
   5103 .w8_loop:
   5104    movq                xm1, [yq]
   5105    movq                xm0, [yq+strideq*2]
   5106    vinserti128          m1, [yq+strideq], 1
   5107    vinserti128          m0, [yq+stride3q], 1
   5108    punpcklbw            m1, m2
   5109    punpcklbw            m0, m2
   5110    psllw                m1, 3
   5111    psllw                m0, 3
   5112    mova              [acq], m1
   5113    mova           [acq+32], m0
   5114    paddw                m1, m0
   5115    paddw                m4, m1
   5116    lea                  yq, [yq+strideq*4]
   5117    add                 acq, 64
   5118    sub                  hd, 4
   5119    jg .w8_loop
   5120    test              hpadd, hpadd
   5121    jz .calc_avg_mul
   5122    vpermq               m0, m0, q3232
   5123    paddw                m1, m0, m0
   5124 .w8_hpad_loop:
   5125    mova              [acq], m0
   5126    mova           [acq+32], m0
   5127    paddw                m4, m1
   5128    add                 acq, 64
   5129    sub               hpadd, 4
   5130    jg .w8_hpad_loop
   5131    jmp .calc_avg_mul
   5132 
   5133 .w16:
   5134    test              wpadd, wpadd
   5135    jnz .w16_wpad
   5136 .w16_loop:
   5137    pmovzxbw             m1, [yq]
   5138    pmovzxbw             m0, [yq+strideq]
   5139    psllw                m1, 3
   5140    psllw                m0, 3
   5141    mova              [acq], m1
   5142    mova           [acq+32], m0
   5143    paddw                m1, m0
   5144    pmaddwd              m1, m5
   5145    paddd                m4, m1
   5146    lea                  yq, [yq+strideq*2]
   5147    add                 acq, 64
   5148    sub                  hd, 2
   5149    jg .w16_loop
   5150    test              hpadd, hpadd
   5151    jz .calc_avg
   5152    jmp .w16_hpad
   5153 .w16_wpad:
   5154    mova                 m3, [cfl_ac_444_w16_pad1_shuffle]
   5155 .w16_wpad_loop:
   5156    vpbroadcastq         m1, [yq]
   5157    vpbroadcastq         m0, [yq+strideq]
   5158    pshufb               m1, m3
   5159    pshufb               m0, m3
   5160    psllw                m1, 3
   5161    psllw                m0, 3
   5162    mova              [acq], m1
   5163    mova           [acq+32], m0
   5164    paddw                m1, m0
   5165    pmaddwd              m1, m5
   5166    paddd                m4, m1
   5167    lea                  yq, [yq+strideq*2]
   5168    add                 acq, 64
   5169    sub                  hd, 2
   5170    jg .w16_wpad_loop
   5171    test              hpadd, hpadd
   5172    jz .calc_avg
   5173 .w16_hpad:
   5174    paddw                m1, m0, m0
   5175    pmaddwd              m1, m5
   5176 .w16_hpad_loop:
   5177    mova              [acq], m0
   5178    mova           [acq+32], m0
   5179    paddd                m4, m1
   5180    add                 acq, 64
   5181    sub               hpadd, 2
   5182    jg .w16_hpad_loop
   5183    jmp .calc_avg
   5184 
   5185 .w32:
   5186    test              wpadd, wpadd
   5187    jnz .w32_wpad
   5188 .w32_loop:
   5189    pmovzxbw             m1, [yq]
   5190    pmovzxbw             m0, [yq+16]
   5191    psllw                m1, 3
   5192    psllw                m0, 3
   5193    mova              [acq], m1
   5194    mova           [acq+32], m0
   5195    paddw                m2, m1, m0
   5196    pmaddwd              m2, m5
   5197    paddd                m4, m2
   5198    add                  yq, strideq
   5199    add                 acq, 64
   5200    dec                  hd
   5201    jg .w32_loop
   5202    test              hpadd, hpadd
   5203    jz .calc_avg
   5204    jmp .w32_hpad_loop
   5205 .w32_wpad:
   5206    DEFINE_ARGS ac, y, stride, wpad, hpad, iptr, h, sz, ac_bak
   5207    lea               iptrq, [ipred_cfl_ac_444_avx2_table]
   5208    add               wpadd, wpadd
   5209    mova                 m3, [iptrq+cfl_ac_444_w16_pad1_shuffle-ipred_cfl_ac_444_avx2_table]
   5210    movsxd            wpadq, [iptrq+wpadq+4]
   5211    add               iptrq, wpadq
   5212    jmp iptrq
   5213 .w32_pad3:
   5214    vpbroadcastq         m1, [yq]
   5215    pshufb               m1, m3
   5216    vpermq               m0, m1, q3232
   5217    jmp .w32_wpad_end
   5218 .w32_pad2:
   5219    pmovzxbw             m1, [yq]
   5220    pshufhw              m0, m1, q3333
   5221    vpermq               m0, m0, q3333
   5222    jmp .w32_wpad_end
   5223 .w32_pad1:
   5224    pmovzxbw             m1, [yq]
   5225    vpbroadcastq         m0, [yq+16]
   5226    pshufb               m0, m3
   5227    ; fall-through
   5228 .w32_wpad_end:
   5229    psllw                m1, 3
   5230    psllw                m0, 3
   5231    mova              [acq], m1
   5232    mova           [acq+32], m0
   5233    paddw                m2, m1, m0
   5234    pmaddwd              m2, m5
   5235    paddd                m4, m2
   5236    add                  yq, strideq
   5237    add                 acq, 64
   5238    dec                  hd
   5239    jz .w32_wpad_done
   5240    jmp iptrq
   5241 .w32_wpad_done:
   5242    test              hpadd, hpadd
   5243    jz .calc_avg
   5244 .w32_hpad_loop:
   5245    mova              [acq], m1
   5246    mova           [acq+32], m0
   5247    paddd                m4, m2
   5248    add                 acq, 64
   5249    dec               hpadd
   5250    jg .w32_hpad_loop
   5251    jmp .calc_avg
   5252 
   5253 .calc_avg_mul:
   5254    pmaddwd              m4, m5
   5255 .calc_avg:
   5256    vextracti128        xm1, m4, 1
   5257    tzcnt               r1d, szd
   5258    paddd               xm0, xm4, xm1
   5259    movd                xm2, r1d
   5260    movd                xm3, szd
   5261    punpckhqdq          xm1, xm0, xm0
   5262    paddd               xm0, xm1
   5263    psrad               xm3, 1
   5264    psrlq               xm1, xm0, 32
   5265    paddd               xm0, xm3
   5266    paddd               xm0, xm1
   5267    psrad               xm0, xm2
   5268    vpbroadcastw         m0, xm0
   5269 .sub_loop:
   5270    mova                 m1, [ac_bakq]
   5271    psubw                m1, m0
   5272    mova          [ac_bakq], m1
   5273    add             ac_bakq, 32
   5274    sub                 szd, 16
   5275    jg .sub_loop
   5276    RET
   5277 
   5278 cglobal pal_pred_8bpc, 4, 6, 5, dst, stride, pal, idx, w, h
   5279    vpbroadcastq         m4, [palq]
   5280    lea                  r2, [pal_pred_avx2_table]
   5281    tzcnt                wd, wm
   5282    movifnidn            hd, hm
   5283    movsxd               wq, [r2+wq*4]
   5284    add                  wq, r2
   5285    lea                  r2, [strideq*3]
   5286    jmp                  wq
   5287 .w4:
   5288    movq                xm0, [idxq]
   5289    add                idxq, 8
   5290    psrlw               xm1, xm0, 4
   5291    punpcklbw           xm0, xm1
   5292    pshufb              xm0, xm4, xm0
   5293    movd   [dstq+strideq*0], xm0
   5294    pextrd [dstq+strideq*1], xm0, 1
   5295    pextrd [dstq+strideq*2], xm0, 2
   5296    pextrd [dstq+r2       ], xm0, 3
   5297    lea                dstq, [dstq+strideq*4]
   5298    sub                  hd, 4
   5299    jg .w4
   5300    RET
   5301 .w8:
   5302    movu                xm2, [idxq]
   5303    add                idxq, 16
   5304    pshufb              xm1, xm4, xm2
   5305    psrlw               xm2, 4
   5306    pshufb              xm2, xm4, xm2
   5307    punpcklbw           xm0, xm1, xm2
   5308    punpckhbw           xm1, xm2
   5309    movq   [dstq+strideq*0], xm0
   5310    movhps [dstq+strideq*1], xm0
   5311    movq   [dstq+strideq*2], xm1
   5312    movhps [dstq+r2       ], xm1
   5313    lea                dstq, [dstq+strideq*4]
   5314    sub                  hd, 4
   5315    jg .w8
   5316    RET
   5317 .w16:
   5318    movu                 m2, [idxq]
   5319    add                idxq, 32
   5320    pshufb               m1, m4, m2
   5321    psrlw                m2, 4
   5322    pshufb               m2, m4, m2
   5323    punpcklbw            m0, m1, m2
   5324    punpckhbw            m1, m2
   5325    mova         [dstq+strideq*0], xm0
   5326    mova         [dstq+strideq*1], xm1
   5327    vextracti128 [dstq+strideq*2], m0, 1
   5328    vextracti128 [dstq+r2       ], m1, 1
   5329    lea                dstq, [dstq+strideq*4]
   5330    sub                  hd, 4
   5331    jg .w16
   5332    RET
   5333 .w32:
   5334    vpermq               m2, [idxq], q3120
   5335    add                idxq, 32
   5336    pshufb               m1, m4, m2
   5337    psrlw                m2, 4
   5338    pshufb               m2, m4, m2
   5339    punpcklbw            m0, m1, m2
   5340    punpckhbw            m1, m2
   5341    mova   [dstq+strideq*0], m0
   5342    mova   [dstq+strideq*1], m1
   5343    lea                dstq, [dstq+strideq*2]
   5344    sub                  hd, 2
   5345    jg .w32
   5346    RET
   5347 .w64:
   5348    vpermq               m2, [idxq], q3120
   5349    add                idxq, 32
   5350    pshufb               m1, m4, m2
   5351    psrlw                m2, 4
   5352    pshufb               m2, m4, m2
   5353    punpcklbw            m0, m1, m2
   5354    punpckhbw            m1, m2
   5355    mova        [dstq+32*0], m0
   5356    mova        [dstq+32*1], m1
   5357    add                dstq, strideq
   5358    dec                  hd
   5359    jg .w64
   5360    RET
   5361 
   5362 %endif