tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

vp9intrapred_16bpp.asm (94365B)


      1 ;******************************************************************************
      2 ;* VP9 Intra prediction SIMD optimizations
      3 ;*
      4 ;* Copyright (c) 2015 Ronald S. Bultje <rsbultje gmail com>
      5 ;* Copyright (c) 2015 Henrik Gramner <henrik gramner com>
      6 ;*
      7 ;* This file is part of FFmpeg.
      8 ;*
      9 ;* FFmpeg is free software; you can redistribute it and/or
     10 ;* modify it under the terms of the GNU Lesser General Public
     11 ;* License as published by the Free Software Foundation; either
     12 ;* version 2.1 of the License, or (at your option) any later version.
     13 ;*
     14 ;* FFmpeg is distributed in the hope that it will be useful,
     15 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
     16 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
     17 ;* Lesser General Public License for more details.
     18 ;*
     19 ;* You should have received a copy of the GNU Lesser General Public
     20 ;* License along with FFmpeg; if not, write to the Free Software
     21 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
     22 ;******************************************************************************
     23 
     24 %include "libavutil/x86/x86util.asm"
     25 
     26 SECTION_RODATA 32
     27 
     28 pd_2: times 8 dd 2
     29 pd_4: times 8 dd 4
     30 pd_8: times 8 dd 8
     31 
     32 pb_2to15_14_15: db 2,3,4,5,6,7,8,9,10,11,12,13,14,15,14,15
     33 pb_4_5_8to13_8x0: db 4,5,8,9,10,11,12,13,0,0,0,0,0,0,0,0
     34 pb_0to7_67x4: db 0,1,2,3,4,5,6,7,6,7,6,7,6,7,6,7
     35 
     36 cextern pw_1
     37 cextern pw_1023
     38 cextern pw_4095
     39 cextern pd_16
     40 cextern pd_32
     41 cextern pd_65535;
     42 
     43 ; FIXME most top-only functions (ddl, vl, v, dc_top) can be modified to take
     44 ; only 3 registers on x86-32, which would make it one cycle faster, but that
     45 ; would make the code quite a bit uglier...
     46 
     47 SECTION .text
     48 
     49 %macro SCRATCH 3-4
     50 %if ARCH_X86_64
     51    SWAP                %1, %2
     52 %if %0 == 4
     53 %define reg_%4 m%2
     54 %endif
     55 %else
     56    mova              [%3], m%1
     57 %if %0 == 4
     58 %define reg_%4 [%3]
     59 %endif
     60 %endif
     61 %endmacro
     62 
     63 %macro UNSCRATCH 3-4
     64 %if ARCH_X86_64
     65    SWAP                %1, %2
     66 %else
     67    mova               m%1, [%3]
     68 %endif
     69 %if %0 == 4
     70 %undef reg_%4
     71 %endif
     72 %endmacro
     73 
     74 %macro PRELOAD 2-3
     75 %if ARCH_X86_64
     76    mova               m%1, [%2]
     77 %if %0 == 3
     78 %define reg_%3 m%1
     79 %endif
     80 %elif %0 == 3
     81 %define reg_%3 [%2]
     82 %endif
     83 %endmacro
     84 
     85 INIT_MMX mmx
     86 cglobal vp9_ipred_v_4x4_16, 2, 4, 1, dst, stride, l, a
     87    movifnidn               aq, amp
     88    mova                    m0, [aq]
     89    DEFINE_ARGS dst, stride, stride3
     90    lea               stride3q, [strideq*3]
     91    mova      [dstq+strideq*0], m0
     92    mova      [dstq+strideq*1], m0
     93    mova      [dstq+strideq*2], m0
     94    mova      [dstq+stride3q ], m0
     95    RET
     96 
     97 INIT_XMM sse
     98 cglobal vp9_ipred_v_8x8_16, 2, 4, 1, dst, stride, l, a
     99    movifnidn               aq, amp
    100    mova                    m0, [aq]
    101    DEFINE_ARGS dst, stride, stride3
    102    lea               stride3q, [strideq*3]
    103    mova      [dstq+strideq*0], m0
    104    mova      [dstq+strideq*1], m0
    105    mova      [dstq+strideq*2], m0
    106    mova      [dstq+stride3q ], m0
    107    lea                   dstq, [dstq+strideq*4]
    108    mova      [dstq+strideq*0], m0
    109    mova      [dstq+strideq*1], m0
    110    mova      [dstq+strideq*2], m0
    111    mova      [dstq+stride3q ], m0
    112    RET
    113 
    114 INIT_XMM sse
    115 cglobal vp9_ipred_v_16x16_16, 2, 4, 2, dst, stride, l, a
    116    movifnidn               aq, amp
    117    mova                    m0, [aq]
    118    mova                    m1, [aq+mmsize]
    119    DEFINE_ARGS dst, stride, stride3, cnt
    120    lea               stride3q, [strideq*3]
    121    mov                   cntd, 4
    122 .loop:
    123    mova   [dstq+strideq*0+ 0], m0
    124    mova   [dstq+strideq*0+16], m1
    125    mova   [dstq+strideq*1+ 0], m0
    126    mova   [dstq+strideq*1+16], m1
    127    mova   [dstq+strideq*2+ 0], m0
    128    mova   [dstq+strideq*2+16], m1
    129    mova   [dstq+stride3q + 0], m0
    130    mova   [dstq+stride3q +16], m1
    131    lea                   dstq, [dstq+strideq*4]
    132    dec               cntd
    133    jg .loop
    134    RET
    135 
    136 INIT_XMM sse
    137 cglobal vp9_ipred_v_32x32_16, 2, 4, 4, dst, stride, l, a
    138    movifnidn               aq, amp
    139    mova                    m0, [aq+mmsize*0]
    140    mova                    m1, [aq+mmsize*1]
    141    mova                    m2, [aq+mmsize*2]
    142    mova                    m3, [aq+mmsize*3]
    143    DEFINE_ARGS dst, stride, cnt
    144    mov                   cntd, 16
    145 .loop:
    146    mova   [dstq+strideq*0+ 0], m0
    147    mova   [dstq+strideq*0+16], m1
    148    mova   [dstq+strideq*0+32], m2
    149    mova   [dstq+strideq*0+48], m3
    150    mova   [dstq+strideq*1+ 0], m0
    151    mova   [dstq+strideq*1+16], m1
    152    mova   [dstq+strideq*1+32], m2
    153    mova   [dstq+strideq*1+48], m3
    154    lea                   dstq, [dstq+strideq*2]
    155    dec               cntd
    156    jg .loop
    157    RET
    158 
    159 INIT_MMX mmxext
    160 cglobal vp9_ipred_h_4x4_16, 3, 3, 4, dst, stride, l, a
    161    mova                    m3, [lq]
    162    DEFINE_ARGS dst, stride, stride3
    163    lea               stride3q, [strideq*3]
    164    pshufw                  m0, m3, q3333
    165    pshufw                  m1, m3, q2222
    166    pshufw                  m2, m3, q1111
    167    pshufw                  m3, m3, q0000
    168    mova      [dstq+strideq*0], m0
    169    mova      [dstq+strideq*1], m1
    170    mova      [dstq+strideq*2], m2
    171    mova      [dstq+stride3q ], m3
    172    RET
    173 
    174 INIT_XMM sse2
    175 cglobal vp9_ipred_h_8x8_16, 3, 3, 4, dst, stride, l, a
    176    mova                    m2, [lq]
    177    DEFINE_ARGS dst, stride, stride3
    178    lea               stride3q, [strideq*3]
    179    punpckhwd               m3, m2, m2
    180    pshufd                  m0, m3, q3333
    181    pshufd                  m1, m3, q2222
    182    mova      [dstq+strideq*0], m0
    183    mova      [dstq+strideq*1], m1
    184    pshufd                  m0, m3, q1111
    185    pshufd                  m1, m3, q0000
    186    mova      [dstq+strideq*2], m0
    187    mova      [dstq+stride3q ], m1
    188    lea                   dstq, [dstq+strideq*4]
    189    punpcklwd               m2, m2
    190    pshufd                  m0, m2, q3333
    191    pshufd                  m1, m2, q2222
    192    mova      [dstq+strideq*0], m0
    193    mova      [dstq+strideq*1], m1
    194    pshufd                  m0, m2, q1111
    195    pshufd                  m1, m2, q0000
    196    mova      [dstq+strideq*2], m0
    197    mova      [dstq+stride3q ], m1
    198    RET
    199 
    200 INIT_XMM sse2
    201 cglobal vp9_ipred_h_16x16_16, 3, 5, 4, dst, stride, l, stride3, cnt
    202    mov                   cntd, 3
    203    lea               stride3q, [strideq*3]
    204 .loop:
    205    movh                    m3, [lq+cntq*8]
    206    punpcklwd               m3, m3
    207    pshufd                  m0, m3, q3333
    208    pshufd                  m1, m3, q2222
    209    pshufd                  m2, m3, q1111
    210    pshufd                  m3, m3, q0000
    211    mova    [dstq+strideq*0+ 0], m0
    212    mova    [dstq+strideq*0+16], m0
    213    mova    [dstq+strideq*1+ 0], m1
    214    mova    [dstq+strideq*1+16], m1
    215    mova    [dstq+strideq*2+ 0], m2
    216    mova    [dstq+strideq*2+16], m2
    217    mova    [dstq+stride3q + 0], m3
    218    mova    [dstq+stride3q +16], m3
    219    lea                   dstq, [dstq+strideq*4]
    220    dec                   cntd
    221    jge .loop
    222    RET
    223 
    224 INIT_XMM sse2
    225 cglobal vp9_ipred_h_32x32_16, 3, 5, 4, dst, stride, l, stride3, cnt
    226    mov                   cntd, 7
    227    lea               stride3q, [strideq*3]
    228 .loop:
    229    movh                    m3, [lq+cntq*8]
    230    punpcklwd               m3, m3
    231    pshufd                  m0, m3, q3333
    232    pshufd                  m1, m3, q2222
    233    pshufd                  m2, m3, q1111
    234    pshufd                  m3, m3, q0000
    235    mova   [dstq+strideq*0+ 0], m0
    236    mova   [dstq+strideq*0+16], m0
    237    mova   [dstq+strideq*0+32], m0
    238    mova   [dstq+strideq*0+48], m0
    239    mova   [dstq+strideq*1+ 0], m1
    240    mova   [dstq+strideq*1+16], m1
    241    mova   [dstq+strideq*1+32], m1
    242    mova   [dstq+strideq*1+48], m1
    243    mova   [dstq+strideq*2+ 0], m2
    244    mova   [dstq+strideq*2+16], m2
    245    mova   [dstq+strideq*2+32], m2
    246    mova   [dstq+strideq*2+48], m2
    247    mova   [dstq+stride3q + 0], m3
    248    mova   [dstq+stride3q +16], m3
    249    mova   [dstq+stride3q +32], m3
    250    mova   [dstq+stride3q +48], m3
    251    lea                   dstq, [dstq+strideq*4]
    252    dec                   cntd
    253    jge .loop
    254    RET
    255 
    256 INIT_MMX mmxext
    257 cglobal vp9_ipred_dc_4x4_16, 4, 4, 2, dst, stride, l, a
    258    mova                    m0, [lq]
    259    paddw                   m0, [aq]
    260    DEFINE_ARGS dst, stride, stride3
    261    lea               stride3q, [strideq*3]
    262    pmaddwd                 m0, [pw_1]
    263    pshufw                  m1, m0, q3232
    264    paddd                   m0, [pd_4]
    265    paddd                   m0, m1
    266    psrad                   m0, 3
    267    pshufw                  m0, m0, q0000
    268    mova      [dstq+strideq*0], m0
    269    mova      [dstq+strideq*1], m0
    270    mova      [dstq+strideq*2], m0
    271    mova      [dstq+stride3q ], m0
    272    RET
    273 
    274 INIT_XMM sse2
    275 cglobal vp9_ipred_dc_8x8_16, 4, 4, 2, dst, stride, l, a
    276    mova                    m0, [lq]
    277    paddw                   m0, [aq]
    278    DEFINE_ARGS dst, stride, stride3
    279    lea               stride3q, [strideq*3]
    280    pmaddwd                 m0, [pw_1]
    281    pshufd                  m1, m0, q3232
    282    paddd                   m0, m1
    283    pshufd                  m1, m0, q1111
    284    paddd                   m0, [pd_8]
    285    paddd                   m0, m1
    286    psrad                   m0, 4
    287    pshuflw                 m0, m0, q0000
    288    punpcklqdq              m0, m0
    289    mova      [dstq+strideq*0], m0
    290    mova      [dstq+strideq*1], m0
    291    mova      [dstq+strideq*2], m0
    292    mova      [dstq+stride3q ], m0
    293    lea                   dstq, [dstq+strideq*4]
    294    mova      [dstq+strideq*0], m0
    295    mova      [dstq+strideq*1], m0
    296    mova      [dstq+strideq*2], m0
    297    mova      [dstq+stride3q ], m0
    298    RET
    299 
    300 INIT_XMM sse2
    301 cglobal vp9_ipred_dc_16x16_16, 4, 4, 2, dst, stride, l, a
    302    mova                    m0, [lq]
    303    paddw                   m0, [lq+mmsize]
    304    paddw                   m0, [aq]
    305    paddw                   m0, [aq+mmsize]
    306    DEFINE_ARGS dst, stride, stride3, cnt
    307    lea               stride3q, [strideq*3]
    308    mov                   cntd, 4
    309    pmaddwd                 m0, [pw_1]
    310    pshufd                  m1, m0, q3232
    311    paddd                   m0, m1
    312    pshufd                  m1, m0, q1111
    313    paddd                   m0, [pd_16]
    314    paddd                   m0, m1
    315    psrad                   m0, 5
    316    pshuflw                 m0, m0, q0000
    317    punpcklqdq              m0, m0
    318 .loop:
    319    mova   [dstq+strideq*0+ 0], m0
    320    mova   [dstq+strideq*0+16], m0
    321    mova   [dstq+strideq*1+ 0], m0
    322    mova   [dstq+strideq*1+16], m0
    323    mova   [dstq+strideq*2+ 0], m0
    324    mova   [dstq+strideq*2+16], m0
    325    mova   [dstq+stride3q + 0], m0
    326    mova   [dstq+stride3q +16], m0
    327    lea                   dstq, [dstq+strideq*4]
    328    dec                   cntd
    329    jg .loop
    330    RET
    331 
    332 INIT_XMM sse2
    333 cglobal vp9_ipred_dc_32x32_16, 4, 4, 2, dst, stride, l, a
    334    mova                    m0, [lq+mmsize*0]
    335    paddw                   m0, [lq+mmsize*1]
    336    paddw                   m0, [lq+mmsize*2]
    337    paddw                   m0, [lq+mmsize*3]
    338    paddw                   m0, [aq+mmsize*0]
    339    paddw                   m0, [aq+mmsize*1]
    340    paddw                   m0, [aq+mmsize*2]
    341    paddw                   m0, [aq+mmsize*3]
    342    DEFINE_ARGS dst, stride, stride3, cnt
    343    lea               stride3q, [strideq*3]
    344    mov                   cntd, 16
    345    pmaddwd                 m0, [pw_1]
    346    pshufd                  m1, m0, q3232
    347    paddd                   m0, m1
    348    pshufd                  m1, m0, q1111
    349    paddd                   m0, [pd_32]
    350    paddd                   m0, m1
    351    psrad                   m0, 6
    352    pshuflw                 m0, m0, q0000
    353    punpcklqdq              m0, m0
    354 .loop:
    355    mova   [dstq+strideq*0+ 0], m0
    356    mova   [dstq+strideq*0+16], m0
    357    mova   [dstq+strideq*0+32], m0
    358    mova   [dstq+strideq*0+48], m0
    359    mova   [dstq+strideq*1+ 0], m0
    360    mova   [dstq+strideq*1+16], m0
    361    mova   [dstq+strideq*1+32], m0
    362    mova   [dstq+strideq*1+48], m0
    363    lea                   dstq, [dstq+strideq*2]
    364    dec                   cntd
    365    jg .loop
    366    RET
    367 
    368 %macro DC_1D_FNS 2
    369 INIT_MMX mmxext
    370 cglobal vp9_ipred_dc_%1_4x4_16, 4, 4, 2, dst, stride, l, a
    371    mova                    m0, [%2]
    372    DEFINE_ARGS dst, stride, stride3
    373    lea               stride3q, [strideq*3]
    374    pmaddwd                 m0, [pw_1]
    375    pshufw                  m1, m0, q3232
    376    paddd                   m0, [pd_2]
    377    paddd                   m0, m1
    378    psrad                   m0, 2
    379    pshufw                  m0, m0, q0000
    380    mova      [dstq+strideq*0], m0
    381    mova      [dstq+strideq*1], m0
    382    mova      [dstq+strideq*2], m0
    383    mova      [dstq+stride3q ], m0
    384    RET
    385 
    386 INIT_XMM sse2
    387 cglobal vp9_ipred_dc_%1_8x8_16, 4, 4, 2, dst, stride, l, a
    388    mova                    m0, [%2]
    389    DEFINE_ARGS dst, stride, stride3
    390    lea               stride3q, [strideq*3]
    391    pmaddwd                 m0, [pw_1]
    392    pshufd                  m1, m0, q3232
    393    paddd                   m0, m1
    394    pshufd                  m1, m0, q1111
    395    paddd                   m0, [pd_4]
    396    paddd                   m0, m1
    397    psrad                   m0, 3
    398    pshuflw                 m0, m0, q0000
    399    punpcklqdq              m0, m0
    400    mova      [dstq+strideq*0], m0
    401    mova      [dstq+strideq*1], m0
    402    mova      [dstq+strideq*2], m0
    403    mova      [dstq+stride3q ], m0
    404    lea                   dstq, [dstq+strideq*4]
    405    mova      [dstq+strideq*0], m0
    406    mova      [dstq+strideq*1], m0
    407    mova      [dstq+strideq*2], m0
    408    mova      [dstq+stride3q ], m0
    409    RET
    410 
    411 INIT_XMM sse2
    412 cglobal vp9_ipred_dc_%1_16x16_16, 4, 4, 2, dst, stride, l, a
    413    mova                    m0, [%2]
    414    paddw                   m0, [%2+mmsize]
    415    DEFINE_ARGS dst, stride, stride3, cnt
    416    lea               stride3q, [strideq*3]
    417    mov                   cntd, 4
    418    pmaddwd                 m0, [pw_1]
    419    pshufd                  m1, m0, q3232
    420    paddd                   m0, m1
    421    pshufd                  m1, m0, q1111
    422    paddd                   m0, [pd_8]
    423    paddd                   m0, m1
    424    psrad                   m0, 4
    425    pshuflw                 m0, m0, q0000
    426    punpcklqdq              m0, m0
    427 .loop:
    428    mova   [dstq+strideq*0+ 0], m0
    429    mova   [dstq+strideq*0+16], m0
    430    mova   [dstq+strideq*1+ 0], m0
    431    mova   [dstq+strideq*1+16], m0
    432    mova   [dstq+strideq*2+ 0], m0
    433    mova   [dstq+strideq*2+16], m0
    434    mova   [dstq+stride3q + 0], m0
    435    mova   [dstq+stride3q +16], m0
    436    lea                   dstq, [dstq+strideq*4]
    437    dec                   cntd
    438    jg .loop
    439    RET
    440 
    441 INIT_XMM sse2
    442 cglobal vp9_ipred_dc_%1_32x32_16, 4, 4, 2, dst, stride, l, a
    443    mova                    m0, [%2+mmsize*0]
    444    paddw                   m0, [%2+mmsize*1]
    445    paddw                   m0, [%2+mmsize*2]
    446    paddw                   m0, [%2+mmsize*3]
    447    DEFINE_ARGS dst, stride, cnt
    448    mov                   cntd, 16
    449    pmaddwd                 m0, [pw_1]
    450    pshufd                  m1, m0, q3232
    451    paddd                   m0, m1
    452    pshufd                  m1, m0, q1111
    453    paddd                   m0, [pd_16]
    454    paddd                   m0, m1
    455    psrad                   m0, 5
    456    pshuflw                 m0, m0, q0000
    457    punpcklqdq              m0, m0
    458 .loop:
    459    mova   [dstq+strideq*0+ 0], m0
    460    mova   [dstq+strideq*0+16], m0
    461    mova   [dstq+strideq*0+32], m0
    462    mova   [dstq+strideq*0+48], m0
    463    mova   [dstq+strideq*1+ 0], m0
    464    mova   [dstq+strideq*1+16], m0
    465    mova   [dstq+strideq*1+32], m0
    466    mova   [dstq+strideq*1+48], m0
    467    lea                   dstq, [dstq+strideq*2]
    468    dec                   cntd
    469    jg .loop
    470    RET
    471 %endmacro
    472 
    473 DC_1D_FNS top,  aq
    474 DC_1D_FNS left, lq
    475 
    476 INIT_MMX mmxext
    477 cglobal vp9_ipred_tm_4x4_10, 4, 4, 6, dst, stride, l, a
    478    mova                    m5, [pw_1023]
    479 .body:
    480    mova                    m4, [aq]
    481    mova                    m3, [lq]
    482    movd                    m0, [aq-4]
    483    pshufw                  m0, m0, q1111
    484    psubw                   m4, m0
    485    DEFINE_ARGS dst, stride, stride3
    486    lea               stride3q, [strideq*3]
    487    pshufw                  m0, m3, q3333
    488    pshufw                  m1, m3, q2222
    489    pshufw                  m2, m3, q1111
    490    pshufw                  m3, m3, q0000
    491    paddw                   m0, m4
    492    paddw                   m1, m4
    493    paddw                   m2, m4
    494    paddw                   m3, m4
    495    pxor                    m4, m4
    496    pmaxsw                  m0, m4
    497    pmaxsw                  m1, m4
    498    pmaxsw                  m2, m4
    499    pmaxsw                  m3, m4
    500    pminsw                  m0, m5
    501    pminsw                  m1, m5
    502    pminsw                  m2, m5
    503    pminsw                  m3, m5
    504    mova      [dstq+strideq*0], m0
    505    mova      [dstq+strideq*1], m1
    506    mova      [dstq+strideq*2], m2
    507    mova      [dstq+stride3q ], m3
    508    RET
    509 
    510 cglobal vp9_ipred_tm_4x4_12, 4, 4, 6, dst, stride, l, a
    511    mova                    m5, [pw_4095]
    512    jmp mangle(private_prefix %+ _ %+ vp9_ipred_tm_4x4_10 %+ SUFFIX).body
    513 
    514 INIT_XMM sse2
    515 cglobal vp9_ipred_tm_8x8_10, 4, 5, 7, dst, stride, l, a
    516    mova                    m4, [pw_1023]
    517 .body:
    518    pxor                    m6, m6
    519    mova                    m5, [aq]
    520    movd                    m0, [aq-4]
    521    pshuflw                 m0, m0, q1111
    522    punpcklqdq              m0, m0
    523    psubw                   m5, m0
    524    DEFINE_ARGS dst, stride, l, stride3, cnt
    525    lea               stride3q, [strideq*3]
    526    mov                   cntd, 1
    527 .loop:
    528    movh                    m3, [lq+cntq*8]
    529    punpcklwd               m3, m3
    530    pshufd                  m0, m3, q3333
    531    pshufd                  m1, m3, q2222
    532    pshufd                  m2, m3, q1111
    533    pshufd                  m3, m3, q0000
    534    paddw                   m0, m5
    535    paddw                   m1, m5
    536    paddw                   m2, m5
    537    paddw                   m3, m5
    538    pmaxsw                  m0, m6
    539    pmaxsw                  m1, m6
    540    pmaxsw                  m2, m6
    541    pmaxsw                  m3, m6
    542    pminsw                  m0, m4
    543    pminsw                  m1, m4
    544    pminsw                  m2, m4
    545    pminsw                  m3, m4
    546    mova      [dstq+strideq*0], m0
    547    mova      [dstq+strideq*1], m1
    548    mova      [dstq+strideq*2], m2
    549    mova      [dstq+stride3q ], m3
    550    lea                   dstq, [dstq+strideq*4]
    551    dec                   cntd
    552    jge .loop
    553    RET
    554 
    555 cglobal vp9_ipred_tm_8x8_12, 4, 5, 7, dst, stride, l, a
    556    mova                    m4, [pw_4095]
    557    jmp mangle(private_prefix %+ _ %+ vp9_ipred_tm_8x8_10 %+ SUFFIX).body
    558 
    559 INIT_XMM sse2
    560 cglobal vp9_ipred_tm_16x16_10, 4, 4, 8, dst, stride, l, a
    561    mova                    m7, [pw_1023]
    562 .body:
    563    pxor                    m6, m6
    564    mova                    m4, [aq]
    565    mova                    m5, [aq+mmsize]
    566    movd                    m0, [aq-4]
    567    pshuflw                 m0, m0, q1111
    568    punpcklqdq              m0, m0
    569    psubw                   m4, m0
    570    psubw                   m5, m0
    571    DEFINE_ARGS dst, stride, l, cnt
    572    mov                   cntd, 7
    573 .loop:
    574    movd                    m3, [lq+cntq*4]
    575    punpcklwd               m3, m3
    576    pshufd                  m2, m3, q1111
    577    pshufd                  m3, m3, q0000
    578    paddw                   m0, m2, m4
    579    paddw                   m2, m5
    580    paddw                   m1, m3, m4
    581    paddw                   m3, m5
    582    pmaxsw                  m0, m6
    583    pmaxsw                  m2, m6
    584    pmaxsw                  m1, m6
    585    pmaxsw                  m3, m6
    586    pminsw                  m0, m7
    587    pminsw                  m2, m7
    588    pminsw                  m1, m7
    589    pminsw                  m3, m7
    590    mova   [dstq+strideq*0+ 0], m0
    591    mova   [dstq+strideq*0+16], m2
    592    mova   [dstq+strideq*1+ 0], m1
    593    mova   [dstq+strideq*1+16], m3
    594    lea                   dstq, [dstq+strideq*2]
    595    dec                   cntd
    596    jge .loop
    597    RET
    598 
    599 cglobal vp9_ipred_tm_16x16_12, 4, 4, 8, dst, stride, l, a
    600    mova                    m7, [pw_4095]
    601    jmp mangle(private_prefix %+ _ %+ vp9_ipred_tm_16x16_10 %+ SUFFIX).body
    602 
    603 INIT_XMM sse2
    604 cglobal vp9_ipred_tm_32x32_10, 4, 4, 10, 32 * -ARCH_X86_32, dst, stride, l, a
    605    mova                    m0, [pw_1023]
    606 .body:
    607    pxor                    m1, m1
    608 %if ARCH_X86_64
    609    SWAP                     0, 8
    610    SWAP                     1, 9
    611 %define reg_min m9
    612 %define reg_max m8
    613 %else
    614    mova              [rsp+ 0], m0
    615    mova              [rsp+16], m1
    616 %define reg_min [rsp+16]
    617 %define reg_max [rsp+ 0]
    618 %endif
    619 
    620    mova                    m4, [aq+mmsize*0]
    621    mova                    m5, [aq+mmsize*1]
    622    mova                    m6, [aq+mmsize*2]
    623    mova                    m7, [aq+mmsize*3]
    624    movd                    m0, [aq-4]
    625    pshuflw                 m0, m0, q1111
    626    punpcklqdq              m0, m0
    627    psubw                   m4, m0
    628    psubw                   m5, m0
    629    psubw                   m6, m0
    630    psubw                   m7, m0
    631    DEFINE_ARGS dst, stride, l, cnt
    632    mov                   cntd, 31
    633 .loop:
    634    pinsrw                  m3, [lq+cntq*2], 0
    635    punpcklwd               m3, m3
    636    pshufd                  m3, m3, q0000
    637    paddw                   m0, m3, m4
    638    paddw                   m1, m3, m5
    639    paddw                   m2, m3, m6
    640    paddw                   m3, m7
    641    pmaxsw                  m0, reg_min
    642    pmaxsw                  m1, reg_min
    643    pmaxsw                  m2, reg_min
    644    pmaxsw                  m3, reg_min
    645    pminsw                  m0, reg_max
    646    pminsw                  m1, reg_max
    647    pminsw                  m2, reg_max
    648    pminsw                  m3, reg_max
    649    mova   [dstq+strideq*0+ 0], m0
    650    mova   [dstq+strideq*0+16], m1
    651    mova   [dstq+strideq*0+32], m2
    652    mova   [dstq+strideq*0+48], m3
    653    add                   dstq, strideq
    654    dec                   cntd
    655    jge .loop
    656    RET
    657 
    658 cglobal vp9_ipred_tm_32x32_12, 4, 4, 10, 32 * -ARCH_X86_32, dst, stride, l, a
    659    mova                    m0, [pw_4095]
    660    jmp mangle(private_prefix %+ _ %+ vp9_ipred_tm_32x32_10 %+ SUFFIX).body
    661 
    662 ; Directional intra predicion functions
    663 ;
    664 ; in the functions below, 'abcdefgh' refers to above data (sometimes simply
    665 ; abbreviated as a[N-M]). 'stuvwxyz' refers to left data (sometimes simply
    666 ; abbreviated as l[N-M]). * is top-left data. ABCDEFG or A[N-M] is filtered
    667 ; above data, STUVWXYZ or L[N-M] is filtered left data, and # is filtered
    668 ; top-left data.
    669 
    670 ; left=(left+2*center+right+2)>>2
    671 %macro LOWPASS 3 ; left [dst], center, right
    672    paddw                  m%1, m%3
    673    psraw                  m%1, 1
    674    pavgw                  m%1, m%2
    675 %endmacro
    676 
    677 ; abcdefgh (src) -> bcdefghh (dst)
    678 ; dst/src can be the same register
    679 %macro SHIFT_RIGHT 2-3 [pb_2to15_14_15] ; dst, src, [ssse3_shift_reg]
    680 %if cpuflag(ssse3)
    681    pshufb                  %1, %2, %3              ; abcdefgh -> bcdefghh
    682 %else
    683    psrldq                  %1, %2, 2               ; abcdefgh -> bcdefgh.
    684    pshufhw                 %1, %1, q2210           ; bcdefgh. -> bcdefghh
    685 %endif
    686 %endmacro
    687 
    688 ; abcdefgh (src) -> bcdefghh (dst1) and cdefghhh (dst2)
    689 %macro SHIFT_RIGHTx2 3-4 [pb_2to15_14_15] ; dst1, dst2, src, [ssse3_shift_reg]
    690 %if cpuflag(ssse3)
    691    pshufb                  %1, %3, %4              ; abcdefgh -> bcdefghh
    692    pshufb                  %2, %1, %4              ; bcdefghh -> cdefghhh
    693 %else
    694    psrldq                  %1, %3, 2               ; abcdefgh -> bcdefgh.
    695    psrldq                  %2, %3, 4               ; abcdefgh -> cdefgh..
    696    pshufhw                 %1, %1, q2210           ; bcdefgh. -> bcdefghh
    697    pshufhw                 %2, %2, q1110           ; cdefgh.. -> cdefghhh
    698 %endif
    699 %endmacro
    700 
    701 %macro DL_FUNCS 0
    702 cglobal vp9_ipred_dl_4x4_16, 2, 4, 3, dst, stride, l, a
    703    movifnidn               aq, amp
    704    movu                    m1, [aq]                ; abcdefgh
    705    pshufhw                 m0, m1, q3310           ; abcdefhh
    706    SHIFT_RIGHT             m1, m1                  ; bcdefghh
    707    psrldq                  m2, m1, 2               ; cdefghh.
    708    LOWPASS                  0,  1,  2              ; BCDEFGh.
    709    pshufd                  m1, m0, q3321           ; DEFGh...
    710    movh      [dstq+strideq*0], m0
    711    movh      [dstq+strideq*2], m1
    712    add                   dstq, strideq
    713    psrldq                  m0, 2                   ; CDEFGh..
    714    psrldq                  m1, 2                   ; EFGh....
    715    movh      [dstq+strideq*0], m0
    716    movh      [dstq+strideq*2], m1
    717    RET
    718 
    719 cglobal vp9_ipred_dl_8x8_16, 2, 4, 5, dst, stride, l, a
    720    movifnidn               aq, amp
    721    mova                    m0, [aq]                ; abcdefgh
    722 %if cpuflag(ssse3)
    723    mova                    m4, [pb_2to15_14_15]
    724 %endif
    725    SHIFT_RIGHTx2           m1, m2, m0, m4          ; bcdefghh/cdefghhh
    726    LOWPASS                  0,  1,  2              ; BCDEFGHh
    727    shufps                  m1, m0, m2, q3332       ; FGHhhhhh
    728    shufps                  m3, m0, m1, q2121       ; DEFGHhhh
    729    DEFINE_ARGS dst, stride, stride5
    730    lea               stride5q, [strideq*5]
    731 
    732    mova      [dstq+strideq*0], m0
    733    mova      [dstq+strideq*4], m1
    734    SHIFT_RIGHT             m0, m0, m4              ; CDEFGHhh
    735    pshuflw                 m1, m1, q3321           ; GHhhhhhh
    736    pshufd                  m2, m0, q3321           ; EFGHhhhh
    737    mova      [dstq+strideq*1], m0
    738    mova      [dstq+stride5q ], m1
    739    lea                   dstq, [dstq+strideq*2]
    740    pshuflw                 m1, m1, q3321           ; Hhhhhhhh
    741    mova      [dstq+strideq*0], m3
    742    mova      [dstq+strideq*4], m1
    743    pshuflw                 m1, m1, q3321           ; hhhhhhhh
    744    mova      [dstq+strideq*1], m2
    745    mova      [dstq+stride5q ], m1
    746    RET
    747 
    748 cglobal vp9_ipred_dl_16x16_16, 2, 4, 5, dst, stride, l, a
    749    movifnidn               aq, amp
    750    mova                    m0, [aq]                ; abcdefgh
    751    mova                    m3, [aq+mmsize]         ; ijklmnop
    752    PALIGNR                 m1, m3, m0, 2, m4       ; bcdefghi
    753    PALIGNR                 m2, m3, m0, 4, m4       ; cdefghij
    754    LOWPASS                  0,  1,  2              ; BCDEFGHI
    755 %if cpuflag(ssse3)
    756    mova                    m4, [pb_2to15_14_15]
    757 %endif
    758    SHIFT_RIGHTx2           m2, m1, m3, m4          ; jklmnopp/klmnoppp
    759    LOWPASS                  1,  2,  3              ; JKLMNOPp
    760    pshufd                  m2, m2, q3333           ; pppppppp
    761    DEFINE_ARGS dst, stride, cnt
    762    mov                   cntd, 8
    763 
    764 .loop:
    765    mova   [dstq+strideq*0+ 0], m0
    766    mova   [dstq+strideq*0+16], m1
    767    mova   [dstq+strideq*8+ 0], m1
    768    mova   [dstq+strideq*8+16], m2
    769    add                   dstq, strideq
    770 %if cpuflag(avx)
    771    vpalignr                m0, m1, m0, 2
    772 %else
    773    PALIGNR                 m3, m1, m0, 2, m4
    774    mova                    m0, m3
    775 %endif
    776    SHIFT_RIGHT             m1, m1, m4
    777    dec                   cntd
    778    jg .loop
    779    RET
    780 
    781 cglobal vp9_ipred_dl_32x32_16, 2, 5, 7, dst, stride, l, a
    782    movifnidn               aq, amp
    783    mova                    m0, [aq+mmsize*0]       ; abcdefgh
    784    mova                    m1, [aq+mmsize*1]       ; ijklmnop
    785    mova                    m2, [aq+mmsize*2]       ; qrstuvwx
    786    mova                    m3, [aq+mmsize*3]       ; yz012345
    787    PALIGNR                 m4, m1, m0, 2, m6
    788    PALIGNR                 m5, m1, m0, 4, m6
    789    LOWPASS                  0,  4,  5              ; BCDEFGHI
    790    PALIGNR                 m4, m2, m1, 2, m6
    791    PALIGNR                 m5, m2, m1, 4, m6
    792    LOWPASS                  1,  4,  5              ; JKLMNOPQ
    793    PALIGNR                 m4, m3, m2, 2, m6
    794    PALIGNR                 m5, m3, m2, 4, m6
    795    LOWPASS                  2,  4,  5              ; RSTUVWXY
    796 %if cpuflag(ssse3)
    797    mova                    m6, [pb_2to15_14_15]
    798 %endif
    799    SHIFT_RIGHTx2           m4, m5, m3, m6
    800    LOWPASS                  3,  4,  5              ; Z0123455
    801    pshufd                  m4, m4, q3333           ; 55555555
    802    DEFINE_ARGS dst, stride, stride8, stride24, cnt
    803    mov                   cntd, 8
    804    lea               stride8q, [strideq*8]
    805    lea              stride24q, [stride8q*3]
    806 
    807 .loop:
    808    mova  [dstq+stride8q*0+ 0], m0
    809    mova  [dstq+stride8q*0+16], m1
    810    mova  [dstq+stride8q*0+32], m2
    811    mova  [dstq+stride8q*0+48], m3
    812    mova  [dstq+stride8q*1+ 0], m1
    813    mova  [dstq+stride8q*1+16], m2
    814    mova  [dstq+stride8q*1+32], m3
    815    mova  [dstq+stride8q*1+48], m4
    816    mova  [dstq+stride8q*2+ 0], m2
    817    mova  [dstq+stride8q*2+16], m3
    818    mova  [dstq+stride8q*2+32], m4
    819    mova  [dstq+stride8q*2+48], m4
    820    mova  [dstq+stride24q + 0], m3
    821    mova  [dstq+stride24q +16], m4
    822    mova  [dstq+stride24q +32], m4
    823    mova  [dstq+stride24q +48], m4
    824    add                   dstq, strideq
    825 %if cpuflag(avx)
    826    vpalignr                m0, m1, m0, 2
    827    vpalignr                m1, m2, m1, 2
    828    vpalignr                m2, m3, m2, 2
    829 %else
    830    PALIGNR                 m5, m1, m0, 2, m6
    831    mova                    m0, m5
    832    PALIGNR                 m5, m2, m1, 2, m6
    833    mova                    m1, m5
    834    PALIGNR                 m5, m3, m2, 2, m6
    835    mova                    m2, m5
    836 %endif
    837    SHIFT_RIGHT             m3, m3, m6
    838    dec                   cntd
    839    jg .loop
    840    RET
    841 %endmacro
    842 
    843 INIT_XMM sse2
    844 DL_FUNCS
    845 INIT_XMM ssse3
    846 DL_FUNCS
    847 INIT_XMM avx
    848 DL_FUNCS
    849 
    850 %if HAVE_AVX2_EXTERNAL
    851 INIT_YMM avx2
    852 cglobal vp9_ipred_dl_16x16_16, 2, 4, 5, dst, stride, l, a
    853    movifnidn               aq, amp
    854    mova                    m0, [aq]                   ; abcdefghijklmnop
    855    vpbroadcastw           xm1, [aq+30]                ; pppppppp
    856    vperm2i128              m2, m0, m1, q0201          ; ijklmnoppppppppp
    857    vpalignr                m3, m2, m0, 2              ; bcdefghijklmnopp
    858    vpalignr                m4, m2, m0, 4              ; cdefghijklmnoppp
    859    LOWPASS                  0,  3,  4                 ; BCDEFGHIJKLMNOPp
    860    vperm2i128              m2, m0, m1, q0201          ; JKLMNOPppppppppp
    861    DEFINE_ARGS dst, stride, stride3, cnt
    862    mov                   cntd, 2
    863    lea               stride3q, [strideq*3]
    864 
    865 .loop:
    866    mova      [dstq+strideq*0], m0
    867    vpalignr                m3, m2, m0, 2
    868    vpalignr                m4, m2, m0, 4
    869    mova      [dstq+strideq*1], m3
    870    mova      [dstq+strideq*2], m4
    871    vpalignr                m3, m2, m0, 6
    872    vpalignr                m4, m2, m0, 8
    873    mova      [dstq+stride3q ], m3
    874    lea                   dstq, [dstq+strideq*4]
    875    mova      [dstq+strideq*0], m4
    876    vpalignr                m3, m2, m0, 10
    877    vpalignr                m4, m2, m0, 12
    878    mova      [dstq+strideq*1], m3
    879    mova      [dstq+strideq*2], m4
    880    vpalignr                m3, m2, m0, 14
    881    mova      [dstq+stride3q ], m3
    882    lea                   dstq, [dstq+strideq*4]
    883    mova                    m0, m2
    884    vperm2i128              m2, m2, m2, q0101          ; pppppppppppppppp
    885    dec                   cntd
    886    jg .loop
    887    RET
    888 
    889 cglobal vp9_ipred_dl_32x32_16, 2, 6, 7, dst, stride, l, a
    890    movifnidn               aq, amp
    891    mova                    m0, [aq+mmsize*0+ 0]       ; abcdefghijklmnop
    892    mova                    m1, [aq+mmsize*1+ 0]       ; qrstuvwxyz012345
    893    vpbroadcastw           xm4, [aq+mmsize*1+30]       ; 55555555
    894    vperm2i128              m5, m0, m1, q0201          ; ijklmnopqrstuvwx
    895    vpalignr                m2, m5, m0, 2              ; bcdefghijklmnopq
    896    vpalignr                m3, m5, m0, 4              ; cdefghijklmnopqr
    897    LOWPASS                  0,  2,  3                 ; BCDEFGHIJKLMNOPQ
    898    vperm2i128              m5, m1, m4, q0201          ; yz01234555555555
    899    vpalignr                m2, m5, m1, 2              ; rstuvwxyz0123455
    900    vpalignr                m3, m5, m1, 4              ; stuvwxyz01234555
    901    LOWPASS                  1,  2,  3                 ; RSTUVWXYZ......5
    902    vperm2i128              m2, m1, m4, q0201          ; Z......555555555
    903    vperm2i128              m5, m0, m1, q0201          ; JKLMNOPQRSTUVWXY
    904    DEFINE_ARGS dst, stride, stride3, cnt
    905    lea               stride3q, [strideq*3]
    906    mov                   cntd, 4
    907 
    908 .loop:
    909    mova   [dstq+strideq*0 + 0], m0
    910    mova   [dstq+strideq*0 +32], m1
    911    vpalignr                 m3, m5, m0, 2
    912    vpalignr                 m4, m2, m1, 2
    913    mova   [dstq+strideq*1 + 0], m3
    914    mova   [dstq+strideq*1 +32], m4
    915    vpalignr                 m3, m5, m0, 4
    916    vpalignr                 m4, m2, m1, 4
    917    mova   [dstq+strideq*2 + 0], m3
    918    mova   [dstq+strideq*2 +32], m4
    919    vpalignr                 m3, m5, m0, 6
    920    vpalignr                 m4, m2, m1, 6
    921    mova   [dstq+stride3q*1+ 0], m3
    922    mova   [dstq+stride3q*1+32], m4
    923    lea                    dstq, [dstq+strideq*4]
    924    vpalignr                 m3, m5, m0, 8
    925    vpalignr                 m4, m2, m1, 8
    926    mova   [dstq+strideq*0 + 0], m3
    927    mova   [dstq+strideq*0 +32], m4
    928    vpalignr                 m3, m5, m0, 10
    929    vpalignr                 m4, m2, m1, 10
    930    mova   [dstq+strideq*1 + 0], m3
    931    mova   [dstq+strideq*1 +32], m4
    932    vpalignr                 m3, m5, m0, 12
    933    vpalignr                 m4, m2, m1, 12
    934    mova   [dstq+strideq*2+ 0], m3
    935    mova   [dstq+strideq*2+32], m4
    936    vpalignr                 m3, m5, m0, 14
    937    vpalignr                 m4, m2, m1, 14
    938    mova   [dstq+stride3q+  0], m3
    939    mova   [dstq+stride3q+ 32], m4
    940    vpalignr                 m3, m5, m0, 16
    941    vpalignr                 m4, m2, m1, 16
    942    vperm2i128               m5, m3, m4, q0201
    943    vperm2i128               m2, m4, m4, q0101
    944    mova                     m0, m3
    945    mova                     m1, m4
    946    lea                    dstq, [dstq+strideq*4]
    947    dec                    cntd
    948    jg .loop
    949    RET
    950 %endif
    951 
    952 %macro DR_FUNCS 1 ; stack_mem_for_32x32_32bit_function
    953 cglobal vp9_ipred_dr_4x4_16, 4, 4, 3, dst, stride, l, a
    954    movh                    m0, [lq]                ; wxyz....
    955    movhps                  m0, [aq-2]              ; wxyz*abc
    956    movd                    m1, [aq+6]              ; d.......
    957    PALIGNR                 m1, m0, 2, m2           ; xyz*abcd
    958    psrldq                  m2, m1, 2               ; yz*abcd.
    959    LOWPASS                  0, 1, 2                ; XYZ#ABC.
    960    DEFINE_ARGS dst, stride, stride3
    961    lea               stride3q, [strideq*3]
    962 
    963    movh      [dstq+stride3q ], m0
    964    psrldq                  m0, 2                   ; YZ#ABC..
    965    movh      [dstq+strideq*2], m0
    966    psrldq                  m0, 2                   ; Z#ABC...
    967    movh      [dstq+strideq*1], m0
    968    psrldq                  m0, 2                   ; #ABC....
    969    movh      [dstq+strideq*0], m0
    970    RET
    971 
    972 cglobal vp9_ipred_dr_8x8_16, 4, 4, 5, dst, stride, l, a
    973    mova                    m0, [lq]                ; stuvwxyz
    974    movu                    m1, [aq-2]              ; *abcdefg
    975    mova                    m2, [aq]                ; abcdefgh
    976    psrldq                  m3, m2, 2               ; bcdefgh.
    977    LOWPASS                  3,  2, 1               ; ABCDEFG.
    978    PALIGNR                 m1, m0, 2, m4           ; tuvwxyz*
    979    PALIGNR                 m2, m1, 2, m4           ; uvwxyz*a
    980    LOWPASS                  2,  1, 0               ; TUVWXYZ#
    981    DEFINE_ARGS dst, stride, dst4, stride3
    982    lea               stride3q, [strideq*3]
    983    lea                  dst4q, [dstq+strideq*4]
    984 
    985    movhps [dstq +stride3q +0], m2
    986    movh   [dstq+ stride3q +8], m3
    987    mova   [dst4q+stride3q +0], m2
    988    PALIGNR                 m1, m3, m2, 2, m0
    989    psrldq                  m3, 2
    990    movhps [dstq +strideq*2+0], m1
    991    movh   [dstq+ strideq*2+8], m3
    992    mova   [dst4q+strideq*2+0], m1
    993    PALIGNR                 m2, m3, m1, 2, m0
    994    psrldq                  m3, 2
    995    movhps [dstq +strideq*1+0], m2
    996    movh   [dstq+ strideq*1+8], m3
    997    mova   [dst4q+strideq*1+0], m2
    998    PALIGNR                 m1, m3, m2, 2, m0
    999    psrldq                  m3, 2
   1000    movhps [dstq +strideq*0+0], m1
   1001    movh   [dstq+ strideq*0+8], m3
   1002    mova   [dst4q+strideq*0+0], m1
   1003    RET
   1004 
   1005 cglobal vp9_ipred_dr_16x16_16, 4, 4, 7, dst, stride, l, a
   1006    mova                    m0, [lq]                ; klmnopqr
   1007    mova                    m1, [lq+mmsize]         ; stuvwxyz
   1008    movu                    m2, [aq-2]              ; *abcdefg
   1009    movu                    m3, [aq+mmsize-2]       ; hijklmno
   1010    mova                    m4, [aq]                ; abcdefgh
   1011    mova                    m5, [aq+mmsize]         ; ijklmnop
   1012    psrldq                  m6, m5, 2               ; jklmnop.
   1013    LOWPASS                  6,  5, 3               ; IJKLMNO.
   1014    PALIGNR                 m5, m4, 2, m3           ; bcdefghi
   1015    LOWPASS                  5,  4, 2               ; ABCDEFGH
   1016    PALIGNR                 m2, m1, 2, m3           ; tuvwxyz*
   1017    PALIGNR                 m4, m2, 2, m3           ; uvwxyz*a
   1018    LOWPASS                  4,  2, 1               ; TUVWXYZ#
   1019    PALIGNR                 m1, m0, 2, m3           ; lmnopqrs
   1020    PALIGNR                 m2, m1, 2, m3           ; mnopqrst
   1021    LOWPASS                  2, 1, 0                ; LMNOPQRS
   1022    DEFINE_ARGS dst, stride, dst8, cnt
   1023    lea                  dst8q, [dstq+strideq*8]
   1024    mov                   cntd, 8
   1025 
   1026 .loop:
   1027    sub                  dst8q, strideq
   1028    mova  [dst8q+strideq*0+ 0], m4
   1029    mova  [dst8q+strideq*0+16], m5
   1030    mova  [dst8q+strideq*8+ 0], m2
   1031    mova  [dst8q+strideq*8+16], m4
   1032 %if cpuflag(avx)
   1033    vpalignr                m2, m4, m2, 2
   1034    vpalignr                m4, m5, m4, 2
   1035    vpalignr                m5, m6, m5, 2
   1036 %else
   1037    PALIGNR                 m0, m4, m2, 2, m1
   1038    mova                    m2, m0
   1039    PALIGNR                 m0, m5, m4, 2, m1
   1040    mova                    m4, m0
   1041    PALIGNR                 m0, m6, m5, 2, m1
   1042    mova                    m5, m0
   1043 %endif
   1044    psrldq                  m6, 2
   1045    dec                   cntd
   1046    jg .loop
   1047    RET
   1048 
   1049 cglobal vp9_ipred_dr_32x32_16, 4, 5, 10 + notcpuflag(ssse3), \
   1050                               %1 * ARCH_X86_32 * -mmsize, dst, stride, l, a
   1051    mova                    m0, [aq+mmsize*3]       ; a[24-31]
   1052    movu                    m1, [aq+mmsize*3-2]     ; a[23-30]
   1053    psrldq                  m2, m0, 2               ; a[25-31].
   1054    LOWPASS                  2,  0, 1               ; A[24-30].
   1055    mova                    m1, [aq+mmsize*2]       ; a[16-23]
   1056    movu                    m3, [aq+mmsize*2-2]     ; a[15-22]
   1057    PALIGNR                 m0, m1, 2, m4           ; a[17-24]
   1058    LOWPASS                  0,  1, 3               ; A[16-23]
   1059    mova                    m3, [aq+mmsize*1]       ; a[8-15]
   1060    movu                    m4, [aq+mmsize*1-2]     ; a[7-14]
   1061    PALIGNR                 m1, m3, 2, m5           ; a[9-16]
   1062    LOWPASS                  1,  3, 4               ; A[8-15]
   1063    mova                    m4, [aq+mmsize*0]       ; a[0-7]
   1064    movu                    m5, [aq+mmsize*0-2]     ; *a[0-6]
   1065    PALIGNR                 m3, m4, 2, m6           ; a[1-8]
   1066    LOWPASS                  3,  4, 5               ; A[0-7]
   1067    SCRATCH                  1,  8, rsp+0*mmsize
   1068    SCRATCH                  3,  9, rsp+1*mmsize
   1069 %if notcpuflag(ssse3)
   1070    SCRATCH                  0, 10, rsp+2*mmsize
   1071 %endif
   1072    mova                    m6, [lq+mmsize*3]       ; l[24-31]
   1073    PALIGNR                 m5, m6, 2, m0           ; l[25-31]*
   1074    PALIGNR                 m4, m5, 2, m0           ; l[26-31]*a
   1075    LOWPASS                  4,  5, 6               ; L[25-31]#
   1076    mova                    m7, [lq+mmsize*2]       ; l[16-23]
   1077    PALIGNR                 m6, m7, 2, m0           ; l[17-24]
   1078    PALIGNR                 m5, m6, 2, m0           ; l[18-25]
   1079    LOWPASS                  5,  6, 7               ; L[17-24]
   1080    mova                    m1, [lq+mmsize*1]       ; l[8-15]
   1081    PALIGNR                 m7, m1, 2, m0           ; l[9-16]
   1082    PALIGNR                 m6, m7, 2, m0           ; l[10-17]
   1083    LOWPASS                  6,  7, 1               ; L[9-16]
   1084    mova                    m3, [lq+mmsize*0]       ; l[0-7]
   1085    PALIGNR                 m1, m3, 2, m0           ; l[1-8]
   1086    PALIGNR                 m7, m1, 2, m0           ; l[2-9]
   1087    LOWPASS                  7,  1, 3               ; L[1-8]
   1088 %if cpuflag(ssse3)
   1089 %if cpuflag(avx)
   1090    UNSCRATCH                1,  8, rsp+0*mmsize
   1091 %endif
   1092    UNSCRATCH                3,  9, rsp+1*mmsize
   1093 %else
   1094    UNSCRATCH                0, 10, rsp+2*mmsize
   1095 %endif
   1096    DEFINE_ARGS dst8, stride, stride8, stride24, cnt
   1097    lea               stride8q, [strideq*8]
   1098    lea              stride24q, [stride8q*3]
   1099    lea                  dst8q, [dst8q+strideq*8]
   1100    mov                   cntd, 8
   1101 
   1102 .loop:
   1103    sub                  dst8q, strideq
   1104 %if notcpuflag(avx)
   1105    UNSCRATCH                1,  8, rsp+0*mmsize
   1106 %if notcpuflag(ssse3)
   1107    UNSCRATCH                3,  9, rsp+1*mmsize
   1108 %endif
   1109 %endif
   1110    mova [dst8q+stride8q*0+ 0], m4
   1111    mova [dst8q+stride8q*0+16], m3
   1112    mova [dst8q+stride8q*0+32], m1
   1113    mova [dst8q+stride8q*0+48], m0
   1114    mova [dst8q+stride8q*1+ 0], m5
   1115    mova [dst8q+stride8q*1+16], m4
   1116    mova [dst8q+stride8q*1+32], m3
   1117    mova [dst8q+stride8q*1+48], m1
   1118    mova [dst8q+stride8q*2+ 0], m6
   1119    mova [dst8q+stride8q*2+16], m5
   1120    mova [dst8q+stride8q*2+32], m4
   1121    mova [dst8q+stride8q*2+48], m3
   1122    mova [dst8q+stride24q + 0], m7
   1123    mova [dst8q+stride24q +16], m6
   1124    mova [dst8q+stride24q +32], m5
   1125    mova [dst8q+stride24q +48], m4
   1126 %if cpuflag(avx)
   1127    vpalignr                m7, m6, m7, 2
   1128    vpalignr                m6, m5, m6, 2
   1129    vpalignr                m5, m4, m5, 2
   1130    vpalignr                m4, m3, m4, 2
   1131    vpalignr                m3, m1, m3, 2
   1132    vpalignr                m1, m0, m1, 2
   1133    vpalignr                m0, m2, m0, 2
   1134 %else
   1135    SCRATCH                  2,  8, rsp+0*mmsize
   1136 %if notcpuflag(ssse3)
   1137    SCRATCH                  0,  9, rsp+1*mmsize
   1138 %endif
   1139    PALIGNR                 m2, m6, m7, 2, m0
   1140    mova                    m7, m2
   1141    PALIGNR                 m2, m5, m6, 2, m0
   1142    mova                    m6, m2
   1143    PALIGNR                 m2, m4, m5, 2, m0
   1144    mova                    m5, m2
   1145    PALIGNR                 m2, m3, m4, 2, m0
   1146    mova                    m4, m2
   1147    PALIGNR                 m2, m1, m3, 2, m0
   1148    mova                    m3, m2
   1149 %if notcpuflag(ssse3)
   1150    UNSCRATCH                0,  9, rsp+1*mmsize
   1151    SCRATCH                  3,  9, rsp+1*mmsize
   1152 %endif
   1153    PALIGNR                 m2, m0, m1, 2, m3
   1154    mova                    m1, m2
   1155    UNSCRATCH                2,  8, rsp+0*mmsize
   1156    SCRATCH                  1,  8, rsp+0*mmsize
   1157    PALIGNR                 m1, m2, m0, 2, m3
   1158    mova                    m0, m1
   1159 %endif
   1160    psrldq                  m2, 2
   1161    dec                   cntd
   1162    jg .loop
   1163    RET
   1164 %endmacro
   1165 
   1166 INIT_XMM sse2
   1167 DR_FUNCS 3
   1168 INIT_XMM ssse3
   1169 DR_FUNCS 2
   1170 INIT_XMM avx
   1171 DR_FUNCS 2
   1172 
   1173 %if HAVE_AVX2_EXTERNAL
   1174 INIT_YMM avx2
   1175 cglobal vp9_ipred_dr_16x16_16, 4, 5, 6, dst, stride, l, a
   1176    mova                    m0, [lq]                   ; klmnopqrstuvwxyz
   1177    movu                    m1, [aq-2]                 ; *abcdefghijklmno
   1178    mova                    m2, [aq]                   ; abcdefghijklmnop
   1179    vperm2i128              m4, m2, m2, q2001          ; ijklmnop........
   1180    vpalignr                m5, m4, m2, 2              ; bcdefghijklmnop.
   1181    vperm2i128              m3, m0, m1, q0201          ; stuvwxyz*abcdefg
   1182    LOWPASS                  1,  2,  5                 ; ABCDEFGHIJKLMNO.
   1183    vpalignr                m4, m3, m0, 2              ; lmnopqrstuvwxyz*
   1184    vpalignr                m5, m3, m0, 4              ; mnopqrstuvwxyz*a
   1185    LOWPASS                  0,  4,  5                 ; LMNOPQRSTUVWXYZ#
   1186    vperm2i128              m5, m0, m1, q0201          ; TUVWXYZ#ABCDEFGH
   1187    DEFINE_ARGS dst, stride, stride3, stride5, dst3
   1188    lea                  dst3q, [dstq+strideq*4]
   1189    lea               stride3q, [strideq*3]
   1190    lea               stride5q, [stride3q+strideq*2]
   1191 
   1192    vpalignr                m3, m5, m0, 2
   1193    vpalignr                m4, m1, m5, 2
   1194    mova    [dst3q+stride5q*2], m3                     ; 14
   1195    mova    [ dstq+stride3q*2], m4                     ; 6
   1196    vpalignr                m3, m5, m0, 4
   1197    vpalignr                m4, m1, m5, 4
   1198    sub                  dst3q, strideq
   1199    mova    [dst3q+stride5q*2], m3                     ; 13
   1200    mova    [dst3q+strideq*2 ], m4                     ; 5
   1201    mova    [dst3q+stride3q*4], m0                     ; 15
   1202    vpalignr                m3, m5, m0, 6
   1203    vpalignr                m4, m1, m5, 6
   1204    mova     [dstq+stride3q*4], m3                     ; 12
   1205    mova     [dst3q+strideq*1], m4                     ; 4
   1206    vpalignr                m3, m5, m0, 8
   1207    vpalignr                m4, m1, m5, 8
   1208    mova     [dst3q+strideq*8], m3                     ; 11
   1209    mova     [dst3q+strideq*0], m4                     ; 3
   1210    vpalignr                m3, m5, m0, 10
   1211    vpalignr                m4, m1, m5, 10
   1212    mova     [dstq+stride5q*2], m3                     ; 10
   1213    mova     [dstq+strideq*2 ], m4                     ; 2
   1214    vpalignr                m3, m5, m0, 12
   1215    vpalignr                m4, m1, m5, 12
   1216    mova    [dst3q+stride3q*2], m3                     ; 9
   1217    mova     [dstq+strideq*1 ], m4                     ; 1
   1218    vpalignr                m3, m5, m0, 14
   1219    vpalignr                m4, m1, m5, 14
   1220    mova      [dstq+strideq*8], m3                     ; 8
   1221    mova      [dstq+strideq*0], m4                     ; 0
   1222    mova     [dst3q+strideq*4], m5                     ; 7
   1223    RET
   1224 
   1225 cglobal vp9_ipred_vl_16x16_16, 4, 5, 7, dst, stride, l, a
   1226    movifnidn               aq, amp
   1227    mova                    m0, [aq]                   ; abcdefghijklmnop
   1228    vpbroadcastw           xm1, [aq+30]                ; pppppppp
   1229    vperm2i128              m2, m0, m1, q0201          ; ijklmnoppppppppp
   1230    vpalignr                m3, m2, m0, 2              ; bcdefghijklmnopp
   1231    vperm2i128              m4, m3, m1, q0201          ; jklmnopppppppppp
   1232    vpalignr                m5, m2, m0, 4              ; cdefghijklmnoppp
   1233    vperm2i128              m6, m5, m1, q0201          ; klmnoppppppppppp
   1234    LOWPASS                  5,  3,  0                 ; BCDEFGHIJKLMNOPP
   1235    LOWPASS                  6,  4,  2                 ; JKLMNOPPPPPPPPPP
   1236    pavgw                   m3, m0                     ; abcdefghijklmnop
   1237    pavgw                   m4, m2                     ; ijklmnoppppppppp
   1238    DEFINE_ARGS dst, stride, stride3, stride5, dst4
   1239    lea                  dst4q, [dstq+strideq*4]
   1240    lea               stride3q, [strideq*3]
   1241    lea               stride5q, [stride3q+strideq*2]
   1242 
   1243    mova      [dstq+strideq*0], m3                     ; 0  abcdefghijklmnop
   1244    mova      [dstq+strideq*1], m5                     ; 1  BCDEFGHIJKLMNOPP
   1245    vpalignr                m0, m4, m3, 2
   1246    vpalignr                m1, m6, m5, 2
   1247    mova     [dstq+strideq*2 ], m0                     ; 2  bcdefghijklmnopp
   1248    mova     [dstq+stride3q*1], m1                     ; 3  CDEFGHIJKLMNOPPP
   1249    vpalignr                m0, m4, m3, 4
   1250    vpalignr                m1, m6, m5, 4
   1251    mova     [dst4q+strideq*0], m0                     ; 4  cdefghijklmnoppp
   1252    mova     [dstq+stride5q*1], m1                     ; 5  DEFGHIJKLMNOPPPP
   1253    vpalignr                m0, m4, m3, 6
   1254    vpalignr                m1, m6, m5, 6
   1255    mova    [ dstq+stride3q*2], m0                     ; 6  defghijklmnopppp
   1256    mova    [dst4q+stride3q*1], m1                     ; 7  EFGHIJKLMNOPPPPP
   1257    vpalignr                m0, m4, m3, 8
   1258    vpalignr                m1, m6, m5, 8
   1259    mova    [  dstq+strideq*8], m0                     ; 8  efghijklmnoppppp
   1260    mova    [dst4q+stride5q*1], m1                     ; 9  FGHIJKLMNOPPPPPP
   1261    vpalignr                m0, m4, m3, 10
   1262    mova     [dstq+stride5q*2], m0                     ; 10 fghijklmnopppppp
   1263    vpalignr                m0, m4, m3, 12
   1264    mova     [dst4q+strideq*8], m0                     ; 12 ghijklmnoppppppp
   1265    vpalignr                m0, m4, m3, 14
   1266    mova    [dst4q+stride5q*2], m0                     ; 14 hijklmnopppppppp
   1267    sub                  dst4q, strideq
   1268    vpalignr                m1, m6, m5, 10
   1269    mova     [dst4q+strideq*8], m1                     ; 11 GHIJKLMNOPPPPPPP
   1270    vpalignr                m1, m6, m5, 12
   1271    mova    [dst4q+stride5q*2], m1                     ; 13 HIJKLMNOPPPPPPPP
   1272    vpalignr                m1, m6, m5, 14
   1273    mova    [dst4q+stride3q*4], m1                     ; 15 IJKLMNOPPPPPPPPP
   1274    RET
   1275 
   1276 cglobal vp9_ipred_hd_16x16_16, 4, 5, 7, dst, stride, l, a
   1277    movu                    m0, [aq-2]                 ; *abcdefghijklmno
   1278    mova                    m1, [lq]                   ; klmnopqrstuvwxyz
   1279    vperm2i128              m2, m1, m0, q0201          ; stuvwxyz*abcdefg
   1280    vpalignr                m3, m2, m1, 2              ; lmnopqrstuvwxyz*
   1281    vpalignr                m4, m2, m1, 4              ; mnopqrstuvwxyz*a
   1282    LOWPASS                  4,  3,  1                 ; LMNOPQRSTUVWXYZ#
   1283    pavgw                   m3, m1                     ; klmnopqrstuvwxyz
   1284    mova                    m1, [aq]                   ; abcdefghijklmnop
   1285    movu                    m2, [aq+2]                 ; bcdefghijklmnop.
   1286    LOWPASS                  2,  1,  0                 ; ABCDEFGHIJKLMNO.
   1287    vpunpcklwd              m0, m3, m4                 ; kLlMmNnOsTtUuVvW
   1288    vpunpckhwd              m1, m3, m4                 ; oPpQqRrSwXxYyZz#
   1289    vperm2i128              m3, m1, m0, q0002          ; kLlMmNnOoPpQqRrS
   1290    vperm2i128              m4, m0, m1, q0301          ; sTtUuVvWwXxYyZz#
   1291    vperm2i128              m0, m4, m2, q0201          ; wXxYyZz#ABCDEFGH
   1292    vperm2i128              m1, m3, m4, q0201          ; oPpQqRrSsTtUuVvW
   1293    DEFINE_ARGS dst, stride, stride3, stride5, dst5
   1294    lea               stride3q, [strideq*3]
   1295    lea               stride5q, [stride3q+strideq*2]
   1296    lea                  dst5q, [dstq+stride5q]
   1297 
   1298    mova    [dst5q+stride5q*2], m3                     ; 15 kLlMmNnOoPpQqRrS
   1299    mova    [dst5q+stride3q*2], m1                     ; 11 oPpQqRrSsTtUuVvW
   1300    mova     [dst5q+strideq*2], m4                     ; 7  sTtUuVvWwXxYyZz#
   1301    mova     [dstq+stride3q*1], m0                     ; 3  wXxYyZz#ABCDEFGH
   1302    vpalignr                m5, m4, m1, 4
   1303    mova     [dstq+stride5q*2], m5                     ; 10 pQqRrSsTtUuVvWwX
   1304    vpalignr                m5, m0, m4, 4
   1305    vpalignr                m6, m2, m0, 4
   1306    mova     [dstq+stride3q*2], m5                     ; 6  tUuVvWwXxYyZz#AB
   1307    mova      [dstq+strideq*2], m6                     ; 2  xYyZz#ABCDEFGHIJ
   1308    vpalignr                m5, m4, m1, 8
   1309    mova     [dst5q+strideq*4], m5                     ; 9  qRrSsTtUuVvWwXxY
   1310    vpalignr                m5, m0, m4, 8
   1311    vpalignr                m6, m2, m0, 8
   1312    mova     [dstq+stride5q*1], m5                     ; 5  uVvWwXxYyZz#ABCD
   1313    mova      [dstq+strideq*1], m6                     ; 1  yZz#ABCDEFGHIJKL
   1314    vpalignr                m5, m1, m3, 12
   1315    vpalignr                m6, m4, m1, 12
   1316    mova     [dstq+stride3q*4], m5                     ; 12 nOoPpQqRrSsTtUuV
   1317    mova      [dst5q+stride3q], m6                     ; 8  rSsTtUuVvWwXxYyZ
   1318    vpalignr                m5, m0, m4, 12
   1319    vpalignr                m6, m2, m0, 12
   1320    mova      [dstq+strideq*4], m5                     ; 4  nOoPpQqRrSsTtUuV
   1321    mova      [dstq+strideq*0], m6                     ; 0  z#ABCDEFGHIJKLMN
   1322    sub                  dst5q, strideq
   1323    vpalignr                m5, m1, m3, 4
   1324    mova    [dst5q+stride5q*2], m5                     ; 14 lMmNnOoPpQqRrSsT
   1325    sub                  dst5q, strideq
   1326    vpalignr                m5, m1, m3, 8
   1327    mova    [dst5q+stride5q*2], m5                    ; 13 mNnOoPpQqRrSsTtU
   1328    RET
   1329 
   1330 %if ARCH_X86_64
   1331 cglobal vp9_ipred_dr_32x32_16, 4, 7, 10, dst, stride, l, a
   1332    mova                    m0, [lq+mmsize*0+0]        ; l[0-15]
   1333    mova                    m1, [lq+mmsize*1+0]        ; l[16-31]
   1334    movu                    m2, [aq+mmsize*0-2]        ; *abcdefghijklmno
   1335    mova                    m3, [aq+mmsize*0+0]        ; abcdefghijklmnop
   1336    mova                    m4, [aq+mmsize*1+0]        ; qrstuvwxyz012345
   1337    vperm2i128              m5, m0, m1, q0201          ; lmnopqrstuvwxyz0
   1338    vpalignr                m6, m5, m0, 2              ; mnopqrstuvwxyz01
   1339    vpalignr                m7, m5, m0, 4              ; nopqrstuvwxyz012
   1340    LOWPASS                  0,  6,  7                 ; L[0-15]
   1341    vperm2i128              m7, m1, m2, q0201          ; stuvwxyz*abcdefg
   1342    vpalignr                m5, m7, m1, 2              ; lmnopqrstuvwxyz*
   1343    vpalignr                m6, m7, m1, 4              ; mnopqrstuvwxyz*a
   1344    LOWPASS                  1,  5,  6                 ; L[16-31]#
   1345    vperm2i128              m5, m3, m4, q0201          ; ijklmnopqrstuvwx
   1346    vpalignr                m6, m5, m3, 2              ; bcdefghijklmnopq
   1347    LOWPASS                  2,  3,  6                 ; A[0-15]
   1348    movu                    m3, [aq+mmsize*1-2]        ; pqrstuvwxyz01234
   1349    vperm2i128              m6, m4, m4, q2001          ; yz012345........
   1350    vpalignr                m7, m6, m4, 2              ; rstuvwxyz012345.
   1351    LOWPASS                  3,  4,  7                 ; A[16-31].
   1352    vperm2i128              m4, m1, m2, q0201          ; TUVWXYZ#ABCDEFGH
   1353    vperm2i128              m5, m0, m1, q0201          ; L[7-15]L[16-23]
   1354    vperm2i128              m8, m2, m3, q0201          ; IJKLMNOPQRSTUVWX
   1355    DEFINE_ARGS dst8, stride, stride3, stride7, stride5, dst24, cnt
   1356    lea               stride3q, [strideq*3]
   1357    lea               stride5q, [stride3q+strideq*2]
   1358    lea               stride7q, [strideq*4+stride3q]
   1359    lea                 dst24q, [dst8q+stride3q*8]
   1360    lea                  dst8q, [dst8q+strideq*8]
   1361    mov                   cntd, 2
   1362 
   1363 .loop:
   1364    mova  [dst24q+stride7q+0 ], m0                     ; 31 23 15 7
   1365    mova  [dst24q+stride7q+32], m1
   1366    mova    [dst8q+stride7q+0], m1
   1367    mova   [dst8q+stride7q+32], m2
   1368    vpalignr                m6, m4, m1, 2
   1369    vpalignr                m7, m5, m0, 2
   1370    vpalignr                m9, m8, m2, 2
   1371    mova [dst24q+stride3q*2+0], m7                     ; 30 22 14 6
   1372    mova [dst24q+stride3q*2+32], m6
   1373    mova  [dst8q+stride3q*2+0], m6
   1374    mova [dst8q+stride3q*2+32], m9
   1375    vpalignr                m6, m4, m1, 4
   1376    vpalignr                m7, m5, m0, 4
   1377    vpalignr                m9, m8, m2, 4
   1378    mova   [dst24q+stride5q+0], m7                     ; 29 21 13 5
   1379    mova  [dst24q+stride5q+32], m6
   1380    mova    [dst8q+stride5q+0], m6
   1381    mova   [dst8q+stride5q+32], m9
   1382    vpalignr                m6, m4, m1, 6
   1383    vpalignr                m7, m5, m0, 6
   1384    vpalignr                m9, m8, m2, 6
   1385    mova [dst24q+strideq*4+0 ], m7                     ; 28 20 12 4
   1386    mova [dst24q+strideq*4+32], m6
   1387    mova   [dst8q+strideq*4+0], m6
   1388    mova  [dst8q+strideq*4+32], m9
   1389    vpalignr                m6, m4, m1, 8
   1390    vpalignr                m7, m5, m0, 8
   1391    vpalignr                m9, m8, m2, 8
   1392    mova  [dst24q+stride3q+0 ], m7                     ; 27 19 11 3
   1393    mova  [dst24q+stride3q+32], m6
   1394    mova    [dst8q+stride3q+0], m6
   1395    mova   [dst8q+stride3q+32], m9
   1396    vpalignr                m6, m4, m1, 10
   1397    vpalignr                m7, m5, m0, 10
   1398    vpalignr                m9, m8, m2, 10
   1399    mova [dst24q+strideq*2+0 ], m7                     ; 26 18 10 2
   1400    mova [dst24q+strideq*2+32], m6
   1401    mova   [dst8q+strideq*2+0], m6
   1402    mova  [dst8q+strideq*2+32], m9
   1403    vpalignr                m6, m4, m1, 12
   1404    vpalignr                m7, m5, m0, 12
   1405    vpalignr                m9, m8, m2, 12
   1406    mova   [dst24q+strideq+0 ], m7                     ; 25 17 9 1
   1407    mova   [dst24q+strideq+32], m6
   1408    mova     [dst8q+strideq+0], m6
   1409    mova    [dst8q+strideq+32], m9
   1410    vpalignr                m6, m4, m1, 14
   1411    vpalignr                m7, m5, m0, 14
   1412    vpalignr                m9, m8, m2, 14
   1413    mova [dst24q+strideq*0+0 ], m7                     ; 24 16 8 0
   1414    mova [dst24q+strideq*0+32], m6
   1415    mova   [dst8q+strideq*0+0], m6
   1416    mova  [dst8q+strideq*0+32], m9
   1417    mova                    m0, m5
   1418    mova                    m5, m1
   1419    mova                    m1, m4
   1420    mova                    m4, m2
   1421    mova                    m2, m8
   1422    mova                    m8, m3
   1423    sub                 dst24q, stride7q
   1424    sub                 dst24q, strideq
   1425    sub                  dst8q, stride7q
   1426    sub                  dst8q, strideq
   1427    dec                   cntd
   1428    jg .loop
   1429    RET
   1430 %endif
   1431 %endif
   1432 
   1433 %macro VL_FUNCS 1 ; stack_mem_for_32x32_32bit_function
   1434 cglobal vp9_ipred_vl_4x4_16, 2, 4, 3, dst, stride, l, a
   1435    movifnidn               aq, amp
   1436    movu                    m0, [aq]                ; abcdefgh
   1437    psrldq                  m1, m0, 2               ; bcdefgh.
   1438    psrldq                  m2, m0, 4               ; cdefgh..
   1439    LOWPASS                  2,  1, 0               ; BCDEFGH.
   1440    pavgw                   m1, m0                  ; ABCDEFG.
   1441    DEFINE_ARGS dst, stride, stride3
   1442    lea               stride3q, [strideq*3]
   1443 
   1444    movh      [dstq+strideq*0], m1
   1445    movh      [dstq+strideq*1], m2
   1446    psrldq                  m1, 2
   1447    psrldq                  m2, 2
   1448    movh      [dstq+strideq*2], m1
   1449    movh      [dstq+stride3q ], m2
   1450    RET
   1451 
   1452 cglobal vp9_ipred_vl_8x8_16, 2, 4, 4, dst, stride, l, a
   1453    movifnidn               aq, amp
   1454    mova                    m0, [aq]                ; abcdefgh
   1455 %if cpuflag(ssse3)
   1456    mova                    m3, [pb_2to15_14_15]
   1457 %endif
   1458    SHIFT_RIGHTx2           m1, m2, m0, m3          ; bcdefghh/cdefghhh
   1459    LOWPASS                  2,  1, 0               ; BCDEFGHh
   1460    pavgw                   m1, m0                  ; ABCDEFGh
   1461    DEFINE_ARGS dst, stride, stride3
   1462    lea               stride3q, [strideq*3]
   1463 
   1464    mova      [dstq+strideq*0], m1
   1465    mova      [dstq+strideq*1], m2
   1466    SHIFT_RIGHT             m1, m1, m3
   1467    SHIFT_RIGHT             m2, m2, m3
   1468    mova      [dstq+strideq*2], m1
   1469    mova      [dstq+stride3q ], m2
   1470    lea                   dstq, [dstq+strideq*4]
   1471    SHIFT_RIGHT             m1, m1, m3
   1472    SHIFT_RIGHT             m2, m2, m3
   1473    mova      [dstq+strideq*0], m1
   1474    mova      [dstq+strideq*1], m2
   1475    SHIFT_RIGHT             m1, m1, m3
   1476    SHIFT_RIGHT             m2, m2, m3
   1477    mova      [dstq+strideq*2], m1
   1478    mova      [dstq+stride3q ], m2
   1479    RET
   1480 
   1481 cglobal vp9_ipred_vl_16x16_16, 2, 4, 6, dst, stride, l, a
   1482    movifnidn               aq, amp
   1483    mova                    m0, [aq]
   1484    mova                    m1, [aq+mmsize]
   1485    PALIGNR                 m2, m1, m0, 2, m3
   1486    PALIGNR                 m3, m1, m0, 4, m4
   1487    LOWPASS                  3,  2,  0
   1488    pavgw                   m2, m0
   1489 %if cpuflag(ssse3)
   1490    mova                    m4, [pb_2to15_14_15]
   1491 %endif
   1492    SHIFT_RIGHTx2           m5, m0, m1, m4
   1493    LOWPASS                  0,  5,  1
   1494    pavgw                   m1, m5
   1495    DEFINE_ARGS dst, stride, cnt
   1496    mov                   cntd, 8
   1497 
   1498 .loop:
   1499    mova   [dstq+strideq*0+ 0], m2
   1500    mova   [dstq+strideq*0+16], m1
   1501    mova   [dstq+strideq*1+ 0], m3
   1502    mova   [dstq+strideq*1+16], m0
   1503    lea                   dstq, [dstq+strideq*2]
   1504 %if cpuflag(avx)
   1505    vpalignr                m2, m1, m2, 2
   1506    vpalignr                m3, m0, m3, 2
   1507 %else
   1508    PALIGNR                 m5, m1, m2, 2, m4
   1509    mova                    m2, m5
   1510    PALIGNR                 m5, m0, m3, 2, m4
   1511    mova                    m3, m5
   1512 %endif
   1513    SHIFT_RIGHT             m1, m1, m4
   1514    SHIFT_RIGHT             m0, m0, m4
   1515    dec                   cntd
   1516    jg .loop
   1517    RET
   1518 
   1519 cglobal vp9_ipred_vl_32x32_16, 2, 5, 11, %1 * mmsize * ARCH_X86_32, dst, stride, l, a
   1520    movifnidn               aq, amp
   1521    mova                    m0, [aq+mmsize*0]
   1522    mova                    m1, [aq+mmsize*1]
   1523    mova                    m2, [aq+mmsize*2]
   1524    PALIGNR                 m6, m1, m0, 2, m5
   1525    PALIGNR                 m7, m1, m0, 4, m5
   1526    LOWPASS                  7,  6,  0
   1527    pavgw                   m6, m0
   1528    SCRATCH                  6,  8, rsp+0*mmsize
   1529    PALIGNR                 m4, m2, m1, 2, m0
   1530    PALIGNR                 m5, m2, m1, 4, m0
   1531    LOWPASS                  5,  4,  1
   1532    pavgw                   m4, m1
   1533    mova                    m0, [aq+mmsize*3]
   1534    PALIGNR                 m1, m0, m2, 2, m6
   1535    PALIGNR                 m3, m0, m2, 4, m6
   1536    LOWPASS                  3,  1,  2
   1537    pavgw                   m2, m1
   1538 %if cpuflag(ssse3)
   1539    PRELOAD                 10, pb_2to15_14_15, shuf
   1540 %endif
   1541    SHIFT_RIGHTx2           m6, m1, m0, reg_shuf
   1542    LOWPASS                  1,  6,  0
   1543    pavgw                   m0, m6
   1544 %if ARCH_X86_64
   1545    pshufd                  m9, m6, q3333
   1546 %endif
   1547 %if cpuflag(avx)
   1548    UNSCRATCH                6,  8, rsp+0*mmsize
   1549 %endif
   1550    DEFINE_ARGS dst, stride, cnt, stride16, stride17
   1551    mov              stride16q, strideq
   1552    mov                   cntd, 8
   1553    shl              stride16q, 4
   1554    lea              stride17q, [stride16q+strideq]
   1555 
   1556    ; FIXME m8 is unused for avx, so we could save one register here for win64
   1557 .loop:
   1558 %if notcpuflag(avx)
   1559    UNSCRATCH                6,  8, rsp+0*mmsize
   1560 %endif
   1561    mova   [dstq+strideq*0+ 0], m6
   1562    mova   [dstq+strideq*0+16], m4
   1563    mova   [dstq+strideq*0+32], m2
   1564    mova   [dstq+strideq*0+48], m0
   1565    mova   [dstq+strideq*1+ 0], m7
   1566    mova   [dstq+strideq*1+16], m5
   1567    mova   [dstq+strideq*1+32], m3
   1568    mova   [dstq+strideq*1+48], m1
   1569    mova   [dstq+stride16q+ 0], m4
   1570    mova   [dstq+stride16q+16], m2
   1571    mova   [dstq+stride16q+32], m0
   1572 %if ARCH_X86_64
   1573    mova   [dstq+stride16q+48], m9
   1574 %endif
   1575    mova   [dstq+stride17q+ 0], m5
   1576    mova   [dstq+stride17q+16], m3
   1577    mova   [dstq+stride17q+32], m1
   1578 %if ARCH_X86_64
   1579    mova   [dstq+stride17q+48], m9
   1580 %endif
   1581    lea                   dstq, [dstq+strideq*2]
   1582 %if cpuflag(avx)
   1583    vpalignr                m6, m4, m6, 2
   1584    vpalignr                m4, m2, m4, 2
   1585    vpalignr                m2, m0, m2, 2
   1586    vpalignr                m7, m5, m7, 2
   1587    vpalignr                m5, m3, m5, 2
   1588    vpalignr                m3, m1, m3, 2
   1589 %else
   1590    SCRATCH                  3,  8, rsp+0*mmsize
   1591 %if notcpuflag(ssse3)
   1592    SCRATCH                  1, 10, rsp+1*mmsize
   1593 %endif
   1594    PALIGNR                 m3, m4, m6, 2, m1
   1595    mova                    m6, m3
   1596    PALIGNR                 m3, m2, m4, 2, m1
   1597    mova                    m4, m3
   1598    PALIGNR                 m3, m0, m2, 2, m1
   1599    mova                    m2, m3
   1600    PALIGNR                 m3, m5, m7, 2, m1
   1601    mova                    m7, m3
   1602    UNSCRATCH                3,  8, rsp+0*mmsize
   1603    SCRATCH                  6,  8, rsp+0*mmsize
   1604 %if notcpuflag(ssse3)
   1605    UNSCRATCH                1, 10, rsp+1*mmsize
   1606    SCRATCH                  7, 10, rsp+1*mmsize
   1607 %endif
   1608    PALIGNR                 m6, m3, m5, 2, m7
   1609    mova                    m5, m6
   1610    PALIGNR                 m6, m1, m3, 2, m7
   1611    mova                    m3, m6
   1612 %if notcpuflag(ssse3)
   1613    UNSCRATCH                7, 10, rsp+1*mmsize
   1614 %endif
   1615 %endif
   1616    SHIFT_RIGHT             m1, m1, reg_shuf
   1617    SHIFT_RIGHT             m0, m0, reg_shuf
   1618    dec                   cntd
   1619    jg .loop
   1620 
   1621 %if ARCH_X86_32
   1622    DEFINE_ARGS dst, stride, stride3
   1623    lea               stride3q, [strideq*3]
   1624 %assign %%n 0
   1625 %rep 4
   1626    mova   [dstq+strideq*0+48], m0
   1627    mova   [dstq+strideq*1+48], m0
   1628    mova   [dstq+strideq*2+48], m0
   1629    mova   [dstq+stride3q +48], m0
   1630 %if %%n < 3
   1631    lea                   dstq, [dstq+strideq*4]
   1632 %endif
   1633 %assign %%n (%%n+1)
   1634 %endrep
   1635 %endif
   1636    RET
   1637 %endmacro
   1638 
   1639 INIT_XMM sse2
   1640 VL_FUNCS 2
   1641 INIT_XMM ssse3
   1642 VL_FUNCS 1
   1643 INIT_XMM avx
   1644 VL_FUNCS 1
   1645 
   1646 %macro VR_FUNCS 0
   1647 cglobal vp9_ipred_vr_4x4_16, 4, 4, 3, dst, stride, l, a
   1648    movu                    m0, [aq-2]
   1649    movhps                  m1, [lq]
   1650    PALIGNR                 m0, m1, 10, m2          ; xyz*abcd
   1651    pslldq                  m1, m0, 2               ; .xyz*abc
   1652    pslldq                  m2, m0, 4               ; ..xyz*ab
   1653    LOWPASS                  2,  1, 0               ; ..YZ#ABC
   1654    pavgw                   m1, m0                  ; ....#ABC
   1655    DEFINE_ARGS dst, stride, stride3
   1656    lea               stride3q, [strideq*3]
   1657 
   1658    movhps    [dstq+strideq*0], m1
   1659    movhps    [dstq+strideq*1], m2
   1660    shufps                  m0, m2, m1, q3210
   1661 %if cpuflag(ssse3)
   1662    pshufb                  m2, [pb_4_5_8to13_8x0]
   1663 %else
   1664    pshuflw                 m2, m2, q2222
   1665    psrldq                  m2, 6
   1666 %endif
   1667    psrldq                  m0, 6
   1668    movh      [dstq+strideq*2], m0
   1669    movh      [dstq+stride3q ], m2
   1670    RET
   1671 
   1672 cglobal vp9_ipred_vr_8x8_16, 4, 4, 5, dst, stride, l, a
   1673    movu                    m1, [aq-2]              ; *abcdefg
   1674    movu                    m2, [lq]                ; stuvwxyz
   1675    mova                    m0, [aq]                ; abcdefgh
   1676    PALIGNR                 m3, m1, m2, 14, m4      ; z*abcdef
   1677    LOWPASS                  3,  1,  0
   1678    pavgw                   m0, m1
   1679    PALIGNR                 m1, m2,  2, m4          ; tuvwxyz*
   1680    pslldq                  m4, m2,  2              ; .stuvwxy
   1681    LOWPASS                  4,  2,  1
   1682    DEFINE_ARGS dst, stride, stride3
   1683    lea               stride3q, [strideq*3]
   1684 
   1685    mova      [dstq+strideq*0], m0
   1686    mova      [dstq+strideq*1], m3
   1687    PALIGNR                 m0, m4, 14, m1
   1688    pslldq                  m4, 2
   1689    PALIGNR                 m3, m4, 14, m1
   1690    pslldq                  m4, 2
   1691    mova      [dstq+strideq*2], m0
   1692    mova      [dstq+stride3q ], m3
   1693    lea                   dstq, [dstq+strideq*4]
   1694    PALIGNR                 m0, m4, 14, m1
   1695    pslldq                  m4, 2
   1696    PALIGNR                 m3, m4, 14, m1
   1697    pslldq                  m4, 2
   1698    mova      [dstq+strideq*0], m0
   1699    mova      [dstq+strideq*1], m3
   1700    PALIGNR                 m0, m4, 14, m1
   1701    pslldq                  m4, 2
   1702    PALIGNR                 m3, m4, 14, m4
   1703    mova      [dstq+strideq*2], m0
   1704    mova      [dstq+stride3q ], m3
   1705    RET
   1706 
   1707 cglobal vp9_ipred_vr_16x16_16, 4, 4, 8, dst, stride, l, a
   1708    movu                    m1, [aq-2]              ; *abcdefg
   1709    movu                    m2, [aq+mmsize-2]       ; hijklmno
   1710    mova                    m3, [aq]                ; abcdefgh
   1711    mova                    m4, [aq+mmsize]         ; ijklmnop
   1712    mova                    m5, [lq+mmsize]         ; stuvwxyz
   1713    PALIGNR                 m0, m1, m5, 14, m6      ; z*abcdef
   1714    movu                    m6, [aq+mmsize-4]       ; ghijklmn
   1715    LOWPASS                  6,  2,  4
   1716    pavgw                   m2, m4
   1717    LOWPASS                  0,  1,  3
   1718    pavgw                   m3, m1
   1719    PALIGNR                 m1, m5,  2, m7          ; tuvwxyz*
   1720    movu                    m7, [lq+mmsize-2]       ; rstuvwxy
   1721    LOWPASS                  1,  5,  7
   1722    movu                    m5, [lq+2]              ; lmnopqrs
   1723    pslldq                  m4, m5,  2              ; .lmnopqr
   1724    pslldq                  m7, m5,  4              ; ..lmnopq
   1725    LOWPASS                  5,  4,  7
   1726    psrld                   m4, m1, 16
   1727    psrld                   m7, m5, 16
   1728    pand                    m1, [pd_65535]
   1729    pand                    m5, [pd_65535]
   1730    packssdw                m7, m4
   1731    packssdw                m5, m1
   1732    DEFINE_ARGS dst, stride, cnt
   1733    mov                   cntd, 8
   1734 
   1735 .loop:
   1736    mova   [dstq+strideq*0+ 0], m3
   1737    mova   [dstq+strideq*0+16], m2
   1738    mova   [dstq+strideq*1+ 0], m0
   1739    mova   [dstq+strideq*1+16], m6
   1740    lea                   dstq, [dstq+strideq*2]
   1741    PALIGNR                 m2, m3, 14, m4
   1742    PALIGNR                 m3, m7, 14, m4
   1743    pslldq                  m7, 2
   1744    PALIGNR                 m6, m0, 14, m4
   1745    PALIGNR                 m0, m5, 14, m4
   1746    pslldq                  m5, 2
   1747    dec                   cntd
   1748    jg .loop
   1749    RET
   1750 
   1751 cglobal vp9_ipred_vr_32x32_16, 4, 5, 14, 6 * mmsize * ARCH_X86_32, dst, stride, l, a
   1752    movu                    m0, [aq+mmsize*0-2]     ; *a[0-6]
   1753    movu                    m1, [aq+mmsize*1-2]     ; a[7-14]
   1754    movu                    m2, [aq+mmsize*2-2]     ; a[15-22]
   1755    movu                    m3, [aq+mmsize*3-2]     ; a[23-30]
   1756    mova                    m4, [aq+mmsize*3+0]     ; a[24-31]
   1757    movu                    m5, [aq+mmsize*3-4]     ; a[22-29]
   1758    LOWPASS                  5,  3,  4              ; A[23-30]
   1759    SCRATCH                  5,  8, rsp+0*mmsize
   1760    pavgw                   m3, m4
   1761    mova                    m4, [aq+mmsize*2+0]     ; a[16-23]
   1762    movu                    m6, [aq+mmsize*2-4]     ; a[14-21]
   1763    LOWPASS                  6,  2,  4              ; A[15-22]
   1764    SCRATCH                  6,  9, rsp+1*mmsize
   1765    pavgw                   m2, m4
   1766    mova                    m4, [aq+mmsize*1+0]     ; a[8-15]
   1767    movu                    m7, [aq+mmsize*1-4]     ; a[6-13]
   1768    LOWPASS                  7,  1,  4              ; A[7-14]
   1769    SCRATCH                  7, 10, rsp+2*mmsize
   1770    pavgw                   m1, m4
   1771    mova                    m4, [aq+mmsize*0+0]     ; a[0-7]
   1772    mova                    m5, [lq+mmsize*3+0]     ; l[24-31]
   1773    PALIGNR                 m6, m0, m5, 14, m7      ; l[31]*a[0-5]
   1774    LOWPASS                  6,  0,  4              ; #A[0-6]
   1775    SCRATCH                  6, 11, rsp+3*mmsize
   1776    pavgw                   m4, m0
   1777    PALIGNR                 m0, m5,  2, m7          ; l[25-31]*
   1778    movu                    m7, [lq+mmsize*3-2]     ; l[23-30]
   1779    LOWPASS                  0,  5,  7              ; L[24-31]
   1780    movu                    m5, [lq+mmsize*2-2]     ; l[15-22]
   1781    mova                    m7, [lq+mmsize*2+0]     ; l[16-23]
   1782    movu                    m6, [lq+mmsize*2+2]     ; l[17-24]
   1783    LOWPASS                  5,  7,  6              ; L[16-23]
   1784    psrld                   m7, m0, 16
   1785    psrld                   m6, m5, 16
   1786    pand                    m0, [pd_65535]
   1787    pand                    m5, [pd_65535]
   1788    packssdw                m6, m7
   1789    packssdw                m5, m0
   1790    SCRATCH                  5, 12, rsp+4*mmsize
   1791    SCRATCH                  6, 13, rsp+5*mmsize
   1792    movu                    m6, [lq+mmsize*1-2]     ; l[7-14]
   1793    mova                    m0, [lq+mmsize*1+0]     ; l[8-15]
   1794    movu                    m5, [lq+mmsize*1+2]     ; l[9-16]
   1795    LOWPASS                  6,  0,  5              ; L[8-15]
   1796    movu                    m0, [lq+mmsize*0+2]     ; l[1-8]
   1797    pslldq                  m5, m0,  2              ; .l[1-7]
   1798    pslldq                  m7, m0,  4              ; ..l[1-6]
   1799    LOWPASS                  0,  5,  7
   1800    psrld                   m5, m6, 16
   1801    psrld                   m7, m0, 16
   1802    pand                    m6, [pd_65535]
   1803    pand                    m0, [pd_65535]
   1804    packssdw                m7, m5
   1805    packssdw                m0, m6
   1806    UNSCRATCH                6, 13, rsp+5*mmsize
   1807    DEFINE_ARGS dst, stride, stride16, cnt, stride17
   1808    mov              stride16q, strideq
   1809    mov                   cntd, 8
   1810    shl              stride16q, 4
   1811 %if ARCH_X86_64
   1812    lea              stride17q, [stride16q+strideq]
   1813 %endif
   1814 
   1815 .loop:
   1816    mova   [dstq+strideq*0+ 0], m4
   1817    mova   [dstq+strideq*0+16], m1
   1818    mova   [dstq+strideq*0+32], m2
   1819    mova   [dstq+strideq*0+48], m3
   1820 %if ARCH_X86_64
   1821    mova   [dstq+strideq*1+ 0], m11
   1822    mova   [dstq+strideq*1+16], m10
   1823    mova   [dstq+strideq*1+32], m9
   1824    mova   [dstq+strideq*1+48], m8
   1825 %endif
   1826    mova   [dstq+stride16q+ 0], m6
   1827    mova   [dstq+stride16q+16], m4
   1828    mova   [dstq+stride16q+32], m1
   1829    mova   [dstq+stride16q+48], m2
   1830 %if ARCH_X86_64
   1831    mova   [dstq+stride17q+ 0], m12
   1832    mova   [dstq+stride17q+16], m11
   1833    mova   [dstq+stride17q+32], m10
   1834    mova   [dstq+stride17q+48], m9
   1835 %endif
   1836    lea                   dstq, [dstq+strideq*2]
   1837    PALIGNR                 m3, m2,  14, m5
   1838    PALIGNR                 m2, m1,  14, m5
   1839    PALIGNR                 m1, m4,  14, m5
   1840    PALIGNR                 m4, m6,  14, m5
   1841    PALIGNR                 m6, m7,  14, m5
   1842    pslldq                  m7, 2
   1843 %if ARCH_X86_64
   1844    PALIGNR                 m8, m9,  14, m5
   1845    PALIGNR                 m9, m10, 14, m5
   1846    PALIGNR                m10, m11, 14, m5
   1847    PALIGNR                m11, m12, 14, m5
   1848    PALIGNR                m12, m0,  14, m5
   1849    pslldq                  m0, 2
   1850 %endif
   1851    dec                   cntd
   1852    jg .loop
   1853 
   1854 %if ARCH_X86_32
   1855    UNSCRATCH                5, 12, rsp+4*mmsize
   1856    UNSCRATCH                4, 11, rsp+3*mmsize
   1857    UNSCRATCH                3, 10, rsp+2*mmsize
   1858    UNSCRATCH                2,  9, rsp+1*mmsize
   1859    UNSCRATCH                1,  8, rsp+0*mmsize
   1860    mov                   dstq, dstm
   1861    mov                   cntd, 8
   1862    add                   dstq, strideq
   1863 .loop2:
   1864    mova   [dstq+strideq*0+ 0], m4
   1865    mova   [dstq+strideq*0+16], m3
   1866    mova   [dstq+strideq*0+32], m2
   1867    mova   [dstq+strideq*0+48], m1
   1868    mova   [dstq+stride16q+ 0], m5
   1869    mova   [dstq+stride16q+16], m4
   1870    mova   [dstq+stride16q+32], m3
   1871    mova   [dstq+stride16q+48], m2
   1872    lea                   dstq, [dstq+strideq*2]
   1873    PALIGNR                 m1, m2,  14, m6
   1874    PALIGNR                 m2, m3,  14, m6
   1875    PALIGNR                 m3, m4,  14, m6
   1876    PALIGNR                 m4, m5,  14, m6
   1877    PALIGNR                 m5, m0,  14, m6
   1878    pslldq                  m0, 2
   1879    dec                   cntd
   1880    jg .loop2
   1881 %endif
   1882    RET
   1883 %endmacro
   1884 
   1885 INIT_XMM sse2
   1886 VR_FUNCS
   1887 INIT_XMM ssse3
   1888 VR_FUNCS
   1889 INIT_XMM avx
   1890 VR_FUNCS
   1891 
   1892 %macro HU_FUNCS 1 ; stack_mem_for_32x32_32bit_function
   1893 cglobal vp9_ipred_hu_4x4_16, 3, 3, 3, dst, stride, l, a
   1894    movh                    m0, [lq]                ; abcd
   1895 %if cpuflag(ssse3)
   1896    pshufb                  m0, [pb_0to7_67x4]      ; abcddddd
   1897 %else
   1898    punpcklqdq              m0, m0
   1899    pshufhw                 m0, m0, q3333           ; abcddddd
   1900 %endif
   1901    psrldq                  m1, m0,  2              ; bcddddd.
   1902    psrldq                  m2, m0,  4              ; cddddd..
   1903    LOWPASS                  2,  1,  0              ; BCDddd..
   1904    pavgw                   m1, m0                  ; abcddddd
   1905    SBUTTERFLY          wd,  1,  2,  0              ; aBbCcDdd, dddddddd
   1906    PALIGNR                 m2, m1,  4, m0          ; bCcDdddd
   1907    DEFINE_ARGS dst, stride, stride3
   1908    lea               stride3q, [strideq*3]
   1909 
   1910    movh      [dstq+strideq*0], m1                  ; aBbC
   1911    movh      [dstq+strideq*1], m2                  ; bCcD
   1912    movhps    [dstq+strideq*2], m1                  ; cDdd
   1913    movhps    [dstq+stride3q ], m2                  ; dddd
   1914    RET
   1915 
   1916 cglobal vp9_ipred_hu_8x8_16, 3, 3, 4, dst, stride, l, a
   1917    mova                    m0, [lq]
   1918 %if cpuflag(ssse3)
   1919    mova                    m3, [pb_2to15_14_15]
   1920 %endif
   1921    SHIFT_RIGHTx2           m1, m2, m0, m3
   1922    LOWPASS                  2,  1,  0
   1923    pavgw                   m1, m0
   1924    SBUTTERFLY          wd,  1,  2,  0
   1925    shufps                  m0, m1, m2, q1032
   1926    pshufd                  m3, m2, q3332
   1927    DEFINE_ARGS dst, stride, stride3
   1928    lea               stride3q, [strideq*3]
   1929 
   1930    mova     [dstq+strideq *0], m1
   1931    mova     [dstq+strideq *2], m0
   1932    mova     [dstq+strideq *4], m2
   1933    mova     [dstq+stride3q*2], m3
   1934    add                   dstq, strideq
   1935 %if cpuflag(avx)
   1936    vpalignr                m1, m2, m1, 4
   1937 %else
   1938    PALIGNR                 m0, m2, m1, 4, m3
   1939    mova                    m1, m0
   1940 %endif
   1941    pshufd                  m2, m2, q3321
   1942    shufps                  m0, m1, m2, q1032
   1943    pshufd                  m3, m2, q3332
   1944    mova     [dstq+strideq *0], m1
   1945    mova     [dstq+strideq *2], m0
   1946    mova     [dstq+strideq *4], m2
   1947    mova     [dstq+stride3q*2], m3
   1948    RET
   1949 
   1950 cglobal vp9_ipred_hu_16x16_16, 3, 4, 6 + notcpuflag(ssse3), dst, stride, l, a
   1951    mova                    m0, [lq]
   1952    mova                    m3, [lq+mmsize]
   1953    movu                    m1, [lq+2]
   1954    movu                    m2, [lq+4]
   1955    LOWPASS                  2,  1,  0
   1956    pavgw                   m1, m0
   1957    SBUTTERFLY           wd, 1,  2,  0
   1958 %if cpuflag(ssse3)
   1959    mova                    m5, [pb_2to15_14_15]
   1960 %endif
   1961    SHIFT_RIGHTx2           m0, m4, m3, m5
   1962    LOWPASS                  4,  0,  3
   1963    pavgw                   m3, m0
   1964    SBUTTERFLY           wd, 3,  4,  5
   1965    pshufd                  m0, m0, q3333
   1966    DEFINE_ARGS dst, stride, stride3, cnt
   1967    lea               stride3q, [strideq*3]
   1968    mov                   cntd, 4
   1969 
   1970 .loop:
   1971    mova  [dstq+strideq *0+ 0], m1
   1972    mova  [dstq+strideq *0+16], m2
   1973    mova  [dstq+strideq *4+ 0], m2
   1974    mova  [dstq+strideq *4+16], m3
   1975    mova  [dstq+strideq *8+ 0], m3
   1976    mova  [dstq+strideq *8+16], m4
   1977    mova  [dstq+stride3q*4+ 0], m4
   1978    mova  [dstq+stride3q*4+16], m0
   1979    add                   dstq, strideq
   1980 %if cpuflag(avx)
   1981    vpalignr                m1, m2, m1, 4
   1982    vpalignr                m2, m3, m2, 4
   1983    vpalignr                m3, m4, m3, 4
   1984    vpalignr                m4, m0, m4, 4
   1985 %else
   1986    PALIGNR                 m5, m2, m1, 4, m6
   1987    mova                    m1, m5
   1988    PALIGNR                 m5, m3, m2, 4, m6
   1989    mova                    m2, m5
   1990    PALIGNR                 m5, m4, m3, 4, m6
   1991    mova                    m3, m5
   1992    PALIGNR                 m5, m0, m4, 4, m6
   1993    mova                    m4, m5
   1994 %endif
   1995    dec                   cntd
   1996    jg .loop
   1997    RET
   1998 
   1999 cglobal vp9_ipred_hu_32x32_16, 3, 7, 10 + notcpuflag(ssse3), \
   2000                               %1 * -mmsize * ARCH_X86_32, dst, stride, l, a
   2001    mova                    m2, [lq+mmsize*0+0]
   2002    movu                    m1, [lq+mmsize*0+2]
   2003    movu                    m0, [lq+mmsize*0+4]
   2004    LOWPASS                  0,  1,  2
   2005    pavgw                   m1, m2
   2006    SBUTTERFLY           wd, 1,  0,  2
   2007    SCRATCH                  1,  8, rsp+0*mmsize
   2008    mova                    m4, [lq+mmsize*1+0]
   2009    movu                    m3, [lq+mmsize*1+2]
   2010    movu                    m2, [lq+mmsize*1+4]
   2011    LOWPASS                  2,  3,  4
   2012    pavgw                   m3, m4
   2013    SBUTTERFLY           wd, 3,  2,  4
   2014    mova                    m6, [lq+mmsize*2+0]
   2015    movu                    m5, [lq+mmsize*2+2]
   2016    movu                    m4, [lq+mmsize*2+4]
   2017    LOWPASS                  4,  5,  6
   2018    pavgw                   m5, m6
   2019    SBUTTERFLY           wd, 5,  4,  6
   2020    mova                    m7, [lq+mmsize*3+0]
   2021    SCRATCH                  0,  9, rsp+1*mmsize
   2022 %if cpuflag(ssse3)
   2023    mova                    m0, [pb_2to15_14_15]
   2024 %endif
   2025    SHIFT_RIGHTx2           m1, m6, m7, m0
   2026    LOWPASS                  6,  1,  7
   2027    pavgw                   m7, m1
   2028    SBUTTERFLY           wd, 7,  6,  0
   2029    pshufd                  m1, m1, q3333
   2030    UNSCRATCH                0,  9, rsp+1*mmsize
   2031    DEFINE_ARGS dst, stride, cnt, stride3, stride4, stride20, stride28
   2032    lea               stride3q, [strideq*3]
   2033    lea               stride4q, [strideq*4]
   2034    lea              stride28q, [stride4q*8]
   2035    lea              stride20q, [stride4q*5]
   2036    sub              stride28q, stride4q
   2037    mov                   cntd, 4
   2038 
   2039 .loop:
   2040 %if ARCH_X86_64
   2041    SWAP                     1,  8
   2042 %else
   2043    mova        [rsp+1*mmsize], m1
   2044    mova                    m1, [rsp+0*mmsize]
   2045 %endif
   2046    mova  [dstq+strideq *0+ 0], m1
   2047    mova  [dstq+strideq *0+16], m0
   2048    mova  [dstq+strideq *0+32], m3
   2049    mova  [dstq+strideq *0+48], m2
   2050    mova  [dstq+stride4q*1+ 0], m0
   2051    mova  [dstq+stride4q*1+16], m3
   2052    mova  [dstq+stride4q*1+32], m2
   2053    mova  [dstq+stride4q*1+48], m5
   2054    mova  [dstq+stride4q*2+ 0], m3
   2055    mova  [dstq+stride4q*2+16], m2
   2056    mova  [dstq+stride4q*2+32], m5
   2057    mova  [dstq+stride4q*2+48], m4
   2058 %if cpuflag(avx)
   2059    vpalignr                m1, m0, m1, 4
   2060    vpalignr                m0, m3, m0, 4
   2061    vpalignr                m3, m2, m3, 4
   2062 %else
   2063    SCRATCH                  6,  9, rsp+2*mmsize
   2064 %if notcpuflag(ssse3)
   2065    SCRATCH                  7, 10, rsp+3*mmsize
   2066 %endif
   2067    PALIGNR                 m6, m0, m1, 4, m7
   2068    mova                    m1, m6
   2069    PALIGNR                 m6, m3, m0, 4, m7
   2070    mova                    m0, m6
   2071    PALIGNR                 m6, m2, m3, 4, m7
   2072    mova                    m3, m6
   2073    UNSCRATCH                6,  9, rsp+2*mmsize
   2074    SCRATCH                  0,  9, rsp+2*mmsize
   2075 %if notcpuflag(ssse3)
   2076    UNSCRATCH                7, 10, rsp+3*mmsize
   2077    SCRATCH                  3, 10, rsp+3*mmsize
   2078 %endif
   2079 %endif
   2080 %if ARCH_X86_64
   2081    SWAP                     1,  8
   2082 %else
   2083    mova        [rsp+0*mmsize], m1
   2084    mova                    m1, [rsp+1*mmsize]
   2085 %endif
   2086    mova  [dstq+stride3q*4+ 0], m2
   2087    mova  [dstq+stride3q*4+16], m5
   2088    mova  [dstq+stride3q*4+32], m4
   2089    mova  [dstq+stride3q*4+48], m7
   2090    mova  [dstq+stride4q*4+ 0], m5
   2091    mova  [dstq+stride4q*4+16], m4
   2092    mova  [dstq+stride4q*4+32], m7
   2093    mova  [dstq+stride4q*4+48], m6
   2094    mova  [dstq+stride20q + 0], m4
   2095    mova  [dstq+stride20q +16], m7
   2096    mova  [dstq+stride20q +32], m6
   2097    mova  [dstq+stride20q +48], m1
   2098    mova  [dstq+stride3q*8+ 0], m7
   2099    mova  [dstq+stride3q*8+16], m6
   2100    mova  [dstq+stride3q*8+32], m1
   2101    mova  [dstq+stride3q*8+48], m1
   2102    mova  [dstq+stride28q + 0], m6
   2103    mova  [dstq+stride28q +16], m1
   2104    mova  [dstq+stride28q +32], m1
   2105    mova  [dstq+stride28q +48], m1
   2106 %if cpuflag(avx)
   2107    vpalignr                m2, m5, m2, 4
   2108    vpalignr                m5, m4, m5, 4
   2109    vpalignr                m4, m7, m4, 4
   2110    vpalignr                m7, m6, m7, 4
   2111    vpalignr                m6, m1, m6, 4
   2112 %else
   2113    PALIGNR                 m0, m5, m2, 4, m3
   2114    mova                    m2, m0
   2115    PALIGNR                 m0, m4, m5, 4, m3
   2116    mova                    m5, m0
   2117    PALIGNR                 m0, m7, m4, 4, m3
   2118    mova                    m4, m0
   2119    PALIGNR                 m0, m6, m7, 4, m3
   2120    mova                    m7, m0
   2121    PALIGNR                 m0, m1, m6, 4, m3
   2122    mova                    m6, m0
   2123    UNSCRATCH                0,  9, rsp+2*mmsize
   2124 %if notcpuflag(ssse3)
   2125    UNSCRATCH                3, 10, rsp+3*mmsize
   2126 %endif
   2127 %endif
   2128    add                   dstq, strideq
   2129    dec                   cntd
   2130    jg .loop
   2131    RET
   2132 %endmacro
   2133 
   2134 INIT_XMM sse2
   2135 HU_FUNCS 4
   2136 INIT_XMM ssse3
   2137 HU_FUNCS 3
   2138 INIT_XMM avx
   2139 HU_FUNCS 2
   2140 
   2141 %macro HD_FUNCS 0
   2142 cglobal vp9_ipred_hd_4x4_16, 4, 4, 4, dst, stride, l, a
   2143    movh                    m0, [lq]
   2144    movhps                  m0, [aq-2]
   2145    psrldq                  m1, m0, 2
   2146    psrldq                  m2, m0, 4
   2147    LOWPASS                  2,  1,  0
   2148    pavgw                   m1, m0
   2149    punpcklwd               m1, m2
   2150    DEFINE_ARGS dst, stride, stride3
   2151    lea               stride3q, [strideq*3]
   2152 
   2153    movh      [dstq+stride3q ], m1
   2154    movhps    [dstq+strideq*1], m1
   2155    movhlps                 m2, m2
   2156    PALIGNR                 m2, m1, 4, m0
   2157    movh      [dstq+strideq*2], m2
   2158    movhps    [dstq+strideq*0], m2
   2159    RET
   2160 
   2161 cglobal vp9_ipred_hd_8x8_16, 4, 4, 5, dst, stride, l, a
   2162    mova                    m0, [lq]
   2163    movu                    m1, [aq-2]
   2164    PALIGNR                 m2, m1, m0, 2, m3
   2165    PALIGNR                 m3, m1, m0, 4, m4
   2166    LOWPASS                  3,  2,  0
   2167    pavgw                   m2, m0
   2168    SBUTTERFLY           wd, 2,  3,  0
   2169    psrldq                  m0, m1,  2
   2170    psrldq                  m4, m1,  4
   2171    LOWPASS                  1,  0,  4
   2172    DEFINE_ARGS dst8, mstride, cnt
   2173    lea                  dst8q, [dst8q+mstrideq*8]
   2174    neg               mstrideq
   2175    mov                   cntd, 4
   2176 
   2177 .loop:
   2178    add                  dst8q, mstrideq
   2179    mova    [dst8q+mstrideq*0], m2
   2180    mova    [dst8q+mstrideq*4], m3
   2181 %if cpuflag(avx)
   2182    vpalignr                m2, m3, m2, 4
   2183    vpalignr                m3, m1, m3, 4
   2184 %else
   2185    PALIGNR                 m0, m3, m2, 4, m4
   2186    mova                    m2, m0
   2187    PALIGNR                 m0, m1, m3, 4, m4
   2188    mova                    m3, m0
   2189 %endif
   2190    psrldq                  m1, 4
   2191    dec                   cntd
   2192    jg .loop
   2193    RET
   2194 
   2195 cglobal vp9_ipred_hd_16x16_16, 4, 4, 8, dst, stride, l, a
   2196    mova                    m2, [lq]
   2197    movu                    m1, [lq+2]
   2198    movu                    m0, [lq+4]
   2199    LOWPASS                  0,  1,  2
   2200    pavgw                   m1, m2
   2201    mova                    m4, [lq+mmsize]
   2202    movu                    m5, [aq-2]
   2203    PALIGNR                 m3, m5, m4, 2, m6
   2204    PALIGNR                 m2, m5, m4, 4, m6
   2205    LOWPASS                  2,  3,  4
   2206    pavgw                   m3, m4
   2207    SBUTTERFLY           wd, 1,  0,  4
   2208    SBUTTERFLY           wd, 3,  2,  4
   2209    mova                    m6, [aq]
   2210    movu                    m4, [aq+2]
   2211    LOWPASS                  4,  6,  5
   2212    movu                    m5, [aq+mmsize-2]
   2213    psrldq                  m6, m5,  2
   2214    psrldq                  m7, m5,  4
   2215    LOWPASS                  5,  6,  7
   2216    DEFINE_ARGS dst, mstride, mstride3, cnt
   2217    lea                   dstq, [dstq+mstrideq*8]
   2218    lea                   dstq, [dstq+mstrideq*8]
   2219    neg               mstrideq
   2220    lea              mstride3q, [mstrideq*3]
   2221    mov                   cntd, 4
   2222 
   2223 .loop:
   2224    add                  dstq, mstrideq
   2225    mova [dstq+mstride3q*4+ 0], m2
   2226    mova [dstq+mstride3q*4+16], m4
   2227    mova [dstq+mstrideq *8+ 0], m3
   2228    mova [dstq+mstrideq *8+16], m2
   2229    mova [dstq+mstrideq *4+ 0], m0
   2230    mova [dstq+mstrideq *4+16], m3
   2231    mova [dstq+mstrideq *0+ 0], m1
   2232    mova [dstq+mstrideq *0+16], m0
   2233 %if cpuflag(avx)
   2234    vpalignr                m1, m0, m1, 4
   2235    vpalignr                m0, m3, m0, 4
   2236    vpalignr                m3, m2, m3, 4
   2237    vpalignr                m2, m4, m2, 4
   2238    vpalignr                m4, m5, m4, 4
   2239 %else
   2240    PALIGNR                 m6, m0, m1, 4, m7
   2241    mova                    m1, m6
   2242    PALIGNR                 m6, m3, m0, 4, m7
   2243    mova                    m0, m6
   2244    PALIGNR                 m6, m2, m3, 4, m7
   2245    mova                    m3, m6
   2246    PALIGNR                 m6, m4, m2, 4, m7
   2247    mova                    m2, m6
   2248    PALIGNR                 m6, m5, m4, 4, m7
   2249    mova                    m4, m6
   2250 %endif
   2251    psrldq                  m5, 4
   2252    dec                   cntd
   2253    jg .loop
   2254    RET
   2255 
   2256 cglobal vp9_ipred_hd_32x32_16, 4, 4 + 3 * ARCH_X86_64, 14, \
   2257                               10 * -mmsize * ARCH_X86_32, dst, stride, l, a
   2258    mova                    m2, [lq+mmsize*0+0]
   2259    movu                    m1, [lq+mmsize*0+2]
   2260    movu                    m0, [lq+mmsize*0+4]
   2261    LOWPASS                  0,  1,  2
   2262    pavgw                   m1, m2
   2263    SBUTTERFLY           wd, 1,  0,  2
   2264    mova                    m4, [lq+mmsize*1+0]
   2265    movu                    m3, [lq+mmsize*1+2]
   2266    movu                    m2, [lq+mmsize*1+4]
   2267    LOWPASS                  2,  3,  4
   2268    pavgw                   m3, m4
   2269    SBUTTERFLY           wd, 3,  2,  4
   2270    SCRATCH                  0,  8, rsp+0*mmsize
   2271    SCRATCH                  1,  9, rsp+1*mmsize
   2272    SCRATCH                  2, 10, rsp+2*mmsize
   2273    SCRATCH                  3, 11, rsp+3*mmsize
   2274    mova                    m6, [lq+mmsize*2+0]
   2275    movu                    m5, [lq+mmsize*2+2]
   2276    movu                    m4, [lq+mmsize*2+4]
   2277    LOWPASS                  4,  5,  6
   2278    pavgw                   m5, m6
   2279    SBUTTERFLY           wd, 5,  4,  6
   2280    mova                    m0, [lq+mmsize*3+0]
   2281    movu                    m1, [aq+mmsize*0-2]
   2282    PALIGNR                 m7, m1, m0, 2, m2
   2283    PALIGNR                 m6, m1, m0, 4, m2
   2284    LOWPASS                  6,  7,  0
   2285    pavgw                   m7, m0
   2286    SBUTTERFLY           wd, 7,  6,  0
   2287    mova                    m2, [aq+mmsize*0+0]
   2288    movu                    m0, [aq+mmsize*0+2]
   2289    LOWPASS                  0,  2,  1
   2290    movu                    m1, [aq+mmsize*1-2]
   2291    mova                    m2, [aq+mmsize*1+0]
   2292    movu                    m3, [aq+mmsize*1+2]
   2293    LOWPASS                  1,  2,  3
   2294    SCRATCH                  6, 12, rsp+6*mmsize
   2295    SCRATCH                  7, 13, rsp+7*mmsize
   2296    movu                    m2, [aq+mmsize*2-2]
   2297    mova                    m3, [aq+mmsize*2+0]
   2298    movu                    m6, [aq+mmsize*2+2]
   2299    LOWPASS                  2,  3,  6
   2300    movu                    m3, [aq+mmsize*3-2]
   2301    psrldq                  m6, m3,  2
   2302    psrldq                  m7, m3,  4
   2303    LOWPASS                  3,  6,  7
   2304    UNSCRATCH                6, 12, rsp+6*mmsize
   2305    UNSCRATCH                7, 13, rsp+7*mmsize
   2306 %if ARCH_X86_32
   2307    mova        [rsp+4*mmsize], m4
   2308    mova        [rsp+5*mmsize], m5
   2309    ; we already backed up m6/m7 earlier on x86-32 in SCRATCH, so we don't need
   2310    ; to do it again here
   2311 %endif
   2312    DEFINE_ARGS dst, stride, cnt, stride3, stride4, stride20, stride28
   2313    mov                   cntd, 4
   2314    lea               stride3q, [strideq*3]
   2315 %if ARCH_X86_64
   2316    lea               stride4q, [strideq*4]
   2317    lea              stride28q, [stride4q*8]
   2318    lea              stride20q, [stride4q*5]
   2319    sub              stride28q, stride4q
   2320 %endif
   2321    add                   dstq, stride3q
   2322 
   2323    ; x86-32 doesn't have enough registers, so on that platform, we split
   2324    ; the loop in 2... Otherwise you spend most of the loop (un)scratching
   2325 .loop:
   2326 %if ARCH_X86_64
   2327    mova  [dstq+stride28q + 0], m9
   2328    mova  [dstq+stride28q +16], m8
   2329    mova  [dstq+stride28q +32], m11
   2330    mova  [dstq+stride28q +48], m10
   2331    mova  [dstq+stride3q*8+ 0], m8
   2332    mova  [dstq+stride3q*8+16], m11
   2333    mova  [dstq+stride3q*8+32], m10
   2334    mova  [dstq+stride3q*8+48], m5
   2335    mova  [dstq+stride20q + 0], m11
   2336    mova  [dstq+stride20q +16], m10
   2337    mova  [dstq+stride20q +32], m5
   2338    mova  [dstq+stride20q +48], m4
   2339    mova  [dstq+stride4q*4+ 0], m10
   2340    mova  [dstq+stride4q*4+16], m5
   2341    mova  [dstq+stride4q*4+32], m4
   2342    mova  [dstq+stride4q*4+48], m7
   2343 %endif
   2344    mova  [dstq+stride3q*4+ 0], m5
   2345    mova  [dstq+stride3q*4+16], m4
   2346    mova  [dstq+stride3q*4+32], m7
   2347    mova  [dstq+stride3q*4+48], m6
   2348    mova  [dstq+strideq* 8+ 0], m4
   2349    mova  [dstq+strideq* 8+16], m7
   2350    mova  [dstq+strideq* 8+32], m6
   2351    mova  [dstq+strideq* 8+48], m0
   2352    mova  [dstq+strideq* 4+ 0], m7
   2353    mova  [dstq+strideq* 4+16], m6
   2354    mova  [dstq+strideq* 4+32], m0
   2355    mova  [dstq+strideq* 4+48], m1
   2356    mova  [dstq+strideq* 0+ 0], m6
   2357    mova  [dstq+strideq* 0+16], m0
   2358    mova  [dstq+strideq* 0+32], m1
   2359    mova  [dstq+strideq* 0+48], m2
   2360    sub                   dstq, strideq
   2361 %if cpuflag(avx)
   2362 %if ARCH_X86_64
   2363    vpalignr                m9, m8,  m9,  4
   2364    vpalignr                m8, m11, m8,  4
   2365    vpalignr               m11, m10, m11, 4
   2366    vpalignr               m10, m5,  m10, 4
   2367 %endif
   2368    vpalignr                m5, m4,  m5,  4
   2369    vpalignr                m4, m7,  m4,  4
   2370    vpalignr                m7, m6,  m7,  4
   2371    vpalignr                m6, m0,  m6,  4
   2372    vpalignr                m0, m1,  m0,  4
   2373    vpalignr                m1, m2,  m1,  4
   2374    vpalignr                m2, m3,  m2,  4
   2375 %else
   2376 %if ARCH_X86_64
   2377    PALIGNR                m12, m8,  m9,  4, m13
   2378    mova                    m9, m12
   2379    PALIGNR                m12, m11, m8,  4, m13
   2380    mova                    m8, m12
   2381    PALIGNR                m12, m10, m11, 4, m13
   2382    mova                   m11, m12
   2383    PALIGNR                m12, m5,  m10, 4, m13
   2384    mova                   m10, m12
   2385 %endif
   2386    SCRATCH                  3, 12, rsp+8*mmsize, sh
   2387 %if notcpuflag(ssse3)
   2388    SCRATCH                  2, 13, rsp+9*mmsize
   2389 %endif
   2390    PALIGNR                 m3, m4,  m5,  4, m2
   2391    mova                    m5, m3
   2392    PALIGNR                 m3, m7,  m4,  4, m2
   2393    mova                    m4, m3
   2394    PALIGNR                 m3, m6,  m7,  4, m2
   2395    mova                    m7, m3
   2396    PALIGNR                 m3, m0,  m6,  4, m2
   2397    mova                    m6, m3
   2398    PALIGNR                 m3, m1,  m0,  4, m2
   2399    mova                    m0, m3
   2400 %if notcpuflag(ssse3)
   2401    UNSCRATCH                2, 13, rsp+9*mmsize
   2402    SCRATCH                  0, 13, rsp+9*mmsize
   2403 %endif
   2404    PALIGNR                 m3, m2,  m1,  4, m0
   2405    mova                    m1, m3
   2406    PALIGNR                 m3, reg_sh,  m2,  4, m0
   2407    mova                    m2, m3
   2408 %if notcpuflag(ssse3)
   2409    UNSCRATCH                0, 13, rsp+9*mmsize
   2410 %endif
   2411    UNSCRATCH                3, 12, rsp+8*mmsize, sh
   2412 %endif
   2413    psrldq                  m3, 4
   2414    dec                   cntd
   2415    jg .loop
   2416 
   2417 %if ARCH_X86_32
   2418    UNSCRATCH                0,  8, rsp+0*mmsize
   2419    UNSCRATCH                1,  9, rsp+1*mmsize
   2420    UNSCRATCH                2, 10, rsp+2*mmsize
   2421    UNSCRATCH                3, 11, rsp+3*mmsize
   2422    mova                    m4, [rsp+4*mmsize]
   2423    mova                    m5, [rsp+5*mmsize]
   2424    mova                    m6, [rsp+6*mmsize]
   2425    mova                    m7, [rsp+7*mmsize]
   2426    DEFINE_ARGS dst, stride, stride5, stride3
   2427    lea               stride5q, [strideq*5]
   2428    lea                   dstq, [dstq+stride5q*4]
   2429    DEFINE_ARGS dst, stride, cnt, stride3
   2430    mov                   cntd, 4
   2431 .loop_2:
   2432    mova  [dstq+stride3q*4+ 0], m1
   2433    mova  [dstq+stride3q*4+16], m0
   2434    mova  [dstq+stride3q*4+32], m3
   2435    mova  [dstq+stride3q*4+48], m2
   2436    mova  [dstq+strideq* 8+ 0], m0
   2437    mova  [dstq+strideq* 8+16], m3
   2438    mova  [dstq+strideq* 8+32], m2
   2439    mova  [dstq+strideq* 8+48], m5
   2440    mova  [dstq+strideq* 4+ 0], m3
   2441    mova  [dstq+strideq* 4+16], m2
   2442    mova  [dstq+strideq* 4+32], m5
   2443    mova  [dstq+strideq* 4+48], m4
   2444    mova  [dstq+strideq* 0+ 0], m2
   2445    mova  [dstq+strideq* 0+16], m5
   2446    mova  [dstq+strideq* 0+32], m4
   2447    mova  [dstq+strideq* 0+48], m7
   2448    sub                   dstq, strideq
   2449 %if cpuflag(avx)
   2450    vpalignr                m1, m0,  m1,  4
   2451    vpalignr                m0, m3,  m0,  4
   2452    vpalignr                m3, m2,  m3,  4
   2453    vpalignr                m2, m5,  m2,  4
   2454    vpalignr                m5, m4,  m5,  4
   2455    vpalignr                m4, m7,  m4,  4
   2456    vpalignr                m7, m6,  m7,  4
   2457 %else
   2458    SCRATCH                  6, 12, rsp+8*mmsize, sh
   2459 %if notcpuflag(ssse3)
   2460    SCRATCH                  7, 13, rsp+9*mmsize
   2461 %endif
   2462    PALIGNR                 m6, m0,  m1,  4, m7
   2463    mova                    m1, m6
   2464    PALIGNR                 m6, m3,  m0,  4, m7
   2465    mova                    m0, m6
   2466    PALIGNR                 m6, m2,  m3,  4, m7
   2467    mova                    m3, m6
   2468    PALIGNR                 m6, m5,  m2,  4, m7
   2469    mova                    m2, m6
   2470    PALIGNR                 m6, m4,  m5,  4, m7
   2471    mova                    m5, m6
   2472 %if notcpuflag(ssse3)
   2473    UNSCRATCH                7, 13, rsp+9*mmsize
   2474    SCRATCH                  5, 13, rsp+9*mmsize
   2475 %endif
   2476    PALIGNR                 m6, m7,  m4,  4, m5
   2477    mova                    m4, m6
   2478    PALIGNR                 m6, reg_sh,  m7,  4, m5
   2479    mova                    m7, m6
   2480 %if notcpuflag(ssse3)
   2481    UNSCRATCH                5, 13, rsp+9*mmsize
   2482 %endif
   2483    UNSCRATCH                6, 12, rsp+8*mmsize, sh
   2484 %endif
   2485    psrldq                  m6, 4
   2486    dec                   cntd
   2487    jg .loop_2
   2488 %endif
   2489    RET
   2490 %endmacro
   2491 
   2492 INIT_XMM sse2
   2493 HD_FUNCS
   2494 INIT_XMM ssse3
   2495 HD_FUNCS
   2496 INIT_XMM avx
   2497 HD_FUNCS