tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

vp9intrapred.asm (63665B)


      1 ;******************************************************************************
      2 ;* VP9 Intra prediction SIMD optimizations
      3 ;*
      4 ;* Copyright (c) 2013 Ronald S. Bultje <rsbultje gmail com>
      5 ;*
      6 ;* Parts based on:
      7 ;* H.264 intra prediction asm optimizations
      8 ;* Copyright (c) 2010 Fiona Glaser
      9 ;* Copyright (c) 2010 Holger Lubitz
     10 ;* Copyright (c) 2010 Loren Merritt
     11 ;* Copyright (c) 2010 Ronald S. Bultje
     12 ;*
     13 ;* This file is part of FFmpeg.
     14 ;*
     15 ;* FFmpeg is free software; you can redistribute it and/or
     16 ;* modify it under the terms of the GNU Lesser General Public
     17 ;* License as published by the Free Software Foundation; either
     18 ;* version 2.1 of the License, or (at your option) any later version.
     19 ;*
     20 ;* FFmpeg is distributed in the hope that it will be useful,
     21 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
     22 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
     23 ;* Lesser General Public License for more details.
     24 ;*
     25 ;* You should have received a copy of the GNU Lesser General Public
     26 ;* License along with FFmpeg; if not, write to the Free Software
     27 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
     28 ;******************************************************************************
     29 
     30 %include "libavutil/x86/x86util.asm"
     31 
     32 SECTION_RODATA 32
     33 
     34 pw_m256: times 16 dw -256
     35 pw_m255: times 16 dw -255
     36 pw_4096: times 8 dw 4096
     37 
     38 pb_4x3_4x2_4x1_4x0: times 4 db 3
     39                    times 4 db 2
     40                    times 4 db 1
     41                    times 4 db 0
     42 pb_8x1_8x0:   times 8 db 1
     43              times 8 db 0
     44 pb_8x3_8x2:   times 8 db 3
     45              times 8 db 2
     46 pb_0to5_2x7:  db 0, 1, 2, 3, 4, 5, 7, 7
     47              times 8 db -1
     48 pb_0to6_9x7:  db 0, 1, 2, 3, 4, 5, 6
     49              times 9 db 7
     50 pb_1to6_10x7: db 1, 2, 3, 4, 5, 6
     51              times 10 db 7
     52 pb_2to6_3x7:
     53 pb_2to6_11x7: db 2, 3, 4, 5, 6
     54              times 11 db 7
     55 pb_1toE_2xF:  db 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 15
     56 pb_2toE_3xF:  db 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 15, 15
     57 pb_13456_3xm1: db 1, 3, 4, 5, 6
     58               times 3 db -1
     59 pb_6012_4xm1: db 6, 0, 1, 2
     60              times 4 db -1
     61 pb_6xm1_246_8toE: times 6 db -1
     62                  db 2, 4, 6, 8, 9, 10, 11, 12, 13, 14
     63 pb_6xm1_BDF_0to6: times 6 db -1
     64                  db 11, 13, 15, 0, 1, 2, 3, 4, 5, 6
     65 pb_02468ACE_13579BDF: db 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15
     66 
     67 pb_15x0_1xm1: times 15 db 0
     68              db -1
     69 pb_0to2_5x3: db 0, 1, 2
     70             times 5 db 3
     71 pb_6xm1_2x0: times 6 db -1
     72             times 2 db 0
     73 pb_6x0_2xm1: times 6 db 0
     74             times 2 db -1
     75 
     76 cextern pb_1
     77 cextern pb_2
     78 cextern pb_3
     79 cextern pb_15
     80 cextern pw_2
     81 cextern pw_4
     82 cextern pw_8
     83 cextern pw_16
     84 cextern pw_32
     85 cextern pw_255
     86 cextern pw_512
     87 cextern pw_1024
     88 cextern pw_2048
     89 cextern pw_8192
     90 
     91 SECTION .text
     92 
     93 ; dc_NxN(uint8_t *dst, ptrdiff_t stride, const uint8_t *l, const uint8_t *a)
     94 
     95 %macro DC_4to8_FUNCS 0
     96 cglobal vp9_ipred_dc_4x4, 4, 4, 0, dst, stride, l, a
     97    movd                    m0, [lq]
     98    punpckldq               m0, [aq]
     99    pxor                    m1, m1
    100    psadbw                  m0, m1
    101 %if cpuflag(ssse3)
    102    pmulhrsw                m0, [pw_4096]
    103    pshufb                  m0, m1
    104 %else
    105    paddw                   m0, [pw_4]
    106    psraw                   m0, 3
    107    punpcklbw               m0, m0
    108    pshufw                  m0, m0, q0000
    109 %endif
    110    movd      [dstq+strideq*0], m0
    111    movd      [dstq+strideq*1], m0
    112    lea                   dstq, [dstq+strideq*2]
    113    movd      [dstq+strideq*0], m0
    114    movd      [dstq+strideq*1], m0
    115    RET
    116 
    117 cglobal vp9_ipred_dc_8x8, 4, 4, 0, dst, stride, l, a
    118    movq                    m0, [lq]
    119    movq                    m1, [aq]
    120    DEFINE_ARGS dst, stride, stride3
    121    lea               stride3q, [strideq*3]
    122    pxor                    m2, m2
    123    psadbw                  m0, m2
    124    psadbw                  m1, m2
    125    paddw                   m0, m1
    126 %if cpuflag(ssse3)
    127    pmulhrsw                m0, [pw_2048]
    128    pshufb                  m0, m2
    129 %else
    130    paddw                   m0, [pw_8]
    131    psraw                   m0, 4
    132    punpcklbw               m0, m0
    133    pshufw                  m0, m0, q0000
    134 %endif
    135    movq      [dstq+strideq*0], m0
    136    movq      [dstq+strideq*1], m0
    137    movq      [dstq+strideq*2], m0
    138    movq      [dstq+stride3q ], m0
    139    lea                   dstq, [dstq+strideq*4]
    140    movq      [dstq+strideq*0], m0
    141    movq      [dstq+strideq*1], m0
    142    movq      [dstq+strideq*2], m0
    143    movq      [dstq+stride3q ], m0
    144    RET
    145 %endmacro
    146 
    147 INIT_MMX mmxext
    148 DC_4to8_FUNCS
    149 INIT_MMX ssse3
    150 DC_4to8_FUNCS
    151 
    152 %macro DC_16to32_FUNCS 0
    153 cglobal vp9_ipred_dc_16x16, 4, 4, 3, dst, stride, l, a
    154    mova                    m0, [lq]
    155    mova                    m1, [aq]
    156    DEFINE_ARGS dst, stride, stride3, cnt
    157    lea               stride3q, [strideq*3]
    158    pxor                    m2, m2
    159    psadbw                  m0, m2
    160    psadbw                  m1, m2
    161    paddw                   m0, m1
    162    movhlps                 m1, m0
    163    paddw                   m0, m1
    164 %if cpuflag(ssse3)
    165    pmulhrsw                m0, [pw_1024]
    166    pshufb                  m0, m2
    167 %else
    168    paddw                   m0, [pw_16]
    169    psraw                   m0, 5
    170    punpcklbw               m0, m0
    171    pshuflw                 m0, m0, q0000
    172    punpcklqdq              m0, m0
    173 %endif
    174    mov                   cntd, 4
    175 .loop:
    176    mova      [dstq+strideq*0], m0
    177    mova      [dstq+strideq*1], m0
    178    mova      [dstq+strideq*2], m0
    179    mova      [dstq+stride3q ], m0
    180    lea                   dstq, [dstq+strideq*4]
    181    dec                   cntd
    182    jg .loop
    183    RET
    184 
    185 cglobal vp9_ipred_dc_32x32, 4, 4, 5, dst, stride, l, a
    186    mova                    m0, [lq]
    187    mova                    m1, [lq+16]
    188    mova                    m2, [aq]
    189    mova                    m3, [aq+16]
    190    DEFINE_ARGS dst, stride, stride3, cnt
    191    lea               stride3q, [strideq*3]
    192    pxor                    m4, m4
    193    psadbw                  m0, m4
    194    psadbw                  m1, m4
    195    psadbw                  m2, m4
    196    psadbw                  m3, m4
    197    paddw                   m0, m1
    198    paddw                   m2, m3
    199    paddw                   m0, m2
    200    movhlps                 m1, m0
    201    paddw                   m0, m1
    202 %if cpuflag(ssse3)
    203    pmulhrsw                m0, [pw_512]
    204    pshufb                  m0, m4
    205 %else
    206    paddw                   m0, [pw_32]
    207    psraw                   m0, 6
    208    punpcklbw               m0, m0
    209    pshuflw                 m0, m0, q0000
    210    punpcklqdq              m0, m0
    211 %endif
    212    mov                   cntd, 8
    213 .loop:
    214    mova   [dstq+strideq*0+ 0], m0
    215    mova   [dstq+strideq*0+16], m0
    216    mova   [dstq+strideq*1+ 0], m0
    217    mova   [dstq+strideq*1+16], m0
    218    mova   [dstq+strideq*2+ 0], m0
    219    mova   [dstq+strideq*2+16], m0
    220    mova   [dstq+stride3q + 0], m0
    221    mova   [dstq+stride3q +16], m0
    222    lea                   dstq, [dstq+strideq*4]
    223    dec                   cntd
    224    jg .loop
    225    RET
    226 %endmacro
    227 
    228 INIT_XMM sse2
    229 DC_16to32_FUNCS
    230 INIT_XMM ssse3
    231 DC_16to32_FUNCS
    232 
    233 %if HAVE_AVX2_EXTERNAL
    234 INIT_YMM avx2
    235 cglobal vp9_ipred_dc_32x32, 4, 4, 3, dst, stride, l, a
    236    mova                    m0, [lq]
    237    mova                    m1, [aq]
    238    DEFINE_ARGS dst, stride, stride3, cnt
    239    lea               stride3q, [strideq*3]
    240    pxor                    m2, m2
    241    psadbw                  m0, m2
    242    psadbw                  m1, m2
    243    paddw                   m0, m1
    244    vextracti128           xm1, m0, 1
    245    paddw                  xm0, xm1
    246    movhlps                xm1, xm0
    247    paddw                  xm0, xm1
    248    pmulhrsw               xm0, [pw_512]
    249    vpbroadcastb            m0, xm0
    250    mov                   cntd, 4
    251 .loop:
    252    mova      [dstq+strideq*0], m0
    253    mova      [dstq+strideq*1], m0
    254    mova      [dstq+strideq*2], m0
    255    mova      [dstq+stride3q ], m0
    256    lea                   dstq, [dstq+strideq*4]
    257    mova      [dstq+strideq*0], m0
    258    mova      [dstq+strideq*1], m0
    259    mova      [dstq+strideq*2], m0
    260    mova      [dstq+stride3q ], m0
    261    lea                   dstq, [dstq+strideq*4]
    262    dec                   cntd
    263    jg .loop
    264    RET
    265 %endif
    266 
    267 ; dc_top/left_NxN(uint8_t *dst, ptrdiff_t stride, const uint8_t *l, const uint8_t *a)
    268 
    269 %macro DC_1D_4to8_FUNCS 2 ; dir (top or left), arg (a or l)
    270 cglobal vp9_ipred_dc_%1_4x4, 4, 4, 0, dst, stride, l, a
    271    movd                    m0, [%2q]
    272    pxor                    m1, m1
    273    psadbw                  m0, m1
    274 %if cpuflag(ssse3)
    275    pmulhrsw                m0, [pw_8192]
    276    pshufb                  m0, m1
    277 %else
    278    paddw                   m0, [pw_2]
    279    psraw                   m0, 2
    280    punpcklbw               m0, m0
    281    pshufw                  m0, m0, q0000
    282 %endif
    283    movd      [dstq+strideq*0], m0
    284    movd      [dstq+strideq*1], m0
    285    lea                   dstq, [dstq+strideq*2]
    286    movd      [dstq+strideq*0], m0
    287    movd      [dstq+strideq*1], m0
    288    RET
    289 
    290 cglobal vp9_ipred_dc_%1_8x8, 4, 4, 0, dst, stride, l, a
    291    movq                    m0, [%2q]
    292    DEFINE_ARGS dst, stride, stride3
    293    lea               stride3q, [strideq*3]
    294    pxor                    m1, m1
    295    psadbw                  m0, m1
    296 %if cpuflag(ssse3)
    297    pmulhrsw                m0, [pw_4096]
    298    pshufb                  m0, m1
    299 %else
    300    paddw                   m0, [pw_4]
    301    psraw                   m0, 3
    302    punpcklbw               m0, m0
    303    pshufw                  m0, m0, q0000
    304 %endif
    305    movq      [dstq+strideq*0], m0
    306    movq      [dstq+strideq*1], m0
    307    movq      [dstq+strideq*2], m0
    308    movq      [dstq+stride3q ], m0
    309    lea                   dstq, [dstq+strideq*4]
    310    movq      [dstq+strideq*0], m0
    311    movq      [dstq+strideq*1], m0
    312    movq      [dstq+strideq*2], m0
    313    movq      [dstq+stride3q ], m0
    314    RET
    315 %endmacro
    316 
    317 INIT_MMX mmxext
    318 DC_1D_4to8_FUNCS top,  a
    319 DC_1D_4to8_FUNCS left, l
    320 INIT_MMX ssse3
    321 DC_1D_4to8_FUNCS top,  a
    322 DC_1D_4to8_FUNCS left, l
    323 
    324 %macro DC_1D_16to32_FUNCS 2; dir (top or left), arg (a or l)
    325 cglobal vp9_ipred_dc_%1_16x16, 4, 4, 3, dst, stride, l, a
    326    mova                    m0, [%2q]
    327    DEFINE_ARGS dst, stride, stride3, cnt
    328    lea               stride3q, [strideq*3]
    329    pxor                    m2, m2
    330    psadbw                  m0, m2
    331    movhlps                 m1, m0
    332    paddw                   m0, m1
    333 %if cpuflag(ssse3)
    334    pmulhrsw                m0, [pw_2048]
    335    pshufb                  m0, m2
    336 %else
    337    paddw                   m0, [pw_8]
    338    psraw                   m0, 4
    339    punpcklbw               m0, m0
    340    pshuflw                 m0, m0, q0000
    341    punpcklqdq              m0, m0
    342 %endif
    343    mov                   cntd, 4
    344 .loop:
    345    mova      [dstq+strideq*0], m0
    346    mova      [dstq+strideq*1], m0
    347    mova      [dstq+strideq*2], m0
    348    mova      [dstq+stride3q ], m0
    349    lea                   dstq, [dstq+strideq*4]
    350    dec                   cntd
    351    jg .loop
    352    RET
    353 
    354 cglobal vp9_ipred_dc_%1_32x32, 4, 4, 3, dst, stride, l, a
    355    mova                    m0, [%2q]
    356    mova                    m1, [%2q+16]
    357    DEFINE_ARGS dst, stride, stride3, cnt
    358    lea               stride3q, [strideq*3]
    359    pxor                    m2, m2
    360    psadbw                  m0, m2
    361    psadbw                  m1, m2
    362    paddw                   m0, m1
    363    movhlps                 m1, m0
    364    paddw                   m0, m1
    365 %if cpuflag(ssse3)
    366    pmulhrsw                m0, [pw_1024]
    367    pshufb                  m0, m2
    368 %else
    369    paddw                   m0, [pw_16]
    370    psraw                   m0, 5
    371    punpcklbw               m0, m0
    372    pshuflw                 m0, m0, q0000
    373    punpcklqdq              m0, m0
    374 %endif
    375    mov                   cntd, 8
    376 .loop:
    377    mova   [dstq+strideq*0+ 0], m0
    378    mova   [dstq+strideq*0+16], m0
    379    mova   [dstq+strideq*1+ 0], m0
    380    mova   [dstq+strideq*1+16], m0
    381    mova   [dstq+strideq*2+ 0], m0
    382    mova   [dstq+strideq*2+16], m0
    383    mova   [dstq+stride3q + 0], m0
    384    mova   [dstq+stride3q +16], m0
    385    lea                   dstq, [dstq+strideq*4]
    386    dec                   cntd
    387    jg .loop
    388    RET
    389 %endmacro
    390 
    391 INIT_XMM sse2
    392 DC_1D_16to32_FUNCS top,  a
    393 DC_1D_16to32_FUNCS left, l
    394 INIT_XMM ssse3
    395 DC_1D_16to32_FUNCS top,  a
    396 DC_1D_16to32_FUNCS left, l
    397 
    398 %macro DC_1D_AVX2_FUNCS 2 ; dir (top or left), arg (a or l)
    399 %if HAVE_AVX2_EXTERNAL
    400 cglobal vp9_ipred_dc_%1_32x32, 4, 4, 3, dst, stride, l, a
    401    mova                    m0, [%2q]
    402    DEFINE_ARGS dst, stride, stride3, cnt
    403    lea               stride3q, [strideq*3]
    404    pxor                    m2, m2
    405    psadbw                  m0, m2
    406    vextracti128           xm1, m0, 1
    407    paddw                  xm0, xm1
    408    movhlps                xm1, xm0
    409    paddw                  xm0, xm1
    410    pmulhrsw               xm0, [pw_1024]
    411    vpbroadcastb            m0, xm0
    412    mov                   cntd, 4
    413 .loop:
    414    mova      [dstq+strideq*0], m0
    415    mova      [dstq+strideq*1], m0
    416    mova      [dstq+strideq*2], m0
    417    mova      [dstq+stride3q ], m0
    418    lea                   dstq, [dstq+strideq*4]
    419    mova      [dstq+strideq*0], m0
    420    mova      [dstq+strideq*1], m0
    421    mova      [dstq+strideq*2], m0
    422    mova      [dstq+stride3q ], m0
    423    lea                   dstq, [dstq+strideq*4]
    424    dec                   cntd
    425    jg .loop
    426    RET
    427 %endif
    428 %endmacro
    429 
    430 INIT_YMM avx2
    431 DC_1D_AVX2_FUNCS top,  a
    432 DC_1D_AVX2_FUNCS left, l
    433 
    434 ; v
    435 
    436 INIT_MMX mmx
    437 cglobal vp9_ipred_v_8x8, 4, 4, 0, dst, stride, l, a
    438    movq                    m0, [aq]
    439    DEFINE_ARGS dst, stride, stride3
    440    lea               stride3q, [strideq*3]
    441    movq      [dstq+strideq*0], m0
    442    movq      [dstq+strideq*1], m0
    443    movq      [dstq+strideq*2], m0
    444    movq      [dstq+stride3q ], m0
    445    lea                   dstq, [dstq+strideq*4]
    446    movq      [dstq+strideq*0], m0
    447    movq      [dstq+strideq*1], m0
    448    movq      [dstq+strideq*2], m0
    449    movq      [dstq+stride3q ], m0
    450    RET
    451 
    452 INIT_XMM sse
    453 cglobal vp9_ipred_v_16x16, 4, 4, 1, dst, stride, l, a
    454    mova                    m0, [aq]
    455    DEFINE_ARGS dst, stride, stride3, cnt
    456    lea               stride3q, [strideq*3]
    457    mov                   cntd, 4
    458 .loop:
    459    mova      [dstq+strideq*0], m0
    460    mova      [dstq+strideq*1], m0
    461    mova      [dstq+strideq*2], m0
    462    mova      [dstq+stride3q ], m0
    463    lea                   dstq, [dstq+strideq*4]
    464    dec                   cntd
    465    jg .loop
    466    RET
    467 
    468 INIT_XMM sse
    469 cglobal vp9_ipred_v_32x32, 4, 4, 2, dst, stride, l, a
    470    mova                    m0, [aq]
    471    mova                    m1, [aq+16]
    472    DEFINE_ARGS dst, stride, stride3, cnt
    473    lea               stride3q, [strideq*3]
    474    mov                   cntd, 8
    475 .loop:
    476    mova   [dstq+strideq*0+ 0], m0
    477    mova   [dstq+strideq*0+16], m1
    478    mova   [dstq+strideq*1+ 0], m0
    479    mova   [dstq+strideq*1+16], m1
    480    mova   [dstq+strideq*2+ 0], m0
    481    mova   [dstq+strideq*2+16], m1
    482    mova   [dstq+stride3q + 0], m0
    483    mova   [dstq+stride3q +16], m1
    484    lea                   dstq, [dstq+strideq*4]
    485    dec                   cntd
    486    jg .loop
    487    RET
    488 
    489 INIT_YMM avx
    490 cglobal vp9_ipred_v_32x32, 4, 4, 1, dst, stride, l, a
    491    mova                    m0, [aq]
    492    DEFINE_ARGS dst, stride, stride3, cnt
    493    lea               stride3q, [strideq*3]
    494    mov                   cntd, 4
    495 .loop:
    496    mova      [dstq+strideq*0], m0
    497    mova      [dstq+strideq*1], m0
    498    mova      [dstq+strideq*2], m0
    499    mova      [dstq+stride3q ], m0
    500    lea                   dstq, [dstq+strideq*4]
    501    mova      [dstq+strideq*0], m0
    502    mova      [dstq+strideq*1], m0
    503    mova      [dstq+strideq*2], m0
    504    mova      [dstq+stride3q ], m0
    505    lea                   dstq, [dstq+strideq*4]
    506    dec                   cntd
    507    jg .loop
    508    RET
    509 
    510 ; h
    511 
    512 %macro H_XMM_FUNCS 2
    513 %if notcpuflag(avx)
    514 cglobal vp9_ipred_h_4x4, 3, 4, 1, dst, stride, l, stride3
    515    movd                    m0, [lq]
    516 %if cpuflag(ssse3)
    517    pshufb                  m0, [pb_4x3_4x2_4x1_4x0]
    518 %else
    519    punpcklbw               m0, m0
    520    pshuflw                 m0, m0, q0123
    521    punpcklwd               m0, m0
    522 %endif
    523    lea               stride3q, [strideq*3]
    524    movd      [dstq+strideq*0], m0
    525    psrldq                  m0, 4
    526    movd      [dstq+strideq*1], m0
    527    psrldq                  m0, 4
    528    movd      [dstq+strideq*2], m0
    529    psrldq                  m0, 4
    530    movd      [dstq+stride3q ], m0
    531    RET
    532 %endif
    533 
    534 cglobal vp9_ipred_h_8x8, 3, 5, %1, dst, stride, l, stride3, cnt
    535 %if cpuflag(ssse3)
    536    mova                    m2, [pb_8x1_8x0]
    537    mova                    m3, [pb_8x3_8x2]
    538 %endif
    539    lea               stride3q, [strideq*3]
    540    mov                   cntq, 1
    541 .loop:
    542    movd                    m0, [lq+cntq*4]
    543 %if cpuflag(ssse3)
    544    pshufb                  m1, m0, m3
    545    pshufb                  m0, m2
    546 %else
    547    punpcklbw               m0, m0
    548    punpcklwd               m0, m0
    549    pshufd                  m1, m0, q2233
    550    pshufd                  m0, m0, q0011
    551 %endif
    552    movq      [dstq+strideq*0], m1
    553    movhps    [dstq+strideq*1], m1
    554    movq      [dstq+strideq*2], m0
    555    movhps    [dstq+stride3q ], m0
    556    lea                   dstq, [dstq+strideq*4]
    557    dec                   cntq
    558    jge .loop
    559    RET
    560 
    561 cglobal vp9_ipred_h_16x16, 3, 5, %2, dst, stride, l, stride3, cnt
    562 %if cpuflag(ssse3)
    563    mova                    m5, [pb_1]
    564    mova                    m6, [pb_2]
    565    mova                    m7, [pb_3]
    566    pxor                    m4, m4
    567 %endif
    568    lea               stride3q, [strideq*3]
    569    mov                   cntq, 3
    570 .loop:
    571    movd                    m3, [lq+cntq*4]
    572 %if cpuflag(ssse3)
    573    pshufb                  m0, m3, m7
    574    pshufb                  m1, m3, m6
    575 %else
    576    punpcklbw               m3, m3
    577    punpcklwd               m3, m3
    578    pshufd                  m0, m3, q3333
    579    pshufd                  m1, m3, q2222
    580 %endif
    581    mova      [dstq+strideq*0], m0
    582    mova      [dstq+strideq*1], m1
    583 %if cpuflag(ssse3)
    584    pshufb                  m2, m3, m5
    585    pshufb                  m3, m4
    586 %else
    587    pshufd                  m2, m3, q1111
    588    pshufd                  m3, m3, q0000
    589 %endif
    590    mova      [dstq+strideq*2], m2
    591    mova      [dstq+stride3q ], m3
    592    lea                   dstq, [dstq+strideq*4]
    593    dec                   cntq
    594    jge .loop
    595    RET
    596 
    597 cglobal vp9_ipred_h_32x32, 3, 5, %2, dst, stride, l, stride3, cnt
    598 %if cpuflag(ssse3)
    599    mova                    m5, [pb_1]
    600    mova                    m6, [pb_2]
    601    mova                    m7, [pb_3]
    602    pxor                    m4, m4
    603 %endif
    604    lea               stride3q, [strideq*3]
    605    mov                   cntq, 7
    606 .loop:
    607    movd                    m3, [lq+cntq*4]
    608 %if cpuflag(ssse3)
    609    pshufb                  m0, m3, m7
    610    pshufb                  m1, m3, m6
    611 %else
    612    punpcklbw               m3, m3
    613    punpcklwd               m3, m3
    614    pshufd                  m0, m3, q3333
    615    pshufd                  m1, m3, q2222
    616 %endif
    617    mova   [dstq+strideq*0+ 0], m0
    618    mova   [dstq+strideq*0+16], m0
    619    mova   [dstq+strideq*1+ 0], m1
    620    mova   [dstq+strideq*1+16], m1
    621 %if cpuflag(ssse3)
    622    pshufb                  m2, m3, m5
    623    pshufb                  m3, m4
    624 %else
    625    pshufd                  m2, m3, q1111
    626    pshufd                  m3, m3, q0000
    627 %endif
    628    mova   [dstq+strideq*2+ 0], m2
    629    mova   [dstq+strideq*2+16], m2
    630    mova   [dstq+stride3q + 0], m3
    631    mova   [dstq+stride3q +16], m3
    632    lea                   dstq, [dstq+strideq*4]
    633    dec                   cntq
    634    jge .loop
    635    RET
    636 %endmacro
    637 
    638 INIT_XMM sse2
    639 H_XMM_FUNCS 2, 4
    640 INIT_XMM ssse3
    641 H_XMM_FUNCS 4, 8
    642 INIT_XMM avx
    643 H_XMM_FUNCS 4, 8
    644 
    645 %if HAVE_AVX2_EXTERNAL
    646 INIT_YMM avx2
    647 cglobal vp9_ipred_h_32x32, 3, 5, 8, dst, stride, l, stride3, cnt
    648    mova                    m5, [pb_1]
    649    mova                    m6, [pb_2]
    650    mova                    m7, [pb_3]
    651    pxor                    m4, m4
    652    lea               stride3q, [strideq*3]
    653    mov                   cntq, 7
    654 .loop:
    655    movd                   xm3, [lq+cntq*4]
    656    vinserti128             m3, m3, xm3, 1
    657    pshufb                  m0, m3, m7
    658    pshufb                  m1, m3, m6
    659    mova      [dstq+strideq*0], m0
    660    mova      [dstq+strideq*1], m1
    661    pshufb                  m2, m3, m5
    662    pshufb                  m3, m4
    663    mova      [dstq+strideq*2], m2
    664    mova      [dstq+stride3q ], m3
    665    lea                   dstq, [dstq+strideq*4]
    666    dec                   cntq
    667    jge .loop
    668    RET
    669 %endif
    670 
    671 ; tm
    672 
    673 %macro TM_MMX_FUNCS 0
    674 cglobal vp9_ipred_tm_4x4, 4, 4, 0, dst, stride, l, a
    675    pxor                    m1, m1
    676    movd                    m0, [aq]
    677    pinsrw                  m2, [aq-1], 0
    678    punpcklbw               m0, m1
    679    DEFINE_ARGS dst, stride, l, cnt
    680 %if cpuflag(ssse3)
    681    mova                    m3, [pw_m256]
    682    mova                    m1, [pw_m255]
    683    pshufb                  m2, m3
    684 %else
    685    punpcklbw               m2, m1
    686    pshufw                  m2, m2, q0000
    687 %endif
    688    psubw                   m0, m2
    689    mov                   cntq, 1
    690 .loop:
    691    pinsrw                  m2, [lq+cntq*2], 0
    692 %if cpuflag(ssse3)
    693    pshufb                  m4, m2, m1
    694    pshufb                  m2, m3
    695 %else
    696    punpcklbw               m2, m1
    697    pshufw                  m4, m2, q1111
    698    pshufw                  m2, m2, q0000
    699 %endif
    700    paddw                   m4, m0
    701    paddw                   m2, m0
    702    packuswb                m4, m4
    703    packuswb                m2, m2
    704    movd      [dstq+strideq*0], m4
    705    movd      [dstq+strideq*1], m2
    706    lea                   dstq, [dstq+strideq*2]
    707    dec                   cntq
    708    jge .loop
    709    RET
    710 %endmacro
    711 
    712 INIT_MMX mmxext
    713 TM_MMX_FUNCS
    714 INIT_MMX ssse3
    715 TM_MMX_FUNCS
    716 
    717 %macro TM_XMM_FUNCS 0
    718 cglobal vp9_ipred_tm_8x8, 4, 4, 5, dst, stride, l, a
    719    pxor                    m1, m1
    720    movh                    m0, [aq]
    721    pinsrw                  m2, [aq-1], 0
    722    punpcklbw               m0, m1
    723    DEFINE_ARGS dst, stride, l, cnt
    724 %if cpuflag(ssse3)
    725    mova                    m3, [pw_m256]
    726    mova                    m1, [pw_m255]
    727    pshufb                  m2, m3
    728 %else
    729    punpcklbw               m2, m1
    730    punpcklwd               m2, m2
    731    pshufd                  m2, m2, q0000
    732 %endif
    733    psubw                   m0, m2
    734    mov                   cntq, 3
    735 .loop:
    736    pinsrw                  m2, [lq+cntq*2], 0
    737 %if cpuflag(ssse3)
    738    pshufb                  m4, m2, m1
    739    pshufb                  m2, m3
    740 %else
    741    punpcklbw               m2, m1
    742    punpcklwd               m2, m2
    743    pshufd                  m4, m2, q1111
    744    pshufd                  m2, m2, q0000
    745 %endif
    746    paddw                   m4, m0
    747    paddw                   m2, m0
    748    packuswb                m4, m2
    749    movh      [dstq+strideq*0], m4
    750    movhps    [dstq+strideq*1], m4
    751    lea                   dstq, [dstq+strideq*2]
    752    dec                   cntq
    753    jge .loop
    754    RET
    755 
    756 cglobal vp9_ipred_tm_16x16, 4, 4, 8, dst, stride, l, a
    757    pxor                    m3, m3
    758    mova                    m0, [aq]
    759    pinsrw                  m2, [aq-1], 0
    760    punpckhbw               m1, m0, m3
    761    punpcklbw               m0, m3
    762    DEFINE_ARGS dst, stride, l, cnt
    763 %if cpuflag(ssse3)
    764    mova                    m4, [pw_m256]
    765    mova                    m3, [pw_m255]
    766    pshufb                  m2, m4
    767 %else
    768    punpcklbw               m2, m3
    769    punpcklwd               m2, m2
    770    pshufd                  m2, m2, q0000
    771 %endif
    772    psubw                   m1, m2
    773    psubw                   m0, m2
    774    mov                   cntq, 7
    775 .loop:
    776    pinsrw                  m7, [lq+cntq*2], 0
    777 %if cpuflag(ssse3)
    778    pshufb                  m5, m7, m3
    779    pshufb                  m7, m4
    780 %else
    781    punpcklbw               m7, m3
    782    punpcklwd               m7, m7
    783    pshufd                  m5, m7, q1111
    784    pshufd                  m7, m7, q0000
    785 %endif
    786    paddw                   m2, m5, m0
    787    paddw                   m5, m1
    788    paddw                   m6, m7, m0
    789    paddw                   m7, m1
    790    packuswb                m2, m5
    791    packuswb                m6, m7
    792    mova      [dstq+strideq*0], m2
    793    mova      [dstq+strideq*1], m6
    794    lea                   dstq, [dstq+strideq*2]
    795    dec                   cntq
    796    jge .loop
    797    RET
    798 
    799 %if ARCH_X86_64
    800 %define mem 0
    801 %else
    802 %define mem 64
    803 %endif
    804 cglobal vp9_ipred_tm_32x32, 4, 4, 14, mem, dst, stride, l, a
    805    pxor                    m5, m5
    806    pinsrw                  m4, [aq-1], 0
    807    mova                    m0, [aq]
    808    mova                    m2, [aq+16]
    809    DEFINE_ARGS dst, stride, l, cnt
    810 %if cpuflag(ssse3)
    811 %if ARCH_X86_64
    812    mova                   m12, [pw_m256]
    813    mova                   m13, [pw_m255]
    814 %define pw_m256_reg m12
    815 %define pw_m255_reg m13
    816 %else
    817 %define pw_m256_reg [pw_m256]
    818 %define pw_m255_reg [pw_m255]
    819 %endif
    820    pshufb                  m4, pw_m256_reg
    821 %else
    822    punpcklbw               m4, m5
    823    punpcklwd               m4, m4
    824    pshufd                  m4, m4, q0000
    825 %endif
    826    punpckhbw               m1, m0,  m5
    827    punpckhbw               m3, m2,  m5
    828    punpcklbw               m0, m5
    829    punpcklbw               m2, m5
    830    psubw                   m1, m4
    831    psubw                   m0, m4
    832    psubw                   m3, m4
    833    psubw                   m2, m4
    834 %if ARCH_X86_64
    835    SWAP                     0, 8
    836    SWAP                     1, 9
    837    SWAP                     2, 10
    838    SWAP                     3, 11
    839 %else
    840    mova            [rsp+0*16], m0
    841    mova            [rsp+1*16], m1
    842    mova            [rsp+2*16], m2
    843    mova            [rsp+3*16], m3
    844 %endif
    845    mov                   cntq, 15
    846 .loop:
    847    pinsrw                  m3, [lq+cntq*2], 0
    848 %if cpuflag(ssse3)
    849    pshufb                  m7, m3, pw_m255_reg
    850    pshufb                  m3, pw_m256_reg
    851 %else
    852    pxor                    m7, m7
    853    punpcklbw               m3, m7
    854    punpcklwd               m3, m3
    855    pshufd                  m7, m3, q1111
    856    pshufd                  m3, m3, q0000
    857 %endif
    858 %if ARCH_X86_64
    859    paddw                   m4, m7, m8
    860    paddw                   m5, m7, m9
    861    paddw                   m6, m7, m10
    862    paddw                   m7, m11
    863    paddw                   m0, m3, m8
    864    paddw                   m1, m3, m9
    865    paddw                   m2, m3, m10
    866    paddw                   m3, m11
    867 %else
    868    paddw                   m4, m7, [rsp+0*16]
    869    paddw                   m5, m7, [rsp+1*16]
    870    paddw                   m6, m7, [rsp+2*16]
    871    paddw                   m7, [rsp+3*16]
    872    paddw                   m0, m3, [rsp+0*16]
    873    paddw                   m1, m3, [rsp+1*16]
    874    paddw                   m2, m3, [rsp+2*16]
    875    paddw                   m3, [rsp+3*16]
    876 %endif
    877    packuswb                m4, m5
    878    packuswb                m6, m7
    879    packuswb                m0, m1
    880    packuswb                m2, m3
    881    mova   [dstq+strideq*0+ 0], m4
    882    mova   [dstq+strideq*0+16], m6
    883    mova   [dstq+strideq*1+ 0], m0
    884    mova   [dstq+strideq*1+16], m2
    885    lea                   dstq, [dstq+strideq*2]
    886    dec                   cntq
    887    jge .loop
    888    RET
    889 %undef pw_m256_reg
    890 %undef pw_m255_reg
    891 %undef mem
    892 %endmacro
    893 
    894 INIT_XMM sse2
    895 TM_XMM_FUNCS
    896 INIT_XMM ssse3
    897 TM_XMM_FUNCS
    898 INIT_XMM avx
    899 TM_XMM_FUNCS
    900 
    901 %if HAVE_AVX2_EXTERNAL
    902 INIT_YMM avx2
    903 cglobal vp9_ipred_tm_32x32, 4, 4, 8, dst, stride, l, a
    904    pxor                    m3, m3
    905    pinsrw                 xm2, [aq-1], 0
    906    vinserti128             m2, m2, xm2, 1
    907    mova                    m0, [aq]
    908    DEFINE_ARGS dst, stride, l, cnt
    909    mova                    m4, [pw_m256]
    910    mova                    m5, [pw_m255]
    911    pshufb                  m2, m4
    912    punpckhbw               m1, m0, m3
    913    punpcklbw               m0, m3
    914    psubw                   m1, m2
    915    psubw                   m0, m2
    916    mov                   cntq, 15
    917 .loop:
    918    pinsrw                 xm7, [lq+cntq*2], 0
    919    vinserti128             m7, m7, xm7, 1
    920    pshufb                  m3, m7, m5
    921    pshufb                  m7, m4
    922    paddw                   m2, m3, m0
    923    paddw                   m3, m1
    924    paddw                   m6, m7, m0
    925    paddw                   m7, m1
    926    packuswb                m2, m3
    927    packuswb                m6, m7
    928    mova      [dstq+strideq*0], m2
    929    mova      [dstq+strideq*1], m6
    930    lea                   dstq, [dstq+strideq*2]
    931    dec                   cntq
    932    jge .loop
    933    RET
    934 %endif
    935 
    936 ; dl
    937 
    938 %macro LOWPASS 4 ; left [dst], center, right, tmp
    939    pxor                   m%4, m%1, m%3
    940    pand                   m%4, [pb_1]
    941    pavgb                  m%1, m%3
    942    psubusb                m%1, m%4
    943    pavgb                  m%1, m%2
    944 %endmacro
    945 
    946 %macro DL_MMX_FUNCS 0
    947 cglobal vp9_ipred_dl_4x4, 4, 4, 0, dst, stride, l, a
    948    movq                    m1, [aq]
    949 %if cpuflag(ssse3)
    950    pshufb                  m0, m1, [pb_0to5_2x7]
    951    pshufb                  m2, m1, [pb_2to6_3x7]
    952 %else
    953    punpckhbw               m3, m1, m1              ; 44556677
    954    pand                    m0, m1, [pb_6xm1_2x0]   ; 012345__
    955    pand                    m3, [pb_6x0_2xm1]       ; ______77
    956    psrlq                   m2, m1, 16              ; 234567__
    957    por                     m0, m3                  ; 01234577
    958    por                     m2, m3                  ; 23456777
    959 %endif
    960    psrlq                   m1, 8
    961    LOWPASS                  0, 1, 2, 3
    962 
    963    pshufw                  m1, m0, q3321
    964    movd      [dstq+strideq*0], m0
    965    movd      [dstq+strideq*2], m1
    966    psrlq                   m0, 8
    967    psrlq                   m1, 8
    968    add                   dstq, strideq
    969    movd      [dstq+strideq*0], m0
    970    movd      [dstq+strideq*2], m1
    971    RET
    972 %endmacro
    973 
    974 INIT_MMX mmxext
    975 DL_MMX_FUNCS
    976 INIT_MMX ssse3
    977 DL_MMX_FUNCS
    978 
    979 %macro DL_XMM_FUNCS 0
    980 cglobal vp9_ipred_dl_8x8, 4, 4, 4, dst, stride, stride5, a
    981    movq                    m0, [aq]
    982    lea               stride5q, [strideq*5]
    983 %if cpuflag(ssse3)
    984    pshufb                  m1, m0, [pb_1to6_10x7]
    985 %else
    986    punpcklbw               m1, m0, m0              ; 0011223344556677
    987    punpckhwd               m1, m1                  ; 4x4,4x5,4x6,4x7
    988 %endif
    989    shufps                  m0, m1, q3310
    990 %if notcpuflag(ssse3)
    991    psrldq                  m1, m0, 1
    992    shufps                  m1, m0, q3210
    993 %endif
    994    psrldq                  m2, m1, 1
    995    LOWPASS                  0, 1, 2, 3
    996 
    997    pshufd                  m1, m0, q3321
    998    movq      [dstq+strideq*0], m0
    999    movq      [dstq+strideq*4], m1
   1000    psrldq                  m0, 1
   1001    psrldq                  m1, 1
   1002    movq      [dstq+strideq*1], m0
   1003    movq      [dstq+stride5q ], m1
   1004    lea                   dstq, [dstq+strideq*2]
   1005    psrldq                  m0, 1
   1006    psrldq                  m1, 1
   1007    movq      [dstq+strideq*0], m0
   1008    movq      [dstq+strideq*4], m1
   1009    psrldq                  m0, 1
   1010    psrldq                  m1, 1
   1011    movq      [dstq+strideq*1], m0
   1012    movq      [dstq+stride5q ], m1
   1013    RET
   1014 
   1015 cglobal vp9_ipred_dl_16x16, 4, 4, 6, dst, stride, l, a
   1016    mova                    m0, [aq]
   1017 %if cpuflag(ssse3)
   1018    mova                    m5, [pb_1toE_2xF]
   1019    pshufb                  m1, m0, m5
   1020    pshufb                  m2, m1, m5
   1021    pshufb                  m4, m0, [pb_15]
   1022 %else
   1023    pand                    m5, m0, [pb_15x0_1xm1]      ; _______________F
   1024    psrldq                  m1, m0, 1                   ; 123456789ABCDEF_
   1025    por                     m1, m5                      ; 123456789ABCDEFF
   1026    psrldq                  m2, m1, 1                   ; 23456789ABCDEFF_
   1027    por                     m2, m5                      ; 23456789ABCDEFFF
   1028    pshufhw                 m4, m1, q3333               ; xxxxxxxxFFFFFFFF
   1029 %endif
   1030    LOWPASS                  0, 1, 2, 3
   1031    DEFINE_ARGS dst, stride, cnt, stride9
   1032    lea               stride9q, [strideq+strideq*8]
   1033    mov                   cntd, 4
   1034 
   1035 .loop:
   1036    movhlps                 m4, m0
   1037    mova      [dstq+strideq*0], m0
   1038 %if cpuflag(ssse3)
   1039    pshufb                  m0, m5
   1040 %else
   1041    psrldq                  m0, 1
   1042    por                     m0, m5
   1043 %endif
   1044    mova      [dstq+strideq*8], m4
   1045    movhlps                 m4, m0
   1046    mova      [dstq+strideq*1], m0
   1047 %if cpuflag(ssse3)
   1048    pshufb                  m0, m5
   1049 %else
   1050    psrldq                  m0, 1
   1051    por                     m0, m5
   1052 %endif
   1053    mova      [dstq+stride9q ], m4
   1054    lea                   dstq, [dstq+strideq*2]
   1055    dec                   cntd
   1056    jg .loop
   1057    RET
   1058 
   1059 cglobal vp9_ipred_dl_32x32, 4, 5, 8, dst, stride, cnt, a, dst16
   1060    mova                    m0, [aq]
   1061    mova                    m1, [aq+16]
   1062    PALIGNR                 m2, m1, m0, 1, m4
   1063    PALIGNR                 m3, m1, m0, 2, m4
   1064    LOWPASS                  0, 2, 3, 4
   1065 %if cpuflag(ssse3)
   1066    mova                    m5, [pb_1toE_2xF]
   1067    pshufb                  m2, m1, m5
   1068    pshufb                  m3, m2, m5
   1069    pshufb                  m6, m1, [pb_15]
   1070    mova                    m7, m6
   1071 %else
   1072    pand                    m5, m1, [pb_15x0_1xm1]      ; _______________F
   1073    psrldq                  m2, m1, 1                   ; 123456789ABCDEF_
   1074    por                     m2, m5                      ; 123456789ABCDEFF
   1075    psrldq                  m3, m2, 1                   ; 23456789ABCDEFF_
   1076    por                     m3, m5                      ; 23456789ABCDEFFF
   1077    pshufhw                 m7, m2, q3333               ; xxxxxxxxFFFFFFFF
   1078    pshufd                  m6, m7, q3333
   1079 %endif
   1080    LOWPASS                  1, 2, 3, 4
   1081    lea                 dst16q, [dstq  +strideq*8]
   1082    mov                   cntd, 8
   1083    lea                 dst16q, [dst16q+strideq*8]
   1084 .loop:
   1085    movhlps                 m7, m1
   1086    mova [dstq  +strideq*0+ 0], m0
   1087    mova [dstq  +strideq*0+16], m1
   1088    movhps [dstq+strideq*8+ 0], m0
   1089    movq [dstq  +strideq*8+ 8], m1
   1090    mova [dstq  +strideq*8+16], m7
   1091    mova [dst16q+strideq*0+ 0], m1
   1092    mova [dst16q+strideq*0+16], m6
   1093    mova [dst16q+strideq*8+ 0], m7
   1094    mova [dst16q+strideq*8+16], m6
   1095 %if cpuflag(avx)
   1096    vpalignr                m0, m1, m0, 1
   1097    pshufb                  m1, m5
   1098 %elif cpuflag(ssse3)
   1099    palignr                 m2, m1, m0, 1
   1100    pshufb                  m1, m5
   1101    mova                    m0, m2
   1102 %else
   1103    mova                    m4, m1
   1104    psrldq                  m0, 1
   1105    pslldq                  m4, 15
   1106    psrldq                  m1, 1
   1107    por                     m0, m4
   1108    por                     m1, m5
   1109 %endif
   1110    add                   dstq, strideq
   1111    add                 dst16q, strideq
   1112    dec                   cntd
   1113    jg .loop
   1114    RET
   1115 %endmacro
   1116 
   1117 INIT_XMM sse2
   1118 DL_XMM_FUNCS
   1119 INIT_XMM ssse3
   1120 DL_XMM_FUNCS
   1121 INIT_XMM avx
   1122 DL_XMM_FUNCS
   1123 
   1124 ; dr
   1125 
   1126 %macro DR_MMX_FUNCS 0
   1127 cglobal vp9_ipred_dr_4x4, 4, 4, 0, dst, stride, l, a
   1128    movd                    m0, [lq]
   1129    punpckldq               m0, [aq-1]
   1130    movd                    m1, [aq+3]
   1131    DEFINE_ARGS dst, stride, stride3
   1132    lea               stride3q, [strideq*3]
   1133    PALIGNR                 m1, m0, 1, m3
   1134    psrlq                   m2, m1, 8
   1135    LOWPASS                  0, 1, 2, 3
   1136 
   1137    movd      [dstq+stride3q ], m0
   1138    psrlq                   m0, 8
   1139    movd      [dstq+strideq*2], m0
   1140    psrlq                   m0, 8
   1141    movd      [dstq+strideq*1], m0
   1142    psrlq                   m0, 8
   1143    movd      [dstq+strideq*0], m0
   1144    RET
   1145 %endmacro
   1146 
   1147 INIT_MMX mmxext
   1148 DR_MMX_FUNCS
   1149 INIT_MMX ssse3
   1150 DR_MMX_FUNCS
   1151 
   1152 %macro DR_XMM_FUNCS 0
   1153 cglobal vp9_ipred_dr_8x8, 4, 4, 4, dst, stride, l, a
   1154    movq                    m1, [lq]
   1155    movhps                  m1, [aq-1]
   1156    movd                    m2, [aq+7]
   1157    DEFINE_ARGS dst, stride, stride3
   1158    lea               stride3q, [strideq*3]
   1159    pslldq                  m0, m1, 1
   1160    PALIGNR                 m2, m1, 1, m3
   1161    LOWPASS                  0, 1, 2, 3
   1162 
   1163    movhps    [dstq+strideq*0], m0
   1164    pslldq                  m0, 1
   1165    movhps    [dstq+strideq*1], m0
   1166    pslldq                  m0, 1
   1167    movhps    [dstq+strideq*2], m0
   1168    pslldq                  m0, 1
   1169    movhps    [dstq+stride3q ], m0
   1170    pslldq                  m0, 1
   1171    lea                   dstq, [dstq+strideq*4]
   1172    movhps    [dstq+strideq*0], m0
   1173    pslldq                  m0, 1
   1174    movhps    [dstq+strideq*1], m0
   1175    pslldq                  m0, 1
   1176    movhps    [dstq+strideq*2], m0
   1177    pslldq                  m0, 1
   1178    movhps    [dstq+stride3q ], m0
   1179    RET
   1180 
   1181 cglobal vp9_ipred_dr_16x16, 4, 4, 6, dst, stride, l, a
   1182    mova                    m1, [lq]
   1183    movu                    m2, [aq-1]
   1184    movd                    m4, [aq+15]
   1185    DEFINE_ARGS dst, stride, stride9, cnt
   1186    lea               stride9q, [strideq *3]
   1187    mov                   cntd, 4
   1188    lea               stride9q, [stride9q*3]
   1189    PALIGNR                 m4, m2, 1, m5
   1190    PALIGNR                 m3, m2, m1, 15, m5
   1191    LOWPASS                  3,  2, 4, 5
   1192    pslldq                  m0, m1, 1
   1193    PALIGNR                 m2, m1, 1, m4
   1194    LOWPASS                  0,  1, 2, 4
   1195 
   1196 .loop:
   1197    mova    [dstq+strideq*0  ], m3
   1198    movhps  [dstq+strideq*8+0], m0
   1199    movq    [dstq+strideq*8+8], m3
   1200    PALIGNR                 m3, m0, 15, m1
   1201    pslldq                  m0, 1
   1202    mova    [dstq+strideq*1  ], m3
   1203    movhps  [dstq+stride9q +0], m0
   1204    movq    [dstq+stride9q +8], m3
   1205    PALIGNR                 m3, m0, 15, m1
   1206    pslldq                  m0, 1
   1207    lea                   dstq, [dstq+strideq*2]
   1208    dec                   cntd
   1209    jg .loop
   1210    RET
   1211 
   1212 cglobal vp9_ipred_dr_32x32, 4, 4, 8, dst, stride, l, a
   1213    mova                    m1, [lq]
   1214    mova                    m2, [lq+16]
   1215    movu                    m3, [aq-1]
   1216    movu                    m4, [aq+15]
   1217    movd                    m5, [aq+31]
   1218    DEFINE_ARGS dst, stride, stride8, cnt
   1219    lea               stride8q, [strideq*8]
   1220    PALIGNR                 m5, m4, 1, m7
   1221    PALIGNR                 m6, m4, m3, 15, m7
   1222    LOWPASS                  5,  4,  6,  7
   1223    PALIGNR                 m4, m3, 1, m7
   1224    PALIGNR                 m6, m3, m2, 15, m7
   1225    LOWPASS                  4,  3,  6,  7
   1226    PALIGNR                 m3, m2, 1, m7
   1227    PALIGNR                 m6, m2, m1, 15, m7
   1228    LOWPASS                  3,  2,  6,  7
   1229    PALIGNR                 m2, m1, 1, m6
   1230    pslldq                  m0, m1, 1
   1231    LOWPASS                  2,  1,  0,  6
   1232    mov                   cntd, 16
   1233 
   1234    ; out=m2/m3/m4/m5
   1235 .loop:
   1236    mova  [dstq+stride8q*0+ 0], m4
   1237    mova  [dstq+stride8q*0+16], m5
   1238    mova  [dstq+stride8q*2+ 0], m3
   1239    mova  [dstq+stride8q*2+16], m4
   1240    PALIGNR                 m5, m4, 15, m6
   1241    PALIGNR                 m4, m3, 15, m6
   1242    PALIGNR                 m3, m2, 15, m6
   1243    pslldq                  m2, 1
   1244    add                   dstq, strideq
   1245    dec                   cntd
   1246    jg .loop
   1247    RET
   1248 %endmacro
   1249 
   1250 INIT_XMM sse2
   1251 DR_XMM_FUNCS
   1252 INIT_XMM ssse3
   1253 DR_XMM_FUNCS
   1254 INIT_XMM avx
   1255 DR_XMM_FUNCS
   1256 
   1257 ; vl
   1258 
   1259 INIT_MMX mmxext
   1260 cglobal vp9_ipred_vl_4x4, 4, 4, 0, dst, stride, l, a
   1261    movq                    m0, [aq]
   1262    psrlq                   m1, m0, 8
   1263    psrlq                   m2, m1, 8
   1264    LOWPASS                  2,  1, 0, 3
   1265    pavgb                   m1, m0
   1266    movd      [dstq+strideq*0], m1
   1267    movd      [dstq+strideq*1], m2
   1268    lea                   dstq, [dstq+strideq*2]
   1269    psrlq                   m1, 8
   1270    psrlq                   m2, 8
   1271    movd      [dstq+strideq*0], m1
   1272    movd      [dstq+strideq*1], m2
   1273    RET
   1274 
   1275 %macro VL_XMM_FUNCS 0
   1276 cglobal vp9_ipred_vl_8x8, 4, 4, 4, dst, stride, l, a
   1277    movq                    m0, [aq]
   1278 %if cpuflag(ssse3)
   1279    pshufb                  m0, [pb_0to6_9x7]
   1280 %else
   1281    punpcklbw               m1, m0, m0
   1282    punpckhwd               m1, m1
   1283    shufps                  m0, m1, q3310
   1284 %endif
   1285    DEFINE_ARGS dst, stride, stride3
   1286    lea               stride3q, [strideq*3]
   1287    psrldq                  m1, m0, 1
   1288    psrldq                  m2, m0, 2
   1289    LOWPASS                  2,  1,  0,  3
   1290    pavgb                   m1, m0
   1291 
   1292    movq      [dstq+strideq*0], m1
   1293    movq      [dstq+strideq*1], m2
   1294    psrldq                  m1, 1
   1295    psrldq                  m2, 1
   1296    movq      [dstq+strideq*2], m1
   1297    movq      [dstq+stride3q ], m2
   1298    lea                   dstq, [dstq+strideq*4]
   1299    psrldq                  m1, 1
   1300    psrldq                  m2, 1
   1301    movq      [dstq+strideq*0], m1
   1302    movq      [dstq+strideq*1], m2
   1303    psrldq                  m1, 1
   1304    psrldq                  m2, 1
   1305    movq      [dstq+strideq*2], m1
   1306    movq      [dstq+stride3q ], m2
   1307    RET
   1308 
   1309 cglobal vp9_ipred_vl_16x16, 4, 4, 5, dst, stride, l, a
   1310    mova                    m0, [aq]
   1311    DEFINE_ARGS dst, stride, stride3, cnt
   1312    lea               stride3q, [strideq*3]
   1313 %if cpuflag(ssse3)
   1314    mova                    m4, [pb_1toE_2xF]
   1315    pshufb                  m1, m0, m4
   1316    pshufb                  m2, m1, m4
   1317 %else
   1318    pand                    m4, m0, [pb_15x0_1xm1]  ; _______________F
   1319    psrldq                  m1, m0, 1               ; 123456789ABCDEF_
   1320    por                     m1, m4                  ; 123456789ABCDEFF
   1321    psrldq                  m2, m1, 1               ; 23456789ABCDEFF_
   1322    por                     m2, m4                  ; 23456789ABCDEFFF
   1323 %endif
   1324    LOWPASS                  2,  1,  0, 3
   1325    pavgb                   m1, m0
   1326    mov                   cntd, 4
   1327 .loop:
   1328    mova      [dstq+strideq*0], m1
   1329    mova      [dstq+strideq*1], m2
   1330 %if cpuflag(ssse3)
   1331    pshufb                  m1, m4
   1332    pshufb                  m2, m4
   1333 %else
   1334    psrldq                  m1, 1
   1335    psrldq                  m2, 1
   1336    por                     m1, m4
   1337    por                     m2, m4
   1338 %endif
   1339    mova      [dstq+strideq*2], m1
   1340    mova      [dstq+stride3q ], m2
   1341 %if cpuflag(ssse3)
   1342    pshufb                  m1, m4
   1343    pshufb                  m2, m4
   1344 %else
   1345    psrldq                  m1, 1
   1346    psrldq                  m2, 1
   1347    por                     m1, m4
   1348    por                     m2, m4
   1349 %endif
   1350    lea                   dstq, [dstq+strideq*4]
   1351    dec                   cntd
   1352    jg .loop
   1353    RET
   1354 
   1355 cglobal vp9_ipred_vl_32x32, 4, 4, 7, dst, stride, l, a
   1356    mova                    m0, [aq]
   1357    mova                    m5, [aq+16]
   1358    DEFINE_ARGS dst, stride, dst16, cnt
   1359    PALIGNR                 m2, m5, m0, 1, m4
   1360    PALIGNR                 m3, m5, m0, 2, m4
   1361    lea                 dst16q, [dstq  +strideq*8]
   1362    LOWPASS                  3,  2,  0, 6
   1363    pavgb                   m2, m0
   1364 %if cpuflag(ssse3)
   1365    mova                    m4, [pb_1toE_2xF]
   1366    pshufb                  m0, m5, m4
   1367    pshufb                  m1, m0, m4
   1368 %else
   1369    pand                    m4, m5, [pb_15x0_1xm1]  ; _______________F
   1370    psrldq                  m0, m5, 1               ; 123456789ABCDEF_
   1371    por                     m0, m4                  ; 123456789ABCDEFF
   1372    psrldq                  m1, m0, 1               ; 23456789ABCDEFF_
   1373    por                     m1, m4                  ; 23456789ABCDEFFF
   1374 %endif
   1375    lea                 dst16q, [dst16q+strideq*8]
   1376    LOWPASS                  1,  0,  5, 6
   1377    pavgb                   m0, m5
   1378 %if cpuflag(ssse3)
   1379    pshufb                  m5, [pb_15]
   1380 %else
   1381    punpckhbw               m5, m4, m4
   1382    pshufhw                 m5, m5, q3333
   1383    punpckhqdq              m5, m5
   1384 %endif
   1385    mov                   cntd, 8
   1386 
   1387 .loop:
   1388 %macro %%write 3
   1389    mova    [dstq+stride%1+ 0], %2
   1390    mova    [dstq+stride%1+16], %3
   1391    movhps  [dst16q+stride%1 ], %2
   1392    movu  [dst16q+stride%1+ 8], %3
   1393    movq  [dst16q+stride%1+24], m5
   1394 %if cpuflag(avx)
   1395    palignr                 %2, %3, %2, 1
   1396    pshufb                  %3, m4
   1397 %elif cpuflag(ssse3)
   1398    palignr                 m6, %3, %2, 1
   1399    pshufb                  %3, m4
   1400    mova                    %2, m6
   1401 %else
   1402    pslldq                  m6, %3, 15
   1403    psrldq                  %3, 1
   1404    psrldq                  %2, 1
   1405    por                     %3, m4
   1406    por                     %2, m6
   1407 %endif
   1408 %endmacro
   1409 
   1410    %%write                q*0, m2, m0
   1411    %%write                q*1, m3, m1
   1412    lea                   dstq, [dstq  +strideq*2]
   1413    lea                 dst16q, [dst16q+strideq*2]
   1414    dec                   cntd
   1415    jg .loop
   1416    RET
   1417 %endmacro
   1418 
   1419 INIT_XMM sse2
   1420 VL_XMM_FUNCS
   1421 INIT_XMM ssse3
   1422 VL_XMM_FUNCS
   1423 INIT_XMM avx
   1424 VL_XMM_FUNCS
   1425 
   1426 ; vr
   1427 
   1428 %macro VR_MMX_FUNCS 0
   1429 cglobal vp9_ipred_vr_4x4, 4, 4, 0, dst, stride, l, a
   1430    movq                    m1, [aq-1]
   1431    punpckldq               m2, [lq]
   1432    movd                    m0, [aq]
   1433    DEFINE_ARGS dst, stride, stride3
   1434    lea               stride3q, [strideq*3]
   1435    pavgb                   m0, m1
   1436    PALIGNR                 m1, m2, 5, m3
   1437    psrlq                   m2, m1, 8
   1438    psllq                   m3, m1, 8
   1439    LOWPASS                  2,  1, 3, 4
   1440 
   1441    ; ABCD <- for the following predictor:
   1442    ; EFGH
   1443    ; IABC  | m0 contains ABCDxxxx
   1444    ; JEFG  | m2 contains xJIEFGHx
   1445 
   1446 %if cpuflag(ssse3)
   1447    punpckldq               m0, m2
   1448    pshufb                  m2, [pb_13456_3xm1]
   1449    movd      [dstq+strideq*0], m0
   1450    pshufb                  m0, [pb_6012_4xm1]
   1451    movd      [dstq+stride3q ], m2
   1452    psrlq                   m2, 8
   1453    movd      [dstq+strideq*2], m0
   1454    movd      [dstq+strideq*1], m2
   1455 %else
   1456    psllq                   m1, m2, 40
   1457    psrlq                   m2, 24
   1458    movd      [dstq+strideq*0], m0
   1459    movd      [dstq+strideq*1], m2
   1460    PALIGNR                 m0, m1, 7, m3
   1461    psllq                   m1, 8
   1462    PALIGNR                 m2, m1, 7, m3
   1463    movd      [dstq+strideq*2], m0
   1464    movd      [dstq+stride3q ], m2
   1465 %endif
   1466    RET
   1467 %endmacro
   1468 
   1469 INIT_MMX mmxext
   1470 VR_MMX_FUNCS
   1471 INIT_MMX ssse3
   1472 VR_MMX_FUNCS
   1473 
   1474 %macro VR_XMM_FUNCS 1 ; n_xmm_regs for 16x16
   1475 cglobal vp9_ipred_vr_8x8, 4, 4, 5, dst, stride, l, a
   1476    movu                    m1, [aq-1]
   1477    movhps                  m2, [lq]
   1478    movq                    m0, [aq]
   1479    DEFINE_ARGS dst, stride, stride3
   1480    lea               stride3q, [strideq*3]
   1481    pavgb                   m0, m1
   1482    PALIGNR                 m1, m2, 9, m3
   1483    pslldq                  m2, m1, 1
   1484    pslldq                  m3, m1, 2
   1485    LOWPASS                  1,  2, 3, 4
   1486 
   1487    ; ABCDEFGH <- for the following predictor:
   1488    ; IJKLMNOP
   1489    ; QABCDEFG  | m0 contains ABCDEFGHxxxxxxxx
   1490    ; RIJKLMNO  | m1 contains xxVUTSRQIJKLMNOP
   1491    ; SQABCDEF
   1492    ; TRIJKLMN
   1493    ; USQABCDE
   1494    ; VTRIJKLM
   1495 
   1496 %if cpuflag(ssse3)
   1497    punpcklqdq              m0, m1 ; ABCDEFGHxxVUTSRQ
   1498 %endif
   1499    movq      [dstq+strideq*0], m0
   1500    movhps    [dstq+strideq*1], m1
   1501 %if cpuflag(ssse3)
   1502    pshufb                  m0, [pb_6xm1_BDF_0to6]  ; xxxxxxUSQABCDEFG
   1503    pshufb                  m1, [pb_6xm1_246_8toE]  ; xxxxxxVTRIJKLMNO
   1504 %else
   1505    psrlw                   m2, m1, 8               ; x_U_S_Q_xxxxxxxx
   1506    pand                    m3, m1, [pw_255]        ; x_V_T_R_xxxxxxxx
   1507    packuswb                m3, m2                  ; xVTRxxxxxUSQxxxx
   1508    pslldq                  m3, 4                   ; xxxxxVTRxxxxxUSQ
   1509    PALIGNR                 m0, m3, 7, m4           ; xxxxxxUSQABCDEFG
   1510    psrldq                  m1, 8
   1511    pslldq                  m3, 8
   1512    PALIGNR                 m1, m3, 7, m4           ; xxxxxxVTRIJKLMNO
   1513 %endif
   1514    movhps    [dstq+strideq*2], m0
   1515    movhps    [dstq+stride3q ], m1
   1516    lea                   dstq, [dstq+strideq*4]
   1517    pslldq                  m0, 1
   1518    pslldq                  m1, 1
   1519    movhps    [dstq+strideq*0], m0
   1520    movhps    [dstq+strideq*1], m1
   1521    pslldq                  m0, 1
   1522    pslldq                  m1, 1
   1523    movhps    [dstq+strideq*2], m0
   1524    movhps    [dstq+stride3q ], m1
   1525    RET
   1526 
   1527 cglobal vp9_ipred_vr_16x16, 4, 4, %1, dst, stride, l, a
   1528    mova                    m0, [aq]
   1529    movu                    m1, [aq-1]
   1530    mova                    m2, [lq]
   1531    DEFINE_ARGS dst, stride, stride3, cnt
   1532    lea               stride3q, [strideq*3]
   1533    PALIGNR                 m3, m1, m2, 15, m6
   1534    LOWPASS                  3,  1,  0,  4
   1535    pavgb                   m0, m1
   1536    PALIGNR                 m1, m2,  1, m6
   1537    pslldq                  m4, m2,  1
   1538    LOWPASS                  1,  2,  4,  5
   1539 %if cpuflag(ssse3)
   1540    pshufb                  m1, [pb_02468ACE_13579BDF]
   1541 %else
   1542    psrlw                   m5, m1, 8
   1543    pand                    m1, [pw_255]
   1544    packuswb                m1, m5
   1545 %endif
   1546    mov                   cntd, 4
   1547 
   1548 .loop:
   1549    movlhps                 m2, m1
   1550    mova      [dstq+strideq*0], m0
   1551    mova      [dstq+strideq*1], m3
   1552    PALIGNR                 m4, m0, m1, 15, m6
   1553    PALIGNR                 m5, m3, m2, 15, m6
   1554    mova      [dstq+strideq*2], m4
   1555    mova      [dstq+stride3q ], m5
   1556    lea                   dstq, [dstq+strideq*4]
   1557    PALIGNR                 m0, m1, 14, m6
   1558    PALIGNR                 m3, m2, 14, m6
   1559    pslldq                  m1, 2
   1560    dec                   cntd
   1561    jg .loop
   1562    RET
   1563 
   1564 cglobal vp9_ipred_vr_32x32, 4, 4, 9, dst, stride, l, a
   1565    mova                    m0, [aq]
   1566    mova                    m2, [aq+16]
   1567    movu                    m1, [aq-1]
   1568    PALIGNR                 m3, m2, m0, 15, m6
   1569    PALIGNR                 m4, m2, m0, 14, m6
   1570    LOWPASS                  4,  3,  2,  5
   1571    pavgb                   m3, m2
   1572    mova                    m2, [lq+16]
   1573    PALIGNR                 m5, m1, m2, 15, m6
   1574    LOWPASS                  5,  1,  0,  6
   1575    pavgb                   m0, m1
   1576    mova                    m6, [lq]
   1577 %if ARCH_X86_64
   1578    SWAP                     0, 8
   1579 %else
   1580    mova                [dstq], m0
   1581 %endif
   1582    PALIGNR                 m1, m2,  1, m0
   1583    PALIGNR                 m7, m2, m6, 15, m0
   1584    LOWPASS                  1,  2,  7,  0
   1585    PALIGNR                 m2, m6,  1, m0
   1586    pslldq                  m7, m6,  1
   1587    LOWPASS                  2,  6,  7,  0
   1588 %if cpuflag(ssse3)
   1589    pshufb                  m1, [pb_02468ACE_13579BDF]
   1590    pshufb                  m2, [pb_02468ACE_13579BDF]
   1591 %else
   1592    psrlw                   m0, m1, 8
   1593    psrlw                   m6, m2, 8
   1594    pand                    m1, [pw_255]
   1595    pand                    m2, [pw_255]
   1596    packuswb                m1, m0
   1597    packuswb                m2, m6
   1598 %endif
   1599    DEFINE_ARGS dst, stride, dst16, cnt
   1600    lea                 dst16q, [dstq  +strideq*8]
   1601    lea                 dst16q, [dst16q+strideq*8]
   1602    SBUTTERFLY             qdq,  2,  1,  6
   1603 %if ARCH_X86_64
   1604    SWAP                     0, 8
   1605 %else
   1606    mova                    m0, [dstq]
   1607 %endif
   1608    mov                   cntd, 8
   1609 
   1610 .loop:
   1611    ; even lines (0, 2, 4, ...): m1 | m0, m3
   1612    ;  odd lines (1, 3, 5, ...): m2 | m5, m4
   1613 %macro %%write 4
   1614    mova    [dstq+stride%1+ 0], %3
   1615    mova    [dstq+stride%1+16], %4
   1616    movhps  [dst16q+stride%1 ], %2
   1617    movu  [dst16q+stride%1+ 8], %3
   1618    movq  [dst16q+stride%1+24], %4
   1619    PALIGNR                 %4, %3, 15, m6
   1620    PALIGNR                 %3, %2, 15, m6
   1621    pslldq                  %2,  1
   1622 %endmacro
   1623 
   1624    %%write                q*0, m1, m0, m3
   1625    %%write                q*1, m2, m5, m4
   1626    lea                   dstq, [dstq  +strideq*2]
   1627    lea                 dst16q, [dst16q+strideq*2]
   1628    dec                   cntd
   1629    jg .loop
   1630    RET
   1631 %endmacro
   1632 
   1633 INIT_XMM sse2
   1634 VR_XMM_FUNCS 7
   1635 INIT_XMM ssse3
   1636 VR_XMM_FUNCS 6
   1637 INIT_XMM avx
   1638 VR_XMM_FUNCS 6
   1639 
   1640 ; hd
   1641 
   1642 INIT_MMX mmxext
   1643 cglobal vp9_ipred_hd_4x4, 4, 4, 0, dst, stride, l, a
   1644    movd                    m0, [lq]
   1645    punpckldq               m0, [aq-1]
   1646    DEFINE_ARGS dst, stride, stride3
   1647    lea               stride3q, [strideq*3]
   1648    psrlq                   m1, m0, 8
   1649    psrlq                   m2, m1, 8
   1650    LOWPASS                  2,  1, 0,  3
   1651    pavgb                   m1, m0
   1652 
   1653    ; DHIJ <- for the following predictor:
   1654    ; CGDH
   1655    ; BFCG  | m1 contains ABCDxxxx
   1656    ; AEBF  | m2 contains EFGHIJxx
   1657 
   1658    punpcklbw               m1, m2
   1659    punpckhdq               m0, m1, m2
   1660 
   1661    ; m1 contains AEBFCGDH
   1662    ; m0 contains CGDHIJxx
   1663 
   1664    movd      [dstq+stride3q ], m1
   1665    movd      [dstq+strideq*1], m0
   1666    psrlq                   m1, 16
   1667    psrlq                   m0, 16
   1668    movd      [dstq+strideq*2], m1
   1669    movd      [dstq+strideq*0], m0
   1670    RET
   1671 
   1672 %macro HD_XMM_FUNCS 0
   1673 cglobal vp9_ipred_hd_8x8, 4, 4, 5, dst, stride, l, a
   1674    movq                    m0, [lq]
   1675    movhps                  m0, [aq-1]
   1676    DEFINE_ARGS dst, stride, stride3, dst4
   1677    lea               stride3q, [strideq*3]
   1678    lea                  dst4q, [dstq+strideq*4]
   1679    psrldq                  m1, m0, 1
   1680    psrldq                  m2, m1, 1
   1681    LOWPASS                  2,  1,  0,  3
   1682    pavgb                   m1, m0
   1683 
   1684    ; HPQRSTUV <- for the following predictor
   1685    ; GOHPQRST
   1686    ; FNGOHPQR  | m1 contains ABCDEFGHxxxxxxxx
   1687    ; EMFNGOHP  | m2 contains IJKLMNOPQRSTUVxx
   1688    ; DLEMFNGO
   1689    ; CKDLEMFN
   1690    ; BJCKDLEM
   1691    ; AIBJCKDL
   1692 
   1693    punpcklbw               m1, m2
   1694    movhlps                 m2, m2
   1695 
   1696    ; m1 contains AIBJCKDLEMFNGOHP
   1697    ; m2 contains QRSTUVxxxxxxxxxx
   1698 
   1699    movhps   [dstq +stride3q ], m1
   1700    movq     [dst4q+stride3q ], m1
   1701    PALIGNR                 m3, m2, m1, 2, m4
   1702    movhps   [dstq +strideq*2], m3
   1703    movq     [dst4q+strideq*2], m3
   1704    PALIGNR                 m3, m2, m1, 4, m4
   1705    movhps   [dstq +strideq*1], m3
   1706    movq     [dst4q+strideq*1], m3
   1707    PALIGNR                 m2, m1, 6, m4
   1708    movhps   [dstq +strideq*0], m2
   1709    movq     [dst4q+strideq*0], m2
   1710    RET
   1711 
   1712 cglobal vp9_ipred_hd_16x16, 4, 6, 7, dst, stride, l, a
   1713    mova                    m0, [lq]
   1714    movu                    m3, [aq-1]
   1715    DEFINE_ARGS dst, stride, stride4, dst4, dst8, dst12
   1716    lea               stride4q, [strideq*4]
   1717    lea                  dst4q, [dstq +stride4q]
   1718    lea                  dst8q, [dst4q+stride4q]
   1719    lea                 dst12q, [dst8q+stride4q]
   1720    psrldq                  m4, m3,  1
   1721    psrldq                  m5, m3,  2
   1722    LOWPASS                  5,  4,  3,  6
   1723    PALIGNR                 m1, m3, m0,  1, m6
   1724    PALIGNR                 m2, m3, m0,  2, m6
   1725    LOWPASS                  2,  1,  0,  6
   1726    pavgb                   m1, m0
   1727    SBUTTERFLY              bw,  1,  2,  6
   1728 
   1729    ; I PROBABLY INVERTED L0 ad L16 here
   1730    ; m1, m2, m5
   1731 .loop:
   1732    sub               stride4q, strideq
   1733    movhps [dstq +stride4q +0], m2
   1734    movq   [dstq +stride4q +8], m5
   1735    mova   [dst4q+stride4q   ], m2
   1736    movhps [dst8q+stride4q +0], m1
   1737    movq   [dst8q+stride4q +8], m2
   1738    mova  [dst12q+stride4q   ], m1
   1739 %if cpuflag(avx)
   1740    palignr                 m1, m2, m1, 2
   1741    palignr                 m2, m5, m2, 2
   1742 %elif cpuflag(ssse3)
   1743    palignr                 m3, m2, m1, 2
   1744    palignr                 m0, m5, m2, 2
   1745    mova                    m1, m3
   1746    mova                    m2, m0
   1747 %else
   1748    ; slightly modified version of PALIGNR
   1749    mova                    m6, m2
   1750    mova                    m4, m5
   1751    pslldq                  m6, 14
   1752    pslldq                  m4, 14
   1753    psrldq                  m1, 2
   1754    psrldq                  m2, 2
   1755    por                     m1, m6
   1756    por                     m2, m4
   1757 %endif
   1758    psrldq                  m5, 2
   1759    jg .loop
   1760    RET
   1761 
   1762 cglobal vp9_ipred_hd_32x32, 4, 6, 8, dst, stride, l, a
   1763    mova                    m0, [lq]
   1764    mova                    m1, [lq+16]
   1765    movu                    m2, [aq-1]
   1766    movu                    m3, [aq+15]
   1767    DEFINE_ARGS dst, stride, stride8, dst8, dst16, dst24
   1768    lea               stride8q, [strideq*8]
   1769    lea                  dst8q, [dstq  +stride8q]
   1770    lea                 dst16q, [dst8q +stride8q]
   1771    lea                 dst24q, [dst16q+stride8q]
   1772    psrldq                  m4, m3,  1
   1773    psrldq                  m5, m3,  2
   1774    LOWPASS                  5,  4,  3,  6
   1775    PALIGNR                 m4, m3, m2,  2, m6
   1776    PALIGNR                 m3, m2,  1, m6
   1777    LOWPASS                  4,  3,  2,  6
   1778    PALIGNR                 m3, m2, m1,  2, m6
   1779    PALIGNR                 m2, m1,  1, m6
   1780    LOWPASS                  3,  2,  1,  6
   1781    pavgb                   m2, m1
   1782    PALIGNR                 m6, m1, m0,  1, m7
   1783    PALIGNR                 m1, m0,  2, m7
   1784    LOWPASS                  1,  6,  0,  7
   1785    pavgb                   m0, m6
   1786    SBUTTERFLY              bw,  2,  3,  6
   1787    SBUTTERFLY              bw,  0,  1,  6
   1788 
   1789    ; m0, m1, m2, m3, m4, m5
   1790 .loop:
   1791    sub               stride8q, strideq
   1792    mova  [dstq  +stride8q+ 0], m3
   1793    mova  [dstq  +stride8q+16], m4
   1794    mova  [dst8q +stride8q+ 0], m2
   1795    mova  [dst8q +stride8q+16], m3
   1796    mova  [dst16q+stride8q+ 0], m1
   1797    mova  [dst16q+stride8q+16], m2
   1798    mova  [dst24q+stride8q+ 0], m0
   1799    mova  [dst24q+stride8q+16], m1
   1800 %if cpuflag(avx)
   1801    palignr                 m0, m1, m0, 2
   1802    palignr                 m1, m2, m1, 2
   1803    palignr                 m2, m3, m2, 2
   1804    palignr                 m3, m4, m3, 2
   1805    palignr                 m4, m5, m4, 2
   1806    psrldq                  m5, 2
   1807 %elif cpuflag(ssse3)
   1808    psrldq                  m6, m5, 2
   1809    palignr                 m5, m4, 2
   1810    palignr                 m4, m3, 2
   1811    palignr                 m3, m2, 2
   1812    palignr                 m2, m1, 2
   1813    palignr                 m1, m0, 2
   1814    mova                    m0, m1
   1815    mova                    m1, m2
   1816    mova                    m2, m3
   1817    mova                    m3, m4
   1818    mova                    m4, m5
   1819    mova                    m5, m6
   1820 %else
   1821    ; sort of a half-integrated version of PALIGNR
   1822    pslldq                  m7, m4, 14
   1823    pslldq                  m6, m5, 14
   1824    psrldq                  m4, 2
   1825    psrldq                  m5, 2
   1826    por                     m4, m6
   1827    pslldq                  m6, m3, 14
   1828    psrldq                  m3, 2
   1829    por                     m3, m7
   1830    pslldq                  m7, m2, 14
   1831    psrldq                  m2, 2
   1832    por                     m2, m6
   1833    pslldq                  m6, m1, 14
   1834    psrldq                  m1, 2
   1835    por                     m1, m7
   1836    psrldq                  m0, 2
   1837    por                     m0, m6
   1838 %endif
   1839    jg .loop
   1840    RET
   1841 %endmacro
   1842 
   1843 INIT_XMM sse2
   1844 HD_XMM_FUNCS
   1845 INIT_XMM ssse3
   1846 HD_XMM_FUNCS
   1847 INIT_XMM avx
   1848 HD_XMM_FUNCS
   1849 
   1850 %macro HU_MMX_FUNCS 0
   1851 cglobal vp9_ipred_hu_4x4, 3, 3, 0, dst, stride, l
   1852    movd                    m0, [lq]
   1853 %if cpuflag(ssse3)
   1854    pshufb                  m0, [pb_0to2_5x3]
   1855 %else
   1856    punpcklbw               m1, m0, m0          ; 00112233
   1857    pshufw                  m1, m1, q3333       ; 33333333
   1858    punpckldq               m0, m1              ; 01233333
   1859 %endif
   1860    psrlq                   m1, m0, 8
   1861    psrlq                   m2, m1, 8
   1862    LOWPASS                  2,  1, 0, 3
   1863    pavgb                   m1, m0
   1864    DEFINE_ARGS dst, stride, stride3
   1865    lea               stride3q, [strideq*3]
   1866    SBUTTERFLY              bw,  1, 2, 0
   1867    PALIGNR                 m2, m1, 2, m0
   1868    movd      [dstq+strideq*0], m1
   1869    movd      [dstq+strideq*1], m2
   1870    punpckhdq               m1, m1
   1871    punpckhdq               m2, m2
   1872    movd      [dstq+strideq*2], m1
   1873    movd      [dstq+stride3q ], m2
   1874    RET
   1875 %endmacro
   1876 
   1877 INIT_MMX mmxext
   1878 HU_MMX_FUNCS
   1879 INIT_MMX ssse3
   1880 HU_MMX_FUNCS
   1881 
   1882 %macro HU_XMM_FUNCS 1 ; n_xmm_regs in hu_32x32
   1883 cglobal vp9_ipred_hu_8x8, 3, 4, 4, dst, stride, l
   1884    movq                    m0, [lq]
   1885 %if cpuflag(ssse3)
   1886    pshufb                  m0, [pb_0to6_9x7]
   1887 %else
   1888    punpcklbw               m1, m0, m0          ; 0011223344556677
   1889    punpckhwd               m1, m1              ; 4444555566667777
   1890    shufps                  m0, m1, q3310       ; 0123456777777777
   1891 %endif
   1892    psrldq                  m1, m0, 1
   1893    psrldq                  m2, m1, 1
   1894    LOWPASS                  2,  1, 0, 3
   1895    pavgb                   m1, m0
   1896    DEFINE_ARGS dst, stride, stride3, dst4
   1897    lea               stride3q, [strideq*3]
   1898    lea                  dst4q, [dstq+strideq*4]
   1899    SBUTTERFLY              bw,  1, 2, 0
   1900    movq     [dstq +strideq*0], m1
   1901    movhps   [dst4q+strideq*0], m1
   1902    PALIGNR                 m0, m2, m1, 2, m3
   1903    movq     [dstq +strideq*1], m0
   1904    movhps   [dst4q+strideq*1], m0
   1905    PALIGNR                 m0, m2, m1, 4, m3
   1906    movq     [dstq +strideq*2], m0
   1907    movhps   [dst4q+strideq*2], m0
   1908    PALIGNR                 m2, m1, 6, m3
   1909    movq     [dstq +stride3q ], m2
   1910    movhps   [dst4q+stride3q ], m2
   1911    RET
   1912 
   1913 cglobal vp9_ipred_hu_16x16, 3, 4, 5, dst, stride, l
   1914    mova                    m0, [lq]
   1915 %if cpuflag(ssse3)
   1916    mova                    m3, [pb_2toE_3xF]
   1917    pshufb                  m1, m0, [pb_1toE_2xF]
   1918    pshufb                  m2, m0, m3
   1919 %else
   1920    pand                    m3, m0, [pb_15x0_1xm1]
   1921    psrldq                  m1, m0, 1
   1922    por                     m1, m3
   1923    punpckhbw               m3, m3
   1924    psrldq                  m2, m0, 2
   1925    por                     m2, m3
   1926 %endif
   1927    LOWPASS                  2,  1,  0,  4
   1928    pavgb                   m1, m0
   1929    DEFINE_ARGS dst, stride, stride9, cnt
   1930    lea                stride9q, [strideq*8+strideq]
   1931    mov                   cntd,  4
   1932    SBUTTERFLY              bw,  1,  2,  0
   1933 
   1934 .loop:
   1935    mova      [dstq+strideq*0], m1
   1936    mova      [dstq+strideq*8], m2
   1937    PALIGNR                 m0, m2, m1, 2, m4
   1938 %if cpuflag(ssse3)
   1939    pshufb                  m2, m3
   1940 %else
   1941    psrldq                  m2, 2
   1942    por                     m2, m3
   1943 %endif
   1944    mova      [dstq+strideq*1], m0
   1945    mova      [dstq+stride9q ], m2
   1946    PALIGNR                 m1, m2, m0, 2, m4
   1947 %if cpuflag(ssse3)
   1948    pshufb                  m2, m3
   1949 %else
   1950    psrldq                  m2, 2
   1951    por                     m2, m3
   1952 %endif
   1953    lea                   dstq, [dstq+strideq*2]
   1954    dec                   cntd
   1955    jg .loop
   1956    RET
   1957 
   1958 cglobal vp9_ipred_hu_32x32, 3, 7, %1, dst, stride, l
   1959    mova                    m1, [lq]
   1960    mova                    m0, [lq+16]
   1961    PALIGNR                 m2, m0, m1,  1, m5
   1962    PALIGNR                 m3, m0, m1,  2, m5
   1963    LOWPASS                  3,  2,  1,  5
   1964    pavgb                   m2, m1
   1965 %if cpuflag(ssse3)
   1966    mova                    m4, [pb_2toE_3xF]
   1967    pshufb                  m5, m0, [pb_1toE_2xF]
   1968    pshufb                  m1, m0, m4
   1969 %else
   1970    pand                    m4, m0, [pb_15x0_1xm1]
   1971    psrldq                  m5, m0, 1
   1972    por                     m5, m4
   1973    punpckhbw               m4, m4
   1974    psrldq                  m1, m0, 2
   1975    por                     m1, m4
   1976 %endif
   1977    LOWPASS                  1,  5,  0,  6
   1978    pavgb                   m0, m5
   1979    DEFINE_ARGS dst, stride, cnt, stride0, dst8, dst16, dst24
   1980    mov                   cntd,  8
   1981    xor               stride0q, stride0q
   1982    lea                  dst8q, [dstq  +strideq*8]
   1983    lea                 dst16q, [dst8q +strideq*8]
   1984    lea                 dst24q, [dst16q+strideq*8]
   1985    SBUTTERFLY              bw,  0,  1,  5
   1986    SBUTTERFLY              bw,  2,  3,  5
   1987 %if cpuflag(ssse3)
   1988    pshufb                  m6, m1, [pb_15]
   1989 %else
   1990    pshufhw                 m6, m4, q3333
   1991    punpckhqdq              m6, m6
   1992 %endif
   1993 
   1994 .loop:
   1995    mova  [dstq  +stride0q+ 0], m2
   1996    mova  [dstq  +stride0q+16], m3
   1997    mova  [dst8q +stride0q+ 0], m3
   1998    mova  [dst8q +stride0q+16], m0
   1999    mova  [dst16q+stride0q+ 0], m0
   2000    mova  [dst16q+stride0q+16], m1
   2001    mova  [dst24q+stride0q+ 0], m1
   2002    mova  [dst24q+stride0q+16], m6
   2003 %if cpuflag(avx)
   2004    palignr                 m2, m3, m2, 2
   2005    palignr                 m3, m0, m3, 2
   2006    palignr                 m0, m1, m0, 2
   2007    pshufb                  m1, m4
   2008 %elif cpuflag(ssse3)
   2009    pshufb                  m5, m1, m4
   2010    palignr                 m1, m0, 2
   2011    palignr                 m0, m3, 2
   2012    palignr                 m3, m2, 2
   2013    mova                    m2, m3
   2014    mova                    m3, m0
   2015    mova                    m0, m1
   2016    mova                    m1, m5
   2017 %else
   2018    ; half-integrated version of PALIGNR
   2019    pslldq                  m5, m1, 14
   2020    pslldq                  m7, m0, 14
   2021    psrldq                  m1, 2
   2022    psrldq                  m0, 2
   2023    por                     m1, m4
   2024    por                     m0, m5
   2025    pslldq                  m5, m3, 14
   2026    psrldq                  m3, 2
   2027    por                     m3, m7
   2028    psrldq                  m2, 2
   2029    por                     m2, m5
   2030 %endif
   2031    add               stride0q, strideq
   2032    dec                   cntd
   2033    jg .loop
   2034    RET
   2035 %endmacro
   2036 
   2037 INIT_XMM sse2
   2038 HU_XMM_FUNCS 8
   2039 INIT_XMM ssse3
   2040 HU_XMM_FUNCS 7
   2041 INIT_XMM avx
   2042 HU_XMM_FUNCS 7
   2043 
   2044 ; FIXME 127, 128, 129 ?