tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

vp9mc.asm (18733B)


      1 ;******************************************************************************
      2 ;* VP9 motion compensation SIMD optimizations
      3 ;*
      4 ;* Copyright (c) 2013 Ronald S. Bultje <rsbultje gmail com>
      5 ;*
      6 ;* This file is part of FFmpeg.
      7 ;*
      8 ;* FFmpeg is free software; you can redistribute it and/or
      9 ;* modify it under the terms of the GNU Lesser General Public
     10 ;* License as published by the Free Software Foundation; either
     11 ;* version 2.1 of the License, or (at your option) any later version.
     12 ;*
     13 ;* FFmpeg is distributed in the hope that it will be useful,
     14 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
     15 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
     16 ;* Lesser General Public License for more details.
     17 ;*
     18 ;* You should have received a copy of the GNU Lesser General Public
     19 ;* License along with FFmpeg; if not, write to the Free Software
     20 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
     21 ;******************************************************************************
     22 
     23 %include "libavutil/x86/x86util.asm"
     24 
     25 SECTION_RODATA 32
     26 
     27 cextern pw_256
     28 cextern pw_64
     29 
     30 %macro F8_SSSE3_TAPS 8
     31 times 16 db %1, %2
     32 times 16 db %3, %4
     33 times 16 db %5, %6
     34 times 16 db %7, %8
     35 %endmacro
     36 
     37 %macro F8_SSE2_TAPS 8
     38 times 8 dw %1
     39 times 8 dw %2
     40 times 8 dw %3
     41 times 8 dw %4
     42 times 8 dw %5
     43 times 8 dw %6
     44 times 8 dw %7
     45 times 8 dw %8
     46 %endmacro
     47 
     48 %macro F8_16BPP_TAPS 8
     49 times 8 dw %1, %2
     50 times 8 dw %3, %4
     51 times 8 dw %5, %6
     52 times 8 dw %7, %8
     53 %endmacro
     54 
     55 %macro FILTER 1
     56 const filters_%1 ; smooth
     57                    F8_TAPS -3, -1,  32,  64,  38,   1, -3,  0
     58                    F8_TAPS -2, -2,  29,  63,  41,   2, -3,  0
     59                    F8_TAPS -2, -2,  26,  63,  43,   4, -4,  0
     60                    F8_TAPS -2, -3,  24,  62,  46,   5, -4,  0
     61                    F8_TAPS -2, -3,  21,  60,  49,   7, -4,  0
     62                    F8_TAPS -1, -4,  18,  59,  51,   9, -4,  0
     63                    F8_TAPS -1, -4,  16,  57,  53,  12, -4, -1
     64                    F8_TAPS -1, -4,  14,  55,  55,  14, -4, -1
     65                    F8_TAPS -1, -4,  12,  53,  57,  16, -4, -1
     66                    F8_TAPS  0, -4,   9,  51,  59,  18, -4, -1
     67                    F8_TAPS  0, -4,   7,  49,  60,  21, -3, -2
     68                    F8_TAPS  0, -4,   5,  46,  62,  24, -3, -2
     69                    F8_TAPS  0, -4,   4,  43,  63,  26, -2, -2
     70                    F8_TAPS  0, -3,   2,  41,  63,  29, -2, -2
     71                    F8_TAPS  0, -3,   1,  38,  64,  32, -1, -3
     72                    ; regular
     73                    F8_TAPS  0,  1,  -5, 126,   8,  -3,  1,  0
     74                    F8_TAPS -1,  3, -10, 122,  18,  -6,  2,  0
     75                    F8_TAPS -1,  4, -13, 118,  27,  -9,  3, -1
     76                    F8_TAPS -1,  4, -16, 112,  37, -11,  4, -1
     77                    F8_TAPS -1,  5, -18, 105,  48, -14,  4, -1
     78                    F8_TAPS -1,  5, -19,  97,  58, -16,  5, -1
     79                    F8_TAPS -1,  6, -19,  88,  68, -18,  5, -1
     80                    F8_TAPS -1,  6, -19,  78,  78, -19,  6, -1
     81                    F8_TAPS -1,  5, -18,  68,  88, -19,  6, -1
     82                    F8_TAPS -1,  5, -16,  58,  97, -19,  5, -1
     83                    F8_TAPS -1,  4, -14,  48, 105, -18,  5, -1
     84                    F8_TAPS -1,  4, -11,  37, 112, -16,  4, -1
     85                    F8_TAPS -1,  3,  -9,  27, 118, -13,  4, -1
     86                    F8_TAPS  0,  2,  -6,  18, 122, -10,  3, -1
     87                    F8_TAPS  0,  1,  -3,   8, 126,  -5,  1,  0
     88                    ; sharp
     89                    F8_TAPS -1,  3,  -7, 127,   8,  -3,  1,  0
     90                    F8_TAPS -2,  5, -13, 125,  17,  -6,  3, -1
     91                    F8_TAPS -3,  7, -17, 121,  27, -10,  5, -2
     92                    F8_TAPS -4,  9, -20, 115,  37, -13,  6, -2
     93                    F8_TAPS -4, 10, -23, 108,  48, -16,  8, -3
     94                    F8_TAPS -4, 10, -24, 100,  59, -19,  9, -3
     95                    F8_TAPS -4, 11, -24,  90,  70, -21, 10, -4
     96                    F8_TAPS -4, 11, -23,  80,  80, -23, 11, -4
     97                    F8_TAPS -4, 10, -21,  70,  90, -24, 11, -4
     98                    F8_TAPS -3,  9, -19,  59, 100, -24, 10, -4
     99                    F8_TAPS -3,  8, -16,  48, 108, -23, 10, -4
    100                    F8_TAPS -2,  6, -13,  37, 115, -20,  9, -4
    101                    F8_TAPS -2,  5, -10,  27, 121, -17,  7, -3
    102                    F8_TAPS -1,  3,  -6,  17, 125, -13,  5, -2
    103                    F8_TAPS  0,  1,  -3,   8, 127,  -7,  3, -1
    104 %endmacro
    105 
    106 %define F8_TAPS F8_SSSE3_TAPS
    107 ; int8_t ff_filters_ssse3[3][15][4][32]
    108 FILTER ssse3
    109 %define F8_TAPS F8_SSE2_TAPS
    110 ; int16_t ff_filters_sse2[3][15][8][8]
    111 FILTER sse2
    112 %define F8_TAPS F8_16BPP_TAPS
    113 ; int16_t ff_filters_16bpp[3][15][4][16]
    114 FILTER 16bpp
    115 
    116 SECTION .text
    117 
    118 %macro filter_sse2_h_fn 1
    119 %assign %%px mmsize/2
    120 cglobal vp9_%1_8tap_1d_h_ %+ %%px %+ _8, 6, 6, 15, dst, dstride, src, sstride, h, filtery
    121    pxor        m5, m5
    122    mova        m6, [pw_64]
    123    mova        m7, [filteryq+  0]
    124 %if ARCH_X86_64 && mmsize > 8
    125    mova        m8, [filteryq+ 16]
    126    mova        m9, [filteryq+ 32]
    127    mova       m10, [filteryq+ 48]
    128    mova       m11, [filteryq+ 64]
    129    mova       m12, [filteryq+ 80]
    130    mova       m13, [filteryq+ 96]
    131    mova       m14, [filteryq+112]
    132 %endif
    133 .loop:
    134    movh        m0, [srcq-3]
    135    movh        m1, [srcq-2]
    136    movh        m2, [srcq-1]
    137    movh        m3, [srcq+0]
    138    movh        m4, [srcq+1]
    139    punpcklbw   m0, m5
    140    punpcklbw   m1, m5
    141    punpcklbw   m2, m5
    142    punpcklbw   m3, m5
    143    punpcklbw   m4, m5
    144    pmullw      m0, m7
    145 %if ARCH_X86_64 && mmsize > 8
    146    pmullw      m1, m8
    147    pmullw      m2, m9
    148    pmullw      m3, m10
    149    pmullw      m4, m11
    150 %else
    151    pmullw      m1, [filteryq+ 16]
    152    pmullw      m2, [filteryq+ 32]
    153    pmullw      m3, [filteryq+ 48]
    154    pmullw      m4, [filteryq+ 64]
    155 %endif
    156    paddw       m0, m1
    157    paddw       m2, m3
    158    paddw       m0, m4
    159    movh        m1, [srcq+2]
    160    movh        m3, [srcq+3]
    161    movh        m4, [srcq+4]
    162    add       srcq, sstrideq
    163    punpcklbw   m1, m5
    164    punpcklbw   m3, m5
    165    punpcklbw   m4, m5
    166 %if ARCH_X86_64 && mmsize > 8
    167    pmullw      m1, m12
    168    pmullw      m3, m13
    169    pmullw      m4, m14
    170 %else
    171    pmullw      m1, [filteryq+ 80]
    172    pmullw      m3, [filteryq+ 96]
    173    pmullw      m4, [filteryq+112]
    174 %endif
    175    paddw       m0, m1
    176    paddw       m3, m4
    177    paddw       m0, m6
    178    paddw       m2, m3
    179    paddsw      m0, m2
    180    psraw       m0, 7
    181 %ifidn %1, avg
    182    movh        m1, [dstq]
    183 %endif
    184    packuswb    m0, m0
    185 %ifidn %1, avg
    186    pavgb       m0, m1
    187 %endif
    188    movh    [dstq], m0
    189    add       dstq, dstrideq
    190    dec         hd
    191    jg .loop
    192    RET
    193 %endmacro
    194 
    195 INIT_MMX mmxext
    196 filter_sse2_h_fn put
    197 filter_sse2_h_fn avg
    198 
    199 INIT_XMM sse2
    200 filter_sse2_h_fn put
    201 filter_sse2_h_fn avg
    202 
    203 %macro filter_h_fn 1
    204 %assign %%px mmsize/2
    205 cglobal vp9_%1_8tap_1d_h_ %+ %%px %+ _8, 6, 6, 11, dst, dstride, src, sstride, h, filtery
    206    mova        m6, [pw_256]
    207    mova        m7, [filteryq+ 0]
    208 %if ARCH_X86_64 && mmsize > 8
    209    mova        m8, [filteryq+32]
    210    mova        m9, [filteryq+64]
    211    mova       m10, [filteryq+96]
    212 %endif
    213 .loop:
    214    movh        m0, [srcq-3]
    215    movh        m1, [srcq-2]
    216    movh        m2, [srcq-1]
    217    movh        m3, [srcq+0]
    218    movh        m4, [srcq+1]
    219    movh        m5, [srcq+2]
    220    punpcklbw   m0, m1
    221    punpcklbw   m2, m3
    222    movh        m1, [srcq+3]
    223    movh        m3, [srcq+4]
    224    add       srcq, sstrideq
    225    punpcklbw   m4, m5
    226    punpcklbw   m1, m3
    227    pmaddubsw   m0, m7
    228 %if ARCH_X86_64 && mmsize > 8
    229    pmaddubsw   m2, m8
    230    pmaddubsw   m4, m9
    231    pmaddubsw   m1, m10
    232 %else
    233    pmaddubsw   m2, [filteryq+32]
    234    pmaddubsw   m4, [filteryq+64]
    235    pmaddubsw   m1, [filteryq+96]
    236 %endif
    237    paddw       m0, m4
    238    paddw       m2, m1
    239    paddsw      m0, m2
    240    pmulhrsw    m0, m6
    241 %ifidn %1, avg
    242    movh        m1, [dstq]
    243 %endif
    244    packuswb    m0, m0
    245 %ifidn %1, avg
    246    pavgb       m0, m1
    247 %endif
    248    movh    [dstq], m0
    249    add       dstq, dstrideq
    250    dec         hd
    251    jg .loop
    252    RET
    253 %endmacro
    254 
    255 INIT_MMX ssse3
    256 filter_h_fn put
    257 filter_h_fn avg
    258 
    259 INIT_XMM ssse3
    260 filter_h_fn put
    261 filter_h_fn avg
    262 
    263 %if ARCH_X86_64
    264 %macro filter_hx2_fn 1
    265 %assign %%px mmsize
    266 cglobal vp9_%1_8tap_1d_h_ %+ %%px %+ _8, 6, 6, 14, dst, dstride, src, sstride, h, filtery
    267    mova       m13, [pw_256]
    268    mova        m8, [filteryq+ 0]
    269    mova        m9, [filteryq+32]
    270    mova       m10, [filteryq+64]
    271    mova       m11, [filteryq+96]
    272 .loop:
    273    movu        m0, [srcq-3]
    274    movu        m1, [srcq-2]
    275    movu        m2, [srcq-1]
    276    movu        m3, [srcq+0]
    277    movu        m4, [srcq+1]
    278    movu        m5, [srcq+2]
    279    movu        m6, [srcq+3]
    280    movu        m7, [srcq+4]
    281    add       srcq, sstrideq
    282    SBUTTERFLY  bw, 0, 1, 12
    283    SBUTTERFLY  bw, 2, 3, 12
    284    SBUTTERFLY  bw, 4, 5, 12
    285    SBUTTERFLY  bw, 6, 7, 12
    286    pmaddubsw   m0, m8
    287    pmaddubsw   m1, m8
    288    pmaddubsw   m2, m9
    289    pmaddubsw   m3, m9
    290    pmaddubsw   m4, m10
    291    pmaddubsw   m5, m10
    292    pmaddubsw   m6, m11
    293    pmaddubsw   m7, m11
    294    paddw       m0, m4
    295    paddw       m1, m5
    296    paddw       m2, m6
    297    paddw       m3, m7
    298    paddsw      m0, m2
    299    paddsw      m1, m3
    300    pmulhrsw    m0, m13
    301    pmulhrsw    m1, m13
    302    packuswb    m0, m1
    303 %ifidn %1, avg
    304    pavgb       m0, [dstq]
    305 %endif
    306    mova    [dstq], m0
    307    add       dstq, dstrideq
    308    dec         hd
    309    jg .loop
    310    RET
    311 %endmacro
    312 
    313 INIT_XMM ssse3
    314 filter_hx2_fn put
    315 filter_hx2_fn avg
    316 
    317 %if HAVE_AVX2_EXTERNAL
    318 INIT_YMM avx2
    319 filter_hx2_fn put
    320 filter_hx2_fn avg
    321 %endif
    322 
    323 %endif ; ARCH_X86_64
    324 
    325 %macro filter_sse2_v_fn 1
    326 %assign %%px mmsize/2
    327 %if ARCH_X86_64
    328 cglobal vp9_%1_8tap_1d_v_ %+ %%px %+ _8, 6, 8, 15, dst, dstride, src, sstride, h, filtery, src4, sstride3
    329 %else
    330 cglobal vp9_%1_8tap_1d_v_ %+ %%px %+ _8, 4, 7, 15, dst, dstride, src, sstride, filtery, src4, sstride3
    331    mov   filteryq, r5mp
    332 %define hd r4mp
    333 %endif
    334    pxor        m5, m5
    335    mova        m6, [pw_64]
    336    lea  sstride3q, [sstrideq*3]
    337    lea      src4q, [srcq+sstrideq]
    338    sub       srcq, sstride3q
    339    mova        m7, [filteryq+  0]
    340 %if ARCH_X86_64 && mmsize > 8
    341    mova        m8, [filteryq+ 16]
    342    mova        m9, [filteryq+ 32]
    343    mova       m10, [filteryq+ 48]
    344    mova       m11, [filteryq+ 64]
    345    mova       m12, [filteryq+ 80]
    346    mova       m13, [filteryq+ 96]
    347    mova       m14, [filteryq+112]
    348 %endif
    349 .loop:
    350    ; FIXME maybe reuse loads from previous rows, or just
    351    ; more generally unroll this to prevent multiple loads of
    352    ; the same data?
    353    movh        m0, [srcq]
    354    movh        m1, [srcq+sstrideq]
    355    movh        m2, [srcq+sstrideq*2]
    356    movh        m3, [srcq+sstride3q]
    357    add       srcq, sstrideq
    358    movh        m4, [src4q]
    359    punpcklbw   m0, m5
    360    punpcklbw   m1, m5
    361    punpcklbw   m2, m5
    362    punpcklbw   m3, m5
    363    punpcklbw   m4, m5
    364    pmullw      m0, m7
    365 %if ARCH_X86_64 && mmsize > 8
    366    pmullw      m1, m8
    367    pmullw      m2, m9
    368    pmullw      m3, m10
    369    pmullw      m4, m11
    370 %else
    371    pmullw      m1, [filteryq+ 16]
    372    pmullw      m2, [filteryq+ 32]
    373    pmullw      m3, [filteryq+ 48]
    374    pmullw      m4, [filteryq+ 64]
    375 %endif
    376    paddw       m0, m1
    377    paddw       m2, m3
    378    paddw       m0, m4
    379    movh        m1, [src4q+sstrideq]
    380    movh        m3, [src4q+sstrideq*2]
    381    movh        m4, [src4q+sstride3q]
    382    add      src4q, sstrideq
    383    punpcklbw   m1, m5
    384    punpcklbw   m3, m5
    385    punpcklbw   m4, m5
    386 %if ARCH_X86_64 && mmsize > 8
    387    pmullw      m1, m12
    388    pmullw      m3, m13
    389    pmullw      m4, m14
    390 %else
    391    pmullw      m1, [filteryq+ 80]
    392    pmullw      m3, [filteryq+ 96]
    393    pmullw      m4, [filteryq+112]
    394 %endif
    395    paddw       m0, m1
    396    paddw       m3, m4
    397    paddw       m0, m6
    398    paddw       m2, m3
    399    paddsw      m0, m2
    400    psraw       m0, 7
    401 %ifidn %1, avg
    402    movh        m1, [dstq]
    403 %endif
    404    packuswb    m0, m0
    405 %ifidn %1, avg
    406    pavgb       m0, m1
    407 %endif
    408    movh    [dstq], m0
    409    add       dstq, dstrideq
    410    dec         hd
    411    jg .loop
    412    RET
    413 %endmacro
    414 
    415 INIT_MMX mmxext
    416 filter_sse2_v_fn put
    417 filter_sse2_v_fn avg
    418 
    419 INIT_XMM sse2
    420 filter_sse2_v_fn put
    421 filter_sse2_v_fn avg
    422 
    423 %macro filter_v_fn 1
    424 %assign %%px mmsize/2
    425 %if ARCH_X86_64
    426 cglobal vp9_%1_8tap_1d_v_ %+ %%px %+ _8, 6, 8, 11, dst, dstride, src, sstride, h, filtery, src4, sstride3
    427 %else
    428 cglobal vp9_%1_8tap_1d_v_ %+ %%px %+ _8, 4, 7, 11, dst, dstride, src, sstride, filtery, src4, sstride3
    429    mov   filteryq, r5mp
    430 %define hd r4mp
    431 %endif
    432    mova        m6, [pw_256]
    433    lea  sstride3q, [sstrideq*3]
    434    lea      src4q, [srcq+sstrideq]
    435    sub       srcq, sstride3q
    436    mova        m7, [filteryq+ 0]
    437 %if ARCH_X86_64 && mmsize > 8
    438    mova        m8, [filteryq+32]
    439    mova        m9, [filteryq+64]
    440    mova       m10, [filteryq+96]
    441 %endif
    442 .loop:
    443    ; FIXME maybe reuse loads from previous rows, or just more generally
    444    ; unroll this to prevent multiple loads of the same data?
    445    movh        m0, [srcq]
    446    movh        m1, [srcq+sstrideq]
    447    movh        m2, [srcq+sstrideq*2]
    448    movh        m3, [srcq+sstride3q]
    449    movh        m4, [src4q]
    450    movh        m5, [src4q+sstrideq]
    451    punpcklbw   m0, m1
    452    punpcklbw   m2, m3
    453    movh        m1, [src4q+sstrideq*2]
    454    movh        m3, [src4q+sstride3q]
    455    add       srcq, sstrideq
    456    add      src4q, sstrideq
    457    punpcklbw   m4, m5
    458    punpcklbw   m1, m3
    459    pmaddubsw   m0, m7
    460 %if ARCH_X86_64 && mmsize > 8
    461    pmaddubsw   m2, m8
    462    pmaddubsw   m4, m9
    463    pmaddubsw   m1, m10
    464 %else
    465    pmaddubsw   m2, [filteryq+32]
    466    pmaddubsw   m4, [filteryq+64]
    467    pmaddubsw   m1, [filteryq+96]
    468 %endif
    469    paddw       m0, m4
    470    paddw       m2, m1
    471    paddsw      m0, m2
    472    pmulhrsw    m0, m6
    473 %ifidn %1, avg
    474    movh        m1, [dstq]
    475 %endif
    476    packuswb    m0, m0
    477 %ifidn %1, avg
    478    pavgb       m0, m1
    479 %endif
    480    movh    [dstq], m0
    481    add       dstq, dstrideq
    482    dec         hd
    483    jg .loop
    484    RET
    485 %endmacro
    486 
    487 INIT_MMX ssse3
    488 filter_v_fn put
    489 filter_v_fn avg
    490 
    491 INIT_XMM ssse3
    492 filter_v_fn put
    493 filter_v_fn avg
    494 
    495 %if ARCH_X86_64
    496 
    497 %macro filter_vx2_fn 1
    498 %assign %%px mmsize
    499 cglobal vp9_%1_8tap_1d_v_ %+ %%px %+ _8, 6, 8, 14, dst, dstride, src, sstride, h, filtery, src4, sstride3
    500    mova       m13, [pw_256]
    501    lea  sstride3q, [sstrideq*3]
    502    lea      src4q, [srcq+sstrideq]
    503    sub       srcq, sstride3q
    504    mova        m8, [filteryq+ 0]
    505    mova        m9, [filteryq+32]
    506    mova       m10, [filteryq+64]
    507    mova       m11, [filteryq+96]
    508 .loop:
    509    ; FIXME maybe reuse loads from previous rows, or just
    510    ; more generally unroll this to prevent multiple loads of
    511    ; the same data?
    512    movu        m0, [srcq]
    513    movu        m1, [srcq+sstrideq]
    514    movu        m2, [srcq+sstrideq*2]
    515    movu        m3, [srcq+sstride3q]
    516    movu        m4, [src4q]
    517    movu        m5, [src4q+sstrideq]
    518    movu        m6, [src4q+sstrideq*2]
    519    movu        m7, [src4q+sstride3q]
    520    add       srcq, sstrideq
    521    add      src4q, sstrideq
    522    SBUTTERFLY  bw, 0, 1, 12
    523    SBUTTERFLY  bw, 2, 3, 12
    524    SBUTTERFLY  bw, 4, 5, 12
    525    SBUTTERFLY  bw, 6, 7, 12
    526    pmaddubsw   m0, m8
    527    pmaddubsw   m1, m8
    528    pmaddubsw   m2, m9
    529    pmaddubsw   m3, m9
    530    pmaddubsw   m4, m10
    531    pmaddubsw   m5, m10
    532    pmaddubsw   m6, m11
    533    pmaddubsw   m7, m11
    534    paddw       m0, m4
    535    paddw       m1, m5
    536    paddw       m2, m6
    537    paddw       m3, m7
    538    paddsw      m0, m2
    539    paddsw      m1, m3
    540    pmulhrsw    m0, m13
    541    pmulhrsw    m1, m13
    542    packuswb    m0, m1
    543 %ifidn %1, avg
    544    pavgb       m0, [dstq]
    545 %endif
    546    mova    [dstq], m0
    547    add       dstq, dstrideq
    548    dec         hd
    549    jg .loop
    550    RET
    551 %endmacro
    552 
    553 INIT_XMM ssse3
    554 filter_vx2_fn put
    555 filter_vx2_fn avg
    556 
    557 %if HAVE_AVX2_EXTERNAL
    558 INIT_YMM avx2
    559 filter_vx2_fn put
    560 filter_vx2_fn avg
    561 %endif
    562 
    563 %endif ; ARCH_X86_64
    564 
    565 %macro fpel_fn 6-8 0, 4
    566 %if %2 == 4
    567 %define %%srcfn movh
    568 %define %%dstfn movh
    569 %else
    570 %define %%srcfn movu
    571 %define %%dstfn mova
    572 %endif
    573 
    574 %if %7 == 8
    575 %define %%pavg pavgb
    576 %define %%szsuf _8
    577 %elif %7 == 16
    578 %define %%pavg pavgw
    579 %define %%szsuf _16
    580 %else
    581 %define %%szsuf
    582 %endif
    583 
    584 %if %2 <= mmsize
    585 cglobal vp9_%1%2 %+ %%szsuf, 5, 7, 4, dst, dstride, src, sstride, h, dstride3, sstride3
    586    lea  sstride3q, [sstrideq*3]
    587    lea  dstride3q, [dstrideq*3]
    588 %else
    589 cglobal vp9_%1%2 %+ %%szsuf, 5, 5, %8, dst, dstride, src, sstride, h
    590 %endif
    591 .loop:
    592    %%srcfn     m0, [srcq]
    593    %%srcfn     m1, [srcq+s%3]
    594    %%srcfn     m2, [srcq+s%4]
    595    %%srcfn     m3, [srcq+s%5]
    596 %if %2/mmsize == 8
    597    %%srcfn     m4, [srcq+mmsize*4]
    598    %%srcfn     m5, [srcq+mmsize*5]
    599    %%srcfn     m6, [srcq+mmsize*6]
    600    %%srcfn     m7, [srcq+mmsize*7]
    601 %endif
    602    lea       srcq, [srcq+sstrideq*%6]
    603 %ifidn %1, avg
    604    %%pavg      m0, [dstq]
    605    %%pavg      m1, [dstq+d%3]
    606    %%pavg      m2, [dstq+d%4]
    607 %if %2 == 4
    608    %%srcfn     m4, [dstq+d%5]
    609    %%pavg      m3, m4
    610 %else
    611    %%pavg      m3, [dstq+d%5]
    612 %endif
    613 %if %2/mmsize == 8
    614    %%pavg      m4, [dstq+mmsize*4]
    615    %%pavg      m5, [dstq+mmsize*5]
    616    %%pavg      m6, [dstq+mmsize*6]
    617    %%pavg      m7, [dstq+mmsize*7]
    618 %endif
    619 %endif
    620    %%dstfn [dstq], m0
    621    %%dstfn [dstq+d%3], m1
    622    %%dstfn [dstq+d%4], m2
    623    %%dstfn [dstq+d%5], m3
    624 %if %2/mmsize == 8
    625    %%dstfn [dstq+mmsize*4], m4
    626    %%dstfn [dstq+mmsize*5], m5
    627    %%dstfn [dstq+mmsize*6], m6
    628    %%dstfn [dstq+mmsize*7], m7
    629 %endif
    630    lea       dstq, [dstq+dstrideq*%6]
    631    sub         hd, %6
    632    jnz .loop
    633    RET
    634 %endmacro
    635 
    636 %define d16 16
    637 %define s16 16
    638 %define d32 32
    639 %define s32 32
    640 INIT_MMX mmx
    641 fpel_fn put, 4,  strideq, strideq*2, stride3q, 4
    642 fpel_fn put, 8,  strideq, strideq*2, stride3q, 4
    643 INIT_MMX mmxext
    644 fpel_fn avg, 4,  strideq, strideq*2, stride3q, 4, 8
    645 fpel_fn avg, 8,  strideq, strideq*2, stride3q, 4, 8
    646 INIT_XMM sse
    647 fpel_fn put, 16, strideq, strideq*2, stride3q, 4
    648 fpel_fn put, 32, mmsize,  strideq,   strideq+mmsize, 2
    649 fpel_fn put, 64, mmsize,  mmsize*2,  mmsize*3, 1
    650 fpel_fn put, 128, mmsize, mmsize*2,  mmsize*3, 1, 0, 8
    651 INIT_XMM sse2
    652 fpel_fn avg, 16, strideq, strideq*2, stride3q, 4, 8
    653 fpel_fn avg, 32, mmsize,  strideq,   strideq+mmsize, 2, 8
    654 fpel_fn avg, 64, mmsize,  mmsize*2,  mmsize*3, 1, 8
    655 INIT_YMM avx
    656 fpel_fn put, 32, strideq, strideq*2, stride3q, 4
    657 fpel_fn put, 64, mmsize,  strideq,   strideq+mmsize, 2
    658 fpel_fn put, 128, mmsize, mmsize*2,     mmsize*3, 1
    659 %if HAVE_AVX2_EXTERNAL
    660 INIT_YMM avx2
    661 fpel_fn avg, 32, strideq, strideq*2, stride3q, 4, 8
    662 fpel_fn avg, 64, mmsize,  strideq,   strideq+mmsize, 2, 8
    663 %endif
    664 INIT_MMX mmxext
    665 fpel_fn avg,  8,  strideq, strideq*2, stride3q, 4, 16
    666 INIT_XMM sse2
    667 fpel_fn avg,  16, strideq, strideq*2, stride3q, 4, 16
    668 fpel_fn avg,  32, mmsize,  strideq,   strideq+mmsize, 2, 16
    669 fpel_fn avg,  64, mmsize,  mmsize*2,  mmsize*3, 1, 16
    670 fpel_fn avg, 128, mmsize,  mmsize*2,  mmsize*3, 1, 16, 8
    671 %if HAVE_AVX2_EXTERNAL
    672 INIT_YMM avx2
    673 fpel_fn avg,  32, strideq, strideq*2, stride3q, 4, 16
    674 fpel_fn avg,  64, mmsize,  strideq,   strideq+mmsize, 2, 16
    675 fpel_fn avg, 128, mmsize,  mmsize*2,  mmsize*3, 1, 16
    676 %endif
    677 %undef s16
    678 %undef d16
    679 %undef s32
    680 %undef d32