tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

vp8dsp.asm (32179B)


      1 ;******************************************************************************
      2 ;* VP8 MMXEXT optimizations
      3 ;* Copyright (c) 2010 Ronald S. Bultje <rsbultje@gmail.com>
      4 ;* Copyright (c) 2010 Fiona Glaser <fiona@x264.com>
      5 ;*
      6 ;* This file is part of FFmpeg.
      7 ;*
      8 ;* FFmpeg is free software; you can redistribute it and/or
      9 ;* modify it under the terms of the GNU Lesser General Public
     10 ;* License as published by the Free Software Foundation; either
     11 ;* version 2.1 of the License, or (at your option) any later version.
     12 ;*
     13 ;* FFmpeg is distributed in the hope that it will be useful,
     14 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
     15 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
     16 ;* Lesser General Public License for more details.
     17 ;*
     18 ;* You should have received a copy of the GNU Lesser General Public
     19 ;* License along with FFmpeg; if not, write to the Free Software
     20 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
     21 ;******************************************************************************
     22 
     23 %include "libavutil/x86/x86util.asm"
     24 
     25 SECTION_RODATA
     26 
     27 fourtap_filter_hw_m: times 4 dw  -6, 123
     28                     times 4 dw  12,  -1
     29                     times 4 dw  -9,  93
     30                     times 4 dw  50,  -6
     31                     times 4 dw  -6,  50
     32                     times 4 dw  93,  -9
     33                     times 4 dw  -1,  12
     34                     times 4 dw 123,  -6
     35 
     36 sixtap_filter_hw_m:  times 4 dw   2, -11
     37                     times 4 dw 108,  36
     38                     times 4 dw  -8,   1
     39                     times 4 dw   3, -16
     40                     times 4 dw  77,  77
     41                     times 4 dw -16,   3
     42                     times 4 dw   1,  -8
     43                     times 4 dw  36, 108
     44                     times 4 dw -11,   2
     45 
     46 fourtap_filter_hb_m: times 8 db  -6, 123
     47                     times 8 db  12,  -1
     48                     times 8 db  -9,  93
     49                     times 8 db  50,  -6
     50                     times 8 db  -6,  50
     51                     times 8 db  93,  -9
     52                     times 8 db  -1,  12
     53                     times 8 db 123,  -6
     54 
     55 sixtap_filter_hb_m:  times 8 db   2,   1
     56                     times 8 db -11, 108
     57                     times 8 db  36,  -8
     58                     times 8 db   3,   3
     59                     times 8 db -16,  77
     60                     times 8 db  77, -16
     61                     times 8 db   1,   2
     62                     times 8 db  -8,  36
     63                     times 8 db 108, -11
     64 
     65 fourtap_filter_v_m:  times 8 dw  -6
     66                     times 8 dw 123
     67                     times 8 dw  12
     68                     times 8 dw  -1
     69                     times 8 dw  -9
     70                     times 8 dw  93
     71                     times 8 dw  50
     72                     times 8 dw  -6
     73                     times 8 dw  -6
     74                     times 8 dw  50
     75                     times 8 dw  93
     76                     times 8 dw  -9
     77                     times 8 dw  -1
     78                     times 8 dw  12
     79                     times 8 dw 123
     80                     times 8 dw  -6
     81 
     82 sixtap_filter_v_m:   times 8 dw   2
     83                     times 8 dw -11
     84                     times 8 dw 108
     85                     times 8 dw  36
     86                     times 8 dw  -8
     87                     times 8 dw   1
     88                     times 8 dw   3
     89                     times 8 dw -16
     90                     times 8 dw  77
     91                     times 8 dw  77
     92                     times 8 dw -16
     93                     times 8 dw   3
     94                     times 8 dw   1
     95                     times 8 dw  -8
     96                     times 8 dw  36
     97                     times 8 dw 108
     98                     times 8 dw -11
     99                     times 8 dw   2
    100 
    101 bilinear_filter_vw_m: times 8 dw 1
    102                      times 8 dw 2
    103                      times 8 dw 3
    104                      times 8 dw 4
    105                      times 8 dw 5
    106                      times 8 dw 6
    107                      times 8 dw 7
    108 
    109 bilinear_filter_vb_m: times 8 db 7, 1
    110                      times 8 db 6, 2
    111                      times 8 db 5, 3
    112                      times 8 db 4, 4
    113                      times 8 db 3, 5
    114                      times 8 db 2, 6
    115                      times 8 db 1, 7
    116 
    117 %if PIC
    118 %define fourtap_filter_hw  picregq
    119 %define sixtap_filter_hw   picregq
    120 %define fourtap_filter_hb  picregq
    121 %define sixtap_filter_hb   picregq
    122 %define fourtap_filter_v   picregq
    123 %define sixtap_filter_v    picregq
    124 %define bilinear_filter_vw picregq
    125 %define bilinear_filter_vb picregq
    126 %define npicregs 1
    127 %else
    128 %define fourtap_filter_hw  fourtap_filter_hw_m
    129 %define sixtap_filter_hw   sixtap_filter_hw_m
    130 %define fourtap_filter_hb  fourtap_filter_hb_m
    131 %define sixtap_filter_hb   sixtap_filter_hb_m
    132 %define fourtap_filter_v   fourtap_filter_v_m
    133 %define sixtap_filter_v    sixtap_filter_v_m
    134 %define bilinear_filter_vw bilinear_filter_vw_m
    135 %define bilinear_filter_vb bilinear_filter_vb_m
    136 %define npicregs 0
    137 %endif
    138 
    139 filter_h2_shuf:  db 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5,  6, 6,  7,  7,  8
    140 filter_h4_shuf:  db 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7,  8, 8,  9,  9, 10
    141 
    142 filter_h6_shuf1: db 0, 5, 1, 6, 2, 7, 3, 8, 4, 9, 5, 10, 6, 11,  7, 12
    143 filter_h6_shuf2: db 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6,  7, 7,  8,  8,  9
    144 filter_h6_shuf3: db 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8,  9, 9, 10, 10, 11
    145 
    146 pw_20091: times 4 dw 20091
    147 pw_17734: times 4 dw 17734
    148 
    149 cextern pw_3
    150 cextern pw_4
    151 cextern pw_64
    152 cextern pw_256
    153 
    154 SECTION .text
    155 
    156 ;-------------------------------------------------------------------------------
    157 ; subpel MC functions:
    158 ;
    159 ; void ff_put_vp8_epel<size>_h<htap>v<vtap>_<opt>(uint8_t *dst, ptrdiff_t deststride,
    160 ;                                                 const uint8_t *src, ptrdiff_t srcstride,
    161 ;                                                 int height,   int mx, int my);
    162 ;-------------------------------------------------------------------------------
    163 
    164 %macro FILTER_SSSE3 1
    165 cglobal put_vp8_epel%1_h6, 6, 6 + npicregs, 8, dst, dststride, src, srcstride, height, mx, picreg
    166    lea      mxd, [mxq*3]
    167    mova      m3, [filter_h6_shuf2]
    168    mova      m4, [filter_h6_shuf3]
    169 %if PIC
    170    lea  picregq, [sixtap_filter_hb_m]
    171 %endif
    172    mova      m5, [sixtap_filter_hb+mxq*8-48] ; set up 6tap filter in bytes
    173    mova      m6, [sixtap_filter_hb+mxq*8-32]
    174    mova      m7, [sixtap_filter_hb+mxq*8-16]
    175 
    176 .nextrow:
    177    movu      m0, [srcq-2]
    178    mova      m1, m0
    179    mova      m2, m0
    180 %if mmsize == 8
    181 ; For epel4, we need 9 bytes, but only 8 get loaded; to compensate, do the
    182 ; shuffle with a memory operand
    183    punpcklbw m0, [srcq+3]
    184 %else
    185    pshufb    m0, [filter_h6_shuf1]
    186 %endif
    187    pshufb    m1, m3
    188    pshufb    m2, m4
    189    pmaddubsw m0, m5
    190    pmaddubsw m1, m6
    191    pmaddubsw m2, m7
    192    paddsw    m0, m1
    193    paddsw    m0, m2
    194    pmulhrsw  m0, [pw_256]
    195    packuswb  m0, m0
    196    movh  [dstq], m0        ; store
    197 
    198    ; go to next line
    199    add     dstq, dststrideq
    200    add     srcq, srcstrideq
    201    dec  heightd            ; next row
    202    jg .nextrow
    203    RET
    204 
    205 cglobal put_vp8_epel%1_h4, 6, 6 + npicregs, 7, dst, dststride, src, srcstride, height, mx, picreg
    206    shl      mxd, 4
    207    mova      m2, [pw_256]
    208    mova      m3, [filter_h2_shuf]
    209    mova      m4, [filter_h4_shuf]
    210 %if PIC
    211    lea  picregq, [fourtap_filter_hb_m]
    212 %endif
    213    mova      m5, [fourtap_filter_hb+mxq-16] ; set up 4tap filter in bytes
    214    mova      m6, [fourtap_filter_hb+mxq]
    215 
    216 .nextrow:
    217    movu      m0, [srcq-1]
    218    mova      m1, m0
    219    pshufb    m0, m3
    220    pshufb    m1, m4
    221    pmaddubsw m0, m5
    222    pmaddubsw m1, m6
    223    paddsw    m0, m1
    224    pmulhrsw  m0, m2
    225    packuswb  m0, m0
    226    movh  [dstq], m0        ; store
    227 
    228    ; go to next line
    229    add     dstq, dststrideq
    230    add     srcq, srcstrideq
    231    dec  heightd            ; next row
    232    jg .nextrow
    233    RET
    234 
    235 cglobal put_vp8_epel%1_v4, 7, 7, 8, dst, dststride, src, srcstride, height, picreg, my
    236    shl      myd, 4
    237 %if PIC
    238    lea  picregq, [fourtap_filter_hb_m]
    239 %endif
    240    mova      m5, [fourtap_filter_hb+myq-16]
    241    mova      m6, [fourtap_filter_hb+myq]
    242    mova      m7, [pw_256]
    243 
    244    ; read 3 lines
    245    sub     srcq, srcstrideq
    246    movh      m0, [srcq]
    247    movh      m1, [srcq+  srcstrideq]
    248    movh      m2, [srcq+2*srcstrideq]
    249    add     srcq, srcstrideq
    250 
    251 .nextrow:
    252    movh      m3, [srcq+2*srcstrideq]      ; read new row
    253    mova      m4, m0
    254    mova      m0, m1
    255    punpcklbw m4, m1
    256    mova      m1, m2
    257    punpcklbw m2, m3
    258    pmaddubsw m4, m5
    259    pmaddubsw m2, m6
    260    paddsw    m4, m2
    261    mova      m2, m3
    262    pmulhrsw  m4, m7
    263    packuswb  m4, m4
    264    movh  [dstq], m4
    265 
    266    ; go to next line
    267    add      dstq, dststrideq
    268    add      srcq, srcstrideq
    269    dec   heightd                          ; next row
    270    jg .nextrow
    271    RET
    272 
    273 cglobal put_vp8_epel%1_v6, 7, 7, 8, dst, dststride, src, srcstride, height, picreg, my
    274    lea      myd, [myq*3]
    275 %if PIC
    276    lea  picregq, [sixtap_filter_hb_m]
    277 %endif
    278    lea      myq, [sixtap_filter_hb+myq*8]
    279 
    280    ; read 5 lines
    281    sub     srcq, srcstrideq
    282    sub     srcq, srcstrideq
    283    movh      m0, [srcq]
    284    movh      m1, [srcq+srcstrideq]
    285    movh      m2, [srcq+srcstrideq*2]
    286    lea     srcq, [srcq+srcstrideq*2]
    287    add     srcq, srcstrideq
    288    movh      m3, [srcq]
    289    movh      m4, [srcq+srcstrideq]
    290 
    291 .nextrow:
    292    movh      m5, [srcq+2*srcstrideq]      ; read new row
    293    mova      m6, m0
    294    punpcklbw m6, m5
    295    mova      m0, m1
    296    punpcklbw m1, m2
    297    mova      m7, m3
    298    punpcklbw m7, m4
    299    pmaddubsw m6, [myq-48]
    300    pmaddubsw m1, [myq-32]
    301    pmaddubsw m7, [myq-16]
    302    paddsw    m6, m1
    303    paddsw    m6, m7
    304    mova      m1, m2
    305    mova      m2, m3
    306    pmulhrsw  m6, [pw_256]
    307    mova      m3, m4
    308    packuswb  m6, m6
    309    mova      m4, m5
    310    movh  [dstq], m6
    311 
    312    ; go to next line
    313    add      dstq, dststrideq
    314    add      srcq, srcstrideq
    315    dec   heightd                          ; next row
    316    jg .nextrow
    317    RET
    318 %endmacro
    319 
    320 INIT_MMX ssse3
    321 FILTER_SSSE3 4
    322 INIT_XMM ssse3
    323 FILTER_SSSE3 8
    324 
    325 ; 4x4 block, H-only 4-tap filter
    326 INIT_MMX mmxext
    327 cglobal put_vp8_epel4_h4, 6, 6 + npicregs, 0, dst, dststride, src, srcstride, height, mx, picreg
    328    shl       mxd, 4
    329 %if PIC
    330    lea   picregq, [fourtap_filter_hw_m]
    331 %endif
    332    movq      mm4, [fourtap_filter_hw+mxq-16] ; set up 4tap filter in words
    333    movq      mm5, [fourtap_filter_hw+mxq]
    334    movq      mm7, [pw_64]
    335    pxor      mm6, mm6
    336 
    337 .nextrow:
    338    movq      mm1, [srcq-1]                ; (ABCDEFGH) load 8 horizontal pixels
    339 
    340    ; first set of 2 pixels
    341    movq      mm2, mm1                     ; byte ABCD..
    342    punpcklbw mm1, mm6                     ; byte->word ABCD
    343    pshufw    mm0, mm2, 9                  ; byte CDEF..
    344    punpcklbw mm0, mm6                     ; byte->word CDEF
    345    pshufw    mm3, mm1, 0x94               ; word ABBC
    346    pshufw    mm1, mm0, 0x94               ; word CDDE
    347    pmaddwd   mm3, mm4                     ; multiply 2px with F0/F1
    348    movq      mm0, mm1                     ; backup for second set of pixels
    349    pmaddwd   mm1, mm5                     ; multiply 2px with F2/F3
    350    paddd     mm3, mm1                     ; finish 1st 2px
    351 
    352    ; second set of 2 pixels, use backup of above
    353    punpckhbw mm2, mm6                     ; byte->word EFGH
    354    pmaddwd   mm0, mm4                     ; multiply backed up 2px with F0/F1
    355    pshufw    mm1, mm2, 0x94               ; word EFFG
    356    pmaddwd   mm1, mm5                     ; multiply 2px with F2/F3
    357    paddd     mm0, mm1                     ; finish 2nd 2px
    358 
    359    ; merge two sets of 2 pixels into one set of 4, round/clip/store
    360    packssdw  mm3, mm0                     ; merge dword->word (4px)
    361    paddsw    mm3, mm7                     ; rounding
    362    psraw     mm3, 7
    363    packuswb  mm3, mm6                     ; clip and word->bytes
    364    movd   [dstq], mm3                     ; store
    365 
    366    ; go to next line
    367    add      dstq, dststrideq
    368    add      srcq, srcstrideq
    369    dec   heightd                          ; next row
    370    jg .nextrow
    371    RET
    372 
    373 ; 4x4 block, H-only 6-tap filter
    374 INIT_MMX mmxext
    375 cglobal put_vp8_epel4_h6, 6, 6 + npicregs, 0, dst, dststride, src, srcstride, height, mx, picreg
    376    lea       mxd, [mxq*3]
    377 %if PIC
    378    lea   picregq, [sixtap_filter_hw_m]
    379 %endif
    380    movq      mm4, [sixtap_filter_hw+mxq*8-48] ; set up 4tap filter in words
    381    movq      mm5, [sixtap_filter_hw+mxq*8-32]
    382    movq      mm6, [sixtap_filter_hw+mxq*8-16]
    383    movq      mm7, [pw_64]
    384    pxor      mm3, mm3
    385 
    386 .nextrow:
    387    movq      mm1, [srcq-2]                ; (ABCDEFGH) load 8 horizontal pixels
    388 
    389    ; first set of 2 pixels
    390    movq      mm2, mm1                     ; byte ABCD..
    391    punpcklbw mm1, mm3                     ; byte->word ABCD
    392    pshufw    mm0, mm2, 0x9                ; byte CDEF..
    393    punpckhbw mm2, mm3                     ; byte->word EFGH
    394    punpcklbw mm0, mm3                     ; byte->word CDEF
    395    pshufw    mm1, mm1, 0x94               ; word ABBC
    396    pshufw    mm2, mm2, 0x94               ; word EFFG
    397    pmaddwd   mm1, mm4                     ; multiply 2px with F0/F1
    398    pshufw    mm3, mm0, 0x94               ; word CDDE
    399    movq      mm0, mm3                     ; backup for second set of pixels
    400    pmaddwd   mm3, mm5                     ; multiply 2px with F2/F3
    401    paddd     mm1, mm3                     ; add to 1st 2px cache
    402    movq      mm3, mm2                     ; backup for second set of pixels
    403    pmaddwd   mm2, mm6                     ; multiply 2px with F4/F5
    404    paddd     mm1, mm2                     ; finish 1st 2px
    405 
    406    ; second set of 2 pixels, use backup of above
    407    movd      mm2, [srcq+3]                ; byte FGHI (prevent overreads)
    408    pmaddwd   mm0, mm4                     ; multiply 1st backed up 2px with F0/F1
    409    pmaddwd   mm3, mm5                     ; multiply 2nd backed up 2px with F2/F3
    410    paddd     mm0, mm3                     ; add to 2nd 2px cache
    411    pxor      mm3, mm3
    412    punpcklbw mm2, mm3                     ; byte->word FGHI
    413    pshufw    mm2, mm2, 0xE9               ; word GHHI
    414    pmaddwd   mm2, mm6                     ; multiply 2px with F4/F5
    415    paddd     mm0, mm2                     ; finish 2nd 2px
    416 
    417    ; merge two sets of 2 pixels into one set of 4, round/clip/store
    418    packssdw  mm1, mm0                     ; merge dword->word (4px)
    419    paddsw    mm1, mm7                     ; rounding
    420    psraw     mm1, 7
    421    packuswb  mm1, mm3                     ; clip and word->bytes
    422    movd   [dstq], mm1                     ; store
    423 
    424    ; go to next line
    425    add      dstq, dststrideq
    426    add      srcq, srcstrideq
    427    dec   heightd                          ; next row
    428    jg .nextrow
    429    RET
    430 
    431 INIT_XMM sse2
    432 cglobal put_vp8_epel8_h4, 6, 6 + npicregs, 10, dst, dststride, src, srcstride, height, mx, picreg
    433    shl      mxd, 5
    434 %if PIC
    435    lea  picregq, [fourtap_filter_v_m]
    436 %endif
    437    lea      mxq, [fourtap_filter_v+mxq-32]
    438    pxor      m7, m7
    439    mova      m4, [pw_64]
    440    mova      m5, [mxq+ 0]
    441    mova      m6, [mxq+16]
    442 %ifdef m8
    443    mova      m8, [mxq+32]
    444    mova      m9, [mxq+48]
    445 %endif
    446 .nextrow:
    447    movq      m0, [srcq-1]
    448    movq      m1, [srcq-0]
    449    movq      m2, [srcq+1]
    450    movq      m3, [srcq+2]
    451    punpcklbw m0, m7
    452    punpcklbw m1, m7
    453    punpcklbw m2, m7
    454    punpcklbw m3, m7
    455    pmullw    m0, m5
    456    pmullw    m1, m6
    457 %ifdef m8
    458    pmullw    m2, m8
    459    pmullw    m3, m9
    460 %else
    461    pmullw    m2, [mxq+32]
    462    pmullw    m3, [mxq+48]
    463 %endif
    464    paddsw    m0, m1
    465    paddsw    m2, m3
    466    paddsw    m0, m2
    467    paddsw    m0, m4
    468    psraw     m0, 7
    469    packuswb  m0, m7
    470    movh  [dstq], m0        ; store
    471 
    472    ; go to next line
    473    add     dstq, dststrideq
    474    add     srcq, srcstrideq
    475    dec  heightd            ; next row
    476    jg .nextrow
    477    RET
    478 
    479 INIT_XMM sse2
    480 cglobal put_vp8_epel8_h6, 6, 6 + npicregs, 14, dst, dststride, src, srcstride, height, mx, picreg
    481    lea      mxd, [mxq*3]
    482    shl      mxd, 4
    483 %if PIC
    484    lea  picregq, [sixtap_filter_v_m]
    485 %endif
    486    lea      mxq, [sixtap_filter_v+mxq-96]
    487    pxor      m7, m7
    488    mova      m6, [pw_64]
    489 %ifdef m8
    490    mova      m8, [mxq+ 0]
    491    mova      m9, [mxq+16]
    492    mova     m10, [mxq+32]
    493    mova     m11, [mxq+48]
    494    mova     m12, [mxq+64]
    495    mova     m13, [mxq+80]
    496 %endif
    497 .nextrow:
    498    movq      m0, [srcq-2]
    499    movq      m1, [srcq-1]
    500    movq      m2, [srcq-0]
    501    movq      m3, [srcq+1]
    502    movq      m4, [srcq+2]
    503    movq      m5, [srcq+3]
    504    punpcklbw m0, m7
    505    punpcklbw m1, m7
    506    punpcklbw m2, m7
    507    punpcklbw m3, m7
    508    punpcklbw m4, m7
    509    punpcklbw m5, m7
    510 %ifdef m8
    511    pmullw    m0, m8
    512    pmullw    m1, m9
    513    pmullw    m2, m10
    514    pmullw    m3, m11
    515    pmullw    m4, m12
    516    pmullw    m5, m13
    517 %else
    518    pmullw    m0, [mxq+ 0]
    519    pmullw    m1, [mxq+16]
    520    pmullw    m2, [mxq+32]
    521    pmullw    m3, [mxq+48]
    522    pmullw    m4, [mxq+64]
    523    pmullw    m5, [mxq+80]
    524 %endif
    525    paddsw    m1, m4
    526    paddsw    m0, m5
    527    paddsw    m1, m2
    528    paddsw    m0, m3
    529    paddsw    m0, m1
    530    paddsw    m0, m6
    531    psraw     m0, 7
    532    packuswb  m0, m7
    533    movh  [dstq], m0        ; store
    534 
    535    ; go to next line
    536    add     dstq, dststrideq
    537    add     srcq, srcstrideq
    538    dec  heightd            ; next row
    539    jg .nextrow
    540    RET
    541 
    542 %macro FILTER_V 1
    543 ; 4x4 block, V-only 4-tap filter
    544 cglobal put_vp8_epel%1_v4, 7, 7, 8, dst, dststride, src, srcstride, height, picreg, my
    545    shl      myd, 5
    546 %if PIC
    547    lea  picregq, [fourtap_filter_v_m]
    548 %endif
    549    lea      myq, [fourtap_filter_v+myq-32]
    550    mova      m6, [pw_64]
    551    pxor      m7, m7
    552    mova      m5, [myq+48]
    553 
    554    ; read 3 lines
    555    sub     srcq, srcstrideq
    556    movh      m0, [srcq]
    557    movh      m1, [srcq+  srcstrideq]
    558    movh      m2, [srcq+2*srcstrideq]
    559    add     srcq, srcstrideq
    560    punpcklbw m0, m7
    561    punpcklbw m1, m7
    562    punpcklbw m2, m7
    563 
    564 .nextrow:
    565    ; first calculate negative taps (to prevent losing positive overflows)
    566    movh      m4, [srcq+2*srcstrideq]      ; read new row
    567    punpcklbw m4, m7
    568    mova      m3, m4
    569    pmullw    m0, [myq+0]
    570    pmullw    m4, m5
    571    paddsw    m4, m0
    572 
    573    ; then calculate positive taps
    574    mova      m0, m1
    575    pmullw    m1, [myq+16]
    576    paddsw    m4, m1
    577    mova      m1, m2
    578    pmullw    m2, [myq+32]
    579    paddsw    m4, m2
    580    mova      m2, m3
    581 
    582    ; round/clip/store
    583    paddsw    m4, m6
    584    psraw     m4, 7
    585    packuswb  m4, m7
    586    movh  [dstq], m4
    587 
    588    ; go to next line
    589    add     dstq, dststrideq
    590    add     srcq, srcstrideq
    591    dec  heightd                           ; next row
    592    jg .nextrow
    593    RET
    594 
    595 
    596 ; 4x4 block, V-only 6-tap filter
    597 cglobal put_vp8_epel%1_v6, 7, 7, 8, dst, dststride, src, srcstride, height, picreg, my
    598    shl      myd, 4
    599    lea      myq, [myq*3]
    600 %if PIC
    601    lea  picregq, [sixtap_filter_v_m]
    602 %endif
    603    lea      myq, [sixtap_filter_v+myq-96]
    604    pxor      m7, m7
    605 
    606    ; read 5 lines
    607    sub     srcq, srcstrideq
    608    sub     srcq, srcstrideq
    609    movh      m0, [srcq]
    610    movh      m1, [srcq+srcstrideq]
    611    movh      m2, [srcq+srcstrideq*2]
    612    lea     srcq, [srcq+srcstrideq*2]
    613    add     srcq, srcstrideq
    614    movh      m3, [srcq]
    615    movh      m4, [srcq+srcstrideq]
    616    punpcklbw m0, m7
    617    punpcklbw m1, m7
    618    punpcklbw m2, m7
    619    punpcklbw m3, m7
    620    punpcklbw m4, m7
    621 
    622 .nextrow:
    623    ; first calculate negative taps (to prevent losing positive overflows)
    624    mova      m5, m1
    625    pmullw    m5, [myq+16]
    626    mova      m6, m4
    627    pmullw    m6, [myq+64]
    628    paddsw    m6, m5
    629 
    630    ; then calculate positive taps
    631    movh      m5, [srcq+2*srcstrideq]      ; read new row
    632    punpcklbw m5, m7
    633    pmullw    m0, [myq+0]
    634    paddsw    m6, m0
    635    mova      m0, m1
    636    mova      m1, m2
    637    pmullw    m2, [myq+32]
    638    paddsw    m6, m2
    639    mova      m2, m3
    640    pmullw    m3, [myq+48]
    641    paddsw    m6, m3
    642    mova      m3, m4
    643    mova      m4, m5
    644    pmullw    m5, [myq+80]
    645    paddsw    m6, m5
    646 
    647    ; round/clip/store
    648    paddsw    m6, [pw_64]
    649    psraw     m6, 7
    650    packuswb  m6, m7
    651    movh  [dstq], m6
    652 
    653    ; go to next line
    654    add     dstq, dststrideq
    655    add     srcq, srcstrideq
    656    dec  heightd                           ; next row
    657    jg .nextrow
    658    RET
    659 %endmacro
    660 
    661 INIT_MMX mmxext
    662 FILTER_V 4
    663 INIT_XMM sse2
    664 FILTER_V 8
    665 
    666 %macro FILTER_BILINEAR 1
    667 %if cpuflag(ssse3)
    668 cglobal put_vp8_bilinear%1_v, 7, 7, 5, dst, dststride, src, srcstride, height, picreg, my
    669    shl      myd, 4
    670 %if PIC
    671    lea  picregq, [bilinear_filter_vb_m]
    672 %endif
    673    pxor      m4, m4
    674    mova      m3, [bilinear_filter_vb+myq-16]
    675 .nextrow:
    676    movh      m0, [srcq+srcstrideq*0]
    677    movh      m1, [srcq+srcstrideq*1]
    678    movh      m2, [srcq+srcstrideq*2]
    679    punpcklbw m0, m1
    680    punpcklbw m1, m2
    681    pmaddubsw m0, m3
    682    pmaddubsw m1, m3
    683    psraw     m0, 2
    684    psraw     m1, 2
    685    pavgw     m0, m4
    686    pavgw     m1, m4
    687 %if mmsize==8
    688    packuswb  m0, m0
    689    packuswb  m1, m1
    690    movh   [dstq+dststrideq*0], m0
    691    movh   [dstq+dststrideq*1], m1
    692 %else
    693    packuswb  m0, m1
    694    movh   [dstq+dststrideq*0], m0
    695    movhps [dstq+dststrideq*1], m0
    696 %endif
    697 %else ; cpuflag(ssse3)
    698 cglobal put_vp8_bilinear%1_v, 7, 7, 7, dst, dststride, src, srcstride, height, picreg, my
    699    shl      myd, 4
    700 %if PIC
    701    lea  picregq, [bilinear_filter_vw_m]
    702 %endif
    703    pxor      m6, m6
    704    mova      m5, [bilinear_filter_vw+myq-1*16]
    705    neg      myq
    706    mova      m4, [bilinear_filter_vw+myq+7*16]
    707 .nextrow:
    708    movh      m0, [srcq+srcstrideq*0]
    709    movh      m1, [srcq+srcstrideq*1]
    710    movh      m3, [srcq+srcstrideq*2]
    711    punpcklbw m0, m6
    712    punpcklbw m1, m6
    713    punpcklbw m3, m6
    714    mova      m2, m1
    715    pmullw    m0, m4
    716    pmullw    m1, m5
    717    pmullw    m2, m4
    718    pmullw    m3, m5
    719    paddsw    m0, m1
    720    paddsw    m2, m3
    721    psraw     m0, 2
    722    psraw     m2, 2
    723    pavgw     m0, m6
    724    pavgw     m2, m6
    725 %if mmsize == 8
    726    packuswb  m0, m0
    727    packuswb  m2, m2
    728    movh   [dstq+dststrideq*0], m0
    729    movh   [dstq+dststrideq*1], m2
    730 %else
    731    packuswb  m0, m2
    732    movh   [dstq+dststrideq*0], m0
    733    movhps [dstq+dststrideq*1], m0
    734 %endif
    735 %endif ; cpuflag(ssse3)
    736 
    737    lea     dstq, [dstq+dststrideq*2]
    738    lea     srcq, [srcq+srcstrideq*2]
    739    sub  heightd, 2
    740    jg .nextrow
    741    RET
    742 
    743 %if cpuflag(ssse3)
    744 cglobal put_vp8_bilinear%1_h, 6, 6 + npicregs, 5, dst, dststride, src, srcstride, height, mx, picreg
    745    shl      mxd, 4
    746 %if PIC
    747    lea  picregq, [bilinear_filter_vb_m]
    748 %endif
    749    pxor      m4, m4
    750    mova      m2, [filter_h2_shuf]
    751    mova      m3, [bilinear_filter_vb+mxq-16]
    752 .nextrow:
    753    movu      m0, [srcq+srcstrideq*0]
    754    movu      m1, [srcq+srcstrideq*1]
    755    pshufb    m0, m2
    756    pshufb    m1, m2
    757    pmaddubsw m0, m3
    758    pmaddubsw m1, m3
    759    psraw     m0, 2
    760    psraw     m1, 2
    761    pavgw     m0, m4
    762    pavgw     m1, m4
    763 %if mmsize==8
    764    packuswb  m0, m0
    765    packuswb  m1, m1
    766    movh   [dstq+dststrideq*0], m0
    767    movh   [dstq+dststrideq*1], m1
    768 %else
    769    packuswb  m0, m1
    770    movh   [dstq+dststrideq*0], m0
    771    movhps [dstq+dststrideq*1], m0
    772 %endif
    773 %else ; cpuflag(ssse3)
    774 cglobal put_vp8_bilinear%1_h, 6, 6 + npicregs, 7, dst, dststride, src, srcstride, height, mx, picreg
    775    shl      mxd, 4
    776 %if PIC
    777    lea  picregq, [bilinear_filter_vw_m]
    778 %endif
    779    pxor      m6, m6
    780    mova      m5, [bilinear_filter_vw+mxq-1*16]
    781    neg      mxq
    782    mova      m4, [bilinear_filter_vw+mxq+7*16]
    783 .nextrow:
    784    movh      m0, [srcq+srcstrideq*0+0]
    785    movh      m1, [srcq+srcstrideq*0+1]
    786    movh      m2, [srcq+srcstrideq*1+0]
    787    movh      m3, [srcq+srcstrideq*1+1]
    788    punpcklbw m0, m6
    789    punpcklbw m1, m6
    790    punpcklbw m2, m6
    791    punpcklbw m3, m6
    792    pmullw    m0, m4
    793    pmullw    m1, m5
    794    pmullw    m2, m4
    795    pmullw    m3, m5
    796    paddsw    m0, m1
    797    paddsw    m2, m3
    798    psraw     m0, 2
    799    psraw     m2, 2
    800    pavgw     m0, m6
    801    pavgw     m2, m6
    802 %if mmsize == 8
    803    packuswb  m0, m0
    804    packuswb  m2, m2
    805    movh   [dstq+dststrideq*0], m0
    806    movh   [dstq+dststrideq*1], m2
    807 %else
    808    packuswb  m0, m2
    809    movh   [dstq+dststrideq*0], m0
    810    movhps [dstq+dststrideq*1], m0
    811 %endif
    812 %endif ; cpuflag(ssse3)
    813 
    814    lea     dstq, [dstq+dststrideq*2]
    815    lea     srcq, [srcq+srcstrideq*2]
    816    sub  heightd, 2
    817    jg .nextrow
    818    RET
    819 %endmacro
    820 
    821 INIT_MMX mmxext
    822 FILTER_BILINEAR 4
    823 INIT_XMM sse2
    824 FILTER_BILINEAR 8
    825 INIT_MMX ssse3
    826 FILTER_BILINEAR 4
    827 INIT_XMM ssse3
    828 FILTER_BILINEAR 8
    829 
    830 INIT_MMX mmx
    831 cglobal put_vp8_pixels8, 5, 5, 0, dst, dststride, src, srcstride, height
    832 .nextrow:
    833    movq    mm0, [srcq+srcstrideq*0]
    834    movq    mm1, [srcq+srcstrideq*1]
    835    lea    srcq, [srcq+srcstrideq*2]
    836    movq [dstq+dststrideq*0], mm0
    837    movq [dstq+dststrideq*1], mm1
    838    lea    dstq, [dstq+dststrideq*2]
    839    sub heightd, 2
    840    jg .nextrow
    841    RET
    842 
    843 INIT_XMM sse
    844 cglobal put_vp8_pixels16, 5, 5, 2, dst, dststride, src, srcstride, height
    845 .nextrow:
    846    movups xmm0, [srcq+srcstrideq*0]
    847    movups xmm1, [srcq+srcstrideq*1]
    848    lea    srcq, [srcq+srcstrideq*2]
    849    movaps [dstq+dststrideq*0], xmm0
    850    movaps [dstq+dststrideq*1], xmm1
    851    lea    dstq, [dstq+dststrideq*2]
    852    sub heightd, 2
    853    jg .nextrow
    854    RET
    855 
    856 ;-----------------------------------------------------------------------------
    857 ; void ff_vp8_idct_dc_add_<opt>(uint8_t *dst, int16_t block[16], ptrdiff_t stride);
    858 ;-----------------------------------------------------------------------------
    859 
    860 %macro ADD_DC 4
    861    %4        m2, [dst1q+%3]
    862    %4        m3, [dst1q+strideq+%3]
    863    %4        m4, [dst2q+%3]
    864    %4        m5, [dst2q+strideq+%3]
    865    paddusb   m2, %1
    866    paddusb   m3, %1
    867    paddusb   m4, %1
    868    paddusb   m5, %1
    869    psubusb   m2, %2
    870    psubusb   m3, %2
    871    psubusb   m4, %2
    872    psubusb   m5, %2
    873    %4 [dst1q+%3], m2
    874    %4 [dst1q+strideq+%3], m3
    875    %4 [dst2q+%3], m4
    876    %4 [dst2q+strideq+%3], m5
    877 %endmacro
    878 
    879 %macro VP8_IDCT_DC_ADD 0
    880 cglobal vp8_idct_dc_add, 3, 3, 6, dst, block, stride
    881    ; load data
    882    movd       m0, [blockq]
    883    pxor       m1, m1
    884 
    885    ; calculate DC
    886    paddw      m0, [pw_4]
    887    movd [blockq], m1
    888    DEFINE_ARGS dst1, dst2, stride
    889    lea     dst2q, [dst1q+strideq*2]
    890    movd       m2, [dst1q]
    891    movd       m3, [dst1q+strideq]
    892    movd       m4, [dst2q]
    893    movd       m5, [dst2q+strideq]
    894    psraw      m0, 3
    895    pshuflw    m0, m0, 0
    896    punpcklqdq m0, m0
    897    punpckldq  m2, m3
    898    punpckldq  m4, m5
    899    punpcklbw  m2, m1
    900    punpcklbw  m4, m1
    901    paddw      m2, m0
    902    paddw      m4, m0
    903    packuswb   m2, m4
    904    movd   [dst1q], m2
    905 %if cpuflag(sse4)
    906    pextrd [dst1q+strideq], m2, 1
    907    pextrd [dst2q], m2, 2
    908    pextrd [dst2q+strideq], m2, 3
    909 %else
    910    psrldq     m2, 4
    911    movd [dst1q+strideq], m2
    912    psrldq     m2, 4
    913    movd [dst2q], m2
    914    psrldq     m2, 4
    915    movd [dst2q+strideq], m2
    916 %endif
    917    RET
    918 %endmacro
    919 
    920 INIT_XMM sse2
    921 VP8_IDCT_DC_ADD
    922 INIT_XMM sse4
    923 VP8_IDCT_DC_ADD
    924 
    925 ;-----------------------------------------------------------------------------
    926 ; void ff_vp8_idct_dc_add4y_<opt>(uint8_t *dst, int16_t block[4][16], ptrdiff_t stride);
    927 ;-----------------------------------------------------------------------------
    928 
    929 INIT_XMM sse2
    930 cglobal vp8_idct_dc_add4y, 3, 3, 6, dst, block, stride
    931    ; load data
    932    movd      m0, [blockq+32*0] ; A
    933    movd      m1, [blockq+32*2] ; C
    934    punpcklwd m0, [blockq+32*1] ; A B
    935    punpcklwd m1, [blockq+32*3] ; C D
    936    punpckldq m0, m1        ; A B C D
    937    pxor      m1, m1
    938 
    939    ; calculate DC
    940    paddw     m0, [pw_4]
    941    movd [blockq+32*0], m1
    942    movd [blockq+32*1], m1
    943    movd [blockq+32*2], m1
    944    movd [blockq+32*3], m1
    945    psraw     m0, 3
    946    psubw     m1, m0
    947    packuswb  m0, m0
    948    packuswb  m1, m1
    949    punpcklbw m0, m0
    950    punpcklbw m1, m1
    951    punpcklbw m0, m0
    952    punpcklbw m1, m1
    953 
    954    ; add DC
    955    DEFINE_ARGS dst1, dst2, stride
    956    lea    dst2q, [dst1q+strideq*2]
    957    ADD_DC    m0, m1, 0, mova
    958    RET
    959 
    960 ;-----------------------------------------------------------------------------
    961 ; void ff_vp8_idct_dc_add4uv_<opt>(uint8_t *dst, int16_t block[4][16], ptrdiff_t stride);
    962 ;-----------------------------------------------------------------------------
    963 
    964 INIT_MMX mmx
    965 cglobal vp8_idct_dc_add4uv, 3, 3, 0, dst, block, stride
    966    ; load data
    967    movd      m0, [blockq+32*0] ; A
    968    movd      m1, [blockq+32*2] ; C
    969    punpcklwd m0, [blockq+32*1] ; A B
    970    punpcklwd m1, [blockq+32*3] ; C D
    971    punpckldq m0, m1        ; A B C D
    972    pxor      m6, m6
    973 
    974    ; calculate DC
    975    paddw     m0, [pw_4]
    976    movd [blockq+32*0], m6
    977    movd [blockq+32*1], m6
    978    movd [blockq+32*2], m6
    979    movd [blockq+32*3], m6
    980    psraw     m0, 3
    981    psubw     m6, m0
    982    packuswb  m0, m0
    983    packuswb  m6, m6
    984    punpcklbw m0, m0 ; AABBCCDD
    985    punpcklbw m6, m6 ; AABBCCDD
    986    movq      m1, m0
    987    movq      m7, m6
    988    punpcklbw m0, m0 ; AAAABBBB
    989    punpckhbw m1, m1 ; CCCCDDDD
    990    punpcklbw m6, m6 ; AAAABBBB
    991    punpckhbw m7, m7 ; CCCCDDDD
    992 
    993    ; add DC
    994    DEFINE_ARGS dst1, dst2, stride
    995    lea    dst2q, [dst1q+strideq*2]
    996    ADD_DC    m0, m6, 0, mova
    997    lea    dst1q, [dst1q+strideq*4]
    998    lea    dst2q, [dst2q+strideq*4]
    999    ADD_DC    m1, m7, 0, mova
   1000    RET
   1001 
   1002 ;-----------------------------------------------------------------------------
   1003 ; void ff_vp8_idct_add_<opt>(uint8_t *dst, int16_t block[16], ptrdiff_t stride);
   1004 ;-----------------------------------------------------------------------------
   1005 
   1006 ; calculate %1=mul_35468(%1)-mul_20091(%2); %2=mul_20091(%1)+mul_35468(%2)
   1007 ;           this macro assumes that m6/m7 have words for 20091/17734 loaded
   1008 %macro VP8_MULTIPLY_SUMSUB 4
   1009    mova      %3, %1
   1010    mova      %4, %2
   1011    pmulhw    %3, m6 ;20091(1)
   1012    pmulhw    %4, m6 ;20091(2)
   1013    paddw     %3, %1
   1014    paddw     %4, %2
   1015    paddw     %1, %1
   1016    paddw     %2, %2
   1017    pmulhw    %1, m7 ;35468(1)
   1018    pmulhw    %2, m7 ;35468(2)
   1019    psubw     %1, %4
   1020    paddw     %2, %3
   1021 %endmacro
   1022 
   1023 ; calculate x0=%1+%3; x1=%1-%3
   1024 ;           x2=mul_35468(%2)-mul_20091(%4); x3=mul_20091(%2)+mul_35468(%4)
   1025 ;           %1=x0+x3 (tmp0); %2=x1+x2 (tmp1); %3=x1-x2 (tmp2); %4=x0-x3 (tmp3)
   1026 ;           %5/%6 are temporary registers
   1027 ;           we assume m6/m7 have constant words 20091/17734 loaded in them
   1028 %macro VP8_IDCT_TRANSFORM4x4_1D 6
   1029    SUMSUB_BA         w, %3,  %1,  %5     ;t0, t1
   1030    VP8_MULTIPLY_SUMSUB m%2, m%4, m%5,m%6 ;t2, t3
   1031    SUMSUB_BA         w, %4,  %3,  %5     ;tmp0, tmp3
   1032    SUMSUB_BA         w, %2,  %1,  %5     ;tmp1, tmp2
   1033    SWAP                 %4,  %1
   1034    SWAP                 %4,  %3
   1035 %endmacro
   1036 
   1037 INIT_MMX sse
   1038 cglobal vp8_idct_add, 3, 3, 0, dst, block, stride
   1039    ; load block data
   1040    movq         m0, [blockq+ 0]
   1041    movq         m1, [blockq+ 8]
   1042    movq         m2, [blockq+16]
   1043    movq         m3, [blockq+24]
   1044    movq         m6, [pw_20091]
   1045    movq         m7, [pw_17734]
   1046    xorps      xmm0, xmm0
   1047    movaps [blockq+ 0], xmm0
   1048    movaps [blockq+16], xmm0
   1049 
   1050    ; actual IDCT
   1051    VP8_IDCT_TRANSFORM4x4_1D 0, 1, 2, 3, 4, 5
   1052    TRANSPOSE4x4W            0, 1, 2, 3, 4
   1053    paddw        m0, [pw_4]
   1054    VP8_IDCT_TRANSFORM4x4_1D 0, 1, 2, 3, 4, 5
   1055    TRANSPOSE4x4W            0, 1, 2, 3, 4
   1056 
   1057    ; store
   1058    pxor         m4, m4
   1059    DEFINE_ARGS dst1, dst2, stride
   1060    lea       dst2q, [dst1q+2*strideq]
   1061    STORE_DIFFx2 m0, m1, m6, m7, m4, 3, dst1q, strideq
   1062    STORE_DIFFx2 m2, m3, m6, m7, m4, 3, dst2q, strideq
   1063 
   1064    RET
   1065 
   1066 ;-----------------------------------------------------------------------------
   1067 ; void ff_vp8_luma_dc_wht(int16_t block[4][4][16], int16_t dc[16])
   1068 ;-----------------------------------------------------------------------------
   1069 
   1070 %macro SCATTER_WHT 3
   1071    movd dc1d, m%1
   1072    movd dc2d, m%2
   1073    mov [blockq+2*16*(0+%3)], dc1w
   1074    mov [blockq+2*16*(1+%3)], dc2w
   1075    shr  dc1d, 16
   1076    shr  dc2d, 16
   1077    psrlq m%1, 32
   1078    psrlq m%2, 32
   1079    mov [blockq+2*16*(4+%3)], dc1w
   1080    mov [blockq+2*16*(5+%3)], dc2w
   1081    movd dc1d, m%1
   1082    movd dc2d, m%2
   1083    mov [blockq+2*16*(8+%3)], dc1w
   1084    mov [blockq+2*16*(9+%3)], dc2w
   1085    shr  dc1d, 16
   1086    shr  dc2d, 16
   1087    mov [blockq+2*16*(12+%3)], dc1w
   1088    mov [blockq+2*16*(13+%3)], dc2w
   1089 %endmacro
   1090 
   1091 %macro HADAMARD4_1D 4
   1092    SUMSUB_BADC w, %2, %1, %4, %3
   1093    SUMSUB_BADC w, %4, %2, %3, %1
   1094    SWAP %1, %4, %3
   1095 %endmacro
   1096 
   1097 INIT_MMX sse
   1098 cglobal vp8_luma_dc_wht, 2, 3, 0, block, dc1, dc2
   1099    movq          m0, [dc1q]
   1100    movq          m1, [dc1q+8]
   1101    movq          m2, [dc1q+16]
   1102    movq          m3, [dc1q+24]
   1103    xorps      xmm0, xmm0
   1104    movaps [dc1q+ 0], xmm0
   1105    movaps [dc1q+16], xmm0
   1106    HADAMARD4_1D  0, 1, 2, 3
   1107    TRANSPOSE4x4W 0, 1, 2, 3, 4
   1108    paddw         m0, [pw_3]
   1109    HADAMARD4_1D  0, 1, 2, 3
   1110    psraw         m0, 3
   1111    psraw         m1, 3
   1112    psraw         m2, 3
   1113    psraw         m3, 3
   1114    SCATTER_WHT   0, 1, 0
   1115    SCATTER_WHT   2, 3, 2
   1116    RET