tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

h264_intrapred.asm (55163B)


      1 ;******************************************************************************
      2 ;* H.264 intra prediction asm optimizations
      3 ;* Copyright (c) 2010 Fiona Glaser
      4 ;* Copyright (c) 2010 Holger Lubitz
      5 ;* Copyright (c) 2010 Loren Merritt
      6 ;* Copyright (c) 2010 Ronald S. Bultje
      7 ;*
      8 ;* This file is part of FFmpeg.
      9 ;*
     10 ;* FFmpeg is free software; you can redistribute it and/or
     11 ;* modify it under the terms of the GNU Lesser General Public
     12 ;* License as published by the Free Software Foundation; either
     13 ;* version 2.1 of the License, or (at your option) any later version.
     14 ;*
     15 ;* FFmpeg is distributed in the hope that it will be useful,
     16 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
     17 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
     18 ;* Lesser General Public License for more details.
     19 ;*
     20 ;* You should have received a copy of the GNU Lesser General Public
     21 ;* License along with FFmpeg; if not, write to the Free Software
     22 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
     23 ;******************************************************************************
     24 
     25 %include "libavutil/x86/x86util.asm"
     26 
     27 SECTION_RODATA
     28 
     29 tm_shuf: times 8 db 0x03, 0x80
     30 pw_ff00: times 8 dw 0xff00
     31 plane_shuf:  db -8, -7, -6, -5, -4, -3, -2, -1
     32             db  1,  2,  3,  4,  5,  6,  7,  8
     33 plane8_shuf: db -4, -3, -2, -1,  0,  0,  0,  0
     34             db  1,  2,  3,  4,  0,  0,  0,  0
     35 pw_0to7:     dw  0,  1,  2,  3,  4,  5,  6,  7
     36 pw_1to8:     dw  1,  2,  3,  4,  5,  6,  7,  8
     37 pw_m8tom1:   dw -8, -7, -6, -5, -4, -3, -2, -1
     38 pw_m4to4:    dw -4, -3, -2, -1,  1,  2,  3,  4
     39 
     40 SECTION .text
     41 
     42 cextern pb_1
     43 cextern pb_3
     44 cextern pw_4
     45 cextern pw_8
     46 
     47 ;-----------------------------------------------------------------------------
     48 ; void ff_pred16x16_vertical_8(uint8_t *src, ptrdiff_t stride)
     49 ;-----------------------------------------------------------------------------
     50 
     51 INIT_XMM sse
     52 cglobal pred16x16_vertical_8, 2,3
     53    sub   r0, r1
     54    mov   r2, 4
     55    movaps xmm0, [r0]
     56 .loop:
     57    movaps [r0+r1*1], xmm0
     58    movaps [r0+r1*2], xmm0
     59    lea   r0, [r0+r1*2]
     60    movaps [r0+r1*1], xmm0
     61    movaps [r0+r1*2], xmm0
     62    lea   r0, [r0+r1*2]
     63    dec   r2
     64    jg .loop
     65    RET
     66 
     67 ;-----------------------------------------------------------------------------
     68 ; void ff_pred16x16_horizontal_8(uint8_t *src, ptrdiff_t stride)
     69 ;-----------------------------------------------------------------------------
     70 
     71 %macro PRED16x16_H 0
     72 cglobal pred16x16_horizontal_8, 2,3
     73    mov       r2, 8
     74 %if cpuflag(ssse3)
     75    mova      m2, [pb_3]
     76 %endif
     77 .loop:
     78    movd      m0, [r0+r1*0-4]
     79    movd      m1, [r0+r1*1-4]
     80 
     81 %if cpuflag(ssse3)
     82    pshufb    m0, m2
     83    pshufb    m1, m2
     84 %else
     85    punpcklbw m0, m0
     86    punpcklbw m1, m1
     87    SPLATW    m0, m0, 3
     88    SPLATW    m1, m1, 3
     89 %endif
     90 
     91    mova [r0+r1*0], m0
     92    mova [r0+r1*1], m1
     93    lea       r0, [r0+r1*2]
     94    dec       r2
     95    jg .loop
     96    RET
     97 %endmacro
     98 
     99 INIT_XMM sse2
    100 PRED16x16_H
    101 INIT_XMM ssse3
    102 PRED16x16_H
    103 
    104 ;-----------------------------------------------------------------------------
    105 ; void ff_pred16x16_dc_8(uint8_t *src, ptrdiff_t stride)
    106 ;-----------------------------------------------------------------------------
    107 
    108 %macro PRED16x16_DC 0
    109 cglobal pred16x16_dc_8, 2,7
    110    mov       r4, r0
    111    sub       r0, r1
    112    pxor      mm0, mm0
    113    pxor      mm1, mm1
    114    psadbw    mm0, [r0+0]
    115    psadbw    mm1, [r0+8]
    116    dec        r0
    117    movzx     r5d, byte [r0+r1*1]
    118    paddw     mm0, mm1
    119    movd      r6d, mm0
    120    lea        r0, [r0+r1*2]
    121 %rep 7
    122    movzx     r2d, byte [r0+r1*0]
    123    movzx     r3d, byte [r0+r1*1]
    124    add       r5d, r2d
    125    add       r6d, r3d
    126    lea        r0, [r0+r1*2]
    127 %endrep
    128    movzx     r2d, byte [r0+r1*0]
    129    add       r5d, r6d
    130    lea       r2d, [r2+r5+16]
    131    shr       r2d, 5
    132 %if cpuflag(ssse3)
    133    pxor       m1, m1
    134 %endif
    135    SPLATB_REG m0, r2, m1
    136 
    137    mov       r3d, 4
    138 .loop:
    139    mova [r4+r1*0], m0
    140    mova [r4+r1*1], m0
    141    lea   r4, [r4+r1*2]
    142    mova [r4+r1*0], m0
    143    mova [r4+r1*1], m0
    144    lea   r4, [r4+r1*2]
    145    dec   r3d
    146    jg .loop
    147    RET
    148 %endmacro
    149 
    150 INIT_XMM sse2
    151 PRED16x16_DC
    152 INIT_XMM ssse3
    153 PRED16x16_DC
    154 
    155 ;-----------------------------------------------------------------------------
    156 ; void ff_pred16x16_tm_vp8_8(uint8_t *src, ptrdiff_t stride)
    157 ;-----------------------------------------------------------------------------
    158 
    159 INIT_XMM sse2
    160 cglobal pred16x16_tm_vp8_8, 2,6,6
    161    sub          r0, r1
    162    pxor       xmm2, xmm2
    163    movdqa     xmm0, [r0]
    164    movdqa     xmm1, xmm0
    165    punpcklbw  xmm0, xmm2
    166    punpckhbw  xmm1, xmm2
    167    movzx       r4d, byte [r0-1]
    168    mov         r5d, 8
    169 .loop:
    170    movzx       r2d, byte [r0+r1*1-1]
    171    movzx       r3d, byte [r0+r1*2-1]
    172    sub         r2d, r4d
    173    sub         r3d, r4d
    174    movd       xmm2, r2d
    175    movd       xmm4, r3d
    176    pshuflw    xmm2, xmm2, 0
    177    pshuflw    xmm4, xmm4, 0
    178    punpcklqdq xmm2, xmm2
    179    punpcklqdq xmm4, xmm4
    180    movdqa     xmm3, xmm2
    181    movdqa     xmm5, xmm4
    182    paddw      xmm2, xmm0
    183    paddw      xmm3, xmm1
    184    paddw      xmm4, xmm0
    185    paddw      xmm5, xmm1
    186    packuswb   xmm2, xmm3
    187    packuswb   xmm4, xmm5
    188    movdqa [r0+r1*1], xmm2
    189    movdqa [r0+r1*2], xmm4
    190    lea          r0, [r0+r1*2]
    191    dec         r5d
    192    jg .loop
    193    RET
    194 
    195 %if HAVE_AVX2_EXTERNAL
    196 INIT_YMM avx2
    197 cglobal pred16x16_tm_vp8_8, 2, 4, 5, dst, stride, stride3, iteration
    198    sub                       dstq, strideq
    199    pmovzxbw                    m0, [dstq]
    200    vpbroadcastb               xm1, [r0-1]
    201    pmovzxbw                    m1, xm1
    202    psubw                       m0, m1
    203    mov                 iterationd, 4
    204    lea                   stride3q, [strideq*3]
    205 .loop:
    206    vpbroadcastb               xm1, [dstq+strideq*1-1]
    207    vpbroadcastb               xm2, [dstq+strideq*2-1]
    208    vpbroadcastb               xm3, [dstq+stride3q-1]
    209    vpbroadcastb               xm4, [dstq+strideq*4-1]
    210    pmovzxbw                    m1, xm1
    211    pmovzxbw                    m2, xm2
    212    pmovzxbw                    m3, xm3
    213    pmovzxbw                    m4, xm4
    214    paddw                       m1, m0
    215    paddw                       m2, m0
    216    paddw                       m3, m0
    217    paddw                       m4, m0
    218    vpackuswb                   m1, m1, m2
    219    vpackuswb                   m3, m3, m4
    220    vpermq                      m1, m1, q3120
    221    vpermq                      m3, m3, q3120
    222    movdqa        [dstq+strideq*1], xm1
    223    vextracti128  [dstq+strideq*2], m1, 1
    224    movdqa       [dstq+stride3q*1], xm3
    225    vextracti128  [dstq+strideq*4], m3, 1
    226    lea                       dstq, [dstq+strideq*4]
    227    dec                 iterationd
    228    jg .loop
    229    RET
    230 %endif
    231 
    232 ;-----------------------------------------------------------------------------
    233 ; void ff_pred16x16_plane_*_8(uint8_t *src, ptrdiff_t stride)
    234 ;-----------------------------------------------------------------------------
    235 
    236 %macro H264_PRED16x16_PLANE 1
    237 cglobal pred16x16_plane_%1_8, 2,9,7
    238    mov          r2, r1           ; +stride
    239    neg          r1               ; -stride
    240 
    241    movh         m0, [r0+r1  -1]
    242 %if cpuflag(ssse3)
    243    movhps       m0, [r0+r1  +8]
    244    pmaddubsw    m0, [plane_shuf] ; H coefficients
    245 %else ; sse2
    246    pxor         m2, m2
    247    movh         m1, [r0+r1  +8]
    248    punpcklbw    m0, m2
    249    punpcklbw    m1, m2
    250    pmullw       m0, [pw_m8tom1]
    251    pmullw       m1, [pw_1to8]
    252    paddw        m0, m1
    253 %endif
    254    movhlps      m1, m0
    255    paddw        m0, m1
    256    PSHUFLW      m1, m0, 0xE
    257    paddw        m0, m1
    258    PSHUFLW      m1, m0, 0x1
    259    paddw        m0, m1           ; sum of H coefficients
    260 
    261    lea          r4, [r0+r2*8-1]
    262    lea          r3, [r0+r2*4-1]
    263    add          r4, r2
    264 
    265 %if ARCH_X86_64
    266 %define e_reg r8
    267 %else
    268 %define e_reg r0
    269 %endif
    270 
    271    movzx     e_reg, byte [r3+r2*2   ]
    272    movzx        r5, byte [r4+r1     ]
    273    sub          r5, e_reg
    274 
    275    movzx     e_reg, byte [r3+r2     ]
    276    movzx        r6, byte [r4        ]
    277    sub          r6, e_reg
    278    lea          r5, [r5+r6*2]
    279 
    280    movzx     e_reg, byte [r3+r1     ]
    281    movzx        r6, byte [r4+r2*2   ]
    282    sub          r6, e_reg
    283    lea          r5, [r5+r6*4]
    284 
    285    movzx     e_reg, byte [r3        ]
    286 %if ARCH_X86_64
    287    movzx        r7, byte [r4+r2     ]
    288    sub          r7, e_reg
    289 %else
    290    movzx        r6, byte [r4+r2     ]
    291    sub          r6, e_reg
    292    lea          r5, [r5+r6*4]
    293    sub          r5, r6
    294 %endif
    295 
    296    lea       e_reg, [r3+r1*4]
    297    lea          r3, [r4+r2*4]
    298 
    299    movzx        r4, byte [e_reg+r2  ]
    300    movzx        r6, byte [r3        ]
    301    sub          r6, r4
    302 %if ARCH_X86_64
    303    lea          r6, [r7+r6*2]
    304    lea          r5, [r5+r6*2]
    305    add          r5, r6
    306 %else
    307    lea          r5, [r5+r6*4]
    308    lea          r5, [r5+r6*2]
    309 %endif
    310 
    311    movzx        r4, byte [e_reg     ]
    312 %if ARCH_X86_64
    313    movzx        r7, byte [r3   +r2  ]
    314    sub          r7, r4
    315    sub          r5, r7
    316 %else
    317    movzx        r6, byte [r3   +r2  ]
    318    sub          r6, r4
    319    lea          r5, [r5+r6*8]
    320    sub          r5, r6
    321 %endif
    322 
    323    movzx        r4, byte [e_reg+r1  ]
    324    movzx        r6, byte [r3   +r2*2]
    325    sub          r6, r4
    326 %if ARCH_X86_64
    327    add          r6, r7
    328 %endif
    329    lea          r5, [r5+r6*8]
    330 
    331    movzx        r4, byte [e_reg+r2*2]
    332    movzx        r6, byte [r3   +r1  ]
    333    sub          r6, r4
    334    lea          r5, [r5+r6*4]
    335    add          r5, r6           ; sum of V coefficients
    336 
    337 %if ARCH_X86_64 == 0
    338    mov          r0, r0m
    339 %endif
    340 
    341 %ifidn %1, h264
    342    lea          r5, [r5*5+32]
    343    sar          r5, 6
    344 %elifidn %1, rv40
    345    lea          r5, [r5*5]
    346    sar          r5, 6
    347 %elifidn %1, svq3
    348    test         r5, r5
    349    lea          r6, [r5+3]
    350    cmovs        r5, r6
    351    sar          r5, 2            ; V/4
    352    lea          r5, [r5*5]       ; 5*(V/4)
    353    test         r5, r5
    354    lea          r6, [r5+15]
    355    cmovs        r5, r6
    356    sar          r5, 4            ; (5*(V/4))/16
    357 %endif
    358 
    359    movzx        r4, byte [r0+r1  +15]
    360    movzx        r3, byte [r3+r2*2   ]
    361    lea          r3, [r3+r4+1]
    362    shl          r3, 4
    363 
    364    movd        r1d, m0
    365    movsx       r1d, r1w
    366 %ifnidn %1, svq3
    367 %ifidn %1, h264
    368    lea         r1d, [r1d*5+32]
    369 %else ; rv40
    370    lea         r1d, [r1d*5]
    371 %endif
    372    sar         r1d, 6
    373 %else ; svq3
    374    test        r1d, r1d
    375    lea         r4d, [r1d+3]
    376    cmovs       r1d, r4d
    377    sar         r1d, 2           ; H/4
    378    lea         r1d, [r1d*5]     ; 5*(H/4)
    379    test        r1d, r1d
    380    lea         r4d, [r1d+15]
    381    cmovs       r1d, r4d
    382    sar         r1d, 4           ; (5*(H/4))/16
    383 %endif
    384    movd         m0, r1d
    385 
    386    add         r1d, r5d
    387    add         r3d, r1d
    388    shl         r1d, 3
    389    sub         r3d, r1d          ; a
    390 
    391    movd         m1, r5d
    392    movd         m3, r3d
    393    SPLATW       m0, m0, 0        ; H
    394    SPLATW       m1, m1, 0        ; V
    395    SPLATW       m3, m3, 0        ; a
    396 %ifidn %1, svq3
    397    SWAP          0, 1
    398 %endif
    399    mova         m2, m0
    400    pmullw       m0, [pw_0to7]    ; 0*H, 1*H, ..., 7*H  (words)
    401    psllw        m2, 3
    402    paddw        m0, m3           ; a + {0,1,2,3,4,5,6,7}*H
    403    paddw        m2, m0           ; a + {8,9,10,11,12,13,14,15}*H
    404 
    405    mov          r4, 8
    406 .loop:
    407    mova         m3, m0           ; b[0..7]
    408    mova         m4, m2           ; b[8..15]
    409    psraw        m3, 5
    410    psraw        m4, 5
    411    packuswb     m3, m4
    412    mova       [r0], m3
    413    paddw        m0, m1
    414    paddw        m2, m1
    415 
    416    mova         m3, m0           ; b[0..7]
    417    mova         m4, m2           ; b[8..15]
    418    psraw        m3, 5
    419    psraw        m4, 5
    420    packuswb     m3, m4
    421    mova    [r0+r2], m3
    422    paddw        m0, m1
    423    paddw        m2, m1
    424 
    425    lea          r0, [r0+r2*2]
    426    dec          r4
    427    jg .loop
    428    RET
    429 %endmacro
    430 
    431 INIT_XMM sse2
    432 H264_PRED16x16_PLANE h264
    433 H264_PRED16x16_PLANE rv40
    434 H264_PRED16x16_PLANE svq3
    435 INIT_XMM ssse3
    436 H264_PRED16x16_PLANE h264
    437 H264_PRED16x16_PLANE rv40
    438 H264_PRED16x16_PLANE svq3
    439 
    440 ;-----------------------------------------------------------------------------
    441 ; void ff_pred8x8_plane_8(uint8_t *src, ptrdiff_t stride)
    442 ;-----------------------------------------------------------------------------
    443 
    444 %macro H264_PRED8x8_PLANE 0
    445 cglobal pred8x8_plane_8, 2,9,7
    446    mov          r2, r1           ; +stride
    447    neg          r1               ; -stride
    448 
    449    movd         m0, [r0+r1  -1]
    450 %if cpuflag(ssse3)
    451    movhps       m0, [r0+r1  +4]   ; this reads 4 bytes more than necessary
    452    pmaddubsw    m0, [plane8_shuf] ; H coefficients
    453 %else ; sse2
    454    pxor         m2, m2
    455    movd         m1, [r0+r1  +4]
    456    punpckldq    m0, m1
    457    punpcklbw    m0, m2
    458    pmullw       m0, [pw_m4to4]
    459 %endif
    460    movhlps      m1, m0
    461    paddw        m0, m1
    462 
    463 %if notcpuflag(ssse3)
    464    PSHUFLW      m1, m0, 0xE
    465    paddw        m0, m1
    466 %endif ; !ssse3
    467 
    468    PSHUFLW      m1, m0, 0x1
    469    paddw        m0, m1           ; sum of H coefficients
    470 
    471    lea          r4, [r0+r2*4-1]
    472    lea          r3, [r0     -1]
    473    add          r4, r2
    474 
    475 %if ARCH_X86_64
    476 %define e_reg r8
    477 %else
    478 %define e_reg r0
    479 %endif
    480 
    481    movzx     e_reg, byte [r3+r2*2   ]
    482    movzx        r5, byte [r4+r1     ]
    483    sub          r5, e_reg
    484 
    485    movzx     e_reg, byte [r3        ]
    486 %if ARCH_X86_64
    487    movzx        r7, byte [r4+r2     ]
    488    sub          r7, e_reg
    489    sub          r5, r7
    490 %else
    491    movzx        r6, byte [r4+r2     ]
    492    sub          r6, e_reg
    493    lea          r5, [r5+r6*4]
    494    sub          r5, r6
    495 %endif
    496 
    497    movzx     e_reg, byte [r3+r1     ]
    498    movzx        r6, byte [r4+r2*2   ]
    499    sub          r6, e_reg
    500 %if ARCH_X86_64
    501    add          r6, r7
    502 %endif
    503    lea          r5, [r5+r6*4]
    504 
    505    movzx     e_reg, byte [r3+r2     ]
    506    movzx        r6, byte [r4        ]
    507    sub          r6, e_reg
    508    lea          r6, [r5+r6*2]
    509 
    510    lea          r5, [r6*9+16]
    511    lea          r5, [r5+r6*8]
    512    sar          r5, 5
    513 
    514 %if ARCH_X86_64 == 0
    515    mov          r0, r0m
    516 %endif
    517 
    518    movzx        r3, byte [r4+r2*2  ]
    519    movzx        r4, byte [r0+r1  +7]
    520    lea          r3, [r3+r4+1]
    521    shl          r3, 4
    522    movd        r1d, m0
    523    movsx       r1d, r1w
    524    imul        r1d, 17
    525    add         r1d, 16
    526    sar         r1d, 5
    527    movd         m0, r1d
    528    add         r1d, r5d
    529    sub         r3d, r1d
    530    add         r1d, r1d
    531    sub         r3d, r1d          ; a
    532 
    533    movd         m1, r5d
    534    movd         m3, r3d
    535    SPLATW       m0, m0, 0        ; H
    536    SPLATW       m1, m1, 0        ; V
    537    SPLATW       m3, m3, 0        ; a
    538    pmullw       m0, [pw_0to7]    ; 0*H, 1*H, ..., 7*H  (words)
    539    paddw        m0, m3           ; a + {0,1,2,3,4,5,6,7}*H
    540 
    541    mov          r4, 4
    542 ALIGN 16
    543 .loop:
    544    mova         m3, m0           ; b[0..7]
    545    paddw        m0, m1
    546    psraw        m3, 5
    547    mova         m4, m0           ; V+b[0..7]
    548    paddw        m0, m1
    549    psraw        m4, 5
    550    packuswb     m3, m4
    551    movh       [r0], m3
    552    movhps  [r0+r2], m3
    553 
    554    lea          r0, [r0+r2*2]
    555    dec          r4
    556    jg .loop
    557    RET
    558 %endmacro
    559 
    560 INIT_XMM sse2
    561 H264_PRED8x8_PLANE
    562 INIT_XMM ssse3
    563 H264_PRED8x8_PLANE
    564 
    565 ;-----------------------------------------------------------------------------
    566 ; void ff_pred8x8_vertical_8(uint8_t *src, ptrdiff_t stride)
    567 ;-----------------------------------------------------------------------------
    568 
    569 INIT_XMM sse2
    570 cglobal pred8x8_vertical_8, 2,2
    571    sub    r0, r1
    572    movq   m0, [r0]
    573 %rep 3
    574    movq [r0+r1*1], m0
    575    movq [r0+r1*2], m0
    576    lea    r0, [r0+r1*2]
    577 %endrep
    578    movq [r0+r1*1], m0
    579    movq [r0+r1*2], m0
    580    RET
    581 
    582 ;-----------------------------------------------------------------------------
    583 ; void ff_pred8x8_horizontal_8(uint8_t *src, ptrdiff_t stride)
    584 ;-----------------------------------------------------------------------------
    585 
    586 %macro PRED8x8_H 0
    587 cglobal pred8x8_horizontal_8, 2,3
    588    mov       r2, 4
    589 %if cpuflag(ssse3)
    590    mova      m2, [pb_3]
    591 %endif
    592 .loop:
    593    SPLATB_LOAD m0, r0+r1*0-1, m2
    594    SPLATB_LOAD m1, r0+r1*1-1, m2
    595    mova [r0+r1*0], m0
    596    mova [r0+r1*1], m1
    597    lea       r0, [r0+r1*2]
    598    dec       r2
    599    jg .loop
    600    RET
    601 %endmacro
    602 
    603 INIT_MMX mmxext
    604 PRED8x8_H
    605 INIT_MMX ssse3
    606 PRED8x8_H
    607 
    608 ;-----------------------------------------------------------------------------
    609 ; void ff_pred8x8_top_dc_8_mmxext(uint8_t *src, ptrdiff_t stride)
    610 ;-----------------------------------------------------------------------------
    611 INIT_MMX mmxext
    612 cglobal pred8x8_top_dc_8, 2,5
    613    sub         r0, r1
    614    movq       mm0, [r0]
    615    pxor       mm1, mm1
    616    pxor       mm2, mm2
    617    lea         r2, [r0+r1*2]
    618    punpckhbw  mm1, mm0
    619    punpcklbw  mm0, mm2
    620    psadbw     mm1, mm2        ; s1
    621    lea         r3, [r2+r1*2]
    622    psadbw     mm0, mm2        ; s0
    623    psrlw      mm1, 1
    624    psrlw      mm0, 1
    625    pavgw      mm1, mm2
    626    lea         r4, [r3+r1*2]
    627    pavgw      mm0, mm2
    628    pshufw     mm1, mm1, 0
    629    pshufw     mm0, mm0, 0     ; dc0 (w)
    630    packuswb   mm0, mm1        ; dc0,dc1 (b)
    631    movq [r0+r1*1], mm0
    632    movq [r0+r1*2], mm0
    633    lea         r0, [r3+r1*2]
    634    movq [r2+r1*1], mm0
    635    movq [r2+r1*2], mm0
    636    movq [r3+r1*1], mm0
    637    movq [r3+r1*2], mm0
    638    movq [r0+r1*1], mm0
    639    movq [r0+r1*2], mm0
    640    RET
    641 
    642 ;-----------------------------------------------------------------------------
    643 ; void ff_pred8x8_dc_8_mmxext(uint8_t *src, ptrdiff_t stride)
    644 ;-----------------------------------------------------------------------------
    645 
    646 INIT_MMX mmxext
    647 cglobal pred8x8_dc_8, 2,5
    648    sub       r0, r1
    649    pxor      m7, m7
    650    movd      m0, [r0+0]
    651    movd      m1, [r0+4]
    652    psadbw    m0, m7            ; s0
    653    mov       r4, r0
    654    psadbw    m1, m7            ; s1
    655 
    656    movzx    r2d, byte [r0+r1*1-1]
    657    movzx    r3d, byte [r0+r1*2-1]
    658    lea       r0, [r0+r1*2]
    659    add      r2d, r3d
    660    movzx    r3d, byte [r0+r1*1-1]
    661    add      r2d, r3d
    662    movzx    r3d, byte [r0+r1*2-1]
    663    add      r2d, r3d
    664    lea       r0, [r0+r1*2]
    665    movd      m2, r2d            ; s2
    666    movzx    r2d, byte [r0+r1*1-1]
    667    movzx    r3d, byte [r0+r1*2-1]
    668    lea       r0, [r0+r1*2]
    669    add      r2d, r3d
    670    movzx    r3d, byte [r0+r1*1-1]
    671    add      r2d, r3d
    672    movzx    r3d, byte [r0+r1*2-1]
    673    add      r2d, r3d
    674    movd      m3, r2d            ; s3
    675 
    676    punpcklwd m0, m1
    677    mov       r0, r4
    678    punpcklwd m2, m3
    679    punpckldq m0, m2            ; s0, s1, s2, s3
    680    pshufw    m3, m0, 11110110b ; s2, s1, s3, s3
    681    lea       r2, [r0+r1*2]
    682    pshufw    m0, m0, 01110100b ; s0, s1, s3, s1
    683    paddw     m0, m3
    684    lea       r3, [r2+r1*2]
    685    psrlw     m0, 2
    686    pavgw     m0, m7            ; s0+s2, s1, s3, s1+s3
    687    lea       r4, [r3+r1*2]
    688    packuswb  m0, m0
    689    punpcklbw m0, m0
    690    movq      m1, m0
    691    punpcklbw m0, m0
    692    punpckhbw m1, m1
    693    movq [r0+r1*1], m0
    694    movq [r0+r1*2], m0
    695    movq [r2+r1*1], m0
    696    movq [r2+r1*2], m0
    697    movq [r3+r1*1], m1
    698    movq [r3+r1*2], m1
    699    movq [r4+r1*1], m1
    700    movq [r4+r1*2], m1
    701    RET
    702 
    703 ;-----------------------------------------------------------------------------
    704 ; void ff_pred8x8_dc_rv40_8(uint8_t *src, ptrdiff_t stride)
    705 ;-----------------------------------------------------------------------------
    706 
    707 INIT_MMX mmxext
    708 cglobal pred8x8_dc_rv40_8, 2,7
    709    mov       r4, r0
    710    sub       r0, r1
    711    pxor      mm0, mm0
    712    psadbw    mm0, [r0]
    713    dec        r0
    714    movzx     r5d, byte [r0+r1*1]
    715    movd      r6d, mm0
    716    lea        r0, [r0+r1*2]
    717 %rep 3
    718    movzx     r2d, byte [r0+r1*0]
    719    movzx     r3d, byte [r0+r1*1]
    720    add       r5d, r2d
    721    add       r6d, r3d
    722    lea        r0, [r0+r1*2]
    723 %endrep
    724    movzx     r2d, byte [r0+r1*0]
    725    add       r5d, r6d
    726    lea       r2d, [r2+r5+8]
    727    shr       r2d, 4
    728    movd      mm0, r2d
    729    punpcklbw mm0, mm0
    730    pshufw    mm0, mm0, 0
    731    mov       r3d, 4
    732 .loop:
    733    movq [r4+r1*0], mm0
    734    movq [r4+r1*1], mm0
    735    lea   r4, [r4+r1*2]
    736    dec   r3d
    737    jg .loop
    738    RET
    739 
    740 ;-----------------------------------------------------------------------------
    741 ; void ff_pred8x8_tm_vp8_8(uint8_t *src, ptrdiff_t stride)
    742 ;-----------------------------------------------------------------------------
    743 
    744 INIT_XMM sse2
    745 cglobal pred8x8_tm_vp8_8, 2,6,4
    746    sub          r0, r1
    747    pxor       xmm1, xmm1
    748    movq       xmm0, [r0]
    749    punpcklbw  xmm0, xmm1
    750    movzx       r4d, byte [r0-1]
    751    mov         r5d, 4
    752 .loop:
    753    movzx       r2d, byte [r0+r1*1-1]
    754    movzx       r3d, byte [r0+r1*2-1]
    755    sub         r2d, r4d
    756    sub         r3d, r4d
    757    movd       xmm2, r2d
    758    movd       xmm3, r3d
    759    pshuflw    xmm2, xmm2, 0
    760    pshuflw    xmm3, xmm3, 0
    761    punpcklqdq xmm2, xmm2
    762    punpcklqdq xmm3, xmm3
    763    paddw      xmm2, xmm0
    764    paddw      xmm3, xmm0
    765    packuswb   xmm2, xmm3
    766    movq   [r0+r1*1], xmm2
    767    movhps [r0+r1*2], xmm2
    768    lea          r0, [r0+r1*2]
    769    dec         r5d
    770    jg .loop
    771    RET
    772 
    773 INIT_XMM ssse3
    774 cglobal pred8x8_tm_vp8_8, 2,3,6
    775    sub          r0, r1
    776    movdqa     xmm4, [tm_shuf]
    777    pxor       xmm1, xmm1
    778    movq       xmm0, [r0]
    779    punpcklbw  xmm0, xmm1
    780    movd       xmm5, [r0-4]
    781    pshufb     xmm5, xmm4
    782    mov         r2d, 4
    783 .loop:
    784    movd       xmm2, [r0+r1*1-4]
    785    movd       xmm3, [r0+r1*2-4]
    786    pshufb     xmm2, xmm4
    787    pshufb     xmm3, xmm4
    788    psubw      xmm2, xmm5
    789    psubw      xmm3, xmm5
    790    paddw      xmm2, xmm0
    791    paddw      xmm3, xmm0
    792    packuswb   xmm2, xmm3
    793    movq   [r0+r1*1], xmm2
    794    movhps [r0+r1*2], xmm2
    795    lea          r0, [r0+r1*2]
    796    dec         r2d
    797    jg .loop
    798    RET
    799 
    800 ; dest, left, right, src, tmp
    801 ; output: %1 = (t[n-1] + t[n]*2 + t[n+1] + 2) >> 2
    802 %macro PRED4x4_LOWPASS 5
    803    mova    %5, %2
    804    pavgb   %2, %3
    805    pxor    %3, %5
    806    mova    %1, %4
    807    pand    %3, [pb_1]
    808    psubusb %2, %3
    809    pavgb   %1, %2
    810 %endmacro
    811 
    812 ;-----------------------------------------------------------------------------
    813 ; void ff_pred8x8l_top_dc_8(uint8_t *src, int has_topleft, int has_topright,
    814 ;                           ptrdiff_t stride)
    815 ;-----------------------------------------------------------------------------
    816 %macro PRED8x8L_TOP_DC 0
    817 cglobal pred8x8l_top_dc_8, 4,4
    818    sub          r0, r3
    819    pxor        mm7, mm7
    820    movq        mm0, [r0-8]
    821    movq        mm3, [r0]
    822    movq        mm1, [r0+8]
    823    movq        mm2, mm3
    824    movq        mm4, mm3
    825    PALIGNR     mm2, mm0, 7, mm0
    826    PALIGNR     mm1, mm4, 1, mm4
    827    test        r1d, r1d ; top_left
    828    jz .fix_lt_2
    829    test        r2d, r2d ; top_right
    830    jz .fix_tr_1
    831    jmp .body
    832 .fix_lt_2:
    833    movq        mm5, mm3
    834    pxor        mm5, mm2
    835    psllq       mm5, 56
    836    psrlq       mm5, 56
    837    pxor        mm2, mm5
    838    test        r2d, r2d ; top_right
    839    jnz .body
    840 .fix_tr_1:
    841    movq        mm5, mm3
    842    pxor        mm5, mm1
    843    psrlq       mm5, 56
    844    psllq       mm5, 56
    845    pxor        mm1, mm5
    846 .body:
    847    PRED4x4_LOWPASS mm0, mm2, mm1, mm3, mm5
    848    psadbw   mm7, mm0
    849    paddw    mm7, [pw_4]
    850    psrlw    mm7, 3
    851    pshufw   mm7, mm7, 0
    852    packuswb mm7, mm7
    853 %rep 3
    854    movq [r0+r3*1], mm7
    855    movq [r0+r3*2], mm7
    856    lea    r0, [r0+r3*2]
    857 %endrep
    858    movq [r0+r3*1], mm7
    859    movq [r0+r3*2], mm7
    860    RET
    861 %endmacro
    862 
    863 INIT_MMX mmxext
    864 PRED8x8L_TOP_DC
    865 INIT_MMX ssse3
    866 PRED8x8L_TOP_DC
    867 
    868 ;-----------------------------------------------------------------------------
    869 ; void ff_pred8x8l_dc_8(uint8_t *src, int has_topleft, int has_topright,
    870 ;                       ptrdiff_t stride)
    871 ;-----------------------------------------------------------------------------
    872 
    873 %macro PRED8x8L_DC 0
    874 cglobal pred8x8l_dc_8, 4,5
    875    sub          r0, r3
    876    lea          r4, [r0+r3*2]
    877    movq        mm0, [r0+r3*1-8]
    878    punpckhbw   mm0, [r0+r3*0-8]
    879    movq        mm1, [r4+r3*1-8]
    880    punpckhbw   mm1, [r0+r3*2-8]
    881    mov          r4, r0
    882    punpckhwd   mm1, mm0
    883    lea          r0, [r0+r3*4]
    884    movq        mm2, [r0+r3*1-8]
    885    punpckhbw   mm2, [r0+r3*0-8]
    886    lea          r0, [r0+r3*2]
    887    movq        mm3, [r0+r3*1-8]
    888    punpckhbw   mm3, [r0+r3*0-8]
    889    punpckhwd   mm3, mm2
    890    punpckhdq   mm3, mm1
    891    lea          r0, [r0+r3*2]
    892    movq        mm0, [r0+r3*0-8]
    893    movq        mm1, [r4]
    894    mov          r0, r4
    895    movq        mm4, mm3
    896    movq        mm2, mm3
    897    PALIGNR     mm4, mm0, 7, mm0
    898    PALIGNR     mm1, mm2, 1, mm2
    899    test        r1d, r1d
    900    jnz .do_left
    901 .fix_lt_1:
    902    movq        mm5, mm3
    903    pxor        mm5, mm4
    904    psrlq       mm5, 56
    905    psllq       mm5, 48
    906    pxor        mm1, mm5
    907    jmp .do_left
    908 .fix_lt_2:
    909    movq        mm5, mm3
    910    pxor        mm5, mm2
    911    psllq       mm5, 56
    912    psrlq       mm5, 56
    913    pxor        mm2, mm5
    914    test        r2d, r2d
    915    jnz .body
    916 .fix_tr_1:
    917    movq        mm5, mm3
    918    pxor        mm5, mm1
    919    psrlq       mm5, 56
    920    psllq       mm5, 56
    921    pxor        mm1, mm5
    922    jmp .body
    923 .do_left:
    924    movq        mm0, mm4
    925    PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5
    926    movq        mm4, mm0
    927    movq        mm7, mm2
    928    PRED4x4_LOWPASS mm1, mm3, mm0, mm4, mm5
    929    psllq       mm1, 56
    930    PALIGNR     mm7, mm1, 7, mm3
    931    movq        mm0, [r0-8]
    932    movq        mm3, [r0]
    933    movq        mm1, [r0+8]
    934    movq        mm2, mm3
    935    movq        mm4, mm3
    936    PALIGNR     mm2, mm0, 7, mm0
    937    PALIGNR     mm1, mm4, 1, mm4
    938    test        r1d, r1d
    939    jz .fix_lt_2
    940    test        r2d, r2d
    941    jz .fix_tr_1
    942 .body:
    943    lea          r1, [r0+r3*2]
    944    PRED4x4_LOWPASS mm6, mm2, mm1, mm3, mm5
    945    pxor        mm0, mm0
    946    pxor        mm1, mm1
    947    lea          r2, [r1+r3*2]
    948    psadbw      mm0, mm7
    949    psadbw      mm1, mm6
    950    paddw       mm0, [pw_8]
    951    paddw       mm0, mm1
    952    lea          r4, [r2+r3*2]
    953    psrlw       mm0, 4
    954    pshufw      mm0, mm0, 0
    955    packuswb    mm0, mm0
    956    movq [r0+r3*1], mm0
    957    movq [r0+r3*2], mm0
    958    movq [r1+r3*1], mm0
    959    movq [r1+r3*2], mm0
    960    movq [r2+r3*1], mm0
    961    movq [r2+r3*2], mm0
    962    movq [r4+r3*1], mm0
    963    movq [r4+r3*2], mm0
    964    RET
    965 %endmacro
    966 
    967 INIT_MMX mmxext
    968 PRED8x8L_DC
    969 INIT_MMX ssse3
    970 PRED8x8L_DC
    971 
    972 ;-----------------------------------------------------------------------------
    973 ; void ff_pred8x8l_horizontal_8(uint8_t *src, int has_topleft,
    974 ;                               int has_topright, ptrdiff_t stride)
    975 ;-----------------------------------------------------------------------------
    976 
    977 %macro PRED8x8L_HORIZONTAL 0
    978 cglobal pred8x8l_horizontal_8, 4,4
    979    sub          r0, r3
    980    lea          r2, [r0+r3*2]
    981    movq        mm0, [r0+r3*1-8]
    982    test        r1d, r1d
    983    lea          r1, [r0+r3]
    984    cmovnz       r1, r0
    985    punpckhbw   mm0, [r1+r3*0-8]
    986    movq        mm1, [r2+r3*1-8]
    987    punpckhbw   mm1, [r0+r3*2-8]
    988    mov          r2, r0
    989    punpckhwd   mm1, mm0
    990    lea          r0, [r0+r3*4]
    991    movq        mm2, [r0+r3*1-8]
    992    punpckhbw   mm2, [r0+r3*0-8]
    993    lea          r0, [r0+r3*2]
    994    movq        mm3, [r0+r3*1-8]
    995    punpckhbw   mm3, [r0+r3*0-8]
    996    punpckhwd   mm3, mm2
    997    punpckhdq   mm3, mm1
    998    lea          r0, [r0+r3*2]
    999    movq        mm0, [r0+r3*0-8]
   1000    movq        mm1, [r1+r3*0-8]
   1001    mov          r0, r2
   1002    movq        mm4, mm3
   1003    movq        mm2, mm3
   1004    PALIGNR     mm4, mm0, 7, mm0
   1005    PALIGNR     mm1, mm2, 1, mm2
   1006    movq        mm0, mm4
   1007    PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5
   1008    movq        mm4, mm0
   1009    movq        mm7, mm2
   1010    PRED4x4_LOWPASS mm1, mm3, mm0, mm4, mm5
   1011    psllq       mm1, 56
   1012    PALIGNR     mm7, mm1, 7, mm3
   1013    movq        mm3, mm7
   1014    lea         r1, [r0+r3*2]
   1015    movq       mm7, mm3
   1016    punpckhbw  mm3, mm3
   1017    punpcklbw  mm7, mm7
   1018    pshufw     mm0, mm3, 0xff
   1019    pshufw     mm1, mm3, 0xaa
   1020    lea         r2, [r1+r3*2]
   1021    pshufw     mm2, mm3, 0x55
   1022    pshufw     mm3, mm3, 0x00
   1023    pshufw     mm4, mm7, 0xff
   1024    pshufw     mm5, mm7, 0xaa
   1025    pshufw     mm6, mm7, 0x55
   1026    pshufw     mm7, mm7, 0x00
   1027    movq [r0+r3*1], mm0
   1028    movq [r0+r3*2], mm1
   1029    movq [r1+r3*1], mm2
   1030    movq [r1+r3*2], mm3
   1031    movq [r2+r3*1], mm4
   1032    movq [r2+r3*2], mm5
   1033    lea         r0, [r2+r3*2]
   1034    movq [r0+r3*1], mm6
   1035    movq [r0+r3*2], mm7
   1036    RET
   1037 %endmacro
   1038 
   1039 INIT_MMX mmxext
   1040 PRED8x8L_HORIZONTAL
   1041 INIT_MMX ssse3
   1042 PRED8x8L_HORIZONTAL
   1043 
   1044 ;-----------------------------------------------------------------------------
   1045 ; void ff_pred8x8l_vertical_8(uint8_t *src, int has_topleft, int has_topright,
   1046 ;                             ptrdiff_t stride)
   1047 ;-----------------------------------------------------------------------------
   1048 
   1049 %macro PRED8x8L_VERTICAL 0
   1050 cglobal pred8x8l_vertical_8, 4,4
   1051    sub          r0, r3
   1052    movq        mm0, [r0-8]
   1053    movq        mm3, [r0]
   1054    movq        mm1, [r0+8]
   1055    movq        mm2, mm3
   1056    movq        mm4, mm3
   1057    PALIGNR     mm2, mm0, 7, mm0
   1058    PALIGNR     mm1, mm4, 1, mm4
   1059    test        r1d, r1d ; top_left
   1060    jz .fix_lt_2
   1061    test        r2d, r2d ; top_right
   1062    jz .fix_tr_1
   1063    jmp .body
   1064 .fix_lt_2:
   1065    movq        mm5, mm3
   1066    pxor        mm5, mm2
   1067    psllq       mm5, 56
   1068    psrlq       mm5, 56
   1069    pxor        mm2, mm5
   1070    test        r2d, r2d ; top_right
   1071    jnz .body
   1072 .fix_tr_1:
   1073    movq        mm5, mm3
   1074    pxor        mm5, mm1
   1075    psrlq       mm5, 56
   1076    psllq       mm5, 56
   1077    pxor        mm1, mm5
   1078 .body:
   1079    PRED4x4_LOWPASS mm0, mm2, mm1, mm3, mm5
   1080 %rep 3
   1081    movq [r0+r3*1], mm0
   1082    movq [r0+r3*2], mm0
   1083    lea    r0, [r0+r3*2]
   1084 %endrep
   1085    movq [r0+r3*1], mm0
   1086    movq [r0+r3*2], mm0
   1087    RET
   1088 %endmacro
   1089 
   1090 INIT_MMX mmxext
   1091 PRED8x8L_VERTICAL
   1092 INIT_MMX ssse3
   1093 PRED8x8L_VERTICAL
   1094 
   1095 ;-----------------------------------------------------------------------------
   1096 ; void ff_pred8x8l_down_left_8(uint8_t *src, int has_topleft,
   1097 ;                              int has_topright, ptrdiff_t stride)
   1098 ;-----------------------------------------------------------------------------
   1099 
   1100 %macro PRED8x8L_DOWN_LEFT 0
   1101 cglobal pred8x8l_down_left_8, 4,4
   1102    sub          r0, r3
   1103    movq        mm0, [r0-8]
   1104    movq        mm3, [r0]
   1105    movq        mm1, [r0+8]
   1106    movq        mm2, mm3
   1107    movq        mm4, mm3
   1108    PALIGNR     mm2, mm0, 7, mm0
   1109    PALIGNR     mm1, mm4, 1, mm4
   1110    test        r1d, r1d ; top_left
   1111    jz .fix_lt_2
   1112    test        r2d, r2d ; top_right
   1113    jz .fix_tr_1
   1114    jmp .do_top
   1115 .fix_lt_2:
   1116    movq        mm5, mm3
   1117    pxor        mm5, mm2
   1118    psllq       mm5, 56
   1119    psrlq       mm5, 56
   1120    pxor        mm2, mm5
   1121    test        r2d, r2d ; top_right
   1122    jnz .do_top
   1123 .fix_tr_1:
   1124    movq        mm5, mm3
   1125    pxor        mm5, mm1
   1126    psrlq       mm5, 56
   1127    psllq       mm5, 56
   1128    pxor        mm1, mm5
   1129    jmp .do_top
   1130 .fix_tr_2:
   1131    punpckhbw   mm3, mm3
   1132    pshufw      mm1, mm3, 0xFF
   1133    jmp .do_topright
   1134 .do_top:
   1135    PRED4x4_LOWPASS mm4, mm2, mm1, mm3, mm5
   1136    movq2dq    xmm3, mm4
   1137    test        r2d, r2d ; top_right
   1138    jz .fix_tr_2
   1139    movq        mm0, [r0+8]
   1140    movq        mm5, mm0
   1141    movq        mm2, mm0
   1142    movq        mm4, mm0
   1143    psrlq       mm5, 56
   1144    PALIGNR     mm2, mm3, 7, mm3
   1145    PALIGNR     mm5, mm4, 1, mm4
   1146    PRED4x4_LOWPASS mm1, mm2, mm5, mm0, mm4
   1147 .do_topright:
   1148    movq2dq    xmm4, mm1
   1149    psrlq       mm1, 56
   1150    movq2dq    xmm5, mm1
   1151    lea         r1, [r0+r3*2]
   1152    pslldq    xmm4, 8
   1153    por       xmm3, xmm4
   1154    movdqa    xmm2, xmm3
   1155    psrldq    xmm2, 1
   1156    pslldq    xmm5, 15
   1157    por       xmm2, xmm5
   1158    lea         r2, [r1+r3*2]
   1159    movdqa    xmm1, xmm3
   1160    pslldq    xmm1, 1
   1161 INIT_XMM cpuname
   1162    PRED4x4_LOWPASS xmm0, xmm1, xmm2, xmm3, xmm4
   1163    psrldq    xmm0, 1
   1164    movq [r0+r3*1], xmm0
   1165    psrldq    xmm0, 1
   1166    movq [r0+r3*2], xmm0
   1167    psrldq    xmm0, 1
   1168    lea         r0, [r2+r3*2]
   1169    movq [r1+r3*1], xmm0
   1170    psrldq    xmm0, 1
   1171    movq [r1+r3*2], xmm0
   1172    psrldq    xmm0, 1
   1173    movq [r2+r3*1], xmm0
   1174    psrldq    xmm0, 1
   1175    movq [r2+r3*2], xmm0
   1176    psrldq    xmm0, 1
   1177    movq [r0+r3*1], xmm0
   1178    psrldq    xmm0, 1
   1179    movq [r0+r3*2], xmm0
   1180    RET
   1181 %endmacro
   1182 
   1183 INIT_MMX sse2
   1184 PRED8x8L_DOWN_LEFT
   1185 INIT_MMX ssse3
   1186 PRED8x8L_DOWN_LEFT
   1187 
   1188 ;-----------------------------------------------------------------------------
   1189 ; void ff_pred8x8l_down_right_8(uint8_t *src, int has_topleft,
   1190 ;                               int has_topright, ptrdiff_t stride)
   1191 ;-----------------------------------------------------------------------------
   1192 
   1193 %macro PRED8x8L_DOWN_RIGHT 0
   1194 cglobal pred8x8l_down_right_8, 4,5
   1195    sub          r0, r3
   1196    lea          r4, [r0+r3*2]
   1197    movq        mm0, [r0+r3*1-8]
   1198    punpckhbw   mm0, [r0+r3*0-8]
   1199    movq        mm1, [r4+r3*1-8]
   1200    punpckhbw   mm1, [r0+r3*2-8]
   1201    mov          r4, r0
   1202    punpckhwd   mm1, mm0
   1203    lea          r0, [r0+r3*4]
   1204    movq        mm2, [r0+r3*1-8]
   1205    punpckhbw   mm2, [r0+r3*0-8]
   1206    lea          r0, [r0+r3*2]
   1207    movq        mm3, [r0+r3*1-8]
   1208    punpckhbw   mm3, [r0+r3*0-8]
   1209    punpckhwd   mm3, mm2
   1210    punpckhdq   mm3, mm1
   1211    lea          r0, [r0+r3*2]
   1212    movq        mm0, [r0+r3*0-8]
   1213    movq        mm1, [r4]
   1214    mov          r0, r4
   1215    movq        mm4, mm3
   1216    movq        mm2, mm3
   1217    PALIGNR     mm4, mm0, 7, mm0
   1218    PALIGNR     mm1, mm2, 1, mm2
   1219    test        r1d, r1d
   1220    jz .fix_lt_1
   1221    jmp .do_left
   1222 .fix_lt_1:
   1223    movq        mm5, mm3
   1224    pxor        mm5, mm4
   1225    psrlq       mm5, 56
   1226    psllq       mm5, 48
   1227    pxor        mm1, mm5
   1228    jmp .do_left
   1229 .fix_lt_2:
   1230    movq        mm5, mm3
   1231    pxor        mm5, mm2
   1232    psllq       mm5, 56
   1233    psrlq       mm5, 56
   1234    pxor        mm2, mm5
   1235    test        r2d, r2d
   1236    jnz .do_top
   1237 .fix_tr_1:
   1238    movq        mm5, mm3
   1239    pxor        mm5, mm1
   1240    psrlq       mm5, 56
   1241    psllq       mm5, 56
   1242    pxor        mm1, mm5
   1243    jmp .do_top
   1244 .do_left:
   1245    movq        mm0, mm4
   1246    PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5
   1247    movq        mm4, mm0
   1248    movq        mm7, mm2
   1249    movq2dq    xmm3, mm2
   1250    PRED4x4_LOWPASS mm1, mm3, mm0, mm4, mm5
   1251    psllq       mm1, 56
   1252    PALIGNR     mm7, mm1, 7, mm3
   1253    movq2dq    xmm1, mm7
   1254    movq        mm0, [r0-8]
   1255    movq        mm3, [r0]
   1256    movq        mm1, [r0+8]
   1257    movq        mm2, mm3
   1258    movq        mm4, mm3
   1259    PALIGNR     mm2, mm0, 7, mm0
   1260    PALIGNR     mm1, mm4, 1, mm4
   1261    test        r1d, r1d
   1262    jz .fix_lt_2
   1263    test        r2d, r2d
   1264    jz .fix_tr_1
   1265 .do_top:
   1266    PRED4x4_LOWPASS mm4, mm2, mm1, mm3, mm5
   1267    movq2dq   xmm4, mm4
   1268    lea         r1, [r0+r3*2]
   1269    movdqa    xmm0, xmm3
   1270    pslldq    xmm4, 8
   1271    por       xmm3, xmm4
   1272    lea         r2, [r1+r3*2]
   1273    pslldq    xmm4, 1
   1274    por       xmm1, xmm4
   1275    psrldq    xmm0, 7
   1276    pslldq    xmm0, 15
   1277    psrldq    xmm0, 7
   1278    por       xmm1, xmm0
   1279    lea         r0, [r2+r3*2]
   1280    movdqa    xmm2, xmm3
   1281    psrldq    xmm2, 1
   1282 INIT_XMM cpuname
   1283    PRED4x4_LOWPASS xmm0, xmm1, xmm2, xmm3, xmm4
   1284    movdqa    xmm1, xmm0
   1285    psrldq    xmm1, 1
   1286    movq [r0+r3*2], xmm0
   1287    movq [r0+r3*1], xmm1
   1288    psrldq    xmm0, 2
   1289    psrldq    xmm1, 2
   1290    movq [r2+r3*2], xmm0
   1291    movq [r2+r3*1], xmm1
   1292    psrldq    xmm0, 2
   1293    psrldq    xmm1, 2
   1294    movq [r1+r3*2], xmm0
   1295    movq [r1+r3*1], xmm1
   1296    psrldq    xmm0, 2
   1297    psrldq    xmm1, 2
   1298    movq [r4+r3*2], xmm0
   1299    movq [r4+r3*1], xmm1
   1300    RET
   1301 %endmacro
   1302 
   1303 INIT_MMX sse2
   1304 PRED8x8L_DOWN_RIGHT
   1305 INIT_MMX ssse3
   1306 PRED8x8L_DOWN_RIGHT
   1307 
   1308 ;-----------------------------------------------------------------------------
   1309 ; void ff_pred8x8l_vertical_right_8(uint8_t *src, int has_topleft,
   1310 ;                                   int has_topright, ptrdiff_t stride)
   1311 ;-----------------------------------------------------------------------------
   1312 
   1313 %macro PRED8x8L_VERTICAL_RIGHT 0
   1314 cglobal pred8x8l_vertical_right_8, 4,5,6
   1315    sub          r0, r3
   1316    lea          r4, [r0+r3*2]
   1317    movq        mm0, [r0+r3*1-8]
   1318    punpckhbw   mm0, [r0+r3*0-8]
   1319    movq        mm1, [r4+r3*1-8]
   1320    punpckhbw   mm1, [r0+r3*2-8]
   1321    mov          r4, r0
   1322    punpckhwd   mm1, mm0
   1323    lea          r0, [r0+r3*4]
   1324    movq        mm2, [r0+r3*1-8]
   1325    punpckhbw   mm2, [r0+r3*0-8]
   1326    lea          r0, [r0+r3*2]
   1327    movq        mm3, [r0+r3*1-8]
   1328    punpckhbw   mm3, [r0+r3*0-8]
   1329    punpckhwd   mm3, mm2
   1330    punpckhdq   mm3, mm1
   1331    lea          r0, [r0+r3*2]
   1332    movq        mm0, [r0+r3*0-8]
   1333    movq        mm1, [r4]
   1334    mov          r0, r4
   1335    movq        mm4, mm3
   1336    movq        mm2, mm3
   1337    PALIGNR     mm4, mm0, 7, mm0
   1338    PALIGNR     mm1, mm2, 1, mm2
   1339    test        r1d, r1d
   1340    jnz .do_left
   1341 .fix_lt_1:
   1342    movq        mm5, mm3
   1343    pxor        mm5, mm4
   1344    psrlq       mm5, 56
   1345    psllq       mm5, 48
   1346    pxor        mm1, mm5
   1347    jmp .do_left
   1348 .fix_lt_2:
   1349    movq        mm5, mm3
   1350    pxor        mm5, mm2
   1351    psllq       mm5, 56
   1352    psrlq       mm5, 56
   1353    pxor        mm2, mm5
   1354    test        r2d, r2d
   1355    jnz .do_top
   1356 .fix_tr_1:
   1357    movq        mm5, mm3
   1358    pxor        mm5, mm1
   1359    psrlq       mm5, 56
   1360    psllq       mm5, 56
   1361    pxor        mm1, mm5
   1362    jmp .do_top
   1363 .do_left:
   1364    movq        mm0, mm4
   1365    PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5
   1366    movq2dq    xmm0, mm2
   1367    movq        mm0, [r0-8]
   1368    movq        mm3, [r0]
   1369    movq        mm1, [r0+8]
   1370    movq        mm2, mm3
   1371    movq        mm4, mm3
   1372    PALIGNR     mm2, mm0, 7, mm0
   1373    PALIGNR     mm1, mm4, 1, mm4
   1374    test        r1d, r1d
   1375    jz .fix_lt_2
   1376    test        r2d, r2d
   1377    jz .fix_tr_1
   1378 .do_top:
   1379    PRED4x4_LOWPASS mm6, mm2, mm1, mm3, mm5
   1380    lea           r1, [r0+r3*2]
   1381    movq2dq     xmm4, mm6
   1382    pslldq      xmm4, 8
   1383    por         xmm0, xmm4
   1384    movdqa      xmm1, xmm0
   1385    lea           r2, [r1+r3*2]
   1386    movdqa      xmm2, xmm0
   1387    movdqa      xmm3, xmm0
   1388    pslldq      xmm0, 1
   1389    pslldq      xmm1, 2
   1390    pavgb       xmm2, xmm0
   1391 INIT_XMM cpuname
   1392    PRED4x4_LOWPASS xmm4, xmm3, xmm1, xmm0, xmm5
   1393    movdqa      xmm0, [pw_ff00]
   1394    pandn       xmm0, xmm4
   1395    movdqa      xmm5, xmm4
   1396    psrlw       xmm4, 8
   1397    packuswb    xmm0, xmm4
   1398    movhlps     xmm4, xmm0
   1399    movhps [r0+r3*2], xmm5
   1400    movhps [r0+r3*1], xmm2
   1401    psrldq      xmm5, 4
   1402    movss       xmm5, xmm0
   1403    psrldq      xmm2, 4
   1404    movss       xmm2, xmm4
   1405    lea           r0, [r2+r3*2]
   1406    psrldq      xmm5, 1
   1407    psrldq      xmm2, 1
   1408    movq        [r0+r3*2], xmm5
   1409    movq        [r0+r3*1], xmm2
   1410    psrldq      xmm5, 1
   1411    psrldq      xmm2, 1
   1412    movq        [r2+r3*2], xmm5
   1413    movq        [r2+r3*1], xmm2
   1414    psrldq      xmm5, 1
   1415    psrldq      xmm2, 1
   1416    movq        [r1+r3*2], xmm5
   1417    movq        [r1+r3*1], xmm2
   1418    RET
   1419 %endmacro
   1420 
   1421 INIT_MMX sse2
   1422 PRED8x8L_VERTICAL_RIGHT
   1423 INIT_MMX ssse3
   1424 PRED8x8L_VERTICAL_RIGHT
   1425 
   1426 ;-----------------------------------------------------------------------------
   1427 ; void ff_pred8x8l_vertical_left_8(uint8_t *src, int has_topleft,
   1428 ;                                  int has_topright, ptrdiff_t stride)
   1429 ;-----------------------------------------------------------------------------
   1430 
   1431 %macro PRED8x8L_VERTICAL_LEFT 0
   1432 cglobal pred8x8l_vertical_left_8, 4,4
   1433    sub          r0, r3
   1434    movq        mm0, [r0-8]
   1435    movq        mm3, [r0]
   1436    movq        mm1, [r0+8]
   1437    movq        mm2, mm3
   1438    movq        mm4, mm3
   1439    PALIGNR     mm2, mm0, 7, mm0
   1440    PALIGNR     mm1, mm4, 1, mm4
   1441    test        r1d, r1d
   1442    jz .fix_lt_2
   1443    test        r2d, r2d
   1444    jz .fix_tr_1
   1445    jmp .do_top
   1446 .fix_lt_2:
   1447    movq        mm5, mm3
   1448    pxor        mm5, mm2
   1449    psllq       mm5, 56
   1450    psrlq       mm5, 56
   1451    pxor        mm2, mm5
   1452    test        r2d, r2d
   1453    jnz .do_top
   1454 .fix_tr_1:
   1455    movq        mm5, mm3
   1456    pxor        mm5, mm1
   1457    psrlq       mm5, 56
   1458    psllq       mm5, 56
   1459    pxor        mm1, mm5
   1460    jmp .do_top
   1461 .fix_tr_2:
   1462    punpckhbw   mm3, mm3
   1463    pshufw      mm1, mm3, 0xFF
   1464    jmp .do_topright
   1465 .do_top:
   1466    PRED4x4_LOWPASS mm4, mm2, mm1, mm3, mm5
   1467    movq2dq    xmm4, mm4
   1468    test        r2d, r2d
   1469    jz .fix_tr_2
   1470    movq        mm0, [r0+8]
   1471    movq        mm5, mm0
   1472    movq        mm2, mm0
   1473    movq        mm4, mm0
   1474    psrlq       mm5, 56
   1475    PALIGNR     mm2, mm3, 7, mm3
   1476    PALIGNR     mm5, mm4, 1, mm4
   1477    PRED4x4_LOWPASS mm1, mm2, mm5, mm0, mm4
   1478 .do_topright:
   1479    movq2dq   xmm3, mm1
   1480    lea         r1, [r0+r3*2]
   1481    pslldq    xmm3, 8
   1482    por       xmm4, xmm3
   1483    movdqa    xmm2, xmm4
   1484    movdqa    xmm1, xmm4
   1485    movdqa    xmm3, xmm4
   1486    psrldq    xmm2, 1
   1487    pslldq    xmm1, 1
   1488    pavgb     xmm3, xmm2
   1489    lea         r2, [r1+r3*2]
   1490 INIT_XMM cpuname
   1491    PRED4x4_LOWPASS xmm0, xmm1, xmm2, xmm4, xmm5
   1492    psrldq    xmm0, 1
   1493    movq [r0+r3*1], xmm3
   1494    movq [r0+r3*2], xmm0
   1495    lea         r0, [r2+r3*2]
   1496    psrldq    xmm3, 1
   1497    psrldq    xmm0, 1
   1498    movq [r1+r3*1], xmm3
   1499    movq [r1+r3*2], xmm0
   1500    psrldq    xmm3, 1
   1501    psrldq    xmm0, 1
   1502    movq [r2+r3*1], xmm3
   1503    movq [r2+r3*2], xmm0
   1504    psrldq    xmm3, 1
   1505    psrldq    xmm0, 1
   1506    movq [r0+r3*1], xmm3
   1507    movq [r0+r3*2], xmm0
   1508    RET
   1509 %endmacro
   1510 
   1511 INIT_MMX sse2
   1512 PRED8x8L_VERTICAL_LEFT
   1513 INIT_MMX ssse3
   1514 PRED8x8L_VERTICAL_LEFT
   1515 
   1516 ;-----------------------------------------------------------------------------
   1517 ; void ff_pred8x8l_horizontal_up_8(uint8_t *src, int has_topleft,
   1518 ;                                  int has_topright, ptrdiff_t stride)
   1519 ;-----------------------------------------------------------------------------
   1520 
   1521 %macro PRED8x8L_HORIZONTAL_UP 0
   1522 cglobal pred8x8l_horizontal_up_8, 4,4
   1523    sub          r0, r3
   1524    lea          r2, [r0+r3*2]
   1525    movq        mm0, [r0+r3*1-8]
   1526    test        r1d, r1d
   1527    lea          r1, [r0+r3]
   1528    cmovnz       r1, r0
   1529    punpckhbw   mm0, [r1+r3*0-8]
   1530    movq        mm1, [r2+r3*1-8]
   1531    punpckhbw   mm1, [r0+r3*2-8]
   1532    mov          r2, r0
   1533    punpckhwd   mm1, mm0
   1534    lea          r0, [r0+r3*4]
   1535    movq        mm2, [r0+r3*1-8]
   1536    punpckhbw   mm2, [r0+r3*0-8]
   1537    lea          r0, [r0+r3*2]
   1538    movq        mm3, [r0+r3*1-8]
   1539    punpckhbw   mm3, [r0+r3*0-8]
   1540    punpckhwd   mm3, mm2
   1541    punpckhdq   mm3, mm1
   1542    lea          r0, [r0+r3*2]
   1543    movq        mm0, [r0+r3*0-8]
   1544    movq        mm1, [r1+r3*0-8]
   1545    mov          r0, r2
   1546    movq        mm4, mm3
   1547    movq        mm2, mm3
   1548    PALIGNR     mm4, mm0, 7, mm0
   1549    PALIGNR     mm1, mm2, 1, mm2
   1550    movq       mm0, mm4
   1551    PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5
   1552    movq       mm4, mm0
   1553    movq       mm7, mm2
   1554    PRED4x4_LOWPASS mm1, mm3, mm0, mm4, mm5
   1555    psllq      mm1, 56
   1556    PALIGNR    mm7, mm1, 7, mm3
   1557    lea         r1, [r0+r3*2]
   1558    pshufw     mm0, mm7, 00011011b ; l6 l7 l4 l5 l2 l3 l0 l1
   1559    psllq      mm7, 56             ; l7 .. .. .. .. .. .. ..
   1560    movq       mm2, mm0
   1561    psllw      mm0, 8
   1562    psrlw      mm2, 8
   1563    por        mm2, mm0            ; l7 l6 l5 l4 l3 l2 l1 l0
   1564    movq       mm3, mm2
   1565    movq       mm4, mm2
   1566    movq       mm5, mm2
   1567    psrlq      mm2, 8
   1568    psrlq      mm3, 16
   1569    lea         r2, [r1+r3*2]
   1570    por        mm2, mm7            ; l7 l7 l6 l5 l4 l3 l2 l1
   1571    punpckhbw  mm7, mm7
   1572    por        mm3, mm7            ; l7 l7 l7 l6 l5 l4 l3 l2
   1573    pavgb      mm4, mm2
   1574    PRED4x4_LOWPASS mm1, mm3, mm5, mm2, mm6
   1575    movq       mm5, mm4
   1576    punpcklbw  mm4, mm1            ; p4 p3 p2 p1
   1577    punpckhbw  mm5, mm1            ; p8 p7 p6 p5
   1578    movq       mm6, mm5
   1579    movq       mm7, mm5
   1580    movq       mm0, mm5
   1581    PALIGNR    mm5, mm4, 2, mm1
   1582    pshufw     mm1, mm6, 11111001b
   1583    PALIGNR    mm6, mm4, 4, mm2
   1584    pshufw     mm2, mm7, 11111110b
   1585    PALIGNR    mm7, mm4, 6, mm3
   1586    pshufw     mm3, mm0, 11111111b
   1587    movq [r0+r3*1], mm4
   1588    movq [r0+r3*2], mm5
   1589    lea         r0, [r2+r3*2]
   1590    movq [r1+r3*1], mm6
   1591    movq [r1+r3*2], mm7
   1592    movq [r2+r3*1], mm0
   1593    movq [r2+r3*2], mm1
   1594    movq [r0+r3*1], mm2
   1595    movq [r0+r3*2], mm3
   1596    RET
   1597 %endmacro
   1598 
   1599 INIT_MMX mmxext
   1600 PRED8x8L_HORIZONTAL_UP
   1601 INIT_MMX ssse3
   1602 PRED8x8L_HORIZONTAL_UP
   1603 
   1604 ;-----------------------------------------------------------------------------
   1605 ; void ff_pred8x8l_horizontal_down_8(uint8_t *src, int has_topleft,
   1606 ;                                    int has_topright, ptrdiff_t stride)
   1607 ;-----------------------------------------------------------------------------
   1608 
   1609 %macro PRED8x8L_HORIZONTAL_DOWN 0
   1610 cglobal pred8x8l_horizontal_down_8, 4,5
   1611    sub          r0, r3
   1612    lea          r4, [r0+r3*2]
   1613    movq        mm0, [r0+r3*1-8]
   1614    punpckhbw   mm0, [r0+r3*0-8]
   1615    movq        mm1, [r4+r3*1-8]
   1616    punpckhbw   mm1, [r0+r3*2-8]
   1617    mov          r4, r0
   1618    punpckhwd   mm1, mm0
   1619    lea          r0, [r0+r3*4]
   1620    movq        mm2, [r0+r3*1-8]
   1621    punpckhbw   mm2, [r0+r3*0-8]
   1622    lea          r0, [r0+r3*2]
   1623    movq        mm3, [r0+r3*1-8]
   1624    punpckhbw   mm3, [r0+r3*0-8]
   1625    punpckhwd   mm3, mm2
   1626    punpckhdq   mm3, mm1
   1627    lea          r0, [r0+r3*2]
   1628    movq        mm0, [r0+r3*0-8]
   1629    movq        mm1, [r4]
   1630    mov          r0, r4
   1631    movq        mm4, mm3
   1632    movq        mm2, mm3
   1633    PALIGNR     mm4, mm0, 7, mm0
   1634    PALIGNR     mm1, mm2, 1, mm2
   1635    test        r1d, r1d
   1636    jnz .do_left
   1637 .fix_lt_1:
   1638    movq        mm5, mm3
   1639    pxor        mm5, mm4
   1640    psrlq       mm5, 56
   1641    psllq       mm5, 48
   1642    pxor        mm1, mm5
   1643    jmp .do_left
   1644 .fix_lt_2:
   1645    movq        mm5, mm3
   1646    pxor        mm5, mm2
   1647    psllq       mm5, 56
   1648    psrlq       mm5, 56
   1649    pxor        mm2, mm5
   1650    test        r2d, r2d
   1651    jnz .do_top
   1652 .fix_tr_1:
   1653    movq        mm5, mm3
   1654    pxor        mm5, mm1
   1655    psrlq       mm5, 56
   1656    psllq       mm5, 56
   1657    pxor        mm1, mm5
   1658    jmp .do_top
   1659 .fix_tr_2:
   1660    punpckhbw   mm3, mm3
   1661    pshufw      mm1, mm3, 0xFF
   1662    jmp .do_topright
   1663 .do_left:
   1664    movq        mm0, mm4
   1665    PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5
   1666    movq2dq    xmm0, mm2
   1667    pslldq     xmm0, 8
   1668    movq        mm4, mm0
   1669    PRED4x4_LOWPASS mm1, mm3, mm0, mm4, mm5
   1670    movq2dq    xmm2, mm1
   1671    pslldq     xmm2, 15
   1672    psrldq     xmm2, 8
   1673    por        xmm0, xmm2
   1674    movq        mm0, [r0-8]
   1675    movq        mm3, [r0]
   1676    movq        mm1, [r0+8]
   1677    movq        mm2, mm3
   1678    movq        mm4, mm3
   1679    PALIGNR     mm2, mm0, 7, mm0
   1680    PALIGNR     mm1, mm4, 1, mm4
   1681    test        r1d, r1d
   1682    jz .fix_lt_2
   1683    test        r2d, r2d
   1684    jz .fix_tr_1
   1685 .do_top:
   1686    PRED4x4_LOWPASS mm4, mm2, mm1, mm3, mm5
   1687    movq2dq    xmm1, mm4
   1688    test        r2d, r2d
   1689    jz .fix_tr_2
   1690    movq        mm0, [r0+8]
   1691    movq        mm5, mm0
   1692    movq        mm2, mm0
   1693    movq        mm4, mm0
   1694    psrlq       mm5, 56
   1695    PALIGNR     mm2, mm3, 7, mm3
   1696    PALIGNR     mm5, mm4, 1, mm4
   1697    PRED4x4_LOWPASS mm1, mm2, mm5, mm0, mm4
   1698 .do_topright:
   1699    movq2dq    xmm5, mm1
   1700    pslldq     xmm5, 8
   1701    por        xmm1, xmm5
   1702 INIT_XMM cpuname
   1703    lea         r2, [r4+r3*2]
   1704    movdqa    xmm2, xmm1
   1705    movdqa    xmm3, xmm1
   1706    PALIGNR   xmm1, xmm0, 7, xmm4
   1707    PALIGNR   xmm2, xmm0, 9, xmm5
   1708    lea         r1, [r2+r3*2]
   1709    PALIGNR   xmm3, xmm0, 8, xmm0
   1710    movdqa    xmm4, xmm1
   1711    pavgb     xmm4, xmm3
   1712    lea         r0, [r1+r3*2]
   1713    PRED4x4_LOWPASS xmm0, xmm1, xmm2, xmm3, xmm5
   1714    punpcklbw xmm4, xmm0
   1715    movhlps   xmm0, xmm4
   1716    movq   [r0+r3*2], xmm4
   1717    movq   [r2+r3*2], xmm0
   1718    psrldq xmm4, 2
   1719    psrldq xmm0, 2
   1720    movq   [r0+r3*1], xmm4
   1721    movq   [r2+r3*1], xmm0
   1722    psrldq xmm4, 2
   1723    psrldq xmm0, 2
   1724    movq   [r1+r3*2], xmm4
   1725    movq   [r4+r3*2], xmm0
   1726    psrldq xmm4, 2
   1727    psrldq xmm0, 2
   1728    movq   [r1+r3*1], xmm4
   1729    movq   [r4+r3*1], xmm0
   1730    RET
   1731 %endmacro
   1732 
   1733 INIT_MMX sse2
   1734 PRED8x8L_HORIZONTAL_DOWN
   1735 INIT_MMX ssse3
   1736 PRED8x8L_HORIZONTAL_DOWN
   1737 
   1738 ;-------------------------------------------------------------------------------
   1739 ; void ff_pred4x4_dc_8_mmxext(uint8_t *src, const uint8_t *topright,
   1740 ;                             ptrdiff_t stride)
   1741 ;-------------------------------------------------------------------------------
   1742 
   1743 INIT_MMX mmxext
   1744 cglobal pred4x4_dc_8, 3,5
   1745    pxor   mm7, mm7
   1746    mov     r4, r0
   1747    sub     r0, r2
   1748    movd   mm0, [r0]
   1749    psadbw mm0, mm7
   1750    movzx  r1d, byte [r0+r2*1-1]
   1751    movd   r3d, mm0
   1752    add    r3d, r1d
   1753    movzx  r1d, byte [r0+r2*2-1]
   1754    lea     r0, [r0+r2*2]
   1755    add    r3d, r1d
   1756    movzx  r1d, byte [r0+r2*1-1]
   1757    add    r3d, r1d
   1758    movzx  r1d, byte [r0+r2*2-1]
   1759    add    r3d, r1d
   1760    add    r3d, 4
   1761    shr    r3d, 3
   1762    imul   r3d, 0x01010101
   1763    mov   [r4+r2*0], r3d
   1764    mov   [r0+r2*0], r3d
   1765    mov   [r0+r2*1], r3d
   1766    mov   [r0+r2*2], r3d
   1767    RET
   1768 
   1769 ;-----------------------------------------------------------------------------
   1770 ; void ff_pred4x4_tm_vp8_8_mmxext(uint8_t *src, const uint8_t *topright,
   1771 ;                                 ptrdiff_t stride)
   1772 ;-----------------------------------------------------------------------------
   1773 
   1774 INIT_MMX mmxext
   1775 cglobal pred4x4_tm_vp8_8, 3,6
   1776    sub        r0, r2
   1777    pxor      mm7, mm7
   1778    movd      mm0, [r0]
   1779    punpcklbw mm0, mm7
   1780    movzx     r4d, byte [r0-1]
   1781    mov       r5d, 2
   1782 .loop:
   1783    movzx     r1d, byte [r0+r2*1-1]
   1784    movzx     r3d, byte [r0+r2*2-1]
   1785    sub       r1d, r4d
   1786    sub       r3d, r4d
   1787    movd      mm2, r1d
   1788    movd      mm4, r3d
   1789    pshufw    mm2, mm2, 0
   1790    pshufw    mm4, mm4, 0
   1791    paddw     mm2, mm0
   1792    paddw     mm4, mm0
   1793    packuswb  mm2, mm2
   1794    packuswb  mm4, mm4
   1795    movd [r0+r2*1], mm2
   1796    movd [r0+r2*2], mm4
   1797    lea        r0, [r0+r2*2]
   1798    dec       r5d
   1799    jg .loop
   1800    RET
   1801 
   1802 INIT_XMM ssse3
   1803 cglobal pred4x4_tm_vp8_8, 3,3
   1804    sub         r0, r2
   1805    movq       mm6, [tm_shuf]
   1806    pxor       mm1, mm1
   1807    movd       mm0, [r0]
   1808    punpcklbw  mm0, mm1
   1809    movd       mm7, [r0-4]
   1810    pshufb     mm7, mm6
   1811    lea         r1, [r0+r2*2]
   1812    movd       mm2, [r0+r2*1-4]
   1813    movd       mm3, [r0+r2*2-4]
   1814    movd       mm4, [r1+r2*1-4]
   1815    movd       mm5, [r1+r2*2-4]
   1816    pshufb     mm2, mm6
   1817    pshufb     mm3, mm6
   1818    pshufb     mm4, mm6
   1819    pshufb     mm5, mm6
   1820    psubw      mm0, mm7
   1821    paddw      mm2, mm0
   1822    paddw      mm3, mm0
   1823    paddw      mm4, mm0
   1824    paddw      mm5, mm0
   1825    packuswb   mm2, mm2
   1826    packuswb   mm3, mm3
   1827    packuswb   mm4, mm4
   1828    packuswb   mm5, mm5
   1829    movd [r0+r2*1], mm2
   1830    movd [r0+r2*2], mm3
   1831    movd [r1+r2*1], mm4
   1832    movd [r1+r2*2], mm5
   1833    RET
   1834 
   1835 ;-----------------------------------------------------------------------------
   1836 ; void ff_pred4x4_vertical_vp8_8_mmxext(uint8_t *src, const uint8_t *topright,
   1837 ;                                       ptrdiff_t stride)
   1838 ;-----------------------------------------------------------------------------
   1839 
   1840 INIT_MMX mmxext
   1841 cglobal pred4x4_vertical_vp8_8, 3,3
   1842    sub       r0, r2
   1843    movd      m1, [r0-1]
   1844    movd      m0, [r0]
   1845    mova      m2, m0   ;t0 t1 t2 t3
   1846    punpckldq m0, [r1] ;t0 t1 t2 t3 t4 t5 t6 t7
   1847    lea       r1, [r0+r2*2]
   1848    psrlq     m0, 8    ;t1 t2 t3 t4
   1849    PRED4x4_LOWPASS m3, m1, m0, m2, m4
   1850    movd [r0+r2*1], m3
   1851    movd [r0+r2*2], m3
   1852    movd [r1+r2*1], m3
   1853    movd [r1+r2*2], m3
   1854    RET
   1855 
   1856 ;-----------------------------------------------------------------------------
   1857 ; void ff_pred4x4_down_left_8_mmxext(uint8_t *src, const uint8_t *topright,
   1858 ;                                    ptrdiff_t stride)
   1859 ;-----------------------------------------------------------------------------
   1860 INIT_MMX mmxext
   1861 cglobal pred4x4_down_left_8, 3,3
   1862    sub       r0, r2
   1863    movq      m1, [r0]
   1864    punpckldq m1, [r1]
   1865    movq      m2, m1
   1866    movq      m3, m1
   1867    psllq     m1, 8
   1868    pxor      m2, m1
   1869    psrlq     m2, 8
   1870    pxor      m2, m3
   1871    PRED4x4_LOWPASS m0, m1, m2, m3, m4
   1872    lea       r1, [r0+r2*2]
   1873    psrlq     m0, 8
   1874    movd      [r0+r2*1], m0
   1875    psrlq     m0, 8
   1876    movd      [r0+r2*2], m0
   1877    psrlq     m0, 8
   1878    movd      [r1+r2*1], m0
   1879    psrlq     m0, 8
   1880    movd      [r1+r2*2], m0
   1881    RET
   1882 
   1883 ;------------------------------------------------------------------------------
   1884 ; void ff_pred4x4_vertical_left_8_mmxext(uint8_t *src, const uint8_t *topright,
   1885 ;                                        ptrdiff_t stride)
   1886 ;------------------------------------------------------------------------------
   1887 
   1888 INIT_MMX mmxext
   1889 cglobal pred4x4_vertical_left_8, 3,3
   1890    sub       r0, r2
   1891    movq      m1, [r0]
   1892    punpckldq m1, [r1]
   1893    movq      m3, m1
   1894    movq      m2, m1
   1895    psrlq     m3, 8
   1896    psrlq     m2, 16
   1897    movq      m4, m3
   1898    pavgb     m4, m1
   1899    PRED4x4_LOWPASS m0, m1, m2, m3, m5
   1900    lea       r1, [r0+r2*2]
   1901    movh      [r0+r2*1], m4
   1902    movh      [r0+r2*2], m0
   1903    psrlq     m4, 8
   1904    psrlq     m0, 8
   1905    movh      [r1+r2*1], m4
   1906    movh      [r1+r2*2], m0
   1907    RET
   1908 
   1909 ;------------------------------------------------------------------------------
   1910 ; void ff_pred4x4_horizontal_up_8_mmxext(uint8_t *src, const uint8_t *topright,
   1911 ;                                        ptrdiff_t stride)
   1912 ;------------------------------------------------------------------------------
   1913 
   1914 INIT_MMX mmxext
   1915 cglobal pred4x4_horizontal_up_8, 3,3
   1916    sub       r0, r2
   1917    lea       r1, [r0+r2*2]
   1918    movd      m0, [r0+r2*1-4]
   1919    punpcklbw m0, [r0+r2*2-4]
   1920    movd      m1, [r1+r2*1-4]
   1921    punpcklbw m1, [r1+r2*2-4]
   1922    punpckhwd m0, m1
   1923    movq      m1, m0
   1924    punpckhbw m1, m1
   1925    pshufw    m1, m1, 0xFF
   1926    punpckhdq m0, m1
   1927    movq      m2, m0
   1928    movq      m3, m0
   1929    movq      m7, m0
   1930    psrlq     m2, 16
   1931    psrlq     m3, 8
   1932    pavgb     m7, m3
   1933    PRED4x4_LOWPASS m4, m0, m2, m3, m5
   1934    punpcklbw m7, m4
   1935    movd    [r0+r2*1], m7
   1936    psrlq    m7, 16
   1937    movd    [r0+r2*2], m7
   1938    psrlq    m7, 16
   1939    movd    [r1+r2*1], m7
   1940    movd    [r1+r2*2], m1
   1941    RET
   1942 
   1943 ;------------------------------------------------------------------------------
   1944 ; void ff_pred4x4_horizontal_down_8_mmxext(uint8_t *src,
   1945 ;                                          const uint8_t *topright,
   1946 ;                                          ptrdiff_t stride)
   1947 ;------------------------------------------------------------------------------
   1948 
   1949 INIT_MMX mmxext
   1950 cglobal pred4x4_horizontal_down_8, 3,3
   1951    sub       r0, r2
   1952    lea       r1, [r0+r2*2]
   1953    movh      m0, [r0-4]      ; lt ..
   1954    punpckldq m0, [r0]        ; t3 t2 t1 t0 lt .. .. ..
   1955    psllq     m0, 8           ; t2 t1 t0 lt .. .. .. ..
   1956    movd      m1, [r1+r2*2-4] ; l3
   1957    punpcklbw m1, [r1+r2*1-4] ; l2 l3
   1958    movd      m2, [r0+r2*2-4] ; l1
   1959    punpcklbw m2, [r0+r2*1-4] ; l0 l1
   1960    punpckhwd m1, m2          ; l0 l1 l2 l3
   1961    punpckhdq m1, m0          ; t2 t1 t0 lt l0 l1 l2 l3
   1962    movq      m0, m1
   1963    movq      m2, m1
   1964    movq      m5, m1
   1965    psrlq     m0, 16          ; .. .. t2 t1 t0 lt l0 l1
   1966    psrlq     m2, 8           ; .. t2 t1 t0 lt l0 l1 l2
   1967    pavgb     m5, m2
   1968    PRED4x4_LOWPASS m3, m1, m0, m2, m4
   1969    punpcklbw m5, m3
   1970    psrlq     m3, 32
   1971    PALIGNR   m3, m5, 6, m4
   1972    movh      [r1+r2*2], m5
   1973    psrlq     m5, 16
   1974    movh      [r1+r2*1], m5
   1975    psrlq     m5, 16
   1976    movh      [r0+r2*2], m5
   1977    movh      [r0+r2*1], m3
   1978    RET
   1979 
   1980 ;-----------------------------------------------------------------------------
   1981 ; void ff_pred4x4_vertical_right_8_mmxext(uint8_t *src,
   1982 ;                                         const uint8_t *topright,
   1983 ;                                         ptrdiff_t stride)
   1984 ;-----------------------------------------------------------------------------
   1985 
   1986 INIT_MMX mmxext
   1987 cglobal pred4x4_vertical_right_8, 3,3
   1988    sub     r0, r2
   1989    lea     r1, [r0+r2*2]
   1990    movh    m0, [r0]                    ; ........t3t2t1t0
   1991    movq    m5, m0
   1992    PALIGNR m0, [r0-8], 7, m1           ; ......t3t2t1t0lt
   1993    pavgb   m5, m0
   1994    PALIGNR m0, [r0+r2*1-8], 7, m1      ; ....t3t2t1t0ltl0
   1995    movq    m1, m0
   1996    PALIGNR m0, [r0+r2*2-8], 7, m2      ; ..t3t2t1t0ltl0l1
   1997    movq    m2, m0
   1998    PALIGNR m0, [r1+r2*1-8], 7, m3      ; t3t2t1t0ltl0l1l2
   1999    PRED4x4_LOWPASS m3, m1, m0, m2, m4
   2000    movq    m1, m3
   2001    psrlq   m3, 16
   2002    psllq   m1, 48
   2003    movh    [r0+r2*1], m5
   2004    movh    [r0+r2*2], m3
   2005    PALIGNR m5, m1, 7, m2
   2006    psllq   m1, 8
   2007    movh    [r1+r2*1], m5
   2008    PALIGNR m3, m1, 7, m1
   2009    movh    [r1+r2*2], m3
   2010    RET
   2011 
   2012 ;-----------------------------------------------------------------------------
   2013 ; void ff_pred4x4_down_right_8_mmxext(uint8_t *src, const uint8_t *topright,
   2014 ;                                     ptrdiff_t stride)
   2015 ;-----------------------------------------------------------------------------
   2016 
   2017 INIT_MMX mmxext
   2018 cglobal pred4x4_down_right_8, 3,3
   2019    sub       r0, r2
   2020    lea       r1, [r0+r2*2]
   2021    movq      m1, [r1-8]
   2022    movq      m2, [r0+r2*1-8]
   2023    punpckhbw m2, [r0-8]
   2024    movh      m3, [r0]
   2025    punpckhwd m1, m2
   2026    PALIGNR   m3, m1, 5, m1
   2027    movq      m1, m3
   2028    PALIGNR   m3, [r1+r2*1-8], 7, m4
   2029    movq      m2, m3
   2030    PALIGNR   m3, [r1+r2*2-8], 7, m4
   2031    PRED4x4_LOWPASS m0, m3, m1, m2, m4
   2032    movh      [r1+r2*2], m0
   2033    psrlq     m0, 8
   2034    movh      [r1+r2*1], m0
   2035    psrlq     m0, 8
   2036    movh      [r0+r2*2], m0
   2037    psrlq     m0, 8
   2038    movh      [r0+r2*1], m0
   2039    RET