tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

h264_intrapred_10bit.asm (31458B)


      1 ;*****************************************************************************
      2 ;* MMX/SSE2/AVX-optimized 10-bit H.264 intra prediction code
      3 ;*****************************************************************************
      4 ;* Copyright (C) 2005-2011 x264 project
      5 ;*
      6 ;* Authors: Daniel Kang <daniel.d.kang@gmail.com>
      7 ;*
      8 ;* This file is part of FFmpeg.
      9 ;*
     10 ;* FFmpeg is free software; you can redistribute it and/or
     11 ;* modify it under the terms of the GNU Lesser General Public
     12 ;* License as published by the Free Software Foundation; either
     13 ;* version 2.1 of the License, or (at your option) any later version.
     14 ;*
     15 ;* FFmpeg is distributed in the hope that it will be useful,
     16 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
     17 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
     18 ;* Lesser General Public License for more details.
     19 ;*
     20 ;* You should have received a copy of the GNU Lesser General Public
     21 ;* License along with FFmpeg; if not, write to the Free Software
     22 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
     23 ;******************************************************************************
     24 
     25 %include "libavutil/x86/x86util.asm"
     26 
     27 SECTION_RODATA
     28 
     29 cextern pw_1023
     30 %define pw_pixel_max pw_1023
     31 cextern pw_512
     32 cextern pw_16
     33 cextern pw_8
     34 cextern pw_4
     35 cextern pw_2
     36 cextern pw_1
     37 cextern pd_16
     38 
     39 pw_m32101234: dw -3, -2, -1, 0, 1, 2, 3, 4
     40 pw_m3:        times 8 dw -3
     41 pd_17:        times 4 dd 17
     42 
     43 SECTION .text
     44 
     45 ; dest, left, right, src
     46 ; output: %1 = (t[n-1] + t[n]*2 + t[n+1] + 2) >> 2
     47 %macro PRED4x4_LOWPASS 4
     48    paddw       %2, %3
     49    psrlw       %2, 1
     50    pavgw       %1, %4, %2
     51 %endmacro
     52 
     53 ;-----------------------------------------------------------------------------
     54 ; void ff_pred4x4_down_right_10(pixel *src, const pixel *topright,
     55 ;                               ptrdiff_t stride)
     56 ;-----------------------------------------------------------------------------
     57 %macro PRED4x4_DR 0
     58 cglobal pred4x4_down_right_10, 3, 3
     59    sub       r0, r2
     60    lea       r1, [r0+r2*2]
     61    movhps    m1, [r1-8]
     62    movhps    m2, [r0+r2*1-8]
     63    movhps    m4, [r0-8]
     64    punpckhwd m2, m4
     65    movq      m3, [r0]
     66    punpckhdq m1, m2
     67    PALIGNR   m3, m1, 10, m1
     68    movhps    m4, [r1+r2*1-8]
     69    PALIGNR   m0, m3, m4, 14, m4
     70    movhps    m4, [r1+r2*2-8]
     71    PALIGNR   m2, m0, m4, 14, m4
     72    PRED4x4_LOWPASS m0, m2, m3, m0
     73    movq      [r1+r2*2], m0
     74    psrldq    m0, 2
     75    movq      [r1+r2*1], m0
     76    psrldq    m0, 2
     77    movq      [r0+r2*2], m0
     78    psrldq    m0, 2
     79    movq      [r0+r2*1], m0
     80    RET
     81 %endmacro
     82 
     83 INIT_XMM sse2
     84 PRED4x4_DR
     85 INIT_XMM ssse3
     86 PRED4x4_DR
     87 %if HAVE_AVX_EXTERNAL
     88 INIT_XMM avx
     89 PRED4x4_DR
     90 %endif
     91 
     92 ;------------------------------------------------------------------------------
     93 ; void ff_pred4x4_vertical_right_10(pixel *src, const pixel *topright,
     94 ;                                   ptrdiff_t stride)
     95 ;------------------------------------------------------------------------------
     96 %macro PRED4x4_VR 0
     97 cglobal pred4x4_vertical_right_10, 3, 3, 6
     98    sub     r0, r2
     99    lea     r1, [r0+r2*2]
    100    movq    m5, [r0]            ; ........t3t2t1t0
    101    movhps  m1, [r0-8]
    102    PALIGNR m0, m5, m1, 14, m1  ; ......t3t2t1t0lt
    103    pavgw   m5, m0
    104    movhps  m1, [r0+r2*1-8]
    105    PALIGNR m0, m1, 14, m1      ; ....t3t2t1t0ltl0
    106    movhps  m2, [r0+r2*2-8]
    107    PALIGNR m1, m0, m2, 14, m2  ; ..t3t2t1t0ltl0l1
    108    movhps  m3, [r1+r2*1-8]
    109    PALIGNR m2, m1, m3, 14, m3  ; t3t2t1t0ltl0l1l2
    110    PRED4x4_LOWPASS m1, m0, m2, m1
    111    pslldq  m0, m1, 12
    112    psrldq  m1, 4
    113    movq    [r0+r2*1], m5
    114    movq    [r0+r2*2], m1
    115    PALIGNR m5, m0, 14, m2
    116    pslldq  m0, 2
    117    movq    [r1+r2*1], m5
    118    PALIGNR m1, m0, 14, m0
    119    movq    [r1+r2*2], m1
    120    RET
    121 %endmacro
    122 
    123 INIT_XMM sse2
    124 PRED4x4_VR
    125 INIT_XMM ssse3
    126 PRED4x4_VR
    127 %if HAVE_AVX_EXTERNAL
    128 INIT_XMM avx
    129 PRED4x4_VR
    130 %endif
    131 
    132 ;-------------------------------------------------------------------------------
    133 ; void ff_pred4x4_horizontal_down_10(pixel *src, const pixel *topright,
    134 ;                                    ptrdiff_t stride)
    135 ;-------------------------------------------------------------------------------
    136 %macro PRED4x4_HD 0
    137 cglobal pred4x4_horizontal_down_10, 3, 3
    138    sub        r0, r2
    139    lea        r1, [r0+r2*2]
    140    movq       m0, [r0-8]      ; lt ..
    141    movhps     m0, [r0]
    142    pslldq     m0, 2           ; t2 t1 t0 lt .. .. .. ..
    143    movq       m1, [r1+r2*2-8] ; l3
    144    movq       m3, [r1+r2*1-8]
    145    punpcklwd  m1, m3          ; l2 l3
    146    movq       m2, [r0+r2*2-8] ; l1
    147    movq       m3, [r0+r2*1-8]
    148    punpcklwd  m2, m3          ; l0 l1
    149    punpckhdq  m1, m2          ; l0 l1 l2 l3
    150    punpckhqdq m1, m0          ; t2 t1 t0 lt l0 l1 l2 l3
    151    psrldq     m0, m1, 4       ; .. .. t2 t1 t0 lt l0 l1
    152    psrldq     m3, m1, 2       ; .. t2 t1 t0 lt l0 l1 l2
    153    pavgw      m5, m1, m3
    154    PRED4x4_LOWPASS m3, m1, m0, m3
    155    punpcklwd  m5, m3
    156    psrldq     m3, 8
    157    PALIGNR    m3, m5, 12, m4
    158    movq       [r1+r2*2], m5
    159    movhps     [r0+r2*2], m5
    160    psrldq     m5, 4
    161    movq       [r1+r2*1], m5
    162    movq       [r0+r2*1], m3
    163    RET
    164 %endmacro
    165 
    166 INIT_XMM sse2
    167 PRED4x4_HD
    168 INIT_XMM ssse3
    169 PRED4x4_HD
    170 %if HAVE_AVX_EXTERNAL
    171 INIT_XMM avx
    172 PRED4x4_HD
    173 %endif
    174 
    175 ;-----------------------------------------------------------------------------
    176 ; void ff_pred4x4_dc_10(pixel *src, const pixel *topright, ptrdiff_t stride)
    177 ;-----------------------------------------------------------------------------
    178 
    179 INIT_MMX mmxext
    180 cglobal pred4x4_dc_10, 3, 3
    181    sub    r0, r2
    182    lea    r1, [r0+r2*2]
    183    movq   m2, [r0+r2*1-8]
    184    paddw  m2, [r0+r2*2-8]
    185    paddw  m2, [r1+r2*1-8]
    186    paddw  m2, [r1+r2*2-8]
    187    psrlq  m2, 48
    188    movq   m0, [r0]
    189    HADDW  m0, m1
    190    paddw  m0, [pw_4]
    191    paddw  m0, m2
    192    psrlw  m0, 3
    193    SPLATW m0, m0, 0
    194    movq   [r0+r2*1], m0
    195    movq   [r0+r2*2], m0
    196    movq   [r1+r2*1], m0
    197    movq   [r1+r2*2], m0
    198    RET
    199 
    200 ;-----------------------------------------------------------------------------
    201 ; void ff_pred4x4_down_left_10(pixel *src, const pixel *topright,
    202 ;                              ptrdiff_t stride)
    203 ;-----------------------------------------------------------------------------
    204 %macro PRED4x4_DL 0
    205 cglobal pred4x4_down_left_10, 3, 3
    206    sub        r0, r2
    207    movq       m0, [r0]
    208    movhps     m0, [r1]
    209    psrldq     m2, m0, 2
    210    pslldq     m3, m0, 2
    211    pshufhw    m2, m2, 10100100b
    212    PRED4x4_LOWPASS m0, m3, m2, m0
    213    lea        r1, [r0+r2*2]
    214    movhps     [r1+r2*2], m0
    215    psrldq     m0, 2
    216    movq       [r0+r2*1], m0
    217    psrldq     m0, 2
    218    movq       [r0+r2*2], m0
    219    psrldq     m0, 2
    220    movq       [r1+r2*1], m0
    221    RET
    222 %endmacro
    223 
    224 INIT_XMM sse2
    225 PRED4x4_DL
    226 %if HAVE_AVX_EXTERNAL
    227 INIT_XMM avx
    228 PRED4x4_DL
    229 %endif
    230 
    231 ;-----------------------------------------------------------------------------
    232 ; void ff_pred4x4_vertical_left_10(pixel *src, const pixel *topright,
    233 ;                                  ptrdiff_t stride)
    234 ;-----------------------------------------------------------------------------
    235 %macro PRED4x4_VL 0
    236 cglobal pred4x4_vertical_left_10, 3, 3
    237    sub        r0, r2
    238    movu       m1, [r0]
    239    movhps     m1, [r1]
    240    psrldq     m0, m1, 2
    241    psrldq     m2, m1, 4
    242    pavgw      m4, m0, m1
    243    PRED4x4_LOWPASS m0, m1, m2, m0
    244    lea        r1, [r0+r2*2]
    245    movq       [r0+r2*1], m4
    246    movq       [r0+r2*2], m0
    247    psrldq     m4, 2
    248    psrldq     m0, 2
    249    movq       [r1+r2*1], m4
    250    movq       [r1+r2*2], m0
    251    RET
    252 %endmacro
    253 
    254 INIT_XMM sse2
    255 PRED4x4_VL
    256 %if HAVE_AVX_EXTERNAL
    257 INIT_XMM avx
    258 PRED4x4_VL
    259 %endif
    260 
    261 ;-----------------------------------------------------------------------------
    262 ; void ff_pred4x4_horizontal_up_10(pixel *src, const pixel *topright,
    263 ;                                  ptrdiff_t stride)
    264 ;-----------------------------------------------------------------------------
    265 INIT_MMX mmxext
    266 cglobal pred4x4_horizontal_up_10, 3, 3
    267    sub       r0, r2
    268    lea       r1, [r0+r2*2]
    269    movq      m0, [r0+r2*1-8]
    270    punpckhwd m0, [r0+r2*2-8]
    271    movq      m1, [r1+r2*1-8]
    272    punpckhwd m1, [r1+r2*2-8]
    273    punpckhdq m0, m1
    274    pshufw    m1, m1, 0xFF
    275    movq      [r1+r2*2], m1
    276    movd      [r1+r2*1+4], m1
    277    pshufw    m2, m0, 11111001b
    278    movq      m1, m2
    279    pavgw     m2, m0
    280 
    281    pshufw    m5, m0, 11111110b
    282    PRED4x4_LOWPASS m1, m0, m5, m1
    283    movq      m6, m2
    284    punpcklwd m6, m1
    285    movq      [r0+r2*1], m6
    286    psrlq     m2, 16
    287    psrlq     m1, 16
    288    punpcklwd m2, m1
    289    movq      [r0+r2*2], m2
    290    psrlq     m2, 32
    291    movd      [r1+r2*1], m2
    292    RET
    293 
    294 
    295 
    296 ;-----------------------------------------------------------------------------
    297 ; void ff_pred8x8_vertical_10(pixel *src, ptrdiff_t stride)
    298 ;-----------------------------------------------------------------------------
    299 INIT_XMM sse2
    300 cglobal pred8x8_vertical_10, 2, 2
    301    sub  r0, r1
    302    mova m0, [r0]
    303 %rep 3
    304    mova [r0+r1*1], m0
    305    mova [r0+r1*2], m0
    306    lea  r0, [r0+r1*2]
    307 %endrep
    308    mova [r0+r1*1], m0
    309    mova [r0+r1*2], m0
    310    RET
    311 
    312 ;-----------------------------------------------------------------------------
    313 ; void ff_pred8x8_horizontal_10(pixel *src, ptrdiff_t stride)
    314 ;-----------------------------------------------------------------------------
    315 INIT_XMM sse2
    316 cglobal pred8x8_horizontal_10, 2, 3
    317    mov         r2d, 4
    318 .loop:
    319    movq         m0, [r0+r1*0-8]
    320    movq         m1, [r0+r1*1-8]
    321    pshuflw      m0, m0, 0xff
    322    pshuflw      m1, m1, 0xff
    323    punpcklqdq   m0, m0
    324    punpcklqdq   m1, m1
    325    mova  [r0+r1*0], m0
    326    mova  [r0+r1*1], m1
    327    lea          r0, [r0+r1*2]
    328    dec          r2d
    329    jg .loop
    330    RET
    331 
    332 ;-----------------------------------------------------------------------------
    333 ; void ff_predict_8x8_dc_10(pixel *src, ptrdiff_t stride)
    334 ;-----------------------------------------------------------------------------
    335 %macro MOV8 2-3
    336 ; sort of a hack, but it works
    337    movdqa    [%1], %2
    338 %endmacro
    339 
    340 %macro PRED8x8_DC 1
    341 cglobal pred8x8_dc_10, 2, 6
    342    sub         r0, r1
    343    pxor        m4, m4
    344    movq        m0, [r0+0]
    345    movq        m1, [r0+8]
    346    punpcklwd   m0, m1
    347    movhlps     m1, m0
    348    paddw       m0, m1
    349    %1          m2, m0, 00001110b
    350    paddw       m0, m2
    351 
    352    lea         r5, [r1*3]
    353    lea         r4, [r0+r1*4]
    354    movzx      r2d, word [r0+r1*1-2]
    355    movzx      r3d, word [r0+r1*2-2]
    356    add        r2d, r3d
    357    movzx      r3d, word [r0+r5*1-2]
    358    add        r2d, r3d
    359    movzx      r3d, word [r4-2]
    360    add        r2d, r3d
    361    movd        m2, r2d            ; s2
    362 
    363    movzx      r2d, word [r4+r1*1-2]
    364    movzx      r3d, word [r4+r1*2-2]
    365    add        r2d, r3d
    366    movzx      r3d, word [r4+r5*1-2]
    367    add        r2d, r3d
    368    movzx      r3d, word [r4+r1*4-2]
    369    add        r2d, r3d
    370    movd        m3, r2d            ; s3
    371 
    372    punpcklwd   m2, m3
    373    punpckldq   m0, m2            ; s0, s1, s2, s3
    374    %1          m3, m0, 11110110b ; s2, s1, s3, s3
    375    %1          m0, m0, 01110100b ; s0, s1, s3, s1
    376    paddw       m0, m3
    377    psrlw       m0, 2
    378    pavgw       m0, m4            ; s0+s2, s1, s3, s1+s3
    379    punpcklwd   m0, m0
    380    pshufd      m3, m0, 11111010b
    381    punpckldq   m0, m0
    382    SWAP         0,1
    383    MOV8   r0+r1*1, m1, m2
    384    MOV8   r0+r1*2, m1, m2
    385    MOV8   r0+r5*1, m1, m2
    386    MOV8   r0+r1*4, m1, m2
    387    MOV8   r4+r1*1, m3, m4
    388    MOV8   r4+r1*2, m3, m4
    389    MOV8   r4+r5*1, m3, m4
    390    MOV8   r4+r1*4, m3, m4
    391    RET
    392 %endmacro
    393 
    394 INIT_XMM sse2
    395 PRED8x8_DC pshuflw
    396 
    397 ;-----------------------------------------------------------------------------
    398 ; void ff_pred8x8_top_dc_10(pixel *src, ptrdiff_t stride)
    399 ;-----------------------------------------------------------------------------
    400 INIT_XMM sse2
    401 cglobal pred8x8_top_dc_10, 2, 4
    402    sub         r0, r1
    403    mova        m0, [r0]
    404    pshuflw     m1, m0, 0x4e
    405    pshufhw     m1, m1, 0x4e
    406    paddw       m0, m1
    407    pshuflw     m1, m0, 0xb1
    408    pshufhw     m1, m1, 0xb1
    409    paddw       m0, m1
    410    lea         r2, [r1*3]
    411    lea         r3, [r0+r1*4]
    412    paddw       m0, [pw_2]
    413    psrlw       m0, 2
    414    mova [r0+r1*1], m0
    415    mova [r0+r1*2], m0
    416    mova [r0+r2*1], m0
    417    mova [r0+r1*4], m0
    418    mova [r3+r1*1], m0
    419    mova [r3+r1*2], m0
    420    mova [r3+r2*1], m0
    421    mova [r3+r1*4], m0
    422    RET
    423 
    424 ;-----------------------------------------------------------------------------
    425 ; void ff_pred8x8_plane_10(pixel *src, ptrdiff_t stride)
    426 ;-----------------------------------------------------------------------------
    427 INIT_XMM sse2
    428 cglobal pred8x8_plane_10, 2, 7, 7
    429    sub       r0, r1
    430    lea       r2, [r1*3]
    431    lea       r3, [r0+r1*4]
    432    mova      m2, [r0]
    433    pmaddwd   m2, [pw_m32101234]
    434    HADDD     m2, m1
    435    movd      m0, [r0-4]
    436    psrld     m0, 14
    437    psubw     m2, m0               ; H
    438    movd      m0, [r3+r1*4-4]
    439    movd      m1, [r0+12]
    440    paddw     m0, m1
    441    psllw     m0, 4                ; 16*(src[7*stride-1] + src[-stride+7])
    442    movzx    r4d, word [r3+r1*1-2] ; src[4*stride-1]
    443    movzx    r5d, word [r0+r2*1-2] ; src[2*stride-1]
    444    sub      r4d, r5d
    445    movzx    r6d, word [r3+r1*2-2] ; src[5*stride-1]
    446    movzx    r5d, word [r0+r1*2-2] ; src[1*stride-1]
    447    sub      r6d, r5d
    448    lea      r4d, [r4+r6*2]
    449    movzx    r5d, word [r3+r2*1-2] ; src[6*stride-1]
    450    movzx    r6d, word [r0+r1*1-2] ; src[0*stride-1]
    451    sub      r5d, r6d
    452    lea      r5d, [r5*3]
    453    add      r4d, r5d
    454    movzx    r6d, word [r3+r1*4-2] ; src[7*stride-1]
    455    movzx    r5d, word [r0+r1*0-2] ; src[ -stride-1]
    456    sub      r6d, r5d
    457    lea      r4d, [r4+r6*4]
    458    movd      m3, r4d              ; V
    459    punpckldq m2, m3
    460    pmaddwd   m2, [pd_17]
    461    paddd     m2, [pd_16]
    462    psrad     m2, 5                ; b, c
    463 
    464    mova      m3, [pw_pixel_max]
    465    pxor      m1, m1
    466    SPLATW    m0, m0, 1
    467    SPLATW    m4, m2, 2
    468    SPLATW    m2, m2, 0
    469    pmullw    m2, [pw_m32101234]   ; b
    470    pmullw    m5, m4, [pw_m3]      ; c
    471    paddw     m5, [pw_16]
    472    mov      r2d, 8
    473    add       r0, r1
    474 .loop:
    475    paddsw    m6, m2, m5
    476    paddsw    m6, m0
    477    psraw     m6, 5
    478    CLIPW     m6, m1, m3
    479    mova    [r0], m6
    480    paddw     m5, m4
    481    add       r0, r1
    482    dec r2d
    483    jg .loop
    484    RET
    485 
    486 
    487 ;-----------------------------------------------------------------------------
    488 ; void ff_pred8x8l_128_dc_10(pixel *src, int has_topleft, int has_topright,
    489 ;                            ptrdiff_t stride)
    490 ;-----------------------------------------------------------------------------
    491 INIT_XMM sse2
    492 cglobal pred8x8l_128_dc_10, 4, 4
    493    mova      m0, [pw_512] ; (1<<(BIT_DEPTH-1))
    494    lea       r1, [r3*3]
    495    lea       r2, [r0+r3*4]
    496    MOV8 r0+r3*0, m0, m0
    497    MOV8 r0+r3*1, m0, m0
    498    MOV8 r0+r3*2, m0, m0
    499    MOV8 r0+r1*1, m0, m0
    500    MOV8 r2+r3*0, m0, m0
    501    MOV8 r2+r3*1, m0, m0
    502    MOV8 r2+r3*2, m0, m0
    503    MOV8 r2+r1*1, m0, m0
    504    RET
    505 
    506 ;-----------------------------------------------------------------------------
    507 ; void ff_pred8x8l_top_dc_10(pixel *src, int has_topleft, int has_topright,
    508 ;                            ptrdiff_t stride)
    509 ;-----------------------------------------------------------------------------
    510 %macro PRED8x8L_TOP_DC 0
    511 cglobal pred8x8l_top_dc_10, 4, 4, 6
    512    sub         r0, r3
    513    mova        m0, [r0]
    514    shr        r1d, 14
    515    shr        r2d, 13
    516    neg         r1
    517    pslldq      m1, m0, 2
    518    psrldq      m2, m0, 2
    519    pinsrw      m1, [r0+r1], 0
    520    pinsrw      m2, [r0+r2+14], 7
    521    lea         r1, [r3*3]
    522    lea         r2, [r0+r3*4]
    523    PRED4x4_LOWPASS m0, m2, m1, m0
    524    HADDW       m0, m1
    525    paddw       m0, [pw_4]
    526    psrlw       m0, 3
    527    SPLATW      m0, m0, 0
    528    mova [r0+r3*1], m0
    529    mova [r0+r3*2], m0
    530    mova [r0+r1*1], m0
    531    mova [r0+r3*4], m0
    532    mova [r2+r3*1], m0
    533    mova [r2+r3*2], m0
    534    mova [r2+r1*1], m0
    535    mova [r2+r3*4], m0
    536    RET
    537 %endmacro
    538 
    539 INIT_XMM sse2
    540 PRED8x8L_TOP_DC
    541 %if HAVE_AVX_EXTERNAL
    542 INIT_XMM avx
    543 PRED8x8L_TOP_DC
    544 %endif
    545 
    546 ;-------------------------------------------------------------------------------
    547 ; void ff_pred8x8l_dc_10(pixel *src, int has_topleft, int has_topright,
    548 ;                        ptrdiff_t stride)
    549 ;-------------------------------------------------------------------------------
    550 ;TODO: see if scalar is faster
    551 %macro PRED8x8L_DC 0
    552 cglobal pred8x8l_dc_10, 4, 6, 6
    553    sub         r0, r3
    554    lea         r4, [r0+r3*4]
    555    lea         r5, [r3*3]
    556    mova        m0, [r0+r3*2-16]
    557    punpckhwd   m0, [r0+r3*1-16]
    558    mova        m1, [r4+r3*0-16]
    559    punpckhwd   m1, [r0+r5*1-16]
    560    punpckhdq   m1, m0
    561    mova        m2, [r4+r3*2-16]
    562    punpckhwd   m2, [r4+r3*1-16]
    563    mova        m3, [r4+r3*4-16]
    564    punpckhwd   m3, [r4+r5*1-16]
    565    punpckhdq   m3, m2
    566    punpckhqdq  m3, m1
    567    mova        m0, [r0]
    568    shr        r1d, 14
    569    shr        r2d, 13
    570    neg         r1
    571    pslldq      m1, m0, 2
    572    psrldq      m2, m0, 2
    573    pinsrw      m1, [r0+r1], 0
    574    pinsrw      m2, [r0+r2+14], 7
    575    not         r1
    576    and         r1, r3
    577    pslldq      m4, m3, 2
    578    psrldq      m5, m3, 2
    579    pshuflw     m4, m4, 11100101b
    580    pinsrw      m5, [r0+r1-2], 7
    581    PRED4x4_LOWPASS m3, m4, m5, m3
    582    PRED4x4_LOWPASS m0, m2, m1, m0
    583    paddw       m0, m3
    584    HADDW       m0, m1
    585    paddw       m0, [pw_8]
    586    psrlw       m0, 4
    587    SPLATW      m0, m0
    588    mova [r0+r3*1], m0
    589    mova [r0+r3*2], m0
    590    mova [r0+r5*1], m0
    591    mova [r0+r3*4], m0
    592    mova [r4+r3*1], m0
    593    mova [r4+r3*2], m0
    594    mova [r4+r5*1], m0
    595    mova [r4+r3*4], m0
    596    RET
    597 %endmacro
    598 
    599 INIT_XMM sse2
    600 PRED8x8L_DC
    601 %if HAVE_AVX_EXTERNAL
    602 INIT_XMM avx
    603 PRED8x8L_DC
    604 %endif
    605 
    606 ;-----------------------------------------------------------------------------
    607 ; void ff_pred8x8l_vertical_10(pixel *src, int has_topleft, int has_topright,
    608 ;                              ptrdiff_t stride)
    609 ;-----------------------------------------------------------------------------
    610 %macro PRED8x8L_VERTICAL 0
    611 cglobal pred8x8l_vertical_10, 4, 4, 6
    612    sub         r0, r3
    613    mova        m0, [r0]
    614    shr        r1d, 14
    615    shr        r2d, 13
    616    neg         r1
    617    pslldq      m1, m0, 2
    618    psrldq      m2, m0, 2
    619    pinsrw      m1, [r0+r1], 0
    620    pinsrw      m2, [r0+r2+14], 7
    621    lea         r1, [r3*3]
    622    lea         r2, [r0+r3*4]
    623    PRED4x4_LOWPASS m0, m2, m1, m0
    624    mova [r0+r3*1], m0
    625    mova [r0+r3*2], m0
    626    mova [r0+r1*1], m0
    627    mova [r0+r3*4], m0
    628    mova [r2+r3*1], m0
    629    mova [r2+r3*2], m0
    630    mova [r2+r1*1], m0
    631    mova [r2+r3*4], m0
    632    RET
    633 %endmacro
    634 
    635 INIT_XMM sse2
    636 PRED8x8L_VERTICAL
    637 %if HAVE_AVX_EXTERNAL
    638 INIT_XMM avx
    639 PRED8x8L_VERTICAL
    640 %endif
    641 
    642 ;-----------------------------------------------------------------------------
    643 ; void ff_pred8x8l_horizontal_10(uint8_t *src, int has_topleft,
    644 ;                                int has_topright, ptrdiff_t stride)
    645 ;-----------------------------------------------------------------------------
    646 %macro PRED8x8L_HORIZONTAL 0
    647 cglobal pred8x8l_horizontal_10, 4, 4, 5
    648    mova        m0, [r0-16]
    649    shr        r1d, 14
    650    dec         r1
    651    and         r1, r3
    652    sub         r1, r3
    653    punpckhwd   m0, [r0+r1-16]
    654    mova        m1, [r0+r3*2-16]
    655    punpckhwd   m1, [r0+r3*1-16]
    656    lea         r2, [r0+r3*4]
    657    lea         r1, [r3*3]
    658    punpckhdq   m1, m0
    659    mova        m2, [r2+r3*0-16]
    660    punpckhwd   m2, [r0+r1-16]
    661    mova        m3, [r2+r3*2-16]
    662    punpckhwd   m3, [r2+r3*1-16]
    663    punpckhdq   m3, m2
    664    punpckhqdq  m3, m1
    665    PALIGNR     m4, m3, [r2+r1-16], 14, m0
    666    pslldq      m0, m4, 2
    667    pshuflw     m0, m0, 11100101b
    668    PRED4x4_LOWPASS m4, m3, m0, m4
    669    punpckhwd   m3, m4, m4
    670    punpcklwd   m4, m4
    671    pshufd      m0, m3, 0xff
    672    pshufd      m1, m3, 0xaa
    673    pshufd      m2, m3, 0x55
    674    pshufd      m3, m3, 0x00
    675    mova [r0+r3*0], m0
    676    mova [r0+r3*1], m1
    677    mova [r0+r3*2], m2
    678    mova [r0+r1*1], m3
    679    pshufd      m0, m4, 0xff
    680    pshufd      m1, m4, 0xaa
    681    pshufd      m2, m4, 0x55
    682    pshufd      m3, m4, 0x00
    683    mova [r2+r3*0], m0
    684    mova [r2+r3*1], m1
    685    mova [r2+r3*2], m2
    686    mova [r2+r1*1], m3
    687    RET
    688 %endmacro
    689 
    690 INIT_XMM sse2
    691 PRED8x8L_HORIZONTAL
    692 INIT_XMM ssse3
    693 PRED8x8L_HORIZONTAL
    694 %if HAVE_AVX_EXTERNAL
    695 INIT_XMM avx
    696 PRED8x8L_HORIZONTAL
    697 %endif
    698 
    699 ;-----------------------------------------------------------------------------
    700 ; void ff_pred8x8l_down_left_10(pixel *src, int has_topleft, int has_topright,
    701 ;                               ptrdiff_t stride)
    702 ;-----------------------------------------------------------------------------
    703 %macro PRED8x8L_DOWN_LEFT 0
    704 cglobal pred8x8l_down_left_10, 4, 4, 7
    705    sub         r0, r3
    706    mova        m3, [r0]
    707    shr        r1d, 14
    708    neg         r1
    709    shr        r2d, 13
    710    pslldq      m1, m3, 2
    711    psrldq      m2, m3, 2
    712    pinsrw      m1, [r0+r1], 0
    713    pinsrw      m2, [r0+r2+14], 7
    714    PRED4x4_LOWPASS m6, m2, m1, m3
    715    jz .fix_tr ; flags from shr r2d
    716    mova        m1, [r0+16]
    717    psrldq      m5, m1, 2
    718    PALIGNR     m2, m1, m3, 14, m3
    719    pshufhw     m5, m5, 10100100b
    720    PRED4x4_LOWPASS m1, m2, m5, m1
    721 .do_topright:
    722    lea         r1, [r3*3]
    723    psrldq      m5, m1, 14
    724    lea         r2, [r0+r3*4]
    725    PALIGNR     m2, m1, m6,  2, m0
    726    PALIGNR     m3, m1, m6, 14, m0
    727    PALIGNR     m5, m1,  2, m0
    728    pslldq      m4, m6, 2
    729    PRED4x4_LOWPASS m6, m4, m2, m6
    730    PRED4x4_LOWPASS m1, m3, m5, m1
    731    mova [r2+r3*4], m1
    732    PALIGNR     m1, m6, 14, m2
    733    pslldq      m6, 2
    734    mova [r2+r1*1], m1
    735    PALIGNR     m1, m6, 14, m2
    736    pslldq      m6, 2
    737    mova [r2+r3*2], m1
    738    PALIGNR     m1, m6, 14, m2
    739    pslldq      m6, 2
    740    mova [r2+r3*1], m1
    741    PALIGNR     m1, m6, 14, m2
    742    pslldq      m6, 2
    743    mova [r0+r3*4], m1
    744    PALIGNR     m1, m6, 14, m2
    745    pslldq      m6, 2
    746    mova [r0+r1*1], m1
    747    PALIGNR     m1, m6, 14, m2
    748    pslldq      m6, 2
    749    mova [r0+r3*2], m1
    750    PALIGNR     m1, m6, 14, m6
    751    mova [r0+r3*1], m1
    752    RET
    753 .fix_tr:
    754    punpckhwd   m3, m3
    755    pshufd      m1, m3, 0xFF
    756    jmp .do_topright
    757 %endmacro
    758 
    759 INIT_XMM sse2
    760 PRED8x8L_DOWN_LEFT
    761 INIT_XMM ssse3
    762 PRED8x8L_DOWN_LEFT
    763 %if HAVE_AVX_EXTERNAL
    764 INIT_XMM avx
    765 PRED8x8L_DOWN_LEFT
    766 %endif
    767 
    768 ;-----------------------------------------------------------------------------
    769 ; void ff_pred8x8l_down_right_10(pixel *src, int has_topleft,
    770 ;                                int has_topright, ptrdiff_t stride)
    771 ;-----------------------------------------------------------------------------
    772 %macro PRED8x8L_DOWN_RIGHT 0
    773 ; standard forbids this when has_topleft is false
    774 ; no need to check
    775 cglobal pred8x8l_down_right_10, 4, 5, 8
    776    sub         r0, r3
    777    lea         r4, [r0+r3*4]
    778    lea         r1, [r3*3]
    779    mova        m0, [r0+r3*1-16]
    780    punpckhwd   m0, [r0+r3*0-16]
    781    mova        m1, [r0+r1*1-16]
    782    punpckhwd   m1, [r0+r3*2-16]
    783    punpckhdq   m1, m0
    784    mova        m2, [r4+r3*1-16]
    785    punpckhwd   m2, [r4+r3*0-16]
    786    mova        m3, [r4+r1*1-16]
    787    punpckhwd   m3, [r4+r3*2-16]
    788    punpckhdq   m3, m2
    789    punpckhqdq  m3, m1
    790    mova        m0, [r4+r3*4-16]
    791    mova        m1, [r0]
    792    PALIGNR     m4, m3, m0, 14, m0
    793    PALIGNR     m1, m3,  2, m2
    794    pslldq      m0, m4, 2
    795    pshuflw     m0, m0, 11100101b
    796    PRED4x4_LOWPASS m6, m1, m4, m3
    797    PRED4x4_LOWPASS m4, m3, m0, m4
    798    mova        m3, [r0]
    799    shr        r2d, 13
    800    pslldq      m1, m3, 2
    801    psrldq      m2, m3, 2
    802    pinsrw      m1, [r0-2], 0
    803    pinsrw      m2, [r0+r2+14], 7
    804    PRED4x4_LOWPASS m3, m2, m1, m3
    805    PALIGNR     m2, m3, m6,  2, m0
    806    PALIGNR     m5, m3, m6, 14, m0
    807    psrldq      m7, m3, 2
    808    PRED4x4_LOWPASS m6, m4, m2, m6
    809    PRED4x4_LOWPASS m3, m5, m7, m3
    810    mova [r4+r3*4], m6
    811    PALIGNR     m3, m6, 14, m2
    812    pslldq      m6, 2
    813    mova [r0+r3*1], m3
    814    PALIGNR     m3, m6, 14, m2
    815    pslldq      m6, 2
    816    mova [r0+r3*2], m3
    817    PALIGNR     m3, m6, 14, m2
    818    pslldq      m6, 2
    819    mova [r0+r1*1], m3
    820    PALIGNR     m3, m6, 14, m2
    821    pslldq      m6, 2
    822    mova [r0+r3*4], m3
    823    PALIGNR     m3, m6, 14, m2
    824    pslldq      m6, 2
    825    mova [r4+r3*1], m3
    826    PALIGNR     m3, m6, 14, m2
    827    pslldq      m6, 2
    828    mova [r4+r3*2], m3
    829    PALIGNR     m3, m6, 14, m6
    830    mova [r4+r1*1], m3
    831    RET
    832 %endmacro
    833 
    834 INIT_XMM sse2
    835 PRED8x8L_DOWN_RIGHT
    836 INIT_XMM ssse3
    837 PRED8x8L_DOWN_RIGHT
    838 %if HAVE_AVX_EXTERNAL
    839 INIT_XMM avx
    840 PRED8x8L_DOWN_RIGHT
    841 %endif
    842 
    843 ;-----------------------------------------------------------------------------
    844 ; void ff_pred8x8l_vertical_right_10(pixel *src, int has_topleft,
    845 ;                                    int has_topright, ptrdiff_t stride)
    846 ;-----------------------------------------------------------------------------
    847 %macro PRED8x8L_VERTICAL_RIGHT 0
    848 ; likewise with 8x8l_down_right
    849 cglobal pred8x8l_vertical_right_10, 4, 5, 7
    850    sub         r0, r3
    851    lea         r4, [r0+r3*4]
    852    lea         r1, [r3*3]
    853    mova        m0, [r0+r3*1-16]
    854    punpckhwd   m0, [r0+r3*0-16]
    855    mova        m1, [r0+r1*1-16]
    856    punpckhwd   m1, [r0+r3*2-16]
    857    punpckhdq   m1, m0
    858    mova        m2, [r4+r3*1-16]
    859    punpckhwd   m2, [r4+r3*0-16]
    860    mova        m3, [r4+r1*1-16]
    861    punpckhwd   m3, [r4+r3*2-16]
    862    punpckhdq   m3, m2
    863    punpckhqdq  m3, m1
    864    mova        m0, [r4+r3*4-16]
    865    mova        m1, [r0]
    866    PALIGNR     m4, m3, m0, 14, m0
    867    PALIGNR     m1, m3,  2, m2
    868    PRED4x4_LOWPASS m3, m1, m4, m3
    869    mova        m2, [r0]
    870    shr        r2d, 13
    871    pslldq      m1, m2, 2
    872    psrldq      m5, m2, 2
    873    pinsrw      m1, [r0-2], 0
    874    pinsrw      m5, [r0+r2+14], 7
    875    PRED4x4_LOWPASS m2, m5, m1, m2
    876    PALIGNR     m6, m2, m3, 12, m1
    877    PALIGNR     m5, m2, m3, 14, m0
    878    PRED4x4_LOWPASS m0, m6, m2, m5
    879    pavgw       m2, m5
    880    mova [r0+r3*2], m0
    881    mova [r0+r3*1], m2
    882    pslldq      m6, m3, 4
    883    pslldq      m1, m3, 2
    884    PRED4x4_LOWPASS m1, m3, m6, m1
    885    PALIGNR     m2, m1, 14, m4
    886    mova [r0+r1*1], m2
    887    pslldq      m1, 2
    888    PALIGNR     m0, m1, 14, m3
    889    mova [r0+r3*4], m0
    890    pslldq      m1, 2
    891    PALIGNR     m2, m1, 14, m4
    892    mova [r4+r3*1], m2
    893    pslldq      m1, 2
    894    PALIGNR     m0, m1, 14, m3
    895    mova [r4+r3*2], m0
    896    pslldq      m1, 2
    897    PALIGNR     m2, m1, 14, m4
    898    mova [r4+r1*1], m2
    899    pslldq      m1, 2
    900    PALIGNR     m0, m1, 14, m1
    901    mova [r4+r3*4], m0
    902    RET
    903 %endmacro
    904 
    905 INIT_XMM sse2
    906 PRED8x8L_VERTICAL_RIGHT
    907 INIT_XMM ssse3
    908 PRED8x8L_VERTICAL_RIGHT
    909 %if HAVE_AVX_EXTERNAL
    910 INIT_XMM avx
    911 PRED8x8L_VERTICAL_RIGHT
    912 %endif
    913 
    914 ;-----------------------------------------------------------------------------
    915 ; void ff_pred8x8l_horizontal_up_10(pixel *src, int has_topleft,
    916 ;                                   int has_topright, ptrdiff_t stride)
    917 ;-----------------------------------------------------------------------------
    918 %macro PRED8x8L_HORIZONTAL_UP 0
    919 cglobal pred8x8l_horizontal_up_10, 4, 4, 6
    920    mova        m0, [r0+r3*0-16]
    921    punpckhwd   m0, [r0+r3*1-16]
    922    shr        r1d, 14
    923    dec         r1
    924    and         r1, r3
    925    sub         r1, r3
    926    mova        m4, [r0+r1*1-16]
    927    lea         r1, [r3*3]
    928    lea         r2, [r0+r3*4]
    929    mova        m1, [r0+r3*2-16]
    930    punpckhwd   m1, [r0+r1*1-16]
    931    punpckhdq   m0, m1
    932    mova        m2, [r2+r3*0-16]
    933    punpckhwd   m2, [r2+r3*1-16]
    934    mova        m3, [r2+r3*2-16]
    935    punpckhwd   m3, [r2+r1*1-16]
    936    punpckhdq   m2, m3
    937    punpckhqdq  m0, m2
    938    PALIGNR     m1, m0, m4, 14, m4
    939    psrldq      m2, m0, 2
    940    pshufhw     m2, m2, 10100100b
    941    PRED4x4_LOWPASS m0, m1, m2, m0
    942    psrldq      m1, m0, 2
    943    psrldq      m2, m0, 4
    944    pshufhw     m1, m1, 10100100b
    945    pshufhw     m2, m2, 01010100b
    946    pavgw       m4, m0, m1
    947    PRED4x4_LOWPASS m1, m2, m0, m1
    948    punpckhwd   m5, m4, m1
    949    punpcklwd   m4, m1
    950    mova [r2+r3*0], m5
    951    mova [r0+r3*0], m4
    952    pshufd      m0, m5, 11111001b
    953    pshufd      m1, m5, 11111110b
    954    pshufd      m2, m5, 11111111b
    955    mova [r2+r3*1], m0
    956    mova [r2+r3*2], m1
    957    mova [r2+r1*1], m2
    958    PALIGNR     m2, m5, m4, 4, m0
    959    PALIGNR     m3, m5, m4, 8, m1
    960    PALIGNR     m5, m5, m4, 12, m4
    961    mova [r0+r3*1], m2
    962    mova [r0+r3*2], m3
    963    mova [r0+r1*1], m5
    964    RET
    965 %endmacro
    966 
    967 INIT_XMM sse2
    968 PRED8x8L_HORIZONTAL_UP
    969 INIT_XMM ssse3
    970 PRED8x8L_HORIZONTAL_UP
    971 %if HAVE_AVX_EXTERNAL
    972 INIT_XMM avx
    973 PRED8x8L_HORIZONTAL_UP
    974 %endif
    975 
    976 
    977 ;-----------------------------------------------------------------------------
    978 ; void ff_pred16x16_vertical_10(pixel *src, ptrdiff_t stride)
    979 ;-----------------------------------------------------------------------------
    980 %macro MOV16 3-5
    981    mova [%1+     0], %2
    982    mova [%1+mmsize], %3
    983 %endmacro
    984 
    985 INIT_XMM sse2
    986 cglobal pred16x16_vertical_10, 2, 3
    987    sub   r0, r1
    988    mov  r2d, 8
    989    mova  m0, [r0+ 0]
    990    mova  m1, [r0+mmsize]
    991 .loop:
    992    MOV16 r0+r1*1, m0, m1, m2, m3
    993    MOV16 r0+r1*2, m0, m1, m2, m3
    994    lea   r0, [r0+r1*2]
    995    dec   r2d
    996    jg .loop
    997    RET
    998 
    999 ;-----------------------------------------------------------------------------
   1000 ; void ff_pred16x16_horizontal_10(pixel *src, ptrdiff_t stride)
   1001 ;-----------------------------------------------------------------------------
   1002 INIT_XMM sse2
   1003 cglobal pred16x16_horizontal_10, 2, 3
   1004    mov   r2d, 8
   1005 .vloop:
   1006    movd   m0, [r0+r1*0-4]
   1007    movd   m1, [r0+r1*1-4]
   1008    SPLATW m0, m0, 1
   1009    SPLATW m1, m1, 1
   1010    MOV16  r0+r1*0, m0, m0, m0, m0
   1011    MOV16  r0+r1*1, m1, m1, m1, m1
   1012    lea    r0, [r0+r1*2]
   1013    dec    r2d
   1014    jg .vloop
   1015    RET
   1016 
   1017 ;-----------------------------------------------------------------------------
   1018 ; void ff_pred16x16_dc_10(pixel *src, ptrdiff_t stride)
   1019 ;-----------------------------------------------------------------------------
   1020 INIT_XMM sse2
   1021 cglobal pred16x16_dc_10, 2, 6
   1022    mov        r5, r0
   1023    sub        r0, r1
   1024    mova       m0, [r0+0]
   1025    paddw      m0, [r0+mmsize]
   1026    HADDW      m0, m2
   1027 
   1028    lea        r0, [r0+r1-2]
   1029    movzx     r3d, word [r0]
   1030    movzx     r4d, word [r0+r1]
   1031 %rep 7
   1032    lea        r0, [r0+r1*2]
   1033    movzx     r2d, word [r0]
   1034    add       r3d, r2d
   1035    movzx     r2d, word [r0+r1]
   1036    add       r4d, r2d
   1037 %endrep
   1038    lea       r3d, [r3+r4+16]
   1039 
   1040    movd       m1, r3d
   1041    paddw      m0, m1
   1042    psrlw      m0, 5
   1043    SPLATW     m0, m0
   1044    mov       r3d, 8
   1045 .loop:
   1046    MOV16 r5+r1*0, m0, m0, m0, m0
   1047    MOV16 r5+r1*1, m0, m0, m0, m0
   1048    lea        r5, [r5+r1*2]
   1049    dec       r3d
   1050    jg .loop
   1051    RET
   1052 
   1053 ;-----------------------------------------------------------------------------
   1054 ; void ff_pred16x16_top_dc_10(pixel *src, ptrdiff_t stride)
   1055 ;-----------------------------------------------------------------------------
   1056 INIT_XMM sse2
   1057 cglobal pred16x16_top_dc_10, 2, 3
   1058    sub        r0, r1
   1059    mova       m0, [r0+0]
   1060    paddw      m0, [r0+mmsize]
   1061    HADDW      m0, m2
   1062 
   1063    SPLATW     m0, m0
   1064    paddw      m0, [pw_8]
   1065    psrlw      m0, 4
   1066    mov       r2d, 8
   1067 .loop:
   1068    MOV16 r0+r1*1, m0, m0, m0, m0
   1069    MOV16 r0+r1*2, m0, m0, m0, m0
   1070    lea        r0, [r0+r1*2]
   1071    dec       r2d
   1072    jg .loop
   1073    RET
   1074 
   1075 ;-----------------------------------------------------------------------------
   1076 ; void ff_pred16x16_left_dc_10(pixel *src, ptrdiff_t stride)
   1077 ;-----------------------------------------------------------------------------
   1078 INIT_XMM sse2
   1079 cglobal pred16x16_left_dc_10, 2, 6
   1080    mov        r5, r0
   1081 
   1082    sub        r0, 2
   1083    movzx     r3d, word [r0]
   1084    movzx     r4d, word [r0+r1]
   1085 %rep 7
   1086    lea        r0, [r0+r1*2]
   1087    movzx     r2d, word [r0]
   1088    add       r3d, r2d
   1089    movzx     r2d, word [r0+r1]
   1090    add       r4d, r2d
   1091 %endrep
   1092    lea       r3d, [r3+r4+8]
   1093    shr       r3d, 4
   1094 
   1095    movd       m0, r3d
   1096    SPLATW     m0, m0
   1097    mov       r3d, 8
   1098 .loop:
   1099    MOV16 r5+r1*0, m0, m0, m0, m0
   1100    MOV16 r5+r1*1, m0, m0, m0, m0
   1101    lea        r5, [r5+r1*2]
   1102    dec       r3d
   1103    jg .loop
   1104    RET
   1105 
   1106 ;-----------------------------------------------------------------------------
   1107 ; void ff_pred16x16_128_dc_10(pixel *src, ptrdiff_t stride)
   1108 ;-----------------------------------------------------------------------------
   1109 INIT_XMM sse2
   1110 cglobal pred16x16_128_dc_10, 2,3
   1111    mova       m0, [pw_512]
   1112    mov       r2d, 8
   1113 .loop:
   1114    MOV16 r0+r1*0, m0, m0, m0, m0
   1115    MOV16 r0+r1*1, m0, m0, m0, m0
   1116    lea        r0, [r0+r1*2]
   1117    dec       r2d
   1118    jg .loop
   1119    RET