tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

aom_subpixel_8t_ssse3.asm (30367B)


      1 ;
      2 ; Copyright (c) 2016, Alliance for Open Media. All rights reserved.
      3 ;
      4 ; This source code is subject to the terms of the BSD 2 Clause License and
      5 ; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
      6 ; was not distributed with this source code in the LICENSE file, you can
      7 ; obtain it at www.aomedia.org/license/software. If the Alliance for Open
      8 ; Media Patent License 1.0 was not distributed with this source code in the
      9 ; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
     10 ;
     11 
     12 ;
     13 
     14 %include "third_party/x86inc/x86inc.asm"
     15 
     16 SECTION_RODATA
     17 pw_64:    times 8 dw 64
     18 even_byte_mask: times 8 dw 0x00ff
     19 
     20 ; %define USE_PMULHRSW
     21 ; NOTE: pmulhrsw has a latency of 5 cycles.  Tests showed a performance loss
     22 ; when using this instruction.
     23 ;
     24 ; The add order below (based on ffav1) must be followed to prevent outranges.
     25 ; x = k0k1 + k4k5
     26 ; y = k2k3 + k6k7
     27 ; z = signed SAT(x + y)
     28 
     29 SECTION .text
     30 %define LOCAL_VARS_SIZE 16*6
     31 
     32 %macro SETUP_LOCAL_VARS 0
     33    ; TODO(slavarnway): using xmm registers for these on AOM_ARCH_X86_64 +
     34    ; pmaddubsw has a higher latency on some platforms, this might be eased by
     35    ; interleaving the instructions.
     36    %define    k0k1  [rsp + 16*0]
     37    %define    k2k3  [rsp + 16*1]
     38    %define    k4k5  [rsp + 16*2]
     39    %define    k6k7  [rsp + 16*3]
     40    packsswb     m4, m4
     41    ; TODO(slavarnway): multiple pshufb instructions had a higher latency on
     42    ; some platforms.
     43    pshuflw      m0, m4, 0b              ;k0_k1
     44    pshuflw      m1, m4, 01010101b       ;k2_k3
     45    pshuflw      m2, m4, 10101010b       ;k4_k5
     46    pshuflw      m3, m4, 11111111b       ;k6_k7
     47    punpcklqdq   m0, m0
     48    punpcklqdq   m1, m1
     49    punpcklqdq   m2, m2
     50    punpcklqdq   m3, m3
     51    mova       k0k1, m0
     52    mova       k2k3, m1
     53    mova       k4k5, m2
     54    mova       k6k7, m3
     55 %if AOM_ARCH_X86_64
     56    %define     krd  m12
     57    %define    tmp0  [rsp + 16*4]
     58    %define    tmp1  [rsp + 16*5]
     59    mova        krd, [GLOBAL(pw_64)]
     60 %else
     61    %define     krd  [rsp + 16*4]
     62 %if CONFIG_PIC=0
     63    mova         m6, [GLOBAL(pw_64)]
     64 %else
     65    ; build constants without accessing global memory
     66    pcmpeqb      m6, m6                  ;all ones
     67    psrlw        m6, 15
     68    psllw        m6, 6                   ;aka pw_64
     69 %endif
     70    mova        krd, m6
     71 %endif
     72 %endm
     73 
     74 ;-------------------------------------------------------------------------------
     75 %if AOM_ARCH_X86_64
     76  %define LOCAL_VARS_SIZE_H4 0
     77 %else
     78  %define LOCAL_VARS_SIZE_H4 16*4
     79 %endif
     80 
     81 %macro SUBPIX_HFILTER4 1
     82 cglobal filter_block1d4_%1, 6, 6, 11, LOCAL_VARS_SIZE_H4, \
     83                            src, sstride, dst, dstride, height, filter
     84    mova                m4, [filterq]
     85    packsswb            m4, m4
     86 %if AOM_ARCH_X86_64
     87    %define       k0k1k4k5  m8
     88    %define       k2k3k6k7  m9
     89    %define            krd  m10
     90    mova               krd, [GLOBAL(pw_64)]
     91    pshuflw       k0k1k4k5, m4, 0b              ;k0_k1
     92    pshufhw       k0k1k4k5, k0k1k4k5, 10101010b ;k0_k1_k4_k5
     93    pshuflw       k2k3k6k7, m4, 01010101b       ;k2_k3
     94    pshufhw       k2k3k6k7, k2k3k6k7, 11111111b ;k2_k3_k6_k7
     95 %else
     96    %define       k0k1k4k5  [rsp + 16*0]
     97    %define       k2k3k6k7  [rsp + 16*1]
     98    %define            krd  [rsp + 16*2]
     99    pshuflw             m6, m4, 0b              ;k0_k1
    100    pshufhw             m6, m6, 10101010b       ;k0_k1_k4_k5
    101    pshuflw             m7, m4, 01010101b       ;k2_k3
    102    pshufhw             m7, m7, 11111111b       ;k2_k3_k6_k7
    103 %if CONFIG_PIC=0
    104    mova                m1, [GLOBAL(pw_64)]
    105 %else
    106    ; build constants without accessing global memory
    107    pcmpeqb             m1, m1                  ;all ones
    108    psrlw               m1, 15
    109    psllw               m1, 6                   ;aka pw_64
    110 %endif
    111    mova          k0k1k4k5, m6
    112    mova          k2k3k6k7, m7
    113    mova               krd, m1
    114 %endif
    115    dec            heightd
    116 
    117 .loop:
    118    ;Do two rows at once
    119    movu                m4, [srcq - 3]
    120    movu                m5, [srcq + sstrideq - 3]
    121    punpckhbw           m1, m4, m4
    122    punpcklbw           m4, m4
    123    punpckhbw           m3, m5, m5
    124    punpcklbw           m5, m5
    125    palignr             m0, m1, m4, 1
    126    pmaddubsw           m0, k0k1k4k5
    127    palignr             m1, m4, 5
    128    pmaddubsw           m1, k2k3k6k7
    129    palignr             m2, m3, m5, 1
    130    pmaddubsw           m2, k0k1k4k5
    131    palignr             m3, m5, 5
    132    pmaddubsw           m3, k2k3k6k7
    133    punpckhqdq          m4, m0, m2
    134    punpcklqdq          m0, m2
    135    punpckhqdq          m5, m1, m3
    136    punpcklqdq          m1, m3
    137    paddsw              m0, m4
    138    paddsw              m1, m5
    139 %ifidn %1, h8_avg
    140    movd                m4, [dstq]
    141    movd                m5, [dstq + dstrideq]
    142 %endif
    143    paddsw              m0, m1
    144    paddsw              m0, krd
    145    psraw               m0, 7
    146 %ifidn %1, h8_add_src
    147    pxor                 m3, m3
    148    movu                 m4, [srcq]
    149    movu                 m5, [srcq + sstrideq]
    150    punpckldq            m4, m5 ; Bytes 0,1,2,3 from row 0, then 0,1,2,3 from row 2
    151    punpcklbw            m4, m3
    152    paddsw               m0, m4
    153 %endif
    154    packuswb            m0, m0
    155    psrldq              m1, m0, 4
    156 
    157 %ifidn %1, h8_avg
    158    pavgb               m0, m4
    159    pavgb               m1, m5
    160 %endif
    161    movd            [dstq], m0
    162    movd [dstq + dstrideq], m1
    163 
    164    lea               srcq, [srcq + sstrideq        ]
    165    prefetcht0              [srcq + 4 * sstrideq - 3]
    166    lea               srcq, [srcq + sstrideq        ]
    167    lea               dstq, [dstq + 2 * dstrideq    ]
    168    prefetcht0              [srcq + 2 * sstrideq - 3]
    169 
    170    sub            heightd, 2
    171    jg               .loop
    172 
    173    ; Do last row if output_height is odd
    174    jne              .done
    175 
    176    movu                m4, [srcq - 3]
    177    punpckhbw           m1, m4, m4
    178    punpcklbw           m4, m4
    179    palignr             m0, m1, m4, 1
    180    palignr             m1, m4, 5
    181    pmaddubsw           m0, k0k1k4k5
    182    pmaddubsw           m1, k2k3k6k7
    183    psrldq              m2, m0, 8
    184    psrldq              m3, m1, 8
    185    paddsw              m0, m2
    186    paddsw              m1, m3
    187    paddsw              m0, m1
    188    paddsw              m0, krd
    189    psraw               m0, 7
    190 %ifidn %1, h8_add_src
    191    pxor                m3, m3
    192    movu                m4, [srcq]
    193    punpcklbw           m4, m3
    194    paddsw              m0, m4
    195 %endif
    196    packuswb            m0, m0
    197 %ifidn %1, h8_avg
    198    movd                m4, [dstq]
    199    pavgb               m0, m4
    200 %endif
    201    movd            [dstq], m0
    202 .done:
    203    REP_RET
    204 %endm
    205 
    206 ;-------------------------------------------------------------------------------
    207 %macro SUBPIX_HFILTER8 1
    208 cglobal filter_block1d8_%1, 6, 6, 14, LOCAL_VARS_SIZE, \
    209                            src, sstride, dst, dstride, height, filter
    210    mova                 m4, [filterq]
    211    SETUP_LOCAL_VARS
    212    dec             heightd
    213 
    214 .loop:
    215    ;Do two rows at once
    216    movu                 m0, [srcq - 3]
    217    movu                 m4, [srcq + sstrideq - 3]
    218    punpckhbw            m1, m0, m0
    219    punpcklbw            m0, m0
    220    palignr              m5, m1, m0, 13
    221    pmaddubsw            m5, k6k7
    222    palignr              m2, m1, m0, 5
    223    palignr              m3, m1, m0, 9
    224    palignr              m1, m0, 1
    225    pmaddubsw            m1, k0k1
    226    punpckhbw            m6, m4, m4
    227    punpcklbw            m4, m4
    228    pmaddubsw            m2, k2k3
    229    pmaddubsw            m3, k4k5
    230 
    231    palignr              m7, m6, m4, 13
    232    palignr              m0, m6, m4, 5
    233    pmaddubsw            m7, k6k7
    234    paddsw               m1, m3
    235    paddsw               m2, m5
    236    paddsw               m1, m2
    237 %ifidn %1, h8_avg
    238    movh                 m2, [dstq]
    239    movhps               m2, [dstq + dstrideq]
    240 %endif
    241    palignr              m5, m6, m4, 9
    242    palignr              m6, m4, 1
    243    pmaddubsw            m0, k2k3
    244    pmaddubsw            m6, k0k1
    245    paddsw               m1, krd
    246    pmaddubsw            m5, k4k5
    247    psraw                m1, 7
    248    paddsw               m0, m7
    249    paddsw               m6, m5
    250    paddsw               m6, m0
    251    paddsw               m6, krd
    252    psraw                m6, 7
    253 %ifidn %1, h8_add_src
    254    pxor                 m3, m3
    255    movu                 m4, [srcq]
    256    movu                 m5, [srcq + sstrideq]
    257    punpcklbw            m4, m3
    258    punpcklbw            m5, m3
    259    paddsw               m1, m4
    260    paddsw               m6, m5
    261 %endif
    262    packuswb             m1, m6
    263 %ifidn %1, h8_avg
    264    pavgb                m1, m2
    265 %endif
    266    movh              [dstq], m1
    267    movhps [dstq + dstrideq], m1
    268 
    269    lea                srcq, [srcq + sstrideq        ]
    270    prefetcht0               [srcq + 4 * sstrideq - 3]
    271    lea                srcq, [srcq + sstrideq        ]
    272    lea                dstq, [dstq + 2 * dstrideq    ]
    273    prefetcht0               [srcq + 2 * sstrideq - 3]
    274    sub             heightd, 2
    275    jg                .loop
    276 
    277    ; Do last row if output_height is odd
    278    jne               .done
    279 
    280    movu                 m0, [srcq - 3]
    281    punpckhbw            m3, m0, m0
    282    punpcklbw            m0, m0
    283    palignr              m1, m3, m0, 1
    284    palignr              m2, m3, m0, 5
    285    palignr              m4, m3, m0, 13
    286    palignr              m3, m0, 9
    287    pmaddubsw            m1, k0k1
    288    pmaddubsw            m2, k2k3
    289    pmaddubsw            m3, k4k5
    290    pmaddubsw            m4, k6k7
    291    paddsw               m1, m3
    292    paddsw               m4, m2
    293    paddsw               m1, m4
    294    paddsw               m1, krd
    295    psraw                m1, 7
    296 %ifidn %1, h8_add_src
    297    pxor                 m6, m6
    298    movu                 m5, [srcq]
    299    punpcklbw            m5, m6
    300    paddsw               m1, m5
    301 %endif
    302    packuswb             m1, m1
    303 %ifidn %1, h8_avg
    304    movh                 m0, [dstq]
    305    pavgb                m1, m0
    306 %endif
    307    movh             [dstq], m1
    308 .done:
    309    REP_RET
    310 %endm
    311 
    312 ;-------------------------------------------------------------------------------
    313 %macro SUBPIX_HFILTER16 1
    314 cglobal filter_block1d16_%1, 6, 6, 14, LOCAL_VARS_SIZE, \
    315                             src, sstride, dst, dstride, height, filter
    316    mova          m4, [filterq]
    317    SETUP_LOCAL_VARS
    318 
    319 .loop:
    320    prefetcht0        [srcq + 2 * sstrideq -3]
    321 
    322    movu          m0, [srcq - 3]
    323    movu          m4, [srcq - 2]
    324    pmaddubsw     m0, k0k1
    325    pmaddubsw     m4, k0k1
    326    movu          m1, [srcq - 1]
    327    movu          m5, [srcq + 0]
    328    pmaddubsw     m1, k2k3
    329    pmaddubsw     m5, k2k3
    330    movu          m2, [srcq + 1]
    331    movu          m6, [srcq + 2]
    332    pmaddubsw     m2, k4k5
    333    pmaddubsw     m6, k4k5
    334    movu          m3, [srcq + 3]
    335    movu          m7, [srcq + 4]
    336    pmaddubsw     m3, k6k7
    337    pmaddubsw     m7, k6k7
    338    paddsw        m0, m2
    339    paddsw        m1, m3
    340    paddsw        m0, m1
    341    paddsw        m4, m6
    342    paddsw        m5, m7
    343    paddsw        m4, m5
    344    paddsw        m0, krd
    345    paddsw        m4, krd
    346    psraw         m0, 7
    347    psraw         m4, 7
    348 %ifidn %1, h8_add_src
    349 %if AOM_ARCH_X86=1 && CONFIG_PIC=1
    350    pcmpeqb       m2, m2                  ;all ones
    351    psrlw         m2, 8                   ;even_byte_mask
    352 %else
    353    mova          m2, [GLOBAL(even_byte_mask)]
    354 %endif
    355    movu          m5, [srcq]
    356    mova          m7, m5
    357    pand          m5, m2
    358    psrlw         m7, 8
    359    paddsw        m0, m5
    360    paddsw        m4, m7
    361 %endif
    362    packuswb      m0, m0
    363    packuswb      m4, m4
    364    punpcklbw     m0, m4
    365 %ifidn %1, h8_avg
    366    pavgb         m0, [dstq]
    367 %endif
    368    lea         srcq, [srcq + sstrideq]
    369    mova      [dstq], m0
    370    lea         dstq, [dstq + dstrideq]
    371    dec      heightd
    372    jnz        .loop
    373    REP_RET
    374 %endm
    375 
    376 INIT_XMM ssse3
    377 SUBPIX_HFILTER16 h8
    378 SUBPIX_HFILTER8  h8
    379 SUBPIX_HFILTER4  h8
    380 
    381 ;-------------------------------------------------------------------------------
    382 
    383 ; TODO(Linfeng): Detect cpu type and choose the code with better performance.
    384 %define X86_SUBPIX_VFILTER_PREFER_SLOW_CELERON 1
    385 
    386 %if AOM_ARCH_X86_64 && X86_SUBPIX_VFILTER_PREFER_SLOW_CELERON
    387    %define NUM_GENERAL_REG_USED 9
    388 %else
    389    %define NUM_GENERAL_REG_USED 6
    390 %endif
    391 
    392 %macro SUBPIX_VFILTER 2
    393 cglobal filter_block1d%2_%1, 6, NUM_GENERAL_REG_USED, 15, LOCAL_VARS_SIZE, \
    394                             src, sstride, dst, dstride, height, filter
    395    mova          m4, [filterq]
    396    SETUP_LOCAL_VARS
    397 
    398 %ifidn %2, 8
    399    %define                movx  movh
    400 %else
    401    %define                movx  movd
    402 %endif
    403 
    404    dec                 heightd
    405 
    406 %if AOM_ARCH_X86 || X86_SUBPIX_VFILTER_PREFER_SLOW_CELERON
    407 
    408 %if AOM_ARCH_X86_64
    409    %define               src1q  r7
    410    %define           sstride6q  r8
    411    %define          dst_stride  dstrideq
    412 %else
    413    %define               src1q  filterq
    414    %define           sstride6q  dstrideq
    415    %define          dst_stride  dstridemp
    416 %endif
    417    mov                   src1q, srcq
    418    add                   src1q, sstrideq
    419    lea               sstride6q, [sstrideq + sstrideq * 4]
    420    add               sstride6q, sstrideq                   ;pitch * 6
    421 
    422 .loop:
    423    ;Do two rows at once
    424    movx                     m0, [srcq                ]     ;A
    425    movx                     m1, [src1q               ]     ;B
    426    punpcklbw                m0, m1                         ;A B
    427    movx                     m2, [srcq + sstrideq * 2 ]     ;C
    428    pmaddubsw                m0, k0k1
    429    mova                     m6, m2
    430    movx                     m3, [src1q + sstrideq * 2]     ;D
    431    punpcklbw                m2, m3                         ;C D
    432    pmaddubsw                m2, k2k3
    433    movx                     m4, [srcq + sstrideq * 4 ]     ;E
    434    mova                     m7, m4
    435    movx                     m5, [src1q + sstrideq * 4]     ;F
    436    punpcklbw                m4, m5                         ;E F
    437    pmaddubsw                m4, k4k5
    438    punpcklbw                m1, m6                         ;A B next iter
    439    movx                     m6, [srcq + sstride6q    ]     ;G
    440    punpcklbw                m5, m6                         ;E F next iter
    441    punpcklbw                m3, m7                         ;C D next iter
    442    pmaddubsw                m5, k4k5
    443    movx                     m7, [src1q + sstride6q   ]     ;H
    444    punpcklbw                m6, m7                         ;G H
    445    pmaddubsw                m6, k6k7
    446    pmaddubsw                m3, k2k3
    447    pmaddubsw                m1, k0k1
    448    paddsw                   m0, m4
    449    paddsw                   m2, m6
    450    movx                     m6, [srcq + sstrideq * 8 ]     ;H next iter
    451    punpcklbw                m7, m6
    452    pmaddubsw                m7, k6k7
    453    paddsw                   m0, m2
    454    paddsw                   m0, krd
    455    psraw                    m0, 7
    456    paddsw                   m1, m5
    457 %ifidn %1, v8_add_src
    458    pxor                     m6, m6
    459    movu                     m4, [srcq]
    460    punpcklbw                m4, m6
    461    paddsw                   m0, m4
    462 %endif
    463    packuswb                 m0, m0
    464 
    465    paddsw                   m3, m7
    466    paddsw                   m1, m3
    467    paddsw                   m1, krd
    468    psraw                    m1, 7
    469 %ifidn %1, v8_add_src
    470    movu                     m4, [src1q]
    471    punpcklbw                m4, m6
    472    paddsw                   m1, m4
    473 %endif
    474    lea                    srcq, [srcq + sstrideq * 2 ]
    475    lea                   src1q, [src1q + sstrideq * 2]
    476    packuswb                 m1, m1
    477 
    478 %ifidn %1, v8_avg
    479    movx                     m2, [dstq]
    480    pavgb                    m0, m2
    481 %endif
    482    movx                 [dstq], m0
    483    add                    dstq, dst_stride
    484 %ifidn %1, v8_avg
    485    movx                     m3, [dstq]
    486    pavgb                    m1, m3
    487 %endif
    488    movx                 [dstq], m1
    489    add                    dstq, dst_stride
    490    sub                 heightd, 2
    491    jg                    .loop
    492 
    493    ; Do last row if output_height is odd
    494    jne                   .done
    495 
    496    movx                     m0, [srcq                ]     ;A
    497    movx                     m1, [srcq + sstrideq     ]     ;B
    498    movx                     m6, [srcq + sstride6q    ]     ;G
    499    punpcklbw                m0, m1                         ;A B
    500    movx                     m7, [src1q + sstride6q   ]     ;H
    501    pmaddubsw                m0, k0k1
    502    movx                     m2, [srcq + sstrideq * 2 ]     ;C
    503    punpcklbw                m6, m7                         ;G H
    504    movx                     m3, [src1q + sstrideq * 2]     ;D
    505    pmaddubsw                m6, k6k7
    506    movx                     m4, [srcq + sstrideq * 4 ]     ;E
    507    punpcklbw                m2, m3                         ;C D
    508    movx                     m5, [src1q + sstrideq * 4]     ;F
    509    punpcklbw                m4, m5                         ;E F
    510    pmaddubsw                m2, k2k3
    511    pmaddubsw                m4, k4k5
    512    paddsw                   m2, m6
    513    paddsw                   m0, m4
    514    paddsw                   m0, m2
    515    paddsw                   m0, krd
    516    psraw                    m0, 7
    517 %ifidn %1, v8_add_src
    518    pxor                     m6, m6
    519    movu                     m4, [srcq]
    520    punpcklbw                m4, m6
    521    paddsw                   m0, m4
    522 %endif
    523    packuswb                 m0, m0
    524 %ifidn %1, v8_avg
    525    movx                     m1, [dstq]
    526    pavgb                    m0, m1
    527 %endif
    528    movx                 [dstq], m0
    529 
    530 %else
    531    ; AOM_ARCH_X86_64
    532 
    533    movx                     m0, [srcq                ]     ;A
    534    movx                     m1, [srcq + sstrideq     ]     ;B
    535    lea                    srcq, [srcq + sstrideq * 2 ]
    536    movx                     m2, [srcq]                     ;C
    537    movx                     m3, [srcq + sstrideq]          ;D
    538    lea                    srcq, [srcq + sstrideq * 2 ]
    539    movx                     m4, [srcq]                     ;E
    540    movx                     m5, [srcq + sstrideq]          ;F
    541    lea                    srcq, [srcq + sstrideq * 2 ]
    542    movx                     m6, [srcq]                     ;G
    543    punpcklbw                m0, m1                         ;A B
    544    punpcklbw                m1, m2                         ;A B next iter
    545    punpcklbw                m2, m3                         ;C D
    546    punpcklbw                m3, m4                         ;C D next iter
    547    punpcklbw                m4, m5                         ;E F
    548    punpcklbw                m5, m6                         ;E F next iter
    549 
    550 .loop:
    551    ;Do two rows at once
    552    movx                     m7, [srcq + sstrideq]          ;H
    553    lea                    srcq, [srcq + sstrideq * 2 ]
    554    movx                    m14, [srcq]                     ;H next iter
    555    punpcklbw                m6, m7                         ;G H
    556    punpcklbw                m7, m14                        ;G H next iter
    557    pmaddubsw                m8, m0, k0k1
    558    pmaddubsw                m9, m1, k0k1
    559    mova                     m0, m2
    560    mova                     m1, m3
    561    pmaddubsw               m10, m2, k2k3
    562    pmaddubsw               m11, m3, k2k3
    563    mova                     m2, m4
    564    mova                     m3, m5
    565    pmaddubsw                m4, k4k5
    566    pmaddubsw                m5, k4k5
    567    paddsw                   m8, m4
    568    paddsw                   m9, m5
    569    mova                     m4, m6
    570    mova                     m5, m7
    571    pmaddubsw                m6, k6k7
    572    pmaddubsw                m7, k6k7
    573    paddsw                  m10, m6
    574    paddsw                  m11, m7
    575    paddsw                   m8, m10
    576    paddsw                   m9, m11
    577    mova                     m6, m14
    578    paddsw                   m8, krd
    579    paddsw                   m9, krd
    580    psraw                    m8, 7
    581    psraw                    m9, 7
    582 %ifidn %2, 4
    583    packuswb                 m8, m8
    584    packuswb                 m9, m9
    585 %else
    586    packuswb                 m8, m9
    587 %endif
    588 
    589 %ifidn %1, v8_avg
    590    movx                     m7, [dstq]
    591 %ifidn %2, 4
    592    movx                    m10, [dstq + dstrideq]
    593    pavgb                    m9, m10
    594 %else
    595    movhpd                   m7, [dstq + dstrideq]
    596 %endif
    597    pavgb                    m8, m7
    598 %endif
    599    movx                 [dstq], m8
    600 %ifidn %2, 4
    601    movx      [dstq + dstrideq], m9
    602 %else
    603    movhpd    [dstq + dstrideq], m8
    604 %endif
    605 
    606    lea                    dstq, [dstq + dstrideq * 2 ]
    607    sub                 heightd, 2
    608    jg                    .loop
    609 
    610    ; Do last row if output_height is odd
    611    jne                   .done
    612 
    613    movx                     m7, [srcq + sstrideq]          ;H
    614    punpcklbw                m6, m7                         ;G H
    615    pmaddubsw                m0, k0k1
    616    pmaddubsw                m2, k2k3
    617    pmaddubsw                m4, k4k5
    618    pmaddubsw                m6, k6k7
    619    paddsw                   m0, m4
    620    paddsw                   m2, m6
    621    paddsw                   m0, m2
    622    paddsw                   m0, krd
    623    psraw                    m0, 7
    624    packuswb                 m0, m0
    625 %ifidn %1, v8_avg
    626    movx                     m1, [dstq]
    627    pavgb                    m0, m1
    628 %endif
    629    movx                 [dstq], m0
    630 
    631 %endif ; AOM_ARCH_X86_64
    632 
    633 .done:
    634    REP_RET
    635 
    636 %endm
    637 
    638 ;-------------------------------------------------------------------------------
    639 %macro SUBPIX_VFILTER16 1
    640 cglobal filter_block1d16_%1, 6, NUM_GENERAL_REG_USED, 16, LOCAL_VARS_SIZE, \
    641                             src, sstride, dst, dstride, height, filter
    642    mova                     m4, [filterq]
    643    SETUP_LOCAL_VARS
    644 
    645 %if AOM_ARCH_X86 || X86_SUBPIX_VFILTER_PREFER_SLOW_CELERON
    646 
    647 %if AOM_ARCH_X86_64
    648    %define               src1q  r7
    649    %define           sstride6q  r8
    650    %define          dst_stride  dstrideq
    651 %else
    652    %define               src1q  filterq
    653    %define           sstride6q  dstrideq
    654    %define          dst_stride  dstridemp
    655 %endif
    656    lea                   src1q, [srcq + sstrideq]
    657    lea               sstride6q, [sstrideq + sstrideq * 4]
    658    add               sstride6q, sstrideq                   ;pitch * 6
    659 
    660 .loop:
    661    movh                     m0, [srcq                ]     ;A
    662    movh                     m1, [src1q               ]     ;B
    663    movh                     m2, [srcq + sstrideq * 2 ]     ;C
    664    movh                     m3, [src1q + sstrideq * 2]     ;D
    665    movh                     m4, [srcq + sstrideq * 4 ]     ;E
    666    movh                     m5, [src1q + sstrideq * 4]     ;F
    667 
    668    punpcklbw                m0, m1                         ;A B
    669    movh                     m6, [srcq + sstride6q]         ;G
    670    punpcklbw                m2, m3                         ;C D
    671    movh                     m7, [src1q + sstride6q]        ;H
    672    punpcklbw                m4, m5                         ;E F
    673    pmaddubsw                m0, k0k1
    674    movh                     m3, [srcq + 8]                 ;A
    675    pmaddubsw                m2, k2k3
    676    punpcklbw                m6, m7                         ;G H
    677    movh                     m5, [srcq + sstrideq + 8]      ;B
    678    pmaddubsw                m4, k4k5
    679    punpcklbw                m3, m5                         ;A B
    680    movh                     m7, [srcq + sstrideq * 2 + 8]  ;C
    681    pmaddubsw                m6, k6k7
    682    movh                     m5, [src1q + sstrideq * 2 + 8] ;D
    683    punpcklbw                m7, m5                         ;C D
    684    paddsw                   m2, m6
    685    pmaddubsw                m3, k0k1
    686    movh                     m1, [srcq + sstrideq * 4 + 8]  ;E
    687    paddsw                   m0, m4
    688    pmaddubsw                m7, k2k3
    689    movh                     m6, [src1q + sstrideq * 4 + 8] ;F
    690    punpcklbw                m1, m6                         ;E F
    691    paddsw                   m0, m2
    692    paddsw                   m0, krd
    693    movh                     m2, [srcq + sstride6q + 8]     ;G
    694    pmaddubsw                m1, k4k5
    695    movh                     m5, [src1q + sstride6q + 8]    ;H
    696    psraw                    m0, 7
    697    punpcklbw                m2, m5                         ;G H
    698    pmaddubsw                m2, k6k7
    699    paddsw                   m7, m2
    700    paddsw                   m3, m1
    701    paddsw                   m3, m7
    702    paddsw                   m3, krd
    703    psraw                    m3, 7
    704 %ifidn %1, v8_add_src
    705    pxor                     m6, m6
    706    movu                     m4, [src1q + 2 * sstrideq] ; Fetch from 3 rows down
    707    mova                     m5, m4
    708    punpcklbw                m4, m6
    709    punpckhbw                m5, m6
    710    paddsw                   m0, m4
    711    paddsw                   m3, m5
    712 %endif
    713    packuswb                 m0, m3
    714 
    715    add                    srcq, sstrideq
    716    add                   src1q, sstrideq
    717 %ifidn %1, v8_avg
    718    pavgb                    m0, [dstq]
    719 %endif
    720    mova                 [dstq], m0
    721    add                    dstq, dst_stride
    722    dec                 heightd
    723    jnz                   .loop
    724    REP_RET
    725 
    726 %else
    727    ; AOM_ARCH_X86_64
    728    dec                 heightd
    729 
    730    movu                     m1, [srcq                ]     ;A
    731    movu                     m3, [srcq + sstrideq     ]     ;B
    732    lea                    srcq, [srcq + sstrideq * 2]
    733    punpcklbw                m0, m1, m3                     ;A B
    734    punpckhbw                m1, m3                         ;A B
    735    movu                     m5, [srcq]                     ;C
    736    punpcklbw                m2, m3, m5                     ;A B next iter
    737    punpckhbw                m3, m5                         ;A B next iter
    738    mova                   tmp0, m2                         ;store to stack
    739    mova                   tmp1, m3                         ;store to stack
    740    movu                     m7, [srcq + sstrideq]          ;D
    741    lea                    srcq, [srcq + sstrideq * 2]
    742    punpcklbw                m4, m5, m7                     ;C D
    743    punpckhbw                m5, m7                         ;C D
    744    movu                     m9, [srcq]                     ;E
    745    punpcklbw                m6, m7, m9                     ;C D next iter
    746    punpckhbw                m7, m9                         ;C D next iter
    747    movu                    m11, [srcq + sstrideq]          ;F
    748    lea                    srcq, [srcq + sstrideq * 2]
    749    punpcklbw                m8, m9, m11                    ;E F
    750    punpckhbw                m9, m11                        ;E F
    751    movu                     m2, [srcq]                     ;G
    752    punpcklbw               m10, m11, m2                    ;E F next iter
    753    punpckhbw               m11, m2                         ;E F next iter
    754 
    755 .loop:
    756    ;Do two rows at once
    757    pmaddubsw               m13, m0, k0k1
    758    mova                     m0, m4
    759    pmaddubsw               m14, m8, k4k5
    760    pmaddubsw               m15, m4, k2k3
    761    mova                     m4, m8
    762    paddsw                  m13, m14
    763    movu                     m3, [srcq + sstrideq]          ;H
    764    lea                    srcq, [srcq + sstrideq * 2]
    765    punpcklbw               m14, m2, m3                     ;G H
    766    mova                     m8, m14
    767    pmaddubsw               m14, k6k7
    768    paddsw                  m15, m14
    769    paddsw                  m13, m15
    770    paddsw                  m13, krd
    771    psraw                   m13, 7
    772 
    773    pmaddubsw               m14, m1, k0k1
    774    pmaddubsw                m1, m9, k4k5
    775    pmaddubsw               m15, m5, k2k3
    776    paddsw                  m14, m1
    777    mova                     m1, m5
    778    mova                     m5, m9
    779    punpckhbw                m2, m3                         ;G H
    780    mova                     m9, m2
    781    pmaddubsw                m2, k6k7
    782    paddsw                  m15, m2
    783    paddsw                  m14, m15
    784    paddsw                  m14, krd
    785    psraw                   m14, 7
    786    packuswb                m13, m14
    787 %ifidn %1, v8_avg
    788    pavgb                   m13, [dstq]
    789 %endif
    790    mova                 [dstq], m13
    791 
    792    ; next iter
    793    pmaddubsw               m15, tmp0, k0k1
    794    pmaddubsw               m14, m10, k4k5
    795    pmaddubsw               m13, m6, k2k3
    796    paddsw                  m15, m14
    797    mova                   tmp0, m6
    798    mova                     m6, m10
    799    movu                     m2, [srcq]                     ;G next iter
    800    punpcklbw               m14, m3, m2                     ;G H next iter
    801    mova                    m10, m14
    802    pmaddubsw               m14, k6k7
    803    paddsw                  m13, m14
    804    paddsw                  m15, m13
    805    paddsw                  m15, krd
    806    psraw                   m15, 7
    807 
    808    pmaddubsw               m14, tmp1, k0k1
    809    mova                   tmp1, m7
    810    pmaddubsw               m13, m7, k2k3
    811    mova                     m7, m11
    812    pmaddubsw               m11, k4k5
    813    paddsw                  m14, m11
    814    punpckhbw                m3, m2                         ;G H next iter
    815    mova                    m11, m3
    816    pmaddubsw                m3, k6k7
    817    paddsw                  m13, m3
    818    paddsw                  m14, m13
    819    paddsw                  m14, krd
    820    psraw                   m14, 7
    821    packuswb                m15, m14
    822 %ifidn %1, v8_avg
    823    pavgb                   m15, [dstq + dstrideq]
    824 %endif
    825    mova      [dstq + dstrideq], m15
    826    lea                    dstq, [dstq + dstrideq * 2]
    827    sub                 heightd, 2
    828    jg                    .loop
    829 
    830    ; Do last row if output_height is odd
    831    jne                   .done
    832 
    833    movu                     m3, [srcq + sstrideq]          ;H
    834    punpcklbw                m6, m2, m3                     ;G H
    835    punpckhbw                m2, m3                         ;G H
    836    pmaddubsw                m0, k0k1
    837    pmaddubsw                m1, k0k1
    838    pmaddubsw                m4, k2k3
    839    pmaddubsw                m5, k2k3
    840    pmaddubsw                m8, k4k5
    841    pmaddubsw                m9, k4k5
    842    pmaddubsw                m6, k6k7
    843    pmaddubsw                m2, k6k7
    844    paddsw                   m0, m8
    845    paddsw                   m1, m9
    846    paddsw                   m4, m6
    847    paddsw                   m5, m2
    848    paddsw                   m0, m4
    849    paddsw                   m1, m5
    850    paddsw                   m0, krd
    851    paddsw                   m1, krd
    852    psraw                    m0, 7
    853    psraw                    m1, 7
    854    packuswb                 m0, m1
    855 %ifidn %1, v8_avg
    856    pavgb                    m0, [dstq]
    857 %endif
    858    mova                 [dstq], m0
    859 
    860 .done:
    861    REP_RET
    862 
    863 %endif ; AOM_ARCH_X86_64
    864 
    865 %endm
    866 
    867 INIT_XMM ssse3
    868 SUBPIX_VFILTER16     v8
    869 SUBPIX_VFILTER       v8, 8
    870 SUBPIX_VFILTER       v8, 4