tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

vp9lpf_16bpp.asm (24829B)


      1 ;******************************************************************************
      2 ;* VP9 loop filter SIMD optimizations
      3 ;*
      4 ;* Copyright (C) 2015 Ronald S. Bultje <rsbultje@gmail.com>
      5 ;*
      6 ;* This file is part of FFmpeg.
      7 ;*
      8 ;* FFmpeg is free software; you can redistribute it and/or
      9 ;* modify it under the terms of the GNU Lesser General Public
     10 ;* License as published by the Free Software Foundation; either
     11 ;* version 2.1 of the License, or (at your option) any later version.
     12 ;*
     13 ;* FFmpeg is distributed in the hope that it will be useful,
     14 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
     15 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
     16 ;* Lesser General Public License for more details.
     17 ;*
     18 ;* You should have received a copy of the GNU Lesser General Public
     19 ;* License along with FFmpeg; if not, write to the Free Software
     20 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
     21 ;******************************************************************************
     22 
     23 %include "libavutil/x86/x86util.asm"
     24 
     25 SECTION_RODATA
     26 
     27 pw_511: times 16 dw 511
     28 pw_2047: times 16 dw 2047
     29 pw_16384: times 16 dw 16384
     30 pw_m512: times 16 dw -512
     31 pw_m2048: times 16 dw -2048
     32 
     33 cextern pw_1
     34 cextern pw_3
     35 cextern pw_4
     36 cextern pw_8
     37 cextern pw_16
     38 cextern pw_256
     39 cextern pw_1023
     40 cextern pw_4095
     41 cextern pw_m1
     42 
     43 SECTION .text
     44 
     45 %macro SCRATCH 3-4
     46 %if ARCH_X86_64
     47    SWAP                %1, %2
     48 %if %0 == 4
     49 %define reg_%4 m%2
     50 %endif
     51 %else
     52    mova              [%3], m%1
     53 %if %0 == 4
     54 %define reg_%4 [%3]
     55 %endif
     56 %endif
     57 %endmacro
     58 
     59 %macro UNSCRATCH 3-4
     60 %if ARCH_X86_64
     61    SWAP                %1, %2
     62 %else
     63    mova               m%1, [%3]
     64 %endif
     65 %if %0 == 4
     66 %undef reg_%4
     67 %endif
     68 %endmacro
     69 
     70 %macro PRELOAD 2-3
     71 %if ARCH_X86_64
     72    mova               m%1, [%2]
     73 %if %0 == 3
     74 %define reg_%3 m%1
     75 %endif
     76 %elif %0 == 3
     77 %define reg_%3 [%2]
     78 %endif
     79 %endmacro
     80 
     81 ; calculate p or q portion of flat8out
     82 %macro FLAT8OUT_HALF 0
     83    psubw               m4, m0                      ; q4-q0
     84    psubw               m5, m0                      ; q5-q0
     85    psubw               m6, m0                      ; q6-q0
     86    psubw               m7, m0                      ; q7-q0
     87    ABS2                m4, m5, m2, m3              ; abs(q4-q0) | abs(q5-q0)
     88    ABS2                m6, m7, m2, m3              ; abs(q6-q0) | abs(q7-q0)
     89    pcmpgtw             m4, reg_F                   ; abs(q4-q0) > F
     90    pcmpgtw             m5, reg_F                   ; abs(q5-q0) > F
     91    pcmpgtw             m6, reg_F                   ; abs(q6-q0) > F
     92    pcmpgtw             m7, reg_F                   ; abs(q7-q0) > F
     93    por                 m5, m4
     94    por                 m7, m6
     95    por                 m7, m5                      ; !flat8out, q portion
     96 %endmacro
     97 
     98 ; calculate p or q portion of flat8in/hev/fm (excluding mb_edge condition)
     99 %macro FLAT8IN_HALF 1
    100 %if %1 > 4
    101    psubw               m4, m3, m0                  ; q3-q0
    102    psubw               m5, m2, m0                  ; q2-q0
    103    ABS2                m4, m5, m6, m7              ; abs(q3-q0) | abs(q2-q0)
    104    pcmpgtw             m4, reg_F                   ; abs(q3-q0) > F
    105    pcmpgtw             m5, reg_F                   ; abs(q2-q0) > F
    106 %endif
    107    psubw               m3, m2                      ; q3-q2
    108    psubw               m2, m1                      ; q2-q1
    109    ABS2                m3, m2, m6, m7              ; abs(q3-q2) | abs(q2-q1)
    110    pcmpgtw             m3, reg_I                   ; abs(q3-q2) > I
    111    pcmpgtw             m2, reg_I                   ; abs(q2-q1) > I
    112 %if %1 > 4
    113    por                 m4, m5
    114 %endif
    115    por                 m2, m3
    116    psubw               m3, m1, m0                  ; q1-q0
    117    ABS1                m3, m5                      ; abs(q1-q0)
    118 %if %1 > 4
    119    pcmpgtw             m6, m3, reg_F               ; abs(q1-q0) > F
    120 %endif
    121    pcmpgtw             m7, m3, reg_H               ; abs(q1-q0) > H
    122    pcmpgtw             m3, reg_I                   ; abs(q1-q0) > I
    123 %if %1 > 4
    124    por                 m4, m6
    125 %endif
    126    por                 m2, m3
    127 %endmacro
    128 
    129 ; one step in filter_14/filter_6
    130 ;
    131 ; take sum $reg, downshift, apply mask and write into dst
    132 ;
    133 ; if sub2/add1-2 are present, add/sub as appropriate to prepare for the next
    134 ; step's sum $reg. This is omitted for the last row in each filter.
    135 ;
    136 ; if dont_store is set, don't write the result into memory, instead keep the
    137 ; values in register so we can write it out later
    138 %macro FILTER_STEP 6-10 "", "", "", 0 ; tmp, reg, mask, shift, dst, \
    139                                      ; src/sub1, sub2, add1, add2, dont_store
    140    psrlw               %1, %2, %4
    141    psubw               %1, %6                      ; abs->delta
    142 %ifnidn %7, ""
    143    psubw               %2, %6
    144    psubw               %2, %7
    145    paddw               %2, %8
    146    paddw               %2, %9
    147 %endif
    148    pand                %1, reg_%3                  ; apply mask
    149 %if %10 == 1
    150    paddw               %6, %1                      ; delta->abs
    151 %else
    152    paddw               %1, %6                      ; delta->abs
    153    mova              [%5], %1
    154 %endif
    155 %endmacro
    156 
    157 ; FIXME avx2 versions for 16_16 and mix2_{4,8}{4,8}
    158 
    159 %macro LOOP_FILTER 3 ; dir[h/v], wd[4/8/16], bpp[10/12]
    160 
    161 %if ARCH_X86_64
    162 %if %2 == 16
    163 %assign %%num_xmm_regs 16
    164 %elif %2 == 8
    165 %assign %%num_xmm_regs 15
    166 %else ; %2 == 4
    167 %assign %%num_xmm_regs 14
    168 %endif ; %2
    169 %assign %%bak_mem 0
    170 %else ; ARCH_X86_32
    171 %assign %%num_xmm_regs 8
    172 %if %2 == 16
    173 %assign %%bak_mem 7
    174 %elif %2 == 8
    175 %assign %%bak_mem 6
    176 %else ; %2 == 4
    177 %assign %%bak_mem 5
    178 %endif ; %2
    179 %endif ; ARCH_X86_64/32
    180 
    181 %if %2 == 16
    182 %ifidn %1, v
    183 %assign %%num_gpr_regs 6
    184 %else ; %1 == h
    185 %assign %%num_gpr_regs 5
    186 %endif ; %1
    187 %assign %%wd_mem 6
    188 %else ; %2 == 8/4
    189 %assign %%num_gpr_regs 5
    190 %if ARCH_X86_32 && %2 == 8
    191 %assign %%wd_mem 2
    192 %else ; ARCH_X86_64 || %2 == 4
    193 %assign %%wd_mem 0
    194 %endif ; ARCH_X86_64/32 etc.
    195 %endif ; %2
    196 
    197 %ifidn %1, v
    198 %assign %%tsp_mem 0
    199 %elif %2 == 16 ; && %1 == h
    200 %assign %%tsp_mem 16
    201 %else ; %1 == h && %1 == 8/4
    202 %assign %%tsp_mem 8
    203 %endif ; %1/%2
    204 
    205 %assign %%off %%wd_mem
    206 %assign %%tspoff %%bak_mem+%%wd_mem
    207 %assign %%stack_mem ((%%bak_mem+%%wd_mem+%%tsp_mem)*mmsize)
    208 
    209 %if %3 == 10
    210 %define %%maxsgn 511
    211 %define %%minsgn m512
    212 %define %%maxusgn 1023
    213 %define %%maxf 4
    214 %else ; %3 == 12
    215 %define %%maxsgn 2047
    216 %define %%minsgn m2048
    217 %define %%maxusgn 4095
    218 %define %%maxf 16
    219 %endif ; %3
    220 
    221 cglobal vp9_loop_filter_%1_%2_%3, 5, %%num_gpr_regs, %%num_xmm_regs, %%stack_mem, dst, stride, E, I, H
    222    ; prepare E, I and H masks
    223    shl                 Ed, %3-8
    224    shl                 Id, %3-8
    225    shl                 Hd, %3-8
    226 %if cpuflag(ssse3)
    227    mova                m0, [pw_256]
    228 %endif
    229    movd                m1, Ed
    230    movd                m2, Id
    231    movd                m3, Hd
    232 %if cpuflag(ssse3)
    233    pshufb              m1, m0                      ; E << (bit_depth - 8)
    234    pshufb              m2, m0                      ; I << (bit_depth - 8)
    235    pshufb              m3, m0                      ; H << (bit_depth - 8)
    236 %else
    237    punpcklwd           m1, m1
    238    punpcklwd           m2, m2
    239    punpcklwd           m3, m3
    240    pshufd              m1, m1, q0000
    241    pshufd              m2, m2, q0000
    242    pshufd              m3, m3, q0000
    243 %endif
    244    SCRATCH              1,  8, rsp+(%%off+0)*mmsize,  E
    245    SCRATCH              2,  9, rsp+(%%off+1)*mmsize,  I
    246    SCRATCH              3, 10, rsp+(%%off+2)*mmsize,  H
    247 %if %2 > 4
    248    PRELOAD                 11, pw_ %+ %%maxf, F
    249 %endif
    250 
    251    ; set up variables to load data
    252 %ifidn %1, v
    253    DEFINE_ARGS dst8, stride, stride3, dst0, dst4, dst12
    254    lea           stride3q, [strideq*3]
    255    neg            strideq
    256 %if %2 == 16
    257    lea              dst0q, [dst8q+strideq*8]
    258 %else
    259    lea              dst4q, [dst8q+strideq*4]
    260 %endif
    261    neg            strideq
    262 %if %2 == 16
    263    lea             dst12q, [dst8q+strideq*4]
    264    lea              dst4q, [dst0q+strideq*4]
    265 %endif
    266 
    267 %if %2 == 16
    268 %define %%p7 dst0q
    269 %define %%p6 dst0q+strideq
    270 %define %%p5 dst0q+strideq*2
    271 %define %%p4 dst0q+stride3q
    272 %endif
    273 %define %%p3 dst4q
    274 %define %%p2 dst4q+strideq
    275 %define %%p1 dst4q+strideq*2
    276 %define %%p0 dst4q+stride3q
    277 %define %%q0 dst8q
    278 %define %%q1 dst8q+strideq
    279 %define %%q2 dst8q+strideq*2
    280 %define %%q3 dst8q+stride3q
    281 %if %2 == 16
    282 %define %%q4 dst12q
    283 %define %%q5 dst12q+strideq
    284 %define %%q6 dst12q+strideq*2
    285 %define %%q7 dst12q+stride3q
    286 %endif
    287 %else ; %1 == h
    288    DEFINE_ARGS dst0, stride, stride3, dst4
    289    lea           stride3q, [strideq*3]
    290    lea              dst4q, [dst0q+strideq*4]
    291 
    292 %define %%p3 rsp+(%%tspoff+0)*mmsize
    293 %define %%p2 rsp+(%%tspoff+1)*mmsize
    294 %define %%p1 rsp+(%%tspoff+2)*mmsize
    295 %define %%p0 rsp+(%%tspoff+3)*mmsize
    296 %define %%q0 rsp+(%%tspoff+4)*mmsize
    297 %define %%q1 rsp+(%%tspoff+5)*mmsize
    298 %define %%q2 rsp+(%%tspoff+6)*mmsize
    299 %define %%q3 rsp+(%%tspoff+7)*mmsize
    300 
    301 %if %2 < 16
    302    movu                m0, [dst0q+strideq*0-8]
    303    movu                m1, [dst0q+strideq*1-8]
    304    movu                m2, [dst0q+strideq*2-8]
    305    movu                m3, [dst0q+stride3q -8]
    306    movu                m4, [dst4q+strideq*0-8]
    307    movu                m5, [dst4q+strideq*1-8]
    308    movu                m6, [dst4q+strideq*2-8]
    309    movu                m7, [dst4q+stride3q -8]
    310 
    311 %if ARCH_X86_64
    312    TRANSPOSE8x8W        0, 1, 2, 3, 4, 5, 6, 7, 12
    313 %else
    314    TRANSPOSE8x8W        0, 1, 2, 3, 4, 5, 6, 7, [%%p0], [%%q0]
    315 %endif
    316 
    317    mova            [%%p3], m0
    318    mova            [%%p2], m1
    319    mova            [%%p1], m2
    320    mova            [%%p0], m3
    321 %if ARCH_X86_64
    322    mova            [%%q0], m4
    323 %endif
    324    mova            [%%q1], m5
    325    mova            [%%q2], m6
    326    mova            [%%q3], m7
    327 
    328    ; FIXME investigate if we can _not_ load q0-3 below if h, and adjust register
    329    ; order here accordingly
    330 %else ; %2 == 16
    331 
    332 %define %%p7 rsp+(%%tspoff+ 8)*mmsize
    333 %define %%p6 rsp+(%%tspoff+ 9)*mmsize
    334 %define %%p5 rsp+(%%tspoff+10)*mmsize
    335 %define %%p4 rsp+(%%tspoff+11)*mmsize
    336 %define %%q4 rsp+(%%tspoff+12)*mmsize
    337 %define %%q5 rsp+(%%tspoff+13)*mmsize
    338 %define %%q6 rsp+(%%tspoff+14)*mmsize
    339 %define %%q7 rsp+(%%tspoff+15)*mmsize
    340 
    341    mova                m0, [dst0q+strideq*0-16]
    342    mova                m1, [dst0q+strideq*1-16]
    343    mova                m2, [dst0q+strideq*2-16]
    344    mova                m3, [dst0q+stride3q -16]
    345    mova                m4, [dst4q+strideq*0-16]
    346    mova                m5, [dst4q+strideq*1-16]
    347 %if ARCH_X86_64
    348    mova                m6, [dst4q+strideq*2-16]
    349 %endif
    350    mova                m7, [dst4q+stride3q -16]
    351 
    352 %if ARCH_X86_64
    353    TRANSPOSE8x8W        0, 1, 2, 3, 4, 5, 6, 7, 12
    354 %else
    355    TRANSPOSE8x8W        0, 1, 2, 3, 4, 5, 6, 7, [dst4q+strideq*2-16], [%%p3], 1
    356 %endif
    357 
    358    mova            [%%p7], m0
    359    mova            [%%p6], m1
    360    mova            [%%p5], m2
    361    mova            [%%p4], m3
    362 %if ARCH_X86_64
    363    mova            [%%p3], m4
    364 %endif
    365    mova            [%%p2], m5
    366    mova            [%%p1], m6
    367    mova            [%%p0], m7
    368 
    369    mova                m0, [dst0q+strideq*0]
    370    mova                m1, [dst0q+strideq*1]
    371    mova                m2, [dst0q+strideq*2]
    372    mova                m3, [dst0q+stride3q ]
    373    mova                m4, [dst4q+strideq*0]
    374    mova                m5, [dst4q+strideq*1]
    375 %if ARCH_X86_64
    376    mova                m6, [dst4q+strideq*2]
    377 %endif
    378    mova                m7, [dst4q+stride3q ]
    379 
    380 %if ARCH_X86_64
    381    TRANSPOSE8x8W        0, 1, 2, 3, 4, 5, 6, 7, 12
    382 %else
    383    TRANSPOSE8x8W        0, 1, 2, 3, 4, 5, 6, 7, [dst4q+strideq*2], [%%q4], 1
    384 %endif
    385 
    386    mova            [%%q0], m0
    387    mova            [%%q1], m1
    388    mova            [%%q2], m2
    389    mova            [%%q3], m3
    390 %if ARCH_X86_64
    391    mova            [%%q4], m4
    392 %endif
    393    mova            [%%q5], m5
    394    mova            [%%q6], m6
    395    mova            [%%q7], m7
    396 
    397    ; FIXME investigate if we can _not_ load q0|q4-7 below if h, and adjust register
    398    ; order here accordingly
    399 %endif ; %2
    400 %endif ; %1
    401 
    402    ; load q0|q4-7 data
    403    mova                m0, [%%q0]
    404 %if %2 == 16
    405    mova                m4, [%%q4]
    406    mova                m5, [%%q5]
    407    mova                m6, [%%q6]
    408    mova                m7, [%%q7]
    409 
    410    ; flat8out q portion
    411    FLAT8OUT_HALF
    412    SCRATCH              7, 15, rsp+(%%off+6)*mmsize, F8O
    413 %endif
    414 
    415    ; load q1-3 data
    416    mova                m1, [%%q1]
    417    mova                m2, [%%q2]
    418    mova                m3, [%%q3]
    419 
    420    ; r6-8|pw_4[m8-11]=reg_E/I/H/F
    421    ; r9[m15]=!flatout[q]
    422    ; m12-14=free
    423    ; m0-3=q0-q3
    424    ; m4-7=free
    425 
    426    ; flat8in|fm|hev q portion
    427    FLAT8IN_HALF        %2
    428    SCRATCH              7, 13, rsp+(%%off+4)*mmsize, HEV
    429 %if %2 > 4
    430    SCRATCH              4, 14, rsp+(%%off+5)*mmsize, F8I
    431 %endif
    432 
    433    ; r6-8|pw_4[m8-11]=reg_E/I/H/F
    434    ; r9[m15]=!flat8out[q]
    435    ; r10[m13]=hev[q]
    436    ; r11[m14]=!flat8in[q]
    437    ; m2=!fm[q]
    438    ; m0,1=q0-q1
    439    ; m2-7=free
    440    ; m12=free
    441 
    442    ; load p0-1
    443    mova                m3, [%%p0]
    444    mova                m4, [%%p1]
    445 
    446    ; fm mb_edge portion
    447    psubw               m5, m3, m0                  ; q0-p0
    448    psubw               m6, m4, m1                  ; q1-p1
    449 %if ARCH_X86_64
    450    ABS2                m5, m6, m7, m12             ; abs(q0-p0) | abs(q1-p1)
    451 %else
    452    ABS1                m5, m7                      ; abs(q0-p0)
    453    ABS1                m6, m7                      ; abs(q1-p1)
    454 %endif
    455    paddw               m5, m5
    456    psraw               m6, 1
    457    paddw               m6, m5                      ; abs(q0-p0)*2+(abs(q1-p1)>>1)
    458    pcmpgtw             m6, reg_E
    459    por                 m2, m6
    460    SCRATCH              2, 12, rsp+(%%off+3)*mmsize, FM
    461 
    462    ; r6-8|pw_4[m8-11]=reg_E/I/H/F
    463    ; r9[m15]=!flat8out[q]
    464    ; r10[m13]=hev[q]
    465    ; r11[m14]=!flat8in[q]
    466    ; r12[m12]=!fm[q]
    467    ; m3-4=q0-1
    468    ; m0-2/5-7=free
    469 
    470    ; load p4-7 data
    471    SWAP                 3, 0                       ; p0
    472    SWAP                 4, 1                       ; p1
    473 %if %2 == 16
    474    mova                m7, [%%p7]
    475    mova                m6, [%%p6]
    476    mova                m5, [%%p5]
    477    mova                m4, [%%p4]
    478 
    479    ; flat8out p portion
    480    FLAT8OUT_HALF
    481    por                 m7, reg_F8O
    482    SCRATCH              7, 15, rsp+(%%off+6)*mmsize, F8O
    483 %endif
    484 
    485    ; r6-8|pw_4[m8-11]=reg_E/I/H/F
    486    ; r9[m15]=!flat8out
    487    ; r10[m13]=hev[q]
    488    ; r11[m14]=!flat8in[q]
    489    ; r12[m12]=!fm[q]
    490    ; m0=p0
    491    ; m1-7=free
    492 
    493    ; load p2-3 data
    494    mova                m2, [%%p2]
    495    mova                m3, [%%p3]
    496 
    497    ; flat8in|fm|hev p portion
    498    FLAT8IN_HALF        %2
    499    por                 m7, reg_HEV
    500 %if %2 > 4
    501    por                 m4, reg_F8I
    502 %endif
    503    por                 m2, reg_FM
    504 %if %2 > 4
    505    por                 m4, m2                      ; !flat8|!fm
    506 %if %2 == 16
    507    por                 m5, m4, reg_F8O             ; !flat16|!fm
    508    pandn               m2, m4                      ; filter4_mask
    509    pandn               m4, m5                      ; filter8_mask
    510    pxor                m5, [pw_m1]                 ; filter16_mask
    511    SCRATCH              5, 15, rsp+(%%off+6)*mmsize, F16M
    512 %else
    513    pandn               m2, m4                      ; filter4_mask
    514    pxor                m4, [pw_m1]                 ; filter8_mask
    515 %endif
    516    SCRATCH              4, 14, rsp+(%%off+5)*mmsize, F8M
    517 %else
    518    pxor                m2, [pw_m1]                 ; filter4_mask
    519 %endif
    520    SCRATCH              7, 13, rsp+(%%off+4)*mmsize, HEV
    521    SCRATCH              2, 12, rsp+(%%off+3)*mmsize, F4M
    522 
    523    ; r9[m15]=filter16_mask
    524    ; r10[m13]=hev
    525    ; r11[m14]=filter8_mask
    526    ; r12[m12]=filter4_mask
    527    ; m0,1=p0-p1
    528    ; m2-7=free
    529    ; m8-11=free
    530 
    531 %if %2 > 4
    532 %if %2 == 16
    533    ; filter_14
    534    mova                m2, [%%p7]
    535    mova                m3, [%%p6]
    536    mova                m6, [%%p5]
    537    mova                m7, [%%p4]
    538    PRELOAD              8, %%p3, P3
    539    PRELOAD              9, %%p2, P2
    540 %endif
    541    PRELOAD             10, %%q0, Q0
    542    PRELOAD             11, %%q1, Q1
    543 %if %2 == 16
    544    psllw               m4, m2, 3
    545    paddw               m5, m3, m3
    546    paddw               m4, m6
    547    paddw               m5, m7
    548    paddw               m4, reg_P3
    549    paddw               m5, reg_P2
    550    paddw               m4, m1
    551    paddw               m5, m0
    552    paddw               m4, reg_Q0                  ; q0+p1+p3+p5+p7*8
    553    psubw               m5, m2                      ; p0+p2+p4+p6*2-p7
    554    paddw               m4, [pw_8]
    555    paddw               m5, m4                      ; q0+p0+p1+p2+p3+p4+p5+p6*2+p7*7+8
    556 
    557    ; below, we use r0-5 for storing pre-filter pixels for subsequent subtraction
    558    ; at the end of the filter
    559 
    560    mova    [rsp+0*mmsize], m3
    561    FILTER_STEP         m4, m5, F16M, 4, %%p6, m3,     m2,             m6,     reg_Q1
    562 %endif
    563    mova                m3, [%%q2]
    564 %if %2 == 16
    565    mova    [rsp+1*mmsize], m6
    566    FILTER_STEP         m4, m5, F16M, 4, %%p5, m6,     m2,             m7,     m3
    567 %endif
    568    mova                m6, [%%q3]
    569 %if %2 == 16
    570    mova    [rsp+2*mmsize], m7
    571    FILTER_STEP         m4, m5, F16M, 4, %%p4, m7,     m2,             reg_P3, m6
    572    mova                m7, [%%q4]
    573 %if ARCH_X86_64
    574    mova    [rsp+3*mmsize], reg_P3
    575 %else
    576    mova                m4, reg_P3
    577    mova    [rsp+3*mmsize], m4
    578 %endif
    579    FILTER_STEP         m4, m5, F16M, 4, %%p3, reg_P3, m2,             reg_P2, m7
    580    PRELOAD              8, %%q5, Q5
    581 %if ARCH_X86_64
    582    mova    [rsp+4*mmsize], reg_P2
    583 %else
    584    mova                m4, reg_P2
    585    mova    [rsp+4*mmsize], m4
    586 %endif
    587    FILTER_STEP         m4, m5, F16M, 4, %%p2, reg_P2, m2,             m1,     reg_Q5
    588    PRELOAD              9, %%q6, Q6
    589    mova    [rsp+5*mmsize], m1
    590    FILTER_STEP         m4, m5, F16M, 4, %%p1, m1,     m2,             m0,     reg_Q6
    591    mova                m1, [%%q7]
    592    FILTER_STEP         m4, m5, F16M, 4, %%p0, m0,     m2,             reg_Q0, m1,     1
    593    FILTER_STEP         m4, m5, F16M, 4, %%q0, reg_Q0, [rsp+0*mmsize], reg_Q1, m1,     ARCH_X86_64
    594    FILTER_STEP         m4, m5, F16M, 4, %%q1, reg_Q1, [rsp+1*mmsize], m3,     m1,     ARCH_X86_64
    595    FILTER_STEP         m4, m5, F16M, 4, %%q2, m3,     [rsp+2*mmsize], m6,     m1,     1
    596    FILTER_STEP         m4, m5, F16M, 4, %%q3, m6,     [rsp+3*mmsize], m7,     m1
    597    FILTER_STEP         m4, m5, F16M, 4, %%q4, m7,     [rsp+4*mmsize], reg_Q5, m1
    598    FILTER_STEP         m4, m5, F16M, 4, %%q5, reg_Q5, [rsp+5*mmsize], reg_Q6, m1
    599    FILTER_STEP         m4, m5, F16M, 4, %%q6, reg_Q6
    600 
    601    mova                m7, [%%p1]
    602 %else
    603    SWAP                 1, 7
    604 %endif
    605 
    606    mova                m2, [%%p3]
    607    mova                m1, [%%p2]
    608 
    609    ; reg_Q0-1 (m10-m11)
    610    ; m0=p0
    611    ; m1=p2
    612    ; m2=p3
    613    ; m3=q2
    614    ; m4-5=free
    615    ; m6=q3
    616    ; m7=p1
    617    ; m8-9 unused
    618 
    619    ; filter_6
    620    psllw               m4, m2, 2
    621    paddw               m5, m1, m1
    622    paddw               m4, m7
    623    psubw               m5, m2
    624    paddw               m4, m0
    625    paddw               m5, reg_Q0
    626    paddw               m4, [pw_4]
    627    paddw               m5, m4
    628 
    629 %if ARCH_X86_64
    630    mova                m8, m1
    631    mova                m9, m7
    632 %else
    633    mova    [rsp+0*mmsize], m1
    634    mova    [rsp+1*mmsize], m7
    635 %endif
    636 %ifidn %1, v
    637    FILTER_STEP         m4, m5, F8M, 3, %%p2, m1,     m2,             m7,     reg_Q1
    638 %else
    639    FILTER_STEP         m4, m5, F8M, 3, %%p2, m1,     m2,             m7,     reg_Q1, 1
    640 %endif
    641    FILTER_STEP         m4, m5, F8M, 3, %%p1, m7,     m2,             m0,     m3, 1
    642    FILTER_STEP         m4, m5, F8M, 3, %%p0, m0,     m2,             reg_Q0, m6, 1
    643 %if ARCH_X86_64
    644    FILTER_STEP         m4, m5, F8M, 3, %%q0, reg_Q0, m8,             reg_Q1, m6, ARCH_X86_64
    645    FILTER_STEP         m4, m5, F8M, 3, %%q1, reg_Q1, m9,             m3,     m6, ARCH_X86_64
    646 %else
    647    FILTER_STEP         m4, m5, F8M, 3, %%q0, reg_Q0, [rsp+0*mmsize], reg_Q1, m6, ARCH_X86_64
    648    FILTER_STEP         m4, m5, F8M, 3, %%q1, reg_Q1, [rsp+1*mmsize], m3,     m6, ARCH_X86_64
    649 %endif
    650    FILTER_STEP         m4, m5, F8M, 3, %%q2, m3
    651 
    652    UNSCRATCH            2, 10, %%q0
    653    UNSCRATCH            6, 11, %%q1
    654 %else
    655    SWAP                 1, 7
    656    mova                m2, [%%q0]
    657    mova                m6, [%%q1]
    658 %endif
    659    UNSCRATCH            3, 13, rsp+(%%off+4)*mmsize, HEV
    660 
    661    ; m0=p0
    662    ; m1=p2
    663    ; m2=q0
    664    ; m3=hev_mask
    665    ; m4-5=free
    666    ; m6=q1
    667    ; m7=p1
    668 
    669    ; filter_4
    670    psubw               m4, m7, m6              ; p1-q1
    671    psubw               m5, m2, m0              ; q0-p0
    672    pand                m4, m3
    673    pminsw              m4, [pw_ %+ %%maxsgn]
    674    pmaxsw              m4, [pw_ %+ %%minsgn]   ; clip_intp2(p1-q1, 9) -> f
    675    paddw               m4, m5
    676    paddw               m5, m5
    677    paddw               m4, m5                  ; 3*(q0-p0)+f
    678    pminsw              m4, [pw_ %+ %%maxsgn]
    679    pmaxsw              m4, [pw_ %+ %%minsgn]   ; clip_intp2(3*(q0-p0)+f, 9) -> f
    680    pand                m4, reg_F4M
    681    paddw               m5, m4, [pw_4]
    682    paddw               m4, [pw_3]
    683    pminsw              m5, [pw_ %+ %%maxsgn]
    684    pminsw              m4, [pw_ %+ %%maxsgn]
    685    psraw               m5, 3                   ; min_intp2(f+4, 9)>>3 -> f1
    686    psraw               m4, 3                   ; min_intp2(f+3, 9)>>3 -> f2
    687    psubw               m2, m5                  ; q0-f1
    688    paddw               m0, m4                  ; p0+f2
    689    pandn               m3, m5                  ; f1 & !hev (for p1/q1 adj)
    690    pxor                m4, m4
    691    mova                m5, [pw_ %+ %%maxusgn]
    692    pmaxsw              m2, m4
    693    pmaxsw              m0, m4
    694    pminsw              m2, m5
    695    pminsw              m0, m5
    696 %if cpuflag(ssse3)
    697    pmulhrsw            m3, [pw_16384]          ; (f1+1)>>1
    698 %else
    699    paddw               m3, [pw_1]
    700    psraw               m3, 1
    701 %endif
    702    paddw               m7, m3                  ; p1+f
    703    psubw               m6, m3                  ; q1-f
    704    pmaxsw              m7, m4
    705    pmaxsw              m6, m4
    706    pminsw              m7, m5
    707    pminsw              m6, m5
    708 
    709    ; store
    710 %ifidn %1, v
    711    mova            [%%p1], m7
    712    mova            [%%p0], m0
    713    mova            [%%q0], m2
    714    mova            [%%q1], m6
    715 %else ; %1 == h
    716 %if %2 == 4
    717    TRANSPOSE4x4W        7, 0, 2, 6, 1
    718    movh   [dst0q+strideq*0-4], m7
    719    movhps [dst0q+strideq*1-4], m7
    720    movh   [dst0q+strideq*2-4], m0
    721    movhps [dst0q+stride3q -4], m0
    722    movh   [dst4q+strideq*0-4], m2
    723    movhps [dst4q+strideq*1-4], m2
    724    movh   [dst4q+strideq*2-4], m6
    725    movhps [dst4q+stride3q -4], m6
    726 %elif %2 == 8
    727    mova                m3, [%%p3]
    728    mova                m4, [%%q2]
    729    mova                m5, [%%q3]
    730 
    731 %if ARCH_X86_64
    732    TRANSPOSE8x8W        3, 1, 7, 0, 2, 6, 4, 5, 8
    733 %else
    734    TRANSPOSE8x8W        3, 1, 7, 0, 2, 6, 4, 5, [%%q2], [%%q0], 1
    735    mova                m2, [%%q0]
    736 %endif
    737 
    738    movu [dst0q+strideq*0-8], m3
    739    movu [dst0q+strideq*1-8], m1
    740    movu [dst0q+strideq*2-8], m7
    741    movu [dst0q+stride3q -8], m0
    742    movu [dst4q+strideq*0-8], m2
    743    movu [dst4q+strideq*1-8], m6
    744    movu [dst4q+strideq*2-8], m4
    745    movu [dst4q+stride3q -8], m5
    746 %else ; %2 == 16
    747    SCRATCH              2, 8, %%q0
    748    SCRATCH              6, 9, %%q1
    749    mova                m2, [%%p7]
    750    mova                m3, [%%p6]
    751    mova                m4, [%%p5]
    752    mova                m5, [%%p4]
    753    mova                m6, [%%p3]
    754 
    755 %if ARCH_X86_64
    756    TRANSPOSE8x8W        2, 3, 4, 5, 6, 1, 7, 0, 10
    757 %else
    758    mova            [%%p1], m7
    759    TRANSPOSE8x8W        2, 3, 4, 5, 6, 1, 7, 0, [%%p1], [dst4q+strideq*0-16], 1
    760 %endif
    761 
    762    mova [dst0q+strideq*0-16], m2
    763    mova [dst0q+strideq*1-16], m3
    764    mova [dst0q+strideq*2-16], m4
    765    mova [dst0q+stride3q -16], m5
    766 %if ARCH_X86_64
    767    mova [dst4q+strideq*0-16], m6
    768 %endif
    769    mova [dst4q+strideq*1-16], m1
    770    mova [dst4q+strideq*2-16], m7
    771    mova [dst4q+stride3q -16], m0
    772 
    773    UNSCRATCH            2, 8, %%q0
    774    UNSCRATCH            6, 9, %%q1
    775    mova                m0, [%%q2]
    776    mova                m1, [%%q3]
    777    mova                m3, [%%q4]
    778    mova                m4, [%%q5]
    779 %if ARCH_X86_64
    780    mova                m5, [%%q6]
    781 %endif
    782    mova                m7, [%%q7]
    783 
    784 %if ARCH_X86_64
    785    TRANSPOSE8x8W        2, 6, 0, 1, 3, 4, 5, 7, 8
    786 %else
    787    TRANSPOSE8x8W        2, 6, 0, 1, 3, 4, 5, 7, [%%q6], [dst4q+strideq*0], 1
    788 %endif
    789 
    790    mova [dst0q+strideq*0], m2
    791    mova [dst0q+strideq*1], m6
    792    mova [dst0q+strideq*2], m0
    793    mova [dst0q+stride3q ], m1
    794 %if ARCH_X86_64
    795    mova [dst4q+strideq*0], m3
    796 %endif
    797    mova [dst4q+strideq*1], m4
    798    mova [dst4q+strideq*2], m5
    799    mova [dst4q+stride3q ], m7
    800 %endif ; %2
    801 %endif ; %1
    802    RET
    803 %endmacro
    804 
    805 %macro LOOP_FILTER_CPUSETS 3
    806 INIT_XMM sse2
    807 LOOP_FILTER %1, %2, %3
    808 INIT_XMM ssse3
    809 LOOP_FILTER %1, %2, %3
    810 INIT_XMM avx
    811 LOOP_FILTER %1, %2, %3
    812 %endmacro
    813 
    814 %macro LOOP_FILTER_WDSETS 2
    815 LOOP_FILTER_CPUSETS %1,  4, %2
    816 LOOP_FILTER_CPUSETS %1,  8, %2
    817 LOOP_FILTER_CPUSETS %1, 16, %2
    818 %endmacro
    819 
    820 LOOP_FILTER_WDSETS h, 10
    821 LOOP_FILTER_WDSETS v, 10
    822 LOOP_FILTER_WDSETS h, 12
    823 LOOP_FILTER_WDSETS v, 12