tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

aom_high_subpixel_8t_sse2.asm (16444B)


      1 ;
      2 ; Copyright (c) 2016, Alliance for Open Media. All rights reserved.
      3 ;
      4 ; This source code is subject to the terms of the BSD 2 Clause License and
      5 ; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
      6 ; was not distributed with this source code in the LICENSE file, you can
      7 ; obtain it at www.aomedia.org/license/software. If the Alliance for Open
      8 ; Media Patent License 1.0 was not distributed with this source code in the
      9 ; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
     10 ;
     11 
     12 ;
     13 
     14 
     15 %include "aom_ports/x86_abi_support.asm"
     16 
     17 ;Note: tap3 and tap4 have to be applied and added after other taps to avoid
     18 ;overflow.
     19 
     20 %macro HIGH_GET_FILTERS_4 0
     21    mov         rdx, arg(5)                 ;filter ptr
     22    mov         rcx, 0x00000040
     23 
     24    movdqa      xmm7, [rdx]                 ;load filters
     25    pshuflw     xmm0, xmm7, 0b              ;k0
     26    pshuflw     xmm1, xmm7, 01010101b       ;k1
     27    pshuflw     xmm2, xmm7, 10101010b       ;k2
     28    pshuflw     xmm3, xmm7, 11111111b       ;k3
     29    psrldq      xmm7, 8
     30    pshuflw     xmm4, xmm7, 0b              ;k4
     31    pshuflw     xmm5, xmm7, 01010101b       ;k5
     32    pshuflw     xmm6, xmm7, 10101010b       ;k6
     33    pshuflw     xmm7, xmm7, 11111111b       ;k7
     34 
     35    punpcklwd   xmm0, xmm6
     36    punpcklwd   xmm2, xmm5
     37    punpcklwd   xmm3, xmm4
     38    punpcklwd   xmm1, xmm7
     39 
     40    movdqa      k0k6, xmm0
     41    movdqa      k2k5, xmm2
     42    movdqa      k3k4, xmm3
     43    movdqa      k1k7, xmm1
     44 
     45    movq        xmm6, rcx
     46    pshufd      xmm6, xmm6, 0
     47    movdqa      krd, xmm6
     48 
     49    ;Compute max and min values of a pixel
     50    mov         rdx, 0x00010001
     51    movsxd      rcx, DWORD PTR arg(6)      ;bps
     52    movq        xmm0, rdx
     53    movq        xmm1, rcx
     54    pshufd      xmm0, xmm0, 0b
     55    movdqa      xmm2, xmm0
     56    psllw       xmm0, xmm1
     57    psubw       xmm0, xmm2
     58    pxor        xmm1, xmm1
     59    movdqa      max, xmm0                  ;max value (for clamping)
     60    movdqa      min, xmm1                  ;min value (for clamping)
     61 
     62 %endm
     63 
     64 %macro HIGH_APPLY_FILTER_4 1
     65    punpcklwd   xmm0, xmm6                  ;two row in one register
     66    punpcklwd   xmm1, xmm7
     67    punpcklwd   xmm2, xmm5
     68    punpcklwd   xmm3, xmm4
     69 
     70    pmaddwd     xmm0, k0k6                  ;multiply the filter factors
     71    pmaddwd     xmm1, k1k7
     72    pmaddwd     xmm2, k2k5
     73    pmaddwd     xmm3, k3k4
     74 
     75    paddd       xmm0, xmm1                  ;sum
     76    paddd       xmm0, xmm2
     77    paddd       xmm0, xmm3
     78 
     79    paddd       xmm0, krd                   ;rounding
     80    psrad       xmm0, 7                     ;shift
     81    packssdw    xmm0, xmm0                  ;pack to word
     82 
     83    ;clamp the values
     84    pminsw      xmm0, max
     85    pmaxsw      xmm0, min
     86 
     87 %if %1
     88    movq        xmm1, [rdi]
     89    pavgw       xmm0, xmm1
     90 %endif
     91    movq        [rdi], xmm0
     92 %endm
     93 
     94 %macro HIGH_GET_FILTERS 0
     95    mov         rdx, arg(5)                 ;filter ptr
     96    mov         rsi, arg(0)                 ;src_ptr
     97    mov         rdi, arg(2)                 ;output_ptr
     98    mov         rcx, 0x00000040
     99 
    100    movdqa      xmm7, [rdx]                 ;load filters
    101    pshuflw     xmm0, xmm7, 0b              ;k0
    102    pshuflw     xmm1, xmm7, 01010101b       ;k1
    103    pshuflw     xmm2, xmm7, 10101010b       ;k2
    104    pshuflw     xmm3, xmm7, 11111111b       ;k3
    105    pshufhw     xmm4, xmm7, 0b              ;k4
    106    pshufhw     xmm5, xmm7, 01010101b       ;k5
    107    pshufhw     xmm6, xmm7, 10101010b       ;k6
    108    pshufhw     xmm7, xmm7, 11111111b       ;k7
    109    punpcklqdq  xmm2, xmm2
    110    punpcklqdq  xmm3, xmm3
    111    punpcklwd   xmm0, xmm1
    112    punpckhwd   xmm6, xmm7
    113    punpckhwd   xmm2, xmm5
    114    punpckhwd   xmm3, xmm4
    115 
    116    movdqa      k0k1, xmm0                  ;store filter factors on stack
    117    movdqa      k6k7, xmm6
    118    movdqa      k2k5, xmm2
    119    movdqa      k3k4, xmm3
    120 
    121    movq        xmm6, rcx
    122    pshufd      xmm6, xmm6, 0
    123    movdqa      krd, xmm6                   ;rounding
    124 
    125    ;Compute max and min values of a pixel
    126    mov         rdx, 0x00010001
    127    movsxd      rcx, DWORD PTR arg(6)       ;bps
    128    movq        xmm0, rdx
    129    movq        xmm1, rcx
    130    pshufd      xmm0, xmm0, 0b
    131    movdqa      xmm2, xmm0
    132    psllw       xmm0, xmm1
    133    psubw       xmm0, xmm2
    134    pxor        xmm1, xmm1
    135    movdqa      max, xmm0                  ;max value (for clamping)
    136    movdqa      min, xmm1                  ;min value (for clamping)
    137 %endm
    138 
    139 %macro LOAD_VERT_8 1
    140    movdqu      xmm0, [rsi + %1]            ;0
    141    movdqu      xmm1, [rsi + rax + %1]      ;1
    142    movdqu      xmm6, [rsi + rdx * 2 + %1]  ;6
    143    lea         rsi,  [rsi + rax]
    144    movdqu      xmm7, [rsi + rdx * 2 + %1]  ;7
    145    movdqu      xmm2, [rsi + rax + %1]      ;2
    146    movdqu      xmm3, [rsi + rax * 2 + %1]  ;3
    147    movdqu      xmm4, [rsi + rdx + %1]      ;4
    148    movdqu      xmm5, [rsi + rax * 4 + %1]  ;5
    149 %endm
    150 
    151 %macro HIGH_APPLY_FILTER_8 2
    152    movdqu      temp, xmm4
    153    movdqa      xmm4, xmm0
    154    punpcklwd   xmm0, xmm1
    155    punpckhwd   xmm4, xmm1
    156    movdqa      xmm1, xmm6
    157    punpcklwd   xmm6, xmm7
    158    punpckhwd   xmm1, xmm7
    159    movdqa      xmm7, xmm2
    160    punpcklwd   xmm2, xmm5
    161    punpckhwd   xmm7, xmm5
    162 
    163    movdqu      xmm5, temp
    164    movdqu      temp, xmm4
    165    movdqa      xmm4, xmm3
    166    punpcklwd   xmm3, xmm5
    167    punpckhwd   xmm4, xmm5
    168    movdqu      xmm5, temp
    169 
    170    pmaddwd     xmm0, k0k1
    171    pmaddwd     xmm5, k0k1
    172    pmaddwd     xmm6, k6k7
    173    pmaddwd     xmm1, k6k7
    174    pmaddwd     xmm2, k2k5
    175    pmaddwd     xmm7, k2k5
    176    pmaddwd     xmm3, k3k4
    177    pmaddwd     xmm4, k3k4
    178 
    179    paddd       xmm0, xmm6
    180    paddd       xmm0, xmm2
    181    paddd       xmm0, xmm3
    182    paddd       xmm5, xmm1
    183    paddd       xmm5, xmm7
    184    paddd       xmm5, xmm4
    185 
    186    paddd       xmm0, krd                   ;rounding
    187    paddd       xmm5, krd
    188    psrad       xmm0, 7                     ;shift
    189    psrad       xmm5, 7
    190    packssdw    xmm0, xmm5                  ;pack back to word
    191 
    192    ;clamp the values
    193    pminsw      xmm0, max
    194    pmaxsw      xmm0, min
    195 
    196 %if %1
    197    movdqu      xmm1, [rdi + %2]
    198    pavgw       xmm0, xmm1
    199 %endif
    200    movdqu      [rdi + %2], xmm0
    201 %endm
    202 
    203 SECTION .text
    204 
    205 ;void aom_highbd_filter_block1d4_v8_sse2
    206 ;(
    207 ;    const uint16_t  *src_ptr,
    208 ;    const ptrdiff_t  src_pitch,
    209 ;    uint16_t        *output_ptr,
    210 ;    ptrdiff_t        out_pitch,
    211 ;    unsigned int     output_height,
    212 ;    const int16_t   *filter,
    213 ;    int              bd
    214 ;)
    215 globalsym(aom_highbd_filter_block1d4_v8_sse2)
    216 sym(aom_highbd_filter_block1d4_v8_sse2):
    217    push        rbp
    218    mov         rbp, rsp
    219    SHADOW_ARGS_TO_STACK 7
    220    SAVE_XMM 7
    221    push        rsi
    222    push        rdi
    223    push        rbx
    224    ; end prolog
    225 
    226    ALIGN_STACK 16, rax
    227    sub         rsp, 16 * 7
    228    %define k0k6 [rsp + 16 * 0]
    229    %define k2k5 [rsp + 16 * 1]
    230    %define k3k4 [rsp + 16 * 2]
    231    %define k1k7 [rsp + 16 * 3]
    232    %define krd [rsp + 16 * 4]
    233    %define max [rsp + 16 * 5]
    234    %define min [rsp + 16 * 6]
    235 
    236    HIGH_GET_FILTERS_4
    237 
    238    mov         rsi, arg(0)                 ;src_ptr
    239    mov         rdi, arg(2)                 ;output_ptr
    240 
    241    movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
    242    movsxd      rbx, DWORD PTR arg(3)       ;out_pitch
    243    lea         rax, [rax + rax]            ;bytes per line
    244    lea         rbx, [rbx + rbx]
    245    lea         rdx, [rax + rax * 2]
    246    movsxd      rcx, DWORD PTR arg(4)       ;output_height
    247 
    248 .loop:
    249    movq        xmm0, [rsi]                 ;load src: row 0
    250    movq        xmm1, [rsi + rax]           ;1
    251    movq        xmm6, [rsi + rdx * 2]       ;6
    252    lea         rsi,  [rsi + rax]
    253    movq        xmm7, [rsi + rdx * 2]       ;7
    254    movq        xmm2, [rsi + rax]           ;2
    255    movq        xmm3, [rsi + rax * 2]       ;3
    256    movq        xmm4, [rsi + rdx]           ;4
    257    movq        xmm5, [rsi + rax * 4]       ;5
    258 
    259    HIGH_APPLY_FILTER_4 0
    260 
    261    lea         rdi, [rdi + rbx]
    262    dec         rcx
    263    jnz         .loop
    264 
    265    add rsp, 16 * 7
    266    pop rsp
    267    pop rbx
    268    ; begin epilog
    269    pop rdi
    270    pop rsi
    271    RESTORE_XMM
    272    UNSHADOW_ARGS
    273    pop         rbp
    274    ret
    275 
    276 ;void aom_highbd_filter_block1d8_v8_sse2
    277 ;(
    278 ;    const uint16_t  *src_ptr,
    279 ;    const ptrdiff_t  src_pitch,
    280 ;    uint16_t        *output_ptr,
    281 ;    ptrdiff_t        out_pitch,
    282 ;    unsigned int     output_height,
    283 ;    const int16_t   *filter,
    284 ;    int              bd
    285 ;)
    286 globalsym(aom_highbd_filter_block1d8_v8_sse2)
    287 sym(aom_highbd_filter_block1d8_v8_sse2):
    288    push        rbp
    289    mov         rbp, rsp
    290    SHADOW_ARGS_TO_STACK 7
    291    SAVE_XMM 7
    292    push        rsi
    293    push        rdi
    294    push        rbx
    295    ; end prolog
    296 
    297    ALIGN_STACK 16, rax
    298    sub         rsp, 16 * 8
    299    %define k0k1 [rsp + 16 * 0]
    300    %define k6k7 [rsp + 16 * 1]
    301    %define k2k5 [rsp + 16 * 2]
    302    %define k3k4 [rsp + 16 * 3]
    303    %define krd [rsp + 16 * 4]
    304    %define temp [rsp + 16 * 5]
    305    %define max [rsp + 16 * 6]
    306    %define min [rsp + 16 * 7]
    307 
    308    HIGH_GET_FILTERS
    309 
    310    movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
    311    movsxd      rbx, DWORD PTR arg(3)       ;out_pitch
    312    lea         rax, [rax + rax]            ;bytes per line
    313    lea         rbx, [rbx + rbx]
    314    lea         rdx, [rax + rax * 2]
    315    movsxd      rcx, DWORD PTR arg(4)       ;output_height
    316 
    317 .loop:
    318    LOAD_VERT_8 0
    319    HIGH_APPLY_FILTER_8 0, 0
    320 
    321    lea         rdi, [rdi + rbx]
    322    dec         rcx
    323    jnz         .loop
    324 
    325    add rsp, 16 * 8
    326    pop rsp
    327    pop rbx
    328    ; begin epilog
    329    pop rdi
    330    pop rsi
    331    RESTORE_XMM
    332    UNSHADOW_ARGS
    333    pop         rbp
    334    ret
    335 
    336 ;void aom_highbd_filter_block1d16_v8_sse2
    337 ;(
    338 ;    const uint16_t  *src_ptr,
    339 ;    const ptrdiff_t  src_pitch,
    340 ;    uint16_t        *output_ptr,
    341 ;    ptrdiff_t        out_pitch,
    342 ;    unsigned int     output_height,
    343 ;    const int16_t   *filter,
    344 ;    int              bd
    345 ;)
    346 globalsym(aom_highbd_filter_block1d16_v8_sse2)
    347 sym(aom_highbd_filter_block1d16_v8_sse2):
    348    push        rbp
    349    mov         rbp, rsp
    350    SHADOW_ARGS_TO_STACK 7
    351    SAVE_XMM 7
    352    push        rsi
    353    push        rdi
    354    push        rbx
    355    ; end prolog
    356 
    357    ALIGN_STACK 16, rax
    358    sub         rsp, 16 * 8
    359    %define k0k1 [rsp + 16 * 0]
    360    %define k6k7 [rsp + 16 * 1]
    361    %define k2k5 [rsp + 16 * 2]
    362    %define k3k4 [rsp + 16 * 3]
    363    %define krd [rsp + 16 * 4]
    364    %define temp [rsp + 16 * 5]
    365    %define max [rsp + 16 * 6]
    366    %define min [rsp + 16 * 7]
    367 
    368    HIGH_GET_FILTERS
    369 
    370    movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
    371    movsxd      rbx, DWORD PTR arg(3)       ;out_pitch
    372    lea         rax, [rax + rax]            ;bytes per line
    373    lea         rbx, [rbx + rbx]
    374    lea         rdx, [rax + rax * 2]
    375    movsxd      rcx, DWORD PTR arg(4)       ;output_height
    376 
    377 .loop:
    378    LOAD_VERT_8 0
    379    HIGH_APPLY_FILTER_8 0, 0
    380    sub         rsi, rax
    381 
    382    LOAD_VERT_8 16
    383    HIGH_APPLY_FILTER_8 0, 16
    384    add         rdi, rbx
    385 
    386    dec         rcx
    387    jnz         .loop
    388 
    389    add rsp, 16 * 8
    390    pop rsp
    391    pop rbx
    392    ; begin epilog
    393    pop rdi
    394    pop rsi
    395    RESTORE_XMM
    396    UNSHADOW_ARGS
    397    pop         rbp
    398    ret
    399 
    400 ;void aom_highbd_filter_block1d4_h8_sse2
    401 ;(
    402 ;    const uint16_t  *src_ptr,
    403 ;    const ptrdiff_t  src_pitch,
    404 ;    uint16_t        *output_ptr,
    405 ;    ptrdiff_t        out_pitch,
    406 ;    unsigned int     output_height,
    407 ;    const int16_t   *filter,
    408 ;    int              bd
    409 ;)
    410 globalsym(aom_highbd_filter_block1d4_h8_sse2)
    411 sym(aom_highbd_filter_block1d4_h8_sse2):
    412    push        rbp
    413    mov         rbp, rsp
    414    SHADOW_ARGS_TO_STACK 7
    415    SAVE_XMM 7
    416    push        rsi
    417    push        rdi
    418    ; end prolog
    419 
    420    ALIGN_STACK 16, rax
    421    sub         rsp, 16 * 7
    422    %define k0k6 [rsp + 16 * 0]
    423    %define k2k5 [rsp + 16 * 1]
    424    %define k3k4 [rsp + 16 * 2]
    425    %define k1k7 [rsp + 16 * 3]
    426    %define krd [rsp + 16 * 4]
    427    %define max [rsp + 16 * 5]
    428    %define min [rsp + 16 * 6]
    429 
    430    HIGH_GET_FILTERS_4
    431 
    432    mov         rsi, arg(0)                 ;src_ptr
    433    mov         rdi, arg(2)                 ;output_ptr
    434 
    435    movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
    436    movsxd      rdx, DWORD PTR arg(3)       ;out_pitch
    437    lea         rax, [rax + rax]            ;bytes per line
    438    lea         rdx, [rdx + rdx]
    439    movsxd      rcx, DWORD PTR arg(4)       ;output_height
    440 
    441 .loop:
    442    movdqu      xmm0,   [rsi - 6]           ;load src
    443    movdqu      xmm4,   [rsi + 2]
    444    movdqa      xmm1, xmm0
    445    movdqa      xmm6, xmm4
    446    movdqa      xmm7, xmm4
    447    movdqa      xmm2, xmm0
    448    movdqa      xmm3, xmm0
    449    movdqa      xmm5, xmm4
    450 
    451    psrldq      xmm1, 2
    452    psrldq      xmm6, 4
    453    psrldq      xmm7, 6
    454    psrldq      xmm2, 4
    455    psrldq      xmm3, 6
    456    psrldq      xmm5, 2
    457 
    458    HIGH_APPLY_FILTER_4 0
    459 
    460    lea         rsi, [rsi + rax]
    461    lea         rdi, [rdi + rdx]
    462    dec         rcx
    463    jnz         .loop
    464 
    465    add rsp, 16 * 7
    466    pop rsp
    467 
    468    ; begin epilog
    469    pop rdi
    470    pop rsi
    471    RESTORE_XMM
    472    UNSHADOW_ARGS
    473    pop         rbp
    474    ret
    475 
    476 ;void aom_highbd_filter_block1d8_h8_sse2
    477 ;(
    478 ;    const uint16_t  *src_ptr,
    479 ;    const ptrdiff_t  src_pitch,
    480 ;    uint16_t        *output_ptr,
    481 ;    ptrdiff_t        out_pitch,
    482 ;    unsigned int     output_height,
    483 ;    const int16_t   *filter,
    484 ;    int              bd
    485 ;)
    486 globalsym(aom_highbd_filter_block1d8_h8_sse2)
    487 sym(aom_highbd_filter_block1d8_h8_sse2):
    488    push        rbp
    489    mov         rbp, rsp
    490    SHADOW_ARGS_TO_STACK 7
    491    SAVE_XMM 7
    492    push        rsi
    493    push        rdi
    494    ; end prolog
    495 
    496    ALIGN_STACK 16, rax
    497    sub         rsp, 16 * 8
    498    %define k0k1 [rsp + 16 * 0]
    499    %define k6k7 [rsp + 16 * 1]
    500    %define k2k5 [rsp + 16 * 2]
    501    %define k3k4 [rsp + 16 * 3]
    502    %define krd [rsp + 16 * 4]
    503    %define temp [rsp + 16 * 5]
    504    %define max [rsp + 16 * 6]
    505    %define min [rsp + 16 * 7]
    506 
    507    HIGH_GET_FILTERS
    508 
    509    movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
    510    movsxd      rdx, DWORD PTR arg(3)       ;out_pitch
    511    lea         rax, [rax + rax]            ;bytes per line
    512    lea         rdx, [rdx + rdx]
    513    movsxd      rcx, DWORD PTR arg(4)       ;output_height
    514 
    515 .loop:
    516    movdqu      xmm0,   [rsi - 6]           ;load src
    517    movdqu      xmm1,   [rsi - 4]
    518    movdqu      xmm2,   [rsi - 2]
    519    movdqu      xmm3,   [rsi]
    520    movdqu      xmm4,   [rsi + 2]
    521    movdqu      xmm5,   [rsi + 4]
    522    movdqu      xmm6,   [rsi + 6]
    523    movdqu      xmm7,   [rsi + 8]
    524 
    525    HIGH_APPLY_FILTER_8 0, 0
    526 
    527    lea         rsi, [rsi + rax]
    528    lea         rdi, [rdi + rdx]
    529    dec         rcx
    530    jnz         .loop
    531 
    532    add rsp, 16 * 8
    533    pop rsp
    534 
    535    ; begin epilog
    536    pop rdi
    537    pop rsi
    538    RESTORE_XMM
    539    UNSHADOW_ARGS
    540    pop         rbp
    541    ret
    542 
    543 ;void aom_highbd_filter_block1d16_h8_sse2
    544 ;(
    545 ;    const uint16_t  *src_ptr,
    546 ;    const ptrdiff_t  src_pitch,
    547 ;    uint16_t        *output_ptr,
    548 ;    ptrdiff_t        out_pitch,
    549 ;    unsigned int     output_height,
    550 ;    const int16_t   *filter,
    551 ;    int              bd
    552 ;)
    553 globalsym(aom_highbd_filter_block1d16_h8_sse2)
    554 sym(aom_highbd_filter_block1d16_h8_sse2):
    555    push        rbp
    556    mov         rbp, rsp
    557    SHADOW_ARGS_TO_STACK 7
    558    SAVE_XMM 7
    559    push        rsi
    560    push        rdi
    561    ; end prolog
    562 
    563    ALIGN_STACK 16, rax
    564    sub         rsp, 16 * 8
    565    %define k0k1 [rsp + 16 * 0]
    566    %define k6k7 [rsp + 16 * 1]
    567    %define k2k5 [rsp + 16 * 2]
    568    %define k3k4 [rsp + 16 * 3]
    569    %define krd [rsp + 16 * 4]
    570    %define temp [rsp + 16 * 5]
    571    %define max [rsp + 16 * 6]
    572    %define min [rsp + 16 * 7]
    573 
    574    HIGH_GET_FILTERS
    575 
    576    movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
    577    movsxd      rdx, DWORD PTR arg(3)       ;out_pitch
    578    lea         rax, [rax + rax]            ;bytes per line
    579    lea         rdx, [rdx + rdx]
    580    movsxd      rcx, DWORD PTR arg(4)       ;output_height
    581 
    582 .loop:
    583    movdqu      xmm0,   [rsi - 6]           ;load src
    584    movdqu      xmm1,   [rsi - 4]
    585    movdqu      xmm2,   [rsi - 2]
    586    movdqu      xmm3,   [rsi]
    587    movdqu      xmm4,   [rsi + 2]
    588    movdqu      xmm5,   [rsi + 4]
    589    movdqu      xmm6,   [rsi + 6]
    590    movdqu      xmm7,   [rsi + 8]
    591 
    592    HIGH_APPLY_FILTER_8 0, 0
    593 
    594    movdqu      xmm0,   [rsi + 10]           ;load src
    595    movdqu      xmm1,   [rsi + 12]
    596    movdqu      xmm2,   [rsi + 14]
    597    movdqu      xmm3,   [rsi + 16]
    598    movdqu      xmm4,   [rsi + 18]
    599    movdqu      xmm5,   [rsi + 20]
    600    movdqu      xmm6,   [rsi + 22]
    601    movdqu      xmm7,   [rsi + 24]
    602 
    603    HIGH_APPLY_FILTER_8 0, 16
    604 
    605    lea         rsi, [rsi + rax]
    606    lea         rdi, [rdi + rdx]
    607    dec         rcx
    608    jnz         .loop
    609 
    610    add rsp, 16 * 8
    611    pop rsp
    612 
    613    ; begin epilog
    614    pop rdi
    615    pop rsi
    616    RESTORE_XMM
    617    UNSHADOW_ARGS
    618    pop         rbp
    619    ret