tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

jdsample-avx2.asm (27485B)


      1 ;
      2 ; jdsample.asm - upsampling (64-bit AVX2)
      3 ;
      4 ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
      5 ; Copyright (C) 2009, 2016, 2024, D. R. Commander.
      6 ; Copyright (C) 2015, Intel Corporation.
      7 ; Copyright (C) 2018, Matthias Räncker.
      8 ; Copyright (C) 2023, Aliaksiej Kandracienka.
      9 ;
     10 ; Based on the x86 SIMD extension for IJG JPEG library
     11 ; Copyright (C) 1999-2006, MIYASAKA Masaru.
     12 ; For conditions of distribution and use, see copyright notice in jsimdext.inc
     13 ;
     14 ; This file should be assembled with NASM (Netwide Assembler) or Yasm.
     15 
     16 %include "jsimdext.inc"
     17 
     18 ; --------------------------------------------------------------------------
     19    SECTION     SEG_CONST
     20 
     21    ALIGNZ      32
     22    GLOBAL_DATA(jconst_fancy_upsample_avx2)
     23 
     24 EXTN(jconst_fancy_upsample_avx2):
     25 
     26 PW_ONE   times 16 dw 1
     27 PW_TWO   times 16 dw 2
     28 PW_THREE times 16 dw 3
     29 PW_SEVEN times 16 dw 7
     30 PW_EIGHT times 16 dw 8
     31 
     32    ALIGNZ      32
     33 
     34 ; --------------------------------------------------------------------------
     35    SECTION     SEG_TEXT
     36    BITS        64
     37 ;
     38 ; Fancy processing for the common case of 2:1 horizontal and 1:1 vertical.
     39 ;
     40 ; The upsampling algorithm is linear interpolation between pixel centers,
     41 ; also known as a "triangle filter".  This is a good compromise between
     42 ; speed and visual quality.  The centers of the output pixels are 1/4 and 3/4
     43 ; of the way between input pixel centers.
     44 ;
     45 ; GLOBAL(void)
     46 ; jsimd_h2v1_fancy_upsample_avx2(int max_v_samp_factor,
     47 ;                                JDIMENSION downsampled_width,
     48 ;                                JSAMPARRAY input_data,
     49 ;                                JSAMPARRAY *output_data_ptr);
     50 ;
     51 
     52 ; r10 = int max_v_samp_factor
     53 ; r11d = JDIMENSION downsampled_width
     54 ; r12 = JSAMPARRAY input_data
     55 ; r13 = JSAMPARRAY *output_data_ptr
     56 
     57    align       32
     58    GLOBAL_FUNCTION(jsimd_h2v1_fancy_upsample_avx2)
     59 
     60 EXTN(jsimd_h2v1_fancy_upsample_avx2):
     61    ENDBR64
     62    push        rbp
     63    mov         rbp, rsp
     64    PUSH_XMM    3
     65    COLLECT_ARGS 4
     66 
     67    mov         eax, r11d               ; colctr
     68    test        rax, rax
     69    jz          near .return
     70 
     71    mov         rcx, r10                ; rowctr
     72    test        rcx, rcx
     73    jz          near .return
     74 
     75    mov         rsi, r12                ; input_data
     76    mov         rdi, r13
     77    mov         rdip, JSAMPARRAY [rdi]  ; output_data
     78 
     79    vpxor       ymm0, ymm0, ymm0                 ; ymm0=(all 0's)
     80    vpcmpeqb    xmm9, xmm9, xmm9
     81    vpsrldq     xmm10, xmm9, (SIZEOF_XMMWORD-1)  ; (ff -- -- -- ... -- --) LSB is ff
     82 
     83    vpslldq     xmm9, xmm9, (SIZEOF_XMMWORD-1)
     84    vperm2i128  ymm9, ymm9, ymm9, 1              ; (---- ---- ... ---- ---- ff) MSB is ff
     85 
     86 .rowloop:
     87    push        rax                     ; colctr
     88    push        rdi
     89    push        rsi
     90 
     91    mov         rsip, JSAMPROW [rsi]    ; inptr
     92    mov         rdip, JSAMPROW [rdi]    ; outptr
     93 
     94    test        rax, SIZEOF_YMMWORD-1
     95    jz          short .skip
     96    mov         dl, JSAMPLE [rsi+(rax-1)*SIZEOF_JSAMPLE]
     97    mov         JSAMPLE [rsi+rax*SIZEOF_JSAMPLE], dl    ; insert a dummy sample
     98 .skip:
     99    vpand       ymm7, ymm10, YMMWORD [rsi+0*SIZEOF_YMMWORD]
    100 
    101    add         rax, byte SIZEOF_YMMWORD-1
    102    and         rax, byte -SIZEOF_YMMWORD
    103    cmp         rax, byte SIZEOF_YMMWORD
    104    ja          short .columnloop
    105 
    106 .columnloop_last:
    107    vpand       ymm6, ymm9, YMMWORD [rsi+0*SIZEOF_YMMWORD]
    108    jmp         short .upsample
    109 
    110 .columnloop:
    111    vmovdqu     ymm6, YMMWORD [rsi+1*SIZEOF_YMMWORD]
    112    vperm2i128  ymm6, ymm0, ymm6, 0x20
    113    vpslldq     ymm6, ymm6, 15
    114 
    115 .upsample:
    116    vmovdqu     ymm1, YMMWORD [rsi+0*SIZEOF_YMMWORD]  ; ymm1=( 0  1  2 ... 29 30 31)
    117 
    118    vperm2i128  ymm2, ymm0, ymm1, 0x20
    119    vpalignr    ymm2, ymm1, ymm2, 15            ; ymm2=(--  0  1 ... 28 29 30)
    120    vperm2i128  ymm4, ymm0, ymm1, 0x03
    121    vpalignr    ymm3, ymm4, ymm1, 1             ; ymm3=( 1  2  3 ... 30 31 --)
    122 
    123    vpor        ymm2, ymm2, ymm7                ; ymm2=(-1  0  1 ... 28 29 30)
    124    vpor        ymm3, ymm3, ymm6                ; ymm3=( 1  2  3 ... 30 31 32)
    125 
    126    vpsrldq     ymm7, ymm4, (SIZEOF_XMMWORD-1)  ; ymm7=(31 -- -- ... -- -- --)
    127 
    128    vpunpckhbw  ymm4, ymm1, ymm0                ; ymm4=( 8  9 10 11 12 13 14 15 24 25 26 27 28 29 30 31)
    129    vpunpcklbw  ymm5, ymm1, ymm0                ; ymm5=( 0  1  2  3  4  5  6  7 16 17 18 19 20 21 22 23)
    130    vperm2i128  ymm1, ymm5, ymm4, 0x20          ; ymm1=( 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15)
    131    vperm2i128  ymm4, ymm5, ymm4, 0x31          ; ymm4=(16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31)
    132 
    133    vpunpckhbw  ymm5, ymm2, ymm0                ; ymm5=( 7  8  9 10 11 12 13 14 23 24 25 26 27 28 29 30)
    134    vpunpcklbw  ymm6, ymm2, ymm0                ; ymm6=(-1  0  1  2  3  4  5  6 15 16 17 18 19 20 21 22)
    135    vperm2i128  ymm2, ymm6, ymm5, 0x20          ; ymm2=(-1  0  1  2  3  4  5  6  7  8  9 10 11 12 13 14)
    136    vperm2i128  ymm5, ymm6, ymm5, 0x31          ; ymm5=(15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30)
    137 
    138    vpunpckhbw  ymm6, ymm3, ymm0                ; ymm6=( 1  2  3  4  5  6  7  8 17 18 19 20 21 22 23 24)
    139    vpunpcklbw  ymm8, ymm3, ymm0                ; ymm8=( 9 10 11 12 13 14 15 16 25 26 27 28 29 30 31 32)
    140    vperm2i128  ymm3, ymm8, ymm6, 0x20          ; ymm3=( 1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16)
    141    vperm2i128  ymm6, ymm8, ymm6, 0x31          ; ymm6=(17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32)
    142 
    143    vpmullw     ymm1, ymm1, [rel PW_THREE]
    144    vpmullw     ymm4, ymm4, [rel PW_THREE]
    145    vpaddw      ymm2, ymm2, [rel PW_ONE]
    146    vpaddw      ymm5, ymm5, [rel PW_ONE]
    147    vpaddw      ymm3, ymm3, [rel PW_TWO]
    148    vpaddw      ymm6, ymm6, [rel PW_TWO]
    149 
    150    vpaddw      ymm2, ymm2, ymm1
    151    vpaddw      ymm5, ymm5, ymm4
    152    vpsrlw      ymm2, ymm2, 2                   ; ymm2=OutLE=( 0  2  4  6  8 10 12 14 16 18 20 22 24 26 28 30)
    153    vpsrlw      ymm5, ymm5, 2                   ; ymm5=OutHE=(32 34 36 38 40 42 44 46 48 50 52 54 56 58 60 62)
    154    vpaddw      ymm3, ymm3, ymm1
    155    vpaddw      ymm6, ymm6, ymm4
    156    vpsrlw      ymm3, ymm3, 2                   ; ymm3=OutLO=( 1  3  5  7  9 11 13 15 17 19 21 23 25 27 29 31)
    157    vpsrlw      ymm6, ymm6, 2                   ; ymm6=OutHO=(33 35 37 39 41 43 45 47 49 51 53 55 57 59 61 63)
    158 
    159    vpsllw      ymm3, ymm3, BYTE_BIT
    160    vpsllw      ymm6, ymm6, BYTE_BIT
    161    vpor        ymm2, ymm2, ymm3                ; ymm2=OutL=( 0  1  2 ... 29 30 31)
    162    vpor        ymm5, ymm5, ymm6                ; ymm5=OutH=(32 33 34 ... 61 62 63)
    163 
    164    vmovdqu     YMMWORD [rdi+0*SIZEOF_YMMWORD], ymm2
    165    vmovdqu     YMMWORD [rdi+1*SIZEOF_YMMWORD], ymm5
    166 
    167    sub         rax, byte SIZEOF_YMMWORD
    168    add         rsi, byte 1*SIZEOF_YMMWORD  ; inptr
    169    add         rdi, byte 2*SIZEOF_YMMWORD  ; outptr
    170    cmp         rax, byte SIZEOF_YMMWORD
    171    ja          near .columnloop
    172    test        eax, eax
    173    jnz         near .columnloop_last
    174 
    175    pop         rsi
    176    pop         rdi
    177    pop         rax
    178 
    179    add         rsi, byte SIZEOF_JSAMPROW  ; input_data
    180    add         rdi, byte SIZEOF_JSAMPROW  ; output_data
    181    dec         rcx                        ; rowctr
    182    jg          near .rowloop
    183 
    184 .return:
    185    vzeroupper
    186    UNCOLLECT_ARGS 4
    187    POP_XMM     3
    188    pop         rbp
    189    ret
    190 
    191 ; --------------------------------------------------------------------------
    192 ;
    193 ; Fancy processing for the common case of 2:1 horizontal and 2:1 vertical.
    194 ; Again a triangle filter; see comments for h2v1 case, above.
    195 ;
    196 ; GLOBAL(void)
    197 ; jsimd_h2v2_fancy_upsample_avx2(int max_v_samp_factor,
    198 ;                                JDIMENSION downsampled_width,
    199 ;                                JSAMPARRAY input_data,
    200 ;                                JSAMPARRAY *output_data_ptr);
    201 ;
    202 
    203 ; r10 = int max_v_samp_factor
    204 ; r11d = JDIMENSION downsampled_width
    205 ; r12 = JSAMPARRAY input_data
    206 ; r13 = JSAMPARRAY *output_data_ptr
    207 
    208 %define wk(i)   r15 - (WK_NUM - (i)) * SIZEOF_YMMWORD  ; ymmword wk[WK_NUM]
    209 %define WK_NUM  4
    210 
    211    align       32
    212    GLOBAL_FUNCTION(jsimd_h2v2_fancy_upsample_avx2)
    213 
    214 EXTN(jsimd_h2v2_fancy_upsample_avx2):
    215    ENDBR64
    216    push        rbp
    217    mov         rbp, rsp
    218    push        r15
    219    and         rsp, byte (-SIZEOF_YMMWORD)  ; align to 128 bits
    220    ; Allocate stack space for wk array.  r15 is used to access it.
    221    mov         r15, rsp
    222    sub         rsp, (SIZEOF_YMMWORD * WK_NUM)
    223    PUSH_XMM    3
    224    COLLECT_ARGS 4
    225    push        rbx
    226 
    227    mov         eax, r11d               ; colctr
    228    test        rax, rax
    229    jz          near .return
    230 
    231    mov         rcx, r10                ; rowctr
    232    test        rcx, rcx
    233    jz          near .return
    234 
    235    mov         rsi, r12                ; input_data
    236    mov         rdi, r13
    237    mov         rdip, JSAMPARRAY [rdi]  ; output_data
    238 .rowloop:
    239    push        rax                     ; colctr
    240    push        rcx
    241    push        rdi
    242    push        rsi
    243 
    244    mov         rcxp, JSAMPROW [rsi-1*SIZEOF_JSAMPROW]  ; inptr1(above)
    245    mov         rbxp, JSAMPROW [rsi+0*SIZEOF_JSAMPROW]  ; inptr0
    246    mov         rsip, JSAMPROW [rsi+1*SIZEOF_JSAMPROW]  ; inptr1(below)
    247    mov         rdxp, JSAMPROW [rdi+0*SIZEOF_JSAMPROW]  ; outptr0
    248    mov         rdip, JSAMPROW [rdi+1*SIZEOF_JSAMPROW]  ; outptr1
    249 
    250    vpxor       ymm8, ymm8, ymm8                 ; ymm8=(all 0's)
    251    vpcmpeqb    xmm9, xmm9, xmm9
    252    vpsrldq     xmm10, xmm9, (SIZEOF_XMMWORD-2)  ; (ffff ---- ---- ... ---- ----) LSB is ffff
    253    vpslldq     xmm9, xmm9, (SIZEOF_XMMWORD-2)
    254    vperm2i128  ymm9, ymm9, ymm9, 1              ; (---- ---- ... ---- ---- ffff) MSB is ffff
    255 
    256    test        rax, SIZEOF_YMMWORD-1
    257    jz          short .skip
    258    push        rdx
    259    mov         dl, JSAMPLE [rcx+(rax-1)*SIZEOF_JSAMPLE]
    260    mov         JSAMPLE [rcx+rax*SIZEOF_JSAMPLE], dl
    261    mov         dl, JSAMPLE [rbx+(rax-1)*SIZEOF_JSAMPLE]
    262    mov         JSAMPLE [rbx+rax*SIZEOF_JSAMPLE], dl
    263    mov         dl, JSAMPLE [rsi+(rax-1)*SIZEOF_JSAMPLE]
    264    mov         JSAMPLE [rsi+rax*SIZEOF_JSAMPLE], dl    ; insert a dummy sample
    265    pop         rdx
    266 .skip:
    267    ; -- process the first column block
    268 
    269    vmovdqu     ymm0, YMMWORD [rbx+0*SIZEOF_YMMWORD]  ; ymm0=row[ 0][0]
    270    vmovdqu     ymm1, YMMWORD [rcx+0*SIZEOF_YMMWORD]  ; ymm1=row[-1][0]
    271    vmovdqu     ymm2, YMMWORD [rsi+0*SIZEOF_YMMWORD]  ; ymm2=row[+1][0]
    272 
    273    vpunpckhbw  ymm4, ymm0, ymm8        ; ymm4=row[ 0]( 8  9 10 11 12 13 14 15 24 25 26 27 28 29 30 31)
    274    vpunpcklbw  ymm5, ymm0, ymm8        ; ymm5=row[ 0]( 0  1  2  3  4  5  6  7 16 17 18 19 20 21 22 23)
    275    vperm2i128  ymm0, ymm5, ymm4, 0x20  ; ymm0=row[ 0]( 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15)
    276    vperm2i128  ymm4, ymm5, ymm4, 0x31  ; ymm4=row[ 0](16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31)
    277 
    278    vpunpckhbw  ymm5, ymm1, ymm8        ; ymm5=row[-1]( 8  9 10 11 12 13 14 15 24 25 26 27 28 29 30 31)
    279    vpunpcklbw  ymm6, ymm1, ymm8        ; ymm6=row[-1]( 0  1  2  3  4  5  6  7 16 17 18 19 20 21 22 23)
    280    vperm2i128  ymm1, ymm6, ymm5, 0x20  ; ymm1=row[-1]( 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15)
    281    vperm2i128  ymm5, ymm6, ymm5, 0x31  ; ymm5=row[-1](16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31)
    282 
    283    vpunpckhbw  ymm6, ymm2, ymm8        ; ymm6=row[+1]( 8  9 10 11 12 13 14 15 24 25 26 27 28 29 30 31)
    284    vpunpcklbw  ymm3, ymm2, ymm8        ; ymm3=row[+1]( 0  1  2  3  4  5  6  7 16 17 18 19 20 21 22 23)
    285    vperm2i128  ymm2, ymm3, ymm6, 0x20  ; ymm2=row[+1]( 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15)
    286    vperm2i128  ymm6, ymm3, ymm6, 0x31  ; ymm6=row[+1](16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31)
    287 
    288    vpmullw     ymm0, ymm0, [rel PW_THREE]
    289    vpmullw     ymm4, ymm4, [rel PW_THREE]
    290 
    291    vpaddw      ymm1, ymm1, ymm0        ; ymm1=Int0L=( 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15)
    292    vpaddw      ymm5, ymm5, ymm4        ; ymm5=Int0H=(16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31)
    293    vpaddw      ymm2, ymm2, ymm0        ; ymm2=Int1L=( 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15)
    294    vpaddw      ymm6, ymm6, ymm4        ; ymm6=Int1H=(16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31)
    295 
    296    vmovdqu     YMMWORD [rdx+0*SIZEOF_YMMWORD], ymm1  ; temporarily save
    297    vmovdqu     YMMWORD [rdx+1*SIZEOF_YMMWORD], ymm5  ; the intermediate data
    298    vmovdqu     YMMWORD [rdi+0*SIZEOF_YMMWORD], ymm2
    299    vmovdqu     YMMWORD [rdi+1*SIZEOF_YMMWORD], ymm6
    300 
    301    vpand       ymm1, ymm1, ymm10       ; ymm1=( 0 -- -- -- -- -- -- -- -- -- -- -- -- -- -- --)
    302    vpand       ymm2, ymm2, ymm10       ; ymm2=( 0 -- -- -- -- -- -- -- -- -- -- -- -- -- -- --)
    303 
    304    vmovdqa     YMMWORD [wk(0)], ymm1
    305    vmovdqa     YMMWORD [wk(1)], ymm2
    306 
    307    add         rax, byte SIZEOF_YMMWORD-1
    308    and         rax, byte -SIZEOF_YMMWORD
    309    cmp         rax, byte SIZEOF_YMMWORD
    310    ja          short .columnloop
    311 
    312 .columnloop_last:
    313    ; -- process the last column block
    314 
    315    vpand       ymm1, ymm9, YMMWORD [rdx+1*SIZEOF_YMMWORD]
    316    vpand       ymm2, ymm9, YMMWORD [rdi+1*SIZEOF_YMMWORD]
    317 
    318    vmovdqa     YMMWORD [wk(2)], ymm1   ; ymm1=(-- -- -- -- -- -- -- -- -- -- -- -- -- -- -- 31)
    319    vmovdqa     YMMWORD [wk(3)], ymm2   ; ymm2=(-- -- -- -- -- -- -- -- -- -- -- -- -- -- -- 31)
    320 
    321    jmp         near .upsample
    322 
    323 .columnloop:
    324    ; -- process the next column block
    325 
    326    vmovdqu     ymm0, YMMWORD [rbx+1*SIZEOF_YMMWORD]  ; ymm0=row[ 0][1]
    327    vmovdqu     ymm1, YMMWORD [rcx+1*SIZEOF_YMMWORD]  ; ymm1=row[-1][1]
    328    vmovdqu     ymm2, YMMWORD [rsi+1*SIZEOF_YMMWORD]  ; ymm2=row[+1][1]
    329 
    330    vpunpckhbw  ymm4, ymm0, ymm8        ; ymm4=row[ 0]( 8  9 10 11 12 13 14 15 24 25 26 27 28 29 30 31)
    331    vpunpcklbw  ymm5, ymm0, ymm8        ; ymm5=row[ 0]( 0  1  2  3  4  5  6  7 16 17 18 19 20 21 22 23)
    332    vperm2i128  ymm0, ymm5, ymm4, 0x20  ; ymm0=row[ 0]( 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15)
    333    vperm2i128  ymm4, ymm5, ymm4, 0x31  ; ymm4=row[ 0](16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31)
    334 
    335    vpunpckhbw  ymm5, ymm1, ymm8        ; ymm5=row[-1]( 8  9 10 11 12 13 14 15 24 25 26 27 28 29 30 31)
    336    vpunpcklbw  ymm6, ymm1, ymm8        ; ymm6=row[-1]( 0  1  2  3  4  5  6  7 16 17 18 19 20 21 22 23)
    337    vperm2i128  ymm1, ymm6, ymm5, 0x20  ; ymm1=row[-1]( 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15)
    338    vperm2i128  ymm5, ymm6, ymm5, 0x31  ; ymm5=row[-1](16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31)
    339 
    340    vpunpckhbw  ymm6, ymm2, ymm8        ; ymm6=row[+1]( 8  9 10 11 12 13 14 15 24 25 26 27 28 29 30 31)
    341    vpunpcklbw  ymm7, ymm2, ymm8        ; ymm7=row[+1]( 0  1  2  3  4  5  6  7 16 17 18 19 20 21 22 23)
    342    vperm2i128  ymm2, ymm7, ymm6, 0x20  ; ymm2=row[+1]( 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15)
    343    vperm2i128  ymm6, ymm7, ymm6, 0x31  ; ymm6=row[+1](16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31)
    344 
    345    vpmullw     ymm0, ymm0, [rel PW_THREE]
    346    vpmullw     ymm4, ymm4, [rel PW_THREE]
    347 
    348    vpaddw      ymm1, ymm1, ymm0        ; ymm1=Int0L=( 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15)
    349    vpaddw      ymm5, ymm5, ymm4        ; ymm5=Int0H=(16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31)
    350    vpaddw      ymm2, ymm2, ymm0        ; ymm2=Int1L=( 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15)
    351    vpaddw      ymm6, ymm6, ymm4        ; ymm6=Int1H=(16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31)
    352 
    353    vmovdqu     YMMWORD [rdx+2*SIZEOF_YMMWORD], ymm1  ; temporarily save
    354    vmovdqu     YMMWORD [rdx+3*SIZEOF_YMMWORD], ymm5  ; the intermediate data
    355    vmovdqu     YMMWORD [rdi+2*SIZEOF_YMMWORD], ymm2
    356    vmovdqu     YMMWORD [rdi+3*SIZEOF_YMMWORD], ymm6
    357 
    358    vperm2i128  ymm1, ymm8, ymm1, 0x20
    359    vpslldq     ymm1, ymm1, 14          ; ymm1=(-- -- -- -- -- -- -- -- -- -- -- -- -- -- --  0)
    360    vperm2i128  ymm2, ymm8, ymm2, 0x20
    361    vpslldq     ymm2, ymm2, 14          ; ymm2=(-- -- -- -- -- -- -- -- -- -- -- -- -- -- --  0)
    362 
    363    vmovdqa     YMMWORD [wk(2)], ymm1
    364    vmovdqa     YMMWORD [wk(3)], ymm2
    365 
    366 .upsample:
    367    ; -- process the upper row
    368 
    369    vmovdqu     ymm7, YMMWORD [rdx+0*SIZEOF_YMMWORD]  ; ymm7=Int0L=( 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15)
    370    vmovdqu     ymm3, YMMWORD [rdx+1*SIZEOF_YMMWORD]  ; ymm3=Int0H=(16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31)
    371 
    372    vperm2i128  ymm0, ymm8, ymm7, 0x03
    373    vpalignr    ymm0, ymm0, ymm7, 2     ; ymm0=( 1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 --)
    374    vperm2i128  ymm4, ymm8, ymm3, 0x20
    375    vpslldq     ymm4, ymm4, 14          ; ymm4=(-- -- -- -- -- -- -- -- -- -- -- -- -- -- -- 16)
    376 
    377    vperm2i128  ymm5, ymm8, ymm7, 0x03
    378    vpsrldq     ymm5, ymm5, 14          ; ymm5=(15 -- -- -- -- -- -- -- -- -- -- -- -- -- -- --)
    379    vperm2i128  ymm6, ymm8, ymm3, 0x20
    380    vpalignr    ymm6, ymm3, ymm6, 14    ; ymm6=(-- 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30)
    381 
    382    vpor        ymm0, ymm0, ymm4        ; ymm0=( 1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16)
    383    vpor        ymm5, ymm5, ymm6        ; ymm5=(15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30)
    384 
    385    vperm2i128  ymm2, ymm8, ymm3, 0x03
    386    vpalignr    ymm2, ymm2, ymm3, 2     ; ymm2=(17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 --)
    387    vperm2i128  ymm4, ymm8, ymm3, 0x03
    388    vpsrldq     ymm4, ymm4, 14          ; ymm4=(31 -- -- -- -- -- -- -- -- -- -- -- -- -- -- --)
    389    vperm2i128  ymm1, ymm8, ymm7, 0x20
    390    vpalignr    ymm1, ymm7, ymm1, 14    ; ymm1=(--  0  1  2  3  4  5  6  7  8  9 10 11 12 13 14)
    391 
    392    vpor        ymm1, ymm1, YMMWORD [wk(0)]  ; ymm1=(-1  0  1  2  3  4  5  6  7  8  9 10 11 12 13 14)
    393    vpor        ymm2, ymm2, YMMWORD [wk(2)]  ; ymm2=(17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32)
    394 
    395    vmovdqa     YMMWORD [wk(0)], ymm4
    396 
    397    vpmullw     ymm7, ymm7, [rel PW_THREE]
    398    vpmullw     ymm3, ymm3, [rel PW_THREE]
    399    vpaddw      ymm1, ymm1, [rel PW_EIGHT]
    400    vpaddw      ymm5, ymm5, [rel PW_EIGHT]
    401    vpaddw      ymm0, ymm0, [rel PW_SEVEN]
    402    vpaddw      ymm2, [rel PW_SEVEN]
    403 
    404    vpaddw      ymm1, ymm1, ymm7
    405    vpaddw      ymm5, ymm5, ymm3
    406    vpsrlw      ymm1, ymm1, 4           ; ymm1=Out0LE=( 0  2  4  6  8 10 12 14 16 18 20 22 24 26 28 30)
    407    vpsrlw      ymm5, ymm5, 4           ; ymm5=Out0HE=(32 34 36 38 40 42 44 46 48 50 52 54 56 58 60 62)
    408    vpaddw      ymm0, ymm0, ymm7
    409    vpaddw      ymm2, ymm2, ymm3
    410    vpsrlw      ymm0, ymm0, 4           ; ymm0=Out0LO=( 1  3  5  7  9 11 13 15 17 19 21 23 25 27 29 31)
    411    vpsrlw      ymm2, ymm2, 4           ; ymm2=Out0HO=(33 35 37 39 41 43 45 47 49 51 53 55 57 59 61 63)
    412 
    413    vpsllw      ymm0, ymm0, BYTE_BIT
    414    vpsllw      ymm2, ymm2, BYTE_BIT
    415    vpor        ymm1, ymm1, ymm0        ; ymm1=Out0L=( 0  1  2 ... 29 30 31)
    416    vpor        ymm5, ymm5, ymm2        ; ymm5=Out0H=(32 33 34 ... 61 62 63)
    417 
    418    vmovdqu     YMMWORD [rdx+0*SIZEOF_YMMWORD], ymm1
    419    vmovdqu     YMMWORD [rdx+1*SIZEOF_YMMWORD], ymm5
    420 
    421    ; -- process the lower row
    422 
    423    vmovdqu     ymm6, YMMWORD [rdi+0*SIZEOF_YMMWORD]  ; ymm6=Int1L=( 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15)
    424    vmovdqu     ymm4, YMMWORD [rdi+1*SIZEOF_YMMWORD]  ; ymm4=Int1H=(16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31)
    425 
    426    vperm2i128  ymm7, ymm8, ymm6, 0x03
    427    vpalignr    ymm7, ymm7, ymm6, 2     ; ymm7=( 1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 --)
    428    vperm2i128  ymm3, ymm8, ymm4, 0x20
    429    vpslldq     ymm3, ymm3, 14          ; ymm3=(-- -- -- -- -- -- -- -- -- -- -- -- -- -- -- 16)
    430 
    431    vperm2i128  ymm0, ymm8, ymm6, 0x03
    432    vpsrldq     ymm0, ymm0, 14          ; ymm0=(15 -- -- -- -- -- -- -- -- -- -- -- -- -- -- --)
    433    vperm2i128  ymm2, ymm8, ymm4, 0x20
    434    vpalignr    ymm2, ymm4, ymm2, 14    ; ymm2=(-- 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30)
    435 
    436    vpor        ymm7, ymm7, ymm3        ; ymm7=( 1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16)
    437    vpor        ymm0, ymm0, ymm2        ; ymm0=(15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30)
    438 
    439    vperm2i128  ymm5, ymm8, ymm4, 0x03
    440    vpalignr    ymm5, ymm5, ymm4, 2     ; ymm5=(17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 --)
    441    vperm2i128  ymm3, ymm8, ymm4, 0x03
    442    vpsrldq     ymm3, ymm3, 14          ; ymm3=(31 -- -- -- -- -- -- -- -- -- -- -- -- -- -- --)
    443    vperm2i128  ymm1, ymm8, ymm6, 0x20
    444    vpalignr    ymm1, ymm6, ymm1, 14    ; ymm1=(--  0  1  2  3  4  5  6  7  8  9 10 11 12 13 14)
    445 
    446    vpor        ymm1, ymm1, YMMWORD [wk(1)]  ; ymm1=(-1  0  1  2  3  4  5  6  7  8  9 10 11 12 13 14)
    447    vpor        ymm5, ymm5, YMMWORD [wk(3)]  ; ymm5=(17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32)
    448 
    449    vmovdqa     YMMWORD [wk(1)], ymm3
    450 
    451    vpmullw     ymm6, ymm6, [rel PW_THREE]
    452    vpmullw     ymm4, ymm4, [rel PW_THREE]
    453    vpaddw      ymm1, ymm1, [rel PW_EIGHT]
    454    vpaddw      ymm0, ymm0, [rel PW_EIGHT]
    455    vpaddw      ymm7, ymm7, [rel PW_SEVEN]
    456    vpaddw      ymm5, ymm5, [rel PW_SEVEN]
    457 
    458    vpaddw      ymm1, ymm1, ymm6
    459    vpaddw      ymm0, ymm0, ymm4
    460    vpsrlw      ymm1, ymm1, 4           ; ymm1=Out1LE=( 0  2  4  6  8 10 12 14 16 18 20 22 24 26 28 30)
    461    vpsrlw      ymm0, ymm0, 4           ; ymm0=Out1HE=(32 34 36 38 40 42 44 46 48 50 52 54 56 58 60 62)
    462    vpaddw      ymm7, ymm7, ymm6
    463    vpaddw      ymm5, ymm5, ymm4
    464    vpsrlw      ymm7, ymm7, 4           ; ymm7=Out1LO=( 1  3  5  7  9 11 13 15 17 19 21 23 25 27 29 31)
    465    vpsrlw      ymm5, ymm5, 4           ; ymm5=Out1HO=(33 35 37 39 41 43 45 47 49 51 53 55 57 59 61 63)
    466 
    467    vpsllw      ymm7, ymm7, BYTE_BIT
    468    vpsllw      ymm5, ymm5, BYTE_BIT
    469    vpor        ymm1, ymm1, ymm7        ; ymm1=Out1L=( 0  1  2 ... 29 30 31)
    470    vpor        ymm0, ymm0, ymm5        ; ymm0=Out1H=(32 33 34 ... 61 62 63)
    471 
    472    vmovdqu     YMMWORD [rdi+0*SIZEOF_YMMWORD], ymm1
    473    vmovdqu     YMMWORD [rdi+1*SIZEOF_YMMWORD], ymm0
    474 
    475    sub         rax, byte SIZEOF_YMMWORD
    476    add         rcx, byte 1*SIZEOF_YMMWORD  ; inptr1(above)
    477    add         rbx, byte 1*SIZEOF_YMMWORD  ; inptr0
    478    add         rsi, byte 1*SIZEOF_YMMWORD  ; inptr1(below)
    479    add         rdx, byte 2*SIZEOF_YMMWORD  ; outptr0
    480    add         rdi, byte 2*SIZEOF_YMMWORD  ; outptr1
    481    cmp         rax, byte SIZEOF_YMMWORD
    482    ja          near .columnloop
    483    test        rax, rax
    484    jnz         near .columnloop_last
    485 
    486    pop         rsi
    487    pop         rdi
    488    pop         rcx
    489    pop         rax
    490 
    491    add         rsi, byte 1*SIZEOF_JSAMPROW  ; input_data
    492    add         rdi, byte 2*SIZEOF_JSAMPROW  ; output_data
    493    sub         rcx, byte 2                  ; rowctr
    494    jg          near .rowloop
    495 
    496 .return:
    497    pop         rbx
    498    vzeroupper
    499    UNCOLLECT_ARGS 4
    500    POP_XMM     3
    501    lea         rsp, [rbp-8]
    502    pop         r15
    503    pop         rbp
    504    ret
    505 
    506 ; --------------------------------------------------------------------------
    507 ;
    508 ; Fast processing for the common case of 2:1 horizontal and 1:1 vertical.
    509 ; It's still a box filter.
    510 ;
    511 ; GLOBAL(void)
    512 ; jsimd_h2v1_upsample_avx2(int max_v_samp_factor, JDIMENSION output_width,
    513 ;                          JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr);
    514 ;
    515 
    516 ; r10 = int max_v_samp_factor
    517 ; r11d = JDIMENSION output_width
    518 ; r12 = JSAMPARRAY input_data
    519 ; r13 = JSAMPARRAY *output_data_ptr
    520 
    521    align       32
    522    GLOBAL_FUNCTION(jsimd_h2v1_upsample_avx2)
    523 
    524 EXTN(jsimd_h2v1_upsample_avx2):
    525    ENDBR64
    526    push        rbp
    527    mov         rbp, rsp
    528    COLLECT_ARGS 4
    529 
    530    mov         edx, r11d
    531    add         rdx, byte (SIZEOF_YMMWORD-1)
    532    and         rdx, -SIZEOF_YMMWORD
    533    jz          near .return
    534 
    535    mov         rcx, r10                ; rowctr
    536    test        rcx, rcx
    537    jz          short .return
    538 
    539    mov         rsi, r12                ; input_data
    540    mov         rdi, r13
    541    mov         rdip, JSAMPARRAY [rdi]  ; output_data
    542 .rowloop:
    543    push        rdi
    544    push        rsi
    545 
    546    mov         rsip, JSAMPROW [rsi]    ; inptr
    547    mov         rdip, JSAMPROW [rdi]    ; outptr
    548    mov         rax, rdx                ; colctr
    549 .columnloop:
    550 
    551    cmp         rax, byte SIZEOF_YMMWORD
    552    ja          near .above_16
    553 
    554    vmovdqu     xmm0, XMMWORD [rsi+0*SIZEOF_YMMWORD]
    555    vpunpckhbw  xmm1, xmm0, xmm0
    556    vpunpcklbw  xmm0, xmm0, xmm0
    557 
    558    vmovdqu     XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm0
    559    vmovdqu     XMMWORD [rdi+1*SIZEOF_XMMWORD], xmm1
    560 
    561    jmp         short .nextrow
    562 
    563 .above_16:
    564    vmovdqu     ymm0, YMMWORD [rsi+0*SIZEOF_YMMWORD]
    565 
    566    vpermq      ymm0, ymm0, 0xd8
    567    vpunpckhbw  ymm1, ymm0, ymm0
    568    vpunpcklbw  ymm0, ymm0, ymm0
    569 
    570    vmovdqu     YMMWORD [rdi+0*SIZEOF_YMMWORD], ymm0
    571    vmovdqu     YMMWORD [rdi+1*SIZEOF_YMMWORD], ymm1
    572 
    573    sub         rax, byte 2*SIZEOF_YMMWORD
    574    jz          short .nextrow
    575 
    576    add         rsi, byte SIZEOF_YMMWORD    ; inptr
    577    add         rdi, byte 2*SIZEOF_YMMWORD  ; outptr
    578    jmp         short .columnloop
    579 
    580 .nextrow:
    581    pop         rsi
    582    pop         rdi
    583 
    584    add         rsi, byte SIZEOF_JSAMPROW  ; input_data
    585    add         rdi, byte SIZEOF_JSAMPROW  ; output_data
    586    dec         rcx                        ; rowctr
    587    jg          short .rowloop
    588 
    589 .return:
    590    vzeroupper
    591    UNCOLLECT_ARGS 4
    592    pop         rbp
    593    ret
    594 
    595 ; --------------------------------------------------------------------------
    596 ;
    597 ; Fast processing for the common case of 2:1 horizontal and 2:1 vertical.
    598 ; It's still a box filter.
    599 ;
    600 ; GLOBAL(void)
    601 ; jsimd_h2v2_upsample_avx2(int max_v_samp_factor, JDIMENSION output_width,
    602 ;                          JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr);
    603 ;
    604 
    605 ; r10 = int max_v_samp_factor
    606 ; r11d = JDIMENSION output_width
    607 ; r12 = JSAMPARRAY input_data
    608 ; r13 = JSAMPARRAY *output_data_ptr
    609 
    610    align       32
    611    GLOBAL_FUNCTION(jsimd_h2v2_upsample_avx2)
    612 
    613 EXTN(jsimd_h2v2_upsample_avx2):
    614    ENDBR64
    615    push        rbp
    616    mov         rbp, rsp
    617    COLLECT_ARGS 4
    618    push        rbx
    619 
    620    mov         edx, r11d
    621    add         rdx, byte (SIZEOF_YMMWORD-1)
    622    and         rdx, -SIZEOF_YMMWORD
    623    jz          near .return
    624 
    625    mov         rcx, r10                ; rowctr
    626    test        rcx, rcx
    627    jz          near .return
    628 
    629    mov         rsi, r12                ; input_data
    630    mov         rdi, r13
    631    mov         rdip, JSAMPARRAY [rdi]  ; output_data
    632 .rowloop:
    633    push        rdi
    634    push        rsi
    635 
    636    mov         rsip, JSAMPROW [rsi]                   ; inptr
    637    mov         rbxp, JSAMPROW [rdi+0*SIZEOF_JSAMPROW] ; outptr0
    638    mov         rdip, JSAMPROW [rdi+1*SIZEOF_JSAMPROW] ; outptr1
    639    mov         rax, rdx                               ; colctr
    640 .columnloop:
    641 
    642    cmp         rax, byte SIZEOF_YMMWORD
    643    ja          short .above_16
    644 
    645    vmovdqu     xmm0, XMMWORD [rsi+0*SIZEOF_XMMWORD]
    646    vpunpckhbw  xmm1, xmm0, xmm0
    647    vpunpcklbw  xmm0, xmm0, xmm0
    648 
    649    vmovdqu     XMMWORD [rbx+0*SIZEOF_XMMWORD], xmm0
    650    vmovdqu     XMMWORD [rbx+1*SIZEOF_XMMWORD], xmm1
    651    vmovdqu     XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm0
    652    vmovdqu     XMMWORD [rdi+1*SIZEOF_XMMWORD], xmm1
    653 
    654    jmp         near .nextrow
    655 
    656 .above_16:
    657    vmovdqu     ymm0, YMMWORD [rsi+0*SIZEOF_YMMWORD]
    658 
    659    vpermq      ymm0, ymm0, 0xd8
    660    vpunpckhbw  ymm1, ymm0, ymm0
    661    vpunpcklbw  ymm0, ymm0, ymm0
    662 
    663    vmovdqu     YMMWORD [rbx+0*SIZEOF_YMMWORD], ymm0
    664    vmovdqu     YMMWORD [rbx+1*SIZEOF_YMMWORD], ymm1
    665    vmovdqu     YMMWORD [rdi+0*SIZEOF_YMMWORD], ymm0
    666    vmovdqu     YMMWORD [rdi+1*SIZEOF_YMMWORD], ymm1
    667 
    668    sub         rax, byte 2*SIZEOF_YMMWORD
    669    jz          short .nextrow
    670 
    671    add         rsi, byte SIZEOF_YMMWORD  ; inptr
    672    add         rbx, 2*SIZEOF_YMMWORD     ; outptr0
    673    add         rdi, 2*SIZEOF_YMMWORD     ; outptr1
    674    jmp         short .columnloop
    675 
    676 .nextrow:
    677    pop         rsi
    678    pop         rdi
    679 
    680    add         rsi, byte 1*SIZEOF_JSAMPROW  ; input_data
    681    add         rdi, byte 2*SIZEOF_JSAMPROW  ; output_data
    682    sub         rcx, byte 2                  ; rowctr
    683    jg          near .rowloop
    684 
    685 .return:
    686    pop         rbx
    687    vzeroupper
    688    UNCOLLECT_ARGS 4
    689    pop         rbp
    690    ret
    691 
    692 ; For some reason, the OS X linker does not honor the request to align the
    693 ; segment unless we do this.
    694    align       32