tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

jdsample-sse2.asm (23196B)


      1 ;
      2 ; jdsample.asm - upsampling (64-bit SSE2)
      3 ;
      4 ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
      5 ; Copyright (C) 2009, 2016, 2024, D. R. Commander.
      6 ; Copyright (C) 2018, Matthias Räncker.
      7 ; Copyright (C) 2023, Aliaksiej Kandracienka.
      8 ;
      9 ; Based on the x86 SIMD extension for IJG JPEG library
     10 ; Copyright (C) 1999-2006, MIYASAKA Masaru.
     11 ; For conditions of distribution and use, see copyright notice in jsimdext.inc
     12 ;
     13 ; This file should be assembled with NASM (Netwide Assembler) or Yasm.
     14 
     15 %include "jsimdext.inc"
     16 
     17 ; --------------------------------------------------------------------------
     18    SECTION     SEG_CONST
     19 
     20    ALIGNZ      32
     21    GLOBAL_DATA(jconst_fancy_upsample_sse2)
     22 
     23 EXTN(jconst_fancy_upsample_sse2):
     24 
     25 PW_ONE   times 8 dw 1
     26 PW_TWO   times 8 dw 2
     27 PW_THREE times 8 dw 3
     28 PW_SEVEN times 8 dw 7
     29 PW_EIGHT times 8 dw 8
     30 
     31    ALIGNZ      32
     32 
     33 ; --------------------------------------------------------------------------
     34    SECTION     SEG_TEXT
     35    BITS        64
     36 ;
     37 ; Fancy processing for the common case of 2:1 horizontal and 1:1 vertical.
     38 ;
     39 ; The upsampling algorithm is linear interpolation between pixel centers,
     40 ; also known as a "triangle filter".  This is a good compromise between
     41 ; speed and visual quality.  The centers of the output pixels are 1/4 and 3/4
     42 ; of the way between input pixel centers.
     43 ;
     44 ; GLOBAL(void)
     45 ; jsimd_h2v1_fancy_upsample_sse2(int max_v_samp_factor,
     46 ;                                JDIMENSION downsampled_width,
     47 ;                                JSAMPARRAY input_data,
     48 ;                                JSAMPARRAY *output_data_ptr);
     49 ;
     50 
     51 ; r10 = int max_v_samp_factor
     52 ; r11d = JDIMENSION downsampled_width
     53 ; r12 = JSAMPARRAY input_data
     54 ; r13 = JSAMPARRAY *output_data_ptr
     55 
     56    align       32
     57    GLOBAL_FUNCTION(jsimd_h2v1_fancy_upsample_sse2)
     58 
     59 EXTN(jsimd_h2v1_fancy_upsample_sse2):
     60    ENDBR64
     61    push        rbp
     62    mov         rbp, rsp
     63    COLLECT_ARGS 4
     64 
     65    mov         eax, r11d               ; colctr
     66    test        rax, rax
     67    jz          near .return
     68 
     69    mov         rcx, r10                ; rowctr
     70    test        rcx, rcx
     71    jz          near .return
     72 
     73    mov         rsi, r12                ; input_data
     74    mov         rdi, r13
     75    mov         rdip, JSAMPARRAY [rdi]  ; output_data
     76 .rowloop:
     77    push        rax                     ; colctr
     78    push        rdi
     79    push        rsi
     80 
     81    mov         rsip, JSAMPROW [rsi]    ; inptr
     82    mov         rdip, JSAMPROW [rdi]    ; outptr
     83 
     84    test        rax, SIZEOF_XMMWORD-1
     85    jz          short .skip
     86    mov         dl, JSAMPLE [rsi+(rax-1)*SIZEOF_JSAMPLE]
     87    mov         JSAMPLE [rsi+rax*SIZEOF_JSAMPLE], dl    ; insert a dummy sample
     88 .skip:
     89    pxor        xmm0, xmm0              ; xmm0=(all 0's)
     90    pcmpeqb     xmm7, xmm7
     91    psrldq      xmm7, (SIZEOF_XMMWORD-1)
     92    pand        xmm7, XMMWORD [rsi+0*SIZEOF_XMMWORD]
     93 
     94    add         rax, byte SIZEOF_XMMWORD-1
     95    and         rax, byte -SIZEOF_XMMWORD
     96    cmp         rax, byte SIZEOF_XMMWORD
     97    ja          short .columnloop
     98 
     99 .columnloop_last:
    100    pcmpeqb     xmm6, xmm6
    101    pslldq      xmm6, (SIZEOF_XMMWORD-1)
    102    pand        xmm6, XMMWORD [rsi+0*SIZEOF_XMMWORD]
    103    jmp         short .upsample
    104 
    105 .columnloop:
    106    movdqa      xmm6, XMMWORD [rsi+1*SIZEOF_XMMWORD]
    107    pslldq      xmm6, (SIZEOF_XMMWORD-1)
    108 
    109 .upsample:
    110    movdqa      xmm1, XMMWORD [rsi+0*SIZEOF_XMMWORD]
    111    movdqa      xmm2, xmm1
    112    movdqa      xmm3, xmm1                ; xmm1=( 0  1  2 ... 13 14 15)
    113    pslldq      xmm2, 1                   ; xmm2=(--  0  1 ... 12 13 14)
    114    psrldq      xmm3, 1                   ; xmm3=( 1  2  3 ... 14 15 --)
    115 
    116    por         xmm2, xmm7                ; xmm2=(-1  0  1 ... 12 13 14)
    117    por         xmm3, xmm6                ; xmm3=( 1  2  3 ... 14 15 16)
    118 
    119    movdqa      xmm7, xmm1
    120    psrldq      xmm7, (SIZEOF_XMMWORD-1)  ; xmm7=(15 -- -- ... -- -- --)
    121 
    122    movdqa      xmm4, xmm1
    123    punpcklbw   xmm1, xmm0                ; xmm1=( 0  1  2  3  4  5  6  7)
    124    punpckhbw   xmm4, xmm0                ; xmm4=( 8  9 10 11 12 13 14 15)
    125    movdqa      xmm5, xmm2
    126    punpcklbw   xmm2, xmm0                ; xmm2=(-1  0  1  2  3  4  5  6)
    127    punpckhbw   xmm5, xmm0                ; xmm5=( 7  8  9 10 11 12 13 14)
    128    movdqa      xmm6, xmm3
    129    punpcklbw   xmm3, xmm0                ; xmm3=( 1  2  3  4  5  6  7  8)
    130    punpckhbw   xmm6, xmm0                ; xmm6=( 9 10 11 12 13 14 15 16)
    131 
    132    pmullw      xmm1, [rel PW_THREE]
    133    pmullw      xmm4, [rel PW_THREE]
    134    paddw       xmm2, [rel PW_ONE]
    135    paddw       xmm5, [rel PW_ONE]
    136    paddw       xmm3, [rel PW_TWO]
    137    paddw       xmm6, [rel PW_TWO]
    138 
    139    paddw       xmm2, xmm1
    140    paddw       xmm5, xmm4
    141    psrlw       xmm2, 2                 ; xmm2=OutLE=( 0  2  4  6  8 10 12 14)
    142    psrlw       xmm5, 2                 ; xmm5=OutHE=(16 18 20 22 24 26 28 30)
    143    paddw       xmm3, xmm1
    144    paddw       xmm6, xmm4
    145    psrlw       xmm3, 2                 ; xmm3=OutLO=( 1  3  5  7  9 11 13 15)
    146    psrlw       xmm6, 2                 ; xmm6=OutHO=(17 19 21 23 25 27 29 31)
    147 
    148    psllw       xmm3, BYTE_BIT
    149    psllw       xmm6, BYTE_BIT
    150    por         xmm2, xmm3              ; xmm2=OutL=( 0  1  2 ... 13 14 15)
    151    por         xmm5, xmm6              ; xmm5=OutH=(16 17 18 ... 29 30 31)
    152 
    153    movdqa      XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm2
    154    movdqa      XMMWORD [rdi+1*SIZEOF_XMMWORD], xmm5
    155 
    156    sub         rax, byte SIZEOF_XMMWORD
    157    add         rsi, byte 1*SIZEOF_XMMWORD  ; inptr
    158    add         rdi, byte 2*SIZEOF_XMMWORD  ; outptr
    159    cmp         rax, byte SIZEOF_XMMWORD
    160    ja          near .columnloop
    161    test        eax, eax
    162    jnz         near .columnloop_last
    163 
    164    pop         rsi
    165    pop         rdi
    166    pop         rax
    167 
    168    add         rsi, byte SIZEOF_JSAMPROW  ; input_data
    169    add         rdi, byte SIZEOF_JSAMPROW  ; output_data
    170    dec         rcx                        ; rowctr
    171    jg          near .rowloop
    172 
    173 .return:
    174    UNCOLLECT_ARGS 4
    175    pop         rbp
    176    ret
    177 
    178 ; --------------------------------------------------------------------------
    179 ;
    180 ; Fancy processing for the common case of 2:1 horizontal and 2:1 vertical.
    181 ; Again a triangle filter; see comments for h2v1 case, above.
    182 ;
    183 ; GLOBAL(void)
    184 ; jsimd_h2v2_fancy_upsample_sse2(int max_v_samp_factor,
    185 ;                                JDIMENSION downsampled_width,
    186 ;                                JSAMPARRAY input_data,
    187 ;                                JSAMPARRAY *output_data_ptr);
    188 ;
    189 
    190 ; r10 = int max_v_samp_factor
    191 ; r11d = JDIMENSION downsampled_width
    192 ; r12 = JSAMPARRAY input_data
    193 ; r13 = JSAMPARRAY *output_data_ptr
    194 
    195 %define wk(i)   r15 - (WK_NUM - (i)) * SIZEOF_XMMWORD  ; xmmword wk[WK_NUM]
    196 %define WK_NUM  4
    197 
    198    align       32
    199    GLOBAL_FUNCTION(jsimd_h2v2_fancy_upsample_sse2)
    200 
    201 EXTN(jsimd_h2v2_fancy_upsample_sse2):
    202    ENDBR64
    203    push        rbp
    204    mov         rbp, rsp
    205    push        r15
    206    and         rsp, byte (-SIZEOF_XMMWORD)  ; align to 128 bits
    207    ; Allocate stack space for wk array.  r15 is used to access it.
    208    mov         r15, rsp
    209    sub         rsp, byte (SIZEOF_XMMWORD * WK_NUM)
    210    COLLECT_ARGS 4
    211    push        rbx
    212 
    213    mov         eax, r11d               ; colctr
    214    test        rax, rax
    215    jz          near .return
    216 
    217    mov         rcx, r10                ; rowctr
    218    test        rcx, rcx
    219    jz          near .return
    220 
    221    mov         rsi, r12                ; input_data
    222    mov         rdi, r13
    223    mov         rdip, JSAMPARRAY [rdi]  ; output_data
    224 .rowloop:
    225    push        rax                     ; colctr
    226    push        rcx
    227    push        rdi
    228    push        rsi
    229 
    230    mov         rcxp, JSAMPROW [rsi-1*SIZEOF_JSAMPROW]  ; inptr1(above)
    231    mov         rbxp, JSAMPROW [rsi+0*SIZEOF_JSAMPROW]  ; inptr0
    232    mov         rsip, JSAMPROW [rsi+1*SIZEOF_JSAMPROW]  ; inptr1(below)
    233    mov         rdxp, JSAMPROW [rdi+0*SIZEOF_JSAMPROW]  ; outptr0
    234    mov         rdip, JSAMPROW [rdi+1*SIZEOF_JSAMPROW]  ; outptr1
    235 
    236    test        rax, SIZEOF_XMMWORD-1
    237    jz          short .skip
    238    push        rdx
    239    mov         dl, JSAMPLE [rcx+(rax-1)*SIZEOF_JSAMPLE]
    240    mov         JSAMPLE [rcx+rax*SIZEOF_JSAMPLE], dl
    241    mov         dl, JSAMPLE [rbx+(rax-1)*SIZEOF_JSAMPLE]
    242    mov         JSAMPLE [rbx+rax*SIZEOF_JSAMPLE], dl
    243    mov         dl, JSAMPLE [rsi+(rax-1)*SIZEOF_JSAMPLE]
    244    mov         JSAMPLE [rsi+rax*SIZEOF_JSAMPLE], dl    ; insert a dummy sample
    245    pop         rdx
    246 .skip:
    247    ; -- process the first column block
    248 
    249    movdqa      xmm0, XMMWORD [rbx+0*SIZEOF_XMMWORD]  ; xmm0=row[ 0][0]
    250    movdqa      xmm1, XMMWORD [rcx+0*SIZEOF_XMMWORD]  ; xmm1=row[-1][0]
    251    movdqa      xmm2, XMMWORD [rsi+0*SIZEOF_XMMWORD]  ; xmm2=row[+1][0]
    252 
    253    pxor        xmm3, xmm3              ; xmm3=(all 0's)
    254    movdqa      xmm4, xmm0
    255    punpcklbw   xmm0, xmm3              ; xmm0=row[ 0]( 0  1  2  3  4  5  6  7)
    256    punpckhbw   xmm4, xmm3              ; xmm4=row[ 0]( 8  9 10 11 12 13 14 15)
    257    movdqa      xmm5, xmm1
    258    punpcklbw   xmm1, xmm3              ; xmm1=row[-1]( 0  1  2  3  4  5  6  7)
    259    punpckhbw   xmm5, xmm3              ; xmm5=row[-1]( 8  9 10 11 12 13 14 15)
    260    movdqa      xmm6, xmm2
    261    punpcklbw   xmm2, xmm3              ; xmm2=row[+1]( 0  1  2  3  4  5  6  7)
    262    punpckhbw   xmm6, xmm3              ; xmm6=row[+1]( 8  9 10 11 12 13 14 15)
    263 
    264    pmullw      xmm0, [rel PW_THREE]
    265    pmullw      xmm4, [rel PW_THREE]
    266 
    267    pcmpeqb     xmm7, xmm7
    268    psrldq      xmm7, (SIZEOF_XMMWORD-2)
    269 
    270    paddw       xmm1, xmm0              ; xmm1=Int0L=( 0  1  2  3  4  5  6  7)
    271    paddw       xmm5, xmm4              ; xmm5=Int0H=( 8  9 10 11 12 13 14 15)
    272    paddw       xmm2, xmm0              ; xmm2=Int1L=( 0  1  2  3  4  5  6  7)
    273    paddw       xmm6, xmm4              ; xmm6=Int1H=( 8  9 10 11 12 13 14 15)
    274 
    275    movdqa      XMMWORD [rdx+0*SIZEOF_XMMWORD], xmm1  ; temporarily save
    276    movdqa      XMMWORD [rdx+1*SIZEOF_XMMWORD], xmm5  ; the intermediate data
    277    movdqa      XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm2
    278    movdqa      XMMWORD [rdi+1*SIZEOF_XMMWORD], xmm6
    279 
    280    pand        xmm1, xmm7              ; xmm1=( 0 -- -- -- -- -- -- --)
    281    pand        xmm2, xmm7              ; xmm2=( 0 -- -- -- -- -- -- --)
    282 
    283    movdqa      XMMWORD [wk(0)], xmm1
    284    movdqa      XMMWORD [wk(1)], xmm2
    285 
    286    add         rax, byte SIZEOF_XMMWORD-1
    287    and         rax, byte -SIZEOF_XMMWORD
    288    cmp         rax, byte SIZEOF_XMMWORD
    289    ja          short .columnloop
    290 
    291 .columnloop_last:
    292    ; -- process the last column block
    293 
    294    pcmpeqb     xmm1, xmm1
    295    pslldq      xmm1, (SIZEOF_XMMWORD-2)
    296    movdqa      xmm2, xmm1
    297 
    298    pand        xmm1, XMMWORD [rdx+1*SIZEOF_XMMWORD]
    299    pand        xmm2, XMMWORD [rdi+1*SIZEOF_XMMWORD]
    300 
    301    movdqa      XMMWORD [wk(2)], xmm1   ; xmm1=(-- -- -- -- -- -- -- 15)
    302    movdqa      XMMWORD [wk(3)], xmm2   ; xmm2=(-- -- -- -- -- -- -- 15)
    303 
    304    jmp         near .upsample
    305 
    306 .columnloop:
    307    ; -- process the next column block
    308 
    309    movdqa      xmm0, XMMWORD [rbx+1*SIZEOF_XMMWORD]  ; xmm0=row[ 0][1]
    310    movdqa      xmm1, XMMWORD [rcx+1*SIZEOF_XMMWORD]  ; xmm1=row[-1][1]
    311    movdqa      xmm2, XMMWORD [rsi+1*SIZEOF_XMMWORD]  ; xmm2=row[+1][1]
    312 
    313    pxor        xmm3, xmm3              ; xmm3=(all 0's)
    314    movdqa      xmm4, xmm0
    315    punpcklbw   xmm0, xmm3              ; xmm0=row[ 0]( 0  1  2  3  4  5  6  7)
    316    punpckhbw   xmm4, xmm3              ; xmm4=row[ 0]( 8  9 10 11 12 13 14 15)
    317    movdqa      xmm5, xmm1
    318    punpcklbw   xmm1, xmm3              ; xmm1=row[-1]( 0  1  2  3  4  5  6  7)
    319    punpckhbw   xmm5, xmm3              ; xmm5=row[-1]( 8  9 10 11 12 13 14 15)
    320    movdqa      xmm6, xmm2
    321    punpcklbw   xmm2, xmm3              ; xmm2=row[+1]( 0  1  2  3  4  5  6  7)
    322    punpckhbw   xmm6, xmm3              ; xmm6=row[+1]( 8  9 10 11 12 13 14 15)
    323 
    324    pmullw      xmm0, [rel PW_THREE]
    325    pmullw      xmm4, [rel PW_THREE]
    326 
    327    paddw       xmm1, xmm0              ; xmm1=Int0L=( 0  1  2  3  4  5  6  7)
    328    paddw       xmm5, xmm4              ; xmm5=Int0H=( 8  9 10 11 12 13 14 15)
    329    paddw       xmm2, xmm0              ; xmm2=Int1L=( 0  1  2  3  4  5  6  7)
    330    paddw       xmm6, xmm4              ; xmm6=Int1H=( 8  9 10 11 12 13 14 15)
    331 
    332    movdqa      XMMWORD [rdx+2*SIZEOF_XMMWORD], xmm1  ; temporarily save
    333    movdqa      XMMWORD [rdx+3*SIZEOF_XMMWORD], xmm5  ; the intermediate data
    334    movdqa      XMMWORD [rdi+2*SIZEOF_XMMWORD], xmm2
    335    movdqa      XMMWORD [rdi+3*SIZEOF_XMMWORD], xmm6
    336 
    337    pslldq      xmm1, (SIZEOF_XMMWORD-2)  ; xmm1=(-- -- -- -- -- -- --  0)
    338    pslldq      xmm2, (SIZEOF_XMMWORD-2)  ; xmm2=(-- -- -- -- -- -- --  0)
    339 
    340    movdqa      XMMWORD [wk(2)], xmm1
    341    movdqa      XMMWORD [wk(3)], xmm2
    342 
    343 .upsample:
    344    ; -- process the upper row
    345 
    346    movdqa      xmm7, XMMWORD [rdx+0*SIZEOF_XMMWORD]
    347    movdqa      xmm3, XMMWORD [rdx+1*SIZEOF_XMMWORD]
    348 
    349    movdqa      xmm0, xmm7                ; xmm7=Int0L=( 0  1  2  3  4  5  6  7)
    350    movdqa      xmm4, xmm3                ; xmm3=Int0H=( 8  9 10 11 12 13 14 15)
    351    psrldq      xmm0, 2                   ; xmm0=( 1  2  3  4  5  6  7 --)
    352    pslldq      xmm4, (SIZEOF_XMMWORD-2)  ; xmm4=(-- -- -- -- -- -- --  8)
    353    movdqa      xmm5, xmm7
    354    movdqa      xmm6, xmm3
    355    psrldq      xmm5, (SIZEOF_XMMWORD-2)  ; xmm5=( 7 -- -- -- -- -- -- --)
    356    pslldq      xmm6, 2                   ; xmm6=(--  8  9 10 11 12 13 14)
    357 
    358    por         xmm0, xmm4                ; xmm0=( 1  2  3  4  5  6  7  8)
    359    por         xmm5, xmm6                ; xmm5=( 7  8  9 10 11 12 13 14)
    360 
    361    movdqa      xmm1, xmm7
    362    movdqa      xmm2, xmm3
    363    pslldq      xmm1, 2                   ; xmm1=(--  0  1  2  3  4  5  6)
    364    psrldq      xmm2, 2                   ; xmm2=( 9 10 11 12 13 14 15 --)
    365    movdqa      xmm4, xmm3
    366    psrldq      xmm4, (SIZEOF_XMMWORD-2)  ; xmm4=(15 -- -- -- -- -- -- --)
    367 
    368    por         xmm1, XMMWORD [wk(0)]     ; xmm1=(-1  0  1  2  3  4  5  6)
    369    por         xmm2, XMMWORD [wk(2)]     ; xmm2=( 9 10 11 12 13 14 15 16)
    370 
    371    movdqa      XMMWORD [wk(0)], xmm4
    372 
    373    pmullw      xmm7, [rel PW_THREE]
    374    pmullw      xmm3, [rel PW_THREE]
    375    paddw       xmm1, [rel PW_EIGHT]
    376    paddw       xmm5, [rel PW_EIGHT]
    377    paddw       xmm0, [rel PW_SEVEN]
    378    paddw       xmm2, [rel PW_SEVEN]
    379 
    380    paddw       xmm1, xmm7
    381    paddw       xmm5, xmm3
    382    psrlw       xmm1, 4                 ; xmm1=Out0LE=( 0  2  4  6  8 10 12 14)
    383    psrlw       xmm5, 4                 ; xmm5=Out0HE=(16 18 20 22 24 26 28 30)
    384    paddw       xmm0, xmm7
    385    paddw       xmm2, xmm3
    386    psrlw       xmm0, 4                 ; xmm0=Out0LO=( 1  3  5  7  9 11 13 15)
    387    psrlw       xmm2, 4                 ; xmm2=Out0HO=(17 19 21 23 25 27 29 31)
    388 
    389    psllw       xmm0, BYTE_BIT
    390    psllw       xmm2, BYTE_BIT
    391    por         xmm1, xmm0              ; xmm1=Out0L=( 0  1  2 ... 13 14 15)
    392    por         xmm5, xmm2              ; xmm5=Out0H=(16 17 18 ... 29 30 31)
    393 
    394    movdqa      XMMWORD [rdx+0*SIZEOF_XMMWORD], xmm1
    395    movdqa      XMMWORD [rdx+1*SIZEOF_XMMWORD], xmm5
    396 
    397    ; -- process the lower row
    398 
    399    movdqa      xmm6, XMMWORD [rdi+0*SIZEOF_XMMWORD]
    400    movdqa      xmm4, XMMWORD [rdi+1*SIZEOF_XMMWORD]
    401 
    402    movdqa      xmm7, xmm6                ; xmm6=Int1L=( 0  1  2  3  4  5  6  7)
    403    movdqa      xmm3, xmm4                ; xmm4=Int1H=( 8  9 10 11 12 13 14 15)
    404    psrldq      xmm7, 2                   ; xmm7=( 1  2  3  4  5  6  7 --)
    405    pslldq      xmm3, (SIZEOF_XMMWORD-2)  ; xmm3=(-- -- -- -- -- -- --  8)
    406    movdqa      xmm0, xmm6
    407    movdqa      xmm2, xmm4
    408    psrldq      xmm0, (SIZEOF_XMMWORD-2)  ; xmm0=( 7 -- -- -- -- -- -- --)
    409    pslldq      xmm2, 2                   ; xmm2=(--  8  9 10 11 12 13 14)
    410 
    411    por         xmm7, xmm3                ; xmm7=( 1  2  3  4  5  6  7  8)
    412    por         xmm0, xmm2                ; xmm0=( 7  8  9 10 11 12 13 14)
    413 
    414    movdqa      xmm1, xmm6
    415    movdqa      xmm5, xmm4
    416    pslldq      xmm1, 2                   ; xmm1=(--  0  1  2  3  4  5  6)
    417    psrldq      xmm5, 2                   ; xmm5=( 9 10 11 12 13 14 15 --)
    418    movdqa      xmm3, xmm4
    419    psrldq      xmm3, (SIZEOF_XMMWORD-2)  ; xmm3=(15 -- -- -- -- -- -- --)
    420 
    421    por         xmm1, XMMWORD [wk(1)]     ; xmm1=(-1  0  1  2  3  4  5  6)
    422    por         xmm5, XMMWORD [wk(3)]     ; xmm5=( 9 10 11 12 13 14 15 16)
    423 
    424    movdqa      XMMWORD [wk(1)], xmm3
    425 
    426    pmullw      xmm6, [rel PW_THREE]
    427    pmullw      xmm4, [rel PW_THREE]
    428    paddw       xmm1, [rel PW_EIGHT]
    429    paddw       xmm0, [rel PW_EIGHT]
    430    paddw       xmm7, [rel PW_SEVEN]
    431    paddw       xmm5, [rel PW_SEVEN]
    432 
    433    paddw       xmm1, xmm6
    434    paddw       xmm0, xmm4
    435    psrlw       xmm1, 4                 ; xmm1=Out1LE=( 0  2  4  6  8 10 12 14)
    436    psrlw       xmm0, 4                 ; xmm0=Out1HE=(16 18 20 22 24 26 28 30)
    437    paddw       xmm7, xmm6
    438    paddw       xmm5, xmm4
    439    psrlw       xmm7, 4                 ; xmm7=Out1LO=( 1  3  5  7  9 11 13 15)
    440    psrlw       xmm5, 4                 ; xmm5=Out1HO=(17 19 21 23 25 27 29 31)
    441 
    442    psllw       xmm7, BYTE_BIT
    443    psllw       xmm5, BYTE_BIT
    444    por         xmm1, xmm7              ; xmm1=Out1L=( 0  1  2 ... 13 14 15)
    445    por         xmm0, xmm5              ; xmm0=Out1H=(16 17 18 ... 29 30 31)
    446 
    447    movdqa      XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm1
    448    movdqa      XMMWORD [rdi+1*SIZEOF_XMMWORD], xmm0
    449 
    450    sub         rax, byte SIZEOF_XMMWORD
    451    add         rcx, byte 1*SIZEOF_XMMWORD  ; inptr1(above)
    452    add         rbx, byte 1*SIZEOF_XMMWORD  ; inptr0
    453    add         rsi, byte 1*SIZEOF_XMMWORD  ; inptr1(below)
    454    add         rdx, byte 2*SIZEOF_XMMWORD  ; outptr0
    455    add         rdi, byte 2*SIZEOF_XMMWORD  ; outptr1
    456    cmp         rax, byte SIZEOF_XMMWORD
    457    ja          near .columnloop
    458    test        rax, rax
    459    jnz         near .columnloop_last
    460 
    461    pop         rsi
    462    pop         rdi
    463    pop         rcx
    464    pop         rax
    465 
    466    add         rsi, byte 1*SIZEOF_JSAMPROW  ; input_data
    467    add         rdi, byte 2*SIZEOF_JSAMPROW  ; output_data
    468    sub         rcx, byte 2                  ; rowctr
    469    jg          near .rowloop
    470 
    471 .return:
    472    pop         rbx
    473    UNCOLLECT_ARGS 4
    474    lea         rsp, [rbp-8]
    475    pop         r15
    476    pop         rbp
    477    ret
    478 
    479 ; --------------------------------------------------------------------------
    480 ;
    481 ; Fast processing for the common case of 2:1 horizontal and 1:1 vertical.
    482 ; It's still a box filter.
    483 ;
    484 ; GLOBAL(void)
    485 ; jsimd_h2v1_upsample_sse2(int max_v_samp_factor, JDIMENSION output_width,
    486 ;                          JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr);
    487 ;
    488 
    489 ; r10 = int max_v_samp_factor
    490 ; r11d = JDIMENSION output_width
    491 ; r12 = JSAMPARRAY input_data
    492 ; r13 = JSAMPARRAY *output_data_ptr
    493 
    494    align       32
    495    GLOBAL_FUNCTION(jsimd_h2v1_upsample_sse2)
    496 
    497 EXTN(jsimd_h2v1_upsample_sse2):
    498    ENDBR64
    499    push        rbp
    500    mov         rbp, rsp
    501    COLLECT_ARGS 4
    502 
    503    mov         edx, r11d
    504    add         rdx, byte (2*SIZEOF_XMMWORD)-1
    505    and         rdx, byte -(2*SIZEOF_XMMWORD)
    506    jz          near .return
    507 
    508    mov         rcx, r10                ; rowctr
    509    test        rcx, rcx
    510    jz          short .return
    511 
    512    mov         rsi, r12                ; input_data
    513    mov         rdi, r13
    514    mov         rdip, JSAMPARRAY [rdi]  ; output_data
    515 .rowloop:
    516    push        rdi
    517    push        rsi
    518 
    519    mov         rsip, JSAMPROW [rsi]    ; inptr
    520    mov         rdip, JSAMPROW [rdi]    ; outptr
    521    mov         rax, rdx                ; colctr
    522 .columnloop:
    523 
    524    movdqa      xmm0, XMMWORD [rsi+0*SIZEOF_XMMWORD]
    525 
    526    movdqa      xmm1, xmm0
    527    punpcklbw   xmm0, xmm0
    528    punpckhbw   xmm1, xmm1
    529 
    530    movdqa      XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm0
    531    movdqa      XMMWORD [rdi+1*SIZEOF_XMMWORD], xmm1
    532 
    533    sub         rax, byte 2*SIZEOF_XMMWORD
    534    jz          short .nextrow
    535 
    536    movdqa      xmm2, XMMWORD [rsi+1*SIZEOF_XMMWORD]
    537 
    538    movdqa      xmm3, xmm2
    539    punpcklbw   xmm2, xmm2
    540    punpckhbw   xmm3, xmm3
    541 
    542    movdqa      XMMWORD [rdi+2*SIZEOF_XMMWORD], xmm2
    543    movdqa      XMMWORD [rdi+3*SIZEOF_XMMWORD], xmm3
    544 
    545    sub         rax, byte 2*SIZEOF_XMMWORD
    546    jz          short .nextrow
    547 
    548    add         rsi, byte 2*SIZEOF_XMMWORD  ; inptr
    549    add         rdi, byte 4*SIZEOF_XMMWORD  ; outptr
    550    jmp         short .columnloop
    551 
    552 .nextrow:
    553    pop         rsi
    554    pop         rdi
    555 
    556    add         rsi, byte SIZEOF_JSAMPROW  ; input_data
    557    add         rdi, byte SIZEOF_JSAMPROW  ; output_data
    558    dec         rcx                        ; rowctr
    559    jg          short .rowloop
    560 
    561 .return:
    562    UNCOLLECT_ARGS 4
    563    pop         rbp
    564    ret
    565 
    566 ; --------------------------------------------------------------------------
    567 ;
    568 ; Fast processing for the common case of 2:1 horizontal and 2:1 vertical.
    569 ; It's still a box filter.
    570 ;
    571 ; GLOBAL(void)
    572 ; jsimd_h2v2_upsample_sse2(int max_v_samp_factor, JDIMENSION output_width,
    573 ;                          JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr);
    574 ;
    575 
    576 ; r10 = int max_v_samp_factor
    577 ; r11d = JDIMENSION output_width
    578 ; r12 = JSAMPARRAY input_data
    579 ; r13 = JSAMPARRAY *output_data_ptr
    580 
    581    align       32
    582    GLOBAL_FUNCTION(jsimd_h2v2_upsample_sse2)
    583 
    584 EXTN(jsimd_h2v2_upsample_sse2):
    585    ENDBR64
    586    push        rbp
    587    mov         rbp, rsp
    588    COLLECT_ARGS 4
    589    push        rbx
    590 
    591    mov         edx, r11d
    592    add         rdx, byte (2*SIZEOF_XMMWORD)-1
    593    and         rdx, byte -(2*SIZEOF_XMMWORD)
    594    jz          near .return
    595 
    596    mov         rcx, r10                ; rowctr
    597    test        rcx, rcx
    598    jz          near .return
    599 
    600    mov         rsi, r12                ; input_data
    601    mov         rdi, r13
    602    mov         rdip, JSAMPARRAY [rdi]  ; output_data
    603 .rowloop:
    604    push        rdi
    605    push        rsi
    606 
    607    mov         rsip, JSAMPROW [rsi]                   ; inptr
    608    mov         rbxp, JSAMPROW [rdi+0*SIZEOF_JSAMPROW] ; outptr0
    609    mov         rdip, JSAMPROW [rdi+1*SIZEOF_JSAMPROW] ; outptr1
    610    mov         rax, rdx                               ; colctr
    611 .columnloop:
    612 
    613    movdqa      xmm0, XMMWORD [rsi+0*SIZEOF_XMMWORD]
    614 
    615    movdqa      xmm1, xmm0
    616    punpcklbw   xmm0, xmm0
    617    punpckhbw   xmm1, xmm1
    618 
    619    movdqa      XMMWORD [rbx+0*SIZEOF_XMMWORD], xmm0
    620    movdqa      XMMWORD [rbx+1*SIZEOF_XMMWORD], xmm1
    621    movdqa      XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm0
    622    movdqa      XMMWORD [rdi+1*SIZEOF_XMMWORD], xmm1
    623 
    624    sub         rax, byte 2*SIZEOF_XMMWORD
    625    jz          short .nextrow
    626 
    627    movdqa      xmm2, XMMWORD [rsi+1*SIZEOF_XMMWORD]
    628 
    629    movdqa      xmm3, xmm2
    630    punpcklbw   xmm2, xmm2
    631    punpckhbw   xmm3, xmm3
    632 
    633    movdqa      XMMWORD [rbx+2*SIZEOF_XMMWORD], xmm2
    634    movdqa      XMMWORD [rbx+3*SIZEOF_XMMWORD], xmm3
    635    movdqa      XMMWORD [rdi+2*SIZEOF_XMMWORD], xmm2
    636    movdqa      XMMWORD [rdi+3*SIZEOF_XMMWORD], xmm3
    637 
    638    sub         rax, byte 2*SIZEOF_XMMWORD
    639    jz          short .nextrow
    640 
    641    add         rsi, byte 2*SIZEOF_XMMWORD  ; inptr
    642    add         rbx, byte 4*SIZEOF_XMMWORD  ; outptr0
    643    add         rdi, byte 4*SIZEOF_XMMWORD  ; outptr1
    644    jmp         short .columnloop
    645 
    646 .nextrow:
    647    pop         rsi
    648    pop         rdi
    649 
    650    add         rsi, byte 1*SIZEOF_JSAMPROW  ; input_data
    651    add         rdi, byte 2*SIZEOF_JSAMPROW  ; output_data
    652    sub         rcx, byte 2                  ; rowctr
    653    jg          near .rowloop
    654 
    655 .return:
    656    pop         rbx
    657    UNCOLLECT_ARGS 4
    658    pop         rbp
    659    ret
    660 
    661 ; For some reason, the OS X linker does not honor the request to align the
    662 ; segment unless we do this.
    663    align       32