tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

jcsample-sse2.asm (8848B)


      1 ;
      2 ; jcsample.asm - downsampling (64-bit SSE2)
      3 ;
      4 ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
      5 ; Copyright (C) 2009, 2016, 2024, D. R. Commander.
      6 ; Copyright (C) 2018, Matthias Räncker.
      7 ;
      8 ; Based on the x86 SIMD extension for IJG JPEG library
      9 ; Copyright (C) 1999-2006, MIYASAKA Masaru.
     10 ; For conditions of distribution and use, see copyright notice in jsimdext.inc
     11 ;
     12 ; This file should be assembled with NASM (Netwide Assembler) or Yasm.
     13 
     14 %include "jsimdext.inc"
     15 
     16 ; --------------------------------------------------------------------------
     17    SECTION     SEG_TEXT
     18    BITS        64
     19 ;
     20 ; Downsample pixel values of a single component.
     21 ; This version handles the common case of 2:1 horizontal and 1:1 vertical,
     22 ; without smoothing.
     23 ;
     24 ; GLOBAL(void)
     25 ; jsimd_h2v1_downsample_sse2(JDIMENSION image_width, int max_v_samp_factor,
     26 ;                            JDIMENSION v_samp_factor,
     27 ;                            JDIMENSION width_in_blocks, JSAMPARRAY input_data,
     28 ;                            JSAMPARRAY output_data);
     29 ;
     30 
     31 ; r10d = JDIMENSION image_width
     32 ; r11 = int max_v_samp_factor
     33 ; r12d = JDIMENSION v_samp_factor
     34 ; r13d = JDIMENSION width_in_blocks
     35 ; r14 = JSAMPARRAY input_data
     36 ; r15 = JSAMPARRAY output_data
     37 
     38    align       32
     39    GLOBAL_FUNCTION(jsimd_h2v1_downsample_sse2)
     40 
     41 EXTN(jsimd_h2v1_downsample_sse2):
     42    ENDBR64
     43    push        rbp
     44    mov         rbp, rsp
     45    COLLECT_ARGS 6
     46 
     47    mov         ecx, r13d
     48    shl         rcx, 3                  ; imul rcx,DCTSIZE (rcx = output_cols)
     49    jz          near .return
     50 
     51    mov         edx, r10d
     52 
     53    ; -- expand_right_edge
     54 
     55    push        rcx
     56    shl         rcx, 1                  ; output_cols * 2
     57    sub         rcx, rdx
     58    jle         short .expand_end
     59 
     60    mov         rax, r11
     61    test        rax, rax
     62    jle         short .expand_end
     63 
     64    cld
     65    mov         rsi, r14                ; input_data
     66 .expandloop:
     67    push        rax
     68    push        rcx
     69 
     70    mov         rdip, JSAMPROW [rsi]
     71    add         rdi, rdx
     72    mov         al, JSAMPLE [rdi-1]
     73 
     74    rep stosb
     75 
     76    pop         rcx
     77    pop         rax
     78 
     79    add         rsi, byte SIZEOF_JSAMPROW
     80    dec         rax
     81    jg          short .expandloop
     82 
     83 .expand_end:
     84    pop         rcx                     ; output_cols
     85 
     86    ; -- h2v1_downsample
     87 
     88    mov         eax, r12d               ; rowctr
     89    test        eax, eax
     90    jle         near .return
     91 
     92    mov         rdx, 0x00010000         ; bias pattern
     93    movd        xmm7, edx
     94    pcmpeqw     xmm6, xmm6
     95    pshufd      xmm7, xmm7, 0x00        ; xmm7={0, 1, 0, 1, 0, 1, 0, 1}
     96    psrlw       xmm6, BYTE_BIT          ; xmm6={0xFF 0x00 0xFF 0x00 ..}
     97 
     98    mov         rsi, r14                ; input_data
     99    mov         rdi, r15                ; output_data
    100 .rowloop:
    101    push        rcx
    102    push        rdi
    103    push        rsi
    104 
    105    mov         rsip, JSAMPROW [rsi]    ; inptr
    106    mov         rdip, JSAMPROW [rdi]    ; outptr
    107 
    108    cmp         rcx, byte SIZEOF_XMMWORD
    109    jae         short .columnloop
    110 
    111 .columnloop_r8:
    112    movdqa      xmm0, XMMWORD [rsi+0*SIZEOF_XMMWORD]
    113    pxor        xmm1, xmm1
    114    mov         rcx, SIZEOF_XMMWORD
    115    jmp         short .downsample
    116 
    117 .columnloop:
    118    movdqa      xmm0, XMMWORD [rsi+0*SIZEOF_XMMWORD]
    119    movdqa      xmm1, XMMWORD [rsi+1*SIZEOF_XMMWORD]
    120 
    121 .downsample:
    122    movdqa      xmm2, xmm0
    123    movdqa      xmm3, xmm1
    124 
    125    pand        xmm0, xmm6
    126    psrlw       xmm2, BYTE_BIT
    127    pand        xmm1, xmm6
    128    psrlw       xmm3, BYTE_BIT
    129 
    130    paddw       xmm0, xmm2
    131    paddw       xmm1, xmm3
    132    paddw       xmm0, xmm7
    133    paddw       xmm1, xmm7
    134    psrlw       xmm0, 1
    135    psrlw       xmm1, 1
    136 
    137    packuswb    xmm0, xmm1
    138 
    139    movdqa      XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm0
    140 
    141    sub         rcx, byte SIZEOF_XMMWORD    ; outcol
    142    add         rsi, byte 2*SIZEOF_XMMWORD  ; inptr
    143    add         rdi, byte 1*SIZEOF_XMMWORD  ; outptr
    144    cmp         rcx, byte SIZEOF_XMMWORD
    145    jae         short .columnloop
    146    test        rcx, rcx
    147    jnz         short .columnloop_r8
    148 
    149    pop         rsi
    150    pop         rdi
    151    pop         rcx
    152 
    153    add         rsi, byte SIZEOF_JSAMPROW  ; input_data
    154    add         rdi, byte SIZEOF_JSAMPROW  ; output_data
    155    dec         rax                        ; rowctr
    156    jg          near .rowloop
    157 
    158 .return:
    159    UNCOLLECT_ARGS 6
    160    pop         rbp
    161    ret
    162 
    163 ; --------------------------------------------------------------------------
    164 ;
    165 ; Downsample pixel values of a single component.
    166 ; This version handles the standard case of 2:1 horizontal and 2:1 vertical,
    167 ; without smoothing.
    168 ;
    169 ; GLOBAL(void)
    170 ; jsimd_h2v2_downsample_sse2(JDIMENSION image_width, int max_v_samp_factor,
    171 ;                            JDIMENSION v_samp_factor,
    172 ;                            JDIMENSION width_in_blocks, JSAMPARRAY input_data,
    173 ;                            JSAMPARRAY output_data);
    174 ;
    175 
    176 ; r10d = JDIMENSION image_width
    177 ; r11 = int max_v_samp_factor
    178 ; r12d = JDIMENSION v_samp_factor
    179 ; r13d = JDIMENSION width_in_blocks
    180 ; r14 = JSAMPARRAY input_data
    181 ; r15 = JSAMPARRAY output_data
    182 
    183    align       32
    184    GLOBAL_FUNCTION(jsimd_h2v2_downsample_sse2)
    185 
    186 EXTN(jsimd_h2v2_downsample_sse2):
    187    ENDBR64
    188    push        rbp
    189    mov         rbp, rsp
    190    COLLECT_ARGS 6
    191 
    192    mov         ecx, r13d
    193    shl         rcx, 3                  ; imul rcx,DCTSIZE (rcx = output_cols)
    194    jz          near .return
    195 
    196    mov         edx, r10d
    197 
    198    ; -- expand_right_edge
    199 
    200    push        rcx
    201    shl         rcx, 1                  ; output_cols * 2
    202    sub         rcx, rdx
    203    jle         short .expand_end
    204 
    205    mov         rax, r11
    206    test        rax, rax
    207    jle         short .expand_end
    208 
    209    cld
    210    mov         rsi, r14                ; input_data
    211 .expandloop:
    212    push        rax
    213    push        rcx
    214 
    215    mov         rdip, JSAMPROW [rsi]
    216    add         rdi, rdx
    217    mov         al, JSAMPLE [rdi-1]
    218 
    219    rep stosb
    220 
    221    pop         rcx
    222    pop         rax
    223 
    224    add         rsi, byte SIZEOF_JSAMPROW
    225    dec         rax
    226    jg          short .expandloop
    227 
    228 .expand_end:
    229    pop         rcx                     ; output_cols
    230 
    231    ; -- h2v2_downsample
    232 
    233    mov         eax, r12d               ; rowctr
    234    test        rax, rax
    235    jle         near .return
    236 
    237    mov         rdx, 0x00020001         ; bias pattern
    238    movd        xmm7, edx
    239    pcmpeqw     xmm6, xmm6
    240    pshufd      xmm7, xmm7, 0x00        ; xmm7={1, 2, 1, 2, 1, 2, 1, 2}
    241    psrlw       xmm6, BYTE_BIT          ; xmm6={0xFF 0x00 0xFF 0x00 ..}
    242 
    243    mov         rsi, r14                ; input_data
    244    mov         rdi, r15                ; output_data
    245 .rowloop:
    246    push        rcx
    247    push        rdi
    248    push        rsi
    249 
    250    mov         rdxp, JSAMPROW [rsi+0*SIZEOF_JSAMPROW]  ; inptr0
    251    mov         rsip, JSAMPROW [rsi+1*SIZEOF_JSAMPROW]  ; inptr1
    252    mov         rdip, JSAMPROW [rdi]                    ; outptr
    253 
    254    cmp         rcx, byte SIZEOF_XMMWORD
    255    jae         short .columnloop
    256 
    257 .columnloop_r8:
    258    movdqa      xmm0, XMMWORD [rdx+0*SIZEOF_XMMWORD]
    259    movdqa      xmm1, XMMWORD [rsi+0*SIZEOF_XMMWORD]
    260    pxor        xmm2, xmm2
    261    pxor        xmm3, xmm3
    262    mov         rcx, SIZEOF_XMMWORD
    263    jmp         short .downsample
    264 
    265 .columnloop:
    266    movdqa      xmm0, XMMWORD [rdx+0*SIZEOF_XMMWORD]
    267    movdqa      xmm1, XMMWORD [rsi+0*SIZEOF_XMMWORD]
    268    movdqa      xmm2, XMMWORD [rdx+1*SIZEOF_XMMWORD]
    269    movdqa      xmm3, XMMWORD [rsi+1*SIZEOF_XMMWORD]
    270 
    271 .downsample:
    272    movdqa      xmm4, xmm0
    273    movdqa      xmm5, xmm1
    274    pand        xmm0, xmm6
    275    psrlw       xmm4, BYTE_BIT
    276    pand        xmm1, xmm6
    277    psrlw       xmm5, BYTE_BIT
    278    paddw       xmm0, xmm4
    279    paddw       xmm1, xmm5
    280 
    281    movdqa      xmm4, xmm2
    282    movdqa      xmm5, xmm3
    283    pand        xmm2, xmm6
    284    psrlw       xmm4, BYTE_BIT
    285    pand        xmm3, xmm6
    286    psrlw       xmm5, BYTE_BIT
    287    paddw       xmm2, xmm4
    288    paddw       xmm3, xmm5
    289 
    290    paddw       xmm0, xmm1
    291    paddw       xmm2, xmm3
    292    paddw       xmm0, xmm7
    293    paddw       xmm2, xmm7
    294    psrlw       xmm0, 2
    295    psrlw       xmm2, 2
    296 
    297    packuswb    xmm0, xmm2
    298 
    299    movdqa      XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm0
    300 
    301    sub         rcx, byte SIZEOF_XMMWORD    ; outcol
    302    add         rdx, byte 2*SIZEOF_XMMWORD  ; inptr0
    303    add         rsi, byte 2*SIZEOF_XMMWORD  ; inptr1
    304    add         rdi, byte 1*SIZEOF_XMMWORD  ; outptr
    305    cmp         rcx, byte SIZEOF_XMMWORD
    306    jae         near .columnloop
    307    test        rcx, rcx
    308    jnz         near .columnloop_r8
    309 
    310    pop         rsi
    311    pop         rdi
    312    pop         rcx
    313 
    314    add         rsi, byte 2*SIZEOF_JSAMPROW  ; input_data
    315    add         rdi, byte 1*SIZEOF_JSAMPROW  ; output_data
    316    dec         rax                          ; rowctr
    317    jg          near .rowloop
    318 
    319 .return:
    320    UNCOLLECT_ARGS 6
    321    pop         rbp
    322    ret
    323 
    324 ; For some reason, the OS X linker does not honor the request to align the
    325 ; segment unless we do this.
    326    align       32